Pārlūkot izejas kodu

Got rid of few more unnecessary files

alex.kopachov 2 gadi atpakaļ
vecāks
revīzija
7b042adb01

+ 0 - 1531
lib/wolfssl/wolfcrypt/src/aes_asm.asm

@@ -1,1531 +0,0 @@
-; /* aes_asm.asm
-;  *
-;  * Copyright (C) 2006-2023 wolfSSL Inc.
-;  *
-;  * This file is part of wolfSSL.
-;  *
-;  * wolfSSL is free software; you can redistribute it and/or modify
-;  * it under the terms of the GNU General Public License as published by
-;  * the Free Software Foundation; either version 2 of the License, or
-;  * (at your option) any later version.
-;  *
-;  * wolfSSL is distributed in the hope that it will be useful,
-;  * but WITHOUT ANY WARRANTY; without even the implied warranty of
-;  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;  * GNU General Public License for more details.
-;  *
-;  * You should have received a copy of the GNU General Public License
-;  * along with this program; if not, write to the Free Software
-;  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
-;  */
-
-
-
-;
-;
-;  /* See Intel Advanced Encryption Standard (AES) Instructions Set White Paper
-;   * by Israel, Intel Mobility Group Development Center, Israel Shay Gueron
-;   */
-;
-;   /* This file is in intel asm syntax, see .s for at&t syntax */
-;
-
-
-fips_version = 0
-IFDEF HAVE_FIPS
-  fips_version = 1
-  IFDEF HAVE_FIPS_VERSION
-    fips_version = HAVE_FIPS_VERSION
-  ENDIF
-ENDIF
-
-IF fips_version GE 2
-  fipsAh SEGMENT ALIAS(".fipsA$h") 'CODE'
-ELSE
-  _text SEGMENT
-ENDIF
-
-
-;	/*
-;	AES_CBC_encrypt[const	,unsigned	char*in
-;	unsigned	,char*out
-;	unsigned	,char	ivec+16
-;	unsigned	,long	length
-;	const	,unsigned	char*KS
-;	int	nr]
-;	*/
-AES_CBC_encrypt PROC
-;#	parameter	1:	rdi
-;#	parameter	2:	rsi
-;#	parameter	3:	rdx
-;#	parameter	4:	rcx
-;#	parameter	5:	r8
-;#	parameter	6:	r9d
-
-; save rdi and rsi to rax and r11, restore before ret
-	mov rax,rdi
-	mov r11,rsi
-
-; convert to what we had for att&t convention
-	mov rdi,rcx
-	mov rsi,rdx
-	mov rdx,r8
-	mov rcx,r9
-	mov r8,[rsp+40]
-	mov r9d,[rsp+48]
-
-	mov	r10,rcx
-	shr	rcx,4
-	shl	r10,60
-	je	NO_PARTS
-	add	rcx,1
-NO_PARTS:
-	sub	rsi,16
-	movdqa	xmm1,[rdx]
-LOOP_1:
-	pxor	xmm1,[rdi]
-	pxor	xmm1,[r8]
-	add	rsi,16
-	add	rdi,16
-	cmp	r9d,12
-	aesenc	xmm1,16[r8]
-	aesenc	xmm1,32[r8]
-	aesenc	xmm1,48[r8]
-	aesenc	xmm1,64[r8]
-	aesenc	xmm1,80[r8]
-	aesenc	xmm1,96[r8]
-	aesenc	xmm1,112[r8]
-	aesenc	xmm1,128[r8]
-	aesenc	xmm1,144[r8]
-	movdqa	xmm2,160[r8]
-	jb	LAST
-	cmp	r9d,14
-
-	aesenc	xmm1,160[r8]
-	aesenc	xmm1,176[r8]
-	movdqa	xmm2,192[r8]
-	jb	LAST
-	aesenc	xmm1,192[r8]
-	aesenc	xmm1,208[r8]
-	movdqa	xmm2,224[r8]
-LAST:
-	dec	rcx
-	aesenclast	xmm1,xmm2
-	movdqu	[rsi],xmm1
-	jne	LOOP_1
-	; restore non volatile rdi,rsi
-	mov rdi,rax
-	mov rsi,r11
-	ret
-AES_CBC_encrypt ENDP
-
-
-; void AES_CBC_decrypt_by4(const unsigned char* in,
-;                          unsigned char* out,
-;                          unsigned char ivec[16],
-;                          unsigned long length,
-;                          const unsigned char* KS,
-;                          int nr)
-AES_CBC_decrypt_by4 PROC
-; parameter 1: rdi
-; parameter 2: rsi
-; parameter 3: rdx
-; parameter 4: rcx
-; parameter 5: r8
-; parameter 6: r9d
-
-        ; save rdi and rsi to rax and r11, restore before ret
-        mov         rax, rdi
-        mov         r11, rsi
-        ; convert to what we had for att&t convention
-        mov         rdi, rcx
-        mov         rsi, rdx
-        mov         rdx, r8
-        mov         rcx,r9
-        mov         r8, [rsp+40]
-        mov         r9d, [rsp+48]
-        ; on microsoft xmm6-xmm15 are non volatile,
-        ; let's save on stack and restore at end
-        sub         rsp, 8+8*16  ; 8 = align stack , 8 xmm6-12,15 16 bytes each
-        movdqa      [rsp+0], xmm6
-        movdqa      [rsp+16], xmm7
-        movdqa      [rsp+32], xmm8
-        movdqa      [rsp+48], xmm9
-        movdqa      [rsp+64], xmm10
-        movdqa      [rsp+80], xmm11
-        movdqa      [rsp+96], xmm12
-        movdqa      [rsp+112], xmm15
-        ; back to our original code, more or less
-        mov         r10, rcx
-        shr         rcx, 4
-        shl         r10, 60
-        je          DNO_PARTS_4
-        add         rcx, 1
-DNO_PARTS_4:
-        mov         r10, rcx
-        shl         r10, 62
-        shr         r10, 62
-        shr         rcx, 2
-        movdqu      xmm5, [rdx]
-        je          DREMAINDER_4
-        sub         rsi, 64
-DLOOP_4:
-        movdqu      xmm1, [rdi]
-        movdqu      xmm2, 16[rdi]
-        movdqu      xmm3, 32[rdi]
-        movdqu      xmm4, 48[rdi]
-        movdqa      xmm6, xmm1
-        movdqa      xmm7, xmm2
-        movdqa      xmm8, xmm3
-        movdqa      xmm15, xmm4
-        movdqa      xmm9, [r8]
-        movdqa      xmm10, 16[r8]
-        movdqa      xmm11, 32[r8]
-        movdqa      xmm12, 48[r8]
-        pxor        xmm1, xmm9
-        pxor        xmm2, xmm9
-        pxor        xmm3, xmm9
-        pxor        xmm4, xmm9
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-        aesdec      xmm1, xmm11
-        aesdec      xmm2, xmm11
-        aesdec      xmm3, xmm11
-        aesdec      xmm4, xmm11
-        aesdec      xmm1, xmm12
-        aesdec      xmm2, xmm12
-        aesdec      xmm3, xmm12
-        aesdec      xmm4, xmm12
-        movdqa      xmm9, 64[r8]
-        movdqa      xmm10, 80[r8]
-        movdqa      xmm11, 96[r8]
-        movdqa      xmm12, 112[r8]
-        aesdec      xmm1, xmm9
-        aesdec      xmm2, xmm9
-        aesdec      xmm3, xmm9
-        aesdec      xmm4, xmm9
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-        aesdec      xmm1, xmm11
-        aesdec      xmm2, xmm11
-        aesdec      xmm3, xmm11
-        aesdec      xmm4, xmm11
-        aesdec      xmm1, xmm12
-        aesdec      xmm2, xmm12
-        aesdec      xmm3, xmm12
-        aesdec      xmm4, xmm12
-        movdqa      xmm9, 128[r8]
-        movdqa      xmm10, 144[r8]
-        movdqa      xmm11, 160[r8]
-        cmp         r9d, 12
-        aesdec      xmm1, xmm9
-        aesdec      xmm2, xmm9
-        aesdec      xmm3, xmm9
-        aesdec      xmm4, xmm9
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-        jb          DLAST_4
-        movdqa      xmm9, 160[r8]
-        movdqa      xmm10, 176[r8]
-        movdqa      xmm11, 192[r8]
-        cmp         r9d, 14
-        aesdec      xmm1, xmm9
-        aesdec      xmm2, xmm9
-        aesdec      xmm3, xmm9
-        aesdec      xmm4, xmm9
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-        jb          DLAST_4
-        movdqa      xmm9, 192[r8]
-        movdqa      xmm10, 208[r8]
-        movdqa      xmm11, 224[r8]
-        aesdec      xmm1, xmm9
-        aesdec      xmm2, xmm9
-        aesdec      xmm3, xmm9
-        aesdec      xmm4, xmm9
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-DLAST_4:
-        add         rdi, 64
-        add         rsi, 64
-        dec         rcx
-        aesdeclast  xmm1, xmm11
-        aesdeclast  xmm2, xmm11
-        aesdeclast  xmm3, xmm11
-        aesdeclast  xmm4, xmm11
-        pxor        xmm1, xmm5
-        pxor        xmm2, xmm6
-        pxor        xmm3, xmm7
-        pxor        xmm4, xmm8
-        movdqu      [rsi], xmm1
-        movdqu      16[rsi], xmm2
-        movdqu      32[rsi], xmm3
-        movdqu      48[rsi], xmm4
-        movdqa      xmm5, xmm15
-        jne         DLOOP_4
-        add         rsi, 64
-DREMAINDER_4:
-        cmp         r10, 0
-        je          DEND_4
-DLOOP_4_2:
-        movdqu      xmm1, [rdi]
-        movdqa      xmm15, xmm1
-        add         rdi, 16
-        pxor        xmm1, [r8]
-        movdqu      xmm2, 160[r8]
-        cmp         r9d, 12
-        aesdec      xmm1, 16[r8]
-        aesdec      xmm1, 32[r8]
-        aesdec      xmm1, 48[r8]
-        aesdec      xmm1, 64[r8]
-        aesdec      xmm1, 80[r8]
-        aesdec      xmm1, 96[r8]
-        aesdec      xmm1, 112[r8]
-        aesdec      xmm1, 128[r8]
-        aesdec      xmm1, 144[r8]
-        jb          DLAST_4_2
-        movdqu      xmm2, 192[r8]
-        cmp         r9d, 14
-        aesdec      xmm1, 160[r8]
-        aesdec      xmm1, 176[r8]
-        jb          DLAST_4_2
-        movdqu      xmm2, 224[r8]
-        aesdec      xmm1, 192[r8]
-        aesdec      xmm1, 208[r8]
-DLAST_4_2:
-        aesdeclast  xmm1, xmm2
-        pxor        xmm1, xmm5
-        movdqa      xmm5, xmm15
-        movdqu      [rsi], xmm1
-        add         rsi, 16
-        dec         r10
-        jne         DLOOP_4_2
-DEND_4:
-        ; restore non volatile rdi,rsi
-        mov         rdi, rax
-        mov         rsi, r11
-        ; restore non volatile xmms from stack
-        movdqa      xmm6, [rsp+0]
-        movdqa      xmm7, [rsp+16]
-        movdqa      xmm8, [rsp+32]
-        movdqa      xmm9, [rsp+48]
-        movdqa      xmm10, [rsp+64]
-        movdqa      xmm11, [rsp+80]
-        movdqa      xmm12, [rsp+96]
-        movdqa      xmm15, [rsp+112]
-        add         rsp, 8+8*16 ; 8 = align stack , 8 xmm6-12,15 16 bytes each
-        ret
-AES_CBC_decrypt_by4 ENDP
-
-
-; void AES_CBC_decrypt_by6(const unsigned char *in,
-;                          unsigned char *out,
-;                          unsigned char ivec[16],
-;                          unsigned long length,
-;                          const unsigned char *KS,
-;                          int nr)
-AES_CBC_decrypt_by6 PROC
-; parameter 1: rdi - in
-; parameter 2: rsi - out
-; parameter 3: rdx - ivec
-; parameter 4: rcx - length
-; parameter 5: r8  - KS
-; parameter 6: r9d - nr
-
-        ; save rdi and rsi to rax and r11, restore before ret
-        mov         rax, rdi
-        mov         r11, rsi
-        ; convert to what we had for att&t convention
-        mov         rdi, rcx
-        mov         rsi, rdx
-        mov         rdx, r8
-        mov         rcx, r9
-        mov         r8, [rsp+40]
-        mov         r9d, [rsp+48]
-        ; on microsoft xmm6-xmm15 are non volatile,
-        ; let's save on stack and restore at end
-        sub         rsp, 8+9*16  ; 8 = align stack , 9 xmm6-14 16 bytes each
-        movdqa      [rsp+0], xmm6
-        movdqa      [rsp+16], xmm7
-        movdqa      [rsp+32], xmm8
-        movdqa      [rsp+48], xmm9
-        movdqa      [rsp+64], xmm10
-        movdqa      [rsp+80], xmm11
-        movdqa      [rsp+96], xmm12
-        movdqa      [rsp+112], xmm13
-        movdqa      [rsp+128], xmm14
-        ; back to our original code, more or less
-        mov         r10, rcx
-        shr         rcx, 4
-        shl         r10, 60
-        je          DNO_PARTS_6
-        add         rcx, 1
-DNO_PARTS_6:
-        mov         r12, rax
-        mov         r13, rdx
-        mov         r14, rbx
-        mov         rdx, 0
-        mov         rax, rcx
-        mov         rbx, 6
-        div         rbx
-        mov         rcx, rax
-        mov         r10, rdx
-        mov         rax, r12
-        mov         rdx, r13
-        mov         rbx, r14
-        cmp         rcx, 0
-        movdqu      xmm7, [rdx]
-        je          DREMAINDER_6
-        sub         rsi, 96
-DLOOP_6:
-        movdqu      xmm1, [rdi]
-        movdqu      xmm2, 16[rdi]
-        movdqu      xmm3, 32[rdi]
-        movdqu      xmm4, 48[rdi]
-        movdqu      xmm5, 64[rdi]
-        movdqu      xmm6, 80[rdi]
-        movdqa      xmm8, [r8]
-        movdqa      xmm9, 16[r8]
-        movdqa      xmm10, 32[r8]
-        movdqa      xmm11, 48[r8]
-        pxor        xmm1, xmm8
-        pxor        xmm2, xmm8
-        pxor        xmm3, xmm8
-        pxor        xmm4, xmm8
-        pxor        xmm5, xmm8
-        pxor        xmm6, xmm8
-        aesdec      xmm1, xmm9
-        aesdec      xmm2, xmm9
-        aesdec      xmm3, xmm9
-        aesdec      xmm4, xmm9
-        aesdec      xmm5, xmm9
-        aesdec      xmm6, xmm9
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-        aesdec      xmm5, xmm10
-        aesdec      xmm6, xmm10
-        aesdec      xmm1, xmm11
-        aesdec      xmm2, xmm11
-        aesdec      xmm3, xmm11
-        aesdec      xmm4, xmm11
-        aesdec      xmm5, xmm11
-        aesdec      xmm6, xmm11
-        movdqa      xmm8, 64[r8]
-        movdqa      xmm9, 80[r8]
-        movdqa      xmm10, 96[r8]
-        movdqa      xmm11, 112[r8]
-        aesdec      xmm1, xmm8
-        aesdec      xmm2, xmm8
-        aesdec      xmm3, xmm8
-        aesdec      xmm4, xmm8
-        aesdec      xmm5, xmm8
-        aesdec      xmm6, xmm8
-        aesdec      xmm1, xmm9
-        aesdec      xmm2, xmm9
-        aesdec      xmm3, xmm9
-        aesdec      xmm4, xmm9
-        aesdec      xmm5, xmm9
-        aesdec      xmm6, xmm9
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-        aesdec      xmm5, xmm10
-        aesdec      xmm6, xmm10
-        aesdec      xmm1, xmm11
-        aesdec      xmm2, xmm11
-        aesdec      xmm3, xmm11
-        aesdec      xmm4, xmm11
-        aesdec      xmm5, xmm11
-        aesdec      xmm6, xmm11
-        movdqa      xmm8, 128[r8]
-        movdqa      xmm9, 144[r8]
-        movdqa      xmm10, 160[r8]
-        cmp         r9d, 12
-        aesdec      xmm1, xmm8
-        aesdec      xmm2, xmm8
-        aesdec      xmm3, xmm8
-        aesdec      xmm4, xmm8
-        aesdec      xmm5, xmm8
-        aesdec      xmm6, xmm8
-        aesdec      xmm1, xmm9
-        aesdec      xmm2, xmm9
-        aesdec      xmm3, xmm9
-        aesdec      xmm4, xmm9
-        aesdec      xmm5, xmm9
-        aesdec      xmm6, xmm9
-        jb          DLAST_6
-        movdqa      xmm8, 160[r8]
-        movdqa      xmm9, 176[r8]
-        movdqa      xmm10, 192[r8]
-        cmp         r9d, 14
-        aesdec      xmm1, xmm8
-        aesdec      xmm2, xmm8
-        aesdec      xmm3, xmm8
-        aesdec      xmm4, xmm8
-        aesdec      xmm5, xmm8
-        aesdec      xmm6, xmm8
-        aesdec      xmm1, xmm9
-        aesdec      xmm2, xmm9
-        aesdec      xmm3, xmm9
-        aesdec      xmm4, xmm9
-        aesdec      xmm5, xmm9
-        aesdec      xmm6, xmm9
-        jb          DLAST_6
-        movdqa      xmm8, 192[r8]
-        movdqa      xmm9, 208[r8]
-        movdqa      xmm10, 224[r8]
-        aesdec      xmm1, xmm8
-        aesdec      xmm2, xmm8
-        aesdec      xmm3, xmm8
-        aesdec      xmm4, xmm8
-        aesdec      xmm5, xmm8
-        aesdec      xmm6, xmm8
-        aesdec      xmm1, xmm9
-        aesdec      xmm2, xmm9
-        aesdec      xmm3, xmm9
-        aesdec      xmm4, xmm9
-        aesdec      xmm5, xmm9
-        aesdec      xmm6, xmm9
-DLAST_6:
-        add         rsi, 96
-        aesdeclast  xmm1, xmm10
-        aesdeclast  xmm2, xmm10
-        aesdeclast  xmm3, xmm10
-        aesdeclast  xmm4, xmm10
-        aesdeclast  xmm5, xmm10
-        aesdeclast  xmm6, xmm10
-        movdqu      xmm8, [rdi]
-        movdqu      xmm9, 16[rdi]
-        movdqu      xmm10, 32[rdi]
-        movdqu      xmm11, 48[rdi]
-        movdqu      xmm12, 64[rdi]
-        movdqu      xmm13, 80[rdi]
-        pxor        xmm1, xmm7
-        pxor        xmm2, xmm8
-        pxor        xmm3, xmm9
-        pxor        xmm4, xmm10
-        pxor        xmm5, xmm11
-        pxor        xmm6, xmm12
-        movdqu      xmm7, xmm13
-        movdqu      [rsi], xmm1
-        movdqu      16[rsi], xmm2
-        movdqu      32[rsi], xmm3
-        movdqu      48[rsi], xmm4
-        movdqu      64[rsi], xmm5
-        movdqu      80[rsi], xmm6
-        add         rdi, 96
-        dec         rcx
-        jne         DLOOP_6
-        add         rsi, 96
-DREMAINDER_6:
-        cmp         r10, 0
-        je          DEND_6
-DLOOP_6_2:
-        movdqu      xmm1, [rdi]
-        movdqa      xmm10, xmm1
-        add         rdi, 16
-        pxor        xmm1, [r8]
-        movdqu      xmm2, 160[r8]
-        cmp         r9d, 12
-        aesdec      xmm1, 16[r8]
-        aesdec      xmm1, 32[r8]
-        aesdec      xmm1, 48[r8]
-        aesdec      xmm1, 64[r8]
-        aesdec      xmm1, 80[r8]
-        aesdec      xmm1, 96[r8]
-        aesdec      xmm1, 112[r8]
-        aesdec      xmm1, 128[r8]
-        aesdec      xmm1, 144[r8]
-        jb          DLAST_6_2
-        movdqu      xmm2, 192[r8]
-        cmp         r9d, 14
-        aesdec      xmm1, 160[r8]
-        aesdec      xmm1, 176[r8]
-        jb          DLAST_6_2
-        movdqu      xmm2, 224[r8]
-        aesdec      xmm1, 192[r8]
-        aesdec      xmm1, 208[r8]
-DLAST_6_2:
-        aesdeclast  xmm1, xmm2
-        pxor        xmm1, xmm7
-        movdqa      xmm7, xmm10
-        movdqu      [rsi], xmm1
-        add         rsi, 16
-        dec         r10
-        jne         DLOOP_6_2
-DEND_6:
-        ; restore non volatile rdi,rsi
-        mov         rdi, rax
-        mov         rsi, r11
-        ; restore non volatile xmms from stack
-        movdqa      xmm6, [rsp+0]
-        movdqa      xmm7, [rsp+16]
-        movdqa      xmm8, [rsp+32]
-        movdqa      xmm9, [rsp+48]
-        movdqa      xmm10, [rsp+64]
-        movdqa      xmm11, [rsp+80]
-        movdqa      xmm12, [rsp+96]
-        movdqa      xmm13, [rsp+112]
-        movdqa      xmm14, [rsp+128]
-        add         rsp, 8+9*16 ; 8 = align stack , 9 xmm6-14 16 bytes each
-        ret
-AES_CBC_decrypt_by6 ENDP
-
-
-; void AES_CBC_decrypt_by8(const unsigned char *in,
-;                          unsigned char *out,
-;                          unsigned char ivec[16],
-;                          unsigned long length,
-;                          const unsigned char *KS,
-;                          int nr)
-AES_CBC_decrypt_by8 PROC
-; parameter 1: rdi - in
-; parameter 2: rsi - out
-; parameter 3: rdx - ivec
-; parameter 4: rcx - length
-; parameter 5: r8  - KS
-; parameter 6: r9d - nr
-
-        ; save rdi and rsi to rax and r11, restore before ret
-        mov         rax, rdi
-        mov         r11, rsi
-        ; convert to what we had for att&t convention
-        mov         rdi, rcx
-        mov         rsi, rdx
-        mov         rdx, r8
-        mov         rcx,r9
-        mov         r8, [rsp+40]
-        mov         r9d, [rsp+48]
-        ; on microsoft xmm6-xmm15 are non volatile,
-        ; let's save on stack and restore at end
-        sub         rsp, 8+8*16  ; 8 = align stack , 8 xmm6-13 16 bytes each
-        movdqa      [rsp+0], xmm6
-        movdqa      [rsp+16], xmm7
-        movdqa      [rsp+32], xmm8
-        movdqa      [rsp+48], xmm9
-        movdqa      [rsp+64], xmm10
-        movdqa      [rsp+80], xmm11
-        movdqa      [rsp+96], xmm12
-        movdqa      [rsp+112], xmm13
-        ; back to our original code, more or less
-        mov         r10, rcx
-        shr         rcx, 4
-        shl         r10, 60
-        je          DNO_PARTS_8
-        add         rcx, 1
-DNO_PARTS_8:
-        mov         r10, rcx
-        shl         r10, 61
-        shr         r10, 61
-        shr         rcx, 3
-        movdqu      xmm9, [rdx]
-        je          DREMAINDER_8
-        sub         rsi, 128
-DLOOP_8:
-        movdqu      xmm1, [rdi]
-        movdqu      xmm2, 16[rdi]
-        movdqu      xmm3, 32[rdi]
-        movdqu      xmm4, 48[rdi]
-        movdqu      xmm5, 64[rdi]
-        movdqu      xmm6, 80[rdi]
-        movdqu      xmm7, 96[rdi]
-        movdqu      xmm8, 112[rdi]
-        movdqa      xmm10, [r8]
-        movdqa      xmm11, 16[r8]
-        movdqa      xmm12, 32[r8]
-        movdqa      xmm13, 48[r8]
-        pxor        xmm1, xmm10
-        pxor        xmm2, xmm10
-        pxor        xmm3, xmm10
-        pxor        xmm4, xmm10
-        pxor        xmm5, xmm10
-        pxor        xmm6, xmm10
-        pxor        xmm7, xmm10
-        pxor        xmm8, xmm10
-        aesdec      xmm1, xmm11
-        aesdec      xmm2, xmm11
-        aesdec      xmm3, xmm11
-        aesdec      xmm4, xmm11
-        aesdec      xmm5, xmm11
-        aesdec      xmm6, xmm11
-        aesdec      xmm7, xmm11
-        aesdec      xmm8, xmm11
-        aesdec      xmm1, xmm12
-        aesdec      xmm2, xmm12
-        aesdec      xmm3, xmm12
-        aesdec      xmm4, xmm12
-        aesdec      xmm5, xmm12
-        aesdec      xmm6, xmm12
-        aesdec      xmm7, xmm12
-        aesdec      xmm8, xmm12
-        aesdec      xmm1, xmm13
-        aesdec      xmm2, xmm13
-        aesdec      xmm3, xmm13
-        aesdec      xmm4, xmm13
-        aesdec      xmm5, xmm13
-        aesdec      xmm6, xmm13
-        aesdec      xmm7, xmm13
-        aesdec      xmm8, xmm13
-        movdqa      xmm10, 64[r8]
-        movdqa      xmm11, 80[r8]
-        movdqa      xmm12, 96[r8]
-        movdqa      xmm13, 112[r8]
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-        aesdec      xmm5, xmm10
-        aesdec      xmm6, xmm10
-        aesdec      xmm7, xmm10
-        aesdec      xmm8, xmm10
-        aesdec      xmm1, xmm11
-        aesdec      xmm2, xmm11
-        aesdec      xmm3, xmm11
-        aesdec      xmm4, xmm11
-        aesdec      xmm5, xmm11
-        aesdec      xmm6, xmm11
-        aesdec      xmm7, xmm11
-        aesdec      xmm8, xmm11
-        aesdec      xmm1, xmm12
-        aesdec      xmm2, xmm12
-        aesdec      xmm3, xmm12
-        aesdec      xmm4, xmm12
-        aesdec      xmm5, xmm12
-        aesdec      xmm6, xmm12
-        aesdec      xmm7, xmm12
-        aesdec      xmm8, xmm12
-        aesdec      xmm1, xmm13
-        aesdec      xmm2, xmm13
-        aesdec      xmm3, xmm13
-        aesdec      xmm4, xmm13
-        aesdec      xmm5, xmm13
-        aesdec      xmm6, xmm13
-        aesdec      xmm7, xmm13
-        aesdec      xmm8, xmm13
-        movdqa      xmm10, 128[r8]
-        movdqa      xmm11, 144[r8]
-        movdqa      xmm12, 160[r8]
-        cmp         r9d, 12
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-        aesdec      xmm5, xmm10
-        aesdec      xmm6, xmm10
-        aesdec      xmm7, xmm10
-        aesdec      xmm8, xmm10
-        aesdec      xmm1, xmm11
-        aesdec      xmm2, xmm11
-        aesdec      xmm3, xmm11
-        aesdec      xmm4, xmm11
-        aesdec      xmm5, xmm11
-        aesdec      xmm6, xmm11
-        aesdec      xmm7, xmm11
-        aesdec      xmm8, xmm11
-        jb          DLAST_8
-        movdqa      xmm10, 160[r8]
-        movdqa      xmm11, 176[r8]
-        movdqa      xmm12, 192[r8]
-        cmp         r9d, 14
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-        aesdec      xmm5, xmm10
-        aesdec      xmm6, xmm10
-        aesdec      xmm7, xmm10
-        aesdec      xmm8, xmm10
-        aesdec      xmm1, xmm11
-        aesdec      xmm2, xmm11
-        aesdec      xmm3, xmm11
-        aesdec      xmm4, xmm11
-        aesdec      xmm5, xmm11
-        aesdec      xmm6, xmm11
-        aesdec      xmm7, xmm11
-        aesdec      xmm8, xmm11
-        jb          DLAST_8
-        movdqa      xmm10, 192[r8]
-        movdqa      xmm11, 208[r8]
-        movdqa      xmm12, 224[r8]
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-        aesdec      xmm5, xmm10
-        aesdec      xmm6, xmm10
-        aesdec      xmm7, xmm10
-        aesdec      xmm8, xmm10
-        aesdec      xmm1, xmm11
-        aesdec      xmm2, xmm11
-        aesdec      xmm3, xmm11
-        aesdec      xmm4, xmm11
-        aesdec      xmm5, xmm11
-        aesdec      xmm6, xmm11
-        aesdec      xmm7, xmm11
-        aesdec      xmm8, xmm11
-DLAST_8:
-        add         rsi, 128
-        aesdeclast  xmm1, xmm12
-        aesdeclast  xmm2, xmm12
-        aesdeclast  xmm3, xmm12
-        aesdeclast  xmm4, xmm12
-        aesdeclast  xmm5, xmm12
-        aesdeclast  xmm6, xmm12
-        aesdeclast  xmm7, xmm12
-        aesdeclast  xmm8, xmm12
-        movdqu      xmm10, [rdi]
-        movdqu      xmm11, 16[rdi]
-        movdqu      xmm12, 32[rdi]
-        movdqu      xmm13, 48[rdi]
-        pxor        xmm1, xmm9
-        pxor        xmm2, xmm10
-        pxor        xmm3, xmm11
-        pxor        xmm4, xmm12
-        pxor        xmm5, xmm13
-        movdqu      xmm10, 64[rdi]
-        movdqu      xmm11, 80[rdi]
-        movdqu      xmm12, 96[rdi]
-        movdqu      xmm9, 112[rdi]
-        pxor        xmm6, xmm10
-        pxor        xmm7, xmm11
-        pxor        xmm8, xmm12
-        movdqu      [rsi], xmm1
-        movdqu      16[rsi], xmm2
-        movdqu      32[rsi], xmm3
-        movdqu      48[rsi], xmm4
-        movdqu      64[rsi], xmm5
-        movdqu      80[rsi], xmm6
-        movdqu      96[rsi], xmm7
-        movdqu      112[rsi], xmm8
-        add         rdi, 128
-        dec         rcx
-        jne         DLOOP_8
-        add         rsi, 128
-DREMAINDER_8:
-        cmp         r10, 0 
-        je          DEND_8
-DLOOP_8_2:
-        movdqu      xmm1, [rdi]
-        movdqa      xmm10, xmm1
-        add         rdi, 16
-        pxor        xmm1, [r8]
-        movdqu      xmm2, 160[r8]
-        cmp         r9d, 12
-        aesdec      xmm1, 16[r8]
-        aesdec      xmm1, 32[r8]
-        aesdec      xmm1, 48[r8]
-        aesdec      xmm1, 64[r8]
-        aesdec      xmm1, 80[r8]
-        aesdec      xmm1, 96[r8]
-        aesdec      xmm1, 112[r8]
-        aesdec      xmm1, 128[r8]
-        aesdec      xmm1, 144[r8]
-        jb          DLAST_8_2
-        movdqu      xmm2, 192[r8]
-        cmp         r9d, 14
-        aesdec      xmm1, 160[r8]
-        aesdec      xmm1, 176[r8]
-        jb          DLAST_8_2
-        movdqu      xmm2, 224[r8]
-        aesdec      xmm1, 192[r8]
-        aesdec      xmm1, 208[r8]
-DLAST_8_2:
-        aesdeclast  xmm1, xmm2
-        pxor        xmm1, xmm9
-        movdqa      xmm9, xmm10
-        movdqu      [rsi], xmm1
-        add         rsi, 16
-        dec         r10
-        jne         DLOOP_8_2
-DEND_8:
-        ; restore non volatile rdi,rsi
-        mov         rdi, rax
-        mov         rsi, r11
-        ; restore non volatile xmms from stack
-        movdqa      xmm6, [rsp+0]
-        movdqa      xmm7, [rsp+16]
-        movdqa      xmm8, [rsp+32]
-        movdqa      xmm9, [rsp+48]
-        movdqa      xmm10, [rsp+64]
-        movdqa      xmm11, [rsp+80]
-        movdqa      xmm12, [rsp+96]
-        movdqa      xmm13, [rsp+112]
-        add         rsp, 8+8*16 ; 8 = align stack , 8 xmm6-13 16 bytes each
-        ret
-AES_CBC_decrypt_by8 ENDP
-
-
-;	/*
-;	AES_ECB_encrypt[const	,unsigned	char*in
-;	unsigned	,char*out
-;	unsigned	,long	length
-;	const	,unsigned	char*KS
-;	int	nr]
-;	*/
-;	.	globl	AES_ECB_encrypt
-AES_ECB_encrypt PROC
-;#	parameter	1:	rdi
-;#	parameter	2:	rsi
-;#	parameter	3:	rdx
-;#	parameter	4:	rcx
-;#	parameter	5:	r8d
-
-; save rdi and rsi to rax and r11, restore before ret
-	mov rax,rdi
-	mov r11,rsi
-
-; convert to what we had for att&t convention
-    mov rdi,rcx
-	mov rsi,rdx
-	mov rdx,r8
-	mov rcx,r9
-	mov r8d,[rsp+40]
-
-; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end
-	sub rsp,8+4*16  ; 8 = align stack , 4 xmm9-12, 16 bytes each
-	movdqa [rsp+0], xmm9
-	movdqa [rsp+16], xmm10
-	movdqa [rsp+32], xmm11
-	movdqa [rsp+48], xmm12
-
-
-	mov	r10,rdx
-	shr	rdx,4
-	shl	r10,60
-	je	EECB_NO_PARTS_4
-	add	rdx,1
-EECB_NO_PARTS_4:
-	mov	r10,rdx
-	shl	r10,62
-	shr	r10,62
-	shr	rdx,2
-	je	EECB_REMAINDER_4
-	sub	rsi,64
-EECB_LOOP_4:
-	movdqu  xmm1,[rdi]
-	movdqu	xmm2,16[rdi]
-	movdqu	xmm3,32[rdi]
-	movdqu	xmm4,48[rdi]
-	movdqa  xmm9,[rcx]
-	movdqa	xmm10,16[rcx]
-	movdqa	xmm11,32[rcx]
-	movdqa	xmm12,48[rcx]
-	pxor	xmm1,xmm9
-	pxor	xmm2,xmm9
-	pxor	xmm3,xmm9
-	pxor	xmm4,xmm9
-	aesenc	xmm1,xmm10
-	aesenc	xmm2,xmm10
-	aesenc	xmm3,xmm10
-	aesenc	xmm4,xmm10
-	aesenc	xmm1,xmm11
-	aesenc	xmm2,xmm11
-	aesenc	xmm3,xmm11
-	aesenc	xmm4,xmm11
-	aesenc	xmm1,xmm12
-	aesenc	xmm2,xmm12
-	aesenc	xmm3,xmm12
-	aesenc	xmm4,xmm12
-	movdqa	xmm9,64[rcx]
-	movdqa	xmm10,80[rcx]
-	movdqa	xmm11,96[rcx]
-	movdqa	xmm12,112[rcx]
-	aesenc	xmm1,xmm9
-	aesenc	xmm2,xmm9
-	aesenc	xmm3,xmm9
-	aesenc	xmm4,xmm9
-	aesenc	xmm1,xmm10
-	aesenc	xmm2,xmm10
-	aesenc	xmm3,xmm10
-	aesenc	xmm4,xmm10
-	aesenc	xmm1,xmm11
-	aesenc	xmm2,xmm11
-	aesenc	xmm3,xmm11
-	aesenc	xmm4,xmm11
-	aesenc	xmm1,xmm12
-	aesenc	xmm2,xmm12
-	aesenc	xmm3,xmm12
-	aesenc	xmm4,xmm12
-	movdqa	xmm9,128[rcx]
-	movdqa	xmm10,144[rcx]
-	movdqa	xmm11,160[rcx]
-	cmp	r8d,12
-	aesenc	xmm1,xmm9
-	aesenc	xmm2,xmm9
-	aesenc	xmm3,xmm9
-	aesenc	xmm4,xmm9
-	aesenc	xmm1,xmm10
-	aesenc	xmm2,xmm10
-	aesenc	xmm3,xmm10
-	aesenc	xmm4,xmm10
-	jb	EECB_LAST_4
-	movdqa	xmm9,160[rcx]
-	movdqa	xmm10,176[rcx]
-	movdqa	xmm11,192[rcx]
-	cmp	r8d,14
-	aesenc	xmm1,xmm9
-	aesenc	xmm2,xmm9
-	aesenc	xmm3,xmm9
-	aesenc	xmm4,xmm9
-	aesenc	xmm1,xmm10
-	aesenc	xmm2,xmm10
-	aesenc	xmm3,xmm10
-	aesenc	xmm4,xmm10
-	jb	EECB_LAST_4
-	movdqa	xmm9,192[rcx]
-	movdqa	xmm10,208[rcx]
-	movdqa	xmm11,224[rcx]
-	aesenc	xmm1,xmm9
-	aesenc	xmm2,xmm9
-	aesenc	xmm3,xmm9
-	aesenc	xmm4,xmm9
-	aesenc	xmm1,xmm10
-	aesenc	xmm2,xmm10
-	aesenc	xmm3,xmm10
-	aesenc	xmm4,xmm10
-EECB_LAST_4:
-	add	rdi,64
-	add	rsi,64
-	dec	rdx
-	aesenclast	xmm1,xmm11
-	aesenclast	xmm2,xmm11
-	aesenclast	xmm3,xmm11
-	aesenclast	xmm4,xmm11
-	movdqu	[rsi],xmm1
-	movdqu	16[rsi],xmm2
-	movdqu	32[rsi],xmm3
-	movdqu	48[rsi],xmm4
-	jne	EECB_LOOP_4
-	add	rsi,64
-EECB_REMAINDER_4:
-	cmp	r10,0
-	je	EECB_END_4
-EECB_LOOP_4_2:
-	movdqu  xmm1,[rdi]
-	add	rdi,16
-	pxor	xmm1,[rcx]
-	movdqu	xmm2,160[rcx]
-	aesenc	xmm1,16[rcx]
-	aesenc	xmm1,32[rcx]
-	aesenc	xmm1,48[rcx]
-	aesenc	xmm1,64[rcx]
-	aesenc	xmm1,80[rcx]
-	aesenc	xmm1,96[rcx]
-	aesenc	xmm1,112[rcx]
-	aesenc	xmm1,128[rcx]
-	aesenc	xmm1,144[rcx]
-	cmp	r8d,12
-	jb	EECB_LAST_4_2
-	movdqu	xmm2,192[rcx]
-	aesenc	xmm1,160[rcx]
-	aesenc	xmm1,176[rcx]
-	cmp	r8d,14
-	jb	EECB_LAST_4_2
-	movdqu	xmm2,224[rcx]
-	aesenc	xmm1,192[rcx]
-	aesenc	xmm1,208[rcx]
-EECB_LAST_4_2:
-	aesenclast	xmm1,xmm2
-	movdqu	[rsi],xmm1
-	add	rsi,16
-	dec	r10
-	jne	EECB_LOOP_4_2
-EECB_END_4:
-	; restore non volatile rdi,rsi
-	mov rdi,rax
-	mov rsi,r11
-	; restore non volatile xmms from stack
-	movdqa xmm9, [rsp+0]
-	movdqa xmm10, [rsp+16]
-	movdqa xmm11, [rsp+32]
-	movdqa xmm12, [rsp+48]
-	add rsp,8+4*16 ; 8 = align stack , 4 xmm9-12 16 bytes each
-	ret
-AES_ECB_encrypt ENDP
-
-;	/*
-;	AES_ECB_decrypt[const	,unsigned	char*in
-;	unsigned	,char*out
-;	unsigned	,long	length
-;	const	,unsigned	char*KS
-;	int	nr]
-;	*/
-;	.	globl	AES_ECB_decrypt
-AES_ECB_decrypt PROC
-;#	parameter	1:	rdi
-;#	parameter	2:	rsi
-;#	parameter	3:	rdx
-;#	parameter	4:	rcx
-;#	parameter	5:	r8d
-
-; save rdi and rsi to rax and r11, restore before ret
-	mov rax,rdi
-	mov r11,rsi
-
-; convert to what we had for att&t convention
-	mov rdi,rcx
-	mov rsi,rdx
-	mov rdx,r8
-	mov rcx,r9
-	mov r8d,[rsp+40]
-
-; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end
-	sub rsp,8+4*16  ; 8 = align stack , 4 xmm9-12, 16 bytes each
-	movdqa [rsp+0], xmm9
-	movdqa [rsp+16], xmm10
-	movdqa [rsp+32], xmm11
-	movdqa [rsp+48], xmm12
-
-	mov	r10,rdx
-	shr	rdx,4
-	shl	r10,60
-	je	DECB_NO_PARTS_4
-	add	rdx,1
-DECB_NO_PARTS_4:
-	mov	r10,rdx
-	shl	r10,62
-	shr	r10,62
-	shr	rdx,2
-	je	DECB_REMAINDER_4
-	sub	rsi,64
-DECB_LOOP_4:
-	movdqu  xmm1,[rdi]
-	movdqu	xmm2,16[rdi]
-	movdqu	xmm3,32[rdi]
-	movdqu	xmm4,48[rdi]
-	movdqa  xmm9,[rcx]
-	movdqa	xmm10,16[rcx]
-	movdqa	xmm11,32[rcx]
-	movdqa	xmm12,48[rcx]
-	pxor	xmm1,xmm9
-	pxor	xmm2,xmm9
-	pxor	xmm3,xmm9
-	pxor	xmm4,xmm9
-	aesdec	xmm1,xmm10
-	aesdec	xmm2,xmm10
-	aesdec	xmm3,xmm10
-	aesdec	xmm4,xmm10
-	aesdec	xmm1,xmm11
-	aesdec	xmm2,xmm11
-	aesdec	xmm3,xmm11
-	aesdec	xmm4,xmm11
-	aesdec	xmm1,xmm12
-	aesdec	xmm2,xmm12
-	aesdec	xmm3,xmm12
-	aesdec	xmm4,xmm12
-	movdqa	xmm9,64[rcx]
-	movdqa	xmm10,80[rcx]
-	movdqa	xmm11,96[rcx]
-	movdqa	xmm12,112[rcx]
-	aesdec	xmm1,xmm9
-	aesdec	xmm2,xmm9
-	aesdec	xmm3,xmm9
-	aesdec	xmm4,xmm9
-	aesdec	xmm1,xmm10
-	aesdec	xmm2,xmm10
-	aesdec	xmm3,xmm10
-	aesdec	xmm4,xmm10
-	aesdec	xmm1,xmm11
-	aesdec	xmm2,xmm11
-	aesdec	xmm3,xmm11
-	aesdec	xmm4,xmm11
-	aesdec	xmm1,xmm12
-	aesdec	xmm2,xmm12
-	aesdec	xmm3,xmm12
-	aesdec	xmm4,xmm12
-	movdqa	xmm9,128[rcx]
-	movdqa	xmm10,144[rcx]
-	movdqa	xmm11,160[rcx]
-	cmp	r8d,12
-	aesdec	xmm1,xmm9
-	aesdec	xmm2,xmm9
-	aesdec	xmm3,xmm9
-	aesdec	xmm4,xmm9
-	aesdec	xmm1,xmm10
-	aesdec	xmm2,xmm10
-	aesdec	xmm3,xmm10
-	aesdec	xmm4,xmm10
-	jb	DECB_LAST_4
-	movdqa	xmm9,160[rcx]
-	movdqa	xmm10,176[rcx]
-	movdqa	xmm11,192[rcx]
-	cmp	r8d,14
-	aesdec	xmm1,xmm9
-	aesdec	xmm2,xmm9
-	aesdec	xmm3,xmm9
-	aesdec	xmm4,xmm9
-	aesdec	xmm1,xmm10
-	aesdec	xmm2,xmm10
-	aesdec	xmm3,xmm10
-	aesdec	xmm4,xmm10
-	jb	DECB_LAST_4
-	movdqa	xmm9,192[rcx]
-	movdqa	xmm10,208[rcx]
-	movdqa	xmm11,224[rcx]
-	aesdec	xmm1,xmm9
-	aesdec	xmm2,xmm9
-	aesdec	xmm3,xmm9
-	aesdec	xmm4,xmm9
-	aesdec	xmm1,xmm10
-	aesdec	xmm2,xmm10
-	aesdec	xmm3,xmm10
-	aesdec	xmm4,xmm10
-DECB_LAST_4:
-	add	rdi,64
-	add	rsi,64
-	dec	rdx
-	aesdeclast	xmm1,xmm11
-	aesdeclast	xmm2,xmm11
-	aesdeclast	xmm3,xmm11
-	aesdeclast	xmm4,xmm11
-	movdqu	[rsi],xmm1
-	movdqu	16[rsi],xmm2
-	movdqu	32[rsi],xmm3
-	movdqu	48[rsi],xmm4
-	jne	DECB_LOOP_4
-	add	rsi,64
-DECB_REMAINDER_4:
-	cmp	r10,0
-	je	DECB_END_4
-DECB_LOOP_4_2:
-	movdqu  xmm1,[rdi]
-	add	rdi,16
-	pxor	xmm1,[rcx]
-	movdqu	xmm2,160[rcx]
-	cmp	r8d,12
-	aesdec	xmm1,16[rcx]
-	aesdec	xmm1,32[rcx]
-	aesdec	xmm1,48[rcx]
-	aesdec	xmm1,64[rcx]
-	aesdec	xmm1,80[rcx]
-	aesdec	xmm1,96[rcx]
-	aesdec	xmm1,112[rcx]
-	aesdec	xmm1,128[rcx]
-	aesdec	xmm1,144[rcx]
-	jb	DECB_LAST_4_2
-	cmp	r8d,14
-	movdqu	xmm2,192[rcx]
-	aesdec	xmm1,160[rcx]
-	aesdec	xmm1,176[rcx]
-	jb	DECB_LAST_4_2
-	movdqu	xmm2,224[rcx]
-	aesdec	xmm1,192[rcx]
-	aesdec	xmm1,208[rcx]
-DECB_LAST_4_2:
-	aesdeclast	xmm1,xmm2
-	movdqu	[rsi],xmm1
-	add	rsi,16
-	dec	r10
-	jne	DECB_LOOP_4_2
-DECB_END_4:
-	; restore non volatile rdi,rsi
-	mov rdi,rax
-	mov rsi,r11
-	; restore non volatile xmms from stack
-	movdqa xmm9, [rsp+0]
-	movdqa xmm10, [rsp+16]
-	movdqa xmm11, [rsp+32]
-	movdqa xmm12, [rsp+48]
-	add rsp,8+4*16 ; 8 = align stack , 4 xmm9-12 16 bytes each
-	ret
-AES_ECB_decrypt ENDP
-
-
-
-;	/*
-;	void	,AES_128_Key_Expansion[const	unsigned	char*userkey
-;	unsigned	char*key_schedule]/
-;	*/
-;	.	align	16,0x90
-;	.	globl	AES_128_Key_Expansion
-AES_128_Key_Expansion PROC
-;#	parameter	1:	rdi
-;#	parameter	2:	rsi
-
-; save rdi and rsi to rax and r11, restore before ret
-	mov rax,rdi
-	mov r11,rsi
-
-; convert to what we had for att&t convention
-	mov rdi,rcx
-	mov rsi,rdx
-
-	mov	dword ptr 240[rsi],10
-
-	movdqu	xmm1,[rdi]
-	movdqa	[rsi],xmm1
-
-
-ASSISTS:
-	aeskeygenassist	xmm2,xmm1,1
-	call	PREPARE_ROUNDKEY_128
-	movdqa	16[rsi],xmm1
-
-	aeskeygenassist	xmm2,xmm1,2
-	call	PREPARE_ROUNDKEY_128
-	movdqa	32[rsi],xmm1
-
-	aeskeygenassist	xmm2,xmm1,4
-	call	PREPARE_ROUNDKEY_128
-	movdqa	48[rsi],xmm1
-
-	aeskeygenassist	xmm2,xmm1,8
-	call	PREPARE_ROUNDKEY_128
-	movdqa	64[rsi],xmm1
-
-	aeskeygenassist	xmm2,xmm1,16
-	call	PREPARE_ROUNDKEY_128
-	movdqa	80[rsi],xmm1
-
-	aeskeygenassist	xmm2,xmm1,32
-	call	PREPARE_ROUNDKEY_128
-	movdqa	96[rsi],xmm1
-
-	aeskeygenassist	xmm2,xmm1,64
-	call	PREPARE_ROUNDKEY_128
-	movdqa	112[rsi],xmm1
-	aeskeygenassist	xmm2,xmm1,80h
-	call	PREPARE_ROUNDKEY_128
-	movdqa	128[rsi],xmm1
-	aeskeygenassist	xmm2,xmm1,1bh
-	call	PREPARE_ROUNDKEY_128
-	movdqa	144[rsi],xmm1
-	aeskeygenassist	xmm2,xmm1,36h
-	call	PREPARE_ROUNDKEY_128
-	movdqa	160[rsi],xmm1
-	; restore non volatile rdi,rsi
-	mov rdi,rax
-	mov rsi,r11
-	ret
-
-PREPARE_ROUNDKEY_128:
-	pshufd	xmm2,xmm2,255
-	movdqa	xmm3,xmm1
-	pslldq	xmm3,4
-	pxor	xmm1,xmm3
-	pslldq	xmm3,4
-	pxor	xmm1,xmm3
-	pslldq	xmm3,4
-	pxor	xmm1,xmm3
-	pxor	xmm1,xmm2
-	ret
-AES_128_Key_Expansion ENDP
-
-;	/*
-;	void	,AES_192_Key_Expansion[const	unsigned	char*userkey
-;	unsigned	char*key]
-;	*/
-;	.	globl	AES_192_Key_Expansion
-AES_192_Key_Expansion PROC
-;#	parameter	1:	rdi
-;#	parameter	2:	rsi
-
-; save rdi and rsi to rax and r11, restore before ret
-	mov rax,rdi
-	mov r11,rsi
-
-; convert to what we had for att&t convention
-    mov rdi,rcx
-	mov rsi,rdx
-
-; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end
-	sub rsp,8+1*16  ; 8 = align stack , 1 xmm6, 16 bytes each
-	movdqa [rsp+0], xmm6
-
-	movdqu  xmm1,[rdi]
-	movq	xmm3,qword ptr 16[rdi]
-	movdqa	[rsi],xmm1
-	movdqa	xmm5,xmm3
-
-	aeskeygenassist	xmm2,xmm3,1h
-	call	PREPARE_ROUNDKEY_192
-	shufpd	xmm5,xmm1,0
-	movdqa	16[rsi],xmm5
-	movdqa	xmm6,xmm1
-	shufpd	xmm6,xmm3,1
-	movdqa	32[rsi],xmm6
-
-	aeskeygenassist	xmm2,xmm3,2h
-	call	PREPARE_ROUNDKEY_192
-	movdqa	48[rsi],xmm1
-	movdqa	xmm5,xmm3
-
-	aeskeygenassist	xmm2,xmm3,4h
-	call	PREPARE_ROUNDKEY_192
-	shufpd	xmm5,xmm1,0
-	movdqa	64[rsi],xmm5
-	movdqa	xmm6,xmm1
-	shufpd	xmm6,xmm3,1
-	movdqa	80[rsi],xmm6
-
-	aeskeygenassist	xmm2,xmm3,8h
-	call	PREPARE_ROUNDKEY_192
-	movdqa	96[rsi],xmm1
-	movdqa	xmm5,xmm3
-
-	aeskeygenassist	xmm2,xmm3,10h
-	call	PREPARE_ROUNDKEY_192
-	shufpd	xmm5,xmm1,0
-	movdqa	112[rsi],xmm5
-	movdqa	xmm6,xmm1
-	shufpd	xmm6,xmm3,1
-	movdqa	128[rsi],xmm6
-
-	aeskeygenassist	xmm2,xmm3,20h
-	call	PREPARE_ROUNDKEY_192
-	movdqa	144[rsi],xmm1
-	movdqa	xmm5,xmm3
-
-	aeskeygenassist	xmm2,xmm3,40h
-	call	PREPARE_ROUNDKEY_192
-	shufpd	xmm5,xmm1,0
-	movdqa	160[rsi],xmm5
-	movdqa	xmm6,xmm1
-	shufpd	xmm6,xmm3,1
-	movdqa	176[rsi],xmm6
-
-	aeskeygenassist	xmm2,xmm3,80h
-	call	PREPARE_ROUNDKEY_192
-	movdqa	192[rsi],xmm1
-	movdqa	208[rsi],xmm3
-	; restore non volatile rdi,rsi
-	mov rdi,rax
-	mov rsi,r11
-; restore non volatile xmms from stack
-	movdqa xmm6, [rsp+0]
-	add rsp,8+1*16 ; 8 = align stack , 1 xmm6 16 bytes each
-	ret
-
-PREPARE_ROUNDKEY_192:
-	pshufd	xmm2,xmm2,55h
-	movdqu	xmm4,xmm1
-	pslldq	xmm4,4
-	pxor	xmm1,xmm4
-
-	pslldq	xmm4,4
-	pxor	xmm1,xmm4
-	pslldq	xmm4,4
-	pxor	xmm1,xmm4
-	pxor	xmm1,xmm2
-	pshufd	xmm2,xmm1,0ffh
-	movdqu	xmm4,xmm3
-	pslldq	xmm4,4
-	pxor	xmm3,xmm4
-	pxor	xmm3,xmm2
-	ret
-AES_192_Key_Expansion ENDP
-
-;	/*
-;	void	,AES_256_Key_Expansion[const	unsigned	char*userkey
-;	unsigned	char*key]
-;	*/
-;	.	globl	AES_256_Key_Expansion
-AES_256_Key_Expansion PROC
-;#	parameter	1:	rdi
-;#	parameter	2:	rsi
-
-; save rdi and rsi to rax and r11, restore before ret
-	mov rax,rdi
-	mov r11,rsi
-
-; convert to what we had for att&t convention
-    mov rdi,rcx
-	mov rsi,rdx
-
-	movdqu  xmm1,[rdi]
-	movdqu	xmm3,16[rdi]
-	movdqa	[rsi],xmm1
-	movdqa	16[rsi],xmm3
-
-	aeskeygenassist	xmm2,xmm3,1h
-	call	MAKE_RK256_a
-	movdqa	32[rsi],xmm1
-	aeskeygenassist	xmm2,xmm1,0h
-	call	MAKE_RK256_b
-	movdqa	48[rsi],xmm3
-	aeskeygenassist	xmm2,xmm3,2h
-	call	MAKE_RK256_a
-	movdqa	64[rsi],xmm1
-	aeskeygenassist	xmm2,xmm1,0h
-	call	MAKE_RK256_b
-	movdqa	80[rsi],xmm3
-	aeskeygenassist	xmm2,xmm3,4h
-	call	MAKE_RK256_a
-	movdqa	96[rsi],xmm1
-	aeskeygenassist	xmm2,xmm1,0h
-	call	MAKE_RK256_b
-	movdqa	112[rsi],xmm3
-	aeskeygenassist	xmm2,xmm3,8h
-	call	MAKE_RK256_a
-	movdqa	128[rsi],xmm1
-	aeskeygenassist	xmm2,xmm1,0h
-	call	MAKE_RK256_b
-	movdqa	144[rsi],xmm3
-	aeskeygenassist	xmm2,xmm3,10h
-	call	MAKE_RK256_a
-	movdqa	160[rsi],xmm1
-	aeskeygenassist	xmm2,xmm1,0h
-	call	MAKE_RK256_b
-	movdqa	176[rsi],xmm3
-	aeskeygenassist	xmm2,xmm3,20h
-	call	MAKE_RK256_a
-	movdqa	192[rsi],xmm1
-
-	aeskeygenassist	xmm2,xmm1,0h
-	call	MAKE_RK256_b
-	movdqa	208[rsi],xmm3
-	aeskeygenassist	xmm2,xmm3,40h
-	call	MAKE_RK256_a
-	movdqa	224[rsi],xmm1
-
-	; restore non volatile rdi,rsi
-	mov rdi,rax
-	mov rsi,r11
-	ret
-AES_256_Key_Expansion ENDP
-
-MAKE_RK256_a:
-	pshufd	xmm2,xmm2,0ffh
-	movdqa	xmm4,xmm1
-	pslldq	xmm4,4
-	pxor	xmm1,xmm4
-	pslldq	xmm4,4
-	pxor	xmm1,xmm4
-	pslldq	xmm4,4
-	pxor	xmm1,xmm4
-	pxor	xmm1,xmm2
-	ret
-
-MAKE_RK256_b:
-	pshufd	xmm2,xmm2,0aah
-	movdqa	xmm4,xmm3
-	pslldq	xmm4,4
-	pxor	xmm3,xmm4
-	pslldq	xmm4,4
-	pxor	xmm3,xmm4
-	pslldq	xmm4,4
-	pxor	xmm3,xmm4
-	pxor	xmm3,xmm2
-	ret
-
-
-IF fips_version GE 2
-  fipsAh ENDS
-ELSE
-  _text ENDS
-ENDIF
-
-END

+ 0 - 15791
lib/wolfssl/wolfcrypt/src/aes_gcm_asm.asm

@@ -1,15791 +0,0 @@
-; /* aes_gcm_asm.asm */
-; /*
-;  * Copyright (C) 2006-2023 wolfSSL Inc.
-;  *
-;  * This file is part of wolfSSL.
-;  *
-;  * wolfSSL is free software; you can redistribute it and/or modify
-;  * it under the terms of the GNU General Public License as published by
-;  * the Free Software Foundation; either version 2 of the License, or
-;  * (at your option) any later version.
-;  *
-;  * wolfSSL is distributed in the hope that it will be useful,
-;  * but WITHOUT ANY WARRANTY; without even the implied warranty of
-;  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;  * GNU General Public License for more details.
-;  *
-;  * You should have received a copy of the GNU General Public License
-;  * along with this program; if not, write to the Free Software
-;  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
-;  */
-IF @Version LT 1200
-; AVX2 instructions not recognized by old versions of MASM
-IFNDEF NO_AVX2_SUPPORT
-NO_AVX2_SUPPORT = 1
-ENDIF
-; MOVBE instruction not recognized by old versions of MASM
-IFNDEF NO_MOVBE_SUPPORT
-NO_MOVBE_SUPPORT = 1
-ENDIF
-ENDIF
-
-IFNDEF HAVE_INTEL_AVX1
-HAVE_INTEL_AVX1 = 1
-ENDIF
-IFNDEF NO_AVX2_SUPPORT
-HAVE_INTEL_AVX2 = 1
-ENDIF
-
-IFNDEF _WIN64
-_WIN64 = 1
-ENDIF
-
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_one QWORD 0, 1
-ptr_L_aes_gcm_one QWORD L_aes_gcm_one
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_two QWORD 0, 2
-ptr_L_aes_gcm_two QWORD L_aes_gcm_two
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_three QWORD 0, 3
-ptr_L_aes_gcm_three QWORD L_aes_gcm_three
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_four QWORD 0, 4
-ptr_L_aes_gcm_four QWORD L_aes_gcm_four
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_five QWORD 0, 5
-ptr_L_aes_gcm_five QWORD L_aes_gcm_five
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_six QWORD 0, 6
-ptr_L_aes_gcm_six QWORD L_aes_gcm_six
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_seven QWORD 0, 7
-ptr_L_aes_gcm_seven QWORD L_aes_gcm_seven
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_eight QWORD 0, 8
-ptr_L_aes_gcm_eight QWORD L_aes_gcm_eight
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_bswap_epi64 QWORD 283686952306183, 579005069656919567
-ptr_L_aes_gcm_bswap_epi64 QWORD L_aes_gcm_bswap_epi64
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_bswap_mask QWORD 579005069656919567, 283686952306183
-ptr_L_aes_gcm_bswap_mask QWORD L_aes_gcm_bswap_mask
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_mod2_128 QWORD 1, 13979173243358019584
-ptr_L_aes_gcm_mod2_128 QWORD L_aes_gcm_mod2_128
-_DATA ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt PROC
-        push	r13
-        push	rdi
-        push	rsi
-        push	r12
-        push	rbx
-        push	r14
-        push	r15
-        mov	rdi, rcx
-        mov	rsi, rdx
-        mov	r12, r8
-        mov	rax, r9
-        mov	r8, QWORD PTR [rsp+96]
-        mov	r9d, DWORD PTR [rsp+104]
-        mov	r11d, DWORD PTR [rsp+112]
-        mov	ebx, DWORD PTR [rsp+120]
-        mov	r14d, DWORD PTR [rsp+128]
-        mov	r15, QWORD PTR [rsp+136]
-        mov	r10d, DWORD PTR [rsp+144]
-        sub	rsp, 320
-        movdqu	[rsp+160], xmm6
-        movdqu	[rsp+176], xmm7
-        movdqu	[rsp+192], xmm8
-        movdqu	[rsp+208], xmm9
-        movdqu	[rsp+224], xmm10
-        movdqu	[rsp+240], xmm11
-        movdqu	[rsp+256], xmm12
-        movdqu	[rsp+272], xmm13
-        movdqu	[rsp+288], xmm14
-        movdqu	[rsp+304], xmm15
-        pxor	xmm4, xmm4
-        pxor	xmm6, xmm6
-        cmp	ebx, 12
-        mov	edx, ebx
-        jne	L_AES_GCM_encrypt_iv_not_12
-        ; # Calculate values when IV is 12 bytes
-        ; Set counter based on IV
-        mov	ecx, 16777216
-        pinsrq	xmm4, QWORD PTR [rax], 0
-        pinsrd	xmm4, DWORD PTR [rax+8], 2
-        pinsrd	xmm4, ecx, 3
-        ; H = Encrypt X(=0) and T = Encrypt counter
-        movdqa	xmm1, xmm4
-        movdqa	xmm5, OWORD PTR [r15]
-        pxor	xmm1, xmm5
-        movdqa	xmm7, OWORD PTR [r15+16]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+32]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+48]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+64]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+80]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+96]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+112]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+128]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+144]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        cmp	r10d, 11
-        movdqa	xmm7, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_calc_iv_12_last
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+176]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        cmp	r10d, 13
-        movdqa	xmm7, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_calc_iv_12_last
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+208]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_calc_iv_12_last:
-        aesenclast	xmm5, xmm7
-        aesenclast	xmm1, xmm7
-        pshufb	xmm5, OWORD PTR L_aes_gcm_bswap_mask
-        movdqu	[rsp+144], xmm1
-        jmp	L_AES_GCM_encrypt_iv_done
-L_AES_GCM_encrypt_iv_not_12:
-        ; Calculate values when IV is not 12 bytes
-        ; H = Encrypt X(=0)
-        movdqa	xmm5, OWORD PTR [r15]
-        aesenc	xmm5, [r15+16]
-        aesenc	xmm5, [r15+32]
-        aesenc	xmm5, [r15+48]
-        aesenc	xmm5, [r15+64]
-        aesenc	xmm5, [r15+80]
-        aesenc	xmm5, [r15+96]
-        aesenc	xmm5, [r15+112]
-        aesenc	xmm5, [r15+128]
-        aesenc	xmm5, [r15+144]
-        cmp	r10d, 11
-        movdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_calc_iv_1_aesenc_avx_last
-        aesenc	xmm5, xmm9
-        aesenc	xmm5, [r15+176]
-        cmp	r10d, 13
-        movdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_calc_iv_1_aesenc_avx_last
-        aesenc	xmm5, xmm9
-        aesenc	xmm5, [r15+208]
-        movdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_calc_iv_1_aesenc_avx_last:
-        aesenclast	xmm5, xmm9
-        pshufb	xmm5, OWORD PTR L_aes_gcm_bswap_mask
-        ; Calc counter
-        ; Initialization vector
-        cmp	edx, 0
-        mov	rcx, 0
-        je	L_AES_GCM_encrypt_calc_iv_done
-        cmp	edx, 16
-        jl	L_AES_GCM_encrypt_calc_iv_lt16
-        and	edx, 4294967280
-L_AES_GCM_encrypt_calc_iv_16_loop:
-        movdqu	xmm8, [rax+rcx]
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm4, xmm8
-        pshufd	xmm1, xmm4, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm4, 17
-        pclmulqdq	xmm0, xmm4, 0
-        pxor	xmm1, xmm4
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm4, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm4
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm4, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm4, xmm2
-        por	xmm7, xmm0
-        por	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm4, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_calc_iv_16_loop
-        mov	edx, ebx
-        cmp	ecx, edx
-        je	L_AES_GCM_encrypt_calc_iv_done
-L_AES_GCM_encrypt_calc_iv_lt16:
-        sub	rsp, 16
-        pxor	xmm8, xmm8
-        xor	ebx, ebx
-        movdqu	[rsp], xmm8
-L_AES_GCM_encrypt_calc_iv_loop:
-        movzx	r13d, BYTE PTR [rax+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_calc_iv_loop
-        movdqu	xmm8, [rsp]
-        add	rsp, 16
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm4, xmm8
-        pshufd	xmm1, xmm4, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm4, 17
-        pclmulqdq	xmm0, xmm4, 0
-        pxor	xmm1, xmm4
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm4, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm4
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm4, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm4, xmm2
-        por	xmm7, xmm0
-        por	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm4, xmm2
-L_AES_GCM_encrypt_calc_iv_done:
-        ; T = Encrypt counter
-        pxor	xmm0, xmm0
-        shl	edx, 3
-        pinsrq	xmm0, rdx, 0
-        pxor	xmm4, xmm0
-        pshufd	xmm1, xmm4, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm4, 17
-        pclmulqdq	xmm0, xmm4, 0
-        pxor	xmm1, xmm4
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm4, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm4
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm4, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm4, xmm2
-        por	xmm7, xmm0
-        por	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm4, xmm2
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_mask
-        ;   Encrypt counter
-        movdqa	xmm8, OWORD PTR [r15]
-        pxor	xmm8, xmm4
-        aesenc	xmm8, [r15+16]
-        aesenc	xmm8, [r15+32]
-        aesenc	xmm8, [r15+48]
-        aesenc	xmm8, [r15+64]
-        aesenc	xmm8, [r15+80]
-        aesenc	xmm8, [r15+96]
-        aesenc	xmm8, [r15+112]
-        aesenc	xmm8, [r15+128]
-        aesenc	xmm8, [r15+144]
-        cmp	r10d, 11
-        movdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [r15+176]
-        cmp	r10d, 13
-        movdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [r15+208]
-        movdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last:
-        aesenclast	xmm8, xmm9
-        movdqu	[rsp+144], xmm8
-L_AES_GCM_encrypt_iv_done:
-        ; Additional authentication data
-        mov	edx, r11d
-        cmp	edx, 0
-        je	L_AES_GCM_encrypt_calc_aad_done
-        xor	ecx, ecx
-        cmp	edx, 16
-        jl	L_AES_GCM_encrypt_calc_aad_lt16
-        and	edx, 4294967280
-L_AES_GCM_encrypt_calc_aad_16_loop:
-        movdqu	xmm8, [r12+rcx]
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm6, xmm8
-        pshufd	xmm1, xmm6, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm6, 17
-        pclmulqdq	xmm0, xmm6, 0
-        pxor	xmm1, xmm6
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm6, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm6
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm6, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm6, xmm2
-        por	xmm7, xmm0
-        por	xmm6, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm6, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_calc_aad_16_loop
-        mov	edx, r11d
-        cmp	ecx, edx
-        je	L_AES_GCM_encrypt_calc_aad_done
-L_AES_GCM_encrypt_calc_aad_lt16:
-        sub	rsp, 16
-        pxor	xmm8, xmm8
-        xor	ebx, ebx
-        movdqu	[rsp], xmm8
-L_AES_GCM_encrypt_calc_aad_loop:
-        movzx	r13d, BYTE PTR [r12+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_calc_aad_loop
-        movdqu	xmm8, [rsp]
-        add	rsp, 16
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm6, xmm8
-        pshufd	xmm1, xmm6, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm6, 17
-        pclmulqdq	xmm0, xmm6, 0
-        pxor	xmm1, xmm6
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm6, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm6
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm6, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm6, xmm2
-        por	xmm7, xmm0
-        por	xmm6, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm6, xmm2
-L_AES_GCM_encrypt_calc_aad_done:
-        ; Calculate counter and H
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_epi64
-        movdqa	xmm9, xmm5
-        paddd	xmm4, OWORD PTR L_aes_gcm_one
-        movdqa	xmm8, xmm5
-        movdqu	[rsp+128], xmm4
-        psrlq	xmm9, 63
-        psllq	xmm8, 1
-        pslldq	xmm9, 8
-        por	xmm8, xmm9
-        pshufd	xmm5, xmm5, 255
-        psrad	xmm5, 31
-        pand	xmm5, OWORD PTR L_aes_gcm_mod2_128
-        pxor	xmm5, xmm8
-        xor	rbx, rbx
-        cmp	r9d, 128
-        mov	r13d, r9d
-        jl	L_AES_GCM_encrypt_done_128
-        and	r13d, 4294967168
-        movdqa	xmm2, xmm6
-        ; H ^ 1
-        movdqu	[rsp], xmm5
-        ; H ^ 2
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm5, 78
-        movdqa	xmm11, xmm5
-        movdqa	xmm8, xmm5
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm5
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm0, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm0, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm0, xmm14
-        movdqu	[rsp+16], xmm0
-        ; H ^ 3
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm0, 78
-        movdqa	xmm11, xmm0
-        movdqa	xmm8, xmm0
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm0
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm1, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm1, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm1, xmm14
-        movdqu	[rsp+32], xmm1
-        ; H ^ 4
-        pshufd	xmm9, xmm0, 78
-        pshufd	xmm10, xmm0, 78
-        movdqa	xmm11, xmm0
-        movdqa	xmm8, xmm0
-        pclmulqdq	xmm11, xmm0, 17
-        pclmulqdq	xmm8, xmm0, 0
-        pxor	xmm9, xmm0
-        pxor	xmm10, xmm0
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm3, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm3, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm3, xmm14
-        movdqu	[rsp+48], xmm3
-        ; H ^ 5
-        pshufd	xmm9, xmm0, 78
-        pshufd	xmm10, xmm1, 78
-        movdqa	xmm11, xmm1
-        movdqa	xmm8, xmm1
-        pclmulqdq	xmm11, xmm0, 17
-        pclmulqdq	xmm8, xmm0, 0
-        pxor	xmm9, xmm0
-        pxor	xmm10, xmm1
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+64], xmm7
-        ; H ^ 6
-        pshufd	xmm9, xmm1, 78
-        pshufd	xmm10, xmm1, 78
-        movdqa	xmm11, xmm1
-        movdqa	xmm8, xmm1
-        pclmulqdq	xmm11, xmm1, 17
-        pclmulqdq	xmm8, xmm1, 0
-        pxor	xmm9, xmm1
-        pxor	xmm10, xmm1
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+80], xmm7
-        ; H ^ 7
-        pshufd	xmm9, xmm1, 78
-        pshufd	xmm10, xmm3, 78
-        movdqa	xmm11, xmm3
-        movdqa	xmm8, xmm3
-        pclmulqdq	xmm11, xmm1, 17
-        pclmulqdq	xmm8, xmm1, 0
-        pxor	xmm9, xmm1
-        pxor	xmm10, xmm3
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+96], xmm7
-        ; H ^ 8
-        pshufd	xmm9, xmm3, 78
-        pshufd	xmm10, xmm3, 78
-        movdqa	xmm11, xmm3
-        movdqa	xmm8, xmm3
-        pclmulqdq	xmm11, xmm3, 17
-        pclmulqdq	xmm8, xmm3, 0
-        pxor	xmm9, xmm3
-        pxor	xmm10, xmm3
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+112], xmm7
-        ; First 128 bytes of input
-        movdqu	xmm8, [rsp+128]
-        movdqa	xmm1, OWORD PTR L_aes_gcm_bswap_epi64
-        movdqa	xmm0, xmm8
-        pshufb	xmm8, xmm1
-        movdqa	xmm9, xmm0
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pshufb	xmm9, xmm1
-        movdqa	xmm10, xmm0
-        paddd	xmm10, OWORD PTR L_aes_gcm_two
-        pshufb	xmm10, xmm1
-        movdqa	xmm11, xmm0
-        paddd	xmm11, OWORD PTR L_aes_gcm_three
-        pshufb	xmm11, xmm1
-        movdqa	xmm12, xmm0
-        paddd	xmm12, OWORD PTR L_aes_gcm_four
-        pshufb	xmm12, xmm1
-        movdqa	xmm13, xmm0
-        paddd	xmm13, OWORD PTR L_aes_gcm_five
-        pshufb	xmm13, xmm1
-        movdqa	xmm14, xmm0
-        paddd	xmm14, OWORD PTR L_aes_gcm_six
-        pshufb	xmm14, xmm1
-        movdqa	xmm15, xmm0
-        paddd	xmm15, OWORD PTR L_aes_gcm_seven
-        pshufb	xmm15, xmm1
-        paddd	xmm0, OWORD PTR L_aes_gcm_eight
-        movdqa	xmm7, OWORD PTR [r15]
-        movdqu	[rsp+128], xmm0
-        pxor	xmm8, xmm7
-        pxor	xmm9, xmm7
-        pxor	xmm10, xmm7
-        pxor	xmm11, xmm7
-        pxor	xmm12, xmm7
-        pxor	xmm13, xmm7
-        pxor	xmm14, xmm7
-        pxor	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+16]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+32]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+48]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+64]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+80]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+96]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+112]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+128]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+144]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        cmp	r10d, 11
-        movdqa	xmm7, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_enc_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+176]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        cmp	r10d, 13
-        movdqa	xmm7, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_enc_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+208]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_enc_done:
-        aesenclast	xmm8, xmm7
-        aesenclast	xmm9, xmm7
-        movdqu	xmm0, [rdi]
-        movdqu	xmm1, [rdi+16]
-        pxor	xmm8, xmm0
-        pxor	xmm9, xmm1
-        movdqu	[rsi], xmm8
-        movdqu	[rsi+16], xmm9
-        aesenclast	xmm10, xmm7
-        aesenclast	xmm11, xmm7
-        movdqu	xmm0, [rdi+32]
-        movdqu	xmm1, [rdi+48]
-        pxor	xmm10, xmm0
-        pxor	xmm11, xmm1
-        movdqu	[rsi+32], xmm10
-        movdqu	[rsi+48], xmm11
-        aesenclast	xmm12, xmm7
-        aesenclast	xmm13, xmm7
-        movdqu	xmm0, [rdi+64]
-        movdqu	xmm1, [rdi+80]
-        pxor	xmm12, xmm0
-        pxor	xmm13, xmm1
-        movdqu	[rsi+64], xmm12
-        movdqu	[rsi+80], xmm13
-        aesenclast	xmm14, xmm7
-        aesenclast	xmm15, xmm7
-        movdqu	xmm0, [rdi+96]
-        movdqu	xmm1, [rdi+112]
-        pxor	xmm14, xmm0
-        pxor	xmm15, xmm1
-        movdqu	[rsi+96], xmm14
-        movdqu	[rsi+112], xmm15
-        cmp	r13d, 128
-        mov	ebx, 128
-        jle	L_AES_GCM_encrypt_end_128
-        ; More 128 bytes of input
-L_AES_GCM_encrypt_ghash_128:
-        lea	rcx, QWORD PTR [rdi+rbx]
-        lea	rdx, QWORD PTR [rsi+rbx]
-        movdqu	xmm8, [rsp+128]
-        movdqa	xmm1, OWORD PTR L_aes_gcm_bswap_epi64
-        movdqa	xmm0, xmm8
-        pshufb	xmm8, xmm1
-        movdqa	xmm9, xmm0
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pshufb	xmm9, xmm1
-        movdqa	xmm10, xmm0
-        paddd	xmm10, OWORD PTR L_aes_gcm_two
-        pshufb	xmm10, xmm1
-        movdqa	xmm11, xmm0
-        paddd	xmm11, OWORD PTR L_aes_gcm_three
-        pshufb	xmm11, xmm1
-        movdqa	xmm12, xmm0
-        paddd	xmm12, OWORD PTR L_aes_gcm_four
-        pshufb	xmm12, xmm1
-        movdqa	xmm13, xmm0
-        paddd	xmm13, OWORD PTR L_aes_gcm_five
-        pshufb	xmm13, xmm1
-        movdqa	xmm14, xmm0
-        paddd	xmm14, OWORD PTR L_aes_gcm_six
-        pshufb	xmm14, xmm1
-        movdqa	xmm15, xmm0
-        paddd	xmm15, OWORD PTR L_aes_gcm_seven
-        pshufb	xmm15, xmm1
-        paddd	xmm0, OWORD PTR L_aes_gcm_eight
-        movdqa	xmm7, OWORD PTR [r15]
-        movdqu	[rsp+128], xmm0
-        pxor	xmm8, xmm7
-        pxor	xmm9, xmm7
-        pxor	xmm10, xmm7
-        pxor	xmm11, xmm7
-        pxor	xmm12, xmm7
-        pxor	xmm13, xmm7
-        pxor	xmm14, xmm7
-        pxor	xmm15, xmm7
-        movdqu	xmm7, [rsp+112]
-        movdqu	xmm0, [rdx+-128]
-        aesenc	xmm8, [r15+16]
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm0, xmm2
-        pshufd	xmm1, xmm7, 78
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm1, xmm7
-        pxor	xmm5, xmm0
-        movdqa	xmm3, xmm0
-        pclmulqdq	xmm3, xmm7, 17
-        aesenc	xmm9, [r15+16]
-        aesenc	xmm10, [r15+16]
-        movdqa	xmm2, xmm0
-        pclmulqdq	xmm2, xmm7, 0
-        aesenc	xmm11, [r15+16]
-        aesenc	xmm12, [r15+16]
-        pclmulqdq	xmm1, xmm5, 0
-        aesenc	xmm13, [r15+16]
-        aesenc	xmm14, [r15+16]
-        aesenc	xmm15, [r15+16]
-        pxor	xmm1, xmm2
-        pxor	xmm1, xmm3
-        movdqu	xmm7, [rsp+96]
-        movdqu	xmm0, [rdx+-112]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+32]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+32]
-        aesenc	xmm10, [r15+32]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+32]
-        aesenc	xmm12, [r15+32]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+32]
-        aesenc	xmm14, [r15+32]
-        aesenc	xmm15, [r15+32]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+80]
-        movdqu	xmm0, [rdx+-96]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+48]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+48]
-        aesenc	xmm10, [r15+48]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+48]
-        aesenc	xmm12, [r15+48]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+48]
-        aesenc	xmm14, [r15+48]
-        aesenc	xmm15, [r15+48]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+64]
-        movdqu	xmm0, [rdx+-80]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+64]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+64]
-        aesenc	xmm10, [r15+64]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+64]
-        aesenc	xmm12, [r15+64]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+64]
-        aesenc	xmm14, [r15+64]
-        aesenc	xmm15, [r15+64]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+48]
-        movdqu	xmm0, [rdx+-64]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+80]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+80]
-        aesenc	xmm10, [r15+80]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+80]
-        aesenc	xmm12, [r15+80]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+80]
-        aesenc	xmm14, [r15+80]
-        aesenc	xmm15, [r15+80]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+32]
-        movdqu	xmm0, [rdx+-48]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+96]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+96]
-        aesenc	xmm10, [r15+96]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+96]
-        aesenc	xmm12, [r15+96]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+96]
-        aesenc	xmm14, [r15+96]
-        aesenc	xmm15, [r15+96]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+16]
-        movdqu	xmm0, [rdx+-32]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+112]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+112]
-        aesenc	xmm10, [r15+112]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+112]
-        aesenc	xmm12, [r15+112]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+112]
-        aesenc	xmm14, [r15+112]
-        aesenc	xmm15, [r15+112]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp]
-        movdqu	xmm0, [rdx+-16]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+128]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+128]
-        aesenc	xmm10, [r15+128]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+128]
-        aesenc	xmm12, [r15+128]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+128]
-        aesenc	xmm14, [r15+128]
-        aesenc	xmm15, [r15+128]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqa	xmm5, xmm1
-        psrldq	xmm1, 8
-        pslldq	xmm5, 8
-        aesenc	xmm8, [r15+144]
-        pxor	xmm2, xmm5
-        pxor	xmm3, xmm1
-        movdqa	xmm7, xmm2
-        movdqa	xmm4, xmm2
-        movdqa	xmm5, xmm2
-        aesenc	xmm9, [r15+144]
-        pslld	xmm7, 31
-        pslld	xmm4, 30
-        pslld	xmm5, 25
-        aesenc	xmm10, [r15+144]
-        pxor	xmm7, xmm4
-        pxor	xmm7, xmm5
-        aesenc	xmm11, [r15+144]
-        movdqa	xmm4, xmm7
-        pslldq	xmm7, 12
-        psrldq	xmm4, 4
-        aesenc	xmm12, [r15+144]
-        pxor	xmm2, xmm7
-        movdqa	xmm5, xmm2
-        movdqa	xmm1, xmm2
-        movdqa	xmm0, xmm2
-        aesenc	xmm13, [r15+144]
-        psrld	xmm5, 1
-        psrld	xmm1, 2
-        psrld	xmm0, 7
-        aesenc	xmm14, [r15+144]
-        pxor	xmm5, xmm1
-        pxor	xmm5, xmm0
-        aesenc	xmm15, [r15+144]
-        pxor	xmm5, xmm4
-        pxor	xmm2, xmm5
-        pxor	xmm2, xmm3
-        cmp	r10d, 11
-        movdqa	xmm7, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_aesenc_128_ghash_avx_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+176]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        cmp	r10d, 13
-        movdqa	xmm7, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_aesenc_128_ghash_avx_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+208]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_aesenc_128_ghash_avx_done:
-        aesenclast	xmm8, xmm7
-        aesenclast	xmm9, xmm7
-        movdqu	xmm0, [rcx]
-        movdqu	xmm1, [rcx+16]
-        pxor	xmm8, xmm0
-        pxor	xmm9, xmm1
-        movdqu	[rdx], xmm8
-        movdqu	[rdx+16], xmm9
-        aesenclast	xmm10, xmm7
-        aesenclast	xmm11, xmm7
-        movdqu	xmm0, [rcx+32]
-        movdqu	xmm1, [rcx+48]
-        pxor	xmm10, xmm0
-        pxor	xmm11, xmm1
-        movdqu	[rdx+32], xmm10
-        movdqu	[rdx+48], xmm11
-        aesenclast	xmm12, xmm7
-        aesenclast	xmm13, xmm7
-        movdqu	xmm0, [rcx+64]
-        movdqu	xmm1, [rcx+80]
-        pxor	xmm12, xmm0
-        pxor	xmm13, xmm1
-        movdqu	[rdx+64], xmm12
-        movdqu	[rdx+80], xmm13
-        aesenclast	xmm14, xmm7
-        aesenclast	xmm15, xmm7
-        movdqu	xmm0, [rcx+96]
-        movdqu	xmm1, [rcx+112]
-        pxor	xmm14, xmm0
-        pxor	xmm15, xmm1
-        movdqu	[rdx+96], xmm14
-        movdqu	[rdx+112], xmm15
-        add	ebx, 128
-        cmp	ebx, r13d
-        jl	L_AES_GCM_encrypt_ghash_128
-L_AES_GCM_encrypt_end_128:
-        movdqa	xmm4, OWORD PTR L_aes_gcm_bswap_mask
-        pshufb	xmm8, xmm4
-        pshufb	xmm9, xmm4
-        pshufb	xmm10, xmm4
-        pshufb	xmm11, xmm4
-        pxor	xmm8, xmm2
-        pshufb	xmm12, xmm4
-        pshufb	xmm13, xmm4
-        pshufb	xmm14, xmm4
-        pshufb	xmm15, xmm4
-        movdqu	xmm7, [rsp+112]
-        pshufd	xmm1, xmm8, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm8, 17
-        pclmulqdq	xmm0, xmm8, 0
-        pxor	xmm1, xmm8
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm4, xmm0
-        movdqa	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+96]
-        pshufd	xmm1, xmm9, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm9, 17
-        pclmulqdq	xmm0, xmm9, 0
-        pxor	xmm1, xmm9
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+80]
-        pshufd	xmm1, xmm10, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm10, 17
-        pclmulqdq	xmm0, xmm10, 0
-        pxor	xmm1, xmm10
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+64]
-        pshufd	xmm1, xmm11, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm11, 17
-        pclmulqdq	xmm0, xmm11, 0
-        pxor	xmm1, xmm11
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+48]
-        pshufd	xmm1, xmm12, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm12, 17
-        pclmulqdq	xmm0, xmm12, 0
-        pxor	xmm1, xmm12
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+32]
-        pshufd	xmm1, xmm13, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm13, 17
-        pclmulqdq	xmm0, xmm13, 0
-        pxor	xmm1, xmm13
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+16]
-        pshufd	xmm1, xmm14, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm14, 17
-        pclmulqdq	xmm0, xmm14, 0
-        pxor	xmm1, xmm14
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp]
-        pshufd	xmm1, xmm15, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm15, 17
-        pclmulqdq	xmm0, xmm15, 0
-        pxor	xmm1, xmm15
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqa	xmm0, xmm4
-        movdqa	xmm1, xmm4
-        movdqa	xmm2, xmm4
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm4, xmm0
-        movdqa	xmm2, xmm4
-        movdqa	xmm3, xmm4
-        movdqa	xmm0, xmm4
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm4
-        pxor	xmm6, xmm2
-        movdqu	xmm5, [rsp]
-L_AES_GCM_encrypt_done_128:
-        mov	edx, r9d
-        cmp	ebx, edx
-        jge	L_AES_GCM_encrypt_done_enc
-        mov	r13d, r9d
-        and	r13d, 4294967280
-        cmp	ebx, r13d
-        jge	L_AES_GCM_encrypt_last_block_done
-        lea	rcx, QWORD PTR [rdi+rbx]
-        lea	rdx, QWORD PTR [rsi+rbx]
-        movdqu	xmm8, [rsp+128]
-        movdqa	xmm9, xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_epi64
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pxor	xmm8, [r15]
-        movdqu	[rsp+128], xmm9
-        aesenc	xmm8, [r15+16]
-        aesenc	xmm8, [r15+32]
-        aesenc	xmm8, [r15+48]
-        aesenc	xmm8, [r15+64]
-        aesenc	xmm8, [r15+80]
-        aesenc	xmm8, [r15+96]
-        aesenc	xmm8, [r15+112]
-        aesenc	xmm8, [r15+128]
-        aesenc	xmm8, [r15+144]
-        cmp	r10d, 11
-        movdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [r15+176]
-        cmp	r10d, 13
-        movdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [r15+208]
-        movdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last:
-        aesenclast	xmm8, xmm9
-        movdqu	xmm9, [rcx]
-        pxor	xmm8, xmm9
-        movdqu	[rdx], xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm6, xmm8
-        add	ebx, 16
-        cmp	ebx, r13d
-        jge	L_AES_GCM_encrypt_last_block_ghash
-L_AES_GCM_encrypt_last_block_start:
-        lea	rcx, QWORD PTR [rdi+rbx]
-        lea	rdx, QWORD PTR [rsi+rbx]
-        movdqu	xmm8, [rsp+128]
-        movdqa	xmm9, xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_epi64
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pxor	xmm8, [r15]
-        movdqu	[rsp+128], xmm9
-        movdqa	xmm10, xmm6
-        pclmulqdq	xmm10, xmm5, 16
-        aesenc	xmm8, [r15+16]
-        aesenc	xmm8, [r15+32]
-        movdqa	xmm11, xmm6
-        pclmulqdq	xmm11, xmm5, 1
-        aesenc	xmm8, [r15+48]
-        aesenc	xmm8, [r15+64]
-        movdqa	xmm12, xmm6
-        pclmulqdq	xmm12, xmm5, 0
-        aesenc	xmm8, [r15+80]
-        movdqa	xmm1, xmm6
-        pclmulqdq	xmm1, xmm5, 17
-        aesenc	xmm8, [r15+96]
-        pxor	xmm10, xmm11
-        movdqa	xmm2, xmm10
-        psrldq	xmm10, 8
-        pslldq	xmm2, 8
-        aesenc	xmm8, [r15+112]
-        movdqa	xmm3, xmm1
-        pxor	xmm2, xmm12
-        pxor	xmm3, xmm10
-        movdqa	xmm0, OWORD PTR L_aes_gcm_mod2_128
-        movdqa	xmm11, xmm2
-        pclmulqdq	xmm11, xmm0, 16
-        aesenc	xmm8, [r15+128]
-        pshufd	xmm10, xmm2, 78
-        pxor	xmm10, xmm11
-        movdqa	xmm11, xmm10
-        pclmulqdq	xmm11, xmm0, 16
-        aesenc	xmm8, [r15+144]
-        pshufd	xmm6, xmm10, 78
-        pxor	xmm6, xmm11
-        pxor	xmm6, xmm3
-        cmp	r10d, 11
-        movdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_aesenc_gfmul_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [r15+176]
-        cmp	r10d, 13
-        movdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_aesenc_gfmul_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [r15+208]
-        movdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_aesenc_gfmul_last:
-        aesenclast	xmm8, xmm9
-        movdqu	xmm9, [rcx]
-        pxor	xmm8, xmm9
-        movdqu	[rdx], xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm6, xmm8
-        add	ebx, 16
-        cmp	ebx, r13d
-        jl	L_AES_GCM_encrypt_last_block_start
-L_AES_GCM_encrypt_last_block_ghash:
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm6, 78
-        movdqa	xmm11, xmm6
-        movdqa	xmm8, xmm6
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm6
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm6, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm6, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm6, xmm14
-L_AES_GCM_encrypt_last_block_done:
-        mov	ecx, r9d
-        mov	edx, ecx
-        and	ecx, 15
-        jz	L_AES_GCM_encrypt_aesenc_last15_enc_avx_done
-        movdqu	xmm4, [rsp+128]
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_epi64
-        pxor	xmm4, [r15]
-        aesenc	xmm4, [r15+16]
-        aesenc	xmm4, [r15+32]
-        aesenc	xmm4, [r15+48]
-        aesenc	xmm4, [r15+64]
-        aesenc	xmm4, [r15+80]
-        aesenc	xmm4, [r15+96]
-        aesenc	xmm4, [r15+112]
-        aesenc	xmm4, [r15+128]
-        aesenc	xmm4, [r15+144]
-        cmp	r10d, 11
-        movdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last
-        aesenc	xmm4, xmm9
-        aesenc	xmm4, [r15+176]
-        cmp	r10d, 13
-        movdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last
-        aesenc	xmm4, xmm9
-        aesenc	xmm4, [r15+208]
-        movdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last:
-        aesenclast	xmm4, xmm9
-        sub	rsp, 16
-        xor	ecx, ecx
-        movdqu	[rsp], xmm4
-L_AES_GCM_encrypt_aesenc_last15_enc_avx_loop:
-        movzx	r13d, BYTE PTR [rdi+rbx]
-        xor	r13b, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [rsi+rbx], r13b
-        mov	BYTE PTR [rsp+rcx], r13b
-        inc	ebx
-        inc	ecx
-        cmp	ebx, edx
-        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_loop
-        xor	r13, r13
-        cmp	ecx, 16
-        je	L_AES_GCM_encrypt_aesenc_last15_enc_avx_finish_enc
-L_AES_GCM_encrypt_aesenc_last15_enc_avx_byte_loop:
-        mov	BYTE PTR [rsp+rcx], r13b
-        inc	ecx
-        cmp	ecx, 16
-        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_byte_loop
-L_AES_GCM_encrypt_aesenc_last15_enc_avx_finish_enc:
-        movdqu	xmm4, [rsp]
-        add	rsp, 16
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm6, xmm4
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm6, 78
-        movdqa	xmm11, xmm6
-        movdqa	xmm8, xmm6
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm6
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm6, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm6, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm6, xmm14
-L_AES_GCM_encrypt_aesenc_last15_enc_avx_done:
-L_AES_GCM_encrypt_done_enc:
-        mov	edx, r9d
-        mov	ecx, r11d
-        shl	rdx, 3
-        shl	rcx, 3
-        pinsrq	xmm0, rdx, 0
-        pinsrq	xmm0, rcx, 1
-        pxor	xmm6, xmm0
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm6, 78
-        movdqa	xmm11, xmm6
-        movdqa	xmm8, xmm6
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm6
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm6, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm6, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm6, xmm14
-        pshufb	xmm6, OWORD PTR L_aes_gcm_bswap_mask
-        movdqu	xmm0, [rsp+144]
-        pxor	xmm0, xmm6
-        cmp	r14d, 16
-        je	L_AES_GCM_encrypt_store_tag_16
-        xor	rcx, rcx
-        movdqu	[rsp], xmm0
-L_AES_GCM_encrypt_store_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [r8+rcx], r13b
-        inc	ecx
-        cmp	ecx, r14d
-        jne	L_AES_GCM_encrypt_store_tag_loop
-        jmp	L_AES_GCM_encrypt_store_tag_done
-L_AES_GCM_encrypt_store_tag_16:
-        movdqu	[r8], xmm0
-L_AES_GCM_encrypt_store_tag_done:
-        movdqu	xmm6, [rsp+160]
-        movdqu	xmm7, [rsp+176]
-        movdqu	xmm8, [rsp+192]
-        movdqu	xmm9, [rsp+208]
-        movdqu	xmm10, [rsp+224]
-        movdqu	xmm11, [rsp+240]
-        movdqu	xmm12, [rsp+256]
-        movdqu	xmm13, [rsp+272]
-        movdqu	xmm14, [rsp+288]
-        movdqu	xmm15, [rsp+304]
-        add	rsp, 320
-        pop	r15
-        pop	r14
-        pop	rbx
-        pop	r12
-        pop	rsi
-        pop	rdi
-        pop	r13
-        ret
-AES_GCM_encrypt ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_decrypt PROC
-        push	r13
-        push	rdi
-        push	rsi
-        push	r12
-        push	rbx
-        push	r14
-        push	r15
-        push	rbp
-        mov	rdi, rcx
-        mov	rsi, rdx
-        mov	r12, r8
-        mov	rax, r9
-        mov	r8, QWORD PTR [rsp+104]
-        mov	r9d, DWORD PTR [rsp+112]
-        mov	r11d, DWORD PTR [rsp+120]
-        mov	ebx, DWORD PTR [rsp+128]
-        mov	r14d, DWORD PTR [rsp+136]
-        mov	r15, QWORD PTR [rsp+144]
-        mov	r10d, DWORD PTR [rsp+152]
-        mov	rbp, QWORD PTR [rsp+160]
-        sub	rsp, 328
-        movdqu	[rsp+168], xmm6
-        movdqu	[rsp+184], xmm7
-        movdqu	[rsp+200], xmm8
-        movdqu	[rsp+216], xmm9
-        movdqu	[rsp+232], xmm10
-        movdqu	[rsp+248], xmm11
-        movdqu	[rsp+264], xmm12
-        movdqu	[rsp+280], xmm13
-        movdqu	[rsp+296], xmm14
-        movdqu	[rsp+312], xmm15
-        pxor	xmm4, xmm4
-        pxor	xmm6, xmm6
-        cmp	ebx, 12
-        mov	edx, ebx
-        jne	L_AES_GCM_decrypt_iv_not_12
-        ; # Calculate values when IV is 12 bytes
-        ; Set counter based on IV
-        mov	ecx, 16777216
-        pinsrq	xmm4, QWORD PTR [rax], 0
-        pinsrd	xmm4, DWORD PTR [rax+8], 2
-        pinsrd	xmm4, ecx, 3
-        ; H = Encrypt X(=0) and T = Encrypt counter
-        movdqa	xmm1, xmm4
-        movdqa	xmm5, OWORD PTR [r15]
-        pxor	xmm1, xmm5
-        movdqa	xmm7, OWORD PTR [r15+16]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+32]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+48]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+64]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+80]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+96]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+112]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+128]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+144]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        cmp	r10d, 11
-        movdqa	xmm7, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_calc_iv_12_last
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+176]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        cmp	r10d, 13
-        movdqa	xmm7, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_calc_iv_12_last
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+208]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_calc_iv_12_last:
-        aesenclast	xmm5, xmm7
-        aesenclast	xmm1, xmm7
-        pshufb	xmm5, OWORD PTR L_aes_gcm_bswap_mask
-        movdqu	[rsp+144], xmm1
-        jmp	L_AES_GCM_decrypt_iv_done
-L_AES_GCM_decrypt_iv_not_12:
-        ; Calculate values when IV is not 12 bytes
-        ; H = Encrypt X(=0)
-        movdqa	xmm5, OWORD PTR [r15]
-        aesenc	xmm5, [r15+16]
-        aesenc	xmm5, [r15+32]
-        aesenc	xmm5, [r15+48]
-        aesenc	xmm5, [r15+64]
-        aesenc	xmm5, [r15+80]
-        aesenc	xmm5, [r15+96]
-        aesenc	xmm5, [r15+112]
-        aesenc	xmm5, [r15+128]
-        aesenc	xmm5, [r15+144]
-        cmp	r10d, 11
-        movdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_calc_iv_1_aesenc_avx_last
-        aesenc	xmm5, xmm9
-        aesenc	xmm5, [r15+176]
-        cmp	r10d, 13
-        movdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_calc_iv_1_aesenc_avx_last
-        aesenc	xmm5, xmm9
-        aesenc	xmm5, [r15+208]
-        movdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_calc_iv_1_aesenc_avx_last:
-        aesenclast	xmm5, xmm9
-        pshufb	xmm5, OWORD PTR L_aes_gcm_bswap_mask
-        ; Calc counter
-        ; Initialization vector
-        cmp	edx, 0
-        mov	rcx, 0
-        je	L_AES_GCM_decrypt_calc_iv_done
-        cmp	edx, 16
-        jl	L_AES_GCM_decrypt_calc_iv_lt16
-        and	edx, 4294967280
-L_AES_GCM_decrypt_calc_iv_16_loop:
-        movdqu	xmm8, [rax+rcx]
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm4, xmm8
-        pshufd	xmm1, xmm4, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm4, 17
-        pclmulqdq	xmm0, xmm4, 0
-        pxor	xmm1, xmm4
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm4, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm4
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm4, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm4, xmm2
-        por	xmm7, xmm0
-        por	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm4, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_calc_iv_16_loop
-        mov	edx, ebx
-        cmp	ecx, edx
-        je	L_AES_GCM_decrypt_calc_iv_done
-L_AES_GCM_decrypt_calc_iv_lt16:
-        sub	rsp, 16
-        pxor	xmm8, xmm8
-        xor	ebx, ebx
-        movdqu	[rsp], xmm8
-L_AES_GCM_decrypt_calc_iv_loop:
-        movzx	r13d, BYTE PTR [rax+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_calc_iv_loop
-        movdqu	xmm8, [rsp]
-        add	rsp, 16
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm4, xmm8
-        pshufd	xmm1, xmm4, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm4, 17
-        pclmulqdq	xmm0, xmm4, 0
-        pxor	xmm1, xmm4
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm4, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm4
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm4, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm4, xmm2
-        por	xmm7, xmm0
-        por	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm4, xmm2
-L_AES_GCM_decrypt_calc_iv_done:
-        ; T = Encrypt counter
-        pxor	xmm0, xmm0
-        shl	edx, 3
-        pinsrq	xmm0, rdx, 0
-        pxor	xmm4, xmm0
-        pshufd	xmm1, xmm4, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm4, 17
-        pclmulqdq	xmm0, xmm4, 0
-        pxor	xmm1, xmm4
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm4, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm4
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm4, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm4, xmm2
-        por	xmm7, xmm0
-        por	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm4, xmm2
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_mask
-        ;   Encrypt counter
-        movdqa	xmm8, OWORD PTR [r15]
-        pxor	xmm8, xmm4
-        aesenc	xmm8, [r15+16]
-        aesenc	xmm8, [r15+32]
-        aesenc	xmm8, [r15+48]
-        aesenc	xmm8, [r15+64]
-        aesenc	xmm8, [r15+80]
-        aesenc	xmm8, [r15+96]
-        aesenc	xmm8, [r15+112]
-        aesenc	xmm8, [r15+128]
-        aesenc	xmm8, [r15+144]
-        cmp	r10d, 11
-        movdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [r15+176]
-        cmp	r10d, 13
-        movdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [r15+208]
-        movdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last:
-        aesenclast	xmm8, xmm9
-        movdqu	[rsp+144], xmm8
-L_AES_GCM_decrypt_iv_done:
-        ; Additional authentication data
-        mov	edx, r11d
-        cmp	edx, 0
-        je	L_AES_GCM_decrypt_calc_aad_done
-        xor	ecx, ecx
-        cmp	edx, 16
-        jl	L_AES_GCM_decrypt_calc_aad_lt16
-        and	edx, 4294967280
-L_AES_GCM_decrypt_calc_aad_16_loop:
-        movdqu	xmm8, [r12+rcx]
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm6, xmm8
-        pshufd	xmm1, xmm6, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm6, 17
-        pclmulqdq	xmm0, xmm6, 0
-        pxor	xmm1, xmm6
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm6, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm6
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm6, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm6, xmm2
-        por	xmm7, xmm0
-        por	xmm6, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm6, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_calc_aad_16_loop
-        mov	edx, r11d
-        cmp	ecx, edx
-        je	L_AES_GCM_decrypt_calc_aad_done
-L_AES_GCM_decrypt_calc_aad_lt16:
-        sub	rsp, 16
-        pxor	xmm8, xmm8
-        xor	ebx, ebx
-        movdqu	[rsp], xmm8
-L_AES_GCM_decrypt_calc_aad_loop:
-        movzx	r13d, BYTE PTR [r12+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_calc_aad_loop
-        movdqu	xmm8, [rsp]
-        add	rsp, 16
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm6, xmm8
-        pshufd	xmm1, xmm6, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm6, 17
-        pclmulqdq	xmm0, xmm6, 0
-        pxor	xmm1, xmm6
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm6, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm6
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm6, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm6, xmm2
-        por	xmm7, xmm0
-        por	xmm6, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm6, xmm2
-L_AES_GCM_decrypt_calc_aad_done:
-        ; Calculate counter and H
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_epi64
-        movdqa	xmm9, xmm5
-        paddd	xmm4, OWORD PTR L_aes_gcm_one
-        movdqa	xmm8, xmm5
-        movdqu	[rsp+128], xmm4
-        psrlq	xmm9, 63
-        psllq	xmm8, 1
-        pslldq	xmm9, 8
-        por	xmm8, xmm9
-        pshufd	xmm5, xmm5, 255
-        psrad	xmm5, 31
-        pand	xmm5, OWORD PTR L_aes_gcm_mod2_128
-        pxor	xmm5, xmm8
-        xor	ebx, ebx
-        cmp	r9d, 128
-        mov	r13d, r9d
-        jl	L_AES_GCM_decrypt_done_128
-        and	r13d, 4294967168
-        movdqa	xmm2, xmm6
-        ; H ^ 1
-        movdqu	[rsp], xmm5
-        ; H ^ 2
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm5, 78
-        movdqa	xmm11, xmm5
-        movdqa	xmm8, xmm5
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm5
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm0, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm0, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm0, xmm14
-        movdqu	[rsp+16], xmm0
-        ; H ^ 3
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm0, 78
-        movdqa	xmm11, xmm0
-        movdqa	xmm8, xmm0
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm0
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm1, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm1, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm1, xmm14
-        movdqu	[rsp+32], xmm1
-        ; H ^ 4
-        pshufd	xmm9, xmm0, 78
-        pshufd	xmm10, xmm0, 78
-        movdqa	xmm11, xmm0
-        movdqa	xmm8, xmm0
-        pclmulqdq	xmm11, xmm0, 17
-        pclmulqdq	xmm8, xmm0, 0
-        pxor	xmm9, xmm0
-        pxor	xmm10, xmm0
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm3, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm3, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm3, xmm14
-        movdqu	[rsp+48], xmm3
-        ; H ^ 5
-        pshufd	xmm9, xmm0, 78
-        pshufd	xmm10, xmm1, 78
-        movdqa	xmm11, xmm1
-        movdqa	xmm8, xmm1
-        pclmulqdq	xmm11, xmm0, 17
-        pclmulqdq	xmm8, xmm0, 0
-        pxor	xmm9, xmm0
-        pxor	xmm10, xmm1
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+64], xmm7
-        ; H ^ 6
-        pshufd	xmm9, xmm1, 78
-        pshufd	xmm10, xmm1, 78
-        movdqa	xmm11, xmm1
-        movdqa	xmm8, xmm1
-        pclmulqdq	xmm11, xmm1, 17
-        pclmulqdq	xmm8, xmm1, 0
-        pxor	xmm9, xmm1
-        pxor	xmm10, xmm1
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+80], xmm7
-        ; H ^ 7
-        pshufd	xmm9, xmm1, 78
-        pshufd	xmm10, xmm3, 78
-        movdqa	xmm11, xmm3
-        movdqa	xmm8, xmm3
-        pclmulqdq	xmm11, xmm1, 17
-        pclmulqdq	xmm8, xmm1, 0
-        pxor	xmm9, xmm1
-        pxor	xmm10, xmm3
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+96], xmm7
-        ; H ^ 8
-        pshufd	xmm9, xmm3, 78
-        pshufd	xmm10, xmm3, 78
-        movdqa	xmm11, xmm3
-        movdqa	xmm8, xmm3
-        pclmulqdq	xmm11, xmm3, 17
-        pclmulqdq	xmm8, xmm3, 0
-        pxor	xmm9, xmm3
-        pxor	xmm10, xmm3
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+112], xmm7
-L_AES_GCM_decrypt_ghash_128:
-        lea	rcx, QWORD PTR [rdi+rbx]
-        lea	rdx, QWORD PTR [rsi+rbx]
-        movdqu	xmm8, [rsp+128]
-        movdqa	xmm1, OWORD PTR L_aes_gcm_bswap_epi64
-        movdqa	xmm0, xmm8
-        pshufb	xmm8, xmm1
-        movdqa	xmm9, xmm0
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pshufb	xmm9, xmm1
-        movdqa	xmm10, xmm0
-        paddd	xmm10, OWORD PTR L_aes_gcm_two
-        pshufb	xmm10, xmm1
-        movdqa	xmm11, xmm0
-        paddd	xmm11, OWORD PTR L_aes_gcm_three
-        pshufb	xmm11, xmm1
-        movdqa	xmm12, xmm0
-        paddd	xmm12, OWORD PTR L_aes_gcm_four
-        pshufb	xmm12, xmm1
-        movdqa	xmm13, xmm0
-        paddd	xmm13, OWORD PTR L_aes_gcm_five
-        pshufb	xmm13, xmm1
-        movdqa	xmm14, xmm0
-        paddd	xmm14, OWORD PTR L_aes_gcm_six
-        pshufb	xmm14, xmm1
-        movdqa	xmm15, xmm0
-        paddd	xmm15, OWORD PTR L_aes_gcm_seven
-        pshufb	xmm15, xmm1
-        paddd	xmm0, OWORD PTR L_aes_gcm_eight
-        movdqa	xmm7, OWORD PTR [r15]
-        movdqu	[rsp+128], xmm0
-        pxor	xmm8, xmm7
-        pxor	xmm9, xmm7
-        pxor	xmm10, xmm7
-        pxor	xmm11, xmm7
-        pxor	xmm12, xmm7
-        pxor	xmm13, xmm7
-        pxor	xmm14, xmm7
-        pxor	xmm15, xmm7
-        movdqu	xmm7, [rsp+112]
-        movdqu	xmm0, [rcx]
-        aesenc	xmm8, [r15+16]
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm0, xmm2
-        pshufd	xmm1, xmm7, 78
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm1, xmm7
-        pxor	xmm5, xmm0
-        movdqa	xmm3, xmm0
-        pclmulqdq	xmm3, xmm7, 17
-        aesenc	xmm9, [r15+16]
-        aesenc	xmm10, [r15+16]
-        movdqa	xmm2, xmm0
-        pclmulqdq	xmm2, xmm7, 0
-        aesenc	xmm11, [r15+16]
-        aesenc	xmm12, [r15+16]
-        pclmulqdq	xmm1, xmm5, 0
-        aesenc	xmm13, [r15+16]
-        aesenc	xmm14, [r15+16]
-        aesenc	xmm15, [r15+16]
-        pxor	xmm1, xmm2
-        pxor	xmm1, xmm3
-        movdqu	xmm7, [rsp+96]
-        movdqu	xmm0, [rcx+16]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+32]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+32]
-        aesenc	xmm10, [r15+32]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+32]
-        aesenc	xmm12, [r15+32]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+32]
-        aesenc	xmm14, [r15+32]
-        aesenc	xmm15, [r15+32]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+80]
-        movdqu	xmm0, [rcx+32]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+48]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+48]
-        aesenc	xmm10, [r15+48]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+48]
-        aesenc	xmm12, [r15+48]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+48]
-        aesenc	xmm14, [r15+48]
-        aesenc	xmm15, [r15+48]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+64]
-        movdqu	xmm0, [rcx+48]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+64]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+64]
-        aesenc	xmm10, [r15+64]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+64]
-        aesenc	xmm12, [r15+64]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+64]
-        aesenc	xmm14, [r15+64]
-        aesenc	xmm15, [r15+64]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+48]
-        movdqu	xmm0, [rcx+64]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+80]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+80]
-        aesenc	xmm10, [r15+80]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+80]
-        aesenc	xmm12, [r15+80]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+80]
-        aesenc	xmm14, [r15+80]
-        aesenc	xmm15, [r15+80]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+32]
-        movdqu	xmm0, [rcx+80]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+96]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+96]
-        aesenc	xmm10, [r15+96]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+96]
-        aesenc	xmm12, [r15+96]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+96]
-        aesenc	xmm14, [r15+96]
-        aesenc	xmm15, [r15+96]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+16]
-        movdqu	xmm0, [rcx+96]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+112]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+112]
-        aesenc	xmm10, [r15+112]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+112]
-        aesenc	xmm12, [r15+112]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+112]
-        aesenc	xmm14, [r15+112]
-        aesenc	xmm15, [r15+112]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp]
-        movdqu	xmm0, [rcx+112]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+128]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+128]
-        aesenc	xmm10, [r15+128]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+128]
-        aesenc	xmm12, [r15+128]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+128]
-        aesenc	xmm14, [r15+128]
-        aesenc	xmm15, [r15+128]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqa	xmm5, xmm1
-        psrldq	xmm1, 8
-        pslldq	xmm5, 8
-        aesenc	xmm8, [r15+144]
-        pxor	xmm2, xmm5
-        pxor	xmm3, xmm1
-        movdqa	xmm7, xmm2
-        movdqa	xmm4, xmm2
-        movdqa	xmm5, xmm2
-        aesenc	xmm9, [r15+144]
-        pslld	xmm7, 31
-        pslld	xmm4, 30
-        pslld	xmm5, 25
-        aesenc	xmm10, [r15+144]
-        pxor	xmm7, xmm4
-        pxor	xmm7, xmm5
-        aesenc	xmm11, [r15+144]
-        movdqa	xmm4, xmm7
-        pslldq	xmm7, 12
-        psrldq	xmm4, 4
-        aesenc	xmm12, [r15+144]
-        pxor	xmm2, xmm7
-        movdqa	xmm5, xmm2
-        movdqa	xmm1, xmm2
-        movdqa	xmm0, xmm2
-        aesenc	xmm13, [r15+144]
-        psrld	xmm5, 1
-        psrld	xmm1, 2
-        psrld	xmm0, 7
-        aesenc	xmm14, [r15+144]
-        pxor	xmm5, xmm1
-        pxor	xmm5, xmm0
-        aesenc	xmm15, [r15+144]
-        pxor	xmm5, xmm4
-        pxor	xmm2, xmm5
-        pxor	xmm2, xmm3
-        cmp	r10d, 11
-        movdqa	xmm7, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_aesenc_128_ghash_avx_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+176]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        cmp	r10d, 13
-        movdqa	xmm7, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_aesenc_128_ghash_avx_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+208]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_aesenc_128_ghash_avx_done:
-        aesenclast	xmm8, xmm7
-        aesenclast	xmm9, xmm7
-        movdqu	xmm0, [rcx]
-        movdqu	xmm1, [rcx+16]
-        pxor	xmm8, xmm0
-        pxor	xmm9, xmm1
-        movdqu	[rdx], xmm8
-        movdqu	[rdx+16], xmm9
-        aesenclast	xmm10, xmm7
-        aesenclast	xmm11, xmm7
-        movdqu	xmm0, [rcx+32]
-        movdqu	xmm1, [rcx+48]
-        pxor	xmm10, xmm0
-        pxor	xmm11, xmm1
-        movdqu	[rdx+32], xmm10
-        movdqu	[rdx+48], xmm11
-        aesenclast	xmm12, xmm7
-        aesenclast	xmm13, xmm7
-        movdqu	xmm0, [rcx+64]
-        movdqu	xmm1, [rcx+80]
-        pxor	xmm12, xmm0
-        pxor	xmm13, xmm1
-        movdqu	[rdx+64], xmm12
-        movdqu	[rdx+80], xmm13
-        aesenclast	xmm14, xmm7
-        aesenclast	xmm15, xmm7
-        movdqu	xmm0, [rcx+96]
-        movdqu	xmm1, [rcx+112]
-        pxor	xmm14, xmm0
-        pxor	xmm15, xmm1
-        movdqu	[rdx+96], xmm14
-        movdqu	[rdx+112], xmm15
-        add	ebx, 128
-        cmp	ebx, r13d
-        jl	L_AES_GCM_decrypt_ghash_128
-        movdqa	xmm6, xmm2
-        movdqu	xmm5, [rsp]
-L_AES_GCM_decrypt_done_128:
-        mov	edx, r9d
-        cmp	ebx, edx
-        jge	L_AES_GCM_decrypt_done_dec
-        mov	r13d, r9d
-        and	r13d, 4294967280
-        cmp	ebx, r13d
-        jge	L_AES_GCM_decrypt_last_block_done
-L_AES_GCM_decrypt_last_block_start:
-        lea	rcx, QWORD PTR [rdi+rbx]
-        lea	rdx, QWORD PTR [rsi+rbx]
-        movdqu	xmm1, [rcx]
-        movdqa	xmm0, xmm5
-        pshufb	xmm1, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm1, xmm6
-        movdqu	xmm8, [rsp+128]
-        movdqa	xmm9, xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_epi64
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pxor	xmm8, [r15]
-        movdqu	[rsp+128], xmm9
-        movdqa	xmm10, xmm1
-        pclmulqdq	xmm10, xmm0, 16
-        aesenc	xmm8, [r15+16]
-        aesenc	xmm8, [r15+32]
-        movdqa	xmm11, xmm1
-        pclmulqdq	xmm11, xmm0, 1
-        aesenc	xmm8, [r15+48]
-        aesenc	xmm8, [r15+64]
-        movdqa	xmm12, xmm1
-        pclmulqdq	xmm12, xmm0, 0
-        aesenc	xmm8, [r15+80]
-        movdqa	xmm1, xmm1
-        pclmulqdq	xmm1, xmm0, 17
-        aesenc	xmm8, [r15+96]
-        pxor	xmm10, xmm11
-        movdqa	xmm2, xmm10
-        psrldq	xmm10, 8
-        pslldq	xmm2, 8
-        aesenc	xmm8, [r15+112]
-        movdqa	xmm3, xmm1
-        pxor	xmm2, xmm12
-        pxor	xmm3, xmm10
-        movdqa	xmm0, OWORD PTR L_aes_gcm_mod2_128
-        movdqa	xmm11, xmm2
-        pclmulqdq	xmm11, xmm0, 16
-        aesenc	xmm8, [r15+128]
-        pshufd	xmm10, xmm2, 78
-        pxor	xmm10, xmm11
-        movdqa	xmm11, xmm10
-        pclmulqdq	xmm11, xmm0, 16
-        aesenc	xmm8, [r15+144]
-        pshufd	xmm6, xmm10, 78
-        pxor	xmm6, xmm11
-        pxor	xmm6, xmm3
-        cmp	r10d, 11
-        movdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_aesenc_gfmul_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [r15+176]
-        cmp	r10d, 13
-        movdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_aesenc_gfmul_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [r15+208]
-        movdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_aesenc_gfmul_last:
-        aesenclast	xmm8, xmm9
-        movdqu	xmm9, [rcx]
-        pxor	xmm8, xmm9
-        movdqu	[rdx], xmm8
-        add	ebx, 16
-        cmp	ebx, r13d
-        jl	L_AES_GCM_decrypt_last_block_start
-L_AES_GCM_decrypt_last_block_done:
-        mov	ecx, r9d
-        mov	edx, ecx
-        and	ecx, 15
-        jz	L_AES_GCM_decrypt_aesenc_last15_dec_avx_done
-        movdqu	xmm4, [rsp+128]
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_epi64
-        pxor	xmm4, [r15]
-        aesenc	xmm4, [r15+16]
-        aesenc	xmm4, [r15+32]
-        aesenc	xmm4, [r15+48]
-        aesenc	xmm4, [r15+64]
-        aesenc	xmm4, [r15+80]
-        aesenc	xmm4, [r15+96]
-        aesenc	xmm4, [r15+112]
-        aesenc	xmm4, [r15+128]
-        aesenc	xmm4, [r15+144]
-        cmp	r10d, 11
-        movdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last
-        aesenc	xmm4, xmm9
-        aesenc	xmm4, [r15+176]
-        cmp	r10d, 13
-        movdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last
-        aesenc	xmm4, xmm9
-        aesenc	xmm4, [r15+208]
-        movdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last:
-        aesenclast	xmm4, xmm9
-        sub	rsp, 32
-        xor	ecx, ecx
-        movdqu	[rsp], xmm4
-        pxor	xmm0, xmm0
-        movdqu	[rsp+16], xmm0
-L_AES_GCM_decrypt_aesenc_last15_dec_avx_loop:
-        movzx	r13d, BYTE PTR [rdi+rbx]
-        mov	BYTE PTR [rsp+rcx+16], r13b
-        xor	r13b, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [rsi+rbx], r13b
-        inc	ebx
-        inc	ecx
-        cmp	ebx, edx
-        jl	L_AES_GCM_decrypt_aesenc_last15_dec_avx_loop
-        movdqu	xmm4, [rsp+16]
-        add	rsp, 32
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm6, xmm4
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm6, 78
-        movdqa	xmm11, xmm6
-        movdqa	xmm8, xmm6
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm6
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm6, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm6, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm6, xmm14
-L_AES_GCM_decrypt_aesenc_last15_dec_avx_done:
-L_AES_GCM_decrypt_done_dec:
-        mov	edx, r9d
-        mov	ecx, r11d
-        shl	rdx, 3
-        shl	rcx, 3
-        pinsrq	xmm0, rdx, 0
-        pinsrq	xmm0, rcx, 1
-        pxor	xmm6, xmm0
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm6, 78
-        movdqa	xmm11, xmm6
-        movdqa	xmm8, xmm6
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm6
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm6, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm6, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm6, xmm14
-        pshufb	xmm6, OWORD PTR L_aes_gcm_bswap_mask
-        movdqu	xmm0, [rsp+144]
-        pxor	xmm0, xmm6
-        cmp	r14d, 16
-        je	L_AES_GCM_decrypt_cmp_tag_16
-        sub	rsp, 16
-        xor	rcx, rcx
-        xor	rbx, rbx
-        movdqu	[rsp], xmm0
-L_AES_GCM_decrypt_cmp_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+rcx]
-        xor	r13b, BYTE PTR [r8+rcx]
-        or	bl, r13b
-        inc	ecx
-        cmp	ecx, r14d
-        jne	L_AES_GCM_decrypt_cmp_tag_loop
-        cmp	rbx, 0
-        sete	bl
-        add	rsp, 16
-        xor	rcx, rcx
-        jmp	L_AES_GCM_decrypt_cmp_tag_done
-L_AES_GCM_decrypt_cmp_tag_16:
-        movdqu	xmm1, [r8]
-        pcmpeqb	xmm0, xmm1
-        pmovmskb	rdx, xmm0
-        ; %%edx == 0xFFFF then return 1 else => return 0
-        xor	ebx, ebx
-        cmp	edx, 65535
-        sete	bl
-L_AES_GCM_decrypt_cmp_tag_done:
-        mov	DWORD PTR [rbp], ebx
-        movdqu	xmm6, [rsp+168]
-        movdqu	xmm7, [rsp+184]
-        movdqu	xmm8, [rsp+200]
-        movdqu	xmm9, [rsp+216]
-        movdqu	xmm10, [rsp+232]
-        movdqu	xmm11, [rsp+248]
-        movdqu	xmm12, [rsp+264]
-        movdqu	xmm13, [rsp+280]
-        movdqu	xmm14, [rsp+296]
-        movdqu	xmm15, [rsp+312]
-        add	rsp, 328
-        pop	rbp
-        pop	r15
-        pop	r14
-        pop	rbx
-        pop	r12
-        pop	rsi
-        pop	rdi
-        pop	r13
-        ret
-AES_GCM_decrypt ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_init_aesni PROC
-        push	rdi
-        push	rsi
-        push	r12
-        push	r13
-        push	r14
-        mov	rdi, rcx
-        mov	rsi, rdx
-        mov	r10, r8
-        mov	r11d, r9d
-        mov	rax, QWORD PTR [rsp+80]
-        mov	r8, QWORD PTR [rsp+88]
-        mov	r9, QWORD PTR [rsp+96]
-        sub	rsp, 80
-        movdqu	[rsp+16], xmm6
-        movdqu	[rsp+32], xmm7
-        movdqu	[rsp+48], xmm8
-        movdqu	[rsp+64], xmm15
-        pxor	xmm4, xmm4
-        mov	edx, r11d
-        cmp	edx, 12
-        jne	L_AES_GCM_init_aesni_iv_not_12
-        ; # Calculate values when IV is 12 bytes
-        ; Set counter based on IV
-        mov	ecx, 16777216
-        pinsrq	xmm4, QWORD PTR [r10], 0
-        pinsrd	xmm4, DWORD PTR [r10+8], 2
-        pinsrd	xmm4, ecx, 3
-        ; H = Encrypt X(=0) and T = Encrypt counter
-        movdqa	xmm1, xmm4
-        movdqa	xmm5, OWORD PTR [rdi]
-        pxor	xmm1, xmm5
-        movdqa	xmm6, OWORD PTR [rdi+16]
-        aesenc	xmm5, xmm6
-        aesenc	xmm1, xmm6
-        movdqa	xmm6, OWORD PTR [rdi+32]
-        aesenc	xmm5, xmm6
-        aesenc	xmm1, xmm6
-        movdqa	xmm6, OWORD PTR [rdi+48]
-        aesenc	xmm5, xmm6
-        aesenc	xmm1, xmm6
-        movdqa	xmm6, OWORD PTR [rdi+64]
-        aesenc	xmm5, xmm6
-        aesenc	xmm1, xmm6
-        movdqa	xmm6, OWORD PTR [rdi+80]
-        aesenc	xmm5, xmm6
-        aesenc	xmm1, xmm6
-        movdqa	xmm6, OWORD PTR [rdi+96]
-        aesenc	xmm5, xmm6
-        aesenc	xmm1, xmm6
-        movdqa	xmm6, OWORD PTR [rdi+112]
-        aesenc	xmm5, xmm6
-        aesenc	xmm1, xmm6
-        movdqa	xmm6, OWORD PTR [rdi+128]
-        aesenc	xmm5, xmm6
-        aesenc	xmm1, xmm6
-        movdqa	xmm6, OWORD PTR [rdi+144]
-        aesenc	xmm5, xmm6
-        aesenc	xmm1, xmm6
-        cmp	esi, 11
-        movdqa	xmm6, OWORD PTR [rdi+160]
-        jl	L_AES_GCM_init_aesni_calc_iv_12_last
-        aesenc	xmm5, xmm6
-        aesenc	xmm1, xmm6
-        movdqa	xmm6, OWORD PTR [rdi+176]
-        aesenc	xmm5, xmm6
-        aesenc	xmm1, xmm6
-        cmp	esi, 13
-        movdqa	xmm6, OWORD PTR [rdi+192]
-        jl	L_AES_GCM_init_aesni_calc_iv_12_last
-        aesenc	xmm5, xmm6
-        aesenc	xmm1, xmm6
-        movdqa	xmm6, OWORD PTR [rdi+208]
-        aesenc	xmm5, xmm6
-        aesenc	xmm1, xmm6
-        movdqa	xmm6, OWORD PTR [rdi+224]
-L_AES_GCM_init_aesni_calc_iv_12_last:
-        aesenclast	xmm5, xmm6
-        aesenclast	xmm1, xmm6
-        pshufb	xmm5, OWORD PTR L_aes_gcm_bswap_mask
-        movdqu	xmm15, xmm1
-        jmp	L_AES_GCM_init_aesni_iv_done
-L_AES_GCM_init_aesni_iv_not_12:
-        ; Calculate values when IV is not 12 bytes
-        ; H = Encrypt X(=0)
-        movdqa	xmm5, OWORD PTR [rdi]
-        aesenc	xmm5, [rdi+16]
-        aesenc	xmm5, [rdi+32]
-        aesenc	xmm5, [rdi+48]
-        aesenc	xmm5, [rdi+64]
-        aesenc	xmm5, [rdi+80]
-        aesenc	xmm5, [rdi+96]
-        aesenc	xmm5, [rdi+112]
-        aesenc	xmm5, [rdi+128]
-        aesenc	xmm5, [rdi+144]
-        cmp	esi, 11
-        movdqa	xmm8, OWORD PTR [rdi+160]
-        jl	L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last
-        aesenc	xmm5, xmm8
-        aesenc	xmm5, [rdi+176]
-        cmp	esi, 13
-        movdqa	xmm8, OWORD PTR [rdi+192]
-        jl	L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last
-        aesenc	xmm5, xmm8
-        aesenc	xmm5, [rdi+208]
-        movdqa	xmm8, OWORD PTR [rdi+224]
-L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last:
-        aesenclast	xmm5, xmm8
-        pshufb	xmm5, OWORD PTR L_aes_gcm_bswap_mask
-        ; Calc counter
-        ; Initialization vector
-        cmp	edx, 0
-        mov	rcx, 0
-        je	L_AES_GCM_init_aesni_calc_iv_done
-        cmp	edx, 16
-        jl	L_AES_GCM_init_aesni_calc_iv_lt16
-        and	edx, 4294967280
-L_AES_GCM_init_aesni_calc_iv_16_loop:
-        movdqu	xmm7, [r10+rcx]
-        pshufb	xmm7, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm4, xmm7
-        pshufd	xmm1, xmm4, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm4, 17
-        pclmulqdq	xmm0, xmm4, 0
-        pxor	xmm1, xmm4
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm6, xmm0
-        movdqa	xmm4, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm6, xmm2
-        pxor	xmm4, xmm1
-        movdqa	xmm0, xmm6
-        movdqa	xmm1, xmm4
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm6, 1
-        pslld	xmm4, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm4, xmm2
-        por	xmm6, xmm0
-        por	xmm4, xmm1
-        movdqa	xmm0, xmm6
-        movdqa	xmm1, xmm6
-        movdqa	xmm2, xmm6
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm6, xmm0
-        movdqa	xmm2, xmm6
-        movdqa	xmm3, xmm6
-        movdqa	xmm0, xmm6
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm6
-        pxor	xmm4, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_init_aesni_calc_iv_16_loop
-        mov	edx, r11d
-        cmp	ecx, edx
-        je	L_AES_GCM_init_aesni_calc_iv_done
-L_AES_GCM_init_aesni_calc_iv_lt16:
-        sub	rsp, 16
-        pxor	xmm7, xmm7
-        xor	r13d, r13d
-        movdqu	[rsp], xmm7
-L_AES_GCM_init_aesni_calc_iv_loop:
-        movzx	r12d, BYTE PTR [r10+rcx]
-        mov	BYTE PTR [rsp+r13], r12b
-        inc	ecx
-        inc	r13d
-        cmp	ecx, edx
-        jl	L_AES_GCM_init_aesni_calc_iv_loop
-        movdqu	xmm7, [rsp]
-        add	rsp, 16
-        pshufb	xmm7, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm4, xmm7
-        pshufd	xmm1, xmm4, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm4, 17
-        pclmulqdq	xmm0, xmm4, 0
-        pxor	xmm1, xmm4
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm6, xmm0
-        movdqa	xmm4, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm6, xmm2
-        pxor	xmm4, xmm1
-        movdqa	xmm0, xmm6
-        movdqa	xmm1, xmm4
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm6, 1
-        pslld	xmm4, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm4, xmm2
-        por	xmm6, xmm0
-        por	xmm4, xmm1
-        movdqa	xmm0, xmm6
-        movdqa	xmm1, xmm6
-        movdqa	xmm2, xmm6
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm6, xmm0
-        movdqa	xmm2, xmm6
-        movdqa	xmm3, xmm6
-        movdqa	xmm0, xmm6
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm6
-        pxor	xmm4, xmm2
-L_AES_GCM_init_aesni_calc_iv_done:
-        ; T = Encrypt counter
-        pxor	xmm0, xmm0
-        shl	edx, 3
-        pinsrq	xmm0, rdx, 0
-        pxor	xmm4, xmm0
-        pshufd	xmm1, xmm4, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm4, 17
-        pclmulqdq	xmm0, xmm4, 0
-        pxor	xmm1, xmm4
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm6, xmm0
-        movdqa	xmm4, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm6, xmm2
-        pxor	xmm4, xmm1
-        movdqa	xmm0, xmm6
-        movdqa	xmm1, xmm4
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm6, 1
-        pslld	xmm4, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm4, xmm2
-        por	xmm6, xmm0
-        por	xmm4, xmm1
-        movdqa	xmm0, xmm6
-        movdqa	xmm1, xmm6
-        movdqa	xmm2, xmm6
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm6, xmm0
-        movdqa	xmm2, xmm6
-        movdqa	xmm3, xmm6
-        movdqa	xmm0, xmm6
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm6
-        pxor	xmm4, xmm2
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_mask
-        ;   Encrypt counter
-        movdqa	xmm7, OWORD PTR [rdi]
-        pxor	xmm7, xmm4
-        aesenc	xmm7, [rdi+16]
-        aesenc	xmm7, [rdi+32]
-        aesenc	xmm7, [rdi+48]
-        aesenc	xmm7, [rdi+64]
-        aesenc	xmm7, [rdi+80]
-        aesenc	xmm7, [rdi+96]
-        aesenc	xmm7, [rdi+112]
-        aesenc	xmm7, [rdi+128]
-        aesenc	xmm7, [rdi+144]
-        cmp	esi, 11
-        movdqa	xmm8, OWORD PTR [rdi+160]
-        jl	L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last
-        aesenc	xmm7, xmm8
-        aesenc	xmm7, [rdi+176]
-        cmp	esi, 13
-        movdqa	xmm8, OWORD PTR [rdi+192]
-        jl	L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last
-        aesenc	xmm7, xmm8
-        aesenc	xmm7, [rdi+208]
-        movdqa	xmm8, OWORD PTR [rdi+224]
-L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last:
-        aesenclast	xmm7, xmm8
-        movdqu	xmm15, xmm7
-L_AES_GCM_init_aesni_iv_done:
-        movdqa	OWORD PTR [r9], xmm15
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_epi64
-        paddd	xmm4, OWORD PTR L_aes_gcm_one
-        movdqa	OWORD PTR [rax], xmm5
-        movdqa	OWORD PTR [r8], xmm4
-        movdqu	xmm6, [rsp+16]
-        movdqu	xmm7, [rsp+32]
-        movdqu	xmm8, [rsp+48]
-        movdqu	xmm15, [rsp+64]
-        add	rsp, 80
-        pop	r14
-        pop	r13
-        pop	r12
-        pop	rsi
-        pop	rdi
-        ret
-AES_GCM_init_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_aad_update_aesni PROC
-        mov	rax, rcx
-        sub	rsp, 32
-        movdqu	[rsp], xmm6
-        movdqu	[rsp+16], xmm7
-        movdqa	xmm5, OWORD PTR [r8]
-        movdqa	xmm6, OWORD PTR [r9]
-        xor	ecx, ecx
-L_AES_GCM_aad_update_aesni_16_loop:
-        movdqu	xmm7, [rax+rcx]
-        pshufb	xmm7, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm5, xmm7
-        pshufd	xmm1, xmm5, 78
-        pshufd	xmm2, xmm6, 78
-        movdqa	xmm3, xmm6
-        movdqa	xmm0, xmm6
-        pclmulqdq	xmm3, xmm5, 17
-        pclmulqdq	xmm0, xmm5, 0
-        pxor	xmm1, xmm5
-        pxor	xmm2, xmm6
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm4, xmm0
-        movdqa	xmm5, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm5, xmm1
-        movdqa	xmm0, xmm4
-        movdqa	xmm1, xmm5
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm4, 1
-        pslld	xmm5, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm5, xmm2
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        movdqa	xmm0, xmm4
-        movdqa	xmm1, xmm4
-        movdqa	xmm2, xmm4
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm4, xmm0
-        movdqa	xmm2, xmm4
-        movdqa	xmm3, xmm4
-        movdqa	xmm0, xmm4
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm4
-        pxor	xmm5, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_aad_update_aesni_16_loop
-        movdqa	OWORD PTR [r8], xmm5
-        movdqu	xmm6, [rsp]
-        movdqu	xmm7, [rsp+16]
-        add	rsp, 32
-        ret
-AES_GCM_aad_update_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_block_aesni PROC
-        mov	r10, r8
-        mov	r11, r9
-        mov	rax, QWORD PTR [rsp+40]
-        movdqu	xmm0, [rax]
-        movdqa	xmm1, xmm0
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_epi64
-        paddd	xmm1, OWORD PTR L_aes_gcm_one
-        pxor	xmm0, [rcx]
-        movdqu	[rax], xmm1
-        aesenc	xmm0, [rcx+16]
-        aesenc	xmm0, [rcx+32]
-        aesenc	xmm0, [rcx+48]
-        aesenc	xmm0, [rcx+64]
-        aesenc	xmm0, [rcx+80]
-        aesenc	xmm0, [rcx+96]
-        aesenc	xmm0, [rcx+112]
-        aesenc	xmm0, [rcx+128]
-        aesenc	xmm0, [rcx+144]
-        cmp	edx, 11
-        movdqa	xmm1, OWORD PTR [rcx+160]
-        jl	L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last
-        aesenc	xmm0, xmm1
-        aesenc	xmm0, [rcx+176]
-        cmp	edx, 13
-        movdqa	xmm1, OWORD PTR [rcx+192]
-        jl	L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last
-        aesenc	xmm0, xmm1
-        aesenc	xmm0, [rcx+208]
-        movdqa	xmm1, OWORD PTR [rcx+224]
-L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last:
-        aesenclast	xmm0, xmm1
-        movdqu	xmm1, [r11]
-        pxor	xmm0, xmm1
-        movdqu	[r10], xmm0
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        ret
-AES_GCM_encrypt_block_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_ghash_block_aesni PROC
-        sub	rsp, 32
-        movdqu	[rsp], xmm6
-        movdqu	[rsp+16], xmm7
-        movdqa	xmm4, OWORD PTR [rdx]
-        movdqa	xmm5, OWORD PTR [r8]
-        movdqu	xmm7, [rcx]
-        pshufb	xmm7, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm4, xmm7
-        pshufd	xmm1, xmm4, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm4, 17
-        pclmulqdq	xmm0, xmm4, 0
-        pxor	xmm1, xmm4
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm6, xmm0
-        movdqa	xmm4, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm6, xmm2
-        pxor	xmm4, xmm1
-        movdqa	xmm0, xmm6
-        movdqa	xmm1, xmm4
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm6, 1
-        pslld	xmm4, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm4, xmm2
-        por	xmm6, xmm0
-        por	xmm4, xmm1
-        movdqa	xmm0, xmm6
-        movdqa	xmm1, xmm6
-        movdqa	xmm2, xmm6
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm6, xmm0
-        movdqa	xmm2, xmm6
-        movdqa	xmm3, xmm6
-        movdqa	xmm0, xmm6
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm6
-        pxor	xmm4, xmm2
-        movdqa	OWORD PTR [rdx], xmm4
-        movdqu	xmm6, [rsp]
-        movdqu	xmm7, [rsp+16]
-        add	rsp, 32
-        ret
-AES_GCM_ghash_block_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_update_aesni PROC
-        push	r13
-        push	r12
-        push	r14
-        push	r15
-        push	rdi
-        mov	rax, rcx
-        mov	r10, r8
-        mov	r8d, edx
-        mov	r11, r9
-        mov	r9d, DWORD PTR [rsp+80]
-        mov	r12, QWORD PTR [rsp+88]
-        mov	r14, QWORD PTR [rsp+96]
-        mov	r15, QWORD PTR [rsp+104]
-        sub	rsp, 320
-        movdqu	[rsp+160], xmm6
-        movdqu	[rsp+176], xmm7
-        movdqu	[rsp+192], xmm8
-        movdqu	[rsp+208], xmm9
-        movdqu	[rsp+224], xmm10
-        movdqu	[rsp+240], xmm11
-        movdqu	[rsp+256], xmm12
-        movdqu	[rsp+272], xmm13
-        movdqu	[rsp+288], xmm14
-        movdqu	[rsp+304], xmm15
-        movdqa	xmm6, OWORD PTR [r12]
-        movdqa	xmm5, OWORD PTR [r14]
-        movdqa	xmm9, xmm5
-        movdqa	xmm8, xmm5
-        psrlq	xmm9, 63
-        psllq	xmm8, 1
-        pslldq	xmm9, 8
-        por	xmm8, xmm9
-        pshufd	xmm5, xmm5, 255
-        psrad	xmm5, 31
-        pand	xmm5, OWORD PTR L_aes_gcm_mod2_128
-        pxor	xmm5, xmm8
-        xor	rdi, rdi
-        cmp	r9d, 128
-        mov	r13d, r9d
-        jl	L_AES_GCM_encrypt_update_aesni_done_128
-        and	r13d, 4294967168
-        movdqa	xmm2, xmm6
-        ; H ^ 1
-        movdqu	[rsp], xmm5
-        ; H ^ 2
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm5, 78
-        movdqa	xmm11, xmm5
-        movdqa	xmm8, xmm5
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm5
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm0, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm0, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm0, xmm14
-        movdqu	[rsp+16], xmm0
-        ; H ^ 3
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm0, 78
-        movdqa	xmm11, xmm0
-        movdqa	xmm8, xmm0
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm0
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm1, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm1, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm1, xmm14
-        movdqu	[rsp+32], xmm1
-        ; H ^ 4
-        pshufd	xmm9, xmm0, 78
-        pshufd	xmm10, xmm0, 78
-        movdqa	xmm11, xmm0
-        movdqa	xmm8, xmm0
-        pclmulqdq	xmm11, xmm0, 17
-        pclmulqdq	xmm8, xmm0, 0
-        pxor	xmm9, xmm0
-        pxor	xmm10, xmm0
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm3, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm3, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm3, xmm14
-        movdqu	[rsp+48], xmm3
-        ; H ^ 5
-        pshufd	xmm9, xmm0, 78
-        pshufd	xmm10, xmm1, 78
-        movdqa	xmm11, xmm1
-        movdqa	xmm8, xmm1
-        pclmulqdq	xmm11, xmm0, 17
-        pclmulqdq	xmm8, xmm0, 0
-        pxor	xmm9, xmm0
-        pxor	xmm10, xmm1
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+64], xmm7
-        ; H ^ 6
-        pshufd	xmm9, xmm1, 78
-        pshufd	xmm10, xmm1, 78
-        movdqa	xmm11, xmm1
-        movdqa	xmm8, xmm1
-        pclmulqdq	xmm11, xmm1, 17
-        pclmulqdq	xmm8, xmm1, 0
-        pxor	xmm9, xmm1
-        pxor	xmm10, xmm1
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+80], xmm7
-        ; H ^ 7
-        pshufd	xmm9, xmm1, 78
-        pshufd	xmm10, xmm3, 78
-        movdqa	xmm11, xmm3
-        movdqa	xmm8, xmm3
-        pclmulqdq	xmm11, xmm1, 17
-        pclmulqdq	xmm8, xmm1, 0
-        pxor	xmm9, xmm1
-        pxor	xmm10, xmm3
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+96], xmm7
-        ; H ^ 8
-        pshufd	xmm9, xmm3, 78
-        pshufd	xmm10, xmm3, 78
-        movdqa	xmm11, xmm3
-        movdqa	xmm8, xmm3
-        pclmulqdq	xmm11, xmm3, 17
-        pclmulqdq	xmm8, xmm3, 0
-        pxor	xmm9, xmm3
-        pxor	xmm10, xmm3
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+112], xmm7
-        ; First 128 bytes of input
-        movdqu	xmm8, [r15]
-        movdqa	xmm1, OWORD PTR L_aes_gcm_bswap_epi64
-        movdqa	xmm0, xmm8
-        pshufb	xmm8, xmm1
-        movdqa	xmm9, xmm0
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pshufb	xmm9, xmm1
-        movdqa	xmm10, xmm0
-        paddd	xmm10, OWORD PTR L_aes_gcm_two
-        pshufb	xmm10, xmm1
-        movdqa	xmm11, xmm0
-        paddd	xmm11, OWORD PTR L_aes_gcm_three
-        pshufb	xmm11, xmm1
-        movdqa	xmm12, xmm0
-        paddd	xmm12, OWORD PTR L_aes_gcm_four
-        pshufb	xmm12, xmm1
-        movdqa	xmm13, xmm0
-        paddd	xmm13, OWORD PTR L_aes_gcm_five
-        pshufb	xmm13, xmm1
-        movdqa	xmm14, xmm0
-        paddd	xmm14, OWORD PTR L_aes_gcm_six
-        pshufb	xmm14, xmm1
-        movdqa	xmm15, xmm0
-        paddd	xmm15, OWORD PTR L_aes_gcm_seven
-        pshufb	xmm15, xmm1
-        paddd	xmm0, OWORD PTR L_aes_gcm_eight
-        movdqa	xmm7, OWORD PTR [rax]
-        movdqu	[r15], xmm0
-        pxor	xmm8, xmm7
-        pxor	xmm9, xmm7
-        pxor	xmm10, xmm7
-        pxor	xmm11, xmm7
-        pxor	xmm12, xmm7
-        pxor	xmm13, xmm7
-        pxor	xmm14, xmm7
-        pxor	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+16]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+32]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+48]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+64]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+80]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+96]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+112]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+128]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+144]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        cmp	r8d, 11
-        movdqa	xmm7, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_aesni_enc_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+176]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        cmp	r8d, 13
-        movdqa	xmm7, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_aesni_enc_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+208]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_aesni_enc_done:
-        aesenclast	xmm8, xmm7
-        aesenclast	xmm9, xmm7
-        movdqu	xmm0, [r11]
-        movdqu	xmm1, [r11+16]
-        pxor	xmm8, xmm0
-        pxor	xmm9, xmm1
-        movdqu	[r10], xmm8
-        movdqu	[r10+16], xmm9
-        aesenclast	xmm10, xmm7
-        aesenclast	xmm11, xmm7
-        movdqu	xmm0, [r11+32]
-        movdqu	xmm1, [r11+48]
-        pxor	xmm10, xmm0
-        pxor	xmm11, xmm1
-        movdqu	[r10+32], xmm10
-        movdqu	[r10+48], xmm11
-        aesenclast	xmm12, xmm7
-        aesenclast	xmm13, xmm7
-        movdqu	xmm0, [r11+64]
-        movdqu	xmm1, [r11+80]
-        pxor	xmm12, xmm0
-        pxor	xmm13, xmm1
-        movdqu	[r10+64], xmm12
-        movdqu	[r10+80], xmm13
-        aesenclast	xmm14, xmm7
-        aesenclast	xmm15, xmm7
-        movdqu	xmm0, [r11+96]
-        movdqu	xmm1, [r11+112]
-        pxor	xmm14, xmm0
-        pxor	xmm15, xmm1
-        movdqu	[r10+96], xmm14
-        movdqu	[r10+112], xmm15
-        cmp	r13d, 128
-        mov	edi, 128
-        jle	L_AES_GCM_encrypt_update_aesni_end_128
-        ; More 128 bytes of input
-L_AES_GCM_encrypt_update_aesni_ghash_128:
-        lea	rcx, QWORD PTR [r11+rdi]
-        lea	rdx, QWORD PTR [r10+rdi]
-        movdqu	xmm8, [r15]
-        movdqa	xmm1, OWORD PTR L_aes_gcm_bswap_epi64
-        movdqa	xmm0, xmm8
-        pshufb	xmm8, xmm1
-        movdqa	xmm9, xmm0
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pshufb	xmm9, xmm1
-        movdqa	xmm10, xmm0
-        paddd	xmm10, OWORD PTR L_aes_gcm_two
-        pshufb	xmm10, xmm1
-        movdqa	xmm11, xmm0
-        paddd	xmm11, OWORD PTR L_aes_gcm_three
-        pshufb	xmm11, xmm1
-        movdqa	xmm12, xmm0
-        paddd	xmm12, OWORD PTR L_aes_gcm_four
-        pshufb	xmm12, xmm1
-        movdqa	xmm13, xmm0
-        paddd	xmm13, OWORD PTR L_aes_gcm_five
-        pshufb	xmm13, xmm1
-        movdqa	xmm14, xmm0
-        paddd	xmm14, OWORD PTR L_aes_gcm_six
-        pshufb	xmm14, xmm1
-        movdqa	xmm15, xmm0
-        paddd	xmm15, OWORD PTR L_aes_gcm_seven
-        pshufb	xmm15, xmm1
-        paddd	xmm0, OWORD PTR L_aes_gcm_eight
-        movdqa	xmm7, OWORD PTR [rax]
-        movdqu	[r15], xmm0
-        pxor	xmm8, xmm7
-        pxor	xmm9, xmm7
-        pxor	xmm10, xmm7
-        pxor	xmm11, xmm7
-        pxor	xmm12, xmm7
-        pxor	xmm13, xmm7
-        pxor	xmm14, xmm7
-        pxor	xmm15, xmm7
-        movdqu	xmm7, [rsp+112]
-        movdqu	xmm0, [rdx+-128]
-        aesenc	xmm8, [rax+16]
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm0, xmm2
-        pshufd	xmm1, xmm7, 78
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm1, xmm7
-        pxor	xmm5, xmm0
-        movdqa	xmm3, xmm0
-        pclmulqdq	xmm3, xmm7, 17
-        aesenc	xmm9, [rax+16]
-        aesenc	xmm10, [rax+16]
-        movdqa	xmm2, xmm0
-        pclmulqdq	xmm2, xmm7, 0
-        aesenc	xmm11, [rax+16]
-        aesenc	xmm12, [rax+16]
-        pclmulqdq	xmm1, xmm5, 0
-        aesenc	xmm13, [rax+16]
-        aesenc	xmm14, [rax+16]
-        aesenc	xmm15, [rax+16]
-        pxor	xmm1, xmm2
-        pxor	xmm1, xmm3
-        movdqu	xmm7, [rsp+96]
-        movdqu	xmm0, [rdx+-112]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+32]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+32]
-        aesenc	xmm10, [rax+32]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+32]
-        aesenc	xmm12, [rax+32]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+32]
-        aesenc	xmm14, [rax+32]
-        aesenc	xmm15, [rax+32]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+80]
-        movdqu	xmm0, [rdx+-96]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+48]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+48]
-        aesenc	xmm10, [rax+48]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+48]
-        aesenc	xmm12, [rax+48]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+48]
-        aesenc	xmm14, [rax+48]
-        aesenc	xmm15, [rax+48]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+64]
-        movdqu	xmm0, [rdx+-80]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+64]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+64]
-        aesenc	xmm10, [rax+64]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+64]
-        aesenc	xmm12, [rax+64]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+64]
-        aesenc	xmm14, [rax+64]
-        aesenc	xmm15, [rax+64]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+48]
-        movdqu	xmm0, [rdx+-64]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+80]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+80]
-        aesenc	xmm10, [rax+80]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+80]
-        aesenc	xmm12, [rax+80]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+80]
-        aesenc	xmm14, [rax+80]
-        aesenc	xmm15, [rax+80]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+32]
-        movdqu	xmm0, [rdx+-48]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+96]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+96]
-        aesenc	xmm10, [rax+96]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+96]
-        aesenc	xmm12, [rax+96]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+96]
-        aesenc	xmm14, [rax+96]
-        aesenc	xmm15, [rax+96]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+16]
-        movdqu	xmm0, [rdx+-32]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+112]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+112]
-        aesenc	xmm10, [rax+112]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+112]
-        aesenc	xmm12, [rax+112]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+112]
-        aesenc	xmm14, [rax+112]
-        aesenc	xmm15, [rax+112]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp]
-        movdqu	xmm0, [rdx+-16]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+128]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+128]
-        aesenc	xmm10, [rax+128]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+128]
-        aesenc	xmm12, [rax+128]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+128]
-        aesenc	xmm14, [rax+128]
-        aesenc	xmm15, [rax+128]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqa	xmm5, xmm1
-        psrldq	xmm1, 8
-        pslldq	xmm5, 8
-        aesenc	xmm8, [rax+144]
-        pxor	xmm2, xmm5
-        pxor	xmm3, xmm1
-        movdqa	xmm7, xmm2
-        movdqa	xmm4, xmm2
-        movdqa	xmm5, xmm2
-        aesenc	xmm9, [rax+144]
-        pslld	xmm7, 31
-        pslld	xmm4, 30
-        pslld	xmm5, 25
-        aesenc	xmm10, [rax+144]
-        pxor	xmm7, xmm4
-        pxor	xmm7, xmm5
-        aesenc	xmm11, [rax+144]
-        movdqa	xmm4, xmm7
-        pslldq	xmm7, 12
-        psrldq	xmm4, 4
-        aesenc	xmm12, [rax+144]
-        pxor	xmm2, xmm7
-        movdqa	xmm5, xmm2
-        movdqa	xmm1, xmm2
-        movdqa	xmm0, xmm2
-        aesenc	xmm13, [rax+144]
-        psrld	xmm5, 1
-        psrld	xmm1, 2
-        psrld	xmm0, 7
-        aesenc	xmm14, [rax+144]
-        pxor	xmm5, xmm1
-        pxor	xmm5, xmm0
-        aesenc	xmm15, [rax+144]
-        pxor	xmm5, xmm4
-        pxor	xmm2, xmm5
-        pxor	xmm2, xmm3
-        cmp	r8d, 11
-        movdqa	xmm7, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_128_ghash_avx_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+176]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        cmp	r8d, 13
-        movdqa	xmm7, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_128_ghash_avx_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+208]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_aesni_aesenc_128_ghash_avx_done:
-        aesenclast	xmm8, xmm7
-        aesenclast	xmm9, xmm7
-        movdqu	xmm0, [rcx]
-        movdqu	xmm1, [rcx+16]
-        pxor	xmm8, xmm0
-        pxor	xmm9, xmm1
-        movdqu	[rdx], xmm8
-        movdqu	[rdx+16], xmm9
-        aesenclast	xmm10, xmm7
-        aesenclast	xmm11, xmm7
-        movdqu	xmm0, [rcx+32]
-        movdqu	xmm1, [rcx+48]
-        pxor	xmm10, xmm0
-        pxor	xmm11, xmm1
-        movdqu	[rdx+32], xmm10
-        movdqu	[rdx+48], xmm11
-        aesenclast	xmm12, xmm7
-        aesenclast	xmm13, xmm7
-        movdqu	xmm0, [rcx+64]
-        movdqu	xmm1, [rcx+80]
-        pxor	xmm12, xmm0
-        pxor	xmm13, xmm1
-        movdqu	[rdx+64], xmm12
-        movdqu	[rdx+80], xmm13
-        aesenclast	xmm14, xmm7
-        aesenclast	xmm15, xmm7
-        movdqu	xmm0, [rcx+96]
-        movdqu	xmm1, [rcx+112]
-        pxor	xmm14, xmm0
-        pxor	xmm15, xmm1
-        movdqu	[rdx+96], xmm14
-        movdqu	[rdx+112], xmm15
-        add	edi, 128
-        cmp	edi, r13d
-        jl	L_AES_GCM_encrypt_update_aesni_ghash_128
-L_AES_GCM_encrypt_update_aesni_end_128:
-        movdqa	xmm4, OWORD PTR L_aes_gcm_bswap_mask
-        pshufb	xmm8, xmm4
-        pshufb	xmm9, xmm4
-        pshufb	xmm10, xmm4
-        pshufb	xmm11, xmm4
-        pxor	xmm8, xmm2
-        pshufb	xmm12, xmm4
-        pshufb	xmm13, xmm4
-        pshufb	xmm14, xmm4
-        pshufb	xmm15, xmm4
-        movdqu	xmm7, [rsp+112]
-        pshufd	xmm1, xmm8, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm8, 17
-        pclmulqdq	xmm0, xmm8, 0
-        pxor	xmm1, xmm8
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm4, xmm0
-        movdqa	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+96]
-        pshufd	xmm1, xmm9, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm9, 17
-        pclmulqdq	xmm0, xmm9, 0
-        pxor	xmm1, xmm9
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+80]
-        pshufd	xmm1, xmm10, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm10, 17
-        pclmulqdq	xmm0, xmm10, 0
-        pxor	xmm1, xmm10
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+64]
-        pshufd	xmm1, xmm11, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm11, 17
-        pclmulqdq	xmm0, xmm11, 0
-        pxor	xmm1, xmm11
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+48]
-        pshufd	xmm1, xmm12, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm12, 17
-        pclmulqdq	xmm0, xmm12, 0
-        pxor	xmm1, xmm12
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+32]
-        pshufd	xmm1, xmm13, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm13, 17
-        pclmulqdq	xmm0, xmm13, 0
-        pxor	xmm1, xmm13
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+16]
-        pshufd	xmm1, xmm14, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm14, 17
-        pclmulqdq	xmm0, xmm14, 0
-        pxor	xmm1, xmm14
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp]
-        pshufd	xmm1, xmm15, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm15, 17
-        pclmulqdq	xmm0, xmm15, 0
-        pxor	xmm1, xmm15
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqa	xmm0, xmm4
-        movdqa	xmm1, xmm4
-        movdqa	xmm2, xmm4
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm4, xmm0
-        movdqa	xmm2, xmm4
-        movdqa	xmm3, xmm4
-        movdqa	xmm0, xmm4
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm4
-        pxor	xmm6, xmm2
-        movdqu	xmm5, [rsp]
-L_AES_GCM_encrypt_update_aesni_done_128:
-        mov	edx, r9d
-        cmp	edi, edx
-        jge	L_AES_GCM_encrypt_update_aesni_done_enc
-        mov	r13d, r9d
-        and	r13d, 4294967280
-        cmp	edi, r13d
-        jge	L_AES_GCM_encrypt_update_aesni_last_block_done
-        lea	rcx, QWORD PTR [r11+rdi]
-        lea	rdx, QWORD PTR [r10+rdi]
-        movdqu	xmm8, [r15]
-        movdqa	xmm9, xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_epi64
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pxor	xmm8, [rax]
-        movdqu	[r15], xmm9
-        aesenc	xmm8, [rax+16]
-        aesenc	xmm8, [rax+32]
-        aesenc	xmm8, [rax+48]
-        aesenc	xmm8, [rax+64]
-        aesenc	xmm8, [rax+80]
-        aesenc	xmm8, [rax+96]
-        aesenc	xmm8, [rax+112]
-        aesenc	xmm8, [rax+128]
-        aesenc	xmm8, [rax+144]
-        cmp	r8d, 11
-        movdqa	xmm9, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [rax+176]
-        cmp	r8d, 13
-        movdqa	xmm9, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [rax+208]
-        movdqa	xmm9, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last:
-        aesenclast	xmm8, xmm9
-        movdqu	xmm9, [rcx]
-        pxor	xmm8, xmm9
-        movdqu	[rdx], xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm6, xmm8
-        add	edi, 16
-        cmp	edi, r13d
-        jge	L_AES_GCM_encrypt_update_aesni_last_block_ghash
-L_AES_GCM_encrypt_update_aesni_last_block_start:
-        lea	rcx, QWORD PTR [r11+rdi]
-        lea	rdx, QWORD PTR [r10+rdi]
-        movdqu	xmm8, [r15]
-        movdqa	xmm9, xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_epi64
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pxor	xmm8, [rax]
-        movdqu	[r15], xmm9
-        movdqa	xmm10, xmm6
-        pclmulqdq	xmm10, xmm5, 16
-        aesenc	xmm8, [rax+16]
-        aesenc	xmm8, [rax+32]
-        movdqa	xmm11, xmm6
-        pclmulqdq	xmm11, xmm5, 1
-        aesenc	xmm8, [rax+48]
-        aesenc	xmm8, [rax+64]
-        movdqa	xmm12, xmm6
-        pclmulqdq	xmm12, xmm5, 0
-        aesenc	xmm8, [rax+80]
-        movdqa	xmm1, xmm6
-        pclmulqdq	xmm1, xmm5, 17
-        aesenc	xmm8, [rax+96]
-        pxor	xmm10, xmm11
-        movdqa	xmm2, xmm10
-        psrldq	xmm10, 8
-        pslldq	xmm2, 8
-        aesenc	xmm8, [rax+112]
-        movdqa	xmm3, xmm1
-        pxor	xmm2, xmm12
-        pxor	xmm3, xmm10
-        movdqa	xmm0, OWORD PTR L_aes_gcm_mod2_128
-        movdqa	xmm11, xmm2
-        pclmulqdq	xmm11, xmm0, 16
-        aesenc	xmm8, [rax+128]
-        pshufd	xmm10, xmm2, 78
-        pxor	xmm10, xmm11
-        movdqa	xmm11, xmm10
-        pclmulqdq	xmm11, xmm0, 16
-        aesenc	xmm8, [rax+144]
-        pshufd	xmm6, xmm10, 78
-        pxor	xmm6, xmm11
-        pxor	xmm6, xmm3
-        cmp	r8d, 11
-        movdqa	xmm9, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [rax+176]
-        cmp	r8d, 13
-        movdqa	xmm9, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [rax+208]
-        movdqa	xmm9, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last:
-        aesenclast	xmm8, xmm9
-        movdqu	xmm9, [rcx]
-        pxor	xmm8, xmm9
-        movdqu	[rdx], xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm6, xmm8
-        add	edi, 16
-        cmp	edi, r13d
-        jl	L_AES_GCM_encrypt_update_aesni_last_block_start
-L_AES_GCM_encrypt_update_aesni_last_block_ghash:
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm6, 78
-        movdqa	xmm11, xmm6
-        movdqa	xmm8, xmm6
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm6
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm6, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm6, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm6, xmm14
-L_AES_GCM_encrypt_update_aesni_last_block_done:
-L_AES_GCM_encrypt_update_aesni_done_enc:
-        movdqa	OWORD PTR [r12], xmm6
-        movdqu	xmm6, [rsp+160]
-        movdqu	xmm7, [rsp+176]
-        movdqu	xmm8, [rsp+192]
-        movdqu	xmm9, [rsp+208]
-        movdqu	xmm10, [rsp+224]
-        movdqu	xmm11, [rsp+240]
-        movdqu	xmm12, [rsp+256]
-        movdqu	xmm13, [rsp+272]
-        movdqu	xmm14, [rsp+288]
-        movdqu	xmm15, [rsp+304]
-        add	rsp, 320
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r12
-        pop	r13
-        ret
-AES_GCM_encrypt_update_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_final_aesni PROC
-        push	r13
-        push	r12
-        push	r14
-        mov	rax, rcx
-        mov	r10d, r9d
-        mov	r9, rdx
-        mov	r11d, DWORD PTR [rsp+64]
-        mov	r12, QWORD PTR [rsp+72]
-        mov	r14, QWORD PTR [rsp+80]
-        sub	rsp, 144
-        movdqu	[rsp+16], xmm6
-        movdqu	[rsp+32], xmm7
-        movdqu	[rsp+48], xmm8
-        movdqu	[rsp+64], xmm9
-        movdqu	[rsp+80], xmm10
-        movdqu	[rsp+96], xmm11
-        movdqu	[rsp+112], xmm12
-        movdqu	[rsp+128], xmm13
-        movdqa	xmm4, OWORD PTR [rax]
-        movdqa	xmm5, OWORD PTR [r12]
-        movdqa	xmm6, OWORD PTR [r14]
-        movdqa	xmm8, xmm5
-        movdqa	xmm7, xmm5
-        psrlq	xmm8, 63
-        psllq	xmm7, 1
-        pslldq	xmm8, 8
-        por	xmm7, xmm8
-        pshufd	xmm5, xmm5, 255
-        psrad	xmm5, 31
-        pand	xmm5, OWORD PTR L_aes_gcm_mod2_128
-        pxor	xmm5, xmm7
-        mov	edx, r10d
-        mov	ecx, r11d
-        shl	rdx, 3
-        shl	rcx, 3
-        pinsrq	xmm0, rdx, 0
-        pinsrq	xmm0, rcx, 1
-        pxor	xmm4, xmm0
-        pshufd	xmm8, xmm5, 78
-        pshufd	xmm9, xmm4, 78
-        movdqa	xmm10, xmm4
-        movdqa	xmm7, xmm4
-        pclmulqdq	xmm10, xmm5, 17
-        pclmulqdq	xmm7, xmm5, 0
-        pxor	xmm8, xmm5
-        pxor	xmm9, xmm4
-        pclmulqdq	xmm8, xmm9, 0
-        pxor	xmm8, xmm7
-        pxor	xmm8, xmm10
-        movdqa	xmm9, xmm8
-        movdqa	xmm4, xmm10
-        pslldq	xmm9, 8
-        psrldq	xmm8, 8
-        pxor	xmm7, xmm9
-        pxor	xmm4, xmm8
-        movdqa	xmm11, xmm7
-        movdqa	xmm12, xmm7
-        movdqa	xmm13, xmm7
-        pslld	xmm11, 31
-        pslld	xmm12, 30
-        pslld	xmm13, 25
-        pxor	xmm11, xmm12
-        pxor	xmm11, xmm13
-        movdqa	xmm12, xmm11
-        psrldq	xmm12, 4
-        pslldq	xmm11, 12
-        pxor	xmm7, xmm11
-        movdqa	xmm13, xmm7
-        movdqa	xmm9, xmm7
-        movdqa	xmm8, xmm7
-        psrld	xmm13, 1
-        psrld	xmm9, 2
-        psrld	xmm8, 7
-        pxor	xmm13, xmm9
-        pxor	xmm13, xmm8
-        pxor	xmm13, xmm12
-        pxor	xmm13, xmm7
-        pxor	xmm4, xmm13
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_mask
-        movdqu	xmm0, xmm6
-        pxor	xmm0, xmm4
-        cmp	r8d, 16
-        je	L_AES_GCM_encrypt_final_aesni_store_tag_16
-        xor	rcx, rcx
-        movdqu	[rsp], xmm0
-L_AES_GCM_encrypt_final_aesni_store_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [r9+rcx], r13b
-        inc	ecx
-        cmp	ecx, r8d
-        jne	L_AES_GCM_encrypt_final_aesni_store_tag_loop
-        jmp	L_AES_GCM_encrypt_final_aesni_store_tag_done
-L_AES_GCM_encrypt_final_aesni_store_tag_16:
-        movdqu	[r9], xmm0
-L_AES_GCM_encrypt_final_aesni_store_tag_done:
-        movdqu	xmm6, [rsp+16]
-        movdqu	xmm7, [rsp+32]
-        movdqu	xmm8, [rsp+48]
-        movdqu	xmm9, [rsp+64]
-        movdqu	xmm10, [rsp+80]
-        movdqu	xmm11, [rsp+96]
-        movdqu	xmm12, [rsp+112]
-        movdqu	xmm13, [rsp+128]
-        add	rsp, 144
-        pop	r14
-        pop	r12
-        pop	r13
-        ret
-AES_GCM_encrypt_final_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_decrypt_update_aesni PROC
-        push	r13
-        push	r12
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        mov	rax, rcx
-        mov	r10, r8
-        mov	r8d, edx
-        mov	r11, r9
-        mov	r9d, DWORD PTR [rsp+88]
-        mov	r12, QWORD PTR [rsp+96]
-        mov	r14, QWORD PTR [rsp+104]
-        mov	r15, QWORD PTR [rsp+112]
-        sub	rsp, 328
-        movdqu	[rsp+168], xmm6
-        movdqu	[rsp+184], xmm7
-        movdqu	[rsp+200], xmm8
-        movdqu	[rsp+216], xmm9
-        movdqu	[rsp+232], xmm10
-        movdqu	[rsp+248], xmm11
-        movdqu	[rsp+264], xmm12
-        movdqu	[rsp+280], xmm13
-        movdqu	[rsp+296], xmm14
-        movdqu	[rsp+312], xmm15
-        movdqa	xmm6, OWORD PTR [r12]
-        movdqa	xmm5, OWORD PTR [r14]
-        movdqa	xmm9, xmm5
-        movdqa	xmm8, xmm5
-        psrlq	xmm9, 63
-        psllq	xmm8, 1
-        pslldq	xmm9, 8
-        por	xmm8, xmm9
-        pshufd	xmm5, xmm5, 255
-        psrad	xmm5, 31
-        pand	xmm5, OWORD PTR L_aes_gcm_mod2_128
-        pxor	xmm5, xmm8
-        xor	edi, edi
-        cmp	r9d, 128
-        mov	r13d, r9d
-        jl	L_AES_GCM_decrypt_update_aesni_done_128
-        and	r13d, 4294967168
-        movdqa	xmm2, xmm6
-        ; H ^ 1
-        movdqu	[rsp], xmm5
-        ; H ^ 2
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm5, 78
-        movdqa	xmm11, xmm5
-        movdqa	xmm8, xmm5
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm5
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm0, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm0, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm0, xmm14
-        movdqu	[rsp+16], xmm0
-        ; H ^ 3
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm0, 78
-        movdqa	xmm11, xmm0
-        movdqa	xmm8, xmm0
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm0
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm1, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm1, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm1, xmm14
-        movdqu	[rsp+32], xmm1
-        ; H ^ 4
-        pshufd	xmm9, xmm0, 78
-        pshufd	xmm10, xmm0, 78
-        movdqa	xmm11, xmm0
-        movdqa	xmm8, xmm0
-        pclmulqdq	xmm11, xmm0, 17
-        pclmulqdq	xmm8, xmm0, 0
-        pxor	xmm9, xmm0
-        pxor	xmm10, xmm0
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm3, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm3, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm3, xmm14
-        movdqu	[rsp+48], xmm3
-        ; H ^ 5
-        pshufd	xmm9, xmm0, 78
-        pshufd	xmm10, xmm1, 78
-        movdqa	xmm11, xmm1
-        movdqa	xmm8, xmm1
-        pclmulqdq	xmm11, xmm0, 17
-        pclmulqdq	xmm8, xmm0, 0
-        pxor	xmm9, xmm0
-        pxor	xmm10, xmm1
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+64], xmm7
-        ; H ^ 6
-        pshufd	xmm9, xmm1, 78
-        pshufd	xmm10, xmm1, 78
-        movdqa	xmm11, xmm1
-        movdqa	xmm8, xmm1
-        pclmulqdq	xmm11, xmm1, 17
-        pclmulqdq	xmm8, xmm1, 0
-        pxor	xmm9, xmm1
-        pxor	xmm10, xmm1
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+80], xmm7
-        ; H ^ 7
-        pshufd	xmm9, xmm1, 78
-        pshufd	xmm10, xmm3, 78
-        movdqa	xmm11, xmm3
-        movdqa	xmm8, xmm3
-        pclmulqdq	xmm11, xmm1, 17
-        pclmulqdq	xmm8, xmm1, 0
-        pxor	xmm9, xmm1
-        pxor	xmm10, xmm3
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+96], xmm7
-        ; H ^ 8
-        pshufd	xmm9, xmm3, 78
-        pshufd	xmm10, xmm3, 78
-        movdqa	xmm11, xmm3
-        movdqa	xmm8, xmm3
-        pclmulqdq	xmm11, xmm3, 17
-        pclmulqdq	xmm8, xmm3, 0
-        pxor	xmm9, xmm3
-        pxor	xmm10, xmm3
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+112], xmm7
-L_AES_GCM_decrypt_update_aesni_ghash_128:
-        lea	rcx, QWORD PTR [r11+rdi]
-        lea	rdx, QWORD PTR [r10+rdi]
-        movdqu	xmm8, [r15]
-        movdqa	xmm1, OWORD PTR L_aes_gcm_bswap_epi64
-        movdqa	xmm0, xmm8
-        pshufb	xmm8, xmm1
-        movdqa	xmm9, xmm0
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pshufb	xmm9, xmm1
-        movdqa	xmm10, xmm0
-        paddd	xmm10, OWORD PTR L_aes_gcm_two
-        pshufb	xmm10, xmm1
-        movdqa	xmm11, xmm0
-        paddd	xmm11, OWORD PTR L_aes_gcm_three
-        pshufb	xmm11, xmm1
-        movdqa	xmm12, xmm0
-        paddd	xmm12, OWORD PTR L_aes_gcm_four
-        pshufb	xmm12, xmm1
-        movdqa	xmm13, xmm0
-        paddd	xmm13, OWORD PTR L_aes_gcm_five
-        pshufb	xmm13, xmm1
-        movdqa	xmm14, xmm0
-        paddd	xmm14, OWORD PTR L_aes_gcm_six
-        pshufb	xmm14, xmm1
-        movdqa	xmm15, xmm0
-        paddd	xmm15, OWORD PTR L_aes_gcm_seven
-        pshufb	xmm15, xmm1
-        paddd	xmm0, OWORD PTR L_aes_gcm_eight
-        movdqa	xmm7, OWORD PTR [rax]
-        movdqu	[r15], xmm0
-        pxor	xmm8, xmm7
-        pxor	xmm9, xmm7
-        pxor	xmm10, xmm7
-        pxor	xmm11, xmm7
-        pxor	xmm12, xmm7
-        pxor	xmm13, xmm7
-        pxor	xmm14, xmm7
-        pxor	xmm15, xmm7
-        movdqu	xmm7, [rsp+112]
-        movdqu	xmm0, [rcx]
-        aesenc	xmm8, [rax+16]
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm0, xmm2
-        pshufd	xmm1, xmm7, 78
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm1, xmm7
-        pxor	xmm5, xmm0
-        movdqa	xmm3, xmm0
-        pclmulqdq	xmm3, xmm7, 17
-        aesenc	xmm9, [rax+16]
-        aesenc	xmm10, [rax+16]
-        movdqa	xmm2, xmm0
-        pclmulqdq	xmm2, xmm7, 0
-        aesenc	xmm11, [rax+16]
-        aesenc	xmm12, [rax+16]
-        pclmulqdq	xmm1, xmm5, 0
-        aesenc	xmm13, [rax+16]
-        aesenc	xmm14, [rax+16]
-        aesenc	xmm15, [rax+16]
-        pxor	xmm1, xmm2
-        pxor	xmm1, xmm3
-        movdqu	xmm7, [rsp+96]
-        movdqu	xmm0, [rcx+16]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+32]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+32]
-        aesenc	xmm10, [rax+32]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+32]
-        aesenc	xmm12, [rax+32]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+32]
-        aesenc	xmm14, [rax+32]
-        aesenc	xmm15, [rax+32]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+80]
-        movdqu	xmm0, [rcx+32]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+48]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+48]
-        aesenc	xmm10, [rax+48]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+48]
-        aesenc	xmm12, [rax+48]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+48]
-        aesenc	xmm14, [rax+48]
-        aesenc	xmm15, [rax+48]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+64]
-        movdqu	xmm0, [rcx+48]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+64]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+64]
-        aesenc	xmm10, [rax+64]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+64]
-        aesenc	xmm12, [rax+64]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+64]
-        aesenc	xmm14, [rax+64]
-        aesenc	xmm15, [rax+64]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+48]
-        movdqu	xmm0, [rcx+64]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+80]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+80]
-        aesenc	xmm10, [rax+80]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+80]
-        aesenc	xmm12, [rax+80]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+80]
-        aesenc	xmm14, [rax+80]
-        aesenc	xmm15, [rax+80]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+32]
-        movdqu	xmm0, [rcx+80]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+96]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+96]
-        aesenc	xmm10, [rax+96]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+96]
-        aesenc	xmm12, [rax+96]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+96]
-        aesenc	xmm14, [rax+96]
-        aesenc	xmm15, [rax+96]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+16]
-        movdqu	xmm0, [rcx+96]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+112]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+112]
-        aesenc	xmm10, [rax+112]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+112]
-        aesenc	xmm12, [rax+112]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+112]
-        aesenc	xmm14, [rax+112]
-        aesenc	xmm15, [rax+112]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp]
-        movdqu	xmm0, [rcx+112]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+128]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+128]
-        aesenc	xmm10, [rax+128]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+128]
-        aesenc	xmm12, [rax+128]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+128]
-        aesenc	xmm14, [rax+128]
-        aesenc	xmm15, [rax+128]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqa	xmm5, xmm1
-        psrldq	xmm1, 8
-        pslldq	xmm5, 8
-        aesenc	xmm8, [rax+144]
-        pxor	xmm2, xmm5
-        pxor	xmm3, xmm1
-        movdqa	xmm7, xmm2
-        movdqa	xmm4, xmm2
-        movdqa	xmm5, xmm2
-        aesenc	xmm9, [rax+144]
-        pslld	xmm7, 31
-        pslld	xmm4, 30
-        pslld	xmm5, 25
-        aesenc	xmm10, [rax+144]
-        pxor	xmm7, xmm4
-        pxor	xmm7, xmm5
-        aesenc	xmm11, [rax+144]
-        movdqa	xmm4, xmm7
-        pslldq	xmm7, 12
-        psrldq	xmm4, 4
-        aesenc	xmm12, [rax+144]
-        pxor	xmm2, xmm7
-        movdqa	xmm5, xmm2
-        movdqa	xmm1, xmm2
-        movdqa	xmm0, xmm2
-        aesenc	xmm13, [rax+144]
-        psrld	xmm5, 1
-        psrld	xmm1, 2
-        psrld	xmm0, 7
-        aesenc	xmm14, [rax+144]
-        pxor	xmm5, xmm1
-        pxor	xmm5, xmm0
-        aesenc	xmm15, [rax+144]
-        pxor	xmm5, xmm4
-        pxor	xmm2, xmm5
-        pxor	xmm2, xmm3
-        cmp	r8d, 11
-        movdqa	xmm7, OWORD PTR [rax+160]
-        jl	L_AES_GCM_decrypt_update_aesni_aesenc_128_ghash_avx_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+176]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        cmp	r8d, 13
-        movdqa	xmm7, OWORD PTR [rax+192]
-        jl	L_AES_GCM_decrypt_update_aesni_aesenc_128_ghash_avx_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+208]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+224]
-L_AES_GCM_decrypt_update_aesni_aesenc_128_ghash_avx_done:
-        aesenclast	xmm8, xmm7
-        aesenclast	xmm9, xmm7
-        movdqu	xmm0, [rcx]
-        movdqu	xmm1, [rcx+16]
-        pxor	xmm8, xmm0
-        pxor	xmm9, xmm1
-        movdqu	[rdx], xmm8
-        movdqu	[rdx+16], xmm9
-        aesenclast	xmm10, xmm7
-        aesenclast	xmm11, xmm7
-        movdqu	xmm0, [rcx+32]
-        movdqu	xmm1, [rcx+48]
-        pxor	xmm10, xmm0
-        pxor	xmm11, xmm1
-        movdqu	[rdx+32], xmm10
-        movdqu	[rdx+48], xmm11
-        aesenclast	xmm12, xmm7
-        aesenclast	xmm13, xmm7
-        movdqu	xmm0, [rcx+64]
-        movdqu	xmm1, [rcx+80]
-        pxor	xmm12, xmm0
-        pxor	xmm13, xmm1
-        movdqu	[rdx+64], xmm12
-        movdqu	[rdx+80], xmm13
-        aesenclast	xmm14, xmm7
-        aesenclast	xmm15, xmm7
-        movdqu	xmm0, [rcx+96]
-        movdqu	xmm1, [rcx+112]
-        pxor	xmm14, xmm0
-        pxor	xmm15, xmm1
-        movdqu	[rdx+96], xmm14
-        movdqu	[rdx+112], xmm15
-        add	edi, 128
-        cmp	edi, r13d
-        jl	L_AES_GCM_decrypt_update_aesni_ghash_128
-        movdqa	xmm6, xmm2
-        movdqu	xmm5, [rsp]
-L_AES_GCM_decrypt_update_aesni_done_128:
-        mov	edx, r9d
-        cmp	edi, edx
-        jge	L_AES_GCM_decrypt_update_aesni_done_dec
-        mov	r13d, r9d
-        and	r13d, 4294967280
-        cmp	edi, r13d
-        jge	L_AES_GCM_decrypt_update_aesni_last_block_done
-L_AES_GCM_decrypt_update_aesni_last_block_start:
-        lea	rcx, QWORD PTR [r11+rdi]
-        lea	rdx, QWORD PTR [r10+rdi]
-        movdqu	xmm1, [rcx]
-        movdqa	xmm0, xmm5
-        pshufb	xmm1, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm1, xmm6
-        movdqu	xmm8, [r15]
-        movdqa	xmm9, xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_epi64
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pxor	xmm8, [rax]
-        movdqu	[r15], xmm9
-        movdqa	xmm10, xmm1
-        pclmulqdq	xmm10, xmm0, 16
-        aesenc	xmm8, [rax+16]
-        aesenc	xmm8, [rax+32]
-        movdqa	xmm11, xmm1
-        pclmulqdq	xmm11, xmm0, 1
-        aesenc	xmm8, [rax+48]
-        aesenc	xmm8, [rax+64]
-        movdqa	xmm12, xmm1
-        pclmulqdq	xmm12, xmm0, 0
-        aesenc	xmm8, [rax+80]
-        movdqa	xmm1, xmm1
-        pclmulqdq	xmm1, xmm0, 17
-        aesenc	xmm8, [rax+96]
-        pxor	xmm10, xmm11
-        movdqa	xmm2, xmm10
-        psrldq	xmm10, 8
-        pslldq	xmm2, 8
-        aesenc	xmm8, [rax+112]
-        movdqa	xmm3, xmm1
-        pxor	xmm2, xmm12
-        pxor	xmm3, xmm10
-        movdqa	xmm0, OWORD PTR L_aes_gcm_mod2_128
-        movdqa	xmm11, xmm2
-        pclmulqdq	xmm11, xmm0, 16
-        aesenc	xmm8, [rax+128]
-        pshufd	xmm10, xmm2, 78
-        pxor	xmm10, xmm11
-        movdqa	xmm11, xmm10
-        pclmulqdq	xmm11, xmm0, 16
-        aesenc	xmm8, [rax+144]
-        pshufd	xmm6, xmm10, 78
-        pxor	xmm6, xmm11
-        pxor	xmm6, xmm3
-        cmp	r8d, 11
-        movdqa	xmm9, OWORD PTR [rax+160]
-        jl	L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [rax+176]
-        cmp	r8d, 13
-        movdqa	xmm9, OWORD PTR [rax+192]
-        jl	L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [rax+208]
-        movdqa	xmm9, OWORD PTR [rax+224]
-L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last:
-        aesenclast	xmm8, xmm9
-        movdqu	xmm9, [rcx]
-        pxor	xmm8, xmm9
-        movdqu	[rdx], xmm8
-        add	edi, 16
-        cmp	edi, r13d
-        jl	L_AES_GCM_decrypt_update_aesni_last_block_start
-L_AES_GCM_decrypt_update_aesni_last_block_done:
-L_AES_GCM_decrypt_update_aesni_done_dec:
-        movdqa	OWORD PTR [r12], xmm6
-        movdqu	xmm6, [rsp+168]
-        movdqu	xmm7, [rsp+184]
-        movdqu	xmm8, [rsp+200]
-        movdqu	xmm9, [rsp+216]
-        movdqu	xmm10, [rsp+232]
-        movdqu	xmm11, [rsp+248]
-        movdqu	xmm12, [rsp+264]
-        movdqu	xmm13, [rsp+280]
-        movdqu	xmm14, [rsp+296]
-        movdqu	xmm15, [rsp+312]
-        add	rsp, 328
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r12
-        pop	r13
-        ret
-AES_GCM_decrypt_update_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_decrypt_final_aesni PROC
-        push	r13
-        push	r12
-        push	r14
-        push	rbp
-        push	r15
-        mov	rax, rcx
-        mov	r10d, r9d
-        mov	r9, rdx
-        mov	r11d, DWORD PTR [rsp+80]
-        mov	r12, QWORD PTR [rsp+88]
-        mov	r14, QWORD PTR [rsp+96]
-        mov	rbp, QWORD PTR [rsp+104]
-        sub	rsp, 160
-        movdqu	[rsp+16], xmm6
-        movdqu	[rsp+32], xmm7
-        movdqu	[rsp+48], xmm8
-        movdqu	[rsp+64], xmm9
-        movdqu	[rsp+80], xmm10
-        movdqu	[rsp+96], xmm11
-        movdqu	[rsp+112], xmm12
-        movdqu	[rsp+128], xmm13
-        movdqu	[rsp+144], xmm15
-        movdqa	xmm6, OWORD PTR [rax]
-        movdqa	xmm5, OWORD PTR [r12]
-        movdqa	xmm15, OWORD PTR [r14]
-        movdqa	xmm8, xmm5
-        movdqa	xmm7, xmm5
-        psrlq	xmm8, 63
-        psllq	xmm7, 1
-        pslldq	xmm8, 8
-        por	xmm7, xmm8
-        pshufd	xmm5, xmm5, 255
-        psrad	xmm5, 31
-        pand	xmm5, OWORD PTR L_aes_gcm_mod2_128
-        pxor	xmm5, xmm7
-        mov	edx, r10d
-        mov	ecx, r11d
-        shl	rdx, 3
-        shl	rcx, 3
-        pinsrq	xmm0, rdx, 0
-        pinsrq	xmm0, rcx, 1
-        pxor	xmm6, xmm0
-        pshufd	xmm8, xmm5, 78
-        pshufd	xmm9, xmm6, 78
-        movdqa	xmm10, xmm6
-        movdqa	xmm7, xmm6
-        pclmulqdq	xmm10, xmm5, 17
-        pclmulqdq	xmm7, xmm5, 0
-        pxor	xmm8, xmm5
-        pxor	xmm9, xmm6
-        pclmulqdq	xmm8, xmm9, 0
-        pxor	xmm8, xmm7
-        pxor	xmm8, xmm10
-        movdqa	xmm9, xmm8
-        movdqa	xmm6, xmm10
-        pslldq	xmm9, 8
-        psrldq	xmm8, 8
-        pxor	xmm7, xmm9
-        pxor	xmm6, xmm8
-        movdqa	xmm11, xmm7
-        movdqa	xmm12, xmm7
-        movdqa	xmm13, xmm7
-        pslld	xmm11, 31
-        pslld	xmm12, 30
-        pslld	xmm13, 25
-        pxor	xmm11, xmm12
-        pxor	xmm11, xmm13
-        movdqa	xmm12, xmm11
-        psrldq	xmm12, 4
-        pslldq	xmm11, 12
-        pxor	xmm7, xmm11
-        movdqa	xmm13, xmm7
-        movdqa	xmm9, xmm7
-        movdqa	xmm8, xmm7
-        psrld	xmm13, 1
-        psrld	xmm9, 2
-        psrld	xmm8, 7
-        pxor	xmm13, xmm9
-        pxor	xmm13, xmm8
-        pxor	xmm13, xmm12
-        pxor	xmm13, xmm7
-        pxor	xmm6, xmm13
-        pshufb	xmm6, OWORD PTR L_aes_gcm_bswap_mask
-        movdqu	xmm0, xmm15
-        pxor	xmm0, xmm6
-        cmp	r8d, 16
-        je	L_AES_GCM_decrypt_final_aesni_cmp_tag_16
-        sub	rsp, 16
-        xor	rcx, rcx
-        xor	r15, r15
-        movdqu	[rsp], xmm0
-L_AES_GCM_decrypt_final_aesni_cmp_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+rcx]
-        xor	r13b, BYTE PTR [r9+rcx]
-        or	r15b, r13b
-        inc	ecx
-        cmp	ecx, r8d
-        jne	L_AES_GCM_decrypt_final_aesni_cmp_tag_loop
-        cmp	r15, 0
-        sete	r15b
-        add	rsp, 16
-        xor	rcx, rcx
-        jmp	L_AES_GCM_decrypt_final_aesni_cmp_tag_done
-L_AES_GCM_decrypt_final_aesni_cmp_tag_16:
-        movdqu	xmm1, [r9]
-        pcmpeqb	xmm0, xmm1
-        pmovmskb	rdx, xmm0
-        ; %%edx == 0xFFFF then return 1 else => return 0
-        xor	r15d, r15d
-        cmp	edx, 65535
-        sete	r15b
-L_AES_GCM_decrypt_final_aesni_cmp_tag_done:
-        mov	DWORD PTR [rbp], r15d
-        movdqu	xmm6, [rsp+16]
-        movdqu	xmm7, [rsp+32]
-        movdqu	xmm8, [rsp+48]
-        movdqu	xmm9, [rsp+64]
-        movdqu	xmm10, [rsp+80]
-        movdqu	xmm11, [rsp+96]
-        movdqu	xmm12, [rsp+112]
-        movdqu	xmm13, [rsp+128]
-        movdqu	xmm15, [rsp+144]
-        add	rsp, 160
-        pop	r15
-        pop	rbp
-        pop	r14
-        pop	r12
-        pop	r13
-        ret
-AES_GCM_decrypt_final_aesni ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX1
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_one QWORD 0, 1
-ptr_L_avx1_aes_gcm_one QWORD L_avx1_aes_gcm_one
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_two QWORD 0, 2
-ptr_L_avx1_aes_gcm_two QWORD L_avx1_aes_gcm_two
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_three QWORD 0, 3
-ptr_L_avx1_aes_gcm_three QWORD L_avx1_aes_gcm_three
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_four QWORD 0, 4
-ptr_L_avx1_aes_gcm_four QWORD L_avx1_aes_gcm_four
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_five QWORD 0, 5
-ptr_L_avx1_aes_gcm_five QWORD L_avx1_aes_gcm_five
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_six QWORD 0, 6
-ptr_L_avx1_aes_gcm_six QWORD L_avx1_aes_gcm_six
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_seven QWORD 0, 7
-ptr_L_avx1_aes_gcm_seven QWORD L_avx1_aes_gcm_seven
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_eight QWORD 0, 8
-ptr_L_avx1_aes_gcm_eight QWORD L_avx1_aes_gcm_eight
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_bswap_epi64 QWORD 283686952306183, 579005069656919567
-ptr_L_avx1_aes_gcm_bswap_epi64 QWORD L_avx1_aes_gcm_bswap_epi64
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_bswap_mask QWORD 579005069656919567, 283686952306183
-ptr_L_avx1_aes_gcm_bswap_mask QWORD L_avx1_aes_gcm_bswap_mask
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_mod2_128 QWORD 1, 13979173243358019584
-ptr_L_avx1_aes_gcm_mod2_128 QWORD L_avx1_aes_gcm_mod2_128
-_DATA ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_avx1 PROC
-        push	r13
-        push	rdi
-        push	rsi
-        push	r12
-        push	rbx
-        push	r14
-        push	r15
-        mov	rdi, rcx
-        mov	rsi, rdx
-        mov	r12, r8
-        mov	rax, r9
-        mov	r8, QWORD PTR [rsp+96]
-        mov	r9d, DWORD PTR [rsp+104]
-        mov	r11d, DWORD PTR [rsp+112]
-        mov	ebx, DWORD PTR [rsp+120]
-        mov	r14d, DWORD PTR [rsp+128]
-        mov	r15, QWORD PTR [rsp+136]
-        mov	r10d, DWORD PTR [rsp+144]
-        sub	rsp, 320
-        vmovdqu	OWORD PTR [rsp+160], xmm6
-        vmovdqu	OWORD PTR [rsp+176], xmm7
-        vmovdqu	OWORD PTR [rsp+192], xmm8
-        vmovdqu	OWORD PTR [rsp+208], xmm9
-        vmovdqu	OWORD PTR [rsp+224], xmm10
-        vmovdqu	OWORD PTR [rsp+240], xmm11
-        vmovdqu	OWORD PTR [rsp+256], xmm12
-        vmovdqu	OWORD PTR [rsp+272], xmm13
-        vmovdqu	OWORD PTR [rsp+288], xmm14
-        vmovdqu	OWORD PTR [rsp+304], xmm15
-        vpxor	xmm4, xmm4, xmm4
-        vpxor	xmm6, xmm6, xmm6
-        mov	edx, ebx
-        cmp	edx, 12
-        jne	L_AES_GCM_encrypt_avx1_iv_not_12
-        ; # Calculate values when IV is 12 bytes
-        ; Set counter based on IV
-        mov	ecx, 16777216
-        vmovq	xmm4, QWORD PTR [rax]
-        vpinsrd	xmm4, xmm4, DWORD PTR [rax+8], 2
-        vpinsrd	xmm4, xmm4, ecx, 3
-        ; H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqa	xmm5, OWORD PTR [r15]
-        vpxor	xmm1, xmm4, xmm5
-        vmovdqa	xmm7, OWORD PTR [r15+16]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+32]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+48]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+64]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+80]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+96]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+112]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+128]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+144]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        cmp	r10d, 11
-        vmovdqa	xmm7, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+176]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        cmp	r10d, 13
-        vmovdqa	xmm7, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+208]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_avx1_calc_iv_12_last:
-        vaesenclast	xmm5, xmm5, xmm7
-        vaesenclast	xmm1, xmm1, xmm7
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vmovdqu	OWORD PTR [rsp+144], xmm1
-        jmp	L_AES_GCM_encrypt_avx1_iv_done
-L_AES_GCM_encrypt_avx1_iv_not_12:
-        ; Calculate values when IV is not 12 bytes
-        ; H = Encrypt X(=0)
-        vmovdqa	xmm5, OWORD PTR [r15]
-        vaesenc	xmm5, xmm5, [r15+16]
-        vaesenc	xmm5, xmm5, [r15+32]
-        vaesenc	xmm5, xmm5, [r15+48]
-        vaesenc	xmm5, xmm5, [r15+64]
-        vaesenc	xmm5, xmm5, [r15+80]
-        vaesenc	xmm5, xmm5, [r15+96]
-        vaesenc	xmm5, xmm5, [r15+112]
-        vaesenc	xmm5, xmm5, [r15+128]
-        vaesenc	xmm5, xmm5, [r15+144]
-        cmp	r10d, 11
-        vmovdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm9
-        vaesenc	xmm5, xmm5, [r15+176]
-        cmp	r10d, 13
-        vmovdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm9
-        vaesenc	xmm5, xmm5, [r15+208]
-        vmovdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last:
-        vaesenclast	xmm5, xmm5, xmm9
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        ; Calc counter
-        ; Initialization vector
-        cmp	edx, 0
-        mov	rcx, 0
-        je	L_AES_GCM_encrypt_avx1_calc_iv_done
-        cmp	edx, 16
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_lt16
-        and	edx, 4294967280
-L_AES_GCM_encrypt_avx1_calc_iv_16_loop:
-        vmovdqu	xmm8, OWORD PTR [rax+rcx]
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm8
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm4, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpxor	xmm1, xmm1, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm4, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm4, xmm4, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm4, xmm4, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_16_loop
-        mov	edx, ebx
-        cmp	ecx, edx
-        je	L_AES_GCM_encrypt_avx1_calc_iv_done
-L_AES_GCM_encrypt_avx1_calc_iv_lt16:
-        sub	rsp, 16
-        vpxor	xmm8, xmm8, xmm8
-        xor	ebx, ebx
-        vmovdqu	OWORD PTR [rsp], xmm8
-L_AES_GCM_encrypt_avx1_calc_iv_loop:
-        movzx	r13d, BYTE PTR [rax+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_loop
-        vmovdqu	xmm8, OWORD PTR [rsp]
-        add	rsp, 16
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm8
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm4, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpxor	xmm1, xmm1, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm4, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm4, xmm4, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm4, xmm4, xmm2
-L_AES_GCM_encrypt_avx1_calc_iv_done:
-        ; T = Encrypt counter
-        vpxor	xmm0, xmm0, xmm0
-        shl	edx, 3
-        vmovq	xmm0, rdx
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm4, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpxor	xmm1, xmm1, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm4, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm4, xmm4, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm4, xmm4, xmm2
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        ;   Encrypt counter
-        vmovdqa	xmm8, OWORD PTR [r15]
-        vpxor	xmm8, xmm8, xmm4
-        vaesenc	xmm8, xmm8, [r15+16]
-        vaesenc	xmm8, xmm8, [r15+32]
-        vaesenc	xmm8, xmm8, [r15+48]
-        vaesenc	xmm8, xmm8, [r15+64]
-        vaesenc	xmm8, xmm8, [r15+80]
-        vaesenc	xmm8, xmm8, [r15+96]
-        vaesenc	xmm8, xmm8, [r15+112]
-        vaesenc	xmm8, xmm8, [r15+128]
-        vaesenc	xmm8, xmm8, [r15+144]
-        cmp	r10d, 11
-        vmovdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [r15+176]
-        cmp	r10d, 13
-        vmovdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [r15+208]
-        vmovdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last:
-        vaesenclast	xmm8, xmm8, xmm9
-        vmovdqu	OWORD PTR [rsp+144], xmm8
-L_AES_GCM_encrypt_avx1_iv_done:
-        ; Additional authentication data
-        mov	edx, r11d
-        cmp	edx, 0
-        je	L_AES_GCM_encrypt_avx1_calc_aad_done
-        xor	ecx, ecx
-        cmp	edx, 16
-        jl	L_AES_GCM_encrypt_avx1_calc_aad_lt16
-        and	edx, 4294967280
-L_AES_GCM_encrypt_avx1_calc_aad_16_loop:
-        vmovdqu	xmm8, OWORD PTR [r12+rcx]
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm8
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm6, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm6, 17
-        vpclmulqdq	xmm0, xmm5, xmm6, 0
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm6, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm6, xmm6, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm6, xmm6, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm6, xmm6, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm6, xmm6, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_avx1_calc_aad_16_loop
-        mov	edx, r11d
-        cmp	ecx, edx
-        je	L_AES_GCM_encrypt_avx1_calc_aad_done
-L_AES_GCM_encrypt_avx1_calc_aad_lt16:
-        sub	rsp, 16
-        vpxor	xmm8, xmm8, xmm8
-        xor	ebx, ebx
-        vmovdqu	OWORD PTR [rsp], xmm8
-L_AES_GCM_encrypt_avx1_calc_aad_loop:
-        movzx	r13d, BYTE PTR [r12+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_avx1_calc_aad_loop
-        vmovdqu	xmm8, OWORD PTR [rsp]
-        add	rsp, 16
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm8
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm6, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm6, 17
-        vpclmulqdq	xmm0, xmm5, xmm6, 0
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm6, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm6, xmm6, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm6, xmm6, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm6, xmm6, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm6, xmm6, xmm2
-L_AES_GCM_encrypt_avx1_calc_aad_done:
-        ; Calculate counter and H
-        vpsrlq	xmm9, xmm5, 63
-        vpsllq	xmm8, xmm5, 1
-        vpslldq	xmm9, xmm9, 8
-        vpor	xmm8, xmm8, xmm9
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpand	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128
-        vpaddd	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_one
-        vpxor	xmm5, xmm5, xmm8
-        vmovdqu	OWORD PTR [rsp+128], xmm4
-        xor	ebx, ebx
-        cmp	r9d, 128
-        mov	r13d, r9d
-        jl	L_AES_GCM_encrypt_avx1_done_128
-        and	r13d, 4294967168
-        vmovdqa	xmm2, xmm6
-        ; H ^ 1
-        vmovdqu	OWORD PTR [rsp], xmm5
-        ; H ^ 2
-        vpclmulqdq	xmm8, xmm5, xmm5, 0
-        vpclmulqdq	xmm0, xmm5, xmm5, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm0, xmm0, xmm14
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-        ; H ^ 3
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm0, 78
-        vpclmulqdq	xmm11, xmm0, xmm5, 17
-        vpclmulqdq	xmm8, xmm0, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm0
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm1, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm1, xmm1, xmm14
-        vmovdqu	OWORD PTR [rsp+32], xmm1
-        ; H ^ 4
-        vpclmulqdq	xmm8, xmm0, xmm0, 0
-        vpclmulqdq	xmm3, xmm0, xmm0, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm3, xmm3, xmm14
-        vmovdqu	OWORD PTR [rsp+48], xmm3
-        ; H ^ 5
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm0, 78
-        vpshufd	xmm10, xmm1, 78
-        vpclmulqdq	xmm11, xmm1, xmm0, 17
-        vpclmulqdq	xmm8, xmm1, xmm0, 0
-        vpxor	xmm9, xmm9, xmm0
-        vpxor	xmm10, xmm10, xmm1
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm7, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+64], xmm7
-        ; H ^ 6
-        vpclmulqdq	xmm8, xmm1, xmm1, 0
-        vpclmulqdq	xmm7, xmm1, xmm1, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+80], xmm7
-        ; H ^ 7
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm1, 78
-        vpshufd	xmm10, xmm3, 78
-        vpclmulqdq	xmm11, xmm3, xmm1, 17
-        vpclmulqdq	xmm8, xmm3, xmm1, 0
-        vpxor	xmm9, xmm9, xmm1
-        vpxor	xmm10, xmm10, xmm3
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm7, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+96], xmm7
-        ; H ^ 8
-        vpclmulqdq	xmm8, xmm3, xmm3, 0
-        vpclmulqdq	xmm7, xmm3, xmm3, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+112], xmm7
-        ; First 128 bytes of input
-        vmovdqu	xmm0, OWORD PTR [rsp+128]
-        vmovdqa	xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven
-        vpshufb	xmm15, xmm15, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight
-        vmovdqa	xmm7, OWORD PTR [r15]
-        vmovdqu	OWORD PTR [rsp+128], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+16]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+32]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+48]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+64]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+80]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+96]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+112]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+128]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+144]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r10d, 11
-        vmovdqa	xmm7, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_avx1_aesenc_128_enc_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r10d, 13
-        vmovdqa	xmm7, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_avx1_aesenc_128_enc_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_avx1_aesenc_128_enc_done:
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vmovdqu	xmm0, OWORD PTR [rdi]
-        vmovdqu	xmm1, OWORD PTR [rdi+16]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vmovdqu	OWORD PTR [rsi], xmm8
-        vmovdqu	OWORD PTR [rsi+16], xmm9
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [rdi+32]
-        vmovdqu	xmm1, OWORD PTR [rdi+48]
-        vpxor	xmm10, xmm10, xmm0
-        vpxor	xmm11, xmm11, xmm1
-        vmovdqu	OWORD PTR [rsi+32], xmm10
-        vmovdqu	OWORD PTR [rsi+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vmovdqu	xmm0, OWORD PTR [rdi+64]
-        vmovdqu	xmm1, OWORD PTR [rdi+80]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vmovdqu	OWORD PTR [rsi+64], xmm12
-        vmovdqu	OWORD PTR [rsi+80], xmm13
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rdi+96]
-        vmovdqu	xmm1, OWORD PTR [rdi+112]
-        vpxor	xmm14, xmm14, xmm0
-        vpxor	xmm15, xmm15, xmm1
-        vmovdqu	OWORD PTR [rsi+96], xmm14
-        vmovdqu	OWORD PTR [rsi+112], xmm15
-        cmp	r13d, 128
-        mov	ebx, 128
-        jle	L_AES_GCM_encrypt_avx1_end_128
-        ; More 128 bytes of input
-L_AES_GCM_encrypt_avx1_ghash_128:
-        lea	rcx, QWORD PTR [rdi+rbx]
-        lea	rdx, QWORD PTR [rsi+rbx]
-        vmovdqu	xmm0, OWORD PTR [rsp+128]
-        vmovdqa	xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven
-        vpshufb	xmm15, xmm15, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight
-        vmovdqa	xmm7, OWORD PTR [r15]
-        vmovdqu	OWORD PTR [rsp+128], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsp+112]
-        vmovdqu	xmm0, OWORD PTR [rdx+-128]
-        vaesenc	xmm8, xmm8, [r15+16]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm0, xmm0, xmm2
-        vpshufd	xmm1, xmm7, 78
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm3, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+16]
-        vaesenc	xmm10, xmm10, [r15+16]
-        vpclmulqdq	xmm2, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+16]
-        vaesenc	xmm12, xmm12, [r15+16]
-        vpclmulqdq	xmm1, xmm1, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+16]
-        vaesenc	xmm14, xmm14, [r15+16]
-        vaesenc	xmm15, xmm15, [r15+16]
-        vpxor	xmm1, xmm1, xmm2
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqu	xmm7, OWORD PTR [rsp+96]
-        vmovdqu	xmm0, OWORD PTR [rdx+-112]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+32]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+32]
-        vaesenc	xmm10, xmm10, [r15+32]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+32]
-        vaesenc	xmm12, xmm12, [r15+32]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+32]
-        vaesenc	xmm14, xmm14, [r15+32]
-        vaesenc	xmm15, xmm15, [r15+32]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+80]
-        vmovdqu	xmm0, OWORD PTR [rdx+-96]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+48]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+48]
-        vaesenc	xmm10, xmm10, [r15+48]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+48]
-        vaesenc	xmm12, xmm12, [r15+48]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+48]
-        vaesenc	xmm14, xmm14, [r15+48]
-        vaesenc	xmm15, xmm15, [r15+48]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+64]
-        vmovdqu	xmm0, OWORD PTR [rdx+-80]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+64]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+64]
-        vaesenc	xmm10, xmm10, [r15+64]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+64]
-        vaesenc	xmm12, xmm12, [r15+64]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+64]
-        vaesenc	xmm14, xmm14, [r15+64]
-        vaesenc	xmm15, xmm15, [r15+64]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+48]
-        vmovdqu	xmm0, OWORD PTR [rdx+-64]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+80]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+80]
-        vaesenc	xmm10, xmm10, [r15+80]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+80]
-        vaesenc	xmm12, xmm12, [r15+80]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+80]
-        vaesenc	xmm14, xmm14, [r15+80]
-        vaesenc	xmm15, xmm15, [r15+80]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+32]
-        vmovdqu	xmm0, OWORD PTR [rdx+-48]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+96]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+96]
-        vaesenc	xmm10, xmm10, [r15+96]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+96]
-        vaesenc	xmm12, xmm12, [r15+96]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+96]
-        vaesenc	xmm14, xmm14, [r15+96]
-        vaesenc	xmm15, xmm15, [r15+96]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm0, OWORD PTR [rdx+-32]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+112]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+112]
-        vaesenc	xmm10, xmm10, [r15+112]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+112]
-        vaesenc	xmm12, xmm12, [r15+112]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+112]
-        vaesenc	xmm14, xmm14, [r15+112]
-        vaesenc	xmm15, xmm15, [r15+112]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp]
-        vmovdqu	xmm0, OWORD PTR [rdx+-16]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+128]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+128]
-        vaesenc	xmm10, xmm10, [r15+128]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+128]
-        vaesenc	xmm12, xmm12, [r15+128]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+128]
-        vaesenc	xmm14, xmm14, [r15+128]
-        vaesenc	xmm15, xmm15, [r15+128]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vpslldq	xmm5, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vaesenc	xmm8, xmm8, [r15+144]
-        vpxor	xmm2, xmm2, xmm5
-        vpxor	xmm3, xmm3, xmm1
-        vaesenc	xmm9, xmm9, [r15+144]
-        vpslld	xmm7, xmm2, 31
-        vpslld	xmm4, xmm2, 30
-        vpslld	xmm5, xmm2, 25
-        vaesenc	xmm10, xmm10, [r15+144]
-        vpxor	xmm7, xmm7, xmm4
-        vpxor	xmm7, xmm7, xmm5
-        vaesenc	xmm11, xmm11, [r15+144]
-        vpsrldq	xmm4, xmm7, 4
-        vpslldq	xmm7, xmm7, 12
-        vaesenc	xmm12, xmm12, [r15+144]
-        vpxor	xmm2, xmm2, xmm7
-        vpsrld	xmm5, xmm2, 1
-        vaesenc	xmm13, xmm13, [r15+144]
-        vpsrld	xmm1, xmm2, 2
-        vpsrld	xmm0, xmm2, 7
-        vaesenc	xmm14, xmm14, [r15+144]
-        vpxor	xmm5, xmm5, xmm1
-        vpxor	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, [r15+144]
-        vpxor	xmm5, xmm5, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpxor	xmm2, xmm2, xmm3
-        cmp	r10d, 11
-        vmovdqa	xmm7, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_avx1_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r10d, 13
-        vmovdqa	xmm7, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_avx1_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_avx1_aesenc_128_ghash_avx_done:
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vmovdqu	xmm1, OWORD PTR [rcx+16]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vmovdqu	OWORD PTR [rdx], xmm8
-        vmovdqu	OWORD PTR [rdx+16], xmm9
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+32]
-        vmovdqu	xmm1, OWORD PTR [rcx+48]
-        vpxor	xmm10, xmm10, xmm0
-        vpxor	xmm11, xmm11, xmm1
-        vmovdqu	OWORD PTR [rdx+32], xmm10
-        vmovdqu	OWORD PTR [rdx+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+64]
-        vmovdqu	xmm1, OWORD PTR [rcx+80]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vmovdqu	OWORD PTR [rdx+64], xmm12
-        vmovdqu	OWORD PTR [rdx+80], xmm13
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+96]
-        vmovdqu	xmm1, OWORD PTR [rcx+112]
-        vpxor	xmm14, xmm14, xmm0
-        vpxor	xmm15, xmm15, xmm1
-        vmovdqu	OWORD PTR [rdx+96], xmm14
-        vmovdqu	OWORD PTR [rdx+112], xmm15
-        add	ebx, 128
-        cmp	ebx, r13d
-        jl	L_AES_GCM_encrypt_avx1_ghash_128
-L_AES_GCM_encrypt_avx1_end_128:
-        vmovdqa	xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpshufb	xmm8, xmm8, xmm4
-        vpshufb	xmm9, xmm9, xmm4
-        vpshufb	xmm10, xmm10, xmm4
-        vpshufb	xmm11, xmm11, xmm4
-        vpxor	xmm8, xmm8, xmm2
-        vpshufb	xmm12, xmm12, xmm4
-        vpshufb	xmm13, xmm13, xmm4
-        vpshufb	xmm14, xmm14, xmm4
-        vpshufb	xmm15, xmm15, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp]
-        vmovdqu	xmm5, OWORD PTR [rsp+16]
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm15, 78
-        vpshufd	xmm2, xmm7, 78
-        vpclmulqdq	xmm3, xmm7, xmm15, 17
-        vpclmulqdq	xmm0, xmm7, xmm15, 0
-        vpxor	xmm1, xmm1, xmm15
-        vpxor	xmm2, xmm2, xmm7
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm4, xmm0
-        vmovdqa	xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm14, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm14, 17
-        vpclmulqdq	xmm0, xmm5, xmm14, 0
-        vpxor	xmm1, xmm1, xmm14
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vmovdqu	xmm7, OWORD PTR [rsp+32]
-        vmovdqu	xmm5, OWORD PTR [rsp+48]
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm13, 78
-        vpshufd	xmm2, xmm7, 78
-        vpclmulqdq	xmm3, xmm7, xmm13, 17
-        vpclmulqdq	xmm0, xmm7, xmm13, 0
-        vpxor	xmm1, xmm1, xmm13
-        vpxor	xmm2, xmm2, xmm7
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm12, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm12, 17
-        vpclmulqdq	xmm0, xmm5, xmm12, 0
-        vpxor	xmm1, xmm1, xmm12
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vmovdqu	xmm7, OWORD PTR [rsp+64]
-        vmovdqu	xmm5, OWORD PTR [rsp+80]
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm11, 78
-        vpshufd	xmm2, xmm7, 78
-        vpclmulqdq	xmm3, xmm7, xmm11, 17
-        vpclmulqdq	xmm0, xmm7, xmm11, 0
-        vpxor	xmm1, xmm1, xmm11
-        vpxor	xmm2, xmm2, xmm7
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm10, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm10, 17
-        vpclmulqdq	xmm0, xmm5, xmm10, 0
-        vpxor	xmm1, xmm1, xmm10
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vmovdqu	xmm7, OWORD PTR [rsp+96]
-        vmovdqu	xmm5, OWORD PTR [rsp+112]
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm9, 78
-        vpshufd	xmm2, xmm7, 78
-        vpclmulqdq	xmm3, xmm7, xmm9, 17
-        vpclmulqdq	xmm0, xmm7, xmm9, 0
-        vpxor	xmm1, xmm1, xmm9
-        vpxor	xmm2, xmm2, xmm7
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm8, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm8, 17
-        vpclmulqdq	xmm0, xmm5, xmm8, 0
-        vpxor	xmm1, xmm1, xmm8
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vpslld	xmm0, xmm4, 31
-        vpslld	xmm1, xmm4, 30
-        vpslld	xmm2, xmm4, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm4, xmm4, xmm0
-        vpsrld	xmm2, xmm4, 1
-        vpsrld	xmm3, xmm4, 2
-        vpsrld	xmm0, xmm4, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm4
-        vpxor	xmm6, xmm6, xmm2
-        vmovdqu	xmm5, OWORD PTR [rsp]
-L_AES_GCM_encrypt_avx1_done_128:
-        mov	edx, r9d
-        cmp	ebx, edx
-        jge	L_AES_GCM_encrypt_avx1_done_enc
-        mov	r13d, r9d
-        and	r13d, 4294967280
-        cmp	ebx, r13d
-        jge	L_AES_GCM_encrypt_avx1_last_block_done
-        vmovdqu	xmm9, OWORD PTR [rsp+128]
-        vpshufb	xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one
-        vmovdqu	OWORD PTR [rsp+128], xmm9
-        vpxor	xmm8, xmm8, [r15]
-        vaesenc	xmm8, xmm8, [r15+16]
-        vaesenc	xmm8, xmm8, [r15+32]
-        vaesenc	xmm8, xmm8, [r15+48]
-        vaesenc	xmm8, xmm8, [r15+64]
-        vaesenc	xmm8, xmm8, [r15+80]
-        vaesenc	xmm8, xmm8, [r15+96]
-        vaesenc	xmm8, xmm8, [r15+112]
-        vaesenc	xmm8, xmm8, [r15+128]
-        vaesenc	xmm8, xmm8, [r15+144]
-        cmp	r10d, 11
-        vmovdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_avx1_aesenc_block_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [r15+176]
-        cmp	r10d, 13
-        vmovdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_avx1_aesenc_block_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [r15+208]
-        vmovdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_avx1_aesenc_block_last:
-        vaesenclast	xmm8, xmm8, xmm9
-        vmovdqu	xmm9, OWORD PTR [rdi+rbx]
-        vpxor	xmm8, xmm8, xmm9
-        vmovdqu	OWORD PTR [rsi+rbx], xmm8
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm8
-        add	ebx, 16
-        cmp	ebx, r13d
-        jge	L_AES_GCM_encrypt_avx1_last_block_ghash
-L_AES_GCM_encrypt_avx1_last_block_start:
-        vmovdqu	xmm13, OWORD PTR [rdi+rbx]
-        vmovdqu	xmm9, OWORD PTR [rsp+128]
-        vpshufb	xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one
-        vmovdqu	OWORD PTR [rsp+128], xmm9
-        vpxor	xmm8, xmm8, [r15]
-        vpclmulqdq	xmm10, xmm6, xmm5, 16
-        vaesenc	xmm8, xmm8, [r15+16]
-        vaesenc	xmm8, xmm8, [r15+32]
-        vpclmulqdq	xmm11, xmm6, xmm5, 1
-        vaesenc	xmm8, xmm8, [r15+48]
-        vaesenc	xmm8, xmm8, [r15+64]
-        vpclmulqdq	xmm12, xmm6, xmm5, 0
-        vaesenc	xmm8, xmm8, [r15+80]
-        vpclmulqdq	xmm1, xmm6, xmm5, 17
-        vaesenc	xmm8, xmm8, [r15+96]
-        vpxor	xmm10, xmm10, xmm11
-        vpslldq	xmm2, xmm10, 8
-        vpsrldq	xmm10, xmm10, 8
-        vaesenc	xmm8, xmm8, [r15+112]
-        vpxor	xmm2, xmm2, xmm12
-        vpxor	xmm3, xmm1, xmm10
-        vmovdqa	xmm0, OWORD PTR L_avx1_aes_gcm_mod2_128
-        vpclmulqdq	xmm11, xmm2, xmm0, 16
-        vaesenc	xmm8, xmm8, [r15+128]
-        vpshufd	xmm10, xmm2, 78
-        vpxor	xmm10, xmm10, xmm11
-        vpclmulqdq	xmm11, xmm10, xmm0, 16
-        vaesenc	xmm8, xmm8, [r15+144]
-        vpshufd	xmm10, xmm10, 78
-        vpxor	xmm10, xmm10, xmm11
-        vpxor	xmm6, xmm10, xmm3
-        cmp	r10d, 11
-        vmovdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_avx1_aesenc_gfmul_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [r15+176]
-        cmp	r10d, 13
-        vmovdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_avx1_aesenc_gfmul_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [r15+208]
-        vmovdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_avx1_aesenc_gfmul_last:
-        vaesenclast	xmm8, xmm8, xmm9
-        vmovdqa	xmm0, xmm13
-        vpxor	xmm8, xmm8, xmm0
-        vmovdqu	OWORD PTR [rsi+rbx], xmm8
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        add	ebx, 16
-        vpxor	xmm6, xmm6, xmm8
-        cmp	ebx, r13d
-        jl	L_AES_GCM_encrypt_avx1_last_block_start
-L_AES_GCM_encrypt_avx1_last_block_ghash:
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm6, 78
-        vpclmulqdq	xmm11, xmm6, xmm5, 17
-        vpclmulqdq	xmm8, xmm6, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm6
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm6, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm6, xmm6, xmm14
-L_AES_GCM_encrypt_avx1_last_block_done:
-        mov	ecx, r9d
-        mov	edx, ecx
-        and	ecx, 15
-        jz	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_done
-        vmovdqu	xmm4, OWORD PTR [rsp+128]
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpxor	xmm4, xmm4, [r15]
-        vaesenc	xmm4, xmm4, [r15+16]
-        vaesenc	xmm4, xmm4, [r15+32]
-        vaesenc	xmm4, xmm4, [r15+48]
-        vaesenc	xmm4, xmm4, [r15+64]
-        vaesenc	xmm4, xmm4, [r15+80]
-        vaesenc	xmm4, xmm4, [r15+96]
-        vaesenc	xmm4, xmm4, [r15+112]
-        vaesenc	xmm4, xmm4, [r15+128]
-        vaesenc	xmm4, xmm4, [r15+144]
-        cmp	r10d, 11
-        vmovdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last
-        vaesenc	xmm4, xmm4, xmm9
-        vaesenc	xmm4, xmm4, [r15+176]
-        cmp	r10d, 13
-        vmovdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last
-        vaesenc	xmm4, xmm4, xmm9
-        vaesenc	xmm4, xmm4, [r15+208]
-        vmovdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last:
-        vaesenclast	xmm4, xmm4, xmm9
-        sub	rsp, 16
-        xor	ecx, ecx
-        vmovdqu	OWORD PTR [rsp], xmm4
-L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_loop:
-        movzx	r13d, BYTE PTR [rdi+rbx]
-        xor	r13b, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [rsi+rbx], r13b
-        mov	BYTE PTR [rsp+rcx], r13b
-        inc	ebx
-        inc	ecx
-        cmp	ebx, edx
-        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_loop
-        xor	r13, r13
-        cmp	ecx, 16
-        je	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_finish_enc
-L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_byte_loop:
-        mov	BYTE PTR [rsp+rcx], r13b
-        inc	ecx
-        cmp	ecx, 16
-        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_byte_loop
-L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_finish_enc:
-        vmovdqu	xmm4, OWORD PTR [rsp]
-        add	rsp, 16
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm4
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm6, 78
-        vpclmulqdq	xmm11, xmm6, xmm5, 17
-        vpclmulqdq	xmm8, xmm6, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm6
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm6, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm6, xmm6, xmm14
-L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_done:
-L_AES_GCM_encrypt_avx1_done_enc:
-        mov	edx, r9d
-        mov	ecx, r11d
-        shl	rdx, 3
-        shl	rcx, 3
-        vmovq	xmm0, rdx
-        vmovq	xmm1, rcx
-        vpunpcklqdq	xmm0, xmm0, xmm1
-        vpxor	xmm6, xmm6, xmm0
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm6, 78
-        vpclmulqdq	xmm11, xmm6, xmm5, 17
-        vpclmulqdq	xmm8, xmm6, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm6
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm6, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm6, xmm6, xmm14
-        vpshufb	xmm6, xmm6, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vmovdqu	xmm0, OWORD PTR [rsp+144]
-        vpxor	xmm0, xmm0, xmm6
-        cmp	r14d, 16
-        je	L_AES_GCM_encrypt_avx1_store_tag_16
-        xor	rcx, rcx
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_encrypt_avx1_store_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [r8+rcx], r13b
-        inc	ecx
-        cmp	ecx, r14d
-        jne	L_AES_GCM_encrypt_avx1_store_tag_loop
-        jmp	L_AES_GCM_encrypt_avx1_store_tag_done
-L_AES_GCM_encrypt_avx1_store_tag_16:
-        vmovdqu	OWORD PTR [r8], xmm0
-L_AES_GCM_encrypt_avx1_store_tag_done:
-        vzeroupper
-        vmovdqu	xmm6, OWORD PTR [rsp+160]
-        vmovdqu	xmm7, OWORD PTR [rsp+176]
-        vmovdqu	xmm8, OWORD PTR [rsp+192]
-        vmovdqu	xmm9, OWORD PTR [rsp+208]
-        vmovdqu	xmm10, OWORD PTR [rsp+224]
-        vmovdqu	xmm11, OWORD PTR [rsp+240]
-        vmovdqu	xmm12, OWORD PTR [rsp+256]
-        vmovdqu	xmm13, OWORD PTR [rsp+272]
-        vmovdqu	xmm14, OWORD PTR [rsp+288]
-        vmovdqu	xmm15, OWORD PTR [rsp+304]
-        add	rsp, 320
-        pop	r15
-        pop	r14
-        pop	rbx
-        pop	r12
-        pop	rsi
-        pop	rdi
-        pop	r13
-        ret
-AES_GCM_encrypt_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_decrypt_avx1 PROC
-        push	r13
-        push	rdi
-        push	rsi
-        push	r12
-        push	rbx
-        push	r14
-        push	r15
-        push	rbp
-        mov	rdi, rcx
-        mov	rsi, rdx
-        mov	r12, r8
-        mov	rax, r9
-        mov	r8, QWORD PTR [rsp+104]
-        mov	r9d, DWORD PTR [rsp+112]
-        mov	r11d, DWORD PTR [rsp+120]
-        mov	ebx, DWORD PTR [rsp+128]
-        mov	r14d, DWORD PTR [rsp+136]
-        mov	r15, QWORD PTR [rsp+144]
-        mov	r10d, DWORD PTR [rsp+152]
-        mov	rbp, QWORD PTR [rsp+160]
-        sub	rsp, 328
-        vmovdqu	OWORD PTR [rsp+168], xmm6
-        vmovdqu	OWORD PTR [rsp+184], xmm7
-        vmovdqu	OWORD PTR [rsp+200], xmm8
-        vmovdqu	OWORD PTR [rsp+216], xmm9
-        vmovdqu	OWORD PTR [rsp+232], xmm10
-        vmovdqu	OWORD PTR [rsp+248], xmm11
-        vmovdqu	OWORD PTR [rsp+264], xmm12
-        vmovdqu	OWORD PTR [rsp+280], xmm13
-        vmovdqu	OWORD PTR [rsp+296], xmm14
-        vmovdqu	OWORD PTR [rsp+312], xmm15
-        vpxor	xmm4, xmm4, xmm4
-        vpxor	xmm6, xmm6, xmm6
-        cmp	ebx, 12
-        mov	edx, ebx
-        jne	L_AES_GCM_decrypt_avx1_iv_not_12
-        ; # Calculate values when IV is 12 bytes
-        ; Set counter based on IV
-        mov	ecx, 16777216
-        vmovq	xmm4, QWORD PTR [rax]
-        vpinsrd	xmm4, xmm4, DWORD PTR [rax+8], 2
-        vpinsrd	xmm4, xmm4, ecx, 3
-        ; H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqa	xmm5, OWORD PTR [r15]
-        vpxor	xmm1, xmm4, xmm5
-        vmovdqa	xmm7, OWORD PTR [r15+16]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+32]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+48]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+64]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+80]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+96]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+112]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+128]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+144]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        cmp	r10d, 11
-        vmovdqa	xmm7, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+176]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        cmp	r10d, 13
-        vmovdqa	xmm7, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+208]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_avx1_calc_iv_12_last:
-        vaesenclast	xmm5, xmm5, xmm7
-        vaesenclast	xmm1, xmm1, xmm7
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vmovdqu	OWORD PTR [rsp+144], xmm1
-        jmp	L_AES_GCM_decrypt_avx1_iv_done
-L_AES_GCM_decrypt_avx1_iv_not_12:
-        ; Calculate values when IV is not 12 bytes
-        ; H = Encrypt X(=0)
-        vmovdqa	xmm5, OWORD PTR [r15]
-        vaesenc	xmm5, xmm5, [r15+16]
-        vaesenc	xmm5, xmm5, [r15+32]
-        vaesenc	xmm5, xmm5, [r15+48]
-        vaesenc	xmm5, xmm5, [r15+64]
-        vaesenc	xmm5, xmm5, [r15+80]
-        vaesenc	xmm5, xmm5, [r15+96]
-        vaesenc	xmm5, xmm5, [r15+112]
-        vaesenc	xmm5, xmm5, [r15+128]
-        vaesenc	xmm5, xmm5, [r15+144]
-        cmp	r10d, 11
-        vmovdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm9
-        vaesenc	xmm5, xmm5, [r15+176]
-        cmp	r10d, 13
-        vmovdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm9
-        vaesenc	xmm5, xmm5, [r15+208]
-        vmovdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last:
-        vaesenclast	xmm5, xmm5, xmm9
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        ; Calc counter
-        ; Initialization vector
-        cmp	edx, 0
-        mov	rcx, 0
-        je	L_AES_GCM_decrypt_avx1_calc_iv_done
-        cmp	edx, 16
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_lt16
-        and	edx, 4294967280
-L_AES_GCM_decrypt_avx1_calc_iv_16_loop:
-        vmovdqu	xmm8, OWORD PTR [rax+rcx]
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm8
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm4, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpxor	xmm1, xmm1, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm4, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm4, xmm4, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm4, xmm4, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_16_loop
-        mov	edx, ebx
-        cmp	ecx, edx
-        je	L_AES_GCM_decrypt_avx1_calc_iv_done
-L_AES_GCM_decrypt_avx1_calc_iv_lt16:
-        sub	rsp, 16
-        vpxor	xmm8, xmm8, xmm8
-        xor	ebx, ebx
-        vmovdqu	OWORD PTR [rsp], xmm8
-L_AES_GCM_decrypt_avx1_calc_iv_loop:
-        movzx	r13d, BYTE PTR [rax+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_loop
-        vmovdqu	xmm8, OWORD PTR [rsp]
-        add	rsp, 16
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm8
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm4, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpxor	xmm1, xmm1, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm4, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm4, xmm4, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm4, xmm4, xmm2
-L_AES_GCM_decrypt_avx1_calc_iv_done:
-        ; T = Encrypt counter
-        vpxor	xmm0, xmm0, xmm0
-        shl	edx, 3
-        vmovq	xmm0, rdx
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm4, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpxor	xmm1, xmm1, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm4, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm4, xmm4, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm4, xmm4, xmm2
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        ;   Encrypt counter
-        vmovdqa	xmm8, OWORD PTR [r15]
-        vpxor	xmm8, xmm8, xmm4
-        vaesenc	xmm8, xmm8, [r15+16]
-        vaesenc	xmm8, xmm8, [r15+32]
-        vaesenc	xmm8, xmm8, [r15+48]
-        vaesenc	xmm8, xmm8, [r15+64]
-        vaesenc	xmm8, xmm8, [r15+80]
-        vaesenc	xmm8, xmm8, [r15+96]
-        vaesenc	xmm8, xmm8, [r15+112]
-        vaesenc	xmm8, xmm8, [r15+128]
-        vaesenc	xmm8, xmm8, [r15+144]
-        cmp	r10d, 11
-        vmovdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [r15+176]
-        cmp	r10d, 13
-        vmovdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [r15+208]
-        vmovdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last:
-        vaesenclast	xmm8, xmm8, xmm9
-        vmovdqu	OWORD PTR [rsp+144], xmm8
-L_AES_GCM_decrypt_avx1_iv_done:
-        ; Additional authentication data
-        mov	edx, r11d
-        cmp	edx, 0
-        je	L_AES_GCM_decrypt_avx1_calc_aad_done
-        xor	ecx, ecx
-        cmp	edx, 16
-        jl	L_AES_GCM_decrypt_avx1_calc_aad_lt16
-        and	edx, 4294967280
-L_AES_GCM_decrypt_avx1_calc_aad_16_loop:
-        vmovdqu	xmm8, OWORD PTR [r12+rcx]
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm8
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm6, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm6, 17
-        vpclmulqdq	xmm0, xmm5, xmm6, 0
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm6, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm6, xmm6, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm6, xmm6, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm6, xmm6, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm6, xmm6, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_avx1_calc_aad_16_loop
-        mov	edx, r11d
-        cmp	ecx, edx
-        je	L_AES_GCM_decrypt_avx1_calc_aad_done
-L_AES_GCM_decrypt_avx1_calc_aad_lt16:
-        sub	rsp, 16
-        vpxor	xmm8, xmm8, xmm8
-        xor	ebx, ebx
-        vmovdqu	OWORD PTR [rsp], xmm8
-L_AES_GCM_decrypt_avx1_calc_aad_loop:
-        movzx	r13d, BYTE PTR [r12+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_avx1_calc_aad_loop
-        vmovdqu	xmm8, OWORD PTR [rsp]
-        add	rsp, 16
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm8
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm6, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm6, 17
-        vpclmulqdq	xmm0, xmm5, xmm6, 0
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm6, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm6, xmm6, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm6, xmm6, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm6, xmm6, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm6, xmm6, xmm2
-L_AES_GCM_decrypt_avx1_calc_aad_done:
-        ; Calculate counter and H
-        vpsrlq	xmm9, xmm5, 63
-        vpsllq	xmm8, xmm5, 1
-        vpslldq	xmm9, xmm9, 8
-        vpor	xmm8, xmm8, xmm9
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpand	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128
-        vpaddd	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_one
-        vpxor	xmm5, xmm5, xmm8
-        vmovdqu	OWORD PTR [rsp+128], xmm4
-        xor	ebx, ebx
-        cmp	r9d, 128
-        mov	r13d, r9d
-        jl	L_AES_GCM_decrypt_avx1_done_128
-        and	r13d, 4294967168
-        vmovdqa	xmm2, xmm6
-        ; H ^ 1
-        vmovdqu	OWORD PTR [rsp], xmm5
-        ; H ^ 2
-        vpclmulqdq	xmm8, xmm5, xmm5, 0
-        vpclmulqdq	xmm0, xmm5, xmm5, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm0, xmm0, xmm14
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-        ; H ^ 3
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm0, 78
-        vpclmulqdq	xmm11, xmm0, xmm5, 17
-        vpclmulqdq	xmm8, xmm0, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm0
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm1, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm1, xmm1, xmm14
-        vmovdqu	OWORD PTR [rsp+32], xmm1
-        ; H ^ 4
-        vpclmulqdq	xmm8, xmm0, xmm0, 0
-        vpclmulqdq	xmm3, xmm0, xmm0, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm3, xmm3, xmm14
-        vmovdqu	OWORD PTR [rsp+48], xmm3
-        ; H ^ 5
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm0, 78
-        vpshufd	xmm10, xmm1, 78
-        vpclmulqdq	xmm11, xmm1, xmm0, 17
-        vpclmulqdq	xmm8, xmm1, xmm0, 0
-        vpxor	xmm9, xmm9, xmm0
-        vpxor	xmm10, xmm10, xmm1
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm7, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+64], xmm7
-        ; H ^ 6
-        vpclmulqdq	xmm8, xmm1, xmm1, 0
-        vpclmulqdq	xmm7, xmm1, xmm1, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+80], xmm7
-        ; H ^ 7
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm1, 78
-        vpshufd	xmm10, xmm3, 78
-        vpclmulqdq	xmm11, xmm3, xmm1, 17
-        vpclmulqdq	xmm8, xmm3, xmm1, 0
-        vpxor	xmm9, xmm9, xmm1
-        vpxor	xmm10, xmm10, xmm3
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm7, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+96], xmm7
-        ; H ^ 8
-        vpclmulqdq	xmm8, xmm3, xmm3, 0
-        vpclmulqdq	xmm7, xmm3, xmm3, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+112], xmm7
-L_AES_GCM_decrypt_avx1_ghash_128:
-        lea	rcx, QWORD PTR [rdi+rbx]
-        lea	rdx, QWORD PTR [rsi+rbx]
-        vmovdqu	xmm0, OWORD PTR [rsp+128]
-        vmovdqa	xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven
-        vpshufb	xmm15, xmm15, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight
-        vmovdqa	xmm7, OWORD PTR [r15]
-        vmovdqu	OWORD PTR [rsp+128], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsp+112]
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vaesenc	xmm8, xmm8, [r15+16]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm0, xmm0, xmm2
-        vpshufd	xmm1, xmm7, 78
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm3, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+16]
-        vaesenc	xmm10, xmm10, [r15+16]
-        vpclmulqdq	xmm2, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+16]
-        vaesenc	xmm12, xmm12, [r15+16]
-        vpclmulqdq	xmm1, xmm1, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+16]
-        vaesenc	xmm14, xmm14, [r15+16]
-        vaesenc	xmm15, xmm15, [r15+16]
-        vpxor	xmm1, xmm1, xmm2
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqu	xmm7, OWORD PTR [rsp+96]
-        vmovdqu	xmm0, OWORD PTR [rcx+16]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+32]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+32]
-        vaesenc	xmm10, xmm10, [r15+32]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+32]
-        vaesenc	xmm12, xmm12, [r15+32]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+32]
-        vaesenc	xmm14, xmm14, [r15+32]
-        vaesenc	xmm15, xmm15, [r15+32]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+80]
-        vmovdqu	xmm0, OWORD PTR [rcx+32]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+48]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+48]
-        vaesenc	xmm10, xmm10, [r15+48]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+48]
-        vaesenc	xmm12, xmm12, [r15+48]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+48]
-        vaesenc	xmm14, xmm14, [r15+48]
-        vaesenc	xmm15, xmm15, [r15+48]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+64]
-        vmovdqu	xmm0, OWORD PTR [rcx+48]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+64]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+64]
-        vaesenc	xmm10, xmm10, [r15+64]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+64]
-        vaesenc	xmm12, xmm12, [r15+64]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+64]
-        vaesenc	xmm14, xmm14, [r15+64]
-        vaesenc	xmm15, xmm15, [r15+64]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+48]
-        vmovdqu	xmm0, OWORD PTR [rcx+64]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+80]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+80]
-        vaesenc	xmm10, xmm10, [r15+80]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+80]
-        vaesenc	xmm12, xmm12, [r15+80]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+80]
-        vaesenc	xmm14, xmm14, [r15+80]
-        vaesenc	xmm15, xmm15, [r15+80]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+32]
-        vmovdqu	xmm0, OWORD PTR [rcx+80]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+96]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+96]
-        vaesenc	xmm10, xmm10, [r15+96]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+96]
-        vaesenc	xmm12, xmm12, [r15+96]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+96]
-        vaesenc	xmm14, xmm14, [r15+96]
-        vaesenc	xmm15, xmm15, [r15+96]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm0, OWORD PTR [rcx+96]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+112]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+112]
-        vaesenc	xmm10, xmm10, [r15+112]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+112]
-        vaesenc	xmm12, xmm12, [r15+112]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+112]
-        vaesenc	xmm14, xmm14, [r15+112]
-        vaesenc	xmm15, xmm15, [r15+112]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp]
-        vmovdqu	xmm0, OWORD PTR [rcx+112]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+128]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+128]
-        vaesenc	xmm10, xmm10, [r15+128]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+128]
-        vaesenc	xmm12, xmm12, [r15+128]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+128]
-        vaesenc	xmm14, xmm14, [r15+128]
-        vaesenc	xmm15, xmm15, [r15+128]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vpslldq	xmm5, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vaesenc	xmm8, xmm8, [r15+144]
-        vpxor	xmm2, xmm2, xmm5
-        vpxor	xmm3, xmm3, xmm1
-        vaesenc	xmm9, xmm9, [r15+144]
-        vpslld	xmm7, xmm2, 31
-        vpslld	xmm4, xmm2, 30
-        vpslld	xmm5, xmm2, 25
-        vaesenc	xmm10, xmm10, [r15+144]
-        vpxor	xmm7, xmm7, xmm4
-        vpxor	xmm7, xmm7, xmm5
-        vaesenc	xmm11, xmm11, [r15+144]
-        vpsrldq	xmm4, xmm7, 4
-        vpslldq	xmm7, xmm7, 12
-        vaesenc	xmm12, xmm12, [r15+144]
-        vpxor	xmm2, xmm2, xmm7
-        vpsrld	xmm5, xmm2, 1
-        vaesenc	xmm13, xmm13, [r15+144]
-        vpsrld	xmm1, xmm2, 2
-        vpsrld	xmm0, xmm2, 7
-        vaesenc	xmm14, xmm14, [r15+144]
-        vpxor	xmm5, xmm5, xmm1
-        vpxor	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, [r15+144]
-        vpxor	xmm5, xmm5, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpxor	xmm2, xmm2, xmm3
-        cmp	r10d, 11
-        vmovdqa	xmm7, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_avx1_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r10d, 13
-        vmovdqa	xmm7, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_avx1_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_avx1_aesenc_128_ghash_avx_done:
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vmovdqu	xmm1, OWORD PTR [rcx+16]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vmovdqu	OWORD PTR [rdx], xmm8
-        vmovdqu	OWORD PTR [rdx+16], xmm9
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+32]
-        vmovdqu	xmm1, OWORD PTR [rcx+48]
-        vpxor	xmm10, xmm10, xmm0
-        vpxor	xmm11, xmm11, xmm1
-        vmovdqu	OWORD PTR [rdx+32], xmm10
-        vmovdqu	OWORD PTR [rdx+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+64]
-        vmovdqu	xmm1, OWORD PTR [rcx+80]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vmovdqu	OWORD PTR [rdx+64], xmm12
-        vmovdqu	OWORD PTR [rdx+80], xmm13
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+96]
-        vmovdqu	xmm1, OWORD PTR [rcx+112]
-        vpxor	xmm14, xmm14, xmm0
-        vpxor	xmm15, xmm15, xmm1
-        vmovdqu	OWORD PTR [rdx+96], xmm14
-        vmovdqu	OWORD PTR [rdx+112], xmm15
-        add	ebx, 128
-        cmp	ebx, r13d
-        jl	L_AES_GCM_decrypt_avx1_ghash_128
-        vmovdqa	xmm6, xmm2
-        vmovdqu	xmm5, OWORD PTR [rsp]
-L_AES_GCM_decrypt_avx1_done_128:
-        mov	edx, r9d
-        cmp	ebx, edx
-        jge	L_AES_GCM_decrypt_avx1_done_dec
-        mov	r13d, r9d
-        and	r13d, 4294967280
-        cmp	ebx, r13d
-        jge	L_AES_GCM_decrypt_avx1_last_block_done
-L_AES_GCM_decrypt_avx1_last_block_start:
-        vmovdqu	xmm13, OWORD PTR [rdi+rbx]
-        vmovdqa	xmm0, xmm5
-        vpshufb	xmm1, xmm13, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm1, xmm1, xmm6
-        vmovdqu	xmm9, OWORD PTR [rsp+128]
-        vpshufb	xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one
-        vmovdqu	OWORD PTR [rsp+128], xmm9
-        vpxor	xmm8, xmm8, [r15]
-        vpclmulqdq	xmm10, xmm1, xmm0, 16
-        vaesenc	xmm8, xmm8, [r15+16]
-        vaesenc	xmm8, xmm8, [r15+32]
-        vpclmulqdq	xmm11, xmm1, xmm0, 1
-        vaesenc	xmm8, xmm8, [r15+48]
-        vaesenc	xmm8, xmm8, [r15+64]
-        vpclmulqdq	xmm12, xmm1, xmm0, 0
-        vaesenc	xmm8, xmm8, [r15+80]
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vaesenc	xmm8, xmm8, [r15+96]
-        vpxor	xmm10, xmm10, xmm11
-        vpslldq	xmm2, xmm10, 8
-        vpsrldq	xmm10, xmm10, 8
-        vaesenc	xmm8, xmm8, [r15+112]
-        vpxor	xmm2, xmm2, xmm12
-        vpxor	xmm3, xmm1, xmm10
-        vmovdqa	xmm0, OWORD PTR L_avx1_aes_gcm_mod2_128
-        vpclmulqdq	xmm11, xmm2, xmm0, 16
-        vaesenc	xmm8, xmm8, [r15+128]
-        vpshufd	xmm10, xmm2, 78
-        vpxor	xmm10, xmm10, xmm11
-        vpclmulqdq	xmm11, xmm10, xmm0, 16
-        vaesenc	xmm8, xmm8, [r15+144]
-        vpshufd	xmm10, xmm10, 78
-        vpxor	xmm10, xmm10, xmm11
-        vpxor	xmm6, xmm10, xmm3
-        cmp	r10d, 11
-        vmovdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_avx1_aesenc_gfmul_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [r15+176]
-        cmp	r10d, 13
-        vmovdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_avx1_aesenc_gfmul_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [r15+208]
-        vmovdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_avx1_aesenc_gfmul_last:
-        vaesenclast	xmm8, xmm8, xmm9
-        vmovdqa	xmm0, xmm13
-        vpxor	xmm8, xmm8, xmm0
-        vmovdqu	OWORD PTR [rsi+rbx], xmm8
-        add	ebx, 16
-        cmp	ebx, r13d
-        jl	L_AES_GCM_decrypt_avx1_last_block_start
-L_AES_GCM_decrypt_avx1_last_block_done:
-        mov	ecx, r9d
-        mov	edx, ecx
-        and	ecx, 15
-        jz	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_done
-        vmovdqu	xmm4, OWORD PTR [rsp+128]
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpxor	xmm4, xmm4, [r15]
-        vaesenc	xmm4, xmm4, [r15+16]
-        vaesenc	xmm4, xmm4, [r15+32]
-        vaesenc	xmm4, xmm4, [r15+48]
-        vaesenc	xmm4, xmm4, [r15+64]
-        vaesenc	xmm4, xmm4, [r15+80]
-        vaesenc	xmm4, xmm4, [r15+96]
-        vaesenc	xmm4, xmm4, [r15+112]
-        vaesenc	xmm4, xmm4, [r15+128]
-        vaesenc	xmm4, xmm4, [r15+144]
-        cmp	r10d, 11
-        vmovdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last
-        vaesenc	xmm4, xmm4, xmm9
-        vaesenc	xmm4, xmm4, [r15+176]
-        cmp	r10d, 13
-        vmovdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last
-        vaesenc	xmm4, xmm4, xmm9
-        vaesenc	xmm4, xmm4, [r15+208]
-        vmovdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last:
-        vaesenclast	xmm4, xmm4, xmm9
-        sub	rsp, 32
-        xor	ecx, ecx
-        vmovdqu	OWORD PTR [rsp], xmm4
-        vpxor	xmm0, xmm0, xmm0
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_loop:
-        movzx	r13d, BYTE PTR [rdi+rbx]
-        mov	BYTE PTR [rsp+rcx+16], r13b
-        xor	r13b, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [rsi+rbx], r13b
-        inc	ebx
-        inc	ecx
-        cmp	ebx, edx
-        jl	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_loop
-        vmovdqu	xmm4, OWORD PTR [rsp+16]
-        add	rsp, 32
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm4
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm6, 78
-        vpclmulqdq	xmm11, xmm6, xmm5, 17
-        vpclmulqdq	xmm8, xmm6, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm6
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm6, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm6, xmm6, xmm14
-L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_done:
-L_AES_GCM_decrypt_avx1_done_dec:
-        mov	edx, r9d
-        mov	ecx, r11d
-        shl	rdx, 3
-        shl	rcx, 3
-        vmovq	xmm0, rdx
-        vmovq	xmm1, rcx
-        vpunpcklqdq	xmm0, xmm0, xmm1
-        vpxor	xmm6, xmm6, xmm0
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm6, 78
-        vpclmulqdq	xmm11, xmm6, xmm5, 17
-        vpclmulqdq	xmm8, xmm6, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm6
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm6, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm6, xmm6, xmm14
-        vpshufb	xmm6, xmm6, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vmovdqu	xmm0, OWORD PTR [rsp+144]
-        vpxor	xmm0, xmm0, xmm6
-        cmp	r14d, 16
-        je	L_AES_GCM_decrypt_avx1_cmp_tag_16
-        sub	rsp, 16
-        xor	rcx, rcx
-        xor	rbx, rbx
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_decrypt_avx1_cmp_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+rcx]
-        xor	r13b, BYTE PTR [r8+rcx]
-        or	bl, r13b
-        inc	ecx
-        cmp	ecx, r14d
-        jne	L_AES_GCM_decrypt_avx1_cmp_tag_loop
-        cmp	rbx, 0
-        sete	bl
-        add	rsp, 16
-        xor	rcx, rcx
-        jmp	L_AES_GCM_decrypt_avx1_cmp_tag_done
-L_AES_GCM_decrypt_avx1_cmp_tag_16:
-        vmovdqu	xmm1, OWORD PTR [r8]
-        vpcmpeqb	xmm0, xmm0, xmm1
-        vpmovmskb	rdx, xmm0
-        ; %%edx == 0xFFFF then return 1 else => return 0
-        xor	ebx, ebx
-        cmp	edx, 65535
-        sete	bl
-L_AES_GCM_decrypt_avx1_cmp_tag_done:
-        mov	DWORD PTR [rbp], ebx
-        vzeroupper
-        vmovdqu	xmm6, OWORD PTR [rsp+168]
-        vmovdqu	xmm7, OWORD PTR [rsp+184]
-        vmovdqu	xmm8, OWORD PTR [rsp+200]
-        vmovdqu	xmm9, OWORD PTR [rsp+216]
-        vmovdqu	xmm10, OWORD PTR [rsp+232]
-        vmovdqu	xmm11, OWORD PTR [rsp+248]
-        vmovdqu	xmm12, OWORD PTR [rsp+264]
-        vmovdqu	xmm13, OWORD PTR [rsp+280]
-        vmovdqu	xmm14, OWORD PTR [rsp+296]
-        vmovdqu	xmm15, OWORD PTR [rsp+312]
-        add	rsp, 328
-        pop	rbp
-        pop	r15
-        pop	r14
-        pop	rbx
-        pop	r12
-        pop	rsi
-        pop	rdi
-        pop	r13
-        ret
-AES_GCM_decrypt_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_init_avx1 PROC
-        push	rdi
-        push	rsi
-        push	r12
-        push	r13
-        mov	rdi, rcx
-        mov	rsi, rdx
-        mov	r10, r8
-        mov	r11d, r9d
-        mov	rax, QWORD PTR [rsp+72]
-        mov	r8, QWORD PTR [rsp+80]
-        mov	r9, QWORD PTR [rsp+88]
-        sub	rsp, 80
-        vmovdqu	OWORD PTR [rsp+16], xmm6
-        vmovdqu	OWORD PTR [rsp+32], xmm7
-        vmovdqu	OWORD PTR [rsp+48], xmm8
-        vmovdqu	OWORD PTR [rsp+64], xmm15
-        vpxor	xmm4, xmm4, xmm4
-        mov	edx, r11d
-        cmp	edx, 12
-        jne	L_AES_GCM_init_avx1_iv_not_12
-        ; # Calculate values when IV is 12 bytes
-        ; Set counter based on IV
-        mov	ecx, 16777216
-        vmovq	xmm4, QWORD PTR [r10]
-        vpinsrd	xmm4, xmm4, DWORD PTR [r10+8], 2
-        vpinsrd	xmm4, xmm4, ecx, 3
-        ; H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqa	xmm5, OWORD PTR [rdi]
-        vpxor	xmm1, xmm4, xmm5
-        vmovdqa	xmm6, OWORD PTR [rdi+16]
-        vaesenc	xmm5, xmm5, xmm6
-        vaesenc	xmm1, xmm1, xmm6
-        vmovdqa	xmm6, OWORD PTR [rdi+32]
-        vaesenc	xmm5, xmm5, xmm6
-        vaesenc	xmm1, xmm1, xmm6
-        vmovdqa	xmm6, OWORD PTR [rdi+48]
-        vaesenc	xmm5, xmm5, xmm6
-        vaesenc	xmm1, xmm1, xmm6
-        vmovdqa	xmm6, OWORD PTR [rdi+64]
-        vaesenc	xmm5, xmm5, xmm6
-        vaesenc	xmm1, xmm1, xmm6
-        vmovdqa	xmm6, OWORD PTR [rdi+80]
-        vaesenc	xmm5, xmm5, xmm6
-        vaesenc	xmm1, xmm1, xmm6
-        vmovdqa	xmm6, OWORD PTR [rdi+96]
-        vaesenc	xmm5, xmm5, xmm6
-        vaesenc	xmm1, xmm1, xmm6
-        vmovdqa	xmm6, OWORD PTR [rdi+112]
-        vaesenc	xmm5, xmm5, xmm6
-        vaesenc	xmm1, xmm1, xmm6
-        vmovdqa	xmm6, OWORD PTR [rdi+128]
-        vaesenc	xmm5, xmm5, xmm6
-        vaesenc	xmm1, xmm1, xmm6
-        vmovdqa	xmm6, OWORD PTR [rdi+144]
-        vaesenc	xmm5, xmm5, xmm6
-        vaesenc	xmm1, xmm1, xmm6
-        cmp	esi, 11
-        vmovdqa	xmm6, OWORD PTR [rdi+160]
-        jl	L_AES_GCM_init_avx1_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm6
-        vaesenc	xmm1, xmm1, xmm6
-        vmovdqa	xmm6, OWORD PTR [rdi+176]
-        vaesenc	xmm5, xmm5, xmm6
-        vaesenc	xmm1, xmm1, xmm6
-        cmp	esi, 13
-        vmovdqa	xmm6, OWORD PTR [rdi+192]
-        jl	L_AES_GCM_init_avx1_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm6
-        vaesenc	xmm1, xmm1, xmm6
-        vmovdqa	xmm6, OWORD PTR [rdi+208]
-        vaesenc	xmm5, xmm5, xmm6
-        vaesenc	xmm1, xmm1, xmm6
-        vmovdqa	xmm6, OWORD PTR [rdi+224]
-L_AES_GCM_init_avx1_calc_iv_12_last:
-        vaesenclast	xmm5, xmm5, xmm6
-        vaesenclast	xmm1, xmm1, xmm6
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vmovdqu	xmm15, xmm1
-        jmp	L_AES_GCM_init_avx1_iv_done
-L_AES_GCM_init_avx1_iv_not_12:
-        ; Calculate values when IV is not 12 bytes
-        ; H = Encrypt X(=0)
-        vmovdqa	xmm5, OWORD PTR [rdi]
-        vaesenc	xmm5, xmm5, [rdi+16]
-        vaesenc	xmm5, xmm5, [rdi+32]
-        vaesenc	xmm5, xmm5, [rdi+48]
-        vaesenc	xmm5, xmm5, [rdi+64]
-        vaesenc	xmm5, xmm5, [rdi+80]
-        vaesenc	xmm5, xmm5, [rdi+96]
-        vaesenc	xmm5, xmm5, [rdi+112]
-        vaesenc	xmm5, xmm5, [rdi+128]
-        vaesenc	xmm5, xmm5, [rdi+144]
-        cmp	esi, 11
-        vmovdqa	xmm8, OWORD PTR [rdi+160]
-        jl	L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm8
-        vaesenc	xmm5, xmm5, [rdi+176]
-        cmp	esi, 13
-        vmovdqa	xmm8, OWORD PTR [rdi+192]
-        jl	L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm8
-        vaesenc	xmm5, xmm5, [rdi+208]
-        vmovdqa	xmm8, OWORD PTR [rdi+224]
-L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last:
-        vaesenclast	xmm5, xmm5, xmm8
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        ; Calc counter
-        ; Initialization vector
-        cmp	edx, 0
-        mov	rcx, 0
-        je	L_AES_GCM_init_avx1_calc_iv_done
-        cmp	edx, 16
-        jl	L_AES_GCM_init_avx1_calc_iv_lt16
-        and	edx, 4294967280
-L_AES_GCM_init_avx1_calc_iv_16_loop:
-        vmovdqu	xmm7, OWORD PTR [r10+rcx]
-        vpshufb	xmm7, xmm7, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm7
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm4, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpxor	xmm1, xmm1, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm6, xmm0
-        vmovdqa	xmm4, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm6, xmm6, xmm2
-        vpxor	xmm4, xmm4, xmm1
-        vpsrld	xmm0, xmm6, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm6, xmm6, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm6, xmm6, xmm0
-        vpor	xmm4, xmm4, xmm1
-        vpslld	xmm0, xmm6, 31
-        vpslld	xmm1, xmm6, 30
-        vpslld	xmm2, xmm6, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm6, xmm6, xmm0
-        vpsrld	xmm2, xmm6, 1
-        vpsrld	xmm3, xmm6, 2
-        vpsrld	xmm0, xmm6, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm6
-        vpxor	xmm4, xmm4, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_init_avx1_calc_iv_16_loop
-        mov	edx, r11d
-        cmp	ecx, edx
-        je	L_AES_GCM_init_avx1_calc_iv_done
-L_AES_GCM_init_avx1_calc_iv_lt16:
-        sub	rsp, 16
-        vpxor	xmm7, xmm7, xmm7
-        xor	r13d, r13d
-        vmovdqu	OWORD PTR [rsp], xmm7
-L_AES_GCM_init_avx1_calc_iv_loop:
-        movzx	r12d, BYTE PTR [r10+rcx]
-        mov	BYTE PTR [rsp+r13], r12b
-        inc	ecx
-        inc	r13d
-        cmp	ecx, edx
-        jl	L_AES_GCM_init_avx1_calc_iv_loop
-        vmovdqu	xmm7, OWORD PTR [rsp]
-        add	rsp, 16
-        vpshufb	xmm7, xmm7, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm7
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm4, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpxor	xmm1, xmm1, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm6, xmm0
-        vmovdqa	xmm4, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm6, xmm6, xmm2
-        vpxor	xmm4, xmm4, xmm1
-        vpsrld	xmm0, xmm6, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm6, xmm6, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm6, xmm6, xmm0
-        vpor	xmm4, xmm4, xmm1
-        vpslld	xmm0, xmm6, 31
-        vpslld	xmm1, xmm6, 30
-        vpslld	xmm2, xmm6, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm6, xmm6, xmm0
-        vpsrld	xmm2, xmm6, 1
-        vpsrld	xmm3, xmm6, 2
-        vpsrld	xmm0, xmm6, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm6
-        vpxor	xmm4, xmm4, xmm2
-L_AES_GCM_init_avx1_calc_iv_done:
-        ; T = Encrypt counter
-        vpxor	xmm0, xmm0, xmm0
-        shl	edx, 3
-        vmovq	xmm0, rdx
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm4, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpxor	xmm1, xmm1, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm6, xmm0
-        vmovdqa	xmm4, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm6, xmm6, xmm2
-        vpxor	xmm4, xmm4, xmm1
-        vpsrld	xmm0, xmm6, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm6, xmm6, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm6, xmm6, xmm0
-        vpor	xmm4, xmm4, xmm1
-        vpslld	xmm0, xmm6, 31
-        vpslld	xmm1, xmm6, 30
-        vpslld	xmm2, xmm6, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm6, xmm6, xmm0
-        vpsrld	xmm2, xmm6, 1
-        vpsrld	xmm3, xmm6, 2
-        vpsrld	xmm0, xmm6, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm6
-        vpxor	xmm4, xmm4, xmm2
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        ;   Encrypt counter
-        vmovdqa	xmm7, OWORD PTR [rdi]
-        vpxor	xmm7, xmm7, xmm4
-        vaesenc	xmm7, xmm7, [rdi+16]
-        vaesenc	xmm7, xmm7, [rdi+32]
-        vaesenc	xmm7, xmm7, [rdi+48]
-        vaesenc	xmm7, xmm7, [rdi+64]
-        vaesenc	xmm7, xmm7, [rdi+80]
-        vaesenc	xmm7, xmm7, [rdi+96]
-        vaesenc	xmm7, xmm7, [rdi+112]
-        vaesenc	xmm7, xmm7, [rdi+128]
-        vaesenc	xmm7, xmm7, [rdi+144]
-        cmp	esi, 11
-        vmovdqa	xmm8, OWORD PTR [rdi+160]
-        jl	L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm7, xmm7, xmm8
-        vaesenc	xmm7, xmm7, [rdi+176]
-        cmp	esi, 13
-        vmovdqa	xmm8, OWORD PTR [rdi+192]
-        jl	L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm7, xmm7, xmm8
-        vaesenc	xmm7, xmm7, [rdi+208]
-        vmovdqa	xmm8, OWORD PTR [rdi+224]
-L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last:
-        vaesenclast	xmm7, xmm7, xmm8
-        vmovdqu	xmm15, xmm7
-L_AES_GCM_init_avx1_iv_done:
-        vmovdqa	OWORD PTR [r9], xmm15
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpaddd	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_one
-        vmovdqa	OWORD PTR [rax], xmm5
-        vmovdqa	OWORD PTR [r8], xmm4
-        vzeroupper
-        vmovdqu	xmm6, OWORD PTR [rsp+16]
-        vmovdqu	xmm7, OWORD PTR [rsp+32]
-        vmovdqu	xmm8, OWORD PTR [rsp+48]
-        vmovdqu	xmm15, OWORD PTR [rsp+64]
-        add	rsp, 80
-        pop	r13
-        pop	r12
-        pop	rsi
-        pop	rdi
-        ret
-AES_GCM_init_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_aad_update_avx1 PROC
-        mov	rax, rcx
-        sub	rsp, 32
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqa	xmm5, OWORD PTR [r8]
-        vmovdqa	xmm6, OWORD PTR [r9]
-        xor	ecx, ecx
-L_AES_GCM_aad_update_avx1_16_loop:
-        vmovdqu	xmm7, OWORD PTR [rax+rcx]
-        vpshufb	xmm7, xmm7, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm7
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm5, 78
-        vpshufd	xmm2, xmm6, 78
-        vpclmulqdq	xmm3, xmm6, xmm5, 17
-        vpclmulqdq	xmm0, xmm6, xmm5, 0
-        vpxor	xmm1, xmm1, xmm5
-        vpxor	xmm2, xmm2, xmm6
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm4, xmm0
-        vmovdqa	xmm5, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm5, xmm5, xmm1
-        vpsrld	xmm0, xmm4, 31
-        vpsrld	xmm1, xmm5, 31
-        vpslld	xmm4, xmm4, 1
-        vpslld	xmm5, xmm5, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm5, xmm5, xmm2
-        vpor	xmm4, xmm4, xmm0
-        vpor	xmm5, xmm5, xmm1
-        vpslld	xmm0, xmm4, 31
-        vpslld	xmm1, xmm4, 30
-        vpslld	xmm2, xmm4, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm4, xmm4, xmm0
-        vpsrld	xmm2, xmm4, 1
-        vpsrld	xmm3, xmm4, 2
-        vpsrld	xmm0, xmm4, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm4
-        vpxor	xmm5, xmm5, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_aad_update_avx1_16_loop
-        vmovdqa	OWORD PTR [r8], xmm5
-        vzeroupper
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        add	rsp, 32
-        ret
-AES_GCM_aad_update_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_block_avx1 PROC
-        mov	r10, r8
-        mov	r11, r9
-        mov	rax, QWORD PTR [rsp+40]
-        vmovdqu	xmm1, OWORD PTR [rax]
-        vpshufb	xmm0, xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpaddd	xmm1, xmm1, OWORD PTR L_avx1_aes_gcm_one
-        vmovdqu	OWORD PTR [rax], xmm1
-        vpxor	xmm0, xmm0, [rcx]
-        vaesenc	xmm0, xmm0, [rcx+16]
-        vaesenc	xmm0, xmm0, [rcx+32]
-        vaesenc	xmm0, xmm0, [rcx+48]
-        vaesenc	xmm0, xmm0, [rcx+64]
-        vaesenc	xmm0, xmm0, [rcx+80]
-        vaesenc	xmm0, xmm0, [rcx+96]
-        vaesenc	xmm0, xmm0, [rcx+112]
-        vaesenc	xmm0, xmm0, [rcx+128]
-        vaesenc	xmm0, xmm0, [rcx+144]
-        cmp	edx, 11
-        vmovdqa	xmm1, OWORD PTR [rcx+160]
-        jl	L_AES_GCM_encrypt_block_avx1_aesenc_block_last
-        vaesenc	xmm0, xmm0, xmm1
-        vaesenc	xmm0, xmm0, [rcx+176]
-        cmp	edx, 13
-        vmovdqa	xmm1, OWORD PTR [rcx+192]
-        jl	L_AES_GCM_encrypt_block_avx1_aesenc_block_last
-        vaesenc	xmm0, xmm0, xmm1
-        vaesenc	xmm0, xmm0, [rcx+208]
-        vmovdqa	xmm1, OWORD PTR [rcx+224]
-L_AES_GCM_encrypt_block_avx1_aesenc_block_last:
-        vaesenclast	xmm0, xmm0, xmm1
-        vmovdqu	xmm1, OWORD PTR [r11]
-        vpxor	xmm0, xmm0, xmm1
-        vmovdqu	OWORD PTR [r10], xmm0
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vzeroupper
-        ret
-AES_GCM_encrypt_block_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_ghash_block_avx1 PROC
-        sub	rsp, 32
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqa	xmm4, OWORD PTR [rdx]
-        vmovdqa	xmm5, OWORD PTR [r8]
-        vmovdqu	xmm7, OWORD PTR [rcx]
-        vpshufb	xmm7, xmm7, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm7
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm4, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpxor	xmm1, xmm1, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm6, xmm0
-        vmovdqa	xmm4, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm6, xmm6, xmm2
-        vpxor	xmm4, xmm4, xmm1
-        vpsrld	xmm0, xmm6, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm6, xmm6, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm6, xmm6, xmm0
-        vpor	xmm4, xmm4, xmm1
-        vpslld	xmm0, xmm6, 31
-        vpslld	xmm1, xmm6, 30
-        vpslld	xmm2, xmm6, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm6, xmm6, xmm0
-        vpsrld	xmm2, xmm6, 1
-        vpsrld	xmm3, xmm6, 2
-        vpsrld	xmm0, xmm6, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm6
-        vpxor	xmm4, xmm4, xmm2
-        vmovdqa	OWORD PTR [rdx], xmm4
-        vzeroupper
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        add	rsp, 32
-        ret
-AES_GCM_ghash_block_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_update_avx1 PROC
-        push	r13
-        push	r12
-        push	r14
-        push	r15
-        push	rdi
-        mov	rax, rcx
-        mov	r10, r8
-        mov	r8d, edx
-        mov	r11, r9
-        mov	r9d, DWORD PTR [rsp+80]
-        mov	r12, QWORD PTR [rsp+88]
-        mov	r14, QWORD PTR [rsp+96]
-        mov	r15, QWORD PTR [rsp+104]
-        sub	rsp, 320
-        vmovdqu	OWORD PTR [rsp+160], xmm6
-        vmovdqu	OWORD PTR [rsp+176], xmm7
-        vmovdqu	OWORD PTR [rsp+192], xmm8
-        vmovdqu	OWORD PTR [rsp+208], xmm9
-        vmovdqu	OWORD PTR [rsp+224], xmm10
-        vmovdqu	OWORD PTR [rsp+240], xmm11
-        vmovdqu	OWORD PTR [rsp+256], xmm12
-        vmovdqu	OWORD PTR [rsp+272], xmm13
-        vmovdqu	OWORD PTR [rsp+288], xmm14
-        vmovdqu	OWORD PTR [rsp+304], xmm15
-        vmovdqa	xmm6, OWORD PTR [r12]
-        vmovdqa	xmm5, OWORD PTR [r14]
-        vpsrlq	xmm9, xmm5, 63
-        vpsllq	xmm8, xmm5, 1
-        vpslldq	xmm9, xmm9, 8
-        vpor	xmm8, xmm8, xmm9
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpand	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128
-        vpxor	xmm5, xmm5, xmm8
-        xor	edi, edi
-        cmp	r9d, 128
-        mov	r13d, r9d
-        jl	L_AES_GCM_encrypt_update_avx1_done_128
-        and	r13d, 4294967168
-        vmovdqa	xmm2, xmm6
-        ; H ^ 1
-        vmovdqu	OWORD PTR [rsp], xmm5
-        ; H ^ 2
-        vpclmulqdq	xmm8, xmm5, xmm5, 0
-        vpclmulqdq	xmm0, xmm5, xmm5, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm0, xmm0, xmm14
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-        ; H ^ 3
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm0, 78
-        vpclmulqdq	xmm11, xmm0, xmm5, 17
-        vpclmulqdq	xmm8, xmm0, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm0
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm1, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm1, xmm1, xmm14
-        vmovdqu	OWORD PTR [rsp+32], xmm1
-        ; H ^ 4
-        vpclmulqdq	xmm8, xmm0, xmm0, 0
-        vpclmulqdq	xmm3, xmm0, xmm0, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm3, xmm3, xmm14
-        vmovdqu	OWORD PTR [rsp+48], xmm3
-        ; H ^ 5
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm0, 78
-        vpshufd	xmm10, xmm1, 78
-        vpclmulqdq	xmm11, xmm1, xmm0, 17
-        vpclmulqdq	xmm8, xmm1, xmm0, 0
-        vpxor	xmm9, xmm9, xmm0
-        vpxor	xmm10, xmm10, xmm1
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm7, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+64], xmm7
-        ; H ^ 6
-        vpclmulqdq	xmm8, xmm1, xmm1, 0
-        vpclmulqdq	xmm7, xmm1, xmm1, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+80], xmm7
-        ; H ^ 7
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm1, 78
-        vpshufd	xmm10, xmm3, 78
-        vpclmulqdq	xmm11, xmm3, xmm1, 17
-        vpclmulqdq	xmm8, xmm3, xmm1, 0
-        vpxor	xmm9, xmm9, xmm1
-        vpxor	xmm10, xmm10, xmm3
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm7, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+96], xmm7
-        ; H ^ 8
-        vpclmulqdq	xmm8, xmm3, xmm3, 0
-        vpclmulqdq	xmm7, xmm3, xmm3, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+112], xmm7
-        ; First 128 bytes of input
-        vmovdqu	xmm0, OWORD PTR [r15]
-        vmovdqa	xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven
-        vpshufb	xmm15, xmm15, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight
-        vmovdqa	xmm7, OWORD PTR [rax]
-        vmovdqu	OWORD PTR [r15], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+16]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+32]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+48]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+64]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+80]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+96]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+112]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+128]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+144]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r8d, 11
-        vmovdqa	xmm7, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_128_enc_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r8d, 13
-        vmovdqa	xmm7, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_128_enc_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_avx1_aesenc_128_enc_done:
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vmovdqu	xmm0, OWORD PTR [r11]
-        vmovdqu	xmm1, OWORD PTR [r11+16]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vmovdqu	OWORD PTR [r10], xmm8
-        vmovdqu	OWORD PTR [r10+16], xmm9
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [r11+32]
-        vmovdqu	xmm1, OWORD PTR [r11+48]
-        vpxor	xmm10, xmm10, xmm0
-        vpxor	xmm11, xmm11, xmm1
-        vmovdqu	OWORD PTR [r10+32], xmm10
-        vmovdqu	OWORD PTR [r10+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vmovdqu	xmm0, OWORD PTR [r11+64]
-        vmovdqu	xmm1, OWORD PTR [r11+80]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vmovdqu	OWORD PTR [r10+64], xmm12
-        vmovdqu	OWORD PTR [r10+80], xmm13
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [r11+96]
-        vmovdqu	xmm1, OWORD PTR [r11+112]
-        vpxor	xmm14, xmm14, xmm0
-        vpxor	xmm15, xmm15, xmm1
-        vmovdqu	OWORD PTR [r10+96], xmm14
-        vmovdqu	OWORD PTR [r10+112], xmm15
-        cmp	r13d, 128
-        mov	edi, 128
-        jle	L_AES_GCM_encrypt_update_avx1_end_128
-        ; More 128 bytes of input
-L_AES_GCM_encrypt_update_avx1_ghash_128:
-        lea	rcx, QWORD PTR [r11+rdi]
-        lea	rdx, QWORD PTR [r10+rdi]
-        vmovdqu	xmm0, OWORD PTR [r15]
-        vmovdqa	xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven
-        vpshufb	xmm15, xmm15, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight
-        vmovdqa	xmm7, OWORD PTR [rax]
-        vmovdqu	OWORD PTR [r15], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsp+112]
-        vmovdqu	xmm0, OWORD PTR [rdx+-128]
-        vaesenc	xmm8, xmm8, [rax+16]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm0, xmm0, xmm2
-        vpshufd	xmm1, xmm7, 78
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm3, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+16]
-        vaesenc	xmm10, xmm10, [rax+16]
-        vpclmulqdq	xmm2, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+16]
-        vaesenc	xmm12, xmm12, [rax+16]
-        vpclmulqdq	xmm1, xmm1, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+16]
-        vaesenc	xmm14, xmm14, [rax+16]
-        vaesenc	xmm15, xmm15, [rax+16]
-        vpxor	xmm1, xmm1, xmm2
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqu	xmm7, OWORD PTR [rsp+96]
-        vmovdqu	xmm0, OWORD PTR [rdx+-112]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+32]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+32]
-        vaesenc	xmm10, xmm10, [rax+32]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+32]
-        vaesenc	xmm12, xmm12, [rax+32]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+32]
-        vaesenc	xmm14, xmm14, [rax+32]
-        vaesenc	xmm15, xmm15, [rax+32]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+80]
-        vmovdqu	xmm0, OWORD PTR [rdx+-96]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+48]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+48]
-        vaesenc	xmm10, xmm10, [rax+48]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+48]
-        vaesenc	xmm12, xmm12, [rax+48]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+48]
-        vaesenc	xmm14, xmm14, [rax+48]
-        vaesenc	xmm15, xmm15, [rax+48]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+64]
-        vmovdqu	xmm0, OWORD PTR [rdx+-80]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+64]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+64]
-        vaesenc	xmm10, xmm10, [rax+64]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+64]
-        vaesenc	xmm12, xmm12, [rax+64]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+64]
-        vaesenc	xmm14, xmm14, [rax+64]
-        vaesenc	xmm15, xmm15, [rax+64]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+48]
-        vmovdqu	xmm0, OWORD PTR [rdx+-64]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+80]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+80]
-        vaesenc	xmm10, xmm10, [rax+80]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+80]
-        vaesenc	xmm12, xmm12, [rax+80]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+80]
-        vaesenc	xmm14, xmm14, [rax+80]
-        vaesenc	xmm15, xmm15, [rax+80]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+32]
-        vmovdqu	xmm0, OWORD PTR [rdx+-48]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+96]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+96]
-        vaesenc	xmm10, xmm10, [rax+96]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+96]
-        vaesenc	xmm12, xmm12, [rax+96]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+96]
-        vaesenc	xmm14, xmm14, [rax+96]
-        vaesenc	xmm15, xmm15, [rax+96]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm0, OWORD PTR [rdx+-32]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+112]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+112]
-        vaesenc	xmm10, xmm10, [rax+112]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+112]
-        vaesenc	xmm12, xmm12, [rax+112]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+112]
-        vaesenc	xmm14, xmm14, [rax+112]
-        vaesenc	xmm15, xmm15, [rax+112]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp]
-        vmovdqu	xmm0, OWORD PTR [rdx+-16]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+128]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+128]
-        vaesenc	xmm10, xmm10, [rax+128]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+128]
-        vaesenc	xmm12, xmm12, [rax+128]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+128]
-        vaesenc	xmm14, xmm14, [rax+128]
-        vaesenc	xmm15, xmm15, [rax+128]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vpslldq	xmm5, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vaesenc	xmm8, xmm8, [rax+144]
-        vpxor	xmm2, xmm2, xmm5
-        vpxor	xmm3, xmm3, xmm1
-        vaesenc	xmm9, xmm9, [rax+144]
-        vpslld	xmm7, xmm2, 31
-        vpslld	xmm4, xmm2, 30
-        vpslld	xmm5, xmm2, 25
-        vaesenc	xmm10, xmm10, [rax+144]
-        vpxor	xmm7, xmm7, xmm4
-        vpxor	xmm7, xmm7, xmm5
-        vaesenc	xmm11, xmm11, [rax+144]
-        vpsrldq	xmm4, xmm7, 4
-        vpslldq	xmm7, xmm7, 12
-        vaesenc	xmm12, xmm12, [rax+144]
-        vpxor	xmm2, xmm2, xmm7
-        vpsrld	xmm5, xmm2, 1
-        vaesenc	xmm13, xmm13, [rax+144]
-        vpsrld	xmm1, xmm2, 2
-        vpsrld	xmm0, xmm2, 7
-        vaesenc	xmm14, xmm14, [rax+144]
-        vpxor	xmm5, xmm5, xmm1
-        vpxor	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, [rax+144]
-        vpxor	xmm5, xmm5, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpxor	xmm2, xmm2, xmm3
-        cmp	r8d, 11
-        vmovdqa	xmm7, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r8d, 13
-        vmovdqa	xmm7, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_avx1_aesenc_128_ghash_avx_done:
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vmovdqu	xmm1, OWORD PTR [rcx+16]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vmovdqu	OWORD PTR [rdx], xmm8
-        vmovdqu	OWORD PTR [rdx+16], xmm9
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+32]
-        vmovdqu	xmm1, OWORD PTR [rcx+48]
-        vpxor	xmm10, xmm10, xmm0
-        vpxor	xmm11, xmm11, xmm1
-        vmovdqu	OWORD PTR [rdx+32], xmm10
-        vmovdqu	OWORD PTR [rdx+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+64]
-        vmovdqu	xmm1, OWORD PTR [rcx+80]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vmovdqu	OWORD PTR [rdx+64], xmm12
-        vmovdqu	OWORD PTR [rdx+80], xmm13
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+96]
-        vmovdqu	xmm1, OWORD PTR [rcx+112]
-        vpxor	xmm14, xmm14, xmm0
-        vpxor	xmm15, xmm15, xmm1
-        vmovdqu	OWORD PTR [rdx+96], xmm14
-        vmovdqu	OWORD PTR [rdx+112], xmm15
-        add	edi, 128
-        cmp	edi, r13d
-        jl	L_AES_GCM_encrypt_update_avx1_ghash_128
-L_AES_GCM_encrypt_update_avx1_end_128:
-        vmovdqa	xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpshufb	xmm8, xmm8, xmm4
-        vpshufb	xmm9, xmm9, xmm4
-        vpshufb	xmm10, xmm10, xmm4
-        vpshufb	xmm11, xmm11, xmm4
-        vpxor	xmm8, xmm8, xmm2
-        vpshufb	xmm12, xmm12, xmm4
-        vpshufb	xmm13, xmm13, xmm4
-        vpshufb	xmm14, xmm14, xmm4
-        vpshufb	xmm15, xmm15, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp]
-        vmovdqu	xmm5, OWORD PTR [rsp+16]
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm15, 78
-        vpshufd	xmm2, xmm7, 78
-        vpclmulqdq	xmm3, xmm7, xmm15, 17
-        vpclmulqdq	xmm0, xmm7, xmm15, 0
-        vpxor	xmm1, xmm1, xmm15
-        vpxor	xmm2, xmm2, xmm7
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm4, xmm0
-        vmovdqa	xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm14, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm14, 17
-        vpclmulqdq	xmm0, xmm5, xmm14, 0
-        vpxor	xmm1, xmm1, xmm14
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vmovdqu	xmm7, OWORD PTR [rsp+32]
-        vmovdqu	xmm5, OWORD PTR [rsp+48]
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm13, 78
-        vpshufd	xmm2, xmm7, 78
-        vpclmulqdq	xmm3, xmm7, xmm13, 17
-        vpclmulqdq	xmm0, xmm7, xmm13, 0
-        vpxor	xmm1, xmm1, xmm13
-        vpxor	xmm2, xmm2, xmm7
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm12, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm12, 17
-        vpclmulqdq	xmm0, xmm5, xmm12, 0
-        vpxor	xmm1, xmm1, xmm12
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vmovdqu	xmm7, OWORD PTR [rsp+64]
-        vmovdqu	xmm5, OWORD PTR [rsp+80]
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm11, 78
-        vpshufd	xmm2, xmm7, 78
-        vpclmulqdq	xmm3, xmm7, xmm11, 17
-        vpclmulqdq	xmm0, xmm7, xmm11, 0
-        vpxor	xmm1, xmm1, xmm11
-        vpxor	xmm2, xmm2, xmm7
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm10, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm10, 17
-        vpclmulqdq	xmm0, xmm5, xmm10, 0
-        vpxor	xmm1, xmm1, xmm10
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vmovdqu	xmm7, OWORD PTR [rsp+96]
-        vmovdqu	xmm5, OWORD PTR [rsp+112]
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm9, 78
-        vpshufd	xmm2, xmm7, 78
-        vpclmulqdq	xmm3, xmm7, xmm9, 17
-        vpclmulqdq	xmm0, xmm7, xmm9, 0
-        vpxor	xmm1, xmm1, xmm9
-        vpxor	xmm2, xmm2, xmm7
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm8, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm8, 17
-        vpclmulqdq	xmm0, xmm5, xmm8, 0
-        vpxor	xmm1, xmm1, xmm8
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vpslld	xmm0, xmm4, 31
-        vpslld	xmm1, xmm4, 30
-        vpslld	xmm2, xmm4, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm4, xmm4, xmm0
-        vpsrld	xmm2, xmm4, 1
-        vpsrld	xmm3, xmm4, 2
-        vpsrld	xmm0, xmm4, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm4
-        vpxor	xmm6, xmm6, xmm2
-        vmovdqu	xmm5, OWORD PTR [rsp]
-L_AES_GCM_encrypt_update_avx1_done_128:
-        mov	edx, r9d
-        cmp	edi, edx
-        jge	L_AES_GCM_encrypt_update_avx1_done_enc
-        mov	r13d, r9d
-        and	r13d, 4294967280
-        cmp	edi, r13d
-        jge	L_AES_GCM_encrypt_update_avx1_last_block_done
-        vmovdqu	xmm9, OWORD PTR [r15]
-        vpshufb	xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one
-        vmovdqu	OWORD PTR [r15], xmm9
-        vpxor	xmm8, xmm8, [rax]
-        vaesenc	xmm8, xmm8, [rax+16]
-        vaesenc	xmm8, xmm8, [rax+32]
-        vaesenc	xmm8, xmm8, [rax+48]
-        vaesenc	xmm8, xmm8, [rax+64]
-        vaesenc	xmm8, xmm8, [rax+80]
-        vaesenc	xmm8, xmm8, [rax+96]
-        vaesenc	xmm8, xmm8, [rax+112]
-        vaesenc	xmm8, xmm8, [rax+128]
-        vaesenc	xmm8, xmm8, [rax+144]
-        cmp	r8d, 11
-        vmovdqa	xmm9, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_block_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [rax+176]
-        cmp	r8d, 13
-        vmovdqa	xmm9, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_block_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [rax+208]
-        vmovdqa	xmm9, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_avx1_aesenc_block_last:
-        vaesenclast	xmm8, xmm8, xmm9
-        vmovdqu	xmm9, OWORD PTR [r11+rdi]
-        vpxor	xmm8, xmm8, xmm9
-        vmovdqu	OWORD PTR [r10+rdi], xmm8
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm8
-        add	edi, 16
-        cmp	edi, r13d
-        jge	L_AES_GCM_encrypt_update_avx1_last_block_ghash
-L_AES_GCM_encrypt_update_avx1_last_block_start:
-        vmovdqu	xmm13, OWORD PTR [r11+rdi]
-        vmovdqu	xmm9, OWORD PTR [r15]
-        vpshufb	xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one
-        vmovdqu	OWORD PTR [r15], xmm9
-        vpxor	xmm8, xmm8, [rax]
-        vpclmulqdq	xmm10, xmm6, xmm5, 16
-        vaesenc	xmm8, xmm8, [rax+16]
-        vaesenc	xmm8, xmm8, [rax+32]
-        vpclmulqdq	xmm11, xmm6, xmm5, 1
-        vaesenc	xmm8, xmm8, [rax+48]
-        vaesenc	xmm8, xmm8, [rax+64]
-        vpclmulqdq	xmm12, xmm6, xmm5, 0
-        vaesenc	xmm8, xmm8, [rax+80]
-        vpclmulqdq	xmm1, xmm6, xmm5, 17
-        vaesenc	xmm8, xmm8, [rax+96]
-        vpxor	xmm10, xmm10, xmm11
-        vpslldq	xmm2, xmm10, 8
-        vpsrldq	xmm10, xmm10, 8
-        vaesenc	xmm8, xmm8, [rax+112]
-        vpxor	xmm2, xmm2, xmm12
-        vpxor	xmm3, xmm1, xmm10
-        vmovdqa	xmm0, OWORD PTR L_avx1_aes_gcm_mod2_128
-        vpclmulqdq	xmm11, xmm2, xmm0, 16
-        vaesenc	xmm8, xmm8, [rax+128]
-        vpshufd	xmm10, xmm2, 78
-        vpxor	xmm10, xmm10, xmm11
-        vpclmulqdq	xmm11, xmm10, xmm0, 16
-        vaesenc	xmm8, xmm8, [rax+144]
-        vpshufd	xmm10, xmm10, 78
-        vpxor	xmm10, xmm10, xmm11
-        vpxor	xmm6, xmm10, xmm3
-        cmp	r8d, 11
-        vmovdqa	xmm9, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [rax+176]
-        cmp	r8d, 13
-        vmovdqa	xmm9, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [rax+208]
-        vmovdqa	xmm9, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last:
-        vaesenclast	xmm8, xmm8, xmm9
-        vmovdqa	xmm0, xmm13
-        vpxor	xmm8, xmm8, xmm0
-        vmovdqu	OWORD PTR [r10+rdi], xmm8
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        add	edi, 16
-        vpxor	xmm6, xmm6, xmm8
-        cmp	edi, r13d
-        jl	L_AES_GCM_encrypt_update_avx1_last_block_start
-L_AES_GCM_encrypt_update_avx1_last_block_ghash:
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm6, 78
-        vpclmulqdq	xmm11, xmm6, xmm5, 17
-        vpclmulqdq	xmm8, xmm6, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm6
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm6, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm6, xmm6, xmm14
-L_AES_GCM_encrypt_update_avx1_last_block_done:
-L_AES_GCM_encrypt_update_avx1_done_enc:
-        vmovdqa	OWORD PTR [r12], xmm6
-        vzeroupper
-        vmovdqu	xmm6, OWORD PTR [rsp+160]
-        vmovdqu	xmm7, OWORD PTR [rsp+176]
-        vmovdqu	xmm8, OWORD PTR [rsp+192]
-        vmovdqu	xmm9, OWORD PTR [rsp+208]
-        vmovdqu	xmm10, OWORD PTR [rsp+224]
-        vmovdqu	xmm11, OWORD PTR [rsp+240]
-        vmovdqu	xmm12, OWORD PTR [rsp+256]
-        vmovdqu	xmm13, OWORD PTR [rsp+272]
-        vmovdqu	xmm14, OWORD PTR [rsp+288]
-        vmovdqu	xmm15, OWORD PTR [rsp+304]
-        add	rsp, 320
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r12
-        pop	r13
-        ret
-AES_GCM_encrypt_update_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_final_avx1 PROC
-        push	r13
-        push	r12
-        push	r14
-        mov	rax, rcx
-        mov	r10d, r9d
-        mov	r9, rdx
-        mov	r11d, DWORD PTR [rsp+64]
-        mov	r12, QWORD PTR [rsp+72]
-        mov	r14, QWORD PTR [rsp+80]
-        sub	rsp, 144
-        vmovdqu	OWORD PTR [rsp+16], xmm6
-        vmovdqu	OWORD PTR [rsp+32], xmm7
-        vmovdqu	OWORD PTR [rsp+48], xmm8
-        vmovdqu	OWORD PTR [rsp+64], xmm9
-        vmovdqu	OWORD PTR [rsp+80], xmm10
-        vmovdqu	OWORD PTR [rsp+96], xmm11
-        vmovdqu	OWORD PTR [rsp+112], xmm12
-        vmovdqu	OWORD PTR [rsp+128], xmm13
-        vmovdqa	xmm4, OWORD PTR [rax]
-        vmovdqa	xmm5, OWORD PTR [r12]
-        vmovdqa	xmm6, OWORD PTR [r14]
-        vpsrlq	xmm8, xmm5, 63
-        vpsllq	xmm7, xmm5, 1
-        vpslldq	xmm8, xmm8, 8
-        vpor	xmm7, xmm7, xmm8
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpand	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128
-        vpxor	xmm5, xmm5, xmm7
-        mov	edx, r10d
-        mov	ecx, r11d
-        shl	rdx, 3
-        shl	rcx, 3
-        vmovq	xmm0, rdx
-        vmovq	xmm1, rcx
-        vpunpcklqdq	xmm0, xmm0, xmm1
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm8, xmm5, 78
-        vpshufd	xmm9, xmm4, 78
-        vpclmulqdq	xmm10, xmm4, xmm5, 17
-        vpclmulqdq	xmm7, xmm4, xmm5, 0
-        vpxor	xmm8, xmm8, xmm5
-        vpxor	xmm9, xmm9, xmm4
-        vpclmulqdq	xmm8, xmm8, xmm9, 0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm8, xmm8, xmm10
-        vpslldq	xmm9, xmm8, 8
-        vpsrldq	xmm8, xmm8, 8
-        vpxor	xmm7, xmm7, xmm9
-        vpxor	xmm4, xmm10, xmm8
-        vpslld	xmm11, xmm7, 31
-        vpslld	xmm12, xmm7, 30
-        vpslld	xmm13, xmm7, 25
-        vpxor	xmm11, xmm11, xmm12
-        vpxor	xmm11, xmm11, xmm13
-        vpsrldq	xmm12, xmm11, 4
-        vpslldq	xmm11, xmm11, 12
-        vpxor	xmm7, xmm7, xmm11
-        vpsrld	xmm13, xmm7, 1
-        vpsrld	xmm9, xmm7, 2
-        vpsrld	xmm8, xmm7, 7
-        vpxor	xmm13, xmm13, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm13, xmm13, xmm12
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm4, xmm4, xmm13
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm0, xmm4, xmm6
-        cmp	r8d, 16
-        je	L_AES_GCM_encrypt_final_avx1_store_tag_16
-        xor	rcx, rcx
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_encrypt_final_avx1_store_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [r9+rcx], r13b
-        inc	ecx
-        cmp	ecx, r8d
-        jne	L_AES_GCM_encrypt_final_avx1_store_tag_loop
-        jmp	L_AES_GCM_encrypt_final_avx1_store_tag_done
-L_AES_GCM_encrypt_final_avx1_store_tag_16:
-        vmovdqu	OWORD PTR [r9], xmm0
-L_AES_GCM_encrypt_final_avx1_store_tag_done:
-        vzeroupper
-        vmovdqu	xmm6, OWORD PTR [rsp+16]
-        vmovdqu	xmm7, OWORD PTR [rsp+32]
-        vmovdqu	xmm8, OWORD PTR [rsp+48]
-        vmovdqu	xmm9, OWORD PTR [rsp+64]
-        vmovdqu	xmm10, OWORD PTR [rsp+80]
-        vmovdqu	xmm11, OWORD PTR [rsp+96]
-        vmovdqu	xmm12, OWORD PTR [rsp+112]
-        vmovdqu	xmm13, OWORD PTR [rsp+128]
-        add	rsp, 144
-        pop	r14
-        pop	r12
-        pop	r13
-        ret
-AES_GCM_encrypt_final_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_decrypt_update_avx1 PROC
-        push	r13
-        push	r12
-        push	r14
-        push	r15
-        push	rdi
-        mov	rax, rcx
-        mov	r10, r8
-        mov	r8d, edx
-        mov	r11, r9
-        mov	r9d, DWORD PTR [rsp+80]
-        mov	r12, QWORD PTR [rsp+88]
-        mov	r14, QWORD PTR [rsp+96]
-        mov	r15, QWORD PTR [rsp+104]
-        sub	rsp, 328
-        vmovdqu	OWORD PTR [rsp+168], xmm6
-        vmovdqu	OWORD PTR [rsp+184], xmm7
-        vmovdqu	OWORD PTR [rsp+200], xmm8
-        vmovdqu	OWORD PTR [rsp+216], xmm9
-        vmovdqu	OWORD PTR [rsp+232], xmm10
-        vmovdqu	OWORD PTR [rsp+248], xmm11
-        vmovdqu	OWORD PTR [rsp+264], xmm12
-        vmovdqu	OWORD PTR [rsp+280], xmm13
-        vmovdqu	OWORD PTR [rsp+296], xmm14
-        vmovdqu	OWORD PTR [rsp+312], xmm15
-        vmovdqa	xmm6, OWORD PTR [r12]
-        vmovdqa	xmm5, OWORD PTR [r14]
-        vpsrlq	xmm9, xmm5, 63
-        vpsllq	xmm8, xmm5, 1
-        vpslldq	xmm9, xmm9, 8
-        vpor	xmm8, xmm8, xmm9
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpand	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128
-        vpxor	xmm5, xmm5, xmm8
-        xor	edi, edi
-        cmp	r9d, 128
-        mov	r13d, r9d
-        jl	L_AES_GCM_decrypt_update_avx1_done_128
-        and	r13d, 4294967168
-        vmovdqa	xmm2, xmm6
-        ; H ^ 1
-        vmovdqu	OWORD PTR [rsp], xmm5
-        ; H ^ 2
-        vpclmulqdq	xmm8, xmm5, xmm5, 0
-        vpclmulqdq	xmm0, xmm5, xmm5, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm0, xmm0, xmm14
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-        ; H ^ 3
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm0, 78
-        vpclmulqdq	xmm11, xmm0, xmm5, 17
-        vpclmulqdq	xmm8, xmm0, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm0
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm1, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm1, xmm1, xmm14
-        vmovdqu	OWORD PTR [rsp+32], xmm1
-        ; H ^ 4
-        vpclmulqdq	xmm8, xmm0, xmm0, 0
-        vpclmulqdq	xmm3, xmm0, xmm0, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm3, xmm3, xmm14
-        vmovdqu	OWORD PTR [rsp+48], xmm3
-        ; H ^ 5
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm0, 78
-        vpshufd	xmm10, xmm1, 78
-        vpclmulqdq	xmm11, xmm1, xmm0, 17
-        vpclmulqdq	xmm8, xmm1, xmm0, 0
-        vpxor	xmm9, xmm9, xmm0
-        vpxor	xmm10, xmm10, xmm1
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm7, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+64], xmm7
-        ; H ^ 6
-        vpclmulqdq	xmm8, xmm1, xmm1, 0
-        vpclmulqdq	xmm7, xmm1, xmm1, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+80], xmm7
-        ; H ^ 7
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm1, 78
-        vpshufd	xmm10, xmm3, 78
-        vpclmulqdq	xmm11, xmm3, xmm1, 17
-        vpclmulqdq	xmm8, xmm3, xmm1, 0
-        vpxor	xmm9, xmm9, xmm1
-        vpxor	xmm10, xmm10, xmm3
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm7, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+96], xmm7
-        ; H ^ 8
-        vpclmulqdq	xmm8, xmm3, xmm3, 0
-        vpclmulqdq	xmm7, xmm3, xmm3, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+112], xmm7
-L_AES_GCM_decrypt_update_avx1_ghash_128:
-        lea	rcx, QWORD PTR [r11+rdi]
-        lea	rdx, QWORD PTR [r10+rdi]
-        vmovdqu	xmm0, OWORD PTR [r15]
-        vmovdqa	xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven
-        vpshufb	xmm15, xmm15, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight
-        vmovdqa	xmm7, OWORD PTR [rax]
-        vmovdqu	OWORD PTR [r15], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsp+112]
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vaesenc	xmm8, xmm8, [rax+16]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm0, xmm0, xmm2
-        vpshufd	xmm1, xmm7, 78
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm3, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+16]
-        vaesenc	xmm10, xmm10, [rax+16]
-        vpclmulqdq	xmm2, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+16]
-        vaesenc	xmm12, xmm12, [rax+16]
-        vpclmulqdq	xmm1, xmm1, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+16]
-        vaesenc	xmm14, xmm14, [rax+16]
-        vaesenc	xmm15, xmm15, [rax+16]
-        vpxor	xmm1, xmm1, xmm2
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqu	xmm7, OWORD PTR [rsp+96]
-        vmovdqu	xmm0, OWORD PTR [rcx+16]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+32]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+32]
-        vaesenc	xmm10, xmm10, [rax+32]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+32]
-        vaesenc	xmm12, xmm12, [rax+32]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+32]
-        vaesenc	xmm14, xmm14, [rax+32]
-        vaesenc	xmm15, xmm15, [rax+32]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+80]
-        vmovdqu	xmm0, OWORD PTR [rcx+32]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+48]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+48]
-        vaesenc	xmm10, xmm10, [rax+48]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+48]
-        vaesenc	xmm12, xmm12, [rax+48]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+48]
-        vaesenc	xmm14, xmm14, [rax+48]
-        vaesenc	xmm15, xmm15, [rax+48]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+64]
-        vmovdqu	xmm0, OWORD PTR [rcx+48]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+64]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+64]
-        vaesenc	xmm10, xmm10, [rax+64]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+64]
-        vaesenc	xmm12, xmm12, [rax+64]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+64]
-        vaesenc	xmm14, xmm14, [rax+64]
-        vaesenc	xmm15, xmm15, [rax+64]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+48]
-        vmovdqu	xmm0, OWORD PTR [rcx+64]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+80]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+80]
-        vaesenc	xmm10, xmm10, [rax+80]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+80]
-        vaesenc	xmm12, xmm12, [rax+80]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+80]
-        vaesenc	xmm14, xmm14, [rax+80]
-        vaesenc	xmm15, xmm15, [rax+80]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+32]
-        vmovdqu	xmm0, OWORD PTR [rcx+80]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+96]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+96]
-        vaesenc	xmm10, xmm10, [rax+96]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+96]
-        vaesenc	xmm12, xmm12, [rax+96]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+96]
-        vaesenc	xmm14, xmm14, [rax+96]
-        vaesenc	xmm15, xmm15, [rax+96]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm0, OWORD PTR [rcx+96]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+112]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+112]
-        vaesenc	xmm10, xmm10, [rax+112]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+112]
-        vaesenc	xmm12, xmm12, [rax+112]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+112]
-        vaesenc	xmm14, xmm14, [rax+112]
-        vaesenc	xmm15, xmm15, [rax+112]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp]
-        vmovdqu	xmm0, OWORD PTR [rcx+112]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+128]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+128]
-        vaesenc	xmm10, xmm10, [rax+128]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+128]
-        vaesenc	xmm12, xmm12, [rax+128]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+128]
-        vaesenc	xmm14, xmm14, [rax+128]
-        vaesenc	xmm15, xmm15, [rax+128]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vpslldq	xmm5, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vaesenc	xmm8, xmm8, [rax+144]
-        vpxor	xmm2, xmm2, xmm5
-        vpxor	xmm3, xmm3, xmm1
-        vaesenc	xmm9, xmm9, [rax+144]
-        vpslld	xmm7, xmm2, 31
-        vpslld	xmm4, xmm2, 30
-        vpslld	xmm5, xmm2, 25
-        vaesenc	xmm10, xmm10, [rax+144]
-        vpxor	xmm7, xmm7, xmm4
-        vpxor	xmm7, xmm7, xmm5
-        vaesenc	xmm11, xmm11, [rax+144]
-        vpsrldq	xmm4, xmm7, 4
-        vpslldq	xmm7, xmm7, 12
-        vaesenc	xmm12, xmm12, [rax+144]
-        vpxor	xmm2, xmm2, xmm7
-        vpsrld	xmm5, xmm2, 1
-        vaesenc	xmm13, xmm13, [rax+144]
-        vpsrld	xmm1, xmm2, 2
-        vpsrld	xmm0, xmm2, 7
-        vaesenc	xmm14, xmm14, [rax+144]
-        vpxor	xmm5, xmm5, xmm1
-        vpxor	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, [rax+144]
-        vpxor	xmm5, xmm5, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpxor	xmm2, xmm2, xmm3
-        cmp	r8d, 11
-        vmovdqa	xmm7, OWORD PTR [rax+160]
-        jl	L_AES_GCM_decrypt_update_avx1_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r8d, 13
-        vmovdqa	xmm7, OWORD PTR [rax+192]
-        jl	L_AES_GCM_decrypt_update_avx1_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+224]
-L_AES_GCM_decrypt_update_avx1_aesenc_128_ghash_avx_done:
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vmovdqu	xmm1, OWORD PTR [rcx+16]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vmovdqu	OWORD PTR [rdx], xmm8
-        vmovdqu	OWORD PTR [rdx+16], xmm9
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+32]
-        vmovdqu	xmm1, OWORD PTR [rcx+48]
-        vpxor	xmm10, xmm10, xmm0
-        vpxor	xmm11, xmm11, xmm1
-        vmovdqu	OWORD PTR [rdx+32], xmm10
-        vmovdqu	OWORD PTR [rdx+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+64]
-        vmovdqu	xmm1, OWORD PTR [rcx+80]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vmovdqu	OWORD PTR [rdx+64], xmm12
-        vmovdqu	OWORD PTR [rdx+80], xmm13
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+96]
-        vmovdqu	xmm1, OWORD PTR [rcx+112]
-        vpxor	xmm14, xmm14, xmm0
-        vpxor	xmm15, xmm15, xmm1
-        vmovdqu	OWORD PTR [rdx+96], xmm14
-        vmovdqu	OWORD PTR [rdx+112], xmm15
-        add	edi, 128
-        cmp	edi, r13d
-        jl	L_AES_GCM_decrypt_update_avx1_ghash_128
-        vmovdqa	xmm6, xmm2
-        vmovdqu	xmm5, OWORD PTR [rsp]
-L_AES_GCM_decrypt_update_avx1_done_128:
-        mov	edx, r9d
-        cmp	edi, edx
-        jge	L_AES_GCM_decrypt_update_avx1_done_dec
-        mov	r13d, r9d
-        and	r13d, 4294967280
-        cmp	edi, r13d
-        jge	L_AES_GCM_decrypt_update_avx1_last_block_done
-L_AES_GCM_decrypt_update_avx1_last_block_start:
-        vmovdqu	xmm13, OWORD PTR [r11+rdi]
-        vmovdqa	xmm0, xmm5
-        vpshufb	xmm1, xmm13, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm1, xmm1, xmm6
-        vmovdqu	xmm9, OWORD PTR [r15]
-        vpshufb	xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one
-        vmovdqu	OWORD PTR [r15], xmm9
-        vpxor	xmm8, xmm8, [rax]
-        vpclmulqdq	xmm10, xmm1, xmm0, 16
-        vaesenc	xmm8, xmm8, [rax+16]
-        vaesenc	xmm8, xmm8, [rax+32]
-        vpclmulqdq	xmm11, xmm1, xmm0, 1
-        vaesenc	xmm8, xmm8, [rax+48]
-        vaesenc	xmm8, xmm8, [rax+64]
-        vpclmulqdq	xmm12, xmm1, xmm0, 0
-        vaesenc	xmm8, xmm8, [rax+80]
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vaesenc	xmm8, xmm8, [rax+96]
-        vpxor	xmm10, xmm10, xmm11
-        vpslldq	xmm2, xmm10, 8
-        vpsrldq	xmm10, xmm10, 8
-        vaesenc	xmm8, xmm8, [rax+112]
-        vpxor	xmm2, xmm2, xmm12
-        vpxor	xmm3, xmm1, xmm10
-        vmovdqa	xmm0, OWORD PTR L_avx1_aes_gcm_mod2_128
-        vpclmulqdq	xmm11, xmm2, xmm0, 16
-        vaesenc	xmm8, xmm8, [rax+128]
-        vpshufd	xmm10, xmm2, 78
-        vpxor	xmm10, xmm10, xmm11
-        vpclmulqdq	xmm11, xmm10, xmm0, 16
-        vaesenc	xmm8, xmm8, [rax+144]
-        vpshufd	xmm10, xmm10, 78
-        vpxor	xmm10, xmm10, xmm11
-        vpxor	xmm6, xmm10, xmm3
-        cmp	r8d, 11
-        vmovdqa	xmm9, OWORD PTR [rax+160]
-        jl	L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [rax+176]
-        cmp	r8d, 13
-        vmovdqa	xmm9, OWORD PTR [rax+192]
-        jl	L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [rax+208]
-        vmovdqa	xmm9, OWORD PTR [rax+224]
-L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last:
-        vaesenclast	xmm8, xmm8, xmm9
-        vmovdqa	xmm0, xmm13
-        vpxor	xmm8, xmm8, xmm0
-        vmovdqu	OWORD PTR [r10+rdi], xmm8
-        add	edi, 16
-        cmp	edi, r13d
-        jl	L_AES_GCM_decrypt_update_avx1_last_block_start
-L_AES_GCM_decrypt_update_avx1_last_block_done:
-L_AES_GCM_decrypt_update_avx1_done_dec:
-        vmovdqa	OWORD PTR [r12], xmm6
-        vzeroupper
-        vmovdqu	xmm6, OWORD PTR [rsp+168]
-        vmovdqu	xmm7, OWORD PTR [rsp+184]
-        vmovdqu	xmm8, OWORD PTR [rsp+200]
-        vmovdqu	xmm9, OWORD PTR [rsp+216]
-        vmovdqu	xmm10, OWORD PTR [rsp+232]
-        vmovdqu	xmm11, OWORD PTR [rsp+248]
-        vmovdqu	xmm12, OWORD PTR [rsp+264]
-        vmovdqu	xmm13, OWORD PTR [rsp+280]
-        vmovdqu	xmm14, OWORD PTR [rsp+296]
-        vmovdqu	xmm15, OWORD PTR [rsp+312]
-        add	rsp, 328
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r12
-        pop	r13
-        ret
-AES_GCM_decrypt_update_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_decrypt_final_avx1 PROC
-        push	r13
-        push	r12
-        push	r14
-        push	rbp
-        push	r15
-        mov	rax, rcx
-        mov	r10d, r9d
-        mov	r9, rdx
-        mov	r11d, DWORD PTR [rsp+80]
-        mov	r12, QWORD PTR [rsp+88]
-        mov	r14, QWORD PTR [rsp+96]
-        mov	rbp, QWORD PTR [rsp+104]
-        sub	rsp, 160
-        vmovdqu	OWORD PTR [rsp+16], xmm6
-        vmovdqu	OWORD PTR [rsp+32], xmm7
-        vmovdqu	OWORD PTR [rsp+48], xmm8
-        vmovdqu	OWORD PTR [rsp+64], xmm9
-        vmovdqu	OWORD PTR [rsp+80], xmm10
-        vmovdqu	OWORD PTR [rsp+96], xmm11
-        vmovdqu	OWORD PTR [rsp+112], xmm12
-        vmovdqu	OWORD PTR [rsp+128], xmm13
-        vmovdqu	OWORD PTR [rsp+144], xmm15
-        vmovdqa	xmm6, OWORD PTR [rax]
-        vmovdqa	xmm5, OWORD PTR [r12]
-        vmovdqa	xmm15, OWORD PTR [r14]
-        vpsrlq	xmm8, xmm5, 63
-        vpsllq	xmm7, xmm5, 1
-        vpslldq	xmm8, xmm8, 8
-        vpor	xmm7, xmm7, xmm8
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpand	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128
-        vpxor	xmm5, xmm5, xmm7
-        mov	edx, r10d
-        mov	ecx, r11d
-        shl	rdx, 3
-        shl	rcx, 3
-        vmovq	xmm0, rdx
-        vmovq	xmm1, rcx
-        vpunpcklqdq	xmm0, xmm0, xmm1
-        vpxor	xmm6, xmm6, xmm0
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm8, xmm5, 78
-        vpshufd	xmm9, xmm6, 78
-        vpclmulqdq	xmm10, xmm6, xmm5, 17
-        vpclmulqdq	xmm7, xmm6, xmm5, 0
-        vpxor	xmm8, xmm8, xmm5
-        vpxor	xmm9, xmm9, xmm6
-        vpclmulqdq	xmm8, xmm8, xmm9, 0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm8, xmm8, xmm10
-        vpslldq	xmm9, xmm8, 8
-        vpsrldq	xmm8, xmm8, 8
-        vpxor	xmm7, xmm7, xmm9
-        vpxor	xmm6, xmm10, xmm8
-        vpslld	xmm11, xmm7, 31
-        vpslld	xmm12, xmm7, 30
-        vpslld	xmm13, xmm7, 25
-        vpxor	xmm11, xmm11, xmm12
-        vpxor	xmm11, xmm11, xmm13
-        vpsrldq	xmm12, xmm11, 4
-        vpslldq	xmm11, xmm11, 12
-        vpxor	xmm7, xmm7, xmm11
-        vpsrld	xmm13, xmm7, 1
-        vpsrld	xmm9, xmm7, 2
-        vpsrld	xmm8, xmm7, 7
-        vpxor	xmm13, xmm13, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm13, xmm13, xmm12
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm6, xmm6, xmm13
-        vpshufb	xmm6, xmm6, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm0, xmm6, xmm15
-        cmp	r8d, 16
-        je	L_AES_GCM_decrypt_final_avx1_cmp_tag_16
-        sub	rsp, 16
-        xor	rcx, rcx
-        xor	r15, r15
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_decrypt_final_avx1_cmp_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+rcx]
-        xor	r13b, BYTE PTR [r9+rcx]
-        or	r15b, r13b
-        inc	ecx
-        cmp	ecx, r8d
-        jne	L_AES_GCM_decrypt_final_avx1_cmp_tag_loop
-        cmp	r15, 0
-        sete	r15b
-        add	rsp, 16
-        xor	rcx, rcx
-        jmp	L_AES_GCM_decrypt_final_avx1_cmp_tag_done
-L_AES_GCM_decrypt_final_avx1_cmp_tag_16:
-        vmovdqu	xmm1, OWORD PTR [r9]
-        vpcmpeqb	xmm0, xmm0, xmm1
-        vpmovmskb	rdx, xmm0
-        ; %%edx == 0xFFFF then return 1 else => return 0
-        xor	r15d, r15d
-        cmp	edx, 65535
-        sete	r15b
-L_AES_GCM_decrypt_final_avx1_cmp_tag_done:
-        mov	DWORD PTR [rbp], r15d
-        vzeroupper
-        vmovdqu	xmm6, OWORD PTR [rsp+16]
-        vmovdqu	xmm7, OWORD PTR [rsp+32]
-        vmovdqu	xmm8, OWORD PTR [rsp+48]
-        vmovdqu	xmm9, OWORD PTR [rsp+64]
-        vmovdqu	xmm10, OWORD PTR [rsp+80]
-        vmovdqu	xmm11, OWORD PTR [rsp+96]
-        vmovdqu	xmm12, OWORD PTR [rsp+112]
-        vmovdqu	xmm13, OWORD PTR [rsp+128]
-        vmovdqu	xmm15, OWORD PTR [rsp+144]
-        add	rsp, 160
-        pop	r15
-        pop	rbp
-        pop	r14
-        pop	r12
-        pop	r13
-        ret
-AES_GCM_decrypt_final_avx1 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_one QWORD 0, 1
-ptr_L_avx2_aes_gcm_one QWORD L_avx2_aes_gcm_one
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_two QWORD 0, 2
-ptr_L_avx2_aes_gcm_two QWORD L_avx2_aes_gcm_two
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_three QWORD 0, 3
-ptr_L_avx2_aes_gcm_three QWORD L_avx2_aes_gcm_three
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_four QWORD 0, 4
-ptr_L_avx2_aes_gcm_four QWORD L_avx2_aes_gcm_four
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_five QWORD 0, 5
-ptr_L_avx2_aes_gcm_five QWORD L_avx2_aes_gcm_five
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_six QWORD 0, 6
-ptr_L_avx2_aes_gcm_six QWORD L_avx2_aes_gcm_six
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_seven QWORD 0, 7
-ptr_L_avx2_aes_gcm_seven QWORD L_avx2_aes_gcm_seven
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_eight QWORD 0, 8
-ptr_L_avx2_aes_gcm_eight QWORD L_avx2_aes_gcm_eight
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_bswap_one QWORD 0, 72057594037927936
-ptr_L_avx2_aes_gcm_bswap_one QWORD L_avx2_aes_gcm_bswap_one
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_bswap_epi64 QWORD 283686952306183, 579005069656919567
-ptr_L_avx2_aes_gcm_bswap_epi64 QWORD L_avx2_aes_gcm_bswap_epi64
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_bswap_mask QWORD 579005069656919567, 283686952306183
-ptr_L_avx2_aes_gcm_bswap_mask QWORD L_avx2_aes_gcm_bswap_mask
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_mod2_128 QWORD 1, 13979173243358019584
-ptr_L_avx2_aes_gcm_mod2_128 QWORD L_avx2_aes_gcm_mod2_128
-_DATA ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_avx2 PROC
-        push	r13
-        push	rdi
-        push	r12
-        push	r15
-        push	rbx
-        push	r14
-        push	rsi
-        mov	rdi, rcx
-        mov	r12, r8
-        mov	rax, r9
-        mov	r15, QWORD PTR [rsp+96]
-        mov	r8, rdx
-        mov	r10d, DWORD PTR [rsp+104]
-        mov	r11d, DWORD PTR [rsp+112]
-        mov	ebx, DWORD PTR [rsp+120]
-        mov	r14d, DWORD PTR [rsp+128]
-        mov	rsi, QWORD PTR [rsp+136]
-        mov	r9d, DWORD PTR [rsp+144]
-        sub	rsp, 320
-        vmovdqu	OWORD PTR [rsp+160], xmm6
-        vmovdqu	OWORD PTR [rsp+176], xmm7
-        vmovdqu	OWORD PTR [rsp+192], xmm8
-        vmovdqu	OWORD PTR [rsp+208], xmm9
-        vmovdqu	OWORD PTR [rsp+224], xmm10
-        vmovdqu	OWORD PTR [rsp+240], xmm11
-        vmovdqu	OWORD PTR [rsp+256], xmm12
-        vmovdqu	OWORD PTR [rsp+272], xmm13
-        vmovdqu	OWORD PTR [rsp+288], xmm14
-        vmovdqu	OWORD PTR [rsp+304], xmm15
-        vpxor	xmm4, xmm4, xmm4
-        vpxor	xmm6, xmm6, xmm6
-        mov	edx, ebx
-        cmp	edx, 12
-        je	L_AES_GCM_encrypt_avx2_iv_12
-        ; Calculate values when IV is not 12 bytes
-        ; H = Encrypt X(=0)
-        vmovdqu	xmm5, OWORD PTR [rsi]
-        vaesenc	xmm5, xmm5, [rsi+16]
-        vaesenc	xmm5, xmm5, [rsi+32]
-        vaesenc	xmm5, xmm5, [rsi+48]
-        vaesenc	xmm5, xmm5, [rsi+64]
-        vaesenc	xmm5, xmm5, [rsi+80]
-        vaesenc	xmm5, xmm5, [rsi+96]
-        vaesenc	xmm5, xmm5, [rsi+112]
-        vaesenc	xmm5, xmm5, [rsi+128]
-        vaesenc	xmm5, xmm5, [rsi+144]
-        cmp	r9d, 11
-        vmovdqu	xmm0, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm5, xmm5, [rsi+176]
-        cmp	r9d, 13
-        vmovdqu	xmm0, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm5, xmm5, [rsi+208]
-        vmovdqu	xmm0, OWORD PTR [rsi+224]
-L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last:
-        vaesenclast	xmm5, xmm5, xmm0
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        ; Calc counter
-        ; Initialization vector
-        cmp	edx, 0
-        mov	rcx, 0
-        je	L_AES_GCM_encrypt_avx2_calc_iv_done
-        cmp	edx, 16
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_lt16
-        and	edx, 4294967280
-L_AES_GCM_encrypt_avx2_calc_iv_16_loop:
-        vmovdqu	xmm0, OWORD PTR [rax+rcx]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm7, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm7, xmm2, 16
-        vpshufd	xmm1, xmm7, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_16_loop
-        mov	edx, ebx
-        cmp	ecx, edx
-        je	L_AES_GCM_encrypt_avx2_calc_iv_done
-L_AES_GCM_encrypt_avx2_calc_iv_lt16:
-        vpxor	xmm0, xmm0, xmm0
-        xor	ebx, ebx
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_encrypt_avx2_calc_iv_loop:
-        movzx	r13d, BYTE PTR [rax+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_loop
-        vmovdqu	xmm0, OWORD PTR [rsp]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm7, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm7, xmm2, 16
-        vpshufd	xmm1, xmm7, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-L_AES_GCM_encrypt_avx2_calc_iv_done:
-        ; T = Encrypt counter
-        vpxor	xmm0, xmm0, xmm0
-        shl	edx, 3
-        vmovq	xmm0, rdx
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm7, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm7, xmm2, 16
-        vpshufd	xmm1, xmm7, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        ;   Encrypt counter
-        vmovdqu	xmm15, OWORD PTR [rsi]
-        vpxor	xmm15, xmm15, xmm4
-        vaesenc	xmm15, xmm15, [rsi+16]
-        vaesenc	xmm15, xmm15, [rsi+32]
-        vaesenc	xmm15, xmm15, [rsi+48]
-        vaesenc	xmm15, xmm15, [rsi+64]
-        vaesenc	xmm15, xmm15, [rsi+80]
-        vaesenc	xmm15, xmm15, [rsi+96]
-        vaesenc	xmm15, xmm15, [rsi+112]
-        vaesenc	xmm15, xmm15, [rsi+128]
-        vaesenc	xmm15, xmm15, [rsi+144]
-        cmp	r9d, 11
-        vmovdqu	xmm0, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm15, xmm15, xmm0
-        vaesenc	xmm15, xmm15, [rsi+176]
-        cmp	r9d, 13
-        vmovdqu	xmm0, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm15, xmm15, xmm0
-        vaesenc	xmm15, xmm15, [rsi+208]
-        vmovdqu	xmm0, OWORD PTR [rsi+224]
-L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last:
-        vaesenclast	xmm15, xmm15, xmm0
-        jmp	L_AES_GCM_encrypt_avx2_iv_done
-L_AES_GCM_encrypt_avx2_iv_12:
-        ; # Calculate values when IV is 12 bytes
-        ; Set counter based on IV
-        vmovdqu	xmm4, OWORD PTR L_avx2_aes_gcm_bswap_one
-        vmovdqu	xmm5, OWORD PTR [rsi]
-        vpblendd	xmm4, xmm4, [rax], 7
-        ; H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqu	xmm7, OWORD PTR [rsi+16]
-        vpxor	xmm15, xmm4, xmm5
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rsi+32]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+48]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+64]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+80]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+96]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+112]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+128]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+144]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        cmp	r9d, 11
-        vmovdqu	xmm0, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+176]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        cmp	r9d, 13
-        vmovdqu	xmm0, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+208]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+224]
-L_AES_GCM_encrypt_avx2_calc_iv_12_last:
-        vaesenclast	xmm5, xmm5, xmm0
-        vaesenclast	xmm15, xmm15, xmm0
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask
-L_AES_GCM_encrypt_avx2_iv_done:
-        ; Additional authentication data
-        mov	edx, r11d
-        cmp	edx, 0
-        je	L_AES_GCM_encrypt_avx2_calc_aad_done
-        xor	ecx, ecx
-        cmp	edx, 16
-        jl	L_AES_GCM_encrypt_avx2_calc_aad_lt16
-        and	edx, 4294967280
-L_AES_GCM_encrypt_avx2_calc_aad_16_loop:
-        vmovdqu	xmm0, OWORD PTR [r12+rcx]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm6, 16
-        vpclmulqdq	xmm1, xmm5, xmm6, 1
-        vpclmulqdq	xmm0, xmm5, xmm6, 0
-        vpclmulqdq	xmm3, xmm5, xmm6, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm7, xmm0, xmm1
-        vpxor	xmm6, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm6, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm6, xmm6, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm6, xmm6, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm6, xmm6, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm7, xmm2, 16
-        vpshufd	xmm1, xmm7, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm6, xmm6, xmm1
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_avx2_calc_aad_16_loop
-        mov	edx, r11d
-        cmp	ecx, edx
-        je	L_AES_GCM_encrypt_avx2_calc_aad_done
-L_AES_GCM_encrypt_avx2_calc_aad_lt16:
-        vpxor	xmm0, xmm0, xmm0
-        xor	ebx, ebx
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_encrypt_avx2_calc_aad_loop:
-        movzx	r13d, BYTE PTR [r12+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_avx2_calc_aad_loop
-        vmovdqu	xmm0, OWORD PTR [rsp]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm6, 16
-        vpclmulqdq	xmm1, xmm5, xmm6, 1
-        vpclmulqdq	xmm0, xmm5, xmm6, 0
-        vpclmulqdq	xmm3, xmm5, xmm6, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm7, xmm0, xmm1
-        vpxor	xmm6, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm6, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm6, xmm6, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm6, xmm6, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm6, xmm6, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm7, xmm2, 16
-        vpshufd	xmm1, xmm7, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm6, xmm6, xmm1
-L_AES_GCM_encrypt_avx2_calc_aad_done:
-        ; Calculate counter and H
-        vpsrlq	xmm1, xmm5, 63
-        vpsllq	xmm0, xmm5, 1
-        vpslldq	xmm1, xmm1, 8
-        vpor	xmm0, xmm0, xmm1
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpand	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpaddd	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one
-        vpxor	xmm5, xmm5, xmm0
-        xor	ebx, ebx
-        cmp	r10d, 128
-        mov	r13d, r10d
-        jl	L_AES_GCM_encrypt_avx2_done_128
-        and	r13d, 4294967168
-        vmovdqu	OWORD PTR [rsp+128], xmm4
-        vmovdqu	OWORD PTR [rsp+144], xmm15
-        vmovdqu	xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128
-        ; H ^ 1 and H ^ 2
-        vpclmulqdq	xmm9, xmm5, xmm5, 0
-        vpclmulqdq	xmm10, xmm5, xmm5, 17
-        vpclmulqdq	xmm8, xmm9, xmm3, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm9, xmm9, xmm8
-        vpclmulqdq	xmm8, xmm9, xmm3, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm0, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp], xmm5
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-        ; H ^ 3 and H ^ 4
-        vpclmulqdq	xmm11, xmm0, xmm5, 16
-        vpclmulqdq	xmm10, xmm0, xmm5, 1
-        vpclmulqdq	xmm9, xmm0, xmm5, 0
-        vpclmulqdq	xmm12, xmm0, xmm5, 17
-        vpclmulqdq	xmm13, xmm0, xmm0, 0
-        vpclmulqdq	xmm14, xmm0, xmm0, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm2, xmm13, xmm14
-        vpxor	xmm1, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+32], xmm1
-        vmovdqu	OWORD PTR [rsp+48], xmm2
-        ; H ^ 5 and H ^ 6
-        vpclmulqdq	xmm11, xmm1, xmm0, 16
-        vpclmulqdq	xmm10, xmm1, xmm0, 1
-        vpclmulqdq	xmm9, xmm1, xmm0, 0
-        vpclmulqdq	xmm12, xmm1, xmm0, 17
-        vpclmulqdq	xmm13, xmm1, xmm1, 0
-        vpclmulqdq	xmm14, xmm1, xmm1, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm0, xmm13, xmm14
-        vpxor	xmm7, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm7
-        vmovdqu	OWORD PTR [rsp+80], xmm0
-        ; H ^ 7 and H ^ 8
-        vpclmulqdq	xmm11, xmm2, xmm1, 16
-        vpclmulqdq	xmm10, xmm2, xmm1, 1
-        vpclmulqdq	xmm9, xmm2, xmm1, 0
-        vpclmulqdq	xmm12, xmm2, xmm1, 17
-        vpclmulqdq	xmm13, xmm2, xmm2, 0
-        vpclmulqdq	xmm14, xmm2, xmm2, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm0, xmm13, xmm14
-        vpxor	xmm7, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+96], xmm7
-        vmovdqu	OWORD PTR [rsp+112], xmm0
-        ; First 128 bytes of input
-        ; aesenc_128
-        ; aesenc_ctr
-        vmovdqu	xmm0, OWORD PTR [rsp+128]
-        vmovdqu	xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight
-        vpshufb	xmm15, xmm15, xmm1
-        ; aesenc_xor
-        vmovdqu	xmm7, OWORD PTR [rsi]
-        vmovdqu	OWORD PTR [rsp+128], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+16]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+32]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+48]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+64]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+80]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+96]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+112]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+128]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+144]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r9d, 11
-        vmovdqu	xmm7, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_encrypt_avx2_aesenc_128_enc_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r9d, 13
-        vmovdqu	xmm7, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_encrypt_avx2_aesenc_128_enc_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+224]
-L_AES_GCM_encrypt_avx2_aesenc_128_enc_done:
-        ; aesenc_last
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [rdi]
-        vmovdqu	xmm1, OWORD PTR [rdi+16]
-        vmovdqu	xmm2, OWORD PTR [rdi+32]
-        vmovdqu	xmm3, OWORD PTR [rdi+48]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vpxor	xmm10, xmm10, xmm2
-        vpxor	xmm11, xmm11, xmm3
-        vmovdqu	OWORD PTR [r8], xmm8
-        vmovdqu	OWORD PTR [r8+16], xmm9
-        vmovdqu	OWORD PTR [r8+32], xmm10
-        vmovdqu	OWORD PTR [r8+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rdi+64]
-        vmovdqu	xmm1, OWORD PTR [rdi+80]
-        vmovdqu	xmm2, OWORD PTR [rdi+96]
-        vmovdqu	xmm3, OWORD PTR [rdi+112]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vpxor	xmm14, xmm14, xmm2
-        vpxor	xmm15, xmm15, xmm3
-        vmovdqu	OWORD PTR [r8+64], xmm12
-        vmovdqu	OWORD PTR [r8+80], xmm13
-        vmovdqu	OWORD PTR [r8+96], xmm14
-        vmovdqu	OWORD PTR [r8+112], xmm15
-        cmp	r13d, 128
-        mov	ebx, 128
-        jle	L_AES_GCM_encrypt_avx2_end_128
-        ; More 128 bytes of input
-L_AES_GCM_encrypt_avx2_ghash_128:
-        ; aesenc_128_ghash
-        lea	rcx, QWORD PTR [rdi+rbx]
-        lea	rdx, QWORD PTR [r8+rbx]
-        ; aesenc_ctr
-        vmovdqu	xmm0, OWORD PTR [rsp+128]
-        vmovdqu	xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight
-        vpshufb	xmm15, xmm15, xmm1
-        ; aesenc_xor
-        vmovdqu	xmm7, OWORD PTR [rsi]
-        vmovdqu	OWORD PTR [rsp+128], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        ; aesenc_pclmul_1
-        vmovdqu	xmm1, OWORD PTR [rdx+-128]
-        vmovdqu	xmm0, OWORD PTR [rsi+16]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vmovdqu	xmm2, OWORD PTR [rsp+112]
-        vpxor	xmm1, xmm1, xmm6
-        vpclmulqdq	xmm5, xmm1, xmm2, 16
-        vpclmulqdq	xmm3, xmm1, xmm2, 1
-        vpclmulqdq	xmm6, xmm1, xmm2, 0
-        vpclmulqdq	xmm7, xmm1, xmm2, 17
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_2
-        vmovdqu	xmm1, OWORD PTR [rdx+-112]
-        vmovdqu	xmm0, OWORD PTR [rsp+96]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+32]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-96]
-        vmovdqu	xmm0, OWORD PTR [rsp+80]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+48]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-80]
-        vmovdqu	xmm0, OWORD PTR [rsp+64]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+64]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-64]
-        vmovdqu	xmm0, OWORD PTR [rsp+48]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+80]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-48]
-        vmovdqu	xmm0, OWORD PTR [rsp+32]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+96]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-32]
-        vmovdqu	xmm0, OWORD PTR [rsp+16]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+112]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-16]
-        vmovdqu	xmm0, OWORD PTR [rsp]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+128]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_l
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm6, xmm6, xmm4
-        vpxor	xmm5, xmm5, xmm3
-        vpslldq	xmm1, xmm5, 8
-        vpsrldq	xmm5, xmm5, 8
-        vmovdqu	xmm4, OWORD PTR [rsi+144]
-        vmovdqu	xmm0, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vaesenc	xmm8, xmm8, xmm4
-        vpxor	xmm6, xmm6, xmm1
-        vpxor	xmm7, xmm7, xmm5
-        vpclmulqdq	xmm3, xmm6, xmm0, 16
-        vaesenc	xmm9, xmm9, xmm4
-        vaesenc	xmm10, xmm10, xmm4
-        vaesenc	xmm11, xmm11, xmm4
-        vpshufd	xmm6, xmm6, 78
-        vpxor	xmm6, xmm6, xmm3
-        vpclmulqdq	xmm3, xmm6, xmm0, 16
-        vaesenc	xmm12, xmm12, xmm4
-        vaesenc	xmm13, xmm13, xmm4
-        vaesenc	xmm14, xmm14, xmm4
-        vpshufd	xmm6, xmm6, 78
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm6, xmm6, xmm7
-        vaesenc	xmm15, xmm15, xmm4
-        cmp	r9d, 11
-        vmovdqu	xmm7, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_encrypt_avx2_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r9d, 13
-        vmovdqu	xmm7, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_encrypt_avx2_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+224]
-L_AES_GCM_encrypt_avx2_aesenc_128_ghash_avx_done:
-        ; aesenc_last
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vmovdqu	xmm1, OWORD PTR [rcx+16]
-        vmovdqu	xmm2, OWORD PTR [rcx+32]
-        vmovdqu	xmm3, OWORD PTR [rcx+48]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vpxor	xmm10, xmm10, xmm2
-        vpxor	xmm11, xmm11, xmm3
-        vmovdqu	OWORD PTR [rdx], xmm8
-        vmovdqu	OWORD PTR [rdx+16], xmm9
-        vmovdqu	OWORD PTR [rdx+32], xmm10
-        vmovdqu	OWORD PTR [rdx+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+64]
-        vmovdqu	xmm1, OWORD PTR [rcx+80]
-        vmovdqu	xmm2, OWORD PTR [rcx+96]
-        vmovdqu	xmm3, OWORD PTR [rcx+112]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vpxor	xmm14, xmm14, xmm2
-        vpxor	xmm15, xmm15, xmm3
-        vmovdqu	OWORD PTR [rdx+64], xmm12
-        vmovdqu	OWORD PTR [rdx+80], xmm13
-        vmovdqu	OWORD PTR [rdx+96], xmm14
-        vmovdqu	OWORD PTR [rdx+112], xmm15
-        ; aesenc_128_ghash - end
-        add	ebx, 128
-        cmp	ebx, r13d
-        jl	L_AES_GCM_encrypt_avx2_ghash_128
-L_AES_GCM_encrypt_avx2_end_128:
-        vmovdqu	xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpshufb	xmm8, xmm8, xmm4
-        vpshufb	xmm9, xmm9, xmm4
-        vpshufb	xmm10, xmm10, xmm4
-        vpshufb	xmm11, xmm11, xmm4
-        vpshufb	xmm12, xmm12, xmm4
-        vpshufb	xmm13, xmm13, xmm4
-        vpshufb	xmm14, xmm14, xmm4
-        vpshufb	xmm15, xmm15, xmm4
-        vpxor	xmm8, xmm8, xmm6
-        vmovdqu	xmm7, OWORD PTR [rsp]
-        vpclmulqdq	xmm5, xmm7, xmm15, 16
-        vpclmulqdq	xmm1, xmm7, xmm15, 1
-        vpclmulqdq	xmm4, xmm7, xmm15, 0
-        vpclmulqdq	xmm6, xmm7, xmm15, 17
-        vpxor	xmm5, xmm5, xmm1
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vpclmulqdq	xmm2, xmm7, xmm14, 16
-        vpclmulqdq	xmm1, xmm7, xmm14, 1
-        vpclmulqdq	xmm0, xmm7, xmm14, 0
-        vpclmulqdq	xmm3, xmm7, xmm14, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vmovdqu	xmm15, OWORD PTR [rsp+32]
-        vmovdqu	xmm7, OWORD PTR [rsp+48]
-        vpclmulqdq	xmm2, xmm15, xmm13, 16
-        vpclmulqdq	xmm1, xmm15, xmm13, 1
-        vpclmulqdq	xmm0, xmm15, xmm13, 0
-        vpclmulqdq	xmm3, xmm15, xmm13, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vpclmulqdq	xmm2, xmm7, xmm12, 16
-        vpclmulqdq	xmm1, xmm7, xmm12, 1
-        vpclmulqdq	xmm0, xmm7, xmm12, 0
-        vpclmulqdq	xmm3, xmm7, xmm12, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vmovdqu	xmm15, OWORD PTR [rsp+64]
-        vmovdqu	xmm7, OWORD PTR [rsp+80]
-        vpclmulqdq	xmm2, xmm15, xmm11, 16
-        vpclmulqdq	xmm1, xmm15, xmm11, 1
-        vpclmulqdq	xmm0, xmm15, xmm11, 0
-        vpclmulqdq	xmm3, xmm15, xmm11, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vpclmulqdq	xmm2, xmm7, xmm10, 16
-        vpclmulqdq	xmm1, xmm7, xmm10, 1
-        vpclmulqdq	xmm0, xmm7, xmm10, 0
-        vpclmulqdq	xmm3, xmm7, xmm10, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vmovdqu	xmm15, OWORD PTR [rsp+96]
-        vmovdqu	xmm7, OWORD PTR [rsp+112]
-        vpclmulqdq	xmm2, xmm15, xmm9, 16
-        vpclmulqdq	xmm1, xmm15, xmm9, 1
-        vpclmulqdq	xmm0, xmm15, xmm9, 0
-        vpclmulqdq	xmm3, xmm15, xmm9, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vpclmulqdq	xmm2, xmm7, xmm8, 16
-        vpclmulqdq	xmm1, xmm7, xmm8, 1
-        vpclmulqdq	xmm0, xmm7, xmm8, 0
-        vpclmulqdq	xmm3, xmm7, xmm8, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vpslldq	xmm7, xmm5, 8
-        vpsrldq	xmm5, xmm5, 8
-        vpxor	xmm4, xmm4, xmm7
-        vpxor	xmm6, xmm6, xmm5
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm4, xmm2, 16
-        vpshufd	xmm1, xmm4, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm6, xmm6, xmm1
-        vmovdqu	xmm5, OWORD PTR [rsp]
-        vmovdqu	xmm4, OWORD PTR [rsp+128]
-        vmovdqu	xmm15, OWORD PTR [rsp+144]
-L_AES_GCM_encrypt_avx2_done_128:
-        cmp	ebx, r10d
-        je	L_AES_GCM_encrypt_avx2_done_enc
-        mov	r13d, r10d
-        and	r13d, 4294967280
-        cmp	ebx, r13d
-        jge	L_AES_GCM_encrypt_avx2_last_block_done
-        ; aesenc_block
-        vmovdqu	xmm1, xmm4
-        vpshufb	xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_one
-        vpxor	xmm0, xmm0, [rsi]
-        vmovdqu	xmm2, OWORD PTR [rsi+16]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rsi+32]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rsi+48]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rsi+64]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rsi+80]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rsi+96]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rsi+112]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rsi+128]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rsi+144]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm4, xmm1
-        cmp	r9d, 11
-        vmovdqu	xmm1, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_encrypt_avx2_aesenc_block_last
-        vaesenc	xmm0, xmm0, xmm1
-        vmovdqu	xmm2, OWORD PTR [rsi+176]
-        vaesenc	xmm0, xmm0, xmm2
-        cmp	r9d, 13
-        vmovdqu	xmm1, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_encrypt_avx2_aesenc_block_last
-        vaesenc	xmm0, xmm0, xmm1
-        vmovdqu	xmm2, OWORD PTR [rsi+208]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm1, OWORD PTR [rsi+224]
-L_AES_GCM_encrypt_avx2_aesenc_block_last:
-        vaesenclast	xmm0, xmm0, xmm1
-        vmovdqu	xmm1, OWORD PTR [rdi+rbx]
-        vpxor	xmm0, xmm0, xmm1
-        vmovdqu	OWORD PTR [r8+rbx], xmm0
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm0
-        add	ebx, 16
-        cmp	ebx, r13d
-        jge	L_AES_GCM_encrypt_avx2_last_block_ghash
-L_AES_GCM_encrypt_avx2_last_block_start:
-        vmovdqu	xmm12, OWORD PTR [rdi+rbx]
-        vpshufb	xmm11, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one
-        ; aesenc_gfmul_sb
-        vpclmulqdq	xmm2, xmm6, xmm5, 1
-        vpclmulqdq	xmm3, xmm6, xmm5, 16
-        vpclmulqdq	xmm1, xmm6, xmm5, 0
-        vpclmulqdq	xmm8, xmm6, xmm5, 17
-        vpxor	xmm11, xmm11, [rsi]
-        vaesenc	xmm11, xmm11, [rsi+16]
-        vpxor	xmm3, xmm3, xmm2
-        vpslldq	xmm2, xmm3, 8
-        vpsrldq	xmm3, xmm3, 8
-        vaesenc	xmm11, xmm11, [rsi+32]
-        vpxor	xmm2, xmm2, xmm1
-        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vaesenc	xmm11, xmm11, [rsi+48]
-        vaesenc	xmm11, xmm11, [rsi+64]
-        vaesenc	xmm11, xmm11, [rsi+80]
-        vpshufd	xmm2, xmm2, 78
-        vpxor	xmm2, xmm2, xmm1
-        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vaesenc	xmm11, xmm11, [rsi+96]
-        vaesenc	xmm11, xmm11, [rsi+112]
-        vaesenc	xmm11, xmm11, [rsi+128]
-        vpshufd	xmm2, xmm2, 78
-        vaesenc	xmm11, xmm11, [rsi+144]
-        vpxor	xmm8, xmm8, xmm3
-        vpxor	xmm2, xmm2, xmm8
-        vmovdqu	xmm0, OWORD PTR [rsi+160]
-        cmp	r9d, 11
-        jl	L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm11, xmm11, [rsi+176]
-        vmovdqu	xmm0, OWORD PTR [rsi+192]
-        cmp	r9d, 13
-        jl	L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm11, xmm11, [rsi+208]
-        vmovdqu	xmm0, OWORD PTR [rsi+224]
-L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last:
-        vaesenclast	xmm11, xmm11, xmm0
-        vpxor	xmm6, xmm2, xmm1
-        vpxor	xmm11, xmm11, xmm12
-        vmovdqu	OWORD PTR [r8+rbx], xmm11
-        vpshufb	xmm11, xmm11, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm11
-        add	ebx, 16
-        cmp	ebx, r13d
-        jl	L_AES_GCM_encrypt_avx2_last_block_start
-L_AES_GCM_encrypt_avx2_last_block_ghash:
-        ; ghash_gfmul_red
-        vpclmulqdq	xmm10, xmm6, xmm5, 16
-        vpclmulqdq	xmm9, xmm6, xmm5, 1
-        vpclmulqdq	xmm8, xmm6, xmm5, 0
-        vpxor	xmm10, xmm10, xmm9
-        vpslldq	xmm9, xmm10, 8
-        vpsrldq	xmm10, xmm10, 8
-        vpxor	xmm9, xmm9, xmm8
-        vpclmulqdq	xmm6, xmm6, xmm5, 17
-        vpclmulqdq	xmm8, xmm9, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm9, xmm9, xmm8
-        vpclmulqdq	xmm8, xmm9, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm6, xmm6, xmm10
-        vpxor	xmm6, xmm6, xmm9
-        vpxor	xmm6, xmm6, xmm8
-L_AES_GCM_encrypt_avx2_last_block_done:
-        mov	ecx, r10d
-        mov	edx, r10d
-        and	ecx, 15
-        jz	L_AES_GCM_encrypt_avx2_done_enc
-        ; aesenc_last15_enc
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpxor	xmm4, xmm4, [rsi]
-        vaesenc	xmm4, xmm4, [rsi+16]
-        vaesenc	xmm4, xmm4, [rsi+32]
-        vaesenc	xmm4, xmm4, [rsi+48]
-        vaesenc	xmm4, xmm4, [rsi+64]
-        vaesenc	xmm4, xmm4, [rsi+80]
-        vaesenc	xmm4, xmm4, [rsi+96]
-        vaesenc	xmm4, xmm4, [rsi+112]
-        vaesenc	xmm4, xmm4, [rsi+128]
-        vaesenc	xmm4, xmm4, [rsi+144]
-        cmp	r9d, 11
-        vmovdqu	xmm0, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last
-        vaesenc	xmm4, xmm4, xmm0
-        vaesenc	xmm4, xmm4, [rsi+176]
-        cmp	r9d, 13
-        vmovdqu	xmm0, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last
-        vaesenc	xmm4, xmm4, xmm0
-        vaesenc	xmm4, xmm4, [rsi+208]
-        vmovdqu	xmm0, OWORD PTR [rsi+224]
-L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last:
-        vaesenclast	xmm4, xmm4, xmm0
-        xor	ecx, ecx
-        vpxor	xmm0, xmm0, xmm0
-        vmovdqu	OWORD PTR [rsp], xmm4
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_loop:
-        movzx	r13d, BYTE PTR [rdi+rbx]
-        xor	r13b, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [rsp+rcx+16], r13b
-        mov	BYTE PTR [r8+rbx], r13b
-        inc	ebx
-        inc	ecx
-        cmp	ebx, edx
-        jl	L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_loop
-L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_finish_enc:
-        vmovdqu	xmm4, OWORD PTR [rsp+16]
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm4
-        ; ghash_gfmul_red
-        vpclmulqdq	xmm2, xmm6, xmm5, 16
-        vpclmulqdq	xmm1, xmm6, xmm5, 1
-        vpclmulqdq	xmm0, xmm6, xmm5, 0
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm6, xmm6, xmm5, 17
-        vpclmulqdq	xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm6, xmm6, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vpxor	xmm6, xmm6, xmm0
-L_AES_GCM_encrypt_avx2_done_enc:
-        ; calc_tag
-        shl	r10, 3
-        shl	r11, 3
-        vmovq	xmm0, r10
-        vmovq	xmm1, r11
-        vpunpcklqdq	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm6
-        ; ghash_gfmul_red
-        vpclmulqdq	xmm4, xmm0, xmm5, 16
-        vpclmulqdq	xmm3, xmm0, xmm5, 1
-        vpclmulqdq	xmm2, xmm0, xmm5, 0
-        vpxor	xmm4, xmm4, xmm3
-        vpslldq	xmm3, xmm4, 8
-        vpsrldq	xmm4, xmm4, 8
-        vpxor	xmm3, xmm3, xmm2
-        vpclmulqdq	xmm0, xmm0, xmm5, 17
-        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm3, xmm3, 78
-        vpxor	xmm3, xmm3, xmm2
-        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm3, xmm3, 78
-        vpxor	xmm0, xmm0, xmm4
-        vpxor	xmm0, xmm0, xmm3
-        vpxor	xmm0, xmm0, xmm2
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm0, xmm0, xmm15
-        ; store_tag
-        cmp	r14d, 16
-        je	L_AES_GCM_encrypt_avx2_store_tag_16
-        xor	rcx, rcx
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_encrypt_avx2_store_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [r15+rcx], r13b
-        inc	ecx
-        cmp	ecx, r14d
-        jne	L_AES_GCM_encrypt_avx2_store_tag_loop
-        jmp	L_AES_GCM_encrypt_avx2_store_tag_done
-L_AES_GCM_encrypt_avx2_store_tag_16:
-        vmovdqu	OWORD PTR [r15], xmm0
-L_AES_GCM_encrypt_avx2_store_tag_done:
-        vzeroupper
-        vmovdqu	xmm6, OWORD PTR [rsp+160]
-        vmovdqu	xmm7, OWORD PTR [rsp+176]
-        vmovdqu	xmm8, OWORD PTR [rsp+192]
-        vmovdqu	xmm9, OWORD PTR [rsp+208]
-        vmovdqu	xmm10, OWORD PTR [rsp+224]
-        vmovdqu	xmm11, OWORD PTR [rsp+240]
-        vmovdqu	xmm12, OWORD PTR [rsp+256]
-        vmovdqu	xmm13, OWORD PTR [rsp+272]
-        vmovdqu	xmm14, OWORD PTR [rsp+288]
-        vmovdqu	xmm15, OWORD PTR [rsp+304]
-        add	rsp, 320
-        pop	rsi
-        pop	r14
-        pop	rbx
-        pop	r15
-        pop	r12
-        pop	rdi
-        pop	r13
-        ret
-AES_GCM_encrypt_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_decrypt_avx2 PROC
-        push	r13
-        push	rdi
-        push	r12
-        push	r14
-        push	rbx
-        push	r15
-        push	rsi
-        push	rbp
-        mov	rdi, rcx
-        mov	r12, r8
-        mov	rax, r9
-        mov	r14, QWORD PTR [rsp+104]
-        mov	r8, rdx
-        mov	r10d, DWORD PTR [rsp+112]
-        mov	r11d, DWORD PTR [rsp+120]
-        mov	ebx, DWORD PTR [rsp+128]
-        mov	r15d, DWORD PTR [rsp+136]
-        mov	rsi, QWORD PTR [rsp+144]
-        mov	r9d, DWORD PTR [rsp+152]
-        mov	rbp, QWORD PTR [rsp+160]
-        sub	rsp, 328
-        vmovdqu	OWORD PTR [rsp+168], xmm6
-        vmovdqu	OWORD PTR [rsp+184], xmm7
-        vmovdqu	OWORD PTR [rsp+200], xmm8
-        vmovdqu	OWORD PTR [rsp+216], xmm9
-        vmovdqu	OWORD PTR [rsp+232], xmm10
-        vmovdqu	OWORD PTR [rsp+248], xmm11
-        vmovdqu	OWORD PTR [rsp+264], xmm12
-        vmovdqu	OWORD PTR [rsp+280], xmm13
-        vmovdqu	OWORD PTR [rsp+296], xmm14
-        vmovdqu	OWORD PTR [rsp+312], xmm15
-        vpxor	xmm4, xmm4, xmm4
-        vpxor	xmm6, xmm6, xmm6
-        mov	edx, ebx
-        cmp	edx, 12
-        je	L_AES_GCM_decrypt_avx2_iv_12
-        ; Calculate values when IV is not 12 bytes
-        ; H = Encrypt X(=0)
-        vmovdqu	xmm5, OWORD PTR [rsi]
-        vaesenc	xmm5, xmm5, [rsi+16]
-        vaesenc	xmm5, xmm5, [rsi+32]
-        vaesenc	xmm5, xmm5, [rsi+48]
-        vaesenc	xmm5, xmm5, [rsi+64]
-        vaesenc	xmm5, xmm5, [rsi+80]
-        vaesenc	xmm5, xmm5, [rsi+96]
-        vaesenc	xmm5, xmm5, [rsi+112]
-        vaesenc	xmm5, xmm5, [rsi+128]
-        vaesenc	xmm5, xmm5, [rsi+144]
-        cmp	r9d, 11
-        vmovdqu	xmm0, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm5, xmm5, [rsi+176]
-        cmp	r9d, 13
-        vmovdqu	xmm0, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm5, xmm5, [rsi+208]
-        vmovdqu	xmm0, OWORD PTR [rsi+224]
-L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last:
-        vaesenclast	xmm5, xmm5, xmm0
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        ; Calc counter
-        ; Initialization vector
-        cmp	edx, 0
-        mov	rcx, 0
-        je	L_AES_GCM_decrypt_avx2_calc_iv_done
-        cmp	edx, 16
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_lt16
-        and	edx, 4294967280
-L_AES_GCM_decrypt_avx2_calc_iv_16_loop:
-        vmovdqu	xmm0, OWORD PTR [rax+rcx]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm7, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm7, xmm2, 16
-        vpshufd	xmm1, xmm7, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_16_loop
-        mov	edx, ebx
-        cmp	ecx, edx
-        je	L_AES_GCM_decrypt_avx2_calc_iv_done
-L_AES_GCM_decrypt_avx2_calc_iv_lt16:
-        vpxor	xmm0, xmm0, xmm0
-        xor	ebx, ebx
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_decrypt_avx2_calc_iv_loop:
-        movzx	r13d, BYTE PTR [rax+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_loop
-        vmovdqu	xmm0, OWORD PTR [rsp]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm7, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm7, xmm2, 16
-        vpshufd	xmm1, xmm7, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-L_AES_GCM_decrypt_avx2_calc_iv_done:
-        ; T = Encrypt counter
-        vpxor	xmm0, xmm0, xmm0
-        shl	edx, 3
-        vmovq	xmm0, rdx
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm7, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm7, xmm2, 16
-        vpshufd	xmm1, xmm7, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        ;   Encrypt counter
-        vmovdqu	xmm15, OWORD PTR [rsi]
-        vpxor	xmm15, xmm15, xmm4
-        vaesenc	xmm15, xmm15, [rsi+16]
-        vaesenc	xmm15, xmm15, [rsi+32]
-        vaesenc	xmm15, xmm15, [rsi+48]
-        vaesenc	xmm15, xmm15, [rsi+64]
-        vaesenc	xmm15, xmm15, [rsi+80]
-        vaesenc	xmm15, xmm15, [rsi+96]
-        vaesenc	xmm15, xmm15, [rsi+112]
-        vaesenc	xmm15, xmm15, [rsi+128]
-        vaesenc	xmm15, xmm15, [rsi+144]
-        cmp	r9d, 11
-        vmovdqu	xmm0, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm15, xmm15, xmm0
-        vaesenc	xmm15, xmm15, [rsi+176]
-        cmp	r9d, 13
-        vmovdqu	xmm0, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm15, xmm15, xmm0
-        vaesenc	xmm15, xmm15, [rsi+208]
-        vmovdqu	xmm0, OWORD PTR [rsi+224]
-L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last:
-        vaesenclast	xmm15, xmm15, xmm0
-        jmp	L_AES_GCM_decrypt_avx2_iv_done
-L_AES_GCM_decrypt_avx2_iv_12:
-        ; # Calculate values when IV is 12 bytes
-        ; Set counter based on IV
-        vmovdqu	xmm4, OWORD PTR L_avx2_aes_gcm_bswap_one
-        vmovdqu	xmm5, OWORD PTR [rsi]
-        vpblendd	xmm4, xmm4, [rax], 7
-        ; H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqu	xmm7, OWORD PTR [rsi+16]
-        vpxor	xmm15, xmm4, xmm5
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rsi+32]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+48]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+64]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+80]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+96]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+112]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+128]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+144]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        cmp	r9d, 11
-        vmovdqu	xmm0, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+176]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        cmp	r9d, 13
-        vmovdqu	xmm0, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+208]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+224]
-L_AES_GCM_decrypt_avx2_calc_iv_12_last:
-        vaesenclast	xmm5, xmm5, xmm0
-        vaesenclast	xmm15, xmm15, xmm0
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask
-L_AES_GCM_decrypt_avx2_iv_done:
-        ; Additional authentication data
-        mov	edx, r11d
-        cmp	edx, 0
-        je	L_AES_GCM_decrypt_avx2_calc_aad_done
-        xor	ecx, ecx
-        cmp	edx, 16
-        jl	L_AES_GCM_decrypt_avx2_calc_aad_lt16
-        and	edx, 4294967280
-L_AES_GCM_decrypt_avx2_calc_aad_16_loop:
-        vmovdqu	xmm0, OWORD PTR [r12+rcx]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm6, 16
-        vpclmulqdq	xmm1, xmm5, xmm6, 1
-        vpclmulqdq	xmm0, xmm5, xmm6, 0
-        vpclmulqdq	xmm3, xmm5, xmm6, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm7, xmm0, xmm1
-        vpxor	xmm6, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm6, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm6, xmm6, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm6, xmm6, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm6, xmm6, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm7, xmm2, 16
-        vpshufd	xmm1, xmm7, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm6, xmm6, xmm1
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_avx2_calc_aad_16_loop
-        mov	edx, r11d
-        cmp	ecx, edx
-        je	L_AES_GCM_decrypt_avx2_calc_aad_done
-L_AES_GCM_decrypt_avx2_calc_aad_lt16:
-        vpxor	xmm0, xmm0, xmm0
-        xor	ebx, ebx
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_decrypt_avx2_calc_aad_loop:
-        movzx	r13d, BYTE PTR [r12+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_avx2_calc_aad_loop
-        vmovdqu	xmm0, OWORD PTR [rsp]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm6, 16
-        vpclmulqdq	xmm1, xmm5, xmm6, 1
-        vpclmulqdq	xmm0, xmm5, xmm6, 0
-        vpclmulqdq	xmm3, xmm5, xmm6, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm7, xmm0, xmm1
-        vpxor	xmm6, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm6, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm6, xmm6, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm6, xmm6, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm6, xmm6, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm7, xmm2, 16
-        vpshufd	xmm1, xmm7, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm6, xmm6, xmm1
-L_AES_GCM_decrypt_avx2_calc_aad_done:
-        ; Calculate counter and H
-        vpsrlq	xmm1, xmm5, 63
-        vpsllq	xmm0, xmm5, 1
-        vpslldq	xmm1, xmm1, 8
-        vpor	xmm0, xmm0, xmm1
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpand	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpaddd	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one
-        vpxor	xmm5, xmm5, xmm0
-        xor	ebx, ebx
-        cmp	r10d, 128
-        mov	r13d, r10d
-        jl	L_AES_GCM_decrypt_avx2_done_128
-        and	r13d, 4294967168
-        vmovdqu	OWORD PTR [rsp+128], xmm4
-        vmovdqu	OWORD PTR [rsp+144], xmm15
-        vmovdqu	xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128
-        ; H ^ 1 and H ^ 2
-        vpclmulqdq	xmm9, xmm5, xmm5, 0
-        vpclmulqdq	xmm10, xmm5, xmm5, 17
-        vpclmulqdq	xmm8, xmm9, xmm3, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm9, xmm9, xmm8
-        vpclmulqdq	xmm8, xmm9, xmm3, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm0, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp], xmm5
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-        ; H ^ 3 and H ^ 4
-        vpclmulqdq	xmm11, xmm0, xmm5, 16
-        vpclmulqdq	xmm10, xmm0, xmm5, 1
-        vpclmulqdq	xmm9, xmm0, xmm5, 0
-        vpclmulqdq	xmm12, xmm0, xmm5, 17
-        vpclmulqdq	xmm13, xmm0, xmm0, 0
-        vpclmulqdq	xmm14, xmm0, xmm0, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm2, xmm13, xmm14
-        vpxor	xmm1, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+32], xmm1
-        vmovdqu	OWORD PTR [rsp+48], xmm2
-        ; H ^ 5 and H ^ 6
-        vpclmulqdq	xmm11, xmm1, xmm0, 16
-        vpclmulqdq	xmm10, xmm1, xmm0, 1
-        vpclmulqdq	xmm9, xmm1, xmm0, 0
-        vpclmulqdq	xmm12, xmm1, xmm0, 17
-        vpclmulqdq	xmm13, xmm1, xmm1, 0
-        vpclmulqdq	xmm14, xmm1, xmm1, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm0, xmm13, xmm14
-        vpxor	xmm7, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm7
-        vmovdqu	OWORD PTR [rsp+80], xmm0
-        ; H ^ 7 and H ^ 8
-        vpclmulqdq	xmm11, xmm2, xmm1, 16
-        vpclmulqdq	xmm10, xmm2, xmm1, 1
-        vpclmulqdq	xmm9, xmm2, xmm1, 0
-        vpclmulqdq	xmm12, xmm2, xmm1, 17
-        vpclmulqdq	xmm13, xmm2, xmm2, 0
-        vpclmulqdq	xmm14, xmm2, xmm2, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm0, xmm13, xmm14
-        vpxor	xmm7, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+96], xmm7
-        vmovdqu	OWORD PTR [rsp+112], xmm0
-L_AES_GCM_decrypt_avx2_ghash_128:
-        ; aesenc_128_ghash
-        lea	rcx, QWORD PTR [rdi+rbx]
-        lea	rdx, QWORD PTR [r8+rbx]
-        ; aesenc_ctr
-        vmovdqu	xmm0, OWORD PTR [rsp+128]
-        vmovdqu	xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight
-        vpshufb	xmm15, xmm15, xmm1
-        ; aesenc_xor
-        vmovdqu	xmm7, OWORD PTR [rsi]
-        vmovdqu	OWORD PTR [rsp+128], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        ; aesenc_pclmul_1
-        vmovdqu	xmm1, OWORD PTR [rcx]
-        vmovdqu	xmm0, OWORD PTR [rsi+16]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vmovdqu	xmm2, OWORD PTR [rsp+112]
-        vpxor	xmm1, xmm1, xmm6
-        vpclmulqdq	xmm5, xmm1, xmm2, 16
-        vpclmulqdq	xmm3, xmm1, xmm2, 1
-        vpclmulqdq	xmm6, xmm1, xmm2, 0
-        vpclmulqdq	xmm7, xmm1, xmm2, 17
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_2
-        vmovdqu	xmm1, OWORD PTR [rcx+16]
-        vmovdqu	xmm0, OWORD PTR [rsp+96]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+32]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+32]
-        vmovdqu	xmm0, OWORD PTR [rsp+80]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+48]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+48]
-        vmovdqu	xmm0, OWORD PTR [rsp+64]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+64]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+64]
-        vmovdqu	xmm0, OWORD PTR [rsp+48]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+80]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+80]
-        vmovdqu	xmm0, OWORD PTR [rsp+32]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+96]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+96]
-        vmovdqu	xmm0, OWORD PTR [rsp+16]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+112]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+112]
-        vmovdqu	xmm0, OWORD PTR [rsp]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+128]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_l
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm6, xmm6, xmm4
-        vpxor	xmm5, xmm5, xmm3
-        vpslldq	xmm1, xmm5, 8
-        vpsrldq	xmm5, xmm5, 8
-        vmovdqu	xmm4, OWORD PTR [rsi+144]
-        vmovdqu	xmm0, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vaesenc	xmm8, xmm8, xmm4
-        vpxor	xmm6, xmm6, xmm1
-        vpxor	xmm7, xmm7, xmm5
-        vpclmulqdq	xmm3, xmm6, xmm0, 16
-        vaesenc	xmm9, xmm9, xmm4
-        vaesenc	xmm10, xmm10, xmm4
-        vaesenc	xmm11, xmm11, xmm4
-        vpshufd	xmm6, xmm6, 78
-        vpxor	xmm6, xmm6, xmm3
-        vpclmulqdq	xmm3, xmm6, xmm0, 16
-        vaesenc	xmm12, xmm12, xmm4
-        vaesenc	xmm13, xmm13, xmm4
-        vaesenc	xmm14, xmm14, xmm4
-        vpshufd	xmm6, xmm6, 78
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm6, xmm6, xmm7
-        vaesenc	xmm15, xmm15, xmm4
-        cmp	r9d, 11
-        vmovdqu	xmm7, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_decrypt_avx2_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r9d, 13
-        vmovdqu	xmm7, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_decrypt_avx2_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+224]
-L_AES_GCM_decrypt_avx2_aesenc_128_ghash_avx_done:
-        ; aesenc_last
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vmovdqu	xmm1, OWORD PTR [rcx+16]
-        vmovdqu	xmm2, OWORD PTR [rcx+32]
-        vmovdqu	xmm3, OWORD PTR [rcx+48]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vpxor	xmm10, xmm10, xmm2
-        vpxor	xmm11, xmm11, xmm3
-        vmovdqu	OWORD PTR [rdx], xmm8
-        vmovdqu	OWORD PTR [rdx+16], xmm9
-        vmovdqu	OWORD PTR [rdx+32], xmm10
-        vmovdqu	OWORD PTR [rdx+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+64]
-        vmovdqu	xmm1, OWORD PTR [rcx+80]
-        vmovdqu	xmm2, OWORD PTR [rcx+96]
-        vmovdqu	xmm3, OWORD PTR [rcx+112]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vpxor	xmm14, xmm14, xmm2
-        vpxor	xmm15, xmm15, xmm3
-        vmovdqu	OWORD PTR [rdx+64], xmm12
-        vmovdqu	OWORD PTR [rdx+80], xmm13
-        vmovdqu	OWORD PTR [rdx+96], xmm14
-        vmovdqu	OWORD PTR [rdx+112], xmm15
-        ; aesenc_128_ghash - end
-        add	ebx, 128
-        cmp	ebx, r13d
-        jl	L_AES_GCM_decrypt_avx2_ghash_128
-        vmovdqu	xmm5, OWORD PTR [rsp]
-        vmovdqu	xmm4, OWORD PTR [rsp+128]
-        vmovdqu	xmm15, OWORD PTR [rsp+144]
-L_AES_GCM_decrypt_avx2_done_128:
-        cmp	ebx, r10d
-        jge	L_AES_GCM_decrypt_avx2_done_dec
-        mov	r13d, r10d
-        and	r13d, 4294967280
-        cmp	ebx, r13d
-        jge	L_AES_GCM_decrypt_avx2_last_block_done
-L_AES_GCM_decrypt_avx2_last_block_start:
-        vmovdqu	xmm11, OWORD PTR [rdi+rbx]
-        vpshufb	xmm10, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpshufb	xmm12, xmm11, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpaddd	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one
-        vpxor	xmm12, xmm12, xmm6
-        ; aesenc_gfmul_sb
-        vpclmulqdq	xmm2, xmm12, xmm5, 1
-        vpclmulqdq	xmm3, xmm12, xmm5, 16
-        vpclmulqdq	xmm1, xmm12, xmm5, 0
-        vpclmulqdq	xmm8, xmm12, xmm5, 17
-        vpxor	xmm10, xmm10, [rsi]
-        vaesenc	xmm10, xmm10, [rsi+16]
-        vpxor	xmm3, xmm3, xmm2
-        vpslldq	xmm2, xmm3, 8
-        vpsrldq	xmm3, xmm3, 8
-        vaesenc	xmm10, xmm10, [rsi+32]
-        vpxor	xmm2, xmm2, xmm1
-        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vaesenc	xmm10, xmm10, [rsi+48]
-        vaesenc	xmm10, xmm10, [rsi+64]
-        vaesenc	xmm10, xmm10, [rsi+80]
-        vpshufd	xmm2, xmm2, 78
-        vpxor	xmm2, xmm2, xmm1
-        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vaesenc	xmm10, xmm10, [rsi+96]
-        vaesenc	xmm10, xmm10, [rsi+112]
-        vaesenc	xmm10, xmm10, [rsi+128]
-        vpshufd	xmm2, xmm2, 78
-        vaesenc	xmm10, xmm10, [rsi+144]
-        vpxor	xmm8, xmm8, xmm3
-        vpxor	xmm2, xmm2, xmm8
-        vmovdqu	xmm0, OWORD PTR [rsi+160]
-        cmp	r9d, 11
-        jl	L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm10, xmm10, [rsi+176]
-        vmovdqu	xmm0, OWORD PTR [rsi+192]
-        cmp	r9d, 13
-        jl	L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm10, xmm10, [rsi+208]
-        vmovdqu	xmm0, OWORD PTR [rsi+224]
-L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last:
-        vaesenclast	xmm10, xmm10, xmm0
-        vpxor	xmm6, xmm2, xmm1
-        vpxor	xmm10, xmm10, xmm11
-        vmovdqu	OWORD PTR [r8+rbx], xmm10
-        add	ebx, 16
-        cmp	ebx, r13d
-        jl	L_AES_GCM_decrypt_avx2_last_block_start
-L_AES_GCM_decrypt_avx2_last_block_done:
-        mov	ecx, r10d
-        mov	edx, r10d
-        and	ecx, 15
-        jz	L_AES_GCM_decrypt_avx2_done_dec
-        ; aesenc_last15_dec
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpxor	xmm4, xmm4, [rsi]
-        vaesenc	xmm4, xmm4, [rsi+16]
-        vaesenc	xmm4, xmm4, [rsi+32]
-        vaesenc	xmm4, xmm4, [rsi+48]
-        vaesenc	xmm4, xmm4, [rsi+64]
-        vaesenc	xmm4, xmm4, [rsi+80]
-        vaesenc	xmm4, xmm4, [rsi+96]
-        vaesenc	xmm4, xmm4, [rsi+112]
-        vaesenc	xmm4, xmm4, [rsi+128]
-        vaesenc	xmm4, xmm4, [rsi+144]
-        cmp	r9d, 11
-        vmovdqu	xmm1, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last
-        vaesenc	xmm4, xmm4, xmm1
-        vaesenc	xmm4, xmm4, [rsi+176]
-        cmp	r9d, 13
-        vmovdqu	xmm1, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last
-        vaesenc	xmm4, xmm4, xmm1
-        vaesenc	xmm4, xmm4, [rsi+208]
-        vmovdqu	xmm1, OWORD PTR [rsi+224]
-L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last:
-        vaesenclast	xmm4, xmm4, xmm1
-        xor	ecx, ecx
-        vpxor	xmm0, xmm0, xmm0
-        vmovdqu	OWORD PTR [rsp], xmm4
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_loop:
-        movzx	r13d, BYTE PTR [rdi+rbx]
-        mov	BYTE PTR [rsp+rcx+16], r13b
-        xor	r13b, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [r8+rbx], r13b
-        inc	ebx
-        inc	ecx
-        cmp	ebx, edx
-        jl	L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_loop
-        vmovdqu	xmm4, OWORD PTR [rsp+16]
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm4
-        ; ghash_gfmul_red
-        vpclmulqdq	xmm2, xmm6, xmm5, 16
-        vpclmulqdq	xmm1, xmm6, xmm5, 1
-        vpclmulqdq	xmm0, xmm6, xmm5, 0
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm6, xmm6, xmm5, 17
-        vpclmulqdq	xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm6, xmm6, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vpxor	xmm6, xmm6, xmm0
-L_AES_GCM_decrypt_avx2_done_dec:
-        ; calc_tag
-        shl	r10, 3
-        shl	r11, 3
-        vmovq	xmm0, r10
-        vmovq	xmm1, r11
-        vpunpcklqdq	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm6
-        ; ghash_gfmul_red
-        vpclmulqdq	xmm4, xmm0, xmm5, 16
-        vpclmulqdq	xmm3, xmm0, xmm5, 1
-        vpclmulqdq	xmm2, xmm0, xmm5, 0
-        vpxor	xmm4, xmm4, xmm3
-        vpslldq	xmm3, xmm4, 8
-        vpsrldq	xmm4, xmm4, 8
-        vpxor	xmm3, xmm3, xmm2
-        vpclmulqdq	xmm0, xmm0, xmm5, 17
-        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm3, xmm3, 78
-        vpxor	xmm3, xmm3, xmm2
-        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm3, xmm3, 78
-        vpxor	xmm0, xmm0, xmm4
-        vpxor	xmm0, xmm0, xmm3
-        vpxor	xmm0, xmm0, xmm2
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm0, xmm0, xmm15
-        ; cmp_tag
-        cmp	r15d, 16
-        je	L_AES_GCM_decrypt_avx2_cmp_tag_16
-        xor	rdx, rdx
-        xor	rax, rax
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_decrypt_avx2_cmp_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+rdx]
-        xor	r13b, BYTE PTR [r14+rdx]
-        or	al, r13b
-        inc	edx
-        cmp	edx, r15d
-        jne	L_AES_GCM_decrypt_avx2_cmp_tag_loop
-        cmp	rax, 0
-        sete	al
-        jmp	L_AES_GCM_decrypt_avx2_cmp_tag_done
-L_AES_GCM_decrypt_avx2_cmp_tag_16:
-        vmovdqu	xmm1, OWORD PTR [r14]
-        vpcmpeqb	xmm0, xmm0, xmm1
-        vpmovmskb	rdx, xmm0
-        ; %%edx == 0xFFFF then return 1 else => return 0
-        xor	eax, eax
-        cmp	edx, 65535
-        sete	al
-L_AES_GCM_decrypt_avx2_cmp_tag_done:
-        mov	DWORD PTR [rbp], eax
-        vzeroupper
-        vmovdqu	xmm6, OWORD PTR [rsp+168]
-        vmovdqu	xmm7, OWORD PTR [rsp+184]
-        vmovdqu	xmm8, OWORD PTR [rsp+200]
-        vmovdqu	xmm9, OWORD PTR [rsp+216]
-        vmovdqu	xmm10, OWORD PTR [rsp+232]
-        vmovdqu	xmm11, OWORD PTR [rsp+248]
-        vmovdqu	xmm12, OWORD PTR [rsp+264]
-        vmovdqu	xmm13, OWORD PTR [rsp+280]
-        vmovdqu	xmm14, OWORD PTR [rsp+296]
-        vmovdqu	xmm15, OWORD PTR [rsp+312]
-        add	rsp, 328
-        pop	rbp
-        pop	rsi
-        pop	r15
-        pop	rbx
-        pop	r14
-        pop	r12
-        pop	rdi
-        pop	r13
-        ret
-AES_GCM_decrypt_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_init_avx2 PROC
-        push	rbx
-        push	rdi
-        push	rsi
-        push	r12
-        mov	rdi, rcx
-        mov	rsi, rdx
-        mov	r10, r8
-        mov	r11d, r9d
-        mov	rax, QWORD PTR [rsp+72]
-        mov	r8, QWORD PTR [rsp+80]
-        mov	r9, QWORD PTR [rsp+88]
-        sub	rsp, 48
-        vmovdqu	OWORD PTR [rsp+16], xmm6
-        vmovdqu	OWORD PTR [rsp+32], xmm7
-        vpxor	xmm4, xmm4, xmm4
-        mov	edx, r11d
-        cmp	edx, 12
-        je	L_AES_GCM_init_avx2_iv_12
-        ; Calculate values when IV is not 12 bytes
-        ; H = Encrypt X(=0)
-        vmovdqu	xmm5, OWORD PTR [rdi]
-        vaesenc	xmm5, xmm5, [rdi+16]
-        vaesenc	xmm5, xmm5, [rdi+32]
-        vaesenc	xmm5, xmm5, [rdi+48]
-        vaesenc	xmm5, xmm5, [rdi+64]
-        vaesenc	xmm5, xmm5, [rdi+80]
-        vaesenc	xmm5, xmm5, [rdi+96]
-        vaesenc	xmm5, xmm5, [rdi+112]
-        vaesenc	xmm5, xmm5, [rdi+128]
-        vaesenc	xmm5, xmm5, [rdi+144]
-        cmp	esi, 11
-        vmovdqu	xmm0, OWORD PTR [rdi+160]
-        jl	L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm5, xmm5, [rdi+176]
-        cmp	esi, 13
-        vmovdqu	xmm0, OWORD PTR [rdi+192]
-        jl	L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm5, xmm5, [rdi+208]
-        vmovdqu	xmm0, OWORD PTR [rdi+224]
-L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last:
-        vaesenclast	xmm5, xmm5, xmm0
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        ; Calc counter
-        ; Initialization vector
-        cmp	edx, 0
-        mov	rcx, 0
-        je	L_AES_GCM_init_avx2_calc_iv_done
-        cmp	edx, 16
-        jl	L_AES_GCM_init_avx2_calc_iv_lt16
-        and	edx, 4294967280
-L_AES_GCM_init_avx2_calc_iv_16_loop:
-        vmovdqu	xmm0, OWORD PTR [r10+rcx]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm6, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm6, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm6, xmm6, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm6, xmm6, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm6, xmm2, 16
-        vpshufd	xmm1, xmm6, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_init_avx2_calc_iv_16_loop
-        mov	edx, r11d
-        cmp	ecx, edx
-        je	L_AES_GCM_init_avx2_calc_iv_done
-L_AES_GCM_init_avx2_calc_iv_lt16:
-        vpxor	xmm0, xmm0, xmm0
-        xor	ebx, ebx
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_init_avx2_calc_iv_loop:
-        movzx	r12d, BYTE PTR [r10+rcx]
-        mov	BYTE PTR [rsp+rbx], r12b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_init_avx2_calc_iv_loop
-        vmovdqu	xmm0, OWORD PTR [rsp]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm6, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm6, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm6, xmm6, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm6, xmm6, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm6, xmm2, 16
-        vpshufd	xmm1, xmm6, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-L_AES_GCM_init_avx2_calc_iv_done:
-        ; T = Encrypt counter
-        vpxor	xmm0, xmm0, xmm0
-        shl	edx, 3
-        vmovq	xmm0, rdx
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm6, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm6, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm6, xmm6, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm6, xmm6, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm6, xmm2, 16
-        vpshufd	xmm1, xmm6, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        ;   Encrypt counter
-        vmovdqu	xmm7, OWORD PTR [rdi]
-        vpxor	xmm7, xmm7, xmm4
-        vaesenc	xmm7, xmm7, [rdi+16]
-        vaesenc	xmm7, xmm7, [rdi+32]
-        vaesenc	xmm7, xmm7, [rdi+48]
-        vaesenc	xmm7, xmm7, [rdi+64]
-        vaesenc	xmm7, xmm7, [rdi+80]
-        vaesenc	xmm7, xmm7, [rdi+96]
-        vaesenc	xmm7, xmm7, [rdi+112]
-        vaesenc	xmm7, xmm7, [rdi+128]
-        vaesenc	xmm7, xmm7, [rdi+144]
-        cmp	esi, 11
-        vmovdqu	xmm0, OWORD PTR [rdi+160]
-        jl	L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm7, xmm7, xmm0
-        vaesenc	xmm7, xmm7, [rdi+176]
-        cmp	esi, 13
-        vmovdqu	xmm0, OWORD PTR [rdi+192]
-        jl	L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm7, xmm7, xmm0
-        vaesenc	xmm7, xmm7, [rdi+208]
-        vmovdqu	xmm0, OWORD PTR [rdi+224]
-L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last:
-        vaesenclast	xmm7, xmm7, xmm0
-        jmp	L_AES_GCM_init_avx2_iv_done
-L_AES_GCM_init_avx2_iv_12:
-        ; # Calculate values when IV is 12 bytes
-        ; Set counter based on IV
-        vmovdqu	xmm4, OWORD PTR L_avx2_aes_gcm_bswap_one
-        vmovdqu	xmm5, OWORD PTR [rdi]
-        vpblendd	xmm4, xmm4, [r10], 7
-        ; H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqu	xmm6, OWORD PTR [rdi+16]
-        vpxor	xmm7, xmm4, xmm5
-        vaesenc	xmm5, xmm5, xmm6
-        vaesenc	xmm7, xmm7, xmm6
-        vmovdqu	xmm0, OWORD PTR [rdi+32]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        vmovdqu	xmm0, OWORD PTR [rdi+48]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        vmovdqu	xmm0, OWORD PTR [rdi+64]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        vmovdqu	xmm0, OWORD PTR [rdi+80]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        vmovdqu	xmm0, OWORD PTR [rdi+96]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        vmovdqu	xmm0, OWORD PTR [rdi+112]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        vmovdqu	xmm0, OWORD PTR [rdi+128]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        vmovdqu	xmm0, OWORD PTR [rdi+144]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        cmp	esi, 11
-        vmovdqu	xmm0, OWORD PTR [rdi+160]
-        jl	L_AES_GCM_init_avx2_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        vmovdqu	xmm0, OWORD PTR [rdi+176]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        cmp	esi, 13
-        vmovdqu	xmm0, OWORD PTR [rdi+192]
-        jl	L_AES_GCM_init_avx2_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        vmovdqu	xmm0, OWORD PTR [rdi+208]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        vmovdqu	xmm0, OWORD PTR [rdi+224]
-L_AES_GCM_init_avx2_calc_iv_12_last:
-        vaesenclast	xmm5, xmm5, xmm0
-        vaesenclast	xmm7, xmm7, xmm0
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask
-L_AES_GCM_init_avx2_iv_done:
-        vmovdqu	OWORD PTR [r9], xmm7
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one
-        vmovdqu	OWORD PTR [rax], xmm5
-        vmovdqu	OWORD PTR [r8], xmm4
-        vzeroupper
-        vmovdqu	xmm6, OWORD PTR [rsp+16]
-        vmovdqu	xmm7, OWORD PTR [rsp+32]
-        add	rsp, 48
-        pop	r12
-        pop	rsi
-        pop	rdi
-        pop	rbx
-        ret
-AES_GCM_init_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_aad_update_avx2 PROC
-        mov	rax, rcx
-        sub	rsp, 16
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	xmm4, OWORD PTR [r8]
-        vmovdqu	xmm5, OWORD PTR [r9]
-        xor	ecx, ecx
-L_AES_GCM_aad_update_avx2_16_loop:
-        vmovdqu	xmm0, OWORD PTR [rax+rcx]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm6, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm6, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm6, xmm6, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm6, xmm6, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm6, xmm2, 16
-        vpshufd	xmm1, xmm6, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_aad_update_avx2_16_loop
-        vmovdqu	OWORD PTR [r8], xmm4
-        vzeroupper
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        add	rsp, 16
-        ret
-AES_GCM_aad_update_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_block_avx2 PROC
-        mov	r10, r8
-        mov	r11, r9
-        mov	rax, QWORD PTR [rsp+40]
-        sub	rsp, 152
-        vmovdqu	xmm3, OWORD PTR [rax]
-        ; aesenc_block
-        vmovdqu	xmm1, xmm3
-        vpshufb	xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_one
-        vpxor	xmm0, xmm0, [rcx]
-        vmovdqu	xmm2, OWORD PTR [rcx+16]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rcx+32]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rcx+48]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rcx+64]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rcx+80]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rcx+96]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rcx+112]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rcx+128]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rcx+144]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm3, xmm1
-        cmp	edx, 11
-        vmovdqu	xmm1, OWORD PTR [rcx+160]
-        jl	L_AES_GCM_encrypt_block_avx2_aesenc_block_last
-        vaesenc	xmm0, xmm0, xmm1
-        vmovdqu	xmm2, OWORD PTR [rcx+176]
-        vaesenc	xmm0, xmm0, xmm2
-        cmp	edx, 13
-        vmovdqu	xmm1, OWORD PTR [rcx+192]
-        jl	L_AES_GCM_encrypt_block_avx2_aesenc_block_last
-        vaesenc	xmm0, xmm0, xmm1
-        vmovdqu	xmm2, OWORD PTR [rcx+208]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm1, OWORD PTR [rcx+224]
-L_AES_GCM_encrypt_block_avx2_aesenc_block_last:
-        vaesenclast	xmm0, xmm0, xmm1
-        vmovdqu	xmm1, OWORD PTR [r11]
-        vpxor	xmm0, xmm0, xmm1
-        vmovdqu	OWORD PTR [r10], xmm0
-        vmovdqu	OWORD PTR [rax], xmm3
-        vzeroupper
-        add	rsp, 152
-        ret
-AES_GCM_encrypt_block_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_ghash_block_avx2 PROC
-        sub	rsp, 16
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	xmm4, OWORD PTR [rdx]
-        vmovdqu	xmm5, OWORD PTR [r8]
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm6, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm6, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm6, xmm6, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm6, xmm6, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm6, xmm2, 16
-        vpshufd	xmm1, xmm6, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-        vmovdqu	OWORD PTR [rdx], xmm4
-        vzeroupper
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        add	rsp, 16
-        ret
-AES_GCM_ghash_block_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_update_avx2 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        mov	rax, rcx
-        mov	r10, r8
-        mov	r8d, edx
-        mov	r11, r9
-        mov	r9d, DWORD PTR [rsp+80]
-        mov	r12, QWORD PTR [rsp+88]
-        mov	r13, QWORD PTR [rsp+96]
-        mov	r14, QWORD PTR [rsp+104]
-        sub	rsp, 312
-        vmovdqu	OWORD PTR [rsp+152], xmm6
-        vmovdqu	OWORD PTR [rsp+168], xmm7
-        vmovdqu	OWORD PTR [rsp+184], xmm8
-        vmovdqu	OWORD PTR [rsp+200], xmm9
-        vmovdqu	OWORD PTR [rsp+216], xmm10
-        vmovdqu	OWORD PTR [rsp+232], xmm11
-        vmovdqu	OWORD PTR [rsp+248], xmm12
-        vmovdqu	OWORD PTR [rsp+264], xmm13
-        vmovdqu	OWORD PTR [rsp+280], xmm14
-        vmovdqu	OWORD PTR [rsp+296], xmm15
-        vmovdqu	xmm6, OWORD PTR [r12]
-        vmovdqu	xmm5, OWORD PTR [r13]
-        vmovdqu	xmm4, OWORD PTR [r14]
-        vpsrlq	xmm1, xmm5, 63
-        vpsllq	xmm0, xmm5, 1
-        vpslldq	xmm1, xmm1, 8
-        vpor	xmm0, xmm0, xmm1
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpand	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpxor	xmm5, xmm5, xmm0
-        xor	edi, edi
-        cmp	r9d, 128
-        mov	r15d, r9d
-        jl	L_AES_GCM_encrypt_update_avx2_done_128
-        and	r15d, 4294967168
-        vmovdqu	OWORD PTR [rsp+128], xmm4
-        vmovdqu	xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128
-        ; H ^ 1 and H ^ 2
-        vpclmulqdq	xmm9, xmm5, xmm5, 0
-        vpclmulqdq	xmm10, xmm5, xmm5, 17
-        vpclmulqdq	xmm8, xmm9, xmm3, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm9, xmm9, xmm8
-        vpclmulqdq	xmm8, xmm9, xmm3, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm0, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp], xmm5
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-        ; H ^ 3 and H ^ 4
-        vpclmulqdq	xmm11, xmm0, xmm5, 16
-        vpclmulqdq	xmm10, xmm0, xmm5, 1
-        vpclmulqdq	xmm9, xmm0, xmm5, 0
-        vpclmulqdq	xmm12, xmm0, xmm5, 17
-        vpclmulqdq	xmm13, xmm0, xmm0, 0
-        vpclmulqdq	xmm14, xmm0, xmm0, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm2, xmm13, xmm14
-        vpxor	xmm1, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+32], xmm1
-        vmovdqu	OWORD PTR [rsp+48], xmm2
-        ; H ^ 5 and H ^ 6
-        vpclmulqdq	xmm11, xmm1, xmm0, 16
-        vpclmulqdq	xmm10, xmm1, xmm0, 1
-        vpclmulqdq	xmm9, xmm1, xmm0, 0
-        vpclmulqdq	xmm12, xmm1, xmm0, 17
-        vpclmulqdq	xmm13, xmm1, xmm1, 0
-        vpclmulqdq	xmm14, xmm1, xmm1, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm0, xmm13, xmm14
-        vpxor	xmm7, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm7
-        vmovdqu	OWORD PTR [rsp+80], xmm0
-        ; H ^ 7 and H ^ 8
-        vpclmulqdq	xmm11, xmm2, xmm1, 16
-        vpclmulqdq	xmm10, xmm2, xmm1, 1
-        vpclmulqdq	xmm9, xmm2, xmm1, 0
-        vpclmulqdq	xmm12, xmm2, xmm1, 17
-        vpclmulqdq	xmm13, xmm2, xmm2, 0
-        vpclmulqdq	xmm14, xmm2, xmm2, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm0, xmm13, xmm14
-        vpxor	xmm7, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+96], xmm7
-        vmovdqu	OWORD PTR [rsp+112], xmm0
-        ; First 128 bytes of input
-        ; aesenc_128
-        ; aesenc_ctr
-        vmovdqu	xmm0, OWORD PTR [rsp+128]
-        vmovdqu	xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight
-        vpshufb	xmm15, xmm15, xmm1
-        ; aesenc_xor
-        vmovdqu	xmm7, OWORD PTR [rax]
-        vmovdqu	OWORD PTR [rsp+128], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+16]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+32]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+48]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+64]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+80]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+96]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+112]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+128]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+144]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r8d, 11
-        vmovdqu	xmm7, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_128_enc_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r8d, 13
-        vmovdqu	xmm7, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_128_enc_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_avx2_aesenc_128_enc_done:
-        ; aesenc_last
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [r11]
-        vmovdqu	xmm1, OWORD PTR [r11+16]
-        vmovdqu	xmm2, OWORD PTR [r11+32]
-        vmovdqu	xmm3, OWORD PTR [r11+48]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vpxor	xmm10, xmm10, xmm2
-        vpxor	xmm11, xmm11, xmm3
-        vmovdqu	OWORD PTR [r10], xmm8
-        vmovdqu	OWORD PTR [r10+16], xmm9
-        vmovdqu	OWORD PTR [r10+32], xmm10
-        vmovdqu	OWORD PTR [r10+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [r11+64]
-        vmovdqu	xmm1, OWORD PTR [r11+80]
-        vmovdqu	xmm2, OWORD PTR [r11+96]
-        vmovdqu	xmm3, OWORD PTR [r11+112]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vpxor	xmm14, xmm14, xmm2
-        vpxor	xmm15, xmm15, xmm3
-        vmovdqu	OWORD PTR [r10+64], xmm12
-        vmovdqu	OWORD PTR [r10+80], xmm13
-        vmovdqu	OWORD PTR [r10+96], xmm14
-        vmovdqu	OWORD PTR [r10+112], xmm15
-        cmp	r15d, 128
-        mov	edi, 128
-        jle	L_AES_GCM_encrypt_update_avx2_end_128
-        ; More 128 bytes of input
-L_AES_GCM_encrypt_update_avx2_ghash_128:
-        ; aesenc_128_ghash
-        lea	rcx, QWORD PTR [r11+rdi]
-        lea	rdx, QWORD PTR [r10+rdi]
-        ; aesenc_ctr
-        vmovdqu	xmm0, OWORD PTR [rsp+128]
-        vmovdqu	xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight
-        vpshufb	xmm15, xmm15, xmm1
-        ; aesenc_xor
-        vmovdqu	xmm7, OWORD PTR [rax]
-        vmovdqu	OWORD PTR [rsp+128], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        ; aesenc_pclmul_1
-        vmovdqu	xmm1, OWORD PTR [rdx+-128]
-        vmovdqu	xmm0, OWORD PTR [rax+16]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vmovdqu	xmm2, OWORD PTR [rsp+112]
-        vpxor	xmm1, xmm1, xmm6
-        vpclmulqdq	xmm5, xmm1, xmm2, 16
-        vpclmulqdq	xmm3, xmm1, xmm2, 1
-        vpclmulqdq	xmm6, xmm1, xmm2, 0
-        vpclmulqdq	xmm7, xmm1, xmm2, 17
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_2
-        vmovdqu	xmm1, OWORD PTR [rdx+-112]
-        vmovdqu	xmm0, OWORD PTR [rsp+96]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+32]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-96]
-        vmovdqu	xmm0, OWORD PTR [rsp+80]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+48]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-80]
-        vmovdqu	xmm0, OWORD PTR [rsp+64]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+64]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-64]
-        vmovdqu	xmm0, OWORD PTR [rsp+48]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+80]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-48]
-        vmovdqu	xmm0, OWORD PTR [rsp+32]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+96]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-32]
-        vmovdqu	xmm0, OWORD PTR [rsp+16]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+112]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-16]
-        vmovdqu	xmm0, OWORD PTR [rsp]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+128]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_l
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm6, xmm6, xmm4
-        vpxor	xmm5, xmm5, xmm3
-        vpslldq	xmm1, xmm5, 8
-        vpsrldq	xmm5, xmm5, 8
-        vmovdqu	xmm4, OWORD PTR [rax+144]
-        vmovdqu	xmm0, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vaesenc	xmm8, xmm8, xmm4
-        vpxor	xmm6, xmm6, xmm1
-        vpxor	xmm7, xmm7, xmm5
-        vpclmulqdq	xmm3, xmm6, xmm0, 16
-        vaesenc	xmm9, xmm9, xmm4
-        vaesenc	xmm10, xmm10, xmm4
-        vaesenc	xmm11, xmm11, xmm4
-        vpshufd	xmm6, xmm6, 78
-        vpxor	xmm6, xmm6, xmm3
-        vpclmulqdq	xmm3, xmm6, xmm0, 16
-        vaesenc	xmm12, xmm12, xmm4
-        vaesenc	xmm13, xmm13, xmm4
-        vaesenc	xmm14, xmm14, xmm4
-        vpshufd	xmm6, xmm6, 78
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm6, xmm6, xmm7
-        vaesenc	xmm15, xmm15, xmm4
-        cmp	r8d, 11
-        vmovdqu	xmm7, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r8d, 13
-        vmovdqu	xmm7, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_avx2_aesenc_128_ghash_avx_done:
-        ; aesenc_last
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vmovdqu	xmm1, OWORD PTR [rcx+16]
-        vmovdqu	xmm2, OWORD PTR [rcx+32]
-        vmovdqu	xmm3, OWORD PTR [rcx+48]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vpxor	xmm10, xmm10, xmm2
-        vpxor	xmm11, xmm11, xmm3
-        vmovdqu	OWORD PTR [rdx], xmm8
-        vmovdqu	OWORD PTR [rdx+16], xmm9
-        vmovdqu	OWORD PTR [rdx+32], xmm10
-        vmovdqu	OWORD PTR [rdx+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+64]
-        vmovdqu	xmm1, OWORD PTR [rcx+80]
-        vmovdqu	xmm2, OWORD PTR [rcx+96]
-        vmovdqu	xmm3, OWORD PTR [rcx+112]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vpxor	xmm14, xmm14, xmm2
-        vpxor	xmm15, xmm15, xmm3
-        vmovdqu	OWORD PTR [rdx+64], xmm12
-        vmovdqu	OWORD PTR [rdx+80], xmm13
-        vmovdqu	OWORD PTR [rdx+96], xmm14
-        vmovdqu	OWORD PTR [rdx+112], xmm15
-        ; aesenc_128_ghash - end
-        add	edi, 128
-        cmp	edi, r15d
-        jl	L_AES_GCM_encrypt_update_avx2_ghash_128
-L_AES_GCM_encrypt_update_avx2_end_128:
-        vmovdqu	xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpshufb	xmm8, xmm8, xmm4
-        vpshufb	xmm9, xmm9, xmm4
-        vpshufb	xmm10, xmm10, xmm4
-        vpshufb	xmm11, xmm11, xmm4
-        vpshufb	xmm12, xmm12, xmm4
-        vpshufb	xmm13, xmm13, xmm4
-        vpshufb	xmm14, xmm14, xmm4
-        vpshufb	xmm15, xmm15, xmm4
-        vpxor	xmm8, xmm8, xmm6
-        vmovdqu	xmm7, OWORD PTR [rsp]
-        vpclmulqdq	xmm5, xmm7, xmm15, 16
-        vpclmulqdq	xmm1, xmm7, xmm15, 1
-        vpclmulqdq	xmm4, xmm7, xmm15, 0
-        vpclmulqdq	xmm6, xmm7, xmm15, 17
-        vpxor	xmm5, xmm5, xmm1
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vpclmulqdq	xmm2, xmm7, xmm14, 16
-        vpclmulqdq	xmm1, xmm7, xmm14, 1
-        vpclmulqdq	xmm0, xmm7, xmm14, 0
-        vpclmulqdq	xmm3, xmm7, xmm14, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vmovdqu	xmm15, OWORD PTR [rsp+32]
-        vmovdqu	xmm7, OWORD PTR [rsp+48]
-        vpclmulqdq	xmm2, xmm15, xmm13, 16
-        vpclmulqdq	xmm1, xmm15, xmm13, 1
-        vpclmulqdq	xmm0, xmm15, xmm13, 0
-        vpclmulqdq	xmm3, xmm15, xmm13, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vpclmulqdq	xmm2, xmm7, xmm12, 16
-        vpclmulqdq	xmm1, xmm7, xmm12, 1
-        vpclmulqdq	xmm0, xmm7, xmm12, 0
-        vpclmulqdq	xmm3, xmm7, xmm12, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vmovdqu	xmm15, OWORD PTR [rsp+64]
-        vmovdqu	xmm7, OWORD PTR [rsp+80]
-        vpclmulqdq	xmm2, xmm15, xmm11, 16
-        vpclmulqdq	xmm1, xmm15, xmm11, 1
-        vpclmulqdq	xmm0, xmm15, xmm11, 0
-        vpclmulqdq	xmm3, xmm15, xmm11, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vpclmulqdq	xmm2, xmm7, xmm10, 16
-        vpclmulqdq	xmm1, xmm7, xmm10, 1
-        vpclmulqdq	xmm0, xmm7, xmm10, 0
-        vpclmulqdq	xmm3, xmm7, xmm10, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vmovdqu	xmm15, OWORD PTR [rsp+96]
-        vmovdqu	xmm7, OWORD PTR [rsp+112]
-        vpclmulqdq	xmm2, xmm15, xmm9, 16
-        vpclmulqdq	xmm1, xmm15, xmm9, 1
-        vpclmulqdq	xmm0, xmm15, xmm9, 0
-        vpclmulqdq	xmm3, xmm15, xmm9, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vpclmulqdq	xmm2, xmm7, xmm8, 16
-        vpclmulqdq	xmm1, xmm7, xmm8, 1
-        vpclmulqdq	xmm0, xmm7, xmm8, 0
-        vpclmulqdq	xmm3, xmm7, xmm8, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vpslldq	xmm7, xmm5, 8
-        vpsrldq	xmm5, xmm5, 8
-        vpxor	xmm4, xmm4, xmm7
-        vpxor	xmm6, xmm6, xmm5
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm4, xmm2, 16
-        vpshufd	xmm1, xmm4, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm6, xmm6, xmm1
-        vmovdqu	xmm5, OWORD PTR [rsp]
-        vmovdqu	xmm4, OWORD PTR [rsp+128]
-L_AES_GCM_encrypt_update_avx2_done_128:
-        cmp	edi, r9d
-        je	L_AES_GCM_encrypt_update_avx2_done_enc
-        mov	r15d, r9d
-        and	r15d, 4294967280
-        cmp	edi, r15d
-        jge	L_AES_GCM_encrypt_update_avx2_last_block_done
-        ; aesenc_block
-        vmovdqu	xmm1, xmm4
-        vpshufb	xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_one
-        vpxor	xmm0, xmm0, [rax]
-        vmovdqu	xmm2, OWORD PTR [rax+16]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rax+32]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rax+48]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rax+64]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rax+80]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rax+96]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rax+112]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rax+128]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rax+144]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm4, xmm1
-        cmp	r8d, 11
-        vmovdqu	xmm1, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_block_last
-        vaesenc	xmm0, xmm0, xmm1
-        vmovdqu	xmm2, OWORD PTR [rax+176]
-        vaesenc	xmm0, xmm0, xmm2
-        cmp	r8d, 13
-        vmovdqu	xmm1, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_block_last
-        vaesenc	xmm0, xmm0, xmm1
-        vmovdqu	xmm2, OWORD PTR [rax+208]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm1, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_avx2_aesenc_block_last:
-        vaesenclast	xmm0, xmm0, xmm1
-        vmovdqu	xmm1, OWORD PTR [r11+rdi]
-        vpxor	xmm0, xmm0, xmm1
-        vmovdqu	OWORD PTR [r10+rdi], xmm0
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm0
-        add	edi, 16
-        cmp	edi, r15d
-        jge	L_AES_GCM_encrypt_update_avx2_last_block_ghash
-L_AES_GCM_encrypt_update_avx2_last_block_start:
-        vmovdqu	xmm12, OWORD PTR [r11+rdi]
-        vpshufb	xmm11, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one
-        ; aesenc_gfmul_sb
-        vpclmulqdq	xmm2, xmm6, xmm5, 1
-        vpclmulqdq	xmm3, xmm6, xmm5, 16
-        vpclmulqdq	xmm1, xmm6, xmm5, 0
-        vpclmulqdq	xmm8, xmm6, xmm5, 17
-        vpxor	xmm11, xmm11, [rax]
-        vaesenc	xmm11, xmm11, [rax+16]
-        vpxor	xmm3, xmm3, xmm2
-        vpslldq	xmm2, xmm3, 8
-        vpsrldq	xmm3, xmm3, 8
-        vaesenc	xmm11, xmm11, [rax+32]
-        vpxor	xmm2, xmm2, xmm1
-        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vaesenc	xmm11, xmm11, [rax+48]
-        vaesenc	xmm11, xmm11, [rax+64]
-        vaesenc	xmm11, xmm11, [rax+80]
-        vpshufd	xmm2, xmm2, 78
-        vpxor	xmm2, xmm2, xmm1
-        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vaesenc	xmm11, xmm11, [rax+96]
-        vaesenc	xmm11, xmm11, [rax+112]
-        vaesenc	xmm11, xmm11, [rax+128]
-        vpshufd	xmm2, xmm2, 78
-        vaesenc	xmm11, xmm11, [rax+144]
-        vpxor	xmm8, xmm8, xmm3
-        vpxor	xmm2, xmm2, xmm8
-        vmovdqu	xmm0, OWORD PTR [rax+160]
-        cmp	r8d, 11
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm11, xmm11, [rax+176]
-        vmovdqu	xmm0, OWORD PTR [rax+192]
-        cmp	r8d, 13
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm11, xmm11, [rax+208]
-        vmovdqu	xmm0, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last:
-        vaesenclast	xmm11, xmm11, xmm0
-        vpxor	xmm6, xmm2, xmm1
-        vpxor	xmm11, xmm11, xmm12
-        vmovdqu	OWORD PTR [r10+rdi], xmm11
-        vpshufb	xmm11, xmm11, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm11
-        add	edi, 16
-        cmp	edi, r15d
-        jl	L_AES_GCM_encrypt_update_avx2_last_block_start
-L_AES_GCM_encrypt_update_avx2_last_block_ghash:
-        ; ghash_gfmul_red
-        vpclmulqdq	xmm10, xmm6, xmm5, 16
-        vpclmulqdq	xmm9, xmm6, xmm5, 1
-        vpclmulqdq	xmm8, xmm6, xmm5, 0
-        vpxor	xmm10, xmm10, xmm9
-        vpslldq	xmm9, xmm10, 8
-        vpsrldq	xmm10, xmm10, 8
-        vpxor	xmm9, xmm9, xmm8
-        vpclmulqdq	xmm6, xmm6, xmm5, 17
-        vpclmulqdq	xmm8, xmm9, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm9, xmm9, xmm8
-        vpclmulqdq	xmm8, xmm9, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm6, xmm6, xmm10
-        vpxor	xmm6, xmm6, xmm9
-        vpxor	xmm6, xmm6, xmm8
-L_AES_GCM_encrypt_update_avx2_last_block_done:
-L_AES_GCM_encrypt_update_avx2_done_enc:
-        vmovdqu	OWORD PTR [r12], xmm6
-        vmovdqu	OWORD PTR [r14], xmm4
-        vzeroupper
-        vmovdqu	xmm6, OWORD PTR [rsp+152]
-        vmovdqu	xmm7, OWORD PTR [rsp+168]
-        vmovdqu	xmm8, OWORD PTR [rsp+184]
-        vmovdqu	xmm9, OWORD PTR [rsp+200]
-        vmovdqu	xmm10, OWORD PTR [rsp+216]
-        vmovdqu	xmm11, OWORD PTR [rsp+232]
-        vmovdqu	xmm12, OWORD PTR [rsp+248]
-        vmovdqu	xmm13, OWORD PTR [rsp+264]
-        vmovdqu	xmm14, OWORD PTR [rsp+280]
-        vmovdqu	xmm15, OWORD PTR [rsp+296]
-        add	rsp, 312
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-AES_GCM_encrypt_update_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_final_avx2 PROC
-        push	r12
-        push	r13
-        mov	eax, DWORD PTR [rsp+56]
-        mov	r10, QWORD PTR [rsp+64]
-        mov	r11, QWORD PTR [rsp+72]
-        sub	rsp, 48
-        vmovdqu	OWORD PTR [rsp+16], xmm6
-        vmovdqu	OWORD PTR [rsp+32], xmm7
-        vmovdqu	xmm4, OWORD PTR [rcx]
-        vmovdqu	xmm5, OWORD PTR [r10]
-        vmovdqu	xmm6, OWORD PTR [r11]
-        vpsrlq	xmm1, xmm5, 63
-        vpsllq	xmm0, xmm5, 1
-        vpslldq	xmm1, xmm1, 8
-        vpor	xmm0, xmm0, xmm1
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpand	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpxor	xmm5, xmm5, xmm0
-        ; calc_tag
-        shl	r9, 3
-        shl	rax, 3
-        vmovq	xmm0, r9
-        vmovq	xmm1, rax
-        vpunpcklqdq	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm4
-        ; ghash_gfmul_red
-        vpclmulqdq	xmm7, xmm0, xmm5, 16
-        vpclmulqdq	xmm3, xmm0, xmm5, 1
-        vpclmulqdq	xmm2, xmm0, xmm5, 0
-        vpxor	xmm7, xmm7, xmm3
-        vpslldq	xmm3, xmm7, 8
-        vpsrldq	xmm7, xmm7, 8
-        vpxor	xmm3, xmm3, xmm2
-        vpclmulqdq	xmm0, xmm0, xmm5, 17
-        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm3, xmm3, 78
-        vpxor	xmm3, xmm3, xmm2
-        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm3, xmm3, 78
-        vpxor	xmm0, xmm0, xmm7
-        vpxor	xmm0, xmm0, xmm3
-        vpxor	xmm0, xmm0, xmm2
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm0, xmm0, xmm6
-        ; store_tag
-        cmp	r8d, 16
-        je	L_AES_GCM_encrypt_final_avx2_store_tag_16
-        xor	r12, r12
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_encrypt_final_avx2_store_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+r12]
-        mov	BYTE PTR [rdx+r12], r13b
-        inc	r12d
-        cmp	r12d, r8d
-        jne	L_AES_GCM_encrypt_final_avx2_store_tag_loop
-        jmp	L_AES_GCM_encrypt_final_avx2_store_tag_done
-L_AES_GCM_encrypt_final_avx2_store_tag_16:
-        vmovdqu	OWORD PTR [rdx], xmm0
-L_AES_GCM_encrypt_final_avx2_store_tag_done:
-        vzeroupper
-        vmovdqu	xmm6, OWORD PTR [rsp+16]
-        vmovdqu	xmm7, OWORD PTR [rsp+32]
-        add	rsp, 48
-        pop	r13
-        pop	r12
-        ret
-AES_GCM_encrypt_final_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_decrypt_update_avx2 PROC
-        push	r13
-        push	r12
-        push	r14
-        push	r15
-        push	rdi
-        mov	rax, rcx
-        mov	r10, r8
-        mov	r8d, edx
-        mov	r11, r9
-        mov	r9d, DWORD PTR [rsp+80]
-        mov	r12, QWORD PTR [rsp+88]
-        mov	r14, QWORD PTR [rsp+96]
-        mov	r15, QWORD PTR [rsp+104]
-        sub	rsp, 328
-        vmovdqu	OWORD PTR [rsp+168], xmm6
-        vmovdqu	OWORD PTR [rsp+184], xmm7
-        vmovdqu	OWORD PTR [rsp+200], xmm8
-        vmovdqu	OWORD PTR [rsp+216], xmm9
-        vmovdqu	OWORD PTR [rsp+232], xmm10
-        vmovdqu	OWORD PTR [rsp+248], xmm11
-        vmovdqu	OWORD PTR [rsp+264], xmm12
-        vmovdqu	OWORD PTR [rsp+280], xmm13
-        vmovdqu	OWORD PTR [rsp+296], xmm14
-        vmovdqu	OWORD PTR [rsp+312], xmm15
-        vmovdqu	xmm6, OWORD PTR [r12]
-        vmovdqu	xmm5, OWORD PTR [r14]
-        vmovdqu	xmm4, OWORD PTR [r15]
-        ; Calculate H
-        vpsrlq	xmm1, xmm5, 63
-        vpsllq	xmm0, xmm5, 1
-        vpslldq	xmm1, xmm1, 8
-        vpor	xmm0, xmm0, xmm1
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpand	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpxor	xmm5, xmm5, xmm0
-        xor	edi, edi
-        cmp	r9d, 128
-        mov	r13d, r9d
-        jl	L_AES_GCM_decrypt_update_avx2_done_128
-        and	r13d, 4294967168
-        vmovdqu	OWORD PTR [rsp+128], xmm4
-        vmovdqu	OWORD PTR [rsp+144], xmm15
-        vmovdqu	xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128
-        ; H ^ 1 and H ^ 2
-        vpclmulqdq	xmm9, xmm5, xmm5, 0
-        vpclmulqdq	xmm10, xmm5, xmm5, 17
-        vpclmulqdq	xmm8, xmm9, xmm3, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm9, xmm9, xmm8
-        vpclmulqdq	xmm8, xmm9, xmm3, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm0, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp], xmm5
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-        ; H ^ 3 and H ^ 4
-        vpclmulqdq	xmm11, xmm0, xmm5, 16
-        vpclmulqdq	xmm10, xmm0, xmm5, 1
-        vpclmulqdq	xmm9, xmm0, xmm5, 0
-        vpclmulqdq	xmm12, xmm0, xmm5, 17
-        vpclmulqdq	xmm13, xmm0, xmm0, 0
-        vpclmulqdq	xmm14, xmm0, xmm0, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm2, xmm13, xmm14
-        vpxor	xmm1, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+32], xmm1
-        vmovdqu	OWORD PTR [rsp+48], xmm2
-        ; H ^ 5 and H ^ 6
-        vpclmulqdq	xmm11, xmm1, xmm0, 16
-        vpclmulqdq	xmm10, xmm1, xmm0, 1
-        vpclmulqdq	xmm9, xmm1, xmm0, 0
-        vpclmulqdq	xmm12, xmm1, xmm0, 17
-        vpclmulqdq	xmm13, xmm1, xmm1, 0
-        vpclmulqdq	xmm14, xmm1, xmm1, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm0, xmm13, xmm14
-        vpxor	xmm7, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm7
-        vmovdqu	OWORD PTR [rsp+80], xmm0
-        ; H ^ 7 and H ^ 8
-        vpclmulqdq	xmm11, xmm2, xmm1, 16
-        vpclmulqdq	xmm10, xmm2, xmm1, 1
-        vpclmulqdq	xmm9, xmm2, xmm1, 0
-        vpclmulqdq	xmm12, xmm2, xmm1, 17
-        vpclmulqdq	xmm13, xmm2, xmm2, 0
-        vpclmulqdq	xmm14, xmm2, xmm2, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm0, xmm13, xmm14
-        vpxor	xmm7, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+96], xmm7
-        vmovdqu	OWORD PTR [rsp+112], xmm0
-L_AES_GCM_decrypt_update_avx2_ghash_128:
-        ; aesenc_128_ghash
-        lea	rcx, QWORD PTR [r11+rdi]
-        lea	rdx, QWORD PTR [r10+rdi]
-        ; aesenc_ctr
-        vmovdqu	xmm0, OWORD PTR [rsp+128]
-        vmovdqu	xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight
-        vpshufb	xmm15, xmm15, xmm1
-        ; aesenc_xor
-        vmovdqu	xmm7, OWORD PTR [rax]
-        vmovdqu	OWORD PTR [rsp+128], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        ; aesenc_pclmul_1
-        vmovdqu	xmm1, OWORD PTR [rcx]
-        vmovdqu	xmm0, OWORD PTR [rax+16]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vmovdqu	xmm2, OWORD PTR [rsp+112]
-        vpxor	xmm1, xmm1, xmm6
-        vpclmulqdq	xmm5, xmm1, xmm2, 16
-        vpclmulqdq	xmm3, xmm1, xmm2, 1
-        vpclmulqdq	xmm6, xmm1, xmm2, 0
-        vpclmulqdq	xmm7, xmm1, xmm2, 17
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_2
-        vmovdqu	xmm1, OWORD PTR [rcx+16]
-        vmovdqu	xmm0, OWORD PTR [rsp+96]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+32]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+32]
-        vmovdqu	xmm0, OWORD PTR [rsp+80]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+48]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+48]
-        vmovdqu	xmm0, OWORD PTR [rsp+64]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+64]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+64]
-        vmovdqu	xmm0, OWORD PTR [rsp+48]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+80]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+80]
-        vmovdqu	xmm0, OWORD PTR [rsp+32]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+96]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+96]
-        vmovdqu	xmm0, OWORD PTR [rsp+16]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+112]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+112]
-        vmovdqu	xmm0, OWORD PTR [rsp]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+128]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_l
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm6, xmm6, xmm4
-        vpxor	xmm5, xmm5, xmm3
-        vpslldq	xmm1, xmm5, 8
-        vpsrldq	xmm5, xmm5, 8
-        vmovdqu	xmm4, OWORD PTR [rax+144]
-        vmovdqu	xmm0, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vaesenc	xmm8, xmm8, xmm4
-        vpxor	xmm6, xmm6, xmm1
-        vpxor	xmm7, xmm7, xmm5
-        vpclmulqdq	xmm3, xmm6, xmm0, 16
-        vaesenc	xmm9, xmm9, xmm4
-        vaesenc	xmm10, xmm10, xmm4
-        vaesenc	xmm11, xmm11, xmm4
-        vpshufd	xmm6, xmm6, 78
-        vpxor	xmm6, xmm6, xmm3
-        vpclmulqdq	xmm3, xmm6, xmm0, 16
-        vaesenc	xmm12, xmm12, xmm4
-        vaesenc	xmm13, xmm13, xmm4
-        vaesenc	xmm14, xmm14, xmm4
-        vpshufd	xmm6, xmm6, 78
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm6, xmm6, xmm7
-        vaesenc	xmm15, xmm15, xmm4
-        cmp	r8d, 11
-        vmovdqu	xmm7, OWORD PTR [rax+160]
-        jl	L_AES_GCM_decrypt_update_avx2_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r8d, 13
-        vmovdqu	xmm7, OWORD PTR [rax+192]
-        jl	L_AES_GCM_decrypt_update_avx2_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+224]
-L_AES_GCM_decrypt_update_avx2_aesenc_128_ghash_avx_done:
-        ; aesenc_last
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vmovdqu	xmm1, OWORD PTR [rcx+16]
-        vmovdqu	xmm2, OWORD PTR [rcx+32]
-        vmovdqu	xmm3, OWORD PTR [rcx+48]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vpxor	xmm10, xmm10, xmm2
-        vpxor	xmm11, xmm11, xmm3
-        vmovdqu	OWORD PTR [rdx], xmm8
-        vmovdqu	OWORD PTR [rdx+16], xmm9
-        vmovdqu	OWORD PTR [rdx+32], xmm10
-        vmovdqu	OWORD PTR [rdx+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+64]
-        vmovdqu	xmm1, OWORD PTR [rcx+80]
-        vmovdqu	xmm2, OWORD PTR [rcx+96]
-        vmovdqu	xmm3, OWORD PTR [rcx+112]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vpxor	xmm14, xmm14, xmm2
-        vpxor	xmm15, xmm15, xmm3
-        vmovdqu	OWORD PTR [rdx+64], xmm12
-        vmovdqu	OWORD PTR [rdx+80], xmm13
-        vmovdqu	OWORD PTR [rdx+96], xmm14
-        vmovdqu	OWORD PTR [rdx+112], xmm15
-        ; aesenc_128_ghash - end
-        add	edi, 128
-        cmp	edi, r13d
-        jl	L_AES_GCM_decrypt_update_avx2_ghash_128
-        vmovdqu	xmm5, OWORD PTR [rsp]
-        vmovdqu	xmm4, OWORD PTR [rsp+128]
-        vmovdqu	xmm15, OWORD PTR [rsp+144]
-L_AES_GCM_decrypt_update_avx2_done_128:
-        cmp	edi, r9d
-        jge	L_AES_GCM_decrypt_update_avx2_done_dec
-        mov	r13d, r9d
-        and	r13d, 4294967280
-        cmp	edi, r13d
-        jge	L_AES_GCM_decrypt_update_avx2_last_block_done
-L_AES_GCM_decrypt_update_avx2_last_block_start:
-        vmovdqu	xmm11, OWORD PTR [r11+rdi]
-        vpshufb	xmm10, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpshufb	xmm12, xmm11, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpaddd	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one
-        vpxor	xmm12, xmm12, xmm6
-        ; aesenc_gfmul_sb
-        vpclmulqdq	xmm2, xmm12, xmm5, 1
-        vpclmulqdq	xmm3, xmm12, xmm5, 16
-        vpclmulqdq	xmm1, xmm12, xmm5, 0
-        vpclmulqdq	xmm8, xmm12, xmm5, 17
-        vpxor	xmm10, xmm10, [rax]
-        vaesenc	xmm10, xmm10, [rax+16]
-        vpxor	xmm3, xmm3, xmm2
-        vpslldq	xmm2, xmm3, 8
-        vpsrldq	xmm3, xmm3, 8
-        vaesenc	xmm10, xmm10, [rax+32]
-        vpxor	xmm2, xmm2, xmm1
-        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vaesenc	xmm10, xmm10, [rax+48]
-        vaesenc	xmm10, xmm10, [rax+64]
-        vaesenc	xmm10, xmm10, [rax+80]
-        vpshufd	xmm2, xmm2, 78
-        vpxor	xmm2, xmm2, xmm1
-        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vaesenc	xmm10, xmm10, [rax+96]
-        vaesenc	xmm10, xmm10, [rax+112]
-        vaesenc	xmm10, xmm10, [rax+128]
-        vpshufd	xmm2, xmm2, 78
-        vaesenc	xmm10, xmm10, [rax+144]
-        vpxor	xmm8, xmm8, xmm3
-        vpxor	xmm2, xmm2, xmm8
-        vmovdqu	xmm0, OWORD PTR [rax+160]
-        cmp	r8d, 11
-        jl	L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm10, xmm10, [rax+176]
-        vmovdqu	xmm0, OWORD PTR [rax+192]
-        cmp	r8d, 13
-        jl	L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm10, xmm10, [rax+208]
-        vmovdqu	xmm0, OWORD PTR [rax+224]
-L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last:
-        vaesenclast	xmm10, xmm10, xmm0
-        vpxor	xmm6, xmm2, xmm1
-        vpxor	xmm10, xmm10, xmm11
-        vmovdqu	OWORD PTR [r10+rdi], xmm10
-        add	edi, 16
-        cmp	edi, r13d
-        jl	L_AES_GCM_decrypt_update_avx2_last_block_start
-L_AES_GCM_decrypt_update_avx2_last_block_done:
-L_AES_GCM_decrypt_update_avx2_done_dec:
-        vmovdqu	OWORD PTR [r12], xmm6
-        vmovdqu	OWORD PTR [r15], xmm4
-        vzeroupper
-        vmovdqu	xmm6, OWORD PTR [rsp+168]
-        vmovdqu	xmm7, OWORD PTR [rsp+184]
-        vmovdqu	xmm8, OWORD PTR [rsp+200]
-        vmovdqu	xmm9, OWORD PTR [rsp+216]
-        vmovdqu	xmm10, OWORD PTR [rsp+232]
-        vmovdqu	xmm11, OWORD PTR [rsp+248]
-        vmovdqu	xmm12, OWORD PTR [rsp+264]
-        vmovdqu	xmm13, OWORD PTR [rsp+280]
-        vmovdqu	xmm14, OWORD PTR [rsp+296]
-        vmovdqu	xmm15, OWORD PTR [rsp+312]
-        add	rsp, 328
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r12
-        pop	r13
-        ret
-AES_GCM_decrypt_update_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_decrypt_final_avx2 PROC
-        push	r12
-        push	r13
-        push	r14
-        mov	eax, DWORD PTR [rsp+64]
-        mov	r10, QWORD PTR [rsp+72]
-        mov	r11, QWORD PTR [rsp+80]
-        mov	r12, QWORD PTR [rsp+88]
-        sub	rsp, 48
-        vmovdqu	OWORD PTR [rsp+16], xmm6
-        vmovdqu	OWORD PTR [rsp+32], xmm7
-        vmovdqu	xmm4, OWORD PTR [rcx]
-        vmovdqu	xmm5, OWORD PTR [r10]
-        vmovdqu	xmm6, OWORD PTR [r11]
-        vpsrlq	xmm1, xmm5, 63
-        vpsllq	xmm0, xmm5, 1
-        vpslldq	xmm1, xmm1, 8
-        vpor	xmm0, xmm0, xmm1
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpand	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpxor	xmm5, xmm5, xmm0
-        ; calc_tag
-        shl	r9, 3
-        shl	rax, 3
-        vmovq	xmm0, r9
-        vmovq	xmm1, rax
-        vpunpcklqdq	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm4
-        ; ghash_gfmul_red
-        vpclmulqdq	xmm7, xmm0, xmm5, 16
-        vpclmulqdq	xmm3, xmm0, xmm5, 1
-        vpclmulqdq	xmm2, xmm0, xmm5, 0
-        vpxor	xmm7, xmm7, xmm3
-        vpslldq	xmm3, xmm7, 8
-        vpsrldq	xmm7, xmm7, 8
-        vpxor	xmm3, xmm3, xmm2
-        vpclmulqdq	xmm0, xmm0, xmm5, 17
-        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm3, xmm3, 78
-        vpxor	xmm3, xmm3, xmm2
-        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm3, xmm3, 78
-        vpxor	xmm0, xmm0, xmm7
-        vpxor	xmm0, xmm0, xmm3
-        vpxor	xmm0, xmm0, xmm2
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm0, xmm0, xmm6
-        ; cmp_tag
-        cmp	r8d, 16
-        je	L_AES_GCM_decrypt_final_avx2_cmp_tag_16
-        xor	r13, r13
-        xor	r10, r10
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_decrypt_final_avx2_cmp_tag_loop:
-        movzx	r14d, BYTE PTR [rsp+r13]
-        xor	r14b, BYTE PTR [rdx+r13]
-        or	r10b, r14b
-        inc	r13d
-        cmp	r13d, r8d
-        jne	L_AES_GCM_decrypt_final_avx2_cmp_tag_loop
-        cmp	r10, 0
-        sete	r10b
-        jmp	L_AES_GCM_decrypt_final_avx2_cmp_tag_done
-L_AES_GCM_decrypt_final_avx2_cmp_tag_16:
-        vmovdqu	xmm1, OWORD PTR [rdx]
-        vpcmpeqb	xmm0, xmm0, xmm1
-        vpmovmskb	r13, xmm0
-        ; %%edx == 0xFFFF then return 1 else => return 0
-        xor	r10d, r10d
-        cmp	r13d, 65535
-        sete	r10b
-L_AES_GCM_decrypt_final_avx2_cmp_tag_done:
-        mov	DWORD PTR [r12], r10d
-        vzeroupper
-        vmovdqu	xmm6, OWORD PTR [rsp+16]
-        vmovdqu	xmm7, OWORD PTR [rsp+32]
-        add	rsp, 48
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-AES_GCM_decrypt_final_avx2 ENDP
-_text ENDS
-ENDIF
-END

+ 0 - 215
lib/wolfssl/wolfcrypt/src/include.am

@@ -1,215 +0,0 @@
-# vim:ft=automake
-# All paths should be given relative to the root
-
-ASYNC_FILES =						\
-	wolfcrypt/src/port/cavium/cavium_nitrox.c	\
-	wolfcrypt/src/port/intel/quickassist.c		\
-	wolfcrypt/src/port/intel/quickassist_mem.c
-
-BUILT_SOURCES+= $(ASYNC_FILES)
-
-MAINTAINERCLEANFILES+= $(ASYNC_FILES)
-
-EXTRA_DIST += wolfcrypt/src/misc.c
-EXTRA_DIST += wolfcrypt/src/evp.c
-EXTRA_DIST += wolfcrypt/src/asm.c
-EXTRA_DIST += wolfcrypt/src/aes_asm.asm
-EXTRA_DIST += wolfcrypt/src/aes_gcm_asm.asm
-EXTRA_DIST += wolfcrypt/src/wc_dsp.c
-EXTRA_DIST += wolfcrypt/src/sp_dsp32.c
-EXTRA_DIST += wolfcrypt/src/sp_x86_64_asm.asm
-
-EXTRA_DIST += \
-              wolfcrypt/src/ecc_fp.c \
-              wolfcrypt/src/fp_mont_small.i \
-              wolfcrypt/src/fp_mul_comba_12.i \
-              wolfcrypt/src/fp_mul_comba_17.i \
-              wolfcrypt/src/fp_mul_comba_20.i \
-              wolfcrypt/src/fp_mul_comba_24.i \
-              wolfcrypt/src/fp_mul_comba_28.i \
-              wolfcrypt/src/fp_mul_comba_32.i \
-              wolfcrypt/src/fp_mul_comba_3.i \
-              wolfcrypt/src/fp_mul_comba_48.i \
-              wolfcrypt/src/fp_mul_comba_4.i \
-              wolfcrypt/src/fp_mul_comba_64.i \
-              wolfcrypt/src/fp_mul_comba_6.i \
-              wolfcrypt/src/fp_mul_comba_7.i \
-              wolfcrypt/src/fp_mul_comba_8.i \
-              wolfcrypt/src/fp_mul_comba_9.i \
-              wolfcrypt/src/fp_mul_comba_small_set.i \
-              wolfcrypt/src/fp_sqr_comba_12.i \
-              wolfcrypt/src/fp_sqr_comba_17.i \
-              wolfcrypt/src/fp_sqr_comba_20.i \
-              wolfcrypt/src/fp_sqr_comba_24.i \
-              wolfcrypt/src/fp_sqr_comba_28.i \
-              wolfcrypt/src/fp_sqr_comba_32.i \
-              wolfcrypt/src/fp_sqr_comba_3.i \
-              wolfcrypt/src/fp_sqr_comba_48.i \
-              wolfcrypt/src/fp_sqr_comba_4.i \
-              wolfcrypt/src/fp_sqr_comba_64.i \
-              wolfcrypt/src/fp_sqr_comba_6.i \
-              wolfcrypt/src/fp_sqr_comba_7.i \
-              wolfcrypt/src/fp_sqr_comba_8.i \
-              wolfcrypt/src/fp_sqr_comba_9.i \
-              wolfcrypt/src/fp_sqr_comba_small_set.i \
-              wolfcrypt/src/fe_x25519_128.i
-
-EXTRA_DIST += wolfcrypt/src/port/ti/ti-aes.c \
-              wolfcrypt/src/port/ti/ti-des3.c \
-              wolfcrypt/src/port/ti/ti-hash.c \
-              wolfcrypt/src/port/ti/ti-ccm.c \
-              wolfcrypt/src/port/pic32/pic32mz-crypt.c \
-              wolfcrypt/src/port/nrf51.c \
-              wolfcrypt/src/port/arm/armv8-aes.c \
-              wolfcrypt/src/port/arm/armv8-sha256.c \
-              wolfcrypt/src/port/arm/armv8-chacha.c \
-              wolfcrypt/src/port/aria/aria-crypt.c \
-              wolfcrypt/src/port/aria/aria-cryptocb.c \
-              wolfcrypt/src/port/nxp/ksdk_port.c \
-              wolfcrypt/src/port/nxp/dcp_port.c \
-              wolfcrypt/src/port/nxp/se050_port.c \
-              wolfcrypt/src/port/nxp/README.md \
-              wolfcrypt/src/port/atmel/README.md \
-              wolfcrypt/src/port/xilinx/xil-sha3.c \
-              wolfcrypt/src/port/xilinx/xil-aesgcm.c \
-              wolfcrypt/src/port/xilinx/xil-versal-glue.c \
-              wolfcrypt/src/port/xilinx/xil-versal-trng.c \
-              wolfcrypt/src/port/caam/caam_aes.c \
-              wolfcrypt/src/port/caam/caam_driver.c \
-              wolfcrypt/src/port/caam/caam_error.c \
-              wolfcrypt/src/port/caam/caam_qnx.c \
-              wolfcrypt/src/port/caam/caam_integrity.c \
-              wolfcrypt/src/port/caam/caam_sha.c \
-              wolfcrypt/src/port/caam/caam_doc.pdf \
-              wolfcrypt/src/port/caam/wolfcaam_init.c \
-              wolfcrypt/src/port/caam/wolfcaam_seco.c \
-              wolfcrypt/src/port/caam/wolfcaam_qnx.c \
-              wolfcrypt/src/port/caam/wolfcaam_x25519.c \
-              wolfcrypt/src/port/caam/wolfcaam_ecdsa.c \
-              wolfcrypt/src/port/caam/wolfcaam_cmac.c \
-              wolfcrypt/src/port/caam/wolfcaam_hash.c \
-              wolfcrypt/src/port/caam/wolfcaam_rsa.c \
-              wolfcrypt/src/port/caam/wolfcaam_hmac.c \
-              wolfcrypt/src/port/caam/wolfcaam_aes.c \
-              wolfcrypt/src/port/caam/wolfcaam_fsl_nxp.c \
-              wolfcrypt/src/port/silabs/silabs_aes.c \
-              wolfcrypt/src/port/silabs/silabs_ecc.c \
-              wolfcrypt/src/port/silabs/silabs_hash.c \
-              wolfcrypt/src/port/silabs/silabs_random.c \
-              wolfcrypt/src/port/silabs/README.md \
-              wolfcrypt/src/port/st/stm32.c \
-              wolfcrypt/src/port/st/stsafe.c \
-              wolfcrypt/src/port/st/README.md \
-              wolfcrypt/src/port/af_alg/afalg_aes.c \
-              wolfcrypt/src/port/af_alg/afalg_hash.c \
-              wolfcrypt/src/port/kcapi/kcapi_aes.c \
-              wolfcrypt/src/port/kcapi/kcapi_hash.c \
-              wolfcrypt/src/port/kcapi/kcapi_hmac.c \
-              wolfcrypt/src/port/kcapi/kcapi_ecc.c \
-              wolfcrypt/src/port/kcapi/kcapi_rsa.c \
-              wolfcrypt/src/port/kcapi/kcapi_dh.c \
-              wolfcrypt/src/port/kcapi/README.md \
-              wolfcrypt/src/port/devcrypto/devcrypto_hash.c \
-              wolfcrypt/src/port/devcrypto/wc_devcrypto.c \
-              wolfcrypt/src/port/devcrypto/README.md \
-              wolfcrypt/src/port/mynewt/mynewt_port.c \
-              wolfcrypt/src/port/Espressif/esp32_aes.c \
-              wolfcrypt/src/port/Espressif/esp32_sha.c \
-              wolfcrypt/src/port/Espressif/esp32_util.c \
-              wolfcrypt/src/port/Espressif/esp32_mp.c \
-              wolfcrypt/src/port/Espressif/README.md \
-              wolfcrypt/src/port/arm/cryptoCell.c \
-              wolfcrypt/src/port/arm/cryptoCellHash.c \
-              wolfcrypt/src/port/Renesas/renesas_tsip_aes.c \
-              wolfcrypt/src/port/Renesas/renesas_tsip_sha.c \
-              wolfcrypt/src/port/Renesas/renesas_tsip_rsa.c \
-              wolfcrypt/src/port/Renesas/renesas_tsip_util.c \
-              wolfcrypt/src/port/Renesas/renesas_fspsm_util.c \
-              wolfcrypt/src/port/Renesas/renesas_fspsm_aes.c \
-              wolfcrypt/src/port/Renesas/renesas_fspsm_sha.c \
-              wolfcrypt/src/port/Renesas/renesas_fspsm_rsa.c \
-              wolfcrypt/src/port/Renesas/renesas_common.c \
-              wolfcrypt/src/port/Renesas/renesas_rx64_hw_sha.c \
-              wolfcrypt/src/port/Renesas/renesas_rx64_hw_util.c \
-              wolfcrypt/src/port/Renesas/README.md \
-              wolfcrypt/src/port/cypress/psoc6_crypto.c
-
-$(ASYNC_FILES):
-	$(AM_V_at)touch $(srcdir)/$@
-
-if BUILD_CRYPTOCB
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/cryptocb.c
-endif
-
-if BUILD_PKCS11
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/wc_pkcs11.c
-endif
-
-if BUILD_DEVCRYPTO
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/devcrypto/devcrypto_ecdsa.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/devcrypto/devcrypto_x25519.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/devcrypto/devcrypto_rsa.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/devcrypto/devcrypto_hmac.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/devcrypto/devcrypto_hash.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/devcrypto/devcrypto_aes.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/devcrypto/wc_devcrypto.c
-endif
-
-if BUILD_CAVIUM
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/cavium/cavium_nitrox.c
-endif
-EXTRA_DIST += wolfcrypt/src/port/cavium/README.md
-
-if BUILD_OCTEON_SYNC
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/cavium/cavium_octeon_sync.c
-endif
-EXTRA_DIST += wolfcrypt/src/port/cavium/README_Octeon.md
-
-if BUILD_INTEL_QA
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/intel/quickassist.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/intel/quickassist_mem.c
-endif
-EXTRA_DIST += wolfcrypt/src/port/intel/README.md
-
-if BUILD_INTEL_QA_SYNC
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/intel/quickassist_sync.c
-endif
-
-if BUILD_CRYPTOAUTHLIB
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/atmel/atmel.c
-endif
-
-if BUILD_IOTSAFE
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/iotsafe/iotsafe.c
-endif
-
-
-if BUILD_CAAM
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/caam/wolfcaam_init.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/caam/wolfcaam_qnx.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/caam/wolfcaam_seco.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/caam/wolfcaam_x25519.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/caam/wolfcaam_ecdsa.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/caam/wolfcaam_cmac.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/caam/wolfcaam_aes.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/caam/wolfcaam_hash.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/caam/wolfcaam_rsa.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/caam/wolfcaam_hmac.c
-endif
-
-if BUILD_SE050
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/nxp/se050_port.c
-endif
-
-if BUILD_PSA
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/psa/psa.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/psa/psa_hash.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/psa/psa_aes.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/psa/psa_pkcbs.c
-endif
-EXTRA_DIST += wolfcrypt/src/port/psa/README.md
-
-if BUILD_MAXQ10XX
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/maxim/maxq10xx.c
-endif
-EXTRA_DIST += wolfcrypt/src/port/maxim/README.md

+ 0 - 76932
lib/wolfssl/wolfcrypt/src/sp_x86_64_asm.asm

@@ -1,76932 +0,0 @@
-; /* sp_x86_64_asm
-;  *
-;  * Copyright (C) 2006-2023 wolfSSL Inc.
-;  *
-;  * This file is part of wolfSSL.
-;  *
-;  * wolfSSL is free software; you can redistribute it and/or modify
-;  * it under the terms of the GNU General Public License as published by
-;  * the Free Software Foundation; either version 2 of the License, or
-;  * (at your option) any later version.
-;  *
-;  * wolfSSL is distributed in the hope that it will be useful,
-;  * but WITHOUT ANY WARRANTY; without even the implied warranty of
-;  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;  * GNU General Public License for more details.
-;  *
-;  * You should have received a copy of the GNU General Public License
-;  * along with this program; if not, write to the Free Software
-;  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
-;  */
-IF @Version LT 1200
-; AVX2 instructions not recognized by old versions of MASM
-IFNDEF NO_AVX2_SUPPORT
-NO_AVX2_SUPPORT = 1
-ENDIF
-; MOVBE instruction not recognized by old versions of MASM
-IFNDEF NO_MOVBE_SUPPORT
-NO_MOVBE_SUPPORT = 1
-ENDIF
-ENDIF
-
-IFNDEF HAVE_INTEL_AVX1
-HAVE_INTEL_AVX1 = 1
-ENDIF
-IFNDEF NO_AVX2_SUPPORT
-HAVE_INTEL_AVX2 = 1
-ENDIF
-
-IFNDEF _WIN64
-_WIN64 = 1
-ENDIF
-
-IFNDEF WOLFSSL_SP_NO_2048
-IFNDEF WOLFSSL_SP_NO_2048
-; /* Read big endian unsigned byte array into r.
-;  * Uses the bswap instruction.
-;  *
-;  * r  A single precision integer.
-;  * size  Maximum number of bytes to convert
-;  * a  Byte array.
-;  * n  Number of bytes in array to read.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_from_bin_bswap PROC
-        push	r12
-        push	r13
-        mov	r11, r8
-        mov	r12, rcx
-        add	r11, r9
-        add	r12, 256
-        xor	r13, r13
-        jmp	L_2048_from_bin_bswap_64_end
-L_2048_from_bin_bswap_64_start:
-        sub	r11, 64
-        mov	rax, QWORD PTR [r11+56]
-        mov	r10, QWORD PTR [r11+48]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        mov	rax, QWORD PTR [r11+40]
-        mov	r10, QWORD PTR [r11+32]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r10
-        mov	rax, QWORD PTR [r11+24]
-        mov	r10, QWORD PTR [r11+16]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        mov	rax, QWORD PTR [r11+8]
-        mov	r10, QWORD PTR [r11]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r10
-        add	rcx, 64
-        sub	r9, 64
-L_2048_from_bin_bswap_64_end:
-        cmp	r9, 63
-        jg	L_2048_from_bin_bswap_64_start
-        jmp	L_2048_from_bin_bswap_8_end
-L_2048_from_bin_bswap_8_start:
-        sub	r11, 8
-        mov	rax, QWORD PTR [r11]
-        bswap	rax
-        mov	QWORD PTR [rcx], rax
-        add	rcx, 8
-        sub	r9, 8
-L_2048_from_bin_bswap_8_end:
-        cmp	r9, 7
-        jg	L_2048_from_bin_bswap_8_start
-        cmp	r9, r13
-        je	L_2048_from_bin_bswap_hi_end
-        mov	r10, r13
-        mov	rax, r13
-L_2048_from_bin_bswap_hi_start:
-        mov	al, BYTE PTR [r8]
-        shl	r10, 8
-        inc	r8
-        add	r10, rax
-        dec	r9
-        jg	L_2048_from_bin_bswap_hi_start
-        mov	QWORD PTR [rcx], r10
-        add	rcx, 8
-L_2048_from_bin_bswap_hi_end:
-        cmp	rcx, r12
-        jge	L_2048_from_bin_bswap_zero_end
-L_2048_from_bin_bswap_zero_start:
-        mov	QWORD PTR [rcx], r13
-        add	rcx, 8
-        cmp	rcx, r12
-        jl	L_2048_from_bin_bswap_zero_start
-L_2048_from_bin_bswap_zero_end:
-        pop	r13
-        pop	r12
-        ret
-sp_2048_from_bin_bswap ENDP
-_text ENDS
-IFNDEF NO_MOVBE_SUPPORT
-; /* Read big endian unsigned byte array into r.
-;  * Uses the movbe instruction which is an optional instruction.
-;  *
-;  * r  A single precision integer.
-;  * size  Maximum number of bytes to convert
-;  * a  Byte array.
-;  * n  Number of bytes in array to read.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_from_bin_movbe PROC
-        push	r12
-        mov	r11, r8
-        mov	r12, rcx
-        add	r11, r9
-        add	r12, 256
-        jmp	L_2048_from_bin_movbe_64_end
-L_2048_from_bin_movbe_64_start:
-        sub	r11, 64
-        movbe	rax, QWORD PTR [r11+56]
-        movbe	r10, QWORD PTR [r11+48]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        movbe	rax, QWORD PTR [r11+40]
-        movbe	r10, QWORD PTR [r11+32]
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r10
-        movbe	rax, QWORD PTR [r11+24]
-        movbe	r10, QWORD PTR [r11+16]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        movbe	rax, QWORD PTR [r11+8]
-        movbe	r10, QWORD PTR [r11]
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r10
-        add	rcx, 64
-        sub	r9, 64
-L_2048_from_bin_movbe_64_end:
-        cmp	r9, 63
-        jg	L_2048_from_bin_movbe_64_start
-        jmp	L_2048_from_bin_movbe_8_end
-L_2048_from_bin_movbe_8_start:
-        sub	r11, 8
-        movbe	rax, QWORD PTR [r11]
-        mov	QWORD PTR [rcx], rax
-        add	rcx, 8
-        sub	r9, 8
-L_2048_from_bin_movbe_8_end:
-        cmp	r9, 7
-        jg	L_2048_from_bin_movbe_8_start
-        cmp	r9, 0
-        je	L_2048_from_bin_movbe_hi_end
-        mov	r10, 0
-        mov	rax, 0
-L_2048_from_bin_movbe_hi_start:
-        mov	al, BYTE PTR [r8]
-        shl	r10, 8
-        inc	r8
-        add	r10, rax
-        dec	r9
-        jg	L_2048_from_bin_movbe_hi_start
-        mov	QWORD PTR [rcx], r10
-        add	rcx, 8
-L_2048_from_bin_movbe_hi_end:
-        cmp	rcx, r12
-        jge	L_2048_from_bin_movbe_zero_end
-L_2048_from_bin_movbe_zero_start:
-        mov	QWORD PTR [rcx], 0
-        add	rcx, 8
-        cmp	rcx, r12
-        jl	L_2048_from_bin_movbe_zero_start
-L_2048_from_bin_movbe_zero_end:
-        pop	r12
-        ret
-sp_2048_from_bin_movbe ENDP
-_text ENDS
-ENDIF
-; /* Write r as big endian to byte array.
-;  * Fixed length number of bytes written: 256
-;  * Uses the bswap instruction.
-;  *
-;  * r  A single precision integer.
-;  * a  Byte array.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_to_bin_bswap_32 PROC
-        mov	rax, QWORD PTR [rcx+248]
-        mov	r8, QWORD PTR [rcx+240]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx], rax
-        mov	QWORD PTR [rdx+8], r8
-        mov	rax, QWORD PTR [rcx+232]
-        mov	r8, QWORD PTR [rcx+224]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+16], rax
-        mov	QWORD PTR [rdx+24], r8
-        mov	rax, QWORD PTR [rcx+216]
-        mov	r8, QWORD PTR [rcx+208]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+32], rax
-        mov	QWORD PTR [rdx+40], r8
-        mov	rax, QWORD PTR [rcx+200]
-        mov	r8, QWORD PTR [rcx+192]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+48], rax
-        mov	QWORD PTR [rdx+56], r8
-        mov	rax, QWORD PTR [rcx+184]
-        mov	r8, QWORD PTR [rcx+176]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+64], rax
-        mov	QWORD PTR [rdx+72], r8
-        mov	rax, QWORD PTR [rcx+168]
-        mov	r8, QWORD PTR [rcx+160]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+80], rax
-        mov	QWORD PTR [rdx+88], r8
-        mov	rax, QWORD PTR [rcx+152]
-        mov	r8, QWORD PTR [rcx+144]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+96], rax
-        mov	QWORD PTR [rdx+104], r8
-        mov	rax, QWORD PTR [rcx+136]
-        mov	r8, QWORD PTR [rcx+128]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+112], rax
-        mov	QWORD PTR [rdx+120], r8
-        mov	rax, QWORD PTR [rcx+120]
-        mov	r8, QWORD PTR [rcx+112]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+128], rax
-        mov	QWORD PTR [rdx+136], r8
-        mov	rax, QWORD PTR [rcx+104]
-        mov	r8, QWORD PTR [rcx+96]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+144], rax
-        mov	QWORD PTR [rdx+152], r8
-        mov	rax, QWORD PTR [rcx+88]
-        mov	r8, QWORD PTR [rcx+80]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+160], rax
-        mov	QWORD PTR [rdx+168], r8
-        mov	rax, QWORD PTR [rcx+72]
-        mov	r8, QWORD PTR [rcx+64]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+176], rax
-        mov	QWORD PTR [rdx+184], r8
-        mov	rax, QWORD PTR [rcx+56]
-        mov	r8, QWORD PTR [rcx+48]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+192], rax
-        mov	QWORD PTR [rdx+200], r8
-        mov	rax, QWORD PTR [rcx+40]
-        mov	r8, QWORD PTR [rcx+32]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+208], rax
-        mov	QWORD PTR [rdx+216], r8
-        mov	rax, QWORD PTR [rcx+24]
-        mov	r8, QWORD PTR [rcx+16]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+224], rax
-        mov	QWORD PTR [rdx+232], r8
-        mov	rax, QWORD PTR [rcx+8]
-        mov	r8, QWORD PTR [rcx]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+240], rax
-        mov	QWORD PTR [rdx+248], r8
-        ret
-sp_2048_to_bin_bswap_32 ENDP
-_text ENDS
-IFNDEF NO_MOVBE_SUPPORT
-; /* Write r as big endian to byte array.
-;  * Fixed length number of bytes written: 256
-;  * Uses the movbe instruction which is optional.
-;  *
-;  * r  A single precision integer.
-;  * a  Byte array.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_to_bin_movbe_32 PROC
-        movbe	rax, QWORD PTR [rcx+248]
-        movbe	r8, QWORD PTR [rcx+240]
-        mov	QWORD PTR [rdx], rax
-        mov	QWORD PTR [rdx+8], r8
-        movbe	rax, QWORD PTR [rcx+232]
-        movbe	r8, QWORD PTR [rcx+224]
-        mov	QWORD PTR [rdx+16], rax
-        mov	QWORD PTR [rdx+24], r8
-        movbe	rax, QWORD PTR [rcx+216]
-        movbe	r8, QWORD PTR [rcx+208]
-        mov	QWORD PTR [rdx+32], rax
-        mov	QWORD PTR [rdx+40], r8
-        movbe	rax, QWORD PTR [rcx+200]
-        movbe	r8, QWORD PTR [rcx+192]
-        mov	QWORD PTR [rdx+48], rax
-        mov	QWORD PTR [rdx+56], r8
-        movbe	rax, QWORD PTR [rcx+184]
-        movbe	r8, QWORD PTR [rcx+176]
-        mov	QWORD PTR [rdx+64], rax
-        mov	QWORD PTR [rdx+72], r8
-        movbe	rax, QWORD PTR [rcx+168]
-        movbe	r8, QWORD PTR [rcx+160]
-        mov	QWORD PTR [rdx+80], rax
-        mov	QWORD PTR [rdx+88], r8
-        movbe	rax, QWORD PTR [rcx+152]
-        movbe	r8, QWORD PTR [rcx+144]
-        mov	QWORD PTR [rdx+96], rax
-        mov	QWORD PTR [rdx+104], r8
-        movbe	rax, QWORD PTR [rcx+136]
-        movbe	r8, QWORD PTR [rcx+128]
-        mov	QWORD PTR [rdx+112], rax
-        mov	QWORD PTR [rdx+120], r8
-        movbe	rax, QWORD PTR [rcx+120]
-        movbe	r8, QWORD PTR [rcx+112]
-        mov	QWORD PTR [rdx+128], rax
-        mov	QWORD PTR [rdx+136], r8
-        movbe	rax, QWORD PTR [rcx+104]
-        movbe	r8, QWORD PTR [rcx+96]
-        mov	QWORD PTR [rdx+144], rax
-        mov	QWORD PTR [rdx+152], r8
-        movbe	rax, QWORD PTR [rcx+88]
-        movbe	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rdx+160], rax
-        mov	QWORD PTR [rdx+168], r8
-        movbe	rax, QWORD PTR [rcx+72]
-        movbe	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rdx+176], rax
-        mov	QWORD PTR [rdx+184], r8
-        movbe	rax, QWORD PTR [rcx+56]
-        movbe	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rdx+192], rax
-        mov	QWORD PTR [rdx+200], r8
-        movbe	rax, QWORD PTR [rcx+40]
-        movbe	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rdx+208], rax
-        mov	QWORD PTR [rdx+216], r8
-        movbe	rax, QWORD PTR [rcx+24]
-        movbe	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rdx+224], rax
-        mov	QWORD PTR [rdx+232], r8
-        movbe	rax, QWORD PTR [rcx+8]
-        movbe	r8, QWORD PTR [rcx]
-        mov	QWORD PTR [rdx+240], rax
-        mov	QWORD PTR [rdx+248], r8
-        ret
-sp_2048_to_bin_movbe_32 ENDP
-_text ENDS
-ENDIF
-; /* Multiply a and b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_mul_16 PROC
-        push	r12
-        mov	r9, rdx
-        sub	rsp, 128
-        ; A[0] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        mov	QWORD PTR [rsp], rax
-        mov	r11, rdx
-        ; A[0] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[1] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+8], r11
-        ; A[0] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+8]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[2] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+16], r12
-        ; A[0] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[1] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[2] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+16]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[3] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rsp+24], r10
-        ; A[0] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[1] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+16]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[3] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+24]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[4] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+32], r11
-        ; A[0] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+8]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[2] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+24]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[4] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+32]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[5] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+40], r12
-        ; A[0] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[1] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[2] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+16]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[3] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+32]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[5] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+40]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[6] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rsp+48], r10
-        ; A[0] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[1] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+16]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[3] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+24]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[4] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+40]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[6] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+48]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[7] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+56], r11
-        ; A[0] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+8]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[2] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+24]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[4] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+32]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[5] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+48]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[7] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+56]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[8] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+64], r12
-        ; A[0] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[1] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[2] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+16]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[3] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+32]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[5] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+40]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[6] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+56]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[8] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+64]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[9] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+72]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rsp+72], r10
-        ; A[0] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[1] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+16]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[3] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+24]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[4] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+40]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[6] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+48]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[7] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+64]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[9] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+72]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[10] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+80]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+80], r11
-        ; A[0] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+8]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[2] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+24]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[4] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+32]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[5] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+48]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[7] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+56]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[8] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[9] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+72]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[10] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+80]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[11] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+88]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+88], r12
-        ; A[0] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[1] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[2] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+16]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[3] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+32]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[5] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+40]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[6] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+56]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[8] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+64]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[9] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+72]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[10] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+80]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[11] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+88]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[12] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+96]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rsp+96], r10
-        ; A[0] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[1] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+16]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[3] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+24]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[4] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+40]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[6] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+48]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[7] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+64]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[9] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+72]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[10] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+80]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[11] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+88]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[12] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+96]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[13] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+104]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+104], r11
-        ; A[0] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+8]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[2] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+24]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[4] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+32]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[5] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+48]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[7] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+56]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[8] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[9] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+72]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[10] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+80]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[11] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+88]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[12] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+96]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[13] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+104]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[14] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+112]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+112], r12
-        ; A[0] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[1] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[2] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+16]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[3] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+32]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[5] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+40]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[6] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+56]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[8] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+64]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[9] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+72]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[10] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+80]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[11] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+88]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[12] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+96]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[13] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+104]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[14] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+112]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[15] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+120]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rsp+120], r10
-        ; A[1] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+8]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+16]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[3] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+24]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[4] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+40]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[6] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+48]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[7] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+64]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[9] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+72]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[10] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+80]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[11] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+88]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[12] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+96]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[13] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+104]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[14] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+112]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[15] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+120]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+128], r11
-        ; A[2] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+16]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+24]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[4] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+32]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[5] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+48]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[7] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+56]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[8] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[9] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+72]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[10] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+80]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[11] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+88]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[12] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+96]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[13] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+104]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[14] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+112]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[15] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+120]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+136], r12
-        ; A[3] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+24]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+32]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[5] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+40]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[6] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+56]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[8] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+64]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[9] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+72]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[10] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+80]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[11] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+88]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[12] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+96]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[13] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+104]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[14] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+112]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[15] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+120]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rcx+144], r10
-        ; A[4] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+32]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+40]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[6] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+48]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[7] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+64]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[9] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+72]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[10] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+80]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[11] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+88]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[12] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+96]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[13] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+104]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[14] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+112]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[15] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+120]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+152], r11
-        ; A[5] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+40]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+48]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[7] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+56]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[8] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[9] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+72]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[10] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+80]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[11] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+88]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[12] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+96]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[13] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+104]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[14] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+112]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[15] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+120]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+160], r12
-        ; A[6] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+48]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+56]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[8] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+64]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[9] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+72]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[10] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+80]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[11] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+88]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[12] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+96]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[13] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+104]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[14] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+112]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[15] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+120]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rcx+168], r10
-        ; A[7] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+56]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+64]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[9] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+72]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[10] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+80]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[11] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+88]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[12] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+96]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[13] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+104]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[14] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+112]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[15] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+120]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+176], r11
-        ; A[8] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+64]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[9] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+72]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[10] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+80]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[11] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+88]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[12] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+96]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[13] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+104]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[14] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+112]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[15] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+120]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+184], r12
-        ; A[9] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+72]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[10] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+80]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[11] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+88]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[12] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+96]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[13] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+104]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[14] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+112]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[15] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+120]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rcx+192], r10
-        ; A[10] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+80]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[11] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+88]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[12] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+96]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[13] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+104]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[14] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+112]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[15] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+120]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+200], r11
-        ; A[11] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+88]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[12] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+96]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[13] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+104]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[14] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+112]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[15] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+120]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+208], r12
-        ; A[12] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+96]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[13] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+104]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[14] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+112]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[15] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+120]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rcx+216], r10
-        ; A[13] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+104]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[14] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+112]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[15] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+120]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+224], r11
-        ; A[14] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+112]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[15] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+120]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+232], r12
-        ; A[15] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+120]
-        add	r10, rax
-        adc	r11, rdx
-        mov	QWORD PTR [rcx+240], r10
-        mov	QWORD PTR [rcx+248], r11
-        mov	rax, QWORD PTR [rsp]
-        mov	rdx, QWORD PTR [rsp+8]
-        mov	r10, QWORD PTR [rsp+16]
-        mov	r11, QWORD PTR [rsp+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], rdx
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        mov	rax, QWORD PTR [rsp+32]
-        mov	rdx, QWORD PTR [rsp+40]
-        mov	r10, QWORD PTR [rsp+48]
-        mov	r11, QWORD PTR [rsp+56]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], rdx
-        mov	QWORD PTR [rcx+48], r10
-        mov	QWORD PTR [rcx+56], r11
-        mov	rax, QWORD PTR [rsp+64]
-        mov	rdx, QWORD PTR [rsp+72]
-        mov	r10, QWORD PTR [rsp+80]
-        mov	r11, QWORD PTR [rsp+88]
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], rdx
-        mov	QWORD PTR [rcx+80], r10
-        mov	QWORD PTR [rcx+88], r11
-        mov	rax, QWORD PTR [rsp+96]
-        mov	rdx, QWORD PTR [rsp+104]
-        mov	r10, QWORD PTR [rsp+112]
-        mov	r11, QWORD PTR [rsp+120]
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], rdx
-        mov	QWORD PTR [rcx+112], r10
-        mov	QWORD PTR [rcx+120], r11
-        add	rsp, 128
-        pop	r12
-        ret
-sp_2048_mul_16 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Multiply a and b into r. (r = a * b)
-;  *
-;  * r   Result of multiplication.
-;  * a   First number to multiply.
-;  * b   Second number to multiply.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_mul_avx2_16 PROC
-        push	rbx
-        push	rbp
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        mov	rbp, r8
-        mov	r8, rcx
-        mov	r9, rdx
-        sub	rsp, 128
-        cmp	r9, r8
-        mov	rbx, rsp
-        cmovne	rbx, r8
-        cmp	rbp, r8
-        cmove	rbx, rsp
-        add	r8, 128
-        xor	rdi, rdi
-        mov	rdx, QWORD PTR [r9]
-        ; A[0] * B[0]
-        mulx	r11, r10, QWORD PTR [rbp]
-        ; A[0] * B[1]
-        mulx	r12, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx], r10
-        adcx	r11, rax
-        ; A[0] * B[2]
-        mulx	r13, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+8], r11
-        adcx	r12, rax
-        ; A[0] * B[3]
-        mulx	r14, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+16], r12
-        adcx	r13, rax
-        mov	QWORD PTR [rbx+24], r13
-        ; A[0] * B[4]
-        mulx	r10, rax, QWORD PTR [rbp+32]
-        adcx	r14, rax
-        ; A[0] * B[5]
-        mulx	r11, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+32], r14
-        adcx	r10, rax
-        ; A[0] * B[6]
-        mulx	r12, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+40], r10
-        adcx	r11, rax
-        ; A[0] * B[7]
-        mulx	r13, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+48], r11
-        adcx	r12, rax
-        mov	QWORD PTR [rbx+56], r12
-        ; A[0] * B[8]
-        mulx	r14, rax, QWORD PTR [rbp+64]
-        adcx	r13, rax
-        ; A[0] * B[9]
-        mulx	r10, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [rbx+64], r13
-        adcx	r14, rax
-        ; A[0] * B[10]
-        mulx	r11, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [rbx+72], r14
-        adcx	r10, rax
-        ; A[0] * B[11]
-        mulx	r12, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [rbx+80], r10
-        adcx	r11, rax
-        mov	QWORD PTR [rbx+88], r11
-        ; A[0] * B[12]
-        mulx	r13, rax, QWORD PTR [rbp+96]
-        adcx	r12, rax
-        ; A[0] * B[13]
-        mulx	r14, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [rbx+96], r12
-        adcx	r13, rax
-        ; A[0] * B[14]
-        mulx	r10, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        ; A[0] * B[15]
-        mulx	r11, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adcx	r11, rdi
-        mov	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [rbx+120], r10
-        mov	QWORD PTR [r8], r11
-        mov	rdx, QWORD PTR [r9+8]
-        mov	r11, QWORD PTR [rbx+8]
-        mov	r12, QWORD PTR [rbx+16]
-        mov	r13, QWORD PTR [rbx+24]
-        mov	r14, QWORD PTR [rbx+32]
-        mov	r10, QWORD PTR [rbx+40]
-        ; A[1] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[1] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[1] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+16], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[1] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+24], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+32], r14
-        mov	r11, QWORD PTR [rbx+48]
-        mov	r12, QWORD PTR [rbx+56]
-        mov	r13, QWORD PTR [rbx+64]
-        mov	r14, QWORD PTR [rbx+72]
-        ; A[1] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[1] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+40], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[1] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+48], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[1] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+56], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        mov	QWORD PTR [rbx+64], r13
-        mov	r10, QWORD PTR [rbx+80]
-        mov	r11, QWORD PTR [rbx+88]
-        mov	r12, QWORD PTR [rbx+96]
-        mov	r13, QWORD PTR [rbx+104]
-        ; A[1] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[1] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [rbx+72], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[1] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [rbx+80], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[1] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [rbx+88], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [rbx+96], r12
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        ; A[1] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[1] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[1] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[1] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [rbx+120], r10
-        mov	r12, rdi
-        adcx	r11, rax
-        adox	r12, rcx
-        adcx	r12, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8], r11
-        mov	QWORD PTR [r8+8], r12
-        mov	rdx, QWORD PTR [r9+16]
-        mov	r12, QWORD PTR [rbx+16]
-        mov	r13, QWORD PTR [rbx+24]
-        mov	r14, QWORD PTR [rbx+32]
-        mov	r10, QWORD PTR [rbx+40]
-        mov	r11, QWORD PTR [rbx+48]
-        ; A[2] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[2] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+16], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[2] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+24], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[2] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+32], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+40], r10
-        mov	r12, QWORD PTR [rbx+56]
-        mov	r13, QWORD PTR [rbx+64]
-        mov	r14, QWORD PTR [rbx+72]
-        mov	r10, QWORD PTR [rbx+80]
-        ; A[2] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[2] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+48], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[2] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+56], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[2] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+64], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+72], r14
-        mov	r11, QWORD PTR [rbx+88]
-        mov	r12, QWORD PTR [rbx+96]
-        mov	r13, QWORD PTR [rbx+104]
-        mov	r14, QWORD PTR [rbx+112]
-        ; A[2] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[2] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [rbx+80], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[2] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [rbx+88], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[2] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [rbx+96], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        mov	QWORD PTR [rbx+104], r13
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        ; A[2] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[2] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[2] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[2] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8], r11
-        mov	r13, rdi
-        adcx	r12, rax
-        adox	r13, rcx
-        adcx	r13, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+8], r12
-        mov	QWORD PTR [r8+16], r13
-        mov	rdx, QWORD PTR [r9+24]
-        mov	r13, QWORD PTR [rbx+24]
-        mov	r14, QWORD PTR [rbx+32]
-        mov	r10, QWORD PTR [rbx+40]
-        mov	r11, QWORD PTR [rbx+48]
-        mov	r12, QWORD PTR [rbx+56]
-        ; A[3] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[3] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+24], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[3] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+32], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[3] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+40], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbx+48], r11
-        mov	r13, QWORD PTR [rbx+64]
-        mov	r14, QWORD PTR [rbx+72]
-        mov	r10, QWORD PTR [rbx+80]
-        mov	r11, QWORD PTR [rbx+88]
-        ; A[3] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[3] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+56], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[3] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+64], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[3] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+72], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+80], r10
-        mov	r12, QWORD PTR [rbx+96]
-        mov	r13, QWORD PTR [rbx+104]
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        ; A[3] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[3] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [rbx+88], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[3] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [rbx+96], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[3] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+112], r14
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        ; A[3] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[3] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[3] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[3] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+8], r12
-        mov	r14, rdi
-        adcx	r13, rax
-        adox	r14, rcx
-        adcx	r14, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+16], r13
-        mov	QWORD PTR [r8+24], r14
-        mov	rdx, QWORD PTR [r9+32]
-        mov	r14, QWORD PTR [rbx+32]
-        mov	r10, QWORD PTR [rbx+40]
-        mov	r11, QWORD PTR [rbx+48]
-        mov	r12, QWORD PTR [rbx+56]
-        mov	r13, QWORD PTR [rbx+64]
-        ; A[4] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[4] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+32], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[4] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+40], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[4] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+48], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [rbx+56], r12
-        mov	r14, QWORD PTR [rbx+72]
-        mov	r10, QWORD PTR [rbx+80]
-        mov	r11, QWORD PTR [rbx+88]
-        mov	r12, QWORD PTR [rbx+96]
-        ; A[4] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[4] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+64], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[4] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+72], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[4] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+80], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbx+88], r11
-        mov	r13, QWORD PTR [rbx+104]
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        ; A[4] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[4] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [rbx+96], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[4] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[4] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+120], r10
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        mov	r14, QWORD PTR [r8+24]
-        ; A[4] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[4] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[4] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+8], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[4] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+16], r13
-        mov	r10, rdi
-        adcx	r14, rax
-        adox	r10, rcx
-        adcx	r10, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+24], r14
-        mov	QWORD PTR [r8+32], r10
-        mov	rdx, QWORD PTR [r9+40]
-        mov	r10, QWORD PTR [rbx+40]
-        mov	r11, QWORD PTR [rbx+48]
-        mov	r12, QWORD PTR [rbx+56]
-        mov	r13, QWORD PTR [rbx+64]
-        mov	r14, QWORD PTR [rbx+72]
-        ; A[5] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[5] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+40], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[5] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+48], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[5] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+56], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        mov	QWORD PTR [rbx+64], r13
-        mov	r10, QWORD PTR [rbx+80]
-        mov	r11, QWORD PTR [rbx+88]
-        mov	r12, QWORD PTR [rbx+96]
-        mov	r13, QWORD PTR [rbx+104]
-        ; A[5] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[5] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+72], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[5] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+80], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[5] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+88], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [rbx+96], r12
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        ; A[5] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[5] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[5] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[5] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8], r11
-        mov	r13, QWORD PTR [r8+16]
-        mov	r14, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [r8+32]
-        ; A[5] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[5] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+8], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[5] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+16], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[5] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+24], r14
-        mov	r11, rdi
-        adcx	r10, rax
-        adox	r11, rcx
-        adcx	r11, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+32], r10
-        mov	QWORD PTR [r8+40], r11
-        mov	rdx, QWORD PTR [r9+48]
-        mov	r11, QWORD PTR [rbx+48]
-        mov	r12, QWORD PTR [rbx+56]
-        mov	r13, QWORD PTR [rbx+64]
-        mov	r14, QWORD PTR [rbx+72]
-        mov	r10, QWORD PTR [rbx+80]
-        ; A[6] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[6] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+48], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[6] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+56], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[6] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+64], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+72], r14
-        mov	r11, QWORD PTR [rbx+88]
-        mov	r12, QWORD PTR [rbx+96]
-        mov	r13, QWORD PTR [rbx+104]
-        mov	r14, QWORD PTR [rbx+112]
-        ; A[6] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[6] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+80], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[6] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+88], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[6] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+96], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        mov	QWORD PTR [rbx+104], r13
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        ; A[6] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[6] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[6] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[6] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r8+8], r12
-        mov	r14, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        ; A[6] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[6] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+16], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[6] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+24], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[6] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+32], r10
-        mov	r12, rdi
-        adcx	r11, rax
-        adox	r12, rcx
-        adcx	r12, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+40], r11
-        mov	QWORD PTR [r8+48], r12
-        mov	rdx, QWORD PTR [r9+56]
-        mov	r12, QWORD PTR [rbx+56]
-        mov	r13, QWORD PTR [rbx+64]
-        mov	r14, QWORD PTR [rbx+72]
-        mov	r10, QWORD PTR [rbx+80]
-        mov	r11, QWORD PTR [rbx+88]
-        ; A[7] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[7] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+56], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[7] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+64], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[7] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+72], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+80], r10
-        mov	r12, QWORD PTR [rbx+96]
-        mov	r13, QWORD PTR [rbx+104]
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        ; A[7] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[7] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+88], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[7] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+96], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[7] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+112], r14
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        mov	r14, QWORD PTR [r8+24]
-        ; A[7] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[7] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[7] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[7] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+8], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        mov	QWORD PTR [r8+16], r13
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [r8+48]
-        ; A[7] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[7] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+24], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[7] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+32], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[7] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+40], r11
-        mov	r13, rdi
-        adcx	r12, rax
-        adox	r13, rcx
-        adcx	r13, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+48], r12
-        mov	QWORD PTR [r8+56], r13
-        mov	rdx, QWORD PTR [r9+64]
-        mov	r13, QWORD PTR [rbx+64]
-        mov	r14, QWORD PTR [rbx+72]
-        mov	r10, QWORD PTR [rbx+80]
-        mov	r11, QWORD PTR [rbx+88]
-        mov	r12, QWORD PTR [rbx+96]
-        ; A[8] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[8] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+64], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[8] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+72], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[8] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+80], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbx+88], r11
-        mov	r13, QWORD PTR [rbx+104]
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        ; A[8] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[8] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+96], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[8] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[8] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+120], r10
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        mov	r14, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [r8+32]
-        ; A[8] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[8] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[8] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [r8+8], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[8] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+16], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+24], r14
-        mov	r11, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [r8+48]
-        mov	r13, QWORD PTR [r8+56]
-        ; A[8] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[8] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+32], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[8] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+40], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[8] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+48], r12
-        mov	r14, rdi
-        adcx	r13, rax
-        adox	r14, rcx
-        adcx	r14, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+56], r13
-        mov	QWORD PTR [r8+64], r14
-        mov	rdx, QWORD PTR [r9+72]
-        mov	r14, QWORD PTR [rbx+72]
-        mov	r10, QWORD PTR [rbx+80]
-        mov	r11, QWORD PTR [rbx+88]
-        mov	r12, QWORD PTR [rbx+96]
-        mov	r13, QWORD PTR [rbx+104]
-        ; A[9] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[9] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+72], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[9] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+80], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[9] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+88], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [rbx+96], r12
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        ; A[9] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[9] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[9] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[9] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8], r11
-        mov	r13, QWORD PTR [r8+16]
-        mov	r14, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        ; A[9] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[9] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8+8], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[9] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [r8+16], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[9] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+24], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+32], r10
-        mov	r12, QWORD PTR [r8+48]
-        mov	r13, QWORD PTR [r8+56]
-        mov	r14, QWORD PTR [r8+64]
-        ; A[9] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[9] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+40], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[9] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+48], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[9] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+56], r13
-        mov	r10, rdi
-        adcx	r14, rax
-        adox	r10, rcx
-        adcx	r10, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+64], r14
-        mov	QWORD PTR [r8+72], r10
-        mov	rdx, QWORD PTR [r9+80]
-        mov	r10, QWORD PTR [rbx+80]
-        mov	r11, QWORD PTR [rbx+88]
-        mov	r12, QWORD PTR [rbx+96]
-        mov	r13, QWORD PTR [rbx+104]
-        mov	r14, QWORD PTR [rbx+112]
-        ; A[10] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[10] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+80], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[10] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+88], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[10] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+96], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        mov	QWORD PTR [rbx+104], r13
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        ; A[10] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[10] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[10] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[10] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r8+8], r12
-        mov	r14, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [r8+48]
-        ; A[10] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[10] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8+16], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[10] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [r8+24], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[10] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+32], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8+40], r11
-        mov	r13, QWORD PTR [r8+56]
-        mov	r14, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [r8+72]
-        ; A[10] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[10] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+48], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[10] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+56], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[10] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+64], r14
-        mov	r11, rdi
-        adcx	r10, rax
-        adox	r11, rcx
-        adcx	r11, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+72], r10
-        mov	QWORD PTR [r8+80], r11
-        mov	rdx, QWORD PTR [r9+88]
-        mov	r11, QWORD PTR [rbx+88]
-        mov	r12, QWORD PTR [rbx+96]
-        mov	r13, QWORD PTR [rbx+104]
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        ; A[11] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[11] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+88], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[11] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+96], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[11] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+112], r14
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        mov	r14, QWORD PTR [r8+24]
-        ; A[11] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[11] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[11] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[11] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8+8], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        mov	QWORD PTR [r8+16], r13
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [r8+48]
-        mov	r13, QWORD PTR [r8+56]
-        ; A[11] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[11] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8+24], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[11] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [r8+32], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[11] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+40], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r8+48], r12
-        mov	r14, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [r8+72]
-        mov	r11, QWORD PTR [r8+80]
-        ; A[11] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[11] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+56], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[11] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+64], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[11] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+72], r10
-        mov	r12, rdi
-        adcx	r11, rax
-        adox	r12, rcx
-        adcx	r12, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+80], r11
-        mov	QWORD PTR [r8+88], r12
-        mov	rdx, QWORD PTR [r9+96]
-        mov	r12, QWORD PTR [rbx+96]
-        mov	r13, QWORD PTR [rbx+104]
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        ; A[12] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[12] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+96], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[12] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[12] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+120], r10
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        mov	r14, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [r8+32]
-        ; A[12] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[12] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[12] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [r8+8], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[12] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8+16], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+24], r14
-        mov	r11, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [r8+48]
-        mov	r13, QWORD PTR [r8+56]
-        mov	r14, QWORD PTR [r8+64]
-        ; A[12] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[12] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8+32], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[12] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [r8+40], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[12] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+48], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        mov	QWORD PTR [r8+56], r13
-        mov	r10, QWORD PTR [r8+72]
-        mov	r11, QWORD PTR [r8+80]
-        mov	r12, QWORD PTR [r8+88]
-        ; A[12] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[12] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+64], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[12] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+72], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[12] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+80], r11
-        mov	r13, rdi
-        adcx	r12, rax
-        adox	r13, rcx
-        adcx	r13, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+88], r12
-        mov	QWORD PTR [r8+96], r13
-        mov	rdx, QWORD PTR [r9+104]
-        mov	r13, QWORD PTR [rbx+104]
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        ; A[13] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[13] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[13] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[13] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8], r11
-        mov	r13, QWORD PTR [r8+16]
-        mov	r14, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        ; A[13] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[13] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [r8+8], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[13] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [r8+16], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[13] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8+24], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+32], r10
-        mov	r12, QWORD PTR [r8+48]
-        mov	r13, QWORD PTR [r8+56]
-        mov	r14, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [r8+72]
-        ; A[13] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[13] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8+40], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[13] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [r8+48], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[13] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+56], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+64], r14
-        mov	r11, QWORD PTR [r8+80]
-        mov	r12, QWORD PTR [r8+88]
-        mov	r13, QWORD PTR [r8+96]
-        ; A[13] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[13] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+72], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[13] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+80], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[13] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+88], r12
-        mov	r14, rdi
-        adcx	r13, rax
-        adox	r14, rcx
-        adcx	r14, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+96], r13
-        mov	QWORD PTR [r8+104], r14
-        mov	rdx, QWORD PTR [r9+112]
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        ; A[14] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[14] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[14] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[14] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r8+8], r12
-        mov	r14, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [r8+48]
-        ; A[14] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[14] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [r8+16], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[14] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [r8+24], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[14] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8+32], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8+40], r11
-        mov	r13, QWORD PTR [r8+56]
-        mov	r14, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [r8+72]
-        mov	r11, QWORD PTR [r8+80]
-        ; A[14] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[14] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8+48], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[14] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [r8+56], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[14] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+64], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+72], r10
-        mov	r12, QWORD PTR [r8+88]
-        mov	r13, QWORD PTR [r8+96]
-        mov	r14, QWORD PTR [r8+104]
-        ; A[14] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[14] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+80], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[14] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+88], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[14] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+96], r13
-        mov	r10, rdi
-        adcx	r14, rax
-        adox	r10, rcx
-        adcx	r10, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+104], r14
-        mov	QWORD PTR [r8+112], r10
-        mov	rdx, QWORD PTR [r9+120]
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        mov	r14, QWORD PTR [r8+24]
-        ; A[15] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[15] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[15] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[15] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [r8+8], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        mov	QWORD PTR [r8+16], r13
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [r8+48]
-        mov	r13, QWORD PTR [r8+56]
-        ; A[15] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[15] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [r8+24], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[15] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [r8+32], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[15] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8+40], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r8+48], r12
-        mov	r14, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [r8+72]
-        mov	r11, QWORD PTR [r8+80]
-        mov	r12, QWORD PTR [r8+88]
-        ; A[15] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[15] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8+56], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[15] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [r8+64], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[15] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+72], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8+80], r11
-        mov	r13, QWORD PTR [r8+96]
-        mov	r14, QWORD PTR [r8+104]
-        mov	r10, QWORD PTR [r8+112]
-        ; A[15] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[15] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+88], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[15] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+96], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[15] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+104], r14
-        mov	r11, rdi
-        adcx	r10, rax
-        adox	r11, rcx
-        adcx	r11, r15
-        mov	QWORD PTR [r8+112], r10
-        mov	QWORD PTR [r8+120], r11
-        sub	r8, 128
-        cmp	r9, r8
-        je	L_start_2048_mul_avx2_16
-        cmp	rbp, r8
-        jne	L_end_2048_mul_avx2_16
-L_start_2048_mul_avx2_16:
-        vmovdqu	xmm0, OWORD PTR [rbx]
-        vmovups	OWORD PTR [r8], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+16]
-        vmovups	OWORD PTR [r8+16], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+32]
-        vmovups	OWORD PTR [r8+32], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+48]
-        vmovups	OWORD PTR [r8+48], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+64]
-        vmovups	OWORD PTR [r8+64], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+80]
-        vmovups	OWORD PTR [r8+80], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+96]
-        vmovups	OWORD PTR [r8+96], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+112]
-        vmovups	OWORD PTR [r8+112], xmm0
-L_end_2048_mul_avx2_16:
-        add	rsp, 128
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        pop	rbp
-        pop	rbx
-        ret
-sp_2048_mul_avx2_16 ENDP
-_text ENDS
-ENDIF
-; /* Add b to a into r. (r = a + b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_add_16 PROC
-        ; Add
-        mov	r9, QWORD PTR [rdx]
-        xor	rax, rax
-        add	r9, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx+8]
-        mov	QWORD PTR [rcx], r9
-        adc	r10, QWORD PTR [r8+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	QWORD PTR [rcx+8], r10
-        adc	r9, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	QWORD PTR [rcx+16], r9
-        adc	r10, QWORD PTR [r8+24]
-        mov	r9, QWORD PTR [rdx+32]
-        mov	QWORD PTR [rcx+24], r10
-        adc	r9, QWORD PTR [r8+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	QWORD PTR [rcx+32], r9
-        adc	r10, QWORD PTR [r8+40]
-        mov	r9, QWORD PTR [rdx+48]
-        mov	QWORD PTR [rcx+40], r10
-        adc	r9, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+56]
-        mov	QWORD PTR [rcx+48], r9
-        adc	r10, QWORD PTR [r8+56]
-        mov	r9, QWORD PTR [rdx+64]
-        mov	QWORD PTR [rcx+56], r10
-        adc	r9, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [rdx+72]
-        mov	QWORD PTR [rcx+64], r9
-        adc	r10, QWORD PTR [r8+72]
-        mov	r9, QWORD PTR [rdx+80]
-        mov	QWORD PTR [rcx+72], r10
-        adc	r9, QWORD PTR [r8+80]
-        mov	r10, QWORD PTR [rdx+88]
-        mov	QWORD PTR [rcx+80], r9
-        adc	r10, QWORD PTR [r8+88]
-        mov	r9, QWORD PTR [rdx+96]
-        mov	QWORD PTR [rcx+88], r10
-        adc	r9, QWORD PTR [r8+96]
-        mov	r10, QWORD PTR [rdx+104]
-        mov	QWORD PTR [rcx+96], r9
-        adc	r10, QWORD PTR [r8+104]
-        mov	r9, QWORD PTR [rdx+112]
-        mov	QWORD PTR [rcx+104], r10
-        adc	r9, QWORD PTR [r8+112]
-        mov	r10, QWORD PTR [rdx+120]
-        mov	QWORD PTR [rcx+112], r9
-        adc	r10, QWORD PTR [r8+120]
-        mov	QWORD PTR [rcx+120], r10
-        adc	rax, 0
-        ret
-sp_2048_add_16 ENDP
-_text ENDS
-; /* Sub b from a into a. (a -= b)
-;  *
-;  * a  A single precision integer and result.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_sub_in_place_32 PROC
-        mov	r8, QWORD PTR [rcx]
-        sub	r8, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        sbb	r9, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], r9
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	r9, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        sbb	r9, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], r9
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	r9, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        sbb	r9, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], r9
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	r9, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        sbb	r9, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], r9
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	r9, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        sbb	r9, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], r9
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	r9, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        sbb	r9, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [rcx+96]
-        mov	QWORD PTR [rcx+88], r9
-        sbb	r8, QWORD PTR [rdx+96]
-        mov	r9, QWORD PTR [rcx+104]
-        mov	QWORD PTR [rcx+96], r8
-        sbb	r9, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [rcx+112]
-        mov	QWORD PTR [rcx+104], r9
-        sbb	r8, QWORD PTR [rdx+112]
-        mov	r9, QWORD PTR [rcx+120]
-        mov	QWORD PTR [rcx+112], r8
-        sbb	r9, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [rcx+128]
-        mov	QWORD PTR [rcx+120], r9
-        sbb	r8, QWORD PTR [rdx+128]
-        mov	r9, QWORD PTR [rcx+136]
-        mov	QWORD PTR [rcx+128], r8
-        sbb	r9, QWORD PTR [rdx+136]
-        mov	r8, QWORD PTR [rcx+144]
-        mov	QWORD PTR [rcx+136], r9
-        sbb	r8, QWORD PTR [rdx+144]
-        mov	r9, QWORD PTR [rcx+152]
-        mov	QWORD PTR [rcx+144], r8
-        sbb	r9, QWORD PTR [rdx+152]
-        mov	r8, QWORD PTR [rcx+160]
-        mov	QWORD PTR [rcx+152], r9
-        sbb	r8, QWORD PTR [rdx+160]
-        mov	r9, QWORD PTR [rcx+168]
-        mov	QWORD PTR [rcx+160], r8
-        sbb	r9, QWORD PTR [rdx+168]
-        mov	r8, QWORD PTR [rcx+176]
-        mov	QWORD PTR [rcx+168], r9
-        sbb	r8, QWORD PTR [rdx+176]
-        mov	r9, QWORD PTR [rcx+184]
-        mov	QWORD PTR [rcx+176], r8
-        sbb	r9, QWORD PTR [rdx+184]
-        mov	r8, QWORD PTR [rcx+192]
-        mov	QWORD PTR [rcx+184], r9
-        sbb	r8, QWORD PTR [rdx+192]
-        mov	r9, QWORD PTR [rcx+200]
-        mov	QWORD PTR [rcx+192], r8
-        sbb	r9, QWORD PTR [rdx+200]
-        mov	r8, QWORD PTR [rcx+208]
-        mov	QWORD PTR [rcx+200], r9
-        sbb	r8, QWORD PTR [rdx+208]
-        mov	r9, QWORD PTR [rcx+216]
-        mov	QWORD PTR [rcx+208], r8
-        sbb	r9, QWORD PTR [rdx+216]
-        mov	r8, QWORD PTR [rcx+224]
-        mov	QWORD PTR [rcx+216], r9
-        sbb	r8, QWORD PTR [rdx+224]
-        mov	r9, QWORD PTR [rcx+232]
-        mov	QWORD PTR [rcx+224], r8
-        sbb	r9, QWORD PTR [rdx+232]
-        mov	r8, QWORD PTR [rcx+240]
-        mov	QWORD PTR [rcx+232], r9
-        sbb	r8, QWORD PTR [rdx+240]
-        mov	r9, QWORD PTR [rcx+248]
-        mov	QWORD PTR [rcx+240], r8
-        sbb	r9, QWORD PTR [rdx+248]
-        mov	QWORD PTR [rcx+248], r9
-        sbb	rax, rax
-        ret
-sp_2048_sub_in_place_32 ENDP
-_text ENDS
-; /* Add b to a into r. (r = a + b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_add_32 PROC
-        ; Add
-        mov	r9, QWORD PTR [rdx]
-        xor	rax, rax
-        add	r9, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx+8]
-        mov	QWORD PTR [rcx], r9
-        adc	r10, QWORD PTR [r8+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	QWORD PTR [rcx+8], r10
-        adc	r9, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	QWORD PTR [rcx+16], r9
-        adc	r10, QWORD PTR [r8+24]
-        mov	r9, QWORD PTR [rdx+32]
-        mov	QWORD PTR [rcx+24], r10
-        adc	r9, QWORD PTR [r8+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	QWORD PTR [rcx+32], r9
-        adc	r10, QWORD PTR [r8+40]
-        mov	r9, QWORD PTR [rdx+48]
-        mov	QWORD PTR [rcx+40], r10
-        adc	r9, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+56]
-        mov	QWORD PTR [rcx+48], r9
-        adc	r10, QWORD PTR [r8+56]
-        mov	r9, QWORD PTR [rdx+64]
-        mov	QWORD PTR [rcx+56], r10
-        adc	r9, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [rdx+72]
-        mov	QWORD PTR [rcx+64], r9
-        adc	r10, QWORD PTR [r8+72]
-        mov	r9, QWORD PTR [rdx+80]
-        mov	QWORD PTR [rcx+72], r10
-        adc	r9, QWORD PTR [r8+80]
-        mov	r10, QWORD PTR [rdx+88]
-        mov	QWORD PTR [rcx+80], r9
-        adc	r10, QWORD PTR [r8+88]
-        mov	r9, QWORD PTR [rdx+96]
-        mov	QWORD PTR [rcx+88], r10
-        adc	r9, QWORD PTR [r8+96]
-        mov	r10, QWORD PTR [rdx+104]
-        mov	QWORD PTR [rcx+96], r9
-        adc	r10, QWORD PTR [r8+104]
-        mov	r9, QWORD PTR [rdx+112]
-        mov	QWORD PTR [rcx+104], r10
-        adc	r9, QWORD PTR [r8+112]
-        mov	r10, QWORD PTR [rdx+120]
-        mov	QWORD PTR [rcx+112], r9
-        adc	r10, QWORD PTR [r8+120]
-        mov	r9, QWORD PTR [rdx+128]
-        mov	QWORD PTR [rcx+120], r10
-        adc	r9, QWORD PTR [r8+128]
-        mov	r10, QWORD PTR [rdx+136]
-        mov	QWORD PTR [rcx+128], r9
-        adc	r10, QWORD PTR [r8+136]
-        mov	r9, QWORD PTR [rdx+144]
-        mov	QWORD PTR [rcx+136], r10
-        adc	r9, QWORD PTR [r8+144]
-        mov	r10, QWORD PTR [rdx+152]
-        mov	QWORD PTR [rcx+144], r9
-        adc	r10, QWORD PTR [r8+152]
-        mov	r9, QWORD PTR [rdx+160]
-        mov	QWORD PTR [rcx+152], r10
-        adc	r9, QWORD PTR [r8+160]
-        mov	r10, QWORD PTR [rdx+168]
-        mov	QWORD PTR [rcx+160], r9
-        adc	r10, QWORD PTR [r8+168]
-        mov	r9, QWORD PTR [rdx+176]
-        mov	QWORD PTR [rcx+168], r10
-        adc	r9, QWORD PTR [r8+176]
-        mov	r10, QWORD PTR [rdx+184]
-        mov	QWORD PTR [rcx+176], r9
-        adc	r10, QWORD PTR [r8+184]
-        mov	r9, QWORD PTR [rdx+192]
-        mov	QWORD PTR [rcx+184], r10
-        adc	r9, QWORD PTR [r8+192]
-        mov	r10, QWORD PTR [rdx+200]
-        mov	QWORD PTR [rcx+192], r9
-        adc	r10, QWORD PTR [r8+200]
-        mov	r9, QWORD PTR [rdx+208]
-        mov	QWORD PTR [rcx+200], r10
-        adc	r9, QWORD PTR [r8+208]
-        mov	r10, QWORD PTR [rdx+216]
-        mov	QWORD PTR [rcx+208], r9
-        adc	r10, QWORD PTR [r8+216]
-        mov	r9, QWORD PTR [rdx+224]
-        mov	QWORD PTR [rcx+216], r10
-        adc	r9, QWORD PTR [r8+224]
-        mov	r10, QWORD PTR [rdx+232]
-        mov	QWORD PTR [rcx+224], r9
-        adc	r10, QWORD PTR [r8+232]
-        mov	r9, QWORD PTR [rdx+240]
-        mov	QWORD PTR [rcx+232], r10
-        adc	r9, QWORD PTR [r8+240]
-        mov	r10, QWORD PTR [rdx+248]
-        mov	QWORD PTR [rcx+240], r9
-        adc	r10, QWORD PTR [r8+248]
-        mov	QWORD PTR [rcx+248], r10
-        adc	rax, 0
-        ret
-sp_2048_add_32 ENDP
-_text ENDS
-; /* Multiply a and b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_mul_32 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        sub	rsp, 808
-        mov	QWORD PTR [rsp+768], rcx
-        mov	QWORD PTR [rsp+776], rdx
-        mov	QWORD PTR [rsp+784], r8
-        lea	r12, QWORD PTR [rsp+512]
-        lea	r14, QWORD PTR [rdx+128]
-        ; Add
-        mov	rax, QWORD PTR [rdx]
-        xor	r15, r15
-        add	rax, QWORD PTR [r14]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	QWORD PTR [r12], rax
-        adc	r9, QWORD PTR [r14+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	QWORD PTR [r12+8], r9
-        adc	r10, QWORD PTR [r14+16]
-        mov	rax, QWORD PTR [rdx+24]
-        mov	QWORD PTR [r12+16], r10
-        adc	rax, QWORD PTR [r14+24]
-        mov	r9, QWORD PTR [rdx+32]
-        mov	QWORD PTR [r12+24], rax
-        adc	r9, QWORD PTR [r14+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	QWORD PTR [r12+32], r9
-        adc	r10, QWORD PTR [r14+40]
-        mov	rax, QWORD PTR [rdx+48]
-        mov	QWORD PTR [r12+40], r10
-        adc	rax, QWORD PTR [r14+48]
-        mov	r9, QWORD PTR [rdx+56]
-        mov	QWORD PTR [r12+48], rax
-        adc	r9, QWORD PTR [r14+56]
-        mov	r10, QWORD PTR [rdx+64]
-        mov	QWORD PTR [r12+56], r9
-        adc	r10, QWORD PTR [r14+64]
-        mov	rax, QWORD PTR [rdx+72]
-        mov	QWORD PTR [r12+64], r10
-        adc	rax, QWORD PTR [r14+72]
-        mov	r9, QWORD PTR [rdx+80]
-        mov	QWORD PTR [r12+72], rax
-        adc	r9, QWORD PTR [r14+80]
-        mov	r10, QWORD PTR [rdx+88]
-        mov	QWORD PTR [r12+80], r9
-        adc	r10, QWORD PTR [r14+88]
-        mov	rax, QWORD PTR [rdx+96]
-        mov	QWORD PTR [r12+88], r10
-        adc	rax, QWORD PTR [r14+96]
-        mov	r9, QWORD PTR [rdx+104]
-        mov	QWORD PTR [r12+96], rax
-        adc	r9, QWORD PTR [r14+104]
-        mov	r10, QWORD PTR [rdx+112]
-        mov	QWORD PTR [r12+104], r9
-        adc	r10, QWORD PTR [r14+112]
-        mov	rax, QWORD PTR [rdx+120]
-        mov	QWORD PTR [r12+112], r10
-        adc	rax, QWORD PTR [r14+120]
-        mov	QWORD PTR [r12+120], rax
-        adc	r15, 0
-        mov	QWORD PTR [rsp+792], r15
-        lea	r13, QWORD PTR [rsp+640]
-        lea	r14, QWORD PTR [r8+128]
-        ; Add
-        mov	rax, QWORD PTR [r8]
-        xor	rdi, rdi
-        add	rax, QWORD PTR [r14]
-        mov	r9, QWORD PTR [r8+8]
-        mov	QWORD PTR [r13], rax
-        adc	r9, QWORD PTR [r14+8]
-        mov	r10, QWORD PTR [r8+16]
-        mov	QWORD PTR [r13+8], r9
-        adc	r10, QWORD PTR [r14+16]
-        mov	rax, QWORD PTR [r8+24]
-        mov	QWORD PTR [r13+16], r10
-        adc	rax, QWORD PTR [r14+24]
-        mov	r9, QWORD PTR [r8+32]
-        mov	QWORD PTR [r13+24], rax
-        adc	r9, QWORD PTR [r14+32]
-        mov	r10, QWORD PTR [r8+40]
-        mov	QWORD PTR [r13+32], r9
-        adc	r10, QWORD PTR [r14+40]
-        mov	rax, QWORD PTR [r8+48]
-        mov	QWORD PTR [r13+40], r10
-        adc	rax, QWORD PTR [r14+48]
-        mov	r9, QWORD PTR [r8+56]
-        mov	QWORD PTR [r13+48], rax
-        adc	r9, QWORD PTR [r14+56]
-        mov	r10, QWORD PTR [r8+64]
-        mov	QWORD PTR [r13+56], r9
-        adc	r10, QWORD PTR [r14+64]
-        mov	rax, QWORD PTR [r8+72]
-        mov	QWORD PTR [r13+64], r10
-        adc	rax, QWORD PTR [r14+72]
-        mov	r9, QWORD PTR [r8+80]
-        mov	QWORD PTR [r13+72], rax
-        adc	r9, QWORD PTR [r14+80]
-        mov	r10, QWORD PTR [r8+88]
-        mov	QWORD PTR [r13+80], r9
-        adc	r10, QWORD PTR [r14+88]
-        mov	rax, QWORD PTR [r8+96]
-        mov	QWORD PTR [r13+88], r10
-        adc	rax, QWORD PTR [r14+96]
-        mov	r9, QWORD PTR [r8+104]
-        mov	QWORD PTR [r13+96], rax
-        adc	r9, QWORD PTR [r14+104]
-        mov	r10, QWORD PTR [r8+112]
-        mov	QWORD PTR [r13+104], r9
-        adc	r10, QWORD PTR [r14+112]
-        mov	rax, QWORD PTR [r8+120]
-        mov	QWORD PTR [r13+112], r10
-        adc	rax, QWORD PTR [r14+120]
-        mov	QWORD PTR [r13+120], rax
-        adc	rdi, 0
-        mov	QWORD PTR [rsp+800], rdi
-        mov	r8, r13
-        mov	rdx, r12
-        mov	rcx, rsp
-        call	sp_2048_mul_16
-        mov	r8, QWORD PTR [rsp+784]
-        mov	rdx, QWORD PTR [rsp+776]
-        lea	rcx, QWORD PTR [rsp+256]
-        add	r8, 128
-        add	rdx, 128
-        call	sp_2048_mul_16
-        mov	r8, QWORD PTR [rsp+784]
-        mov	rdx, QWORD PTR [rsp+776]
-        mov	rcx, QWORD PTR [rsp+768]
-        call	sp_2048_mul_16
-IFDEF _WIN64
-        mov	r8, QWORD PTR [rsp+784]
-        mov	rdx, QWORD PTR [rsp+776]
-        mov	rcx, QWORD PTR [rsp+768]
-ENDIF
-        mov	r15, QWORD PTR [rsp+792]
-        mov	rdi, QWORD PTR [rsp+800]
-        mov	rsi, QWORD PTR [rsp+768]
-        mov	r11, r15
-        lea	r12, QWORD PTR [rsp+512]
-        lea	r13, QWORD PTR [rsp+640]
-        and	r11, rdi
-        neg	r15
-        neg	rdi
-        add	rsi, 256
-        mov	rax, QWORD PTR [r12]
-        mov	r9, QWORD PTR [r13]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12], rax
-        mov	QWORD PTR [r13], r9
-        mov	rax, QWORD PTR [r12+8]
-        mov	r9, QWORD PTR [r13+8]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+8], rax
-        mov	QWORD PTR [r13+8], r9
-        mov	rax, QWORD PTR [r12+16]
-        mov	r9, QWORD PTR [r13+16]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+16], rax
-        mov	QWORD PTR [r13+16], r9
-        mov	rax, QWORD PTR [r12+24]
-        mov	r9, QWORD PTR [r13+24]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+24], rax
-        mov	QWORD PTR [r13+24], r9
-        mov	rax, QWORD PTR [r12+32]
-        mov	r9, QWORD PTR [r13+32]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+32], rax
-        mov	QWORD PTR [r13+32], r9
-        mov	rax, QWORD PTR [r12+40]
-        mov	r9, QWORD PTR [r13+40]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+40], rax
-        mov	QWORD PTR [r13+40], r9
-        mov	rax, QWORD PTR [r12+48]
-        mov	r9, QWORD PTR [r13+48]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+48], rax
-        mov	QWORD PTR [r13+48], r9
-        mov	rax, QWORD PTR [r12+56]
-        mov	r9, QWORD PTR [r13+56]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+56], rax
-        mov	QWORD PTR [r13+56], r9
-        mov	rax, QWORD PTR [r12+64]
-        mov	r9, QWORD PTR [r13+64]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+64], rax
-        mov	QWORD PTR [r13+64], r9
-        mov	rax, QWORD PTR [r12+72]
-        mov	r9, QWORD PTR [r13+72]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+72], rax
-        mov	QWORD PTR [r13+72], r9
-        mov	rax, QWORD PTR [r12+80]
-        mov	r9, QWORD PTR [r13+80]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+80], rax
-        mov	QWORD PTR [r13+80], r9
-        mov	rax, QWORD PTR [r12+88]
-        mov	r9, QWORD PTR [r13+88]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+88], rax
-        mov	QWORD PTR [r13+88], r9
-        mov	rax, QWORD PTR [r12+96]
-        mov	r9, QWORD PTR [r13+96]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+96], rax
-        mov	QWORD PTR [r13+96], r9
-        mov	rax, QWORD PTR [r12+104]
-        mov	r9, QWORD PTR [r13+104]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+104], rax
-        mov	QWORD PTR [r13+104], r9
-        mov	rax, QWORD PTR [r12+112]
-        mov	r9, QWORD PTR [r13+112]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+112], rax
-        mov	QWORD PTR [r13+112], r9
-        mov	rax, QWORD PTR [r12+120]
-        mov	r9, QWORD PTR [r13+120]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+120], rax
-        mov	QWORD PTR [r13+120], r9
-        mov	rax, QWORD PTR [r12]
-        add	rax, QWORD PTR [r13]
-        mov	r9, QWORD PTR [r12+8]
-        mov	QWORD PTR [rsi], rax
-        adc	r9, QWORD PTR [r13+8]
-        mov	r10, QWORD PTR [r12+16]
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, QWORD PTR [r13+16]
-        mov	rax, QWORD PTR [r12+24]
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, QWORD PTR [r13+24]
-        mov	r9, QWORD PTR [r12+32]
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, QWORD PTR [r13+32]
-        mov	r10, QWORD PTR [r12+40]
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, QWORD PTR [r13+40]
-        mov	rax, QWORD PTR [r12+48]
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, QWORD PTR [r13+48]
-        mov	r9, QWORD PTR [r12+56]
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, QWORD PTR [r13+56]
-        mov	r10, QWORD PTR [r12+64]
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, QWORD PTR [r13+64]
-        mov	rax, QWORD PTR [r12+72]
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, QWORD PTR [r13+72]
-        mov	r9, QWORD PTR [r12+80]
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, QWORD PTR [r13+80]
-        mov	r10, QWORD PTR [r12+88]
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, QWORD PTR [r13+88]
-        mov	rax, QWORD PTR [r12+96]
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, QWORD PTR [r13+96]
-        mov	r9, QWORD PTR [r12+104]
-        mov	QWORD PTR [rsi+96], rax
-        adc	r9, QWORD PTR [r13+104]
-        mov	r10, QWORD PTR [r12+112]
-        mov	QWORD PTR [rsi+104], r9
-        adc	r10, QWORD PTR [r13+112]
-        mov	rax, QWORD PTR [r12+120]
-        mov	QWORD PTR [rsi+112], r10
-        adc	rax, QWORD PTR [r13+120]
-        mov	QWORD PTR [rsi+120], rax
-        adc	r11, 0
-        lea	r13, QWORD PTR [rsp+256]
-        mov	r12, rsp
-        mov	rax, QWORD PTR [r12]
-        sub	rax, QWORD PTR [r13]
-        mov	r9, QWORD PTR [r12+8]
-        mov	QWORD PTR [r12], rax
-        sbb	r9, QWORD PTR [r13+8]
-        mov	r10, QWORD PTR [r12+16]
-        mov	QWORD PTR [r12+8], r9
-        sbb	r10, QWORD PTR [r13+16]
-        mov	rax, QWORD PTR [r12+24]
-        mov	QWORD PTR [r12+16], r10
-        sbb	rax, QWORD PTR [r13+24]
-        mov	r9, QWORD PTR [r12+32]
-        mov	QWORD PTR [r12+24], rax
-        sbb	r9, QWORD PTR [r13+32]
-        mov	r10, QWORD PTR [r12+40]
-        mov	QWORD PTR [r12+32], r9
-        sbb	r10, QWORD PTR [r13+40]
-        mov	rax, QWORD PTR [r12+48]
-        mov	QWORD PTR [r12+40], r10
-        sbb	rax, QWORD PTR [r13+48]
-        mov	r9, QWORD PTR [r12+56]
-        mov	QWORD PTR [r12+48], rax
-        sbb	r9, QWORD PTR [r13+56]
-        mov	r10, QWORD PTR [r12+64]
-        mov	QWORD PTR [r12+56], r9
-        sbb	r10, QWORD PTR [r13+64]
-        mov	rax, QWORD PTR [r12+72]
-        mov	QWORD PTR [r12+64], r10
-        sbb	rax, QWORD PTR [r13+72]
-        mov	r9, QWORD PTR [r12+80]
-        mov	QWORD PTR [r12+72], rax
-        sbb	r9, QWORD PTR [r13+80]
-        mov	r10, QWORD PTR [r12+88]
-        mov	QWORD PTR [r12+80], r9
-        sbb	r10, QWORD PTR [r13+88]
-        mov	rax, QWORD PTR [r12+96]
-        mov	QWORD PTR [r12+88], r10
-        sbb	rax, QWORD PTR [r13+96]
-        mov	r9, QWORD PTR [r12+104]
-        mov	QWORD PTR [r12+96], rax
-        sbb	r9, QWORD PTR [r13+104]
-        mov	r10, QWORD PTR [r12+112]
-        mov	QWORD PTR [r12+104], r9
-        sbb	r10, QWORD PTR [r13+112]
-        mov	rax, QWORD PTR [r12+120]
-        mov	QWORD PTR [r12+112], r10
-        sbb	rax, QWORD PTR [r13+120]
-        mov	r9, QWORD PTR [r12+128]
-        mov	QWORD PTR [r12+120], rax
-        sbb	r9, QWORD PTR [r13+128]
-        mov	r10, QWORD PTR [r12+136]
-        mov	QWORD PTR [r12+128], r9
-        sbb	r10, QWORD PTR [r13+136]
-        mov	rax, QWORD PTR [r12+144]
-        mov	QWORD PTR [r12+136], r10
-        sbb	rax, QWORD PTR [r13+144]
-        mov	r9, QWORD PTR [r12+152]
-        mov	QWORD PTR [r12+144], rax
-        sbb	r9, QWORD PTR [r13+152]
-        mov	r10, QWORD PTR [r12+160]
-        mov	QWORD PTR [r12+152], r9
-        sbb	r10, QWORD PTR [r13+160]
-        mov	rax, QWORD PTR [r12+168]
-        mov	QWORD PTR [r12+160], r10
-        sbb	rax, QWORD PTR [r13+168]
-        mov	r9, QWORD PTR [r12+176]
-        mov	QWORD PTR [r12+168], rax
-        sbb	r9, QWORD PTR [r13+176]
-        mov	r10, QWORD PTR [r12+184]
-        mov	QWORD PTR [r12+176], r9
-        sbb	r10, QWORD PTR [r13+184]
-        mov	rax, QWORD PTR [r12+192]
-        mov	QWORD PTR [r12+184], r10
-        sbb	rax, QWORD PTR [r13+192]
-        mov	r9, QWORD PTR [r12+200]
-        mov	QWORD PTR [r12+192], rax
-        sbb	r9, QWORD PTR [r13+200]
-        mov	r10, QWORD PTR [r12+208]
-        mov	QWORD PTR [r12+200], r9
-        sbb	r10, QWORD PTR [r13+208]
-        mov	rax, QWORD PTR [r12+216]
-        mov	QWORD PTR [r12+208], r10
-        sbb	rax, QWORD PTR [r13+216]
-        mov	r9, QWORD PTR [r12+224]
-        mov	QWORD PTR [r12+216], rax
-        sbb	r9, QWORD PTR [r13+224]
-        mov	r10, QWORD PTR [r12+232]
-        mov	QWORD PTR [r12+224], r9
-        sbb	r10, QWORD PTR [r13+232]
-        mov	rax, QWORD PTR [r12+240]
-        mov	QWORD PTR [r12+232], r10
-        sbb	rax, QWORD PTR [r13+240]
-        mov	r9, QWORD PTR [r12+248]
-        mov	QWORD PTR [r12+240], rax
-        sbb	r9, QWORD PTR [r13+248]
-        mov	QWORD PTR [r12+248], r9
-        sbb	r11, 0
-        mov	rax, QWORD PTR [r12]
-        sub	rax, QWORD PTR [rcx]
-        mov	r9, QWORD PTR [r12+8]
-        mov	QWORD PTR [r12], rax
-        sbb	r9, QWORD PTR [rcx+8]
-        mov	r10, QWORD PTR [r12+16]
-        mov	QWORD PTR [r12+8], r9
-        sbb	r10, QWORD PTR [rcx+16]
-        mov	rax, QWORD PTR [r12+24]
-        mov	QWORD PTR [r12+16], r10
-        sbb	rax, QWORD PTR [rcx+24]
-        mov	r9, QWORD PTR [r12+32]
-        mov	QWORD PTR [r12+24], rax
-        sbb	r9, QWORD PTR [rcx+32]
-        mov	r10, QWORD PTR [r12+40]
-        mov	QWORD PTR [r12+32], r9
-        sbb	r10, QWORD PTR [rcx+40]
-        mov	rax, QWORD PTR [r12+48]
-        mov	QWORD PTR [r12+40], r10
-        sbb	rax, QWORD PTR [rcx+48]
-        mov	r9, QWORD PTR [r12+56]
-        mov	QWORD PTR [r12+48], rax
-        sbb	r9, QWORD PTR [rcx+56]
-        mov	r10, QWORD PTR [r12+64]
-        mov	QWORD PTR [r12+56], r9
-        sbb	r10, QWORD PTR [rcx+64]
-        mov	rax, QWORD PTR [r12+72]
-        mov	QWORD PTR [r12+64], r10
-        sbb	rax, QWORD PTR [rcx+72]
-        mov	r9, QWORD PTR [r12+80]
-        mov	QWORD PTR [r12+72], rax
-        sbb	r9, QWORD PTR [rcx+80]
-        mov	r10, QWORD PTR [r12+88]
-        mov	QWORD PTR [r12+80], r9
-        sbb	r10, QWORD PTR [rcx+88]
-        mov	rax, QWORD PTR [r12+96]
-        mov	QWORD PTR [r12+88], r10
-        sbb	rax, QWORD PTR [rcx+96]
-        mov	r9, QWORD PTR [r12+104]
-        mov	QWORD PTR [r12+96], rax
-        sbb	r9, QWORD PTR [rcx+104]
-        mov	r10, QWORD PTR [r12+112]
-        mov	QWORD PTR [r12+104], r9
-        sbb	r10, QWORD PTR [rcx+112]
-        mov	rax, QWORD PTR [r12+120]
-        mov	QWORD PTR [r12+112], r10
-        sbb	rax, QWORD PTR [rcx+120]
-        mov	r9, QWORD PTR [r12+128]
-        mov	QWORD PTR [r12+120], rax
-        sbb	r9, QWORD PTR [rcx+128]
-        mov	r10, QWORD PTR [r12+136]
-        mov	QWORD PTR [r12+128], r9
-        sbb	r10, QWORD PTR [rcx+136]
-        mov	rax, QWORD PTR [r12+144]
-        mov	QWORD PTR [r12+136], r10
-        sbb	rax, QWORD PTR [rcx+144]
-        mov	r9, QWORD PTR [r12+152]
-        mov	QWORD PTR [r12+144], rax
-        sbb	r9, QWORD PTR [rcx+152]
-        mov	r10, QWORD PTR [r12+160]
-        mov	QWORD PTR [r12+152], r9
-        sbb	r10, QWORD PTR [rcx+160]
-        mov	rax, QWORD PTR [r12+168]
-        mov	QWORD PTR [r12+160], r10
-        sbb	rax, QWORD PTR [rcx+168]
-        mov	r9, QWORD PTR [r12+176]
-        mov	QWORD PTR [r12+168], rax
-        sbb	r9, QWORD PTR [rcx+176]
-        mov	r10, QWORD PTR [r12+184]
-        mov	QWORD PTR [r12+176], r9
-        sbb	r10, QWORD PTR [rcx+184]
-        mov	rax, QWORD PTR [r12+192]
-        mov	QWORD PTR [r12+184], r10
-        sbb	rax, QWORD PTR [rcx+192]
-        mov	r9, QWORD PTR [r12+200]
-        mov	QWORD PTR [r12+192], rax
-        sbb	r9, QWORD PTR [rcx+200]
-        mov	r10, QWORD PTR [r12+208]
-        mov	QWORD PTR [r12+200], r9
-        sbb	r10, QWORD PTR [rcx+208]
-        mov	rax, QWORD PTR [r12+216]
-        mov	QWORD PTR [r12+208], r10
-        sbb	rax, QWORD PTR [rcx+216]
-        mov	r9, QWORD PTR [r12+224]
-        mov	QWORD PTR [r12+216], rax
-        sbb	r9, QWORD PTR [rcx+224]
-        mov	r10, QWORD PTR [r12+232]
-        mov	QWORD PTR [r12+224], r9
-        sbb	r10, QWORD PTR [rcx+232]
-        mov	rax, QWORD PTR [r12+240]
-        mov	QWORD PTR [r12+232], r10
-        sbb	rax, QWORD PTR [rcx+240]
-        mov	r9, QWORD PTR [r12+248]
-        mov	QWORD PTR [r12+240], rax
-        sbb	r9, QWORD PTR [rcx+248]
-        mov	QWORD PTR [r12+248], r9
-        sbb	r11, 0
-        sub	rsi, 128
-        ; Add
-        mov	rax, QWORD PTR [rsi]
-        add	rax, QWORD PTR [r12]
-        mov	r9, QWORD PTR [rsi+8]
-        mov	QWORD PTR [rsi], rax
-        adc	r9, QWORD PTR [r12+8]
-        mov	r10, QWORD PTR [rsi+16]
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, QWORD PTR [r12+16]
-        mov	rax, QWORD PTR [rsi+24]
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, QWORD PTR [r12+24]
-        mov	r9, QWORD PTR [rsi+32]
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, QWORD PTR [r12+32]
-        mov	r10, QWORD PTR [rsi+40]
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, QWORD PTR [r12+40]
-        mov	rax, QWORD PTR [rsi+48]
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, QWORD PTR [r12+48]
-        mov	r9, QWORD PTR [rsi+56]
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, QWORD PTR [r12+56]
-        mov	r10, QWORD PTR [rsi+64]
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, QWORD PTR [r12+64]
-        mov	rax, QWORD PTR [rsi+72]
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, QWORD PTR [r12+72]
-        mov	r9, QWORD PTR [rsi+80]
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, QWORD PTR [r12+80]
-        mov	r10, QWORD PTR [rsi+88]
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, QWORD PTR [r12+88]
-        mov	rax, QWORD PTR [rsi+96]
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, QWORD PTR [r12+96]
-        mov	r9, QWORD PTR [rsi+104]
-        mov	QWORD PTR [rsi+96], rax
-        adc	r9, QWORD PTR [r12+104]
-        mov	r10, QWORD PTR [rsi+112]
-        mov	QWORD PTR [rsi+104], r9
-        adc	r10, QWORD PTR [r12+112]
-        mov	rax, QWORD PTR [rsi+120]
-        mov	QWORD PTR [rsi+112], r10
-        adc	rax, QWORD PTR [r12+120]
-        mov	r9, QWORD PTR [rsi+128]
-        mov	QWORD PTR [rsi+120], rax
-        adc	r9, QWORD PTR [r12+128]
-        mov	r10, QWORD PTR [rsi+136]
-        mov	QWORD PTR [rsi+128], r9
-        adc	r10, QWORD PTR [r12+136]
-        mov	rax, QWORD PTR [rsi+144]
-        mov	QWORD PTR [rsi+136], r10
-        adc	rax, QWORD PTR [r12+144]
-        mov	r9, QWORD PTR [rsi+152]
-        mov	QWORD PTR [rsi+144], rax
-        adc	r9, QWORD PTR [r12+152]
-        mov	r10, QWORD PTR [rsi+160]
-        mov	QWORD PTR [rsi+152], r9
-        adc	r10, QWORD PTR [r12+160]
-        mov	rax, QWORD PTR [rsi+168]
-        mov	QWORD PTR [rsi+160], r10
-        adc	rax, QWORD PTR [r12+168]
-        mov	r9, QWORD PTR [rsi+176]
-        mov	QWORD PTR [rsi+168], rax
-        adc	r9, QWORD PTR [r12+176]
-        mov	r10, QWORD PTR [rsi+184]
-        mov	QWORD PTR [rsi+176], r9
-        adc	r10, QWORD PTR [r12+184]
-        mov	rax, QWORD PTR [rsi+192]
-        mov	QWORD PTR [rsi+184], r10
-        adc	rax, QWORD PTR [r12+192]
-        mov	r9, QWORD PTR [rsi+200]
-        mov	QWORD PTR [rsi+192], rax
-        adc	r9, QWORD PTR [r12+200]
-        mov	r10, QWORD PTR [rsi+208]
-        mov	QWORD PTR [rsi+200], r9
-        adc	r10, QWORD PTR [r12+208]
-        mov	rax, QWORD PTR [rsi+216]
-        mov	QWORD PTR [rsi+208], r10
-        adc	rax, QWORD PTR [r12+216]
-        mov	r9, QWORD PTR [rsi+224]
-        mov	QWORD PTR [rsi+216], rax
-        adc	r9, QWORD PTR [r12+224]
-        mov	r10, QWORD PTR [rsi+232]
-        mov	QWORD PTR [rsi+224], r9
-        adc	r10, QWORD PTR [r12+232]
-        mov	rax, QWORD PTR [rsi+240]
-        mov	QWORD PTR [rsi+232], r10
-        adc	rax, QWORD PTR [r12+240]
-        mov	r9, QWORD PTR [rsi+248]
-        mov	QWORD PTR [rsi+240], rax
-        adc	r9, QWORD PTR [r12+248]
-        mov	QWORD PTR [rsi+248], r9
-        adc	r11, 0
-        mov	QWORD PTR [rcx+384], r11
-        add	rsi, 128
-        ; Add
-        mov	rax, QWORD PTR [rsi]
-        add	rax, QWORD PTR [r13]
-        mov	r9, QWORD PTR [rsi+8]
-        mov	QWORD PTR [rsi], rax
-        adc	r9, QWORD PTR [r13+8]
-        mov	r10, QWORD PTR [rsi+16]
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, QWORD PTR [r13+16]
-        mov	rax, QWORD PTR [rsi+24]
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, QWORD PTR [r13+24]
-        mov	r9, QWORD PTR [rsi+32]
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, QWORD PTR [r13+32]
-        mov	r10, QWORD PTR [rsi+40]
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, QWORD PTR [r13+40]
-        mov	rax, QWORD PTR [rsi+48]
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, QWORD PTR [r13+48]
-        mov	r9, QWORD PTR [rsi+56]
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, QWORD PTR [r13+56]
-        mov	r10, QWORD PTR [rsi+64]
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, QWORD PTR [r13+64]
-        mov	rax, QWORD PTR [rsi+72]
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, QWORD PTR [r13+72]
-        mov	r9, QWORD PTR [rsi+80]
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, QWORD PTR [r13+80]
-        mov	r10, QWORD PTR [rsi+88]
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, QWORD PTR [r13+88]
-        mov	rax, QWORD PTR [rsi+96]
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, QWORD PTR [r13+96]
-        mov	r9, QWORD PTR [rsi+104]
-        mov	QWORD PTR [rsi+96], rax
-        adc	r9, QWORD PTR [r13+104]
-        mov	r10, QWORD PTR [rsi+112]
-        mov	QWORD PTR [rsi+104], r9
-        adc	r10, QWORD PTR [r13+112]
-        mov	rax, QWORD PTR [rsi+120]
-        mov	QWORD PTR [rsi+112], r10
-        adc	rax, QWORD PTR [r13+120]
-        mov	r9, QWORD PTR [rsi+128]
-        mov	QWORD PTR [rsi+120], rax
-        adc	r9, QWORD PTR [r13+128]
-        mov	QWORD PTR [rsi+128], r9
-        ; Add to zero
-        mov	rax, QWORD PTR [r13+136]
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+144]
-        mov	QWORD PTR [rsi+136], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+152]
-        mov	QWORD PTR [rsi+144], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+160]
-        mov	QWORD PTR [rsi+152], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+168]
-        mov	QWORD PTR [rsi+160], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+176]
-        mov	QWORD PTR [rsi+168], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+184]
-        mov	QWORD PTR [rsi+176], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+192]
-        mov	QWORD PTR [rsi+184], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+200]
-        mov	QWORD PTR [rsi+192], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+208]
-        mov	QWORD PTR [rsi+200], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+216]
-        mov	QWORD PTR [rsi+208], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+224]
-        mov	QWORD PTR [rsi+216], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+232]
-        mov	QWORD PTR [rsi+224], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+240]
-        mov	QWORD PTR [rsi+232], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+248]
-        mov	QWORD PTR [rsi+240], r9
-        adc	r10, 0
-        mov	QWORD PTR [rsi+248], r10
-        add	rsp, 808
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_2048_mul_32 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Multiply a and b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_mul_avx2_32 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        sub	rsp, 808
-        mov	QWORD PTR [rsp+768], rcx
-        mov	QWORD PTR [rsp+776], rdx
-        mov	QWORD PTR [rsp+784], r8
-        lea	r12, QWORD PTR [rsp+512]
-        lea	r14, QWORD PTR [rdx+128]
-        ; Add
-        mov	rax, QWORD PTR [rdx]
-        xor	r15, r15
-        add	rax, QWORD PTR [r14]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	QWORD PTR [r12], rax
-        adc	r9, QWORD PTR [r14+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	QWORD PTR [r12+8], r9
-        adc	r10, QWORD PTR [r14+16]
-        mov	rax, QWORD PTR [rdx+24]
-        mov	QWORD PTR [r12+16], r10
-        adc	rax, QWORD PTR [r14+24]
-        mov	r9, QWORD PTR [rdx+32]
-        mov	QWORD PTR [r12+24], rax
-        adc	r9, QWORD PTR [r14+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	QWORD PTR [r12+32], r9
-        adc	r10, QWORD PTR [r14+40]
-        mov	rax, QWORD PTR [rdx+48]
-        mov	QWORD PTR [r12+40], r10
-        adc	rax, QWORD PTR [r14+48]
-        mov	r9, QWORD PTR [rdx+56]
-        mov	QWORD PTR [r12+48], rax
-        adc	r9, QWORD PTR [r14+56]
-        mov	r10, QWORD PTR [rdx+64]
-        mov	QWORD PTR [r12+56], r9
-        adc	r10, QWORD PTR [r14+64]
-        mov	rax, QWORD PTR [rdx+72]
-        mov	QWORD PTR [r12+64], r10
-        adc	rax, QWORD PTR [r14+72]
-        mov	r9, QWORD PTR [rdx+80]
-        mov	QWORD PTR [r12+72], rax
-        adc	r9, QWORD PTR [r14+80]
-        mov	r10, QWORD PTR [rdx+88]
-        mov	QWORD PTR [r12+80], r9
-        adc	r10, QWORD PTR [r14+88]
-        mov	rax, QWORD PTR [rdx+96]
-        mov	QWORD PTR [r12+88], r10
-        adc	rax, QWORD PTR [r14+96]
-        mov	r9, QWORD PTR [rdx+104]
-        mov	QWORD PTR [r12+96], rax
-        adc	r9, QWORD PTR [r14+104]
-        mov	r10, QWORD PTR [rdx+112]
-        mov	QWORD PTR [r12+104], r9
-        adc	r10, QWORD PTR [r14+112]
-        mov	rax, QWORD PTR [rdx+120]
-        mov	QWORD PTR [r12+112], r10
-        adc	rax, QWORD PTR [r14+120]
-        mov	QWORD PTR [r12+120], rax
-        adc	r15, 0
-        mov	QWORD PTR [rsp+792], r15
-        lea	r13, QWORD PTR [rsp+640]
-        lea	r14, QWORD PTR [r8+128]
-        ; Add
-        mov	rax, QWORD PTR [r8]
-        xor	rdi, rdi
-        add	rax, QWORD PTR [r14]
-        mov	r9, QWORD PTR [r8+8]
-        mov	QWORD PTR [r13], rax
-        adc	r9, QWORD PTR [r14+8]
-        mov	r10, QWORD PTR [r8+16]
-        mov	QWORD PTR [r13+8], r9
-        adc	r10, QWORD PTR [r14+16]
-        mov	rax, QWORD PTR [r8+24]
-        mov	QWORD PTR [r13+16], r10
-        adc	rax, QWORD PTR [r14+24]
-        mov	r9, QWORD PTR [r8+32]
-        mov	QWORD PTR [r13+24], rax
-        adc	r9, QWORD PTR [r14+32]
-        mov	r10, QWORD PTR [r8+40]
-        mov	QWORD PTR [r13+32], r9
-        adc	r10, QWORD PTR [r14+40]
-        mov	rax, QWORD PTR [r8+48]
-        mov	QWORD PTR [r13+40], r10
-        adc	rax, QWORD PTR [r14+48]
-        mov	r9, QWORD PTR [r8+56]
-        mov	QWORD PTR [r13+48], rax
-        adc	r9, QWORD PTR [r14+56]
-        mov	r10, QWORD PTR [r8+64]
-        mov	QWORD PTR [r13+56], r9
-        adc	r10, QWORD PTR [r14+64]
-        mov	rax, QWORD PTR [r8+72]
-        mov	QWORD PTR [r13+64], r10
-        adc	rax, QWORD PTR [r14+72]
-        mov	r9, QWORD PTR [r8+80]
-        mov	QWORD PTR [r13+72], rax
-        adc	r9, QWORD PTR [r14+80]
-        mov	r10, QWORD PTR [r8+88]
-        mov	QWORD PTR [r13+80], r9
-        adc	r10, QWORD PTR [r14+88]
-        mov	rax, QWORD PTR [r8+96]
-        mov	QWORD PTR [r13+88], r10
-        adc	rax, QWORD PTR [r14+96]
-        mov	r9, QWORD PTR [r8+104]
-        mov	QWORD PTR [r13+96], rax
-        adc	r9, QWORD PTR [r14+104]
-        mov	r10, QWORD PTR [r8+112]
-        mov	QWORD PTR [r13+104], r9
-        adc	r10, QWORD PTR [r14+112]
-        mov	rax, QWORD PTR [r8+120]
-        mov	QWORD PTR [r13+112], r10
-        adc	rax, QWORD PTR [r14+120]
-        mov	QWORD PTR [r13+120], rax
-        adc	rdi, 0
-        mov	QWORD PTR [rsp+800], rdi
-        mov	r8, r13
-        mov	rdx, r12
-        mov	rcx, rsp
-        call	sp_2048_mul_avx2_16
-        mov	r8, QWORD PTR [rsp+784]
-        mov	rdx, QWORD PTR [rsp+776]
-        lea	rcx, QWORD PTR [rsp+256]
-        add	r8, 128
-        add	rdx, 128
-        call	sp_2048_mul_avx2_16
-        mov	r8, QWORD PTR [rsp+784]
-        mov	rdx, QWORD PTR [rsp+776]
-        mov	rcx, QWORD PTR [rsp+768]
-        call	sp_2048_mul_avx2_16
-IFDEF _WIN64
-        mov	r8, QWORD PTR [rsp+784]
-        mov	rdx, QWORD PTR [rsp+776]
-        mov	rcx, QWORD PTR [rsp+768]
-ENDIF
-        mov	r15, QWORD PTR [rsp+792]
-        mov	rdi, QWORD PTR [rsp+800]
-        mov	rsi, QWORD PTR [rsp+768]
-        mov	r11, r15
-        lea	r12, QWORD PTR [rsp+512]
-        lea	r13, QWORD PTR [rsp+640]
-        and	r11, rdi
-        neg	r15
-        neg	rdi
-        add	rsi, 256
-        mov	rax, QWORD PTR [r12]
-        mov	r9, QWORD PTR [r13]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        add	rax, r9
-        mov	r9, QWORD PTR [r12+8]
-        mov	r10, QWORD PTR [r13+8]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+16]
-        mov	rax, QWORD PTR [r13+16]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+24]
-        mov	r9, QWORD PTR [r13+24]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+32]
-        mov	r10, QWORD PTR [r13+32]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+40]
-        mov	rax, QWORD PTR [r13+40]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+48]
-        mov	r9, QWORD PTR [r13+48]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+56]
-        mov	r10, QWORD PTR [r13+56]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+64]
-        mov	rax, QWORD PTR [r13+64]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+72]
-        mov	r9, QWORD PTR [r13+72]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+80]
-        mov	r10, QWORD PTR [r13+80]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+88]
-        mov	rax, QWORD PTR [r13+88]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+96]
-        mov	r9, QWORD PTR [r13+96]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+104]
-        mov	r10, QWORD PTR [r13+104]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+96], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+112]
-        mov	rax, QWORD PTR [r13+112]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+104], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+120]
-        mov	r9, QWORD PTR [r13+120]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+112], r10
-        adc	rax, r9
-        mov	QWORD PTR [rsi+120], rax
-        adc	r11, 0
-        lea	r13, QWORD PTR [rsp+256]
-        mov	r12, rsp
-        mov	rax, QWORD PTR [r12]
-        sub	rax, QWORD PTR [r13]
-        mov	r9, QWORD PTR [r12+8]
-        mov	QWORD PTR [r12], rax
-        sbb	r9, QWORD PTR [r13+8]
-        mov	r10, QWORD PTR [r12+16]
-        mov	QWORD PTR [r12+8], r9
-        sbb	r10, QWORD PTR [r13+16]
-        mov	rax, QWORD PTR [r12+24]
-        mov	QWORD PTR [r12+16], r10
-        sbb	rax, QWORD PTR [r13+24]
-        mov	r9, QWORD PTR [r12+32]
-        mov	QWORD PTR [r12+24], rax
-        sbb	r9, QWORD PTR [r13+32]
-        mov	r10, QWORD PTR [r12+40]
-        mov	QWORD PTR [r12+32], r9
-        sbb	r10, QWORD PTR [r13+40]
-        mov	rax, QWORD PTR [r12+48]
-        mov	QWORD PTR [r12+40], r10
-        sbb	rax, QWORD PTR [r13+48]
-        mov	r9, QWORD PTR [r12+56]
-        mov	QWORD PTR [r12+48], rax
-        sbb	r9, QWORD PTR [r13+56]
-        mov	r10, QWORD PTR [r12+64]
-        mov	QWORD PTR [r12+56], r9
-        sbb	r10, QWORD PTR [r13+64]
-        mov	rax, QWORD PTR [r12+72]
-        mov	QWORD PTR [r12+64], r10
-        sbb	rax, QWORD PTR [r13+72]
-        mov	r9, QWORD PTR [r12+80]
-        mov	QWORD PTR [r12+72], rax
-        sbb	r9, QWORD PTR [r13+80]
-        mov	r10, QWORD PTR [r12+88]
-        mov	QWORD PTR [r12+80], r9
-        sbb	r10, QWORD PTR [r13+88]
-        mov	rax, QWORD PTR [r12+96]
-        mov	QWORD PTR [r12+88], r10
-        sbb	rax, QWORD PTR [r13+96]
-        mov	r9, QWORD PTR [r12+104]
-        mov	QWORD PTR [r12+96], rax
-        sbb	r9, QWORD PTR [r13+104]
-        mov	r10, QWORD PTR [r12+112]
-        mov	QWORD PTR [r12+104], r9
-        sbb	r10, QWORD PTR [r13+112]
-        mov	rax, QWORD PTR [r12+120]
-        mov	QWORD PTR [r12+112], r10
-        sbb	rax, QWORD PTR [r13+120]
-        mov	r9, QWORD PTR [r12+128]
-        mov	QWORD PTR [r12+120], rax
-        sbb	r9, QWORD PTR [r13+128]
-        mov	r10, QWORD PTR [r12+136]
-        mov	QWORD PTR [r12+128], r9
-        sbb	r10, QWORD PTR [r13+136]
-        mov	rax, QWORD PTR [r12+144]
-        mov	QWORD PTR [r12+136], r10
-        sbb	rax, QWORD PTR [r13+144]
-        mov	r9, QWORD PTR [r12+152]
-        mov	QWORD PTR [r12+144], rax
-        sbb	r9, QWORD PTR [r13+152]
-        mov	r10, QWORD PTR [r12+160]
-        mov	QWORD PTR [r12+152], r9
-        sbb	r10, QWORD PTR [r13+160]
-        mov	rax, QWORD PTR [r12+168]
-        mov	QWORD PTR [r12+160], r10
-        sbb	rax, QWORD PTR [r13+168]
-        mov	r9, QWORD PTR [r12+176]
-        mov	QWORD PTR [r12+168], rax
-        sbb	r9, QWORD PTR [r13+176]
-        mov	r10, QWORD PTR [r12+184]
-        mov	QWORD PTR [r12+176], r9
-        sbb	r10, QWORD PTR [r13+184]
-        mov	rax, QWORD PTR [r12+192]
-        mov	QWORD PTR [r12+184], r10
-        sbb	rax, QWORD PTR [r13+192]
-        mov	r9, QWORD PTR [r12+200]
-        mov	QWORD PTR [r12+192], rax
-        sbb	r9, QWORD PTR [r13+200]
-        mov	r10, QWORD PTR [r12+208]
-        mov	QWORD PTR [r12+200], r9
-        sbb	r10, QWORD PTR [r13+208]
-        mov	rax, QWORD PTR [r12+216]
-        mov	QWORD PTR [r12+208], r10
-        sbb	rax, QWORD PTR [r13+216]
-        mov	r9, QWORD PTR [r12+224]
-        mov	QWORD PTR [r12+216], rax
-        sbb	r9, QWORD PTR [r13+224]
-        mov	r10, QWORD PTR [r12+232]
-        mov	QWORD PTR [r12+224], r9
-        sbb	r10, QWORD PTR [r13+232]
-        mov	rax, QWORD PTR [r12+240]
-        mov	QWORD PTR [r12+232], r10
-        sbb	rax, QWORD PTR [r13+240]
-        mov	r9, QWORD PTR [r12+248]
-        mov	QWORD PTR [r12+240], rax
-        sbb	r9, QWORD PTR [r13+248]
-        mov	QWORD PTR [r12+248], r9
-        sbb	r11, 0
-        mov	rax, QWORD PTR [r12]
-        sub	rax, QWORD PTR [rcx]
-        mov	r9, QWORD PTR [r12+8]
-        mov	QWORD PTR [r12], rax
-        sbb	r9, QWORD PTR [rcx+8]
-        mov	r10, QWORD PTR [r12+16]
-        mov	QWORD PTR [r12+8], r9
-        sbb	r10, QWORD PTR [rcx+16]
-        mov	rax, QWORD PTR [r12+24]
-        mov	QWORD PTR [r12+16], r10
-        sbb	rax, QWORD PTR [rcx+24]
-        mov	r9, QWORD PTR [r12+32]
-        mov	QWORD PTR [r12+24], rax
-        sbb	r9, QWORD PTR [rcx+32]
-        mov	r10, QWORD PTR [r12+40]
-        mov	QWORD PTR [r12+32], r9
-        sbb	r10, QWORD PTR [rcx+40]
-        mov	rax, QWORD PTR [r12+48]
-        mov	QWORD PTR [r12+40], r10
-        sbb	rax, QWORD PTR [rcx+48]
-        mov	r9, QWORD PTR [r12+56]
-        mov	QWORD PTR [r12+48], rax
-        sbb	r9, QWORD PTR [rcx+56]
-        mov	r10, QWORD PTR [r12+64]
-        mov	QWORD PTR [r12+56], r9
-        sbb	r10, QWORD PTR [rcx+64]
-        mov	rax, QWORD PTR [r12+72]
-        mov	QWORD PTR [r12+64], r10
-        sbb	rax, QWORD PTR [rcx+72]
-        mov	r9, QWORD PTR [r12+80]
-        mov	QWORD PTR [r12+72], rax
-        sbb	r9, QWORD PTR [rcx+80]
-        mov	r10, QWORD PTR [r12+88]
-        mov	QWORD PTR [r12+80], r9
-        sbb	r10, QWORD PTR [rcx+88]
-        mov	rax, QWORD PTR [r12+96]
-        mov	QWORD PTR [r12+88], r10
-        sbb	rax, QWORD PTR [rcx+96]
-        mov	r9, QWORD PTR [r12+104]
-        mov	QWORD PTR [r12+96], rax
-        sbb	r9, QWORD PTR [rcx+104]
-        mov	r10, QWORD PTR [r12+112]
-        mov	QWORD PTR [r12+104], r9
-        sbb	r10, QWORD PTR [rcx+112]
-        mov	rax, QWORD PTR [r12+120]
-        mov	QWORD PTR [r12+112], r10
-        sbb	rax, QWORD PTR [rcx+120]
-        mov	r9, QWORD PTR [r12+128]
-        mov	QWORD PTR [r12+120], rax
-        sbb	r9, QWORD PTR [rcx+128]
-        mov	r10, QWORD PTR [r12+136]
-        mov	QWORD PTR [r12+128], r9
-        sbb	r10, QWORD PTR [rcx+136]
-        mov	rax, QWORD PTR [r12+144]
-        mov	QWORD PTR [r12+136], r10
-        sbb	rax, QWORD PTR [rcx+144]
-        mov	r9, QWORD PTR [r12+152]
-        mov	QWORD PTR [r12+144], rax
-        sbb	r9, QWORD PTR [rcx+152]
-        mov	r10, QWORD PTR [r12+160]
-        mov	QWORD PTR [r12+152], r9
-        sbb	r10, QWORD PTR [rcx+160]
-        mov	rax, QWORD PTR [r12+168]
-        mov	QWORD PTR [r12+160], r10
-        sbb	rax, QWORD PTR [rcx+168]
-        mov	r9, QWORD PTR [r12+176]
-        mov	QWORD PTR [r12+168], rax
-        sbb	r9, QWORD PTR [rcx+176]
-        mov	r10, QWORD PTR [r12+184]
-        mov	QWORD PTR [r12+176], r9
-        sbb	r10, QWORD PTR [rcx+184]
-        mov	rax, QWORD PTR [r12+192]
-        mov	QWORD PTR [r12+184], r10
-        sbb	rax, QWORD PTR [rcx+192]
-        mov	r9, QWORD PTR [r12+200]
-        mov	QWORD PTR [r12+192], rax
-        sbb	r9, QWORD PTR [rcx+200]
-        mov	r10, QWORD PTR [r12+208]
-        mov	QWORD PTR [r12+200], r9
-        sbb	r10, QWORD PTR [rcx+208]
-        mov	rax, QWORD PTR [r12+216]
-        mov	QWORD PTR [r12+208], r10
-        sbb	rax, QWORD PTR [rcx+216]
-        mov	r9, QWORD PTR [r12+224]
-        mov	QWORD PTR [r12+216], rax
-        sbb	r9, QWORD PTR [rcx+224]
-        mov	r10, QWORD PTR [r12+232]
-        mov	QWORD PTR [r12+224], r9
-        sbb	r10, QWORD PTR [rcx+232]
-        mov	rax, QWORD PTR [r12+240]
-        mov	QWORD PTR [r12+232], r10
-        sbb	rax, QWORD PTR [rcx+240]
-        mov	r9, QWORD PTR [r12+248]
-        mov	QWORD PTR [r12+240], rax
-        sbb	r9, QWORD PTR [rcx+248]
-        mov	QWORD PTR [r12+248], r9
-        sbb	r11, 0
-        sub	rsi, 128
-        ; Add
-        mov	rax, QWORD PTR [rsi]
-        add	rax, QWORD PTR [r12]
-        mov	r9, QWORD PTR [rsi+8]
-        mov	QWORD PTR [rsi], rax
-        adc	r9, QWORD PTR [r12+8]
-        mov	r10, QWORD PTR [rsi+16]
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, QWORD PTR [r12+16]
-        mov	rax, QWORD PTR [rsi+24]
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, QWORD PTR [r12+24]
-        mov	r9, QWORD PTR [rsi+32]
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, QWORD PTR [r12+32]
-        mov	r10, QWORD PTR [rsi+40]
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, QWORD PTR [r12+40]
-        mov	rax, QWORD PTR [rsi+48]
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, QWORD PTR [r12+48]
-        mov	r9, QWORD PTR [rsi+56]
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, QWORD PTR [r12+56]
-        mov	r10, QWORD PTR [rsi+64]
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, QWORD PTR [r12+64]
-        mov	rax, QWORD PTR [rsi+72]
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, QWORD PTR [r12+72]
-        mov	r9, QWORD PTR [rsi+80]
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, QWORD PTR [r12+80]
-        mov	r10, QWORD PTR [rsi+88]
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, QWORD PTR [r12+88]
-        mov	rax, QWORD PTR [rsi+96]
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, QWORD PTR [r12+96]
-        mov	r9, QWORD PTR [rsi+104]
-        mov	QWORD PTR [rsi+96], rax
-        adc	r9, QWORD PTR [r12+104]
-        mov	r10, QWORD PTR [rsi+112]
-        mov	QWORD PTR [rsi+104], r9
-        adc	r10, QWORD PTR [r12+112]
-        mov	rax, QWORD PTR [rsi+120]
-        mov	QWORD PTR [rsi+112], r10
-        adc	rax, QWORD PTR [r12+120]
-        mov	r9, QWORD PTR [rsi+128]
-        mov	QWORD PTR [rsi+120], rax
-        adc	r9, QWORD PTR [r12+128]
-        mov	r10, QWORD PTR [rsi+136]
-        mov	QWORD PTR [rsi+128], r9
-        adc	r10, QWORD PTR [r12+136]
-        mov	rax, QWORD PTR [rsi+144]
-        mov	QWORD PTR [rsi+136], r10
-        adc	rax, QWORD PTR [r12+144]
-        mov	r9, QWORD PTR [rsi+152]
-        mov	QWORD PTR [rsi+144], rax
-        adc	r9, QWORD PTR [r12+152]
-        mov	r10, QWORD PTR [rsi+160]
-        mov	QWORD PTR [rsi+152], r9
-        adc	r10, QWORD PTR [r12+160]
-        mov	rax, QWORD PTR [rsi+168]
-        mov	QWORD PTR [rsi+160], r10
-        adc	rax, QWORD PTR [r12+168]
-        mov	r9, QWORD PTR [rsi+176]
-        mov	QWORD PTR [rsi+168], rax
-        adc	r9, QWORD PTR [r12+176]
-        mov	r10, QWORD PTR [rsi+184]
-        mov	QWORD PTR [rsi+176], r9
-        adc	r10, QWORD PTR [r12+184]
-        mov	rax, QWORD PTR [rsi+192]
-        mov	QWORD PTR [rsi+184], r10
-        adc	rax, QWORD PTR [r12+192]
-        mov	r9, QWORD PTR [rsi+200]
-        mov	QWORD PTR [rsi+192], rax
-        adc	r9, QWORD PTR [r12+200]
-        mov	r10, QWORD PTR [rsi+208]
-        mov	QWORD PTR [rsi+200], r9
-        adc	r10, QWORD PTR [r12+208]
-        mov	rax, QWORD PTR [rsi+216]
-        mov	QWORD PTR [rsi+208], r10
-        adc	rax, QWORD PTR [r12+216]
-        mov	r9, QWORD PTR [rsi+224]
-        mov	QWORD PTR [rsi+216], rax
-        adc	r9, QWORD PTR [r12+224]
-        mov	r10, QWORD PTR [rsi+232]
-        mov	QWORD PTR [rsi+224], r9
-        adc	r10, QWORD PTR [r12+232]
-        mov	rax, QWORD PTR [rsi+240]
-        mov	QWORD PTR [rsi+232], r10
-        adc	rax, QWORD PTR [r12+240]
-        mov	r9, QWORD PTR [rsi+248]
-        mov	QWORD PTR [rsi+240], rax
-        adc	r9, QWORD PTR [r12+248]
-        mov	QWORD PTR [rsi+248], r9
-        adc	r11, 0
-        mov	QWORD PTR [rcx+384], r11
-        add	rsi, 128
-        ; Add
-        mov	rax, QWORD PTR [rsi]
-        add	rax, QWORD PTR [r13]
-        mov	r9, QWORD PTR [rsi+8]
-        mov	QWORD PTR [rsi], rax
-        adc	r9, QWORD PTR [r13+8]
-        mov	r10, QWORD PTR [rsi+16]
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, QWORD PTR [r13+16]
-        mov	rax, QWORD PTR [rsi+24]
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, QWORD PTR [r13+24]
-        mov	r9, QWORD PTR [rsi+32]
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, QWORD PTR [r13+32]
-        mov	r10, QWORD PTR [rsi+40]
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, QWORD PTR [r13+40]
-        mov	rax, QWORD PTR [rsi+48]
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, QWORD PTR [r13+48]
-        mov	r9, QWORD PTR [rsi+56]
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, QWORD PTR [r13+56]
-        mov	r10, QWORD PTR [rsi+64]
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, QWORD PTR [r13+64]
-        mov	rax, QWORD PTR [rsi+72]
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, QWORD PTR [r13+72]
-        mov	r9, QWORD PTR [rsi+80]
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, QWORD PTR [r13+80]
-        mov	r10, QWORD PTR [rsi+88]
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, QWORD PTR [r13+88]
-        mov	rax, QWORD PTR [rsi+96]
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, QWORD PTR [r13+96]
-        mov	r9, QWORD PTR [rsi+104]
-        mov	QWORD PTR [rsi+96], rax
-        adc	r9, QWORD PTR [r13+104]
-        mov	r10, QWORD PTR [rsi+112]
-        mov	QWORD PTR [rsi+104], r9
-        adc	r10, QWORD PTR [r13+112]
-        mov	rax, QWORD PTR [rsi+120]
-        mov	QWORD PTR [rsi+112], r10
-        adc	rax, QWORD PTR [r13+120]
-        mov	r9, QWORD PTR [rsi+128]
-        mov	QWORD PTR [rsi+120], rax
-        adc	r9, QWORD PTR [r13+128]
-        mov	QWORD PTR [rsi+128], r9
-        ; Add to zero
-        mov	rax, QWORD PTR [r13+136]
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+144]
-        mov	QWORD PTR [rsi+136], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+152]
-        mov	QWORD PTR [rsi+144], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+160]
-        mov	QWORD PTR [rsi+152], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+168]
-        mov	QWORD PTR [rsi+160], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+176]
-        mov	QWORD PTR [rsi+168], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+184]
-        mov	QWORD PTR [rsi+176], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+192]
-        mov	QWORD PTR [rsi+184], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+200]
-        mov	QWORD PTR [rsi+192], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+208]
-        mov	QWORD PTR [rsi+200], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+216]
-        mov	QWORD PTR [rsi+208], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+224]
-        mov	QWORD PTR [rsi+216], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+232]
-        mov	QWORD PTR [rsi+224], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+240]
-        mov	QWORD PTR [rsi+232], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+248]
-        mov	QWORD PTR [rsi+240], r9
-        adc	r10, 0
-        mov	QWORD PTR [rsi+248], r10
-        add	rsp, 808
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_2048_mul_avx2_32 ENDP
-_text ENDS
-ENDIF
-; /* Square a and put result in r. (r = a * a)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_sqr_16 PROC
-        push	r12
-        push	r13
-        push	r14
-        mov	r8, rdx
-        sub	rsp, 128
-        ; A[0] * A[0]
-        mov	rax, QWORD PTR [r8]
-        mul	rax
-        xor	r11, r11
-        mov	QWORD PTR [rsp], rax
-        mov	r10, rdx
-        ; A[0] * A[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r8]
-        xor	r9, r9
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        mov	QWORD PTR [rsp+8], r10
-        ; A[0] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        ; A[1] * A[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	rax
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+16], r11
-        ; A[0] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r8]
-        xor	r11, r11
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r8+8]
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+24], r9
-        ; A[0] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8]
-        xor	r9, r9
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        ; A[1] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r8+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        ; A[2] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	rax
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        mov	QWORD PTR [rsp+32], r10
-        ; A[0] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rsp+40], r11
-        ; A[0] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8]
-        xor	r11, r11
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r9, r12
-        adc	r10, r13
-        adc	r11, r14
-        mov	QWORD PTR [rsp+48], r9
-        ; A[0] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8]
-        xor	r9, r9
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r10, r12
-        adc	r11, r13
-        adc	r9, r14
-        mov	QWORD PTR [rsp+56], r10
-        ; A[0] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rsp+64], r11
-        ; A[0] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8]
-        xor	r11, r11
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r9, r12
-        adc	r10, r13
-        adc	r11, r14
-        mov	QWORD PTR [rsp+72], r9
-        ; A[0] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8]
-        xor	r9, r9
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r10, r12
-        adc	r11, r13
-        adc	r9, r14
-        mov	QWORD PTR [rsp+80], r10
-        ; A[0] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rsp+88], r11
-        ; A[0] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8]
-        xor	r11, r11
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r9, r12
-        adc	r10, r13
-        adc	r11, r14
-        mov	QWORD PTR [rsp+96], r9
-        ; A[0] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8]
-        xor	r9, r9
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+48]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r10, r12
-        adc	r11, r13
-        adc	r9, r14
-        mov	QWORD PTR [rsp+104], r10
-        ; A[0] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+48]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[7] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rsp+112], r11
-        ; A[0] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8]
-        xor	r11, r11
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+48]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[7] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+56]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r9, r12
-        adc	r10, r13
-        adc	r11, r14
-        mov	QWORD PTR [rsp+120], r9
-        ; A[1] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+8]
-        xor	r9, r9
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[2] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+48]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[7] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+56]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[8] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r10, r12
-        adc	r11, r13
-        adc	r9, r14
-        mov	QWORD PTR [rcx+128], r10
-        ; A[2] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+16]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[3] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+48]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[7] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+56]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[8] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+64]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rcx+136], r11
-        ; A[3] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+24]
-        xor	r11, r11
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[4] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+48]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[7] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+56]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[8] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+64]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[9] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r9, r12
-        adc	r10, r13
-        adc	r11, r14
-        mov	QWORD PTR [rcx+144], r9
-        ; A[4] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+32]
-        xor	r9, r9
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[5] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+48]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[7] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+56]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[8] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+64]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[9] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+72]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r10, r12
-        adc	r11, r13
-        adc	r9, r14
-        mov	QWORD PTR [rcx+152], r10
-        ; A[5] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+40]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[6] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+48]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[7] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+56]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[8] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+64]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[9] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+72]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[10] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rcx+160], r11
-        ; A[6] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+48]
-        xor	r11, r11
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[7] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+56]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[8] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+64]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[9] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+72]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[10] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+80]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r9, r12
-        adc	r10, r13
-        adc	r11, r14
-        mov	QWORD PTR [rcx+168], r9
-        ; A[7] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+56]
-        xor	r9, r9
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[8] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+64]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[9] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+72]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[10] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+80]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[11] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r10, r12
-        adc	r11, r13
-        adc	r9, r14
-        mov	QWORD PTR [rcx+176], r10
-        ; A[8] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+64]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[9] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+72]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[10] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+80]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[11] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+88]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rcx+184], r11
-        ; A[9] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+72]
-        xor	r11, r11
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[10] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+80]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[11] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+88]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[12] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r9, r12
-        adc	r10, r13
-        adc	r11, r14
-        mov	QWORD PTR [rcx+192], r9
-        ; A[10] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+80]
-        xor	r9, r9
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[11] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+88]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[12] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+96]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r10, r12
-        adc	r11, r13
-        adc	r9, r14
-        mov	QWORD PTR [rcx+200], r10
-        ; A[11] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+88]
-        xor	r10, r10
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        ; A[12] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+96]
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        ; A[13] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	rax
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+208], r11
-        ; A[12] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+96]
-        xor	r11, r11
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[13] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+104]
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+216], r9
-        ; A[13] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+104]
-        xor	r9, r9
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        ; A[14] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	rax
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        mov	QWORD PTR [rcx+224], r10
-        ; A[14] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+112]
-        xor	r10, r10
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+232], r11
-        ; A[15] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	rax
-        add	r9, rax
-        adc	r10, rdx
-        mov	QWORD PTR [rcx+240], r9
-        mov	QWORD PTR [rcx+248], r10
-        mov	rax, QWORD PTR [rsp]
-        mov	rdx, QWORD PTR [rsp+8]
-        mov	r12, QWORD PTR [rsp+16]
-        mov	r13, QWORD PTR [rsp+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], rdx
-        mov	QWORD PTR [rcx+16], r12
-        mov	QWORD PTR [rcx+24], r13
-        mov	rax, QWORD PTR [rsp+32]
-        mov	rdx, QWORD PTR [rsp+40]
-        mov	r12, QWORD PTR [rsp+48]
-        mov	r13, QWORD PTR [rsp+56]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], rdx
-        mov	QWORD PTR [rcx+48], r12
-        mov	QWORD PTR [rcx+56], r13
-        mov	rax, QWORD PTR [rsp+64]
-        mov	rdx, QWORD PTR [rsp+72]
-        mov	r12, QWORD PTR [rsp+80]
-        mov	r13, QWORD PTR [rsp+88]
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], rdx
-        mov	QWORD PTR [rcx+80], r12
-        mov	QWORD PTR [rcx+88], r13
-        mov	rax, QWORD PTR [rsp+96]
-        mov	rdx, QWORD PTR [rsp+104]
-        mov	r12, QWORD PTR [rsp+112]
-        mov	r13, QWORD PTR [rsp+120]
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], rdx
-        mov	QWORD PTR [rcx+112], r12
-        mov	QWORD PTR [rcx+120], r13
-        add	rsp, 128
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_2048_sqr_16 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Square a and put result in r. (r = a * a)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_sqr_avx2_16 PROC
-        push	rbp
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        mov	r8, rcx
-        mov	r9, rdx
-        sub	rsp, 128
-        cmp	r9, r8
-        mov	rbp, rsp
-        cmovne	rbp, r8
-        add	r8, 128
-        xor	r13, r13
-        ; Diagonal 1
-        ; Zero into %r9
-        ; Zero into %r10
-        ; A[1] x A[0]
-        mov	rdx, QWORD PTR [r9]
-        mulx	r11, r10, QWORD PTR [r9+8]
-        ; A[2] x A[0]
-        mulx	r12, rax, QWORD PTR [r9+16]
-        adcx	r11, rax
-        adox	r12, r13
-        mov	QWORD PTR [rbp+8], r10
-        mov	QWORD PTR [rbp+16], r11
-        ; Zero into %r8
-        ; Zero into %r9
-        ; A[3] x A[0]
-        mulx	r10, rax, QWORD PTR [r9+24]
-        adcx	r12, rax
-        adox	r10, r13
-        ; A[4] x A[0]
-        mulx	r11, rax, QWORD PTR [r9+32]
-        adcx	r10, rax
-        adox	r11, r13
-        mov	QWORD PTR [rbp+24], r12
-        mov	QWORD PTR [rbp+32], r10
-        ; Zero into %r10
-        ; Zero into %r8
-        ; A[5] x A[0]
-        mulx	r12, rax, QWORD PTR [r9+40]
-        adcx	r11, rax
-        adox	r12, r13
-        ; A[6] x A[0]
-        mulx	r10, rax, QWORD PTR [r9+48]
-        adcx	r12, rax
-        adox	r10, r13
-        mov	QWORD PTR [rbp+40], r11
-        mov	QWORD PTR [rbp+48], r12
-        ; Zero into %r9
-        ; Zero into %r10
-        ; A[7] x A[0]
-        mulx	r11, rax, QWORD PTR [r9+56]
-        adcx	r10, rax
-        adox	r11, r13
-        ; A[8] x A[0]
-        mulx	r12, rax, QWORD PTR [r9+64]
-        adcx	r11, rax
-        adox	r12, r13
-        mov	QWORD PTR [rbp+56], r10
-        mov	QWORD PTR [rbp+64], r11
-        ; Zero into %r8
-        ; Zero into %r9
-        ; A[9] x A[0]
-        mulx	r10, rax, QWORD PTR [r9+72]
-        adcx	r12, rax
-        adox	r10, r13
-        ; A[10] x A[0]
-        mulx	r11, rax, QWORD PTR [r9+80]
-        adcx	r10, rax
-        adox	r11, r13
-        mov	QWORD PTR [rbp+72], r12
-        mov	QWORD PTR [rbp+80], r10
-        ; No load %r13 - %r10
-        ; A[11] x A[0]
-        mulx	r15, rax, QWORD PTR [r9+88]
-        adcx	r11, rax
-        adox	r15, r13
-        ; A[12] x A[0]
-        mulx	rdi, rax, QWORD PTR [r9+96]
-        adcx	r15, rax
-        adox	rdi, r13
-        mov	QWORD PTR [rbp+88], r11
-        ; No store %r13 - %r10
-        ; No load %r15 - %r9
-        ; A[13] x A[0]
-        mulx	rsi, rax, QWORD PTR [r9+104]
-        adcx	rdi, rax
-        adox	rsi, r13
-        ; A[14] x A[0]
-        mulx	rbx, rax, QWORD PTR [r9+112]
-        adcx	rsi, rax
-        adox	rbx, r13
-        ; No store %r14 - %r8
-        ; No store %r15 - %r9
-        ; Zero into %r8
-        ; Zero into %r9
-        ; A[15] x A[0]
-        mulx	r10, rax, QWORD PTR [r9+120]
-        adcx	rbx, rax
-        adox	r10, r13
-        ; No store %rbx - %r10
-        ;  Carry
-        adcx	r10, r13
-        mov	r14, r13
-        adcx	r14, r13
-        adox	r14, r13
-        mov	QWORD PTR [r8], r10
-        ; Diagonal 2
-        mov	r10, QWORD PTR [rbp+24]
-        mov	r11, QWORD PTR [rbp+32]
-        mov	r12, QWORD PTR [rbp+40]
-        ; A[2] x A[1]
-        mov	rdx, QWORD PTR [r9+8]
-        mulx	rcx, rax, QWORD PTR [r9+16]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[3] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+24]
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbp+24], r10
-        mov	QWORD PTR [rbp+32], r11
-        mov	r10, QWORD PTR [rbp+48]
-        mov	r11, QWORD PTR [rbp+56]
-        ; A[4] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+32]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[5] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+40]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbp+40], r12
-        mov	QWORD PTR [rbp+48], r10
-        mov	r12, QWORD PTR [rbp+64]
-        mov	r10, QWORD PTR [rbp+72]
-        ; A[6] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[7] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbp+56], r11
-        mov	QWORD PTR [rbp+64], r12
-        mov	r11, QWORD PTR [rbp+80]
-        mov	r12, QWORD PTR [rbp+88]
-        ; A[8] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[9] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbp+72], r10
-        mov	QWORD PTR [rbp+80], r11
-        ; No load %r13 - %r8
-        ; A[10] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	r12, rax
-        adox	r15, rcx
-        ; A[11] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	r15, rax
-        adox	rdi, rcx
-        mov	QWORD PTR [rbp+88], r12
-        ; No store %r13 - %r8
-        ; No load %r15 - %r10
-        ; A[12] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+96]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; A[13] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+104]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r14 - %r9
-        ; No store %r15 - %r10
-        mov	r11, QWORD PTR [r8]
-        ; Zero into %r10
-        ; A[14] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+112]
-        adcx	rbx, rax
-        adox	r11, rcx
-        ; A[15] x A[1]
-        mulx	r12, rax, QWORD PTR [r9+120]
-        adcx	r11, rax
-        adox	r12, r13
-        ; No store %rbx - %r8
-        mov	QWORD PTR [r8], r11
-        ; Zero into %r8
-        ; Zero into %r9
-        ; A[15] x A[2]
-        mov	rdx, QWORD PTR [r9+16]
-        mulx	r10, rax, QWORD PTR [r9+120]
-        adcx	r12, rax
-        adox	r10, r13
-        mov	QWORD PTR [r8+8], r12
-        ;  Carry
-        adcx	r10, r14
-        mov	r14, r13
-        adcx	r14, r13
-        adox	r14, r13
-        mov	QWORD PTR [r8+16], r10
-        ; Diagonal 3
-        mov	r10, QWORD PTR [rbp+40]
-        mov	r11, QWORD PTR [rbp+48]
-        mov	r12, QWORD PTR [rbp+56]
-        ; A[3] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+24]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[4] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+32]
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbp+40], r10
-        mov	QWORD PTR [rbp+48], r11
-        mov	r10, QWORD PTR [rbp+64]
-        mov	r11, QWORD PTR [rbp+72]
-        ; A[5] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+40]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[6] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbp+56], r12
-        mov	QWORD PTR [rbp+64], r10
-        mov	r12, QWORD PTR [rbp+80]
-        mov	r10, QWORD PTR [rbp+88]
-        ; A[7] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[8] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbp+72], r11
-        mov	QWORD PTR [rbp+80], r12
-        ; No load %r13 - %r9
-        ; A[9] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	r10, rax
-        adox	r15, rcx
-        ; A[10] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	r15, rax
-        adox	rdi, rcx
-        mov	QWORD PTR [rbp+88], r10
-        ; No store %r13 - %r9
-        ; No load %r15 - %r8
-        ; A[11] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; A[12] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+96]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r14 - %r10
-        ; No store %r15 - %r8
-        mov	r12, QWORD PTR [r8]
-        mov	r10, QWORD PTR [r8+8]
-        ; A[13] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+104]
-        adcx	rbx, rax
-        adox	r12, rcx
-        ; A[14] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+112]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; No store %rbx - %r9
-        mov	QWORD PTR [r8], r12
-        mov	r11, QWORD PTR [r8+16]
-        ; Zero into %r10
-        ; A[14] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	rcx, rax, QWORD PTR [r9+112]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[14] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	r12, rax, QWORD PTR [r9+112]
-        adcx	r11, rax
-        adox	r12, r13
-        mov	QWORD PTR [r8+8], r10
-        mov	QWORD PTR [r8+16], r11
-        ; Zero into %r8
-        ; Zero into %r9
-        ; A[14] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	r10, rax, QWORD PTR [r9+112]
-        adcx	r12, rax
-        adox	r10, r13
-        mov	QWORD PTR [r8+24], r12
-        ;  Carry
-        adcx	r10, r14
-        mov	r14, r13
-        adcx	r14, r13
-        adox	r14, r13
-        mov	QWORD PTR [r8+32], r10
-        ; Diagonal 4
-        mov	r10, QWORD PTR [rbp+56]
-        mov	r11, QWORD PTR [rbp+64]
-        mov	r12, QWORD PTR [rbp+72]
-        ; A[4] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	rcx, rax, QWORD PTR [r9+32]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[5] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+40]
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbp+56], r10
-        mov	QWORD PTR [rbp+64], r11
-        mov	r10, QWORD PTR [rbp+80]
-        mov	r11, QWORD PTR [rbp+88]
-        ; A[6] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[7] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbp+72], r12
-        mov	QWORD PTR [rbp+80], r10
-        ; No load %r13 - %r10
-        ; A[8] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	r11, rax
-        adox	r15, rcx
-        ; A[9] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	r15, rax
-        adox	rdi, rcx
-        mov	QWORD PTR [rbp+88], r11
-        ; No store %r13 - %r10
-        ; No load %r15 - %r9
-        ; A[10] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; A[11] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r14 - %r8
-        ; No store %r15 - %r9
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        ; A[12] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+96]
-        adcx	rbx, rax
-        adox	r10, rcx
-        ; A[13] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+104]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; No store %rbx - %r10
-        mov	QWORD PTR [r8], r10
-        mov	r12, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [r8+24]
-        ; A[13] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	rcx, rax, QWORD PTR [r9+104]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[13] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	rcx, rax, QWORD PTR [r9+104]
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+8], r11
-        mov	QWORD PTR [r8+16], r12
-        mov	r11, QWORD PTR [r8+32]
-        ; Zero into %r10
-        ; A[13] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, QWORD PTR [r9+104]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[13] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	r12, rax, QWORD PTR [r9+104]
-        adcx	r11, rax
-        adox	r12, r13
-        mov	QWORD PTR [r8+24], r10
-        mov	QWORD PTR [r8+32], r11
-        ; Zero into %r8
-        ; Zero into %r9
-        ; A[13] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	r10, rax, QWORD PTR [r9+104]
-        adcx	r12, rax
-        adox	r10, r13
-        mov	QWORD PTR [r8+40], r12
-        ;  Carry
-        adcx	r10, r14
-        mov	r14, r13
-        adcx	r14, r13
-        adox	r14, r13
-        mov	QWORD PTR [r8+48], r10
-        ; Diagonal 5
-        mov	r10, QWORD PTR [rbp+72]
-        mov	r11, QWORD PTR [rbp+80]
-        mov	r12, QWORD PTR [rbp+88]
-        ; A[5] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	rcx, rax, QWORD PTR [r9+40]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[6] x A[4]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbp+72], r10
-        mov	QWORD PTR [rbp+80], r11
-        ; No load %r13 - %r8
-        ; A[7] x A[4]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	r12, rax
-        adox	r15, rcx
-        ; A[8] x A[4]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	r15, rax
-        adox	rdi, rcx
-        mov	QWORD PTR [rbp+88], r12
-        ; No store %r13 - %r8
-        ; No load %r15 - %r10
-        ; A[9] x A[4]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; A[10] x A[4]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r14 - %r9
-        ; No store %r15 - %r10
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        ; A[11] x A[4]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	rbx, rax
-        adox	r11, rcx
-        ; A[12] x A[4]
-        mulx	rcx, rax, QWORD PTR [r9+96]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; No store %rbx - %r8
-        mov	QWORD PTR [r8], r11
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        ; A[12] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	rcx, rax, QWORD PTR [r9+96]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[12] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, QWORD PTR [r9+96]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+8], r12
-        mov	QWORD PTR [r8+16], r10
-        mov	r12, QWORD PTR [r8+32]
-        mov	r10, QWORD PTR [r8+40]
-        ; A[12] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	rcx, rax, QWORD PTR [r9+96]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[12] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	rcx, rax, QWORD PTR [r9+96]
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+24], r11
-        mov	QWORD PTR [r8+32], r12
-        mov	r11, QWORD PTR [r8+48]
-        ; Zero into %r10
-        ; A[12] x A[9]
-        mov	rdx, QWORD PTR [r9+72]
-        mulx	rcx, rax, QWORD PTR [r9+96]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[12] x A[10]
-        mov	rdx, QWORD PTR [r9+80]
-        mulx	r12, rax, QWORD PTR [r9+96]
-        adcx	r11, rax
-        adox	r12, r13
-        mov	QWORD PTR [r8+40], r10
-        mov	QWORD PTR [r8+48], r11
-        ; Zero into %r8
-        ; Zero into %r9
-        ; A[12] x A[11]
-        mov	rdx, QWORD PTR [r9+88]
-        mulx	r10, rax, QWORD PTR [r9+96]
-        adcx	r12, rax
-        adox	r10, r13
-        mov	QWORD PTR [r8+56], r12
-        ;  Carry
-        adcx	r10, r14
-        mov	r14, r13
-        adcx	r14, r13
-        adox	r14, r13
-        mov	QWORD PTR [r8+64], r10
-        ; Diagonal 6
-        mov	r10, QWORD PTR [rbp+88]
-        ; No load %r13 - %r9
-        ; A[6] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	r10, rax
-        adox	r15, rcx
-        ; A[7] x A[5]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	r15, rax
-        adox	rdi, rcx
-        mov	QWORD PTR [rbp+88], r10
-        ; No store %r13 - %r9
-        ; No load %r15 - %r8
-        ; A[8] x A[5]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; A[9] x A[5]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r14 - %r10
-        ; No store %r15 - %r8
-        mov	r12, QWORD PTR [r8]
-        mov	r10, QWORD PTR [r8+8]
-        ; A[10] x A[5]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	rbx, rax
-        adox	r12, rcx
-        ; A[11] x A[5]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; No store %rbx - %r9
-        mov	QWORD PTR [r8], r12
-        mov	r11, QWORD PTR [r8+16]
-        mov	r12, QWORD PTR [r8+24]
-        ; A[11] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[11] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8+8], r10
-        mov	QWORD PTR [r8+16], r11
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        ; A[11] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[11] x A[9]
-        mov	rdx, QWORD PTR [r9+72]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+24], r12
-        mov	QWORD PTR [r8+32], r10
-        mov	r12, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [r8+56]
-        ; A[11] x A[10]
-        mov	rdx, QWORD PTR [r9+80]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[13] x A[9]
-        mov	rdx, QWORD PTR [r9+72]
-        mulx	rcx, rax, QWORD PTR [r9+104]
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+40], r11
-        mov	QWORD PTR [r8+48], r12
-        mov	r11, QWORD PTR [r8+64]
-        ; Zero into %r10
-        ; A[13] x A[10]
-        mov	rdx, QWORD PTR [r9+80]
-        mulx	rcx, rax, QWORD PTR [r9+104]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[13] x A[11]
-        mov	rdx, QWORD PTR [r9+88]
-        mulx	r12, rax, QWORD PTR [r9+104]
-        adcx	r11, rax
-        adox	r12, r13
-        mov	QWORD PTR [r8+56], r10
-        mov	QWORD PTR [r8+64], r11
-        ; Zero into %r8
-        ; Zero into %r9
-        ; A[13] x A[12]
-        mov	rdx, QWORD PTR [r9+96]
-        mulx	r10, rax, QWORD PTR [r9+104]
-        adcx	r12, rax
-        adox	r10, r13
-        mov	QWORD PTR [r8+72], r12
-        ;  Carry
-        adcx	r10, r14
-        mov	r14, r13
-        adcx	r14, r13
-        adox	r14, r13
-        mov	QWORD PTR [r8+80], r10
-        ; Diagonal 7
-        ; No load %r15 - %r9
-        ; A[7] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; A[8] x A[6]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r14 - %r8
-        ; No store %r15 - %r9
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        ; A[9] x A[6]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	rbx, rax
-        adox	r10, rcx
-        ; A[10] x A[6]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; No store %rbx - %r10
-        mov	QWORD PTR [r8], r10
-        mov	r12, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [r8+24]
-        ; A[10] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[10] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+8], r11
-        mov	QWORD PTR [r8+16], r12
-        mov	r11, QWORD PTR [r8+32]
-        mov	r12, QWORD PTR [r8+40]
-        ; A[10] x A[9]
-        mov	rdx, QWORD PTR [r9+72]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[14] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, QWORD PTR [r9+112]
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8+24], r10
-        mov	QWORD PTR [r8+32], r11
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        ; A[14] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	rcx, rax, QWORD PTR [r9+112]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[14] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	rcx, rax, QWORD PTR [r9+112]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+40], r12
-        mov	QWORD PTR [r8+48], r10
-        mov	r12, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [r8+72]
-        ; A[14] x A[9]
-        mov	rdx, QWORD PTR [r9+72]
-        mulx	rcx, rax, QWORD PTR [r9+112]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[14] x A[10]
-        mov	rdx, QWORD PTR [r9+80]
-        mulx	rcx, rax, QWORD PTR [r9+112]
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+56], r11
-        mov	QWORD PTR [r8+64], r12
-        mov	r11, QWORD PTR [r8+80]
-        ; Zero into %r10
-        ; A[14] x A[11]
-        mov	rdx, QWORD PTR [r9+88]
-        mulx	rcx, rax, QWORD PTR [r9+112]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[14] x A[12]
-        mov	rdx, QWORD PTR [r9+96]
-        mulx	r12, rax, QWORD PTR [r9+112]
-        adcx	r11, rax
-        adox	r12, r13
-        mov	QWORD PTR [r8+72], r10
-        mov	QWORD PTR [r8+80], r11
-        ; Zero into %r8
-        ; Zero into %r9
-        ; A[14] x A[13]
-        mov	rdx, QWORD PTR [r9+104]
-        mulx	r10, rax, QWORD PTR [r9+112]
-        adcx	r12, rax
-        adox	r10, r13
-        mov	QWORD PTR [r8+88], r12
-        ;  Carry
-        adcx	r10, r14
-        mov	r14, r13
-        adcx	r14, r13
-        adox	r14, r13
-        mov	QWORD PTR [r8+96], r10
-        ; Diagonal 8
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        ; A[8] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	rbx, rax
-        adox	r11, rcx
-        ; A[9] x A[7]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; No store %rbx - %r8
-        mov	QWORD PTR [r8], r11
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        ; A[9] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[15] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	rcx, rax, QWORD PTR [r9+120]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+8], r12
-        mov	QWORD PTR [r8+16], r10
-        mov	r12, QWORD PTR [r8+32]
-        mov	r10, QWORD PTR [r8+40]
-        ; A[15] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	rcx, rax, QWORD PTR [r9+120]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[15] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	rcx, rax, QWORD PTR [r9+120]
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+24], r11
-        mov	QWORD PTR [r8+32], r12
-        mov	r11, QWORD PTR [r8+48]
-        mov	r12, QWORD PTR [r8+56]
-        ; A[15] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, QWORD PTR [r9+120]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[15] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	rcx, rax, QWORD PTR [r9+120]
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8+40], r10
-        mov	QWORD PTR [r8+48], r11
-        mov	r10, QWORD PTR [r8+64]
-        mov	r11, QWORD PTR [r8+72]
-        ; A[15] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	rcx, rax, QWORD PTR [r9+120]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[15] x A[9]
-        mov	rdx, QWORD PTR [r9+72]
-        mulx	rcx, rax, QWORD PTR [r9+120]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+56], r12
-        mov	QWORD PTR [r8+64], r10
-        mov	r12, QWORD PTR [r8+80]
-        mov	r10, QWORD PTR [r8+88]
-        ; A[15] x A[10]
-        mov	rdx, QWORD PTR [r9+80]
-        mulx	rcx, rax, QWORD PTR [r9+120]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[15] x A[11]
-        mov	rdx, QWORD PTR [r9+88]
-        mulx	rcx, rax, QWORD PTR [r9+120]
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+72], r11
-        mov	QWORD PTR [r8+80], r12
-        mov	r11, QWORD PTR [r8+96]
-        ; Zero into %r10
-        ; A[15] x A[12]
-        mov	rdx, QWORD PTR [r9+96]
-        mulx	rcx, rax, QWORD PTR [r9+120]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[15] x A[13]
-        mov	rdx, QWORD PTR [r9+104]
-        mulx	r12, rax, QWORD PTR [r9+120]
-        adcx	r11, rax
-        adox	r12, r13
-        mov	QWORD PTR [r8+88], r10
-        mov	QWORD PTR [r8+96], r11
-        ; Zero into %r8
-        ; Zero into %r9
-        ; A[15] x A[14]
-        mov	rdx, QWORD PTR [r9+112]
-        mulx	r10, rax, QWORD PTR [r9+120]
-        adcx	r12, rax
-        adox	r10, r13
-        mov	QWORD PTR [r8+104], r12
-        ;  Carry
-        adcx	r10, r14
-        mov	r14, r13
-        adcx	r14, r13
-        adox	r14, r13
-        mov	QWORD PTR [r8+112], r10
-        mov	QWORD PTR [r8+120], r14
-        ; Double and Add in A[i] x A[i]
-        mov	r11, QWORD PTR [rbp+8]
-        ; A[0] x A[0]
-        mov	rdx, QWORD PTR [r9]
-        mulx	rcx, rax, rdx
-        mov	QWORD PTR [rbp], rax
-        adox	r11, r11
-        adcx	r11, rcx
-        mov	QWORD PTR [rbp+8], r11
-        mov	r10, QWORD PTR [rbp+16]
-        mov	r11, QWORD PTR [rbp+24]
-        ; A[1] x A[1]
-        mov	rdx, QWORD PTR [r9+8]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [rbp+16], r10
-        mov	QWORD PTR [rbp+24], r11
-        mov	r10, QWORD PTR [rbp+32]
-        mov	r11, QWORD PTR [rbp+40]
-        ; A[2] x A[2]
-        mov	rdx, QWORD PTR [r9+16]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [rbp+32], r10
-        mov	QWORD PTR [rbp+40], r11
-        mov	r10, QWORD PTR [rbp+48]
-        mov	r11, QWORD PTR [rbp+56]
-        ; A[3] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [rbp+48], r10
-        mov	QWORD PTR [rbp+56], r11
-        mov	r10, QWORD PTR [rbp+64]
-        mov	r11, QWORD PTR [rbp+72]
-        ; A[4] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [rbp+64], r10
-        mov	QWORD PTR [rbp+72], r11
-        mov	r10, QWORD PTR [rbp+80]
-        mov	r11, QWORD PTR [rbp+88]
-        ; A[5] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [rbp+80], r10
-        mov	QWORD PTR [rbp+88], r11
-        ; A[6] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, rdx
-        adox	r15, r15
-        adox	rdi, rdi
-        adcx	r15, rax
-        adcx	rdi, rcx
-        ; A[7] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	rcx, rax, rdx
-        adox	rsi, rsi
-        adox	rbx, rbx
-        adcx	rsi, rax
-        adcx	rbx, rcx
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        ; A[8] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8], r10
-        mov	QWORD PTR [r8+8], r11
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        ; A[9] x A[9]
-        mov	rdx, QWORD PTR [r9+72]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+16], r10
-        mov	QWORD PTR [r8+24], r11
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        ; A[10] x A[10]
-        mov	rdx, QWORD PTR [r9+80]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+32], r10
-        mov	QWORD PTR [r8+40], r11
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        ; A[11] x A[11]
-        mov	rdx, QWORD PTR [r9+88]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+48], r10
-        mov	QWORD PTR [r8+56], r11
-        mov	r10, QWORD PTR [r8+64]
-        mov	r11, QWORD PTR [r8+72]
-        ; A[12] x A[12]
-        mov	rdx, QWORD PTR [r9+96]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+64], r10
-        mov	QWORD PTR [r8+72], r11
-        mov	r10, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [r8+88]
-        ; A[13] x A[13]
-        mov	rdx, QWORD PTR [r9+104]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+80], r10
-        mov	QWORD PTR [r8+88], r11
-        mov	r10, QWORD PTR [r8+96]
-        mov	r11, QWORD PTR [r8+104]
-        ; A[14] x A[14]
-        mov	rdx, QWORD PTR [r9+112]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+96], r10
-        mov	QWORD PTR [r8+104], r11
-        mov	r10, QWORD PTR [r8+112]
-        mov	r11, QWORD PTR [r8+120]
-        ; A[15] x A[15]
-        mov	rdx, QWORD PTR [r9+120]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+112], r10
-        mov	QWORD PTR [r8+120], r11
-        mov	QWORD PTR [r8+-32], r15
-        mov	QWORD PTR [r8+-24], rdi
-        mov	QWORD PTR [r8+-16], rsi
-        mov	QWORD PTR [r8+-8], rbx
-        sub	r8, 128
-        cmp	r9, r8
-        jne	L_end_2048_sqr_avx2_16
-        vmovdqu	xmm0, OWORD PTR [rbp]
-        vmovups	OWORD PTR [r8], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbp+16]
-        vmovups	OWORD PTR [r8+16], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbp+32]
-        vmovups	OWORD PTR [r8+32], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbp+48]
-        vmovups	OWORD PTR [r8+48], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbp+64]
-        vmovups	OWORD PTR [r8+64], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbp+80]
-        vmovups	OWORD PTR [r8+80], xmm0
-L_end_2048_sqr_avx2_16:
-        add	rsp, 128
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        pop	rbp
-        ret
-sp_2048_sqr_avx2_16 ENDP
-_text ENDS
-ENDIF
-; /* Square a and put result in r. (r = a * a)
-;  *
-;  * Karatsuba: ah^2, al^2, (al - ah)^2
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_sqr_32 PROC
-        sub	rsp, 272
-        mov	QWORD PTR [rsp+256], rcx
-        mov	QWORD PTR [rsp+264], rdx
-        mov	r9, 0
-        mov	r10, rsp
-        lea	r11, QWORD PTR [rdx+128]
-        mov	rax, QWORD PTR [rdx]
-        sub	rax, QWORD PTR [r11]
-        mov	r8, QWORD PTR [rdx+8]
-        mov	QWORD PTR [r10], rax
-        sbb	r8, QWORD PTR [r11+8]
-        mov	rax, QWORD PTR [rdx+16]
-        mov	QWORD PTR [r10+8], r8
-        sbb	rax, QWORD PTR [r11+16]
-        mov	r8, QWORD PTR [rdx+24]
-        mov	QWORD PTR [r10+16], rax
-        sbb	r8, QWORD PTR [r11+24]
-        mov	rax, QWORD PTR [rdx+32]
-        mov	QWORD PTR [r10+24], r8
-        sbb	rax, QWORD PTR [r11+32]
-        mov	r8, QWORD PTR [rdx+40]
-        mov	QWORD PTR [r10+32], rax
-        sbb	r8, QWORD PTR [r11+40]
-        mov	rax, QWORD PTR [rdx+48]
-        mov	QWORD PTR [r10+40], r8
-        sbb	rax, QWORD PTR [r11+48]
-        mov	r8, QWORD PTR [rdx+56]
-        mov	QWORD PTR [r10+48], rax
-        sbb	r8, QWORD PTR [r11+56]
-        mov	rax, QWORD PTR [rdx+64]
-        mov	QWORD PTR [r10+56], r8
-        sbb	rax, QWORD PTR [r11+64]
-        mov	r8, QWORD PTR [rdx+72]
-        mov	QWORD PTR [r10+64], rax
-        sbb	r8, QWORD PTR [r11+72]
-        mov	rax, QWORD PTR [rdx+80]
-        mov	QWORD PTR [r10+72], r8
-        sbb	rax, QWORD PTR [r11+80]
-        mov	r8, QWORD PTR [rdx+88]
-        mov	QWORD PTR [r10+80], rax
-        sbb	r8, QWORD PTR [r11+88]
-        mov	rax, QWORD PTR [rdx+96]
-        mov	QWORD PTR [r10+88], r8
-        sbb	rax, QWORD PTR [r11+96]
-        mov	r8, QWORD PTR [rdx+104]
-        mov	QWORD PTR [r10+96], rax
-        sbb	r8, QWORD PTR [r11+104]
-        mov	rax, QWORD PTR [rdx+112]
-        mov	QWORD PTR [r10+104], r8
-        sbb	rax, QWORD PTR [r11+112]
-        mov	r8, QWORD PTR [rdx+120]
-        mov	QWORD PTR [r10+112], rax
-        sbb	r8, QWORD PTR [r11+120]
-        mov	QWORD PTR [r10+120], r8
-        sbb	r9, 0
-        ; Cond Negate
-        mov	rax, QWORD PTR [r10]
-        mov	r11, r9
-        xor	rax, r9
-        neg	r11
-        sub	rax, r9
-        mov	r8, QWORD PTR [r10+8]
-        sbb	r11, 0
-        mov	QWORD PTR [r10], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+16]
-        setc	r11b
-        mov	QWORD PTR [r10+8], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+24]
-        setc	r11b
-        mov	QWORD PTR [r10+16], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+32]
-        setc	r11b
-        mov	QWORD PTR [r10+24], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+40]
-        setc	r11b
-        mov	QWORD PTR [r10+32], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+48]
-        setc	r11b
-        mov	QWORD PTR [r10+40], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+56]
-        setc	r11b
-        mov	QWORD PTR [r10+48], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+64]
-        setc	r11b
-        mov	QWORD PTR [r10+56], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+72]
-        setc	r11b
-        mov	QWORD PTR [r10+64], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+80]
-        setc	r11b
-        mov	QWORD PTR [r10+72], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+88]
-        setc	r11b
-        mov	QWORD PTR [r10+80], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+96]
-        setc	r11b
-        mov	QWORD PTR [r10+88], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+104]
-        setc	r11b
-        mov	QWORD PTR [r10+96], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+112]
-        setc	r11b
-        mov	QWORD PTR [r10+104], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+120]
-        setc	r11b
-        mov	QWORD PTR [r10+112], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	QWORD PTR [r10+120], r8
-        mov	rdx, r10
-        mov	rcx, rsp
-        call	sp_2048_sqr_16
-        mov	rdx, QWORD PTR [rsp+264]
-        mov	rcx, QWORD PTR [rsp+256]
-        add	rdx, 128
-        add	rcx, 256
-        call	sp_2048_sqr_16
-        mov	rdx, QWORD PTR [rsp+264]
-        mov	rcx, QWORD PTR [rsp+256]
-        call	sp_2048_sqr_16
-IFDEF _WIN64
-        mov	rdx, QWORD PTR [rsp+264]
-        mov	rcx, QWORD PTR [rsp+256]
-ENDIF
-        mov	rdx, QWORD PTR [rsp+256]
-        lea	r10, QWORD PTR [rsp+128]
-        add	rdx, 384
-        mov	r9, 0
-        mov	r8, QWORD PTR [r10+-128]
-        sub	r8, QWORD PTR [rdx+-128]
-        mov	rax, QWORD PTR [r10+-120]
-        mov	QWORD PTR [r10+-128], r8
-        sbb	rax, QWORD PTR [rdx+-120]
-        mov	r8, QWORD PTR [r10+-112]
-        mov	QWORD PTR [r10+-120], rax
-        sbb	r8, QWORD PTR [rdx+-112]
-        mov	rax, QWORD PTR [r10+-104]
-        mov	QWORD PTR [r10+-112], r8
-        sbb	rax, QWORD PTR [rdx+-104]
-        mov	r8, QWORD PTR [r10+-96]
-        mov	QWORD PTR [r10+-104], rax
-        sbb	r8, QWORD PTR [rdx+-96]
-        mov	rax, QWORD PTR [r10+-88]
-        mov	QWORD PTR [r10+-96], r8
-        sbb	rax, QWORD PTR [rdx+-88]
-        mov	r8, QWORD PTR [r10+-80]
-        mov	QWORD PTR [r10+-88], rax
-        sbb	r8, QWORD PTR [rdx+-80]
-        mov	rax, QWORD PTR [r10+-72]
-        mov	QWORD PTR [r10+-80], r8
-        sbb	rax, QWORD PTR [rdx+-72]
-        mov	r8, QWORD PTR [r10+-64]
-        mov	QWORD PTR [r10+-72], rax
-        sbb	r8, QWORD PTR [rdx+-64]
-        mov	rax, QWORD PTR [r10+-56]
-        mov	QWORD PTR [r10+-64], r8
-        sbb	rax, QWORD PTR [rdx+-56]
-        mov	r8, QWORD PTR [r10+-48]
-        mov	QWORD PTR [r10+-56], rax
-        sbb	r8, QWORD PTR [rdx+-48]
-        mov	rax, QWORD PTR [r10+-40]
-        mov	QWORD PTR [r10+-48], r8
-        sbb	rax, QWORD PTR [rdx+-40]
-        mov	r8, QWORD PTR [r10+-32]
-        mov	QWORD PTR [r10+-40], rax
-        sbb	r8, QWORD PTR [rdx+-32]
-        mov	rax, QWORD PTR [r10+-24]
-        mov	QWORD PTR [r10+-32], r8
-        sbb	rax, QWORD PTR [rdx+-24]
-        mov	r8, QWORD PTR [r10+-16]
-        mov	QWORD PTR [r10+-24], rax
-        sbb	r8, QWORD PTR [rdx+-16]
-        mov	rax, QWORD PTR [r10+-8]
-        mov	QWORD PTR [r10+-16], r8
-        sbb	rax, QWORD PTR [rdx+-8]
-        mov	r8, QWORD PTR [r10]
-        mov	QWORD PTR [r10+-8], rax
-        sbb	r8, QWORD PTR [rdx]
-        mov	rax, QWORD PTR [r10+8]
-        mov	QWORD PTR [r10], r8
-        sbb	rax, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [r10+16]
-        mov	QWORD PTR [r10+8], rax
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	rax, QWORD PTR [r10+24]
-        mov	QWORD PTR [r10+16], r8
-        sbb	rax, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [r10+32]
-        mov	QWORD PTR [r10+24], rax
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	rax, QWORD PTR [r10+40]
-        mov	QWORD PTR [r10+32], r8
-        sbb	rax, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [r10+48]
-        mov	QWORD PTR [r10+40], rax
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	rax, QWORD PTR [r10+56]
-        mov	QWORD PTR [r10+48], r8
-        sbb	rax, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [r10+64]
-        mov	QWORD PTR [r10+56], rax
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	rax, QWORD PTR [r10+72]
-        mov	QWORD PTR [r10+64], r8
-        sbb	rax, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [r10+80]
-        mov	QWORD PTR [r10+72], rax
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	rax, QWORD PTR [r10+88]
-        mov	QWORD PTR [r10+80], r8
-        sbb	rax, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [r10+96]
-        mov	QWORD PTR [r10+88], rax
-        sbb	r8, QWORD PTR [rdx+96]
-        mov	rax, QWORD PTR [r10+104]
-        mov	QWORD PTR [r10+96], r8
-        sbb	rax, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [r10+112]
-        mov	QWORD PTR [r10+104], rax
-        sbb	r8, QWORD PTR [rdx+112]
-        mov	rax, QWORD PTR [r10+120]
-        mov	QWORD PTR [r10+112], r8
-        sbb	rax, QWORD PTR [rdx+120]
-        mov	QWORD PTR [r10+120], rax
-        sbb	r9, 0
-        sub	rdx, 256
-        mov	r8, QWORD PTR [r10+-128]
-        sub	r8, QWORD PTR [rdx+-128]
-        mov	rax, QWORD PTR [r10+-120]
-        mov	QWORD PTR [r10+-128], r8
-        sbb	rax, QWORD PTR [rdx+-120]
-        mov	r8, QWORD PTR [r10+-112]
-        mov	QWORD PTR [r10+-120], rax
-        sbb	r8, QWORD PTR [rdx+-112]
-        mov	rax, QWORD PTR [r10+-104]
-        mov	QWORD PTR [r10+-112], r8
-        sbb	rax, QWORD PTR [rdx+-104]
-        mov	r8, QWORD PTR [r10+-96]
-        mov	QWORD PTR [r10+-104], rax
-        sbb	r8, QWORD PTR [rdx+-96]
-        mov	rax, QWORD PTR [r10+-88]
-        mov	QWORD PTR [r10+-96], r8
-        sbb	rax, QWORD PTR [rdx+-88]
-        mov	r8, QWORD PTR [r10+-80]
-        mov	QWORD PTR [r10+-88], rax
-        sbb	r8, QWORD PTR [rdx+-80]
-        mov	rax, QWORD PTR [r10+-72]
-        mov	QWORD PTR [r10+-80], r8
-        sbb	rax, QWORD PTR [rdx+-72]
-        mov	r8, QWORD PTR [r10+-64]
-        mov	QWORD PTR [r10+-72], rax
-        sbb	r8, QWORD PTR [rdx+-64]
-        mov	rax, QWORD PTR [r10+-56]
-        mov	QWORD PTR [r10+-64], r8
-        sbb	rax, QWORD PTR [rdx+-56]
-        mov	r8, QWORD PTR [r10+-48]
-        mov	QWORD PTR [r10+-56], rax
-        sbb	r8, QWORD PTR [rdx+-48]
-        mov	rax, QWORD PTR [r10+-40]
-        mov	QWORD PTR [r10+-48], r8
-        sbb	rax, QWORD PTR [rdx+-40]
-        mov	r8, QWORD PTR [r10+-32]
-        mov	QWORD PTR [r10+-40], rax
-        sbb	r8, QWORD PTR [rdx+-32]
-        mov	rax, QWORD PTR [r10+-24]
-        mov	QWORD PTR [r10+-32], r8
-        sbb	rax, QWORD PTR [rdx+-24]
-        mov	r8, QWORD PTR [r10+-16]
-        mov	QWORD PTR [r10+-24], rax
-        sbb	r8, QWORD PTR [rdx+-16]
-        mov	rax, QWORD PTR [r10+-8]
-        mov	QWORD PTR [r10+-16], r8
-        sbb	rax, QWORD PTR [rdx+-8]
-        mov	r8, QWORD PTR [r10]
-        mov	QWORD PTR [r10+-8], rax
-        sbb	r8, QWORD PTR [rdx]
-        mov	rax, QWORD PTR [r10+8]
-        mov	QWORD PTR [r10], r8
-        sbb	rax, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [r10+16]
-        mov	QWORD PTR [r10+8], rax
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	rax, QWORD PTR [r10+24]
-        mov	QWORD PTR [r10+16], r8
-        sbb	rax, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [r10+32]
-        mov	QWORD PTR [r10+24], rax
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	rax, QWORD PTR [r10+40]
-        mov	QWORD PTR [r10+32], r8
-        sbb	rax, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [r10+48]
-        mov	QWORD PTR [r10+40], rax
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	rax, QWORD PTR [r10+56]
-        mov	QWORD PTR [r10+48], r8
-        sbb	rax, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [r10+64]
-        mov	QWORD PTR [r10+56], rax
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	rax, QWORD PTR [r10+72]
-        mov	QWORD PTR [r10+64], r8
-        sbb	rax, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [r10+80]
-        mov	QWORD PTR [r10+72], rax
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	rax, QWORD PTR [r10+88]
-        mov	QWORD PTR [r10+80], r8
-        sbb	rax, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [r10+96]
-        mov	QWORD PTR [r10+88], rax
-        sbb	r8, QWORD PTR [rdx+96]
-        mov	rax, QWORD PTR [r10+104]
-        mov	QWORD PTR [r10+96], r8
-        sbb	rax, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [r10+112]
-        mov	QWORD PTR [r10+104], rax
-        sbb	r8, QWORD PTR [rdx+112]
-        mov	rax, QWORD PTR [r10+120]
-        mov	QWORD PTR [r10+112], r8
-        sbb	rax, QWORD PTR [rdx+120]
-        mov	QWORD PTR [r10+120], rax
-        sbb	r9, 0
-        mov	rcx, QWORD PTR [rsp+256]
-        neg	r9
-        add	rcx, 256
-        mov	r8, QWORD PTR [rcx+-128]
-        sub	r8, QWORD PTR [r10+-128]
-        mov	rax, QWORD PTR [rcx+-120]
-        mov	QWORD PTR [rcx+-128], r8
-        sbb	rax, QWORD PTR [r10+-120]
-        mov	r8, QWORD PTR [rcx+-112]
-        mov	QWORD PTR [rcx+-120], rax
-        sbb	r8, QWORD PTR [r10+-112]
-        mov	rax, QWORD PTR [rcx+-104]
-        mov	QWORD PTR [rcx+-112], r8
-        sbb	rax, QWORD PTR [r10+-104]
-        mov	r8, QWORD PTR [rcx+-96]
-        mov	QWORD PTR [rcx+-104], rax
-        sbb	r8, QWORD PTR [r10+-96]
-        mov	rax, QWORD PTR [rcx+-88]
-        mov	QWORD PTR [rcx+-96], r8
-        sbb	rax, QWORD PTR [r10+-88]
-        mov	r8, QWORD PTR [rcx+-80]
-        mov	QWORD PTR [rcx+-88], rax
-        sbb	r8, QWORD PTR [r10+-80]
-        mov	rax, QWORD PTR [rcx+-72]
-        mov	QWORD PTR [rcx+-80], r8
-        sbb	rax, QWORD PTR [r10+-72]
-        mov	r8, QWORD PTR [rcx+-64]
-        mov	QWORD PTR [rcx+-72], rax
-        sbb	r8, QWORD PTR [r10+-64]
-        mov	rax, QWORD PTR [rcx+-56]
-        mov	QWORD PTR [rcx+-64], r8
-        sbb	rax, QWORD PTR [r10+-56]
-        mov	r8, QWORD PTR [rcx+-48]
-        mov	QWORD PTR [rcx+-56], rax
-        sbb	r8, QWORD PTR [r10+-48]
-        mov	rax, QWORD PTR [rcx+-40]
-        mov	QWORD PTR [rcx+-48], r8
-        sbb	rax, QWORD PTR [r10+-40]
-        mov	r8, QWORD PTR [rcx+-32]
-        mov	QWORD PTR [rcx+-40], rax
-        sbb	r8, QWORD PTR [r10+-32]
-        mov	rax, QWORD PTR [rcx+-24]
-        mov	QWORD PTR [rcx+-32], r8
-        sbb	rax, QWORD PTR [r10+-24]
-        mov	r8, QWORD PTR [rcx+-16]
-        mov	QWORD PTR [rcx+-24], rax
-        sbb	r8, QWORD PTR [r10+-16]
-        mov	rax, QWORD PTR [rcx+-8]
-        mov	QWORD PTR [rcx+-16], r8
-        sbb	rax, QWORD PTR [r10+-8]
-        mov	r8, QWORD PTR [rcx]
-        mov	QWORD PTR [rcx+-8], rax
-        sbb	r8, QWORD PTR [r10]
-        mov	rax, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        sbb	rax, QWORD PTR [r10+8]
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], rax
-        sbb	r8, QWORD PTR [r10+16]
-        mov	rax, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        sbb	rax, QWORD PTR [r10+24]
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], rax
-        sbb	r8, QWORD PTR [r10+32]
-        mov	rax, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        sbb	rax, QWORD PTR [r10+40]
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], rax
-        sbb	r8, QWORD PTR [r10+48]
-        mov	rax, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        sbb	rax, QWORD PTR [r10+56]
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], rax
-        sbb	r8, QWORD PTR [r10+64]
-        mov	rax, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        sbb	rax, QWORD PTR [r10+72]
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], rax
-        sbb	r8, QWORD PTR [r10+80]
-        mov	rax, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        sbb	rax, QWORD PTR [r10+88]
-        mov	r8, QWORD PTR [rcx+96]
-        mov	QWORD PTR [rcx+88], rax
-        sbb	r8, QWORD PTR [r10+96]
-        mov	rax, QWORD PTR [rcx+104]
-        mov	QWORD PTR [rcx+96], r8
-        sbb	rax, QWORD PTR [r10+104]
-        mov	r8, QWORD PTR [rcx+112]
-        mov	QWORD PTR [rcx+104], rax
-        sbb	r8, QWORD PTR [r10+112]
-        mov	rax, QWORD PTR [rcx+120]
-        mov	QWORD PTR [rcx+112], r8
-        sbb	rax, QWORD PTR [r10+120]
-        mov	QWORD PTR [rcx+120], rax
-        sbb	r9, 0
-        mov	rcx, QWORD PTR [rsp+256]
-        add	rcx, 384
-        ; Add in word
-        mov	r8, QWORD PTR [rcx]
-        add	r8, r9
-        mov	rax, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+96]
-        mov	QWORD PTR [rcx+88], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+104]
-        mov	QWORD PTR [rcx+96], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+112]
-        mov	QWORD PTR [rcx+104], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+120]
-        mov	QWORD PTR [rcx+112], r8
-        adc	rax, 0
-        mov	QWORD PTR [rcx+120], rax
-        mov	rdx, QWORD PTR [rsp+264]
-        mov	rcx, QWORD PTR [rsp+256]
-        add	rsp, 272
-        ret
-sp_2048_sqr_32 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Square a and put result in r. (r = a * a)
-;  *
-;  * Karatsuba: ah^2, al^2, (al - ah)^2
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_sqr_avx2_32 PROC
-        sub	rsp, 272
-        mov	QWORD PTR [rsp+256], rcx
-        mov	QWORD PTR [rsp+264], rdx
-        mov	r9, 0
-        mov	r10, rsp
-        lea	r11, QWORD PTR [rdx+128]
-        mov	rax, QWORD PTR [rdx]
-        sub	rax, QWORD PTR [r11]
-        mov	r8, QWORD PTR [rdx+8]
-        mov	QWORD PTR [r10], rax
-        sbb	r8, QWORD PTR [r11+8]
-        mov	rax, QWORD PTR [rdx+16]
-        mov	QWORD PTR [r10+8], r8
-        sbb	rax, QWORD PTR [r11+16]
-        mov	r8, QWORD PTR [rdx+24]
-        mov	QWORD PTR [r10+16], rax
-        sbb	r8, QWORD PTR [r11+24]
-        mov	rax, QWORD PTR [rdx+32]
-        mov	QWORD PTR [r10+24], r8
-        sbb	rax, QWORD PTR [r11+32]
-        mov	r8, QWORD PTR [rdx+40]
-        mov	QWORD PTR [r10+32], rax
-        sbb	r8, QWORD PTR [r11+40]
-        mov	rax, QWORD PTR [rdx+48]
-        mov	QWORD PTR [r10+40], r8
-        sbb	rax, QWORD PTR [r11+48]
-        mov	r8, QWORD PTR [rdx+56]
-        mov	QWORD PTR [r10+48], rax
-        sbb	r8, QWORD PTR [r11+56]
-        mov	rax, QWORD PTR [rdx+64]
-        mov	QWORD PTR [r10+56], r8
-        sbb	rax, QWORD PTR [r11+64]
-        mov	r8, QWORD PTR [rdx+72]
-        mov	QWORD PTR [r10+64], rax
-        sbb	r8, QWORD PTR [r11+72]
-        mov	rax, QWORD PTR [rdx+80]
-        mov	QWORD PTR [r10+72], r8
-        sbb	rax, QWORD PTR [r11+80]
-        mov	r8, QWORD PTR [rdx+88]
-        mov	QWORD PTR [r10+80], rax
-        sbb	r8, QWORD PTR [r11+88]
-        mov	rax, QWORD PTR [rdx+96]
-        mov	QWORD PTR [r10+88], r8
-        sbb	rax, QWORD PTR [r11+96]
-        mov	r8, QWORD PTR [rdx+104]
-        mov	QWORD PTR [r10+96], rax
-        sbb	r8, QWORD PTR [r11+104]
-        mov	rax, QWORD PTR [rdx+112]
-        mov	QWORD PTR [r10+104], r8
-        sbb	rax, QWORD PTR [r11+112]
-        mov	r8, QWORD PTR [rdx+120]
-        mov	QWORD PTR [r10+112], rax
-        sbb	r8, QWORD PTR [r11+120]
-        mov	QWORD PTR [r10+120], r8
-        sbb	r9, 0
-        ; Cond Negate
-        mov	rax, QWORD PTR [r10]
-        mov	r11, r9
-        xor	rax, r9
-        neg	r11
-        sub	rax, r9
-        mov	r8, QWORD PTR [r10+8]
-        sbb	r11, 0
-        mov	QWORD PTR [r10], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+16]
-        setc	r11b
-        mov	QWORD PTR [r10+8], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+24]
-        setc	r11b
-        mov	QWORD PTR [r10+16], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+32]
-        setc	r11b
-        mov	QWORD PTR [r10+24], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+40]
-        setc	r11b
-        mov	QWORD PTR [r10+32], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+48]
-        setc	r11b
-        mov	QWORD PTR [r10+40], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+56]
-        setc	r11b
-        mov	QWORD PTR [r10+48], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+64]
-        setc	r11b
-        mov	QWORD PTR [r10+56], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+72]
-        setc	r11b
-        mov	QWORD PTR [r10+64], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+80]
-        setc	r11b
-        mov	QWORD PTR [r10+72], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+88]
-        setc	r11b
-        mov	QWORD PTR [r10+80], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+96]
-        setc	r11b
-        mov	QWORD PTR [r10+88], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+104]
-        setc	r11b
-        mov	QWORD PTR [r10+96], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+112]
-        setc	r11b
-        mov	QWORD PTR [r10+104], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+120]
-        setc	r11b
-        mov	QWORD PTR [r10+112], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	QWORD PTR [r10+120], r8
-        mov	rdx, r10
-        mov	rcx, rsp
-        call	sp_2048_sqr_avx2_16
-        mov	rdx, QWORD PTR [rsp+264]
-        mov	rcx, QWORD PTR [rsp+256]
-        add	rdx, 128
-        add	rcx, 256
-        call	sp_2048_sqr_avx2_16
-        mov	rdx, QWORD PTR [rsp+264]
-        mov	rcx, QWORD PTR [rsp+256]
-        call	sp_2048_sqr_avx2_16
-IFDEF _WIN64
-        mov	rdx, QWORD PTR [rsp+264]
-        mov	rcx, QWORD PTR [rsp+256]
-ENDIF
-        mov	rdx, QWORD PTR [rsp+256]
-        lea	r10, QWORD PTR [rsp+128]
-        add	rdx, 384
-        mov	r9, 0
-        mov	r8, QWORD PTR [r10+-128]
-        sub	r8, QWORD PTR [rdx+-128]
-        mov	rax, QWORD PTR [r10+-120]
-        mov	QWORD PTR [r10+-128], r8
-        sbb	rax, QWORD PTR [rdx+-120]
-        mov	r8, QWORD PTR [r10+-112]
-        mov	QWORD PTR [r10+-120], rax
-        sbb	r8, QWORD PTR [rdx+-112]
-        mov	rax, QWORD PTR [r10+-104]
-        mov	QWORD PTR [r10+-112], r8
-        sbb	rax, QWORD PTR [rdx+-104]
-        mov	r8, QWORD PTR [r10+-96]
-        mov	QWORD PTR [r10+-104], rax
-        sbb	r8, QWORD PTR [rdx+-96]
-        mov	rax, QWORD PTR [r10+-88]
-        mov	QWORD PTR [r10+-96], r8
-        sbb	rax, QWORD PTR [rdx+-88]
-        mov	r8, QWORD PTR [r10+-80]
-        mov	QWORD PTR [r10+-88], rax
-        sbb	r8, QWORD PTR [rdx+-80]
-        mov	rax, QWORD PTR [r10+-72]
-        mov	QWORD PTR [r10+-80], r8
-        sbb	rax, QWORD PTR [rdx+-72]
-        mov	r8, QWORD PTR [r10+-64]
-        mov	QWORD PTR [r10+-72], rax
-        sbb	r8, QWORD PTR [rdx+-64]
-        mov	rax, QWORD PTR [r10+-56]
-        mov	QWORD PTR [r10+-64], r8
-        sbb	rax, QWORD PTR [rdx+-56]
-        mov	r8, QWORD PTR [r10+-48]
-        mov	QWORD PTR [r10+-56], rax
-        sbb	r8, QWORD PTR [rdx+-48]
-        mov	rax, QWORD PTR [r10+-40]
-        mov	QWORD PTR [r10+-48], r8
-        sbb	rax, QWORD PTR [rdx+-40]
-        mov	r8, QWORD PTR [r10+-32]
-        mov	QWORD PTR [r10+-40], rax
-        sbb	r8, QWORD PTR [rdx+-32]
-        mov	rax, QWORD PTR [r10+-24]
-        mov	QWORD PTR [r10+-32], r8
-        sbb	rax, QWORD PTR [rdx+-24]
-        mov	r8, QWORD PTR [r10+-16]
-        mov	QWORD PTR [r10+-24], rax
-        sbb	r8, QWORD PTR [rdx+-16]
-        mov	rax, QWORD PTR [r10+-8]
-        mov	QWORD PTR [r10+-16], r8
-        sbb	rax, QWORD PTR [rdx+-8]
-        mov	r8, QWORD PTR [r10]
-        mov	QWORD PTR [r10+-8], rax
-        sbb	r8, QWORD PTR [rdx]
-        mov	rax, QWORD PTR [r10+8]
-        mov	QWORD PTR [r10], r8
-        sbb	rax, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [r10+16]
-        mov	QWORD PTR [r10+8], rax
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	rax, QWORD PTR [r10+24]
-        mov	QWORD PTR [r10+16], r8
-        sbb	rax, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [r10+32]
-        mov	QWORD PTR [r10+24], rax
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	rax, QWORD PTR [r10+40]
-        mov	QWORD PTR [r10+32], r8
-        sbb	rax, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [r10+48]
-        mov	QWORD PTR [r10+40], rax
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	rax, QWORD PTR [r10+56]
-        mov	QWORD PTR [r10+48], r8
-        sbb	rax, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [r10+64]
-        mov	QWORD PTR [r10+56], rax
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	rax, QWORD PTR [r10+72]
-        mov	QWORD PTR [r10+64], r8
-        sbb	rax, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [r10+80]
-        mov	QWORD PTR [r10+72], rax
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	rax, QWORD PTR [r10+88]
-        mov	QWORD PTR [r10+80], r8
-        sbb	rax, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [r10+96]
-        mov	QWORD PTR [r10+88], rax
-        sbb	r8, QWORD PTR [rdx+96]
-        mov	rax, QWORD PTR [r10+104]
-        mov	QWORD PTR [r10+96], r8
-        sbb	rax, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [r10+112]
-        mov	QWORD PTR [r10+104], rax
-        sbb	r8, QWORD PTR [rdx+112]
-        mov	rax, QWORD PTR [r10+120]
-        mov	QWORD PTR [r10+112], r8
-        sbb	rax, QWORD PTR [rdx+120]
-        mov	QWORD PTR [r10+120], rax
-        sbb	r9, 0
-        sub	rdx, 256
-        mov	r8, QWORD PTR [r10+-128]
-        sub	r8, QWORD PTR [rdx+-128]
-        mov	rax, QWORD PTR [r10+-120]
-        mov	QWORD PTR [r10+-128], r8
-        sbb	rax, QWORD PTR [rdx+-120]
-        mov	r8, QWORD PTR [r10+-112]
-        mov	QWORD PTR [r10+-120], rax
-        sbb	r8, QWORD PTR [rdx+-112]
-        mov	rax, QWORD PTR [r10+-104]
-        mov	QWORD PTR [r10+-112], r8
-        sbb	rax, QWORD PTR [rdx+-104]
-        mov	r8, QWORD PTR [r10+-96]
-        mov	QWORD PTR [r10+-104], rax
-        sbb	r8, QWORD PTR [rdx+-96]
-        mov	rax, QWORD PTR [r10+-88]
-        mov	QWORD PTR [r10+-96], r8
-        sbb	rax, QWORD PTR [rdx+-88]
-        mov	r8, QWORD PTR [r10+-80]
-        mov	QWORD PTR [r10+-88], rax
-        sbb	r8, QWORD PTR [rdx+-80]
-        mov	rax, QWORD PTR [r10+-72]
-        mov	QWORD PTR [r10+-80], r8
-        sbb	rax, QWORD PTR [rdx+-72]
-        mov	r8, QWORD PTR [r10+-64]
-        mov	QWORD PTR [r10+-72], rax
-        sbb	r8, QWORD PTR [rdx+-64]
-        mov	rax, QWORD PTR [r10+-56]
-        mov	QWORD PTR [r10+-64], r8
-        sbb	rax, QWORD PTR [rdx+-56]
-        mov	r8, QWORD PTR [r10+-48]
-        mov	QWORD PTR [r10+-56], rax
-        sbb	r8, QWORD PTR [rdx+-48]
-        mov	rax, QWORD PTR [r10+-40]
-        mov	QWORD PTR [r10+-48], r8
-        sbb	rax, QWORD PTR [rdx+-40]
-        mov	r8, QWORD PTR [r10+-32]
-        mov	QWORD PTR [r10+-40], rax
-        sbb	r8, QWORD PTR [rdx+-32]
-        mov	rax, QWORD PTR [r10+-24]
-        mov	QWORD PTR [r10+-32], r8
-        sbb	rax, QWORD PTR [rdx+-24]
-        mov	r8, QWORD PTR [r10+-16]
-        mov	QWORD PTR [r10+-24], rax
-        sbb	r8, QWORD PTR [rdx+-16]
-        mov	rax, QWORD PTR [r10+-8]
-        mov	QWORD PTR [r10+-16], r8
-        sbb	rax, QWORD PTR [rdx+-8]
-        mov	r8, QWORD PTR [r10]
-        mov	QWORD PTR [r10+-8], rax
-        sbb	r8, QWORD PTR [rdx]
-        mov	rax, QWORD PTR [r10+8]
-        mov	QWORD PTR [r10], r8
-        sbb	rax, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [r10+16]
-        mov	QWORD PTR [r10+8], rax
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	rax, QWORD PTR [r10+24]
-        mov	QWORD PTR [r10+16], r8
-        sbb	rax, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [r10+32]
-        mov	QWORD PTR [r10+24], rax
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	rax, QWORD PTR [r10+40]
-        mov	QWORD PTR [r10+32], r8
-        sbb	rax, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [r10+48]
-        mov	QWORD PTR [r10+40], rax
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	rax, QWORD PTR [r10+56]
-        mov	QWORD PTR [r10+48], r8
-        sbb	rax, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [r10+64]
-        mov	QWORD PTR [r10+56], rax
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	rax, QWORD PTR [r10+72]
-        mov	QWORD PTR [r10+64], r8
-        sbb	rax, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [r10+80]
-        mov	QWORD PTR [r10+72], rax
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	rax, QWORD PTR [r10+88]
-        mov	QWORD PTR [r10+80], r8
-        sbb	rax, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [r10+96]
-        mov	QWORD PTR [r10+88], rax
-        sbb	r8, QWORD PTR [rdx+96]
-        mov	rax, QWORD PTR [r10+104]
-        mov	QWORD PTR [r10+96], r8
-        sbb	rax, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [r10+112]
-        mov	QWORD PTR [r10+104], rax
-        sbb	r8, QWORD PTR [rdx+112]
-        mov	rax, QWORD PTR [r10+120]
-        mov	QWORD PTR [r10+112], r8
-        sbb	rax, QWORD PTR [rdx+120]
-        mov	QWORD PTR [r10+120], rax
-        sbb	r9, 0
-        mov	rcx, QWORD PTR [rsp+256]
-        neg	r9
-        add	rcx, 256
-        mov	r8, QWORD PTR [rcx+-128]
-        sub	r8, QWORD PTR [r10+-128]
-        mov	rax, QWORD PTR [rcx+-120]
-        mov	QWORD PTR [rcx+-128], r8
-        sbb	rax, QWORD PTR [r10+-120]
-        mov	r8, QWORD PTR [rcx+-112]
-        mov	QWORD PTR [rcx+-120], rax
-        sbb	r8, QWORD PTR [r10+-112]
-        mov	rax, QWORD PTR [rcx+-104]
-        mov	QWORD PTR [rcx+-112], r8
-        sbb	rax, QWORD PTR [r10+-104]
-        mov	r8, QWORD PTR [rcx+-96]
-        mov	QWORD PTR [rcx+-104], rax
-        sbb	r8, QWORD PTR [r10+-96]
-        mov	rax, QWORD PTR [rcx+-88]
-        mov	QWORD PTR [rcx+-96], r8
-        sbb	rax, QWORD PTR [r10+-88]
-        mov	r8, QWORD PTR [rcx+-80]
-        mov	QWORD PTR [rcx+-88], rax
-        sbb	r8, QWORD PTR [r10+-80]
-        mov	rax, QWORD PTR [rcx+-72]
-        mov	QWORD PTR [rcx+-80], r8
-        sbb	rax, QWORD PTR [r10+-72]
-        mov	r8, QWORD PTR [rcx+-64]
-        mov	QWORD PTR [rcx+-72], rax
-        sbb	r8, QWORD PTR [r10+-64]
-        mov	rax, QWORD PTR [rcx+-56]
-        mov	QWORD PTR [rcx+-64], r8
-        sbb	rax, QWORD PTR [r10+-56]
-        mov	r8, QWORD PTR [rcx+-48]
-        mov	QWORD PTR [rcx+-56], rax
-        sbb	r8, QWORD PTR [r10+-48]
-        mov	rax, QWORD PTR [rcx+-40]
-        mov	QWORD PTR [rcx+-48], r8
-        sbb	rax, QWORD PTR [r10+-40]
-        mov	r8, QWORD PTR [rcx+-32]
-        mov	QWORD PTR [rcx+-40], rax
-        sbb	r8, QWORD PTR [r10+-32]
-        mov	rax, QWORD PTR [rcx+-24]
-        mov	QWORD PTR [rcx+-32], r8
-        sbb	rax, QWORD PTR [r10+-24]
-        mov	r8, QWORD PTR [rcx+-16]
-        mov	QWORD PTR [rcx+-24], rax
-        sbb	r8, QWORD PTR [r10+-16]
-        mov	rax, QWORD PTR [rcx+-8]
-        mov	QWORD PTR [rcx+-16], r8
-        sbb	rax, QWORD PTR [r10+-8]
-        mov	r8, QWORD PTR [rcx]
-        mov	QWORD PTR [rcx+-8], rax
-        sbb	r8, QWORD PTR [r10]
-        mov	rax, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        sbb	rax, QWORD PTR [r10+8]
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], rax
-        sbb	r8, QWORD PTR [r10+16]
-        mov	rax, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        sbb	rax, QWORD PTR [r10+24]
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], rax
-        sbb	r8, QWORD PTR [r10+32]
-        mov	rax, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        sbb	rax, QWORD PTR [r10+40]
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], rax
-        sbb	r8, QWORD PTR [r10+48]
-        mov	rax, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        sbb	rax, QWORD PTR [r10+56]
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], rax
-        sbb	r8, QWORD PTR [r10+64]
-        mov	rax, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        sbb	rax, QWORD PTR [r10+72]
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], rax
-        sbb	r8, QWORD PTR [r10+80]
-        mov	rax, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        sbb	rax, QWORD PTR [r10+88]
-        mov	r8, QWORD PTR [rcx+96]
-        mov	QWORD PTR [rcx+88], rax
-        sbb	r8, QWORD PTR [r10+96]
-        mov	rax, QWORD PTR [rcx+104]
-        mov	QWORD PTR [rcx+96], r8
-        sbb	rax, QWORD PTR [r10+104]
-        mov	r8, QWORD PTR [rcx+112]
-        mov	QWORD PTR [rcx+104], rax
-        sbb	r8, QWORD PTR [r10+112]
-        mov	rax, QWORD PTR [rcx+120]
-        mov	QWORD PTR [rcx+112], r8
-        sbb	rax, QWORD PTR [r10+120]
-        mov	QWORD PTR [rcx+120], rax
-        sbb	r9, 0
-        mov	rcx, QWORD PTR [rsp+256]
-        add	rcx, 384
-        ; Add in word
-        mov	r8, QWORD PTR [rcx]
-        add	r8, r9
-        mov	rax, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+96]
-        mov	QWORD PTR [rcx+88], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+104]
-        mov	QWORD PTR [rcx+96], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+112]
-        mov	QWORD PTR [rcx+104], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+120]
-        mov	QWORD PTR [rcx+112], r8
-        adc	rax, 0
-        mov	QWORD PTR [rcx+120], rax
-        mov	rdx, QWORD PTR [rsp+264]
-        mov	rcx, QWORD PTR [rsp+256]
-        add	rsp, 272
-        ret
-sp_2048_sqr_avx2_32 ENDP
-_text ENDS
-ENDIF
-; /* Sub b from a into a. (a -= b)
-;  *
-;  * a  A single precision integer and result.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_sub_in_place_16 PROC
-        mov	r8, QWORD PTR [rcx]
-        sub	r8, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        sbb	r9, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], r9
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	r9, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        sbb	r9, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], r9
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	r9, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        sbb	r9, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], r9
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	r9, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        sbb	r9, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], r9
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	r9, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        sbb	r9, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], r9
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	r9, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        sbb	r9, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [rcx+96]
-        mov	QWORD PTR [rcx+88], r9
-        sbb	r8, QWORD PTR [rdx+96]
-        mov	r9, QWORD PTR [rcx+104]
-        mov	QWORD PTR [rcx+96], r8
-        sbb	r9, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [rcx+112]
-        mov	QWORD PTR [rcx+104], r9
-        sbb	r8, QWORD PTR [rdx+112]
-        mov	r9, QWORD PTR [rcx+120]
-        mov	QWORD PTR [rcx+112], r8
-        sbb	r9, QWORD PTR [rdx+120]
-        mov	QWORD PTR [rcx+120], r9
-        sbb	rax, rax
-        ret
-sp_2048_sub_in_place_16 ENDP
-_text ENDS
-; /* Mul a by digit b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision digit.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_mul_d_32 PROC
-        push	r12
-        mov	r9, rdx
-        ; A[0] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9]
-        mov	r10, rax
-        mov	r11, rdx
-        mov	QWORD PTR [rcx], r10
-        ; A[1] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        mov	QWORD PTR [rcx+8], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        mov	QWORD PTR [rcx+16], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        mov	QWORD PTR [rcx+24], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        mov	QWORD PTR [rcx+32], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        mov	QWORD PTR [rcx+40], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        mov	QWORD PTR [rcx+48], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        mov	QWORD PTR [rcx+56], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        mov	QWORD PTR [rcx+64], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[9] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+72]
-        add	r10, rax
-        mov	QWORD PTR [rcx+72], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[10] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+80]
-        add	r11, rax
-        mov	QWORD PTR [rcx+80], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[11] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+88]
-        add	r12, rax
-        mov	QWORD PTR [rcx+88], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[12] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+96]
-        add	r10, rax
-        mov	QWORD PTR [rcx+96], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[13] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+104]
-        add	r11, rax
-        mov	QWORD PTR [rcx+104], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[14] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+112]
-        add	r12, rax
-        mov	QWORD PTR [rcx+112], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[15] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+120]
-        add	r10, rax
-        mov	QWORD PTR [rcx+120], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[16] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+128]
-        add	r11, rax
-        mov	QWORD PTR [rcx+128], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[17] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+136]
-        add	r12, rax
-        mov	QWORD PTR [rcx+136], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[18] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+144]
-        add	r10, rax
-        mov	QWORD PTR [rcx+144], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[19] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+152]
-        add	r11, rax
-        mov	QWORD PTR [rcx+152], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[20] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+160]
-        add	r12, rax
-        mov	QWORD PTR [rcx+160], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[21] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+168]
-        add	r10, rax
-        mov	QWORD PTR [rcx+168], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[22] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+176]
-        add	r11, rax
-        mov	QWORD PTR [rcx+176], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[23] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+184]
-        add	r12, rax
-        mov	QWORD PTR [rcx+184], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[24] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+192]
-        add	r10, rax
-        mov	QWORD PTR [rcx+192], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[25] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+200]
-        add	r11, rax
-        mov	QWORD PTR [rcx+200], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[26] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+208]
-        add	r12, rax
-        mov	QWORD PTR [rcx+208], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[27] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+216]
-        add	r10, rax
-        mov	QWORD PTR [rcx+216], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[28] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+224]
-        add	r11, rax
-        mov	QWORD PTR [rcx+224], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[29] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+232]
-        add	r12, rax
-        mov	QWORD PTR [rcx+232], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[30] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+240]
-        add	r10, rax
-        mov	QWORD PTR [rcx+240], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[31] * B
-        mov	rax, r8
-        mul	QWORD PTR [r9+248]
-        add	r11, rax
-        adc	r12, rdx
-        mov	QWORD PTR [rcx+248], r11
-        mov	QWORD PTR [rcx+256], r12
-        pop	r12
-        ret
-sp_2048_mul_d_32 ENDP
-_text ENDS
-; /* Conditionally subtract b from a using the mask m.
-;  * m is -1 to subtract and 0 when not copying.
-;  *
-;  * r  A single precision number representing condition subtract result.
-;  * a  A single precision number to subtract from.
-;  * b  A single precision number to subtract.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_cond_sub_16 PROC
-        sub	rsp, 128
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp], r10
-        mov	QWORD PTR [rsp+8], r11
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+16], r10
-        mov	QWORD PTR [rsp+24], r11
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+32], r10
-        mov	QWORD PTR [rsp+40], r11
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+48], r10
-        mov	QWORD PTR [rsp+56], r11
-        mov	r10, QWORD PTR [r8+64]
-        mov	r11, QWORD PTR [r8+72]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+64], r10
-        mov	QWORD PTR [rsp+72], r11
-        mov	r10, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [r8+88]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+80], r10
-        mov	QWORD PTR [rsp+88], r11
-        mov	r10, QWORD PTR [r8+96]
-        mov	r11, QWORD PTR [r8+104]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+96], r10
-        mov	QWORD PTR [rsp+104], r11
-        mov	r10, QWORD PTR [r8+112]
-        mov	r11, QWORD PTR [r8+120]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+112], r10
-        mov	QWORD PTR [rsp+120], r11
-        mov	r10, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rsp]
-        sub	r10, r8
-        mov	r11, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [rsp+8]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx], r10
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r8, QWORD PTR [rsp+16]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+8], r11
-        mov	r11, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [rsp+24]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+16], r10
-        mov	r10, QWORD PTR [rdx+32]
-        mov	r8, QWORD PTR [rsp+32]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+24], r11
-        mov	r11, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [rsp+40]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+32], r10
-        mov	r10, QWORD PTR [rdx+48]
-        mov	r8, QWORD PTR [rsp+48]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+40], r11
-        mov	r11, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [rsp+56]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+48], r10
-        mov	r10, QWORD PTR [rdx+64]
-        mov	r8, QWORD PTR [rsp+64]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+56], r11
-        mov	r11, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [rsp+72]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+64], r10
-        mov	r10, QWORD PTR [rdx+80]
-        mov	r8, QWORD PTR [rsp+80]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+72], r11
-        mov	r11, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [rsp+88]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+80], r10
-        mov	r10, QWORD PTR [rdx+96]
-        mov	r8, QWORD PTR [rsp+96]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+88], r11
-        mov	r11, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [rsp+104]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+96], r10
-        mov	r10, QWORD PTR [rdx+112]
-        mov	r8, QWORD PTR [rsp+112]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+104], r11
-        mov	r11, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [rsp+120]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+112], r10
-        mov	QWORD PTR [rcx+120], r11
-        sbb	rax, rax
-        add	rsp, 128
-        ret
-sp_2048_cond_sub_16 ENDP
-_text ENDS
-; /* Reduce the number back to 2048 bits using Montgomery reduction.
-;  *
-;  * a   A single precision number to reduce in place.
-;  * m   The single precision number representing the modulus.
-;  * mp  The digit representing the negative inverse of m mod 2^n.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_mont_reduce_16 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        mov	r9, rdx
-        xor	rsi, rsi
-        ; i = 16
-        mov	r10, 16
-        mov	r15, QWORD PTR [rcx]
-        mov	rdi, QWORD PTR [rcx+8]
-L_2048_mont_reduce_16_loop:
-        ; mu = a[i] * mp
-        mov	r13, r15
-        imul	r13, r8
-        ; a[i+0] += m[0] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9]
-        add	r15, rax
-        adc	r12, rdx
-        ; a[i+1] += m[1] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+8]
-        mov	r15, rdi
-        add	r15, rax
-        adc	r11, rdx
-        add	r15, r12
-        adc	r11, 0
-        ; a[i+2] += m[2] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+16]
-        mov	rdi, QWORD PTR [rcx+16]
-        add	rdi, rax
-        adc	r12, rdx
-        add	rdi, r11
-        adc	r12, 0
-        ; a[i+3] += m[3] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+24]
-        mov	r14, QWORD PTR [rcx+24]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+24], r14
-        adc	r11, 0
-        ; a[i+4] += m[4] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+32]
-        mov	r14, QWORD PTR [rcx+32]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+32], r14
-        adc	r12, 0
-        ; a[i+5] += m[5] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+40]
-        mov	r14, QWORD PTR [rcx+40]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+40], r14
-        adc	r11, 0
-        ; a[i+6] += m[6] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+48]
-        mov	r14, QWORD PTR [rcx+48]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+48], r14
-        adc	r12, 0
-        ; a[i+7] += m[7] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+56]
-        mov	r14, QWORD PTR [rcx+56]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+56], r14
-        adc	r11, 0
-        ; a[i+8] += m[8] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+64]
-        mov	r14, QWORD PTR [rcx+64]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+64], r14
-        adc	r12, 0
-        ; a[i+9] += m[9] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+72]
-        mov	r14, QWORD PTR [rcx+72]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+72], r14
-        adc	r11, 0
-        ; a[i+10] += m[10] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+80]
-        mov	r14, QWORD PTR [rcx+80]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+80], r14
-        adc	r12, 0
-        ; a[i+11] += m[11] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+88]
-        mov	r14, QWORD PTR [rcx+88]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+88], r14
-        adc	r11, 0
-        ; a[i+12] += m[12] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+96]
-        mov	r14, QWORD PTR [rcx+96]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+96], r14
-        adc	r12, 0
-        ; a[i+13] += m[13] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+104]
-        mov	r14, QWORD PTR [rcx+104]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+104], r14
-        adc	r11, 0
-        ; a[i+14] += m[14] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+112]
-        mov	r14, QWORD PTR [rcx+112]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+112], r14
-        adc	r12, 0
-        ; a[i+15] += m[15] * mu
-        mov	rax, r13
-        mul	QWORD PTR [r9+120]
-        mov	r14, QWORD PTR [rcx+120]
-        add	r12, rax
-        adc	rdx, rsi
-        mov	rsi, 0
-        adc	rsi, 0
-        add	r14, r12
-        mov	QWORD PTR [rcx+120], r14
-        adc	QWORD PTR [rcx+128], rdx
-        adc	rsi, 0
-        ; i -= 1
-        add	rcx, 8
-        dec	r10
-        jnz	L_2048_mont_reduce_16_loop
-        mov	QWORD PTR [rcx], r15
-        mov	QWORD PTR [rcx+8], rdi
-        neg	rsi
-IFDEF _WIN64
-        mov	r8, r9
-        mov	r9, rsi
-ELSE
-        mov	r9, rsi
-        mov	r8, r9
-ENDIF
-        mov	rdx, rcx
-        mov	rcx, rcx
-        sub	rcx, 128
-        call	sp_2048_cond_sub_16
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_2048_mont_reduce_16 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Conditionally subtract b from a using the mask m.
-;  * m is -1 to subtract and 0 when not copying.
-;  *
-;  * r  A single precision number representing condition subtract result.
-;  * a  A single precision number to subtract from.
-;  * b  A single precision number to subtract.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_cond_sub_avx2_16 PROC
-        push	r12
-        mov	r12, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx]
-        pext	r12, r12, r9
-        sub	r10, r12
-        mov	r12, QWORD PTR [r8+8]
-        mov	r11, QWORD PTR [rdx+8]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+16]
-        mov	r12, QWORD PTR [rdx+16]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+8], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [rdx+24]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+16], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [rdx+32]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+24], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [rdx+40]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+32], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+48]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+40], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+56]
-        mov	r11, QWORD PTR [rdx+56]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+48], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+64]
-        mov	r12, QWORD PTR [rdx+64]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+56], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+72]
-        mov	r10, QWORD PTR [rdx+72]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+64], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [rdx+80]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+72], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+88]
-        mov	r12, QWORD PTR [rdx+88]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+80], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+96]
-        mov	r10, QWORD PTR [rdx+96]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+88], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+104]
-        mov	r11, QWORD PTR [rdx+104]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+96], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+112]
-        mov	r12, QWORD PTR [rdx+112]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+104], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+120]
-        mov	r10, QWORD PTR [rdx+120]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+112], r12
-        sbb	r10, r11
-        mov	QWORD PTR [rcx+120], r10
-        sbb	rax, rax
-        pop	r12
-        ret
-sp_2048_cond_sub_avx2_16 ENDP
-_text ENDS
-ENDIF
-; /* Mul a by digit b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision digit.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_mul_d_16 PROC
-        push	r12
-        mov	r9, rdx
-        ; A[0] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9]
-        mov	r10, rax
-        mov	r11, rdx
-        mov	QWORD PTR [rcx], r10
-        ; A[1] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        mov	QWORD PTR [rcx+8], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        mov	QWORD PTR [rcx+16], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        mov	QWORD PTR [rcx+24], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        mov	QWORD PTR [rcx+32], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        mov	QWORD PTR [rcx+40], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        mov	QWORD PTR [rcx+48], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        mov	QWORD PTR [rcx+56], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        mov	QWORD PTR [rcx+64], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[9] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+72]
-        add	r10, rax
-        mov	QWORD PTR [rcx+72], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[10] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+80]
-        add	r11, rax
-        mov	QWORD PTR [rcx+80], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[11] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+88]
-        add	r12, rax
-        mov	QWORD PTR [rcx+88], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[12] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+96]
-        add	r10, rax
-        mov	QWORD PTR [rcx+96], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[13] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+104]
-        add	r11, rax
-        mov	QWORD PTR [rcx+104], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[14] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+112]
-        add	r12, rax
-        mov	QWORD PTR [rcx+112], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[15] * B
-        mov	rax, r8
-        mul	QWORD PTR [r9+120]
-        add	r10, rax
-        adc	r11, rdx
-        mov	QWORD PTR [rcx+120], r10
-        mov	QWORD PTR [rcx+128], r11
-        pop	r12
-        ret
-sp_2048_mul_d_16 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Mul a by digit b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision digit.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_mul_d_avx2_16 PROC
-        push	r12
-        push	r13
-        mov	rax, rdx
-        ; A[0] * B
-        mov	rdx, r8
-        xor	r13, r13
-        mulx	r12, r11, QWORD PTR [rax]
-        mov	QWORD PTR [rcx], r11
-        ; A[1] * B
-        mulx	r10, r9, QWORD PTR [rax+8]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+8], r12
-        ; A[2] * B
-        mulx	r10, r9, QWORD PTR [rax+16]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+16], r11
-        ; A[3] * B
-        mulx	r10, r9, QWORD PTR [rax+24]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+24], r12
-        ; A[4] * B
-        mulx	r10, r9, QWORD PTR [rax+32]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+32], r11
-        ; A[5] * B
-        mulx	r10, r9, QWORD PTR [rax+40]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+40], r12
-        ; A[6] * B
-        mulx	r10, r9, QWORD PTR [rax+48]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+48], r11
-        ; A[7] * B
-        mulx	r10, r9, QWORD PTR [rax+56]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+56], r12
-        ; A[8] * B
-        mulx	r10, r9, QWORD PTR [rax+64]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+64], r11
-        ; A[9] * B
-        mulx	r10, r9, QWORD PTR [rax+72]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+72], r12
-        ; A[10] * B
-        mulx	r10, r9, QWORD PTR [rax+80]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+80], r11
-        ; A[11] * B
-        mulx	r10, r9, QWORD PTR [rax+88]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+88], r12
-        ; A[12] * B
-        mulx	r10, r9, QWORD PTR [rax+96]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+96], r11
-        ; A[13] * B
-        mulx	r10, r9, QWORD PTR [rax+104]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+104], r12
-        ; A[14] * B
-        mulx	r10, r9, QWORD PTR [rax+112]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+112], r11
-        ; A[15] * B
-        mulx	r10, r9, QWORD PTR [rax+120]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        adcx	r11, r13
-        mov	QWORD PTR [rcx+120], r12
-        mov	QWORD PTR [rcx+128], r11
-        pop	r13
-        pop	r12
-        ret
-sp_2048_mul_d_avx2_16 ENDP
-_text ENDS
-ENDIF
-IFDEF _WIN64
-; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
-;  *
-;  * d1   The high order half of the number to divide.
-;  * d0   The low order half of the number to divide.
-;  * div  The dividend.
-;  * returns the result of the division.
-;  */
-_text SEGMENT READONLY PARA
-div_2048_word_asm_16 PROC
-        mov	r9, rdx
-        mov	rax, r9
-        mov	rdx, rcx
-        div	r8
-        ret
-div_2048_word_asm_16 ENDP
-_text ENDS
-ENDIF
-; /* Compare a with b in constant time.
-;  *
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  * return -ve, 0 or +ve if a is less than, equal to or greater than b
-;  * respectively.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_cmp_16 PROC
-        push	r12
-        xor	r9, r9
-        mov	r8, -1
-        mov	rax, -1
-        mov	r10, 1
-        mov	r11, QWORD PTR [rcx+120]
-        mov	r12, QWORD PTR [rdx+120]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+112]
-        mov	r12, QWORD PTR [rdx+112]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+104]
-        mov	r12, QWORD PTR [rdx+104]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+96]
-        mov	r12, QWORD PTR [rdx+96]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+88]
-        mov	r12, QWORD PTR [rdx+88]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+80]
-        mov	r12, QWORD PTR [rdx+80]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+72]
-        mov	r12, QWORD PTR [rdx+72]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+64]
-        mov	r12, QWORD PTR [rdx+64]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+56]
-        mov	r12, QWORD PTR [rdx+56]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+48]
-        mov	r12, QWORD PTR [rdx+48]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+40]
-        mov	r12, QWORD PTR [rdx+40]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+32]
-        mov	r12, QWORD PTR [rdx+32]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+24]
-        mov	r12, QWORD PTR [rdx+24]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+16]
-        mov	r12, QWORD PTR [rdx+16]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+8]
-        mov	r12, QWORD PTR [rdx+8]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx]
-        mov	r12, QWORD PTR [rdx]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        xor	rax, r8
-        pop	r12
-        ret
-sp_2048_cmp_16 ENDP
-_text ENDS
-IFNDEF WC_NO_CACHE_RESISTANT
-_text SEGMENT READONLY PARA
-sp_2048_get_from_table_16 PROC
-        sub	rsp, 128
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        vmovdqu	OWORD PTR [rsp+96], xmm12
-        vmovdqu	OWORD PTR [rsp+112], xmm13
-        mov	rax, 1
-        movd	xmm10, r8
-        movd	xmm11, rax
-        pxor	xmm13, xmm13
-        pshufd	xmm11, xmm11, 0
-        pshufd	xmm10, xmm10, 0
-        ; START: 0-7
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 16
-        mov	r9, QWORD PTR [rdx+128]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 17
-        mov	r9, QWORD PTR [rdx+136]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 18
-        mov	r9, QWORD PTR [rdx+144]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 19
-        mov	r9, QWORD PTR [rdx+152]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 20
-        mov	r9, QWORD PTR [rdx+160]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 21
-        mov	r9, QWORD PTR [rdx+168]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 22
-        mov	r9, QWORD PTR [rdx+176]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 23
-        mov	r9, QWORD PTR [rdx+184]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 24
-        mov	r9, QWORD PTR [rdx+192]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 25
-        mov	r9, QWORD PTR [rdx+200]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 26
-        mov	r9, QWORD PTR [rdx+208]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 27
-        mov	r9, QWORD PTR [rdx+216]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 28
-        mov	r9, QWORD PTR [rdx+224]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 29
-        mov	r9, QWORD PTR [rdx+232]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 30
-        mov	r9, QWORD PTR [rdx+240]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 31
-        mov	r9, QWORD PTR [rdx+248]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        add	rcx, 64
-        ; END: 0-7
-        ; START: 8-15
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 16
-        mov	r9, QWORD PTR [rdx+128]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 17
-        mov	r9, QWORD PTR [rdx+136]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 18
-        mov	r9, QWORD PTR [rdx+144]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 19
-        mov	r9, QWORD PTR [rdx+152]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 20
-        mov	r9, QWORD PTR [rdx+160]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 21
-        mov	r9, QWORD PTR [rdx+168]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 22
-        mov	r9, QWORD PTR [rdx+176]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 23
-        mov	r9, QWORD PTR [rdx+184]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 24
-        mov	r9, QWORD PTR [rdx+192]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 25
-        mov	r9, QWORD PTR [rdx+200]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 26
-        mov	r9, QWORD PTR [rdx+208]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 27
-        mov	r9, QWORD PTR [rdx+216]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 28
-        mov	r9, QWORD PTR [rdx+224]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 29
-        mov	r9, QWORD PTR [rdx+232]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 30
-        mov	r9, QWORD PTR [rdx+240]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 31
-        mov	r9, QWORD PTR [rdx+248]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        ; END: 8-15
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        vmovdqu	xmm12, OWORD PTR [rsp+96]
-        vmovdqu	xmm13, OWORD PTR [rsp+112]
-        add	rsp, 128
-        ret
-sp_2048_get_from_table_16 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Reduce the number back to 2048 bits using Montgomery reduction.
-;  *
-;  * a   A single precision number to reduce in place.
-;  * m   The single precision number representing the modulus.
-;  * mp  The digit representing the negative inverse of m mod 2^n.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_mont_reduce_avx2_16 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        push	rbp
-        mov	r9, rcx
-        mov	r10, rdx
-        xor	rbp, rbp
-        ; i = 16
-        mov	r11, 16
-        mov	r14, QWORD PTR [r9]
-        mov	r15, QWORD PTR [r9+8]
-        mov	rdi, QWORD PTR [r9+16]
-        mov	rsi, QWORD PTR [r9+24]
-        add	r9, 64
-        xor	rbp, rbp
-L_2048_mont_reduce_avx2_16_loop:
-        ; mu = a[i] * mp
-        mov	rdx, r14
-        mov	r12, r14
-        imul	rdx, r8
-        xor	rbx, rbx
-        ; a[i+0] += m[0] * mu
-        mulx	rcx, rax, QWORD PTR [r10]
-        mov	r14, r15
-        adcx	r12, rax
-        adox	r14, rcx
-        ; a[i+1] += m[1] * mu
-        mulx	rcx, rax, QWORD PTR [r10+8]
-        mov	r15, rdi
-        adcx	r14, rax
-        adox	r15, rcx
-        ; a[i+2] += m[2] * mu
-        mulx	rcx, rax, QWORD PTR [r10+16]
-        mov	rdi, rsi
-        adcx	r15, rax
-        adox	rdi, rcx
-        ; a[i+3] += m[3] * mu
-        mulx	rcx, rax, QWORD PTR [r10+24]
-        mov	rsi, QWORD PTR [r9+-32]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; a[i+4] += m[4] * mu
-        mulx	rcx, rax, QWORD PTR [r10+32]
-        mov	r13, QWORD PTR [r9+-24]
-        adcx	rsi, rax
-        adox	r13, rcx
-        ; a[i+5] += m[5] * mu
-        mulx	rcx, rax, QWORD PTR [r10+40]
-        mov	r12, QWORD PTR [r9+-16]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-24], r13
-        ; a[i+6] += m[6] * mu
-        mulx	rcx, rax, QWORD PTR [r10+48]
-        mov	r13, QWORD PTR [r9+-8]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-16], r12
-        ; a[i+7] += m[7] * mu
-        mulx	rcx, rax, QWORD PTR [r10+56]
-        mov	r12, QWORD PTR [r9]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-8], r13
-        ; a[i+8] += m[8] * mu
-        mulx	rcx, rax, QWORD PTR [r10+64]
-        mov	r13, QWORD PTR [r9+8]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9], r12
-        ; a[i+9] += m[9] * mu
-        mulx	rcx, rax, QWORD PTR [r10+72]
-        mov	r12, QWORD PTR [r9+16]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+8], r13
-        ; a[i+10] += m[10] * mu
-        mulx	rcx, rax, QWORD PTR [r10+80]
-        mov	r13, QWORD PTR [r9+24]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+16], r12
-        ; a[i+11] += m[11] * mu
-        mulx	rcx, rax, QWORD PTR [r10+88]
-        mov	r12, QWORD PTR [r9+32]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+24], r13
-        ; a[i+12] += m[12] * mu
-        mulx	rcx, rax, QWORD PTR [r10+96]
-        mov	r13, QWORD PTR [r9+40]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+32], r12
-        ; a[i+13] += m[13] * mu
-        mulx	rcx, rax, QWORD PTR [r10+104]
-        mov	r12, QWORD PTR [r9+48]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+40], r13
-        ; a[i+14] += m[14] * mu
-        mulx	rcx, rax, QWORD PTR [r10+112]
-        mov	r13, QWORD PTR [r9+56]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+48], r12
-        ; a[i+15] += m[15] * mu
-        mulx	rcx, rax, QWORD PTR [r10+120]
-        mov	r12, QWORD PTR [r9+64]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+56], r13
-        adcx	r12, rbp
-        mov	rbp, rbx
-        mov	QWORD PTR [r9+64], r12
-        adox	rbp, rbx
-        adcx	rbp, rbx
-        ; mu = a[i] * mp
-        mov	rdx, r14
-        mov	r12, r14
-        imul	rdx, r8
-        xor	rbx, rbx
-        ; a[i+0] += m[0] * mu
-        mulx	rcx, rax, QWORD PTR [r10]
-        mov	r14, r15
-        adcx	r12, rax
-        adox	r14, rcx
-        ; a[i+1] += m[1] * mu
-        mulx	rcx, rax, QWORD PTR [r10+8]
-        mov	r15, rdi
-        adcx	r14, rax
-        adox	r15, rcx
-        ; a[i+2] += m[2] * mu
-        mulx	rcx, rax, QWORD PTR [r10+16]
-        mov	rdi, rsi
-        adcx	r15, rax
-        adox	rdi, rcx
-        ; a[i+3] += m[3] * mu
-        mulx	rcx, rax, QWORD PTR [r10+24]
-        mov	rsi, QWORD PTR [r9+-24]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; a[i+4] += m[4] * mu
-        mulx	rcx, rax, QWORD PTR [r10+32]
-        mov	r13, QWORD PTR [r9+-16]
-        adcx	rsi, rax
-        adox	r13, rcx
-        ; a[i+5] += m[5] * mu
-        mulx	rcx, rax, QWORD PTR [r10+40]
-        mov	r12, QWORD PTR [r9+-8]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-16], r13
-        ; a[i+6] += m[6] * mu
-        mulx	rcx, rax, QWORD PTR [r10+48]
-        mov	r13, QWORD PTR [r9]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-8], r12
-        ; a[i+7] += m[7] * mu
-        mulx	rcx, rax, QWORD PTR [r10+56]
-        mov	r12, QWORD PTR [r9+8]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9], r13
-        ; a[i+8] += m[8] * mu
-        mulx	rcx, rax, QWORD PTR [r10+64]
-        mov	r13, QWORD PTR [r9+16]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+8], r12
-        ; a[i+9] += m[9] * mu
-        mulx	rcx, rax, QWORD PTR [r10+72]
-        mov	r12, QWORD PTR [r9+24]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+16], r13
-        ; a[i+10] += m[10] * mu
-        mulx	rcx, rax, QWORD PTR [r10+80]
-        mov	r13, QWORD PTR [r9+32]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+24], r12
-        ; a[i+11] += m[11] * mu
-        mulx	rcx, rax, QWORD PTR [r10+88]
-        mov	r12, QWORD PTR [r9+40]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+32], r13
-        ; a[i+12] += m[12] * mu
-        mulx	rcx, rax, QWORD PTR [r10+96]
-        mov	r13, QWORD PTR [r9+48]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+40], r12
-        ; a[i+13] += m[13] * mu
-        mulx	rcx, rax, QWORD PTR [r10+104]
-        mov	r12, QWORD PTR [r9+56]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+48], r13
-        ; a[i+14] += m[14] * mu
-        mulx	rcx, rax, QWORD PTR [r10+112]
-        mov	r13, QWORD PTR [r9+64]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+56], r12
-        ; a[i+15] += m[15] * mu
-        mulx	rcx, rax, QWORD PTR [r10+120]
-        mov	r12, QWORD PTR [r9+72]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+64], r13
-        adcx	r12, rbp
-        mov	rbp, rbx
-        mov	QWORD PTR [r9+72], r12
-        adox	rbp, rbx
-        adcx	rbp, rbx
-        ; a += 2
-        add	r9, 16
-        ; i -= 2
-        sub	r11, 2
-        jnz	L_2048_mont_reduce_avx2_16_loop
-        sub	r9, 64
-        neg	rbp
-        mov	r8, r9
-        sub	r9, 128
-        mov	rcx, QWORD PTR [r10]
-        mov	rdx, r14
-        pext	rcx, rcx, rbp
-        sub	rdx, rcx
-        mov	rcx, QWORD PTR [r10+8]
-        mov	rax, r15
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+16]
-        mov	rcx, rdi
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+8], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+24]
-        mov	rdx, rsi
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+16], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+32]
-        mov	rax, QWORD PTR [r8+32]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+24], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+40]
-        mov	rcx, QWORD PTR [r8+40]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+32], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+48]
-        mov	rdx, QWORD PTR [r8+48]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+40], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+56]
-        mov	rax, QWORD PTR [r8+56]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+48], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+64]
-        mov	rcx, QWORD PTR [r8+64]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+56], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+72]
-        mov	rdx, QWORD PTR [r8+72]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+64], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+80]
-        mov	rax, QWORD PTR [r8+80]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+72], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+88]
-        mov	rcx, QWORD PTR [r8+88]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+80], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+96]
-        mov	rdx, QWORD PTR [r8+96]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+88], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+104]
-        mov	rax, QWORD PTR [r8+104]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+96], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+112]
-        mov	rcx, QWORD PTR [r8+112]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+104], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+120]
-        mov	rdx, QWORD PTR [r8+120]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+112], rcx
-        sbb	rdx, rax
-        mov	QWORD PTR [r9+120], rdx
-        pop	rbp
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_2048_mont_reduce_avx2_16 ENDP
-_text ENDS
-ENDIF
-IFNDEF WC_NO_CACHE_RESISTANT
-_text SEGMENT READONLY PARA
-sp_2048_get_from_table_avx2_16 PROC
-        sub	rsp, 128
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        vmovdqu	OWORD PTR [rsp+96], xmm12
-        vmovdqu	OWORD PTR [rsp+112], xmm13
-        mov	rax, 1
-        movd	xmm10, r8
-        movd	xmm11, rax
-        vpxor	ymm13, ymm13, ymm13
-        vpermd	ymm10, ymm13, ymm10
-        vpermd	ymm11, ymm13, ymm11
-        ; START: 0-15
-        vpxor	ymm13, ymm13, ymm13
-        vpxor	ymm4, ymm4, ymm4
-        vpxor	ymm5, ymm5, ymm5
-        vpxor	ymm6, ymm6, ymm6
-        vpxor	ymm7, ymm7, ymm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 16
-        mov	r9, QWORD PTR [rdx+128]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 17
-        mov	r9, QWORD PTR [rdx+136]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 18
-        mov	r9, QWORD PTR [rdx+144]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 19
-        mov	r9, QWORD PTR [rdx+152]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 20
-        mov	r9, QWORD PTR [rdx+160]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 21
-        mov	r9, QWORD PTR [rdx+168]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 22
-        mov	r9, QWORD PTR [rdx+176]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 23
-        mov	r9, QWORD PTR [rdx+184]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 24
-        mov	r9, QWORD PTR [rdx+192]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 25
-        mov	r9, QWORD PTR [rdx+200]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 26
-        mov	r9, QWORD PTR [rdx+208]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 27
-        mov	r9, QWORD PTR [rdx+216]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 28
-        mov	r9, QWORD PTR [rdx+224]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 29
-        mov	r9, QWORD PTR [rdx+232]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 30
-        mov	r9, QWORD PTR [rdx+240]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 31
-        mov	r9, QWORD PTR [rdx+248]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        vmovdqu	YMMWORD PTR [rcx], ymm4
-        vmovdqu	YMMWORD PTR [rcx+32], ymm5
-        vmovdqu	YMMWORD PTR [rcx+64], ymm6
-        vmovdqu	YMMWORD PTR [rcx+96], ymm7
-        ; END: 0-15
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        vmovdqu	xmm12, OWORD PTR [rsp+96]
-        vmovdqu	xmm13, OWORD PTR [rsp+112]
-        add	rsp, 128
-        ret
-sp_2048_get_from_table_avx2_16 ENDP
-_text ENDS
-ENDIF
-; /* Conditionally subtract b from a using the mask m.
-;  * m is -1 to subtract and 0 when not copying.
-;  *
-;  * r  A single precision number representing condition subtract result.
-;  * a  A single precision number to subtract from.
-;  * b  A single precision number to subtract.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_cond_sub_32 PROC
-        sub	rsp, 256
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp], r10
-        mov	QWORD PTR [rsp+8], r11
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+16], r10
-        mov	QWORD PTR [rsp+24], r11
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+32], r10
-        mov	QWORD PTR [rsp+40], r11
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+48], r10
-        mov	QWORD PTR [rsp+56], r11
-        mov	r10, QWORD PTR [r8+64]
-        mov	r11, QWORD PTR [r8+72]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+64], r10
-        mov	QWORD PTR [rsp+72], r11
-        mov	r10, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [r8+88]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+80], r10
-        mov	QWORD PTR [rsp+88], r11
-        mov	r10, QWORD PTR [r8+96]
-        mov	r11, QWORD PTR [r8+104]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+96], r10
-        mov	QWORD PTR [rsp+104], r11
-        mov	r10, QWORD PTR [r8+112]
-        mov	r11, QWORD PTR [r8+120]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+112], r10
-        mov	QWORD PTR [rsp+120], r11
-        mov	r10, QWORD PTR [r8+128]
-        mov	r11, QWORD PTR [r8+136]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+128], r10
-        mov	QWORD PTR [rsp+136], r11
-        mov	r10, QWORD PTR [r8+144]
-        mov	r11, QWORD PTR [r8+152]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+144], r10
-        mov	QWORD PTR [rsp+152], r11
-        mov	r10, QWORD PTR [r8+160]
-        mov	r11, QWORD PTR [r8+168]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+160], r10
-        mov	QWORD PTR [rsp+168], r11
-        mov	r10, QWORD PTR [r8+176]
-        mov	r11, QWORD PTR [r8+184]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+176], r10
-        mov	QWORD PTR [rsp+184], r11
-        mov	r10, QWORD PTR [r8+192]
-        mov	r11, QWORD PTR [r8+200]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+192], r10
-        mov	QWORD PTR [rsp+200], r11
-        mov	r10, QWORD PTR [r8+208]
-        mov	r11, QWORD PTR [r8+216]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+208], r10
-        mov	QWORD PTR [rsp+216], r11
-        mov	r10, QWORD PTR [r8+224]
-        mov	r11, QWORD PTR [r8+232]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+224], r10
-        mov	QWORD PTR [rsp+232], r11
-        mov	r10, QWORD PTR [r8+240]
-        mov	r11, QWORD PTR [r8+248]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+240], r10
-        mov	QWORD PTR [rsp+248], r11
-        mov	r10, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rsp]
-        sub	r10, r8
-        mov	r11, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [rsp+8]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx], r10
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r8, QWORD PTR [rsp+16]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+8], r11
-        mov	r11, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [rsp+24]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+16], r10
-        mov	r10, QWORD PTR [rdx+32]
-        mov	r8, QWORD PTR [rsp+32]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+24], r11
-        mov	r11, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [rsp+40]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+32], r10
-        mov	r10, QWORD PTR [rdx+48]
-        mov	r8, QWORD PTR [rsp+48]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+40], r11
-        mov	r11, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [rsp+56]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+48], r10
-        mov	r10, QWORD PTR [rdx+64]
-        mov	r8, QWORD PTR [rsp+64]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+56], r11
-        mov	r11, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [rsp+72]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+64], r10
-        mov	r10, QWORD PTR [rdx+80]
-        mov	r8, QWORD PTR [rsp+80]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+72], r11
-        mov	r11, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [rsp+88]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+80], r10
-        mov	r10, QWORD PTR [rdx+96]
-        mov	r8, QWORD PTR [rsp+96]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+88], r11
-        mov	r11, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [rsp+104]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+96], r10
-        mov	r10, QWORD PTR [rdx+112]
-        mov	r8, QWORD PTR [rsp+112]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+104], r11
-        mov	r11, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [rsp+120]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+112], r10
-        mov	r10, QWORD PTR [rdx+128]
-        mov	r8, QWORD PTR [rsp+128]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+120], r11
-        mov	r11, QWORD PTR [rdx+136]
-        mov	r8, QWORD PTR [rsp+136]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+128], r10
-        mov	r10, QWORD PTR [rdx+144]
-        mov	r8, QWORD PTR [rsp+144]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+136], r11
-        mov	r11, QWORD PTR [rdx+152]
-        mov	r8, QWORD PTR [rsp+152]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+144], r10
-        mov	r10, QWORD PTR [rdx+160]
-        mov	r8, QWORD PTR [rsp+160]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+152], r11
-        mov	r11, QWORD PTR [rdx+168]
-        mov	r8, QWORD PTR [rsp+168]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+160], r10
-        mov	r10, QWORD PTR [rdx+176]
-        mov	r8, QWORD PTR [rsp+176]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+168], r11
-        mov	r11, QWORD PTR [rdx+184]
-        mov	r8, QWORD PTR [rsp+184]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+176], r10
-        mov	r10, QWORD PTR [rdx+192]
-        mov	r8, QWORD PTR [rsp+192]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+184], r11
-        mov	r11, QWORD PTR [rdx+200]
-        mov	r8, QWORD PTR [rsp+200]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+192], r10
-        mov	r10, QWORD PTR [rdx+208]
-        mov	r8, QWORD PTR [rsp+208]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+200], r11
-        mov	r11, QWORD PTR [rdx+216]
-        mov	r8, QWORD PTR [rsp+216]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+208], r10
-        mov	r10, QWORD PTR [rdx+224]
-        mov	r8, QWORD PTR [rsp+224]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+216], r11
-        mov	r11, QWORD PTR [rdx+232]
-        mov	r8, QWORD PTR [rsp+232]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+224], r10
-        mov	r10, QWORD PTR [rdx+240]
-        mov	r8, QWORD PTR [rsp+240]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+232], r11
-        mov	r11, QWORD PTR [rdx+248]
-        mov	r8, QWORD PTR [rsp+248]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+240], r10
-        mov	QWORD PTR [rcx+248], r11
-        sbb	rax, rax
-        add	rsp, 256
-        ret
-sp_2048_cond_sub_32 ENDP
-_text ENDS
-; /* Reduce the number back to 2048 bits using Montgomery reduction.
-;  *
-;  * a   A single precision number to reduce in place.
-;  * m   The single precision number representing the modulus.
-;  * mp  The digit representing the negative inverse of m mod 2^n.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_mont_reduce_32 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        mov	r9, rdx
-        xor	rsi, rsi
-        ; i = 32
-        mov	r10, 32
-        mov	r15, QWORD PTR [rcx]
-        mov	rdi, QWORD PTR [rcx+8]
-L_2048_mont_reduce_32_loop:
-        ; mu = a[i] * mp
-        mov	r13, r15
-        imul	r13, r8
-        ; a[i+0] += m[0] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9]
-        add	r15, rax
-        adc	r12, rdx
-        ; a[i+1] += m[1] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+8]
-        mov	r15, rdi
-        add	r15, rax
-        adc	r11, rdx
-        add	r15, r12
-        adc	r11, 0
-        ; a[i+2] += m[2] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+16]
-        mov	rdi, QWORD PTR [rcx+16]
-        add	rdi, rax
-        adc	r12, rdx
-        add	rdi, r11
-        adc	r12, 0
-        ; a[i+3] += m[3] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+24]
-        mov	r14, QWORD PTR [rcx+24]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+24], r14
-        adc	r11, 0
-        ; a[i+4] += m[4] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+32]
-        mov	r14, QWORD PTR [rcx+32]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+32], r14
-        adc	r12, 0
-        ; a[i+5] += m[5] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+40]
-        mov	r14, QWORD PTR [rcx+40]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+40], r14
-        adc	r11, 0
-        ; a[i+6] += m[6] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+48]
-        mov	r14, QWORD PTR [rcx+48]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+48], r14
-        adc	r12, 0
-        ; a[i+7] += m[7] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+56]
-        mov	r14, QWORD PTR [rcx+56]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+56], r14
-        adc	r11, 0
-        ; a[i+8] += m[8] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+64]
-        mov	r14, QWORD PTR [rcx+64]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+64], r14
-        adc	r12, 0
-        ; a[i+9] += m[9] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+72]
-        mov	r14, QWORD PTR [rcx+72]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+72], r14
-        adc	r11, 0
-        ; a[i+10] += m[10] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+80]
-        mov	r14, QWORD PTR [rcx+80]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+80], r14
-        adc	r12, 0
-        ; a[i+11] += m[11] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+88]
-        mov	r14, QWORD PTR [rcx+88]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+88], r14
-        adc	r11, 0
-        ; a[i+12] += m[12] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+96]
-        mov	r14, QWORD PTR [rcx+96]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+96], r14
-        adc	r12, 0
-        ; a[i+13] += m[13] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+104]
-        mov	r14, QWORD PTR [rcx+104]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+104], r14
-        adc	r11, 0
-        ; a[i+14] += m[14] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+112]
-        mov	r14, QWORD PTR [rcx+112]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+112], r14
-        adc	r12, 0
-        ; a[i+15] += m[15] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+120]
-        mov	r14, QWORD PTR [rcx+120]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+120], r14
-        adc	r11, 0
-        ; a[i+16] += m[16] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+128]
-        mov	r14, QWORD PTR [rcx+128]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+128], r14
-        adc	r12, 0
-        ; a[i+17] += m[17] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+136]
-        mov	r14, QWORD PTR [rcx+136]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+136], r14
-        adc	r11, 0
-        ; a[i+18] += m[18] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+144]
-        mov	r14, QWORD PTR [rcx+144]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+144], r14
-        adc	r12, 0
-        ; a[i+19] += m[19] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+152]
-        mov	r14, QWORD PTR [rcx+152]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+152], r14
-        adc	r11, 0
-        ; a[i+20] += m[20] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+160]
-        mov	r14, QWORD PTR [rcx+160]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+160], r14
-        adc	r12, 0
-        ; a[i+21] += m[21] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+168]
-        mov	r14, QWORD PTR [rcx+168]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+168], r14
-        adc	r11, 0
-        ; a[i+22] += m[22] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+176]
-        mov	r14, QWORD PTR [rcx+176]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+176], r14
-        adc	r12, 0
-        ; a[i+23] += m[23] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+184]
-        mov	r14, QWORD PTR [rcx+184]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+184], r14
-        adc	r11, 0
-        ; a[i+24] += m[24] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+192]
-        mov	r14, QWORD PTR [rcx+192]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+192], r14
-        adc	r12, 0
-        ; a[i+25] += m[25] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+200]
-        mov	r14, QWORD PTR [rcx+200]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+200], r14
-        adc	r11, 0
-        ; a[i+26] += m[26] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+208]
-        mov	r14, QWORD PTR [rcx+208]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+208], r14
-        adc	r12, 0
-        ; a[i+27] += m[27] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+216]
-        mov	r14, QWORD PTR [rcx+216]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+216], r14
-        adc	r11, 0
-        ; a[i+28] += m[28] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+224]
-        mov	r14, QWORD PTR [rcx+224]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+224], r14
-        adc	r12, 0
-        ; a[i+29] += m[29] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+232]
-        mov	r14, QWORD PTR [rcx+232]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+232], r14
-        adc	r11, 0
-        ; a[i+30] += m[30] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+240]
-        mov	r14, QWORD PTR [rcx+240]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+240], r14
-        adc	r12, 0
-        ; a[i+31] += m[31] * mu
-        mov	rax, r13
-        mul	QWORD PTR [r9+248]
-        mov	r14, QWORD PTR [rcx+248]
-        add	r12, rax
-        adc	rdx, rsi
-        mov	rsi, 0
-        adc	rsi, 0
-        add	r14, r12
-        mov	QWORD PTR [rcx+248], r14
-        adc	QWORD PTR [rcx+256], rdx
-        adc	rsi, 0
-        ; i -= 1
-        add	rcx, 8
-        dec	r10
-        jnz	L_2048_mont_reduce_32_loop
-        mov	QWORD PTR [rcx], r15
-        mov	QWORD PTR [rcx+8], rdi
-        neg	rsi
-IFDEF _WIN64
-        mov	r8, r9
-        mov	r9, rsi
-ELSE
-        mov	r9, rsi
-        mov	r8, r9
-ENDIF
-        mov	rdx, rcx
-        mov	rcx, rcx
-        sub	rcx, 256
-        call	sp_2048_cond_sub_32
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_2048_mont_reduce_32 ENDP
-_text ENDS
-; /* Sub b from a into r. (r = a - b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_sub_32 PROC
-        mov	r9, QWORD PTR [rdx]
-        sub	r9, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx+8]
-        mov	QWORD PTR [rcx], r9
-        sbb	r10, QWORD PTR [r8+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	QWORD PTR [rcx+8], r10
-        sbb	r9, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	QWORD PTR [rcx+16], r9
-        sbb	r10, QWORD PTR [r8+24]
-        mov	r9, QWORD PTR [rdx+32]
-        mov	QWORD PTR [rcx+24], r10
-        sbb	r9, QWORD PTR [r8+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	QWORD PTR [rcx+32], r9
-        sbb	r10, QWORD PTR [r8+40]
-        mov	r9, QWORD PTR [rdx+48]
-        mov	QWORD PTR [rcx+40], r10
-        sbb	r9, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+56]
-        mov	QWORD PTR [rcx+48], r9
-        sbb	r10, QWORD PTR [r8+56]
-        mov	r9, QWORD PTR [rdx+64]
-        mov	QWORD PTR [rcx+56], r10
-        sbb	r9, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [rdx+72]
-        mov	QWORD PTR [rcx+64], r9
-        sbb	r10, QWORD PTR [r8+72]
-        mov	r9, QWORD PTR [rdx+80]
-        mov	QWORD PTR [rcx+72], r10
-        sbb	r9, QWORD PTR [r8+80]
-        mov	r10, QWORD PTR [rdx+88]
-        mov	QWORD PTR [rcx+80], r9
-        sbb	r10, QWORD PTR [r8+88]
-        mov	r9, QWORD PTR [rdx+96]
-        mov	QWORD PTR [rcx+88], r10
-        sbb	r9, QWORD PTR [r8+96]
-        mov	r10, QWORD PTR [rdx+104]
-        mov	QWORD PTR [rcx+96], r9
-        sbb	r10, QWORD PTR [r8+104]
-        mov	r9, QWORD PTR [rdx+112]
-        mov	QWORD PTR [rcx+104], r10
-        sbb	r9, QWORD PTR [r8+112]
-        mov	r10, QWORD PTR [rdx+120]
-        mov	QWORD PTR [rcx+112], r9
-        sbb	r10, QWORD PTR [r8+120]
-        mov	r9, QWORD PTR [rdx+128]
-        mov	QWORD PTR [rcx+120], r10
-        sbb	r9, QWORD PTR [r8+128]
-        mov	r10, QWORD PTR [rdx+136]
-        mov	QWORD PTR [rcx+128], r9
-        sbb	r10, QWORD PTR [r8+136]
-        mov	r9, QWORD PTR [rdx+144]
-        mov	QWORD PTR [rcx+136], r10
-        sbb	r9, QWORD PTR [r8+144]
-        mov	r10, QWORD PTR [rdx+152]
-        mov	QWORD PTR [rcx+144], r9
-        sbb	r10, QWORD PTR [r8+152]
-        mov	r9, QWORD PTR [rdx+160]
-        mov	QWORD PTR [rcx+152], r10
-        sbb	r9, QWORD PTR [r8+160]
-        mov	r10, QWORD PTR [rdx+168]
-        mov	QWORD PTR [rcx+160], r9
-        sbb	r10, QWORD PTR [r8+168]
-        mov	r9, QWORD PTR [rdx+176]
-        mov	QWORD PTR [rcx+168], r10
-        sbb	r9, QWORD PTR [r8+176]
-        mov	r10, QWORD PTR [rdx+184]
-        mov	QWORD PTR [rcx+176], r9
-        sbb	r10, QWORD PTR [r8+184]
-        mov	r9, QWORD PTR [rdx+192]
-        mov	QWORD PTR [rcx+184], r10
-        sbb	r9, QWORD PTR [r8+192]
-        mov	r10, QWORD PTR [rdx+200]
-        mov	QWORD PTR [rcx+192], r9
-        sbb	r10, QWORD PTR [r8+200]
-        mov	r9, QWORD PTR [rdx+208]
-        mov	QWORD PTR [rcx+200], r10
-        sbb	r9, QWORD PTR [r8+208]
-        mov	r10, QWORD PTR [rdx+216]
-        mov	QWORD PTR [rcx+208], r9
-        sbb	r10, QWORD PTR [r8+216]
-        mov	r9, QWORD PTR [rdx+224]
-        mov	QWORD PTR [rcx+216], r10
-        sbb	r9, QWORD PTR [r8+224]
-        mov	r10, QWORD PTR [rdx+232]
-        mov	QWORD PTR [rcx+224], r9
-        sbb	r10, QWORD PTR [r8+232]
-        mov	r9, QWORD PTR [rdx+240]
-        mov	QWORD PTR [rcx+232], r10
-        sbb	r9, QWORD PTR [r8+240]
-        mov	r10, QWORD PTR [rdx+248]
-        mov	QWORD PTR [rcx+240], r9
-        sbb	r10, QWORD PTR [r8+248]
-        mov	QWORD PTR [rcx+248], r10
-        sbb	rax, rax
-        ret
-sp_2048_sub_32 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Mul a by digit b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision digit.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_mul_d_avx2_32 PROC
-        push	r12
-        push	r13
-        mov	rax, rdx
-        ; A[0] * B
-        mov	rdx, r8
-        xor	r13, r13
-        mulx	r12, r11, QWORD PTR [rax]
-        mov	QWORD PTR [rcx], r11
-        ; A[1] * B
-        mulx	r10, r9, QWORD PTR [rax+8]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+8], r12
-        ; A[2] * B
-        mulx	r10, r9, QWORD PTR [rax+16]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+16], r11
-        ; A[3] * B
-        mulx	r10, r9, QWORD PTR [rax+24]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+24], r12
-        ; A[4] * B
-        mulx	r10, r9, QWORD PTR [rax+32]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+32], r11
-        ; A[5] * B
-        mulx	r10, r9, QWORD PTR [rax+40]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+40], r12
-        ; A[6] * B
-        mulx	r10, r9, QWORD PTR [rax+48]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+48], r11
-        ; A[7] * B
-        mulx	r10, r9, QWORD PTR [rax+56]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+56], r12
-        ; A[8] * B
-        mulx	r10, r9, QWORD PTR [rax+64]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+64], r11
-        ; A[9] * B
-        mulx	r10, r9, QWORD PTR [rax+72]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+72], r12
-        ; A[10] * B
-        mulx	r10, r9, QWORD PTR [rax+80]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+80], r11
-        ; A[11] * B
-        mulx	r10, r9, QWORD PTR [rax+88]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+88], r12
-        ; A[12] * B
-        mulx	r10, r9, QWORD PTR [rax+96]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+96], r11
-        ; A[13] * B
-        mulx	r10, r9, QWORD PTR [rax+104]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+104], r12
-        ; A[14] * B
-        mulx	r10, r9, QWORD PTR [rax+112]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+112], r11
-        ; A[15] * B
-        mulx	r10, r9, QWORD PTR [rax+120]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+120], r12
-        ; A[16] * B
-        mulx	r10, r9, QWORD PTR [rax+128]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+128], r11
-        ; A[17] * B
-        mulx	r10, r9, QWORD PTR [rax+136]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+136], r12
-        ; A[18] * B
-        mulx	r10, r9, QWORD PTR [rax+144]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+144], r11
-        ; A[19] * B
-        mulx	r10, r9, QWORD PTR [rax+152]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+152], r12
-        ; A[20] * B
-        mulx	r10, r9, QWORD PTR [rax+160]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+160], r11
-        ; A[21] * B
-        mulx	r10, r9, QWORD PTR [rax+168]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+168], r12
-        ; A[22] * B
-        mulx	r10, r9, QWORD PTR [rax+176]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+176], r11
-        ; A[23] * B
-        mulx	r10, r9, QWORD PTR [rax+184]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+184], r12
-        ; A[24] * B
-        mulx	r10, r9, QWORD PTR [rax+192]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+192], r11
-        ; A[25] * B
-        mulx	r10, r9, QWORD PTR [rax+200]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+200], r12
-        ; A[26] * B
-        mulx	r10, r9, QWORD PTR [rax+208]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+208], r11
-        ; A[27] * B
-        mulx	r10, r9, QWORD PTR [rax+216]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+216], r12
-        ; A[28] * B
-        mulx	r10, r9, QWORD PTR [rax+224]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+224], r11
-        ; A[29] * B
-        mulx	r10, r9, QWORD PTR [rax+232]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+232], r12
-        ; A[30] * B
-        mulx	r10, r9, QWORD PTR [rax+240]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+240], r11
-        ; A[31] * B
-        mulx	r10, r9, QWORD PTR [rax+248]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        adcx	r11, r13
-        mov	QWORD PTR [rcx+248], r12
-        mov	QWORD PTR [rcx+256], r11
-        pop	r13
-        pop	r12
-        ret
-sp_2048_mul_d_avx2_32 ENDP
-_text ENDS
-ENDIF
-IFDEF _WIN64
-; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
-;  *
-;  * d1   The high order half of the number to divide.
-;  * d0   The low order half of the number to divide.
-;  * div  The dividend.
-;  * returns the result of the division.
-;  */
-_text SEGMENT READONLY PARA
-div_2048_word_asm_32 PROC
-        mov	r9, rdx
-        mov	rax, r9
-        mov	rdx, rcx
-        div	r8
-        ret
-div_2048_word_asm_32 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Conditionally subtract b from a using the mask m.
-;  * m is -1 to subtract and 0 when not copying.
-;  *
-;  * r  A single precision number representing condition subtract result.
-;  * a  A single precision number to subtract from.
-;  * b  A single precision number to subtract.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_cond_sub_avx2_32 PROC
-        push	r12
-        mov	r12, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx]
-        pext	r12, r12, r9
-        sub	r10, r12
-        mov	r12, QWORD PTR [r8+8]
-        mov	r11, QWORD PTR [rdx+8]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+16]
-        mov	r12, QWORD PTR [rdx+16]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+8], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [rdx+24]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+16], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [rdx+32]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+24], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [rdx+40]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+32], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+48]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+40], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+56]
-        mov	r11, QWORD PTR [rdx+56]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+48], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+64]
-        mov	r12, QWORD PTR [rdx+64]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+56], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+72]
-        mov	r10, QWORD PTR [rdx+72]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+64], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [rdx+80]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+72], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+88]
-        mov	r12, QWORD PTR [rdx+88]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+80], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+96]
-        mov	r10, QWORD PTR [rdx+96]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+88], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+104]
-        mov	r11, QWORD PTR [rdx+104]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+96], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+112]
-        mov	r12, QWORD PTR [rdx+112]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+104], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+120]
-        mov	r10, QWORD PTR [rdx+120]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+112], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+128]
-        mov	r11, QWORD PTR [rdx+128]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+120], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+136]
-        mov	r12, QWORD PTR [rdx+136]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+128], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+144]
-        mov	r10, QWORD PTR [rdx+144]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+136], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+152]
-        mov	r11, QWORD PTR [rdx+152]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+144], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+160]
-        mov	r12, QWORD PTR [rdx+160]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+152], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+168]
-        mov	r10, QWORD PTR [rdx+168]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+160], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+176]
-        mov	r11, QWORD PTR [rdx+176]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+168], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+184]
-        mov	r12, QWORD PTR [rdx+184]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+176], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+192]
-        mov	r10, QWORD PTR [rdx+192]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+184], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+200]
-        mov	r11, QWORD PTR [rdx+200]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+192], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+208]
-        mov	r12, QWORD PTR [rdx+208]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+200], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+216]
-        mov	r10, QWORD PTR [rdx+216]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+208], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+224]
-        mov	r11, QWORD PTR [rdx+224]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+216], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+232]
-        mov	r12, QWORD PTR [rdx+232]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+224], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+240]
-        mov	r10, QWORD PTR [rdx+240]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+232], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+248]
-        mov	r11, QWORD PTR [rdx+248]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+240], r10
-        sbb	r11, r12
-        mov	QWORD PTR [rcx+248], r11
-        sbb	rax, rax
-        pop	r12
-        ret
-sp_2048_cond_sub_avx2_32 ENDP
-_text ENDS
-ENDIF
-; /* Compare a with b in constant time.
-;  *
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  * return -ve, 0 or +ve if a is less than, equal to or greater than b
-;  * respectively.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_cmp_32 PROC
-        push	r12
-        xor	r9, r9
-        mov	r8, -1
-        mov	rax, -1
-        mov	r10, 1
-        mov	r11, QWORD PTR [rcx+248]
-        mov	r12, QWORD PTR [rdx+248]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+240]
-        mov	r12, QWORD PTR [rdx+240]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+232]
-        mov	r12, QWORD PTR [rdx+232]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+224]
-        mov	r12, QWORD PTR [rdx+224]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+216]
-        mov	r12, QWORD PTR [rdx+216]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+208]
-        mov	r12, QWORD PTR [rdx+208]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+200]
-        mov	r12, QWORD PTR [rdx+200]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+192]
-        mov	r12, QWORD PTR [rdx+192]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+184]
-        mov	r12, QWORD PTR [rdx+184]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+176]
-        mov	r12, QWORD PTR [rdx+176]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+168]
-        mov	r12, QWORD PTR [rdx+168]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+160]
-        mov	r12, QWORD PTR [rdx+160]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+152]
-        mov	r12, QWORD PTR [rdx+152]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+144]
-        mov	r12, QWORD PTR [rdx+144]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+136]
-        mov	r12, QWORD PTR [rdx+136]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+128]
-        mov	r12, QWORD PTR [rdx+128]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+120]
-        mov	r12, QWORD PTR [rdx+120]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+112]
-        mov	r12, QWORD PTR [rdx+112]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+104]
-        mov	r12, QWORD PTR [rdx+104]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+96]
-        mov	r12, QWORD PTR [rdx+96]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+88]
-        mov	r12, QWORD PTR [rdx+88]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+80]
-        mov	r12, QWORD PTR [rdx+80]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+72]
-        mov	r12, QWORD PTR [rdx+72]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+64]
-        mov	r12, QWORD PTR [rdx+64]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+56]
-        mov	r12, QWORD PTR [rdx+56]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+48]
-        mov	r12, QWORD PTR [rdx+48]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+40]
-        mov	r12, QWORD PTR [rdx+40]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+32]
-        mov	r12, QWORD PTR [rdx+32]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+24]
-        mov	r12, QWORD PTR [rdx+24]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+16]
-        mov	r12, QWORD PTR [rdx+16]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+8]
-        mov	r12, QWORD PTR [rdx+8]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx]
-        mov	r12, QWORD PTR [rdx]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        xor	rax, r8
-        pop	r12
-        ret
-sp_2048_cmp_32 ENDP
-_text ENDS
-IFNDEF WC_NO_CACHE_RESISTANT
-_text SEGMENT READONLY PARA
-sp_2048_get_from_table_32 PROC
-        sub	rsp, 128
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        vmovdqu	OWORD PTR [rsp+96], xmm12
-        vmovdqu	OWORD PTR [rsp+112], xmm13
-        mov	rax, 1
-        movd	xmm10, r8
-        movd	xmm11, rax
-        pxor	xmm13, xmm13
-        pshufd	xmm11, xmm11, 0
-        pshufd	xmm10, xmm10, 0
-        ; START: 0-7
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 16
-        mov	r9, QWORD PTR [rdx+128]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 17
-        mov	r9, QWORD PTR [rdx+136]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 18
-        mov	r9, QWORD PTR [rdx+144]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 19
-        mov	r9, QWORD PTR [rdx+152]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 20
-        mov	r9, QWORD PTR [rdx+160]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 21
-        mov	r9, QWORD PTR [rdx+168]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 22
-        mov	r9, QWORD PTR [rdx+176]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 23
-        mov	r9, QWORD PTR [rdx+184]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 24
-        mov	r9, QWORD PTR [rdx+192]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 25
-        mov	r9, QWORD PTR [rdx+200]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 26
-        mov	r9, QWORD PTR [rdx+208]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 27
-        mov	r9, QWORD PTR [rdx+216]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 28
-        mov	r9, QWORD PTR [rdx+224]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 29
-        mov	r9, QWORD PTR [rdx+232]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 30
-        mov	r9, QWORD PTR [rdx+240]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 31
-        mov	r9, QWORD PTR [rdx+248]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 32
-        mov	r9, QWORD PTR [rdx+256]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 33
-        mov	r9, QWORD PTR [rdx+264]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 34
-        mov	r9, QWORD PTR [rdx+272]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 35
-        mov	r9, QWORD PTR [rdx+280]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 36
-        mov	r9, QWORD PTR [rdx+288]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 37
-        mov	r9, QWORD PTR [rdx+296]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 38
-        mov	r9, QWORD PTR [rdx+304]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 39
-        mov	r9, QWORD PTR [rdx+312]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 40
-        mov	r9, QWORD PTR [rdx+320]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 41
-        mov	r9, QWORD PTR [rdx+328]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 42
-        mov	r9, QWORD PTR [rdx+336]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 43
-        mov	r9, QWORD PTR [rdx+344]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 44
-        mov	r9, QWORD PTR [rdx+352]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 45
-        mov	r9, QWORD PTR [rdx+360]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 46
-        mov	r9, QWORD PTR [rdx+368]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 47
-        mov	r9, QWORD PTR [rdx+376]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 48
-        mov	r9, QWORD PTR [rdx+384]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 49
-        mov	r9, QWORD PTR [rdx+392]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 50
-        mov	r9, QWORD PTR [rdx+400]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 51
-        mov	r9, QWORD PTR [rdx+408]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 52
-        mov	r9, QWORD PTR [rdx+416]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 53
-        mov	r9, QWORD PTR [rdx+424]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 54
-        mov	r9, QWORD PTR [rdx+432]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 55
-        mov	r9, QWORD PTR [rdx+440]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 56
-        mov	r9, QWORD PTR [rdx+448]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 57
-        mov	r9, QWORD PTR [rdx+456]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 58
-        mov	r9, QWORD PTR [rdx+464]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 59
-        mov	r9, QWORD PTR [rdx+472]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 60
-        mov	r9, QWORD PTR [rdx+480]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 61
-        mov	r9, QWORD PTR [rdx+488]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 62
-        mov	r9, QWORD PTR [rdx+496]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 63
-        mov	r9, QWORD PTR [rdx+504]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        add	rcx, 64
-        ; END: 0-7
-        ; START: 8-15
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 16
-        mov	r9, QWORD PTR [rdx+128]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 17
-        mov	r9, QWORD PTR [rdx+136]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 18
-        mov	r9, QWORD PTR [rdx+144]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 19
-        mov	r9, QWORD PTR [rdx+152]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 20
-        mov	r9, QWORD PTR [rdx+160]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 21
-        mov	r9, QWORD PTR [rdx+168]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 22
-        mov	r9, QWORD PTR [rdx+176]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 23
-        mov	r9, QWORD PTR [rdx+184]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 24
-        mov	r9, QWORD PTR [rdx+192]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 25
-        mov	r9, QWORD PTR [rdx+200]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 26
-        mov	r9, QWORD PTR [rdx+208]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 27
-        mov	r9, QWORD PTR [rdx+216]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 28
-        mov	r9, QWORD PTR [rdx+224]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 29
-        mov	r9, QWORD PTR [rdx+232]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 30
-        mov	r9, QWORD PTR [rdx+240]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 31
-        mov	r9, QWORD PTR [rdx+248]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 32
-        mov	r9, QWORD PTR [rdx+256]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 33
-        mov	r9, QWORD PTR [rdx+264]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 34
-        mov	r9, QWORD PTR [rdx+272]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 35
-        mov	r9, QWORD PTR [rdx+280]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 36
-        mov	r9, QWORD PTR [rdx+288]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 37
-        mov	r9, QWORD PTR [rdx+296]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 38
-        mov	r9, QWORD PTR [rdx+304]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 39
-        mov	r9, QWORD PTR [rdx+312]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 40
-        mov	r9, QWORD PTR [rdx+320]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 41
-        mov	r9, QWORD PTR [rdx+328]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 42
-        mov	r9, QWORD PTR [rdx+336]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 43
-        mov	r9, QWORD PTR [rdx+344]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 44
-        mov	r9, QWORD PTR [rdx+352]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 45
-        mov	r9, QWORD PTR [rdx+360]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 46
-        mov	r9, QWORD PTR [rdx+368]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 47
-        mov	r9, QWORD PTR [rdx+376]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 48
-        mov	r9, QWORD PTR [rdx+384]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 49
-        mov	r9, QWORD PTR [rdx+392]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 50
-        mov	r9, QWORD PTR [rdx+400]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 51
-        mov	r9, QWORD PTR [rdx+408]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 52
-        mov	r9, QWORD PTR [rdx+416]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 53
-        mov	r9, QWORD PTR [rdx+424]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 54
-        mov	r9, QWORD PTR [rdx+432]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 55
-        mov	r9, QWORD PTR [rdx+440]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 56
-        mov	r9, QWORD PTR [rdx+448]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 57
-        mov	r9, QWORD PTR [rdx+456]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 58
-        mov	r9, QWORD PTR [rdx+464]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 59
-        mov	r9, QWORD PTR [rdx+472]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 60
-        mov	r9, QWORD PTR [rdx+480]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 61
-        mov	r9, QWORD PTR [rdx+488]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 62
-        mov	r9, QWORD PTR [rdx+496]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 63
-        mov	r9, QWORD PTR [rdx+504]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        add	rcx, 64
-        ; END: 8-15
-        ; START: 16-23
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 16
-        mov	r9, QWORD PTR [rdx+128]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 17
-        mov	r9, QWORD PTR [rdx+136]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 18
-        mov	r9, QWORD PTR [rdx+144]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 19
-        mov	r9, QWORD PTR [rdx+152]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 20
-        mov	r9, QWORD PTR [rdx+160]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 21
-        mov	r9, QWORD PTR [rdx+168]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 22
-        mov	r9, QWORD PTR [rdx+176]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 23
-        mov	r9, QWORD PTR [rdx+184]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 24
-        mov	r9, QWORD PTR [rdx+192]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 25
-        mov	r9, QWORD PTR [rdx+200]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 26
-        mov	r9, QWORD PTR [rdx+208]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 27
-        mov	r9, QWORD PTR [rdx+216]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 28
-        mov	r9, QWORD PTR [rdx+224]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 29
-        mov	r9, QWORD PTR [rdx+232]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 30
-        mov	r9, QWORD PTR [rdx+240]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 31
-        mov	r9, QWORD PTR [rdx+248]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 32
-        mov	r9, QWORD PTR [rdx+256]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 33
-        mov	r9, QWORD PTR [rdx+264]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 34
-        mov	r9, QWORD PTR [rdx+272]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 35
-        mov	r9, QWORD PTR [rdx+280]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 36
-        mov	r9, QWORD PTR [rdx+288]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 37
-        mov	r9, QWORD PTR [rdx+296]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 38
-        mov	r9, QWORD PTR [rdx+304]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 39
-        mov	r9, QWORD PTR [rdx+312]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 40
-        mov	r9, QWORD PTR [rdx+320]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 41
-        mov	r9, QWORD PTR [rdx+328]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 42
-        mov	r9, QWORD PTR [rdx+336]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 43
-        mov	r9, QWORD PTR [rdx+344]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 44
-        mov	r9, QWORD PTR [rdx+352]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 45
-        mov	r9, QWORD PTR [rdx+360]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 46
-        mov	r9, QWORD PTR [rdx+368]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 47
-        mov	r9, QWORD PTR [rdx+376]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 48
-        mov	r9, QWORD PTR [rdx+384]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 49
-        mov	r9, QWORD PTR [rdx+392]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 50
-        mov	r9, QWORD PTR [rdx+400]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 51
-        mov	r9, QWORD PTR [rdx+408]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 52
-        mov	r9, QWORD PTR [rdx+416]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 53
-        mov	r9, QWORD PTR [rdx+424]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 54
-        mov	r9, QWORD PTR [rdx+432]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 55
-        mov	r9, QWORD PTR [rdx+440]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 56
-        mov	r9, QWORD PTR [rdx+448]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 57
-        mov	r9, QWORD PTR [rdx+456]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 58
-        mov	r9, QWORD PTR [rdx+464]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 59
-        mov	r9, QWORD PTR [rdx+472]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 60
-        mov	r9, QWORD PTR [rdx+480]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 61
-        mov	r9, QWORD PTR [rdx+488]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 62
-        mov	r9, QWORD PTR [rdx+496]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 63
-        mov	r9, QWORD PTR [rdx+504]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        add	rcx, 64
-        ; END: 16-23
-        ; START: 24-31
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 16
-        mov	r9, QWORD PTR [rdx+128]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 17
-        mov	r9, QWORD PTR [rdx+136]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 18
-        mov	r9, QWORD PTR [rdx+144]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 19
-        mov	r9, QWORD PTR [rdx+152]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 20
-        mov	r9, QWORD PTR [rdx+160]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 21
-        mov	r9, QWORD PTR [rdx+168]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 22
-        mov	r9, QWORD PTR [rdx+176]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 23
-        mov	r9, QWORD PTR [rdx+184]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 24
-        mov	r9, QWORD PTR [rdx+192]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 25
-        mov	r9, QWORD PTR [rdx+200]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 26
-        mov	r9, QWORD PTR [rdx+208]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 27
-        mov	r9, QWORD PTR [rdx+216]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 28
-        mov	r9, QWORD PTR [rdx+224]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 29
-        mov	r9, QWORD PTR [rdx+232]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 30
-        mov	r9, QWORD PTR [rdx+240]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 31
-        mov	r9, QWORD PTR [rdx+248]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 32
-        mov	r9, QWORD PTR [rdx+256]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 33
-        mov	r9, QWORD PTR [rdx+264]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 34
-        mov	r9, QWORD PTR [rdx+272]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 35
-        mov	r9, QWORD PTR [rdx+280]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 36
-        mov	r9, QWORD PTR [rdx+288]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 37
-        mov	r9, QWORD PTR [rdx+296]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 38
-        mov	r9, QWORD PTR [rdx+304]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 39
-        mov	r9, QWORD PTR [rdx+312]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 40
-        mov	r9, QWORD PTR [rdx+320]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 41
-        mov	r9, QWORD PTR [rdx+328]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 42
-        mov	r9, QWORD PTR [rdx+336]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 43
-        mov	r9, QWORD PTR [rdx+344]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 44
-        mov	r9, QWORD PTR [rdx+352]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 45
-        mov	r9, QWORD PTR [rdx+360]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 46
-        mov	r9, QWORD PTR [rdx+368]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 47
-        mov	r9, QWORD PTR [rdx+376]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 48
-        mov	r9, QWORD PTR [rdx+384]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 49
-        mov	r9, QWORD PTR [rdx+392]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 50
-        mov	r9, QWORD PTR [rdx+400]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 51
-        mov	r9, QWORD PTR [rdx+408]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 52
-        mov	r9, QWORD PTR [rdx+416]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 53
-        mov	r9, QWORD PTR [rdx+424]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 54
-        mov	r9, QWORD PTR [rdx+432]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 55
-        mov	r9, QWORD PTR [rdx+440]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 56
-        mov	r9, QWORD PTR [rdx+448]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 57
-        mov	r9, QWORD PTR [rdx+456]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 58
-        mov	r9, QWORD PTR [rdx+464]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 59
-        mov	r9, QWORD PTR [rdx+472]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 60
-        mov	r9, QWORD PTR [rdx+480]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 61
-        mov	r9, QWORD PTR [rdx+488]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 62
-        mov	r9, QWORD PTR [rdx+496]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 63
-        mov	r9, QWORD PTR [rdx+504]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        ; END: 24-31
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        vmovdqu	xmm12, OWORD PTR [rsp+96]
-        vmovdqu	xmm13, OWORD PTR [rsp+112]
-        add	rsp, 128
-        ret
-sp_2048_get_from_table_32 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Reduce the number back to 2048 bits using Montgomery reduction.
-;  *
-;  * a   A single precision number to reduce in place.
-;  * m   The single precision number representing the modulus.
-;  * mp  The digit representing the negative inverse of m mod 2^n.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_mont_reduce_avx2_32 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        push	rbp
-        mov	r9, rcx
-        mov	r10, rdx
-        xor	rbp, rbp
-        ; i = 32
-        mov	r11, 32
-        mov	r14, QWORD PTR [r9]
-        mov	r15, QWORD PTR [r9+8]
-        mov	rdi, QWORD PTR [r9+16]
-        mov	rsi, QWORD PTR [r9+24]
-        add	r9, 128
-        xor	rbp, rbp
-L_2048_mont_reduce_avx2_32_loop:
-        ; mu = a[i] * mp
-        mov	rdx, r14
-        mov	r12, r14
-        imul	rdx, r8
-        xor	rbx, rbx
-        ; a[i+0] += m[0] * mu
-        mulx	rcx, rax, QWORD PTR [r10]
-        mov	r14, r15
-        adcx	r12, rax
-        adox	r14, rcx
-        ; a[i+1] += m[1] * mu
-        mulx	rcx, rax, QWORD PTR [r10+8]
-        mov	r15, rdi
-        adcx	r14, rax
-        adox	r15, rcx
-        ; a[i+2] += m[2] * mu
-        mulx	rcx, rax, QWORD PTR [r10+16]
-        mov	rdi, rsi
-        adcx	r15, rax
-        adox	rdi, rcx
-        ; a[i+3] += m[3] * mu
-        mulx	rcx, rax, QWORD PTR [r10+24]
-        mov	rsi, QWORD PTR [r9+-96]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; a[i+4] += m[4] * mu
-        mulx	rcx, rax, QWORD PTR [r10+32]
-        mov	r13, QWORD PTR [r9+-88]
-        adcx	rsi, rax
-        adox	r13, rcx
-        ; a[i+5] += m[5] * mu
-        mulx	rcx, rax, QWORD PTR [r10+40]
-        mov	r12, QWORD PTR [r9+-80]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-88], r13
-        ; a[i+6] += m[6] * mu
-        mulx	rcx, rax, QWORD PTR [r10+48]
-        mov	r13, QWORD PTR [r9+-72]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-80], r12
-        ; a[i+7] += m[7] * mu
-        mulx	rcx, rax, QWORD PTR [r10+56]
-        mov	r12, QWORD PTR [r9+-64]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-72], r13
-        ; a[i+8] += m[8] * mu
-        mulx	rcx, rax, QWORD PTR [r10+64]
-        mov	r13, QWORD PTR [r9+-56]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-64], r12
-        ; a[i+9] += m[9] * mu
-        mulx	rcx, rax, QWORD PTR [r10+72]
-        mov	r12, QWORD PTR [r9+-48]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-56], r13
-        ; a[i+10] += m[10] * mu
-        mulx	rcx, rax, QWORD PTR [r10+80]
-        mov	r13, QWORD PTR [r9+-40]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-48], r12
-        ; a[i+11] += m[11] * mu
-        mulx	rcx, rax, QWORD PTR [r10+88]
-        mov	r12, QWORD PTR [r9+-32]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-40], r13
-        ; a[i+12] += m[12] * mu
-        mulx	rcx, rax, QWORD PTR [r10+96]
-        mov	r13, QWORD PTR [r9+-24]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-32], r12
-        ; a[i+13] += m[13] * mu
-        mulx	rcx, rax, QWORD PTR [r10+104]
-        mov	r12, QWORD PTR [r9+-16]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-24], r13
-        ; a[i+14] += m[14] * mu
-        mulx	rcx, rax, QWORD PTR [r10+112]
-        mov	r13, QWORD PTR [r9+-8]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-16], r12
-        ; a[i+15] += m[15] * mu
-        mulx	rcx, rax, QWORD PTR [r10+120]
-        mov	r12, QWORD PTR [r9]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-8], r13
-        ; a[i+16] += m[16] * mu
-        mulx	rcx, rax, QWORD PTR [r10+128]
-        mov	r13, QWORD PTR [r9+8]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9], r12
-        ; a[i+17] += m[17] * mu
-        mulx	rcx, rax, QWORD PTR [r10+136]
-        mov	r12, QWORD PTR [r9+16]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+8], r13
-        ; a[i+18] += m[18] * mu
-        mulx	rcx, rax, QWORD PTR [r10+144]
-        mov	r13, QWORD PTR [r9+24]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+16], r12
-        ; a[i+19] += m[19] * mu
-        mulx	rcx, rax, QWORD PTR [r10+152]
-        mov	r12, QWORD PTR [r9+32]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+24], r13
-        ; a[i+20] += m[20] * mu
-        mulx	rcx, rax, QWORD PTR [r10+160]
-        mov	r13, QWORD PTR [r9+40]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+32], r12
-        ; a[i+21] += m[21] * mu
-        mulx	rcx, rax, QWORD PTR [r10+168]
-        mov	r12, QWORD PTR [r9+48]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+40], r13
-        ; a[i+22] += m[22] * mu
-        mulx	rcx, rax, QWORD PTR [r10+176]
-        mov	r13, QWORD PTR [r9+56]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+48], r12
-        ; a[i+23] += m[23] * mu
-        mulx	rcx, rax, QWORD PTR [r10+184]
-        mov	r12, QWORD PTR [r9+64]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+56], r13
-        ; a[i+24] += m[24] * mu
-        mulx	rcx, rax, QWORD PTR [r10+192]
-        mov	r13, QWORD PTR [r9+72]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+64], r12
-        ; a[i+25] += m[25] * mu
-        mulx	rcx, rax, QWORD PTR [r10+200]
-        mov	r12, QWORD PTR [r9+80]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+72], r13
-        ; a[i+26] += m[26] * mu
-        mulx	rcx, rax, QWORD PTR [r10+208]
-        mov	r13, QWORD PTR [r9+88]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+80], r12
-        ; a[i+27] += m[27] * mu
-        mulx	rcx, rax, QWORD PTR [r10+216]
-        mov	r12, QWORD PTR [r9+96]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+88], r13
-        ; a[i+28] += m[28] * mu
-        mulx	rcx, rax, QWORD PTR [r10+224]
-        mov	r13, QWORD PTR [r9+104]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+96], r12
-        ; a[i+29] += m[29] * mu
-        mulx	rcx, rax, QWORD PTR [r10+232]
-        mov	r12, QWORD PTR [r9+112]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+104], r13
-        ; a[i+30] += m[30] * mu
-        mulx	rcx, rax, QWORD PTR [r10+240]
-        mov	r13, QWORD PTR [r9+120]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+112], r12
-        ; a[i+31] += m[31] * mu
-        mulx	rcx, rax, QWORD PTR [r10+248]
-        mov	r12, QWORD PTR [r9+128]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+120], r13
-        adcx	r12, rbp
-        mov	rbp, rbx
-        mov	QWORD PTR [r9+128], r12
-        adox	rbp, rbx
-        adcx	rbp, rbx
-        ; a += 1
-        add	r9, 8
-        ; i -= 1
-        sub	r11, 1
-        jnz	L_2048_mont_reduce_avx2_32_loop
-        sub	r9, 128
-        neg	rbp
-        mov	r8, r9
-        sub	r9, 256
-        mov	rcx, QWORD PTR [r10]
-        mov	rdx, r14
-        pext	rcx, rcx, rbp
-        sub	rdx, rcx
-        mov	rcx, QWORD PTR [r10+8]
-        mov	rax, r15
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+16]
-        mov	rcx, rdi
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+8], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+24]
-        mov	rdx, rsi
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+16], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+32]
-        mov	rax, QWORD PTR [r8+32]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+24], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+40]
-        mov	rcx, QWORD PTR [r8+40]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+32], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+48]
-        mov	rdx, QWORD PTR [r8+48]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+40], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+56]
-        mov	rax, QWORD PTR [r8+56]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+48], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+64]
-        mov	rcx, QWORD PTR [r8+64]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+56], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+72]
-        mov	rdx, QWORD PTR [r8+72]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+64], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+80]
-        mov	rax, QWORD PTR [r8+80]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+72], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+88]
-        mov	rcx, QWORD PTR [r8+88]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+80], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+96]
-        mov	rdx, QWORD PTR [r8+96]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+88], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+104]
-        mov	rax, QWORD PTR [r8+104]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+96], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+112]
-        mov	rcx, QWORD PTR [r8+112]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+104], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+120]
-        mov	rdx, QWORD PTR [r8+120]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+112], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+128]
-        mov	rax, QWORD PTR [r8+128]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+120], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+136]
-        mov	rcx, QWORD PTR [r8+136]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+128], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+144]
-        mov	rdx, QWORD PTR [r8+144]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+136], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+152]
-        mov	rax, QWORD PTR [r8+152]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+144], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+160]
-        mov	rcx, QWORD PTR [r8+160]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+152], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+168]
-        mov	rdx, QWORD PTR [r8+168]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+160], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+176]
-        mov	rax, QWORD PTR [r8+176]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+168], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+184]
-        mov	rcx, QWORD PTR [r8+184]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+176], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+192]
-        mov	rdx, QWORD PTR [r8+192]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+184], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+200]
-        mov	rax, QWORD PTR [r8+200]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+192], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+208]
-        mov	rcx, QWORD PTR [r8+208]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+200], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+216]
-        mov	rdx, QWORD PTR [r8+216]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+208], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+224]
-        mov	rax, QWORD PTR [r8+224]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+216], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+232]
-        mov	rcx, QWORD PTR [r8+232]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+224], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+240]
-        mov	rdx, QWORD PTR [r8+240]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+232], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+248]
-        mov	rax, QWORD PTR [r8+248]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+240], rdx
-        sbb	rax, rcx
-        mov	QWORD PTR [r9+248], rax
-        pop	rbp
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_2048_mont_reduce_avx2_32 ENDP
-_text ENDS
-ENDIF
-IFNDEF WC_NO_CACHE_RESISTANT
-_text SEGMENT READONLY PARA
-sp_2048_get_from_table_avx2_32 PROC
-        sub	rsp, 128
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        vmovdqu	OWORD PTR [rsp+96], xmm12
-        vmovdqu	OWORD PTR [rsp+112], xmm13
-        mov	rax, 1
-        movd	xmm10, r8
-        movd	xmm11, rax
-        vpxor	ymm13, ymm13, ymm13
-        vpermd	ymm10, ymm13, ymm10
-        vpermd	ymm11, ymm13, ymm11
-        ; START: 0-15
-        vpxor	ymm13, ymm13, ymm13
-        vpxor	ymm4, ymm4, ymm4
-        vpxor	ymm5, ymm5, ymm5
-        vpxor	ymm6, ymm6, ymm6
-        vpxor	ymm7, ymm7, ymm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 16
-        mov	r9, QWORD PTR [rdx+128]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 17
-        mov	r9, QWORD PTR [rdx+136]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 18
-        mov	r9, QWORD PTR [rdx+144]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 19
-        mov	r9, QWORD PTR [rdx+152]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 20
-        mov	r9, QWORD PTR [rdx+160]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 21
-        mov	r9, QWORD PTR [rdx+168]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 22
-        mov	r9, QWORD PTR [rdx+176]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 23
-        mov	r9, QWORD PTR [rdx+184]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 24
-        mov	r9, QWORD PTR [rdx+192]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 25
-        mov	r9, QWORD PTR [rdx+200]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 26
-        mov	r9, QWORD PTR [rdx+208]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 27
-        mov	r9, QWORD PTR [rdx+216]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 28
-        mov	r9, QWORD PTR [rdx+224]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 29
-        mov	r9, QWORD PTR [rdx+232]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 30
-        mov	r9, QWORD PTR [rdx+240]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 31
-        mov	r9, QWORD PTR [rdx+248]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 32
-        mov	r9, QWORD PTR [rdx+256]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 33
-        mov	r9, QWORD PTR [rdx+264]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 34
-        mov	r9, QWORD PTR [rdx+272]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 35
-        mov	r9, QWORD PTR [rdx+280]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 36
-        mov	r9, QWORD PTR [rdx+288]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 37
-        mov	r9, QWORD PTR [rdx+296]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 38
-        mov	r9, QWORD PTR [rdx+304]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 39
-        mov	r9, QWORD PTR [rdx+312]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 40
-        mov	r9, QWORD PTR [rdx+320]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 41
-        mov	r9, QWORD PTR [rdx+328]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 42
-        mov	r9, QWORD PTR [rdx+336]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 43
-        mov	r9, QWORD PTR [rdx+344]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 44
-        mov	r9, QWORD PTR [rdx+352]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 45
-        mov	r9, QWORD PTR [rdx+360]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 46
-        mov	r9, QWORD PTR [rdx+368]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 47
-        mov	r9, QWORD PTR [rdx+376]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 48
-        mov	r9, QWORD PTR [rdx+384]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 49
-        mov	r9, QWORD PTR [rdx+392]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 50
-        mov	r9, QWORD PTR [rdx+400]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 51
-        mov	r9, QWORD PTR [rdx+408]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 52
-        mov	r9, QWORD PTR [rdx+416]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 53
-        mov	r9, QWORD PTR [rdx+424]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 54
-        mov	r9, QWORD PTR [rdx+432]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 55
-        mov	r9, QWORD PTR [rdx+440]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 56
-        mov	r9, QWORD PTR [rdx+448]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 57
-        mov	r9, QWORD PTR [rdx+456]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 58
-        mov	r9, QWORD PTR [rdx+464]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 59
-        mov	r9, QWORD PTR [rdx+472]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 60
-        mov	r9, QWORD PTR [rdx+480]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 61
-        mov	r9, QWORD PTR [rdx+488]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 62
-        mov	r9, QWORD PTR [rdx+496]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 63
-        mov	r9, QWORD PTR [rdx+504]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        vmovdqu	YMMWORD PTR [rcx], ymm4
-        vmovdqu	YMMWORD PTR [rcx+32], ymm5
-        vmovdqu	YMMWORD PTR [rcx+64], ymm6
-        vmovdqu	YMMWORD PTR [rcx+96], ymm7
-        add	rcx, 128
-        ; END: 0-15
-        ; START: 16-31
-        vpxor	ymm13, ymm13, ymm13
-        vpxor	ymm4, ymm4, ymm4
-        vpxor	ymm5, ymm5, ymm5
-        vpxor	ymm6, ymm6, ymm6
-        vpxor	ymm7, ymm7, ymm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 16
-        mov	r9, QWORD PTR [rdx+128]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 17
-        mov	r9, QWORD PTR [rdx+136]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 18
-        mov	r9, QWORD PTR [rdx+144]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 19
-        mov	r9, QWORD PTR [rdx+152]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 20
-        mov	r9, QWORD PTR [rdx+160]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 21
-        mov	r9, QWORD PTR [rdx+168]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 22
-        mov	r9, QWORD PTR [rdx+176]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 23
-        mov	r9, QWORD PTR [rdx+184]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 24
-        mov	r9, QWORD PTR [rdx+192]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 25
-        mov	r9, QWORD PTR [rdx+200]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 26
-        mov	r9, QWORD PTR [rdx+208]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 27
-        mov	r9, QWORD PTR [rdx+216]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 28
-        mov	r9, QWORD PTR [rdx+224]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 29
-        mov	r9, QWORD PTR [rdx+232]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 30
-        mov	r9, QWORD PTR [rdx+240]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 31
-        mov	r9, QWORD PTR [rdx+248]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 32
-        mov	r9, QWORD PTR [rdx+256]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 33
-        mov	r9, QWORD PTR [rdx+264]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 34
-        mov	r9, QWORD PTR [rdx+272]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 35
-        mov	r9, QWORD PTR [rdx+280]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 36
-        mov	r9, QWORD PTR [rdx+288]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 37
-        mov	r9, QWORD PTR [rdx+296]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 38
-        mov	r9, QWORD PTR [rdx+304]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 39
-        mov	r9, QWORD PTR [rdx+312]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 40
-        mov	r9, QWORD PTR [rdx+320]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 41
-        mov	r9, QWORD PTR [rdx+328]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 42
-        mov	r9, QWORD PTR [rdx+336]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 43
-        mov	r9, QWORD PTR [rdx+344]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 44
-        mov	r9, QWORD PTR [rdx+352]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 45
-        mov	r9, QWORD PTR [rdx+360]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 46
-        mov	r9, QWORD PTR [rdx+368]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 47
-        mov	r9, QWORD PTR [rdx+376]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 48
-        mov	r9, QWORD PTR [rdx+384]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 49
-        mov	r9, QWORD PTR [rdx+392]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 50
-        mov	r9, QWORD PTR [rdx+400]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 51
-        mov	r9, QWORD PTR [rdx+408]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 52
-        mov	r9, QWORD PTR [rdx+416]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 53
-        mov	r9, QWORD PTR [rdx+424]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 54
-        mov	r9, QWORD PTR [rdx+432]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 55
-        mov	r9, QWORD PTR [rdx+440]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 56
-        mov	r9, QWORD PTR [rdx+448]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 57
-        mov	r9, QWORD PTR [rdx+456]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 58
-        mov	r9, QWORD PTR [rdx+464]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 59
-        mov	r9, QWORD PTR [rdx+472]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 60
-        mov	r9, QWORD PTR [rdx+480]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 61
-        mov	r9, QWORD PTR [rdx+488]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 62
-        mov	r9, QWORD PTR [rdx+496]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 63
-        mov	r9, QWORD PTR [rdx+504]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        vmovdqu	YMMWORD PTR [rcx], ymm4
-        vmovdqu	YMMWORD PTR [rcx+32], ymm5
-        vmovdqu	YMMWORD PTR [rcx+64], ymm6
-        vmovdqu	YMMWORD PTR [rcx+96], ymm7
-        ; END: 16-31
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        vmovdqu	xmm12, OWORD PTR [rsp+96]
-        vmovdqu	xmm13, OWORD PTR [rsp+112]
-        add	rsp, 128
-        ret
-sp_2048_get_from_table_avx2_32 ENDP
-_text ENDS
-ENDIF
-; /* Conditionally add a and b using the mask m.
-;  * m is -1 to add and 0 when not.
-;  *
-;  * r  A single precision number representing conditional add result.
-;  * a  A single precision number to add with.
-;  * b  A single precision number to add.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_cond_add_16 PROC
-        sub	rsp, 128
-        mov	rax, 0
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp], r10
-        mov	QWORD PTR [rsp+8], r11
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+16], r10
-        mov	QWORD PTR [rsp+24], r11
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+32], r10
-        mov	QWORD PTR [rsp+40], r11
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+48], r10
-        mov	QWORD PTR [rsp+56], r11
-        mov	r10, QWORD PTR [r8+64]
-        mov	r11, QWORD PTR [r8+72]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+64], r10
-        mov	QWORD PTR [rsp+72], r11
-        mov	r10, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [r8+88]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+80], r10
-        mov	QWORD PTR [rsp+88], r11
-        mov	r10, QWORD PTR [r8+96]
-        mov	r11, QWORD PTR [r8+104]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+96], r10
-        mov	QWORD PTR [rsp+104], r11
-        mov	r10, QWORD PTR [r8+112]
-        mov	r11, QWORD PTR [r8+120]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+112], r10
-        mov	QWORD PTR [rsp+120], r11
-        mov	r10, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rsp]
-        add	r10, r8
-        mov	r11, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [rsp+8]
-        adc	r11, r8
-        mov	QWORD PTR [rcx], r10
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r8, QWORD PTR [rsp+16]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+8], r11
-        mov	r11, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [rsp+24]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+16], r10
-        mov	r10, QWORD PTR [rdx+32]
-        mov	r8, QWORD PTR [rsp+32]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+24], r11
-        mov	r11, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [rsp+40]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+32], r10
-        mov	r10, QWORD PTR [rdx+48]
-        mov	r8, QWORD PTR [rsp+48]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+40], r11
-        mov	r11, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [rsp+56]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+48], r10
-        mov	r10, QWORD PTR [rdx+64]
-        mov	r8, QWORD PTR [rsp+64]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+56], r11
-        mov	r11, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [rsp+72]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+64], r10
-        mov	r10, QWORD PTR [rdx+80]
-        mov	r8, QWORD PTR [rsp+80]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+72], r11
-        mov	r11, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [rsp+88]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+80], r10
-        mov	r10, QWORD PTR [rdx+96]
-        mov	r8, QWORD PTR [rsp+96]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+88], r11
-        mov	r11, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [rsp+104]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+96], r10
-        mov	r10, QWORD PTR [rdx+112]
-        mov	r8, QWORD PTR [rsp+112]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+104], r11
-        mov	r11, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [rsp+120]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+112], r10
-        mov	QWORD PTR [rcx+120], r11
-        adc	rax, 0
-        add	rsp, 128
-        ret
-sp_2048_cond_add_16 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Conditionally add a and b using the mask m.
-;  * m is -1 to add and 0 when not.
-;  *
-;  * r  A single precision number representing conditional add result.
-;  * a  A single precision number to add with.
-;  * b  A single precision number to add.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_cond_add_avx2_16 PROC
-        push	r12
-        mov	rax, 0
-        mov	r12, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx]
-        pext	r12, r12, r9
-        add	r10, r12
-        mov	r12, QWORD PTR [r8+8]
-        mov	r11, QWORD PTR [rdx+8]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+16]
-        mov	r12, QWORD PTR [rdx+16]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+8], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [rdx+24]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+16], r12
-        adc	r10, r11
-        mov	r12, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [rdx+32]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+24], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [rdx+40]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+32], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+48]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+40], r12
-        adc	r10, r11
-        mov	r12, QWORD PTR [r8+56]
-        mov	r11, QWORD PTR [rdx+56]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+48], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+64]
-        mov	r12, QWORD PTR [rdx+64]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+56], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+72]
-        mov	r10, QWORD PTR [rdx+72]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+64], r12
-        adc	r10, r11
-        mov	r12, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [rdx+80]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+72], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+88]
-        mov	r12, QWORD PTR [rdx+88]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+80], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+96]
-        mov	r10, QWORD PTR [rdx+96]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+88], r12
-        adc	r10, r11
-        mov	r12, QWORD PTR [r8+104]
-        mov	r11, QWORD PTR [rdx+104]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+96], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+112]
-        mov	r12, QWORD PTR [rdx+112]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+104], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+120]
-        mov	r10, QWORD PTR [rdx+120]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+112], r12
-        adc	r10, r11
-        mov	QWORD PTR [rcx+120], r10
-        adc	rax, 0
-        pop	r12
-        ret
-sp_2048_cond_add_avx2_16 ENDP
-_text ENDS
-ENDIF
-; /* Shift number left by n bit. (r = a << n)
-;  *
-;  * r  Result of left shift by n.
-;  * a  Number to shift.
-;  * n  Amoutnt o shift.
-;  */
-_text SEGMENT READONLY PARA
-sp_2048_lshift_32 PROC
-        push	r12
-        push	r13
-        mov	cl, r8b
-        mov	rax, rcx
-        mov	r12, 0
-        mov	r13, QWORD PTR [rdx+216]
-        mov	r8, QWORD PTR [rdx+224]
-        mov	r9, QWORD PTR [rdx+232]
-        mov	r10, QWORD PTR [rdx+240]
-        mov	r11, QWORD PTR [rdx+248]
-        shld	r12, r11, cl
-        shld	r11, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r13, cl
-        mov	QWORD PTR [rax+224], r8
-        mov	QWORD PTR [rax+232], r9
-        mov	QWORD PTR [rax+240], r10
-        mov	QWORD PTR [rax+248], r11
-        mov	QWORD PTR [rax+256], r12
-        mov	r11, QWORD PTR [rdx+184]
-        mov	r8, QWORD PTR [rdx+192]
-        mov	r9, QWORD PTR [rdx+200]
-        mov	r10, QWORD PTR [rdx+208]
-        shld	r13, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r11, cl
-        mov	QWORD PTR [rax+192], r8
-        mov	QWORD PTR [rax+200], r9
-        mov	QWORD PTR [rax+208], r10
-        mov	QWORD PTR [rax+216], r13
-        mov	r13, QWORD PTR [rdx+152]
-        mov	r8, QWORD PTR [rdx+160]
-        mov	r9, QWORD PTR [rdx+168]
-        mov	r10, QWORD PTR [rdx+176]
-        shld	r11, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r13, cl
-        mov	QWORD PTR [rax+160], r8
-        mov	QWORD PTR [rax+168], r9
-        mov	QWORD PTR [rax+176], r10
-        mov	QWORD PTR [rax+184], r11
-        mov	r11, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [rdx+128]
-        mov	r9, QWORD PTR [rdx+136]
-        mov	r10, QWORD PTR [rdx+144]
-        shld	r13, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r11, cl
-        mov	QWORD PTR [rax+128], r8
-        mov	QWORD PTR [rax+136], r9
-        mov	QWORD PTR [rax+144], r10
-        mov	QWORD PTR [rax+152], r13
-        mov	r13, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [rdx+96]
-        mov	r9, QWORD PTR [rdx+104]
-        mov	r10, QWORD PTR [rdx+112]
-        shld	r11, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r13, cl
-        mov	QWORD PTR [rax+96], r8
-        mov	QWORD PTR [rax+104], r9
-        mov	QWORD PTR [rax+112], r10
-        mov	QWORD PTR [rax+120], r11
-        mov	r11, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [rdx+64]
-        mov	r9, QWORD PTR [rdx+72]
-        mov	r10, QWORD PTR [rdx+80]
-        shld	r13, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r11, cl
-        mov	QWORD PTR [rax+64], r8
-        mov	QWORD PTR [rax+72], r9
-        mov	QWORD PTR [rax+80], r10
-        mov	QWORD PTR [rax+88], r13
-        mov	r13, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [rdx+32]
-        mov	r9, QWORD PTR [rdx+40]
-        mov	r10, QWORD PTR [rdx+48]
-        shld	r11, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r13, cl
-        mov	QWORD PTR [rax+32], r8
-        mov	QWORD PTR [rax+40], r9
-        mov	QWORD PTR [rax+48], r10
-        mov	QWORD PTR [rax+56], r11
-        mov	r8, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	r10, QWORD PTR [rdx+16]
-        shld	r13, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shl	r8, cl
-        mov	QWORD PTR [rax], r8
-        mov	QWORD PTR [rax+8], r9
-        mov	QWORD PTR [rax+16], r10
-        mov	QWORD PTR [rax+24], r13
-        pop	r13
-        pop	r12
-        ret
-sp_2048_lshift_32 ENDP
-_text ENDS
-ENDIF
-ENDIF
-IFNDEF WOLFSSL_SP_NO_3072
-IFNDEF WOLFSSL_SP_NO_3072
-; /* Read big endian unsigned byte array into r.
-;  * Uses the bswap instruction.
-;  *
-;  * r  A single precision integer.
-;  * size  Maximum number of bytes to convert
-;  * a  Byte array.
-;  * n  Number of bytes in array to read.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_from_bin_bswap PROC
-        push	r12
-        push	r13
-        mov	r11, r8
-        mov	r12, rcx
-        add	r11, r9
-        add	r12, 384
-        xor	r13, r13
-        jmp	L_3072_from_bin_bswap_64_end
-L_3072_from_bin_bswap_64_start:
-        sub	r11, 64
-        mov	rax, QWORD PTR [r11+56]
-        mov	r10, QWORD PTR [r11+48]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        mov	rax, QWORD PTR [r11+40]
-        mov	r10, QWORD PTR [r11+32]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r10
-        mov	rax, QWORD PTR [r11+24]
-        mov	r10, QWORD PTR [r11+16]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        mov	rax, QWORD PTR [r11+8]
-        mov	r10, QWORD PTR [r11]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r10
-        add	rcx, 64
-        sub	r9, 64
-L_3072_from_bin_bswap_64_end:
-        cmp	r9, 63
-        jg	L_3072_from_bin_bswap_64_start
-        jmp	L_3072_from_bin_bswap_8_end
-L_3072_from_bin_bswap_8_start:
-        sub	r11, 8
-        mov	rax, QWORD PTR [r11]
-        bswap	rax
-        mov	QWORD PTR [rcx], rax
-        add	rcx, 8
-        sub	r9, 8
-L_3072_from_bin_bswap_8_end:
-        cmp	r9, 7
-        jg	L_3072_from_bin_bswap_8_start
-        cmp	r9, r13
-        je	L_3072_from_bin_bswap_hi_end
-        mov	r10, r13
-        mov	rax, r13
-L_3072_from_bin_bswap_hi_start:
-        mov	al, BYTE PTR [r8]
-        shl	r10, 8
-        inc	r8
-        add	r10, rax
-        dec	r9
-        jg	L_3072_from_bin_bswap_hi_start
-        mov	QWORD PTR [rcx], r10
-        add	rcx, 8
-L_3072_from_bin_bswap_hi_end:
-        cmp	rcx, r12
-        jge	L_3072_from_bin_bswap_zero_end
-L_3072_from_bin_bswap_zero_start:
-        mov	QWORD PTR [rcx], r13
-        add	rcx, 8
-        cmp	rcx, r12
-        jl	L_3072_from_bin_bswap_zero_start
-L_3072_from_bin_bswap_zero_end:
-        pop	r13
-        pop	r12
-        ret
-sp_3072_from_bin_bswap ENDP
-_text ENDS
-IFNDEF NO_MOVBE_SUPPORT
-; /* Read big endian unsigned byte array into r.
-;  * Uses the movbe instruction which is an optional instruction.
-;  *
-;  * r  A single precision integer.
-;  * size  Maximum number of bytes to convert
-;  * a  Byte array.
-;  * n  Number of bytes in array to read.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_from_bin_movbe PROC
-        push	r12
-        mov	r11, r8
-        mov	r12, rcx
-        add	r11, r9
-        add	r12, 384
-        jmp	L_3072_from_bin_movbe_64_end
-L_3072_from_bin_movbe_64_start:
-        sub	r11, 64
-        movbe	rax, QWORD PTR [r11+56]
-        movbe	r10, QWORD PTR [r11+48]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        movbe	rax, QWORD PTR [r11+40]
-        movbe	r10, QWORD PTR [r11+32]
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r10
-        movbe	rax, QWORD PTR [r11+24]
-        movbe	r10, QWORD PTR [r11+16]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        movbe	rax, QWORD PTR [r11+8]
-        movbe	r10, QWORD PTR [r11]
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r10
-        add	rcx, 64
-        sub	r9, 64
-L_3072_from_bin_movbe_64_end:
-        cmp	r9, 63
-        jg	L_3072_from_bin_movbe_64_start
-        jmp	L_3072_from_bin_movbe_8_end
-L_3072_from_bin_movbe_8_start:
-        sub	r11, 8
-        movbe	rax, QWORD PTR [r11]
-        mov	QWORD PTR [rcx], rax
-        add	rcx, 8
-        sub	r9, 8
-L_3072_from_bin_movbe_8_end:
-        cmp	r9, 7
-        jg	L_3072_from_bin_movbe_8_start
-        cmp	r9, 0
-        je	L_3072_from_bin_movbe_hi_end
-        mov	r10, 0
-        mov	rax, 0
-L_3072_from_bin_movbe_hi_start:
-        mov	al, BYTE PTR [r8]
-        shl	r10, 8
-        inc	r8
-        add	r10, rax
-        dec	r9
-        jg	L_3072_from_bin_movbe_hi_start
-        mov	QWORD PTR [rcx], r10
-        add	rcx, 8
-L_3072_from_bin_movbe_hi_end:
-        cmp	rcx, r12
-        jge	L_3072_from_bin_movbe_zero_end
-L_3072_from_bin_movbe_zero_start:
-        mov	QWORD PTR [rcx], 0
-        add	rcx, 8
-        cmp	rcx, r12
-        jl	L_3072_from_bin_movbe_zero_start
-L_3072_from_bin_movbe_zero_end:
-        pop	r12
-        ret
-sp_3072_from_bin_movbe ENDP
-_text ENDS
-ENDIF
-; /* Write r as big endian to byte array.
-;  * Fixed length number of bytes written: 384
-;  * Uses the bswap instruction.
-;  *
-;  * r  A single precision integer.
-;  * a  Byte array.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_to_bin_bswap_48 PROC
-        mov	rax, QWORD PTR [rcx+376]
-        mov	r8, QWORD PTR [rcx+368]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx], rax
-        mov	QWORD PTR [rdx+8], r8
-        mov	rax, QWORD PTR [rcx+360]
-        mov	r8, QWORD PTR [rcx+352]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+16], rax
-        mov	QWORD PTR [rdx+24], r8
-        mov	rax, QWORD PTR [rcx+344]
-        mov	r8, QWORD PTR [rcx+336]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+32], rax
-        mov	QWORD PTR [rdx+40], r8
-        mov	rax, QWORD PTR [rcx+328]
-        mov	r8, QWORD PTR [rcx+320]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+48], rax
-        mov	QWORD PTR [rdx+56], r8
-        mov	rax, QWORD PTR [rcx+312]
-        mov	r8, QWORD PTR [rcx+304]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+64], rax
-        mov	QWORD PTR [rdx+72], r8
-        mov	rax, QWORD PTR [rcx+296]
-        mov	r8, QWORD PTR [rcx+288]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+80], rax
-        mov	QWORD PTR [rdx+88], r8
-        mov	rax, QWORD PTR [rcx+280]
-        mov	r8, QWORD PTR [rcx+272]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+96], rax
-        mov	QWORD PTR [rdx+104], r8
-        mov	rax, QWORD PTR [rcx+264]
-        mov	r8, QWORD PTR [rcx+256]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+112], rax
-        mov	QWORD PTR [rdx+120], r8
-        mov	rax, QWORD PTR [rcx+248]
-        mov	r8, QWORD PTR [rcx+240]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+128], rax
-        mov	QWORD PTR [rdx+136], r8
-        mov	rax, QWORD PTR [rcx+232]
-        mov	r8, QWORD PTR [rcx+224]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+144], rax
-        mov	QWORD PTR [rdx+152], r8
-        mov	rax, QWORD PTR [rcx+216]
-        mov	r8, QWORD PTR [rcx+208]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+160], rax
-        mov	QWORD PTR [rdx+168], r8
-        mov	rax, QWORD PTR [rcx+200]
-        mov	r8, QWORD PTR [rcx+192]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+176], rax
-        mov	QWORD PTR [rdx+184], r8
-        mov	rax, QWORD PTR [rcx+184]
-        mov	r8, QWORD PTR [rcx+176]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+192], rax
-        mov	QWORD PTR [rdx+200], r8
-        mov	rax, QWORD PTR [rcx+168]
-        mov	r8, QWORD PTR [rcx+160]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+208], rax
-        mov	QWORD PTR [rdx+216], r8
-        mov	rax, QWORD PTR [rcx+152]
-        mov	r8, QWORD PTR [rcx+144]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+224], rax
-        mov	QWORD PTR [rdx+232], r8
-        mov	rax, QWORD PTR [rcx+136]
-        mov	r8, QWORD PTR [rcx+128]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+240], rax
-        mov	QWORD PTR [rdx+248], r8
-        mov	rax, QWORD PTR [rcx+120]
-        mov	r8, QWORD PTR [rcx+112]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+256], rax
-        mov	QWORD PTR [rdx+264], r8
-        mov	rax, QWORD PTR [rcx+104]
-        mov	r8, QWORD PTR [rcx+96]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+272], rax
-        mov	QWORD PTR [rdx+280], r8
-        mov	rax, QWORD PTR [rcx+88]
-        mov	r8, QWORD PTR [rcx+80]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+288], rax
-        mov	QWORD PTR [rdx+296], r8
-        mov	rax, QWORD PTR [rcx+72]
-        mov	r8, QWORD PTR [rcx+64]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+304], rax
-        mov	QWORD PTR [rdx+312], r8
-        mov	rax, QWORD PTR [rcx+56]
-        mov	r8, QWORD PTR [rcx+48]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+320], rax
-        mov	QWORD PTR [rdx+328], r8
-        mov	rax, QWORD PTR [rcx+40]
-        mov	r8, QWORD PTR [rcx+32]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+336], rax
-        mov	QWORD PTR [rdx+344], r8
-        mov	rax, QWORD PTR [rcx+24]
-        mov	r8, QWORD PTR [rcx+16]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+352], rax
-        mov	QWORD PTR [rdx+360], r8
-        mov	rax, QWORD PTR [rcx+8]
-        mov	r8, QWORD PTR [rcx]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+368], rax
-        mov	QWORD PTR [rdx+376], r8
-        ret
-sp_3072_to_bin_bswap_48 ENDP
-_text ENDS
-IFNDEF NO_MOVBE_SUPPORT
-; /* Write r as big endian to byte array.
-;  * Fixed length number of bytes written: 384
-;  * Uses the movbe instruction which is optional.
-;  *
-;  * r  A single precision integer.
-;  * a  Byte array.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_to_bin_movbe_48 PROC
-        movbe	rax, QWORD PTR [rcx+376]
-        movbe	r8, QWORD PTR [rcx+368]
-        mov	QWORD PTR [rdx], rax
-        mov	QWORD PTR [rdx+8], r8
-        movbe	rax, QWORD PTR [rcx+360]
-        movbe	r8, QWORD PTR [rcx+352]
-        mov	QWORD PTR [rdx+16], rax
-        mov	QWORD PTR [rdx+24], r8
-        movbe	rax, QWORD PTR [rcx+344]
-        movbe	r8, QWORD PTR [rcx+336]
-        mov	QWORD PTR [rdx+32], rax
-        mov	QWORD PTR [rdx+40], r8
-        movbe	rax, QWORD PTR [rcx+328]
-        movbe	r8, QWORD PTR [rcx+320]
-        mov	QWORD PTR [rdx+48], rax
-        mov	QWORD PTR [rdx+56], r8
-        movbe	rax, QWORD PTR [rcx+312]
-        movbe	r8, QWORD PTR [rcx+304]
-        mov	QWORD PTR [rdx+64], rax
-        mov	QWORD PTR [rdx+72], r8
-        movbe	rax, QWORD PTR [rcx+296]
-        movbe	r8, QWORD PTR [rcx+288]
-        mov	QWORD PTR [rdx+80], rax
-        mov	QWORD PTR [rdx+88], r8
-        movbe	rax, QWORD PTR [rcx+280]
-        movbe	r8, QWORD PTR [rcx+272]
-        mov	QWORD PTR [rdx+96], rax
-        mov	QWORD PTR [rdx+104], r8
-        movbe	rax, QWORD PTR [rcx+264]
-        movbe	r8, QWORD PTR [rcx+256]
-        mov	QWORD PTR [rdx+112], rax
-        mov	QWORD PTR [rdx+120], r8
-        movbe	rax, QWORD PTR [rcx+248]
-        movbe	r8, QWORD PTR [rcx+240]
-        mov	QWORD PTR [rdx+128], rax
-        mov	QWORD PTR [rdx+136], r8
-        movbe	rax, QWORD PTR [rcx+232]
-        movbe	r8, QWORD PTR [rcx+224]
-        mov	QWORD PTR [rdx+144], rax
-        mov	QWORD PTR [rdx+152], r8
-        movbe	rax, QWORD PTR [rcx+216]
-        movbe	r8, QWORD PTR [rcx+208]
-        mov	QWORD PTR [rdx+160], rax
-        mov	QWORD PTR [rdx+168], r8
-        movbe	rax, QWORD PTR [rcx+200]
-        movbe	r8, QWORD PTR [rcx+192]
-        mov	QWORD PTR [rdx+176], rax
-        mov	QWORD PTR [rdx+184], r8
-        movbe	rax, QWORD PTR [rcx+184]
-        movbe	r8, QWORD PTR [rcx+176]
-        mov	QWORD PTR [rdx+192], rax
-        mov	QWORD PTR [rdx+200], r8
-        movbe	rax, QWORD PTR [rcx+168]
-        movbe	r8, QWORD PTR [rcx+160]
-        mov	QWORD PTR [rdx+208], rax
-        mov	QWORD PTR [rdx+216], r8
-        movbe	rax, QWORD PTR [rcx+152]
-        movbe	r8, QWORD PTR [rcx+144]
-        mov	QWORD PTR [rdx+224], rax
-        mov	QWORD PTR [rdx+232], r8
-        movbe	rax, QWORD PTR [rcx+136]
-        movbe	r8, QWORD PTR [rcx+128]
-        mov	QWORD PTR [rdx+240], rax
-        mov	QWORD PTR [rdx+248], r8
-        movbe	rax, QWORD PTR [rcx+120]
-        movbe	r8, QWORD PTR [rcx+112]
-        mov	QWORD PTR [rdx+256], rax
-        mov	QWORD PTR [rdx+264], r8
-        movbe	rax, QWORD PTR [rcx+104]
-        movbe	r8, QWORD PTR [rcx+96]
-        mov	QWORD PTR [rdx+272], rax
-        mov	QWORD PTR [rdx+280], r8
-        movbe	rax, QWORD PTR [rcx+88]
-        movbe	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rdx+288], rax
-        mov	QWORD PTR [rdx+296], r8
-        movbe	rax, QWORD PTR [rcx+72]
-        movbe	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rdx+304], rax
-        mov	QWORD PTR [rdx+312], r8
-        movbe	rax, QWORD PTR [rcx+56]
-        movbe	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rdx+320], rax
-        mov	QWORD PTR [rdx+328], r8
-        movbe	rax, QWORD PTR [rcx+40]
-        movbe	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rdx+336], rax
-        mov	QWORD PTR [rdx+344], r8
-        movbe	rax, QWORD PTR [rcx+24]
-        movbe	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rdx+352], rax
-        mov	QWORD PTR [rdx+360], r8
-        movbe	rax, QWORD PTR [rcx+8]
-        movbe	r8, QWORD PTR [rcx]
-        mov	QWORD PTR [rdx+368], rax
-        mov	QWORD PTR [rdx+376], r8
-        ret
-sp_3072_to_bin_movbe_48 ENDP
-_text ENDS
-ENDIF
-; /* Multiply a and b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_mul_12 PROC
-        push	r12
-        mov	r9, rdx
-        sub	rsp, 96
-        ; A[0] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        mov	QWORD PTR [rsp], rax
-        mov	r11, rdx
-        ; A[0] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[1] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+8], r11
-        ; A[0] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+8]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[2] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+16], r12
-        ; A[0] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[1] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[2] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+16]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[3] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rsp+24], r10
-        ; A[0] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[1] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+16]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[3] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+24]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[4] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+32], r11
-        ; A[0] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+8]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[2] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+24]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[4] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+32]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[5] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+40], r12
-        ; A[0] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[1] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[2] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+16]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[3] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+32]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[5] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+40]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[6] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rsp+48], r10
-        ; A[0] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[1] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+16]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[3] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+24]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[4] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+40]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[6] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+48]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[7] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+56], r11
-        ; A[0] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+8]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[2] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+24]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[4] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+32]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[5] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+48]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[7] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+56]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[8] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+64], r12
-        ; A[0] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[1] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[2] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+16]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[3] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+32]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[5] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+40]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[6] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+56]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[8] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+64]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[9] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+72]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rsp+72], r10
-        ; A[0] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[1] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+16]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[3] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+24]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[4] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+40]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[6] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+48]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[7] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+64]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[9] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+72]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[10] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+80]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+80], r11
-        ; A[0] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+8]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[2] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+24]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[4] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+32]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[5] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+48]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[7] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+56]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[8] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[9] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+72]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[10] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+80]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[11] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+88]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+88], r12
-        ; A[1] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+8]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[2] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+16]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[3] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+32]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[5] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+40]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[6] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+56]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[8] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+64]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[9] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+72]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[10] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+80]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[11] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+88]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rcx+96], r10
-        ; A[2] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+16]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[3] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+24]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[4] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+40]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[6] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+48]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[7] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+64]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[9] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+72]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[10] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+80]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[11] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+88]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+104], r11
-        ; A[3] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+24]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[4] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+32]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[5] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+48]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[7] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+56]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[8] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[9] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+72]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[10] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+80]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[11] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+88]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+112], r12
-        ; A[4] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+32]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[5] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+40]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[6] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+56]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[8] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+64]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[9] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+72]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[10] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+80]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[11] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+88]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rcx+120], r10
-        ; A[5] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+40]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[6] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+48]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[7] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+64]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[9] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+72]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[10] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+80]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[11] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+88]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+128], r11
-        ; A[6] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+48]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[7] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+56]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[8] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[9] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+72]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[10] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+80]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[11] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+88]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+136], r12
-        ; A[7] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+56]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[8] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+64]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[9] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+72]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[10] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+80]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[11] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+88]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rcx+144], r10
-        ; A[8] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+64]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[9] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+72]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[10] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+80]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[11] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+88]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+152], r11
-        ; A[9] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+72]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[10] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+80]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[11] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+88]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+160], r12
-        ; A[10] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+80]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[11] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+88]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rcx+168], r10
-        ; A[11] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+88]
-        add	r11, rax
-        adc	r12, rdx
-        mov	QWORD PTR [rcx+176], r11
-        mov	QWORD PTR [rcx+184], r12
-        mov	rax, QWORD PTR [rsp]
-        mov	rdx, QWORD PTR [rsp+8]
-        mov	r10, QWORD PTR [rsp+16]
-        mov	r11, QWORD PTR [rsp+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], rdx
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        mov	rax, QWORD PTR [rsp+32]
-        mov	rdx, QWORD PTR [rsp+40]
-        mov	r10, QWORD PTR [rsp+48]
-        mov	r11, QWORD PTR [rsp+56]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], rdx
-        mov	QWORD PTR [rcx+48], r10
-        mov	QWORD PTR [rcx+56], r11
-        mov	rax, QWORD PTR [rsp+64]
-        mov	rdx, QWORD PTR [rsp+72]
-        mov	r10, QWORD PTR [rsp+80]
-        mov	r11, QWORD PTR [rsp+88]
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], rdx
-        mov	QWORD PTR [rcx+80], r10
-        mov	QWORD PTR [rcx+88], r11
-        add	rsp, 96
-        pop	r12
-        ret
-sp_3072_mul_12 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Multiply a and b into r. (r = a * b)
-;  *
-;  * r   Result of multiplication.
-;  * a   First number to multiply.
-;  * b   Second number to multiply.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_mul_avx2_12 PROC
-        push	rbx
-        push	rbp
-        push	r12
-        push	r13
-        push	r14
-        mov	rbp, r8
-        mov	r8, rcx
-        mov	r9, rdx
-        sub	rsp, 96
-        cmp	r9, r8
-        mov	rbx, rsp
-        cmovne	rbx, r8
-        cmp	rbp, r8
-        cmove	rbx, rsp
-        add	r8, 96
-        xor	r14, r14
-        mov	rdx, QWORD PTR [r9]
-        ; A[0] * B[0]
-        mulx	r11, r10, QWORD PTR [rbp]
-        ; A[0] * B[1]
-        mulx	r12, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx], r10
-        adcx	r11, rax
-        mov	QWORD PTR [rbx+8], r11
-        ; A[0] * B[2]
-        mulx	r10, rax, QWORD PTR [rbp+16]
-        adcx	r12, rax
-        ; A[0] * B[3]
-        mulx	r11, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+16], r12
-        adcx	r10, rax
-        mov	QWORD PTR [rbx+24], r10
-        ; A[0] * B[4]
-        mulx	r12, rax, QWORD PTR [rbp+32]
-        adcx	r11, rax
-        ; A[0] * B[5]
-        mulx	r10, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+32], r11
-        adcx	r12, rax
-        mov	QWORD PTR [rbx+40], r12
-        ; A[0] * B[6]
-        mulx	r11, rax, QWORD PTR [rbp+48]
-        adcx	r10, rax
-        ; A[0] * B[7]
-        mulx	r12, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+48], r10
-        adcx	r11, rax
-        mov	QWORD PTR [rbx+56], r11
-        ; A[0] * B[8]
-        mulx	r10, rax, QWORD PTR [rbp+64]
-        adcx	r12, rax
-        ; A[0] * B[9]
-        mulx	r11, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [rbx+64], r12
-        adcx	r10, rax
-        mov	QWORD PTR [rbx+72], r10
-        ; A[0] * B[10]
-        mulx	r12, rax, QWORD PTR [rbp+80]
-        adcx	r11, rax
-        ; A[0] * B[11]
-        mulx	r10, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [rbx+80], r11
-        adcx	r12, rax
-        adcx	r10, r14
-        mov	r13, r14
-        adcx	r13, r14
-        mov	QWORD PTR [rbx+88], r12
-        mov	QWORD PTR [r8], r10
-        mov	rdx, QWORD PTR [r9+8]
-        mov	r11, QWORD PTR [rbx+8]
-        mov	r12, QWORD PTR [rbx+16]
-        mov	r10, QWORD PTR [rbx+24]
-        ; A[1] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[1] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+8], r11
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+16], r12
-        mov	r11, QWORD PTR [rbx+32]
-        mov	r12, QWORD PTR [rbx+40]
-        ; A[1] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[1] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+24], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbx+32], r11
-        mov	r10, QWORD PTR [rbx+48]
-        mov	r11, QWORD PTR [rbx+56]
-        ; A[1] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[1] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+40], r12
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+48], r10
-        mov	r12, QWORD PTR [rbx+64]
-        mov	r10, QWORD PTR [rbx+72]
-        ; A[1] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[1] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+56], r11
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+64], r12
-        mov	r11, QWORD PTR [rbx+80]
-        mov	r12, QWORD PTR [rbx+88]
-        ; A[1] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[1] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [rbx+72], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbx+80], r11
-        mov	r10, QWORD PTR [r8]
-        ; A[1] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[1] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [rbx+88], r12
-        mov	r11, r14
-        adcx	r10, rax
-        adox	r11, rcx
-        adcx	r11, r13
-        mov	r13, r14
-        adox	r13, r14
-        adcx	r13, r14
-        mov	QWORD PTR [r8], r10
-        mov	QWORD PTR [r8+8], r11
-        mov	rdx, QWORD PTR [r9+16]
-        mov	r12, QWORD PTR [rbx+16]
-        mov	r10, QWORD PTR [rbx+24]
-        mov	r11, QWORD PTR [rbx+32]
-        ; A[2] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[2] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+16], r12
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+24], r10
-        mov	r12, QWORD PTR [rbx+40]
-        mov	r10, QWORD PTR [rbx+48]
-        ; A[2] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[2] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+32], r11
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+40], r12
-        mov	r11, QWORD PTR [rbx+56]
-        mov	r12, QWORD PTR [rbx+64]
-        ; A[2] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[2] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+48], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbx+56], r11
-        mov	r10, QWORD PTR [rbx+72]
-        mov	r11, QWORD PTR [rbx+80]
-        ; A[2] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[2] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+64], r12
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+72], r10
-        mov	r12, QWORD PTR [rbx+88]
-        mov	r10, QWORD PTR [r8]
-        ; A[2] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[2] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [rbx+80], r11
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+88], r12
-        mov	r11, QWORD PTR [r8+8]
-        ; A[2] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[2] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8], r10
-        mov	r12, r14
-        adcx	r11, rax
-        adox	r12, rcx
-        adcx	r12, r13
-        mov	r13, r14
-        adox	r13, r14
-        adcx	r13, r14
-        mov	QWORD PTR [r8+8], r11
-        mov	QWORD PTR [r8+16], r12
-        mov	rdx, QWORD PTR [r9+24]
-        mov	r10, QWORD PTR [rbx+24]
-        mov	r11, QWORD PTR [rbx+32]
-        mov	r12, QWORD PTR [rbx+40]
-        ; A[3] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[3] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+24], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbx+32], r11
-        mov	r10, QWORD PTR [rbx+48]
-        mov	r11, QWORD PTR [rbx+56]
-        ; A[3] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[3] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+40], r12
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+48], r10
-        mov	r12, QWORD PTR [rbx+64]
-        mov	r10, QWORD PTR [rbx+72]
-        ; A[3] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[3] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+56], r11
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+64], r12
-        mov	r11, QWORD PTR [rbx+80]
-        mov	r12, QWORD PTR [rbx+88]
-        ; A[3] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[3] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+72], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbx+80], r11
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        ; A[3] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[3] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [rbx+88], r12
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8], r10
-        mov	r12, QWORD PTR [r8+16]
-        ; A[3] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[3] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+8], r11
-        mov	r10, r14
-        adcx	r12, rax
-        adox	r10, rcx
-        adcx	r10, r13
-        mov	r13, r14
-        adox	r13, r14
-        adcx	r13, r14
-        mov	QWORD PTR [r8+16], r12
-        mov	QWORD PTR [r8+24], r10
-        mov	rdx, QWORD PTR [r9+32]
-        mov	r11, QWORD PTR [rbx+32]
-        mov	r12, QWORD PTR [rbx+40]
-        mov	r10, QWORD PTR [rbx+48]
-        ; A[4] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[4] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+32], r11
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+40], r12
-        mov	r11, QWORD PTR [rbx+56]
-        mov	r12, QWORD PTR [rbx+64]
-        ; A[4] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[4] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+48], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbx+56], r11
-        mov	r10, QWORD PTR [rbx+72]
-        mov	r11, QWORD PTR [rbx+80]
-        ; A[4] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[4] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+64], r12
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+72], r10
-        mov	r12, QWORD PTR [rbx+88]
-        mov	r10, QWORD PTR [r8]
-        ; A[4] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[4] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+80], r11
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+88], r12
-        mov	r11, QWORD PTR [r8+8]
-        mov	r12, QWORD PTR [r8+16]
-        ; A[4] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[4] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8+8], r11
-        mov	r10, QWORD PTR [r8+24]
-        ; A[4] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[4] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+16], r12
-        mov	r11, r14
-        adcx	r10, rax
-        adox	r11, rcx
-        adcx	r11, r13
-        mov	r13, r14
-        adox	r13, r14
-        adcx	r13, r14
-        mov	QWORD PTR [r8+24], r10
-        mov	QWORD PTR [r8+32], r11
-        mov	rdx, QWORD PTR [r9+40]
-        mov	r12, QWORD PTR [rbx+40]
-        mov	r10, QWORD PTR [rbx+48]
-        mov	r11, QWORD PTR [rbx+56]
-        ; A[5] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[5] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+40], r12
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+48], r10
-        mov	r12, QWORD PTR [rbx+64]
-        mov	r10, QWORD PTR [rbx+72]
-        ; A[5] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[5] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+56], r11
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+64], r12
-        mov	r11, QWORD PTR [rbx+80]
-        mov	r12, QWORD PTR [rbx+88]
-        ; A[5] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[5] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+72], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbx+80], r11
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        ; A[5] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[5] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+88], r12
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8], r10
-        mov	r12, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [r8+24]
-        ; A[5] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[5] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8+8], r11
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+16], r12
-        mov	r11, QWORD PTR [r8+32]
-        ; A[5] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[5] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+24], r10
-        mov	r12, r14
-        adcx	r11, rax
-        adox	r12, rcx
-        adcx	r12, r13
-        mov	r13, r14
-        adox	r13, r14
-        adcx	r13, r14
-        mov	QWORD PTR [r8+32], r11
-        mov	QWORD PTR [r8+40], r12
-        mov	rdx, QWORD PTR [r9+48]
-        mov	r10, QWORD PTR [rbx+48]
-        mov	r11, QWORD PTR [rbx+56]
-        mov	r12, QWORD PTR [rbx+64]
-        ; A[6] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[6] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+48], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbx+56], r11
-        mov	r10, QWORD PTR [rbx+72]
-        mov	r11, QWORD PTR [rbx+80]
-        ; A[6] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[6] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+64], r12
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+72], r10
-        mov	r12, QWORD PTR [rbx+88]
-        mov	r10, QWORD PTR [r8]
-        ; A[6] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[6] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+80], r11
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+88], r12
-        mov	r11, QWORD PTR [r8+8]
-        mov	r12, QWORD PTR [r8+16]
-        ; A[6] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[6] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8+8], r11
-        mov	r10, QWORD PTR [r8+24]
-        mov	r11, QWORD PTR [r8+32]
-        ; A[6] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[6] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8+16], r12
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+24], r10
-        mov	r12, QWORD PTR [r8+40]
-        ; A[6] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[6] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+32], r11
-        mov	r10, r14
-        adcx	r12, rax
-        adox	r10, rcx
-        adcx	r10, r13
-        mov	r13, r14
-        adox	r13, r14
-        adcx	r13, r14
-        mov	QWORD PTR [r8+40], r12
-        mov	QWORD PTR [r8+48], r10
-        mov	rdx, QWORD PTR [r9+56]
-        mov	r11, QWORD PTR [rbx+56]
-        mov	r12, QWORD PTR [rbx+64]
-        mov	r10, QWORD PTR [rbx+72]
-        ; A[7] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[7] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+56], r11
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+64], r12
-        mov	r11, QWORD PTR [rbx+80]
-        mov	r12, QWORD PTR [rbx+88]
-        ; A[7] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[7] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+72], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbx+80], r11
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        ; A[7] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[7] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+88], r12
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8], r10
-        mov	r12, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [r8+24]
-        ; A[7] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[7] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8+8], r11
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+16], r12
-        mov	r11, QWORD PTR [r8+32]
-        mov	r12, QWORD PTR [r8+40]
-        ; A[7] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[7] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8+24], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8+32], r11
-        mov	r10, QWORD PTR [r8+48]
-        ; A[7] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[7] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+40], r12
-        mov	r11, r14
-        adcx	r10, rax
-        adox	r11, rcx
-        adcx	r11, r13
-        mov	r13, r14
-        adox	r13, r14
-        adcx	r13, r14
-        mov	QWORD PTR [r8+48], r10
-        mov	QWORD PTR [r8+56], r11
-        mov	rdx, QWORD PTR [r9+64]
-        mov	r12, QWORD PTR [rbx+64]
-        mov	r10, QWORD PTR [rbx+72]
-        mov	r11, QWORD PTR [rbx+80]
-        ; A[8] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[8] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+64], r12
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+72], r10
-        mov	r12, QWORD PTR [rbx+88]
-        mov	r10, QWORD PTR [r8]
-        ; A[8] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[8] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+80], r11
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+88], r12
-        mov	r11, QWORD PTR [r8+8]
-        mov	r12, QWORD PTR [r8+16]
-        ; A[8] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[8] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [r8], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8+8], r11
-        mov	r10, QWORD PTR [r8+24]
-        mov	r11, QWORD PTR [r8+32]
-        ; A[8] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[8] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8+16], r12
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+24], r10
-        mov	r12, QWORD PTR [r8+40]
-        mov	r10, QWORD PTR [r8+48]
-        ; A[8] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[8] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8+32], r11
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+40], r12
-        mov	r11, QWORD PTR [r8+56]
-        ; A[8] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[8] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+48], r10
-        mov	r12, r14
-        adcx	r11, rax
-        adox	r12, rcx
-        adcx	r12, r13
-        mov	r13, r14
-        adox	r13, r14
-        adcx	r13, r14
-        mov	QWORD PTR [r8+56], r11
-        mov	QWORD PTR [r8+64], r12
-        mov	rdx, QWORD PTR [r9+72]
-        mov	r10, QWORD PTR [rbx+72]
-        mov	r11, QWORD PTR [rbx+80]
-        mov	r12, QWORD PTR [rbx+88]
-        ; A[9] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[9] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+72], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbx+80], r11
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        ; A[9] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[9] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+88], r12
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8], r10
-        mov	r12, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [r8+24]
-        ; A[9] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[9] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [r8+8], r11
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+16], r12
-        mov	r11, QWORD PTR [r8+32]
-        mov	r12, QWORD PTR [r8+40]
-        ; A[9] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[9] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8+24], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8+32], r11
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        ; A[9] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[9] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8+40], r12
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+48], r10
-        mov	r12, QWORD PTR [r8+64]
-        ; A[9] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[9] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+56], r11
-        mov	r10, r14
-        adcx	r12, rax
-        adox	r10, rcx
-        adcx	r10, r13
-        mov	r13, r14
-        adox	r13, r14
-        adcx	r13, r14
-        mov	QWORD PTR [r8+64], r12
-        mov	QWORD PTR [r8+72], r10
-        mov	rdx, QWORD PTR [r9+80]
-        mov	r11, QWORD PTR [rbx+80]
-        mov	r12, QWORD PTR [rbx+88]
-        mov	r10, QWORD PTR [r8]
-        ; A[10] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[10] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+80], r11
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+88], r12
-        mov	r11, QWORD PTR [r8+8]
-        mov	r12, QWORD PTR [r8+16]
-        ; A[10] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[10] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [r8], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8+8], r11
-        mov	r10, QWORD PTR [r8+24]
-        mov	r11, QWORD PTR [r8+32]
-        ; A[10] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[10] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [r8+16], r12
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+24], r10
-        mov	r12, QWORD PTR [r8+40]
-        mov	r10, QWORD PTR [r8+48]
-        ; A[10] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[10] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8+32], r11
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+40], r12
-        mov	r11, QWORD PTR [r8+56]
-        mov	r12, QWORD PTR [r8+64]
-        ; A[10] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[10] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8+48], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8+56], r11
-        mov	r10, QWORD PTR [r8+72]
-        ; A[10] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[10] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+64], r12
-        mov	r11, r14
-        adcx	r10, rax
-        adox	r11, rcx
-        adcx	r11, r13
-        mov	r13, r14
-        adox	r13, r14
-        adcx	r13, r14
-        mov	QWORD PTR [r8+72], r10
-        mov	QWORD PTR [r8+80], r11
-        mov	rdx, QWORD PTR [r9+88]
-        mov	r12, QWORD PTR [rbx+88]
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        ; A[11] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[11] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+88], r12
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8], r10
-        mov	r12, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [r8+24]
-        ; A[11] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[11] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [r8+8], r11
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+16], r12
-        mov	r11, QWORD PTR [r8+32]
-        mov	r12, QWORD PTR [r8+40]
-        ; A[11] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[11] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [r8+24], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8+32], r11
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        ; A[11] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[11] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8+40], r12
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+48], r10
-        mov	r12, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [r8+72]
-        ; A[11] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[11] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8+56], r11
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+64], r12
-        mov	r11, QWORD PTR [r8+80]
-        ; A[11] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[11] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+72], r10
-        mov	r12, r14
-        adcx	r11, rax
-        adox	r12, rcx
-        adcx	r12, r13
-        mov	QWORD PTR [r8+80], r11
-        mov	QWORD PTR [r8+88], r12
-        sub	r8, 96
-        cmp	r9, r8
-        je	L_start_3072_mul_avx2_12
-        cmp	rbp, r8
-        jne	L_end_3072_mul_avx2_12
-L_start_3072_mul_avx2_12:
-        vmovdqu	xmm0, OWORD PTR [rbx]
-        vmovups	OWORD PTR [r8], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+16]
-        vmovups	OWORD PTR [r8+16], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+32]
-        vmovups	OWORD PTR [r8+32], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+48]
-        vmovups	OWORD PTR [r8+48], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+64]
-        vmovups	OWORD PTR [r8+64], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+80]
-        vmovups	OWORD PTR [r8+80], xmm0
-L_end_3072_mul_avx2_12:
-        add	rsp, 96
-        pop	r14
-        pop	r13
-        pop	r12
-        pop	rbp
-        pop	rbx
-        ret
-sp_3072_mul_avx2_12 ENDP
-_text ENDS
-ENDIF
-; /* Add b to a into r. (r = a + b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_add_12 PROC
-        ; Add
-        mov	r9, QWORD PTR [rdx]
-        xor	rax, rax
-        add	r9, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx+8]
-        mov	QWORD PTR [rcx], r9
-        adc	r10, QWORD PTR [r8+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	QWORD PTR [rcx+8], r10
-        adc	r9, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	QWORD PTR [rcx+16], r9
-        adc	r10, QWORD PTR [r8+24]
-        mov	r9, QWORD PTR [rdx+32]
-        mov	QWORD PTR [rcx+24], r10
-        adc	r9, QWORD PTR [r8+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	QWORD PTR [rcx+32], r9
-        adc	r10, QWORD PTR [r8+40]
-        mov	r9, QWORD PTR [rdx+48]
-        mov	QWORD PTR [rcx+40], r10
-        adc	r9, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+56]
-        mov	QWORD PTR [rcx+48], r9
-        adc	r10, QWORD PTR [r8+56]
-        mov	r9, QWORD PTR [rdx+64]
-        mov	QWORD PTR [rcx+56], r10
-        adc	r9, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [rdx+72]
-        mov	QWORD PTR [rcx+64], r9
-        adc	r10, QWORD PTR [r8+72]
-        mov	r9, QWORD PTR [rdx+80]
-        mov	QWORD PTR [rcx+72], r10
-        adc	r9, QWORD PTR [r8+80]
-        mov	r10, QWORD PTR [rdx+88]
-        mov	QWORD PTR [rcx+80], r9
-        adc	r10, QWORD PTR [r8+88]
-        mov	QWORD PTR [rcx+88], r10
-        adc	rax, 0
-        ret
-sp_3072_add_12 ENDP
-_text ENDS
-; /* Sub b from a into a. (a -= b)
-;  *
-;  * a  A single precision integer and result.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_sub_in_place_24 PROC
-        mov	r8, QWORD PTR [rcx]
-        sub	r8, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        sbb	r9, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], r9
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	r9, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        sbb	r9, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], r9
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	r9, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        sbb	r9, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], r9
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	r9, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        sbb	r9, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], r9
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	r9, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        sbb	r9, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], r9
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	r9, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        sbb	r9, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [rcx+96]
-        mov	QWORD PTR [rcx+88], r9
-        sbb	r8, QWORD PTR [rdx+96]
-        mov	r9, QWORD PTR [rcx+104]
-        mov	QWORD PTR [rcx+96], r8
-        sbb	r9, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [rcx+112]
-        mov	QWORD PTR [rcx+104], r9
-        sbb	r8, QWORD PTR [rdx+112]
-        mov	r9, QWORD PTR [rcx+120]
-        mov	QWORD PTR [rcx+112], r8
-        sbb	r9, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [rcx+128]
-        mov	QWORD PTR [rcx+120], r9
-        sbb	r8, QWORD PTR [rdx+128]
-        mov	r9, QWORD PTR [rcx+136]
-        mov	QWORD PTR [rcx+128], r8
-        sbb	r9, QWORD PTR [rdx+136]
-        mov	r8, QWORD PTR [rcx+144]
-        mov	QWORD PTR [rcx+136], r9
-        sbb	r8, QWORD PTR [rdx+144]
-        mov	r9, QWORD PTR [rcx+152]
-        mov	QWORD PTR [rcx+144], r8
-        sbb	r9, QWORD PTR [rdx+152]
-        mov	r8, QWORD PTR [rcx+160]
-        mov	QWORD PTR [rcx+152], r9
-        sbb	r8, QWORD PTR [rdx+160]
-        mov	r9, QWORD PTR [rcx+168]
-        mov	QWORD PTR [rcx+160], r8
-        sbb	r9, QWORD PTR [rdx+168]
-        mov	r8, QWORD PTR [rcx+176]
-        mov	QWORD PTR [rcx+168], r9
-        sbb	r8, QWORD PTR [rdx+176]
-        mov	r9, QWORD PTR [rcx+184]
-        mov	QWORD PTR [rcx+176], r8
-        sbb	r9, QWORD PTR [rdx+184]
-        mov	QWORD PTR [rcx+184], r9
-        sbb	rax, rax
-        ret
-sp_3072_sub_in_place_24 ENDP
-_text ENDS
-; /* Add b to a into r. (r = a + b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_add_24 PROC
-        ; Add
-        mov	r9, QWORD PTR [rdx]
-        xor	rax, rax
-        add	r9, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx+8]
-        mov	QWORD PTR [rcx], r9
-        adc	r10, QWORD PTR [r8+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	QWORD PTR [rcx+8], r10
-        adc	r9, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	QWORD PTR [rcx+16], r9
-        adc	r10, QWORD PTR [r8+24]
-        mov	r9, QWORD PTR [rdx+32]
-        mov	QWORD PTR [rcx+24], r10
-        adc	r9, QWORD PTR [r8+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	QWORD PTR [rcx+32], r9
-        adc	r10, QWORD PTR [r8+40]
-        mov	r9, QWORD PTR [rdx+48]
-        mov	QWORD PTR [rcx+40], r10
-        adc	r9, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+56]
-        mov	QWORD PTR [rcx+48], r9
-        adc	r10, QWORD PTR [r8+56]
-        mov	r9, QWORD PTR [rdx+64]
-        mov	QWORD PTR [rcx+56], r10
-        adc	r9, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [rdx+72]
-        mov	QWORD PTR [rcx+64], r9
-        adc	r10, QWORD PTR [r8+72]
-        mov	r9, QWORD PTR [rdx+80]
-        mov	QWORD PTR [rcx+72], r10
-        adc	r9, QWORD PTR [r8+80]
-        mov	r10, QWORD PTR [rdx+88]
-        mov	QWORD PTR [rcx+80], r9
-        adc	r10, QWORD PTR [r8+88]
-        mov	r9, QWORD PTR [rdx+96]
-        mov	QWORD PTR [rcx+88], r10
-        adc	r9, QWORD PTR [r8+96]
-        mov	r10, QWORD PTR [rdx+104]
-        mov	QWORD PTR [rcx+96], r9
-        adc	r10, QWORD PTR [r8+104]
-        mov	r9, QWORD PTR [rdx+112]
-        mov	QWORD PTR [rcx+104], r10
-        adc	r9, QWORD PTR [r8+112]
-        mov	r10, QWORD PTR [rdx+120]
-        mov	QWORD PTR [rcx+112], r9
-        adc	r10, QWORD PTR [r8+120]
-        mov	r9, QWORD PTR [rdx+128]
-        mov	QWORD PTR [rcx+120], r10
-        adc	r9, QWORD PTR [r8+128]
-        mov	r10, QWORD PTR [rdx+136]
-        mov	QWORD PTR [rcx+128], r9
-        adc	r10, QWORD PTR [r8+136]
-        mov	r9, QWORD PTR [rdx+144]
-        mov	QWORD PTR [rcx+136], r10
-        adc	r9, QWORD PTR [r8+144]
-        mov	r10, QWORD PTR [rdx+152]
-        mov	QWORD PTR [rcx+144], r9
-        adc	r10, QWORD PTR [r8+152]
-        mov	r9, QWORD PTR [rdx+160]
-        mov	QWORD PTR [rcx+152], r10
-        adc	r9, QWORD PTR [r8+160]
-        mov	r10, QWORD PTR [rdx+168]
-        mov	QWORD PTR [rcx+160], r9
-        adc	r10, QWORD PTR [r8+168]
-        mov	r9, QWORD PTR [rdx+176]
-        mov	QWORD PTR [rcx+168], r10
-        adc	r9, QWORD PTR [r8+176]
-        mov	r10, QWORD PTR [rdx+184]
-        mov	QWORD PTR [rcx+176], r9
-        adc	r10, QWORD PTR [r8+184]
-        mov	QWORD PTR [rcx+184], r10
-        adc	rax, 0
-        ret
-sp_3072_add_24 ENDP
-_text ENDS
-; /* Multiply a and b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_mul_24 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        sub	rsp, 616
-        mov	QWORD PTR [rsp+576], rcx
-        mov	QWORD PTR [rsp+584], rdx
-        mov	QWORD PTR [rsp+592], r8
-        lea	r12, QWORD PTR [rsp+384]
-        lea	r14, QWORD PTR [rdx+96]
-        ; Add
-        mov	rax, QWORD PTR [rdx]
-        xor	r15, r15
-        add	rax, QWORD PTR [r14]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	QWORD PTR [r12], rax
-        adc	r9, QWORD PTR [r14+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	QWORD PTR [r12+8], r9
-        adc	r10, QWORD PTR [r14+16]
-        mov	rax, QWORD PTR [rdx+24]
-        mov	QWORD PTR [r12+16], r10
-        adc	rax, QWORD PTR [r14+24]
-        mov	r9, QWORD PTR [rdx+32]
-        mov	QWORD PTR [r12+24], rax
-        adc	r9, QWORD PTR [r14+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	QWORD PTR [r12+32], r9
-        adc	r10, QWORD PTR [r14+40]
-        mov	rax, QWORD PTR [rdx+48]
-        mov	QWORD PTR [r12+40], r10
-        adc	rax, QWORD PTR [r14+48]
-        mov	r9, QWORD PTR [rdx+56]
-        mov	QWORD PTR [r12+48], rax
-        adc	r9, QWORD PTR [r14+56]
-        mov	r10, QWORD PTR [rdx+64]
-        mov	QWORD PTR [r12+56], r9
-        adc	r10, QWORD PTR [r14+64]
-        mov	rax, QWORD PTR [rdx+72]
-        mov	QWORD PTR [r12+64], r10
-        adc	rax, QWORD PTR [r14+72]
-        mov	r9, QWORD PTR [rdx+80]
-        mov	QWORD PTR [r12+72], rax
-        adc	r9, QWORD PTR [r14+80]
-        mov	r10, QWORD PTR [rdx+88]
-        mov	QWORD PTR [r12+80], r9
-        adc	r10, QWORD PTR [r14+88]
-        mov	QWORD PTR [r12+88], r10
-        adc	r15, 0
-        mov	QWORD PTR [rsp+600], r15
-        lea	r13, QWORD PTR [rsp+480]
-        lea	r14, QWORD PTR [r8+96]
-        ; Add
-        mov	rax, QWORD PTR [r8]
-        xor	rdi, rdi
-        add	rax, QWORD PTR [r14]
-        mov	r9, QWORD PTR [r8+8]
-        mov	QWORD PTR [r13], rax
-        adc	r9, QWORD PTR [r14+8]
-        mov	r10, QWORD PTR [r8+16]
-        mov	QWORD PTR [r13+8], r9
-        adc	r10, QWORD PTR [r14+16]
-        mov	rax, QWORD PTR [r8+24]
-        mov	QWORD PTR [r13+16], r10
-        adc	rax, QWORD PTR [r14+24]
-        mov	r9, QWORD PTR [r8+32]
-        mov	QWORD PTR [r13+24], rax
-        adc	r9, QWORD PTR [r14+32]
-        mov	r10, QWORD PTR [r8+40]
-        mov	QWORD PTR [r13+32], r9
-        adc	r10, QWORD PTR [r14+40]
-        mov	rax, QWORD PTR [r8+48]
-        mov	QWORD PTR [r13+40], r10
-        adc	rax, QWORD PTR [r14+48]
-        mov	r9, QWORD PTR [r8+56]
-        mov	QWORD PTR [r13+48], rax
-        adc	r9, QWORD PTR [r14+56]
-        mov	r10, QWORD PTR [r8+64]
-        mov	QWORD PTR [r13+56], r9
-        adc	r10, QWORD PTR [r14+64]
-        mov	rax, QWORD PTR [r8+72]
-        mov	QWORD PTR [r13+64], r10
-        adc	rax, QWORD PTR [r14+72]
-        mov	r9, QWORD PTR [r8+80]
-        mov	QWORD PTR [r13+72], rax
-        adc	r9, QWORD PTR [r14+80]
-        mov	r10, QWORD PTR [r8+88]
-        mov	QWORD PTR [r13+80], r9
-        adc	r10, QWORD PTR [r14+88]
-        mov	QWORD PTR [r13+88], r10
-        adc	rdi, 0
-        mov	QWORD PTR [rsp+608], rdi
-        mov	r8, r13
-        mov	rdx, r12
-        mov	rcx, rsp
-        call	sp_3072_mul_12
-        mov	r8, QWORD PTR [rsp+592]
-        mov	rdx, QWORD PTR [rsp+584]
-        lea	rcx, QWORD PTR [rsp+192]
-        add	r8, 96
-        add	rdx, 96
-        call	sp_3072_mul_12
-        mov	r8, QWORD PTR [rsp+592]
-        mov	rdx, QWORD PTR [rsp+584]
-        mov	rcx, QWORD PTR [rsp+576]
-        call	sp_3072_mul_12
-IFDEF _WIN64
-        mov	r8, QWORD PTR [rsp+592]
-        mov	rdx, QWORD PTR [rsp+584]
-        mov	rcx, QWORD PTR [rsp+576]
-ENDIF
-        mov	r15, QWORD PTR [rsp+600]
-        mov	rdi, QWORD PTR [rsp+608]
-        mov	rsi, QWORD PTR [rsp+576]
-        mov	r11, r15
-        lea	r12, QWORD PTR [rsp+384]
-        lea	r13, QWORD PTR [rsp+480]
-        and	r11, rdi
-        neg	r15
-        neg	rdi
-        add	rsi, 192
-        mov	rax, QWORD PTR [r12]
-        mov	r9, QWORD PTR [r13]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12], rax
-        mov	QWORD PTR [r13], r9
-        mov	rax, QWORD PTR [r12+8]
-        mov	r9, QWORD PTR [r13+8]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+8], rax
-        mov	QWORD PTR [r13+8], r9
-        mov	rax, QWORD PTR [r12+16]
-        mov	r9, QWORD PTR [r13+16]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+16], rax
-        mov	QWORD PTR [r13+16], r9
-        mov	rax, QWORD PTR [r12+24]
-        mov	r9, QWORD PTR [r13+24]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+24], rax
-        mov	QWORD PTR [r13+24], r9
-        mov	rax, QWORD PTR [r12+32]
-        mov	r9, QWORD PTR [r13+32]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+32], rax
-        mov	QWORD PTR [r13+32], r9
-        mov	rax, QWORD PTR [r12+40]
-        mov	r9, QWORD PTR [r13+40]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+40], rax
-        mov	QWORD PTR [r13+40], r9
-        mov	rax, QWORD PTR [r12+48]
-        mov	r9, QWORD PTR [r13+48]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+48], rax
-        mov	QWORD PTR [r13+48], r9
-        mov	rax, QWORD PTR [r12+56]
-        mov	r9, QWORD PTR [r13+56]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+56], rax
-        mov	QWORD PTR [r13+56], r9
-        mov	rax, QWORD PTR [r12+64]
-        mov	r9, QWORD PTR [r13+64]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+64], rax
-        mov	QWORD PTR [r13+64], r9
-        mov	rax, QWORD PTR [r12+72]
-        mov	r9, QWORD PTR [r13+72]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+72], rax
-        mov	QWORD PTR [r13+72], r9
-        mov	rax, QWORD PTR [r12+80]
-        mov	r9, QWORD PTR [r13+80]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+80], rax
-        mov	QWORD PTR [r13+80], r9
-        mov	rax, QWORD PTR [r12+88]
-        mov	r9, QWORD PTR [r13+88]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+88], rax
-        mov	QWORD PTR [r13+88], r9
-        mov	rax, QWORD PTR [r12]
-        add	rax, QWORD PTR [r13]
-        mov	r9, QWORD PTR [r12+8]
-        mov	QWORD PTR [rsi], rax
-        adc	r9, QWORD PTR [r13+8]
-        mov	r10, QWORD PTR [r12+16]
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, QWORD PTR [r13+16]
-        mov	rax, QWORD PTR [r12+24]
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, QWORD PTR [r13+24]
-        mov	r9, QWORD PTR [r12+32]
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, QWORD PTR [r13+32]
-        mov	r10, QWORD PTR [r12+40]
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, QWORD PTR [r13+40]
-        mov	rax, QWORD PTR [r12+48]
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, QWORD PTR [r13+48]
-        mov	r9, QWORD PTR [r12+56]
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, QWORD PTR [r13+56]
-        mov	r10, QWORD PTR [r12+64]
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, QWORD PTR [r13+64]
-        mov	rax, QWORD PTR [r12+72]
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, QWORD PTR [r13+72]
-        mov	r9, QWORD PTR [r12+80]
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, QWORD PTR [r13+80]
-        mov	r10, QWORD PTR [r12+88]
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, QWORD PTR [r13+88]
-        mov	QWORD PTR [rsi+88], r10
-        adc	r11, 0
-        lea	r13, QWORD PTR [rsp+192]
-        mov	r12, rsp
-        mov	rax, QWORD PTR [r12]
-        sub	rax, QWORD PTR [r13]
-        mov	r9, QWORD PTR [r12+8]
-        mov	QWORD PTR [r12], rax
-        sbb	r9, QWORD PTR [r13+8]
-        mov	r10, QWORD PTR [r12+16]
-        mov	QWORD PTR [r12+8], r9
-        sbb	r10, QWORD PTR [r13+16]
-        mov	rax, QWORD PTR [r12+24]
-        mov	QWORD PTR [r12+16], r10
-        sbb	rax, QWORD PTR [r13+24]
-        mov	r9, QWORD PTR [r12+32]
-        mov	QWORD PTR [r12+24], rax
-        sbb	r9, QWORD PTR [r13+32]
-        mov	r10, QWORD PTR [r12+40]
-        mov	QWORD PTR [r12+32], r9
-        sbb	r10, QWORD PTR [r13+40]
-        mov	rax, QWORD PTR [r12+48]
-        mov	QWORD PTR [r12+40], r10
-        sbb	rax, QWORD PTR [r13+48]
-        mov	r9, QWORD PTR [r12+56]
-        mov	QWORD PTR [r12+48], rax
-        sbb	r9, QWORD PTR [r13+56]
-        mov	r10, QWORD PTR [r12+64]
-        mov	QWORD PTR [r12+56], r9
-        sbb	r10, QWORD PTR [r13+64]
-        mov	rax, QWORD PTR [r12+72]
-        mov	QWORD PTR [r12+64], r10
-        sbb	rax, QWORD PTR [r13+72]
-        mov	r9, QWORD PTR [r12+80]
-        mov	QWORD PTR [r12+72], rax
-        sbb	r9, QWORD PTR [r13+80]
-        mov	r10, QWORD PTR [r12+88]
-        mov	QWORD PTR [r12+80], r9
-        sbb	r10, QWORD PTR [r13+88]
-        mov	rax, QWORD PTR [r12+96]
-        mov	QWORD PTR [r12+88], r10
-        sbb	rax, QWORD PTR [r13+96]
-        mov	r9, QWORD PTR [r12+104]
-        mov	QWORD PTR [r12+96], rax
-        sbb	r9, QWORD PTR [r13+104]
-        mov	r10, QWORD PTR [r12+112]
-        mov	QWORD PTR [r12+104], r9
-        sbb	r10, QWORD PTR [r13+112]
-        mov	rax, QWORD PTR [r12+120]
-        mov	QWORD PTR [r12+112], r10
-        sbb	rax, QWORD PTR [r13+120]
-        mov	r9, QWORD PTR [r12+128]
-        mov	QWORD PTR [r12+120], rax
-        sbb	r9, QWORD PTR [r13+128]
-        mov	r10, QWORD PTR [r12+136]
-        mov	QWORD PTR [r12+128], r9
-        sbb	r10, QWORD PTR [r13+136]
-        mov	rax, QWORD PTR [r12+144]
-        mov	QWORD PTR [r12+136], r10
-        sbb	rax, QWORD PTR [r13+144]
-        mov	r9, QWORD PTR [r12+152]
-        mov	QWORD PTR [r12+144], rax
-        sbb	r9, QWORD PTR [r13+152]
-        mov	r10, QWORD PTR [r12+160]
-        mov	QWORD PTR [r12+152], r9
-        sbb	r10, QWORD PTR [r13+160]
-        mov	rax, QWORD PTR [r12+168]
-        mov	QWORD PTR [r12+160], r10
-        sbb	rax, QWORD PTR [r13+168]
-        mov	r9, QWORD PTR [r12+176]
-        mov	QWORD PTR [r12+168], rax
-        sbb	r9, QWORD PTR [r13+176]
-        mov	r10, QWORD PTR [r12+184]
-        mov	QWORD PTR [r12+176], r9
-        sbb	r10, QWORD PTR [r13+184]
-        mov	QWORD PTR [r12+184], r10
-        sbb	r11, 0
-        mov	rax, QWORD PTR [r12]
-        sub	rax, QWORD PTR [rcx]
-        mov	r9, QWORD PTR [r12+8]
-        mov	QWORD PTR [r12], rax
-        sbb	r9, QWORD PTR [rcx+8]
-        mov	r10, QWORD PTR [r12+16]
-        mov	QWORD PTR [r12+8], r9
-        sbb	r10, QWORD PTR [rcx+16]
-        mov	rax, QWORD PTR [r12+24]
-        mov	QWORD PTR [r12+16], r10
-        sbb	rax, QWORD PTR [rcx+24]
-        mov	r9, QWORD PTR [r12+32]
-        mov	QWORD PTR [r12+24], rax
-        sbb	r9, QWORD PTR [rcx+32]
-        mov	r10, QWORD PTR [r12+40]
-        mov	QWORD PTR [r12+32], r9
-        sbb	r10, QWORD PTR [rcx+40]
-        mov	rax, QWORD PTR [r12+48]
-        mov	QWORD PTR [r12+40], r10
-        sbb	rax, QWORD PTR [rcx+48]
-        mov	r9, QWORD PTR [r12+56]
-        mov	QWORD PTR [r12+48], rax
-        sbb	r9, QWORD PTR [rcx+56]
-        mov	r10, QWORD PTR [r12+64]
-        mov	QWORD PTR [r12+56], r9
-        sbb	r10, QWORD PTR [rcx+64]
-        mov	rax, QWORD PTR [r12+72]
-        mov	QWORD PTR [r12+64], r10
-        sbb	rax, QWORD PTR [rcx+72]
-        mov	r9, QWORD PTR [r12+80]
-        mov	QWORD PTR [r12+72], rax
-        sbb	r9, QWORD PTR [rcx+80]
-        mov	r10, QWORD PTR [r12+88]
-        mov	QWORD PTR [r12+80], r9
-        sbb	r10, QWORD PTR [rcx+88]
-        mov	rax, QWORD PTR [r12+96]
-        mov	QWORD PTR [r12+88], r10
-        sbb	rax, QWORD PTR [rcx+96]
-        mov	r9, QWORD PTR [r12+104]
-        mov	QWORD PTR [r12+96], rax
-        sbb	r9, QWORD PTR [rcx+104]
-        mov	r10, QWORD PTR [r12+112]
-        mov	QWORD PTR [r12+104], r9
-        sbb	r10, QWORD PTR [rcx+112]
-        mov	rax, QWORD PTR [r12+120]
-        mov	QWORD PTR [r12+112], r10
-        sbb	rax, QWORD PTR [rcx+120]
-        mov	r9, QWORD PTR [r12+128]
-        mov	QWORD PTR [r12+120], rax
-        sbb	r9, QWORD PTR [rcx+128]
-        mov	r10, QWORD PTR [r12+136]
-        mov	QWORD PTR [r12+128], r9
-        sbb	r10, QWORD PTR [rcx+136]
-        mov	rax, QWORD PTR [r12+144]
-        mov	QWORD PTR [r12+136], r10
-        sbb	rax, QWORD PTR [rcx+144]
-        mov	r9, QWORD PTR [r12+152]
-        mov	QWORD PTR [r12+144], rax
-        sbb	r9, QWORD PTR [rcx+152]
-        mov	r10, QWORD PTR [r12+160]
-        mov	QWORD PTR [r12+152], r9
-        sbb	r10, QWORD PTR [rcx+160]
-        mov	rax, QWORD PTR [r12+168]
-        mov	QWORD PTR [r12+160], r10
-        sbb	rax, QWORD PTR [rcx+168]
-        mov	r9, QWORD PTR [r12+176]
-        mov	QWORD PTR [r12+168], rax
-        sbb	r9, QWORD PTR [rcx+176]
-        mov	r10, QWORD PTR [r12+184]
-        mov	QWORD PTR [r12+176], r9
-        sbb	r10, QWORD PTR [rcx+184]
-        mov	QWORD PTR [r12+184], r10
-        sbb	r11, 0
-        sub	rsi, 96
-        ; Add
-        mov	rax, QWORD PTR [rsi]
-        add	rax, QWORD PTR [r12]
-        mov	r9, QWORD PTR [rsi+8]
-        mov	QWORD PTR [rsi], rax
-        adc	r9, QWORD PTR [r12+8]
-        mov	r10, QWORD PTR [rsi+16]
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, QWORD PTR [r12+16]
-        mov	rax, QWORD PTR [rsi+24]
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, QWORD PTR [r12+24]
-        mov	r9, QWORD PTR [rsi+32]
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, QWORD PTR [r12+32]
-        mov	r10, QWORD PTR [rsi+40]
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, QWORD PTR [r12+40]
-        mov	rax, QWORD PTR [rsi+48]
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, QWORD PTR [r12+48]
-        mov	r9, QWORD PTR [rsi+56]
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, QWORD PTR [r12+56]
-        mov	r10, QWORD PTR [rsi+64]
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, QWORD PTR [r12+64]
-        mov	rax, QWORD PTR [rsi+72]
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, QWORD PTR [r12+72]
-        mov	r9, QWORD PTR [rsi+80]
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, QWORD PTR [r12+80]
-        mov	r10, QWORD PTR [rsi+88]
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, QWORD PTR [r12+88]
-        mov	rax, QWORD PTR [rsi+96]
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, QWORD PTR [r12+96]
-        mov	r9, QWORD PTR [rsi+104]
-        mov	QWORD PTR [rsi+96], rax
-        adc	r9, QWORD PTR [r12+104]
-        mov	r10, QWORD PTR [rsi+112]
-        mov	QWORD PTR [rsi+104], r9
-        adc	r10, QWORD PTR [r12+112]
-        mov	rax, QWORD PTR [rsi+120]
-        mov	QWORD PTR [rsi+112], r10
-        adc	rax, QWORD PTR [r12+120]
-        mov	r9, QWORD PTR [rsi+128]
-        mov	QWORD PTR [rsi+120], rax
-        adc	r9, QWORD PTR [r12+128]
-        mov	r10, QWORD PTR [rsi+136]
-        mov	QWORD PTR [rsi+128], r9
-        adc	r10, QWORD PTR [r12+136]
-        mov	rax, QWORD PTR [rsi+144]
-        mov	QWORD PTR [rsi+136], r10
-        adc	rax, QWORD PTR [r12+144]
-        mov	r9, QWORD PTR [rsi+152]
-        mov	QWORD PTR [rsi+144], rax
-        adc	r9, QWORD PTR [r12+152]
-        mov	r10, QWORD PTR [rsi+160]
-        mov	QWORD PTR [rsi+152], r9
-        adc	r10, QWORD PTR [r12+160]
-        mov	rax, QWORD PTR [rsi+168]
-        mov	QWORD PTR [rsi+160], r10
-        adc	rax, QWORD PTR [r12+168]
-        mov	r9, QWORD PTR [rsi+176]
-        mov	QWORD PTR [rsi+168], rax
-        adc	r9, QWORD PTR [r12+176]
-        mov	r10, QWORD PTR [rsi+184]
-        mov	QWORD PTR [rsi+176], r9
-        adc	r10, QWORD PTR [r12+184]
-        mov	QWORD PTR [rsi+184], r10
-        adc	r11, 0
-        mov	QWORD PTR [rcx+288], r11
-        add	rsi, 96
-        ; Add
-        mov	rax, QWORD PTR [rsi]
-        add	rax, QWORD PTR [r13]
-        mov	r9, QWORD PTR [rsi+8]
-        mov	QWORD PTR [rsi], rax
-        adc	r9, QWORD PTR [r13+8]
-        mov	r10, QWORD PTR [rsi+16]
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, QWORD PTR [r13+16]
-        mov	rax, QWORD PTR [rsi+24]
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, QWORD PTR [r13+24]
-        mov	r9, QWORD PTR [rsi+32]
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, QWORD PTR [r13+32]
-        mov	r10, QWORD PTR [rsi+40]
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, QWORD PTR [r13+40]
-        mov	rax, QWORD PTR [rsi+48]
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, QWORD PTR [r13+48]
-        mov	r9, QWORD PTR [rsi+56]
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, QWORD PTR [r13+56]
-        mov	r10, QWORD PTR [rsi+64]
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, QWORD PTR [r13+64]
-        mov	rax, QWORD PTR [rsi+72]
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, QWORD PTR [r13+72]
-        mov	r9, QWORD PTR [rsi+80]
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, QWORD PTR [r13+80]
-        mov	r10, QWORD PTR [rsi+88]
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, QWORD PTR [r13+88]
-        mov	rax, QWORD PTR [rsi+96]
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, QWORD PTR [r13+96]
-        mov	QWORD PTR [rsi+96], rax
-        ; Add to zero
-        mov	rax, QWORD PTR [r13+104]
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+112]
-        mov	QWORD PTR [rsi+104], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+120]
-        mov	QWORD PTR [rsi+112], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+128]
-        mov	QWORD PTR [rsi+120], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+136]
-        mov	QWORD PTR [rsi+128], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+144]
-        mov	QWORD PTR [rsi+136], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+152]
-        mov	QWORD PTR [rsi+144], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+160]
-        mov	QWORD PTR [rsi+152], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+168]
-        mov	QWORD PTR [rsi+160], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+176]
-        mov	QWORD PTR [rsi+168], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+184]
-        mov	QWORD PTR [rsi+176], rax
-        adc	r9, 0
-        mov	QWORD PTR [rsi+184], r9
-        add	rsp, 616
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_3072_mul_24 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Multiply a and b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_mul_avx2_24 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        sub	rsp, 616
-        mov	QWORD PTR [rsp+576], rcx
-        mov	QWORD PTR [rsp+584], rdx
-        mov	QWORD PTR [rsp+592], r8
-        lea	r12, QWORD PTR [rsp+384]
-        lea	r14, QWORD PTR [rdx+96]
-        ; Add
-        mov	rax, QWORD PTR [rdx]
-        xor	r15, r15
-        add	rax, QWORD PTR [r14]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	QWORD PTR [r12], rax
-        adc	r9, QWORD PTR [r14+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	QWORD PTR [r12+8], r9
-        adc	r10, QWORD PTR [r14+16]
-        mov	rax, QWORD PTR [rdx+24]
-        mov	QWORD PTR [r12+16], r10
-        adc	rax, QWORD PTR [r14+24]
-        mov	r9, QWORD PTR [rdx+32]
-        mov	QWORD PTR [r12+24], rax
-        adc	r9, QWORD PTR [r14+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	QWORD PTR [r12+32], r9
-        adc	r10, QWORD PTR [r14+40]
-        mov	rax, QWORD PTR [rdx+48]
-        mov	QWORD PTR [r12+40], r10
-        adc	rax, QWORD PTR [r14+48]
-        mov	r9, QWORD PTR [rdx+56]
-        mov	QWORD PTR [r12+48], rax
-        adc	r9, QWORD PTR [r14+56]
-        mov	r10, QWORD PTR [rdx+64]
-        mov	QWORD PTR [r12+56], r9
-        adc	r10, QWORD PTR [r14+64]
-        mov	rax, QWORD PTR [rdx+72]
-        mov	QWORD PTR [r12+64], r10
-        adc	rax, QWORD PTR [r14+72]
-        mov	r9, QWORD PTR [rdx+80]
-        mov	QWORD PTR [r12+72], rax
-        adc	r9, QWORD PTR [r14+80]
-        mov	r10, QWORD PTR [rdx+88]
-        mov	QWORD PTR [r12+80], r9
-        adc	r10, QWORD PTR [r14+88]
-        mov	QWORD PTR [r12+88], r10
-        adc	r15, 0
-        mov	QWORD PTR [rsp+600], r15
-        lea	r13, QWORD PTR [rsp+480]
-        lea	r14, QWORD PTR [r8+96]
-        ; Add
-        mov	rax, QWORD PTR [r8]
-        xor	rdi, rdi
-        add	rax, QWORD PTR [r14]
-        mov	r9, QWORD PTR [r8+8]
-        mov	QWORD PTR [r13], rax
-        adc	r9, QWORD PTR [r14+8]
-        mov	r10, QWORD PTR [r8+16]
-        mov	QWORD PTR [r13+8], r9
-        adc	r10, QWORD PTR [r14+16]
-        mov	rax, QWORD PTR [r8+24]
-        mov	QWORD PTR [r13+16], r10
-        adc	rax, QWORD PTR [r14+24]
-        mov	r9, QWORD PTR [r8+32]
-        mov	QWORD PTR [r13+24], rax
-        adc	r9, QWORD PTR [r14+32]
-        mov	r10, QWORD PTR [r8+40]
-        mov	QWORD PTR [r13+32], r9
-        adc	r10, QWORD PTR [r14+40]
-        mov	rax, QWORD PTR [r8+48]
-        mov	QWORD PTR [r13+40], r10
-        adc	rax, QWORD PTR [r14+48]
-        mov	r9, QWORD PTR [r8+56]
-        mov	QWORD PTR [r13+48], rax
-        adc	r9, QWORD PTR [r14+56]
-        mov	r10, QWORD PTR [r8+64]
-        mov	QWORD PTR [r13+56], r9
-        adc	r10, QWORD PTR [r14+64]
-        mov	rax, QWORD PTR [r8+72]
-        mov	QWORD PTR [r13+64], r10
-        adc	rax, QWORD PTR [r14+72]
-        mov	r9, QWORD PTR [r8+80]
-        mov	QWORD PTR [r13+72], rax
-        adc	r9, QWORD PTR [r14+80]
-        mov	r10, QWORD PTR [r8+88]
-        mov	QWORD PTR [r13+80], r9
-        adc	r10, QWORD PTR [r14+88]
-        mov	QWORD PTR [r13+88], r10
-        adc	rdi, 0
-        mov	QWORD PTR [rsp+608], rdi
-        mov	r8, r13
-        mov	rdx, r12
-        mov	rcx, rsp
-        call	sp_3072_mul_avx2_12
-        mov	r8, QWORD PTR [rsp+592]
-        mov	rdx, QWORD PTR [rsp+584]
-        lea	rcx, QWORD PTR [rsp+192]
-        add	r8, 96
-        add	rdx, 96
-        call	sp_3072_mul_avx2_12
-        mov	r8, QWORD PTR [rsp+592]
-        mov	rdx, QWORD PTR [rsp+584]
-        mov	rcx, QWORD PTR [rsp+576]
-        call	sp_3072_mul_avx2_12
-IFDEF _WIN64
-        mov	r8, QWORD PTR [rsp+592]
-        mov	rdx, QWORD PTR [rsp+584]
-        mov	rcx, QWORD PTR [rsp+576]
-ENDIF
-        mov	r15, QWORD PTR [rsp+600]
-        mov	rdi, QWORD PTR [rsp+608]
-        mov	rsi, QWORD PTR [rsp+576]
-        mov	r11, r15
-        lea	r12, QWORD PTR [rsp+384]
-        lea	r13, QWORD PTR [rsp+480]
-        and	r11, rdi
-        neg	r15
-        neg	rdi
-        add	rsi, 192
-        mov	rax, QWORD PTR [r12]
-        mov	r9, QWORD PTR [r13]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        add	rax, r9
-        mov	r9, QWORD PTR [r12+8]
-        mov	r10, QWORD PTR [r13+8]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+16]
-        mov	rax, QWORD PTR [r13+16]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+24]
-        mov	r9, QWORD PTR [r13+24]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+32]
-        mov	r10, QWORD PTR [r13+32]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+40]
-        mov	rax, QWORD PTR [r13+40]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+48]
-        mov	r9, QWORD PTR [r13+48]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+56]
-        mov	r10, QWORD PTR [r13+56]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+64]
-        mov	rax, QWORD PTR [r13+64]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+72]
-        mov	r9, QWORD PTR [r13+72]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+80]
-        mov	r10, QWORD PTR [r13+80]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+88]
-        mov	rax, QWORD PTR [r13+88]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, rax
-        mov	QWORD PTR [rsi+88], r10
-        adc	r11, 0
-        lea	r13, QWORD PTR [rsp+192]
-        mov	r12, rsp
-        mov	rax, QWORD PTR [r12]
-        sub	rax, QWORD PTR [r13]
-        mov	r9, QWORD PTR [r12+8]
-        mov	QWORD PTR [r12], rax
-        sbb	r9, QWORD PTR [r13+8]
-        mov	r10, QWORD PTR [r12+16]
-        mov	QWORD PTR [r12+8], r9
-        sbb	r10, QWORD PTR [r13+16]
-        mov	rax, QWORD PTR [r12+24]
-        mov	QWORD PTR [r12+16], r10
-        sbb	rax, QWORD PTR [r13+24]
-        mov	r9, QWORD PTR [r12+32]
-        mov	QWORD PTR [r12+24], rax
-        sbb	r9, QWORD PTR [r13+32]
-        mov	r10, QWORD PTR [r12+40]
-        mov	QWORD PTR [r12+32], r9
-        sbb	r10, QWORD PTR [r13+40]
-        mov	rax, QWORD PTR [r12+48]
-        mov	QWORD PTR [r12+40], r10
-        sbb	rax, QWORD PTR [r13+48]
-        mov	r9, QWORD PTR [r12+56]
-        mov	QWORD PTR [r12+48], rax
-        sbb	r9, QWORD PTR [r13+56]
-        mov	r10, QWORD PTR [r12+64]
-        mov	QWORD PTR [r12+56], r9
-        sbb	r10, QWORD PTR [r13+64]
-        mov	rax, QWORD PTR [r12+72]
-        mov	QWORD PTR [r12+64], r10
-        sbb	rax, QWORD PTR [r13+72]
-        mov	r9, QWORD PTR [r12+80]
-        mov	QWORD PTR [r12+72], rax
-        sbb	r9, QWORD PTR [r13+80]
-        mov	r10, QWORD PTR [r12+88]
-        mov	QWORD PTR [r12+80], r9
-        sbb	r10, QWORD PTR [r13+88]
-        mov	rax, QWORD PTR [r12+96]
-        mov	QWORD PTR [r12+88], r10
-        sbb	rax, QWORD PTR [r13+96]
-        mov	r9, QWORD PTR [r12+104]
-        mov	QWORD PTR [r12+96], rax
-        sbb	r9, QWORD PTR [r13+104]
-        mov	r10, QWORD PTR [r12+112]
-        mov	QWORD PTR [r12+104], r9
-        sbb	r10, QWORD PTR [r13+112]
-        mov	rax, QWORD PTR [r12+120]
-        mov	QWORD PTR [r12+112], r10
-        sbb	rax, QWORD PTR [r13+120]
-        mov	r9, QWORD PTR [r12+128]
-        mov	QWORD PTR [r12+120], rax
-        sbb	r9, QWORD PTR [r13+128]
-        mov	r10, QWORD PTR [r12+136]
-        mov	QWORD PTR [r12+128], r9
-        sbb	r10, QWORD PTR [r13+136]
-        mov	rax, QWORD PTR [r12+144]
-        mov	QWORD PTR [r12+136], r10
-        sbb	rax, QWORD PTR [r13+144]
-        mov	r9, QWORD PTR [r12+152]
-        mov	QWORD PTR [r12+144], rax
-        sbb	r9, QWORD PTR [r13+152]
-        mov	r10, QWORD PTR [r12+160]
-        mov	QWORD PTR [r12+152], r9
-        sbb	r10, QWORD PTR [r13+160]
-        mov	rax, QWORD PTR [r12+168]
-        mov	QWORD PTR [r12+160], r10
-        sbb	rax, QWORD PTR [r13+168]
-        mov	r9, QWORD PTR [r12+176]
-        mov	QWORD PTR [r12+168], rax
-        sbb	r9, QWORD PTR [r13+176]
-        mov	r10, QWORD PTR [r12+184]
-        mov	QWORD PTR [r12+176], r9
-        sbb	r10, QWORD PTR [r13+184]
-        mov	QWORD PTR [r12+184], r10
-        sbb	r11, 0
-        mov	rax, QWORD PTR [r12]
-        sub	rax, QWORD PTR [rcx]
-        mov	r9, QWORD PTR [r12+8]
-        mov	QWORD PTR [r12], rax
-        sbb	r9, QWORD PTR [rcx+8]
-        mov	r10, QWORD PTR [r12+16]
-        mov	QWORD PTR [r12+8], r9
-        sbb	r10, QWORD PTR [rcx+16]
-        mov	rax, QWORD PTR [r12+24]
-        mov	QWORD PTR [r12+16], r10
-        sbb	rax, QWORD PTR [rcx+24]
-        mov	r9, QWORD PTR [r12+32]
-        mov	QWORD PTR [r12+24], rax
-        sbb	r9, QWORD PTR [rcx+32]
-        mov	r10, QWORD PTR [r12+40]
-        mov	QWORD PTR [r12+32], r9
-        sbb	r10, QWORD PTR [rcx+40]
-        mov	rax, QWORD PTR [r12+48]
-        mov	QWORD PTR [r12+40], r10
-        sbb	rax, QWORD PTR [rcx+48]
-        mov	r9, QWORD PTR [r12+56]
-        mov	QWORD PTR [r12+48], rax
-        sbb	r9, QWORD PTR [rcx+56]
-        mov	r10, QWORD PTR [r12+64]
-        mov	QWORD PTR [r12+56], r9
-        sbb	r10, QWORD PTR [rcx+64]
-        mov	rax, QWORD PTR [r12+72]
-        mov	QWORD PTR [r12+64], r10
-        sbb	rax, QWORD PTR [rcx+72]
-        mov	r9, QWORD PTR [r12+80]
-        mov	QWORD PTR [r12+72], rax
-        sbb	r9, QWORD PTR [rcx+80]
-        mov	r10, QWORD PTR [r12+88]
-        mov	QWORD PTR [r12+80], r9
-        sbb	r10, QWORD PTR [rcx+88]
-        mov	rax, QWORD PTR [r12+96]
-        mov	QWORD PTR [r12+88], r10
-        sbb	rax, QWORD PTR [rcx+96]
-        mov	r9, QWORD PTR [r12+104]
-        mov	QWORD PTR [r12+96], rax
-        sbb	r9, QWORD PTR [rcx+104]
-        mov	r10, QWORD PTR [r12+112]
-        mov	QWORD PTR [r12+104], r9
-        sbb	r10, QWORD PTR [rcx+112]
-        mov	rax, QWORD PTR [r12+120]
-        mov	QWORD PTR [r12+112], r10
-        sbb	rax, QWORD PTR [rcx+120]
-        mov	r9, QWORD PTR [r12+128]
-        mov	QWORD PTR [r12+120], rax
-        sbb	r9, QWORD PTR [rcx+128]
-        mov	r10, QWORD PTR [r12+136]
-        mov	QWORD PTR [r12+128], r9
-        sbb	r10, QWORD PTR [rcx+136]
-        mov	rax, QWORD PTR [r12+144]
-        mov	QWORD PTR [r12+136], r10
-        sbb	rax, QWORD PTR [rcx+144]
-        mov	r9, QWORD PTR [r12+152]
-        mov	QWORD PTR [r12+144], rax
-        sbb	r9, QWORD PTR [rcx+152]
-        mov	r10, QWORD PTR [r12+160]
-        mov	QWORD PTR [r12+152], r9
-        sbb	r10, QWORD PTR [rcx+160]
-        mov	rax, QWORD PTR [r12+168]
-        mov	QWORD PTR [r12+160], r10
-        sbb	rax, QWORD PTR [rcx+168]
-        mov	r9, QWORD PTR [r12+176]
-        mov	QWORD PTR [r12+168], rax
-        sbb	r9, QWORD PTR [rcx+176]
-        mov	r10, QWORD PTR [r12+184]
-        mov	QWORD PTR [r12+176], r9
-        sbb	r10, QWORD PTR [rcx+184]
-        mov	QWORD PTR [r12+184], r10
-        sbb	r11, 0
-        sub	rsi, 96
-        ; Add
-        mov	rax, QWORD PTR [rsi]
-        add	rax, QWORD PTR [r12]
-        mov	r9, QWORD PTR [rsi+8]
-        mov	QWORD PTR [rsi], rax
-        adc	r9, QWORD PTR [r12+8]
-        mov	r10, QWORD PTR [rsi+16]
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, QWORD PTR [r12+16]
-        mov	rax, QWORD PTR [rsi+24]
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, QWORD PTR [r12+24]
-        mov	r9, QWORD PTR [rsi+32]
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, QWORD PTR [r12+32]
-        mov	r10, QWORD PTR [rsi+40]
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, QWORD PTR [r12+40]
-        mov	rax, QWORD PTR [rsi+48]
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, QWORD PTR [r12+48]
-        mov	r9, QWORD PTR [rsi+56]
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, QWORD PTR [r12+56]
-        mov	r10, QWORD PTR [rsi+64]
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, QWORD PTR [r12+64]
-        mov	rax, QWORD PTR [rsi+72]
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, QWORD PTR [r12+72]
-        mov	r9, QWORD PTR [rsi+80]
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, QWORD PTR [r12+80]
-        mov	r10, QWORD PTR [rsi+88]
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, QWORD PTR [r12+88]
-        mov	rax, QWORD PTR [rsi+96]
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, QWORD PTR [r12+96]
-        mov	r9, QWORD PTR [rsi+104]
-        mov	QWORD PTR [rsi+96], rax
-        adc	r9, QWORD PTR [r12+104]
-        mov	r10, QWORD PTR [rsi+112]
-        mov	QWORD PTR [rsi+104], r9
-        adc	r10, QWORD PTR [r12+112]
-        mov	rax, QWORD PTR [rsi+120]
-        mov	QWORD PTR [rsi+112], r10
-        adc	rax, QWORD PTR [r12+120]
-        mov	r9, QWORD PTR [rsi+128]
-        mov	QWORD PTR [rsi+120], rax
-        adc	r9, QWORD PTR [r12+128]
-        mov	r10, QWORD PTR [rsi+136]
-        mov	QWORD PTR [rsi+128], r9
-        adc	r10, QWORD PTR [r12+136]
-        mov	rax, QWORD PTR [rsi+144]
-        mov	QWORD PTR [rsi+136], r10
-        adc	rax, QWORD PTR [r12+144]
-        mov	r9, QWORD PTR [rsi+152]
-        mov	QWORD PTR [rsi+144], rax
-        adc	r9, QWORD PTR [r12+152]
-        mov	r10, QWORD PTR [rsi+160]
-        mov	QWORD PTR [rsi+152], r9
-        adc	r10, QWORD PTR [r12+160]
-        mov	rax, QWORD PTR [rsi+168]
-        mov	QWORD PTR [rsi+160], r10
-        adc	rax, QWORD PTR [r12+168]
-        mov	r9, QWORD PTR [rsi+176]
-        mov	QWORD PTR [rsi+168], rax
-        adc	r9, QWORD PTR [r12+176]
-        mov	r10, QWORD PTR [rsi+184]
-        mov	QWORD PTR [rsi+176], r9
-        adc	r10, QWORD PTR [r12+184]
-        mov	QWORD PTR [rsi+184], r10
-        adc	r11, 0
-        mov	QWORD PTR [rcx+288], r11
-        add	rsi, 96
-        ; Add
-        mov	rax, QWORD PTR [rsi]
-        add	rax, QWORD PTR [r13]
-        mov	r9, QWORD PTR [rsi+8]
-        mov	QWORD PTR [rsi], rax
-        adc	r9, QWORD PTR [r13+8]
-        mov	r10, QWORD PTR [rsi+16]
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, QWORD PTR [r13+16]
-        mov	rax, QWORD PTR [rsi+24]
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, QWORD PTR [r13+24]
-        mov	r9, QWORD PTR [rsi+32]
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, QWORD PTR [r13+32]
-        mov	r10, QWORD PTR [rsi+40]
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, QWORD PTR [r13+40]
-        mov	rax, QWORD PTR [rsi+48]
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, QWORD PTR [r13+48]
-        mov	r9, QWORD PTR [rsi+56]
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, QWORD PTR [r13+56]
-        mov	r10, QWORD PTR [rsi+64]
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, QWORD PTR [r13+64]
-        mov	rax, QWORD PTR [rsi+72]
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, QWORD PTR [r13+72]
-        mov	r9, QWORD PTR [rsi+80]
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, QWORD PTR [r13+80]
-        mov	r10, QWORD PTR [rsi+88]
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, QWORD PTR [r13+88]
-        mov	rax, QWORD PTR [rsi+96]
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, QWORD PTR [r13+96]
-        mov	QWORD PTR [rsi+96], rax
-        ; Add to zero
-        mov	rax, QWORD PTR [r13+104]
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+112]
-        mov	QWORD PTR [rsi+104], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+120]
-        mov	QWORD PTR [rsi+112], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+128]
-        mov	QWORD PTR [rsi+120], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+136]
-        mov	QWORD PTR [rsi+128], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+144]
-        mov	QWORD PTR [rsi+136], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+152]
-        mov	QWORD PTR [rsi+144], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+160]
-        mov	QWORD PTR [rsi+152], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+168]
-        mov	QWORD PTR [rsi+160], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+176]
-        mov	QWORD PTR [rsi+168], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+184]
-        mov	QWORD PTR [rsi+176], rax
-        adc	r9, 0
-        mov	QWORD PTR [rsi+184], r9
-        add	rsp, 616
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_3072_mul_avx2_24 ENDP
-_text ENDS
-ENDIF
-; /* Sub b from a into a. (a -= b)
-;  *
-;  * a  A single precision integer and result.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_sub_in_place_48 PROC
-        mov	r8, QWORD PTR [rcx]
-        sub	r8, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        sbb	r9, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], r9
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	r9, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        sbb	r9, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], r9
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	r9, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        sbb	r9, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], r9
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	r9, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        sbb	r9, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], r9
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	r9, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        sbb	r9, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], r9
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	r9, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        sbb	r9, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [rcx+96]
-        mov	QWORD PTR [rcx+88], r9
-        sbb	r8, QWORD PTR [rdx+96]
-        mov	r9, QWORD PTR [rcx+104]
-        mov	QWORD PTR [rcx+96], r8
-        sbb	r9, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [rcx+112]
-        mov	QWORD PTR [rcx+104], r9
-        sbb	r8, QWORD PTR [rdx+112]
-        mov	r9, QWORD PTR [rcx+120]
-        mov	QWORD PTR [rcx+112], r8
-        sbb	r9, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [rcx+128]
-        mov	QWORD PTR [rcx+120], r9
-        sbb	r8, QWORD PTR [rdx+128]
-        mov	r9, QWORD PTR [rcx+136]
-        mov	QWORD PTR [rcx+128], r8
-        sbb	r9, QWORD PTR [rdx+136]
-        mov	r8, QWORD PTR [rcx+144]
-        mov	QWORD PTR [rcx+136], r9
-        sbb	r8, QWORD PTR [rdx+144]
-        mov	r9, QWORD PTR [rcx+152]
-        mov	QWORD PTR [rcx+144], r8
-        sbb	r9, QWORD PTR [rdx+152]
-        mov	r8, QWORD PTR [rcx+160]
-        mov	QWORD PTR [rcx+152], r9
-        sbb	r8, QWORD PTR [rdx+160]
-        mov	r9, QWORD PTR [rcx+168]
-        mov	QWORD PTR [rcx+160], r8
-        sbb	r9, QWORD PTR [rdx+168]
-        mov	r8, QWORD PTR [rcx+176]
-        mov	QWORD PTR [rcx+168], r9
-        sbb	r8, QWORD PTR [rdx+176]
-        mov	r9, QWORD PTR [rcx+184]
-        mov	QWORD PTR [rcx+176], r8
-        sbb	r9, QWORD PTR [rdx+184]
-        mov	r8, QWORD PTR [rcx+192]
-        mov	QWORD PTR [rcx+184], r9
-        sbb	r8, QWORD PTR [rdx+192]
-        mov	r9, QWORD PTR [rcx+200]
-        mov	QWORD PTR [rcx+192], r8
-        sbb	r9, QWORD PTR [rdx+200]
-        mov	r8, QWORD PTR [rcx+208]
-        mov	QWORD PTR [rcx+200], r9
-        sbb	r8, QWORD PTR [rdx+208]
-        mov	r9, QWORD PTR [rcx+216]
-        mov	QWORD PTR [rcx+208], r8
-        sbb	r9, QWORD PTR [rdx+216]
-        mov	r8, QWORD PTR [rcx+224]
-        mov	QWORD PTR [rcx+216], r9
-        sbb	r8, QWORD PTR [rdx+224]
-        mov	r9, QWORD PTR [rcx+232]
-        mov	QWORD PTR [rcx+224], r8
-        sbb	r9, QWORD PTR [rdx+232]
-        mov	r8, QWORD PTR [rcx+240]
-        mov	QWORD PTR [rcx+232], r9
-        sbb	r8, QWORD PTR [rdx+240]
-        mov	r9, QWORD PTR [rcx+248]
-        mov	QWORD PTR [rcx+240], r8
-        sbb	r9, QWORD PTR [rdx+248]
-        mov	r8, QWORD PTR [rcx+256]
-        mov	QWORD PTR [rcx+248], r9
-        sbb	r8, QWORD PTR [rdx+256]
-        mov	r9, QWORD PTR [rcx+264]
-        mov	QWORD PTR [rcx+256], r8
-        sbb	r9, QWORD PTR [rdx+264]
-        mov	r8, QWORD PTR [rcx+272]
-        mov	QWORD PTR [rcx+264], r9
-        sbb	r8, QWORD PTR [rdx+272]
-        mov	r9, QWORD PTR [rcx+280]
-        mov	QWORD PTR [rcx+272], r8
-        sbb	r9, QWORD PTR [rdx+280]
-        mov	r8, QWORD PTR [rcx+288]
-        mov	QWORD PTR [rcx+280], r9
-        sbb	r8, QWORD PTR [rdx+288]
-        mov	r9, QWORD PTR [rcx+296]
-        mov	QWORD PTR [rcx+288], r8
-        sbb	r9, QWORD PTR [rdx+296]
-        mov	r8, QWORD PTR [rcx+304]
-        mov	QWORD PTR [rcx+296], r9
-        sbb	r8, QWORD PTR [rdx+304]
-        mov	r9, QWORD PTR [rcx+312]
-        mov	QWORD PTR [rcx+304], r8
-        sbb	r9, QWORD PTR [rdx+312]
-        mov	r8, QWORD PTR [rcx+320]
-        mov	QWORD PTR [rcx+312], r9
-        sbb	r8, QWORD PTR [rdx+320]
-        mov	r9, QWORD PTR [rcx+328]
-        mov	QWORD PTR [rcx+320], r8
-        sbb	r9, QWORD PTR [rdx+328]
-        mov	r8, QWORD PTR [rcx+336]
-        mov	QWORD PTR [rcx+328], r9
-        sbb	r8, QWORD PTR [rdx+336]
-        mov	r9, QWORD PTR [rcx+344]
-        mov	QWORD PTR [rcx+336], r8
-        sbb	r9, QWORD PTR [rdx+344]
-        mov	r8, QWORD PTR [rcx+352]
-        mov	QWORD PTR [rcx+344], r9
-        sbb	r8, QWORD PTR [rdx+352]
-        mov	r9, QWORD PTR [rcx+360]
-        mov	QWORD PTR [rcx+352], r8
-        sbb	r9, QWORD PTR [rdx+360]
-        mov	r8, QWORD PTR [rcx+368]
-        mov	QWORD PTR [rcx+360], r9
-        sbb	r8, QWORD PTR [rdx+368]
-        mov	r9, QWORD PTR [rcx+376]
-        mov	QWORD PTR [rcx+368], r8
-        sbb	r9, QWORD PTR [rdx+376]
-        mov	QWORD PTR [rcx+376], r9
-        sbb	rax, rax
-        ret
-sp_3072_sub_in_place_48 ENDP
-_text ENDS
-; /* Add b to a into r. (r = a + b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_add_48 PROC
-        ; Add
-        mov	r9, QWORD PTR [rdx]
-        xor	rax, rax
-        add	r9, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx+8]
-        mov	QWORD PTR [rcx], r9
-        adc	r10, QWORD PTR [r8+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	QWORD PTR [rcx+8], r10
-        adc	r9, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	QWORD PTR [rcx+16], r9
-        adc	r10, QWORD PTR [r8+24]
-        mov	r9, QWORD PTR [rdx+32]
-        mov	QWORD PTR [rcx+24], r10
-        adc	r9, QWORD PTR [r8+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	QWORD PTR [rcx+32], r9
-        adc	r10, QWORD PTR [r8+40]
-        mov	r9, QWORD PTR [rdx+48]
-        mov	QWORD PTR [rcx+40], r10
-        adc	r9, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+56]
-        mov	QWORD PTR [rcx+48], r9
-        adc	r10, QWORD PTR [r8+56]
-        mov	r9, QWORD PTR [rdx+64]
-        mov	QWORD PTR [rcx+56], r10
-        adc	r9, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [rdx+72]
-        mov	QWORD PTR [rcx+64], r9
-        adc	r10, QWORD PTR [r8+72]
-        mov	r9, QWORD PTR [rdx+80]
-        mov	QWORD PTR [rcx+72], r10
-        adc	r9, QWORD PTR [r8+80]
-        mov	r10, QWORD PTR [rdx+88]
-        mov	QWORD PTR [rcx+80], r9
-        adc	r10, QWORD PTR [r8+88]
-        mov	r9, QWORD PTR [rdx+96]
-        mov	QWORD PTR [rcx+88], r10
-        adc	r9, QWORD PTR [r8+96]
-        mov	r10, QWORD PTR [rdx+104]
-        mov	QWORD PTR [rcx+96], r9
-        adc	r10, QWORD PTR [r8+104]
-        mov	r9, QWORD PTR [rdx+112]
-        mov	QWORD PTR [rcx+104], r10
-        adc	r9, QWORD PTR [r8+112]
-        mov	r10, QWORD PTR [rdx+120]
-        mov	QWORD PTR [rcx+112], r9
-        adc	r10, QWORD PTR [r8+120]
-        mov	r9, QWORD PTR [rdx+128]
-        mov	QWORD PTR [rcx+120], r10
-        adc	r9, QWORD PTR [r8+128]
-        mov	r10, QWORD PTR [rdx+136]
-        mov	QWORD PTR [rcx+128], r9
-        adc	r10, QWORD PTR [r8+136]
-        mov	r9, QWORD PTR [rdx+144]
-        mov	QWORD PTR [rcx+136], r10
-        adc	r9, QWORD PTR [r8+144]
-        mov	r10, QWORD PTR [rdx+152]
-        mov	QWORD PTR [rcx+144], r9
-        adc	r10, QWORD PTR [r8+152]
-        mov	r9, QWORD PTR [rdx+160]
-        mov	QWORD PTR [rcx+152], r10
-        adc	r9, QWORD PTR [r8+160]
-        mov	r10, QWORD PTR [rdx+168]
-        mov	QWORD PTR [rcx+160], r9
-        adc	r10, QWORD PTR [r8+168]
-        mov	r9, QWORD PTR [rdx+176]
-        mov	QWORD PTR [rcx+168], r10
-        adc	r9, QWORD PTR [r8+176]
-        mov	r10, QWORD PTR [rdx+184]
-        mov	QWORD PTR [rcx+176], r9
-        adc	r10, QWORD PTR [r8+184]
-        mov	r9, QWORD PTR [rdx+192]
-        mov	QWORD PTR [rcx+184], r10
-        adc	r9, QWORD PTR [r8+192]
-        mov	r10, QWORD PTR [rdx+200]
-        mov	QWORD PTR [rcx+192], r9
-        adc	r10, QWORD PTR [r8+200]
-        mov	r9, QWORD PTR [rdx+208]
-        mov	QWORD PTR [rcx+200], r10
-        adc	r9, QWORD PTR [r8+208]
-        mov	r10, QWORD PTR [rdx+216]
-        mov	QWORD PTR [rcx+208], r9
-        adc	r10, QWORD PTR [r8+216]
-        mov	r9, QWORD PTR [rdx+224]
-        mov	QWORD PTR [rcx+216], r10
-        adc	r9, QWORD PTR [r8+224]
-        mov	r10, QWORD PTR [rdx+232]
-        mov	QWORD PTR [rcx+224], r9
-        adc	r10, QWORD PTR [r8+232]
-        mov	r9, QWORD PTR [rdx+240]
-        mov	QWORD PTR [rcx+232], r10
-        adc	r9, QWORD PTR [r8+240]
-        mov	r10, QWORD PTR [rdx+248]
-        mov	QWORD PTR [rcx+240], r9
-        adc	r10, QWORD PTR [r8+248]
-        mov	r9, QWORD PTR [rdx+256]
-        mov	QWORD PTR [rcx+248], r10
-        adc	r9, QWORD PTR [r8+256]
-        mov	r10, QWORD PTR [rdx+264]
-        mov	QWORD PTR [rcx+256], r9
-        adc	r10, QWORD PTR [r8+264]
-        mov	r9, QWORD PTR [rdx+272]
-        mov	QWORD PTR [rcx+264], r10
-        adc	r9, QWORD PTR [r8+272]
-        mov	r10, QWORD PTR [rdx+280]
-        mov	QWORD PTR [rcx+272], r9
-        adc	r10, QWORD PTR [r8+280]
-        mov	r9, QWORD PTR [rdx+288]
-        mov	QWORD PTR [rcx+280], r10
-        adc	r9, QWORD PTR [r8+288]
-        mov	r10, QWORD PTR [rdx+296]
-        mov	QWORD PTR [rcx+288], r9
-        adc	r10, QWORD PTR [r8+296]
-        mov	r9, QWORD PTR [rdx+304]
-        mov	QWORD PTR [rcx+296], r10
-        adc	r9, QWORD PTR [r8+304]
-        mov	r10, QWORD PTR [rdx+312]
-        mov	QWORD PTR [rcx+304], r9
-        adc	r10, QWORD PTR [r8+312]
-        mov	r9, QWORD PTR [rdx+320]
-        mov	QWORD PTR [rcx+312], r10
-        adc	r9, QWORD PTR [r8+320]
-        mov	r10, QWORD PTR [rdx+328]
-        mov	QWORD PTR [rcx+320], r9
-        adc	r10, QWORD PTR [r8+328]
-        mov	r9, QWORD PTR [rdx+336]
-        mov	QWORD PTR [rcx+328], r10
-        adc	r9, QWORD PTR [r8+336]
-        mov	r10, QWORD PTR [rdx+344]
-        mov	QWORD PTR [rcx+336], r9
-        adc	r10, QWORD PTR [r8+344]
-        mov	r9, QWORD PTR [rdx+352]
-        mov	QWORD PTR [rcx+344], r10
-        adc	r9, QWORD PTR [r8+352]
-        mov	r10, QWORD PTR [rdx+360]
-        mov	QWORD PTR [rcx+352], r9
-        adc	r10, QWORD PTR [r8+360]
-        mov	r9, QWORD PTR [rdx+368]
-        mov	QWORD PTR [rcx+360], r10
-        adc	r9, QWORD PTR [r8+368]
-        mov	r10, QWORD PTR [rdx+376]
-        mov	QWORD PTR [rcx+368], r9
-        adc	r10, QWORD PTR [r8+376]
-        mov	QWORD PTR [rcx+376], r10
-        adc	rax, 0
-        ret
-sp_3072_add_48 ENDP
-_text ENDS
-; /* Multiply a and b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_mul_48 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        sub	rsp, 1192
-        mov	QWORD PTR [rsp+1152], rcx
-        mov	QWORD PTR [rsp+1160], rdx
-        mov	QWORD PTR [rsp+1168], r8
-        lea	r12, QWORD PTR [rsp+768]
-        lea	r14, QWORD PTR [rdx+192]
-        ; Add
-        mov	rax, QWORD PTR [rdx]
-        xor	r15, r15
-        add	rax, QWORD PTR [r14]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	QWORD PTR [r12], rax
-        adc	r9, QWORD PTR [r14+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	QWORD PTR [r12+8], r9
-        adc	r10, QWORD PTR [r14+16]
-        mov	rax, QWORD PTR [rdx+24]
-        mov	QWORD PTR [r12+16], r10
-        adc	rax, QWORD PTR [r14+24]
-        mov	r9, QWORD PTR [rdx+32]
-        mov	QWORD PTR [r12+24], rax
-        adc	r9, QWORD PTR [r14+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	QWORD PTR [r12+32], r9
-        adc	r10, QWORD PTR [r14+40]
-        mov	rax, QWORD PTR [rdx+48]
-        mov	QWORD PTR [r12+40], r10
-        adc	rax, QWORD PTR [r14+48]
-        mov	r9, QWORD PTR [rdx+56]
-        mov	QWORD PTR [r12+48], rax
-        adc	r9, QWORD PTR [r14+56]
-        mov	r10, QWORD PTR [rdx+64]
-        mov	QWORD PTR [r12+56], r9
-        adc	r10, QWORD PTR [r14+64]
-        mov	rax, QWORD PTR [rdx+72]
-        mov	QWORD PTR [r12+64], r10
-        adc	rax, QWORD PTR [r14+72]
-        mov	r9, QWORD PTR [rdx+80]
-        mov	QWORD PTR [r12+72], rax
-        adc	r9, QWORD PTR [r14+80]
-        mov	r10, QWORD PTR [rdx+88]
-        mov	QWORD PTR [r12+80], r9
-        adc	r10, QWORD PTR [r14+88]
-        mov	rax, QWORD PTR [rdx+96]
-        mov	QWORD PTR [r12+88], r10
-        adc	rax, QWORD PTR [r14+96]
-        mov	r9, QWORD PTR [rdx+104]
-        mov	QWORD PTR [r12+96], rax
-        adc	r9, QWORD PTR [r14+104]
-        mov	r10, QWORD PTR [rdx+112]
-        mov	QWORD PTR [r12+104], r9
-        adc	r10, QWORD PTR [r14+112]
-        mov	rax, QWORD PTR [rdx+120]
-        mov	QWORD PTR [r12+112], r10
-        adc	rax, QWORD PTR [r14+120]
-        mov	r9, QWORD PTR [rdx+128]
-        mov	QWORD PTR [r12+120], rax
-        adc	r9, QWORD PTR [r14+128]
-        mov	r10, QWORD PTR [rdx+136]
-        mov	QWORD PTR [r12+128], r9
-        adc	r10, QWORD PTR [r14+136]
-        mov	rax, QWORD PTR [rdx+144]
-        mov	QWORD PTR [r12+136], r10
-        adc	rax, QWORD PTR [r14+144]
-        mov	r9, QWORD PTR [rdx+152]
-        mov	QWORD PTR [r12+144], rax
-        adc	r9, QWORD PTR [r14+152]
-        mov	r10, QWORD PTR [rdx+160]
-        mov	QWORD PTR [r12+152], r9
-        adc	r10, QWORD PTR [r14+160]
-        mov	rax, QWORD PTR [rdx+168]
-        mov	QWORD PTR [r12+160], r10
-        adc	rax, QWORD PTR [r14+168]
-        mov	r9, QWORD PTR [rdx+176]
-        mov	QWORD PTR [r12+168], rax
-        adc	r9, QWORD PTR [r14+176]
-        mov	r10, QWORD PTR [rdx+184]
-        mov	QWORD PTR [r12+176], r9
-        adc	r10, QWORD PTR [r14+184]
-        mov	QWORD PTR [r12+184], r10
-        adc	r15, 0
-        mov	QWORD PTR [rsp+1176], r15
-        lea	r13, QWORD PTR [rsp+960]
-        lea	r14, QWORD PTR [r8+192]
-        ; Add
-        mov	rax, QWORD PTR [r8]
-        xor	rdi, rdi
-        add	rax, QWORD PTR [r14]
-        mov	r9, QWORD PTR [r8+8]
-        mov	QWORD PTR [r13], rax
-        adc	r9, QWORD PTR [r14+8]
-        mov	r10, QWORD PTR [r8+16]
-        mov	QWORD PTR [r13+8], r9
-        adc	r10, QWORD PTR [r14+16]
-        mov	rax, QWORD PTR [r8+24]
-        mov	QWORD PTR [r13+16], r10
-        adc	rax, QWORD PTR [r14+24]
-        mov	r9, QWORD PTR [r8+32]
-        mov	QWORD PTR [r13+24], rax
-        adc	r9, QWORD PTR [r14+32]
-        mov	r10, QWORD PTR [r8+40]
-        mov	QWORD PTR [r13+32], r9
-        adc	r10, QWORD PTR [r14+40]
-        mov	rax, QWORD PTR [r8+48]
-        mov	QWORD PTR [r13+40], r10
-        adc	rax, QWORD PTR [r14+48]
-        mov	r9, QWORD PTR [r8+56]
-        mov	QWORD PTR [r13+48], rax
-        adc	r9, QWORD PTR [r14+56]
-        mov	r10, QWORD PTR [r8+64]
-        mov	QWORD PTR [r13+56], r9
-        adc	r10, QWORD PTR [r14+64]
-        mov	rax, QWORD PTR [r8+72]
-        mov	QWORD PTR [r13+64], r10
-        adc	rax, QWORD PTR [r14+72]
-        mov	r9, QWORD PTR [r8+80]
-        mov	QWORD PTR [r13+72], rax
-        adc	r9, QWORD PTR [r14+80]
-        mov	r10, QWORD PTR [r8+88]
-        mov	QWORD PTR [r13+80], r9
-        adc	r10, QWORD PTR [r14+88]
-        mov	rax, QWORD PTR [r8+96]
-        mov	QWORD PTR [r13+88], r10
-        adc	rax, QWORD PTR [r14+96]
-        mov	r9, QWORD PTR [r8+104]
-        mov	QWORD PTR [r13+96], rax
-        adc	r9, QWORD PTR [r14+104]
-        mov	r10, QWORD PTR [r8+112]
-        mov	QWORD PTR [r13+104], r9
-        adc	r10, QWORD PTR [r14+112]
-        mov	rax, QWORD PTR [r8+120]
-        mov	QWORD PTR [r13+112], r10
-        adc	rax, QWORD PTR [r14+120]
-        mov	r9, QWORD PTR [r8+128]
-        mov	QWORD PTR [r13+120], rax
-        adc	r9, QWORD PTR [r14+128]
-        mov	r10, QWORD PTR [r8+136]
-        mov	QWORD PTR [r13+128], r9
-        adc	r10, QWORD PTR [r14+136]
-        mov	rax, QWORD PTR [r8+144]
-        mov	QWORD PTR [r13+136], r10
-        adc	rax, QWORD PTR [r14+144]
-        mov	r9, QWORD PTR [r8+152]
-        mov	QWORD PTR [r13+144], rax
-        adc	r9, QWORD PTR [r14+152]
-        mov	r10, QWORD PTR [r8+160]
-        mov	QWORD PTR [r13+152], r9
-        adc	r10, QWORD PTR [r14+160]
-        mov	rax, QWORD PTR [r8+168]
-        mov	QWORD PTR [r13+160], r10
-        adc	rax, QWORD PTR [r14+168]
-        mov	r9, QWORD PTR [r8+176]
-        mov	QWORD PTR [r13+168], rax
-        adc	r9, QWORD PTR [r14+176]
-        mov	r10, QWORD PTR [r8+184]
-        mov	QWORD PTR [r13+176], r9
-        adc	r10, QWORD PTR [r14+184]
-        mov	QWORD PTR [r13+184], r10
-        adc	rdi, 0
-        mov	QWORD PTR [rsp+1184], rdi
-        mov	r8, r13
-        mov	rdx, r12
-        mov	rcx, rsp
-        call	sp_3072_mul_24
-        mov	r8, QWORD PTR [rsp+1168]
-        mov	rdx, QWORD PTR [rsp+1160]
-        lea	rcx, QWORD PTR [rsp+384]
-        add	r8, 192
-        add	rdx, 192
-        call	sp_3072_mul_24
-        mov	r8, QWORD PTR [rsp+1168]
-        mov	rdx, QWORD PTR [rsp+1160]
-        mov	rcx, QWORD PTR [rsp+1152]
-        call	sp_3072_mul_24
-IFDEF _WIN64
-        mov	r8, QWORD PTR [rsp+1168]
-        mov	rdx, QWORD PTR [rsp+1160]
-        mov	rcx, QWORD PTR [rsp+1152]
-ENDIF
-        mov	r15, QWORD PTR [rsp+1176]
-        mov	rdi, QWORD PTR [rsp+1184]
-        mov	rsi, QWORD PTR [rsp+1152]
-        mov	r11, r15
-        lea	r12, QWORD PTR [rsp+768]
-        lea	r13, QWORD PTR [rsp+960]
-        and	r11, rdi
-        neg	r15
-        neg	rdi
-        add	rsi, 384
-        mov	rax, QWORD PTR [r12]
-        mov	r9, QWORD PTR [r13]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12], rax
-        mov	QWORD PTR [r13], r9
-        mov	rax, QWORD PTR [r12+8]
-        mov	r9, QWORD PTR [r13+8]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+8], rax
-        mov	QWORD PTR [r13+8], r9
-        mov	rax, QWORD PTR [r12+16]
-        mov	r9, QWORD PTR [r13+16]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+16], rax
-        mov	QWORD PTR [r13+16], r9
-        mov	rax, QWORD PTR [r12+24]
-        mov	r9, QWORD PTR [r13+24]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+24], rax
-        mov	QWORD PTR [r13+24], r9
-        mov	rax, QWORD PTR [r12+32]
-        mov	r9, QWORD PTR [r13+32]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+32], rax
-        mov	QWORD PTR [r13+32], r9
-        mov	rax, QWORD PTR [r12+40]
-        mov	r9, QWORD PTR [r13+40]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+40], rax
-        mov	QWORD PTR [r13+40], r9
-        mov	rax, QWORD PTR [r12+48]
-        mov	r9, QWORD PTR [r13+48]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+48], rax
-        mov	QWORD PTR [r13+48], r9
-        mov	rax, QWORD PTR [r12+56]
-        mov	r9, QWORD PTR [r13+56]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+56], rax
-        mov	QWORD PTR [r13+56], r9
-        mov	rax, QWORD PTR [r12+64]
-        mov	r9, QWORD PTR [r13+64]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+64], rax
-        mov	QWORD PTR [r13+64], r9
-        mov	rax, QWORD PTR [r12+72]
-        mov	r9, QWORD PTR [r13+72]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+72], rax
-        mov	QWORD PTR [r13+72], r9
-        mov	rax, QWORD PTR [r12+80]
-        mov	r9, QWORD PTR [r13+80]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+80], rax
-        mov	QWORD PTR [r13+80], r9
-        mov	rax, QWORD PTR [r12+88]
-        mov	r9, QWORD PTR [r13+88]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+88], rax
-        mov	QWORD PTR [r13+88], r9
-        mov	rax, QWORD PTR [r12+96]
-        mov	r9, QWORD PTR [r13+96]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+96], rax
-        mov	QWORD PTR [r13+96], r9
-        mov	rax, QWORD PTR [r12+104]
-        mov	r9, QWORD PTR [r13+104]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+104], rax
-        mov	QWORD PTR [r13+104], r9
-        mov	rax, QWORD PTR [r12+112]
-        mov	r9, QWORD PTR [r13+112]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+112], rax
-        mov	QWORD PTR [r13+112], r9
-        mov	rax, QWORD PTR [r12+120]
-        mov	r9, QWORD PTR [r13+120]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+120], rax
-        mov	QWORD PTR [r13+120], r9
-        mov	rax, QWORD PTR [r12+128]
-        mov	r9, QWORD PTR [r13+128]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+128], rax
-        mov	QWORD PTR [r13+128], r9
-        mov	rax, QWORD PTR [r12+136]
-        mov	r9, QWORD PTR [r13+136]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+136], rax
-        mov	QWORD PTR [r13+136], r9
-        mov	rax, QWORD PTR [r12+144]
-        mov	r9, QWORD PTR [r13+144]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+144], rax
-        mov	QWORD PTR [r13+144], r9
-        mov	rax, QWORD PTR [r12+152]
-        mov	r9, QWORD PTR [r13+152]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+152], rax
-        mov	QWORD PTR [r13+152], r9
-        mov	rax, QWORD PTR [r12+160]
-        mov	r9, QWORD PTR [r13+160]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+160], rax
-        mov	QWORD PTR [r13+160], r9
-        mov	rax, QWORD PTR [r12+168]
-        mov	r9, QWORD PTR [r13+168]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+168], rax
-        mov	QWORD PTR [r13+168], r9
-        mov	rax, QWORD PTR [r12+176]
-        mov	r9, QWORD PTR [r13+176]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+176], rax
-        mov	QWORD PTR [r13+176], r9
-        mov	rax, QWORD PTR [r12+184]
-        mov	r9, QWORD PTR [r13+184]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+184], rax
-        mov	QWORD PTR [r13+184], r9
-        mov	rax, QWORD PTR [r12]
-        add	rax, QWORD PTR [r13]
-        mov	r9, QWORD PTR [r12+8]
-        mov	QWORD PTR [rsi], rax
-        adc	r9, QWORD PTR [r13+8]
-        mov	r10, QWORD PTR [r12+16]
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, QWORD PTR [r13+16]
-        mov	rax, QWORD PTR [r12+24]
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, QWORD PTR [r13+24]
-        mov	r9, QWORD PTR [r12+32]
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, QWORD PTR [r13+32]
-        mov	r10, QWORD PTR [r12+40]
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, QWORD PTR [r13+40]
-        mov	rax, QWORD PTR [r12+48]
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, QWORD PTR [r13+48]
-        mov	r9, QWORD PTR [r12+56]
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, QWORD PTR [r13+56]
-        mov	r10, QWORD PTR [r12+64]
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, QWORD PTR [r13+64]
-        mov	rax, QWORD PTR [r12+72]
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, QWORD PTR [r13+72]
-        mov	r9, QWORD PTR [r12+80]
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, QWORD PTR [r13+80]
-        mov	r10, QWORD PTR [r12+88]
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, QWORD PTR [r13+88]
-        mov	rax, QWORD PTR [r12+96]
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, QWORD PTR [r13+96]
-        mov	r9, QWORD PTR [r12+104]
-        mov	QWORD PTR [rsi+96], rax
-        adc	r9, QWORD PTR [r13+104]
-        mov	r10, QWORD PTR [r12+112]
-        mov	QWORD PTR [rsi+104], r9
-        adc	r10, QWORD PTR [r13+112]
-        mov	rax, QWORD PTR [r12+120]
-        mov	QWORD PTR [rsi+112], r10
-        adc	rax, QWORD PTR [r13+120]
-        mov	r9, QWORD PTR [r12+128]
-        mov	QWORD PTR [rsi+120], rax
-        adc	r9, QWORD PTR [r13+128]
-        mov	r10, QWORD PTR [r12+136]
-        mov	QWORD PTR [rsi+128], r9
-        adc	r10, QWORD PTR [r13+136]
-        mov	rax, QWORD PTR [r12+144]
-        mov	QWORD PTR [rsi+136], r10
-        adc	rax, QWORD PTR [r13+144]
-        mov	r9, QWORD PTR [r12+152]
-        mov	QWORD PTR [rsi+144], rax
-        adc	r9, QWORD PTR [r13+152]
-        mov	r10, QWORD PTR [r12+160]
-        mov	QWORD PTR [rsi+152], r9
-        adc	r10, QWORD PTR [r13+160]
-        mov	rax, QWORD PTR [r12+168]
-        mov	QWORD PTR [rsi+160], r10
-        adc	rax, QWORD PTR [r13+168]
-        mov	r9, QWORD PTR [r12+176]
-        mov	QWORD PTR [rsi+168], rax
-        adc	r9, QWORD PTR [r13+176]
-        mov	r10, QWORD PTR [r12+184]
-        mov	QWORD PTR [rsi+176], r9
-        adc	r10, QWORD PTR [r13+184]
-        mov	QWORD PTR [rsi+184], r10
-        adc	r11, 0
-        lea	r13, QWORD PTR [rsp+384]
-        mov	r12, rsp
-        mov	rax, QWORD PTR [r12]
-        sub	rax, QWORD PTR [r13]
-        mov	r9, QWORD PTR [r12+8]
-        mov	QWORD PTR [r12], rax
-        sbb	r9, QWORD PTR [r13+8]
-        mov	r10, QWORD PTR [r12+16]
-        mov	QWORD PTR [r12+8], r9
-        sbb	r10, QWORD PTR [r13+16]
-        mov	rax, QWORD PTR [r12+24]
-        mov	QWORD PTR [r12+16], r10
-        sbb	rax, QWORD PTR [r13+24]
-        mov	r9, QWORD PTR [r12+32]
-        mov	QWORD PTR [r12+24], rax
-        sbb	r9, QWORD PTR [r13+32]
-        mov	r10, QWORD PTR [r12+40]
-        mov	QWORD PTR [r12+32], r9
-        sbb	r10, QWORD PTR [r13+40]
-        mov	rax, QWORD PTR [r12+48]
-        mov	QWORD PTR [r12+40], r10
-        sbb	rax, QWORD PTR [r13+48]
-        mov	r9, QWORD PTR [r12+56]
-        mov	QWORD PTR [r12+48], rax
-        sbb	r9, QWORD PTR [r13+56]
-        mov	r10, QWORD PTR [r12+64]
-        mov	QWORD PTR [r12+56], r9
-        sbb	r10, QWORD PTR [r13+64]
-        mov	rax, QWORD PTR [r12+72]
-        mov	QWORD PTR [r12+64], r10
-        sbb	rax, QWORD PTR [r13+72]
-        mov	r9, QWORD PTR [r12+80]
-        mov	QWORD PTR [r12+72], rax
-        sbb	r9, QWORD PTR [r13+80]
-        mov	r10, QWORD PTR [r12+88]
-        mov	QWORD PTR [r12+80], r9
-        sbb	r10, QWORD PTR [r13+88]
-        mov	rax, QWORD PTR [r12+96]
-        mov	QWORD PTR [r12+88], r10
-        sbb	rax, QWORD PTR [r13+96]
-        mov	r9, QWORD PTR [r12+104]
-        mov	QWORD PTR [r12+96], rax
-        sbb	r9, QWORD PTR [r13+104]
-        mov	r10, QWORD PTR [r12+112]
-        mov	QWORD PTR [r12+104], r9
-        sbb	r10, QWORD PTR [r13+112]
-        mov	rax, QWORD PTR [r12+120]
-        mov	QWORD PTR [r12+112], r10
-        sbb	rax, QWORD PTR [r13+120]
-        mov	r9, QWORD PTR [r12+128]
-        mov	QWORD PTR [r12+120], rax
-        sbb	r9, QWORD PTR [r13+128]
-        mov	r10, QWORD PTR [r12+136]
-        mov	QWORD PTR [r12+128], r9
-        sbb	r10, QWORD PTR [r13+136]
-        mov	rax, QWORD PTR [r12+144]
-        mov	QWORD PTR [r12+136], r10
-        sbb	rax, QWORD PTR [r13+144]
-        mov	r9, QWORD PTR [r12+152]
-        mov	QWORD PTR [r12+144], rax
-        sbb	r9, QWORD PTR [r13+152]
-        mov	r10, QWORD PTR [r12+160]
-        mov	QWORD PTR [r12+152], r9
-        sbb	r10, QWORD PTR [r13+160]
-        mov	rax, QWORD PTR [r12+168]
-        mov	QWORD PTR [r12+160], r10
-        sbb	rax, QWORD PTR [r13+168]
-        mov	r9, QWORD PTR [r12+176]
-        mov	QWORD PTR [r12+168], rax
-        sbb	r9, QWORD PTR [r13+176]
-        mov	r10, QWORD PTR [r12+184]
-        mov	QWORD PTR [r12+176], r9
-        sbb	r10, QWORD PTR [r13+184]
-        mov	rax, QWORD PTR [r12+192]
-        mov	QWORD PTR [r12+184], r10
-        sbb	rax, QWORD PTR [r13+192]
-        mov	r9, QWORD PTR [r12+200]
-        mov	QWORD PTR [r12+192], rax
-        sbb	r9, QWORD PTR [r13+200]
-        mov	r10, QWORD PTR [r12+208]
-        mov	QWORD PTR [r12+200], r9
-        sbb	r10, QWORD PTR [r13+208]
-        mov	rax, QWORD PTR [r12+216]
-        mov	QWORD PTR [r12+208], r10
-        sbb	rax, QWORD PTR [r13+216]
-        mov	r9, QWORD PTR [r12+224]
-        mov	QWORD PTR [r12+216], rax
-        sbb	r9, QWORD PTR [r13+224]
-        mov	r10, QWORD PTR [r12+232]
-        mov	QWORD PTR [r12+224], r9
-        sbb	r10, QWORD PTR [r13+232]
-        mov	rax, QWORD PTR [r12+240]
-        mov	QWORD PTR [r12+232], r10
-        sbb	rax, QWORD PTR [r13+240]
-        mov	r9, QWORD PTR [r12+248]
-        mov	QWORD PTR [r12+240], rax
-        sbb	r9, QWORD PTR [r13+248]
-        mov	r10, QWORD PTR [r12+256]
-        mov	QWORD PTR [r12+248], r9
-        sbb	r10, QWORD PTR [r13+256]
-        mov	rax, QWORD PTR [r12+264]
-        mov	QWORD PTR [r12+256], r10
-        sbb	rax, QWORD PTR [r13+264]
-        mov	r9, QWORD PTR [r12+272]
-        mov	QWORD PTR [r12+264], rax
-        sbb	r9, QWORD PTR [r13+272]
-        mov	r10, QWORD PTR [r12+280]
-        mov	QWORD PTR [r12+272], r9
-        sbb	r10, QWORD PTR [r13+280]
-        mov	rax, QWORD PTR [r12+288]
-        mov	QWORD PTR [r12+280], r10
-        sbb	rax, QWORD PTR [r13+288]
-        mov	r9, QWORD PTR [r12+296]
-        mov	QWORD PTR [r12+288], rax
-        sbb	r9, QWORD PTR [r13+296]
-        mov	r10, QWORD PTR [r12+304]
-        mov	QWORD PTR [r12+296], r9
-        sbb	r10, QWORD PTR [r13+304]
-        mov	rax, QWORD PTR [r12+312]
-        mov	QWORD PTR [r12+304], r10
-        sbb	rax, QWORD PTR [r13+312]
-        mov	r9, QWORD PTR [r12+320]
-        mov	QWORD PTR [r12+312], rax
-        sbb	r9, QWORD PTR [r13+320]
-        mov	r10, QWORD PTR [r12+328]
-        mov	QWORD PTR [r12+320], r9
-        sbb	r10, QWORD PTR [r13+328]
-        mov	rax, QWORD PTR [r12+336]
-        mov	QWORD PTR [r12+328], r10
-        sbb	rax, QWORD PTR [r13+336]
-        mov	r9, QWORD PTR [r12+344]
-        mov	QWORD PTR [r12+336], rax
-        sbb	r9, QWORD PTR [r13+344]
-        mov	r10, QWORD PTR [r12+352]
-        mov	QWORD PTR [r12+344], r9
-        sbb	r10, QWORD PTR [r13+352]
-        mov	rax, QWORD PTR [r12+360]
-        mov	QWORD PTR [r12+352], r10
-        sbb	rax, QWORD PTR [r13+360]
-        mov	r9, QWORD PTR [r12+368]
-        mov	QWORD PTR [r12+360], rax
-        sbb	r9, QWORD PTR [r13+368]
-        mov	r10, QWORD PTR [r12+376]
-        mov	QWORD PTR [r12+368], r9
-        sbb	r10, QWORD PTR [r13+376]
-        mov	QWORD PTR [r12+376], r10
-        sbb	r11, 0
-        mov	rax, QWORD PTR [r12]
-        sub	rax, QWORD PTR [rcx]
-        mov	r9, QWORD PTR [r12+8]
-        mov	QWORD PTR [r12], rax
-        sbb	r9, QWORD PTR [rcx+8]
-        mov	r10, QWORD PTR [r12+16]
-        mov	QWORD PTR [r12+8], r9
-        sbb	r10, QWORD PTR [rcx+16]
-        mov	rax, QWORD PTR [r12+24]
-        mov	QWORD PTR [r12+16], r10
-        sbb	rax, QWORD PTR [rcx+24]
-        mov	r9, QWORD PTR [r12+32]
-        mov	QWORD PTR [r12+24], rax
-        sbb	r9, QWORD PTR [rcx+32]
-        mov	r10, QWORD PTR [r12+40]
-        mov	QWORD PTR [r12+32], r9
-        sbb	r10, QWORD PTR [rcx+40]
-        mov	rax, QWORD PTR [r12+48]
-        mov	QWORD PTR [r12+40], r10
-        sbb	rax, QWORD PTR [rcx+48]
-        mov	r9, QWORD PTR [r12+56]
-        mov	QWORD PTR [r12+48], rax
-        sbb	r9, QWORD PTR [rcx+56]
-        mov	r10, QWORD PTR [r12+64]
-        mov	QWORD PTR [r12+56], r9
-        sbb	r10, QWORD PTR [rcx+64]
-        mov	rax, QWORD PTR [r12+72]
-        mov	QWORD PTR [r12+64], r10
-        sbb	rax, QWORD PTR [rcx+72]
-        mov	r9, QWORD PTR [r12+80]
-        mov	QWORD PTR [r12+72], rax
-        sbb	r9, QWORD PTR [rcx+80]
-        mov	r10, QWORD PTR [r12+88]
-        mov	QWORD PTR [r12+80], r9
-        sbb	r10, QWORD PTR [rcx+88]
-        mov	rax, QWORD PTR [r12+96]
-        mov	QWORD PTR [r12+88], r10
-        sbb	rax, QWORD PTR [rcx+96]
-        mov	r9, QWORD PTR [r12+104]
-        mov	QWORD PTR [r12+96], rax
-        sbb	r9, QWORD PTR [rcx+104]
-        mov	r10, QWORD PTR [r12+112]
-        mov	QWORD PTR [r12+104], r9
-        sbb	r10, QWORD PTR [rcx+112]
-        mov	rax, QWORD PTR [r12+120]
-        mov	QWORD PTR [r12+112], r10
-        sbb	rax, QWORD PTR [rcx+120]
-        mov	r9, QWORD PTR [r12+128]
-        mov	QWORD PTR [r12+120], rax
-        sbb	r9, QWORD PTR [rcx+128]
-        mov	r10, QWORD PTR [r12+136]
-        mov	QWORD PTR [r12+128], r9
-        sbb	r10, QWORD PTR [rcx+136]
-        mov	rax, QWORD PTR [r12+144]
-        mov	QWORD PTR [r12+136], r10
-        sbb	rax, QWORD PTR [rcx+144]
-        mov	r9, QWORD PTR [r12+152]
-        mov	QWORD PTR [r12+144], rax
-        sbb	r9, QWORD PTR [rcx+152]
-        mov	r10, QWORD PTR [r12+160]
-        mov	QWORD PTR [r12+152], r9
-        sbb	r10, QWORD PTR [rcx+160]
-        mov	rax, QWORD PTR [r12+168]
-        mov	QWORD PTR [r12+160], r10
-        sbb	rax, QWORD PTR [rcx+168]
-        mov	r9, QWORD PTR [r12+176]
-        mov	QWORD PTR [r12+168], rax
-        sbb	r9, QWORD PTR [rcx+176]
-        mov	r10, QWORD PTR [r12+184]
-        mov	QWORD PTR [r12+176], r9
-        sbb	r10, QWORD PTR [rcx+184]
-        mov	rax, QWORD PTR [r12+192]
-        mov	QWORD PTR [r12+184], r10
-        sbb	rax, QWORD PTR [rcx+192]
-        mov	r9, QWORD PTR [r12+200]
-        mov	QWORD PTR [r12+192], rax
-        sbb	r9, QWORD PTR [rcx+200]
-        mov	r10, QWORD PTR [r12+208]
-        mov	QWORD PTR [r12+200], r9
-        sbb	r10, QWORD PTR [rcx+208]
-        mov	rax, QWORD PTR [r12+216]
-        mov	QWORD PTR [r12+208], r10
-        sbb	rax, QWORD PTR [rcx+216]
-        mov	r9, QWORD PTR [r12+224]
-        mov	QWORD PTR [r12+216], rax
-        sbb	r9, QWORD PTR [rcx+224]
-        mov	r10, QWORD PTR [r12+232]
-        mov	QWORD PTR [r12+224], r9
-        sbb	r10, QWORD PTR [rcx+232]
-        mov	rax, QWORD PTR [r12+240]
-        mov	QWORD PTR [r12+232], r10
-        sbb	rax, QWORD PTR [rcx+240]
-        mov	r9, QWORD PTR [r12+248]
-        mov	QWORD PTR [r12+240], rax
-        sbb	r9, QWORD PTR [rcx+248]
-        mov	r10, QWORD PTR [r12+256]
-        mov	QWORD PTR [r12+248], r9
-        sbb	r10, QWORD PTR [rcx+256]
-        mov	rax, QWORD PTR [r12+264]
-        mov	QWORD PTR [r12+256], r10
-        sbb	rax, QWORD PTR [rcx+264]
-        mov	r9, QWORD PTR [r12+272]
-        mov	QWORD PTR [r12+264], rax
-        sbb	r9, QWORD PTR [rcx+272]
-        mov	r10, QWORD PTR [r12+280]
-        mov	QWORD PTR [r12+272], r9
-        sbb	r10, QWORD PTR [rcx+280]
-        mov	rax, QWORD PTR [r12+288]
-        mov	QWORD PTR [r12+280], r10
-        sbb	rax, QWORD PTR [rcx+288]
-        mov	r9, QWORD PTR [r12+296]
-        mov	QWORD PTR [r12+288], rax
-        sbb	r9, QWORD PTR [rcx+296]
-        mov	r10, QWORD PTR [r12+304]
-        mov	QWORD PTR [r12+296], r9
-        sbb	r10, QWORD PTR [rcx+304]
-        mov	rax, QWORD PTR [r12+312]
-        mov	QWORD PTR [r12+304], r10
-        sbb	rax, QWORD PTR [rcx+312]
-        mov	r9, QWORD PTR [r12+320]
-        mov	QWORD PTR [r12+312], rax
-        sbb	r9, QWORD PTR [rcx+320]
-        mov	r10, QWORD PTR [r12+328]
-        mov	QWORD PTR [r12+320], r9
-        sbb	r10, QWORD PTR [rcx+328]
-        mov	rax, QWORD PTR [r12+336]
-        mov	QWORD PTR [r12+328], r10
-        sbb	rax, QWORD PTR [rcx+336]
-        mov	r9, QWORD PTR [r12+344]
-        mov	QWORD PTR [r12+336], rax
-        sbb	r9, QWORD PTR [rcx+344]
-        mov	r10, QWORD PTR [r12+352]
-        mov	QWORD PTR [r12+344], r9
-        sbb	r10, QWORD PTR [rcx+352]
-        mov	rax, QWORD PTR [r12+360]
-        mov	QWORD PTR [r12+352], r10
-        sbb	rax, QWORD PTR [rcx+360]
-        mov	r9, QWORD PTR [r12+368]
-        mov	QWORD PTR [r12+360], rax
-        sbb	r9, QWORD PTR [rcx+368]
-        mov	r10, QWORD PTR [r12+376]
-        mov	QWORD PTR [r12+368], r9
-        sbb	r10, QWORD PTR [rcx+376]
-        mov	QWORD PTR [r12+376], r10
-        sbb	r11, 0
-        sub	rsi, 192
-        ; Add
-        mov	rax, QWORD PTR [rsi]
-        add	rax, QWORD PTR [r12]
-        mov	r9, QWORD PTR [rsi+8]
-        mov	QWORD PTR [rsi], rax
-        adc	r9, QWORD PTR [r12+8]
-        mov	r10, QWORD PTR [rsi+16]
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, QWORD PTR [r12+16]
-        mov	rax, QWORD PTR [rsi+24]
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, QWORD PTR [r12+24]
-        mov	r9, QWORD PTR [rsi+32]
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, QWORD PTR [r12+32]
-        mov	r10, QWORD PTR [rsi+40]
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, QWORD PTR [r12+40]
-        mov	rax, QWORD PTR [rsi+48]
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, QWORD PTR [r12+48]
-        mov	r9, QWORD PTR [rsi+56]
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, QWORD PTR [r12+56]
-        mov	r10, QWORD PTR [rsi+64]
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, QWORD PTR [r12+64]
-        mov	rax, QWORD PTR [rsi+72]
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, QWORD PTR [r12+72]
-        mov	r9, QWORD PTR [rsi+80]
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, QWORD PTR [r12+80]
-        mov	r10, QWORD PTR [rsi+88]
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, QWORD PTR [r12+88]
-        mov	rax, QWORD PTR [rsi+96]
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, QWORD PTR [r12+96]
-        mov	r9, QWORD PTR [rsi+104]
-        mov	QWORD PTR [rsi+96], rax
-        adc	r9, QWORD PTR [r12+104]
-        mov	r10, QWORD PTR [rsi+112]
-        mov	QWORD PTR [rsi+104], r9
-        adc	r10, QWORD PTR [r12+112]
-        mov	rax, QWORD PTR [rsi+120]
-        mov	QWORD PTR [rsi+112], r10
-        adc	rax, QWORD PTR [r12+120]
-        mov	r9, QWORD PTR [rsi+128]
-        mov	QWORD PTR [rsi+120], rax
-        adc	r9, QWORD PTR [r12+128]
-        mov	r10, QWORD PTR [rsi+136]
-        mov	QWORD PTR [rsi+128], r9
-        adc	r10, QWORD PTR [r12+136]
-        mov	rax, QWORD PTR [rsi+144]
-        mov	QWORD PTR [rsi+136], r10
-        adc	rax, QWORD PTR [r12+144]
-        mov	r9, QWORD PTR [rsi+152]
-        mov	QWORD PTR [rsi+144], rax
-        adc	r9, QWORD PTR [r12+152]
-        mov	r10, QWORD PTR [rsi+160]
-        mov	QWORD PTR [rsi+152], r9
-        adc	r10, QWORD PTR [r12+160]
-        mov	rax, QWORD PTR [rsi+168]
-        mov	QWORD PTR [rsi+160], r10
-        adc	rax, QWORD PTR [r12+168]
-        mov	r9, QWORD PTR [rsi+176]
-        mov	QWORD PTR [rsi+168], rax
-        adc	r9, QWORD PTR [r12+176]
-        mov	r10, QWORD PTR [rsi+184]
-        mov	QWORD PTR [rsi+176], r9
-        adc	r10, QWORD PTR [r12+184]
-        mov	rax, QWORD PTR [rsi+192]
-        mov	QWORD PTR [rsi+184], r10
-        adc	rax, QWORD PTR [r12+192]
-        mov	r9, QWORD PTR [rsi+200]
-        mov	QWORD PTR [rsi+192], rax
-        adc	r9, QWORD PTR [r12+200]
-        mov	r10, QWORD PTR [rsi+208]
-        mov	QWORD PTR [rsi+200], r9
-        adc	r10, QWORD PTR [r12+208]
-        mov	rax, QWORD PTR [rsi+216]
-        mov	QWORD PTR [rsi+208], r10
-        adc	rax, QWORD PTR [r12+216]
-        mov	r9, QWORD PTR [rsi+224]
-        mov	QWORD PTR [rsi+216], rax
-        adc	r9, QWORD PTR [r12+224]
-        mov	r10, QWORD PTR [rsi+232]
-        mov	QWORD PTR [rsi+224], r9
-        adc	r10, QWORD PTR [r12+232]
-        mov	rax, QWORD PTR [rsi+240]
-        mov	QWORD PTR [rsi+232], r10
-        adc	rax, QWORD PTR [r12+240]
-        mov	r9, QWORD PTR [rsi+248]
-        mov	QWORD PTR [rsi+240], rax
-        adc	r9, QWORD PTR [r12+248]
-        mov	r10, QWORD PTR [rsi+256]
-        mov	QWORD PTR [rsi+248], r9
-        adc	r10, QWORD PTR [r12+256]
-        mov	rax, QWORD PTR [rsi+264]
-        mov	QWORD PTR [rsi+256], r10
-        adc	rax, QWORD PTR [r12+264]
-        mov	r9, QWORD PTR [rsi+272]
-        mov	QWORD PTR [rsi+264], rax
-        adc	r9, QWORD PTR [r12+272]
-        mov	r10, QWORD PTR [rsi+280]
-        mov	QWORD PTR [rsi+272], r9
-        adc	r10, QWORD PTR [r12+280]
-        mov	rax, QWORD PTR [rsi+288]
-        mov	QWORD PTR [rsi+280], r10
-        adc	rax, QWORD PTR [r12+288]
-        mov	r9, QWORD PTR [rsi+296]
-        mov	QWORD PTR [rsi+288], rax
-        adc	r9, QWORD PTR [r12+296]
-        mov	r10, QWORD PTR [rsi+304]
-        mov	QWORD PTR [rsi+296], r9
-        adc	r10, QWORD PTR [r12+304]
-        mov	rax, QWORD PTR [rsi+312]
-        mov	QWORD PTR [rsi+304], r10
-        adc	rax, QWORD PTR [r12+312]
-        mov	r9, QWORD PTR [rsi+320]
-        mov	QWORD PTR [rsi+312], rax
-        adc	r9, QWORD PTR [r12+320]
-        mov	r10, QWORD PTR [rsi+328]
-        mov	QWORD PTR [rsi+320], r9
-        adc	r10, QWORD PTR [r12+328]
-        mov	rax, QWORD PTR [rsi+336]
-        mov	QWORD PTR [rsi+328], r10
-        adc	rax, QWORD PTR [r12+336]
-        mov	r9, QWORD PTR [rsi+344]
-        mov	QWORD PTR [rsi+336], rax
-        adc	r9, QWORD PTR [r12+344]
-        mov	r10, QWORD PTR [rsi+352]
-        mov	QWORD PTR [rsi+344], r9
-        adc	r10, QWORD PTR [r12+352]
-        mov	rax, QWORD PTR [rsi+360]
-        mov	QWORD PTR [rsi+352], r10
-        adc	rax, QWORD PTR [r12+360]
-        mov	r9, QWORD PTR [rsi+368]
-        mov	QWORD PTR [rsi+360], rax
-        adc	r9, QWORD PTR [r12+368]
-        mov	r10, QWORD PTR [rsi+376]
-        mov	QWORD PTR [rsi+368], r9
-        adc	r10, QWORD PTR [r12+376]
-        mov	QWORD PTR [rsi+376], r10
-        adc	r11, 0
-        mov	QWORD PTR [rcx+576], r11
-        add	rsi, 192
-        ; Add
-        mov	rax, QWORD PTR [rsi]
-        add	rax, QWORD PTR [r13]
-        mov	r9, QWORD PTR [rsi+8]
-        mov	QWORD PTR [rsi], rax
-        adc	r9, QWORD PTR [r13+8]
-        mov	r10, QWORD PTR [rsi+16]
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, QWORD PTR [r13+16]
-        mov	rax, QWORD PTR [rsi+24]
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, QWORD PTR [r13+24]
-        mov	r9, QWORD PTR [rsi+32]
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, QWORD PTR [r13+32]
-        mov	r10, QWORD PTR [rsi+40]
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, QWORD PTR [r13+40]
-        mov	rax, QWORD PTR [rsi+48]
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, QWORD PTR [r13+48]
-        mov	r9, QWORD PTR [rsi+56]
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, QWORD PTR [r13+56]
-        mov	r10, QWORD PTR [rsi+64]
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, QWORD PTR [r13+64]
-        mov	rax, QWORD PTR [rsi+72]
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, QWORD PTR [r13+72]
-        mov	r9, QWORD PTR [rsi+80]
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, QWORD PTR [r13+80]
-        mov	r10, QWORD PTR [rsi+88]
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, QWORD PTR [r13+88]
-        mov	rax, QWORD PTR [rsi+96]
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, QWORD PTR [r13+96]
-        mov	r9, QWORD PTR [rsi+104]
-        mov	QWORD PTR [rsi+96], rax
-        adc	r9, QWORD PTR [r13+104]
-        mov	r10, QWORD PTR [rsi+112]
-        mov	QWORD PTR [rsi+104], r9
-        adc	r10, QWORD PTR [r13+112]
-        mov	rax, QWORD PTR [rsi+120]
-        mov	QWORD PTR [rsi+112], r10
-        adc	rax, QWORD PTR [r13+120]
-        mov	r9, QWORD PTR [rsi+128]
-        mov	QWORD PTR [rsi+120], rax
-        adc	r9, QWORD PTR [r13+128]
-        mov	r10, QWORD PTR [rsi+136]
-        mov	QWORD PTR [rsi+128], r9
-        adc	r10, QWORD PTR [r13+136]
-        mov	rax, QWORD PTR [rsi+144]
-        mov	QWORD PTR [rsi+136], r10
-        adc	rax, QWORD PTR [r13+144]
-        mov	r9, QWORD PTR [rsi+152]
-        mov	QWORD PTR [rsi+144], rax
-        adc	r9, QWORD PTR [r13+152]
-        mov	r10, QWORD PTR [rsi+160]
-        mov	QWORD PTR [rsi+152], r9
-        adc	r10, QWORD PTR [r13+160]
-        mov	rax, QWORD PTR [rsi+168]
-        mov	QWORD PTR [rsi+160], r10
-        adc	rax, QWORD PTR [r13+168]
-        mov	r9, QWORD PTR [rsi+176]
-        mov	QWORD PTR [rsi+168], rax
-        adc	r9, QWORD PTR [r13+176]
-        mov	r10, QWORD PTR [rsi+184]
-        mov	QWORD PTR [rsi+176], r9
-        adc	r10, QWORD PTR [r13+184]
-        mov	rax, QWORD PTR [rsi+192]
-        mov	QWORD PTR [rsi+184], r10
-        adc	rax, QWORD PTR [r13+192]
-        mov	QWORD PTR [rsi+192], rax
-        ; Add to zero
-        mov	rax, QWORD PTR [r13+200]
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+208]
-        mov	QWORD PTR [rsi+200], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+216]
-        mov	QWORD PTR [rsi+208], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+224]
-        mov	QWORD PTR [rsi+216], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+232]
-        mov	QWORD PTR [rsi+224], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+240]
-        mov	QWORD PTR [rsi+232], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+248]
-        mov	QWORD PTR [rsi+240], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+256]
-        mov	QWORD PTR [rsi+248], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+264]
-        mov	QWORD PTR [rsi+256], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+272]
-        mov	QWORD PTR [rsi+264], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+280]
-        mov	QWORD PTR [rsi+272], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+288]
-        mov	QWORD PTR [rsi+280], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+296]
-        mov	QWORD PTR [rsi+288], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+304]
-        mov	QWORD PTR [rsi+296], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+312]
-        mov	QWORD PTR [rsi+304], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+320]
-        mov	QWORD PTR [rsi+312], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+328]
-        mov	QWORD PTR [rsi+320], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+336]
-        mov	QWORD PTR [rsi+328], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+344]
-        mov	QWORD PTR [rsi+336], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+352]
-        mov	QWORD PTR [rsi+344], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+360]
-        mov	QWORD PTR [rsi+352], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+368]
-        mov	QWORD PTR [rsi+360], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+376]
-        mov	QWORD PTR [rsi+368], rax
-        adc	r9, 0
-        mov	QWORD PTR [rsi+376], r9
-        add	rsp, 1192
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_3072_mul_48 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Multiply a and b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_mul_avx2_48 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        sub	rsp, 1192
-        mov	QWORD PTR [rsp+1152], rcx
-        mov	QWORD PTR [rsp+1160], rdx
-        mov	QWORD PTR [rsp+1168], r8
-        lea	r12, QWORD PTR [rsp+768]
-        lea	r14, QWORD PTR [rdx+192]
-        ; Add
-        mov	rax, QWORD PTR [rdx]
-        xor	r15, r15
-        add	rax, QWORD PTR [r14]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	QWORD PTR [r12], rax
-        adc	r9, QWORD PTR [r14+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	QWORD PTR [r12+8], r9
-        adc	r10, QWORD PTR [r14+16]
-        mov	rax, QWORD PTR [rdx+24]
-        mov	QWORD PTR [r12+16], r10
-        adc	rax, QWORD PTR [r14+24]
-        mov	r9, QWORD PTR [rdx+32]
-        mov	QWORD PTR [r12+24], rax
-        adc	r9, QWORD PTR [r14+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	QWORD PTR [r12+32], r9
-        adc	r10, QWORD PTR [r14+40]
-        mov	rax, QWORD PTR [rdx+48]
-        mov	QWORD PTR [r12+40], r10
-        adc	rax, QWORD PTR [r14+48]
-        mov	r9, QWORD PTR [rdx+56]
-        mov	QWORD PTR [r12+48], rax
-        adc	r9, QWORD PTR [r14+56]
-        mov	r10, QWORD PTR [rdx+64]
-        mov	QWORD PTR [r12+56], r9
-        adc	r10, QWORD PTR [r14+64]
-        mov	rax, QWORD PTR [rdx+72]
-        mov	QWORD PTR [r12+64], r10
-        adc	rax, QWORD PTR [r14+72]
-        mov	r9, QWORD PTR [rdx+80]
-        mov	QWORD PTR [r12+72], rax
-        adc	r9, QWORD PTR [r14+80]
-        mov	r10, QWORD PTR [rdx+88]
-        mov	QWORD PTR [r12+80], r9
-        adc	r10, QWORD PTR [r14+88]
-        mov	rax, QWORD PTR [rdx+96]
-        mov	QWORD PTR [r12+88], r10
-        adc	rax, QWORD PTR [r14+96]
-        mov	r9, QWORD PTR [rdx+104]
-        mov	QWORD PTR [r12+96], rax
-        adc	r9, QWORD PTR [r14+104]
-        mov	r10, QWORD PTR [rdx+112]
-        mov	QWORD PTR [r12+104], r9
-        adc	r10, QWORD PTR [r14+112]
-        mov	rax, QWORD PTR [rdx+120]
-        mov	QWORD PTR [r12+112], r10
-        adc	rax, QWORD PTR [r14+120]
-        mov	r9, QWORD PTR [rdx+128]
-        mov	QWORD PTR [r12+120], rax
-        adc	r9, QWORD PTR [r14+128]
-        mov	r10, QWORD PTR [rdx+136]
-        mov	QWORD PTR [r12+128], r9
-        adc	r10, QWORD PTR [r14+136]
-        mov	rax, QWORD PTR [rdx+144]
-        mov	QWORD PTR [r12+136], r10
-        adc	rax, QWORD PTR [r14+144]
-        mov	r9, QWORD PTR [rdx+152]
-        mov	QWORD PTR [r12+144], rax
-        adc	r9, QWORD PTR [r14+152]
-        mov	r10, QWORD PTR [rdx+160]
-        mov	QWORD PTR [r12+152], r9
-        adc	r10, QWORD PTR [r14+160]
-        mov	rax, QWORD PTR [rdx+168]
-        mov	QWORD PTR [r12+160], r10
-        adc	rax, QWORD PTR [r14+168]
-        mov	r9, QWORD PTR [rdx+176]
-        mov	QWORD PTR [r12+168], rax
-        adc	r9, QWORD PTR [r14+176]
-        mov	r10, QWORD PTR [rdx+184]
-        mov	QWORD PTR [r12+176], r9
-        adc	r10, QWORD PTR [r14+184]
-        mov	QWORD PTR [r12+184], r10
-        adc	r15, 0
-        mov	QWORD PTR [rsp+1176], r15
-        lea	r13, QWORD PTR [rsp+960]
-        lea	r14, QWORD PTR [r8+192]
-        ; Add
-        mov	rax, QWORD PTR [r8]
-        xor	rdi, rdi
-        add	rax, QWORD PTR [r14]
-        mov	r9, QWORD PTR [r8+8]
-        mov	QWORD PTR [r13], rax
-        adc	r9, QWORD PTR [r14+8]
-        mov	r10, QWORD PTR [r8+16]
-        mov	QWORD PTR [r13+8], r9
-        adc	r10, QWORD PTR [r14+16]
-        mov	rax, QWORD PTR [r8+24]
-        mov	QWORD PTR [r13+16], r10
-        adc	rax, QWORD PTR [r14+24]
-        mov	r9, QWORD PTR [r8+32]
-        mov	QWORD PTR [r13+24], rax
-        adc	r9, QWORD PTR [r14+32]
-        mov	r10, QWORD PTR [r8+40]
-        mov	QWORD PTR [r13+32], r9
-        adc	r10, QWORD PTR [r14+40]
-        mov	rax, QWORD PTR [r8+48]
-        mov	QWORD PTR [r13+40], r10
-        adc	rax, QWORD PTR [r14+48]
-        mov	r9, QWORD PTR [r8+56]
-        mov	QWORD PTR [r13+48], rax
-        adc	r9, QWORD PTR [r14+56]
-        mov	r10, QWORD PTR [r8+64]
-        mov	QWORD PTR [r13+56], r9
-        adc	r10, QWORD PTR [r14+64]
-        mov	rax, QWORD PTR [r8+72]
-        mov	QWORD PTR [r13+64], r10
-        adc	rax, QWORD PTR [r14+72]
-        mov	r9, QWORD PTR [r8+80]
-        mov	QWORD PTR [r13+72], rax
-        adc	r9, QWORD PTR [r14+80]
-        mov	r10, QWORD PTR [r8+88]
-        mov	QWORD PTR [r13+80], r9
-        adc	r10, QWORD PTR [r14+88]
-        mov	rax, QWORD PTR [r8+96]
-        mov	QWORD PTR [r13+88], r10
-        adc	rax, QWORD PTR [r14+96]
-        mov	r9, QWORD PTR [r8+104]
-        mov	QWORD PTR [r13+96], rax
-        adc	r9, QWORD PTR [r14+104]
-        mov	r10, QWORD PTR [r8+112]
-        mov	QWORD PTR [r13+104], r9
-        adc	r10, QWORD PTR [r14+112]
-        mov	rax, QWORD PTR [r8+120]
-        mov	QWORD PTR [r13+112], r10
-        adc	rax, QWORD PTR [r14+120]
-        mov	r9, QWORD PTR [r8+128]
-        mov	QWORD PTR [r13+120], rax
-        adc	r9, QWORD PTR [r14+128]
-        mov	r10, QWORD PTR [r8+136]
-        mov	QWORD PTR [r13+128], r9
-        adc	r10, QWORD PTR [r14+136]
-        mov	rax, QWORD PTR [r8+144]
-        mov	QWORD PTR [r13+136], r10
-        adc	rax, QWORD PTR [r14+144]
-        mov	r9, QWORD PTR [r8+152]
-        mov	QWORD PTR [r13+144], rax
-        adc	r9, QWORD PTR [r14+152]
-        mov	r10, QWORD PTR [r8+160]
-        mov	QWORD PTR [r13+152], r9
-        adc	r10, QWORD PTR [r14+160]
-        mov	rax, QWORD PTR [r8+168]
-        mov	QWORD PTR [r13+160], r10
-        adc	rax, QWORD PTR [r14+168]
-        mov	r9, QWORD PTR [r8+176]
-        mov	QWORD PTR [r13+168], rax
-        adc	r9, QWORD PTR [r14+176]
-        mov	r10, QWORD PTR [r8+184]
-        mov	QWORD PTR [r13+176], r9
-        adc	r10, QWORD PTR [r14+184]
-        mov	QWORD PTR [r13+184], r10
-        adc	rdi, 0
-        mov	QWORD PTR [rsp+1184], rdi
-        mov	r8, r13
-        mov	rdx, r12
-        mov	rcx, rsp
-        call	sp_3072_mul_avx2_24
-        mov	r8, QWORD PTR [rsp+1168]
-        mov	rdx, QWORD PTR [rsp+1160]
-        lea	rcx, QWORD PTR [rsp+384]
-        add	r8, 192
-        add	rdx, 192
-        call	sp_3072_mul_avx2_24
-        mov	r8, QWORD PTR [rsp+1168]
-        mov	rdx, QWORD PTR [rsp+1160]
-        mov	rcx, QWORD PTR [rsp+1152]
-        call	sp_3072_mul_avx2_24
-IFDEF _WIN64
-        mov	r8, QWORD PTR [rsp+1168]
-        mov	rdx, QWORD PTR [rsp+1160]
-        mov	rcx, QWORD PTR [rsp+1152]
-ENDIF
-        mov	r15, QWORD PTR [rsp+1176]
-        mov	rdi, QWORD PTR [rsp+1184]
-        mov	rsi, QWORD PTR [rsp+1152]
-        mov	r11, r15
-        lea	r12, QWORD PTR [rsp+768]
-        lea	r13, QWORD PTR [rsp+960]
-        and	r11, rdi
-        neg	r15
-        neg	rdi
-        add	rsi, 384
-        mov	rax, QWORD PTR [r12]
-        mov	r9, QWORD PTR [r13]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        add	rax, r9
-        mov	r9, QWORD PTR [r12+8]
-        mov	r10, QWORD PTR [r13+8]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+16]
-        mov	rax, QWORD PTR [r13+16]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+24]
-        mov	r9, QWORD PTR [r13+24]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+32]
-        mov	r10, QWORD PTR [r13+32]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+40]
-        mov	rax, QWORD PTR [r13+40]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+48]
-        mov	r9, QWORD PTR [r13+48]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+56]
-        mov	r10, QWORD PTR [r13+56]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+64]
-        mov	rax, QWORD PTR [r13+64]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+72]
-        mov	r9, QWORD PTR [r13+72]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+80]
-        mov	r10, QWORD PTR [r13+80]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+88]
-        mov	rax, QWORD PTR [r13+88]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+96]
-        mov	r9, QWORD PTR [r13+96]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+104]
-        mov	r10, QWORD PTR [r13+104]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+96], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+112]
-        mov	rax, QWORD PTR [r13+112]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+104], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+120]
-        mov	r9, QWORD PTR [r13+120]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+112], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+128]
-        mov	r10, QWORD PTR [r13+128]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+120], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+136]
-        mov	rax, QWORD PTR [r13+136]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+128], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+144]
-        mov	r9, QWORD PTR [r13+144]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+136], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+152]
-        mov	r10, QWORD PTR [r13+152]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+144], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+160]
-        mov	rax, QWORD PTR [r13+160]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+152], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+168]
-        mov	r9, QWORD PTR [r13+168]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+160], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+176]
-        mov	r10, QWORD PTR [r13+176]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+168], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+184]
-        mov	rax, QWORD PTR [r13+184]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+176], r9
-        adc	r10, rax
-        mov	QWORD PTR [rsi+184], r10
-        adc	r11, 0
-        lea	r13, QWORD PTR [rsp+384]
-        mov	r12, rsp
-        mov	rax, QWORD PTR [r12]
-        sub	rax, QWORD PTR [r13]
-        mov	r9, QWORD PTR [r12+8]
-        mov	QWORD PTR [r12], rax
-        sbb	r9, QWORD PTR [r13+8]
-        mov	r10, QWORD PTR [r12+16]
-        mov	QWORD PTR [r12+8], r9
-        sbb	r10, QWORD PTR [r13+16]
-        mov	rax, QWORD PTR [r12+24]
-        mov	QWORD PTR [r12+16], r10
-        sbb	rax, QWORD PTR [r13+24]
-        mov	r9, QWORD PTR [r12+32]
-        mov	QWORD PTR [r12+24], rax
-        sbb	r9, QWORD PTR [r13+32]
-        mov	r10, QWORD PTR [r12+40]
-        mov	QWORD PTR [r12+32], r9
-        sbb	r10, QWORD PTR [r13+40]
-        mov	rax, QWORD PTR [r12+48]
-        mov	QWORD PTR [r12+40], r10
-        sbb	rax, QWORD PTR [r13+48]
-        mov	r9, QWORD PTR [r12+56]
-        mov	QWORD PTR [r12+48], rax
-        sbb	r9, QWORD PTR [r13+56]
-        mov	r10, QWORD PTR [r12+64]
-        mov	QWORD PTR [r12+56], r9
-        sbb	r10, QWORD PTR [r13+64]
-        mov	rax, QWORD PTR [r12+72]
-        mov	QWORD PTR [r12+64], r10
-        sbb	rax, QWORD PTR [r13+72]
-        mov	r9, QWORD PTR [r12+80]
-        mov	QWORD PTR [r12+72], rax
-        sbb	r9, QWORD PTR [r13+80]
-        mov	r10, QWORD PTR [r12+88]
-        mov	QWORD PTR [r12+80], r9
-        sbb	r10, QWORD PTR [r13+88]
-        mov	rax, QWORD PTR [r12+96]
-        mov	QWORD PTR [r12+88], r10
-        sbb	rax, QWORD PTR [r13+96]
-        mov	r9, QWORD PTR [r12+104]
-        mov	QWORD PTR [r12+96], rax
-        sbb	r9, QWORD PTR [r13+104]
-        mov	r10, QWORD PTR [r12+112]
-        mov	QWORD PTR [r12+104], r9
-        sbb	r10, QWORD PTR [r13+112]
-        mov	rax, QWORD PTR [r12+120]
-        mov	QWORD PTR [r12+112], r10
-        sbb	rax, QWORD PTR [r13+120]
-        mov	r9, QWORD PTR [r12+128]
-        mov	QWORD PTR [r12+120], rax
-        sbb	r9, QWORD PTR [r13+128]
-        mov	r10, QWORD PTR [r12+136]
-        mov	QWORD PTR [r12+128], r9
-        sbb	r10, QWORD PTR [r13+136]
-        mov	rax, QWORD PTR [r12+144]
-        mov	QWORD PTR [r12+136], r10
-        sbb	rax, QWORD PTR [r13+144]
-        mov	r9, QWORD PTR [r12+152]
-        mov	QWORD PTR [r12+144], rax
-        sbb	r9, QWORD PTR [r13+152]
-        mov	r10, QWORD PTR [r12+160]
-        mov	QWORD PTR [r12+152], r9
-        sbb	r10, QWORD PTR [r13+160]
-        mov	rax, QWORD PTR [r12+168]
-        mov	QWORD PTR [r12+160], r10
-        sbb	rax, QWORD PTR [r13+168]
-        mov	r9, QWORD PTR [r12+176]
-        mov	QWORD PTR [r12+168], rax
-        sbb	r9, QWORD PTR [r13+176]
-        mov	r10, QWORD PTR [r12+184]
-        mov	QWORD PTR [r12+176], r9
-        sbb	r10, QWORD PTR [r13+184]
-        mov	rax, QWORD PTR [r12+192]
-        mov	QWORD PTR [r12+184], r10
-        sbb	rax, QWORD PTR [r13+192]
-        mov	r9, QWORD PTR [r12+200]
-        mov	QWORD PTR [r12+192], rax
-        sbb	r9, QWORD PTR [r13+200]
-        mov	r10, QWORD PTR [r12+208]
-        mov	QWORD PTR [r12+200], r9
-        sbb	r10, QWORD PTR [r13+208]
-        mov	rax, QWORD PTR [r12+216]
-        mov	QWORD PTR [r12+208], r10
-        sbb	rax, QWORD PTR [r13+216]
-        mov	r9, QWORD PTR [r12+224]
-        mov	QWORD PTR [r12+216], rax
-        sbb	r9, QWORD PTR [r13+224]
-        mov	r10, QWORD PTR [r12+232]
-        mov	QWORD PTR [r12+224], r9
-        sbb	r10, QWORD PTR [r13+232]
-        mov	rax, QWORD PTR [r12+240]
-        mov	QWORD PTR [r12+232], r10
-        sbb	rax, QWORD PTR [r13+240]
-        mov	r9, QWORD PTR [r12+248]
-        mov	QWORD PTR [r12+240], rax
-        sbb	r9, QWORD PTR [r13+248]
-        mov	r10, QWORD PTR [r12+256]
-        mov	QWORD PTR [r12+248], r9
-        sbb	r10, QWORD PTR [r13+256]
-        mov	rax, QWORD PTR [r12+264]
-        mov	QWORD PTR [r12+256], r10
-        sbb	rax, QWORD PTR [r13+264]
-        mov	r9, QWORD PTR [r12+272]
-        mov	QWORD PTR [r12+264], rax
-        sbb	r9, QWORD PTR [r13+272]
-        mov	r10, QWORD PTR [r12+280]
-        mov	QWORD PTR [r12+272], r9
-        sbb	r10, QWORD PTR [r13+280]
-        mov	rax, QWORD PTR [r12+288]
-        mov	QWORD PTR [r12+280], r10
-        sbb	rax, QWORD PTR [r13+288]
-        mov	r9, QWORD PTR [r12+296]
-        mov	QWORD PTR [r12+288], rax
-        sbb	r9, QWORD PTR [r13+296]
-        mov	r10, QWORD PTR [r12+304]
-        mov	QWORD PTR [r12+296], r9
-        sbb	r10, QWORD PTR [r13+304]
-        mov	rax, QWORD PTR [r12+312]
-        mov	QWORD PTR [r12+304], r10
-        sbb	rax, QWORD PTR [r13+312]
-        mov	r9, QWORD PTR [r12+320]
-        mov	QWORD PTR [r12+312], rax
-        sbb	r9, QWORD PTR [r13+320]
-        mov	r10, QWORD PTR [r12+328]
-        mov	QWORD PTR [r12+320], r9
-        sbb	r10, QWORD PTR [r13+328]
-        mov	rax, QWORD PTR [r12+336]
-        mov	QWORD PTR [r12+328], r10
-        sbb	rax, QWORD PTR [r13+336]
-        mov	r9, QWORD PTR [r12+344]
-        mov	QWORD PTR [r12+336], rax
-        sbb	r9, QWORD PTR [r13+344]
-        mov	r10, QWORD PTR [r12+352]
-        mov	QWORD PTR [r12+344], r9
-        sbb	r10, QWORD PTR [r13+352]
-        mov	rax, QWORD PTR [r12+360]
-        mov	QWORD PTR [r12+352], r10
-        sbb	rax, QWORD PTR [r13+360]
-        mov	r9, QWORD PTR [r12+368]
-        mov	QWORD PTR [r12+360], rax
-        sbb	r9, QWORD PTR [r13+368]
-        mov	r10, QWORD PTR [r12+376]
-        mov	QWORD PTR [r12+368], r9
-        sbb	r10, QWORD PTR [r13+376]
-        mov	QWORD PTR [r12+376], r10
-        sbb	r11, 0
-        mov	rax, QWORD PTR [r12]
-        sub	rax, QWORD PTR [rcx]
-        mov	r9, QWORD PTR [r12+8]
-        mov	QWORD PTR [r12], rax
-        sbb	r9, QWORD PTR [rcx+8]
-        mov	r10, QWORD PTR [r12+16]
-        mov	QWORD PTR [r12+8], r9
-        sbb	r10, QWORD PTR [rcx+16]
-        mov	rax, QWORD PTR [r12+24]
-        mov	QWORD PTR [r12+16], r10
-        sbb	rax, QWORD PTR [rcx+24]
-        mov	r9, QWORD PTR [r12+32]
-        mov	QWORD PTR [r12+24], rax
-        sbb	r9, QWORD PTR [rcx+32]
-        mov	r10, QWORD PTR [r12+40]
-        mov	QWORD PTR [r12+32], r9
-        sbb	r10, QWORD PTR [rcx+40]
-        mov	rax, QWORD PTR [r12+48]
-        mov	QWORD PTR [r12+40], r10
-        sbb	rax, QWORD PTR [rcx+48]
-        mov	r9, QWORD PTR [r12+56]
-        mov	QWORD PTR [r12+48], rax
-        sbb	r9, QWORD PTR [rcx+56]
-        mov	r10, QWORD PTR [r12+64]
-        mov	QWORD PTR [r12+56], r9
-        sbb	r10, QWORD PTR [rcx+64]
-        mov	rax, QWORD PTR [r12+72]
-        mov	QWORD PTR [r12+64], r10
-        sbb	rax, QWORD PTR [rcx+72]
-        mov	r9, QWORD PTR [r12+80]
-        mov	QWORD PTR [r12+72], rax
-        sbb	r9, QWORD PTR [rcx+80]
-        mov	r10, QWORD PTR [r12+88]
-        mov	QWORD PTR [r12+80], r9
-        sbb	r10, QWORD PTR [rcx+88]
-        mov	rax, QWORD PTR [r12+96]
-        mov	QWORD PTR [r12+88], r10
-        sbb	rax, QWORD PTR [rcx+96]
-        mov	r9, QWORD PTR [r12+104]
-        mov	QWORD PTR [r12+96], rax
-        sbb	r9, QWORD PTR [rcx+104]
-        mov	r10, QWORD PTR [r12+112]
-        mov	QWORD PTR [r12+104], r9
-        sbb	r10, QWORD PTR [rcx+112]
-        mov	rax, QWORD PTR [r12+120]
-        mov	QWORD PTR [r12+112], r10
-        sbb	rax, QWORD PTR [rcx+120]
-        mov	r9, QWORD PTR [r12+128]
-        mov	QWORD PTR [r12+120], rax
-        sbb	r9, QWORD PTR [rcx+128]
-        mov	r10, QWORD PTR [r12+136]
-        mov	QWORD PTR [r12+128], r9
-        sbb	r10, QWORD PTR [rcx+136]
-        mov	rax, QWORD PTR [r12+144]
-        mov	QWORD PTR [r12+136], r10
-        sbb	rax, QWORD PTR [rcx+144]
-        mov	r9, QWORD PTR [r12+152]
-        mov	QWORD PTR [r12+144], rax
-        sbb	r9, QWORD PTR [rcx+152]
-        mov	r10, QWORD PTR [r12+160]
-        mov	QWORD PTR [r12+152], r9
-        sbb	r10, QWORD PTR [rcx+160]
-        mov	rax, QWORD PTR [r12+168]
-        mov	QWORD PTR [r12+160], r10
-        sbb	rax, QWORD PTR [rcx+168]
-        mov	r9, QWORD PTR [r12+176]
-        mov	QWORD PTR [r12+168], rax
-        sbb	r9, QWORD PTR [rcx+176]
-        mov	r10, QWORD PTR [r12+184]
-        mov	QWORD PTR [r12+176], r9
-        sbb	r10, QWORD PTR [rcx+184]
-        mov	rax, QWORD PTR [r12+192]
-        mov	QWORD PTR [r12+184], r10
-        sbb	rax, QWORD PTR [rcx+192]
-        mov	r9, QWORD PTR [r12+200]
-        mov	QWORD PTR [r12+192], rax
-        sbb	r9, QWORD PTR [rcx+200]
-        mov	r10, QWORD PTR [r12+208]
-        mov	QWORD PTR [r12+200], r9
-        sbb	r10, QWORD PTR [rcx+208]
-        mov	rax, QWORD PTR [r12+216]
-        mov	QWORD PTR [r12+208], r10
-        sbb	rax, QWORD PTR [rcx+216]
-        mov	r9, QWORD PTR [r12+224]
-        mov	QWORD PTR [r12+216], rax
-        sbb	r9, QWORD PTR [rcx+224]
-        mov	r10, QWORD PTR [r12+232]
-        mov	QWORD PTR [r12+224], r9
-        sbb	r10, QWORD PTR [rcx+232]
-        mov	rax, QWORD PTR [r12+240]
-        mov	QWORD PTR [r12+232], r10
-        sbb	rax, QWORD PTR [rcx+240]
-        mov	r9, QWORD PTR [r12+248]
-        mov	QWORD PTR [r12+240], rax
-        sbb	r9, QWORD PTR [rcx+248]
-        mov	r10, QWORD PTR [r12+256]
-        mov	QWORD PTR [r12+248], r9
-        sbb	r10, QWORD PTR [rcx+256]
-        mov	rax, QWORD PTR [r12+264]
-        mov	QWORD PTR [r12+256], r10
-        sbb	rax, QWORD PTR [rcx+264]
-        mov	r9, QWORD PTR [r12+272]
-        mov	QWORD PTR [r12+264], rax
-        sbb	r9, QWORD PTR [rcx+272]
-        mov	r10, QWORD PTR [r12+280]
-        mov	QWORD PTR [r12+272], r9
-        sbb	r10, QWORD PTR [rcx+280]
-        mov	rax, QWORD PTR [r12+288]
-        mov	QWORD PTR [r12+280], r10
-        sbb	rax, QWORD PTR [rcx+288]
-        mov	r9, QWORD PTR [r12+296]
-        mov	QWORD PTR [r12+288], rax
-        sbb	r9, QWORD PTR [rcx+296]
-        mov	r10, QWORD PTR [r12+304]
-        mov	QWORD PTR [r12+296], r9
-        sbb	r10, QWORD PTR [rcx+304]
-        mov	rax, QWORD PTR [r12+312]
-        mov	QWORD PTR [r12+304], r10
-        sbb	rax, QWORD PTR [rcx+312]
-        mov	r9, QWORD PTR [r12+320]
-        mov	QWORD PTR [r12+312], rax
-        sbb	r9, QWORD PTR [rcx+320]
-        mov	r10, QWORD PTR [r12+328]
-        mov	QWORD PTR [r12+320], r9
-        sbb	r10, QWORD PTR [rcx+328]
-        mov	rax, QWORD PTR [r12+336]
-        mov	QWORD PTR [r12+328], r10
-        sbb	rax, QWORD PTR [rcx+336]
-        mov	r9, QWORD PTR [r12+344]
-        mov	QWORD PTR [r12+336], rax
-        sbb	r9, QWORD PTR [rcx+344]
-        mov	r10, QWORD PTR [r12+352]
-        mov	QWORD PTR [r12+344], r9
-        sbb	r10, QWORD PTR [rcx+352]
-        mov	rax, QWORD PTR [r12+360]
-        mov	QWORD PTR [r12+352], r10
-        sbb	rax, QWORD PTR [rcx+360]
-        mov	r9, QWORD PTR [r12+368]
-        mov	QWORD PTR [r12+360], rax
-        sbb	r9, QWORD PTR [rcx+368]
-        mov	r10, QWORD PTR [r12+376]
-        mov	QWORD PTR [r12+368], r9
-        sbb	r10, QWORD PTR [rcx+376]
-        mov	QWORD PTR [r12+376], r10
-        sbb	r11, 0
-        sub	rsi, 192
-        ; Add
-        mov	rax, QWORD PTR [rsi]
-        add	rax, QWORD PTR [r12]
-        mov	r9, QWORD PTR [rsi+8]
-        mov	QWORD PTR [rsi], rax
-        adc	r9, QWORD PTR [r12+8]
-        mov	r10, QWORD PTR [rsi+16]
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, QWORD PTR [r12+16]
-        mov	rax, QWORD PTR [rsi+24]
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, QWORD PTR [r12+24]
-        mov	r9, QWORD PTR [rsi+32]
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, QWORD PTR [r12+32]
-        mov	r10, QWORD PTR [rsi+40]
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, QWORD PTR [r12+40]
-        mov	rax, QWORD PTR [rsi+48]
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, QWORD PTR [r12+48]
-        mov	r9, QWORD PTR [rsi+56]
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, QWORD PTR [r12+56]
-        mov	r10, QWORD PTR [rsi+64]
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, QWORD PTR [r12+64]
-        mov	rax, QWORD PTR [rsi+72]
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, QWORD PTR [r12+72]
-        mov	r9, QWORD PTR [rsi+80]
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, QWORD PTR [r12+80]
-        mov	r10, QWORD PTR [rsi+88]
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, QWORD PTR [r12+88]
-        mov	rax, QWORD PTR [rsi+96]
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, QWORD PTR [r12+96]
-        mov	r9, QWORD PTR [rsi+104]
-        mov	QWORD PTR [rsi+96], rax
-        adc	r9, QWORD PTR [r12+104]
-        mov	r10, QWORD PTR [rsi+112]
-        mov	QWORD PTR [rsi+104], r9
-        adc	r10, QWORD PTR [r12+112]
-        mov	rax, QWORD PTR [rsi+120]
-        mov	QWORD PTR [rsi+112], r10
-        adc	rax, QWORD PTR [r12+120]
-        mov	r9, QWORD PTR [rsi+128]
-        mov	QWORD PTR [rsi+120], rax
-        adc	r9, QWORD PTR [r12+128]
-        mov	r10, QWORD PTR [rsi+136]
-        mov	QWORD PTR [rsi+128], r9
-        adc	r10, QWORD PTR [r12+136]
-        mov	rax, QWORD PTR [rsi+144]
-        mov	QWORD PTR [rsi+136], r10
-        adc	rax, QWORD PTR [r12+144]
-        mov	r9, QWORD PTR [rsi+152]
-        mov	QWORD PTR [rsi+144], rax
-        adc	r9, QWORD PTR [r12+152]
-        mov	r10, QWORD PTR [rsi+160]
-        mov	QWORD PTR [rsi+152], r9
-        adc	r10, QWORD PTR [r12+160]
-        mov	rax, QWORD PTR [rsi+168]
-        mov	QWORD PTR [rsi+160], r10
-        adc	rax, QWORD PTR [r12+168]
-        mov	r9, QWORD PTR [rsi+176]
-        mov	QWORD PTR [rsi+168], rax
-        adc	r9, QWORD PTR [r12+176]
-        mov	r10, QWORD PTR [rsi+184]
-        mov	QWORD PTR [rsi+176], r9
-        adc	r10, QWORD PTR [r12+184]
-        mov	rax, QWORD PTR [rsi+192]
-        mov	QWORD PTR [rsi+184], r10
-        adc	rax, QWORD PTR [r12+192]
-        mov	r9, QWORD PTR [rsi+200]
-        mov	QWORD PTR [rsi+192], rax
-        adc	r9, QWORD PTR [r12+200]
-        mov	r10, QWORD PTR [rsi+208]
-        mov	QWORD PTR [rsi+200], r9
-        adc	r10, QWORD PTR [r12+208]
-        mov	rax, QWORD PTR [rsi+216]
-        mov	QWORD PTR [rsi+208], r10
-        adc	rax, QWORD PTR [r12+216]
-        mov	r9, QWORD PTR [rsi+224]
-        mov	QWORD PTR [rsi+216], rax
-        adc	r9, QWORD PTR [r12+224]
-        mov	r10, QWORD PTR [rsi+232]
-        mov	QWORD PTR [rsi+224], r9
-        adc	r10, QWORD PTR [r12+232]
-        mov	rax, QWORD PTR [rsi+240]
-        mov	QWORD PTR [rsi+232], r10
-        adc	rax, QWORD PTR [r12+240]
-        mov	r9, QWORD PTR [rsi+248]
-        mov	QWORD PTR [rsi+240], rax
-        adc	r9, QWORD PTR [r12+248]
-        mov	r10, QWORD PTR [rsi+256]
-        mov	QWORD PTR [rsi+248], r9
-        adc	r10, QWORD PTR [r12+256]
-        mov	rax, QWORD PTR [rsi+264]
-        mov	QWORD PTR [rsi+256], r10
-        adc	rax, QWORD PTR [r12+264]
-        mov	r9, QWORD PTR [rsi+272]
-        mov	QWORD PTR [rsi+264], rax
-        adc	r9, QWORD PTR [r12+272]
-        mov	r10, QWORD PTR [rsi+280]
-        mov	QWORD PTR [rsi+272], r9
-        adc	r10, QWORD PTR [r12+280]
-        mov	rax, QWORD PTR [rsi+288]
-        mov	QWORD PTR [rsi+280], r10
-        adc	rax, QWORD PTR [r12+288]
-        mov	r9, QWORD PTR [rsi+296]
-        mov	QWORD PTR [rsi+288], rax
-        adc	r9, QWORD PTR [r12+296]
-        mov	r10, QWORD PTR [rsi+304]
-        mov	QWORD PTR [rsi+296], r9
-        adc	r10, QWORD PTR [r12+304]
-        mov	rax, QWORD PTR [rsi+312]
-        mov	QWORD PTR [rsi+304], r10
-        adc	rax, QWORD PTR [r12+312]
-        mov	r9, QWORD PTR [rsi+320]
-        mov	QWORD PTR [rsi+312], rax
-        adc	r9, QWORD PTR [r12+320]
-        mov	r10, QWORD PTR [rsi+328]
-        mov	QWORD PTR [rsi+320], r9
-        adc	r10, QWORD PTR [r12+328]
-        mov	rax, QWORD PTR [rsi+336]
-        mov	QWORD PTR [rsi+328], r10
-        adc	rax, QWORD PTR [r12+336]
-        mov	r9, QWORD PTR [rsi+344]
-        mov	QWORD PTR [rsi+336], rax
-        adc	r9, QWORD PTR [r12+344]
-        mov	r10, QWORD PTR [rsi+352]
-        mov	QWORD PTR [rsi+344], r9
-        adc	r10, QWORD PTR [r12+352]
-        mov	rax, QWORD PTR [rsi+360]
-        mov	QWORD PTR [rsi+352], r10
-        adc	rax, QWORD PTR [r12+360]
-        mov	r9, QWORD PTR [rsi+368]
-        mov	QWORD PTR [rsi+360], rax
-        adc	r9, QWORD PTR [r12+368]
-        mov	r10, QWORD PTR [rsi+376]
-        mov	QWORD PTR [rsi+368], r9
-        adc	r10, QWORD PTR [r12+376]
-        mov	QWORD PTR [rsi+376], r10
-        adc	r11, 0
-        mov	QWORD PTR [rcx+576], r11
-        add	rsi, 192
-        ; Add
-        mov	rax, QWORD PTR [rsi]
-        add	rax, QWORD PTR [r13]
-        mov	r9, QWORD PTR [rsi+8]
-        mov	QWORD PTR [rsi], rax
-        adc	r9, QWORD PTR [r13+8]
-        mov	r10, QWORD PTR [rsi+16]
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, QWORD PTR [r13+16]
-        mov	rax, QWORD PTR [rsi+24]
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, QWORD PTR [r13+24]
-        mov	r9, QWORD PTR [rsi+32]
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, QWORD PTR [r13+32]
-        mov	r10, QWORD PTR [rsi+40]
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, QWORD PTR [r13+40]
-        mov	rax, QWORD PTR [rsi+48]
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, QWORD PTR [r13+48]
-        mov	r9, QWORD PTR [rsi+56]
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, QWORD PTR [r13+56]
-        mov	r10, QWORD PTR [rsi+64]
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, QWORD PTR [r13+64]
-        mov	rax, QWORD PTR [rsi+72]
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, QWORD PTR [r13+72]
-        mov	r9, QWORD PTR [rsi+80]
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, QWORD PTR [r13+80]
-        mov	r10, QWORD PTR [rsi+88]
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, QWORD PTR [r13+88]
-        mov	rax, QWORD PTR [rsi+96]
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, QWORD PTR [r13+96]
-        mov	r9, QWORD PTR [rsi+104]
-        mov	QWORD PTR [rsi+96], rax
-        adc	r9, QWORD PTR [r13+104]
-        mov	r10, QWORD PTR [rsi+112]
-        mov	QWORD PTR [rsi+104], r9
-        adc	r10, QWORD PTR [r13+112]
-        mov	rax, QWORD PTR [rsi+120]
-        mov	QWORD PTR [rsi+112], r10
-        adc	rax, QWORD PTR [r13+120]
-        mov	r9, QWORD PTR [rsi+128]
-        mov	QWORD PTR [rsi+120], rax
-        adc	r9, QWORD PTR [r13+128]
-        mov	r10, QWORD PTR [rsi+136]
-        mov	QWORD PTR [rsi+128], r9
-        adc	r10, QWORD PTR [r13+136]
-        mov	rax, QWORD PTR [rsi+144]
-        mov	QWORD PTR [rsi+136], r10
-        adc	rax, QWORD PTR [r13+144]
-        mov	r9, QWORD PTR [rsi+152]
-        mov	QWORD PTR [rsi+144], rax
-        adc	r9, QWORD PTR [r13+152]
-        mov	r10, QWORD PTR [rsi+160]
-        mov	QWORD PTR [rsi+152], r9
-        adc	r10, QWORD PTR [r13+160]
-        mov	rax, QWORD PTR [rsi+168]
-        mov	QWORD PTR [rsi+160], r10
-        adc	rax, QWORD PTR [r13+168]
-        mov	r9, QWORD PTR [rsi+176]
-        mov	QWORD PTR [rsi+168], rax
-        adc	r9, QWORD PTR [r13+176]
-        mov	r10, QWORD PTR [rsi+184]
-        mov	QWORD PTR [rsi+176], r9
-        adc	r10, QWORD PTR [r13+184]
-        mov	rax, QWORD PTR [rsi+192]
-        mov	QWORD PTR [rsi+184], r10
-        adc	rax, QWORD PTR [r13+192]
-        mov	QWORD PTR [rsi+192], rax
-        ; Add to zero
-        mov	rax, QWORD PTR [r13+200]
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+208]
-        mov	QWORD PTR [rsi+200], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+216]
-        mov	QWORD PTR [rsi+208], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+224]
-        mov	QWORD PTR [rsi+216], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+232]
-        mov	QWORD PTR [rsi+224], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+240]
-        mov	QWORD PTR [rsi+232], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+248]
-        mov	QWORD PTR [rsi+240], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+256]
-        mov	QWORD PTR [rsi+248], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+264]
-        mov	QWORD PTR [rsi+256], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+272]
-        mov	QWORD PTR [rsi+264], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+280]
-        mov	QWORD PTR [rsi+272], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+288]
-        mov	QWORD PTR [rsi+280], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+296]
-        mov	QWORD PTR [rsi+288], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+304]
-        mov	QWORD PTR [rsi+296], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+312]
-        mov	QWORD PTR [rsi+304], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+320]
-        mov	QWORD PTR [rsi+312], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+328]
-        mov	QWORD PTR [rsi+320], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+336]
-        mov	QWORD PTR [rsi+328], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+344]
-        mov	QWORD PTR [rsi+336], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+352]
-        mov	QWORD PTR [rsi+344], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+360]
-        mov	QWORD PTR [rsi+352], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+368]
-        mov	QWORD PTR [rsi+360], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+376]
-        mov	QWORD PTR [rsi+368], rax
-        adc	r9, 0
-        mov	QWORD PTR [rsi+376], r9
-        add	rsp, 1192
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_3072_mul_avx2_48 ENDP
-_text ENDS
-ENDIF
-; /* Square a and put result in r. (r = a * a)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_sqr_12 PROC
-        push	r12
-        push	r13
-        push	r14
-        mov	r8, rdx
-        sub	rsp, 96
-        ; A[0] * A[0]
-        mov	rax, QWORD PTR [r8]
-        mul	rax
-        xor	r11, r11
-        mov	QWORD PTR [rsp], rax
-        mov	r10, rdx
-        ; A[0] * A[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r8]
-        xor	r9, r9
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        mov	QWORD PTR [rsp+8], r10
-        ; A[0] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        ; A[1] * A[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	rax
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+16], r11
-        ; A[0] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r8]
-        xor	r11, r11
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r8+8]
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+24], r9
-        ; A[0] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8]
-        xor	r9, r9
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        ; A[1] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r8+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        ; A[2] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	rax
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        mov	QWORD PTR [rsp+32], r10
-        ; A[0] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rsp+40], r11
-        ; A[0] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8]
-        xor	r11, r11
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r9, r12
-        adc	r10, r13
-        adc	r11, r14
-        mov	QWORD PTR [rsp+48], r9
-        ; A[0] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8]
-        xor	r9, r9
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r10, r12
-        adc	r11, r13
-        adc	r9, r14
-        mov	QWORD PTR [rsp+56], r10
-        ; A[0] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rsp+64], r11
-        ; A[0] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8]
-        xor	r11, r11
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r9, r12
-        adc	r10, r13
-        adc	r11, r14
-        mov	QWORD PTR [rsp+72], r9
-        ; A[0] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8]
-        xor	r9, r9
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r10, r12
-        adc	r11, r13
-        adc	r9, r14
-        mov	QWORD PTR [rsp+80], r10
-        ; A[0] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rsp+88], r11
-        ; A[1] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+8]
-        xor	r11, r11
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[2] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r9, r12
-        adc	r10, r13
-        adc	r11, r14
-        mov	QWORD PTR [rcx+96], r9
-        ; A[2] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+16]
-        xor	r9, r9
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[3] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+48]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r10, r12
-        adc	r11, r13
-        adc	r9, r14
-        mov	QWORD PTR [rcx+104], r10
-        ; A[3] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+24]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[4] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+48]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[7] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rcx+112], r11
-        ; A[4] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+32]
-        xor	r11, r11
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[5] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+48]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[7] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+56]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r9, r12
-        adc	r10, r13
-        adc	r11, r14
-        mov	QWORD PTR [rcx+120], r9
-        ; A[5] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+40]
-        xor	r9, r9
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[6] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+48]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[7] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+56]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[8] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r10, r12
-        adc	r11, r13
-        adc	r9, r14
-        mov	QWORD PTR [rcx+128], r10
-        ; A[6] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+48]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[7] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+56]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[8] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+64]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rcx+136], r11
-        ; A[7] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+56]
-        xor	r11, r11
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[8] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+64]
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[9] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	rax
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+144], r9
-        ; A[8] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+64]
-        xor	r9, r9
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        ; A[9] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+72]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        mov	QWORD PTR [rcx+152], r10
-        ; A[9] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+72]
-        xor	r10, r10
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        ; A[10] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	rax
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+160], r11
-        ; A[10] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+80]
-        xor	r11, r11
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+168], r9
-        ; A[11] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	rax
-        add	r10, rax
-        adc	r11, rdx
-        mov	QWORD PTR [rcx+176], r10
-        mov	QWORD PTR [rcx+184], r11
-        mov	rax, QWORD PTR [rsp]
-        mov	rdx, QWORD PTR [rsp+8]
-        mov	r12, QWORD PTR [rsp+16]
-        mov	r13, QWORD PTR [rsp+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], rdx
-        mov	QWORD PTR [rcx+16], r12
-        mov	QWORD PTR [rcx+24], r13
-        mov	rax, QWORD PTR [rsp+32]
-        mov	rdx, QWORD PTR [rsp+40]
-        mov	r12, QWORD PTR [rsp+48]
-        mov	r13, QWORD PTR [rsp+56]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], rdx
-        mov	QWORD PTR [rcx+48], r12
-        mov	QWORD PTR [rcx+56], r13
-        mov	rax, QWORD PTR [rsp+64]
-        mov	rdx, QWORD PTR [rsp+72]
-        mov	r12, QWORD PTR [rsp+80]
-        mov	r13, QWORD PTR [rsp+88]
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], rdx
-        mov	QWORD PTR [rcx+80], r12
-        mov	QWORD PTR [rcx+88], r13
-        add	rsp, 96
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_3072_sqr_12 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Square a and put result in r. (r = a * a)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_sqr_avx2_12 PROC
-        push	rbp
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        mov	r8, rcx
-        mov	r9, rdx
-        sub	rsp, 96
-        cmp	r9, r8
-        mov	rbp, rsp
-        cmovne	rbp, r8
-        add	r8, 96
-        xor	r12, r12
-        ; Diagonal 1
-        ; Zero into %r9
-        ; A[1] x A[0]
-        mov	rdx, QWORD PTR [r9]
-        mulx	r11, r10, QWORD PTR [r9+8]
-        mov	QWORD PTR [rbp+8], r10
-        ; Zero into %r8
-        ; A[2] x A[0]
-        mulx	r10, rax, QWORD PTR [r9+16]
-        adcx	r11, rax
-        adox	r10, r12
-        mov	QWORD PTR [rbp+16], r11
-        ; Zero into %r9
-        ; A[3] x A[0]
-        mulx	r11, rax, QWORD PTR [r9+24]
-        adcx	r10, rax
-        adox	r11, r12
-        mov	QWORD PTR [rbp+24], r10
-        ; Zero into %r8
-        ; A[4] x A[0]
-        mulx	r10, rax, QWORD PTR [r9+32]
-        adcx	r11, rax
-        adox	r10, r12
-        mov	QWORD PTR [rbp+32], r11
-        ; Zero into %r9
-        ; A[5] x A[0]
-        mulx	r11, rax, QWORD PTR [r9+40]
-        adcx	r10, rax
-        adox	r11, r12
-        mov	QWORD PTR [rbp+40], r10
-        ; No load %r12 - %r8
-        ; A[6] x A[0]
-        mulx	r14, rax, QWORD PTR [r9+48]
-        adcx	r11, rax
-        adox	r14, r12
-        mov	QWORD PTR [rbp+48], r11
-        ; No load %r13 - %r9
-        ; A[7] x A[0]
-        mulx	r15, rax, QWORD PTR [r9+56]
-        adcx	r14, rax
-        adox	r15, r12
-        ; No store %r12 - %r8
-        ; No load %r14 - %r8
-        ; A[8] x A[0]
-        mulx	rdi, rax, QWORD PTR [r9+64]
-        adcx	r15, rax
-        adox	rdi, r12
-        ; No store %r13 - %r9
-        ; No load %r15 - %r9
-        ; A[9] x A[0]
-        mulx	rsi, rax, QWORD PTR [r9+72]
-        adcx	rdi, rax
-        adox	rsi, r12
-        ; No store %r14 - %r8
-        ; No load %rbx - %r8
-        ; A[10] x A[0]
-        mulx	rbx, rax, QWORD PTR [r9+80]
-        adcx	rsi, rax
-        adox	rbx, r12
-        ; No store %r15 - %r9
-        ; Zero into %r9
-        ; A[11] x A[0]
-        mulx	r11, rax, QWORD PTR [r9+88]
-        adcx	rbx, rax
-        adox	r11, r12
-        ; No store %rbx - %r8
-        ;  Carry
-        adcx	r11, r12
-        mov	r13, r12
-        adcx	r13, r12
-        adox	r13, r12
-        mov	QWORD PTR [r8], r11
-        ; Diagonal 2
-        mov	r11, QWORD PTR [rbp+24]
-        mov	r10, QWORD PTR [rbp+32]
-        ; A[2] x A[1]
-        mov	rdx, QWORD PTR [r9+8]
-        mulx	rcx, rax, QWORD PTR [r9+16]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbp+24], r11
-        mov	r11, QWORD PTR [rbp+40]
-        ; A[3] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+24]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbp+32], r10
-        mov	r10, QWORD PTR [rbp+48]
-        ; A[4] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+32]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbp+40], r11
-        ; No load %r12 - %r9
-        ; A[5] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+40]
-        adcx	r10, rax
-        adox	r14, rcx
-        mov	QWORD PTR [rbp+48], r10
-        ; No load %r13 - %r8
-        ; A[6] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	r14, rax
-        adox	r15, rcx
-        ; No store %r12 - %r9
-        ; No load %r14 - %r9
-        ; A[7] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	r15, rax
-        adox	rdi, rcx
-        ; No store %r13 - %r8
-        ; No load %r15 - %r8
-        ; A[8] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; No store %r14 - %r9
-        ; No load %rbx - %r9
-        ; A[9] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r15 - %r8
-        mov	r10, QWORD PTR [r8]
-        ; A[10] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	rbx, rax
-        adox	r10, rcx
-        ; No store %rbx - %r9
-        ; Zero into %r9
-        ; A[11] x A[1]
-        mulx	r11, rax, QWORD PTR [r9+88]
-        adcx	r10, rax
-        adox	r11, r12
-        mov	QWORD PTR [r8], r10
-        ; Zero into %r8
-        ; A[11] x A[2]
-        mov	rdx, QWORD PTR [r9+16]
-        mulx	r10, rax, QWORD PTR [r9+88]
-        adcx	r11, rax
-        adox	r10, r12
-        mov	QWORD PTR [r8+8], r11
-        ;  Carry
-        adcx	r10, r13
-        mov	r13, r12
-        adcx	r13, r12
-        adox	r13, r12
-        mov	QWORD PTR [r8+16], r10
-        ; Diagonal 3
-        mov	r10, QWORD PTR [rbp+40]
-        mov	r11, QWORD PTR [rbp+48]
-        ; A[3] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+24]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbp+40], r10
-        ; No load %r12 - %r8
-        ; A[4] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+32]
-        adcx	r11, rax
-        adox	r14, rcx
-        mov	QWORD PTR [rbp+48], r11
-        ; No load %r13 - %r9
-        ; A[5] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+40]
-        adcx	r14, rax
-        adox	r15, rcx
-        ; No store %r12 - %r8
-        ; No load %r14 - %r8
-        ; A[6] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	r15, rax
-        adox	rdi, rcx
-        ; No store %r13 - %r9
-        ; No load %r15 - %r9
-        ; A[7] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; No store %r14 - %r8
-        ; No load %rbx - %r8
-        ; A[8] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r15 - %r9
-        mov	r11, QWORD PTR [r8]
-        ; A[9] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	rbx, rax
-        adox	r11, rcx
-        ; No store %rbx - %r8
-        mov	r10, QWORD PTR [r8+8]
-        ; A[10] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8], r11
-        mov	r11, QWORD PTR [r8+16]
-        ; A[10] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+8], r10
-        ; Zero into %r8
-        ; A[10] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	r10, rax, QWORD PTR [r9+80]
-        adcx	r11, rax
-        adox	r10, r12
-        mov	QWORD PTR [r8+16], r11
-        ; Zero into %r9
-        ; A[10] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	r11, rax, QWORD PTR [r9+80]
-        adcx	r10, rax
-        adox	r11, r12
-        mov	QWORD PTR [r8+24], r10
-        ;  Carry
-        adcx	r11, r13
-        mov	r13, r12
-        adcx	r13, r12
-        adox	r13, r12
-        mov	QWORD PTR [r8+32], r11
-        ; Diagonal 4
-        ; No load %r13 - %r8
-        ; A[4] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	rcx, rax, QWORD PTR [r9+32]
-        adcx	r14, rax
-        adox	r15, rcx
-        ; No store %r12 - %r9
-        ; No load %r14 - %r9
-        ; A[5] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+40]
-        adcx	r15, rax
-        adox	rdi, rcx
-        ; No store %r13 - %r8
-        ; No load %r15 - %r8
-        ; A[6] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; No store %r14 - %r9
-        ; No load %rbx - %r9
-        ; A[7] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r15 - %r8
-        mov	r10, QWORD PTR [r8]
-        ; A[8] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	rbx, rax
-        adox	r10, rcx
-        ; No store %rbx - %r9
-        mov	r11, QWORD PTR [r8+8]
-        ; A[9] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8], r10
-        mov	r10, QWORD PTR [r8+16]
-        ; A[9] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+8], r11
-        mov	r11, QWORD PTR [r8+24]
-        ; A[9] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+16], r10
-        mov	r10, QWORD PTR [r8+32]
-        ; A[9] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+24], r11
-        ; Zero into %r9
-        ; A[9] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	r11, rax, QWORD PTR [r9+72]
-        adcx	r10, rax
-        adox	r11, r12
-        mov	QWORD PTR [r8+32], r10
-        ; Zero into %r8
-        ; A[9] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	r10, rax, QWORD PTR [r9+72]
-        adcx	r11, rax
-        adox	r10, r12
-        mov	QWORD PTR [r8+40], r11
-        ;  Carry
-        adcx	r10, r13
-        mov	r13, r12
-        adcx	r13, r12
-        adox	r13, r12
-        mov	QWORD PTR [r8+48], r10
-        ; Diagonal 5
-        ; No load %r15 - %r9
-        ; A[5] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	rcx, rax, QWORD PTR [r9+40]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; No store %r14 - %r8
-        ; No load %rbx - %r8
-        ; A[6] x A[4]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r15 - %r9
-        mov	r11, QWORD PTR [r8]
-        ; A[7] x A[4]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	rbx, rax
-        adox	r11, rcx
-        ; No store %rbx - %r8
-        mov	r10, QWORD PTR [r8+8]
-        ; A[8] x A[4]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8], r11
-        mov	r11, QWORD PTR [r8+16]
-        ; A[8] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+8], r10
-        mov	r10, QWORD PTR [r8+24]
-        ; A[8] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+16], r11
-        mov	r11, QWORD PTR [r8+32]
-        ; A[8] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+24], r10
-        mov	r10, QWORD PTR [r8+40]
-        ; A[10] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+32], r11
-        mov	r11, QWORD PTR [r8+48]
-        ; A[10] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+40], r10
-        ; Zero into %r8
-        ; A[10] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	r10, rax, QWORD PTR [r9+80]
-        adcx	r11, rax
-        adox	r10, r12
-        mov	QWORD PTR [r8+48], r11
-        ; Zero into %r9
-        ; A[10] x A[9]
-        mov	rdx, QWORD PTR [r9+72]
-        mulx	r11, rax, QWORD PTR [r9+80]
-        adcx	r10, rax
-        adox	r11, r12
-        mov	QWORD PTR [r8+56], r10
-        ;  Carry
-        adcx	r11, r13
-        mov	r13, r12
-        adcx	r13, r12
-        adox	r13, r12
-        mov	QWORD PTR [r8+64], r11
-        ; Diagonal 6
-        mov	r10, QWORD PTR [r8]
-        ; A[6] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	rbx, rax
-        adox	r10, rcx
-        ; No store %rbx - %r9
-        mov	r11, QWORD PTR [r8+8]
-        ; A[7] x A[5]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8], r10
-        mov	r10, QWORD PTR [r8+16]
-        ; A[7] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+8], r11
-        mov	r11, QWORD PTR [r8+24]
-        ; A[11] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+16], r10
-        mov	r10, QWORD PTR [r8+32]
-        ; A[11] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+24], r11
-        mov	r11, QWORD PTR [r8+40]
-        ; A[11] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+32], r10
-        mov	r10, QWORD PTR [r8+48]
-        ; A[11] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+40], r11
-        mov	r11, QWORD PTR [r8+56]
-        ; A[11] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+48], r10
-        mov	r10, QWORD PTR [r8+64]
-        ; A[11] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+56], r11
-        ; Zero into %r9
-        ; A[11] x A[9]
-        mov	rdx, QWORD PTR [r9+72]
-        mulx	r11, rax, QWORD PTR [r9+88]
-        adcx	r10, rax
-        adox	r11, r12
-        mov	QWORD PTR [r8+64], r10
-        ; Zero into %r8
-        ; A[11] x A[10]
-        mov	rdx, QWORD PTR [r9+80]
-        mulx	r10, rax, QWORD PTR [r9+88]
-        adcx	r11, rax
-        adox	r10, r12
-        mov	QWORD PTR [r8+72], r11
-        ;  Carry
-        adcx	r10, r13
-        mov	r13, r12
-        adcx	r13, r12
-        adox	r13, r12
-        mov	QWORD PTR [r8+80], r10
-        mov	QWORD PTR [r8+88], r13
-        ; Double and Add in A[i] x A[i]
-        mov	r11, QWORD PTR [rbp+8]
-        ; A[0] x A[0]
-        mov	rdx, QWORD PTR [r9]
-        mulx	rcx, rax, rdx
-        mov	QWORD PTR [rbp], rax
-        adox	r11, r11
-        adcx	r11, rcx
-        mov	QWORD PTR [rbp+8], r11
-        mov	r10, QWORD PTR [rbp+16]
-        mov	r11, QWORD PTR [rbp+24]
-        ; A[1] x A[1]
-        mov	rdx, QWORD PTR [r9+8]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [rbp+16], r10
-        mov	QWORD PTR [rbp+24], r11
-        mov	r10, QWORD PTR [rbp+32]
-        mov	r11, QWORD PTR [rbp+40]
-        ; A[2] x A[2]
-        mov	rdx, QWORD PTR [r9+16]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [rbp+32], r10
-        mov	QWORD PTR [rbp+40], r11
-        mov	r10, QWORD PTR [rbp+48]
-        ; A[3] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r14, r14
-        adcx	r10, rax
-        adcx	r14, rcx
-        mov	QWORD PTR [rbp+48], r10
-        ; A[4] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	rcx, rax, rdx
-        adox	r15, r15
-        adox	rdi, rdi
-        adcx	r15, rax
-        adcx	rdi, rcx
-        ; A[5] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	rcx, rax, rdx
-        adox	rsi, rsi
-        adox	rbx, rbx
-        adcx	rsi, rax
-        adcx	rbx, rcx
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        ; A[6] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8], r10
-        mov	QWORD PTR [r8+8], r11
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        ; A[7] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+16], r10
-        mov	QWORD PTR [r8+24], r11
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        ; A[8] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+32], r10
-        mov	QWORD PTR [r8+40], r11
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        ; A[9] x A[9]
-        mov	rdx, QWORD PTR [r9+72]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+48], r10
-        mov	QWORD PTR [r8+56], r11
-        mov	r10, QWORD PTR [r8+64]
-        mov	r11, QWORD PTR [r8+72]
-        ; A[10] x A[10]
-        mov	rdx, QWORD PTR [r9+80]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+64], r10
-        mov	QWORD PTR [r8+72], r11
-        mov	r10, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [r8+88]
-        ; A[11] x A[11]
-        mov	rdx, QWORD PTR [r9+88]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+80], r10
-        mov	QWORD PTR [r8+88], r11
-        mov	QWORD PTR [r8+-40], r14
-        mov	QWORD PTR [r8+-32], r15
-        mov	QWORD PTR [r8+-24], rdi
-        mov	QWORD PTR [r8+-16], rsi
-        mov	QWORD PTR [r8+-8], rbx
-        sub	r8, 96
-        cmp	r9, r8
-        jne	L_end_3072_sqr_avx2_12
-        vmovdqu	xmm0, OWORD PTR [rbp]
-        vmovups	OWORD PTR [r8], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbp+16]
-        vmovups	OWORD PTR [r8+16], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbp+32]
-        vmovups	OWORD PTR [r8+32], xmm0
-        mov	rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [r8+48], rax
-L_end_3072_sqr_avx2_12:
-        add	rsp, 96
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        pop	rbp
-        ret
-sp_3072_sqr_avx2_12 ENDP
-_text ENDS
-ENDIF
-; /* Square a and put result in r. (r = a * a)
-;  *
-;  * Karatsuba: ah^2, al^2, (al - ah)^2
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_sqr_24 PROC
-        sub	rsp, 208
-        mov	QWORD PTR [rsp+192], rcx
-        mov	QWORD PTR [rsp+200], rdx
-        mov	r9, 0
-        mov	r10, rsp
-        lea	r11, QWORD PTR [rdx+96]
-        mov	rax, QWORD PTR [rdx]
-        sub	rax, QWORD PTR [r11]
-        mov	r8, QWORD PTR [rdx+8]
-        mov	QWORD PTR [r10], rax
-        sbb	r8, QWORD PTR [r11+8]
-        mov	rax, QWORD PTR [rdx+16]
-        mov	QWORD PTR [r10+8], r8
-        sbb	rax, QWORD PTR [r11+16]
-        mov	r8, QWORD PTR [rdx+24]
-        mov	QWORD PTR [r10+16], rax
-        sbb	r8, QWORD PTR [r11+24]
-        mov	rax, QWORD PTR [rdx+32]
-        mov	QWORD PTR [r10+24], r8
-        sbb	rax, QWORD PTR [r11+32]
-        mov	r8, QWORD PTR [rdx+40]
-        mov	QWORD PTR [r10+32], rax
-        sbb	r8, QWORD PTR [r11+40]
-        mov	rax, QWORD PTR [rdx+48]
-        mov	QWORD PTR [r10+40], r8
-        sbb	rax, QWORD PTR [r11+48]
-        mov	r8, QWORD PTR [rdx+56]
-        mov	QWORD PTR [r10+48], rax
-        sbb	r8, QWORD PTR [r11+56]
-        mov	rax, QWORD PTR [rdx+64]
-        mov	QWORD PTR [r10+56], r8
-        sbb	rax, QWORD PTR [r11+64]
-        mov	r8, QWORD PTR [rdx+72]
-        mov	QWORD PTR [r10+64], rax
-        sbb	r8, QWORD PTR [r11+72]
-        mov	rax, QWORD PTR [rdx+80]
-        mov	QWORD PTR [r10+72], r8
-        sbb	rax, QWORD PTR [r11+80]
-        mov	r8, QWORD PTR [rdx+88]
-        mov	QWORD PTR [r10+80], rax
-        sbb	r8, QWORD PTR [r11+88]
-        mov	QWORD PTR [r10+88], r8
-        sbb	r9, 0
-        ; Cond Negate
-        mov	rax, QWORD PTR [r10]
-        mov	r11, r9
-        xor	rax, r9
-        neg	r11
-        sub	rax, r9
-        mov	r8, QWORD PTR [r10+8]
-        sbb	r11, 0
-        mov	QWORD PTR [r10], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+16]
-        setc	r11b
-        mov	QWORD PTR [r10+8], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+24]
-        setc	r11b
-        mov	QWORD PTR [r10+16], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+32]
-        setc	r11b
-        mov	QWORD PTR [r10+24], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+40]
-        setc	r11b
-        mov	QWORD PTR [r10+32], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+48]
-        setc	r11b
-        mov	QWORD PTR [r10+40], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+56]
-        setc	r11b
-        mov	QWORD PTR [r10+48], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+64]
-        setc	r11b
-        mov	QWORD PTR [r10+56], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+72]
-        setc	r11b
-        mov	QWORD PTR [r10+64], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+80]
-        setc	r11b
-        mov	QWORD PTR [r10+72], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+88]
-        setc	r11b
-        mov	QWORD PTR [r10+80], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	QWORD PTR [r10+88], r8
-        mov	rdx, r10
-        mov	rcx, rsp
-        call	sp_3072_sqr_12
-        mov	rdx, QWORD PTR [rsp+200]
-        mov	rcx, QWORD PTR [rsp+192]
-        add	rdx, 96
-        add	rcx, 192
-        call	sp_3072_sqr_12
-        mov	rdx, QWORD PTR [rsp+200]
-        mov	rcx, QWORD PTR [rsp+192]
-        call	sp_3072_sqr_12
-IFDEF _WIN64
-        mov	rdx, QWORD PTR [rsp+200]
-        mov	rcx, QWORD PTR [rsp+192]
-ENDIF
-        mov	rdx, QWORD PTR [rsp+192]
-        lea	r10, QWORD PTR [rsp+96]
-        add	rdx, 288
-        mov	r9, 0
-        mov	r8, QWORD PTR [r10+-96]
-        sub	r8, QWORD PTR [rdx+-96]
-        mov	rax, QWORD PTR [r10+-88]
-        mov	QWORD PTR [r10+-96], r8
-        sbb	rax, QWORD PTR [rdx+-88]
-        mov	r8, QWORD PTR [r10+-80]
-        mov	QWORD PTR [r10+-88], rax
-        sbb	r8, QWORD PTR [rdx+-80]
-        mov	rax, QWORD PTR [r10+-72]
-        mov	QWORD PTR [r10+-80], r8
-        sbb	rax, QWORD PTR [rdx+-72]
-        mov	r8, QWORD PTR [r10+-64]
-        mov	QWORD PTR [r10+-72], rax
-        sbb	r8, QWORD PTR [rdx+-64]
-        mov	rax, QWORD PTR [r10+-56]
-        mov	QWORD PTR [r10+-64], r8
-        sbb	rax, QWORD PTR [rdx+-56]
-        mov	r8, QWORD PTR [r10+-48]
-        mov	QWORD PTR [r10+-56], rax
-        sbb	r8, QWORD PTR [rdx+-48]
-        mov	rax, QWORD PTR [r10+-40]
-        mov	QWORD PTR [r10+-48], r8
-        sbb	rax, QWORD PTR [rdx+-40]
-        mov	r8, QWORD PTR [r10+-32]
-        mov	QWORD PTR [r10+-40], rax
-        sbb	r8, QWORD PTR [rdx+-32]
-        mov	rax, QWORD PTR [r10+-24]
-        mov	QWORD PTR [r10+-32], r8
-        sbb	rax, QWORD PTR [rdx+-24]
-        mov	r8, QWORD PTR [r10+-16]
-        mov	QWORD PTR [r10+-24], rax
-        sbb	r8, QWORD PTR [rdx+-16]
-        mov	rax, QWORD PTR [r10+-8]
-        mov	QWORD PTR [r10+-16], r8
-        sbb	rax, QWORD PTR [rdx+-8]
-        mov	r8, QWORD PTR [r10]
-        mov	QWORD PTR [r10+-8], rax
-        sbb	r8, QWORD PTR [rdx]
-        mov	rax, QWORD PTR [r10+8]
-        mov	QWORD PTR [r10], r8
-        sbb	rax, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [r10+16]
-        mov	QWORD PTR [r10+8], rax
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	rax, QWORD PTR [r10+24]
-        mov	QWORD PTR [r10+16], r8
-        sbb	rax, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [r10+32]
-        mov	QWORD PTR [r10+24], rax
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	rax, QWORD PTR [r10+40]
-        mov	QWORD PTR [r10+32], r8
-        sbb	rax, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [r10+48]
-        mov	QWORD PTR [r10+40], rax
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	rax, QWORD PTR [r10+56]
-        mov	QWORD PTR [r10+48], r8
-        sbb	rax, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [r10+64]
-        mov	QWORD PTR [r10+56], rax
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	rax, QWORD PTR [r10+72]
-        mov	QWORD PTR [r10+64], r8
-        sbb	rax, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [r10+80]
-        mov	QWORD PTR [r10+72], rax
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	rax, QWORD PTR [r10+88]
-        mov	QWORD PTR [r10+80], r8
-        sbb	rax, QWORD PTR [rdx+88]
-        mov	QWORD PTR [r10+88], rax
-        sbb	r9, 0
-        sub	rdx, 192
-        mov	r8, QWORD PTR [r10+-96]
-        sub	r8, QWORD PTR [rdx+-96]
-        mov	rax, QWORD PTR [r10+-88]
-        mov	QWORD PTR [r10+-96], r8
-        sbb	rax, QWORD PTR [rdx+-88]
-        mov	r8, QWORD PTR [r10+-80]
-        mov	QWORD PTR [r10+-88], rax
-        sbb	r8, QWORD PTR [rdx+-80]
-        mov	rax, QWORD PTR [r10+-72]
-        mov	QWORD PTR [r10+-80], r8
-        sbb	rax, QWORD PTR [rdx+-72]
-        mov	r8, QWORD PTR [r10+-64]
-        mov	QWORD PTR [r10+-72], rax
-        sbb	r8, QWORD PTR [rdx+-64]
-        mov	rax, QWORD PTR [r10+-56]
-        mov	QWORD PTR [r10+-64], r8
-        sbb	rax, QWORD PTR [rdx+-56]
-        mov	r8, QWORD PTR [r10+-48]
-        mov	QWORD PTR [r10+-56], rax
-        sbb	r8, QWORD PTR [rdx+-48]
-        mov	rax, QWORD PTR [r10+-40]
-        mov	QWORD PTR [r10+-48], r8
-        sbb	rax, QWORD PTR [rdx+-40]
-        mov	r8, QWORD PTR [r10+-32]
-        mov	QWORD PTR [r10+-40], rax
-        sbb	r8, QWORD PTR [rdx+-32]
-        mov	rax, QWORD PTR [r10+-24]
-        mov	QWORD PTR [r10+-32], r8
-        sbb	rax, QWORD PTR [rdx+-24]
-        mov	r8, QWORD PTR [r10+-16]
-        mov	QWORD PTR [r10+-24], rax
-        sbb	r8, QWORD PTR [rdx+-16]
-        mov	rax, QWORD PTR [r10+-8]
-        mov	QWORD PTR [r10+-16], r8
-        sbb	rax, QWORD PTR [rdx+-8]
-        mov	r8, QWORD PTR [r10]
-        mov	QWORD PTR [r10+-8], rax
-        sbb	r8, QWORD PTR [rdx]
-        mov	rax, QWORD PTR [r10+8]
-        mov	QWORD PTR [r10], r8
-        sbb	rax, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [r10+16]
-        mov	QWORD PTR [r10+8], rax
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	rax, QWORD PTR [r10+24]
-        mov	QWORD PTR [r10+16], r8
-        sbb	rax, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [r10+32]
-        mov	QWORD PTR [r10+24], rax
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	rax, QWORD PTR [r10+40]
-        mov	QWORD PTR [r10+32], r8
-        sbb	rax, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [r10+48]
-        mov	QWORD PTR [r10+40], rax
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	rax, QWORD PTR [r10+56]
-        mov	QWORD PTR [r10+48], r8
-        sbb	rax, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [r10+64]
-        mov	QWORD PTR [r10+56], rax
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	rax, QWORD PTR [r10+72]
-        mov	QWORD PTR [r10+64], r8
-        sbb	rax, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [r10+80]
-        mov	QWORD PTR [r10+72], rax
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	rax, QWORD PTR [r10+88]
-        mov	QWORD PTR [r10+80], r8
-        sbb	rax, QWORD PTR [rdx+88]
-        mov	QWORD PTR [r10+88], rax
-        sbb	r9, 0
-        mov	rcx, QWORD PTR [rsp+192]
-        neg	r9
-        add	rcx, 192
-        mov	r8, QWORD PTR [rcx+-96]
-        sub	r8, QWORD PTR [r10+-96]
-        mov	rax, QWORD PTR [rcx+-88]
-        mov	QWORD PTR [rcx+-96], r8
-        sbb	rax, QWORD PTR [r10+-88]
-        mov	r8, QWORD PTR [rcx+-80]
-        mov	QWORD PTR [rcx+-88], rax
-        sbb	r8, QWORD PTR [r10+-80]
-        mov	rax, QWORD PTR [rcx+-72]
-        mov	QWORD PTR [rcx+-80], r8
-        sbb	rax, QWORD PTR [r10+-72]
-        mov	r8, QWORD PTR [rcx+-64]
-        mov	QWORD PTR [rcx+-72], rax
-        sbb	r8, QWORD PTR [r10+-64]
-        mov	rax, QWORD PTR [rcx+-56]
-        mov	QWORD PTR [rcx+-64], r8
-        sbb	rax, QWORD PTR [r10+-56]
-        mov	r8, QWORD PTR [rcx+-48]
-        mov	QWORD PTR [rcx+-56], rax
-        sbb	r8, QWORD PTR [r10+-48]
-        mov	rax, QWORD PTR [rcx+-40]
-        mov	QWORD PTR [rcx+-48], r8
-        sbb	rax, QWORD PTR [r10+-40]
-        mov	r8, QWORD PTR [rcx+-32]
-        mov	QWORD PTR [rcx+-40], rax
-        sbb	r8, QWORD PTR [r10+-32]
-        mov	rax, QWORD PTR [rcx+-24]
-        mov	QWORD PTR [rcx+-32], r8
-        sbb	rax, QWORD PTR [r10+-24]
-        mov	r8, QWORD PTR [rcx+-16]
-        mov	QWORD PTR [rcx+-24], rax
-        sbb	r8, QWORD PTR [r10+-16]
-        mov	rax, QWORD PTR [rcx+-8]
-        mov	QWORD PTR [rcx+-16], r8
-        sbb	rax, QWORD PTR [r10+-8]
-        mov	r8, QWORD PTR [rcx]
-        mov	QWORD PTR [rcx+-8], rax
-        sbb	r8, QWORD PTR [r10]
-        mov	rax, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        sbb	rax, QWORD PTR [r10+8]
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], rax
-        sbb	r8, QWORD PTR [r10+16]
-        mov	rax, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        sbb	rax, QWORD PTR [r10+24]
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], rax
-        sbb	r8, QWORD PTR [r10+32]
-        mov	rax, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        sbb	rax, QWORD PTR [r10+40]
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], rax
-        sbb	r8, QWORD PTR [r10+48]
-        mov	rax, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        sbb	rax, QWORD PTR [r10+56]
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], rax
-        sbb	r8, QWORD PTR [r10+64]
-        mov	rax, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        sbb	rax, QWORD PTR [r10+72]
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], rax
-        sbb	r8, QWORD PTR [r10+80]
-        mov	rax, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        sbb	rax, QWORD PTR [r10+88]
-        mov	QWORD PTR [rcx+88], rax
-        sbb	r9, 0
-        mov	rcx, QWORD PTR [rsp+192]
-        add	rcx, 288
-        ; Add in word
-        mov	r8, QWORD PTR [rcx]
-        add	r8, r9
-        mov	rax, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        adc	rax, 0
-        mov	QWORD PTR [rcx+88], rax
-        mov	rdx, QWORD PTR [rsp+200]
-        mov	rcx, QWORD PTR [rsp+192]
-        add	rsp, 208
-        ret
-sp_3072_sqr_24 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Square a and put result in r. (r = a * a)
-;  *
-;  * Karatsuba: ah^2, al^2, (al - ah)^2
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_sqr_avx2_24 PROC
-        sub	rsp, 208
-        mov	QWORD PTR [rsp+192], rcx
-        mov	QWORD PTR [rsp+200], rdx
-        mov	r9, 0
-        mov	r10, rsp
-        lea	r11, QWORD PTR [rdx+96]
-        mov	rax, QWORD PTR [rdx]
-        sub	rax, QWORD PTR [r11]
-        mov	r8, QWORD PTR [rdx+8]
-        mov	QWORD PTR [r10], rax
-        sbb	r8, QWORD PTR [r11+8]
-        mov	rax, QWORD PTR [rdx+16]
-        mov	QWORD PTR [r10+8], r8
-        sbb	rax, QWORD PTR [r11+16]
-        mov	r8, QWORD PTR [rdx+24]
-        mov	QWORD PTR [r10+16], rax
-        sbb	r8, QWORD PTR [r11+24]
-        mov	rax, QWORD PTR [rdx+32]
-        mov	QWORD PTR [r10+24], r8
-        sbb	rax, QWORD PTR [r11+32]
-        mov	r8, QWORD PTR [rdx+40]
-        mov	QWORD PTR [r10+32], rax
-        sbb	r8, QWORD PTR [r11+40]
-        mov	rax, QWORD PTR [rdx+48]
-        mov	QWORD PTR [r10+40], r8
-        sbb	rax, QWORD PTR [r11+48]
-        mov	r8, QWORD PTR [rdx+56]
-        mov	QWORD PTR [r10+48], rax
-        sbb	r8, QWORD PTR [r11+56]
-        mov	rax, QWORD PTR [rdx+64]
-        mov	QWORD PTR [r10+56], r8
-        sbb	rax, QWORD PTR [r11+64]
-        mov	r8, QWORD PTR [rdx+72]
-        mov	QWORD PTR [r10+64], rax
-        sbb	r8, QWORD PTR [r11+72]
-        mov	rax, QWORD PTR [rdx+80]
-        mov	QWORD PTR [r10+72], r8
-        sbb	rax, QWORD PTR [r11+80]
-        mov	r8, QWORD PTR [rdx+88]
-        mov	QWORD PTR [r10+80], rax
-        sbb	r8, QWORD PTR [r11+88]
-        mov	QWORD PTR [r10+88], r8
-        sbb	r9, 0
-        ; Cond Negate
-        mov	rax, QWORD PTR [r10]
-        mov	r11, r9
-        xor	rax, r9
-        neg	r11
-        sub	rax, r9
-        mov	r8, QWORD PTR [r10+8]
-        sbb	r11, 0
-        mov	QWORD PTR [r10], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+16]
-        setc	r11b
-        mov	QWORD PTR [r10+8], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+24]
-        setc	r11b
-        mov	QWORD PTR [r10+16], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+32]
-        setc	r11b
-        mov	QWORD PTR [r10+24], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+40]
-        setc	r11b
-        mov	QWORD PTR [r10+32], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+48]
-        setc	r11b
-        mov	QWORD PTR [r10+40], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+56]
-        setc	r11b
-        mov	QWORD PTR [r10+48], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+64]
-        setc	r11b
-        mov	QWORD PTR [r10+56], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+72]
-        setc	r11b
-        mov	QWORD PTR [r10+64], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+80]
-        setc	r11b
-        mov	QWORD PTR [r10+72], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+88]
-        setc	r11b
-        mov	QWORD PTR [r10+80], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	QWORD PTR [r10+88], r8
-        mov	rdx, r10
-        mov	rcx, rsp
-        call	sp_3072_sqr_avx2_12
-        mov	rdx, QWORD PTR [rsp+200]
-        mov	rcx, QWORD PTR [rsp+192]
-        add	rdx, 96
-        add	rcx, 192
-        call	sp_3072_sqr_avx2_12
-        mov	rdx, QWORD PTR [rsp+200]
-        mov	rcx, QWORD PTR [rsp+192]
-        call	sp_3072_sqr_avx2_12
-IFDEF _WIN64
-        mov	rdx, QWORD PTR [rsp+200]
-        mov	rcx, QWORD PTR [rsp+192]
-ENDIF
-        mov	rdx, QWORD PTR [rsp+192]
-        lea	r10, QWORD PTR [rsp+96]
-        add	rdx, 288
-        mov	r9, 0
-        mov	r8, QWORD PTR [r10+-96]
-        sub	r8, QWORD PTR [rdx+-96]
-        mov	rax, QWORD PTR [r10+-88]
-        mov	QWORD PTR [r10+-96], r8
-        sbb	rax, QWORD PTR [rdx+-88]
-        mov	r8, QWORD PTR [r10+-80]
-        mov	QWORD PTR [r10+-88], rax
-        sbb	r8, QWORD PTR [rdx+-80]
-        mov	rax, QWORD PTR [r10+-72]
-        mov	QWORD PTR [r10+-80], r8
-        sbb	rax, QWORD PTR [rdx+-72]
-        mov	r8, QWORD PTR [r10+-64]
-        mov	QWORD PTR [r10+-72], rax
-        sbb	r8, QWORD PTR [rdx+-64]
-        mov	rax, QWORD PTR [r10+-56]
-        mov	QWORD PTR [r10+-64], r8
-        sbb	rax, QWORD PTR [rdx+-56]
-        mov	r8, QWORD PTR [r10+-48]
-        mov	QWORD PTR [r10+-56], rax
-        sbb	r8, QWORD PTR [rdx+-48]
-        mov	rax, QWORD PTR [r10+-40]
-        mov	QWORD PTR [r10+-48], r8
-        sbb	rax, QWORD PTR [rdx+-40]
-        mov	r8, QWORD PTR [r10+-32]
-        mov	QWORD PTR [r10+-40], rax
-        sbb	r8, QWORD PTR [rdx+-32]
-        mov	rax, QWORD PTR [r10+-24]
-        mov	QWORD PTR [r10+-32], r8
-        sbb	rax, QWORD PTR [rdx+-24]
-        mov	r8, QWORD PTR [r10+-16]
-        mov	QWORD PTR [r10+-24], rax
-        sbb	r8, QWORD PTR [rdx+-16]
-        mov	rax, QWORD PTR [r10+-8]
-        mov	QWORD PTR [r10+-16], r8
-        sbb	rax, QWORD PTR [rdx+-8]
-        mov	r8, QWORD PTR [r10]
-        mov	QWORD PTR [r10+-8], rax
-        sbb	r8, QWORD PTR [rdx]
-        mov	rax, QWORD PTR [r10+8]
-        mov	QWORD PTR [r10], r8
-        sbb	rax, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [r10+16]
-        mov	QWORD PTR [r10+8], rax
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	rax, QWORD PTR [r10+24]
-        mov	QWORD PTR [r10+16], r8
-        sbb	rax, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [r10+32]
-        mov	QWORD PTR [r10+24], rax
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	rax, QWORD PTR [r10+40]
-        mov	QWORD PTR [r10+32], r8
-        sbb	rax, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [r10+48]
-        mov	QWORD PTR [r10+40], rax
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	rax, QWORD PTR [r10+56]
-        mov	QWORD PTR [r10+48], r8
-        sbb	rax, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [r10+64]
-        mov	QWORD PTR [r10+56], rax
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	rax, QWORD PTR [r10+72]
-        mov	QWORD PTR [r10+64], r8
-        sbb	rax, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [r10+80]
-        mov	QWORD PTR [r10+72], rax
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	rax, QWORD PTR [r10+88]
-        mov	QWORD PTR [r10+80], r8
-        sbb	rax, QWORD PTR [rdx+88]
-        mov	QWORD PTR [r10+88], rax
-        sbb	r9, 0
-        sub	rdx, 192
-        mov	r8, QWORD PTR [r10+-96]
-        sub	r8, QWORD PTR [rdx+-96]
-        mov	rax, QWORD PTR [r10+-88]
-        mov	QWORD PTR [r10+-96], r8
-        sbb	rax, QWORD PTR [rdx+-88]
-        mov	r8, QWORD PTR [r10+-80]
-        mov	QWORD PTR [r10+-88], rax
-        sbb	r8, QWORD PTR [rdx+-80]
-        mov	rax, QWORD PTR [r10+-72]
-        mov	QWORD PTR [r10+-80], r8
-        sbb	rax, QWORD PTR [rdx+-72]
-        mov	r8, QWORD PTR [r10+-64]
-        mov	QWORD PTR [r10+-72], rax
-        sbb	r8, QWORD PTR [rdx+-64]
-        mov	rax, QWORD PTR [r10+-56]
-        mov	QWORD PTR [r10+-64], r8
-        sbb	rax, QWORD PTR [rdx+-56]
-        mov	r8, QWORD PTR [r10+-48]
-        mov	QWORD PTR [r10+-56], rax
-        sbb	r8, QWORD PTR [rdx+-48]
-        mov	rax, QWORD PTR [r10+-40]
-        mov	QWORD PTR [r10+-48], r8
-        sbb	rax, QWORD PTR [rdx+-40]
-        mov	r8, QWORD PTR [r10+-32]
-        mov	QWORD PTR [r10+-40], rax
-        sbb	r8, QWORD PTR [rdx+-32]
-        mov	rax, QWORD PTR [r10+-24]
-        mov	QWORD PTR [r10+-32], r8
-        sbb	rax, QWORD PTR [rdx+-24]
-        mov	r8, QWORD PTR [r10+-16]
-        mov	QWORD PTR [r10+-24], rax
-        sbb	r8, QWORD PTR [rdx+-16]
-        mov	rax, QWORD PTR [r10+-8]
-        mov	QWORD PTR [r10+-16], r8
-        sbb	rax, QWORD PTR [rdx+-8]
-        mov	r8, QWORD PTR [r10]
-        mov	QWORD PTR [r10+-8], rax
-        sbb	r8, QWORD PTR [rdx]
-        mov	rax, QWORD PTR [r10+8]
-        mov	QWORD PTR [r10], r8
-        sbb	rax, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [r10+16]
-        mov	QWORD PTR [r10+8], rax
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	rax, QWORD PTR [r10+24]
-        mov	QWORD PTR [r10+16], r8
-        sbb	rax, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [r10+32]
-        mov	QWORD PTR [r10+24], rax
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	rax, QWORD PTR [r10+40]
-        mov	QWORD PTR [r10+32], r8
-        sbb	rax, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [r10+48]
-        mov	QWORD PTR [r10+40], rax
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	rax, QWORD PTR [r10+56]
-        mov	QWORD PTR [r10+48], r8
-        sbb	rax, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [r10+64]
-        mov	QWORD PTR [r10+56], rax
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	rax, QWORD PTR [r10+72]
-        mov	QWORD PTR [r10+64], r8
-        sbb	rax, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [r10+80]
-        mov	QWORD PTR [r10+72], rax
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	rax, QWORD PTR [r10+88]
-        mov	QWORD PTR [r10+80], r8
-        sbb	rax, QWORD PTR [rdx+88]
-        mov	QWORD PTR [r10+88], rax
-        sbb	r9, 0
-        mov	rcx, QWORD PTR [rsp+192]
-        neg	r9
-        add	rcx, 192
-        mov	r8, QWORD PTR [rcx+-96]
-        sub	r8, QWORD PTR [r10+-96]
-        mov	rax, QWORD PTR [rcx+-88]
-        mov	QWORD PTR [rcx+-96], r8
-        sbb	rax, QWORD PTR [r10+-88]
-        mov	r8, QWORD PTR [rcx+-80]
-        mov	QWORD PTR [rcx+-88], rax
-        sbb	r8, QWORD PTR [r10+-80]
-        mov	rax, QWORD PTR [rcx+-72]
-        mov	QWORD PTR [rcx+-80], r8
-        sbb	rax, QWORD PTR [r10+-72]
-        mov	r8, QWORD PTR [rcx+-64]
-        mov	QWORD PTR [rcx+-72], rax
-        sbb	r8, QWORD PTR [r10+-64]
-        mov	rax, QWORD PTR [rcx+-56]
-        mov	QWORD PTR [rcx+-64], r8
-        sbb	rax, QWORD PTR [r10+-56]
-        mov	r8, QWORD PTR [rcx+-48]
-        mov	QWORD PTR [rcx+-56], rax
-        sbb	r8, QWORD PTR [r10+-48]
-        mov	rax, QWORD PTR [rcx+-40]
-        mov	QWORD PTR [rcx+-48], r8
-        sbb	rax, QWORD PTR [r10+-40]
-        mov	r8, QWORD PTR [rcx+-32]
-        mov	QWORD PTR [rcx+-40], rax
-        sbb	r8, QWORD PTR [r10+-32]
-        mov	rax, QWORD PTR [rcx+-24]
-        mov	QWORD PTR [rcx+-32], r8
-        sbb	rax, QWORD PTR [r10+-24]
-        mov	r8, QWORD PTR [rcx+-16]
-        mov	QWORD PTR [rcx+-24], rax
-        sbb	r8, QWORD PTR [r10+-16]
-        mov	rax, QWORD PTR [rcx+-8]
-        mov	QWORD PTR [rcx+-16], r8
-        sbb	rax, QWORD PTR [r10+-8]
-        mov	r8, QWORD PTR [rcx]
-        mov	QWORD PTR [rcx+-8], rax
-        sbb	r8, QWORD PTR [r10]
-        mov	rax, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        sbb	rax, QWORD PTR [r10+8]
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], rax
-        sbb	r8, QWORD PTR [r10+16]
-        mov	rax, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        sbb	rax, QWORD PTR [r10+24]
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], rax
-        sbb	r8, QWORD PTR [r10+32]
-        mov	rax, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        sbb	rax, QWORD PTR [r10+40]
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], rax
-        sbb	r8, QWORD PTR [r10+48]
-        mov	rax, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        sbb	rax, QWORD PTR [r10+56]
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], rax
-        sbb	r8, QWORD PTR [r10+64]
-        mov	rax, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        sbb	rax, QWORD PTR [r10+72]
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], rax
-        sbb	r8, QWORD PTR [r10+80]
-        mov	rax, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        sbb	rax, QWORD PTR [r10+88]
-        mov	QWORD PTR [rcx+88], rax
-        sbb	r9, 0
-        mov	rcx, QWORD PTR [rsp+192]
-        add	rcx, 288
-        ; Add in word
-        mov	r8, QWORD PTR [rcx]
-        add	r8, r9
-        mov	rax, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        adc	rax, 0
-        mov	QWORD PTR [rcx+88], rax
-        mov	rdx, QWORD PTR [rsp+200]
-        mov	rcx, QWORD PTR [rsp+192]
-        add	rsp, 208
-        ret
-sp_3072_sqr_avx2_24 ENDP
-_text ENDS
-ENDIF
-; /* Square a and put result in r. (r = a * a)
-;  *
-;  * Karatsuba: ah^2, al^2, (al - ah)^2
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_sqr_48 PROC
-        sub	rsp, 400
-        mov	QWORD PTR [rsp+384], rcx
-        mov	QWORD PTR [rsp+392], rdx
-        mov	r9, 0
-        mov	r10, rsp
-        lea	r11, QWORD PTR [rdx+192]
-        mov	rax, QWORD PTR [rdx]
-        sub	rax, QWORD PTR [r11]
-        mov	r8, QWORD PTR [rdx+8]
-        mov	QWORD PTR [r10], rax
-        sbb	r8, QWORD PTR [r11+8]
-        mov	rax, QWORD PTR [rdx+16]
-        mov	QWORD PTR [r10+8], r8
-        sbb	rax, QWORD PTR [r11+16]
-        mov	r8, QWORD PTR [rdx+24]
-        mov	QWORD PTR [r10+16], rax
-        sbb	r8, QWORD PTR [r11+24]
-        mov	rax, QWORD PTR [rdx+32]
-        mov	QWORD PTR [r10+24], r8
-        sbb	rax, QWORD PTR [r11+32]
-        mov	r8, QWORD PTR [rdx+40]
-        mov	QWORD PTR [r10+32], rax
-        sbb	r8, QWORD PTR [r11+40]
-        mov	rax, QWORD PTR [rdx+48]
-        mov	QWORD PTR [r10+40], r8
-        sbb	rax, QWORD PTR [r11+48]
-        mov	r8, QWORD PTR [rdx+56]
-        mov	QWORD PTR [r10+48], rax
-        sbb	r8, QWORD PTR [r11+56]
-        mov	rax, QWORD PTR [rdx+64]
-        mov	QWORD PTR [r10+56], r8
-        sbb	rax, QWORD PTR [r11+64]
-        mov	r8, QWORD PTR [rdx+72]
-        mov	QWORD PTR [r10+64], rax
-        sbb	r8, QWORD PTR [r11+72]
-        mov	rax, QWORD PTR [rdx+80]
-        mov	QWORD PTR [r10+72], r8
-        sbb	rax, QWORD PTR [r11+80]
-        mov	r8, QWORD PTR [rdx+88]
-        mov	QWORD PTR [r10+80], rax
-        sbb	r8, QWORD PTR [r11+88]
-        mov	rax, QWORD PTR [rdx+96]
-        mov	QWORD PTR [r10+88], r8
-        sbb	rax, QWORD PTR [r11+96]
-        mov	r8, QWORD PTR [rdx+104]
-        mov	QWORD PTR [r10+96], rax
-        sbb	r8, QWORD PTR [r11+104]
-        mov	rax, QWORD PTR [rdx+112]
-        mov	QWORD PTR [r10+104], r8
-        sbb	rax, QWORD PTR [r11+112]
-        mov	r8, QWORD PTR [rdx+120]
-        mov	QWORD PTR [r10+112], rax
-        sbb	r8, QWORD PTR [r11+120]
-        mov	rax, QWORD PTR [rdx+128]
-        mov	QWORD PTR [r10+120], r8
-        sbb	rax, QWORD PTR [r11+128]
-        mov	r8, QWORD PTR [rdx+136]
-        mov	QWORD PTR [r10+128], rax
-        sbb	r8, QWORD PTR [r11+136]
-        mov	rax, QWORD PTR [rdx+144]
-        mov	QWORD PTR [r10+136], r8
-        sbb	rax, QWORD PTR [r11+144]
-        mov	r8, QWORD PTR [rdx+152]
-        mov	QWORD PTR [r10+144], rax
-        sbb	r8, QWORD PTR [r11+152]
-        mov	rax, QWORD PTR [rdx+160]
-        mov	QWORD PTR [r10+152], r8
-        sbb	rax, QWORD PTR [r11+160]
-        mov	r8, QWORD PTR [rdx+168]
-        mov	QWORD PTR [r10+160], rax
-        sbb	r8, QWORD PTR [r11+168]
-        mov	rax, QWORD PTR [rdx+176]
-        mov	QWORD PTR [r10+168], r8
-        sbb	rax, QWORD PTR [r11+176]
-        mov	r8, QWORD PTR [rdx+184]
-        mov	QWORD PTR [r10+176], rax
-        sbb	r8, QWORD PTR [r11+184]
-        mov	QWORD PTR [r10+184], r8
-        sbb	r9, 0
-        ; Cond Negate
-        mov	rax, QWORD PTR [r10]
-        mov	r11, r9
-        xor	rax, r9
-        neg	r11
-        sub	rax, r9
-        mov	r8, QWORD PTR [r10+8]
-        sbb	r11, 0
-        mov	QWORD PTR [r10], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+16]
-        setc	r11b
-        mov	QWORD PTR [r10+8], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+24]
-        setc	r11b
-        mov	QWORD PTR [r10+16], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+32]
-        setc	r11b
-        mov	QWORD PTR [r10+24], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+40]
-        setc	r11b
-        mov	QWORD PTR [r10+32], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+48]
-        setc	r11b
-        mov	QWORD PTR [r10+40], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+56]
-        setc	r11b
-        mov	QWORD PTR [r10+48], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+64]
-        setc	r11b
-        mov	QWORD PTR [r10+56], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+72]
-        setc	r11b
-        mov	QWORD PTR [r10+64], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+80]
-        setc	r11b
-        mov	QWORD PTR [r10+72], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+88]
-        setc	r11b
-        mov	QWORD PTR [r10+80], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+96]
-        setc	r11b
-        mov	QWORD PTR [r10+88], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+104]
-        setc	r11b
-        mov	QWORD PTR [r10+96], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+112]
-        setc	r11b
-        mov	QWORD PTR [r10+104], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+120]
-        setc	r11b
-        mov	QWORD PTR [r10+112], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+128]
-        setc	r11b
-        mov	QWORD PTR [r10+120], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+136]
-        setc	r11b
-        mov	QWORD PTR [r10+128], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+144]
-        setc	r11b
-        mov	QWORD PTR [r10+136], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+152]
-        setc	r11b
-        mov	QWORD PTR [r10+144], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+160]
-        setc	r11b
-        mov	QWORD PTR [r10+152], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+168]
-        setc	r11b
-        mov	QWORD PTR [r10+160], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+176]
-        setc	r11b
-        mov	QWORD PTR [r10+168], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+184]
-        setc	r11b
-        mov	QWORD PTR [r10+176], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	QWORD PTR [r10+184], r8
-        mov	rdx, r10
-        mov	rcx, rsp
-        call	sp_3072_sqr_24
-        mov	rdx, QWORD PTR [rsp+392]
-        mov	rcx, QWORD PTR [rsp+384]
-        add	rdx, 192
-        add	rcx, 384
-        call	sp_3072_sqr_24
-        mov	rdx, QWORD PTR [rsp+392]
-        mov	rcx, QWORD PTR [rsp+384]
-        call	sp_3072_sqr_24
-IFDEF _WIN64
-        mov	rdx, QWORD PTR [rsp+392]
-        mov	rcx, QWORD PTR [rsp+384]
-ENDIF
-        mov	rdx, QWORD PTR [rsp+384]
-        lea	r10, QWORD PTR [rsp+192]
-        add	rdx, 576
-        mov	r9, 0
-        mov	r8, QWORD PTR [r10+-192]
-        sub	r8, QWORD PTR [rdx+-192]
-        mov	rax, QWORD PTR [r10+-184]
-        mov	QWORD PTR [r10+-192], r8
-        sbb	rax, QWORD PTR [rdx+-184]
-        mov	r8, QWORD PTR [r10+-176]
-        mov	QWORD PTR [r10+-184], rax
-        sbb	r8, QWORD PTR [rdx+-176]
-        mov	rax, QWORD PTR [r10+-168]
-        mov	QWORD PTR [r10+-176], r8
-        sbb	rax, QWORD PTR [rdx+-168]
-        mov	r8, QWORD PTR [r10+-160]
-        mov	QWORD PTR [r10+-168], rax
-        sbb	r8, QWORD PTR [rdx+-160]
-        mov	rax, QWORD PTR [r10+-152]
-        mov	QWORD PTR [r10+-160], r8
-        sbb	rax, QWORD PTR [rdx+-152]
-        mov	r8, QWORD PTR [r10+-144]
-        mov	QWORD PTR [r10+-152], rax
-        sbb	r8, QWORD PTR [rdx+-144]
-        mov	rax, QWORD PTR [r10+-136]
-        mov	QWORD PTR [r10+-144], r8
-        sbb	rax, QWORD PTR [rdx+-136]
-        mov	r8, QWORD PTR [r10+-128]
-        mov	QWORD PTR [r10+-136], rax
-        sbb	r8, QWORD PTR [rdx+-128]
-        mov	rax, QWORD PTR [r10+-120]
-        mov	QWORD PTR [r10+-128], r8
-        sbb	rax, QWORD PTR [rdx+-120]
-        mov	r8, QWORD PTR [r10+-112]
-        mov	QWORD PTR [r10+-120], rax
-        sbb	r8, QWORD PTR [rdx+-112]
-        mov	rax, QWORD PTR [r10+-104]
-        mov	QWORD PTR [r10+-112], r8
-        sbb	rax, QWORD PTR [rdx+-104]
-        mov	r8, QWORD PTR [r10+-96]
-        mov	QWORD PTR [r10+-104], rax
-        sbb	r8, QWORD PTR [rdx+-96]
-        mov	rax, QWORD PTR [r10+-88]
-        mov	QWORD PTR [r10+-96], r8
-        sbb	rax, QWORD PTR [rdx+-88]
-        mov	r8, QWORD PTR [r10+-80]
-        mov	QWORD PTR [r10+-88], rax
-        sbb	r8, QWORD PTR [rdx+-80]
-        mov	rax, QWORD PTR [r10+-72]
-        mov	QWORD PTR [r10+-80], r8
-        sbb	rax, QWORD PTR [rdx+-72]
-        mov	r8, QWORD PTR [r10+-64]
-        mov	QWORD PTR [r10+-72], rax
-        sbb	r8, QWORD PTR [rdx+-64]
-        mov	rax, QWORD PTR [r10+-56]
-        mov	QWORD PTR [r10+-64], r8
-        sbb	rax, QWORD PTR [rdx+-56]
-        mov	r8, QWORD PTR [r10+-48]
-        mov	QWORD PTR [r10+-56], rax
-        sbb	r8, QWORD PTR [rdx+-48]
-        mov	rax, QWORD PTR [r10+-40]
-        mov	QWORD PTR [r10+-48], r8
-        sbb	rax, QWORD PTR [rdx+-40]
-        mov	r8, QWORD PTR [r10+-32]
-        mov	QWORD PTR [r10+-40], rax
-        sbb	r8, QWORD PTR [rdx+-32]
-        mov	rax, QWORD PTR [r10+-24]
-        mov	QWORD PTR [r10+-32], r8
-        sbb	rax, QWORD PTR [rdx+-24]
-        mov	r8, QWORD PTR [r10+-16]
-        mov	QWORD PTR [r10+-24], rax
-        sbb	r8, QWORD PTR [rdx+-16]
-        mov	rax, QWORD PTR [r10+-8]
-        mov	QWORD PTR [r10+-16], r8
-        sbb	rax, QWORD PTR [rdx+-8]
-        mov	r8, QWORD PTR [r10]
-        mov	QWORD PTR [r10+-8], rax
-        sbb	r8, QWORD PTR [rdx]
-        mov	rax, QWORD PTR [r10+8]
-        mov	QWORD PTR [r10], r8
-        sbb	rax, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [r10+16]
-        mov	QWORD PTR [r10+8], rax
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	rax, QWORD PTR [r10+24]
-        mov	QWORD PTR [r10+16], r8
-        sbb	rax, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [r10+32]
-        mov	QWORD PTR [r10+24], rax
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	rax, QWORD PTR [r10+40]
-        mov	QWORD PTR [r10+32], r8
-        sbb	rax, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [r10+48]
-        mov	QWORD PTR [r10+40], rax
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	rax, QWORD PTR [r10+56]
-        mov	QWORD PTR [r10+48], r8
-        sbb	rax, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [r10+64]
-        mov	QWORD PTR [r10+56], rax
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	rax, QWORD PTR [r10+72]
-        mov	QWORD PTR [r10+64], r8
-        sbb	rax, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [r10+80]
-        mov	QWORD PTR [r10+72], rax
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	rax, QWORD PTR [r10+88]
-        mov	QWORD PTR [r10+80], r8
-        sbb	rax, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [r10+96]
-        mov	QWORD PTR [r10+88], rax
-        sbb	r8, QWORD PTR [rdx+96]
-        mov	rax, QWORD PTR [r10+104]
-        mov	QWORD PTR [r10+96], r8
-        sbb	rax, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [r10+112]
-        mov	QWORD PTR [r10+104], rax
-        sbb	r8, QWORD PTR [rdx+112]
-        mov	rax, QWORD PTR [r10+120]
-        mov	QWORD PTR [r10+112], r8
-        sbb	rax, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [r10+128]
-        mov	QWORD PTR [r10+120], rax
-        sbb	r8, QWORD PTR [rdx+128]
-        mov	rax, QWORD PTR [r10+136]
-        mov	QWORD PTR [r10+128], r8
-        sbb	rax, QWORD PTR [rdx+136]
-        mov	r8, QWORD PTR [r10+144]
-        mov	QWORD PTR [r10+136], rax
-        sbb	r8, QWORD PTR [rdx+144]
-        mov	rax, QWORD PTR [r10+152]
-        mov	QWORD PTR [r10+144], r8
-        sbb	rax, QWORD PTR [rdx+152]
-        mov	r8, QWORD PTR [r10+160]
-        mov	QWORD PTR [r10+152], rax
-        sbb	r8, QWORD PTR [rdx+160]
-        mov	rax, QWORD PTR [r10+168]
-        mov	QWORD PTR [r10+160], r8
-        sbb	rax, QWORD PTR [rdx+168]
-        mov	r8, QWORD PTR [r10+176]
-        mov	QWORD PTR [r10+168], rax
-        sbb	r8, QWORD PTR [rdx+176]
-        mov	rax, QWORD PTR [r10+184]
-        mov	QWORD PTR [r10+176], r8
-        sbb	rax, QWORD PTR [rdx+184]
-        mov	QWORD PTR [r10+184], rax
-        sbb	r9, 0
-        sub	rdx, 384
-        mov	r8, QWORD PTR [r10+-192]
-        sub	r8, QWORD PTR [rdx+-192]
-        mov	rax, QWORD PTR [r10+-184]
-        mov	QWORD PTR [r10+-192], r8
-        sbb	rax, QWORD PTR [rdx+-184]
-        mov	r8, QWORD PTR [r10+-176]
-        mov	QWORD PTR [r10+-184], rax
-        sbb	r8, QWORD PTR [rdx+-176]
-        mov	rax, QWORD PTR [r10+-168]
-        mov	QWORD PTR [r10+-176], r8
-        sbb	rax, QWORD PTR [rdx+-168]
-        mov	r8, QWORD PTR [r10+-160]
-        mov	QWORD PTR [r10+-168], rax
-        sbb	r8, QWORD PTR [rdx+-160]
-        mov	rax, QWORD PTR [r10+-152]
-        mov	QWORD PTR [r10+-160], r8
-        sbb	rax, QWORD PTR [rdx+-152]
-        mov	r8, QWORD PTR [r10+-144]
-        mov	QWORD PTR [r10+-152], rax
-        sbb	r8, QWORD PTR [rdx+-144]
-        mov	rax, QWORD PTR [r10+-136]
-        mov	QWORD PTR [r10+-144], r8
-        sbb	rax, QWORD PTR [rdx+-136]
-        mov	r8, QWORD PTR [r10+-128]
-        mov	QWORD PTR [r10+-136], rax
-        sbb	r8, QWORD PTR [rdx+-128]
-        mov	rax, QWORD PTR [r10+-120]
-        mov	QWORD PTR [r10+-128], r8
-        sbb	rax, QWORD PTR [rdx+-120]
-        mov	r8, QWORD PTR [r10+-112]
-        mov	QWORD PTR [r10+-120], rax
-        sbb	r8, QWORD PTR [rdx+-112]
-        mov	rax, QWORD PTR [r10+-104]
-        mov	QWORD PTR [r10+-112], r8
-        sbb	rax, QWORD PTR [rdx+-104]
-        mov	r8, QWORD PTR [r10+-96]
-        mov	QWORD PTR [r10+-104], rax
-        sbb	r8, QWORD PTR [rdx+-96]
-        mov	rax, QWORD PTR [r10+-88]
-        mov	QWORD PTR [r10+-96], r8
-        sbb	rax, QWORD PTR [rdx+-88]
-        mov	r8, QWORD PTR [r10+-80]
-        mov	QWORD PTR [r10+-88], rax
-        sbb	r8, QWORD PTR [rdx+-80]
-        mov	rax, QWORD PTR [r10+-72]
-        mov	QWORD PTR [r10+-80], r8
-        sbb	rax, QWORD PTR [rdx+-72]
-        mov	r8, QWORD PTR [r10+-64]
-        mov	QWORD PTR [r10+-72], rax
-        sbb	r8, QWORD PTR [rdx+-64]
-        mov	rax, QWORD PTR [r10+-56]
-        mov	QWORD PTR [r10+-64], r8
-        sbb	rax, QWORD PTR [rdx+-56]
-        mov	r8, QWORD PTR [r10+-48]
-        mov	QWORD PTR [r10+-56], rax
-        sbb	r8, QWORD PTR [rdx+-48]
-        mov	rax, QWORD PTR [r10+-40]
-        mov	QWORD PTR [r10+-48], r8
-        sbb	rax, QWORD PTR [rdx+-40]
-        mov	r8, QWORD PTR [r10+-32]
-        mov	QWORD PTR [r10+-40], rax
-        sbb	r8, QWORD PTR [rdx+-32]
-        mov	rax, QWORD PTR [r10+-24]
-        mov	QWORD PTR [r10+-32], r8
-        sbb	rax, QWORD PTR [rdx+-24]
-        mov	r8, QWORD PTR [r10+-16]
-        mov	QWORD PTR [r10+-24], rax
-        sbb	r8, QWORD PTR [rdx+-16]
-        mov	rax, QWORD PTR [r10+-8]
-        mov	QWORD PTR [r10+-16], r8
-        sbb	rax, QWORD PTR [rdx+-8]
-        mov	r8, QWORD PTR [r10]
-        mov	QWORD PTR [r10+-8], rax
-        sbb	r8, QWORD PTR [rdx]
-        mov	rax, QWORD PTR [r10+8]
-        mov	QWORD PTR [r10], r8
-        sbb	rax, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [r10+16]
-        mov	QWORD PTR [r10+8], rax
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	rax, QWORD PTR [r10+24]
-        mov	QWORD PTR [r10+16], r8
-        sbb	rax, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [r10+32]
-        mov	QWORD PTR [r10+24], rax
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	rax, QWORD PTR [r10+40]
-        mov	QWORD PTR [r10+32], r8
-        sbb	rax, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [r10+48]
-        mov	QWORD PTR [r10+40], rax
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	rax, QWORD PTR [r10+56]
-        mov	QWORD PTR [r10+48], r8
-        sbb	rax, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [r10+64]
-        mov	QWORD PTR [r10+56], rax
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	rax, QWORD PTR [r10+72]
-        mov	QWORD PTR [r10+64], r8
-        sbb	rax, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [r10+80]
-        mov	QWORD PTR [r10+72], rax
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	rax, QWORD PTR [r10+88]
-        mov	QWORD PTR [r10+80], r8
-        sbb	rax, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [r10+96]
-        mov	QWORD PTR [r10+88], rax
-        sbb	r8, QWORD PTR [rdx+96]
-        mov	rax, QWORD PTR [r10+104]
-        mov	QWORD PTR [r10+96], r8
-        sbb	rax, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [r10+112]
-        mov	QWORD PTR [r10+104], rax
-        sbb	r8, QWORD PTR [rdx+112]
-        mov	rax, QWORD PTR [r10+120]
-        mov	QWORD PTR [r10+112], r8
-        sbb	rax, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [r10+128]
-        mov	QWORD PTR [r10+120], rax
-        sbb	r8, QWORD PTR [rdx+128]
-        mov	rax, QWORD PTR [r10+136]
-        mov	QWORD PTR [r10+128], r8
-        sbb	rax, QWORD PTR [rdx+136]
-        mov	r8, QWORD PTR [r10+144]
-        mov	QWORD PTR [r10+136], rax
-        sbb	r8, QWORD PTR [rdx+144]
-        mov	rax, QWORD PTR [r10+152]
-        mov	QWORD PTR [r10+144], r8
-        sbb	rax, QWORD PTR [rdx+152]
-        mov	r8, QWORD PTR [r10+160]
-        mov	QWORD PTR [r10+152], rax
-        sbb	r8, QWORD PTR [rdx+160]
-        mov	rax, QWORD PTR [r10+168]
-        mov	QWORD PTR [r10+160], r8
-        sbb	rax, QWORD PTR [rdx+168]
-        mov	r8, QWORD PTR [r10+176]
-        mov	QWORD PTR [r10+168], rax
-        sbb	r8, QWORD PTR [rdx+176]
-        mov	rax, QWORD PTR [r10+184]
-        mov	QWORD PTR [r10+176], r8
-        sbb	rax, QWORD PTR [rdx+184]
-        mov	QWORD PTR [r10+184], rax
-        sbb	r9, 0
-        mov	rcx, QWORD PTR [rsp+384]
-        neg	r9
-        add	rcx, 384
-        mov	r8, QWORD PTR [rcx+-192]
-        sub	r8, QWORD PTR [r10+-192]
-        mov	rax, QWORD PTR [rcx+-184]
-        mov	QWORD PTR [rcx+-192], r8
-        sbb	rax, QWORD PTR [r10+-184]
-        mov	r8, QWORD PTR [rcx+-176]
-        mov	QWORD PTR [rcx+-184], rax
-        sbb	r8, QWORD PTR [r10+-176]
-        mov	rax, QWORD PTR [rcx+-168]
-        mov	QWORD PTR [rcx+-176], r8
-        sbb	rax, QWORD PTR [r10+-168]
-        mov	r8, QWORD PTR [rcx+-160]
-        mov	QWORD PTR [rcx+-168], rax
-        sbb	r8, QWORD PTR [r10+-160]
-        mov	rax, QWORD PTR [rcx+-152]
-        mov	QWORD PTR [rcx+-160], r8
-        sbb	rax, QWORD PTR [r10+-152]
-        mov	r8, QWORD PTR [rcx+-144]
-        mov	QWORD PTR [rcx+-152], rax
-        sbb	r8, QWORD PTR [r10+-144]
-        mov	rax, QWORD PTR [rcx+-136]
-        mov	QWORD PTR [rcx+-144], r8
-        sbb	rax, QWORD PTR [r10+-136]
-        mov	r8, QWORD PTR [rcx+-128]
-        mov	QWORD PTR [rcx+-136], rax
-        sbb	r8, QWORD PTR [r10+-128]
-        mov	rax, QWORD PTR [rcx+-120]
-        mov	QWORD PTR [rcx+-128], r8
-        sbb	rax, QWORD PTR [r10+-120]
-        mov	r8, QWORD PTR [rcx+-112]
-        mov	QWORD PTR [rcx+-120], rax
-        sbb	r8, QWORD PTR [r10+-112]
-        mov	rax, QWORD PTR [rcx+-104]
-        mov	QWORD PTR [rcx+-112], r8
-        sbb	rax, QWORD PTR [r10+-104]
-        mov	r8, QWORD PTR [rcx+-96]
-        mov	QWORD PTR [rcx+-104], rax
-        sbb	r8, QWORD PTR [r10+-96]
-        mov	rax, QWORD PTR [rcx+-88]
-        mov	QWORD PTR [rcx+-96], r8
-        sbb	rax, QWORD PTR [r10+-88]
-        mov	r8, QWORD PTR [rcx+-80]
-        mov	QWORD PTR [rcx+-88], rax
-        sbb	r8, QWORD PTR [r10+-80]
-        mov	rax, QWORD PTR [rcx+-72]
-        mov	QWORD PTR [rcx+-80], r8
-        sbb	rax, QWORD PTR [r10+-72]
-        mov	r8, QWORD PTR [rcx+-64]
-        mov	QWORD PTR [rcx+-72], rax
-        sbb	r8, QWORD PTR [r10+-64]
-        mov	rax, QWORD PTR [rcx+-56]
-        mov	QWORD PTR [rcx+-64], r8
-        sbb	rax, QWORD PTR [r10+-56]
-        mov	r8, QWORD PTR [rcx+-48]
-        mov	QWORD PTR [rcx+-56], rax
-        sbb	r8, QWORD PTR [r10+-48]
-        mov	rax, QWORD PTR [rcx+-40]
-        mov	QWORD PTR [rcx+-48], r8
-        sbb	rax, QWORD PTR [r10+-40]
-        mov	r8, QWORD PTR [rcx+-32]
-        mov	QWORD PTR [rcx+-40], rax
-        sbb	r8, QWORD PTR [r10+-32]
-        mov	rax, QWORD PTR [rcx+-24]
-        mov	QWORD PTR [rcx+-32], r8
-        sbb	rax, QWORD PTR [r10+-24]
-        mov	r8, QWORD PTR [rcx+-16]
-        mov	QWORD PTR [rcx+-24], rax
-        sbb	r8, QWORD PTR [r10+-16]
-        mov	rax, QWORD PTR [rcx+-8]
-        mov	QWORD PTR [rcx+-16], r8
-        sbb	rax, QWORD PTR [r10+-8]
-        mov	r8, QWORD PTR [rcx]
-        mov	QWORD PTR [rcx+-8], rax
-        sbb	r8, QWORD PTR [r10]
-        mov	rax, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        sbb	rax, QWORD PTR [r10+8]
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], rax
-        sbb	r8, QWORD PTR [r10+16]
-        mov	rax, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        sbb	rax, QWORD PTR [r10+24]
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], rax
-        sbb	r8, QWORD PTR [r10+32]
-        mov	rax, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        sbb	rax, QWORD PTR [r10+40]
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], rax
-        sbb	r8, QWORD PTR [r10+48]
-        mov	rax, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        sbb	rax, QWORD PTR [r10+56]
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], rax
-        sbb	r8, QWORD PTR [r10+64]
-        mov	rax, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        sbb	rax, QWORD PTR [r10+72]
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], rax
-        sbb	r8, QWORD PTR [r10+80]
-        mov	rax, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        sbb	rax, QWORD PTR [r10+88]
-        mov	r8, QWORD PTR [rcx+96]
-        mov	QWORD PTR [rcx+88], rax
-        sbb	r8, QWORD PTR [r10+96]
-        mov	rax, QWORD PTR [rcx+104]
-        mov	QWORD PTR [rcx+96], r8
-        sbb	rax, QWORD PTR [r10+104]
-        mov	r8, QWORD PTR [rcx+112]
-        mov	QWORD PTR [rcx+104], rax
-        sbb	r8, QWORD PTR [r10+112]
-        mov	rax, QWORD PTR [rcx+120]
-        mov	QWORD PTR [rcx+112], r8
-        sbb	rax, QWORD PTR [r10+120]
-        mov	r8, QWORD PTR [rcx+128]
-        mov	QWORD PTR [rcx+120], rax
-        sbb	r8, QWORD PTR [r10+128]
-        mov	rax, QWORD PTR [rcx+136]
-        mov	QWORD PTR [rcx+128], r8
-        sbb	rax, QWORD PTR [r10+136]
-        mov	r8, QWORD PTR [rcx+144]
-        mov	QWORD PTR [rcx+136], rax
-        sbb	r8, QWORD PTR [r10+144]
-        mov	rax, QWORD PTR [rcx+152]
-        mov	QWORD PTR [rcx+144], r8
-        sbb	rax, QWORD PTR [r10+152]
-        mov	r8, QWORD PTR [rcx+160]
-        mov	QWORD PTR [rcx+152], rax
-        sbb	r8, QWORD PTR [r10+160]
-        mov	rax, QWORD PTR [rcx+168]
-        mov	QWORD PTR [rcx+160], r8
-        sbb	rax, QWORD PTR [r10+168]
-        mov	r8, QWORD PTR [rcx+176]
-        mov	QWORD PTR [rcx+168], rax
-        sbb	r8, QWORD PTR [r10+176]
-        mov	rax, QWORD PTR [rcx+184]
-        mov	QWORD PTR [rcx+176], r8
-        sbb	rax, QWORD PTR [r10+184]
-        mov	QWORD PTR [rcx+184], rax
-        sbb	r9, 0
-        mov	rcx, QWORD PTR [rsp+384]
-        add	rcx, 576
-        ; Add in word
-        mov	r8, QWORD PTR [rcx]
-        add	r8, r9
-        mov	rax, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+96]
-        mov	QWORD PTR [rcx+88], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+104]
-        mov	QWORD PTR [rcx+96], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+112]
-        mov	QWORD PTR [rcx+104], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+120]
-        mov	QWORD PTR [rcx+112], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+128]
-        mov	QWORD PTR [rcx+120], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+136]
-        mov	QWORD PTR [rcx+128], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+144]
-        mov	QWORD PTR [rcx+136], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+152]
-        mov	QWORD PTR [rcx+144], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+160]
-        mov	QWORD PTR [rcx+152], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+168]
-        mov	QWORD PTR [rcx+160], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+176]
-        mov	QWORD PTR [rcx+168], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+184]
-        mov	QWORD PTR [rcx+176], r8
-        adc	rax, 0
-        mov	QWORD PTR [rcx+184], rax
-        mov	rdx, QWORD PTR [rsp+392]
-        mov	rcx, QWORD PTR [rsp+384]
-        add	rsp, 400
-        ret
-sp_3072_sqr_48 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Square a and put result in r. (r = a * a)
-;  *
-;  * Karatsuba: ah^2, al^2, (al - ah)^2
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_sqr_avx2_48 PROC
-        sub	rsp, 400
-        mov	QWORD PTR [rsp+384], rcx
-        mov	QWORD PTR [rsp+392], rdx
-        mov	r9, 0
-        mov	r10, rsp
-        lea	r11, QWORD PTR [rdx+192]
-        mov	rax, QWORD PTR [rdx]
-        sub	rax, QWORD PTR [r11]
-        mov	r8, QWORD PTR [rdx+8]
-        mov	QWORD PTR [r10], rax
-        sbb	r8, QWORD PTR [r11+8]
-        mov	rax, QWORD PTR [rdx+16]
-        mov	QWORD PTR [r10+8], r8
-        sbb	rax, QWORD PTR [r11+16]
-        mov	r8, QWORD PTR [rdx+24]
-        mov	QWORD PTR [r10+16], rax
-        sbb	r8, QWORD PTR [r11+24]
-        mov	rax, QWORD PTR [rdx+32]
-        mov	QWORD PTR [r10+24], r8
-        sbb	rax, QWORD PTR [r11+32]
-        mov	r8, QWORD PTR [rdx+40]
-        mov	QWORD PTR [r10+32], rax
-        sbb	r8, QWORD PTR [r11+40]
-        mov	rax, QWORD PTR [rdx+48]
-        mov	QWORD PTR [r10+40], r8
-        sbb	rax, QWORD PTR [r11+48]
-        mov	r8, QWORD PTR [rdx+56]
-        mov	QWORD PTR [r10+48], rax
-        sbb	r8, QWORD PTR [r11+56]
-        mov	rax, QWORD PTR [rdx+64]
-        mov	QWORD PTR [r10+56], r8
-        sbb	rax, QWORD PTR [r11+64]
-        mov	r8, QWORD PTR [rdx+72]
-        mov	QWORD PTR [r10+64], rax
-        sbb	r8, QWORD PTR [r11+72]
-        mov	rax, QWORD PTR [rdx+80]
-        mov	QWORD PTR [r10+72], r8
-        sbb	rax, QWORD PTR [r11+80]
-        mov	r8, QWORD PTR [rdx+88]
-        mov	QWORD PTR [r10+80], rax
-        sbb	r8, QWORD PTR [r11+88]
-        mov	rax, QWORD PTR [rdx+96]
-        mov	QWORD PTR [r10+88], r8
-        sbb	rax, QWORD PTR [r11+96]
-        mov	r8, QWORD PTR [rdx+104]
-        mov	QWORD PTR [r10+96], rax
-        sbb	r8, QWORD PTR [r11+104]
-        mov	rax, QWORD PTR [rdx+112]
-        mov	QWORD PTR [r10+104], r8
-        sbb	rax, QWORD PTR [r11+112]
-        mov	r8, QWORD PTR [rdx+120]
-        mov	QWORD PTR [r10+112], rax
-        sbb	r8, QWORD PTR [r11+120]
-        mov	rax, QWORD PTR [rdx+128]
-        mov	QWORD PTR [r10+120], r8
-        sbb	rax, QWORD PTR [r11+128]
-        mov	r8, QWORD PTR [rdx+136]
-        mov	QWORD PTR [r10+128], rax
-        sbb	r8, QWORD PTR [r11+136]
-        mov	rax, QWORD PTR [rdx+144]
-        mov	QWORD PTR [r10+136], r8
-        sbb	rax, QWORD PTR [r11+144]
-        mov	r8, QWORD PTR [rdx+152]
-        mov	QWORD PTR [r10+144], rax
-        sbb	r8, QWORD PTR [r11+152]
-        mov	rax, QWORD PTR [rdx+160]
-        mov	QWORD PTR [r10+152], r8
-        sbb	rax, QWORD PTR [r11+160]
-        mov	r8, QWORD PTR [rdx+168]
-        mov	QWORD PTR [r10+160], rax
-        sbb	r8, QWORD PTR [r11+168]
-        mov	rax, QWORD PTR [rdx+176]
-        mov	QWORD PTR [r10+168], r8
-        sbb	rax, QWORD PTR [r11+176]
-        mov	r8, QWORD PTR [rdx+184]
-        mov	QWORD PTR [r10+176], rax
-        sbb	r8, QWORD PTR [r11+184]
-        mov	QWORD PTR [r10+184], r8
-        sbb	r9, 0
-        ; Cond Negate
-        mov	rax, QWORD PTR [r10]
-        mov	r11, r9
-        xor	rax, r9
-        neg	r11
-        sub	rax, r9
-        mov	r8, QWORD PTR [r10+8]
-        sbb	r11, 0
-        mov	QWORD PTR [r10], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+16]
-        setc	r11b
-        mov	QWORD PTR [r10+8], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+24]
-        setc	r11b
-        mov	QWORD PTR [r10+16], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+32]
-        setc	r11b
-        mov	QWORD PTR [r10+24], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+40]
-        setc	r11b
-        mov	QWORD PTR [r10+32], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+48]
-        setc	r11b
-        mov	QWORD PTR [r10+40], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+56]
-        setc	r11b
-        mov	QWORD PTR [r10+48], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+64]
-        setc	r11b
-        mov	QWORD PTR [r10+56], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+72]
-        setc	r11b
-        mov	QWORD PTR [r10+64], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+80]
-        setc	r11b
-        mov	QWORD PTR [r10+72], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+88]
-        setc	r11b
-        mov	QWORD PTR [r10+80], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+96]
-        setc	r11b
-        mov	QWORD PTR [r10+88], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+104]
-        setc	r11b
-        mov	QWORD PTR [r10+96], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+112]
-        setc	r11b
-        mov	QWORD PTR [r10+104], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+120]
-        setc	r11b
-        mov	QWORD PTR [r10+112], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+128]
-        setc	r11b
-        mov	QWORD PTR [r10+120], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+136]
-        setc	r11b
-        mov	QWORD PTR [r10+128], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+144]
-        setc	r11b
-        mov	QWORD PTR [r10+136], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+152]
-        setc	r11b
-        mov	QWORD PTR [r10+144], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+160]
-        setc	r11b
-        mov	QWORD PTR [r10+152], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+168]
-        setc	r11b
-        mov	QWORD PTR [r10+160], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+176]
-        setc	r11b
-        mov	QWORD PTR [r10+168], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+184]
-        setc	r11b
-        mov	QWORD PTR [r10+176], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	QWORD PTR [r10+184], r8
-        mov	rdx, r10
-        mov	rcx, rsp
-        call	sp_3072_sqr_avx2_24
-        mov	rdx, QWORD PTR [rsp+392]
-        mov	rcx, QWORD PTR [rsp+384]
-        add	rdx, 192
-        add	rcx, 384
-        call	sp_3072_sqr_avx2_24
-        mov	rdx, QWORD PTR [rsp+392]
-        mov	rcx, QWORD PTR [rsp+384]
-        call	sp_3072_sqr_avx2_24
-IFDEF _WIN64
-        mov	rdx, QWORD PTR [rsp+392]
-        mov	rcx, QWORD PTR [rsp+384]
-ENDIF
-        mov	rdx, QWORD PTR [rsp+384]
-        lea	r10, QWORD PTR [rsp+192]
-        add	rdx, 576
-        mov	r9, 0
-        mov	r8, QWORD PTR [r10+-192]
-        sub	r8, QWORD PTR [rdx+-192]
-        mov	rax, QWORD PTR [r10+-184]
-        mov	QWORD PTR [r10+-192], r8
-        sbb	rax, QWORD PTR [rdx+-184]
-        mov	r8, QWORD PTR [r10+-176]
-        mov	QWORD PTR [r10+-184], rax
-        sbb	r8, QWORD PTR [rdx+-176]
-        mov	rax, QWORD PTR [r10+-168]
-        mov	QWORD PTR [r10+-176], r8
-        sbb	rax, QWORD PTR [rdx+-168]
-        mov	r8, QWORD PTR [r10+-160]
-        mov	QWORD PTR [r10+-168], rax
-        sbb	r8, QWORD PTR [rdx+-160]
-        mov	rax, QWORD PTR [r10+-152]
-        mov	QWORD PTR [r10+-160], r8
-        sbb	rax, QWORD PTR [rdx+-152]
-        mov	r8, QWORD PTR [r10+-144]
-        mov	QWORD PTR [r10+-152], rax
-        sbb	r8, QWORD PTR [rdx+-144]
-        mov	rax, QWORD PTR [r10+-136]
-        mov	QWORD PTR [r10+-144], r8
-        sbb	rax, QWORD PTR [rdx+-136]
-        mov	r8, QWORD PTR [r10+-128]
-        mov	QWORD PTR [r10+-136], rax
-        sbb	r8, QWORD PTR [rdx+-128]
-        mov	rax, QWORD PTR [r10+-120]
-        mov	QWORD PTR [r10+-128], r8
-        sbb	rax, QWORD PTR [rdx+-120]
-        mov	r8, QWORD PTR [r10+-112]
-        mov	QWORD PTR [r10+-120], rax
-        sbb	r8, QWORD PTR [rdx+-112]
-        mov	rax, QWORD PTR [r10+-104]
-        mov	QWORD PTR [r10+-112], r8
-        sbb	rax, QWORD PTR [rdx+-104]
-        mov	r8, QWORD PTR [r10+-96]
-        mov	QWORD PTR [r10+-104], rax
-        sbb	r8, QWORD PTR [rdx+-96]
-        mov	rax, QWORD PTR [r10+-88]
-        mov	QWORD PTR [r10+-96], r8
-        sbb	rax, QWORD PTR [rdx+-88]
-        mov	r8, QWORD PTR [r10+-80]
-        mov	QWORD PTR [r10+-88], rax
-        sbb	r8, QWORD PTR [rdx+-80]
-        mov	rax, QWORD PTR [r10+-72]
-        mov	QWORD PTR [r10+-80], r8
-        sbb	rax, QWORD PTR [rdx+-72]
-        mov	r8, QWORD PTR [r10+-64]
-        mov	QWORD PTR [r10+-72], rax
-        sbb	r8, QWORD PTR [rdx+-64]
-        mov	rax, QWORD PTR [r10+-56]
-        mov	QWORD PTR [r10+-64], r8
-        sbb	rax, QWORD PTR [rdx+-56]
-        mov	r8, QWORD PTR [r10+-48]
-        mov	QWORD PTR [r10+-56], rax
-        sbb	r8, QWORD PTR [rdx+-48]
-        mov	rax, QWORD PTR [r10+-40]
-        mov	QWORD PTR [r10+-48], r8
-        sbb	rax, QWORD PTR [rdx+-40]
-        mov	r8, QWORD PTR [r10+-32]
-        mov	QWORD PTR [r10+-40], rax
-        sbb	r8, QWORD PTR [rdx+-32]
-        mov	rax, QWORD PTR [r10+-24]
-        mov	QWORD PTR [r10+-32], r8
-        sbb	rax, QWORD PTR [rdx+-24]
-        mov	r8, QWORD PTR [r10+-16]
-        mov	QWORD PTR [r10+-24], rax
-        sbb	r8, QWORD PTR [rdx+-16]
-        mov	rax, QWORD PTR [r10+-8]
-        mov	QWORD PTR [r10+-16], r8
-        sbb	rax, QWORD PTR [rdx+-8]
-        mov	r8, QWORD PTR [r10]
-        mov	QWORD PTR [r10+-8], rax
-        sbb	r8, QWORD PTR [rdx]
-        mov	rax, QWORD PTR [r10+8]
-        mov	QWORD PTR [r10], r8
-        sbb	rax, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [r10+16]
-        mov	QWORD PTR [r10+8], rax
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	rax, QWORD PTR [r10+24]
-        mov	QWORD PTR [r10+16], r8
-        sbb	rax, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [r10+32]
-        mov	QWORD PTR [r10+24], rax
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	rax, QWORD PTR [r10+40]
-        mov	QWORD PTR [r10+32], r8
-        sbb	rax, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [r10+48]
-        mov	QWORD PTR [r10+40], rax
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	rax, QWORD PTR [r10+56]
-        mov	QWORD PTR [r10+48], r8
-        sbb	rax, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [r10+64]
-        mov	QWORD PTR [r10+56], rax
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	rax, QWORD PTR [r10+72]
-        mov	QWORD PTR [r10+64], r8
-        sbb	rax, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [r10+80]
-        mov	QWORD PTR [r10+72], rax
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	rax, QWORD PTR [r10+88]
-        mov	QWORD PTR [r10+80], r8
-        sbb	rax, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [r10+96]
-        mov	QWORD PTR [r10+88], rax
-        sbb	r8, QWORD PTR [rdx+96]
-        mov	rax, QWORD PTR [r10+104]
-        mov	QWORD PTR [r10+96], r8
-        sbb	rax, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [r10+112]
-        mov	QWORD PTR [r10+104], rax
-        sbb	r8, QWORD PTR [rdx+112]
-        mov	rax, QWORD PTR [r10+120]
-        mov	QWORD PTR [r10+112], r8
-        sbb	rax, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [r10+128]
-        mov	QWORD PTR [r10+120], rax
-        sbb	r8, QWORD PTR [rdx+128]
-        mov	rax, QWORD PTR [r10+136]
-        mov	QWORD PTR [r10+128], r8
-        sbb	rax, QWORD PTR [rdx+136]
-        mov	r8, QWORD PTR [r10+144]
-        mov	QWORD PTR [r10+136], rax
-        sbb	r8, QWORD PTR [rdx+144]
-        mov	rax, QWORD PTR [r10+152]
-        mov	QWORD PTR [r10+144], r8
-        sbb	rax, QWORD PTR [rdx+152]
-        mov	r8, QWORD PTR [r10+160]
-        mov	QWORD PTR [r10+152], rax
-        sbb	r8, QWORD PTR [rdx+160]
-        mov	rax, QWORD PTR [r10+168]
-        mov	QWORD PTR [r10+160], r8
-        sbb	rax, QWORD PTR [rdx+168]
-        mov	r8, QWORD PTR [r10+176]
-        mov	QWORD PTR [r10+168], rax
-        sbb	r8, QWORD PTR [rdx+176]
-        mov	rax, QWORD PTR [r10+184]
-        mov	QWORD PTR [r10+176], r8
-        sbb	rax, QWORD PTR [rdx+184]
-        mov	QWORD PTR [r10+184], rax
-        sbb	r9, 0
-        sub	rdx, 384
-        mov	r8, QWORD PTR [r10+-192]
-        sub	r8, QWORD PTR [rdx+-192]
-        mov	rax, QWORD PTR [r10+-184]
-        mov	QWORD PTR [r10+-192], r8
-        sbb	rax, QWORD PTR [rdx+-184]
-        mov	r8, QWORD PTR [r10+-176]
-        mov	QWORD PTR [r10+-184], rax
-        sbb	r8, QWORD PTR [rdx+-176]
-        mov	rax, QWORD PTR [r10+-168]
-        mov	QWORD PTR [r10+-176], r8
-        sbb	rax, QWORD PTR [rdx+-168]
-        mov	r8, QWORD PTR [r10+-160]
-        mov	QWORD PTR [r10+-168], rax
-        sbb	r8, QWORD PTR [rdx+-160]
-        mov	rax, QWORD PTR [r10+-152]
-        mov	QWORD PTR [r10+-160], r8
-        sbb	rax, QWORD PTR [rdx+-152]
-        mov	r8, QWORD PTR [r10+-144]
-        mov	QWORD PTR [r10+-152], rax
-        sbb	r8, QWORD PTR [rdx+-144]
-        mov	rax, QWORD PTR [r10+-136]
-        mov	QWORD PTR [r10+-144], r8
-        sbb	rax, QWORD PTR [rdx+-136]
-        mov	r8, QWORD PTR [r10+-128]
-        mov	QWORD PTR [r10+-136], rax
-        sbb	r8, QWORD PTR [rdx+-128]
-        mov	rax, QWORD PTR [r10+-120]
-        mov	QWORD PTR [r10+-128], r8
-        sbb	rax, QWORD PTR [rdx+-120]
-        mov	r8, QWORD PTR [r10+-112]
-        mov	QWORD PTR [r10+-120], rax
-        sbb	r8, QWORD PTR [rdx+-112]
-        mov	rax, QWORD PTR [r10+-104]
-        mov	QWORD PTR [r10+-112], r8
-        sbb	rax, QWORD PTR [rdx+-104]
-        mov	r8, QWORD PTR [r10+-96]
-        mov	QWORD PTR [r10+-104], rax
-        sbb	r8, QWORD PTR [rdx+-96]
-        mov	rax, QWORD PTR [r10+-88]
-        mov	QWORD PTR [r10+-96], r8
-        sbb	rax, QWORD PTR [rdx+-88]
-        mov	r8, QWORD PTR [r10+-80]
-        mov	QWORD PTR [r10+-88], rax
-        sbb	r8, QWORD PTR [rdx+-80]
-        mov	rax, QWORD PTR [r10+-72]
-        mov	QWORD PTR [r10+-80], r8
-        sbb	rax, QWORD PTR [rdx+-72]
-        mov	r8, QWORD PTR [r10+-64]
-        mov	QWORD PTR [r10+-72], rax
-        sbb	r8, QWORD PTR [rdx+-64]
-        mov	rax, QWORD PTR [r10+-56]
-        mov	QWORD PTR [r10+-64], r8
-        sbb	rax, QWORD PTR [rdx+-56]
-        mov	r8, QWORD PTR [r10+-48]
-        mov	QWORD PTR [r10+-56], rax
-        sbb	r8, QWORD PTR [rdx+-48]
-        mov	rax, QWORD PTR [r10+-40]
-        mov	QWORD PTR [r10+-48], r8
-        sbb	rax, QWORD PTR [rdx+-40]
-        mov	r8, QWORD PTR [r10+-32]
-        mov	QWORD PTR [r10+-40], rax
-        sbb	r8, QWORD PTR [rdx+-32]
-        mov	rax, QWORD PTR [r10+-24]
-        mov	QWORD PTR [r10+-32], r8
-        sbb	rax, QWORD PTR [rdx+-24]
-        mov	r8, QWORD PTR [r10+-16]
-        mov	QWORD PTR [r10+-24], rax
-        sbb	r8, QWORD PTR [rdx+-16]
-        mov	rax, QWORD PTR [r10+-8]
-        mov	QWORD PTR [r10+-16], r8
-        sbb	rax, QWORD PTR [rdx+-8]
-        mov	r8, QWORD PTR [r10]
-        mov	QWORD PTR [r10+-8], rax
-        sbb	r8, QWORD PTR [rdx]
-        mov	rax, QWORD PTR [r10+8]
-        mov	QWORD PTR [r10], r8
-        sbb	rax, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [r10+16]
-        mov	QWORD PTR [r10+8], rax
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	rax, QWORD PTR [r10+24]
-        mov	QWORD PTR [r10+16], r8
-        sbb	rax, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [r10+32]
-        mov	QWORD PTR [r10+24], rax
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	rax, QWORD PTR [r10+40]
-        mov	QWORD PTR [r10+32], r8
-        sbb	rax, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [r10+48]
-        mov	QWORD PTR [r10+40], rax
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	rax, QWORD PTR [r10+56]
-        mov	QWORD PTR [r10+48], r8
-        sbb	rax, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [r10+64]
-        mov	QWORD PTR [r10+56], rax
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	rax, QWORD PTR [r10+72]
-        mov	QWORD PTR [r10+64], r8
-        sbb	rax, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [r10+80]
-        mov	QWORD PTR [r10+72], rax
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	rax, QWORD PTR [r10+88]
-        mov	QWORD PTR [r10+80], r8
-        sbb	rax, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [r10+96]
-        mov	QWORD PTR [r10+88], rax
-        sbb	r8, QWORD PTR [rdx+96]
-        mov	rax, QWORD PTR [r10+104]
-        mov	QWORD PTR [r10+96], r8
-        sbb	rax, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [r10+112]
-        mov	QWORD PTR [r10+104], rax
-        sbb	r8, QWORD PTR [rdx+112]
-        mov	rax, QWORD PTR [r10+120]
-        mov	QWORD PTR [r10+112], r8
-        sbb	rax, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [r10+128]
-        mov	QWORD PTR [r10+120], rax
-        sbb	r8, QWORD PTR [rdx+128]
-        mov	rax, QWORD PTR [r10+136]
-        mov	QWORD PTR [r10+128], r8
-        sbb	rax, QWORD PTR [rdx+136]
-        mov	r8, QWORD PTR [r10+144]
-        mov	QWORD PTR [r10+136], rax
-        sbb	r8, QWORD PTR [rdx+144]
-        mov	rax, QWORD PTR [r10+152]
-        mov	QWORD PTR [r10+144], r8
-        sbb	rax, QWORD PTR [rdx+152]
-        mov	r8, QWORD PTR [r10+160]
-        mov	QWORD PTR [r10+152], rax
-        sbb	r8, QWORD PTR [rdx+160]
-        mov	rax, QWORD PTR [r10+168]
-        mov	QWORD PTR [r10+160], r8
-        sbb	rax, QWORD PTR [rdx+168]
-        mov	r8, QWORD PTR [r10+176]
-        mov	QWORD PTR [r10+168], rax
-        sbb	r8, QWORD PTR [rdx+176]
-        mov	rax, QWORD PTR [r10+184]
-        mov	QWORD PTR [r10+176], r8
-        sbb	rax, QWORD PTR [rdx+184]
-        mov	QWORD PTR [r10+184], rax
-        sbb	r9, 0
-        mov	rcx, QWORD PTR [rsp+384]
-        neg	r9
-        add	rcx, 384
-        mov	r8, QWORD PTR [rcx+-192]
-        sub	r8, QWORD PTR [r10+-192]
-        mov	rax, QWORD PTR [rcx+-184]
-        mov	QWORD PTR [rcx+-192], r8
-        sbb	rax, QWORD PTR [r10+-184]
-        mov	r8, QWORD PTR [rcx+-176]
-        mov	QWORD PTR [rcx+-184], rax
-        sbb	r8, QWORD PTR [r10+-176]
-        mov	rax, QWORD PTR [rcx+-168]
-        mov	QWORD PTR [rcx+-176], r8
-        sbb	rax, QWORD PTR [r10+-168]
-        mov	r8, QWORD PTR [rcx+-160]
-        mov	QWORD PTR [rcx+-168], rax
-        sbb	r8, QWORD PTR [r10+-160]
-        mov	rax, QWORD PTR [rcx+-152]
-        mov	QWORD PTR [rcx+-160], r8
-        sbb	rax, QWORD PTR [r10+-152]
-        mov	r8, QWORD PTR [rcx+-144]
-        mov	QWORD PTR [rcx+-152], rax
-        sbb	r8, QWORD PTR [r10+-144]
-        mov	rax, QWORD PTR [rcx+-136]
-        mov	QWORD PTR [rcx+-144], r8
-        sbb	rax, QWORD PTR [r10+-136]
-        mov	r8, QWORD PTR [rcx+-128]
-        mov	QWORD PTR [rcx+-136], rax
-        sbb	r8, QWORD PTR [r10+-128]
-        mov	rax, QWORD PTR [rcx+-120]
-        mov	QWORD PTR [rcx+-128], r8
-        sbb	rax, QWORD PTR [r10+-120]
-        mov	r8, QWORD PTR [rcx+-112]
-        mov	QWORD PTR [rcx+-120], rax
-        sbb	r8, QWORD PTR [r10+-112]
-        mov	rax, QWORD PTR [rcx+-104]
-        mov	QWORD PTR [rcx+-112], r8
-        sbb	rax, QWORD PTR [r10+-104]
-        mov	r8, QWORD PTR [rcx+-96]
-        mov	QWORD PTR [rcx+-104], rax
-        sbb	r8, QWORD PTR [r10+-96]
-        mov	rax, QWORD PTR [rcx+-88]
-        mov	QWORD PTR [rcx+-96], r8
-        sbb	rax, QWORD PTR [r10+-88]
-        mov	r8, QWORD PTR [rcx+-80]
-        mov	QWORD PTR [rcx+-88], rax
-        sbb	r8, QWORD PTR [r10+-80]
-        mov	rax, QWORD PTR [rcx+-72]
-        mov	QWORD PTR [rcx+-80], r8
-        sbb	rax, QWORD PTR [r10+-72]
-        mov	r8, QWORD PTR [rcx+-64]
-        mov	QWORD PTR [rcx+-72], rax
-        sbb	r8, QWORD PTR [r10+-64]
-        mov	rax, QWORD PTR [rcx+-56]
-        mov	QWORD PTR [rcx+-64], r8
-        sbb	rax, QWORD PTR [r10+-56]
-        mov	r8, QWORD PTR [rcx+-48]
-        mov	QWORD PTR [rcx+-56], rax
-        sbb	r8, QWORD PTR [r10+-48]
-        mov	rax, QWORD PTR [rcx+-40]
-        mov	QWORD PTR [rcx+-48], r8
-        sbb	rax, QWORD PTR [r10+-40]
-        mov	r8, QWORD PTR [rcx+-32]
-        mov	QWORD PTR [rcx+-40], rax
-        sbb	r8, QWORD PTR [r10+-32]
-        mov	rax, QWORD PTR [rcx+-24]
-        mov	QWORD PTR [rcx+-32], r8
-        sbb	rax, QWORD PTR [r10+-24]
-        mov	r8, QWORD PTR [rcx+-16]
-        mov	QWORD PTR [rcx+-24], rax
-        sbb	r8, QWORD PTR [r10+-16]
-        mov	rax, QWORD PTR [rcx+-8]
-        mov	QWORD PTR [rcx+-16], r8
-        sbb	rax, QWORD PTR [r10+-8]
-        mov	r8, QWORD PTR [rcx]
-        mov	QWORD PTR [rcx+-8], rax
-        sbb	r8, QWORD PTR [r10]
-        mov	rax, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        sbb	rax, QWORD PTR [r10+8]
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], rax
-        sbb	r8, QWORD PTR [r10+16]
-        mov	rax, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        sbb	rax, QWORD PTR [r10+24]
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], rax
-        sbb	r8, QWORD PTR [r10+32]
-        mov	rax, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        sbb	rax, QWORD PTR [r10+40]
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], rax
-        sbb	r8, QWORD PTR [r10+48]
-        mov	rax, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        sbb	rax, QWORD PTR [r10+56]
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], rax
-        sbb	r8, QWORD PTR [r10+64]
-        mov	rax, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        sbb	rax, QWORD PTR [r10+72]
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], rax
-        sbb	r8, QWORD PTR [r10+80]
-        mov	rax, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        sbb	rax, QWORD PTR [r10+88]
-        mov	r8, QWORD PTR [rcx+96]
-        mov	QWORD PTR [rcx+88], rax
-        sbb	r8, QWORD PTR [r10+96]
-        mov	rax, QWORD PTR [rcx+104]
-        mov	QWORD PTR [rcx+96], r8
-        sbb	rax, QWORD PTR [r10+104]
-        mov	r8, QWORD PTR [rcx+112]
-        mov	QWORD PTR [rcx+104], rax
-        sbb	r8, QWORD PTR [r10+112]
-        mov	rax, QWORD PTR [rcx+120]
-        mov	QWORD PTR [rcx+112], r8
-        sbb	rax, QWORD PTR [r10+120]
-        mov	r8, QWORD PTR [rcx+128]
-        mov	QWORD PTR [rcx+120], rax
-        sbb	r8, QWORD PTR [r10+128]
-        mov	rax, QWORD PTR [rcx+136]
-        mov	QWORD PTR [rcx+128], r8
-        sbb	rax, QWORD PTR [r10+136]
-        mov	r8, QWORD PTR [rcx+144]
-        mov	QWORD PTR [rcx+136], rax
-        sbb	r8, QWORD PTR [r10+144]
-        mov	rax, QWORD PTR [rcx+152]
-        mov	QWORD PTR [rcx+144], r8
-        sbb	rax, QWORD PTR [r10+152]
-        mov	r8, QWORD PTR [rcx+160]
-        mov	QWORD PTR [rcx+152], rax
-        sbb	r8, QWORD PTR [r10+160]
-        mov	rax, QWORD PTR [rcx+168]
-        mov	QWORD PTR [rcx+160], r8
-        sbb	rax, QWORD PTR [r10+168]
-        mov	r8, QWORD PTR [rcx+176]
-        mov	QWORD PTR [rcx+168], rax
-        sbb	r8, QWORD PTR [r10+176]
-        mov	rax, QWORD PTR [rcx+184]
-        mov	QWORD PTR [rcx+176], r8
-        sbb	rax, QWORD PTR [r10+184]
-        mov	QWORD PTR [rcx+184], rax
-        sbb	r9, 0
-        mov	rcx, QWORD PTR [rsp+384]
-        add	rcx, 576
-        ; Add in word
-        mov	r8, QWORD PTR [rcx]
-        add	r8, r9
-        mov	rax, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+96]
-        mov	QWORD PTR [rcx+88], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+104]
-        mov	QWORD PTR [rcx+96], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+112]
-        mov	QWORD PTR [rcx+104], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+120]
-        mov	QWORD PTR [rcx+112], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+128]
-        mov	QWORD PTR [rcx+120], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+136]
-        mov	QWORD PTR [rcx+128], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+144]
-        mov	QWORD PTR [rcx+136], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+152]
-        mov	QWORD PTR [rcx+144], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+160]
-        mov	QWORD PTR [rcx+152], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+168]
-        mov	QWORD PTR [rcx+160], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+176]
-        mov	QWORD PTR [rcx+168], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+184]
-        mov	QWORD PTR [rcx+176], r8
-        adc	rax, 0
-        mov	QWORD PTR [rcx+184], rax
-        mov	rdx, QWORD PTR [rsp+392]
-        mov	rcx, QWORD PTR [rsp+384]
-        add	rsp, 400
-        ret
-sp_3072_sqr_avx2_48 ENDP
-_text ENDS
-ENDIF
-; /* Mul a by digit b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision digit.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_mul_d_48 PROC
-        push	r12
-        mov	r9, rdx
-        ; A[0] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9]
-        mov	r10, rax
-        mov	r11, rdx
-        mov	QWORD PTR [rcx], r10
-        ; A[1] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        mov	QWORD PTR [rcx+8], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        mov	QWORD PTR [rcx+16], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        mov	QWORD PTR [rcx+24], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        mov	QWORD PTR [rcx+32], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        mov	QWORD PTR [rcx+40], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        mov	QWORD PTR [rcx+48], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        mov	QWORD PTR [rcx+56], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        mov	QWORD PTR [rcx+64], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[9] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+72]
-        add	r10, rax
-        mov	QWORD PTR [rcx+72], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[10] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+80]
-        add	r11, rax
-        mov	QWORD PTR [rcx+80], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[11] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+88]
-        add	r12, rax
-        mov	QWORD PTR [rcx+88], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[12] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+96]
-        add	r10, rax
-        mov	QWORD PTR [rcx+96], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[13] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+104]
-        add	r11, rax
-        mov	QWORD PTR [rcx+104], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[14] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+112]
-        add	r12, rax
-        mov	QWORD PTR [rcx+112], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[15] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+120]
-        add	r10, rax
-        mov	QWORD PTR [rcx+120], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[16] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+128]
-        add	r11, rax
-        mov	QWORD PTR [rcx+128], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[17] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+136]
-        add	r12, rax
-        mov	QWORD PTR [rcx+136], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[18] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+144]
-        add	r10, rax
-        mov	QWORD PTR [rcx+144], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[19] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+152]
-        add	r11, rax
-        mov	QWORD PTR [rcx+152], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[20] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+160]
-        add	r12, rax
-        mov	QWORD PTR [rcx+160], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[21] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+168]
-        add	r10, rax
-        mov	QWORD PTR [rcx+168], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[22] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+176]
-        add	r11, rax
-        mov	QWORD PTR [rcx+176], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[23] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+184]
-        add	r12, rax
-        mov	QWORD PTR [rcx+184], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[24] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+192]
-        add	r10, rax
-        mov	QWORD PTR [rcx+192], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[25] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+200]
-        add	r11, rax
-        mov	QWORD PTR [rcx+200], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[26] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+208]
-        add	r12, rax
-        mov	QWORD PTR [rcx+208], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[27] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+216]
-        add	r10, rax
-        mov	QWORD PTR [rcx+216], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[28] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+224]
-        add	r11, rax
-        mov	QWORD PTR [rcx+224], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[29] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+232]
-        add	r12, rax
-        mov	QWORD PTR [rcx+232], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[30] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+240]
-        add	r10, rax
-        mov	QWORD PTR [rcx+240], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[31] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+248]
-        add	r11, rax
-        mov	QWORD PTR [rcx+248], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[32] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+256]
-        add	r12, rax
-        mov	QWORD PTR [rcx+256], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[33] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+264]
-        add	r10, rax
-        mov	QWORD PTR [rcx+264], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[34] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+272]
-        add	r11, rax
-        mov	QWORD PTR [rcx+272], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[35] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+280]
-        add	r12, rax
-        mov	QWORD PTR [rcx+280], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[36] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+288]
-        add	r10, rax
-        mov	QWORD PTR [rcx+288], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[37] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+296]
-        add	r11, rax
-        mov	QWORD PTR [rcx+296], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[38] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+304]
-        add	r12, rax
-        mov	QWORD PTR [rcx+304], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[39] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+312]
-        add	r10, rax
-        mov	QWORD PTR [rcx+312], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[40] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+320]
-        add	r11, rax
-        mov	QWORD PTR [rcx+320], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[41] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+328]
-        add	r12, rax
-        mov	QWORD PTR [rcx+328], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[42] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+336]
-        add	r10, rax
-        mov	QWORD PTR [rcx+336], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[43] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+344]
-        add	r11, rax
-        mov	QWORD PTR [rcx+344], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[44] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+352]
-        add	r12, rax
-        mov	QWORD PTR [rcx+352], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[45] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+360]
-        add	r10, rax
-        mov	QWORD PTR [rcx+360], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[46] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+368]
-        add	r11, rax
-        mov	QWORD PTR [rcx+368], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[47] * B
-        mov	rax, r8
-        mul	QWORD PTR [r9+376]
-        add	r12, rax
-        adc	r10, rdx
-        mov	QWORD PTR [rcx+376], r12
-        mov	QWORD PTR [rcx+384], r10
-        pop	r12
-        ret
-sp_3072_mul_d_48 ENDP
-_text ENDS
-; /* Conditionally subtract b from a using the mask m.
-;  * m is -1 to subtract and 0 when not copying.
-;  *
-;  * r  A single precision number representing condition subtract result.
-;  * a  A single precision number to subtract from.
-;  * b  A single precision number to subtract.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_cond_sub_24 PROC
-        sub	rsp, 192
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp], r10
-        mov	QWORD PTR [rsp+8], r11
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+16], r10
-        mov	QWORD PTR [rsp+24], r11
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+32], r10
-        mov	QWORD PTR [rsp+40], r11
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+48], r10
-        mov	QWORD PTR [rsp+56], r11
-        mov	r10, QWORD PTR [r8+64]
-        mov	r11, QWORD PTR [r8+72]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+64], r10
-        mov	QWORD PTR [rsp+72], r11
-        mov	r10, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [r8+88]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+80], r10
-        mov	QWORD PTR [rsp+88], r11
-        mov	r10, QWORD PTR [r8+96]
-        mov	r11, QWORD PTR [r8+104]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+96], r10
-        mov	QWORD PTR [rsp+104], r11
-        mov	r10, QWORD PTR [r8+112]
-        mov	r11, QWORD PTR [r8+120]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+112], r10
-        mov	QWORD PTR [rsp+120], r11
-        mov	r10, QWORD PTR [r8+128]
-        mov	r11, QWORD PTR [r8+136]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+128], r10
-        mov	QWORD PTR [rsp+136], r11
-        mov	r10, QWORD PTR [r8+144]
-        mov	r11, QWORD PTR [r8+152]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+144], r10
-        mov	QWORD PTR [rsp+152], r11
-        mov	r10, QWORD PTR [r8+160]
-        mov	r11, QWORD PTR [r8+168]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+160], r10
-        mov	QWORD PTR [rsp+168], r11
-        mov	r10, QWORD PTR [r8+176]
-        mov	r11, QWORD PTR [r8+184]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+176], r10
-        mov	QWORD PTR [rsp+184], r11
-        mov	r10, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rsp]
-        sub	r10, r8
-        mov	r11, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [rsp+8]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx], r10
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r8, QWORD PTR [rsp+16]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+8], r11
-        mov	r11, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [rsp+24]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+16], r10
-        mov	r10, QWORD PTR [rdx+32]
-        mov	r8, QWORD PTR [rsp+32]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+24], r11
-        mov	r11, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [rsp+40]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+32], r10
-        mov	r10, QWORD PTR [rdx+48]
-        mov	r8, QWORD PTR [rsp+48]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+40], r11
-        mov	r11, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [rsp+56]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+48], r10
-        mov	r10, QWORD PTR [rdx+64]
-        mov	r8, QWORD PTR [rsp+64]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+56], r11
-        mov	r11, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [rsp+72]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+64], r10
-        mov	r10, QWORD PTR [rdx+80]
-        mov	r8, QWORD PTR [rsp+80]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+72], r11
-        mov	r11, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [rsp+88]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+80], r10
-        mov	r10, QWORD PTR [rdx+96]
-        mov	r8, QWORD PTR [rsp+96]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+88], r11
-        mov	r11, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [rsp+104]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+96], r10
-        mov	r10, QWORD PTR [rdx+112]
-        mov	r8, QWORD PTR [rsp+112]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+104], r11
-        mov	r11, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [rsp+120]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+112], r10
-        mov	r10, QWORD PTR [rdx+128]
-        mov	r8, QWORD PTR [rsp+128]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+120], r11
-        mov	r11, QWORD PTR [rdx+136]
-        mov	r8, QWORD PTR [rsp+136]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+128], r10
-        mov	r10, QWORD PTR [rdx+144]
-        mov	r8, QWORD PTR [rsp+144]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+136], r11
-        mov	r11, QWORD PTR [rdx+152]
-        mov	r8, QWORD PTR [rsp+152]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+144], r10
-        mov	r10, QWORD PTR [rdx+160]
-        mov	r8, QWORD PTR [rsp+160]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+152], r11
-        mov	r11, QWORD PTR [rdx+168]
-        mov	r8, QWORD PTR [rsp+168]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+160], r10
-        mov	r10, QWORD PTR [rdx+176]
-        mov	r8, QWORD PTR [rsp+176]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+168], r11
-        mov	r11, QWORD PTR [rdx+184]
-        mov	r8, QWORD PTR [rsp+184]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+176], r10
-        mov	QWORD PTR [rcx+184], r11
-        sbb	rax, rax
-        add	rsp, 192
-        ret
-sp_3072_cond_sub_24 ENDP
-_text ENDS
-; /* Reduce the number back to 3072 bits using Montgomery reduction.
-;  *
-;  * a   A single precision number to reduce in place.
-;  * m   The single precision number representing the modulus.
-;  * mp  The digit representing the negative inverse of m mod 2^n.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_mont_reduce_24 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        mov	r9, rdx
-        xor	rsi, rsi
-        ; i = 24
-        mov	r10, 24
-        mov	r15, QWORD PTR [rcx]
-        mov	rdi, QWORD PTR [rcx+8]
-L_3072_mont_reduce_24_loop:
-        ; mu = a[i] * mp
-        mov	r13, r15
-        imul	r13, r8
-        ; a[i+0] += m[0] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9]
-        add	r15, rax
-        adc	r12, rdx
-        ; a[i+1] += m[1] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+8]
-        mov	r15, rdi
-        add	r15, rax
-        adc	r11, rdx
-        add	r15, r12
-        adc	r11, 0
-        ; a[i+2] += m[2] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+16]
-        mov	rdi, QWORD PTR [rcx+16]
-        add	rdi, rax
-        adc	r12, rdx
-        add	rdi, r11
-        adc	r12, 0
-        ; a[i+3] += m[3] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+24]
-        mov	r14, QWORD PTR [rcx+24]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+24], r14
-        adc	r11, 0
-        ; a[i+4] += m[4] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+32]
-        mov	r14, QWORD PTR [rcx+32]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+32], r14
-        adc	r12, 0
-        ; a[i+5] += m[5] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+40]
-        mov	r14, QWORD PTR [rcx+40]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+40], r14
-        adc	r11, 0
-        ; a[i+6] += m[6] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+48]
-        mov	r14, QWORD PTR [rcx+48]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+48], r14
-        adc	r12, 0
-        ; a[i+7] += m[7] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+56]
-        mov	r14, QWORD PTR [rcx+56]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+56], r14
-        adc	r11, 0
-        ; a[i+8] += m[8] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+64]
-        mov	r14, QWORD PTR [rcx+64]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+64], r14
-        adc	r12, 0
-        ; a[i+9] += m[9] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+72]
-        mov	r14, QWORD PTR [rcx+72]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+72], r14
-        adc	r11, 0
-        ; a[i+10] += m[10] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+80]
-        mov	r14, QWORD PTR [rcx+80]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+80], r14
-        adc	r12, 0
-        ; a[i+11] += m[11] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+88]
-        mov	r14, QWORD PTR [rcx+88]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+88], r14
-        adc	r11, 0
-        ; a[i+12] += m[12] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+96]
-        mov	r14, QWORD PTR [rcx+96]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+96], r14
-        adc	r12, 0
-        ; a[i+13] += m[13] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+104]
-        mov	r14, QWORD PTR [rcx+104]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+104], r14
-        adc	r11, 0
-        ; a[i+14] += m[14] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+112]
-        mov	r14, QWORD PTR [rcx+112]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+112], r14
-        adc	r12, 0
-        ; a[i+15] += m[15] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+120]
-        mov	r14, QWORD PTR [rcx+120]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+120], r14
-        adc	r11, 0
-        ; a[i+16] += m[16] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+128]
-        mov	r14, QWORD PTR [rcx+128]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+128], r14
-        adc	r12, 0
-        ; a[i+17] += m[17] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+136]
-        mov	r14, QWORD PTR [rcx+136]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+136], r14
-        adc	r11, 0
-        ; a[i+18] += m[18] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+144]
-        mov	r14, QWORD PTR [rcx+144]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+144], r14
-        adc	r12, 0
-        ; a[i+19] += m[19] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+152]
-        mov	r14, QWORD PTR [rcx+152]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+152], r14
-        adc	r11, 0
-        ; a[i+20] += m[20] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+160]
-        mov	r14, QWORD PTR [rcx+160]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+160], r14
-        adc	r12, 0
-        ; a[i+21] += m[21] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+168]
-        mov	r14, QWORD PTR [rcx+168]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+168], r14
-        adc	r11, 0
-        ; a[i+22] += m[22] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+176]
-        mov	r14, QWORD PTR [rcx+176]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+176], r14
-        adc	r12, 0
-        ; a[i+23] += m[23] * mu
-        mov	rax, r13
-        mul	QWORD PTR [r9+184]
-        mov	r14, QWORD PTR [rcx+184]
-        add	r12, rax
-        adc	rdx, rsi
-        mov	rsi, 0
-        adc	rsi, 0
-        add	r14, r12
-        mov	QWORD PTR [rcx+184], r14
-        adc	QWORD PTR [rcx+192], rdx
-        adc	rsi, 0
-        ; i -= 1
-        add	rcx, 8
-        dec	r10
-        jnz	L_3072_mont_reduce_24_loop
-        mov	QWORD PTR [rcx], r15
-        mov	QWORD PTR [rcx+8], rdi
-        neg	rsi
-IFDEF _WIN64
-        mov	r8, r9
-        mov	r9, rsi
-ELSE
-        mov	r9, rsi
-        mov	r8, r9
-ENDIF
-        mov	rdx, rcx
-        mov	rcx, rcx
-        sub	rcx, 192
-        call	sp_3072_cond_sub_24
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_3072_mont_reduce_24 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Conditionally subtract b from a using the mask m.
-;  * m is -1 to subtract and 0 when not copying.
-;  *
-;  * r  A single precision number representing condition subtract result.
-;  * a  A single precision number to subtract from.
-;  * b  A single precision number to subtract.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_cond_sub_avx2_24 PROC
-        push	r12
-        mov	r12, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx]
-        pext	r12, r12, r9
-        sub	r10, r12
-        mov	r12, QWORD PTR [r8+8]
-        mov	r11, QWORD PTR [rdx+8]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+16]
-        mov	r12, QWORD PTR [rdx+16]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+8], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [rdx+24]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+16], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [rdx+32]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+24], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [rdx+40]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+32], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+48]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+40], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+56]
-        mov	r11, QWORD PTR [rdx+56]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+48], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+64]
-        mov	r12, QWORD PTR [rdx+64]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+56], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+72]
-        mov	r10, QWORD PTR [rdx+72]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+64], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [rdx+80]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+72], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+88]
-        mov	r12, QWORD PTR [rdx+88]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+80], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+96]
-        mov	r10, QWORD PTR [rdx+96]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+88], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+104]
-        mov	r11, QWORD PTR [rdx+104]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+96], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+112]
-        mov	r12, QWORD PTR [rdx+112]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+104], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+120]
-        mov	r10, QWORD PTR [rdx+120]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+112], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+128]
-        mov	r11, QWORD PTR [rdx+128]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+120], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+136]
-        mov	r12, QWORD PTR [rdx+136]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+128], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+144]
-        mov	r10, QWORD PTR [rdx+144]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+136], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+152]
-        mov	r11, QWORD PTR [rdx+152]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+144], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+160]
-        mov	r12, QWORD PTR [rdx+160]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+152], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+168]
-        mov	r10, QWORD PTR [rdx+168]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+160], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+176]
-        mov	r11, QWORD PTR [rdx+176]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+168], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+184]
-        mov	r12, QWORD PTR [rdx+184]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+176], r11
-        sbb	r12, r10
-        mov	QWORD PTR [rcx+184], r12
-        sbb	rax, rax
-        pop	r12
-        ret
-sp_3072_cond_sub_avx2_24 ENDP
-_text ENDS
-ENDIF
-; /* Mul a by digit b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision digit.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_mul_d_24 PROC
-        push	r12
-        mov	r9, rdx
-        ; A[0] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9]
-        mov	r10, rax
-        mov	r11, rdx
-        mov	QWORD PTR [rcx], r10
-        ; A[1] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        mov	QWORD PTR [rcx+8], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        mov	QWORD PTR [rcx+16], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        mov	QWORD PTR [rcx+24], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        mov	QWORD PTR [rcx+32], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        mov	QWORD PTR [rcx+40], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        mov	QWORD PTR [rcx+48], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        mov	QWORD PTR [rcx+56], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        mov	QWORD PTR [rcx+64], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[9] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+72]
-        add	r10, rax
-        mov	QWORD PTR [rcx+72], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[10] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+80]
-        add	r11, rax
-        mov	QWORD PTR [rcx+80], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[11] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+88]
-        add	r12, rax
-        mov	QWORD PTR [rcx+88], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[12] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+96]
-        add	r10, rax
-        mov	QWORD PTR [rcx+96], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[13] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+104]
-        add	r11, rax
-        mov	QWORD PTR [rcx+104], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[14] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+112]
-        add	r12, rax
-        mov	QWORD PTR [rcx+112], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[15] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+120]
-        add	r10, rax
-        mov	QWORD PTR [rcx+120], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[16] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+128]
-        add	r11, rax
-        mov	QWORD PTR [rcx+128], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[17] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+136]
-        add	r12, rax
-        mov	QWORD PTR [rcx+136], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[18] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+144]
-        add	r10, rax
-        mov	QWORD PTR [rcx+144], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[19] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+152]
-        add	r11, rax
-        mov	QWORD PTR [rcx+152], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[20] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+160]
-        add	r12, rax
-        mov	QWORD PTR [rcx+160], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[21] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+168]
-        add	r10, rax
-        mov	QWORD PTR [rcx+168], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[22] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+176]
-        add	r11, rax
-        mov	QWORD PTR [rcx+176], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[23] * B
-        mov	rax, r8
-        mul	QWORD PTR [r9+184]
-        add	r12, rax
-        adc	r10, rdx
-        mov	QWORD PTR [rcx+184], r12
-        mov	QWORD PTR [rcx+192], r10
-        pop	r12
-        ret
-sp_3072_mul_d_24 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Mul a by digit b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision digit.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_mul_d_avx2_24 PROC
-        push	r12
-        push	r13
-        mov	rax, rdx
-        ; A[0] * B
-        mov	rdx, r8
-        xor	r13, r13
-        mulx	r12, r11, QWORD PTR [rax]
-        mov	QWORD PTR [rcx], r11
-        ; A[1] * B
-        mulx	r10, r9, QWORD PTR [rax+8]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+8], r12
-        ; A[2] * B
-        mulx	r10, r9, QWORD PTR [rax+16]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+16], r11
-        ; A[3] * B
-        mulx	r10, r9, QWORD PTR [rax+24]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+24], r12
-        ; A[4] * B
-        mulx	r10, r9, QWORD PTR [rax+32]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+32], r11
-        ; A[5] * B
-        mulx	r10, r9, QWORD PTR [rax+40]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+40], r12
-        ; A[6] * B
-        mulx	r10, r9, QWORD PTR [rax+48]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+48], r11
-        ; A[7] * B
-        mulx	r10, r9, QWORD PTR [rax+56]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+56], r12
-        ; A[8] * B
-        mulx	r10, r9, QWORD PTR [rax+64]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+64], r11
-        ; A[9] * B
-        mulx	r10, r9, QWORD PTR [rax+72]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+72], r12
-        ; A[10] * B
-        mulx	r10, r9, QWORD PTR [rax+80]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+80], r11
-        ; A[11] * B
-        mulx	r10, r9, QWORD PTR [rax+88]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+88], r12
-        ; A[12] * B
-        mulx	r10, r9, QWORD PTR [rax+96]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+96], r11
-        ; A[13] * B
-        mulx	r10, r9, QWORD PTR [rax+104]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+104], r12
-        ; A[14] * B
-        mulx	r10, r9, QWORD PTR [rax+112]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+112], r11
-        ; A[15] * B
-        mulx	r10, r9, QWORD PTR [rax+120]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+120], r12
-        ; A[16] * B
-        mulx	r10, r9, QWORD PTR [rax+128]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+128], r11
-        ; A[17] * B
-        mulx	r10, r9, QWORD PTR [rax+136]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+136], r12
-        ; A[18] * B
-        mulx	r10, r9, QWORD PTR [rax+144]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+144], r11
-        ; A[19] * B
-        mulx	r10, r9, QWORD PTR [rax+152]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+152], r12
-        ; A[20] * B
-        mulx	r10, r9, QWORD PTR [rax+160]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+160], r11
-        ; A[21] * B
-        mulx	r10, r9, QWORD PTR [rax+168]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+168], r12
-        ; A[22] * B
-        mulx	r10, r9, QWORD PTR [rax+176]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+176], r11
-        ; A[23] * B
-        mulx	r10, r9, QWORD PTR [rax+184]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        adcx	r11, r13
-        mov	QWORD PTR [rcx+184], r12
-        mov	QWORD PTR [rcx+192], r11
-        pop	r13
-        pop	r12
-        ret
-sp_3072_mul_d_avx2_24 ENDP
-_text ENDS
-ENDIF
-IFDEF _WIN64
-; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
-;  *
-;  * d1   The high order half of the number to divide.
-;  * d0   The low order half of the number to divide.
-;  * div  The dividend.
-;  * returns the result of the division.
-;  */
-_text SEGMENT READONLY PARA
-div_3072_word_asm_24 PROC
-        mov	r9, rdx
-        mov	rax, r9
-        mov	rdx, rcx
-        div	r8
-        ret
-div_3072_word_asm_24 ENDP
-_text ENDS
-ENDIF
-; /* Compare a with b in constant time.
-;  *
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  * return -ve, 0 or +ve if a is less than, equal to or greater than b
-;  * respectively.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_cmp_24 PROC
-        push	r12
-        xor	r9, r9
-        mov	r8, -1
-        mov	rax, -1
-        mov	r10, 1
-        mov	r11, QWORD PTR [rcx+184]
-        mov	r12, QWORD PTR [rdx+184]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+176]
-        mov	r12, QWORD PTR [rdx+176]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+168]
-        mov	r12, QWORD PTR [rdx+168]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+160]
-        mov	r12, QWORD PTR [rdx+160]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+152]
-        mov	r12, QWORD PTR [rdx+152]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+144]
-        mov	r12, QWORD PTR [rdx+144]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+136]
-        mov	r12, QWORD PTR [rdx+136]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+128]
-        mov	r12, QWORD PTR [rdx+128]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+120]
-        mov	r12, QWORD PTR [rdx+120]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+112]
-        mov	r12, QWORD PTR [rdx+112]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+104]
-        mov	r12, QWORD PTR [rdx+104]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+96]
-        mov	r12, QWORD PTR [rdx+96]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+88]
-        mov	r12, QWORD PTR [rdx+88]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+80]
-        mov	r12, QWORD PTR [rdx+80]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+72]
-        mov	r12, QWORD PTR [rdx+72]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+64]
-        mov	r12, QWORD PTR [rdx+64]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+56]
-        mov	r12, QWORD PTR [rdx+56]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+48]
-        mov	r12, QWORD PTR [rdx+48]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+40]
-        mov	r12, QWORD PTR [rdx+40]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+32]
-        mov	r12, QWORD PTR [rdx+32]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+24]
-        mov	r12, QWORD PTR [rdx+24]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+16]
-        mov	r12, QWORD PTR [rdx+16]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+8]
-        mov	r12, QWORD PTR [rdx+8]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx]
-        mov	r12, QWORD PTR [rdx]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        xor	rax, r8
-        pop	r12
-        ret
-sp_3072_cmp_24 ENDP
-_text ENDS
-IFNDEF WC_NO_CACHE_RESISTANT
-_text SEGMENT READONLY PARA
-sp_3072_get_from_table_24 PROC
-        sub	rsp, 128
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        vmovdqu	OWORD PTR [rsp+96], xmm12
-        vmovdqu	OWORD PTR [rsp+112], xmm13
-        mov	rax, 1
-        movd	xmm10, r8
-        movd	xmm11, rax
-        pxor	xmm13, xmm13
-        pshufd	xmm11, xmm11, 0
-        pshufd	xmm10, xmm10, 0
-        ; START: 0-7
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 16
-        mov	r9, QWORD PTR [rdx+128]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 17
-        mov	r9, QWORD PTR [rdx+136]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 18
-        mov	r9, QWORD PTR [rdx+144]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 19
-        mov	r9, QWORD PTR [rdx+152]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 20
-        mov	r9, QWORD PTR [rdx+160]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 21
-        mov	r9, QWORD PTR [rdx+168]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 22
-        mov	r9, QWORD PTR [rdx+176]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 23
-        mov	r9, QWORD PTR [rdx+184]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 24
-        mov	r9, QWORD PTR [rdx+192]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 25
-        mov	r9, QWORD PTR [rdx+200]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 26
-        mov	r9, QWORD PTR [rdx+208]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 27
-        mov	r9, QWORD PTR [rdx+216]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 28
-        mov	r9, QWORD PTR [rdx+224]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 29
-        mov	r9, QWORD PTR [rdx+232]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 30
-        mov	r9, QWORD PTR [rdx+240]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 31
-        mov	r9, QWORD PTR [rdx+248]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        add	rcx, 64
-        ; END: 0-7
-        ; START: 8-15
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 16
-        mov	r9, QWORD PTR [rdx+128]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 17
-        mov	r9, QWORD PTR [rdx+136]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 18
-        mov	r9, QWORD PTR [rdx+144]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 19
-        mov	r9, QWORD PTR [rdx+152]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 20
-        mov	r9, QWORD PTR [rdx+160]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 21
-        mov	r9, QWORD PTR [rdx+168]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 22
-        mov	r9, QWORD PTR [rdx+176]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 23
-        mov	r9, QWORD PTR [rdx+184]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 24
-        mov	r9, QWORD PTR [rdx+192]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 25
-        mov	r9, QWORD PTR [rdx+200]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 26
-        mov	r9, QWORD PTR [rdx+208]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 27
-        mov	r9, QWORD PTR [rdx+216]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 28
-        mov	r9, QWORD PTR [rdx+224]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 29
-        mov	r9, QWORD PTR [rdx+232]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 30
-        mov	r9, QWORD PTR [rdx+240]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 31
-        mov	r9, QWORD PTR [rdx+248]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        add	rcx, 64
-        ; END: 8-15
-        ; START: 16-23
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 16
-        mov	r9, QWORD PTR [rdx+128]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 17
-        mov	r9, QWORD PTR [rdx+136]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 18
-        mov	r9, QWORD PTR [rdx+144]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 19
-        mov	r9, QWORD PTR [rdx+152]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 20
-        mov	r9, QWORD PTR [rdx+160]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 21
-        mov	r9, QWORD PTR [rdx+168]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 22
-        mov	r9, QWORD PTR [rdx+176]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 23
-        mov	r9, QWORD PTR [rdx+184]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 24
-        mov	r9, QWORD PTR [rdx+192]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 25
-        mov	r9, QWORD PTR [rdx+200]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 26
-        mov	r9, QWORD PTR [rdx+208]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 27
-        mov	r9, QWORD PTR [rdx+216]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 28
-        mov	r9, QWORD PTR [rdx+224]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 29
-        mov	r9, QWORD PTR [rdx+232]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 30
-        mov	r9, QWORD PTR [rdx+240]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 31
-        mov	r9, QWORD PTR [rdx+248]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        ; END: 16-23
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        vmovdqu	xmm12, OWORD PTR [rsp+96]
-        vmovdqu	xmm13, OWORD PTR [rsp+112]
-        add	rsp, 128
-        ret
-sp_3072_get_from_table_24 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Reduce the number back to 3072 bits using Montgomery reduction.
-;  *
-;  * a   A single precision number to reduce in place.
-;  * m   The single precision number representing the modulus.
-;  * mp  The digit representing the negative inverse of m mod 2^n.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_mont_reduce_avx2_24 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        push	rbp
-        mov	r9, rcx
-        mov	r10, rdx
-        xor	rbp, rbp
-        ; i = 24
-        mov	r11, 24
-        mov	r14, QWORD PTR [r9]
-        mov	r15, QWORD PTR [r9+8]
-        mov	rdi, QWORD PTR [r9+16]
-        mov	rsi, QWORD PTR [r9+24]
-        add	r9, 96
-        xor	rbp, rbp
-L_3072_mont_reduce_avx2_24_loop:
-        ; mu = a[i] * mp
-        mov	rdx, r14
-        mov	r12, r14
-        imul	rdx, r8
-        xor	rbx, rbx
-        ; a[i+0] += m[0] * mu
-        mulx	rcx, rax, QWORD PTR [r10]
-        mov	r14, r15
-        adcx	r12, rax
-        adox	r14, rcx
-        ; a[i+1] += m[1] * mu
-        mulx	rcx, rax, QWORD PTR [r10+8]
-        mov	r15, rdi
-        adcx	r14, rax
-        adox	r15, rcx
-        ; a[i+2] += m[2] * mu
-        mulx	rcx, rax, QWORD PTR [r10+16]
-        mov	rdi, rsi
-        adcx	r15, rax
-        adox	rdi, rcx
-        ; a[i+3] += m[3] * mu
-        mulx	rcx, rax, QWORD PTR [r10+24]
-        mov	rsi, QWORD PTR [r9+-64]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; a[i+4] += m[4] * mu
-        mulx	rcx, rax, QWORD PTR [r10+32]
-        mov	r13, QWORD PTR [r9+-56]
-        adcx	rsi, rax
-        adox	r13, rcx
-        ; a[i+5] += m[5] * mu
-        mulx	rcx, rax, QWORD PTR [r10+40]
-        mov	r12, QWORD PTR [r9+-48]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-56], r13
-        ; a[i+6] += m[6] * mu
-        mulx	rcx, rax, QWORD PTR [r10+48]
-        mov	r13, QWORD PTR [r9+-40]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-48], r12
-        ; a[i+7] += m[7] * mu
-        mulx	rcx, rax, QWORD PTR [r10+56]
-        mov	r12, QWORD PTR [r9+-32]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-40], r13
-        ; a[i+8] += m[8] * mu
-        mulx	rcx, rax, QWORD PTR [r10+64]
-        mov	r13, QWORD PTR [r9+-24]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-32], r12
-        ; a[i+9] += m[9] * mu
-        mulx	rcx, rax, QWORD PTR [r10+72]
-        mov	r12, QWORD PTR [r9+-16]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-24], r13
-        ; a[i+10] += m[10] * mu
-        mulx	rcx, rax, QWORD PTR [r10+80]
-        mov	r13, QWORD PTR [r9+-8]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-16], r12
-        ; a[i+11] += m[11] * mu
-        mulx	rcx, rax, QWORD PTR [r10+88]
-        mov	r12, QWORD PTR [r9]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-8], r13
-        ; a[i+12] += m[12] * mu
-        mulx	rcx, rax, QWORD PTR [r10+96]
-        mov	r13, QWORD PTR [r9+8]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9], r12
-        ; a[i+13] += m[13] * mu
-        mulx	rcx, rax, QWORD PTR [r10+104]
-        mov	r12, QWORD PTR [r9+16]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+8], r13
-        ; a[i+14] += m[14] * mu
-        mulx	rcx, rax, QWORD PTR [r10+112]
-        mov	r13, QWORD PTR [r9+24]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+16], r12
-        ; a[i+15] += m[15] * mu
-        mulx	rcx, rax, QWORD PTR [r10+120]
-        mov	r12, QWORD PTR [r9+32]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+24], r13
-        ; a[i+16] += m[16] * mu
-        mulx	rcx, rax, QWORD PTR [r10+128]
-        mov	r13, QWORD PTR [r9+40]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+32], r12
-        ; a[i+17] += m[17] * mu
-        mulx	rcx, rax, QWORD PTR [r10+136]
-        mov	r12, QWORD PTR [r9+48]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+40], r13
-        ; a[i+18] += m[18] * mu
-        mulx	rcx, rax, QWORD PTR [r10+144]
-        mov	r13, QWORD PTR [r9+56]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+48], r12
-        ; a[i+19] += m[19] * mu
-        mulx	rcx, rax, QWORD PTR [r10+152]
-        mov	r12, QWORD PTR [r9+64]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+56], r13
-        ; a[i+20] += m[20] * mu
-        mulx	rcx, rax, QWORD PTR [r10+160]
-        mov	r13, QWORD PTR [r9+72]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+64], r12
-        ; a[i+21] += m[21] * mu
-        mulx	rcx, rax, QWORD PTR [r10+168]
-        mov	r12, QWORD PTR [r9+80]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+72], r13
-        ; a[i+22] += m[22] * mu
-        mulx	rcx, rax, QWORD PTR [r10+176]
-        mov	r13, QWORD PTR [r9+88]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+80], r12
-        ; a[i+23] += m[23] * mu
-        mulx	rcx, rax, QWORD PTR [r10+184]
-        mov	r12, QWORD PTR [r9+96]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+88], r13
-        adcx	r12, rbp
-        mov	rbp, rbx
-        mov	QWORD PTR [r9+96], r12
-        adox	rbp, rbx
-        adcx	rbp, rbx
-        ; a += 1
-        add	r9, 8
-        ; i -= 1
-        sub	r11, 1
-        jnz	L_3072_mont_reduce_avx2_24_loop
-        sub	r9, 96
-        neg	rbp
-        mov	r8, r9
-        sub	r9, 192
-        mov	rcx, QWORD PTR [r10]
-        mov	rdx, r14
-        pext	rcx, rcx, rbp
-        sub	rdx, rcx
-        mov	rcx, QWORD PTR [r10+8]
-        mov	rax, r15
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+16]
-        mov	rcx, rdi
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+8], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+24]
-        mov	rdx, rsi
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+16], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+32]
-        mov	rax, QWORD PTR [r8+32]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+24], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+40]
-        mov	rcx, QWORD PTR [r8+40]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+32], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+48]
-        mov	rdx, QWORD PTR [r8+48]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+40], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+56]
-        mov	rax, QWORD PTR [r8+56]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+48], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+64]
-        mov	rcx, QWORD PTR [r8+64]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+56], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+72]
-        mov	rdx, QWORD PTR [r8+72]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+64], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+80]
-        mov	rax, QWORD PTR [r8+80]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+72], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+88]
-        mov	rcx, QWORD PTR [r8+88]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+80], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+96]
-        mov	rdx, QWORD PTR [r8+96]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+88], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+104]
-        mov	rax, QWORD PTR [r8+104]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+96], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+112]
-        mov	rcx, QWORD PTR [r8+112]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+104], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+120]
-        mov	rdx, QWORD PTR [r8+120]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+112], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+128]
-        mov	rax, QWORD PTR [r8+128]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+120], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+136]
-        mov	rcx, QWORD PTR [r8+136]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+128], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+144]
-        mov	rdx, QWORD PTR [r8+144]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+136], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+152]
-        mov	rax, QWORD PTR [r8+152]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+144], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+160]
-        mov	rcx, QWORD PTR [r8+160]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+152], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+168]
-        mov	rdx, QWORD PTR [r8+168]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+160], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+176]
-        mov	rax, QWORD PTR [r8+176]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+168], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+184]
-        mov	rcx, QWORD PTR [r8+184]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+176], rax
-        sbb	rcx, rdx
-        mov	QWORD PTR [r9+184], rcx
-        pop	rbp
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_3072_mont_reduce_avx2_24 ENDP
-_text ENDS
-ENDIF
-IFNDEF WC_NO_CACHE_RESISTANT
-_text SEGMENT READONLY PARA
-sp_3072_get_from_table_avx2_24 PROC
-        sub	rsp, 128
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        vmovdqu	OWORD PTR [rsp+96], xmm12
-        vmovdqu	OWORD PTR [rsp+112], xmm13
-        mov	rax, 1
-        movd	xmm10, r8
-        movd	xmm11, rax
-        vpxor	ymm13, ymm13, ymm13
-        vpermd	ymm10, ymm13, ymm10
-        vpermd	ymm11, ymm13, ymm11
-        ; START: 0-15
-        vpxor	ymm13, ymm13, ymm13
-        vpxor	ymm4, ymm4, ymm4
-        vpxor	ymm5, ymm5, ymm5
-        vpxor	ymm6, ymm6, ymm6
-        vpxor	ymm7, ymm7, ymm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 16
-        mov	r9, QWORD PTR [rdx+128]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 17
-        mov	r9, QWORD PTR [rdx+136]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 18
-        mov	r9, QWORD PTR [rdx+144]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 19
-        mov	r9, QWORD PTR [rdx+152]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 20
-        mov	r9, QWORD PTR [rdx+160]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 21
-        mov	r9, QWORD PTR [rdx+168]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 22
-        mov	r9, QWORD PTR [rdx+176]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 23
-        mov	r9, QWORD PTR [rdx+184]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 24
-        mov	r9, QWORD PTR [rdx+192]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 25
-        mov	r9, QWORD PTR [rdx+200]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 26
-        mov	r9, QWORD PTR [rdx+208]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 27
-        mov	r9, QWORD PTR [rdx+216]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 28
-        mov	r9, QWORD PTR [rdx+224]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 29
-        mov	r9, QWORD PTR [rdx+232]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 30
-        mov	r9, QWORD PTR [rdx+240]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 31
-        mov	r9, QWORD PTR [rdx+248]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        vmovdqu	YMMWORD PTR [rcx], ymm4
-        vmovdqu	YMMWORD PTR [rcx+32], ymm5
-        vmovdqu	YMMWORD PTR [rcx+64], ymm6
-        vmovdqu	YMMWORD PTR [rcx+96], ymm7
-        add	rcx, 128
-        ; END: 0-15
-        ; START: 16-23
-        vpxor	ymm13, ymm13, ymm13
-        vpxor	ymm4, ymm4, ymm4
-        vpxor	ymm5, ymm5, ymm5
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 16
-        mov	r9, QWORD PTR [rdx+128]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 17
-        mov	r9, QWORD PTR [rdx+136]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 18
-        mov	r9, QWORD PTR [rdx+144]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 19
-        mov	r9, QWORD PTR [rdx+152]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 20
-        mov	r9, QWORD PTR [rdx+160]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 21
-        mov	r9, QWORD PTR [rdx+168]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 22
-        mov	r9, QWORD PTR [rdx+176]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 23
-        mov	r9, QWORD PTR [rdx+184]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 24
-        mov	r9, QWORD PTR [rdx+192]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 25
-        mov	r9, QWORD PTR [rdx+200]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 26
-        mov	r9, QWORD PTR [rdx+208]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 27
-        mov	r9, QWORD PTR [rdx+216]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 28
-        mov	r9, QWORD PTR [rdx+224]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 29
-        mov	r9, QWORD PTR [rdx+232]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 30
-        mov	r9, QWORD PTR [rdx+240]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 31
-        mov	r9, QWORD PTR [rdx+248]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpaddd	ymm13, ymm13, ymm11
-        vmovdqu	YMMWORD PTR [rcx], ymm4
-        vmovdqu	YMMWORD PTR [rcx+32], ymm5
-        ; END: 16-23
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        vmovdqu	xmm12, OWORD PTR [rsp+96]
-        vmovdqu	xmm13, OWORD PTR [rsp+112]
-        add	rsp, 128
-        ret
-sp_3072_get_from_table_avx2_24 ENDP
-_text ENDS
-ENDIF
-; /* Conditionally subtract b from a using the mask m.
-;  * m is -1 to subtract and 0 when not copying.
-;  *
-;  * r  A single precision number representing condition subtract result.
-;  * a  A single precision number to subtract from.
-;  * b  A single precision number to subtract.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_cond_sub_48 PROC
-        sub	rsp, 384
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp], r10
-        mov	QWORD PTR [rsp+8], r11
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+16], r10
-        mov	QWORD PTR [rsp+24], r11
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+32], r10
-        mov	QWORD PTR [rsp+40], r11
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+48], r10
-        mov	QWORD PTR [rsp+56], r11
-        mov	r10, QWORD PTR [r8+64]
-        mov	r11, QWORD PTR [r8+72]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+64], r10
-        mov	QWORD PTR [rsp+72], r11
-        mov	r10, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [r8+88]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+80], r10
-        mov	QWORD PTR [rsp+88], r11
-        mov	r10, QWORD PTR [r8+96]
-        mov	r11, QWORD PTR [r8+104]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+96], r10
-        mov	QWORD PTR [rsp+104], r11
-        mov	r10, QWORD PTR [r8+112]
-        mov	r11, QWORD PTR [r8+120]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+112], r10
-        mov	QWORD PTR [rsp+120], r11
-        mov	r10, QWORD PTR [r8+128]
-        mov	r11, QWORD PTR [r8+136]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+128], r10
-        mov	QWORD PTR [rsp+136], r11
-        mov	r10, QWORD PTR [r8+144]
-        mov	r11, QWORD PTR [r8+152]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+144], r10
-        mov	QWORD PTR [rsp+152], r11
-        mov	r10, QWORD PTR [r8+160]
-        mov	r11, QWORD PTR [r8+168]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+160], r10
-        mov	QWORD PTR [rsp+168], r11
-        mov	r10, QWORD PTR [r8+176]
-        mov	r11, QWORD PTR [r8+184]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+176], r10
-        mov	QWORD PTR [rsp+184], r11
-        mov	r10, QWORD PTR [r8+192]
-        mov	r11, QWORD PTR [r8+200]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+192], r10
-        mov	QWORD PTR [rsp+200], r11
-        mov	r10, QWORD PTR [r8+208]
-        mov	r11, QWORD PTR [r8+216]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+208], r10
-        mov	QWORD PTR [rsp+216], r11
-        mov	r10, QWORD PTR [r8+224]
-        mov	r11, QWORD PTR [r8+232]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+224], r10
-        mov	QWORD PTR [rsp+232], r11
-        mov	r10, QWORD PTR [r8+240]
-        mov	r11, QWORD PTR [r8+248]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+240], r10
-        mov	QWORD PTR [rsp+248], r11
-        mov	r10, QWORD PTR [r8+256]
-        mov	r11, QWORD PTR [r8+264]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+256], r10
-        mov	QWORD PTR [rsp+264], r11
-        mov	r10, QWORD PTR [r8+272]
-        mov	r11, QWORD PTR [r8+280]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+272], r10
-        mov	QWORD PTR [rsp+280], r11
-        mov	r10, QWORD PTR [r8+288]
-        mov	r11, QWORD PTR [r8+296]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+288], r10
-        mov	QWORD PTR [rsp+296], r11
-        mov	r10, QWORD PTR [r8+304]
-        mov	r11, QWORD PTR [r8+312]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+304], r10
-        mov	QWORD PTR [rsp+312], r11
-        mov	r10, QWORD PTR [r8+320]
-        mov	r11, QWORD PTR [r8+328]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+320], r10
-        mov	QWORD PTR [rsp+328], r11
-        mov	r10, QWORD PTR [r8+336]
-        mov	r11, QWORD PTR [r8+344]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+336], r10
-        mov	QWORD PTR [rsp+344], r11
-        mov	r10, QWORD PTR [r8+352]
-        mov	r11, QWORD PTR [r8+360]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+352], r10
-        mov	QWORD PTR [rsp+360], r11
-        mov	r10, QWORD PTR [r8+368]
-        mov	r11, QWORD PTR [r8+376]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+368], r10
-        mov	QWORD PTR [rsp+376], r11
-        mov	r10, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rsp]
-        sub	r10, r8
-        mov	r11, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [rsp+8]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx], r10
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r8, QWORD PTR [rsp+16]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+8], r11
-        mov	r11, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [rsp+24]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+16], r10
-        mov	r10, QWORD PTR [rdx+32]
-        mov	r8, QWORD PTR [rsp+32]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+24], r11
-        mov	r11, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [rsp+40]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+32], r10
-        mov	r10, QWORD PTR [rdx+48]
-        mov	r8, QWORD PTR [rsp+48]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+40], r11
-        mov	r11, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [rsp+56]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+48], r10
-        mov	r10, QWORD PTR [rdx+64]
-        mov	r8, QWORD PTR [rsp+64]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+56], r11
-        mov	r11, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [rsp+72]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+64], r10
-        mov	r10, QWORD PTR [rdx+80]
-        mov	r8, QWORD PTR [rsp+80]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+72], r11
-        mov	r11, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [rsp+88]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+80], r10
-        mov	r10, QWORD PTR [rdx+96]
-        mov	r8, QWORD PTR [rsp+96]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+88], r11
-        mov	r11, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [rsp+104]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+96], r10
-        mov	r10, QWORD PTR [rdx+112]
-        mov	r8, QWORD PTR [rsp+112]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+104], r11
-        mov	r11, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [rsp+120]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+112], r10
-        mov	r10, QWORD PTR [rdx+128]
-        mov	r8, QWORD PTR [rsp+128]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+120], r11
-        mov	r11, QWORD PTR [rdx+136]
-        mov	r8, QWORD PTR [rsp+136]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+128], r10
-        mov	r10, QWORD PTR [rdx+144]
-        mov	r8, QWORD PTR [rsp+144]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+136], r11
-        mov	r11, QWORD PTR [rdx+152]
-        mov	r8, QWORD PTR [rsp+152]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+144], r10
-        mov	r10, QWORD PTR [rdx+160]
-        mov	r8, QWORD PTR [rsp+160]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+152], r11
-        mov	r11, QWORD PTR [rdx+168]
-        mov	r8, QWORD PTR [rsp+168]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+160], r10
-        mov	r10, QWORD PTR [rdx+176]
-        mov	r8, QWORD PTR [rsp+176]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+168], r11
-        mov	r11, QWORD PTR [rdx+184]
-        mov	r8, QWORD PTR [rsp+184]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+176], r10
-        mov	r10, QWORD PTR [rdx+192]
-        mov	r8, QWORD PTR [rsp+192]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+184], r11
-        mov	r11, QWORD PTR [rdx+200]
-        mov	r8, QWORD PTR [rsp+200]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+192], r10
-        mov	r10, QWORD PTR [rdx+208]
-        mov	r8, QWORD PTR [rsp+208]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+200], r11
-        mov	r11, QWORD PTR [rdx+216]
-        mov	r8, QWORD PTR [rsp+216]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+208], r10
-        mov	r10, QWORD PTR [rdx+224]
-        mov	r8, QWORD PTR [rsp+224]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+216], r11
-        mov	r11, QWORD PTR [rdx+232]
-        mov	r8, QWORD PTR [rsp+232]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+224], r10
-        mov	r10, QWORD PTR [rdx+240]
-        mov	r8, QWORD PTR [rsp+240]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+232], r11
-        mov	r11, QWORD PTR [rdx+248]
-        mov	r8, QWORD PTR [rsp+248]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+240], r10
-        mov	r10, QWORD PTR [rdx+256]
-        mov	r8, QWORD PTR [rsp+256]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+248], r11
-        mov	r11, QWORD PTR [rdx+264]
-        mov	r8, QWORD PTR [rsp+264]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+256], r10
-        mov	r10, QWORD PTR [rdx+272]
-        mov	r8, QWORD PTR [rsp+272]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+264], r11
-        mov	r11, QWORD PTR [rdx+280]
-        mov	r8, QWORD PTR [rsp+280]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+272], r10
-        mov	r10, QWORD PTR [rdx+288]
-        mov	r8, QWORD PTR [rsp+288]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+280], r11
-        mov	r11, QWORD PTR [rdx+296]
-        mov	r8, QWORD PTR [rsp+296]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+288], r10
-        mov	r10, QWORD PTR [rdx+304]
-        mov	r8, QWORD PTR [rsp+304]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+296], r11
-        mov	r11, QWORD PTR [rdx+312]
-        mov	r8, QWORD PTR [rsp+312]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+304], r10
-        mov	r10, QWORD PTR [rdx+320]
-        mov	r8, QWORD PTR [rsp+320]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+312], r11
-        mov	r11, QWORD PTR [rdx+328]
-        mov	r8, QWORD PTR [rsp+328]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+320], r10
-        mov	r10, QWORD PTR [rdx+336]
-        mov	r8, QWORD PTR [rsp+336]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+328], r11
-        mov	r11, QWORD PTR [rdx+344]
-        mov	r8, QWORD PTR [rsp+344]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+336], r10
-        mov	r10, QWORD PTR [rdx+352]
-        mov	r8, QWORD PTR [rsp+352]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+344], r11
-        mov	r11, QWORD PTR [rdx+360]
-        mov	r8, QWORD PTR [rsp+360]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+352], r10
-        mov	r10, QWORD PTR [rdx+368]
-        mov	r8, QWORD PTR [rsp+368]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+360], r11
-        mov	r11, QWORD PTR [rdx+376]
-        mov	r8, QWORD PTR [rsp+376]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+368], r10
-        mov	QWORD PTR [rcx+376], r11
-        sbb	rax, rax
-        add	rsp, 384
-        ret
-sp_3072_cond_sub_48 ENDP
-_text ENDS
-; /* Reduce the number back to 3072 bits using Montgomery reduction.
-;  *
-;  * a   A single precision number to reduce in place.
-;  * m   The single precision number representing the modulus.
-;  * mp  The digit representing the negative inverse of m mod 2^n.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_mont_reduce_48 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        mov	r9, rdx
-        xor	rsi, rsi
-        ; i = 48
-        mov	r10, 48
-        mov	r15, QWORD PTR [rcx]
-        mov	rdi, QWORD PTR [rcx+8]
-L_3072_mont_reduce_48_loop:
-        ; mu = a[i] * mp
-        mov	r13, r15
-        imul	r13, r8
-        ; a[i+0] += m[0] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9]
-        add	r15, rax
-        adc	r12, rdx
-        ; a[i+1] += m[1] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+8]
-        mov	r15, rdi
-        add	r15, rax
-        adc	r11, rdx
-        add	r15, r12
-        adc	r11, 0
-        ; a[i+2] += m[2] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+16]
-        mov	rdi, QWORD PTR [rcx+16]
-        add	rdi, rax
-        adc	r12, rdx
-        add	rdi, r11
-        adc	r12, 0
-        ; a[i+3] += m[3] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+24]
-        mov	r14, QWORD PTR [rcx+24]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+24], r14
-        adc	r11, 0
-        ; a[i+4] += m[4] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+32]
-        mov	r14, QWORD PTR [rcx+32]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+32], r14
-        adc	r12, 0
-        ; a[i+5] += m[5] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+40]
-        mov	r14, QWORD PTR [rcx+40]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+40], r14
-        adc	r11, 0
-        ; a[i+6] += m[6] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+48]
-        mov	r14, QWORD PTR [rcx+48]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+48], r14
-        adc	r12, 0
-        ; a[i+7] += m[7] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+56]
-        mov	r14, QWORD PTR [rcx+56]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+56], r14
-        adc	r11, 0
-        ; a[i+8] += m[8] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+64]
-        mov	r14, QWORD PTR [rcx+64]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+64], r14
-        adc	r12, 0
-        ; a[i+9] += m[9] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+72]
-        mov	r14, QWORD PTR [rcx+72]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+72], r14
-        adc	r11, 0
-        ; a[i+10] += m[10] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+80]
-        mov	r14, QWORD PTR [rcx+80]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+80], r14
-        adc	r12, 0
-        ; a[i+11] += m[11] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+88]
-        mov	r14, QWORD PTR [rcx+88]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+88], r14
-        adc	r11, 0
-        ; a[i+12] += m[12] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+96]
-        mov	r14, QWORD PTR [rcx+96]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+96], r14
-        adc	r12, 0
-        ; a[i+13] += m[13] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+104]
-        mov	r14, QWORD PTR [rcx+104]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+104], r14
-        adc	r11, 0
-        ; a[i+14] += m[14] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+112]
-        mov	r14, QWORD PTR [rcx+112]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+112], r14
-        adc	r12, 0
-        ; a[i+15] += m[15] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+120]
-        mov	r14, QWORD PTR [rcx+120]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+120], r14
-        adc	r11, 0
-        ; a[i+16] += m[16] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+128]
-        mov	r14, QWORD PTR [rcx+128]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+128], r14
-        adc	r12, 0
-        ; a[i+17] += m[17] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+136]
-        mov	r14, QWORD PTR [rcx+136]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+136], r14
-        adc	r11, 0
-        ; a[i+18] += m[18] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+144]
-        mov	r14, QWORD PTR [rcx+144]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+144], r14
-        adc	r12, 0
-        ; a[i+19] += m[19] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+152]
-        mov	r14, QWORD PTR [rcx+152]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+152], r14
-        adc	r11, 0
-        ; a[i+20] += m[20] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+160]
-        mov	r14, QWORD PTR [rcx+160]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+160], r14
-        adc	r12, 0
-        ; a[i+21] += m[21] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+168]
-        mov	r14, QWORD PTR [rcx+168]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+168], r14
-        adc	r11, 0
-        ; a[i+22] += m[22] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+176]
-        mov	r14, QWORD PTR [rcx+176]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+176], r14
-        adc	r12, 0
-        ; a[i+23] += m[23] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+184]
-        mov	r14, QWORD PTR [rcx+184]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+184], r14
-        adc	r11, 0
-        ; a[i+24] += m[24] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+192]
-        mov	r14, QWORD PTR [rcx+192]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+192], r14
-        adc	r12, 0
-        ; a[i+25] += m[25] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+200]
-        mov	r14, QWORD PTR [rcx+200]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+200], r14
-        adc	r11, 0
-        ; a[i+26] += m[26] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+208]
-        mov	r14, QWORD PTR [rcx+208]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+208], r14
-        adc	r12, 0
-        ; a[i+27] += m[27] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+216]
-        mov	r14, QWORD PTR [rcx+216]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+216], r14
-        adc	r11, 0
-        ; a[i+28] += m[28] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+224]
-        mov	r14, QWORD PTR [rcx+224]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+224], r14
-        adc	r12, 0
-        ; a[i+29] += m[29] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+232]
-        mov	r14, QWORD PTR [rcx+232]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+232], r14
-        adc	r11, 0
-        ; a[i+30] += m[30] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+240]
-        mov	r14, QWORD PTR [rcx+240]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+240], r14
-        adc	r12, 0
-        ; a[i+31] += m[31] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+248]
-        mov	r14, QWORD PTR [rcx+248]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+248], r14
-        adc	r11, 0
-        ; a[i+32] += m[32] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+256]
-        mov	r14, QWORD PTR [rcx+256]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+256], r14
-        adc	r12, 0
-        ; a[i+33] += m[33] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+264]
-        mov	r14, QWORD PTR [rcx+264]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+264], r14
-        adc	r11, 0
-        ; a[i+34] += m[34] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+272]
-        mov	r14, QWORD PTR [rcx+272]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+272], r14
-        adc	r12, 0
-        ; a[i+35] += m[35] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+280]
-        mov	r14, QWORD PTR [rcx+280]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+280], r14
-        adc	r11, 0
-        ; a[i+36] += m[36] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+288]
-        mov	r14, QWORD PTR [rcx+288]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+288], r14
-        adc	r12, 0
-        ; a[i+37] += m[37] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+296]
-        mov	r14, QWORD PTR [rcx+296]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+296], r14
-        adc	r11, 0
-        ; a[i+38] += m[38] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+304]
-        mov	r14, QWORD PTR [rcx+304]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+304], r14
-        adc	r12, 0
-        ; a[i+39] += m[39] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+312]
-        mov	r14, QWORD PTR [rcx+312]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+312], r14
-        adc	r11, 0
-        ; a[i+40] += m[40] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+320]
-        mov	r14, QWORD PTR [rcx+320]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+320], r14
-        adc	r12, 0
-        ; a[i+41] += m[41] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+328]
-        mov	r14, QWORD PTR [rcx+328]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+328], r14
-        adc	r11, 0
-        ; a[i+42] += m[42] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+336]
-        mov	r14, QWORD PTR [rcx+336]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+336], r14
-        adc	r12, 0
-        ; a[i+43] += m[43] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+344]
-        mov	r14, QWORD PTR [rcx+344]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+344], r14
-        adc	r11, 0
-        ; a[i+44] += m[44] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+352]
-        mov	r14, QWORD PTR [rcx+352]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+352], r14
-        adc	r12, 0
-        ; a[i+45] += m[45] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+360]
-        mov	r14, QWORD PTR [rcx+360]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+360], r14
-        adc	r11, 0
-        ; a[i+46] += m[46] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+368]
-        mov	r14, QWORD PTR [rcx+368]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+368], r14
-        adc	r12, 0
-        ; a[i+47] += m[47] * mu
-        mov	rax, r13
-        mul	QWORD PTR [r9+376]
-        mov	r14, QWORD PTR [rcx+376]
-        add	r12, rax
-        adc	rdx, rsi
-        mov	rsi, 0
-        adc	rsi, 0
-        add	r14, r12
-        mov	QWORD PTR [rcx+376], r14
-        adc	QWORD PTR [rcx+384], rdx
-        adc	rsi, 0
-        ; i -= 1
-        add	rcx, 8
-        dec	r10
-        jnz	L_3072_mont_reduce_48_loop
-        mov	QWORD PTR [rcx], r15
-        mov	QWORD PTR [rcx+8], rdi
-        neg	rsi
-IFDEF _WIN64
-        mov	r8, r9
-        mov	r9, rsi
-ELSE
-        mov	r9, rsi
-        mov	r8, r9
-ENDIF
-        mov	rdx, rcx
-        mov	rcx, rcx
-        sub	rcx, 384
-        call	sp_3072_cond_sub_48
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_3072_mont_reduce_48 ENDP
-_text ENDS
-; /* Sub b from a into r. (r = a - b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_sub_48 PROC
-        mov	r9, QWORD PTR [rdx]
-        sub	r9, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx+8]
-        mov	QWORD PTR [rcx], r9
-        sbb	r10, QWORD PTR [r8+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	QWORD PTR [rcx+8], r10
-        sbb	r9, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	QWORD PTR [rcx+16], r9
-        sbb	r10, QWORD PTR [r8+24]
-        mov	r9, QWORD PTR [rdx+32]
-        mov	QWORD PTR [rcx+24], r10
-        sbb	r9, QWORD PTR [r8+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	QWORD PTR [rcx+32], r9
-        sbb	r10, QWORD PTR [r8+40]
-        mov	r9, QWORD PTR [rdx+48]
-        mov	QWORD PTR [rcx+40], r10
-        sbb	r9, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+56]
-        mov	QWORD PTR [rcx+48], r9
-        sbb	r10, QWORD PTR [r8+56]
-        mov	r9, QWORD PTR [rdx+64]
-        mov	QWORD PTR [rcx+56], r10
-        sbb	r9, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [rdx+72]
-        mov	QWORD PTR [rcx+64], r9
-        sbb	r10, QWORD PTR [r8+72]
-        mov	r9, QWORD PTR [rdx+80]
-        mov	QWORD PTR [rcx+72], r10
-        sbb	r9, QWORD PTR [r8+80]
-        mov	r10, QWORD PTR [rdx+88]
-        mov	QWORD PTR [rcx+80], r9
-        sbb	r10, QWORD PTR [r8+88]
-        mov	r9, QWORD PTR [rdx+96]
-        mov	QWORD PTR [rcx+88], r10
-        sbb	r9, QWORD PTR [r8+96]
-        mov	r10, QWORD PTR [rdx+104]
-        mov	QWORD PTR [rcx+96], r9
-        sbb	r10, QWORD PTR [r8+104]
-        mov	r9, QWORD PTR [rdx+112]
-        mov	QWORD PTR [rcx+104], r10
-        sbb	r9, QWORD PTR [r8+112]
-        mov	r10, QWORD PTR [rdx+120]
-        mov	QWORD PTR [rcx+112], r9
-        sbb	r10, QWORD PTR [r8+120]
-        mov	r9, QWORD PTR [rdx+128]
-        mov	QWORD PTR [rcx+120], r10
-        sbb	r9, QWORD PTR [r8+128]
-        mov	r10, QWORD PTR [rdx+136]
-        mov	QWORD PTR [rcx+128], r9
-        sbb	r10, QWORD PTR [r8+136]
-        mov	r9, QWORD PTR [rdx+144]
-        mov	QWORD PTR [rcx+136], r10
-        sbb	r9, QWORD PTR [r8+144]
-        mov	r10, QWORD PTR [rdx+152]
-        mov	QWORD PTR [rcx+144], r9
-        sbb	r10, QWORD PTR [r8+152]
-        mov	r9, QWORD PTR [rdx+160]
-        mov	QWORD PTR [rcx+152], r10
-        sbb	r9, QWORD PTR [r8+160]
-        mov	r10, QWORD PTR [rdx+168]
-        mov	QWORD PTR [rcx+160], r9
-        sbb	r10, QWORD PTR [r8+168]
-        mov	r9, QWORD PTR [rdx+176]
-        mov	QWORD PTR [rcx+168], r10
-        sbb	r9, QWORD PTR [r8+176]
-        mov	r10, QWORD PTR [rdx+184]
-        mov	QWORD PTR [rcx+176], r9
-        sbb	r10, QWORD PTR [r8+184]
-        mov	r9, QWORD PTR [rdx+192]
-        mov	QWORD PTR [rcx+184], r10
-        sbb	r9, QWORD PTR [r8+192]
-        mov	r10, QWORD PTR [rdx+200]
-        mov	QWORD PTR [rcx+192], r9
-        sbb	r10, QWORD PTR [r8+200]
-        mov	r9, QWORD PTR [rdx+208]
-        mov	QWORD PTR [rcx+200], r10
-        sbb	r9, QWORD PTR [r8+208]
-        mov	r10, QWORD PTR [rdx+216]
-        mov	QWORD PTR [rcx+208], r9
-        sbb	r10, QWORD PTR [r8+216]
-        mov	r9, QWORD PTR [rdx+224]
-        mov	QWORD PTR [rcx+216], r10
-        sbb	r9, QWORD PTR [r8+224]
-        mov	r10, QWORD PTR [rdx+232]
-        mov	QWORD PTR [rcx+224], r9
-        sbb	r10, QWORD PTR [r8+232]
-        mov	r9, QWORD PTR [rdx+240]
-        mov	QWORD PTR [rcx+232], r10
-        sbb	r9, QWORD PTR [r8+240]
-        mov	r10, QWORD PTR [rdx+248]
-        mov	QWORD PTR [rcx+240], r9
-        sbb	r10, QWORD PTR [r8+248]
-        mov	r9, QWORD PTR [rdx+256]
-        mov	QWORD PTR [rcx+248], r10
-        sbb	r9, QWORD PTR [r8+256]
-        mov	r10, QWORD PTR [rdx+264]
-        mov	QWORD PTR [rcx+256], r9
-        sbb	r10, QWORD PTR [r8+264]
-        mov	r9, QWORD PTR [rdx+272]
-        mov	QWORD PTR [rcx+264], r10
-        sbb	r9, QWORD PTR [r8+272]
-        mov	r10, QWORD PTR [rdx+280]
-        mov	QWORD PTR [rcx+272], r9
-        sbb	r10, QWORD PTR [r8+280]
-        mov	r9, QWORD PTR [rdx+288]
-        mov	QWORD PTR [rcx+280], r10
-        sbb	r9, QWORD PTR [r8+288]
-        mov	r10, QWORD PTR [rdx+296]
-        mov	QWORD PTR [rcx+288], r9
-        sbb	r10, QWORD PTR [r8+296]
-        mov	r9, QWORD PTR [rdx+304]
-        mov	QWORD PTR [rcx+296], r10
-        sbb	r9, QWORD PTR [r8+304]
-        mov	r10, QWORD PTR [rdx+312]
-        mov	QWORD PTR [rcx+304], r9
-        sbb	r10, QWORD PTR [r8+312]
-        mov	r9, QWORD PTR [rdx+320]
-        mov	QWORD PTR [rcx+312], r10
-        sbb	r9, QWORD PTR [r8+320]
-        mov	r10, QWORD PTR [rdx+328]
-        mov	QWORD PTR [rcx+320], r9
-        sbb	r10, QWORD PTR [r8+328]
-        mov	r9, QWORD PTR [rdx+336]
-        mov	QWORD PTR [rcx+328], r10
-        sbb	r9, QWORD PTR [r8+336]
-        mov	r10, QWORD PTR [rdx+344]
-        mov	QWORD PTR [rcx+336], r9
-        sbb	r10, QWORD PTR [r8+344]
-        mov	r9, QWORD PTR [rdx+352]
-        mov	QWORD PTR [rcx+344], r10
-        sbb	r9, QWORD PTR [r8+352]
-        mov	r10, QWORD PTR [rdx+360]
-        mov	QWORD PTR [rcx+352], r9
-        sbb	r10, QWORD PTR [r8+360]
-        mov	r9, QWORD PTR [rdx+368]
-        mov	QWORD PTR [rcx+360], r10
-        sbb	r9, QWORD PTR [r8+368]
-        mov	r10, QWORD PTR [rdx+376]
-        mov	QWORD PTR [rcx+368], r9
-        sbb	r10, QWORD PTR [r8+376]
-        mov	QWORD PTR [rcx+376], r10
-        sbb	rax, rax
-        ret
-sp_3072_sub_48 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Mul a by digit b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision digit.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_mul_d_avx2_48 PROC
-        push	r12
-        push	r13
-        mov	rax, rdx
-        ; A[0] * B
-        mov	rdx, r8
-        xor	r13, r13
-        mulx	r12, r11, QWORD PTR [rax]
-        mov	QWORD PTR [rcx], r11
-        ; A[1] * B
-        mulx	r10, r9, QWORD PTR [rax+8]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+8], r12
-        ; A[2] * B
-        mulx	r10, r9, QWORD PTR [rax+16]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+16], r11
-        ; A[3] * B
-        mulx	r10, r9, QWORD PTR [rax+24]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+24], r12
-        ; A[4] * B
-        mulx	r10, r9, QWORD PTR [rax+32]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+32], r11
-        ; A[5] * B
-        mulx	r10, r9, QWORD PTR [rax+40]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+40], r12
-        ; A[6] * B
-        mulx	r10, r9, QWORD PTR [rax+48]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+48], r11
-        ; A[7] * B
-        mulx	r10, r9, QWORD PTR [rax+56]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+56], r12
-        ; A[8] * B
-        mulx	r10, r9, QWORD PTR [rax+64]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+64], r11
-        ; A[9] * B
-        mulx	r10, r9, QWORD PTR [rax+72]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+72], r12
-        ; A[10] * B
-        mulx	r10, r9, QWORD PTR [rax+80]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+80], r11
-        ; A[11] * B
-        mulx	r10, r9, QWORD PTR [rax+88]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+88], r12
-        ; A[12] * B
-        mulx	r10, r9, QWORD PTR [rax+96]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+96], r11
-        ; A[13] * B
-        mulx	r10, r9, QWORD PTR [rax+104]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+104], r12
-        ; A[14] * B
-        mulx	r10, r9, QWORD PTR [rax+112]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+112], r11
-        ; A[15] * B
-        mulx	r10, r9, QWORD PTR [rax+120]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+120], r12
-        ; A[16] * B
-        mulx	r10, r9, QWORD PTR [rax+128]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+128], r11
-        ; A[17] * B
-        mulx	r10, r9, QWORD PTR [rax+136]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+136], r12
-        ; A[18] * B
-        mulx	r10, r9, QWORD PTR [rax+144]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+144], r11
-        ; A[19] * B
-        mulx	r10, r9, QWORD PTR [rax+152]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+152], r12
-        ; A[20] * B
-        mulx	r10, r9, QWORD PTR [rax+160]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+160], r11
-        ; A[21] * B
-        mulx	r10, r9, QWORD PTR [rax+168]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+168], r12
-        ; A[22] * B
-        mulx	r10, r9, QWORD PTR [rax+176]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+176], r11
-        ; A[23] * B
-        mulx	r10, r9, QWORD PTR [rax+184]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+184], r12
-        ; A[24] * B
-        mulx	r10, r9, QWORD PTR [rax+192]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+192], r11
-        ; A[25] * B
-        mulx	r10, r9, QWORD PTR [rax+200]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+200], r12
-        ; A[26] * B
-        mulx	r10, r9, QWORD PTR [rax+208]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+208], r11
-        ; A[27] * B
-        mulx	r10, r9, QWORD PTR [rax+216]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+216], r12
-        ; A[28] * B
-        mulx	r10, r9, QWORD PTR [rax+224]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+224], r11
-        ; A[29] * B
-        mulx	r10, r9, QWORD PTR [rax+232]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+232], r12
-        ; A[30] * B
-        mulx	r10, r9, QWORD PTR [rax+240]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+240], r11
-        ; A[31] * B
-        mulx	r10, r9, QWORD PTR [rax+248]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+248], r12
-        ; A[32] * B
-        mulx	r10, r9, QWORD PTR [rax+256]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+256], r11
-        ; A[33] * B
-        mulx	r10, r9, QWORD PTR [rax+264]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+264], r12
-        ; A[34] * B
-        mulx	r10, r9, QWORD PTR [rax+272]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+272], r11
-        ; A[35] * B
-        mulx	r10, r9, QWORD PTR [rax+280]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+280], r12
-        ; A[36] * B
-        mulx	r10, r9, QWORD PTR [rax+288]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+288], r11
-        ; A[37] * B
-        mulx	r10, r9, QWORD PTR [rax+296]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+296], r12
-        ; A[38] * B
-        mulx	r10, r9, QWORD PTR [rax+304]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+304], r11
-        ; A[39] * B
-        mulx	r10, r9, QWORD PTR [rax+312]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+312], r12
-        ; A[40] * B
-        mulx	r10, r9, QWORD PTR [rax+320]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+320], r11
-        ; A[41] * B
-        mulx	r10, r9, QWORD PTR [rax+328]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+328], r12
-        ; A[42] * B
-        mulx	r10, r9, QWORD PTR [rax+336]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+336], r11
-        ; A[43] * B
-        mulx	r10, r9, QWORD PTR [rax+344]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+344], r12
-        ; A[44] * B
-        mulx	r10, r9, QWORD PTR [rax+352]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+352], r11
-        ; A[45] * B
-        mulx	r10, r9, QWORD PTR [rax+360]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+360], r12
-        ; A[46] * B
-        mulx	r10, r9, QWORD PTR [rax+368]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+368], r11
-        ; A[47] * B
-        mulx	r10, r9, QWORD PTR [rax+376]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        adcx	r11, r13
-        mov	QWORD PTR [rcx+376], r12
-        mov	QWORD PTR [rcx+384], r11
-        pop	r13
-        pop	r12
-        ret
-sp_3072_mul_d_avx2_48 ENDP
-_text ENDS
-ENDIF
-IFDEF _WIN64
-; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
-;  *
-;  * d1   The high order half of the number to divide.
-;  * d0   The low order half of the number to divide.
-;  * div  The dividend.
-;  * returns the result of the division.
-;  */
-_text SEGMENT READONLY PARA
-div_3072_word_asm_48 PROC
-        mov	r9, rdx
-        mov	rax, r9
-        mov	rdx, rcx
-        div	r8
-        ret
-div_3072_word_asm_48 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Conditionally subtract b from a using the mask m.
-;  * m is -1 to subtract and 0 when not copying.
-;  *
-;  * r  A single precision number representing condition subtract result.
-;  * a  A single precision number to subtract from.
-;  * b  A single precision number to subtract.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_cond_sub_avx2_48 PROC
-        push	r12
-        mov	r12, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx]
-        pext	r12, r12, r9
-        sub	r10, r12
-        mov	r12, QWORD PTR [r8+8]
-        mov	r11, QWORD PTR [rdx+8]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+16]
-        mov	r12, QWORD PTR [rdx+16]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+8], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [rdx+24]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+16], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [rdx+32]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+24], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [rdx+40]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+32], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+48]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+40], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+56]
-        mov	r11, QWORD PTR [rdx+56]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+48], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+64]
-        mov	r12, QWORD PTR [rdx+64]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+56], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+72]
-        mov	r10, QWORD PTR [rdx+72]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+64], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [rdx+80]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+72], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+88]
-        mov	r12, QWORD PTR [rdx+88]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+80], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+96]
-        mov	r10, QWORD PTR [rdx+96]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+88], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+104]
-        mov	r11, QWORD PTR [rdx+104]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+96], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+112]
-        mov	r12, QWORD PTR [rdx+112]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+104], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+120]
-        mov	r10, QWORD PTR [rdx+120]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+112], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+128]
-        mov	r11, QWORD PTR [rdx+128]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+120], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+136]
-        mov	r12, QWORD PTR [rdx+136]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+128], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+144]
-        mov	r10, QWORD PTR [rdx+144]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+136], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+152]
-        mov	r11, QWORD PTR [rdx+152]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+144], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+160]
-        mov	r12, QWORD PTR [rdx+160]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+152], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+168]
-        mov	r10, QWORD PTR [rdx+168]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+160], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+176]
-        mov	r11, QWORD PTR [rdx+176]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+168], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+184]
-        mov	r12, QWORD PTR [rdx+184]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+176], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+192]
-        mov	r10, QWORD PTR [rdx+192]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+184], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+200]
-        mov	r11, QWORD PTR [rdx+200]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+192], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+208]
-        mov	r12, QWORD PTR [rdx+208]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+200], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+216]
-        mov	r10, QWORD PTR [rdx+216]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+208], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+224]
-        mov	r11, QWORD PTR [rdx+224]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+216], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+232]
-        mov	r12, QWORD PTR [rdx+232]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+224], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+240]
-        mov	r10, QWORD PTR [rdx+240]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+232], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+248]
-        mov	r11, QWORD PTR [rdx+248]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+240], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+256]
-        mov	r12, QWORD PTR [rdx+256]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+248], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+264]
-        mov	r10, QWORD PTR [rdx+264]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+256], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+272]
-        mov	r11, QWORD PTR [rdx+272]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+264], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+280]
-        mov	r12, QWORD PTR [rdx+280]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+272], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+288]
-        mov	r10, QWORD PTR [rdx+288]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+280], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+296]
-        mov	r11, QWORD PTR [rdx+296]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+288], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+304]
-        mov	r12, QWORD PTR [rdx+304]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+296], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+312]
-        mov	r10, QWORD PTR [rdx+312]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+304], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+320]
-        mov	r11, QWORD PTR [rdx+320]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+312], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+328]
-        mov	r12, QWORD PTR [rdx+328]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+320], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+336]
-        mov	r10, QWORD PTR [rdx+336]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+328], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+344]
-        mov	r11, QWORD PTR [rdx+344]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+336], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+352]
-        mov	r12, QWORD PTR [rdx+352]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+344], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+360]
-        mov	r10, QWORD PTR [rdx+360]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+352], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+368]
-        mov	r11, QWORD PTR [rdx+368]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+360], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+376]
-        mov	r12, QWORD PTR [rdx+376]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+368], r11
-        sbb	r12, r10
-        mov	QWORD PTR [rcx+376], r12
-        sbb	rax, rax
-        pop	r12
-        ret
-sp_3072_cond_sub_avx2_48 ENDP
-_text ENDS
-ENDIF
-; /* Compare a with b in constant time.
-;  *
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  * return -ve, 0 or +ve if a is less than, equal to or greater than b
-;  * respectively.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_cmp_48 PROC
-        push	r12
-        xor	r9, r9
-        mov	r8, -1
-        mov	rax, -1
-        mov	r10, 1
-        mov	r11, QWORD PTR [rcx+376]
-        mov	r12, QWORD PTR [rdx+376]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+368]
-        mov	r12, QWORD PTR [rdx+368]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+360]
-        mov	r12, QWORD PTR [rdx+360]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+352]
-        mov	r12, QWORD PTR [rdx+352]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+344]
-        mov	r12, QWORD PTR [rdx+344]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+336]
-        mov	r12, QWORD PTR [rdx+336]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+328]
-        mov	r12, QWORD PTR [rdx+328]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+320]
-        mov	r12, QWORD PTR [rdx+320]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+312]
-        mov	r12, QWORD PTR [rdx+312]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+304]
-        mov	r12, QWORD PTR [rdx+304]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+296]
-        mov	r12, QWORD PTR [rdx+296]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+288]
-        mov	r12, QWORD PTR [rdx+288]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+280]
-        mov	r12, QWORD PTR [rdx+280]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+272]
-        mov	r12, QWORD PTR [rdx+272]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+264]
-        mov	r12, QWORD PTR [rdx+264]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+256]
-        mov	r12, QWORD PTR [rdx+256]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+248]
-        mov	r12, QWORD PTR [rdx+248]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+240]
-        mov	r12, QWORD PTR [rdx+240]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+232]
-        mov	r12, QWORD PTR [rdx+232]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+224]
-        mov	r12, QWORD PTR [rdx+224]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+216]
-        mov	r12, QWORD PTR [rdx+216]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+208]
-        mov	r12, QWORD PTR [rdx+208]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+200]
-        mov	r12, QWORD PTR [rdx+200]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+192]
-        mov	r12, QWORD PTR [rdx+192]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+184]
-        mov	r12, QWORD PTR [rdx+184]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+176]
-        mov	r12, QWORD PTR [rdx+176]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+168]
-        mov	r12, QWORD PTR [rdx+168]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+160]
-        mov	r12, QWORD PTR [rdx+160]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+152]
-        mov	r12, QWORD PTR [rdx+152]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+144]
-        mov	r12, QWORD PTR [rdx+144]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+136]
-        mov	r12, QWORD PTR [rdx+136]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+128]
-        mov	r12, QWORD PTR [rdx+128]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+120]
-        mov	r12, QWORD PTR [rdx+120]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+112]
-        mov	r12, QWORD PTR [rdx+112]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+104]
-        mov	r12, QWORD PTR [rdx+104]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+96]
-        mov	r12, QWORD PTR [rdx+96]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+88]
-        mov	r12, QWORD PTR [rdx+88]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+80]
-        mov	r12, QWORD PTR [rdx+80]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+72]
-        mov	r12, QWORD PTR [rdx+72]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+64]
-        mov	r12, QWORD PTR [rdx+64]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+56]
-        mov	r12, QWORD PTR [rdx+56]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+48]
-        mov	r12, QWORD PTR [rdx+48]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+40]
-        mov	r12, QWORD PTR [rdx+40]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+32]
-        mov	r12, QWORD PTR [rdx+32]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+24]
-        mov	r12, QWORD PTR [rdx+24]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+16]
-        mov	r12, QWORD PTR [rdx+16]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+8]
-        mov	r12, QWORD PTR [rdx+8]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx]
-        mov	r12, QWORD PTR [rdx]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        xor	rax, r8
-        pop	r12
-        ret
-sp_3072_cmp_48 ENDP
-_text ENDS
-IFNDEF WC_NO_CACHE_RESISTANT
-_text SEGMENT READONLY PARA
-sp_3072_get_from_table_48 PROC
-        sub	rsp, 128
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        vmovdqu	OWORD PTR [rsp+96], xmm12
-        vmovdqu	OWORD PTR [rsp+112], xmm13
-        mov	rax, 1
-        movd	xmm10, r8
-        movd	xmm11, rax
-        pxor	xmm13, xmm13
-        pshufd	xmm11, xmm11, 0
-        pshufd	xmm10, xmm10, 0
-        ; START: 0-7
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        add	rcx, 64
-        ; END: 0-7
-        ; START: 8-15
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        add	rcx, 64
-        ; END: 8-15
-        ; START: 16-23
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        add	rcx, 64
-        ; END: 16-23
-        ; START: 24-31
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        add	rcx, 64
-        ; END: 24-31
-        ; START: 32-39
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        add	rcx, 64
-        ; END: 32-39
-        ; START: 40-47
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        ; END: 40-47
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        vmovdqu	xmm12, OWORD PTR [rsp+96]
-        vmovdqu	xmm13, OWORD PTR [rsp+112]
-        add	rsp, 128
-        ret
-sp_3072_get_from_table_48 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Reduce the number back to 3072 bits using Montgomery reduction.
-;  *
-;  * a   A single precision number to reduce in place.
-;  * m   The single precision number representing the modulus.
-;  * mp  The digit representing the negative inverse of m mod 2^n.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_mont_reduce_avx2_48 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        push	rbp
-        mov	r9, rcx
-        mov	r10, rdx
-        xor	rbp, rbp
-        ; i = 48
-        mov	r11, 48
-        mov	r14, QWORD PTR [r9]
-        mov	r15, QWORD PTR [r9+8]
-        mov	rdi, QWORD PTR [r9+16]
-        mov	rsi, QWORD PTR [r9+24]
-        add	r9, 192
-        xor	rbp, rbp
-L_3072_mont_reduce_avx2_48_loop:
-        ; mu = a[i] * mp
-        mov	rdx, r14
-        mov	r12, r14
-        imul	rdx, r8
-        xor	rbx, rbx
-        ; a[i+0] += m[0] * mu
-        mulx	rcx, rax, QWORD PTR [r10]
-        mov	r14, r15
-        adcx	r12, rax
-        adox	r14, rcx
-        ; a[i+1] += m[1] * mu
-        mulx	rcx, rax, QWORD PTR [r10+8]
-        mov	r15, rdi
-        adcx	r14, rax
-        adox	r15, rcx
-        ; a[i+2] += m[2] * mu
-        mulx	rcx, rax, QWORD PTR [r10+16]
-        mov	rdi, rsi
-        adcx	r15, rax
-        adox	rdi, rcx
-        ; a[i+3] += m[3] * mu
-        mulx	rcx, rax, QWORD PTR [r10+24]
-        mov	rsi, QWORD PTR [r9+-160]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; a[i+4] += m[4] * mu
-        mulx	rcx, rax, QWORD PTR [r10+32]
-        mov	r13, QWORD PTR [r9+-152]
-        adcx	rsi, rax
-        adox	r13, rcx
-        ; a[i+5] += m[5] * mu
-        mulx	rcx, rax, QWORD PTR [r10+40]
-        mov	r12, QWORD PTR [r9+-144]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-152], r13
-        ; a[i+6] += m[6] * mu
-        mulx	rcx, rax, QWORD PTR [r10+48]
-        mov	r13, QWORD PTR [r9+-136]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-144], r12
-        ; a[i+7] += m[7] * mu
-        mulx	rcx, rax, QWORD PTR [r10+56]
-        mov	r12, QWORD PTR [r9+-128]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-136], r13
-        ; a[i+8] += m[8] * mu
-        mulx	rcx, rax, QWORD PTR [r10+64]
-        mov	r13, QWORD PTR [r9+-120]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-128], r12
-        ; a[i+9] += m[9] * mu
-        mulx	rcx, rax, QWORD PTR [r10+72]
-        mov	r12, QWORD PTR [r9+-112]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-120], r13
-        ; a[i+10] += m[10] * mu
-        mulx	rcx, rax, QWORD PTR [r10+80]
-        mov	r13, QWORD PTR [r9+-104]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-112], r12
-        ; a[i+11] += m[11] * mu
-        mulx	rcx, rax, QWORD PTR [r10+88]
-        mov	r12, QWORD PTR [r9+-96]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-104], r13
-        ; a[i+12] += m[12] * mu
-        mulx	rcx, rax, QWORD PTR [r10+96]
-        mov	r13, QWORD PTR [r9+-88]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-96], r12
-        ; a[i+13] += m[13] * mu
-        mulx	rcx, rax, QWORD PTR [r10+104]
-        mov	r12, QWORD PTR [r9+-80]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-88], r13
-        ; a[i+14] += m[14] * mu
-        mulx	rcx, rax, QWORD PTR [r10+112]
-        mov	r13, QWORD PTR [r9+-72]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-80], r12
-        ; a[i+15] += m[15] * mu
-        mulx	rcx, rax, QWORD PTR [r10+120]
-        mov	r12, QWORD PTR [r9+-64]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-72], r13
-        ; a[i+16] += m[16] * mu
-        mulx	rcx, rax, QWORD PTR [r10+128]
-        mov	r13, QWORD PTR [r9+-56]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-64], r12
-        ; a[i+17] += m[17] * mu
-        mulx	rcx, rax, QWORD PTR [r10+136]
-        mov	r12, QWORD PTR [r9+-48]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-56], r13
-        ; a[i+18] += m[18] * mu
-        mulx	rcx, rax, QWORD PTR [r10+144]
-        mov	r13, QWORD PTR [r9+-40]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-48], r12
-        ; a[i+19] += m[19] * mu
-        mulx	rcx, rax, QWORD PTR [r10+152]
-        mov	r12, QWORD PTR [r9+-32]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-40], r13
-        ; a[i+20] += m[20] * mu
-        mulx	rcx, rax, QWORD PTR [r10+160]
-        mov	r13, QWORD PTR [r9+-24]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-32], r12
-        ; a[i+21] += m[21] * mu
-        mulx	rcx, rax, QWORD PTR [r10+168]
-        mov	r12, QWORD PTR [r9+-16]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-24], r13
-        ; a[i+22] += m[22] * mu
-        mulx	rcx, rax, QWORD PTR [r10+176]
-        mov	r13, QWORD PTR [r9+-8]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-16], r12
-        ; a[i+23] += m[23] * mu
-        mulx	rcx, rax, QWORD PTR [r10+184]
-        mov	r12, QWORD PTR [r9]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-8], r13
-        ; a[i+24] += m[24] * mu
-        mulx	rcx, rax, QWORD PTR [r10+192]
-        mov	r13, QWORD PTR [r9+8]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9], r12
-        ; a[i+25] += m[25] * mu
-        mulx	rcx, rax, QWORD PTR [r10+200]
-        mov	r12, QWORD PTR [r9+16]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+8], r13
-        ; a[i+26] += m[26] * mu
-        mulx	rcx, rax, QWORD PTR [r10+208]
-        mov	r13, QWORD PTR [r9+24]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+16], r12
-        ; a[i+27] += m[27] * mu
-        mulx	rcx, rax, QWORD PTR [r10+216]
-        mov	r12, QWORD PTR [r9+32]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+24], r13
-        ; a[i+28] += m[28] * mu
-        mulx	rcx, rax, QWORD PTR [r10+224]
-        mov	r13, QWORD PTR [r9+40]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+32], r12
-        ; a[i+29] += m[29] * mu
-        mulx	rcx, rax, QWORD PTR [r10+232]
-        mov	r12, QWORD PTR [r9+48]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+40], r13
-        ; a[i+30] += m[30] * mu
-        mulx	rcx, rax, QWORD PTR [r10+240]
-        mov	r13, QWORD PTR [r9+56]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+48], r12
-        ; a[i+31] += m[31] * mu
-        mulx	rcx, rax, QWORD PTR [r10+248]
-        mov	r12, QWORD PTR [r9+64]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+56], r13
-        ; a[i+32] += m[32] * mu
-        mulx	rcx, rax, QWORD PTR [r10+256]
-        mov	r13, QWORD PTR [r9+72]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+64], r12
-        ; a[i+33] += m[33] * mu
-        mulx	rcx, rax, QWORD PTR [r10+264]
-        mov	r12, QWORD PTR [r9+80]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+72], r13
-        ; a[i+34] += m[34] * mu
-        mulx	rcx, rax, QWORD PTR [r10+272]
-        mov	r13, QWORD PTR [r9+88]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+80], r12
-        ; a[i+35] += m[35] * mu
-        mulx	rcx, rax, QWORD PTR [r10+280]
-        mov	r12, QWORD PTR [r9+96]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+88], r13
-        ; a[i+36] += m[36] * mu
-        mulx	rcx, rax, QWORD PTR [r10+288]
-        mov	r13, QWORD PTR [r9+104]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+96], r12
-        ; a[i+37] += m[37] * mu
-        mulx	rcx, rax, QWORD PTR [r10+296]
-        mov	r12, QWORD PTR [r9+112]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+104], r13
-        ; a[i+38] += m[38] * mu
-        mulx	rcx, rax, QWORD PTR [r10+304]
-        mov	r13, QWORD PTR [r9+120]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+112], r12
-        ; a[i+39] += m[39] * mu
-        mulx	rcx, rax, QWORD PTR [r10+312]
-        mov	r12, QWORD PTR [r9+128]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+120], r13
-        ; a[i+40] += m[40] * mu
-        mulx	rcx, rax, QWORD PTR [r10+320]
-        mov	r13, QWORD PTR [r9+136]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+128], r12
-        ; a[i+41] += m[41] * mu
-        mulx	rcx, rax, QWORD PTR [r10+328]
-        mov	r12, QWORD PTR [r9+144]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+136], r13
-        ; a[i+42] += m[42] * mu
-        mulx	rcx, rax, QWORD PTR [r10+336]
-        mov	r13, QWORD PTR [r9+152]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+144], r12
-        ; a[i+43] += m[43] * mu
-        mulx	rcx, rax, QWORD PTR [r10+344]
-        mov	r12, QWORD PTR [r9+160]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+152], r13
-        ; a[i+44] += m[44] * mu
-        mulx	rcx, rax, QWORD PTR [r10+352]
-        mov	r13, QWORD PTR [r9+168]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+160], r12
-        ; a[i+45] += m[45] * mu
-        mulx	rcx, rax, QWORD PTR [r10+360]
-        mov	r12, QWORD PTR [r9+176]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+168], r13
-        ; a[i+46] += m[46] * mu
-        mulx	rcx, rax, QWORD PTR [r10+368]
-        mov	r13, QWORD PTR [r9+184]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+176], r12
-        ; a[i+47] += m[47] * mu
-        mulx	rcx, rax, QWORD PTR [r10+376]
-        mov	r12, QWORD PTR [r9+192]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+184], r13
-        adcx	r12, rbp
-        mov	rbp, rbx
-        mov	QWORD PTR [r9+192], r12
-        adox	rbp, rbx
-        adcx	rbp, rbx
-        ; a += 1
-        add	r9, 8
-        ; i -= 1
-        sub	r11, 1
-        jnz	L_3072_mont_reduce_avx2_48_loop
-        sub	r9, 192
-        neg	rbp
-        mov	r8, r9
-        sub	r9, 384
-        mov	rcx, QWORD PTR [r10]
-        mov	rdx, r14
-        pext	rcx, rcx, rbp
-        sub	rdx, rcx
-        mov	rcx, QWORD PTR [r10+8]
-        mov	rax, r15
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+16]
-        mov	rcx, rdi
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+8], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+24]
-        mov	rdx, rsi
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+16], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+32]
-        mov	rax, QWORD PTR [r8+32]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+24], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+40]
-        mov	rcx, QWORD PTR [r8+40]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+32], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+48]
-        mov	rdx, QWORD PTR [r8+48]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+40], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+56]
-        mov	rax, QWORD PTR [r8+56]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+48], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+64]
-        mov	rcx, QWORD PTR [r8+64]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+56], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+72]
-        mov	rdx, QWORD PTR [r8+72]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+64], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+80]
-        mov	rax, QWORD PTR [r8+80]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+72], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+88]
-        mov	rcx, QWORD PTR [r8+88]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+80], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+96]
-        mov	rdx, QWORD PTR [r8+96]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+88], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+104]
-        mov	rax, QWORD PTR [r8+104]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+96], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+112]
-        mov	rcx, QWORD PTR [r8+112]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+104], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+120]
-        mov	rdx, QWORD PTR [r8+120]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+112], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+128]
-        mov	rax, QWORD PTR [r8+128]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+120], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+136]
-        mov	rcx, QWORD PTR [r8+136]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+128], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+144]
-        mov	rdx, QWORD PTR [r8+144]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+136], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+152]
-        mov	rax, QWORD PTR [r8+152]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+144], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+160]
-        mov	rcx, QWORD PTR [r8+160]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+152], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+168]
-        mov	rdx, QWORD PTR [r8+168]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+160], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+176]
-        mov	rax, QWORD PTR [r8+176]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+168], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+184]
-        mov	rcx, QWORD PTR [r8+184]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+176], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+192]
-        mov	rdx, QWORD PTR [r8+192]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+184], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+200]
-        mov	rax, QWORD PTR [r8+200]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+192], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+208]
-        mov	rcx, QWORD PTR [r8+208]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+200], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+216]
-        mov	rdx, QWORD PTR [r8+216]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+208], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+224]
-        mov	rax, QWORD PTR [r8+224]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+216], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+232]
-        mov	rcx, QWORD PTR [r8+232]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+224], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+240]
-        mov	rdx, QWORD PTR [r8+240]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+232], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+248]
-        mov	rax, QWORD PTR [r8+248]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+240], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+256]
-        mov	rcx, QWORD PTR [r8+256]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+248], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+264]
-        mov	rdx, QWORD PTR [r8+264]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+256], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+272]
-        mov	rax, QWORD PTR [r8+272]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+264], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+280]
-        mov	rcx, QWORD PTR [r8+280]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+272], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+288]
-        mov	rdx, QWORD PTR [r8+288]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+280], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+296]
-        mov	rax, QWORD PTR [r8+296]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+288], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+304]
-        mov	rcx, QWORD PTR [r8+304]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+296], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+312]
-        mov	rdx, QWORD PTR [r8+312]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+304], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+320]
-        mov	rax, QWORD PTR [r8+320]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+312], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+328]
-        mov	rcx, QWORD PTR [r8+328]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+320], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+336]
-        mov	rdx, QWORD PTR [r8+336]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+328], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+344]
-        mov	rax, QWORD PTR [r8+344]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+336], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+352]
-        mov	rcx, QWORD PTR [r8+352]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+344], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+360]
-        mov	rdx, QWORD PTR [r8+360]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+352], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+368]
-        mov	rax, QWORD PTR [r8+368]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+360], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+376]
-        mov	rcx, QWORD PTR [r8+376]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+368], rax
-        sbb	rcx, rdx
-        mov	QWORD PTR [r9+376], rcx
-        pop	rbp
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_3072_mont_reduce_avx2_48 ENDP
-_text ENDS
-ENDIF
-IFNDEF WC_NO_CACHE_RESISTANT
-_text SEGMENT READONLY PARA
-sp_3072_get_from_table_avx2_48 PROC
-        sub	rsp, 128
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        vmovdqu	OWORD PTR [rsp+96], xmm12
-        vmovdqu	OWORD PTR [rsp+112], xmm13
-        mov	rax, 1
-        movd	xmm10, r8
-        movd	xmm11, rax
-        vpxor	ymm13, ymm13, ymm13
-        vpermd	ymm10, ymm13, ymm10
-        vpermd	ymm11, ymm13, ymm11
-        ; START: 0-15
-        vpxor	ymm13, ymm13, ymm13
-        vpxor	ymm4, ymm4, ymm4
-        vpxor	ymm5, ymm5, ymm5
-        vpxor	ymm6, ymm6, ymm6
-        vpxor	ymm7, ymm7, ymm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        vmovdqu	YMMWORD PTR [rcx], ymm4
-        vmovdqu	YMMWORD PTR [rcx+32], ymm5
-        vmovdqu	YMMWORD PTR [rcx+64], ymm6
-        vmovdqu	YMMWORD PTR [rcx+96], ymm7
-        add	rcx, 128
-        ; END: 0-15
-        ; START: 16-31
-        vpxor	ymm13, ymm13, ymm13
-        vpxor	ymm4, ymm4, ymm4
-        vpxor	ymm5, ymm5, ymm5
-        vpxor	ymm6, ymm6, ymm6
-        vpxor	ymm7, ymm7, ymm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        vmovdqu	YMMWORD PTR [rcx], ymm4
-        vmovdqu	YMMWORD PTR [rcx+32], ymm5
-        vmovdqu	YMMWORD PTR [rcx+64], ymm6
-        vmovdqu	YMMWORD PTR [rcx+96], ymm7
-        add	rcx, 128
-        ; END: 16-31
-        ; START: 32-47
-        vpxor	ymm13, ymm13, ymm13
-        vpxor	ymm4, ymm4, ymm4
-        vpxor	ymm5, ymm5, ymm5
-        vpxor	ymm6, ymm6, ymm6
-        vpxor	ymm7, ymm7, ymm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        vmovdqu	YMMWORD PTR [rcx], ymm4
-        vmovdqu	YMMWORD PTR [rcx+32], ymm5
-        vmovdqu	YMMWORD PTR [rcx+64], ymm6
-        vmovdqu	YMMWORD PTR [rcx+96], ymm7
-        ; END: 32-47
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        vmovdqu	xmm12, OWORD PTR [rsp+96]
-        vmovdqu	xmm13, OWORD PTR [rsp+112]
-        add	rsp, 128
-        ret
-sp_3072_get_from_table_avx2_48 ENDP
-_text ENDS
-ENDIF
-; /* Conditionally add a and b using the mask m.
-;  * m is -1 to add and 0 when not.
-;  *
-;  * r  A single precision number representing conditional add result.
-;  * a  A single precision number to add with.
-;  * b  A single precision number to add.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_cond_add_24 PROC
-        sub	rsp, 192
-        mov	rax, 0
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp], r10
-        mov	QWORD PTR [rsp+8], r11
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+16], r10
-        mov	QWORD PTR [rsp+24], r11
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+32], r10
-        mov	QWORD PTR [rsp+40], r11
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+48], r10
-        mov	QWORD PTR [rsp+56], r11
-        mov	r10, QWORD PTR [r8+64]
-        mov	r11, QWORD PTR [r8+72]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+64], r10
-        mov	QWORD PTR [rsp+72], r11
-        mov	r10, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [r8+88]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+80], r10
-        mov	QWORD PTR [rsp+88], r11
-        mov	r10, QWORD PTR [r8+96]
-        mov	r11, QWORD PTR [r8+104]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+96], r10
-        mov	QWORD PTR [rsp+104], r11
-        mov	r10, QWORD PTR [r8+112]
-        mov	r11, QWORD PTR [r8+120]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+112], r10
-        mov	QWORD PTR [rsp+120], r11
-        mov	r10, QWORD PTR [r8+128]
-        mov	r11, QWORD PTR [r8+136]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+128], r10
-        mov	QWORD PTR [rsp+136], r11
-        mov	r10, QWORD PTR [r8+144]
-        mov	r11, QWORD PTR [r8+152]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+144], r10
-        mov	QWORD PTR [rsp+152], r11
-        mov	r10, QWORD PTR [r8+160]
-        mov	r11, QWORD PTR [r8+168]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+160], r10
-        mov	QWORD PTR [rsp+168], r11
-        mov	r10, QWORD PTR [r8+176]
-        mov	r11, QWORD PTR [r8+184]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+176], r10
-        mov	QWORD PTR [rsp+184], r11
-        mov	r10, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rsp]
-        add	r10, r8
-        mov	r11, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [rsp+8]
-        adc	r11, r8
-        mov	QWORD PTR [rcx], r10
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r8, QWORD PTR [rsp+16]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+8], r11
-        mov	r11, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [rsp+24]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+16], r10
-        mov	r10, QWORD PTR [rdx+32]
-        mov	r8, QWORD PTR [rsp+32]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+24], r11
-        mov	r11, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [rsp+40]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+32], r10
-        mov	r10, QWORD PTR [rdx+48]
-        mov	r8, QWORD PTR [rsp+48]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+40], r11
-        mov	r11, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [rsp+56]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+48], r10
-        mov	r10, QWORD PTR [rdx+64]
-        mov	r8, QWORD PTR [rsp+64]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+56], r11
-        mov	r11, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [rsp+72]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+64], r10
-        mov	r10, QWORD PTR [rdx+80]
-        mov	r8, QWORD PTR [rsp+80]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+72], r11
-        mov	r11, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [rsp+88]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+80], r10
-        mov	r10, QWORD PTR [rdx+96]
-        mov	r8, QWORD PTR [rsp+96]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+88], r11
-        mov	r11, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [rsp+104]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+96], r10
-        mov	r10, QWORD PTR [rdx+112]
-        mov	r8, QWORD PTR [rsp+112]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+104], r11
-        mov	r11, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [rsp+120]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+112], r10
-        mov	r10, QWORD PTR [rdx+128]
-        mov	r8, QWORD PTR [rsp+128]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+120], r11
-        mov	r11, QWORD PTR [rdx+136]
-        mov	r8, QWORD PTR [rsp+136]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+128], r10
-        mov	r10, QWORD PTR [rdx+144]
-        mov	r8, QWORD PTR [rsp+144]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+136], r11
-        mov	r11, QWORD PTR [rdx+152]
-        mov	r8, QWORD PTR [rsp+152]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+144], r10
-        mov	r10, QWORD PTR [rdx+160]
-        mov	r8, QWORD PTR [rsp+160]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+152], r11
-        mov	r11, QWORD PTR [rdx+168]
-        mov	r8, QWORD PTR [rsp+168]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+160], r10
-        mov	r10, QWORD PTR [rdx+176]
-        mov	r8, QWORD PTR [rsp+176]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+168], r11
-        mov	r11, QWORD PTR [rdx+184]
-        mov	r8, QWORD PTR [rsp+184]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+176], r10
-        mov	QWORD PTR [rcx+184], r11
-        adc	rax, 0
-        add	rsp, 192
-        ret
-sp_3072_cond_add_24 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Conditionally add a and b using the mask m.
-;  * m is -1 to add and 0 when not.
-;  *
-;  * r  A single precision number representing conditional add result.
-;  * a  A single precision number to add with.
-;  * b  A single precision number to add.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_cond_add_avx2_24 PROC
-        push	r12
-        mov	rax, 0
-        mov	r12, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx]
-        pext	r12, r12, r9
-        add	r10, r12
-        mov	r12, QWORD PTR [r8+8]
-        mov	r11, QWORD PTR [rdx+8]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+16]
-        mov	r12, QWORD PTR [rdx+16]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+8], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [rdx+24]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+16], r12
-        adc	r10, r11
-        mov	r12, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [rdx+32]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+24], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [rdx+40]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+32], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+48]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+40], r12
-        adc	r10, r11
-        mov	r12, QWORD PTR [r8+56]
-        mov	r11, QWORD PTR [rdx+56]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+48], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+64]
-        mov	r12, QWORD PTR [rdx+64]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+56], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+72]
-        mov	r10, QWORD PTR [rdx+72]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+64], r12
-        adc	r10, r11
-        mov	r12, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [rdx+80]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+72], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+88]
-        mov	r12, QWORD PTR [rdx+88]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+80], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+96]
-        mov	r10, QWORD PTR [rdx+96]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+88], r12
-        adc	r10, r11
-        mov	r12, QWORD PTR [r8+104]
-        mov	r11, QWORD PTR [rdx+104]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+96], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+112]
-        mov	r12, QWORD PTR [rdx+112]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+104], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+120]
-        mov	r10, QWORD PTR [rdx+120]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+112], r12
-        adc	r10, r11
-        mov	r12, QWORD PTR [r8+128]
-        mov	r11, QWORD PTR [rdx+128]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+120], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+136]
-        mov	r12, QWORD PTR [rdx+136]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+128], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+144]
-        mov	r10, QWORD PTR [rdx+144]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+136], r12
-        adc	r10, r11
-        mov	r12, QWORD PTR [r8+152]
-        mov	r11, QWORD PTR [rdx+152]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+144], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+160]
-        mov	r12, QWORD PTR [rdx+160]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+152], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+168]
-        mov	r10, QWORD PTR [rdx+168]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+160], r12
-        adc	r10, r11
-        mov	r12, QWORD PTR [r8+176]
-        mov	r11, QWORD PTR [rdx+176]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+168], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+184]
-        mov	r12, QWORD PTR [rdx+184]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+176], r11
-        adc	r12, r10
-        mov	QWORD PTR [rcx+184], r12
-        adc	rax, 0
-        pop	r12
-        ret
-sp_3072_cond_add_avx2_24 ENDP
-_text ENDS
-ENDIF
-; /* Shift number left by n bit. (r = a << n)
-;  *
-;  * r  Result of left shift by n.
-;  * a  Number to shift.
-;  * n  Amoutnt o shift.
-;  */
-_text SEGMENT READONLY PARA
-sp_3072_lshift_48 PROC
-        push	r12
-        push	r13
-        mov	cl, r8b
-        mov	rax, rcx
-        mov	r12, 0
-        mov	r13, QWORD PTR [rdx+344]
-        mov	r8, QWORD PTR [rdx+352]
-        mov	r9, QWORD PTR [rdx+360]
-        mov	r10, QWORD PTR [rdx+368]
-        mov	r11, QWORD PTR [rdx+376]
-        shld	r12, r11, cl
-        shld	r11, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r13, cl
-        mov	QWORD PTR [rax+352], r8
-        mov	QWORD PTR [rax+360], r9
-        mov	QWORD PTR [rax+368], r10
-        mov	QWORD PTR [rax+376], r11
-        mov	QWORD PTR [rax+384], r12
-        mov	r11, QWORD PTR [rdx+312]
-        mov	r8, QWORD PTR [rdx+320]
-        mov	r9, QWORD PTR [rdx+328]
-        mov	r10, QWORD PTR [rdx+336]
-        shld	r13, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r11, cl
-        mov	QWORD PTR [rax+320], r8
-        mov	QWORD PTR [rax+328], r9
-        mov	QWORD PTR [rax+336], r10
-        mov	QWORD PTR [rax+344], r13
-        mov	r13, QWORD PTR [rdx+280]
-        mov	r8, QWORD PTR [rdx+288]
-        mov	r9, QWORD PTR [rdx+296]
-        mov	r10, QWORD PTR [rdx+304]
-        shld	r11, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r13, cl
-        mov	QWORD PTR [rax+288], r8
-        mov	QWORD PTR [rax+296], r9
-        mov	QWORD PTR [rax+304], r10
-        mov	QWORD PTR [rax+312], r11
-        mov	r11, QWORD PTR [rdx+248]
-        mov	r8, QWORD PTR [rdx+256]
-        mov	r9, QWORD PTR [rdx+264]
-        mov	r10, QWORD PTR [rdx+272]
-        shld	r13, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r11, cl
-        mov	QWORD PTR [rax+256], r8
-        mov	QWORD PTR [rax+264], r9
-        mov	QWORD PTR [rax+272], r10
-        mov	QWORD PTR [rax+280], r13
-        mov	r13, QWORD PTR [rdx+216]
-        mov	r8, QWORD PTR [rdx+224]
-        mov	r9, QWORD PTR [rdx+232]
-        mov	r10, QWORD PTR [rdx+240]
-        shld	r11, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r13, cl
-        mov	QWORD PTR [rax+224], r8
-        mov	QWORD PTR [rax+232], r9
-        mov	QWORD PTR [rax+240], r10
-        mov	QWORD PTR [rax+248], r11
-        mov	r11, QWORD PTR [rdx+184]
-        mov	r8, QWORD PTR [rdx+192]
-        mov	r9, QWORD PTR [rdx+200]
-        mov	r10, QWORD PTR [rdx+208]
-        shld	r13, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r11, cl
-        mov	QWORD PTR [rax+192], r8
-        mov	QWORD PTR [rax+200], r9
-        mov	QWORD PTR [rax+208], r10
-        mov	QWORD PTR [rax+216], r13
-        mov	r13, QWORD PTR [rdx+152]
-        mov	r8, QWORD PTR [rdx+160]
-        mov	r9, QWORD PTR [rdx+168]
-        mov	r10, QWORD PTR [rdx+176]
-        shld	r11, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r13, cl
-        mov	QWORD PTR [rax+160], r8
-        mov	QWORD PTR [rax+168], r9
-        mov	QWORD PTR [rax+176], r10
-        mov	QWORD PTR [rax+184], r11
-        mov	r11, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [rdx+128]
-        mov	r9, QWORD PTR [rdx+136]
-        mov	r10, QWORD PTR [rdx+144]
-        shld	r13, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r11, cl
-        mov	QWORD PTR [rax+128], r8
-        mov	QWORD PTR [rax+136], r9
-        mov	QWORD PTR [rax+144], r10
-        mov	QWORD PTR [rax+152], r13
-        mov	r13, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [rdx+96]
-        mov	r9, QWORD PTR [rdx+104]
-        mov	r10, QWORD PTR [rdx+112]
-        shld	r11, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r13, cl
-        mov	QWORD PTR [rax+96], r8
-        mov	QWORD PTR [rax+104], r9
-        mov	QWORD PTR [rax+112], r10
-        mov	QWORD PTR [rax+120], r11
-        mov	r11, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [rdx+64]
-        mov	r9, QWORD PTR [rdx+72]
-        mov	r10, QWORD PTR [rdx+80]
-        shld	r13, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r11, cl
-        mov	QWORD PTR [rax+64], r8
-        mov	QWORD PTR [rax+72], r9
-        mov	QWORD PTR [rax+80], r10
-        mov	QWORD PTR [rax+88], r13
-        mov	r13, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [rdx+32]
-        mov	r9, QWORD PTR [rdx+40]
-        mov	r10, QWORD PTR [rdx+48]
-        shld	r11, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r13, cl
-        mov	QWORD PTR [rax+32], r8
-        mov	QWORD PTR [rax+40], r9
-        mov	QWORD PTR [rax+48], r10
-        mov	QWORD PTR [rax+56], r11
-        mov	r8, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	r10, QWORD PTR [rdx+16]
-        shld	r13, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shl	r8, cl
-        mov	QWORD PTR [rax], r8
-        mov	QWORD PTR [rax+8], r9
-        mov	QWORD PTR [rax+16], r10
-        mov	QWORD PTR [rax+24], r13
-        pop	r13
-        pop	r12
-        ret
-sp_3072_lshift_48 ENDP
-_text ENDS
-ENDIF
-ENDIF
-IFDEF WOLFSSL_SP_4096
-IFDEF WOLFSSL_SP_4096
-; /* Read big endian unsigned byte array into r.
-;  * Uses the bswap instruction.
-;  *
-;  * r  A single precision integer.
-;  * size  Maximum number of bytes to convert
-;  * a  Byte array.
-;  * n  Number of bytes in array to read.
-;  */
-_text SEGMENT READONLY PARA
-sp_4096_from_bin_bswap PROC
-        push	r12
-        push	r13
-        mov	r11, r8
-        mov	r12, rcx
-        add	r11, r9
-        add	r12, 512
-        xor	r13, r13
-        jmp	L_4096_from_bin_bswap_64_end
-L_4096_from_bin_bswap_64_start:
-        sub	r11, 64
-        mov	rax, QWORD PTR [r11+56]
-        mov	r10, QWORD PTR [r11+48]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        mov	rax, QWORD PTR [r11+40]
-        mov	r10, QWORD PTR [r11+32]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r10
-        mov	rax, QWORD PTR [r11+24]
-        mov	r10, QWORD PTR [r11+16]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        mov	rax, QWORD PTR [r11+8]
-        mov	r10, QWORD PTR [r11]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r10
-        add	rcx, 64
-        sub	r9, 64
-L_4096_from_bin_bswap_64_end:
-        cmp	r9, 63
-        jg	L_4096_from_bin_bswap_64_start
-        jmp	L_4096_from_bin_bswap_8_end
-L_4096_from_bin_bswap_8_start:
-        sub	r11, 8
-        mov	rax, QWORD PTR [r11]
-        bswap	rax
-        mov	QWORD PTR [rcx], rax
-        add	rcx, 8
-        sub	r9, 8
-L_4096_from_bin_bswap_8_end:
-        cmp	r9, 7
-        jg	L_4096_from_bin_bswap_8_start
-        cmp	r9, r13
-        je	L_4096_from_bin_bswap_hi_end
-        mov	r10, r13
-        mov	rax, r13
-L_4096_from_bin_bswap_hi_start:
-        mov	al, BYTE PTR [r8]
-        shl	r10, 8
-        inc	r8
-        add	r10, rax
-        dec	r9
-        jg	L_4096_from_bin_bswap_hi_start
-        mov	QWORD PTR [rcx], r10
-        add	rcx, 8
-L_4096_from_bin_bswap_hi_end:
-        cmp	rcx, r12
-        jge	L_4096_from_bin_bswap_zero_end
-L_4096_from_bin_bswap_zero_start:
-        mov	QWORD PTR [rcx], r13
-        add	rcx, 8
-        cmp	rcx, r12
-        jl	L_4096_from_bin_bswap_zero_start
-L_4096_from_bin_bswap_zero_end:
-        pop	r13
-        pop	r12
-        ret
-sp_4096_from_bin_bswap ENDP
-_text ENDS
-IFNDEF NO_MOVBE_SUPPORT
-; /* Read big endian unsigned byte array into r.
-;  * Uses the movbe instruction which is an optional instruction.
-;  *
-;  * r  A single precision integer.
-;  * size  Maximum number of bytes to convert
-;  * a  Byte array.
-;  * n  Number of bytes in array to read.
-;  */
-_text SEGMENT READONLY PARA
-sp_4096_from_bin_movbe PROC
-        push	r12
-        mov	r11, r8
-        mov	r12, rcx
-        add	r11, r9
-        add	r12, 512
-        jmp	L_4096_from_bin_movbe_64_end
-L_4096_from_bin_movbe_64_start:
-        sub	r11, 64
-        movbe	rax, QWORD PTR [r11+56]
-        movbe	r10, QWORD PTR [r11+48]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        movbe	rax, QWORD PTR [r11+40]
-        movbe	r10, QWORD PTR [r11+32]
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r10
-        movbe	rax, QWORD PTR [r11+24]
-        movbe	r10, QWORD PTR [r11+16]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        movbe	rax, QWORD PTR [r11+8]
-        movbe	r10, QWORD PTR [r11]
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r10
-        add	rcx, 64
-        sub	r9, 64
-L_4096_from_bin_movbe_64_end:
-        cmp	r9, 63
-        jg	L_4096_from_bin_movbe_64_start
-        jmp	L_4096_from_bin_movbe_8_end
-L_4096_from_bin_movbe_8_start:
-        sub	r11, 8
-        movbe	rax, QWORD PTR [r11]
-        mov	QWORD PTR [rcx], rax
-        add	rcx, 8
-        sub	r9, 8
-L_4096_from_bin_movbe_8_end:
-        cmp	r9, 7
-        jg	L_4096_from_bin_movbe_8_start
-        cmp	r9, 0
-        je	L_4096_from_bin_movbe_hi_end
-        mov	r10, 0
-        mov	rax, 0
-L_4096_from_bin_movbe_hi_start:
-        mov	al, BYTE PTR [r8]
-        shl	r10, 8
-        inc	r8
-        add	r10, rax
-        dec	r9
-        jg	L_4096_from_bin_movbe_hi_start
-        mov	QWORD PTR [rcx], r10
-        add	rcx, 8
-L_4096_from_bin_movbe_hi_end:
-        cmp	rcx, r12
-        jge	L_4096_from_bin_movbe_zero_end
-L_4096_from_bin_movbe_zero_start:
-        mov	QWORD PTR [rcx], 0
-        add	rcx, 8
-        cmp	rcx, r12
-        jl	L_4096_from_bin_movbe_zero_start
-L_4096_from_bin_movbe_zero_end:
-        pop	r12
-        ret
-sp_4096_from_bin_movbe ENDP
-_text ENDS
-ENDIF
-; /* Write r as big endian to byte array.
-;  * Fixed length number of bytes written: 512
-;  * Uses the bswap instruction.
-;  *
-;  * r  A single precision integer.
-;  * a  Byte array.
-;  */
-_text SEGMENT READONLY PARA
-sp_4096_to_bin_bswap_64 PROC
-        mov	rax, QWORD PTR [rcx+504]
-        mov	r8, QWORD PTR [rcx+496]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx], rax
-        mov	QWORD PTR [rdx+8], r8
-        mov	rax, QWORD PTR [rcx+488]
-        mov	r8, QWORD PTR [rcx+480]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+16], rax
-        mov	QWORD PTR [rdx+24], r8
-        mov	rax, QWORD PTR [rcx+472]
-        mov	r8, QWORD PTR [rcx+464]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+32], rax
-        mov	QWORD PTR [rdx+40], r8
-        mov	rax, QWORD PTR [rcx+456]
-        mov	r8, QWORD PTR [rcx+448]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+48], rax
-        mov	QWORD PTR [rdx+56], r8
-        mov	rax, QWORD PTR [rcx+440]
-        mov	r8, QWORD PTR [rcx+432]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+64], rax
-        mov	QWORD PTR [rdx+72], r8
-        mov	rax, QWORD PTR [rcx+424]
-        mov	r8, QWORD PTR [rcx+416]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+80], rax
-        mov	QWORD PTR [rdx+88], r8
-        mov	rax, QWORD PTR [rcx+408]
-        mov	r8, QWORD PTR [rcx+400]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+96], rax
-        mov	QWORD PTR [rdx+104], r8
-        mov	rax, QWORD PTR [rcx+392]
-        mov	r8, QWORD PTR [rcx+384]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+112], rax
-        mov	QWORD PTR [rdx+120], r8
-        mov	rax, QWORD PTR [rcx+376]
-        mov	r8, QWORD PTR [rcx+368]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+128], rax
-        mov	QWORD PTR [rdx+136], r8
-        mov	rax, QWORD PTR [rcx+360]
-        mov	r8, QWORD PTR [rcx+352]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+144], rax
-        mov	QWORD PTR [rdx+152], r8
-        mov	rax, QWORD PTR [rcx+344]
-        mov	r8, QWORD PTR [rcx+336]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+160], rax
-        mov	QWORD PTR [rdx+168], r8
-        mov	rax, QWORD PTR [rcx+328]
-        mov	r8, QWORD PTR [rcx+320]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+176], rax
-        mov	QWORD PTR [rdx+184], r8
-        mov	rax, QWORD PTR [rcx+312]
-        mov	r8, QWORD PTR [rcx+304]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+192], rax
-        mov	QWORD PTR [rdx+200], r8
-        mov	rax, QWORD PTR [rcx+296]
-        mov	r8, QWORD PTR [rcx+288]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+208], rax
-        mov	QWORD PTR [rdx+216], r8
-        mov	rax, QWORD PTR [rcx+280]
-        mov	r8, QWORD PTR [rcx+272]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+224], rax
-        mov	QWORD PTR [rdx+232], r8
-        mov	rax, QWORD PTR [rcx+264]
-        mov	r8, QWORD PTR [rcx+256]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+240], rax
-        mov	QWORD PTR [rdx+248], r8
-        mov	rax, QWORD PTR [rcx+248]
-        mov	r8, QWORD PTR [rcx+240]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+256], rax
-        mov	QWORD PTR [rdx+264], r8
-        mov	rax, QWORD PTR [rcx+232]
-        mov	r8, QWORD PTR [rcx+224]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+272], rax
-        mov	QWORD PTR [rdx+280], r8
-        mov	rax, QWORD PTR [rcx+216]
-        mov	r8, QWORD PTR [rcx+208]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+288], rax
-        mov	QWORD PTR [rdx+296], r8
-        mov	rax, QWORD PTR [rcx+200]
-        mov	r8, QWORD PTR [rcx+192]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+304], rax
-        mov	QWORD PTR [rdx+312], r8
-        mov	rax, QWORD PTR [rcx+184]
-        mov	r8, QWORD PTR [rcx+176]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+320], rax
-        mov	QWORD PTR [rdx+328], r8
-        mov	rax, QWORD PTR [rcx+168]
-        mov	r8, QWORD PTR [rcx+160]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+336], rax
-        mov	QWORD PTR [rdx+344], r8
-        mov	rax, QWORD PTR [rcx+152]
-        mov	r8, QWORD PTR [rcx+144]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+352], rax
-        mov	QWORD PTR [rdx+360], r8
-        mov	rax, QWORD PTR [rcx+136]
-        mov	r8, QWORD PTR [rcx+128]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+368], rax
-        mov	QWORD PTR [rdx+376], r8
-        mov	rax, QWORD PTR [rcx+120]
-        mov	r8, QWORD PTR [rcx+112]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+384], rax
-        mov	QWORD PTR [rdx+392], r8
-        mov	rax, QWORD PTR [rcx+104]
-        mov	r8, QWORD PTR [rcx+96]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+400], rax
-        mov	QWORD PTR [rdx+408], r8
-        mov	rax, QWORD PTR [rcx+88]
-        mov	r8, QWORD PTR [rcx+80]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+416], rax
-        mov	QWORD PTR [rdx+424], r8
-        mov	rax, QWORD PTR [rcx+72]
-        mov	r8, QWORD PTR [rcx+64]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+432], rax
-        mov	QWORD PTR [rdx+440], r8
-        mov	rax, QWORD PTR [rcx+56]
-        mov	r8, QWORD PTR [rcx+48]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+448], rax
-        mov	QWORD PTR [rdx+456], r8
-        mov	rax, QWORD PTR [rcx+40]
-        mov	r8, QWORD PTR [rcx+32]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+464], rax
-        mov	QWORD PTR [rdx+472], r8
-        mov	rax, QWORD PTR [rcx+24]
-        mov	r8, QWORD PTR [rcx+16]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+480], rax
-        mov	QWORD PTR [rdx+488], r8
-        mov	rax, QWORD PTR [rcx+8]
-        mov	r8, QWORD PTR [rcx]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+496], rax
-        mov	QWORD PTR [rdx+504], r8
-        ret
-sp_4096_to_bin_bswap_64 ENDP
-_text ENDS
-IFNDEF NO_MOVBE_SUPPORT
-; /* Write r as big endian to byte array.
-;  * Fixed length number of bytes written: 512
-;  * Uses the movbe instruction which is optional.
-;  *
-;  * r  A single precision integer.
-;  * a  Byte array.
-;  */
-_text SEGMENT READONLY PARA
-sp_4096_to_bin_movbe_64 PROC
-        movbe	rax, QWORD PTR [rcx+504]
-        movbe	r8, QWORD PTR [rcx+496]
-        mov	QWORD PTR [rdx], rax
-        mov	QWORD PTR [rdx+8], r8
-        movbe	rax, QWORD PTR [rcx+488]
-        movbe	r8, QWORD PTR [rcx+480]
-        mov	QWORD PTR [rdx+16], rax
-        mov	QWORD PTR [rdx+24], r8
-        movbe	rax, QWORD PTR [rcx+472]
-        movbe	r8, QWORD PTR [rcx+464]
-        mov	QWORD PTR [rdx+32], rax
-        mov	QWORD PTR [rdx+40], r8
-        movbe	rax, QWORD PTR [rcx+456]
-        movbe	r8, QWORD PTR [rcx+448]
-        mov	QWORD PTR [rdx+48], rax
-        mov	QWORD PTR [rdx+56], r8
-        movbe	rax, QWORD PTR [rcx+440]
-        movbe	r8, QWORD PTR [rcx+432]
-        mov	QWORD PTR [rdx+64], rax
-        mov	QWORD PTR [rdx+72], r8
-        movbe	rax, QWORD PTR [rcx+424]
-        movbe	r8, QWORD PTR [rcx+416]
-        mov	QWORD PTR [rdx+80], rax
-        mov	QWORD PTR [rdx+88], r8
-        movbe	rax, QWORD PTR [rcx+408]
-        movbe	r8, QWORD PTR [rcx+400]
-        mov	QWORD PTR [rdx+96], rax
-        mov	QWORD PTR [rdx+104], r8
-        movbe	rax, QWORD PTR [rcx+392]
-        movbe	r8, QWORD PTR [rcx+384]
-        mov	QWORD PTR [rdx+112], rax
-        mov	QWORD PTR [rdx+120], r8
-        movbe	rax, QWORD PTR [rcx+376]
-        movbe	r8, QWORD PTR [rcx+368]
-        mov	QWORD PTR [rdx+128], rax
-        mov	QWORD PTR [rdx+136], r8
-        movbe	rax, QWORD PTR [rcx+360]
-        movbe	r8, QWORD PTR [rcx+352]
-        mov	QWORD PTR [rdx+144], rax
-        mov	QWORD PTR [rdx+152], r8
-        movbe	rax, QWORD PTR [rcx+344]
-        movbe	r8, QWORD PTR [rcx+336]
-        mov	QWORD PTR [rdx+160], rax
-        mov	QWORD PTR [rdx+168], r8
-        movbe	rax, QWORD PTR [rcx+328]
-        movbe	r8, QWORD PTR [rcx+320]
-        mov	QWORD PTR [rdx+176], rax
-        mov	QWORD PTR [rdx+184], r8
-        movbe	rax, QWORD PTR [rcx+312]
-        movbe	r8, QWORD PTR [rcx+304]
-        mov	QWORD PTR [rdx+192], rax
-        mov	QWORD PTR [rdx+200], r8
-        movbe	rax, QWORD PTR [rcx+296]
-        movbe	r8, QWORD PTR [rcx+288]
-        mov	QWORD PTR [rdx+208], rax
-        mov	QWORD PTR [rdx+216], r8
-        movbe	rax, QWORD PTR [rcx+280]
-        movbe	r8, QWORD PTR [rcx+272]
-        mov	QWORD PTR [rdx+224], rax
-        mov	QWORD PTR [rdx+232], r8
-        movbe	rax, QWORD PTR [rcx+264]
-        movbe	r8, QWORD PTR [rcx+256]
-        mov	QWORD PTR [rdx+240], rax
-        mov	QWORD PTR [rdx+248], r8
-        movbe	rax, QWORD PTR [rcx+248]
-        movbe	r8, QWORD PTR [rcx+240]
-        mov	QWORD PTR [rdx+256], rax
-        mov	QWORD PTR [rdx+264], r8
-        movbe	rax, QWORD PTR [rcx+232]
-        movbe	r8, QWORD PTR [rcx+224]
-        mov	QWORD PTR [rdx+272], rax
-        mov	QWORD PTR [rdx+280], r8
-        movbe	rax, QWORD PTR [rcx+216]
-        movbe	r8, QWORD PTR [rcx+208]
-        mov	QWORD PTR [rdx+288], rax
-        mov	QWORD PTR [rdx+296], r8
-        movbe	rax, QWORD PTR [rcx+200]
-        movbe	r8, QWORD PTR [rcx+192]
-        mov	QWORD PTR [rdx+304], rax
-        mov	QWORD PTR [rdx+312], r8
-        movbe	rax, QWORD PTR [rcx+184]
-        movbe	r8, QWORD PTR [rcx+176]
-        mov	QWORD PTR [rdx+320], rax
-        mov	QWORD PTR [rdx+328], r8
-        movbe	rax, QWORD PTR [rcx+168]
-        movbe	r8, QWORD PTR [rcx+160]
-        mov	QWORD PTR [rdx+336], rax
-        mov	QWORD PTR [rdx+344], r8
-        movbe	rax, QWORD PTR [rcx+152]
-        movbe	r8, QWORD PTR [rcx+144]
-        mov	QWORD PTR [rdx+352], rax
-        mov	QWORD PTR [rdx+360], r8
-        movbe	rax, QWORD PTR [rcx+136]
-        movbe	r8, QWORD PTR [rcx+128]
-        mov	QWORD PTR [rdx+368], rax
-        mov	QWORD PTR [rdx+376], r8
-        movbe	rax, QWORD PTR [rcx+120]
-        movbe	r8, QWORD PTR [rcx+112]
-        mov	QWORD PTR [rdx+384], rax
-        mov	QWORD PTR [rdx+392], r8
-        movbe	rax, QWORD PTR [rcx+104]
-        movbe	r8, QWORD PTR [rcx+96]
-        mov	QWORD PTR [rdx+400], rax
-        mov	QWORD PTR [rdx+408], r8
-        movbe	rax, QWORD PTR [rcx+88]
-        movbe	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rdx+416], rax
-        mov	QWORD PTR [rdx+424], r8
-        movbe	rax, QWORD PTR [rcx+72]
-        movbe	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rdx+432], rax
-        mov	QWORD PTR [rdx+440], r8
-        movbe	rax, QWORD PTR [rcx+56]
-        movbe	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rdx+448], rax
-        mov	QWORD PTR [rdx+456], r8
-        movbe	rax, QWORD PTR [rcx+40]
-        movbe	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rdx+464], rax
-        mov	QWORD PTR [rdx+472], r8
-        movbe	rax, QWORD PTR [rcx+24]
-        movbe	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rdx+480], rax
-        mov	QWORD PTR [rdx+488], r8
-        movbe	rax, QWORD PTR [rcx+8]
-        movbe	r8, QWORD PTR [rcx]
-        mov	QWORD PTR [rdx+496], rax
-        mov	QWORD PTR [rdx+504], r8
-        ret
-sp_4096_to_bin_movbe_64 ENDP
-_text ENDS
-ENDIF
-; /* Sub b from a into a. (a -= b)
-;  *
-;  * a  A single precision integer and result.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_4096_sub_in_place_64 PROC
-        mov	r8, QWORD PTR [rcx]
-        sub	r8, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        sbb	r9, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], r9
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	r9, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        sbb	r9, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], r9
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	r9, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        sbb	r9, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], r9
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	r9, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        sbb	r9, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], r9
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	r9, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        sbb	r9, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], r9
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	r9, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        sbb	r9, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [rcx+96]
-        mov	QWORD PTR [rcx+88], r9
-        sbb	r8, QWORD PTR [rdx+96]
-        mov	r9, QWORD PTR [rcx+104]
-        mov	QWORD PTR [rcx+96], r8
-        sbb	r9, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [rcx+112]
-        mov	QWORD PTR [rcx+104], r9
-        sbb	r8, QWORD PTR [rdx+112]
-        mov	r9, QWORD PTR [rcx+120]
-        mov	QWORD PTR [rcx+112], r8
-        sbb	r9, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [rcx+128]
-        mov	QWORD PTR [rcx+120], r9
-        sbb	r8, QWORD PTR [rdx+128]
-        mov	r9, QWORD PTR [rcx+136]
-        mov	QWORD PTR [rcx+128], r8
-        sbb	r9, QWORD PTR [rdx+136]
-        mov	r8, QWORD PTR [rcx+144]
-        mov	QWORD PTR [rcx+136], r9
-        sbb	r8, QWORD PTR [rdx+144]
-        mov	r9, QWORD PTR [rcx+152]
-        mov	QWORD PTR [rcx+144], r8
-        sbb	r9, QWORD PTR [rdx+152]
-        mov	r8, QWORD PTR [rcx+160]
-        mov	QWORD PTR [rcx+152], r9
-        sbb	r8, QWORD PTR [rdx+160]
-        mov	r9, QWORD PTR [rcx+168]
-        mov	QWORD PTR [rcx+160], r8
-        sbb	r9, QWORD PTR [rdx+168]
-        mov	r8, QWORD PTR [rcx+176]
-        mov	QWORD PTR [rcx+168], r9
-        sbb	r8, QWORD PTR [rdx+176]
-        mov	r9, QWORD PTR [rcx+184]
-        mov	QWORD PTR [rcx+176], r8
-        sbb	r9, QWORD PTR [rdx+184]
-        mov	r8, QWORD PTR [rcx+192]
-        mov	QWORD PTR [rcx+184], r9
-        sbb	r8, QWORD PTR [rdx+192]
-        mov	r9, QWORD PTR [rcx+200]
-        mov	QWORD PTR [rcx+192], r8
-        sbb	r9, QWORD PTR [rdx+200]
-        mov	r8, QWORD PTR [rcx+208]
-        mov	QWORD PTR [rcx+200], r9
-        sbb	r8, QWORD PTR [rdx+208]
-        mov	r9, QWORD PTR [rcx+216]
-        mov	QWORD PTR [rcx+208], r8
-        sbb	r9, QWORD PTR [rdx+216]
-        mov	r8, QWORD PTR [rcx+224]
-        mov	QWORD PTR [rcx+216], r9
-        sbb	r8, QWORD PTR [rdx+224]
-        mov	r9, QWORD PTR [rcx+232]
-        mov	QWORD PTR [rcx+224], r8
-        sbb	r9, QWORD PTR [rdx+232]
-        mov	r8, QWORD PTR [rcx+240]
-        mov	QWORD PTR [rcx+232], r9
-        sbb	r8, QWORD PTR [rdx+240]
-        mov	r9, QWORD PTR [rcx+248]
-        mov	QWORD PTR [rcx+240], r8
-        sbb	r9, QWORD PTR [rdx+248]
-        mov	r8, QWORD PTR [rcx+256]
-        mov	QWORD PTR [rcx+248], r9
-        sbb	r8, QWORD PTR [rdx+256]
-        mov	r9, QWORD PTR [rcx+264]
-        mov	QWORD PTR [rcx+256], r8
-        sbb	r9, QWORD PTR [rdx+264]
-        mov	r8, QWORD PTR [rcx+272]
-        mov	QWORD PTR [rcx+264], r9
-        sbb	r8, QWORD PTR [rdx+272]
-        mov	r9, QWORD PTR [rcx+280]
-        mov	QWORD PTR [rcx+272], r8
-        sbb	r9, QWORD PTR [rdx+280]
-        mov	r8, QWORD PTR [rcx+288]
-        mov	QWORD PTR [rcx+280], r9
-        sbb	r8, QWORD PTR [rdx+288]
-        mov	r9, QWORD PTR [rcx+296]
-        mov	QWORD PTR [rcx+288], r8
-        sbb	r9, QWORD PTR [rdx+296]
-        mov	r8, QWORD PTR [rcx+304]
-        mov	QWORD PTR [rcx+296], r9
-        sbb	r8, QWORD PTR [rdx+304]
-        mov	r9, QWORD PTR [rcx+312]
-        mov	QWORD PTR [rcx+304], r8
-        sbb	r9, QWORD PTR [rdx+312]
-        mov	r8, QWORD PTR [rcx+320]
-        mov	QWORD PTR [rcx+312], r9
-        sbb	r8, QWORD PTR [rdx+320]
-        mov	r9, QWORD PTR [rcx+328]
-        mov	QWORD PTR [rcx+320], r8
-        sbb	r9, QWORD PTR [rdx+328]
-        mov	r8, QWORD PTR [rcx+336]
-        mov	QWORD PTR [rcx+328], r9
-        sbb	r8, QWORD PTR [rdx+336]
-        mov	r9, QWORD PTR [rcx+344]
-        mov	QWORD PTR [rcx+336], r8
-        sbb	r9, QWORD PTR [rdx+344]
-        mov	r8, QWORD PTR [rcx+352]
-        mov	QWORD PTR [rcx+344], r9
-        sbb	r8, QWORD PTR [rdx+352]
-        mov	r9, QWORD PTR [rcx+360]
-        mov	QWORD PTR [rcx+352], r8
-        sbb	r9, QWORD PTR [rdx+360]
-        mov	r8, QWORD PTR [rcx+368]
-        mov	QWORD PTR [rcx+360], r9
-        sbb	r8, QWORD PTR [rdx+368]
-        mov	r9, QWORD PTR [rcx+376]
-        mov	QWORD PTR [rcx+368], r8
-        sbb	r9, QWORD PTR [rdx+376]
-        mov	r8, QWORD PTR [rcx+384]
-        mov	QWORD PTR [rcx+376], r9
-        sbb	r8, QWORD PTR [rdx+384]
-        mov	r9, QWORD PTR [rcx+392]
-        mov	QWORD PTR [rcx+384], r8
-        sbb	r9, QWORD PTR [rdx+392]
-        mov	r8, QWORD PTR [rcx+400]
-        mov	QWORD PTR [rcx+392], r9
-        sbb	r8, QWORD PTR [rdx+400]
-        mov	r9, QWORD PTR [rcx+408]
-        mov	QWORD PTR [rcx+400], r8
-        sbb	r9, QWORD PTR [rdx+408]
-        mov	r8, QWORD PTR [rcx+416]
-        mov	QWORD PTR [rcx+408], r9
-        sbb	r8, QWORD PTR [rdx+416]
-        mov	r9, QWORD PTR [rcx+424]
-        mov	QWORD PTR [rcx+416], r8
-        sbb	r9, QWORD PTR [rdx+424]
-        mov	r8, QWORD PTR [rcx+432]
-        mov	QWORD PTR [rcx+424], r9
-        sbb	r8, QWORD PTR [rdx+432]
-        mov	r9, QWORD PTR [rcx+440]
-        mov	QWORD PTR [rcx+432], r8
-        sbb	r9, QWORD PTR [rdx+440]
-        mov	r8, QWORD PTR [rcx+448]
-        mov	QWORD PTR [rcx+440], r9
-        sbb	r8, QWORD PTR [rdx+448]
-        mov	r9, QWORD PTR [rcx+456]
-        mov	QWORD PTR [rcx+448], r8
-        sbb	r9, QWORD PTR [rdx+456]
-        mov	r8, QWORD PTR [rcx+464]
-        mov	QWORD PTR [rcx+456], r9
-        sbb	r8, QWORD PTR [rdx+464]
-        mov	r9, QWORD PTR [rcx+472]
-        mov	QWORD PTR [rcx+464], r8
-        sbb	r9, QWORD PTR [rdx+472]
-        mov	r8, QWORD PTR [rcx+480]
-        mov	QWORD PTR [rcx+472], r9
-        sbb	r8, QWORD PTR [rdx+480]
-        mov	r9, QWORD PTR [rcx+488]
-        mov	QWORD PTR [rcx+480], r8
-        sbb	r9, QWORD PTR [rdx+488]
-        mov	r8, QWORD PTR [rcx+496]
-        mov	QWORD PTR [rcx+488], r9
-        sbb	r8, QWORD PTR [rdx+496]
-        mov	r9, QWORD PTR [rcx+504]
-        mov	QWORD PTR [rcx+496], r8
-        sbb	r9, QWORD PTR [rdx+504]
-        mov	QWORD PTR [rcx+504], r9
-        sbb	rax, rax
-        ret
-sp_4096_sub_in_place_64 ENDP
-_text ENDS
-; /* Add b to a into r. (r = a + b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_4096_add_64 PROC
-        ; Add
-        mov	r9, QWORD PTR [rdx]
-        xor	rax, rax
-        add	r9, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx+8]
-        mov	QWORD PTR [rcx], r9
-        adc	r10, QWORD PTR [r8+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	QWORD PTR [rcx+8], r10
-        adc	r9, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	QWORD PTR [rcx+16], r9
-        adc	r10, QWORD PTR [r8+24]
-        mov	r9, QWORD PTR [rdx+32]
-        mov	QWORD PTR [rcx+24], r10
-        adc	r9, QWORD PTR [r8+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	QWORD PTR [rcx+32], r9
-        adc	r10, QWORD PTR [r8+40]
-        mov	r9, QWORD PTR [rdx+48]
-        mov	QWORD PTR [rcx+40], r10
-        adc	r9, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+56]
-        mov	QWORD PTR [rcx+48], r9
-        adc	r10, QWORD PTR [r8+56]
-        mov	r9, QWORD PTR [rdx+64]
-        mov	QWORD PTR [rcx+56], r10
-        adc	r9, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [rdx+72]
-        mov	QWORD PTR [rcx+64], r9
-        adc	r10, QWORD PTR [r8+72]
-        mov	r9, QWORD PTR [rdx+80]
-        mov	QWORD PTR [rcx+72], r10
-        adc	r9, QWORD PTR [r8+80]
-        mov	r10, QWORD PTR [rdx+88]
-        mov	QWORD PTR [rcx+80], r9
-        adc	r10, QWORD PTR [r8+88]
-        mov	r9, QWORD PTR [rdx+96]
-        mov	QWORD PTR [rcx+88], r10
-        adc	r9, QWORD PTR [r8+96]
-        mov	r10, QWORD PTR [rdx+104]
-        mov	QWORD PTR [rcx+96], r9
-        adc	r10, QWORD PTR [r8+104]
-        mov	r9, QWORD PTR [rdx+112]
-        mov	QWORD PTR [rcx+104], r10
-        adc	r9, QWORD PTR [r8+112]
-        mov	r10, QWORD PTR [rdx+120]
-        mov	QWORD PTR [rcx+112], r9
-        adc	r10, QWORD PTR [r8+120]
-        mov	r9, QWORD PTR [rdx+128]
-        mov	QWORD PTR [rcx+120], r10
-        adc	r9, QWORD PTR [r8+128]
-        mov	r10, QWORD PTR [rdx+136]
-        mov	QWORD PTR [rcx+128], r9
-        adc	r10, QWORD PTR [r8+136]
-        mov	r9, QWORD PTR [rdx+144]
-        mov	QWORD PTR [rcx+136], r10
-        adc	r9, QWORD PTR [r8+144]
-        mov	r10, QWORD PTR [rdx+152]
-        mov	QWORD PTR [rcx+144], r9
-        adc	r10, QWORD PTR [r8+152]
-        mov	r9, QWORD PTR [rdx+160]
-        mov	QWORD PTR [rcx+152], r10
-        adc	r9, QWORD PTR [r8+160]
-        mov	r10, QWORD PTR [rdx+168]
-        mov	QWORD PTR [rcx+160], r9
-        adc	r10, QWORD PTR [r8+168]
-        mov	r9, QWORD PTR [rdx+176]
-        mov	QWORD PTR [rcx+168], r10
-        adc	r9, QWORD PTR [r8+176]
-        mov	r10, QWORD PTR [rdx+184]
-        mov	QWORD PTR [rcx+176], r9
-        adc	r10, QWORD PTR [r8+184]
-        mov	r9, QWORD PTR [rdx+192]
-        mov	QWORD PTR [rcx+184], r10
-        adc	r9, QWORD PTR [r8+192]
-        mov	r10, QWORD PTR [rdx+200]
-        mov	QWORD PTR [rcx+192], r9
-        adc	r10, QWORD PTR [r8+200]
-        mov	r9, QWORD PTR [rdx+208]
-        mov	QWORD PTR [rcx+200], r10
-        adc	r9, QWORD PTR [r8+208]
-        mov	r10, QWORD PTR [rdx+216]
-        mov	QWORD PTR [rcx+208], r9
-        adc	r10, QWORD PTR [r8+216]
-        mov	r9, QWORD PTR [rdx+224]
-        mov	QWORD PTR [rcx+216], r10
-        adc	r9, QWORD PTR [r8+224]
-        mov	r10, QWORD PTR [rdx+232]
-        mov	QWORD PTR [rcx+224], r9
-        adc	r10, QWORD PTR [r8+232]
-        mov	r9, QWORD PTR [rdx+240]
-        mov	QWORD PTR [rcx+232], r10
-        adc	r9, QWORD PTR [r8+240]
-        mov	r10, QWORD PTR [rdx+248]
-        mov	QWORD PTR [rcx+240], r9
-        adc	r10, QWORD PTR [r8+248]
-        mov	r9, QWORD PTR [rdx+256]
-        mov	QWORD PTR [rcx+248], r10
-        adc	r9, QWORD PTR [r8+256]
-        mov	r10, QWORD PTR [rdx+264]
-        mov	QWORD PTR [rcx+256], r9
-        adc	r10, QWORD PTR [r8+264]
-        mov	r9, QWORD PTR [rdx+272]
-        mov	QWORD PTR [rcx+264], r10
-        adc	r9, QWORD PTR [r8+272]
-        mov	r10, QWORD PTR [rdx+280]
-        mov	QWORD PTR [rcx+272], r9
-        adc	r10, QWORD PTR [r8+280]
-        mov	r9, QWORD PTR [rdx+288]
-        mov	QWORD PTR [rcx+280], r10
-        adc	r9, QWORD PTR [r8+288]
-        mov	r10, QWORD PTR [rdx+296]
-        mov	QWORD PTR [rcx+288], r9
-        adc	r10, QWORD PTR [r8+296]
-        mov	r9, QWORD PTR [rdx+304]
-        mov	QWORD PTR [rcx+296], r10
-        adc	r9, QWORD PTR [r8+304]
-        mov	r10, QWORD PTR [rdx+312]
-        mov	QWORD PTR [rcx+304], r9
-        adc	r10, QWORD PTR [r8+312]
-        mov	r9, QWORD PTR [rdx+320]
-        mov	QWORD PTR [rcx+312], r10
-        adc	r9, QWORD PTR [r8+320]
-        mov	r10, QWORD PTR [rdx+328]
-        mov	QWORD PTR [rcx+320], r9
-        adc	r10, QWORD PTR [r8+328]
-        mov	r9, QWORD PTR [rdx+336]
-        mov	QWORD PTR [rcx+328], r10
-        adc	r9, QWORD PTR [r8+336]
-        mov	r10, QWORD PTR [rdx+344]
-        mov	QWORD PTR [rcx+336], r9
-        adc	r10, QWORD PTR [r8+344]
-        mov	r9, QWORD PTR [rdx+352]
-        mov	QWORD PTR [rcx+344], r10
-        adc	r9, QWORD PTR [r8+352]
-        mov	r10, QWORD PTR [rdx+360]
-        mov	QWORD PTR [rcx+352], r9
-        adc	r10, QWORD PTR [r8+360]
-        mov	r9, QWORD PTR [rdx+368]
-        mov	QWORD PTR [rcx+360], r10
-        adc	r9, QWORD PTR [r8+368]
-        mov	r10, QWORD PTR [rdx+376]
-        mov	QWORD PTR [rcx+368], r9
-        adc	r10, QWORD PTR [r8+376]
-        mov	r9, QWORD PTR [rdx+384]
-        mov	QWORD PTR [rcx+376], r10
-        adc	r9, QWORD PTR [r8+384]
-        mov	r10, QWORD PTR [rdx+392]
-        mov	QWORD PTR [rcx+384], r9
-        adc	r10, QWORD PTR [r8+392]
-        mov	r9, QWORD PTR [rdx+400]
-        mov	QWORD PTR [rcx+392], r10
-        adc	r9, QWORD PTR [r8+400]
-        mov	r10, QWORD PTR [rdx+408]
-        mov	QWORD PTR [rcx+400], r9
-        adc	r10, QWORD PTR [r8+408]
-        mov	r9, QWORD PTR [rdx+416]
-        mov	QWORD PTR [rcx+408], r10
-        adc	r9, QWORD PTR [r8+416]
-        mov	r10, QWORD PTR [rdx+424]
-        mov	QWORD PTR [rcx+416], r9
-        adc	r10, QWORD PTR [r8+424]
-        mov	r9, QWORD PTR [rdx+432]
-        mov	QWORD PTR [rcx+424], r10
-        adc	r9, QWORD PTR [r8+432]
-        mov	r10, QWORD PTR [rdx+440]
-        mov	QWORD PTR [rcx+432], r9
-        adc	r10, QWORD PTR [r8+440]
-        mov	r9, QWORD PTR [rdx+448]
-        mov	QWORD PTR [rcx+440], r10
-        adc	r9, QWORD PTR [r8+448]
-        mov	r10, QWORD PTR [rdx+456]
-        mov	QWORD PTR [rcx+448], r9
-        adc	r10, QWORD PTR [r8+456]
-        mov	r9, QWORD PTR [rdx+464]
-        mov	QWORD PTR [rcx+456], r10
-        adc	r9, QWORD PTR [r8+464]
-        mov	r10, QWORD PTR [rdx+472]
-        mov	QWORD PTR [rcx+464], r9
-        adc	r10, QWORD PTR [r8+472]
-        mov	r9, QWORD PTR [rdx+480]
-        mov	QWORD PTR [rcx+472], r10
-        adc	r9, QWORD PTR [r8+480]
-        mov	r10, QWORD PTR [rdx+488]
-        mov	QWORD PTR [rcx+480], r9
-        adc	r10, QWORD PTR [r8+488]
-        mov	r9, QWORD PTR [rdx+496]
-        mov	QWORD PTR [rcx+488], r10
-        adc	r9, QWORD PTR [r8+496]
-        mov	r10, QWORD PTR [rdx+504]
-        mov	QWORD PTR [rcx+496], r9
-        adc	r10, QWORD PTR [r8+504]
-        mov	QWORD PTR [rcx+504], r10
-        adc	rax, 0
-        ret
-sp_4096_add_64 ENDP
-_text ENDS
-; /* Multiply a and b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_4096_mul_64 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        sub	rsp, 1576
-        mov	QWORD PTR [rsp+1536], rcx
-        mov	QWORD PTR [rsp+1544], rdx
-        mov	QWORD PTR [rsp+1552], r8
-        lea	r12, QWORD PTR [rsp+1024]
-        lea	r14, QWORD PTR [rdx+256]
-        ; Add
-        mov	rax, QWORD PTR [rdx]
-        xor	r15, r15
-        add	rax, QWORD PTR [r14]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	QWORD PTR [r12], rax
-        adc	r9, QWORD PTR [r14+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	QWORD PTR [r12+8], r9
-        adc	r10, QWORD PTR [r14+16]
-        mov	rax, QWORD PTR [rdx+24]
-        mov	QWORD PTR [r12+16], r10
-        adc	rax, QWORD PTR [r14+24]
-        mov	r9, QWORD PTR [rdx+32]
-        mov	QWORD PTR [r12+24], rax
-        adc	r9, QWORD PTR [r14+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	QWORD PTR [r12+32], r9
-        adc	r10, QWORD PTR [r14+40]
-        mov	rax, QWORD PTR [rdx+48]
-        mov	QWORD PTR [r12+40], r10
-        adc	rax, QWORD PTR [r14+48]
-        mov	r9, QWORD PTR [rdx+56]
-        mov	QWORD PTR [r12+48], rax
-        adc	r9, QWORD PTR [r14+56]
-        mov	r10, QWORD PTR [rdx+64]
-        mov	QWORD PTR [r12+56], r9
-        adc	r10, QWORD PTR [r14+64]
-        mov	rax, QWORD PTR [rdx+72]
-        mov	QWORD PTR [r12+64], r10
-        adc	rax, QWORD PTR [r14+72]
-        mov	r9, QWORD PTR [rdx+80]
-        mov	QWORD PTR [r12+72], rax
-        adc	r9, QWORD PTR [r14+80]
-        mov	r10, QWORD PTR [rdx+88]
-        mov	QWORD PTR [r12+80], r9
-        adc	r10, QWORD PTR [r14+88]
-        mov	rax, QWORD PTR [rdx+96]
-        mov	QWORD PTR [r12+88], r10
-        adc	rax, QWORD PTR [r14+96]
-        mov	r9, QWORD PTR [rdx+104]
-        mov	QWORD PTR [r12+96], rax
-        adc	r9, QWORD PTR [r14+104]
-        mov	r10, QWORD PTR [rdx+112]
-        mov	QWORD PTR [r12+104], r9
-        adc	r10, QWORD PTR [r14+112]
-        mov	rax, QWORD PTR [rdx+120]
-        mov	QWORD PTR [r12+112], r10
-        adc	rax, QWORD PTR [r14+120]
-        mov	r9, QWORD PTR [rdx+128]
-        mov	QWORD PTR [r12+120], rax
-        adc	r9, QWORD PTR [r14+128]
-        mov	r10, QWORD PTR [rdx+136]
-        mov	QWORD PTR [r12+128], r9
-        adc	r10, QWORD PTR [r14+136]
-        mov	rax, QWORD PTR [rdx+144]
-        mov	QWORD PTR [r12+136], r10
-        adc	rax, QWORD PTR [r14+144]
-        mov	r9, QWORD PTR [rdx+152]
-        mov	QWORD PTR [r12+144], rax
-        adc	r9, QWORD PTR [r14+152]
-        mov	r10, QWORD PTR [rdx+160]
-        mov	QWORD PTR [r12+152], r9
-        adc	r10, QWORD PTR [r14+160]
-        mov	rax, QWORD PTR [rdx+168]
-        mov	QWORD PTR [r12+160], r10
-        adc	rax, QWORD PTR [r14+168]
-        mov	r9, QWORD PTR [rdx+176]
-        mov	QWORD PTR [r12+168], rax
-        adc	r9, QWORD PTR [r14+176]
-        mov	r10, QWORD PTR [rdx+184]
-        mov	QWORD PTR [r12+176], r9
-        adc	r10, QWORD PTR [r14+184]
-        mov	rax, QWORD PTR [rdx+192]
-        mov	QWORD PTR [r12+184], r10
-        adc	rax, QWORD PTR [r14+192]
-        mov	r9, QWORD PTR [rdx+200]
-        mov	QWORD PTR [r12+192], rax
-        adc	r9, QWORD PTR [r14+200]
-        mov	r10, QWORD PTR [rdx+208]
-        mov	QWORD PTR [r12+200], r9
-        adc	r10, QWORD PTR [r14+208]
-        mov	rax, QWORD PTR [rdx+216]
-        mov	QWORD PTR [r12+208], r10
-        adc	rax, QWORD PTR [r14+216]
-        mov	r9, QWORD PTR [rdx+224]
-        mov	QWORD PTR [r12+216], rax
-        adc	r9, QWORD PTR [r14+224]
-        mov	r10, QWORD PTR [rdx+232]
-        mov	QWORD PTR [r12+224], r9
-        adc	r10, QWORD PTR [r14+232]
-        mov	rax, QWORD PTR [rdx+240]
-        mov	QWORD PTR [r12+232], r10
-        adc	rax, QWORD PTR [r14+240]
-        mov	r9, QWORD PTR [rdx+248]
-        mov	QWORD PTR [r12+240], rax
-        adc	r9, QWORD PTR [r14+248]
-        mov	QWORD PTR [r12+248], r9
-        adc	r15, 0
-        mov	QWORD PTR [rsp+1560], r15
-        lea	r13, QWORD PTR [rsp+1280]
-        lea	r14, QWORD PTR [r8+256]
-        ; Add
-        mov	rax, QWORD PTR [r8]
-        xor	rdi, rdi
-        add	rax, QWORD PTR [r14]
-        mov	r9, QWORD PTR [r8+8]
-        mov	QWORD PTR [r13], rax
-        adc	r9, QWORD PTR [r14+8]
-        mov	r10, QWORD PTR [r8+16]
-        mov	QWORD PTR [r13+8], r9
-        adc	r10, QWORD PTR [r14+16]
-        mov	rax, QWORD PTR [r8+24]
-        mov	QWORD PTR [r13+16], r10
-        adc	rax, QWORD PTR [r14+24]
-        mov	r9, QWORD PTR [r8+32]
-        mov	QWORD PTR [r13+24], rax
-        adc	r9, QWORD PTR [r14+32]
-        mov	r10, QWORD PTR [r8+40]
-        mov	QWORD PTR [r13+32], r9
-        adc	r10, QWORD PTR [r14+40]
-        mov	rax, QWORD PTR [r8+48]
-        mov	QWORD PTR [r13+40], r10
-        adc	rax, QWORD PTR [r14+48]
-        mov	r9, QWORD PTR [r8+56]
-        mov	QWORD PTR [r13+48], rax
-        adc	r9, QWORD PTR [r14+56]
-        mov	r10, QWORD PTR [r8+64]
-        mov	QWORD PTR [r13+56], r9
-        adc	r10, QWORD PTR [r14+64]
-        mov	rax, QWORD PTR [r8+72]
-        mov	QWORD PTR [r13+64], r10
-        adc	rax, QWORD PTR [r14+72]
-        mov	r9, QWORD PTR [r8+80]
-        mov	QWORD PTR [r13+72], rax
-        adc	r9, QWORD PTR [r14+80]
-        mov	r10, QWORD PTR [r8+88]
-        mov	QWORD PTR [r13+80], r9
-        adc	r10, QWORD PTR [r14+88]
-        mov	rax, QWORD PTR [r8+96]
-        mov	QWORD PTR [r13+88], r10
-        adc	rax, QWORD PTR [r14+96]
-        mov	r9, QWORD PTR [r8+104]
-        mov	QWORD PTR [r13+96], rax
-        adc	r9, QWORD PTR [r14+104]
-        mov	r10, QWORD PTR [r8+112]
-        mov	QWORD PTR [r13+104], r9
-        adc	r10, QWORD PTR [r14+112]
-        mov	rax, QWORD PTR [r8+120]
-        mov	QWORD PTR [r13+112], r10
-        adc	rax, QWORD PTR [r14+120]
-        mov	r9, QWORD PTR [r8+128]
-        mov	QWORD PTR [r13+120], rax
-        adc	r9, QWORD PTR [r14+128]
-        mov	r10, QWORD PTR [r8+136]
-        mov	QWORD PTR [r13+128], r9
-        adc	r10, QWORD PTR [r14+136]
-        mov	rax, QWORD PTR [r8+144]
-        mov	QWORD PTR [r13+136], r10
-        adc	rax, QWORD PTR [r14+144]
-        mov	r9, QWORD PTR [r8+152]
-        mov	QWORD PTR [r13+144], rax
-        adc	r9, QWORD PTR [r14+152]
-        mov	r10, QWORD PTR [r8+160]
-        mov	QWORD PTR [r13+152], r9
-        adc	r10, QWORD PTR [r14+160]
-        mov	rax, QWORD PTR [r8+168]
-        mov	QWORD PTR [r13+160], r10
-        adc	rax, QWORD PTR [r14+168]
-        mov	r9, QWORD PTR [r8+176]
-        mov	QWORD PTR [r13+168], rax
-        adc	r9, QWORD PTR [r14+176]
-        mov	r10, QWORD PTR [r8+184]
-        mov	QWORD PTR [r13+176], r9
-        adc	r10, QWORD PTR [r14+184]
-        mov	rax, QWORD PTR [r8+192]
-        mov	QWORD PTR [r13+184], r10
-        adc	rax, QWORD PTR [r14+192]
-        mov	r9, QWORD PTR [r8+200]
-        mov	QWORD PTR [r13+192], rax
-        adc	r9, QWORD PTR [r14+200]
-        mov	r10, QWORD PTR [r8+208]
-        mov	QWORD PTR [r13+200], r9
-        adc	r10, QWORD PTR [r14+208]
-        mov	rax, QWORD PTR [r8+216]
-        mov	QWORD PTR [r13+208], r10
-        adc	rax, QWORD PTR [r14+216]
-        mov	r9, QWORD PTR [r8+224]
-        mov	QWORD PTR [r13+216], rax
-        adc	r9, QWORD PTR [r14+224]
-        mov	r10, QWORD PTR [r8+232]
-        mov	QWORD PTR [r13+224], r9
-        adc	r10, QWORD PTR [r14+232]
-        mov	rax, QWORD PTR [r8+240]
-        mov	QWORD PTR [r13+232], r10
-        adc	rax, QWORD PTR [r14+240]
-        mov	r9, QWORD PTR [r8+248]
-        mov	QWORD PTR [r13+240], rax
-        adc	r9, QWORD PTR [r14+248]
-        mov	QWORD PTR [r13+248], r9
-        adc	rdi, 0
-        mov	QWORD PTR [rsp+1568], rdi
-        mov	r8, r13
-        mov	rdx, r12
-        mov	rcx, rsp
-        call	sp_2048_mul_32
-        mov	r8, QWORD PTR [rsp+1552]
-        mov	rdx, QWORD PTR [rsp+1544]
-        lea	rcx, QWORD PTR [rsp+512]
-        add	r8, 256
-        add	rdx, 256
-        call	sp_2048_mul_32
-        mov	r8, QWORD PTR [rsp+1552]
-        mov	rdx, QWORD PTR [rsp+1544]
-        mov	rcx, QWORD PTR [rsp+1536]
-        call	sp_2048_mul_32
-IFDEF _WIN64
-        mov	r8, QWORD PTR [rsp+1552]
-        mov	rdx, QWORD PTR [rsp+1544]
-        mov	rcx, QWORD PTR [rsp+1536]
-ENDIF
-        mov	r15, QWORD PTR [rsp+1560]
-        mov	rdi, QWORD PTR [rsp+1568]
-        mov	rsi, QWORD PTR [rsp+1536]
-        mov	r11, r15
-        lea	r12, QWORD PTR [rsp+1024]
-        lea	r13, QWORD PTR [rsp+1280]
-        and	r11, rdi
-        neg	r15
-        neg	rdi
-        add	rsi, 512
-        mov	rax, QWORD PTR [r12]
-        mov	r9, QWORD PTR [r13]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12], rax
-        mov	QWORD PTR [r13], r9
-        mov	rax, QWORD PTR [r12+8]
-        mov	r9, QWORD PTR [r13+8]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+8], rax
-        mov	QWORD PTR [r13+8], r9
-        mov	rax, QWORD PTR [r12+16]
-        mov	r9, QWORD PTR [r13+16]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+16], rax
-        mov	QWORD PTR [r13+16], r9
-        mov	rax, QWORD PTR [r12+24]
-        mov	r9, QWORD PTR [r13+24]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+24], rax
-        mov	QWORD PTR [r13+24], r9
-        mov	rax, QWORD PTR [r12+32]
-        mov	r9, QWORD PTR [r13+32]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+32], rax
-        mov	QWORD PTR [r13+32], r9
-        mov	rax, QWORD PTR [r12+40]
-        mov	r9, QWORD PTR [r13+40]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+40], rax
-        mov	QWORD PTR [r13+40], r9
-        mov	rax, QWORD PTR [r12+48]
-        mov	r9, QWORD PTR [r13+48]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+48], rax
-        mov	QWORD PTR [r13+48], r9
-        mov	rax, QWORD PTR [r12+56]
-        mov	r9, QWORD PTR [r13+56]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+56], rax
-        mov	QWORD PTR [r13+56], r9
-        mov	rax, QWORD PTR [r12+64]
-        mov	r9, QWORD PTR [r13+64]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+64], rax
-        mov	QWORD PTR [r13+64], r9
-        mov	rax, QWORD PTR [r12+72]
-        mov	r9, QWORD PTR [r13+72]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+72], rax
-        mov	QWORD PTR [r13+72], r9
-        mov	rax, QWORD PTR [r12+80]
-        mov	r9, QWORD PTR [r13+80]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+80], rax
-        mov	QWORD PTR [r13+80], r9
-        mov	rax, QWORD PTR [r12+88]
-        mov	r9, QWORD PTR [r13+88]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+88], rax
-        mov	QWORD PTR [r13+88], r9
-        mov	rax, QWORD PTR [r12+96]
-        mov	r9, QWORD PTR [r13+96]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+96], rax
-        mov	QWORD PTR [r13+96], r9
-        mov	rax, QWORD PTR [r12+104]
-        mov	r9, QWORD PTR [r13+104]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+104], rax
-        mov	QWORD PTR [r13+104], r9
-        mov	rax, QWORD PTR [r12+112]
-        mov	r9, QWORD PTR [r13+112]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+112], rax
-        mov	QWORD PTR [r13+112], r9
-        mov	rax, QWORD PTR [r12+120]
-        mov	r9, QWORD PTR [r13+120]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+120], rax
-        mov	QWORD PTR [r13+120], r9
-        mov	rax, QWORD PTR [r12+128]
-        mov	r9, QWORD PTR [r13+128]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+128], rax
-        mov	QWORD PTR [r13+128], r9
-        mov	rax, QWORD PTR [r12+136]
-        mov	r9, QWORD PTR [r13+136]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+136], rax
-        mov	QWORD PTR [r13+136], r9
-        mov	rax, QWORD PTR [r12+144]
-        mov	r9, QWORD PTR [r13+144]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+144], rax
-        mov	QWORD PTR [r13+144], r9
-        mov	rax, QWORD PTR [r12+152]
-        mov	r9, QWORD PTR [r13+152]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+152], rax
-        mov	QWORD PTR [r13+152], r9
-        mov	rax, QWORD PTR [r12+160]
-        mov	r9, QWORD PTR [r13+160]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+160], rax
-        mov	QWORD PTR [r13+160], r9
-        mov	rax, QWORD PTR [r12+168]
-        mov	r9, QWORD PTR [r13+168]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+168], rax
-        mov	QWORD PTR [r13+168], r9
-        mov	rax, QWORD PTR [r12+176]
-        mov	r9, QWORD PTR [r13+176]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+176], rax
-        mov	QWORD PTR [r13+176], r9
-        mov	rax, QWORD PTR [r12+184]
-        mov	r9, QWORD PTR [r13+184]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+184], rax
-        mov	QWORD PTR [r13+184], r9
-        mov	rax, QWORD PTR [r12+192]
-        mov	r9, QWORD PTR [r13+192]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+192], rax
-        mov	QWORD PTR [r13+192], r9
-        mov	rax, QWORD PTR [r12+200]
-        mov	r9, QWORD PTR [r13+200]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+200], rax
-        mov	QWORD PTR [r13+200], r9
-        mov	rax, QWORD PTR [r12+208]
-        mov	r9, QWORD PTR [r13+208]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+208], rax
-        mov	QWORD PTR [r13+208], r9
-        mov	rax, QWORD PTR [r12+216]
-        mov	r9, QWORD PTR [r13+216]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+216], rax
-        mov	QWORD PTR [r13+216], r9
-        mov	rax, QWORD PTR [r12+224]
-        mov	r9, QWORD PTR [r13+224]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+224], rax
-        mov	QWORD PTR [r13+224], r9
-        mov	rax, QWORD PTR [r12+232]
-        mov	r9, QWORD PTR [r13+232]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+232], rax
-        mov	QWORD PTR [r13+232], r9
-        mov	rax, QWORD PTR [r12+240]
-        mov	r9, QWORD PTR [r13+240]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+240], rax
-        mov	QWORD PTR [r13+240], r9
-        mov	rax, QWORD PTR [r12+248]
-        mov	r9, QWORD PTR [r13+248]
-        and	rax, rdi
-        and	r9, r15
-        mov	QWORD PTR [r12+248], rax
-        mov	QWORD PTR [r13+248], r9
-        mov	rax, QWORD PTR [r12]
-        add	rax, QWORD PTR [r13]
-        mov	r9, QWORD PTR [r12+8]
-        mov	QWORD PTR [rsi], rax
-        adc	r9, QWORD PTR [r13+8]
-        mov	r10, QWORD PTR [r12+16]
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, QWORD PTR [r13+16]
-        mov	rax, QWORD PTR [r12+24]
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, QWORD PTR [r13+24]
-        mov	r9, QWORD PTR [r12+32]
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, QWORD PTR [r13+32]
-        mov	r10, QWORD PTR [r12+40]
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, QWORD PTR [r13+40]
-        mov	rax, QWORD PTR [r12+48]
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, QWORD PTR [r13+48]
-        mov	r9, QWORD PTR [r12+56]
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, QWORD PTR [r13+56]
-        mov	r10, QWORD PTR [r12+64]
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, QWORD PTR [r13+64]
-        mov	rax, QWORD PTR [r12+72]
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, QWORD PTR [r13+72]
-        mov	r9, QWORD PTR [r12+80]
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, QWORD PTR [r13+80]
-        mov	r10, QWORD PTR [r12+88]
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, QWORD PTR [r13+88]
-        mov	rax, QWORD PTR [r12+96]
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, QWORD PTR [r13+96]
-        mov	r9, QWORD PTR [r12+104]
-        mov	QWORD PTR [rsi+96], rax
-        adc	r9, QWORD PTR [r13+104]
-        mov	r10, QWORD PTR [r12+112]
-        mov	QWORD PTR [rsi+104], r9
-        adc	r10, QWORD PTR [r13+112]
-        mov	rax, QWORD PTR [r12+120]
-        mov	QWORD PTR [rsi+112], r10
-        adc	rax, QWORD PTR [r13+120]
-        mov	r9, QWORD PTR [r12+128]
-        mov	QWORD PTR [rsi+120], rax
-        adc	r9, QWORD PTR [r13+128]
-        mov	r10, QWORD PTR [r12+136]
-        mov	QWORD PTR [rsi+128], r9
-        adc	r10, QWORD PTR [r13+136]
-        mov	rax, QWORD PTR [r12+144]
-        mov	QWORD PTR [rsi+136], r10
-        adc	rax, QWORD PTR [r13+144]
-        mov	r9, QWORD PTR [r12+152]
-        mov	QWORD PTR [rsi+144], rax
-        adc	r9, QWORD PTR [r13+152]
-        mov	r10, QWORD PTR [r12+160]
-        mov	QWORD PTR [rsi+152], r9
-        adc	r10, QWORD PTR [r13+160]
-        mov	rax, QWORD PTR [r12+168]
-        mov	QWORD PTR [rsi+160], r10
-        adc	rax, QWORD PTR [r13+168]
-        mov	r9, QWORD PTR [r12+176]
-        mov	QWORD PTR [rsi+168], rax
-        adc	r9, QWORD PTR [r13+176]
-        mov	r10, QWORD PTR [r12+184]
-        mov	QWORD PTR [rsi+176], r9
-        adc	r10, QWORD PTR [r13+184]
-        mov	rax, QWORD PTR [r12+192]
-        mov	QWORD PTR [rsi+184], r10
-        adc	rax, QWORD PTR [r13+192]
-        mov	r9, QWORD PTR [r12+200]
-        mov	QWORD PTR [rsi+192], rax
-        adc	r9, QWORD PTR [r13+200]
-        mov	r10, QWORD PTR [r12+208]
-        mov	QWORD PTR [rsi+200], r9
-        adc	r10, QWORD PTR [r13+208]
-        mov	rax, QWORD PTR [r12+216]
-        mov	QWORD PTR [rsi+208], r10
-        adc	rax, QWORD PTR [r13+216]
-        mov	r9, QWORD PTR [r12+224]
-        mov	QWORD PTR [rsi+216], rax
-        adc	r9, QWORD PTR [r13+224]
-        mov	r10, QWORD PTR [r12+232]
-        mov	QWORD PTR [rsi+224], r9
-        adc	r10, QWORD PTR [r13+232]
-        mov	rax, QWORD PTR [r12+240]
-        mov	QWORD PTR [rsi+232], r10
-        adc	rax, QWORD PTR [r13+240]
-        mov	r9, QWORD PTR [r12+248]
-        mov	QWORD PTR [rsi+240], rax
-        adc	r9, QWORD PTR [r13+248]
-        mov	QWORD PTR [rsi+248], r9
-        adc	r11, 0
-        lea	r13, QWORD PTR [rsp+512]
-        mov	r12, rsp
-        mov	rax, QWORD PTR [r12]
-        sub	rax, QWORD PTR [r13]
-        mov	r9, QWORD PTR [r12+8]
-        mov	QWORD PTR [r12], rax
-        sbb	r9, QWORD PTR [r13+8]
-        mov	r10, QWORD PTR [r12+16]
-        mov	QWORD PTR [r12+8], r9
-        sbb	r10, QWORD PTR [r13+16]
-        mov	rax, QWORD PTR [r12+24]
-        mov	QWORD PTR [r12+16], r10
-        sbb	rax, QWORD PTR [r13+24]
-        mov	r9, QWORD PTR [r12+32]
-        mov	QWORD PTR [r12+24], rax
-        sbb	r9, QWORD PTR [r13+32]
-        mov	r10, QWORD PTR [r12+40]
-        mov	QWORD PTR [r12+32], r9
-        sbb	r10, QWORD PTR [r13+40]
-        mov	rax, QWORD PTR [r12+48]
-        mov	QWORD PTR [r12+40], r10
-        sbb	rax, QWORD PTR [r13+48]
-        mov	r9, QWORD PTR [r12+56]
-        mov	QWORD PTR [r12+48], rax
-        sbb	r9, QWORD PTR [r13+56]
-        mov	r10, QWORD PTR [r12+64]
-        mov	QWORD PTR [r12+56], r9
-        sbb	r10, QWORD PTR [r13+64]
-        mov	rax, QWORD PTR [r12+72]
-        mov	QWORD PTR [r12+64], r10
-        sbb	rax, QWORD PTR [r13+72]
-        mov	r9, QWORD PTR [r12+80]
-        mov	QWORD PTR [r12+72], rax
-        sbb	r9, QWORD PTR [r13+80]
-        mov	r10, QWORD PTR [r12+88]
-        mov	QWORD PTR [r12+80], r9
-        sbb	r10, QWORD PTR [r13+88]
-        mov	rax, QWORD PTR [r12+96]
-        mov	QWORD PTR [r12+88], r10
-        sbb	rax, QWORD PTR [r13+96]
-        mov	r9, QWORD PTR [r12+104]
-        mov	QWORD PTR [r12+96], rax
-        sbb	r9, QWORD PTR [r13+104]
-        mov	r10, QWORD PTR [r12+112]
-        mov	QWORD PTR [r12+104], r9
-        sbb	r10, QWORD PTR [r13+112]
-        mov	rax, QWORD PTR [r12+120]
-        mov	QWORD PTR [r12+112], r10
-        sbb	rax, QWORD PTR [r13+120]
-        mov	r9, QWORD PTR [r12+128]
-        mov	QWORD PTR [r12+120], rax
-        sbb	r9, QWORD PTR [r13+128]
-        mov	r10, QWORD PTR [r12+136]
-        mov	QWORD PTR [r12+128], r9
-        sbb	r10, QWORD PTR [r13+136]
-        mov	rax, QWORD PTR [r12+144]
-        mov	QWORD PTR [r12+136], r10
-        sbb	rax, QWORD PTR [r13+144]
-        mov	r9, QWORD PTR [r12+152]
-        mov	QWORD PTR [r12+144], rax
-        sbb	r9, QWORD PTR [r13+152]
-        mov	r10, QWORD PTR [r12+160]
-        mov	QWORD PTR [r12+152], r9
-        sbb	r10, QWORD PTR [r13+160]
-        mov	rax, QWORD PTR [r12+168]
-        mov	QWORD PTR [r12+160], r10
-        sbb	rax, QWORD PTR [r13+168]
-        mov	r9, QWORD PTR [r12+176]
-        mov	QWORD PTR [r12+168], rax
-        sbb	r9, QWORD PTR [r13+176]
-        mov	r10, QWORD PTR [r12+184]
-        mov	QWORD PTR [r12+176], r9
-        sbb	r10, QWORD PTR [r13+184]
-        mov	rax, QWORD PTR [r12+192]
-        mov	QWORD PTR [r12+184], r10
-        sbb	rax, QWORD PTR [r13+192]
-        mov	r9, QWORD PTR [r12+200]
-        mov	QWORD PTR [r12+192], rax
-        sbb	r9, QWORD PTR [r13+200]
-        mov	r10, QWORD PTR [r12+208]
-        mov	QWORD PTR [r12+200], r9
-        sbb	r10, QWORD PTR [r13+208]
-        mov	rax, QWORD PTR [r12+216]
-        mov	QWORD PTR [r12+208], r10
-        sbb	rax, QWORD PTR [r13+216]
-        mov	r9, QWORD PTR [r12+224]
-        mov	QWORD PTR [r12+216], rax
-        sbb	r9, QWORD PTR [r13+224]
-        mov	r10, QWORD PTR [r12+232]
-        mov	QWORD PTR [r12+224], r9
-        sbb	r10, QWORD PTR [r13+232]
-        mov	rax, QWORD PTR [r12+240]
-        mov	QWORD PTR [r12+232], r10
-        sbb	rax, QWORD PTR [r13+240]
-        mov	r9, QWORD PTR [r12+248]
-        mov	QWORD PTR [r12+240], rax
-        sbb	r9, QWORD PTR [r13+248]
-        mov	r10, QWORD PTR [r12+256]
-        mov	QWORD PTR [r12+248], r9
-        sbb	r10, QWORD PTR [r13+256]
-        mov	rax, QWORD PTR [r12+264]
-        mov	QWORD PTR [r12+256], r10
-        sbb	rax, QWORD PTR [r13+264]
-        mov	r9, QWORD PTR [r12+272]
-        mov	QWORD PTR [r12+264], rax
-        sbb	r9, QWORD PTR [r13+272]
-        mov	r10, QWORD PTR [r12+280]
-        mov	QWORD PTR [r12+272], r9
-        sbb	r10, QWORD PTR [r13+280]
-        mov	rax, QWORD PTR [r12+288]
-        mov	QWORD PTR [r12+280], r10
-        sbb	rax, QWORD PTR [r13+288]
-        mov	r9, QWORD PTR [r12+296]
-        mov	QWORD PTR [r12+288], rax
-        sbb	r9, QWORD PTR [r13+296]
-        mov	r10, QWORD PTR [r12+304]
-        mov	QWORD PTR [r12+296], r9
-        sbb	r10, QWORD PTR [r13+304]
-        mov	rax, QWORD PTR [r12+312]
-        mov	QWORD PTR [r12+304], r10
-        sbb	rax, QWORD PTR [r13+312]
-        mov	r9, QWORD PTR [r12+320]
-        mov	QWORD PTR [r12+312], rax
-        sbb	r9, QWORD PTR [r13+320]
-        mov	r10, QWORD PTR [r12+328]
-        mov	QWORD PTR [r12+320], r9
-        sbb	r10, QWORD PTR [r13+328]
-        mov	rax, QWORD PTR [r12+336]
-        mov	QWORD PTR [r12+328], r10
-        sbb	rax, QWORD PTR [r13+336]
-        mov	r9, QWORD PTR [r12+344]
-        mov	QWORD PTR [r12+336], rax
-        sbb	r9, QWORD PTR [r13+344]
-        mov	r10, QWORD PTR [r12+352]
-        mov	QWORD PTR [r12+344], r9
-        sbb	r10, QWORD PTR [r13+352]
-        mov	rax, QWORD PTR [r12+360]
-        mov	QWORD PTR [r12+352], r10
-        sbb	rax, QWORD PTR [r13+360]
-        mov	r9, QWORD PTR [r12+368]
-        mov	QWORD PTR [r12+360], rax
-        sbb	r9, QWORD PTR [r13+368]
-        mov	r10, QWORD PTR [r12+376]
-        mov	QWORD PTR [r12+368], r9
-        sbb	r10, QWORD PTR [r13+376]
-        mov	rax, QWORD PTR [r12+384]
-        mov	QWORD PTR [r12+376], r10
-        sbb	rax, QWORD PTR [r13+384]
-        mov	r9, QWORD PTR [r12+392]
-        mov	QWORD PTR [r12+384], rax
-        sbb	r9, QWORD PTR [r13+392]
-        mov	r10, QWORD PTR [r12+400]
-        mov	QWORD PTR [r12+392], r9
-        sbb	r10, QWORD PTR [r13+400]
-        mov	rax, QWORD PTR [r12+408]
-        mov	QWORD PTR [r12+400], r10
-        sbb	rax, QWORD PTR [r13+408]
-        mov	r9, QWORD PTR [r12+416]
-        mov	QWORD PTR [r12+408], rax
-        sbb	r9, QWORD PTR [r13+416]
-        mov	r10, QWORD PTR [r12+424]
-        mov	QWORD PTR [r12+416], r9
-        sbb	r10, QWORD PTR [r13+424]
-        mov	rax, QWORD PTR [r12+432]
-        mov	QWORD PTR [r12+424], r10
-        sbb	rax, QWORD PTR [r13+432]
-        mov	r9, QWORD PTR [r12+440]
-        mov	QWORD PTR [r12+432], rax
-        sbb	r9, QWORD PTR [r13+440]
-        mov	r10, QWORD PTR [r12+448]
-        mov	QWORD PTR [r12+440], r9
-        sbb	r10, QWORD PTR [r13+448]
-        mov	rax, QWORD PTR [r12+456]
-        mov	QWORD PTR [r12+448], r10
-        sbb	rax, QWORD PTR [r13+456]
-        mov	r9, QWORD PTR [r12+464]
-        mov	QWORD PTR [r12+456], rax
-        sbb	r9, QWORD PTR [r13+464]
-        mov	r10, QWORD PTR [r12+472]
-        mov	QWORD PTR [r12+464], r9
-        sbb	r10, QWORD PTR [r13+472]
-        mov	rax, QWORD PTR [r12+480]
-        mov	QWORD PTR [r12+472], r10
-        sbb	rax, QWORD PTR [r13+480]
-        mov	r9, QWORD PTR [r12+488]
-        mov	QWORD PTR [r12+480], rax
-        sbb	r9, QWORD PTR [r13+488]
-        mov	r10, QWORD PTR [r12+496]
-        mov	QWORD PTR [r12+488], r9
-        sbb	r10, QWORD PTR [r13+496]
-        mov	rax, QWORD PTR [r12+504]
-        mov	QWORD PTR [r12+496], r10
-        sbb	rax, QWORD PTR [r13+504]
-        mov	QWORD PTR [r12+504], rax
-        sbb	r11, 0
-        mov	rax, QWORD PTR [r12]
-        sub	rax, QWORD PTR [rcx]
-        mov	r9, QWORD PTR [r12+8]
-        mov	QWORD PTR [r12], rax
-        sbb	r9, QWORD PTR [rcx+8]
-        mov	r10, QWORD PTR [r12+16]
-        mov	QWORD PTR [r12+8], r9
-        sbb	r10, QWORD PTR [rcx+16]
-        mov	rax, QWORD PTR [r12+24]
-        mov	QWORD PTR [r12+16], r10
-        sbb	rax, QWORD PTR [rcx+24]
-        mov	r9, QWORD PTR [r12+32]
-        mov	QWORD PTR [r12+24], rax
-        sbb	r9, QWORD PTR [rcx+32]
-        mov	r10, QWORD PTR [r12+40]
-        mov	QWORD PTR [r12+32], r9
-        sbb	r10, QWORD PTR [rcx+40]
-        mov	rax, QWORD PTR [r12+48]
-        mov	QWORD PTR [r12+40], r10
-        sbb	rax, QWORD PTR [rcx+48]
-        mov	r9, QWORD PTR [r12+56]
-        mov	QWORD PTR [r12+48], rax
-        sbb	r9, QWORD PTR [rcx+56]
-        mov	r10, QWORD PTR [r12+64]
-        mov	QWORD PTR [r12+56], r9
-        sbb	r10, QWORD PTR [rcx+64]
-        mov	rax, QWORD PTR [r12+72]
-        mov	QWORD PTR [r12+64], r10
-        sbb	rax, QWORD PTR [rcx+72]
-        mov	r9, QWORD PTR [r12+80]
-        mov	QWORD PTR [r12+72], rax
-        sbb	r9, QWORD PTR [rcx+80]
-        mov	r10, QWORD PTR [r12+88]
-        mov	QWORD PTR [r12+80], r9
-        sbb	r10, QWORD PTR [rcx+88]
-        mov	rax, QWORD PTR [r12+96]
-        mov	QWORD PTR [r12+88], r10
-        sbb	rax, QWORD PTR [rcx+96]
-        mov	r9, QWORD PTR [r12+104]
-        mov	QWORD PTR [r12+96], rax
-        sbb	r9, QWORD PTR [rcx+104]
-        mov	r10, QWORD PTR [r12+112]
-        mov	QWORD PTR [r12+104], r9
-        sbb	r10, QWORD PTR [rcx+112]
-        mov	rax, QWORD PTR [r12+120]
-        mov	QWORD PTR [r12+112], r10
-        sbb	rax, QWORD PTR [rcx+120]
-        mov	r9, QWORD PTR [r12+128]
-        mov	QWORD PTR [r12+120], rax
-        sbb	r9, QWORD PTR [rcx+128]
-        mov	r10, QWORD PTR [r12+136]
-        mov	QWORD PTR [r12+128], r9
-        sbb	r10, QWORD PTR [rcx+136]
-        mov	rax, QWORD PTR [r12+144]
-        mov	QWORD PTR [r12+136], r10
-        sbb	rax, QWORD PTR [rcx+144]
-        mov	r9, QWORD PTR [r12+152]
-        mov	QWORD PTR [r12+144], rax
-        sbb	r9, QWORD PTR [rcx+152]
-        mov	r10, QWORD PTR [r12+160]
-        mov	QWORD PTR [r12+152], r9
-        sbb	r10, QWORD PTR [rcx+160]
-        mov	rax, QWORD PTR [r12+168]
-        mov	QWORD PTR [r12+160], r10
-        sbb	rax, QWORD PTR [rcx+168]
-        mov	r9, QWORD PTR [r12+176]
-        mov	QWORD PTR [r12+168], rax
-        sbb	r9, QWORD PTR [rcx+176]
-        mov	r10, QWORD PTR [r12+184]
-        mov	QWORD PTR [r12+176], r9
-        sbb	r10, QWORD PTR [rcx+184]
-        mov	rax, QWORD PTR [r12+192]
-        mov	QWORD PTR [r12+184], r10
-        sbb	rax, QWORD PTR [rcx+192]
-        mov	r9, QWORD PTR [r12+200]
-        mov	QWORD PTR [r12+192], rax
-        sbb	r9, QWORD PTR [rcx+200]
-        mov	r10, QWORD PTR [r12+208]
-        mov	QWORD PTR [r12+200], r9
-        sbb	r10, QWORD PTR [rcx+208]
-        mov	rax, QWORD PTR [r12+216]
-        mov	QWORD PTR [r12+208], r10
-        sbb	rax, QWORD PTR [rcx+216]
-        mov	r9, QWORD PTR [r12+224]
-        mov	QWORD PTR [r12+216], rax
-        sbb	r9, QWORD PTR [rcx+224]
-        mov	r10, QWORD PTR [r12+232]
-        mov	QWORD PTR [r12+224], r9
-        sbb	r10, QWORD PTR [rcx+232]
-        mov	rax, QWORD PTR [r12+240]
-        mov	QWORD PTR [r12+232], r10
-        sbb	rax, QWORD PTR [rcx+240]
-        mov	r9, QWORD PTR [r12+248]
-        mov	QWORD PTR [r12+240], rax
-        sbb	r9, QWORD PTR [rcx+248]
-        mov	r10, QWORD PTR [r12+256]
-        mov	QWORD PTR [r12+248], r9
-        sbb	r10, QWORD PTR [rcx+256]
-        mov	rax, QWORD PTR [r12+264]
-        mov	QWORD PTR [r12+256], r10
-        sbb	rax, QWORD PTR [rcx+264]
-        mov	r9, QWORD PTR [r12+272]
-        mov	QWORD PTR [r12+264], rax
-        sbb	r9, QWORD PTR [rcx+272]
-        mov	r10, QWORD PTR [r12+280]
-        mov	QWORD PTR [r12+272], r9
-        sbb	r10, QWORD PTR [rcx+280]
-        mov	rax, QWORD PTR [r12+288]
-        mov	QWORD PTR [r12+280], r10
-        sbb	rax, QWORD PTR [rcx+288]
-        mov	r9, QWORD PTR [r12+296]
-        mov	QWORD PTR [r12+288], rax
-        sbb	r9, QWORD PTR [rcx+296]
-        mov	r10, QWORD PTR [r12+304]
-        mov	QWORD PTR [r12+296], r9
-        sbb	r10, QWORD PTR [rcx+304]
-        mov	rax, QWORD PTR [r12+312]
-        mov	QWORD PTR [r12+304], r10
-        sbb	rax, QWORD PTR [rcx+312]
-        mov	r9, QWORD PTR [r12+320]
-        mov	QWORD PTR [r12+312], rax
-        sbb	r9, QWORD PTR [rcx+320]
-        mov	r10, QWORD PTR [r12+328]
-        mov	QWORD PTR [r12+320], r9
-        sbb	r10, QWORD PTR [rcx+328]
-        mov	rax, QWORD PTR [r12+336]
-        mov	QWORD PTR [r12+328], r10
-        sbb	rax, QWORD PTR [rcx+336]
-        mov	r9, QWORD PTR [r12+344]
-        mov	QWORD PTR [r12+336], rax
-        sbb	r9, QWORD PTR [rcx+344]
-        mov	r10, QWORD PTR [r12+352]
-        mov	QWORD PTR [r12+344], r9
-        sbb	r10, QWORD PTR [rcx+352]
-        mov	rax, QWORD PTR [r12+360]
-        mov	QWORD PTR [r12+352], r10
-        sbb	rax, QWORD PTR [rcx+360]
-        mov	r9, QWORD PTR [r12+368]
-        mov	QWORD PTR [r12+360], rax
-        sbb	r9, QWORD PTR [rcx+368]
-        mov	r10, QWORD PTR [r12+376]
-        mov	QWORD PTR [r12+368], r9
-        sbb	r10, QWORD PTR [rcx+376]
-        mov	rax, QWORD PTR [r12+384]
-        mov	QWORD PTR [r12+376], r10
-        sbb	rax, QWORD PTR [rcx+384]
-        mov	r9, QWORD PTR [r12+392]
-        mov	QWORD PTR [r12+384], rax
-        sbb	r9, QWORD PTR [rcx+392]
-        mov	r10, QWORD PTR [r12+400]
-        mov	QWORD PTR [r12+392], r9
-        sbb	r10, QWORD PTR [rcx+400]
-        mov	rax, QWORD PTR [r12+408]
-        mov	QWORD PTR [r12+400], r10
-        sbb	rax, QWORD PTR [rcx+408]
-        mov	r9, QWORD PTR [r12+416]
-        mov	QWORD PTR [r12+408], rax
-        sbb	r9, QWORD PTR [rcx+416]
-        mov	r10, QWORD PTR [r12+424]
-        mov	QWORD PTR [r12+416], r9
-        sbb	r10, QWORD PTR [rcx+424]
-        mov	rax, QWORD PTR [r12+432]
-        mov	QWORD PTR [r12+424], r10
-        sbb	rax, QWORD PTR [rcx+432]
-        mov	r9, QWORD PTR [r12+440]
-        mov	QWORD PTR [r12+432], rax
-        sbb	r9, QWORD PTR [rcx+440]
-        mov	r10, QWORD PTR [r12+448]
-        mov	QWORD PTR [r12+440], r9
-        sbb	r10, QWORD PTR [rcx+448]
-        mov	rax, QWORD PTR [r12+456]
-        mov	QWORD PTR [r12+448], r10
-        sbb	rax, QWORD PTR [rcx+456]
-        mov	r9, QWORD PTR [r12+464]
-        mov	QWORD PTR [r12+456], rax
-        sbb	r9, QWORD PTR [rcx+464]
-        mov	r10, QWORD PTR [r12+472]
-        mov	QWORD PTR [r12+464], r9
-        sbb	r10, QWORD PTR [rcx+472]
-        mov	rax, QWORD PTR [r12+480]
-        mov	QWORD PTR [r12+472], r10
-        sbb	rax, QWORD PTR [rcx+480]
-        mov	r9, QWORD PTR [r12+488]
-        mov	QWORD PTR [r12+480], rax
-        sbb	r9, QWORD PTR [rcx+488]
-        mov	r10, QWORD PTR [r12+496]
-        mov	QWORD PTR [r12+488], r9
-        sbb	r10, QWORD PTR [rcx+496]
-        mov	rax, QWORD PTR [r12+504]
-        mov	QWORD PTR [r12+496], r10
-        sbb	rax, QWORD PTR [rcx+504]
-        mov	QWORD PTR [r12+504], rax
-        sbb	r11, 0
-        sub	rsi, 256
-        ; Add
-        mov	rax, QWORD PTR [rsi]
-        add	rax, QWORD PTR [r12]
-        mov	r9, QWORD PTR [rsi+8]
-        mov	QWORD PTR [rsi], rax
-        adc	r9, QWORD PTR [r12+8]
-        mov	r10, QWORD PTR [rsi+16]
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, QWORD PTR [r12+16]
-        mov	rax, QWORD PTR [rsi+24]
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, QWORD PTR [r12+24]
-        mov	r9, QWORD PTR [rsi+32]
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, QWORD PTR [r12+32]
-        mov	r10, QWORD PTR [rsi+40]
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, QWORD PTR [r12+40]
-        mov	rax, QWORD PTR [rsi+48]
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, QWORD PTR [r12+48]
-        mov	r9, QWORD PTR [rsi+56]
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, QWORD PTR [r12+56]
-        mov	r10, QWORD PTR [rsi+64]
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, QWORD PTR [r12+64]
-        mov	rax, QWORD PTR [rsi+72]
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, QWORD PTR [r12+72]
-        mov	r9, QWORD PTR [rsi+80]
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, QWORD PTR [r12+80]
-        mov	r10, QWORD PTR [rsi+88]
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, QWORD PTR [r12+88]
-        mov	rax, QWORD PTR [rsi+96]
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, QWORD PTR [r12+96]
-        mov	r9, QWORD PTR [rsi+104]
-        mov	QWORD PTR [rsi+96], rax
-        adc	r9, QWORD PTR [r12+104]
-        mov	r10, QWORD PTR [rsi+112]
-        mov	QWORD PTR [rsi+104], r9
-        adc	r10, QWORD PTR [r12+112]
-        mov	rax, QWORD PTR [rsi+120]
-        mov	QWORD PTR [rsi+112], r10
-        adc	rax, QWORD PTR [r12+120]
-        mov	r9, QWORD PTR [rsi+128]
-        mov	QWORD PTR [rsi+120], rax
-        adc	r9, QWORD PTR [r12+128]
-        mov	r10, QWORD PTR [rsi+136]
-        mov	QWORD PTR [rsi+128], r9
-        adc	r10, QWORD PTR [r12+136]
-        mov	rax, QWORD PTR [rsi+144]
-        mov	QWORD PTR [rsi+136], r10
-        adc	rax, QWORD PTR [r12+144]
-        mov	r9, QWORD PTR [rsi+152]
-        mov	QWORD PTR [rsi+144], rax
-        adc	r9, QWORD PTR [r12+152]
-        mov	r10, QWORD PTR [rsi+160]
-        mov	QWORD PTR [rsi+152], r9
-        adc	r10, QWORD PTR [r12+160]
-        mov	rax, QWORD PTR [rsi+168]
-        mov	QWORD PTR [rsi+160], r10
-        adc	rax, QWORD PTR [r12+168]
-        mov	r9, QWORD PTR [rsi+176]
-        mov	QWORD PTR [rsi+168], rax
-        adc	r9, QWORD PTR [r12+176]
-        mov	r10, QWORD PTR [rsi+184]
-        mov	QWORD PTR [rsi+176], r9
-        adc	r10, QWORD PTR [r12+184]
-        mov	rax, QWORD PTR [rsi+192]
-        mov	QWORD PTR [rsi+184], r10
-        adc	rax, QWORD PTR [r12+192]
-        mov	r9, QWORD PTR [rsi+200]
-        mov	QWORD PTR [rsi+192], rax
-        adc	r9, QWORD PTR [r12+200]
-        mov	r10, QWORD PTR [rsi+208]
-        mov	QWORD PTR [rsi+200], r9
-        adc	r10, QWORD PTR [r12+208]
-        mov	rax, QWORD PTR [rsi+216]
-        mov	QWORD PTR [rsi+208], r10
-        adc	rax, QWORD PTR [r12+216]
-        mov	r9, QWORD PTR [rsi+224]
-        mov	QWORD PTR [rsi+216], rax
-        adc	r9, QWORD PTR [r12+224]
-        mov	r10, QWORD PTR [rsi+232]
-        mov	QWORD PTR [rsi+224], r9
-        adc	r10, QWORD PTR [r12+232]
-        mov	rax, QWORD PTR [rsi+240]
-        mov	QWORD PTR [rsi+232], r10
-        adc	rax, QWORD PTR [r12+240]
-        mov	r9, QWORD PTR [rsi+248]
-        mov	QWORD PTR [rsi+240], rax
-        adc	r9, QWORD PTR [r12+248]
-        mov	r10, QWORD PTR [rsi+256]
-        mov	QWORD PTR [rsi+248], r9
-        adc	r10, QWORD PTR [r12+256]
-        mov	rax, QWORD PTR [rsi+264]
-        mov	QWORD PTR [rsi+256], r10
-        adc	rax, QWORD PTR [r12+264]
-        mov	r9, QWORD PTR [rsi+272]
-        mov	QWORD PTR [rsi+264], rax
-        adc	r9, QWORD PTR [r12+272]
-        mov	r10, QWORD PTR [rsi+280]
-        mov	QWORD PTR [rsi+272], r9
-        adc	r10, QWORD PTR [r12+280]
-        mov	rax, QWORD PTR [rsi+288]
-        mov	QWORD PTR [rsi+280], r10
-        adc	rax, QWORD PTR [r12+288]
-        mov	r9, QWORD PTR [rsi+296]
-        mov	QWORD PTR [rsi+288], rax
-        adc	r9, QWORD PTR [r12+296]
-        mov	r10, QWORD PTR [rsi+304]
-        mov	QWORD PTR [rsi+296], r9
-        adc	r10, QWORD PTR [r12+304]
-        mov	rax, QWORD PTR [rsi+312]
-        mov	QWORD PTR [rsi+304], r10
-        adc	rax, QWORD PTR [r12+312]
-        mov	r9, QWORD PTR [rsi+320]
-        mov	QWORD PTR [rsi+312], rax
-        adc	r9, QWORD PTR [r12+320]
-        mov	r10, QWORD PTR [rsi+328]
-        mov	QWORD PTR [rsi+320], r9
-        adc	r10, QWORD PTR [r12+328]
-        mov	rax, QWORD PTR [rsi+336]
-        mov	QWORD PTR [rsi+328], r10
-        adc	rax, QWORD PTR [r12+336]
-        mov	r9, QWORD PTR [rsi+344]
-        mov	QWORD PTR [rsi+336], rax
-        adc	r9, QWORD PTR [r12+344]
-        mov	r10, QWORD PTR [rsi+352]
-        mov	QWORD PTR [rsi+344], r9
-        adc	r10, QWORD PTR [r12+352]
-        mov	rax, QWORD PTR [rsi+360]
-        mov	QWORD PTR [rsi+352], r10
-        adc	rax, QWORD PTR [r12+360]
-        mov	r9, QWORD PTR [rsi+368]
-        mov	QWORD PTR [rsi+360], rax
-        adc	r9, QWORD PTR [r12+368]
-        mov	r10, QWORD PTR [rsi+376]
-        mov	QWORD PTR [rsi+368], r9
-        adc	r10, QWORD PTR [r12+376]
-        mov	rax, QWORD PTR [rsi+384]
-        mov	QWORD PTR [rsi+376], r10
-        adc	rax, QWORD PTR [r12+384]
-        mov	r9, QWORD PTR [rsi+392]
-        mov	QWORD PTR [rsi+384], rax
-        adc	r9, QWORD PTR [r12+392]
-        mov	r10, QWORD PTR [rsi+400]
-        mov	QWORD PTR [rsi+392], r9
-        adc	r10, QWORD PTR [r12+400]
-        mov	rax, QWORD PTR [rsi+408]
-        mov	QWORD PTR [rsi+400], r10
-        adc	rax, QWORD PTR [r12+408]
-        mov	r9, QWORD PTR [rsi+416]
-        mov	QWORD PTR [rsi+408], rax
-        adc	r9, QWORD PTR [r12+416]
-        mov	r10, QWORD PTR [rsi+424]
-        mov	QWORD PTR [rsi+416], r9
-        adc	r10, QWORD PTR [r12+424]
-        mov	rax, QWORD PTR [rsi+432]
-        mov	QWORD PTR [rsi+424], r10
-        adc	rax, QWORD PTR [r12+432]
-        mov	r9, QWORD PTR [rsi+440]
-        mov	QWORD PTR [rsi+432], rax
-        adc	r9, QWORD PTR [r12+440]
-        mov	r10, QWORD PTR [rsi+448]
-        mov	QWORD PTR [rsi+440], r9
-        adc	r10, QWORD PTR [r12+448]
-        mov	rax, QWORD PTR [rsi+456]
-        mov	QWORD PTR [rsi+448], r10
-        adc	rax, QWORD PTR [r12+456]
-        mov	r9, QWORD PTR [rsi+464]
-        mov	QWORD PTR [rsi+456], rax
-        adc	r9, QWORD PTR [r12+464]
-        mov	r10, QWORD PTR [rsi+472]
-        mov	QWORD PTR [rsi+464], r9
-        adc	r10, QWORD PTR [r12+472]
-        mov	rax, QWORD PTR [rsi+480]
-        mov	QWORD PTR [rsi+472], r10
-        adc	rax, QWORD PTR [r12+480]
-        mov	r9, QWORD PTR [rsi+488]
-        mov	QWORD PTR [rsi+480], rax
-        adc	r9, QWORD PTR [r12+488]
-        mov	r10, QWORD PTR [rsi+496]
-        mov	QWORD PTR [rsi+488], r9
-        adc	r10, QWORD PTR [r12+496]
-        mov	rax, QWORD PTR [rsi+504]
-        mov	QWORD PTR [rsi+496], r10
-        adc	rax, QWORD PTR [r12+504]
-        mov	QWORD PTR [rsi+504], rax
-        adc	r11, 0
-        mov	QWORD PTR [rcx+768], r11
-        add	rsi, 256
-        ; Add
-        mov	rax, QWORD PTR [rsi]
-        add	rax, QWORD PTR [r13]
-        mov	r9, QWORD PTR [rsi+8]
-        mov	QWORD PTR [rsi], rax
-        adc	r9, QWORD PTR [r13+8]
-        mov	r10, QWORD PTR [rsi+16]
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, QWORD PTR [r13+16]
-        mov	rax, QWORD PTR [rsi+24]
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, QWORD PTR [r13+24]
-        mov	r9, QWORD PTR [rsi+32]
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, QWORD PTR [r13+32]
-        mov	r10, QWORD PTR [rsi+40]
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, QWORD PTR [r13+40]
-        mov	rax, QWORD PTR [rsi+48]
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, QWORD PTR [r13+48]
-        mov	r9, QWORD PTR [rsi+56]
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, QWORD PTR [r13+56]
-        mov	r10, QWORD PTR [rsi+64]
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, QWORD PTR [r13+64]
-        mov	rax, QWORD PTR [rsi+72]
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, QWORD PTR [r13+72]
-        mov	r9, QWORD PTR [rsi+80]
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, QWORD PTR [r13+80]
-        mov	r10, QWORD PTR [rsi+88]
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, QWORD PTR [r13+88]
-        mov	rax, QWORD PTR [rsi+96]
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, QWORD PTR [r13+96]
-        mov	r9, QWORD PTR [rsi+104]
-        mov	QWORD PTR [rsi+96], rax
-        adc	r9, QWORD PTR [r13+104]
-        mov	r10, QWORD PTR [rsi+112]
-        mov	QWORD PTR [rsi+104], r9
-        adc	r10, QWORD PTR [r13+112]
-        mov	rax, QWORD PTR [rsi+120]
-        mov	QWORD PTR [rsi+112], r10
-        adc	rax, QWORD PTR [r13+120]
-        mov	r9, QWORD PTR [rsi+128]
-        mov	QWORD PTR [rsi+120], rax
-        adc	r9, QWORD PTR [r13+128]
-        mov	r10, QWORD PTR [rsi+136]
-        mov	QWORD PTR [rsi+128], r9
-        adc	r10, QWORD PTR [r13+136]
-        mov	rax, QWORD PTR [rsi+144]
-        mov	QWORD PTR [rsi+136], r10
-        adc	rax, QWORD PTR [r13+144]
-        mov	r9, QWORD PTR [rsi+152]
-        mov	QWORD PTR [rsi+144], rax
-        adc	r9, QWORD PTR [r13+152]
-        mov	r10, QWORD PTR [rsi+160]
-        mov	QWORD PTR [rsi+152], r9
-        adc	r10, QWORD PTR [r13+160]
-        mov	rax, QWORD PTR [rsi+168]
-        mov	QWORD PTR [rsi+160], r10
-        adc	rax, QWORD PTR [r13+168]
-        mov	r9, QWORD PTR [rsi+176]
-        mov	QWORD PTR [rsi+168], rax
-        adc	r9, QWORD PTR [r13+176]
-        mov	r10, QWORD PTR [rsi+184]
-        mov	QWORD PTR [rsi+176], r9
-        adc	r10, QWORD PTR [r13+184]
-        mov	rax, QWORD PTR [rsi+192]
-        mov	QWORD PTR [rsi+184], r10
-        adc	rax, QWORD PTR [r13+192]
-        mov	r9, QWORD PTR [rsi+200]
-        mov	QWORD PTR [rsi+192], rax
-        adc	r9, QWORD PTR [r13+200]
-        mov	r10, QWORD PTR [rsi+208]
-        mov	QWORD PTR [rsi+200], r9
-        adc	r10, QWORD PTR [r13+208]
-        mov	rax, QWORD PTR [rsi+216]
-        mov	QWORD PTR [rsi+208], r10
-        adc	rax, QWORD PTR [r13+216]
-        mov	r9, QWORD PTR [rsi+224]
-        mov	QWORD PTR [rsi+216], rax
-        adc	r9, QWORD PTR [r13+224]
-        mov	r10, QWORD PTR [rsi+232]
-        mov	QWORD PTR [rsi+224], r9
-        adc	r10, QWORD PTR [r13+232]
-        mov	rax, QWORD PTR [rsi+240]
-        mov	QWORD PTR [rsi+232], r10
-        adc	rax, QWORD PTR [r13+240]
-        mov	r9, QWORD PTR [rsi+248]
-        mov	QWORD PTR [rsi+240], rax
-        adc	r9, QWORD PTR [r13+248]
-        mov	r10, QWORD PTR [rsi+256]
-        mov	QWORD PTR [rsi+248], r9
-        adc	r10, QWORD PTR [r13+256]
-        mov	QWORD PTR [rsi+256], r10
-        ; Add to zero
-        mov	rax, QWORD PTR [r13+264]
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+272]
-        mov	QWORD PTR [rsi+264], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+280]
-        mov	QWORD PTR [rsi+272], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+288]
-        mov	QWORD PTR [rsi+280], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+296]
-        mov	QWORD PTR [rsi+288], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+304]
-        mov	QWORD PTR [rsi+296], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+312]
-        mov	QWORD PTR [rsi+304], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+320]
-        mov	QWORD PTR [rsi+312], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+328]
-        mov	QWORD PTR [rsi+320], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+336]
-        mov	QWORD PTR [rsi+328], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+344]
-        mov	QWORD PTR [rsi+336], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+352]
-        mov	QWORD PTR [rsi+344], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+360]
-        mov	QWORD PTR [rsi+352], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+368]
-        mov	QWORD PTR [rsi+360], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+376]
-        mov	QWORD PTR [rsi+368], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+384]
-        mov	QWORD PTR [rsi+376], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+392]
-        mov	QWORD PTR [rsi+384], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+400]
-        mov	QWORD PTR [rsi+392], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+408]
-        mov	QWORD PTR [rsi+400], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+416]
-        mov	QWORD PTR [rsi+408], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+424]
-        mov	QWORD PTR [rsi+416], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+432]
-        mov	QWORD PTR [rsi+424], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+440]
-        mov	QWORD PTR [rsi+432], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+448]
-        mov	QWORD PTR [rsi+440], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+456]
-        mov	QWORD PTR [rsi+448], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+464]
-        mov	QWORD PTR [rsi+456], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+472]
-        mov	QWORD PTR [rsi+464], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+480]
-        mov	QWORD PTR [rsi+472], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+488]
-        mov	QWORD PTR [rsi+480], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+496]
-        mov	QWORD PTR [rsi+488], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+504]
-        mov	QWORD PTR [rsi+496], r10
-        adc	rax, 0
-        mov	QWORD PTR [rsi+504], rax
-        add	rsp, 1576
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_4096_mul_64 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Multiply a and b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_4096_mul_avx2_64 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        sub	rsp, 1576
-        mov	QWORD PTR [rsp+1536], rcx
-        mov	QWORD PTR [rsp+1544], rdx
-        mov	QWORD PTR [rsp+1552], r8
-        lea	r12, QWORD PTR [rsp+1024]
-        lea	r14, QWORD PTR [rdx+256]
-        ; Add
-        mov	rax, QWORD PTR [rdx]
-        xor	r15, r15
-        add	rax, QWORD PTR [r14]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	QWORD PTR [r12], rax
-        adc	r9, QWORD PTR [r14+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	QWORD PTR [r12+8], r9
-        adc	r10, QWORD PTR [r14+16]
-        mov	rax, QWORD PTR [rdx+24]
-        mov	QWORD PTR [r12+16], r10
-        adc	rax, QWORD PTR [r14+24]
-        mov	r9, QWORD PTR [rdx+32]
-        mov	QWORD PTR [r12+24], rax
-        adc	r9, QWORD PTR [r14+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	QWORD PTR [r12+32], r9
-        adc	r10, QWORD PTR [r14+40]
-        mov	rax, QWORD PTR [rdx+48]
-        mov	QWORD PTR [r12+40], r10
-        adc	rax, QWORD PTR [r14+48]
-        mov	r9, QWORD PTR [rdx+56]
-        mov	QWORD PTR [r12+48], rax
-        adc	r9, QWORD PTR [r14+56]
-        mov	r10, QWORD PTR [rdx+64]
-        mov	QWORD PTR [r12+56], r9
-        adc	r10, QWORD PTR [r14+64]
-        mov	rax, QWORD PTR [rdx+72]
-        mov	QWORD PTR [r12+64], r10
-        adc	rax, QWORD PTR [r14+72]
-        mov	r9, QWORD PTR [rdx+80]
-        mov	QWORD PTR [r12+72], rax
-        adc	r9, QWORD PTR [r14+80]
-        mov	r10, QWORD PTR [rdx+88]
-        mov	QWORD PTR [r12+80], r9
-        adc	r10, QWORD PTR [r14+88]
-        mov	rax, QWORD PTR [rdx+96]
-        mov	QWORD PTR [r12+88], r10
-        adc	rax, QWORD PTR [r14+96]
-        mov	r9, QWORD PTR [rdx+104]
-        mov	QWORD PTR [r12+96], rax
-        adc	r9, QWORD PTR [r14+104]
-        mov	r10, QWORD PTR [rdx+112]
-        mov	QWORD PTR [r12+104], r9
-        adc	r10, QWORD PTR [r14+112]
-        mov	rax, QWORD PTR [rdx+120]
-        mov	QWORD PTR [r12+112], r10
-        adc	rax, QWORD PTR [r14+120]
-        mov	r9, QWORD PTR [rdx+128]
-        mov	QWORD PTR [r12+120], rax
-        adc	r9, QWORD PTR [r14+128]
-        mov	r10, QWORD PTR [rdx+136]
-        mov	QWORD PTR [r12+128], r9
-        adc	r10, QWORD PTR [r14+136]
-        mov	rax, QWORD PTR [rdx+144]
-        mov	QWORD PTR [r12+136], r10
-        adc	rax, QWORD PTR [r14+144]
-        mov	r9, QWORD PTR [rdx+152]
-        mov	QWORD PTR [r12+144], rax
-        adc	r9, QWORD PTR [r14+152]
-        mov	r10, QWORD PTR [rdx+160]
-        mov	QWORD PTR [r12+152], r9
-        adc	r10, QWORD PTR [r14+160]
-        mov	rax, QWORD PTR [rdx+168]
-        mov	QWORD PTR [r12+160], r10
-        adc	rax, QWORD PTR [r14+168]
-        mov	r9, QWORD PTR [rdx+176]
-        mov	QWORD PTR [r12+168], rax
-        adc	r9, QWORD PTR [r14+176]
-        mov	r10, QWORD PTR [rdx+184]
-        mov	QWORD PTR [r12+176], r9
-        adc	r10, QWORD PTR [r14+184]
-        mov	rax, QWORD PTR [rdx+192]
-        mov	QWORD PTR [r12+184], r10
-        adc	rax, QWORD PTR [r14+192]
-        mov	r9, QWORD PTR [rdx+200]
-        mov	QWORD PTR [r12+192], rax
-        adc	r9, QWORD PTR [r14+200]
-        mov	r10, QWORD PTR [rdx+208]
-        mov	QWORD PTR [r12+200], r9
-        adc	r10, QWORD PTR [r14+208]
-        mov	rax, QWORD PTR [rdx+216]
-        mov	QWORD PTR [r12+208], r10
-        adc	rax, QWORD PTR [r14+216]
-        mov	r9, QWORD PTR [rdx+224]
-        mov	QWORD PTR [r12+216], rax
-        adc	r9, QWORD PTR [r14+224]
-        mov	r10, QWORD PTR [rdx+232]
-        mov	QWORD PTR [r12+224], r9
-        adc	r10, QWORD PTR [r14+232]
-        mov	rax, QWORD PTR [rdx+240]
-        mov	QWORD PTR [r12+232], r10
-        adc	rax, QWORD PTR [r14+240]
-        mov	r9, QWORD PTR [rdx+248]
-        mov	QWORD PTR [r12+240], rax
-        adc	r9, QWORD PTR [r14+248]
-        mov	QWORD PTR [r12+248], r9
-        adc	r15, 0
-        mov	QWORD PTR [rsp+1560], r15
-        lea	r13, QWORD PTR [rsp+1280]
-        lea	r14, QWORD PTR [r8+256]
-        ; Add
-        mov	rax, QWORD PTR [r8]
-        xor	rdi, rdi
-        add	rax, QWORD PTR [r14]
-        mov	r9, QWORD PTR [r8+8]
-        mov	QWORD PTR [r13], rax
-        adc	r9, QWORD PTR [r14+8]
-        mov	r10, QWORD PTR [r8+16]
-        mov	QWORD PTR [r13+8], r9
-        adc	r10, QWORD PTR [r14+16]
-        mov	rax, QWORD PTR [r8+24]
-        mov	QWORD PTR [r13+16], r10
-        adc	rax, QWORD PTR [r14+24]
-        mov	r9, QWORD PTR [r8+32]
-        mov	QWORD PTR [r13+24], rax
-        adc	r9, QWORD PTR [r14+32]
-        mov	r10, QWORD PTR [r8+40]
-        mov	QWORD PTR [r13+32], r9
-        adc	r10, QWORD PTR [r14+40]
-        mov	rax, QWORD PTR [r8+48]
-        mov	QWORD PTR [r13+40], r10
-        adc	rax, QWORD PTR [r14+48]
-        mov	r9, QWORD PTR [r8+56]
-        mov	QWORD PTR [r13+48], rax
-        adc	r9, QWORD PTR [r14+56]
-        mov	r10, QWORD PTR [r8+64]
-        mov	QWORD PTR [r13+56], r9
-        adc	r10, QWORD PTR [r14+64]
-        mov	rax, QWORD PTR [r8+72]
-        mov	QWORD PTR [r13+64], r10
-        adc	rax, QWORD PTR [r14+72]
-        mov	r9, QWORD PTR [r8+80]
-        mov	QWORD PTR [r13+72], rax
-        adc	r9, QWORD PTR [r14+80]
-        mov	r10, QWORD PTR [r8+88]
-        mov	QWORD PTR [r13+80], r9
-        adc	r10, QWORD PTR [r14+88]
-        mov	rax, QWORD PTR [r8+96]
-        mov	QWORD PTR [r13+88], r10
-        adc	rax, QWORD PTR [r14+96]
-        mov	r9, QWORD PTR [r8+104]
-        mov	QWORD PTR [r13+96], rax
-        adc	r9, QWORD PTR [r14+104]
-        mov	r10, QWORD PTR [r8+112]
-        mov	QWORD PTR [r13+104], r9
-        adc	r10, QWORD PTR [r14+112]
-        mov	rax, QWORD PTR [r8+120]
-        mov	QWORD PTR [r13+112], r10
-        adc	rax, QWORD PTR [r14+120]
-        mov	r9, QWORD PTR [r8+128]
-        mov	QWORD PTR [r13+120], rax
-        adc	r9, QWORD PTR [r14+128]
-        mov	r10, QWORD PTR [r8+136]
-        mov	QWORD PTR [r13+128], r9
-        adc	r10, QWORD PTR [r14+136]
-        mov	rax, QWORD PTR [r8+144]
-        mov	QWORD PTR [r13+136], r10
-        adc	rax, QWORD PTR [r14+144]
-        mov	r9, QWORD PTR [r8+152]
-        mov	QWORD PTR [r13+144], rax
-        adc	r9, QWORD PTR [r14+152]
-        mov	r10, QWORD PTR [r8+160]
-        mov	QWORD PTR [r13+152], r9
-        adc	r10, QWORD PTR [r14+160]
-        mov	rax, QWORD PTR [r8+168]
-        mov	QWORD PTR [r13+160], r10
-        adc	rax, QWORD PTR [r14+168]
-        mov	r9, QWORD PTR [r8+176]
-        mov	QWORD PTR [r13+168], rax
-        adc	r9, QWORD PTR [r14+176]
-        mov	r10, QWORD PTR [r8+184]
-        mov	QWORD PTR [r13+176], r9
-        adc	r10, QWORD PTR [r14+184]
-        mov	rax, QWORD PTR [r8+192]
-        mov	QWORD PTR [r13+184], r10
-        adc	rax, QWORD PTR [r14+192]
-        mov	r9, QWORD PTR [r8+200]
-        mov	QWORD PTR [r13+192], rax
-        adc	r9, QWORD PTR [r14+200]
-        mov	r10, QWORD PTR [r8+208]
-        mov	QWORD PTR [r13+200], r9
-        adc	r10, QWORD PTR [r14+208]
-        mov	rax, QWORD PTR [r8+216]
-        mov	QWORD PTR [r13+208], r10
-        adc	rax, QWORD PTR [r14+216]
-        mov	r9, QWORD PTR [r8+224]
-        mov	QWORD PTR [r13+216], rax
-        adc	r9, QWORD PTR [r14+224]
-        mov	r10, QWORD PTR [r8+232]
-        mov	QWORD PTR [r13+224], r9
-        adc	r10, QWORD PTR [r14+232]
-        mov	rax, QWORD PTR [r8+240]
-        mov	QWORD PTR [r13+232], r10
-        adc	rax, QWORD PTR [r14+240]
-        mov	r9, QWORD PTR [r8+248]
-        mov	QWORD PTR [r13+240], rax
-        adc	r9, QWORD PTR [r14+248]
-        mov	QWORD PTR [r13+248], r9
-        adc	rdi, 0
-        mov	QWORD PTR [rsp+1568], rdi
-        mov	r8, r13
-        mov	rdx, r12
-        mov	rcx, rsp
-        call	sp_2048_mul_avx2_32
-        mov	r8, QWORD PTR [rsp+1552]
-        mov	rdx, QWORD PTR [rsp+1544]
-        lea	rcx, QWORD PTR [rsp+512]
-        add	r8, 256
-        add	rdx, 256
-        call	sp_2048_mul_avx2_32
-        mov	r8, QWORD PTR [rsp+1552]
-        mov	rdx, QWORD PTR [rsp+1544]
-        mov	rcx, QWORD PTR [rsp+1536]
-        call	sp_2048_mul_avx2_32
-IFDEF _WIN64
-        mov	r8, QWORD PTR [rsp+1552]
-        mov	rdx, QWORD PTR [rsp+1544]
-        mov	rcx, QWORD PTR [rsp+1536]
-ENDIF
-        mov	r15, QWORD PTR [rsp+1560]
-        mov	rdi, QWORD PTR [rsp+1568]
-        mov	rsi, QWORD PTR [rsp+1536]
-        mov	r11, r15
-        lea	r12, QWORD PTR [rsp+1024]
-        lea	r13, QWORD PTR [rsp+1280]
-        and	r11, rdi
-        neg	r15
-        neg	rdi
-        add	rsi, 512
-        mov	rax, QWORD PTR [r12]
-        mov	r9, QWORD PTR [r13]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        add	rax, r9
-        mov	r9, QWORD PTR [r12+8]
-        mov	r10, QWORD PTR [r13+8]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+16]
-        mov	rax, QWORD PTR [r13+16]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+24]
-        mov	r9, QWORD PTR [r13+24]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+32]
-        mov	r10, QWORD PTR [r13+32]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+40]
-        mov	rax, QWORD PTR [r13+40]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+48]
-        mov	r9, QWORD PTR [r13+48]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+56]
-        mov	r10, QWORD PTR [r13+56]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+64]
-        mov	rax, QWORD PTR [r13+64]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+72]
-        mov	r9, QWORD PTR [r13+72]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+80]
-        mov	r10, QWORD PTR [r13+80]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+88]
-        mov	rax, QWORD PTR [r13+88]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+96]
-        mov	r9, QWORD PTR [r13+96]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+104]
-        mov	r10, QWORD PTR [r13+104]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+96], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+112]
-        mov	rax, QWORD PTR [r13+112]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+104], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+120]
-        mov	r9, QWORD PTR [r13+120]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+112], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+128]
-        mov	r10, QWORD PTR [r13+128]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+120], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+136]
-        mov	rax, QWORD PTR [r13+136]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+128], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+144]
-        mov	r9, QWORD PTR [r13+144]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+136], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+152]
-        mov	r10, QWORD PTR [r13+152]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+144], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+160]
-        mov	rax, QWORD PTR [r13+160]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+152], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+168]
-        mov	r9, QWORD PTR [r13+168]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+160], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+176]
-        mov	r10, QWORD PTR [r13+176]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+168], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+184]
-        mov	rax, QWORD PTR [r13+184]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+176], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+192]
-        mov	r9, QWORD PTR [r13+192]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+184], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+200]
-        mov	r10, QWORD PTR [r13+200]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+192], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+208]
-        mov	rax, QWORD PTR [r13+208]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+200], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+216]
-        mov	r9, QWORD PTR [r13+216]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+208], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+224]
-        mov	r10, QWORD PTR [r13+224]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+216], rax
-        adc	r9, r10
-        mov	r10, QWORD PTR [r12+232]
-        mov	rax, QWORD PTR [r13+232]
-        pext	r10, r10, rdi
-        pext	rax, rax, r15
-        mov	QWORD PTR [rsi+224], r9
-        adc	r10, rax
-        mov	rax, QWORD PTR [r12+240]
-        mov	r9, QWORD PTR [r13+240]
-        pext	rax, rax, rdi
-        pext	r9, r9, r15
-        mov	QWORD PTR [rsi+232], r10
-        adc	rax, r9
-        mov	r9, QWORD PTR [r12+248]
-        mov	r10, QWORD PTR [r13+248]
-        pext	r9, r9, rdi
-        pext	r10, r10, r15
-        mov	QWORD PTR [rsi+240], rax
-        adc	r9, r10
-        mov	QWORD PTR [rsi+248], r9
-        adc	r11, 0
-        lea	r13, QWORD PTR [rsp+512]
-        mov	r12, rsp
-        mov	rax, QWORD PTR [r12]
-        sub	rax, QWORD PTR [r13]
-        mov	r9, QWORD PTR [r12+8]
-        mov	QWORD PTR [r12], rax
-        sbb	r9, QWORD PTR [r13+8]
-        mov	r10, QWORD PTR [r12+16]
-        mov	QWORD PTR [r12+8], r9
-        sbb	r10, QWORD PTR [r13+16]
-        mov	rax, QWORD PTR [r12+24]
-        mov	QWORD PTR [r12+16], r10
-        sbb	rax, QWORD PTR [r13+24]
-        mov	r9, QWORD PTR [r12+32]
-        mov	QWORD PTR [r12+24], rax
-        sbb	r9, QWORD PTR [r13+32]
-        mov	r10, QWORD PTR [r12+40]
-        mov	QWORD PTR [r12+32], r9
-        sbb	r10, QWORD PTR [r13+40]
-        mov	rax, QWORD PTR [r12+48]
-        mov	QWORD PTR [r12+40], r10
-        sbb	rax, QWORD PTR [r13+48]
-        mov	r9, QWORD PTR [r12+56]
-        mov	QWORD PTR [r12+48], rax
-        sbb	r9, QWORD PTR [r13+56]
-        mov	r10, QWORD PTR [r12+64]
-        mov	QWORD PTR [r12+56], r9
-        sbb	r10, QWORD PTR [r13+64]
-        mov	rax, QWORD PTR [r12+72]
-        mov	QWORD PTR [r12+64], r10
-        sbb	rax, QWORD PTR [r13+72]
-        mov	r9, QWORD PTR [r12+80]
-        mov	QWORD PTR [r12+72], rax
-        sbb	r9, QWORD PTR [r13+80]
-        mov	r10, QWORD PTR [r12+88]
-        mov	QWORD PTR [r12+80], r9
-        sbb	r10, QWORD PTR [r13+88]
-        mov	rax, QWORD PTR [r12+96]
-        mov	QWORD PTR [r12+88], r10
-        sbb	rax, QWORD PTR [r13+96]
-        mov	r9, QWORD PTR [r12+104]
-        mov	QWORD PTR [r12+96], rax
-        sbb	r9, QWORD PTR [r13+104]
-        mov	r10, QWORD PTR [r12+112]
-        mov	QWORD PTR [r12+104], r9
-        sbb	r10, QWORD PTR [r13+112]
-        mov	rax, QWORD PTR [r12+120]
-        mov	QWORD PTR [r12+112], r10
-        sbb	rax, QWORD PTR [r13+120]
-        mov	r9, QWORD PTR [r12+128]
-        mov	QWORD PTR [r12+120], rax
-        sbb	r9, QWORD PTR [r13+128]
-        mov	r10, QWORD PTR [r12+136]
-        mov	QWORD PTR [r12+128], r9
-        sbb	r10, QWORD PTR [r13+136]
-        mov	rax, QWORD PTR [r12+144]
-        mov	QWORD PTR [r12+136], r10
-        sbb	rax, QWORD PTR [r13+144]
-        mov	r9, QWORD PTR [r12+152]
-        mov	QWORD PTR [r12+144], rax
-        sbb	r9, QWORD PTR [r13+152]
-        mov	r10, QWORD PTR [r12+160]
-        mov	QWORD PTR [r12+152], r9
-        sbb	r10, QWORD PTR [r13+160]
-        mov	rax, QWORD PTR [r12+168]
-        mov	QWORD PTR [r12+160], r10
-        sbb	rax, QWORD PTR [r13+168]
-        mov	r9, QWORD PTR [r12+176]
-        mov	QWORD PTR [r12+168], rax
-        sbb	r9, QWORD PTR [r13+176]
-        mov	r10, QWORD PTR [r12+184]
-        mov	QWORD PTR [r12+176], r9
-        sbb	r10, QWORD PTR [r13+184]
-        mov	rax, QWORD PTR [r12+192]
-        mov	QWORD PTR [r12+184], r10
-        sbb	rax, QWORD PTR [r13+192]
-        mov	r9, QWORD PTR [r12+200]
-        mov	QWORD PTR [r12+192], rax
-        sbb	r9, QWORD PTR [r13+200]
-        mov	r10, QWORD PTR [r12+208]
-        mov	QWORD PTR [r12+200], r9
-        sbb	r10, QWORD PTR [r13+208]
-        mov	rax, QWORD PTR [r12+216]
-        mov	QWORD PTR [r12+208], r10
-        sbb	rax, QWORD PTR [r13+216]
-        mov	r9, QWORD PTR [r12+224]
-        mov	QWORD PTR [r12+216], rax
-        sbb	r9, QWORD PTR [r13+224]
-        mov	r10, QWORD PTR [r12+232]
-        mov	QWORD PTR [r12+224], r9
-        sbb	r10, QWORD PTR [r13+232]
-        mov	rax, QWORD PTR [r12+240]
-        mov	QWORD PTR [r12+232], r10
-        sbb	rax, QWORD PTR [r13+240]
-        mov	r9, QWORD PTR [r12+248]
-        mov	QWORD PTR [r12+240], rax
-        sbb	r9, QWORD PTR [r13+248]
-        mov	r10, QWORD PTR [r12+256]
-        mov	QWORD PTR [r12+248], r9
-        sbb	r10, QWORD PTR [r13+256]
-        mov	rax, QWORD PTR [r12+264]
-        mov	QWORD PTR [r12+256], r10
-        sbb	rax, QWORD PTR [r13+264]
-        mov	r9, QWORD PTR [r12+272]
-        mov	QWORD PTR [r12+264], rax
-        sbb	r9, QWORD PTR [r13+272]
-        mov	r10, QWORD PTR [r12+280]
-        mov	QWORD PTR [r12+272], r9
-        sbb	r10, QWORD PTR [r13+280]
-        mov	rax, QWORD PTR [r12+288]
-        mov	QWORD PTR [r12+280], r10
-        sbb	rax, QWORD PTR [r13+288]
-        mov	r9, QWORD PTR [r12+296]
-        mov	QWORD PTR [r12+288], rax
-        sbb	r9, QWORD PTR [r13+296]
-        mov	r10, QWORD PTR [r12+304]
-        mov	QWORD PTR [r12+296], r9
-        sbb	r10, QWORD PTR [r13+304]
-        mov	rax, QWORD PTR [r12+312]
-        mov	QWORD PTR [r12+304], r10
-        sbb	rax, QWORD PTR [r13+312]
-        mov	r9, QWORD PTR [r12+320]
-        mov	QWORD PTR [r12+312], rax
-        sbb	r9, QWORD PTR [r13+320]
-        mov	r10, QWORD PTR [r12+328]
-        mov	QWORD PTR [r12+320], r9
-        sbb	r10, QWORD PTR [r13+328]
-        mov	rax, QWORD PTR [r12+336]
-        mov	QWORD PTR [r12+328], r10
-        sbb	rax, QWORD PTR [r13+336]
-        mov	r9, QWORD PTR [r12+344]
-        mov	QWORD PTR [r12+336], rax
-        sbb	r9, QWORD PTR [r13+344]
-        mov	r10, QWORD PTR [r12+352]
-        mov	QWORD PTR [r12+344], r9
-        sbb	r10, QWORD PTR [r13+352]
-        mov	rax, QWORD PTR [r12+360]
-        mov	QWORD PTR [r12+352], r10
-        sbb	rax, QWORD PTR [r13+360]
-        mov	r9, QWORD PTR [r12+368]
-        mov	QWORD PTR [r12+360], rax
-        sbb	r9, QWORD PTR [r13+368]
-        mov	r10, QWORD PTR [r12+376]
-        mov	QWORD PTR [r12+368], r9
-        sbb	r10, QWORD PTR [r13+376]
-        mov	rax, QWORD PTR [r12+384]
-        mov	QWORD PTR [r12+376], r10
-        sbb	rax, QWORD PTR [r13+384]
-        mov	r9, QWORD PTR [r12+392]
-        mov	QWORD PTR [r12+384], rax
-        sbb	r9, QWORD PTR [r13+392]
-        mov	r10, QWORD PTR [r12+400]
-        mov	QWORD PTR [r12+392], r9
-        sbb	r10, QWORD PTR [r13+400]
-        mov	rax, QWORD PTR [r12+408]
-        mov	QWORD PTR [r12+400], r10
-        sbb	rax, QWORD PTR [r13+408]
-        mov	r9, QWORD PTR [r12+416]
-        mov	QWORD PTR [r12+408], rax
-        sbb	r9, QWORD PTR [r13+416]
-        mov	r10, QWORD PTR [r12+424]
-        mov	QWORD PTR [r12+416], r9
-        sbb	r10, QWORD PTR [r13+424]
-        mov	rax, QWORD PTR [r12+432]
-        mov	QWORD PTR [r12+424], r10
-        sbb	rax, QWORD PTR [r13+432]
-        mov	r9, QWORD PTR [r12+440]
-        mov	QWORD PTR [r12+432], rax
-        sbb	r9, QWORD PTR [r13+440]
-        mov	r10, QWORD PTR [r12+448]
-        mov	QWORD PTR [r12+440], r9
-        sbb	r10, QWORD PTR [r13+448]
-        mov	rax, QWORD PTR [r12+456]
-        mov	QWORD PTR [r12+448], r10
-        sbb	rax, QWORD PTR [r13+456]
-        mov	r9, QWORD PTR [r12+464]
-        mov	QWORD PTR [r12+456], rax
-        sbb	r9, QWORD PTR [r13+464]
-        mov	r10, QWORD PTR [r12+472]
-        mov	QWORD PTR [r12+464], r9
-        sbb	r10, QWORD PTR [r13+472]
-        mov	rax, QWORD PTR [r12+480]
-        mov	QWORD PTR [r12+472], r10
-        sbb	rax, QWORD PTR [r13+480]
-        mov	r9, QWORD PTR [r12+488]
-        mov	QWORD PTR [r12+480], rax
-        sbb	r9, QWORD PTR [r13+488]
-        mov	r10, QWORD PTR [r12+496]
-        mov	QWORD PTR [r12+488], r9
-        sbb	r10, QWORD PTR [r13+496]
-        mov	rax, QWORD PTR [r12+504]
-        mov	QWORD PTR [r12+496], r10
-        sbb	rax, QWORD PTR [r13+504]
-        mov	QWORD PTR [r12+504], rax
-        sbb	r11, 0
-        mov	rax, QWORD PTR [r12]
-        sub	rax, QWORD PTR [rcx]
-        mov	r9, QWORD PTR [r12+8]
-        mov	QWORD PTR [r12], rax
-        sbb	r9, QWORD PTR [rcx+8]
-        mov	r10, QWORD PTR [r12+16]
-        mov	QWORD PTR [r12+8], r9
-        sbb	r10, QWORD PTR [rcx+16]
-        mov	rax, QWORD PTR [r12+24]
-        mov	QWORD PTR [r12+16], r10
-        sbb	rax, QWORD PTR [rcx+24]
-        mov	r9, QWORD PTR [r12+32]
-        mov	QWORD PTR [r12+24], rax
-        sbb	r9, QWORD PTR [rcx+32]
-        mov	r10, QWORD PTR [r12+40]
-        mov	QWORD PTR [r12+32], r9
-        sbb	r10, QWORD PTR [rcx+40]
-        mov	rax, QWORD PTR [r12+48]
-        mov	QWORD PTR [r12+40], r10
-        sbb	rax, QWORD PTR [rcx+48]
-        mov	r9, QWORD PTR [r12+56]
-        mov	QWORD PTR [r12+48], rax
-        sbb	r9, QWORD PTR [rcx+56]
-        mov	r10, QWORD PTR [r12+64]
-        mov	QWORD PTR [r12+56], r9
-        sbb	r10, QWORD PTR [rcx+64]
-        mov	rax, QWORD PTR [r12+72]
-        mov	QWORD PTR [r12+64], r10
-        sbb	rax, QWORD PTR [rcx+72]
-        mov	r9, QWORD PTR [r12+80]
-        mov	QWORD PTR [r12+72], rax
-        sbb	r9, QWORD PTR [rcx+80]
-        mov	r10, QWORD PTR [r12+88]
-        mov	QWORD PTR [r12+80], r9
-        sbb	r10, QWORD PTR [rcx+88]
-        mov	rax, QWORD PTR [r12+96]
-        mov	QWORD PTR [r12+88], r10
-        sbb	rax, QWORD PTR [rcx+96]
-        mov	r9, QWORD PTR [r12+104]
-        mov	QWORD PTR [r12+96], rax
-        sbb	r9, QWORD PTR [rcx+104]
-        mov	r10, QWORD PTR [r12+112]
-        mov	QWORD PTR [r12+104], r9
-        sbb	r10, QWORD PTR [rcx+112]
-        mov	rax, QWORD PTR [r12+120]
-        mov	QWORD PTR [r12+112], r10
-        sbb	rax, QWORD PTR [rcx+120]
-        mov	r9, QWORD PTR [r12+128]
-        mov	QWORD PTR [r12+120], rax
-        sbb	r9, QWORD PTR [rcx+128]
-        mov	r10, QWORD PTR [r12+136]
-        mov	QWORD PTR [r12+128], r9
-        sbb	r10, QWORD PTR [rcx+136]
-        mov	rax, QWORD PTR [r12+144]
-        mov	QWORD PTR [r12+136], r10
-        sbb	rax, QWORD PTR [rcx+144]
-        mov	r9, QWORD PTR [r12+152]
-        mov	QWORD PTR [r12+144], rax
-        sbb	r9, QWORD PTR [rcx+152]
-        mov	r10, QWORD PTR [r12+160]
-        mov	QWORD PTR [r12+152], r9
-        sbb	r10, QWORD PTR [rcx+160]
-        mov	rax, QWORD PTR [r12+168]
-        mov	QWORD PTR [r12+160], r10
-        sbb	rax, QWORD PTR [rcx+168]
-        mov	r9, QWORD PTR [r12+176]
-        mov	QWORD PTR [r12+168], rax
-        sbb	r9, QWORD PTR [rcx+176]
-        mov	r10, QWORD PTR [r12+184]
-        mov	QWORD PTR [r12+176], r9
-        sbb	r10, QWORD PTR [rcx+184]
-        mov	rax, QWORD PTR [r12+192]
-        mov	QWORD PTR [r12+184], r10
-        sbb	rax, QWORD PTR [rcx+192]
-        mov	r9, QWORD PTR [r12+200]
-        mov	QWORD PTR [r12+192], rax
-        sbb	r9, QWORD PTR [rcx+200]
-        mov	r10, QWORD PTR [r12+208]
-        mov	QWORD PTR [r12+200], r9
-        sbb	r10, QWORD PTR [rcx+208]
-        mov	rax, QWORD PTR [r12+216]
-        mov	QWORD PTR [r12+208], r10
-        sbb	rax, QWORD PTR [rcx+216]
-        mov	r9, QWORD PTR [r12+224]
-        mov	QWORD PTR [r12+216], rax
-        sbb	r9, QWORD PTR [rcx+224]
-        mov	r10, QWORD PTR [r12+232]
-        mov	QWORD PTR [r12+224], r9
-        sbb	r10, QWORD PTR [rcx+232]
-        mov	rax, QWORD PTR [r12+240]
-        mov	QWORD PTR [r12+232], r10
-        sbb	rax, QWORD PTR [rcx+240]
-        mov	r9, QWORD PTR [r12+248]
-        mov	QWORD PTR [r12+240], rax
-        sbb	r9, QWORD PTR [rcx+248]
-        mov	r10, QWORD PTR [r12+256]
-        mov	QWORD PTR [r12+248], r9
-        sbb	r10, QWORD PTR [rcx+256]
-        mov	rax, QWORD PTR [r12+264]
-        mov	QWORD PTR [r12+256], r10
-        sbb	rax, QWORD PTR [rcx+264]
-        mov	r9, QWORD PTR [r12+272]
-        mov	QWORD PTR [r12+264], rax
-        sbb	r9, QWORD PTR [rcx+272]
-        mov	r10, QWORD PTR [r12+280]
-        mov	QWORD PTR [r12+272], r9
-        sbb	r10, QWORD PTR [rcx+280]
-        mov	rax, QWORD PTR [r12+288]
-        mov	QWORD PTR [r12+280], r10
-        sbb	rax, QWORD PTR [rcx+288]
-        mov	r9, QWORD PTR [r12+296]
-        mov	QWORD PTR [r12+288], rax
-        sbb	r9, QWORD PTR [rcx+296]
-        mov	r10, QWORD PTR [r12+304]
-        mov	QWORD PTR [r12+296], r9
-        sbb	r10, QWORD PTR [rcx+304]
-        mov	rax, QWORD PTR [r12+312]
-        mov	QWORD PTR [r12+304], r10
-        sbb	rax, QWORD PTR [rcx+312]
-        mov	r9, QWORD PTR [r12+320]
-        mov	QWORD PTR [r12+312], rax
-        sbb	r9, QWORD PTR [rcx+320]
-        mov	r10, QWORD PTR [r12+328]
-        mov	QWORD PTR [r12+320], r9
-        sbb	r10, QWORD PTR [rcx+328]
-        mov	rax, QWORD PTR [r12+336]
-        mov	QWORD PTR [r12+328], r10
-        sbb	rax, QWORD PTR [rcx+336]
-        mov	r9, QWORD PTR [r12+344]
-        mov	QWORD PTR [r12+336], rax
-        sbb	r9, QWORD PTR [rcx+344]
-        mov	r10, QWORD PTR [r12+352]
-        mov	QWORD PTR [r12+344], r9
-        sbb	r10, QWORD PTR [rcx+352]
-        mov	rax, QWORD PTR [r12+360]
-        mov	QWORD PTR [r12+352], r10
-        sbb	rax, QWORD PTR [rcx+360]
-        mov	r9, QWORD PTR [r12+368]
-        mov	QWORD PTR [r12+360], rax
-        sbb	r9, QWORD PTR [rcx+368]
-        mov	r10, QWORD PTR [r12+376]
-        mov	QWORD PTR [r12+368], r9
-        sbb	r10, QWORD PTR [rcx+376]
-        mov	rax, QWORD PTR [r12+384]
-        mov	QWORD PTR [r12+376], r10
-        sbb	rax, QWORD PTR [rcx+384]
-        mov	r9, QWORD PTR [r12+392]
-        mov	QWORD PTR [r12+384], rax
-        sbb	r9, QWORD PTR [rcx+392]
-        mov	r10, QWORD PTR [r12+400]
-        mov	QWORD PTR [r12+392], r9
-        sbb	r10, QWORD PTR [rcx+400]
-        mov	rax, QWORD PTR [r12+408]
-        mov	QWORD PTR [r12+400], r10
-        sbb	rax, QWORD PTR [rcx+408]
-        mov	r9, QWORD PTR [r12+416]
-        mov	QWORD PTR [r12+408], rax
-        sbb	r9, QWORD PTR [rcx+416]
-        mov	r10, QWORD PTR [r12+424]
-        mov	QWORD PTR [r12+416], r9
-        sbb	r10, QWORD PTR [rcx+424]
-        mov	rax, QWORD PTR [r12+432]
-        mov	QWORD PTR [r12+424], r10
-        sbb	rax, QWORD PTR [rcx+432]
-        mov	r9, QWORD PTR [r12+440]
-        mov	QWORD PTR [r12+432], rax
-        sbb	r9, QWORD PTR [rcx+440]
-        mov	r10, QWORD PTR [r12+448]
-        mov	QWORD PTR [r12+440], r9
-        sbb	r10, QWORD PTR [rcx+448]
-        mov	rax, QWORD PTR [r12+456]
-        mov	QWORD PTR [r12+448], r10
-        sbb	rax, QWORD PTR [rcx+456]
-        mov	r9, QWORD PTR [r12+464]
-        mov	QWORD PTR [r12+456], rax
-        sbb	r9, QWORD PTR [rcx+464]
-        mov	r10, QWORD PTR [r12+472]
-        mov	QWORD PTR [r12+464], r9
-        sbb	r10, QWORD PTR [rcx+472]
-        mov	rax, QWORD PTR [r12+480]
-        mov	QWORD PTR [r12+472], r10
-        sbb	rax, QWORD PTR [rcx+480]
-        mov	r9, QWORD PTR [r12+488]
-        mov	QWORD PTR [r12+480], rax
-        sbb	r9, QWORD PTR [rcx+488]
-        mov	r10, QWORD PTR [r12+496]
-        mov	QWORD PTR [r12+488], r9
-        sbb	r10, QWORD PTR [rcx+496]
-        mov	rax, QWORD PTR [r12+504]
-        mov	QWORD PTR [r12+496], r10
-        sbb	rax, QWORD PTR [rcx+504]
-        mov	QWORD PTR [r12+504], rax
-        sbb	r11, 0
-        sub	rsi, 256
-        ; Add
-        mov	rax, QWORD PTR [rsi]
-        add	rax, QWORD PTR [r12]
-        mov	r9, QWORD PTR [rsi+8]
-        mov	QWORD PTR [rsi], rax
-        adc	r9, QWORD PTR [r12+8]
-        mov	r10, QWORD PTR [rsi+16]
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, QWORD PTR [r12+16]
-        mov	rax, QWORD PTR [rsi+24]
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, QWORD PTR [r12+24]
-        mov	r9, QWORD PTR [rsi+32]
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, QWORD PTR [r12+32]
-        mov	r10, QWORD PTR [rsi+40]
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, QWORD PTR [r12+40]
-        mov	rax, QWORD PTR [rsi+48]
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, QWORD PTR [r12+48]
-        mov	r9, QWORD PTR [rsi+56]
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, QWORD PTR [r12+56]
-        mov	r10, QWORD PTR [rsi+64]
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, QWORD PTR [r12+64]
-        mov	rax, QWORD PTR [rsi+72]
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, QWORD PTR [r12+72]
-        mov	r9, QWORD PTR [rsi+80]
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, QWORD PTR [r12+80]
-        mov	r10, QWORD PTR [rsi+88]
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, QWORD PTR [r12+88]
-        mov	rax, QWORD PTR [rsi+96]
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, QWORD PTR [r12+96]
-        mov	r9, QWORD PTR [rsi+104]
-        mov	QWORD PTR [rsi+96], rax
-        adc	r9, QWORD PTR [r12+104]
-        mov	r10, QWORD PTR [rsi+112]
-        mov	QWORD PTR [rsi+104], r9
-        adc	r10, QWORD PTR [r12+112]
-        mov	rax, QWORD PTR [rsi+120]
-        mov	QWORD PTR [rsi+112], r10
-        adc	rax, QWORD PTR [r12+120]
-        mov	r9, QWORD PTR [rsi+128]
-        mov	QWORD PTR [rsi+120], rax
-        adc	r9, QWORD PTR [r12+128]
-        mov	r10, QWORD PTR [rsi+136]
-        mov	QWORD PTR [rsi+128], r9
-        adc	r10, QWORD PTR [r12+136]
-        mov	rax, QWORD PTR [rsi+144]
-        mov	QWORD PTR [rsi+136], r10
-        adc	rax, QWORD PTR [r12+144]
-        mov	r9, QWORD PTR [rsi+152]
-        mov	QWORD PTR [rsi+144], rax
-        adc	r9, QWORD PTR [r12+152]
-        mov	r10, QWORD PTR [rsi+160]
-        mov	QWORD PTR [rsi+152], r9
-        adc	r10, QWORD PTR [r12+160]
-        mov	rax, QWORD PTR [rsi+168]
-        mov	QWORD PTR [rsi+160], r10
-        adc	rax, QWORD PTR [r12+168]
-        mov	r9, QWORD PTR [rsi+176]
-        mov	QWORD PTR [rsi+168], rax
-        adc	r9, QWORD PTR [r12+176]
-        mov	r10, QWORD PTR [rsi+184]
-        mov	QWORD PTR [rsi+176], r9
-        adc	r10, QWORD PTR [r12+184]
-        mov	rax, QWORD PTR [rsi+192]
-        mov	QWORD PTR [rsi+184], r10
-        adc	rax, QWORD PTR [r12+192]
-        mov	r9, QWORD PTR [rsi+200]
-        mov	QWORD PTR [rsi+192], rax
-        adc	r9, QWORD PTR [r12+200]
-        mov	r10, QWORD PTR [rsi+208]
-        mov	QWORD PTR [rsi+200], r9
-        adc	r10, QWORD PTR [r12+208]
-        mov	rax, QWORD PTR [rsi+216]
-        mov	QWORD PTR [rsi+208], r10
-        adc	rax, QWORD PTR [r12+216]
-        mov	r9, QWORD PTR [rsi+224]
-        mov	QWORD PTR [rsi+216], rax
-        adc	r9, QWORD PTR [r12+224]
-        mov	r10, QWORD PTR [rsi+232]
-        mov	QWORD PTR [rsi+224], r9
-        adc	r10, QWORD PTR [r12+232]
-        mov	rax, QWORD PTR [rsi+240]
-        mov	QWORD PTR [rsi+232], r10
-        adc	rax, QWORD PTR [r12+240]
-        mov	r9, QWORD PTR [rsi+248]
-        mov	QWORD PTR [rsi+240], rax
-        adc	r9, QWORD PTR [r12+248]
-        mov	r10, QWORD PTR [rsi+256]
-        mov	QWORD PTR [rsi+248], r9
-        adc	r10, QWORD PTR [r12+256]
-        mov	rax, QWORD PTR [rsi+264]
-        mov	QWORD PTR [rsi+256], r10
-        adc	rax, QWORD PTR [r12+264]
-        mov	r9, QWORD PTR [rsi+272]
-        mov	QWORD PTR [rsi+264], rax
-        adc	r9, QWORD PTR [r12+272]
-        mov	r10, QWORD PTR [rsi+280]
-        mov	QWORD PTR [rsi+272], r9
-        adc	r10, QWORD PTR [r12+280]
-        mov	rax, QWORD PTR [rsi+288]
-        mov	QWORD PTR [rsi+280], r10
-        adc	rax, QWORD PTR [r12+288]
-        mov	r9, QWORD PTR [rsi+296]
-        mov	QWORD PTR [rsi+288], rax
-        adc	r9, QWORD PTR [r12+296]
-        mov	r10, QWORD PTR [rsi+304]
-        mov	QWORD PTR [rsi+296], r9
-        adc	r10, QWORD PTR [r12+304]
-        mov	rax, QWORD PTR [rsi+312]
-        mov	QWORD PTR [rsi+304], r10
-        adc	rax, QWORD PTR [r12+312]
-        mov	r9, QWORD PTR [rsi+320]
-        mov	QWORD PTR [rsi+312], rax
-        adc	r9, QWORD PTR [r12+320]
-        mov	r10, QWORD PTR [rsi+328]
-        mov	QWORD PTR [rsi+320], r9
-        adc	r10, QWORD PTR [r12+328]
-        mov	rax, QWORD PTR [rsi+336]
-        mov	QWORD PTR [rsi+328], r10
-        adc	rax, QWORD PTR [r12+336]
-        mov	r9, QWORD PTR [rsi+344]
-        mov	QWORD PTR [rsi+336], rax
-        adc	r9, QWORD PTR [r12+344]
-        mov	r10, QWORD PTR [rsi+352]
-        mov	QWORD PTR [rsi+344], r9
-        adc	r10, QWORD PTR [r12+352]
-        mov	rax, QWORD PTR [rsi+360]
-        mov	QWORD PTR [rsi+352], r10
-        adc	rax, QWORD PTR [r12+360]
-        mov	r9, QWORD PTR [rsi+368]
-        mov	QWORD PTR [rsi+360], rax
-        adc	r9, QWORD PTR [r12+368]
-        mov	r10, QWORD PTR [rsi+376]
-        mov	QWORD PTR [rsi+368], r9
-        adc	r10, QWORD PTR [r12+376]
-        mov	rax, QWORD PTR [rsi+384]
-        mov	QWORD PTR [rsi+376], r10
-        adc	rax, QWORD PTR [r12+384]
-        mov	r9, QWORD PTR [rsi+392]
-        mov	QWORD PTR [rsi+384], rax
-        adc	r9, QWORD PTR [r12+392]
-        mov	r10, QWORD PTR [rsi+400]
-        mov	QWORD PTR [rsi+392], r9
-        adc	r10, QWORD PTR [r12+400]
-        mov	rax, QWORD PTR [rsi+408]
-        mov	QWORD PTR [rsi+400], r10
-        adc	rax, QWORD PTR [r12+408]
-        mov	r9, QWORD PTR [rsi+416]
-        mov	QWORD PTR [rsi+408], rax
-        adc	r9, QWORD PTR [r12+416]
-        mov	r10, QWORD PTR [rsi+424]
-        mov	QWORD PTR [rsi+416], r9
-        adc	r10, QWORD PTR [r12+424]
-        mov	rax, QWORD PTR [rsi+432]
-        mov	QWORD PTR [rsi+424], r10
-        adc	rax, QWORD PTR [r12+432]
-        mov	r9, QWORD PTR [rsi+440]
-        mov	QWORD PTR [rsi+432], rax
-        adc	r9, QWORD PTR [r12+440]
-        mov	r10, QWORD PTR [rsi+448]
-        mov	QWORD PTR [rsi+440], r9
-        adc	r10, QWORD PTR [r12+448]
-        mov	rax, QWORD PTR [rsi+456]
-        mov	QWORD PTR [rsi+448], r10
-        adc	rax, QWORD PTR [r12+456]
-        mov	r9, QWORD PTR [rsi+464]
-        mov	QWORD PTR [rsi+456], rax
-        adc	r9, QWORD PTR [r12+464]
-        mov	r10, QWORD PTR [rsi+472]
-        mov	QWORD PTR [rsi+464], r9
-        adc	r10, QWORD PTR [r12+472]
-        mov	rax, QWORD PTR [rsi+480]
-        mov	QWORD PTR [rsi+472], r10
-        adc	rax, QWORD PTR [r12+480]
-        mov	r9, QWORD PTR [rsi+488]
-        mov	QWORD PTR [rsi+480], rax
-        adc	r9, QWORD PTR [r12+488]
-        mov	r10, QWORD PTR [rsi+496]
-        mov	QWORD PTR [rsi+488], r9
-        adc	r10, QWORD PTR [r12+496]
-        mov	rax, QWORD PTR [rsi+504]
-        mov	QWORD PTR [rsi+496], r10
-        adc	rax, QWORD PTR [r12+504]
-        mov	QWORD PTR [rsi+504], rax
-        adc	r11, 0
-        mov	QWORD PTR [rcx+768], r11
-        add	rsi, 256
-        ; Add
-        mov	rax, QWORD PTR [rsi]
-        add	rax, QWORD PTR [r13]
-        mov	r9, QWORD PTR [rsi+8]
-        mov	QWORD PTR [rsi], rax
-        adc	r9, QWORD PTR [r13+8]
-        mov	r10, QWORD PTR [rsi+16]
-        mov	QWORD PTR [rsi+8], r9
-        adc	r10, QWORD PTR [r13+16]
-        mov	rax, QWORD PTR [rsi+24]
-        mov	QWORD PTR [rsi+16], r10
-        adc	rax, QWORD PTR [r13+24]
-        mov	r9, QWORD PTR [rsi+32]
-        mov	QWORD PTR [rsi+24], rax
-        adc	r9, QWORD PTR [r13+32]
-        mov	r10, QWORD PTR [rsi+40]
-        mov	QWORD PTR [rsi+32], r9
-        adc	r10, QWORD PTR [r13+40]
-        mov	rax, QWORD PTR [rsi+48]
-        mov	QWORD PTR [rsi+40], r10
-        adc	rax, QWORD PTR [r13+48]
-        mov	r9, QWORD PTR [rsi+56]
-        mov	QWORD PTR [rsi+48], rax
-        adc	r9, QWORD PTR [r13+56]
-        mov	r10, QWORD PTR [rsi+64]
-        mov	QWORD PTR [rsi+56], r9
-        adc	r10, QWORD PTR [r13+64]
-        mov	rax, QWORD PTR [rsi+72]
-        mov	QWORD PTR [rsi+64], r10
-        adc	rax, QWORD PTR [r13+72]
-        mov	r9, QWORD PTR [rsi+80]
-        mov	QWORD PTR [rsi+72], rax
-        adc	r9, QWORD PTR [r13+80]
-        mov	r10, QWORD PTR [rsi+88]
-        mov	QWORD PTR [rsi+80], r9
-        adc	r10, QWORD PTR [r13+88]
-        mov	rax, QWORD PTR [rsi+96]
-        mov	QWORD PTR [rsi+88], r10
-        adc	rax, QWORD PTR [r13+96]
-        mov	r9, QWORD PTR [rsi+104]
-        mov	QWORD PTR [rsi+96], rax
-        adc	r9, QWORD PTR [r13+104]
-        mov	r10, QWORD PTR [rsi+112]
-        mov	QWORD PTR [rsi+104], r9
-        adc	r10, QWORD PTR [r13+112]
-        mov	rax, QWORD PTR [rsi+120]
-        mov	QWORD PTR [rsi+112], r10
-        adc	rax, QWORD PTR [r13+120]
-        mov	r9, QWORD PTR [rsi+128]
-        mov	QWORD PTR [rsi+120], rax
-        adc	r9, QWORD PTR [r13+128]
-        mov	r10, QWORD PTR [rsi+136]
-        mov	QWORD PTR [rsi+128], r9
-        adc	r10, QWORD PTR [r13+136]
-        mov	rax, QWORD PTR [rsi+144]
-        mov	QWORD PTR [rsi+136], r10
-        adc	rax, QWORD PTR [r13+144]
-        mov	r9, QWORD PTR [rsi+152]
-        mov	QWORD PTR [rsi+144], rax
-        adc	r9, QWORD PTR [r13+152]
-        mov	r10, QWORD PTR [rsi+160]
-        mov	QWORD PTR [rsi+152], r9
-        adc	r10, QWORD PTR [r13+160]
-        mov	rax, QWORD PTR [rsi+168]
-        mov	QWORD PTR [rsi+160], r10
-        adc	rax, QWORD PTR [r13+168]
-        mov	r9, QWORD PTR [rsi+176]
-        mov	QWORD PTR [rsi+168], rax
-        adc	r9, QWORD PTR [r13+176]
-        mov	r10, QWORD PTR [rsi+184]
-        mov	QWORD PTR [rsi+176], r9
-        adc	r10, QWORD PTR [r13+184]
-        mov	rax, QWORD PTR [rsi+192]
-        mov	QWORD PTR [rsi+184], r10
-        adc	rax, QWORD PTR [r13+192]
-        mov	r9, QWORD PTR [rsi+200]
-        mov	QWORD PTR [rsi+192], rax
-        adc	r9, QWORD PTR [r13+200]
-        mov	r10, QWORD PTR [rsi+208]
-        mov	QWORD PTR [rsi+200], r9
-        adc	r10, QWORD PTR [r13+208]
-        mov	rax, QWORD PTR [rsi+216]
-        mov	QWORD PTR [rsi+208], r10
-        adc	rax, QWORD PTR [r13+216]
-        mov	r9, QWORD PTR [rsi+224]
-        mov	QWORD PTR [rsi+216], rax
-        adc	r9, QWORD PTR [r13+224]
-        mov	r10, QWORD PTR [rsi+232]
-        mov	QWORD PTR [rsi+224], r9
-        adc	r10, QWORD PTR [r13+232]
-        mov	rax, QWORD PTR [rsi+240]
-        mov	QWORD PTR [rsi+232], r10
-        adc	rax, QWORD PTR [r13+240]
-        mov	r9, QWORD PTR [rsi+248]
-        mov	QWORD PTR [rsi+240], rax
-        adc	r9, QWORD PTR [r13+248]
-        mov	r10, QWORD PTR [rsi+256]
-        mov	QWORD PTR [rsi+248], r9
-        adc	r10, QWORD PTR [r13+256]
-        mov	QWORD PTR [rsi+256], r10
-        ; Add to zero
-        mov	rax, QWORD PTR [r13+264]
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+272]
-        mov	QWORD PTR [rsi+264], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+280]
-        mov	QWORD PTR [rsi+272], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+288]
-        mov	QWORD PTR [rsi+280], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+296]
-        mov	QWORD PTR [rsi+288], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+304]
-        mov	QWORD PTR [rsi+296], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+312]
-        mov	QWORD PTR [rsi+304], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+320]
-        mov	QWORD PTR [rsi+312], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+328]
-        mov	QWORD PTR [rsi+320], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+336]
-        mov	QWORD PTR [rsi+328], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+344]
-        mov	QWORD PTR [rsi+336], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+352]
-        mov	QWORD PTR [rsi+344], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+360]
-        mov	QWORD PTR [rsi+352], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+368]
-        mov	QWORD PTR [rsi+360], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+376]
-        mov	QWORD PTR [rsi+368], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+384]
-        mov	QWORD PTR [rsi+376], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+392]
-        mov	QWORD PTR [rsi+384], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+400]
-        mov	QWORD PTR [rsi+392], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+408]
-        mov	QWORD PTR [rsi+400], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+416]
-        mov	QWORD PTR [rsi+408], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+424]
-        mov	QWORD PTR [rsi+416], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+432]
-        mov	QWORD PTR [rsi+424], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+440]
-        mov	QWORD PTR [rsi+432], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+448]
-        mov	QWORD PTR [rsi+440], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+456]
-        mov	QWORD PTR [rsi+448], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+464]
-        mov	QWORD PTR [rsi+456], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+472]
-        mov	QWORD PTR [rsi+464], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+480]
-        mov	QWORD PTR [rsi+472], r10
-        adc	rax, 0
-        mov	r9, QWORD PTR [r13+488]
-        mov	QWORD PTR [rsi+480], rax
-        adc	r9, 0
-        mov	r10, QWORD PTR [r13+496]
-        mov	QWORD PTR [rsi+488], r9
-        adc	r10, 0
-        mov	rax, QWORD PTR [r13+504]
-        mov	QWORD PTR [rsi+496], r10
-        adc	rax, 0
-        mov	QWORD PTR [rsi+504], rax
-        add	rsp, 1576
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_4096_mul_avx2_64 ENDP
-_text ENDS
-ENDIF
-; /* Square a and put result in r. (r = a * a)
-;  *
-;  * Karatsuba: ah^2, al^2, (al - ah)^2
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_4096_sqr_64 PROC
-        sub	rsp, 528
-        mov	QWORD PTR [rsp+512], rcx
-        mov	QWORD PTR [rsp+520], rdx
-        mov	r9, 0
-        mov	r10, rsp
-        lea	r11, QWORD PTR [rdx+256]
-        mov	rax, QWORD PTR [rdx]
-        sub	rax, QWORD PTR [r11]
-        mov	r8, QWORD PTR [rdx+8]
-        mov	QWORD PTR [r10], rax
-        sbb	r8, QWORD PTR [r11+8]
-        mov	rax, QWORD PTR [rdx+16]
-        mov	QWORD PTR [r10+8], r8
-        sbb	rax, QWORD PTR [r11+16]
-        mov	r8, QWORD PTR [rdx+24]
-        mov	QWORD PTR [r10+16], rax
-        sbb	r8, QWORD PTR [r11+24]
-        mov	rax, QWORD PTR [rdx+32]
-        mov	QWORD PTR [r10+24], r8
-        sbb	rax, QWORD PTR [r11+32]
-        mov	r8, QWORD PTR [rdx+40]
-        mov	QWORD PTR [r10+32], rax
-        sbb	r8, QWORD PTR [r11+40]
-        mov	rax, QWORD PTR [rdx+48]
-        mov	QWORD PTR [r10+40], r8
-        sbb	rax, QWORD PTR [r11+48]
-        mov	r8, QWORD PTR [rdx+56]
-        mov	QWORD PTR [r10+48], rax
-        sbb	r8, QWORD PTR [r11+56]
-        mov	rax, QWORD PTR [rdx+64]
-        mov	QWORD PTR [r10+56], r8
-        sbb	rax, QWORD PTR [r11+64]
-        mov	r8, QWORD PTR [rdx+72]
-        mov	QWORD PTR [r10+64], rax
-        sbb	r8, QWORD PTR [r11+72]
-        mov	rax, QWORD PTR [rdx+80]
-        mov	QWORD PTR [r10+72], r8
-        sbb	rax, QWORD PTR [r11+80]
-        mov	r8, QWORD PTR [rdx+88]
-        mov	QWORD PTR [r10+80], rax
-        sbb	r8, QWORD PTR [r11+88]
-        mov	rax, QWORD PTR [rdx+96]
-        mov	QWORD PTR [r10+88], r8
-        sbb	rax, QWORD PTR [r11+96]
-        mov	r8, QWORD PTR [rdx+104]
-        mov	QWORD PTR [r10+96], rax
-        sbb	r8, QWORD PTR [r11+104]
-        mov	rax, QWORD PTR [rdx+112]
-        mov	QWORD PTR [r10+104], r8
-        sbb	rax, QWORD PTR [r11+112]
-        mov	r8, QWORD PTR [rdx+120]
-        mov	QWORD PTR [r10+112], rax
-        sbb	r8, QWORD PTR [r11+120]
-        mov	rax, QWORD PTR [rdx+128]
-        mov	QWORD PTR [r10+120], r8
-        sbb	rax, QWORD PTR [r11+128]
-        mov	r8, QWORD PTR [rdx+136]
-        mov	QWORD PTR [r10+128], rax
-        sbb	r8, QWORD PTR [r11+136]
-        mov	rax, QWORD PTR [rdx+144]
-        mov	QWORD PTR [r10+136], r8
-        sbb	rax, QWORD PTR [r11+144]
-        mov	r8, QWORD PTR [rdx+152]
-        mov	QWORD PTR [r10+144], rax
-        sbb	r8, QWORD PTR [r11+152]
-        mov	rax, QWORD PTR [rdx+160]
-        mov	QWORD PTR [r10+152], r8
-        sbb	rax, QWORD PTR [r11+160]
-        mov	r8, QWORD PTR [rdx+168]
-        mov	QWORD PTR [r10+160], rax
-        sbb	r8, QWORD PTR [r11+168]
-        mov	rax, QWORD PTR [rdx+176]
-        mov	QWORD PTR [r10+168], r8
-        sbb	rax, QWORD PTR [r11+176]
-        mov	r8, QWORD PTR [rdx+184]
-        mov	QWORD PTR [r10+176], rax
-        sbb	r8, QWORD PTR [r11+184]
-        mov	rax, QWORD PTR [rdx+192]
-        mov	QWORD PTR [r10+184], r8
-        sbb	rax, QWORD PTR [r11+192]
-        mov	r8, QWORD PTR [rdx+200]
-        mov	QWORD PTR [r10+192], rax
-        sbb	r8, QWORD PTR [r11+200]
-        mov	rax, QWORD PTR [rdx+208]
-        mov	QWORD PTR [r10+200], r8
-        sbb	rax, QWORD PTR [r11+208]
-        mov	r8, QWORD PTR [rdx+216]
-        mov	QWORD PTR [r10+208], rax
-        sbb	r8, QWORD PTR [r11+216]
-        mov	rax, QWORD PTR [rdx+224]
-        mov	QWORD PTR [r10+216], r8
-        sbb	rax, QWORD PTR [r11+224]
-        mov	r8, QWORD PTR [rdx+232]
-        mov	QWORD PTR [r10+224], rax
-        sbb	r8, QWORD PTR [r11+232]
-        mov	rax, QWORD PTR [rdx+240]
-        mov	QWORD PTR [r10+232], r8
-        sbb	rax, QWORD PTR [r11+240]
-        mov	r8, QWORD PTR [rdx+248]
-        mov	QWORD PTR [r10+240], rax
-        sbb	r8, QWORD PTR [r11+248]
-        mov	QWORD PTR [r10+248], r8
-        sbb	r9, 0
-        ; Cond Negate
-        mov	rax, QWORD PTR [r10]
-        mov	r11, r9
-        xor	rax, r9
-        neg	r11
-        sub	rax, r9
-        mov	r8, QWORD PTR [r10+8]
-        sbb	r11, 0
-        mov	QWORD PTR [r10], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+16]
-        setc	r11b
-        mov	QWORD PTR [r10+8], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+24]
-        setc	r11b
-        mov	QWORD PTR [r10+16], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+32]
-        setc	r11b
-        mov	QWORD PTR [r10+24], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+40]
-        setc	r11b
-        mov	QWORD PTR [r10+32], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+48]
-        setc	r11b
-        mov	QWORD PTR [r10+40], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+56]
-        setc	r11b
-        mov	QWORD PTR [r10+48], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+64]
-        setc	r11b
-        mov	QWORD PTR [r10+56], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+72]
-        setc	r11b
-        mov	QWORD PTR [r10+64], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+80]
-        setc	r11b
-        mov	QWORD PTR [r10+72], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+88]
-        setc	r11b
-        mov	QWORD PTR [r10+80], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+96]
-        setc	r11b
-        mov	QWORD PTR [r10+88], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+104]
-        setc	r11b
-        mov	QWORD PTR [r10+96], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+112]
-        setc	r11b
-        mov	QWORD PTR [r10+104], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+120]
-        setc	r11b
-        mov	QWORD PTR [r10+112], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+128]
-        setc	r11b
-        mov	QWORD PTR [r10+120], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+136]
-        setc	r11b
-        mov	QWORD PTR [r10+128], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+144]
-        setc	r11b
-        mov	QWORD PTR [r10+136], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+152]
-        setc	r11b
-        mov	QWORD PTR [r10+144], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+160]
-        setc	r11b
-        mov	QWORD PTR [r10+152], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+168]
-        setc	r11b
-        mov	QWORD PTR [r10+160], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+176]
-        setc	r11b
-        mov	QWORD PTR [r10+168], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+184]
-        setc	r11b
-        mov	QWORD PTR [r10+176], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+192]
-        setc	r11b
-        mov	QWORD PTR [r10+184], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+200]
-        setc	r11b
-        mov	QWORD PTR [r10+192], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+208]
-        setc	r11b
-        mov	QWORD PTR [r10+200], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+216]
-        setc	r11b
-        mov	QWORD PTR [r10+208], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+224]
-        setc	r11b
-        mov	QWORD PTR [r10+216], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+232]
-        setc	r11b
-        mov	QWORD PTR [r10+224], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+240]
-        setc	r11b
-        mov	QWORD PTR [r10+232], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+248]
-        setc	r11b
-        mov	QWORD PTR [r10+240], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	QWORD PTR [r10+248], r8
-        mov	rdx, r10
-        mov	rcx, rsp
-        call	sp_2048_sqr_32
-        mov	rdx, QWORD PTR [rsp+520]
-        mov	rcx, QWORD PTR [rsp+512]
-        add	rdx, 256
-        add	rcx, 512
-        call	sp_2048_sqr_32
-        mov	rdx, QWORD PTR [rsp+520]
-        mov	rcx, QWORD PTR [rsp+512]
-        call	sp_2048_sqr_32
-IFDEF _WIN64
-        mov	rdx, QWORD PTR [rsp+520]
-        mov	rcx, QWORD PTR [rsp+512]
-ENDIF
-        mov	rdx, QWORD PTR [rsp+512]
-        lea	r10, QWORD PTR [rsp+256]
-        add	rdx, 768
-        mov	r9, 0
-        mov	r8, QWORD PTR [r10+-256]
-        sub	r8, QWORD PTR [rdx+-256]
-        mov	rax, QWORD PTR [r10+-248]
-        mov	QWORD PTR [r10+-256], r8
-        sbb	rax, QWORD PTR [rdx+-248]
-        mov	r8, QWORD PTR [r10+-240]
-        mov	QWORD PTR [r10+-248], rax
-        sbb	r8, QWORD PTR [rdx+-240]
-        mov	rax, QWORD PTR [r10+-232]
-        mov	QWORD PTR [r10+-240], r8
-        sbb	rax, QWORD PTR [rdx+-232]
-        mov	r8, QWORD PTR [r10+-224]
-        mov	QWORD PTR [r10+-232], rax
-        sbb	r8, QWORD PTR [rdx+-224]
-        mov	rax, QWORD PTR [r10+-216]
-        mov	QWORD PTR [r10+-224], r8
-        sbb	rax, QWORD PTR [rdx+-216]
-        mov	r8, QWORD PTR [r10+-208]
-        mov	QWORD PTR [r10+-216], rax
-        sbb	r8, QWORD PTR [rdx+-208]
-        mov	rax, QWORD PTR [r10+-200]
-        mov	QWORD PTR [r10+-208], r8
-        sbb	rax, QWORD PTR [rdx+-200]
-        mov	r8, QWORD PTR [r10+-192]
-        mov	QWORD PTR [r10+-200], rax
-        sbb	r8, QWORD PTR [rdx+-192]
-        mov	rax, QWORD PTR [r10+-184]
-        mov	QWORD PTR [r10+-192], r8
-        sbb	rax, QWORD PTR [rdx+-184]
-        mov	r8, QWORD PTR [r10+-176]
-        mov	QWORD PTR [r10+-184], rax
-        sbb	r8, QWORD PTR [rdx+-176]
-        mov	rax, QWORD PTR [r10+-168]
-        mov	QWORD PTR [r10+-176], r8
-        sbb	rax, QWORD PTR [rdx+-168]
-        mov	r8, QWORD PTR [r10+-160]
-        mov	QWORD PTR [r10+-168], rax
-        sbb	r8, QWORD PTR [rdx+-160]
-        mov	rax, QWORD PTR [r10+-152]
-        mov	QWORD PTR [r10+-160], r8
-        sbb	rax, QWORD PTR [rdx+-152]
-        mov	r8, QWORD PTR [r10+-144]
-        mov	QWORD PTR [r10+-152], rax
-        sbb	r8, QWORD PTR [rdx+-144]
-        mov	rax, QWORD PTR [r10+-136]
-        mov	QWORD PTR [r10+-144], r8
-        sbb	rax, QWORD PTR [rdx+-136]
-        mov	r8, QWORD PTR [r10+-128]
-        mov	QWORD PTR [r10+-136], rax
-        sbb	r8, QWORD PTR [rdx+-128]
-        mov	rax, QWORD PTR [r10+-120]
-        mov	QWORD PTR [r10+-128], r8
-        sbb	rax, QWORD PTR [rdx+-120]
-        mov	r8, QWORD PTR [r10+-112]
-        mov	QWORD PTR [r10+-120], rax
-        sbb	r8, QWORD PTR [rdx+-112]
-        mov	rax, QWORD PTR [r10+-104]
-        mov	QWORD PTR [r10+-112], r8
-        sbb	rax, QWORD PTR [rdx+-104]
-        mov	r8, QWORD PTR [r10+-96]
-        mov	QWORD PTR [r10+-104], rax
-        sbb	r8, QWORD PTR [rdx+-96]
-        mov	rax, QWORD PTR [r10+-88]
-        mov	QWORD PTR [r10+-96], r8
-        sbb	rax, QWORD PTR [rdx+-88]
-        mov	r8, QWORD PTR [r10+-80]
-        mov	QWORD PTR [r10+-88], rax
-        sbb	r8, QWORD PTR [rdx+-80]
-        mov	rax, QWORD PTR [r10+-72]
-        mov	QWORD PTR [r10+-80], r8
-        sbb	rax, QWORD PTR [rdx+-72]
-        mov	r8, QWORD PTR [r10+-64]
-        mov	QWORD PTR [r10+-72], rax
-        sbb	r8, QWORD PTR [rdx+-64]
-        mov	rax, QWORD PTR [r10+-56]
-        mov	QWORD PTR [r10+-64], r8
-        sbb	rax, QWORD PTR [rdx+-56]
-        mov	r8, QWORD PTR [r10+-48]
-        mov	QWORD PTR [r10+-56], rax
-        sbb	r8, QWORD PTR [rdx+-48]
-        mov	rax, QWORD PTR [r10+-40]
-        mov	QWORD PTR [r10+-48], r8
-        sbb	rax, QWORD PTR [rdx+-40]
-        mov	r8, QWORD PTR [r10+-32]
-        mov	QWORD PTR [r10+-40], rax
-        sbb	r8, QWORD PTR [rdx+-32]
-        mov	rax, QWORD PTR [r10+-24]
-        mov	QWORD PTR [r10+-32], r8
-        sbb	rax, QWORD PTR [rdx+-24]
-        mov	r8, QWORD PTR [r10+-16]
-        mov	QWORD PTR [r10+-24], rax
-        sbb	r8, QWORD PTR [rdx+-16]
-        mov	rax, QWORD PTR [r10+-8]
-        mov	QWORD PTR [r10+-16], r8
-        sbb	rax, QWORD PTR [rdx+-8]
-        mov	r8, QWORD PTR [r10]
-        mov	QWORD PTR [r10+-8], rax
-        sbb	r8, QWORD PTR [rdx]
-        mov	rax, QWORD PTR [r10+8]
-        mov	QWORD PTR [r10], r8
-        sbb	rax, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [r10+16]
-        mov	QWORD PTR [r10+8], rax
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	rax, QWORD PTR [r10+24]
-        mov	QWORD PTR [r10+16], r8
-        sbb	rax, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [r10+32]
-        mov	QWORD PTR [r10+24], rax
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	rax, QWORD PTR [r10+40]
-        mov	QWORD PTR [r10+32], r8
-        sbb	rax, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [r10+48]
-        mov	QWORD PTR [r10+40], rax
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	rax, QWORD PTR [r10+56]
-        mov	QWORD PTR [r10+48], r8
-        sbb	rax, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [r10+64]
-        mov	QWORD PTR [r10+56], rax
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	rax, QWORD PTR [r10+72]
-        mov	QWORD PTR [r10+64], r8
-        sbb	rax, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [r10+80]
-        mov	QWORD PTR [r10+72], rax
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	rax, QWORD PTR [r10+88]
-        mov	QWORD PTR [r10+80], r8
-        sbb	rax, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [r10+96]
-        mov	QWORD PTR [r10+88], rax
-        sbb	r8, QWORD PTR [rdx+96]
-        mov	rax, QWORD PTR [r10+104]
-        mov	QWORD PTR [r10+96], r8
-        sbb	rax, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [r10+112]
-        mov	QWORD PTR [r10+104], rax
-        sbb	r8, QWORD PTR [rdx+112]
-        mov	rax, QWORD PTR [r10+120]
-        mov	QWORD PTR [r10+112], r8
-        sbb	rax, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [r10+128]
-        mov	QWORD PTR [r10+120], rax
-        sbb	r8, QWORD PTR [rdx+128]
-        mov	rax, QWORD PTR [r10+136]
-        mov	QWORD PTR [r10+128], r8
-        sbb	rax, QWORD PTR [rdx+136]
-        mov	r8, QWORD PTR [r10+144]
-        mov	QWORD PTR [r10+136], rax
-        sbb	r8, QWORD PTR [rdx+144]
-        mov	rax, QWORD PTR [r10+152]
-        mov	QWORD PTR [r10+144], r8
-        sbb	rax, QWORD PTR [rdx+152]
-        mov	r8, QWORD PTR [r10+160]
-        mov	QWORD PTR [r10+152], rax
-        sbb	r8, QWORD PTR [rdx+160]
-        mov	rax, QWORD PTR [r10+168]
-        mov	QWORD PTR [r10+160], r8
-        sbb	rax, QWORD PTR [rdx+168]
-        mov	r8, QWORD PTR [r10+176]
-        mov	QWORD PTR [r10+168], rax
-        sbb	r8, QWORD PTR [rdx+176]
-        mov	rax, QWORD PTR [r10+184]
-        mov	QWORD PTR [r10+176], r8
-        sbb	rax, QWORD PTR [rdx+184]
-        mov	r8, QWORD PTR [r10+192]
-        mov	QWORD PTR [r10+184], rax
-        sbb	r8, QWORD PTR [rdx+192]
-        mov	rax, QWORD PTR [r10+200]
-        mov	QWORD PTR [r10+192], r8
-        sbb	rax, QWORD PTR [rdx+200]
-        mov	r8, QWORD PTR [r10+208]
-        mov	QWORD PTR [r10+200], rax
-        sbb	r8, QWORD PTR [rdx+208]
-        mov	rax, QWORD PTR [r10+216]
-        mov	QWORD PTR [r10+208], r8
-        sbb	rax, QWORD PTR [rdx+216]
-        mov	r8, QWORD PTR [r10+224]
-        mov	QWORD PTR [r10+216], rax
-        sbb	r8, QWORD PTR [rdx+224]
-        mov	rax, QWORD PTR [r10+232]
-        mov	QWORD PTR [r10+224], r8
-        sbb	rax, QWORD PTR [rdx+232]
-        mov	r8, QWORD PTR [r10+240]
-        mov	QWORD PTR [r10+232], rax
-        sbb	r8, QWORD PTR [rdx+240]
-        mov	rax, QWORD PTR [r10+248]
-        mov	QWORD PTR [r10+240], r8
-        sbb	rax, QWORD PTR [rdx+248]
-        mov	QWORD PTR [r10+248], rax
-        sbb	r9, 0
-        sub	rdx, 512
-        mov	r8, QWORD PTR [r10+-256]
-        sub	r8, QWORD PTR [rdx+-256]
-        mov	rax, QWORD PTR [r10+-248]
-        mov	QWORD PTR [r10+-256], r8
-        sbb	rax, QWORD PTR [rdx+-248]
-        mov	r8, QWORD PTR [r10+-240]
-        mov	QWORD PTR [r10+-248], rax
-        sbb	r8, QWORD PTR [rdx+-240]
-        mov	rax, QWORD PTR [r10+-232]
-        mov	QWORD PTR [r10+-240], r8
-        sbb	rax, QWORD PTR [rdx+-232]
-        mov	r8, QWORD PTR [r10+-224]
-        mov	QWORD PTR [r10+-232], rax
-        sbb	r8, QWORD PTR [rdx+-224]
-        mov	rax, QWORD PTR [r10+-216]
-        mov	QWORD PTR [r10+-224], r8
-        sbb	rax, QWORD PTR [rdx+-216]
-        mov	r8, QWORD PTR [r10+-208]
-        mov	QWORD PTR [r10+-216], rax
-        sbb	r8, QWORD PTR [rdx+-208]
-        mov	rax, QWORD PTR [r10+-200]
-        mov	QWORD PTR [r10+-208], r8
-        sbb	rax, QWORD PTR [rdx+-200]
-        mov	r8, QWORD PTR [r10+-192]
-        mov	QWORD PTR [r10+-200], rax
-        sbb	r8, QWORD PTR [rdx+-192]
-        mov	rax, QWORD PTR [r10+-184]
-        mov	QWORD PTR [r10+-192], r8
-        sbb	rax, QWORD PTR [rdx+-184]
-        mov	r8, QWORD PTR [r10+-176]
-        mov	QWORD PTR [r10+-184], rax
-        sbb	r8, QWORD PTR [rdx+-176]
-        mov	rax, QWORD PTR [r10+-168]
-        mov	QWORD PTR [r10+-176], r8
-        sbb	rax, QWORD PTR [rdx+-168]
-        mov	r8, QWORD PTR [r10+-160]
-        mov	QWORD PTR [r10+-168], rax
-        sbb	r8, QWORD PTR [rdx+-160]
-        mov	rax, QWORD PTR [r10+-152]
-        mov	QWORD PTR [r10+-160], r8
-        sbb	rax, QWORD PTR [rdx+-152]
-        mov	r8, QWORD PTR [r10+-144]
-        mov	QWORD PTR [r10+-152], rax
-        sbb	r8, QWORD PTR [rdx+-144]
-        mov	rax, QWORD PTR [r10+-136]
-        mov	QWORD PTR [r10+-144], r8
-        sbb	rax, QWORD PTR [rdx+-136]
-        mov	r8, QWORD PTR [r10+-128]
-        mov	QWORD PTR [r10+-136], rax
-        sbb	r8, QWORD PTR [rdx+-128]
-        mov	rax, QWORD PTR [r10+-120]
-        mov	QWORD PTR [r10+-128], r8
-        sbb	rax, QWORD PTR [rdx+-120]
-        mov	r8, QWORD PTR [r10+-112]
-        mov	QWORD PTR [r10+-120], rax
-        sbb	r8, QWORD PTR [rdx+-112]
-        mov	rax, QWORD PTR [r10+-104]
-        mov	QWORD PTR [r10+-112], r8
-        sbb	rax, QWORD PTR [rdx+-104]
-        mov	r8, QWORD PTR [r10+-96]
-        mov	QWORD PTR [r10+-104], rax
-        sbb	r8, QWORD PTR [rdx+-96]
-        mov	rax, QWORD PTR [r10+-88]
-        mov	QWORD PTR [r10+-96], r8
-        sbb	rax, QWORD PTR [rdx+-88]
-        mov	r8, QWORD PTR [r10+-80]
-        mov	QWORD PTR [r10+-88], rax
-        sbb	r8, QWORD PTR [rdx+-80]
-        mov	rax, QWORD PTR [r10+-72]
-        mov	QWORD PTR [r10+-80], r8
-        sbb	rax, QWORD PTR [rdx+-72]
-        mov	r8, QWORD PTR [r10+-64]
-        mov	QWORD PTR [r10+-72], rax
-        sbb	r8, QWORD PTR [rdx+-64]
-        mov	rax, QWORD PTR [r10+-56]
-        mov	QWORD PTR [r10+-64], r8
-        sbb	rax, QWORD PTR [rdx+-56]
-        mov	r8, QWORD PTR [r10+-48]
-        mov	QWORD PTR [r10+-56], rax
-        sbb	r8, QWORD PTR [rdx+-48]
-        mov	rax, QWORD PTR [r10+-40]
-        mov	QWORD PTR [r10+-48], r8
-        sbb	rax, QWORD PTR [rdx+-40]
-        mov	r8, QWORD PTR [r10+-32]
-        mov	QWORD PTR [r10+-40], rax
-        sbb	r8, QWORD PTR [rdx+-32]
-        mov	rax, QWORD PTR [r10+-24]
-        mov	QWORD PTR [r10+-32], r8
-        sbb	rax, QWORD PTR [rdx+-24]
-        mov	r8, QWORD PTR [r10+-16]
-        mov	QWORD PTR [r10+-24], rax
-        sbb	r8, QWORD PTR [rdx+-16]
-        mov	rax, QWORD PTR [r10+-8]
-        mov	QWORD PTR [r10+-16], r8
-        sbb	rax, QWORD PTR [rdx+-8]
-        mov	r8, QWORD PTR [r10]
-        mov	QWORD PTR [r10+-8], rax
-        sbb	r8, QWORD PTR [rdx]
-        mov	rax, QWORD PTR [r10+8]
-        mov	QWORD PTR [r10], r8
-        sbb	rax, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [r10+16]
-        mov	QWORD PTR [r10+8], rax
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	rax, QWORD PTR [r10+24]
-        mov	QWORD PTR [r10+16], r8
-        sbb	rax, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [r10+32]
-        mov	QWORD PTR [r10+24], rax
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	rax, QWORD PTR [r10+40]
-        mov	QWORD PTR [r10+32], r8
-        sbb	rax, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [r10+48]
-        mov	QWORD PTR [r10+40], rax
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	rax, QWORD PTR [r10+56]
-        mov	QWORD PTR [r10+48], r8
-        sbb	rax, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [r10+64]
-        mov	QWORD PTR [r10+56], rax
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	rax, QWORD PTR [r10+72]
-        mov	QWORD PTR [r10+64], r8
-        sbb	rax, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [r10+80]
-        mov	QWORD PTR [r10+72], rax
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	rax, QWORD PTR [r10+88]
-        mov	QWORD PTR [r10+80], r8
-        sbb	rax, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [r10+96]
-        mov	QWORD PTR [r10+88], rax
-        sbb	r8, QWORD PTR [rdx+96]
-        mov	rax, QWORD PTR [r10+104]
-        mov	QWORD PTR [r10+96], r8
-        sbb	rax, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [r10+112]
-        mov	QWORD PTR [r10+104], rax
-        sbb	r8, QWORD PTR [rdx+112]
-        mov	rax, QWORD PTR [r10+120]
-        mov	QWORD PTR [r10+112], r8
-        sbb	rax, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [r10+128]
-        mov	QWORD PTR [r10+120], rax
-        sbb	r8, QWORD PTR [rdx+128]
-        mov	rax, QWORD PTR [r10+136]
-        mov	QWORD PTR [r10+128], r8
-        sbb	rax, QWORD PTR [rdx+136]
-        mov	r8, QWORD PTR [r10+144]
-        mov	QWORD PTR [r10+136], rax
-        sbb	r8, QWORD PTR [rdx+144]
-        mov	rax, QWORD PTR [r10+152]
-        mov	QWORD PTR [r10+144], r8
-        sbb	rax, QWORD PTR [rdx+152]
-        mov	r8, QWORD PTR [r10+160]
-        mov	QWORD PTR [r10+152], rax
-        sbb	r8, QWORD PTR [rdx+160]
-        mov	rax, QWORD PTR [r10+168]
-        mov	QWORD PTR [r10+160], r8
-        sbb	rax, QWORD PTR [rdx+168]
-        mov	r8, QWORD PTR [r10+176]
-        mov	QWORD PTR [r10+168], rax
-        sbb	r8, QWORD PTR [rdx+176]
-        mov	rax, QWORD PTR [r10+184]
-        mov	QWORD PTR [r10+176], r8
-        sbb	rax, QWORD PTR [rdx+184]
-        mov	r8, QWORD PTR [r10+192]
-        mov	QWORD PTR [r10+184], rax
-        sbb	r8, QWORD PTR [rdx+192]
-        mov	rax, QWORD PTR [r10+200]
-        mov	QWORD PTR [r10+192], r8
-        sbb	rax, QWORD PTR [rdx+200]
-        mov	r8, QWORD PTR [r10+208]
-        mov	QWORD PTR [r10+200], rax
-        sbb	r8, QWORD PTR [rdx+208]
-        mov	rax, QWORD PTR [r10+216]
-        mov	QWORD PTR [r10+208], r8
-        sbb	rax, QWORD PTR [rdx+216]
-        mov	r8, QWORD PTR [r10+224]
-        mov	QWORD PTR [r10+216], rax
-        sbb	r8, QWORD PTR [rdx+224]
-        mov	rax, QWORD PTR [r10+232]
-        mov	QWORD PTR [r10+224], r8
-        sbb	rax, QWORD PTR [rdx+232]
-        mov	r8, QWORD PTR [r10+240]
-        mov	QWORD PTR [r10+232], rax
-        sbb	r8, QWORD PTR [rdx+240]
-        mov	rax, QWORD PTR [r10+248]
-        mov	QWORD PTR [r10+240], r8
-        sbb	rax, QWORD PTR [rdx+248]
-        mov	QWORD PTR [r10+248], rax
-        sbb	r9, 0
-        mov	rcx, QWORD PTR [rsp+512]
-        neg	r9
-        add	rcx, 512
-        mov	r8, QWORD PTR [rcx+-256]
-        sub	r8, QWORD PTR [r10+-256]
-        mov	rax, QWORD PTR [rcx+-248]
-        mov	QWORD PTR [rcx+-256], r8
-        sbb	rax, QWORD PTR [r10+-248]
-        mov	r8, QWORD PTR [rcx+-240]
-        mov	QWORD PTR [rcx+-248], rax
-        sbb	r8, QWORD PTR [r10+-240]
-        mov	rax, QWORD PTR [rcx+-232]
-        mov	QWORD PTR [rcx+-240], r8
-        sbb	rax, QWORD PTR [r10+-232]
-        mov	r8, QWORD PTR [rcx+-224]
-        mov	QWORD PTR [rcx+-232], rax
-        sbb	r8, QWORD PTR [r10+-224]
-        mov	rax, QWORD PTR [rcx+-216]
-        mov	QWORD PTR [rcx+-224], r8
-        sbb	rax, QWORD PTR [r10+-216]
-        mov	r8, QWORD PTR [rcx+-208]
-        mov	QWORD PTR [rcx+-216], rax
-        sbb	r8, QWORD PTR [r10+-208]
-        mov	rax, QWORD PTR [rcx+-200]
-        mov	QWORD PTR [rcx+-208], r8
-        sbb	rax, QWORD PTR [r10+-200]
-        mov	r8, QWORD PTR [rcx+-192]
-        mov	QWORD PTR [rcx+-200], rax
-        sbb	r8, QWORD PTR [r10+-192]
-        mov	rax, QWORD PTR [rcx+-184]
-        mov	QWORD PTR [rcx+-192], r8
-        sbb	rax, QWORD PTR [r10+-184]
-        mov	r8, QWORD PTR [rcx+-176]
-        mov	QWORD PTR [rcx+-184], rax
-        sbb	r8, QWORD PTR [r10+-176]
-        mov	rax, QWORD PTR [rcx+-168]
-        mov	QWORD PTR [rcx+-176], r8
-        sbb	rax, QWORD PTR [r10+-168]
-        mov	r8, QWORD PTR [rcx+-160]
-        mov	QWORD PTR [rcx+-168], rax
-        sbb	r8, QWORD PTR [r10+-160]
-        mov	rax, QWORD PTR [rcx+-152]
-        mov	QWORD PTR [rcx+-160], r8
-        sbb	rax, QWORD PTR [r10+-152]
-        mov	r8, QWORD PTR [rcx+-144]
-        mov	QWORD PTR [rcx+-152], rax
-        sbb	r8, QWORD PTR [r10+-144]
-        mov	rax, QWORD PTR [rcx+-136]
-        mov	QWORD PTR [rcx+-144], r8
-        sbb	rax, QWORD PTR [r10+-136]
-        mov	r8, QWORD PTR [rcx+-128]
-        mov	QWORD PTR [rcx+-136], rax
-        sbb	r8, QWORD PTR [r10+-128]
-        mov	rax, QWORD PTR [rcx+-120]
-        mov	QWORD PTR [rcx+-128], r8
-        sbb	rax, QWORD PTR [r10+-120]
-        mov	r8, QWORD PTR [rcx+-112]
-        mov	QWORD PTR [rcx+-120], rax
-        sbb	r8, QWORD PTR [r10+-112]
-        mov	rax, QWORD PTR [rcx+-104]
-        mov	QWORD PTR [rcx+-112], r8
-        sbb	rax, QWORD PTR [r10+-104]
-        mov	r8, QWORD PTR [rcx+-96]
-        mov	QWORD PTR [rcx+-104], rax
-        sbb	r8, QWORD PTR [r10+-96]
-        mov	rax, QWORD PTR [rcx+-88]
-        mov	QWORD PTR [rcx+-96], r8
-        sbb	rax, QWORD PTR [r10+-88]
-        mov	r8, QWORD PTR [rcx+-80]
-        mov	QWORD PTR [rcx+-88], rax
-        sbb	r8, QWORD PTR [r10+-80]
-        mov	rax, QWORD PTR [rcx+-72]
-        mov	QWORD PTR [rcx+-80], r8
-        sbb	rax, QWORD PTR [r10+-72]
-        mov	r8, QWORD PTR [rcx+-64]
-        mov	QWORD PTR [rcx+-72], rax
-        sbb	r8, QWORD PTR [r10+-64]
-        mov	rax, QWORD PTR [rcx+-56]
-        mov	QWORD PTR [rcx+-64], r8
-        sbb	rax, QWORD PTR [r10+-56]
-        mov	r8, QWORD PTR [rcx+-48]
-        mov	QWORD PTR [rcx+-56], rax
-        sbb	r8, QWORD PTR [r10+-48]
-        mov	rax, QWORD PTR [rcx+-40]
-        mov	QWORD PTR [rcx+-48], r8
-        sbb	rax, QWORD PTR [r10+-40]
-        mov	r8, QWORD PTR [rcx+-32]
-        mov	QWORD PTR [rcx+-40], rax
-        sbb	r8, QWORD PTR [r10+-32]
-        mov	rax, QWORD PTR [rcx+-24]
-        mov	QWORD PTR [rcx+-32], r8
-        sbb	rax, QWORD PTR [r10+-24]
-        mov	r8, QWORD PTR [rcx+-16]
-        mov	QWORD PTR [rcx+-24], rax
-        sbb	r8, QWORD PTR [r10+-16]
-        mov	rax, QWORD PTR [rcx+-8]
-        mov	QWORD PTR [rcx+-16], r8
-        sbb	rax, QWORD PTR [r10+-8]
-        mov	r8, QWORD PTR [rcx]
-        mov	QWORD PTR [rcx+-8], rax
-        sbb	r8, QWORD PTR [r10]
-        mov	rax, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        sbb	rax, QWORD PTR [r10+8]
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], rax
-        sbb	r8, QWORD PTR [r10+16]
-        mov	rax, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        sbb	rax, QWORD PTR [r10+24]
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], rax
-        sbb	r8, QWORD PTR [r10+32]
-        mov	rax, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        sbb	rax, QWORD PTR [r10+40]
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], rax
-        sbb	r8, QWORD PTR [r10+48]
-        mov	rax, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        sbb	rax, QWORD PTR [r10+56]
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], rax
-        sbb	r8, QWORD PTR [r10+64]
-        mov	rax, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        sbb	rax, QWORD PTR [r10+72]
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], rax
-        sbb	r8, QWORD PTR [r10+80]
-        mov	rax, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        sbb	rax, QWORD PTR [r10+88]
-        mov	r8, QWORD PTR [rcx+96]
-        mov	QWORD PTR [rcx+88], rax
-        sbb	r8, QWORD PTR [r10+96]
-        mov	rax, QWORD PTR [rcx+104]
-        mov	QWORD PTR [rcx+96], r8
-        sbb	rax, QWORD PTR [r10+104]
-        mov	r8, QWORD PTR [rcx+112]
-        mov	QWORD PTR [rcx+104], rax
-        sbb	r8, QWORD PTR [r10+112]
-        mov	rax, QWORD PTR [rcx+120]
-        mov	QWORD PTR [rcx+112], r8
-        sbb	rax, QWORD PTR [r10+120]
-        mov	r8, QWORD PTR [rcx+128]
-        mov	QWORD PTR [rcx+120], rax
-        sbb	r8, QWORD PTR [r10+128]
-        mov	rax, QWORD PTR [rcx+136]
-        mov	QWORD PTR [rcx+128], r8
-        sbb	rax, QWORD PTR [r10+136]
-        mov	r8, QWORD PTR [rcx+144]
-        mov	QWORD PTR [rcx+136], rax
-        sbb	r8, QWORD PTR [r10+144]
-        mov	rax, QWORD PTR [rcx+152]
-        mov	QWORD PTR [rcx+144], r8
-        sbb	rax, QWORD PTR [r10+152]
-        mov	r8, QWORD PTR [rcx+160]
-        mov	QWORD PTR [rcx+152], rax
-        sbb	r8, QWORD PTR [r10+160]
-        mov	rax, QWORD PTR [rcx+168]
-        mov	QWORD PTR [rcx+160], r8
-        sbb	rax, QWORD PTR [r10+168]
-        mov	r8, QWORD PTR [rcx+176]
-        mov	QWORD PTR [rcx+168], rax
-        sbb	r8, QWORD PTR [r10+176]
-        mov	rax, QWORD PTR [rcx+184]
-        mov	QWORD PTR [rcx+176], r8
-        sbb	rax, QWORD PTR [r10+184]
-        mov	r8, QWORD PTR [rcx+192]
-        mov	QWORD PTR [rcx+184], rax
-        sbb	r8, QWORD PTR [r10+192]
-        mov	rax, QWORD PTR [rcx+200]
-        mov	QWORD PTR [rcx+192], r8
-        sbb	rax, QWORD PTR [r10+200]
-        mov	r8, QWORD PTR [rcx+208]
-        mov	QWORD PTR [rcx+200], rax
-        sbb	r8, QWORD PTR [r10+208]
-        mov	rax, QWORD PTR [rcx+216]
-        mov	QWORD PTR [rcx+208], r8
-        sbb	rax, QWORD PTR [r10+216]
-        mov	r8, QWORD PTR [rcx+224]
-        mov	QWORD PTR [rcx+216], rax
-        sbb	r8, QWORD PTR [r10+224]
-        mov	rax, QWORD PTR [rcx+232]
-        mov	QWORD PTR [rcx+224], r8
-        sbb	rax, QWORD PTR [r10+232]
-        mov	r8, QWORD PTR [rcx+240]
-        mov	QWORD PTR [rcx+232], rax
-        sbb	r8, QWORD PTR [r10+240]
-        mov	rax, QWORD PTR [rcx+248]
-        mov	QWORD PTR [rcx+240], r8
-        sbb	rax, QWORD PTR [r10+248]
-        mov	QWORD PTR [rcx+248], rax
-        sbb	r9, 0
-        mov	rcx, QWORD PTR [rsp+512]
-        add	rcx, 768
-        ; Add in word
-        mov	r8, QWORD PTR [rcx]
-        add	r8, r9
-        mov	rax, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+96]
-        mov	QWORD PTR [rcx+88], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+104]
-        mov	QWORD PTR [rcx+96], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+112]
-        mov	QWORD PTR [rcx+104], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+120]
-        mov	QWORD PTR [rcx+112], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+128]
-        mov	QWORD PTR [rcx+120], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+136]
-        mov	QWORD PTR [rcx+128], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+144]
-        mov	QWORD PTR [rcx+136], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+152]
-        mov	QWORD PTR [rcx+144], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+160]
-        mov	QWORD PTR [rcx+152], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+168]
-        mov	QWORD PTR [rcx+160], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+176]
-        mov	QWORD PTR [rcx+168], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+184]
-        mov	QWORD PTR [rcx+176], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+192]
-        mov	QWORD PTR [rcx+184], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+200]
-        mov	QWORD PTR [rcx+192], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+208]
-        mov	QWORD PTR [rcx+200], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+216]
-        mov	QWORD PTR [rcx+208], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+224]
-        mov	QWORD PTR [rcx+216], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+232]
-        mov	QWORD PTR [rcx+224], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+240]
-        mov	QWORD PTR [rcx+232], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+248]
-        mov	QWORD PTR [rcx+240], r8
-        adc	rax, 0
-        mov	QWORD PTR [rcx+248], rax
-        mov	rdx, QWORD PTR [rsp+520]
-        mov	rcx, QWORD PTR [rsp+512]
-        add	rsp, 528
-        ret
-sp_4096_sqr_64 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Square a and put result in r. (r = a * a)
-;  *
-;  * Karatsuba: ah^2, al^2, (al - ah)^2
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_4096_sqr_avx2_64 PROC
-        sub	rsp, 528
-        mov	QWORD PTR [rsp+512], rcx
-        mov	QWORD PTR [rsp+520], rdx
-        mov	r9, 0
-        mov	r10, rsp
-        lea	r11, QWORD PTR [rdx+256]
-        mov	rax, QWORD PTR [rdx]
-        sub	rax, QWORD PTR [r11]
-        mov	r8, QWORD PTR [rdx+8]
-        mov	QWORD PTR [r10], rax
-        sbb	r8, QWORD PTR [r11+8]
-        mov	rax, QWORD PTR [rdx+16]
-        mov	QWORD PTR [r10+8], r8
-        sbb	rax, QWORD PTR [r11+16]
-        mov	r8, QWORD PTR [rdx+24]
-        mov	QWORD PTR [r10+16], rax
-        sbb	r8, QWORD PTR [r11+24]
-        mov	rax, QWORD PTR [rdx+32]
-        mov	QWORD PTR [r10+24], r8
-        sbb	rax, QWORD PTR [r11+32]
-        mov	r8, QWORD PTR [rdx+40]
-        mov	QWORD PTR [r10+32], rax
-        sbb	r8, QWORD PTR [r11+40]
-        mov	rax, QWORD PTR [rdx+48]
-        mov	QWORD PTR [r10+40], r8
-        sbb	rax, QWORD PTR [r11+48]
-        mov	r8, QWORD PTR [rdx+56]
-        mov	QWORD PTR [r10+48], rax
-        sbb	r8, QWORD PTR [r11+56]
-        mov	rax, QWORD PTR [rdx+64]
-        mov	QWORD PTR [r10+56], r8
-        sbb	rax, QWORD PTR [r11+64]
-        mov	r8, QWORD PTR [rdx+72]
-        mov	QWORD PTR [r10+64], rax
-        sbb	r8, QWORD PTR [r11+72]
-        mov	rax, QWORD PTR [rdx+80]
-        mov	QWORD PTR [r10+72], r8
-        sbb	rax, QWORD PTR [r11+80]
-        mov	r8, QWORD PTR [rdx+88]
-        mov	QWORD PTR [r10+80], rax
-        sbb	r8, QWORD PTR [r11+88]
-        mov	rax, QWORD PTR [rdx+96]
-        mov	QWORD PTR [r10+88], r8
-        sbb	rax, QWORD PTR [r11+96]
-        mov	r8, QWORD PTR [rdx+104]
-        mov	QWORD PTR [r10+96], rax
-        sbb	r8, QWORD PTR [r11+104]
-        mov	rax, QWORD PTR [rdx+112]
-        mov	QWORD PTR [r10+104], r8
-        sbb	rax, QWORD PTR [r11+112]
-        mov	r8, QWORD PTR [rdx+120]
-        mov	QWORD PTR [r10+112], rax
-        sbb	r8, QWORD PTR [r11+120]
-        mov	rax, QWORD PTR [rdx+128]
-        mov	QWORD PTR [r10+120], r8
-        sbb	rax, QWORD PTR [r11+128]
-        mov	r8, QWORD PTR [rdx+136]
-        mov	QWORD PTR [r10+128], rax
-        sbb	r8, QWORD PTR [r11+136]
-        mov	rax, QWORD PTR [rdx+144]
-        mov	QWORD PTR [r10+136], r8
-        sbb	rax, QWORD PTR [r11+144]
-        mov	r8, QWORD PTR [rdx+152]
-        mov	QWORD PTR [r10+144], rax
-        sbb	r8, QWORD PTR [r11+152]
-        mov	rax, QWORD PTR [rdx+160]
-        mov	QWORD PTR [r10+152], r8
-        sbb	rax, QWORD PTR [r11+160]
-        mov	r8, QWORD PTR [rdx+168]
-        mov	QWORD PTR [r10+160], rax
-        sbb	r8, QWORD PTR [r11+168]
-        mov	rax, QWORD PTR [rdx+176]
-        mov	QWORD PTR [r10+168], r8
-        sbb	rax, QWORD PTR [r11+176]
-        mov	r8, QWORD PTR [rdx+184]
-        mov	QWORD PTR [r10+176], rax
-        sbb	r8, QWORD PTR [r11+184]
-        mov	rax, QWORD PTR [rdx+192]
-        mov	QWORD PTR [r10+184], r8
-        sbb	rax, QWORD PTR [r11+192]
-        mov	r8, QWORD PTR [rdx+200]
-        mov	QWORD PTR [r10+192], rax
-        sbb	r8, QWORD PTR [r11+200]
-        mov	rax, QWORD PTR [rdx+208]
-        mov	QWORD PTR [r10+200], r8
-        sbb	rax, QWORD PTR [r11+208]
-        mov	r8, QWORD PTR [rdx+216]
-        mov	QWORD PTR [r10+208], rax
-        sbb	r8, QWORD PTR [r11+216]
-        mov	rax, QWORD PTR [rdx+224]
-        mov	QWORD PTR [r10+216], r8
-        sbb	rax, QWORD PTR [r11+224]
-        mov	r8, QWORD PTR [rdx+232]
-        mov	QWORD PTR [r10+224], rax
-        sbb	r8, QWORD PTR [r11+232]
-        mov	rax, QWORD PTR [rdx+240]
-        mov	QWORD PTR [r10+232], r8
-        sbb	rax, QWORD PTR [r11+240]
-        mov	r8, QWORD PTR [rdx+248]
-        mov	QWORD PTR [r10+240], rax
-        sbb	r8, QWORD PTR [r11+248]
-        mov	QWORD PTR [r10+248], r8
-        sbb	r9, 0
-        ; Cond Negate
-        mov	rax, QWORD PTR [r10]
-        mov	r11, r9
-        xor	rax, r9
-        neg	r11
-        sub	rax, r9
-        mov	r8, QWORD PTR [r10+8]
-        sbb	r11, 0
-        mov	QWORD PTR [r10], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+16]
-        setc	r11b
-        mov	QWORD PTR [r10+8], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+24]
-        setc	r11b
-        mov	QWORD PTR [r10+16], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+32]
-        setc	r11b
-        mov	QWORD PTR [r10+24], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+40]
-        setc	r11b
-        mov	QWORD PTR [r10+32], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+48]
-        setc	r11b
-        mov	QWORD PTR [r10+40], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+56]
-        setc	r11b
-        mov	QWORD PTR [r10+48], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+64]
-        setc	r11b
-        mov	QWORD PTR [r10+56], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+72]
-        setc	r11b
-        mov	QWORD PTR [r10+64], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+80]
-        setc	r11b
-        mov	QWORD PTR [r10+72], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+88]
-        setc	r11b
-        mov	QWORD PTR [r10+80], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+96]
-        setc	r11b
-        mov	QWORD PTR [r10+88], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+104]
-        setc	r11b
-        mov	QWORD PTR [r10+96], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+112]
-        setc	r11b
-        mov	QWORD PTR [r10+104], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+120]
-        setc	r11b
-        mov	QWORD PTR [r10+112], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+128]
-        setc	r11b
-        mov	QWORD PTR [r10+120], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+136]
-        setc	r11b
-        mov	QWORD PTR [r10+128], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+144]
-        setc	r11b
-        mov	QWORD PTR [r10+136], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+152]
-        setc	r11b
-        mov	QWORD PTR [r10+144], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+160]
-        setc	r11b
-        mov	QWORD PTR [r10+152], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+168]
-        setc	r11b
-        mov	QWORD PTR [r10+160], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+176]
-        setc	r11b
-        mov	QWORD PTR [r10+168], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+184]
-        setc	r11b
-        mov	QWORD PTR [r10+176], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+192]
-        setc	r11b
-        mov	QWORD PTR [r10+184], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+200]
-        setc	r11b
-        mov	QWORD PTR [r10+192], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+208]
-        setc	r11b
-        mov	QWORD PTR [r10+200], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+216]
-        setc	r11b
-        mov	QWORD PTR [r10+208], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+224]
-        setc	r11b
-        mov	QWORD PTR [r10+216], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+232]
-        setc	r11b
-        mov	QWORD PTR [r10+224], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	rax, QWORD PTR [r10+240]
-        setc	r11b
-        mov	QWORD PTR [r10+232], r8
-        xor	rax, r9
-        add	rax, r11
-        mov	r8, QWORD PTR [r10+248]
-        setc	r11b
-        mov	QWORD PTR [r10+240], rax
-        xor	r8, r9
-        add	r8, r11
-        mov	QWORD PTR [r10+248], r8
-        mov	rdx, r10
-        mov	rcx, rsp
-        call	sp_2048_sqr_avx2_32
-        mov	rdx, QWORD PTR [rsp+520]
-        mov	rcx, QWORD PTR [rsp+512]
-        add	rdx, 256
-        add	rcx, 512
-        call	sp_2048_sqr_avx2_32
-        mov	rdx, QWORD PTR [rsp+520]
-        mov	rcx, QWORD PTR [rsp+512]
-        call	sp_2048_sqr_avx2_32
-IFDEF _WIN64
-        mov	rdx, QWORD PTR [rsp+520]
-        mov	rcx, QWORD PTR [rsp+512]
-ENDIF
-        mov	rdx, QWORD PTR [rsp+512]
-        lea	r10, QWORD PTR [rsp+256]
-        add	rdx, 768
-        mov	r9, 0
-        mov	r8, QWORD PTR [r10+-256]
-        sub	r8, QWORD PTR [rdx+-256]
-        mov	rax, QWORD PTR [r10+-248]
-        mov	QWORD PTR [r10+-256], r8
-        sbb	rax, QWORD PTR [rdx+-248]
-        mov	r8, QWORD PTR [r10+-240]
-        mov	QWORD PTR [r10+-248], rax
-        sbb	r8, QWORD PTR [rdx+-240]
-        mov	rax, QWORD PTR [r10+-232]
-        mov	QWORD PTR [r10+-240], r8
-        sbb	rax, QWORD PTR [rdx+-232]
-        mov	r8, QWORD PTR [r10+-224]
-        mov	QWORD PTR [r10+-232], rax
-        sbb	r8, QWORD PTR [rdx+-224]
-        mov	rax, QWORD PTR [r10+-216]
-        mov	QWORD PTR [r10+-224], r8
-        sbb	rax, QWORD PTR [rdx+-216]
-        mov	r8, QWORD PTR [r10+-208]
-        mov	QWORD PTR [r10+-216], rax
-        sbb	r8, QWORD PTR [rdx+-208]
-        mov	rax, QWORD PTR [r10+-200]
-        mov	QWORD PTR [r10+-208], r8
-        sbb	rax, QWORD PTR [rdx+-200]
-        mov	r8, QWORD PTR [r10+-192]
-        mov	QWORD PTR [r10+-200], rax
-        sbb	r8, QWORD PTR [rdx+-192]
-        mov	rax, QWORD PTR [r10+-184]
-        mov	QWORD PTR [r10+-192], r8
-        sbb	rax, QWORD PTR [rdx+-184]
-        mov	r8, QWORD PTR [r10+-176]
-        mov	QWORD PTR [r10+-184], rax
-        sbb	r8, QWORD PTR [rdx+-176]
-        mov	rax, QWORD PTR [r10+-168]
-        mov	QWORD PTR [r10+-176], r8
-        sbb	rax, QWORD PTR [rdx+-168]
-        mov	r8, QWORD PTR [r10+-160]
-        mov	QWORD PTR [r10+-168], rax
-        sbb	r8, QWORD PTR [rdx+-160]
-        mov	rax, QWORD PTR [r10+-152]
-        mov	QWORD PTR [r10+-160], r8
-        sbb	rax, QWORD PTR [rdx+-152]
-        mov	r8, QWORD PTR [r10+-144]
-        mov	QWORD PTR [r10+-152], rax
-        sbb	r8, QWORD PTR [rdx+-144]
-        mov	rax, QWORD PTR [r10+-136]
-        mov	QWORD PTR [r10+-144], r8
-        sbb	rax, QWORD PTR [rdx+-136]
-        mov	r8, QWORD PTR [r10+-128]
-        mov	QWORD PTR [r10+-136], rax
-        sbb	r8, QWORD PTR [rdx+-128]
-        mov	rax, QWORD PTR [r10+-120]
-        mov	QWORD PTR [r10+-128], r8
-        sbb	rax, QWORD PTR [rdx+-120]
-        mov	r8, QWORD PTR [r10+-112]
-        mov	QWORD PTR [r10+-120], rax
-        sbb	r8, QWORD PTR [rdx+-112]
-        mov	rax, QWORD PTR [r10+-104]
-        mov	QWORD PTR [r10+-112], r8
-        sbb	rax, QWORD PTR [rdx+-104]
-        mov	r8, QWORD PTR [r10+-96]
-        mov	QWORD PTR [r10+-104], rax
-        sbb	r8, QWORD PTR [rdx+-96]
-        mov	rax, QWORD PTR [r10+-88]
-        mov	QWORD PTR [r10+-96], r8
-        sbb	rax, QWORD PTR [rdx+-88]
-        mov	r8, QWORD PTR [r10+-80]
-        mov	QWORD PTR [r10+-88], rax
-        sbb	r8, QWORD PTR [rdx+-80]
-        mov	rax, QWORD PTR [r10+-72]
-        mov	QWORD PTR [r10+-80], r8
-        sbb	rax, QWORD PTR [rdx+-72]
-        mov	r8, QWORD PTR [r10+-64]
-        mov	QWORD PTR [r10+-72], rax
-        sbb	r8, QWORD PTR [rdx+-64]
-        mov	rax, QWORD PTR [r10+-56]
-        mov	QWORD PTR [r10+-64], r8
-        sbb	rax, QWORD PTR [rdx+-56]
-        mov	r8, QWORD PTR [r10+-48]
-        mov	QWORD PTR [r10+-56], rax
-        sbb	r8, QWORD PTR [rdx+-48]
-        mov	rax, QWORD PTR [r10+-40]
-        mov	QWORD PTR [r10+-48], r8
-        sbb	rax, QWORD PTR [rdx+-40]
-        mov	r8, QWORD PTR [r10+-32]
-        mov	QWORD PTR [r10+-40], rax
-        sbb	r8, QWORD PTR [rdx+-32]
-        mov	rax, QWORD PTR [r10+-24]
-        mov	QWORD PTR [r10+-32], r8
-        sbb	rax, QWORD PTR [rdx+-24]
-        mov	r8, QWORD PTR [r10+-16]
-        mov	QWORD PTR [r10+-24], rax
-        sbb	r8, QWORD PTR [rdx+-16]
-        mov	rax, QWORD PTR [r10+-8]
-        mov	QWORD PTR [r10+-16], r8
-        sbb	rax, QWORD PTR [rdx+-8]
-        mov	r8, QWORD PTR [r10]
-        mov	QWORD PTR [r10+-8], rax
-        sbb	r8, QWORD PTR [rdx]
-        mov	rax, QWORD PTR [r10+8]
-        mov	QWORD PTR [r10], r8
-        sbb	rax, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [r10+16]
-        mov	QWORD PTR [r10+8], rax
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	rax, QWORD PTR [r10+24]
-        mov	QWORD PTR [r10+16], r8
-        sbb	rax, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [r10+32]
-        mov	QWORD PTR [r10+24], rax
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	rax, QWORD PTR [r10+40]
-        mov	QWORD PTR [r10+32], r8
-        sbb	rax, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [r10+48]
-        mov	QWORD PTR [r10+40], rax
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	rax, QWORD PTR [r10+56]
-        mov	QWORD PTR [r10+48], r8
-        sbb	rax, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [r10+64]
-        mov	QWORD PTR [r10+56], rax
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	rax, QWORD PTR [r10+72]
-        mov	QWORD PTR [r10+64], r8
-        sbb	rax, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [r10+80]
-        mov	QWORD PTR [r10+72], rax
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	rax, QWORD PTR [r10+88]
-        mov	QWORD PTR [r10+80], r8
-        sbb	rax, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [r10+96]
-        mov	QWORD PTR [r10+88], rax
-        sbb	r8, QWORD PTR [rdx+96]
-        mov	rax, QWORD PTR [r10+104]
-        mov	QWORD PTR [r10+96], r8
-        sbb	rax, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [r10+112]
-        mov	QWORD PTR [r10+104], rax
-        sbb	r8, QWORD PTR [rdx+112]
-        mov	rax, QWORD PTR [r10+120]
-        mov	QWORD PTR [r10+112], r8
-        sbb	rax, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [r10+128]
-        mov	QWORD PTR [r10+120], rax
-        sbb	r8, QWORD PTR [rdx+128]
-        mov	rax, QWORD PTR [r10+136]
-        mov	QWORD PTR [r10+128], r8
-        sbb	rax, QWORD PTR [rdx+136]
-        mov	r8, QWORD PTR [r10+144]
-        mov	QWORD PTR [r10+136], rax
-        sbb	r8, QWORD PTR [rdx+144]
-        mov	rax, QWORD PTR [r10+152]
-        mov	QWORD PTR [r10+144], r8
-        sbb	rax, QWORD PTR [rdx+152]
-        mov	r8, QWORD PTR [r10+160]
-        mov	QWORD PTR [r10+152], rax
-        sbb	r8, QWORD PTR [rdx+160]
-        mov	rax, QWORD PTR [r10+168]
-        mov	QWORD PTR [r10+160], r8
-        sbb	rax, QWORD PTR [rdx+168]
-        mov	r8, QWORD PTR [r10+176]
-        mov	QWORD PTR [r10+168], rax
-        sbb	r8, QWORD PTR [rdx+176]
-        mov	rax, QWORD PTR [r10+184]
-        mov	QWORD PTR [r10+176], r8
-        sbb	rax, QWORD PTR [rdx+184]
-        mov	r8, QWORD PTR [r10+192]
-        mov	QWORD PTR [r10+184], rax
-        sbb	r8, QWORD PTR [rdx+192]
-        mov	rax, QWORD PTR [r10+200]
-        mov	QWORD PTR [r10+192], r8
-        sbb	rax, QWORD PTR [rdx+200]
-        mov	r8, QWORD PTR [r10+208]
-        mov	QWORD PTR [r10+200], rax
-        sbb	r8, QWORD PTR [rdx+208]
-        mov	rax, QWORD PTR [r10+216]
-        mov	QWORD PTR [r10+208], r8
-        sbb	rax, QWORD PTR [rdx+216]
-        mov	r8, QWORD PTR [r10+224]
-        mov	QWORD PTR [r10+216], rax
-        sbb	r8, QWORD PTR [rdx+224]
-        mov	rax, QWORD PTR [r10+232]
-        mov	QWORD PTR [r10+224], r8
-        sbb	rax, QWORD PTR [rdx+232]
-        mov	r8, QWORD PTR [r10+240]
-        mov	QWORD PTR [r10+232], rax
-        sbb	r8, QWORD PTR [rdx+240]
-        mov	rax, QWORD PTR [r10+248]
-        mov	QWORD PTR [r10+240], r8
-        sbb	rax, QWORD PTR [rdx+248]
-        mov	QWORD PTR [r10+248], rax
-        sbb	r9, 0
-        sub	rdx, 512
-        mov	r8, QWORD PTR [r10+-256]
-        sub	r8, QWORD PTR [rdx+-256]
-        mov	rax, QWORD PTR [r10+-248]
-        mov	QWORD PTR [r10+-256], r8
-        sbb	rax, QWORD PTR [rdx+-248]
-        mov	r8, QWORD PTR [r10+-240]
-        mov	QWORD PTR [r10+-248], rax
-        sbb	r8, QWORD PTR [rdx+-240]
-        mov	rax, QWORD PTR [r10+-232]
-        mov	QWORD PTR [r10+-240], r8
-        sbb	rax, QWORD PTR [rdx+-232]
-        mov	r8, QWORD PTR [r10+-224]
-        mov	QWORD PTR [r10+-232], rax
-        sbb	r8, QWORD PTR [rdx+-224]
-        mov	rax, QWORD PTR [r10+-216]
-        mov	QWORD PTR [r10+-224], r8
-        sbb	rax, QWORD PTR [rdx+-216]
-        mov	r8, QWORD PTR [r10+-208]
-        mov	QWORD PTR [r10+-216], rax
-        sbb	r8, QWORD PTR [rdx+-208]
-        mov	rax, QWORD PTR [r10+-200]
-        mov	QWORD PTR [r10+-208], r8
-        sbb	rax, QWORD PTR [rdx+-200]
-        mov	r8, QWORD PTR [r10+-192]
-        mov	QWORD PTR [r10+-200], rax
-        sbb	r8, QWORD PTR [rdx+-192]
-        mov	rax, QWORD PTR [r10+-184]
-        mov	QWORD PTR [r10+-192], r8
-        sbb	rax, QWORD PTR [rdx+-184]
-        mov	r8, QWORD PTR [r10+-176]
-        mov	QWORD PTR [r10+-184], rax
-        sbb	r8, QWORD PTR [rdx+-176]
-        mov	rax, QWORD PTR [r10+-168]
-        mov	QWORD PTR [r10+-176], r8
-        sbb	rax, QWORD PTR [rdx+-168]
-        mov	r8, QWORD PTR [r10+-160]
-        mov	QWORD PTR [r10+-168], rax
-        sbb	r8, QWORD PTR [rdx+-160]
-        mov	rax, QWORD PTR [r10+-152]
-        mov	QWORD PTR [r10+-160], r8
-        sbb	rax, QWORD PTR [rdx+-152]
-        mov	r8, QWORD PTR [r10+-144]
-        mov	QWORD PTR [r10+-152], rax
-        sbb	r8, QWORD PTR [rdx+-144]
-        mov	rax, QWORD PTR [r10+-136]
-        mov	QWORD PTR [r10+-144], r8
-        sbb	rax, QWORD PTR [rdx+-136]
-        mov	r8, QWORD PTR [r10+-128]
-        mov	QWORD PTR [r10+-136], rax
-        sbb	r8, QWORD PTR [rdx+-128]
-        mov	rax, QWORD PTR [r10+-120]
-        mov	QWORD PTR [r10+-128], r8
-        sbb	rax, QWORD PTR [rdx+-120]
-        mov	r8, QWORD PTR [r10+-112]
-        mov	QWORD PTR [r10+-120], rax
-        sbb	r8, QWORD PTR [rdx+-112]
-        mov	rax, QWORD PTR [r10+-104]
-        mov	QWORD PTR [r10+-112], r8
-        sbb	rax, QWORD PTR [rdx+-104]
-        mov	r8, QWORD PTR [r10+-96]
-        mov	QWORD PTR [r10+-104], rax
-        sbb	r8, QWORD PTR [rdx+-96]
-        mov	rax, QWORD PTR [r10+-88]
-        mov	QWORD PTR [r10+-96], r8
-        sbb	rax, QWORD PTR [rdx+-88]
-        mov	r8, QWORD PTR [r10+-80]
-        mov	QWORD PTR [r10+-88], rax
-        sbb	r8, QWORD PTR [rdx+-80]
-        mov	rax, QWORD PTR [r10+-72]
-        mov	QWORD PTR [r10+-80], r8
-        sbb	rax, QWORD PTR [rdx+-72]
-        mov	r8, QWORD PTR [r10+-64]
-        mov	QWORD PTR [r10+-72], rax
-        sbb	r8, QWORD PTR [rdx+-64]
-        mov	rax, QWORD PTR [r10+-56]
-        mov	QWORD PTR [r10+-64], r8
-        sbb	rax, QWORD PTR [rdx+-56]
-        mov	r8, QWORD PTR [r10+-48]
-        mov	QWORD PTR [r10+-56], rax
-        sbb	r8, QWORD PTR [rdx+-48]
-        mov	rax, QWORD PTR [r10+-40]
-        mov	QWORD PTR [r10+-48], r8
-        sbb	rax, QWORD PTR [rdx+-40]
-        mov	r8, QWORD PTR [r10+-32]
-        mov	QWORD PTR [r10+-40], rax
-        sbb	r8, QWORD PTR [rdx+-32]
-        mov	rax, QWORD PTR [r10+-24]
-        mov	QWORD PTR [r10+-32], r8
-        sbb	rax, QWORD PTR [rdx+-24]
-        mov	r8, QWORD PTR [r10+-16]
-        mov	QWORD PTR [r10+-24], rax
-        sbb	r8, QWORD PTR [rdx+-16]
-        mov	rax, QWORD PTR [r10+-8]
-        mov	QWORD PTR [r10+-16], r8
-        sbb	rax, QWORD PTR [rdx+-8]
-        mov	r8, QWORD PTR [r10]
-        mov	QWORD PTR [r10+-8], rax
-        sbb	r8, QWORD PTR [rdx]
-        mov	rax, QWORD PTR [r10+8]
-        mov	QWORD PTR [r10], r8
-        sbb	rax, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [r10+16]
-        mov	QWORD PTR [r10+8], rax
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	rax, QWORD PTR [r10+24]
-        mov	QWORD PTR [r10+16], r8
-        sbb	rax, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [r10+32]
-        mov	QWORD PTR [r10+24], rax
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	rax, QWORD PTR [r10+40]
-        mov	QWORD PTR [r10+32], r8
-        sbb	rax, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [r10+48]
-        mov	QWORD PTR [r10+40], rax
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	rax, QWORD PTR [r10+56]
-        mov	QWORD PTR [r10+48], r8
-        sbb	rax, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [r10+64]
-        mov	QWORD PTR [r10+56], rax
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	rax, QWORD PTR [r10+72]
-        mov	QWORD PTR [r10+64], r8
-        sbb	rax, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [r10+80]
-        mov	QWORD PTR [r10+72], rax
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	rax, QWORD PTR [r10+88]
-        mov	QWORD PTR [r10+80], r8
-        sbb	rax, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [r10+96]
-        mov	QWORD PTR [r10+88], rax
-        sbb	r8, QWORD PTR [rdx+96]
-        mov	rax, QWORD PTR [r10+104]
-        mov	QWORD PTR [r10+96], r8
-        sbb	rax, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [r10+112]
-        mov	QWORD PTR [r10+104], rax
-        sbb	r8, QWORD PTR [rdx+112]
-        mov	rax, QWORD PTR [r10+120]
-        mov	QWORD PTR [r10+112], r8
-        sbb	rax, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [r10+128]
-        mov	QWORD PTR [r10+120], rax
-        sbb	r8, QWORD PTR [rdx+128]
-        mov	rax, QWORD PTR [r10+136]
-        mov	QWORD PTR [r10+128], r8
-        sbb	rax, QWORD PTR [rdx+136]
-        mov	r8, QWORD PTR [r10+144]
-        mov	QWORD PTR [r10+136], rax
-        sbb	r8, QWORD PTR [rdx+144]
-        mov	rax, QWORD PTR [r10+152]
-        mov	QWORD PTR [r10+144], r8
-        sbb	rax, QWORD PTR [rdx+152]
-        mov	r8, QWORD PTR [r10+160]
-        mov	QWORD PTR [r10+152], rax
-        sbb	r8, QWORD PTR [rdx+160]
-        mov	rax, QWORD PTR [r10+168]
-        mov	QWORD PTR [r10+160], r8
-        sbb	rax, QWORD PTR [rdx+168]
-        mov	r8, QWORD PTR [r10+176]
-        mov	QWORD PTR [r10+168], rax
-        sbb	r8, QWORD PTR [rdx+176]
-        mov	rax, QWORD PTR [r10+184]
-        mov	QWORD PTR [r10+176], r8
-        sbb	rax, QWORD PTR [rdx+184]
-        mov	r8, QWORD PTR [r10+192]
-        mov	QWORD PTR [r10+184], rax
-        sbb	r8, QWORD PTR [rdx+192]
-        mov	rax, QWORD PTR [r10+200]
-        mov	QWORD PTR [r10+192], r8
-        sbb	rax, QWORD PTR [rdx+200]
-        mov	r8, QWORD PTR [r10+208]
-        mov	QWORD PTR [r10+200], rax
-        sbb	r8, QWORD PTR [rdx+208]
-        mov	rax, QWORD PTR [r10+216]
-        mov	QWORD PTR [r10+208], r8
-        sbb	rax, QWORD PTR [rdx+216]
-        mov	r8, QWORD PTR [r10+224]
-        mov	QWORD PTR [r10+216], rax
-        sbb	r8, QWORD PTR [rdx+224]
-        mov	rax, QWORD PTR [r10+232]
-        mov	QWORD PTR [r10+224], r8
-        sbb	rax, QWORD PTR [rdx+232]
-        mov	r8, QWORD PTR [r10+240]
-        mov	QWORD PTR [r10+232], rax
-        sbb	r8, QWORD PTR [rdx+240]
-        mov	rax, QWORD PTR [r10+248]
-        mov	QWORD PTR [r10+240], r8
-        sbb	rax, QWORD PTR [rdx+248]
-        mov	QWORD PTR [r10+248], rax
-        sbb	r9, 0
-        mov	rcx, QWORD PTR [rsp+512]
-        neg	r9
-        add	rcx, 512
-        mov	r8, QWORD PTR [rcx+-256]
-        sub	r8, QWORD PTR [r10+-256]
-        mov	rax, QWORD PTR [rcx+-248]
-        mov	QWORD PTR [rcx+-256], r8
-        sbb	rax, QWORD PTR [r10+-248]
-        mov	r8, QWORD PTR [rcx+-240]
-        mov	QWORD PTR [rcx+-248], rax
-        sbb	r8, QWORD PTR [r10+-240]
-        mov	rax, QWORD PTR [rcx+-232]
-        mov	QWORD PTR [rcx+-240], r8
-        sbb	rax, QWORD PTR [r10+-232]
-        mov	r8, QWORD PTR [rcx+-224]
-        mov	QWORD PTR [rcx+-232], rax
-        sbb	r8, QWORD PTR [r10+-224]
-        mov	rax, QWORD PTR [rcx+-216]
-        mov	QWORD PTR [rcx+-224], r8
-        sbb	rax, QWORD PTR [r10+-216]
-        mov	r8, QWORD PTR [rcx+-208]
-        mov	QWORD PTR [rcx+-216], rax
-        sbb	r8, QWORD PTR [r10+-208]
-        mov	rax, QWORD PTR [rcx+-200]
-        mov	QWORD PTR [rcx+-208], r8
-        sbb	rax, QWORD PTR [r10+-200]
-        mov	r8, QWORD PTR [rcx+-192]
-        mov	QWORD PTR [rcx+-200], rax
-        sbb	r8, QWORD PTR [r10+-192]
-        mov	rax, QWORD PTR [rcx+-184]
-        mov	QWORD PTR [rcx+-192], r8
-        sbb	rax, QWORD PTR [r10+-184]
-        mov	r8, QWORD PTR [rcx+-176]
-        mov	QWORD PTR [rcx+-184], rax
-        sbb	r8, QWORD PTR [r10+-176]
-        mov	rax, QWORD PTR [rcx+-168]
-        mov	QWORD PTR [rcx+-176], r8
-        sbb	rax, QWORD PTR [r10+-168]
-        mov	r8, QWORD PTR [rcx+-160]
-        mov	QWORD PTR [rcx+-168], rax
-        sbb	r8, QWORD PTR [r10+-160]
-        mov	rax, QWORD PTR [rcx+-152]
-        mov	QWORD PTR [rcx+-160], r8
-        sbb	rax, QWORD PTR [r10+-152]
-        mov	r8, QWORD PTR [rcx+-144]
-        mov	QWORD PTR [rcx+-152], rax
-        sbb	r8, QWORD PTR [r10+-144]
-        mov	rax, QWORD PTR [rcx+-136]
-        mov	QWORD PTR [rcx+-144], r8
-        sbb	rax, QWORD PTR [r10+-136]
-        mov	r8, QWORD PTR [rcx+-128]
-        mov	QWORD PTR [rcx+-136], rax
-        sbb	r8, QWORD PTR [r10+-128]
-        mov	rax, QWORD PTR [rcx+-120]
-        mov	QWORD PTR [rcx+-128], r8
-        sbb	rax, QWORD PTR [r10+-120]
-        mov	r8, QWORD PTR [rcx+-112]
-        mov	QWORD PTR [rcx+-120], rax
-        sbb	r8, QWORD PTR [r10+-112]
-        mov	rax, QWORD PTR [rcx+-104]
-        mov	QWORD PTR [rcx+-112], r8
-        sbb	rax, QWORD PTR [r10+-104]
-        mov	r8, QWORD PTR [rcx+-96]
-        mov	QWORD PTR [rcx+-104], rax
-        sbb	r8, QWORD PTR [r10+-96]
-        mov	rax, QWORD PTR [rcx+-88]
-        mov	QWORD PTR [rcx+-96], r8
-        sbb	rax, QWORD PTR [r10+-88]
-        mov	r8, QWORD PTR [rcx+-80]
-        mov	QWORD PTR [rcx+-88], rax
-        sbb	r8, QWORD PTR [r10+-80]
-        mov	rax, QWORD PTR [rcx+-72]
-        mov	QWORD PTR [rcx+-80], r8
-        sbb	rax, QWORD PTR [r10+-72]
-        mov	r8, QWORD PTR [rcx+-64]
-        mov	QWORD PTR [rcx+-72], rax
-        sbb	r8, QWORD PTR [r10+-64]
-        mov	rax, QWORD PTR [rcx+-56]
-        mov	QWORD PTR [rcx+-64], r8
-        sbb	rax, QWORD PTR [r10+-56]
-        mov	r8, QWORD PTR [rcx+-48]
-        mov	QWORD PTR [rcx+-56], rax
-        sbb	r8, QWORD PTR [r10+-48]
-        mov	rax, QWORD PTR [rcx+-40]
-        mov	QWORD PTR [rcx+-48], r8
-        sbb	rax, QWORD PTR [r10+-40]
-        mov	r8, QWORD PTR [rcx+-32]
-        mov	QWORD PTR [rcx+-40], rax
-        sbb	r8, QWORD PTR [r10+-32]
-        mov	rax, QWORD PTR [rcx+-24]
-        mov	QWORD PTR [rcx+-32], r8
-        sbb	rax, QWORD PTR [r10+-24]
-        mov	r8, QWORD PTR [rcx+-16]
-        mov	QWORD PTR [rcx+-24], rax
-        sbb	r8, QWORD PTR [r10+-16]
-        mov	rax, QWORD PTR [rcx+-8]
-        mov	QWORD PTR [rcx+-16], r8
-        sbb	rax, QWORD PTR [r10+-8]
-        mov	r8, QWORD PTR [rcx]
-        mov	QWORD PTR [rcx+-8], rax
-        sbb	r8, QWORD PTR [r10]
-        mov	rax, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        sbb	rax, QWORD PTR [r10+8]
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], rax
-        sbb	r8, QWORD PTR [r10+16]
-        mov	rax, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        sbb	rax, QWORD PTR [r10+24]
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], rax
-        sbb	r8, QWORD PTR [r10+32]
-        mov	rax, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        sbb	rax, QWORD PTR [r10+40]
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], rax
-        sbb	r8, QWORD PTR [r10+48]
-        mov	rax, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        sbb	rax, QWORD PTR [r10+56]
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], rax
-        sbb	r8, QWORD PTR [r10+64]
-        mov	rax, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        sbb	rax, QWORD PTR [r10+72]
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], rax
-        sbb	r8, QWORD PTR [r10+80]
-        mov	rax, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        sbb	rax, QWORD PTR [r10+88]
-        mov	r8, QWORD PTR [rcx+96]
-        mov	QWORD PTR [rcx+88], rax
-        sbb	r8, QWORD PTR [r10+96]
-        mov	rax, QWORD PTR [rcx+104]
-        mov	QWORD PTR [rcx+96], r8
-        sbb	rax, QWORD PTR [r10+104]
-        mov	r8, QWORD PTR [rcx+112]
-        mov	QWORD PTR [rcx+104], rax
-        sbb	r8, QWORD PTR [r10+112]
-        mov	rax, QWORD PTR [rcx+120]
-        mov	QWORD PTR [rcx+112], r8
-        sbb	rax, QWORD PTR [r10+120]
-        mov	r8, QWORD PTR [rcx+128]
-        mov	QWORD PTR [rcx+120], rax
-        sbb	r8, QWORD PTR [r10+128]
-        mov	rax, QWORD PTR [rcx+136]
-        mov	QWORD PTR [rcx+128], r8
-        sbb	rax, QWORD PTR [r10+136]
-        mov	r8, QWORD PTR [rcx+144]
-        mov	QWORD PTR [rcx+136], rax
-        sbb	r8, QWORD PTR [r10+144]
-        mov	rax, QWORD PTR [rcx+152]
-        mov	QWORD PTR [rcx+144], r8
-        sbb	rax, QWORD PTR [r10+152]
-        mov	r8, QWORD PTR [rcx+160]
-        mov	QWORD PTR [rcx+152], rax
-        sbb	r8, QWORD PTR [r10+160]
-        mov	rax, QWORD PTR [rcx+168]
-        mov	QWORD PTR [rcx+160], r8
-        sbb	rax, QWORD PTR [r10+168]
-        mov	r8, QWORD PTR [rcx+176]
-        mov	QWORD PTR [rcx+168], rax
-        sbb	r8, QWORD PTR [r10+176]
-        mov	rax, QWORD PTR [rcx+184]
-        mov	QWORD PTR [rcx+176], r8
-        sbb	rax, QWORD PTR [r10+184]
-        mov	r8, QWORD PTR [rcx+192]
-        mov	QWORD PTR [rcx+184], rax
-        sbb	r8, QWORD PTR [r10+192]
-        mov	rax, QWORD PTR [rcx+200]
-        mov	QWORD PTR [rcx+192], r8
-        sbb	rax, QWORD PTR [r10+200]
-        mov	r8, QWORD PTR [rcx+208]
-        mov	QWORD PTR [rcx+200], rax
-        sbb	r8, QWORD PTR [r10+208]
-        mov	rax, QWORD PTR [rcx+216]
-        mov	QWORD PTR [rcx+208], r8
-        sbb	rax, QWORD PTR [r10+216]
-        mov	r8, QWORD PTR [rcx+224]
-        mov	QWORD PTR [rcx+216], rax
-        sbb	r8, QWORD PTR [r10+224]
-        mov	rax, QWORD PTR [rcx+232]
-        mov	QWORD PTR [rcx+224], r8
-        sbb	rax, QWORD PTR [r10+232]
-        mov	r8, QWORD PTR [rcx+240]
-        mov	QWORD PTR [rcx+232], rax
-        sbb	r8, QWORD PTR [r10+240]
-        mov	rax, QWORD PTR [rcx+248]
-        mov	QWORD PTR [rcx+240], r8
-        sbb	rax, QWORD PTR [r10+248]
-        mov	QWORD PTR [rcx+248], rax
-        sbb	r9, 0
-        mov	rcx, QWORD PTR [rsp+512]
-        add	rcx, 768
-        ; Add in word
-        mov	r8, QWORD PTR [rcx]
-        add	r8, r9
-        mov	rax, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+96]
-        mov	QWORD PTR [rcx+88], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+104]
-        mov	QWORD PTR [rcx+96], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+112]
-        mov	QWORD PTR [rcx+104], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+120]
-        mov	QWORD PTR [rcx+112], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+128]
-        mov	QWORD PTR [rcx+120], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+136]
-        mov	QWORD PTR [rcx+128], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+144]
-        mov	QWORD PTR [rcx+136], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+152]
-        mov	QWORD PTR [rcx+144], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+160]
-        mov	QWORD PTR [rcx+152], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+168]
-        mov	QWORD PTR [rcx+160], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+176]
-        mov	QWORD PTR [rcx+168], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+184]
-        mov	QWORD PTR [rcx+176], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+192]
-        mov	QWORD PTR [rcx+184], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+200]
-        mov	QWORD PTR [rcx+192], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+208]
-        mov	QWORD PTR [rcx+200], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+216]
-        mov	QWORD PTR [rcx+208], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+224]
-        mov	QWORD PTR [rcx+216], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+232]
-        mov	QWORD PTR [rcx+224], r8
-        adc	rax, 0
-        mov	r8, QWORD PTR [rcx+240]
-        mov	QWORD PTR [rcx+232], rax
-        adc	r8, 0
-        mov	rax, QWORD PTR [rcx+248]
-        mov	QWORD PTR [rcx+240], r8
-        adc	rax, 0
-        mov	QWORD PTR [rcx+248], rax
-        mov	rdx, QWORD PTR [rsp+520]
-        mov	rcx, QWORD PTR [rsp+512]
-        add	rsp, 528
-        ret
-sp_4096_sqr_avx2_64 ENDP
-_text ENDS
-ENDIF
-; /* Mul a by digit b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision digit.
-;  */
-_text SEGMENT READONLY PARA
-sp_4096_mul_d_64 PROC
-        push	r12
-        mov	r9, rdx
-        ; A[0] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9]
-        mov	r10, rax
-        mov	r11, rdx
-        mov	QWORD PTR [rcx], r10
-        ; A[1] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        mov	QWORD PTR [rcx+8], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        mov	QWORD PTR [rcx+16], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        mov	QWORD PTR [rcx+24], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        mov	QWORD PTR [rcx+32], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        mov	QWORD PTR [rcx+40], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        mov	QWORD PTR [rcx+48], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        mov	QWORD PTR [rcx+56], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        mov	QWORD PTR [rcx+64], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[9] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+72]
-        add	r10, rax
-        mov	QWORD PTR [rcx+72], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[10] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+80]
-        add	r11, rax
-        mov	QWORD PTR [rcx+80], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[11] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+88]
-        add	r12, rax
-        mov	QWORD PTR [rcx+88], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[12] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+96]
-        add	r10, rax
-        mov	QWORD PTR [rcx+96], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[13] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+104]
-        add	r11, rax
-        mov	QWORD PTR [rcx+104], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[14] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+112]
-        add	r12, rax
-        mov	QWORD PTR [rcx+112], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[15] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+120]
-        add	r10, rax
-        mov	QWORD PTR [rcx+120], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[16] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+128]
-        add	r11, rax
-        mov	QWORD PTR [rcx+128], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[17] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+136]
-        add	r12, rax
-        mov	QWORD PTR [rcx+136], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[18] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+144]
-        add	r10, rax
-        mov	QWORD PTR [rcx+144], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[19] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+152]
-        add	r11, rax
-        mov	QWORD PTR [rcx+152], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[20] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+160]
-        add	r12, rax
-        mov	QWORD PTR [rcx+160], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[21] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+168]
-        add	r10, rax
-        mov	QWORD PTR [rcx+168], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[22] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+176]
-        add	r11, rax
-        mov	QWORD PTR [rcx+176], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[23] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+184]
-        add	r12, rax
-        mov	QWORD PTR [rcx+184], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[24] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+192]
-        add	r10, rax
-        mov	QWORD PTR [rcx+192], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[25] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+200]
-        add	r11, rax
-        mov	QWORD PTR [rcx+200], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[26] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+208]
-        add	r12, rax
-        mov	QWORD PTR [rcx+208], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[27] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+216]
-        add	r10, rax
-        mov	QWORD PTR [rcx+216], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[28] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+224]
-        add	r11, rax
-        mov	QWORD PTR [rcx+224], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[29] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+232]
-        add	r12, rax
-        mov	QWORD PTR [rcx+232], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[30] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+240]
-        add	r10, rax
-        mov	QWORD PTR [rcx+240], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[31] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+248]
-        add	r11, rax
-        mov	QWORD PTR [rcx+248], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[32] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+256]
-        add	r12, rax
-        mov	QWORD PTR [rcx+256], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[33] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+264]
-        add	r10, rax
-        mov	QWORD PTR [rcx+264], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[34] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+272]
-        add	r11, rax
-        mov	QWORD PTR [rcx+272], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[35] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+280]
-        add	r12, rax
-        mov	QWORD PTR [rcx+280], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[36] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+288]
-        add	r10, rax
-        mov	QWORD PTR [rcx+288], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[37] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+296]
-        add	r11, rax
-        mov	QWORD PTR [rcx+296], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[38] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+304]
-        add	r12, rax
-        mov	QWORD PTR [rcx+304], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[39] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+312]
-        add	r10, rax
-        mov	QWORD PTR [rcx+312], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[40] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+320]
-        add	r11, rax
-        mov	QWORD PTR [rcx+320], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[41] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+328]
-        add	r12, rax
-        mov	QWORD PTR [rcx+328], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[42] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+336]
-        add	r10, rax
-        mov	QWORD PTR [rcx+336], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[43] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+344]
-        add	r11, rax
-        mov	QWORD PTR [rcx+344], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[44] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+352]
-        add	r12, rax
-        mov	QWORD PTR [rcx+352], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[45] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+360]
-        add	r10, rax
-        mov	QWORD PTR [rcx+360], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[46] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+368]
-        add	r11, rax
-        mov	QWORD PTR [rcx+368], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[47] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+376]
-        add	r12, rax
-        mov	QWORD PTR [rcx+376], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[48] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+384]
-        add	r10, rax
-        mov	QWORD PTR [rcx+384], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[49] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+392]
-        add	r11, rax
-        mov	QWORD PTR [rcx+392], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[50] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+400]
-        add	r12, rax
-        mov	QWORD PTR [rcx+400], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[51] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+408]
-        add	r10, rax
-        mov	QWORD PTR [rcx+408], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[52] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+416]
-        add	r11, rax
-        mov	QWORD PTR [rcx+416], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[53] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+424]
-        add	r12, rax
-        mov	QWORD PTR [rcx+424], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[54] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+432]
-        add	r10, rax
-        mov	QWORD PTR [rcx+432], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[55] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+440]
-        add	r11, rax
-        mov	QWORD PTR [rcx+440], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[56] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+448]
-        add	r12, rax
-        mov	QWORD PTR [rcx+448], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[57] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+456]
-        add	r10, rax
-        mov	QWORD PTR [rcx+456], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[58] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+464]
-        add	r11, rax
-        mov	QWORD PTR [rcx+464], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[59] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+472]
-        add	r12, rax
-        mov	QWORD PTR [rcx+472], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[60] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+480]
-        add	r10, rax
-        mov	QWORD PTR [rcx+480], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[61] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+488]
-        add	r11, rax
-        mov	QWORD PTR [rcx+488], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[62] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+496]
-        add	r12, rax
-        mov	QWORD PTR [rcx+496], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[63] * B
-        mov	rax, r8
-        mul	QWORD PTR [r9+504]
-        add	r10, rax
-        adc	r11, rdx
-        mov	QWORD PTR [rcx+504], r10
-        mov	QWORD PTR [rcx+512], r11
-        pop	r12
-        ret
-sp_4096_mul_d_64 ENDP
-_text ENDS
-; /* Conditionally subtract b from a using the mask m.
-;  * m is -1 to subtract and 0 when not copying.
-;  *
-;  * r  A single precision number representing condition subtract result.
-;  * a  A single precision number to subtract from.
-;  * b  A single precision number to subtract.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_4096_cond_sub_64 PROC
-        sub	rsp, 512
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp], r10
-        mov	QWORD PTR [rsp+8], r11
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+16], r10
-        mov	QWORD PTR [rsp+24], r11
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+32], r10
-        mov	QWORD PTR [rsp+40], r11
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+48], r10
-        mov	QWORD PTR [rsp+56], r11
-        mov	r10, QWORD PTR [r8+64]
-        mov	r11, QWORD PTR [r8+72]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+64], r10
-        mov	QWORD PTR [rsp+72], r11
-        mov	r10, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [r8+88]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+80], r10
-        mov	QWORD PTR [rsp+88], r11
-        mov	r10, QWORD PTR [r8+96]
-        mov	r11, QWORD PTR [r8+104]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+96], r10
-        mov	QWORD PTR [rsp+104], r11
-        mov	r10, QWORD PTR [r8+112]
-        mov	r11, QWORD PTR [r8+120]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+112], r10
-        mov	QWORD PTR [rsp+120], r11
-        mov	r10, QWORD PTR [r8+128]
-        mov	r11, QWORD PTR [r8+136]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+128], r10
-        mov	QWORD PTR [rsp+136], r11
-        mov	r10, QWORD PTR [r8+144]
-        mov	r11, QWORD PTR [r8+152]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+144], r10
-        mov	QWORD PTR [rsp+152], r11
-        mov	r10, QWORD PTR [r8+160]
-        mov	r11, QWORD PTR [r8+168]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+160], r10
-        mov	QWORD PTR [rsp+168], r11
-        mov	r10, QWORD PTR [r8+176]
-        mov	r11, QWORD PTR [r8+184]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+176], r10
-        mov	QWORD PTR [rsp+184], r11
-        mov	r10, QWORD PTR [r8+192]
-        mov	r11, QWORD PTR [r8+200]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+192], r10
-        mov	QWORD PTR [rsp+200], r11
-        mov	r10, QWORD PTR [r8+208]
-        mov	r11, QWORD PTR [r8+216]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+208], r10
-        mov	QWORD PTR [rsp+216], r11
-        mov	r10, QWORD PTR [r8+224]
-        mov	r11, QWORD PTR [r8+232]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+224], r10
-        mov	QWORD PTR [rsp+232], r11
-        mov	r10, QWORD PTR [r8+240]
-        mov	r11, QWORD PTR [r8+248]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+240], r10
-        mov	QWORD PTR [rsp+248], r11
-        mov	r10, QWORD PTR [r8+256]
-        mov	r11, QWORD PTR [r8+264]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+256], r10
-        mov	QWORD PTR [rsp+264], r11
-        mov	r10, QWORD PTR [r8+272]
-        mov	r11, QWORD PTR [r8+280]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+272], r10
-        mov	QWORD PTR [rsp+280], r11
-        mov	r10, QWORD PTR [r8+288]
-        mov	r11, QWORD PTR [r8+296]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+288], r10
-        mov	QWORD PTR [rsp+296], r11
-        mov	r10, QWORD PTR [r8+304]
-        mov	r11, QWORD PTR [r8+312]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+304], r10
-        mov	QWORD PTR [rsp+312], r11
-        mov	r10, QWORD PTR [r8+320]
-        mov	r11, QWORD PTR [r8+328]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+320], r10
-        mov	QWORD PTR [rsp+328], r11
-        mov	r10, QWORD PTR [r8+336]
-        mov	r11, QWORD PTR [r8+344]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+336], r10
-        mov	QWORD PTR [rsp+344], r11
-        mov	r10, QWORD PTR [r8+352]
-        mov	r11, QWORD PTR [r8+360]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+352], r10
-        mov	QWORD PTR [rsp+360], r11
-        mov	r10, QWORD PTR [r8+368]
-        mov	r11, QWORD PTR [r8+376]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+368], r10
-        mov	QWORD PTR [rsp+376], r11
-        mov	r10, QWORD PTR [r8+384]
-        mov	r11, QWORD PTR [r8+392]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+384], r10
-        mov	QWORD PTR [rsp+392], r11
-        mov	r10, QWORD PTR [r8+400]
-        mov	r11, QWORD PTR [r8+408]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+400], r10
-        mov	QWORD PTR [rsp+408], r11
-        mov	r10, QWORD PTR [r8+416]
-        mov	r11, QWORD PTR [r8+424]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+416], r10
-        mov	QWORD PTR [rsp+424], r11
-        mov	r10, QWORD PTR [r8+432]
-        mov	r11, QWORD PTR [r8+440]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+432], r10
-        mov	QWORD PTR [rsp+440], r11
-        mov	r10, QWORD PTR [r8+448]
-        mov	r11, QWORD PTR [r8+456]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+448], r10
-        mov	QWORD PTR [rsp+456], r11
-        mov	r10, QWORD PTR [r8+464]
-        mov	r11, QWORD PTR [r8+472]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+464], r10
-        mov	QWORD PTR [rsp+472], r11
-        mov	r10, QWORD PTR [r8+480]
-        mov	r11, QWORD PTR [r8+488]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+480], r10
-        mov	QWORD PTR [rsp+488], r11
-        mov	r10, QWORD PTR [r8+496]
-        mov	r11, QWORD PTR [r8+504]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+496], r10
-        mov	QWORD PTR [rsp+504], r11
-        mov	r10, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rsp]
-        sub	r10, r8
-        mov	r11, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [rsp+8]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx], r10
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r8, QWORD PTR [rsp+16]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+8], r11
-        mov	r11, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [rsp+24]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+16], r10
-        mov	r10, QWORD PTR [rdx+32]
-        mov	r8, QWORD PTR [rsp+32]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+24], r11
-        mov	r11, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [rsp+40]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+32], r10
-        mov	r10, QWORD PTR [rdx+48]
-        mov	r8, QWORD PTR [rsp+48]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+40], r11
-        mov	r11, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [rsp+56]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+48], r10
-        mov	r10, QWORD PTR [rdx+64]
-        mov	r8, QWORD PTR [rsp+64]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+56], r11
-        mov	r11, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [rsp+72]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+64], r10
-        mov	r10, QWORD PTR [rdx+80]
-        mov	r8, QWORD PTR [rsp+80]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+72], r11
-        mov	r11, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [rsp+88]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+80], r10
-        mov	r10, QWORD PTR [rdx+96]
-        mov	r8, QWORD PTR [rsp+96]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+88], r11
-        mov	r11, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [rsp+104]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+96], r10
-        mov	r10, QWORD PTR [rdx+112]
-        mov	r8, QWORD PTR [rsp+112]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+104], r11
-        mov	r11, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [rsp+120]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+112], r10
-        mov	r10, QWORD PTR [rdx+128]
-        mov	r8, QWORD PTR [rsp+128]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+120], r11
-        mov	r11, QWORD PTR [rdx+136]
-        mov	r8, QWORD PTR [rsp+136]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+128], r10
-        mov	r10, QWORD PTR [rdx+144]
-        mov	r8, QWORD PTR [rsp+144]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+136], r11
-        mov	r11, QWORD PTR [rdx+152]
-        mov	r8, QWORD PTR [rsp+152]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+144], r10
-        mov	r10, QWORD PTR [rdx+160]
-        mov	r8, QWORD PTR [rsp+160]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+152], r11
-        mov	r11, QWORD PTR [rdx+168]
-        mov	r8, QWORD PTR [rsp+168]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+160], r10
-        mov	r10, QWORD PTR [rdx+176]
-        mov	r8, QWORD PTR [rsp+176]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+168], r11
-        mov	r11, QWORD PTR [rdx+184]
-        mov	r8, QWORD PTR [rsp+184]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+176], r10
-        mov	r10, QWORD PTR [rdx+192]
-        mov	r8, QWORD PTR [rsp+192]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+184], r11
-        mov	r11, QWORD PTR [rdx+200]
-        mov	r8, QWORD PTR [rsp+200]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+192], r10
-        mov	r10, QWORD PTR [rdx+208]
-        mov	r8, QWORD PTR [rsp+208]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+200], r11
-        mov	r11, QWORD PTR [rdx+216]
-        mov	r8, QWORD PTR [rsp+216]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+208], r10
-        mov	r10, QWORD PTR [rdx+224]
-        mov	r8, QWORD PTR [rsp+224]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+216], r11
-        mov	r11, QWORD PTR [rdx+232]
-        mov	r8, QWORD PTR [rsp+232]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+224], r10
-        mov	r10, QWORD PTR [rdx+240]
-        mov	r8, QWORD PTR [rsp+240]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+232], r11
-        mov	r11, QWORD PTR [rdx+248]
-        mov	r8, QWORD PTR [rsp+248]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+240], r10
-        mov	r10, QWORD PTR [rdx+256]
-        mov	r8, QWORD PTR [rsp+256]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+248], r11
-        mov	r11, QWORD PTR [rdx+264]
-        mov	r8, QWORD PTR [rsp+264]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+256], r10
-        mov	r10, QWORD PTR [rdx+272]
-        mov	r8, QWORD PTR [rsp+272]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+264], r11
-        mov	r11, QWORD PTR [rdx+280]
-        mov	r8, QWORD PTR [rsp+280]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+272], r10
-        mov	r10, QWORD PTR [rdx+288]
-        mov	r8, QWORD PTR [rsp+288]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+280], r11
-        mov	r11, QWORD PTR [rdx+296]
-        mov	r8, QWORD PTR [rsp+296]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+288], r10
-        mov	r10, QWORD PTR [rdx+304]
-        mov	r8, QWORD PTR [rsp+304]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+296], r11
-        mov	r11, QWORD PTR [rdx+312]
-        mov	r8, QWORD PTR [rsp+312]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+304], r10
-        mov	r10, QWORD PTR [rdx+320]
-        mov	r8, QWORD PTR [rsp+320]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+312], r11
-        mov	r11, QWORD PTR [rdx+328]
-        mov	r8, QWORD PTR [rsp+328]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+320], r10
-        mov	r10, QWORD PTR [rdx+336]
-        mov	r8, QWORD PTR [rsp+336]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+328], r11
-        mov	r11, QWORD PTR [rdx+344]
-        mov	r8, QWORD PTR [rsp+344]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+336], r10
-        mov	r10, QWORD PTR [rdx+352]
-        mov	r8, QWORD PTR [rsp+352]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+344], r11
-        mov	r11, QWORD PTR [rdx+360]
-        mov	r8, QWORD PTR [rsp+360]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+352], r10
-        mov	r10, QWORD PTR [rdx+368]
-        mov	r8, QWORD PTR [rsp+368]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+360], r11
-        mov	r11, QWORD PTR [rdx+376]
-        mov	r8, QWORD PTR [rsp+376]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+368], r10
-        mov	r10, QWORD PTR [rdx+384]
-        mov	r8, QWORD PTR [rsp+384]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+376], r11
-        mov	r11, QWORD PTR [rdx+392]
-        mov	r8, QWORD PTR [rsp+392]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+384], r10
-        mov	r10, QWORD PTR [rdx+400]
-        mov	r8, QWORD PTR [rsp+400]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+392], r11
-        mov	r11, QWORD PTR [rdx+408]
-        mov	r8, QWORD PTR [rsp+408]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+400], r10
-        mov	r10, QWORD PTR [rdx+416]
-        mov	r8, QWORD PTR [rsp+416]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+408], r11
-        mov	r11, QWORD PTR [rdx+424]
-        mov	r8, QWORD PTR [rsp+424]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+416], r10
-        mov	r10, QWORD PTR [rdx+432]
-        mov	r8, QWORD PTR [rsp+432]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+424], r11
-        mov	r11, QWORD PTR [rdx+440]
-        mov	r8, QWORD PTR [rsp+440]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+432], r10
-        mov	r10, QWORD PTR [rdx+448]
-        mov	r8, QWORD PTR [rsp+448]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+440], r11
-        mov	r11, QWORD PTR [rdx+456]
-        mov	r8, QWORD PTR [rsp+456]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+448], r10
-        mov	r10, QWORD PTR [rdx+464]
-        mov	r8, QWORD PTR [rsp+464]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+456], r11
-        mov	r11, QWORD PTR [rdx+472]
-        mov	r8, QWORD PTR [rsp+472]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+464], r10
-        mov	r10, QWORD PTR [rdx+480]
-        mov	r8, QWORD PTR [rsp+480]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+472], r11
-        mov	r11, QWORD PTR [rdx+488]
-        mov	r8, QWORD PTR [rsp+488]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+480], r10
-        mov	r10, QWORD PTR [rdx+496]
-        mov	r8, QWORD PTR [rsp+496]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+488], r11
-        mov	r11, QWORD PTR [rdx+504]
-        mov	r8, QWORD PTR [rsp+504]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+496], r10
-        mov	QWORD PTR [rcx+504], r11
-        sbb	rax, rax
-        add	rsp, 512
-        ret
-sp_4096_cond_sub_64 ENDP
-_text ENDS
-; /* Reduce the number back to 4096 bits using Montgomery reduction.
-;  *
-;  * a   A single precision number to reduce in place.
-;  * m   The single precision number representing the modulus.
-;  * mp  The digit representing the negative inverse of m mod 2^n.
-;  */
-_text SEGMENT READONLY PARA
-sp_4096_mont_reduce_64 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        mov	r9, rdx
-        xor	rsi, rsi
-        ; i = 64
-        mov	r10, 64
-        mov	r15, QWORD PTR [rcx]
-        mov	rdi, QWORD PTR [rcx+8]
-L_4096_mont_reduce_64_loop:
-        ; mu = a[i] * mp
-        mov	r13, r15
-        imul	r13, r8
-        ; a[i+0] += m[0] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9]
-        add	r15, rax
-        adc	r12, rdx
-        ; a[i+1] += m[1] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+8]
-        mov	r15, rdi
-        add	r15, rax
-        adc	r11, rdx
-        add	r15, r12
-        adc	r11, 0
-        ; a[i+2] += m[2] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+16]
-        mov	rdi, QWORD PTR [rcx+16]
-        add	rdi, rax
-        adc	r12, rdx
-        add	rdi, r11
-        adc	r12, 0
-        ; a[i+3] += m[3] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+24]
-        mov	r14, QWORD PTR [rcx+24]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+24], r14
-        adc	r11, 0
-        ; a[i+4] += m[4] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+32]
-        mov	r14, QWORD PTR [rcx+32]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+32], r14
-        adc	r12, 0
-        ; a[i+5] += m[5] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+40]
-        mov	r14, QWORD PTR [rcx+40]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+40], r14
-        adc	r11, 0
-        ; a[i+6] += m[6] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+48]
-        mov	r14, QWORD PTR [rcx+48]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+48], r14
-        adc	r12, 0
-        ; a[i+7] += m[7] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+56]
-        mov	r14, QWORD PTR [rcx+56]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+56], r14
-        adc	r11, 0
-        ; a[i+8] += m[8] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+64]
-        mov	r14, QWORD PTR [rcx+64]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+64], r14
-        adc	r12, 0
-        ; a[i+9] += m[9] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+72]
-        mov	r14, QWORD PTR [rcx+72]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+72], r14
-        adc	r11, 0
-        ; a[i+10] += m[10] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+80]
-        mov	r14, QWORD PTR [rcx+80]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+80], r14
-        adc	r12, 0
-        ; a[i+11] += m[11] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+88]
-        mov	r14, QWORD PTR [rcx+88]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+88], r14
-        adc	r11, 0
-        ; a[i+12] += m[12] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+96]
-        mov	r14, QWORD PTR [rcx+96]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+96], r14
-        adc	r12, 0
-        ; a[i+13] += m[13] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+104]
-        mov	r14, QWORD PTR [rcx+104]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+104], r14
-        adc	r11, 0
-        ; a[i+14] += m[14] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+112]
-        mov	r14, QWORD PTR [rcx+112]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+112], r14
-        adc	r12, 0
-        ; a[i+15] += m[15] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+120]
-        mov	r14, QWORD PTR [rcx+120]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+120], r14
-        adc	r11, 0
-        ; a[i+16] += m[16] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+128]
-        mov	r14, QWORD PTR [rcx+128]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+128], r14
-        adc	r12, 0
-        ; a[i+17] += m[17] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+136]
-        mov	r14, QWORD PTR [rcx+136]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+136], r14
-        adc	r11, 0
-        ; a[i+18] += m[18] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+144]
-        mov	r14, QWORD PTR [rcx+144]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+144], r14
-        adc	r12, 0
-        ; a[i+19] += m[19] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+152]
-        mov	r14, QWORD PTR [rcx+152]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+152], r14
-        adc	r11, 0
-        ; a[i+20] += m[20] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+160]
-        mov	r14, QWORD PTR [rcx+160]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+160], r14
-        adc	r12, 0
-        ; a[i+21] += m[21] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+168]
-        mov	r14, QWORD PTR [rcx+168]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+168], r14
-        adc	r11, 0
-        ; a[i+22] += m[22] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+176]
-        mov	r14, QWORD PTR [rcx+176]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+176], r14
-        adc	r12, 0
-        ; a[i+23] += m[23] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+184]
-        mov	r14, QWORD PTR [rcx+184]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+184], r14
-        adc	r11, 0
-        ; a[i+24] += m[24] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+192]
-        mov	r14, QWORD PTR [rcx+192]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+192], r14
-        adc	r12, 0
-        ; a[i+25] += m[25] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+200]
-        mov	r14, QWORD PTR [rcx+200]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+200], r14
-        adc	r11, 0
-        ; a[i+26] += m[26] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+208]
-        mov	r14, QWORD PTR [rcx+208]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+208], r14
-        adc	r12, 0
-        ; a[i+27] += m[27] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+216]
-        mov	r14, QWORD PTR [rcx+216]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+216], r14
-        adc	r11, 0
-        ; a[i+28] += m[28] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+224]
-        mov	r14, QWORD PTR [rcx+224]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+224], r14
-        adc	r12, 0
-        ; a[i+29] += m[29] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+232]
-        mov	r14, QWORD PTR [rcx+232]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+232], r14
-        adc	r11, 0
-        ; a[i+30] += m[30] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+240]
-        mov	r14, QWORD PTR [rcx+240]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+240], r14
-        adc	r12, 0
-        ; a[i+31] += m[31] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+248]
-        mov	r14, QWORD PTR [rcx+248]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+248], r14
-        adc	r11, 0
-        ; a[i+32] += m[32] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+256]
-        mov	r14, QWORD PTR [rcx+256]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+256], r14
-        adc	r12, 0
-        ; a[i+33] += m[33] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+264]
-        mov	r14, QWORD PTR [rcx+264]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+264], r14
-        adc	r11, 0
-        ; a[i+34] += m[34] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+272]
-        mov	r14, QWORD PTR [rcx+272]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+272], r14
-        adc	r12, 0
-        ; a[i+35] += m[35] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+280]
-        mov	r14, QWORD PTR [rcx+280]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+280], r14
-        adc	r11, 0
-        ; a[i+36] += m[36] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+288]
-        mov	r14, QWORD PTR [rcx+288]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+288], r14
-        adc	r12, 0
-        ; a[i+37] += m[37] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+296]
-        mov	r14, QWORD PTR [rcx+296]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+296], r14
-        adc	r11, 0
-        ; a[i+38] += m[38] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+304]
-        mov	r14, QWORD PTR [rcx+304]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+304], r14
-        adc	r12, 0
-        ; a[i+39] += m[39] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+312]
-        mov	r14, QWORD PTR [rcx+312]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+312], r14
-        adc	r11, 0
-        ; a[i+40] += m[40] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+320]
-        mov	r14, QWORD PTR [rcx+320]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+320], r14
-        adc	r12, 0
-        ; a[i+41] += m[41] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+328]
-        mov	r14, QWORD PTR [rcx+328]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+328], r14
-        adc	r11, 0
-        ; a[i+42] += m[42] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+336]
-        mov	r14, QWORD PTR [rcx+336]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+336], r14
-        adc	r12, 0
-        ; a[i+43] += m[43] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+344]
-        mov	r14, QWORD PTR [rcx+344]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+344], r14
-        adc	r11, 0
-        ; a[i+44] += m[44] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+352]
-        mov	r14, QWORD PTR [rcx+352]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+352], r14
-        adc	r12, 0
-        ; a[i+45] += m[45] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+360]
-        mov	r14, QWORD PTR [rcx+360]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+360], r14
-        adc	r11, 0
-        ; a[i+46] += m[46] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+368]
-        mov	r14, QWORD PTR [rcx+368]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+368], r14
-        adc	r12, 0
-        ; a[i+47] += m[47] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+376]
-        mov	r14, QWORD PTR [rcx+376]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+376], r14
-        adc	r11, 0
-        ; a[i+48] += m[48] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+384]
-        mov	r14, QWORD PTR [rcx+384]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+384], r14
-        adc	r12, 0
-        ; a[i+49] += m[49] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+392]
-        mov	r14, QWORD PTR [rcx+392]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+392], r14
-        adc	r11, 0
-        ; a[i+50] += m[50] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+400]
-        mov	r14, QWORD PTR [rcx+400]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+400], r14
-        adc	r12, 0
-        ; a[i+51] += m[51] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+408]
-        mov	r14, QWORD PTR [rcx+408]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+408], r14
-        adc	r11, 0
-        ; a[i+52] += m[52] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+416]
-        mov	r14, QWORD PTR [rcx+416]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+416], r14
-        adc	r12, 0
-        ; a[i+53] += m[53] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+424]
-        mov	r14, QWORD PTR [rcx+424]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+424], r14
-        adc	r11, 0
-        ; a[i+54] += m[54] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+432]
-        mov	r14, QWORD PTR [rcx+432]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+432], r14
-        adc	r12, 0
-        ; a[i+55] += m[55] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+440]
-        mov	r14, QWORD PTR [rcx+440]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+440], r14
-        adc	r11, 0
-        ; a[i+56] += m[56] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+448]
-        mov	r14, QWORD PTR [rcx+448]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+448], r14
-        adc	r12, 0
-        ; a[i+57] += m[57] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+456]
-        mov	r14, QWORD PTR [rcx+456]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+456], r14
-        adc	r11, 0
-        ; a[i+58] += m[58] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+464]
-        mov	r14, QWORD PTR [rcx+464]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+464], r14
-        adc	r12, 0
-        ; a[i+59] += m[59] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+472]
-        mov	r14, QWORD PTR [rcx+472]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+472], r14
-        adc	r11, 0
-        ; a[i+60] += m[60] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+480]
-        mov	r14, QWORD PTR [rcx+480]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+480], r14
-        adc	r12, 0
-        ; a[i+61] += m[61] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+488]
-        mov	r14, QWORD PTR [rcx+488]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+488], r14
-        adc	r11, 0
-        ; a[i+62] += m[62] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+496]
-        mov	r14, QWORD PTR [rcx+496]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+496], r14
-        adc	r12, 0
-        ; a[i+63] += m[63] * mu
-        mov	rax, r13
-        mul	QWORD PTR [r9+504]
-        mov	r14, QWORD PTR [rcx+504]
-        add	r12, rax
-        adc	rdx, rsi
-        mov	rsi, 0
-        adc	rsi, 0
-        add	r14, r12
-        mov	QWORD PTR [rcx+504], r14
-        adc	QWORD PTR [rcx+512], rdx
-        adc	rsi, 0
-        ; i -= 1
-        add	rcx, 8
-        dec	r10
-        jnz	L_4096_mont_reduce_64_loop
-        mov	QWORD PTR [rcx], r15
-        mov	QWORD PTR [rcx+8], rdi
-        neg	rsi
-IFDEF _WIN64
-        mov	r8, r9
-        mov	r9, rsi
-ELSE
-        mov	r9, rsi
-        mov	r8, r9
-ENDIF
-        mov	rdx, rcx
-        mov	rcx, rcx
-        sub	rcx, 512
-        call	sp_4096_cond_sub_64
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_4096_mont_reduce_64 ENDP
-_text ENDS
-; /* Sub b from a into r. (r = a - b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_4096_sub_64 PROC
-        mov	r9, QWORD PTR [rdx]
-        sub	r9, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx+8]
-        mov	QWORD PTR [rcx], r9
-        sbb	r10, QWORD PTR [r8+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	QWORD PTR [rcx+8], r10
-        sbb	r9, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	QWORD PTR [rcx+16], r9
-        sbb	r10, QWORD PTR [r8+24]
-        mov	r9, QWORD PTR [rdx+32]
-        mov	QWORD PTR [rcx+24], r10
-        sbb	r9, QWORD PTR [r8+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	QWORD PTR [rcx+32], r9
-        sbb	r10, QWORD PTR [r8+40]
-        mov	r9, QWORD PTR [rdx+48]
-        mov	QWORD PTR [rcx+40], r10
-        sbb	r9, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+56]
-        mov	QWORD PTR [rcx+48], r9
-        sbb	r10, QWORD PTR [r8+56]
-        mov	r9, QWORD PTR [rdx+64]
-        mov	QWORD PTR [rcx+56], r10
-        sbb	r9, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [rdx+72]
-        mov	QWORD PTR [rcx+64], r9
-        sbb	r10, QWORD PTR [r8+72]
-        mov	r9, QWORD PTR [rdx+80]
-        mov	QWORD PTR [rcx+72], r10
-        sbb	r9, QWORD PTR [r8+80]
-        mov	r10, QWORD PTR [rdx+88]
-        mov	QWORD PTR [rcx+80], r9
-        sbb	r10, QWORD PTR [r8+88]
-        mov	r9, QWORD PTR [rdx+96]
-        mov	QWORD PTR [rcx+88], r10
-        sbb	r9, QWORD PTR [r8+96]
-        mov	r10, QWORD PTR [rdx+104]
-        mov	QWORD PTR [rcx+96], r9
-        sbb	r10, QWORD PTR [r8+104]
-        mov	r9, QWORD PTR [rdx+112]
-        mov	QWORD PTR [rcx+104], r10
-        sbb	r9, QWORD PTR [r8+112]
-        mov	r10, QWORD PTR [rdx+120]
-        mov	QWORD PTR [rcx+112], r9
-        sbb	r10, QWORD PTR [r8+120]
-        mov	r9, QWORD PTR [rdx+128]
-        mov	QWORD PTR [rcx+120], r10
-        sbb	r9, QWORD PTR [r8+128]
-        mov	r10, QWORD PTR [rdx+136]
-        mov	QWORD PTR [rcx+128], r9
-        sbb	r10, QWORD PTR [r8+136]
-        mov	r9, QWORD PTR [rdx+144]
-        mov	QWORD PTR [rcx+136], r10
-        sbb	r9, QWORD PTR [r8+144]
-        mov	r10, QWORD PTR [rdx+152]
-        mov	QWORD PTR [rcx+144], r9
-        sbb	r10, QWORD PTR [r8+152]
-        mov	r9, QWORD PTR [rdx+160]
-        mov	QWORD PTR [rcx+152], r10
-        sbb	r9, QWORD PTR [r8+160]
-        mov	r10, QWORD PTR [rdx+168]
-        mov	QWORD PTR [rcx+160], r9
-        sbb	r10, QWORD PTR [r8+168]
-        mov	r9, QWORD PTR [rdx+176]
-        mov	QWORD PTR [rcx+168], r10
-        sbb	r9, QWORD PTR [r8+176]
-        mov	r10, QWORD PTR [rdx+184]
-        mov	QWORD PTR [rcx+176], r9
-        sbb	r10, QWORD PTR [r8+184]
-        mov	r9, QWORD PTR [rdx+192]
-        mov	QWORD PTR [rcx+184], r10
-        sbb	r9, QWORD PTR [r8+192]
-        mov	r10, QWORD PTR [rdx+200]
-        mov	QWORD PTR [rcx+192], r9
-        sbb	r10, QWORD PTR [r8+200]
-        mov	r9, QWORD PTR [rdx+208]
-        mov	QWORD PTR [rcx+200], r10
-        sbb	r9, QWORD PTR [r8+208]
-        mov	r10, QWORD PTR [rdx+216]
-        mov	QWORD PTR [rcx+208], r9
-        sbb	r10, QWORD PTR [r8+216]
-        mov	r9, QWORD PTR [rdx+224]
-        mov	QWORD PTR [rcx+216], r10
-        sbb	r9, QWORD PTR [r8+224]
-        mov	r10, QWORD PTR [rdx+232]
-        mov	QWORD PTR [rcx+224], r9
-        sbb	r10, QWORD PTR [r8+232]
-        mov	r9, QWORD PTR [rdx+240]
-        mov	QWORD PTR [rcx+232], r10
-        sbb	r9, QWORD PTR [r8+240]
-        mov	r10, QWORD PTR [rdx+248]
-        mov	QWORD PTR [rcx+240], r9
-        sbb	r10, QWORD PTR [r8+248]
-        mov	r9, QWORD PTR [rdx+256]
-        mov	QWORD PTR [rcx+248], r10
-        sbb	r9, QWORD PTR [r8+256]
-        mov	r10, QWORD PTR [rdx+264]
-        mov	QWORD PTR [rcx+256], r9
-        sbb	r10, QWORD PTR [r8+264]
-        mov	r9, QWORD PTR [rdx+272]
-        mov	QWORD PTR [rcx+264], r10
-        sbb	r9, QWORD PTR [r8+272]
-        mov	r10, QWORD PTR [rdx+280]
-        mov	QWORD PTR [rcx+272], r9
-        sbb	r10, QWORD PTR [r8+280]
-        mov	r9, QWORD PTR [rdx+288]
-        mov	QWORD PTR [rcx+280], r10
-        sbb	r9, QWORD PTR [r8+288]
-        mov	r10, QWORD PTR [rdx+296]
-        mov	QWORD PTR [rcx+288], r9
-        sbb	r10, QWORD PTR [r8+296]
-        mov	r9, QWORD PTR [rdx+304]
-        mov	QWORD PTR [rcx+296], r10
-        sbb	r9, QWORD PTR [r8+304]
-        mov	r10, QWORD PTR [rdx+312]
-        mov	QWORD PTR [rcx+304], r9
-        sbb	r10, QWORD PTR [r8+312]
-        mov	r9, QWORD PTR [rdx+320]
-        mov	QWORD PTR [rcx+312], r10
-        sbb	r9, QWORD PTR [r8+320]
-        mov	r10, QWORD PTR [rdx+328]
-        mov	QWORD PTR [rcx+320], r9
-        sbb	r10, QWORD PTR [r8+328]
-        mov	r9, QWORD PTR [rdx+336]
-        mov	QWORD PTR [rcx+328], r10
-        sbb	r9, QWORD PTR [r8+336]
-        mov	r10, QWORD PTR [rdx+344]
-        mov	QWORD PTR [rcx+336], r9
-        sbb	r10, QWORD PTR [r8+344]
-        mov	r9, QWORD PTR [rdx+352]
-        mov	QWORD PTR [rcx+344], r10
-        sbb	r9, QWORD PTR [r8+352]
-        mov	r10, QWORD PTR [rdx+360]
-        mov	QWORD PTR [rcx+352], r9
-        sbb	r10, QWORD PTR [r8+360]
-        mov	r9, QWORD PTR [rdx+368]
-        mov	QWORD PTR [rcx+360], r10
-        sbb	r9, QWORD PTR [r8+368]
-        mov	r10, QWORD PTR [rdx+376]
-        mov	QWORD PTR [rcx+368], r9
-        sbb	r10, QWORD PTR [r8+376]
-        mov	r9, QWORD PTR [rdx+384]
-        mov	QWORD PTR [rcx+376], r10
-        sbb	r9, QWORD PTR [r8+384]
-        mov	r10, QWORD PTR [rdx+392]
-        mov	QWORD PTR [rcx+384], r9
-        sbb	r10, QWORD PTR [r8+392]
-        mov	r9, QWORD PTR [rdx+400]
-        mov	QWORD PTR [rcx+392], r10
-        sbb	r9, QWORD PTR [r8+400]
-        mov	r10, QWORD PTR [rdx+408]
-        mov	QWORD PTR [rcx+400], r9
-        sbb	r10, QWORD PTR [r8+408]
-        mov	r9, QWORD PTR [rdx+416]
-        mov	QWORD PTR [rcx+408], r10
-        sbb	r9, QWORD PTR [r8+416]
-        mov	r10, QWORD PTR [rdx+424]
-        mov	QWORD PTR [rcx+416], r9
-        sbb	r10, QWORD PTR [r8+424]
-        mov	r9, QWORD PTR [rdx+432]
-        mov	QWORD PTR [rcx+424], r10
-        sbb	r9, QWORD PTR [r8+432]
-        mov	r10, QWORD PTR [rdx+440]
-        mov	QWORD PTR [rcx+432], r9
-        sbb	r10, QWORD PTR [r8+440]
-        mov	r9, QWORD PTR [rdx+448]
-        mov	QWORD PTR [rcx+440], r10
-        sbb	r9, QWORD PTR [r8+448]
-        mov	r10, QWORD PTR [rdx+456]
-        mov	QWORD PTR [rcx+448], r9
-        sbb	r10, QWORD PTR [r8+456]
-        mov	r9, QWORD PTR [rdx+464]
-        mov	QWORD PTR [rcx+456], r10
-        sbb	r9, QWORD PTR [r8+464]
-        mov	r10, QWORD PTR [rdx+472]
-        mov	QWORD PTR [rcx+464], r9
-        sbb	r10, QWORD PTR [r8+472]
-        mov	r9, QWORD PTR [rdx+480]
-        mov	QWORD PTR [rcx+472], r10
-        sbb	r9, QWORD PTR [r8+480]
-        mov	r10, QWORD PTR [rdx+488]
-        mov	QWORD PTR [rcx+480], r9
-        sbb	r10, QWORD PTR [r8+488]
-        mov	r9, QWORD PTR [rdx+496]
-        mov	QWORD PTR [rcx+488], r10
-        sbb	r9, QWORD PTR [r8+496]
-        mov	r10, QWORD PTR [rdx+504]
-        mov	QWORD PTR [rcx+496], r9
-        sbb	r10, QWORD PTR [r8+504]
-        mov	QWORD PTR [rcx+504], r10
-        sbb	rax, rax
-        ret
-sp_4096_sub_64 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Mul a by digit b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision digit.
-;  */
-_text SEGMENT READONLY PARA
-sp_4096_mul_d_avx2_64 PROC
-        push	r12
-        push	r13
-        mov	rax, rdx
-        ; A[0] * B
-        mov	rdx, r8
-        xor	r13, r13
-        mulx	r12, r11, QWORD PTR [rax]
-        mov	QWORD PTR [rcx], r11
-        ; A[1] * B
-        mulx	r10, r9, QWORD PTR [rax+8]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+8], r12
-        ; A[2] * B
-        mulx	r10, r9, QWORD PTR [rax+16]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+16], r11
-        ; A[3] * B
-        mulx	r10, r9, QWORD PTR [rax+24]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+24], r12
-        ; A[4] * B
-        mulx	r10, r9, QWORD PTR [rax+32]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+32], r11
-        ; A[5] * B
-        mulx	r10, r9, QWORD PTR [rax+40]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+40], r12
-        ; A[6] * B
-        mulx	r10, r9, QWORD PTR [rax+48]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+48], r11
-        ; A[7] * B
-        mulx	r10, r9, QWORD PTR [rax+56]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+56], r12
-        ; A[8] * B
-        mulx	r10, r9, QWORD PTR [rax+64]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+64], r11
-        ; A[9] * B
-        mulx	r10, r9, QWORD PTR [rax+72]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+72], r12
-        ; A[10] * B
-        mulx	r10, r9, QWORD PTR [rax+80]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+80], r11
-        ; A[11] * B
-        mulx	r10, r9, QWORD PTR [rax+88]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+88], r12
-        ; A[12] * B
-        mulx	r10, r9, QWORD PTR [rax+96]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+96], r11
-        ; A[13] * B
-        mulx	r10, r9, QWORD PTR [rax+104]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+104], r12
-        ; A[14] * B
-        mulx	r10, r9, QWORD PTR [rax+112]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+112], r11
-        ; A[15] * B
-        mulx	r10, r9, QWORD PTR [rax+120]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+120], r12
-        ; A[16] * B
-        mulx	r10, r9, QWORD PTR [rax+128]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+128], r11
-        ; A[17] * B
-        mulx	r10, r9, QWORD PTR [rax+136]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+136], r12
-        ; A[18] * B
-        mulx	r10, r9, QWORD PTR [rax+144]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+144], r11
-        ; A[19] * B
-        mulx	r10, r9, QWORD PTR [rax+152]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+152], r12
-        ; A[20] * B
-        mulx	r10, r9, QWORD PTR [rax+160]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+160], r11
-        ; A[21] * B
-        mulx	r10, r9, QWORD PTR [rax+168]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+168], r12
-        ; A[22] * B
-        mulx	r10, r9, QWORD PTR [rax+176]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+176], r11
-        ; A[23] * B
-        mulx	r10, r9, QWORD PTR [rax+184]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+184], r12
-        ; A[24] * B
-        mulx	r10, r9, QWORD PTR [rax+192]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+192], r11
-        ; A[25] * B
-        mulx	r10, r9, QWORD PTR [rax+200]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+200], r12
-        ; A[26] * B
-        mulx	r10, r9, QWORD PTR [rax+208]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+208], r11
-        ; A[27] * B
-        mulx	r10, r9, QWORD PTR [rax+216]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+216], r12
-        ; A[28] * B
-        mulx	r10, r9, QWORD PTR [rax+224]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+224], r11
-        ; A[29] * B
-        mulx	r10, r9, QWORD PTR [rax+232]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+232], r12
-        ; A[30] * B
-        mulx	r10, r9, QWORD PTR [rax+240]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+240], r11
-        ; A[31] * B
-        mulx	r10, r9, QWORD PTR [rax+248]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+248], r12
-        ; A[32] * B
-        mulx	r10, r9, QWORD PTR [rax+256]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+256], r11
-        ; A[33] * B
-        mulx	r10, r9, QWORD PTR [rax+264]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+264], r12
-        ; A[34] * B
-        mulx	r10, r9, QWORD PTR [rax+272]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+272], r11
-        ; A[35] * B
-        mulx	r10, r9, QWORD PTR [rax+280]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+280], r12
-        ; A[36] * B
-        mulx	r10, r9, QWORD PTR [rax+288]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+288], r11
-        ; A[37] * B
-        mulx	r10, r9, QWORD PTR [rax+296]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+296], r12
-        ; A[38] * B
-        mulx	r10, r9, QWORD PTR [rax+304]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+304], r11
-        ; A[39] * B
-        mulx	r10, r9, QWORD PTR [rax+312]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+312], r12
-        ; A[40] * B
-        mulx	r10, r9, QWORD PTR [rax+320]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+320], r11
-        ; A[41] * B
-        mulx	r10, r9, QWORD PTR [rax+328]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+328], r12
-        ; A[42] * B
-        mulx	r10, r9, QWORD PTR [rax+336]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+336], r11
-        ; A[43] * B
-        mulx	r10, r9, QWORD PTR [rax+344]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+344], r12
-        ; A[44] * B
-        mulx	r10, r9, QWORD PTR [rax+352]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+352], r11
-        ; A[45] * B
-        mulx	r10, r9, QWORD PTR [rax+360]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+360], r12
-        ; A[46] * B
-        mulx	r10, r9, QWORD PTR [rax+368]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+368], r11
-        ; A[47] * B
-        mulx	r10, r9, QWORD PTR [rax+376]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+376], r12
-        ; A[48] * B
-        mulx	r10, r9, QWORD PTR [rax+384]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+384], r11
-        ; A[49] * B
-        mulx	r10, r9, QWORD PTR [rax+392]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+392], r12
-        ; A[50] * B
-        mulx	r10, r9, QWORD PTR [rax+400]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+400], r11
-        ; A[51] * B
-        mulx	r10, r9, QWORD PTR [rax+408]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+408], r12
-        ; A[52] * B
-        mulx	r10, r9, QWORD PTR [rax+416]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+416], r11
-        ; A[53] * B
-        mulx	r10, r9, QWORD PTR [rax+424]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+424], r12
-        ; A[54] * B
-        mulx	r10, r9, QWORD PTR [rax+432]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+432], r11
-        ; A[55] * B
-        mulx	r10, r9, QWORD PTR [rax+440]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+440], r12
-        ; A[56] * B
-        mulx	r10, r9, QWORD PTR [rax+448]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+448], r11
-        ; A[57] * B
-        mulx	r10, r9, QWORD PTR [rax+456]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+456], r12
-        ; A[58] * B
-        mulx	r10, r9, QWORD PTR [rax+464]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+464], r11
-        ; A[59] * B
-        mulx	r10, r9, QWORD PTR [rax+472]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+472], r12
-        ; A[60] * B
-        mulx	r10, r9, QWORD PTR [rax+480]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+480], r11
-        ; A[61] * B
-        mulx	r10, r9, QWORD PTR [rax+488]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+488], r12
-        ; A[62] * B
-        mulx	r10, r9, QWORD PTR [rax+496]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+496], r11
-        ; A[63] * B
-        mulx	r10, r9, QWORD PTR [rax+504]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        adcx	r11, r13
-        mov	QWORD PTR [rcx+504], r12
-        mov	QWORD PTR [rcx+512], r11
-        pop	r13
-        pop	r12
-        ret
-sp_4096_mul_d_avx2_64 ENDP
-_text ENDS
-ENDIF
-IFDEF _WIN64
-; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
-;  *
-;  * d1   The high order half of the number to divide.
-;  * d0   The low order half of the number to divide.
-;  * div  The dividend.
-;  * returns the result of the division.
-;  */
-_text SEGMENT READONLY PARA
-div_4096_word_asm_64 PROC
-        mov	r9, rdx
-        mov	rax, r9
-        mov	rdx, rcx
-        div	r8
-        ret
-div_4096_word_asm_64 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Conditionally subtract b from a using the mask m.
-;  * m is -1 to subtract and 0 when not copying.
-;  *
-;  * r  A single precision number representing condition subtract result.
-;  * a  A single precision number to subtract from.
-;  * b  A single precision number to subtract.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_4096_cond_sub_avx2_64 PROC
-        push	r12
-        mov	r12, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx]
-        pext	r12, r12, r9
-        sub	r10, r12
-        mov	r12, QWORD PTR [r8+8]
-        mov	r11, QWORD PTR [rdx+8]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+16]
-        mov	r12, QWORD PTR [rdx+16]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+8], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [rdx+24]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+16], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [rdx+32]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+24], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [rdx+40]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+32], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+48]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+40], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+56]
-        mov	r11, QWORD PTR [rdx+56]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+48], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+64]
-        mov	r12, QWORD PTR [rdx+64]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+56], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+72]
-        mov	r10, QWORD PTR [rdx+72]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+64], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [rdx+80]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+72], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+88]
-        mov	r12, QWORD PTR [rdx+88]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+80], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+96]
-        mov	r10, QWORD PTR [rdx+96]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+88], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+104]
-        mov	r11, QWORD PTR [rdx+104]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+96], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+112]
-        mov	r12, QWORD PTR [rdx+112]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+104], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+120]
-        mov	r10, QWORD PTR [rdx+120]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+112], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+128]
-        mov	r11, QWORD PTR [rdx+128]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+120], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+136]
-        mov	r12, QWORD PTR [rdx+136]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+128], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+144]
-        mov	r10, QWORD PTR [rdx+144]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+136], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+152]
-        mov	r11, QWORD PTR [rdx+152]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+144], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+160]
-        mov	r12, QWORD PTR [rdx+160]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+152], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+168]
-        mov	r10, QWORD PTR [rdx+168]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+160], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+176]
-        mov	r11, QWORD PTR [rdx+176]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+168], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+184]
-        mov	r12, QWORD PTR [rdx+184]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+176], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+192]
-        mov	r10, QWORD PTR [rdx+192]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+184], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+200]
-        mov	r11, QWORD PTR [rdx+200]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+192], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+208]
-        mov	r12, QWORD PTR [rdx+208]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+200], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+216]
-        mov	r10, QWORD PTR [rdx+216]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+208], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+224]
-        mov	r11, QWORD PTR [rdx+224]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+216], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+232]
-        mov	r12, QWORD PTR [rdx+232]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+224], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+240]
-        mov	r10, QWORD PTR [rdx+240]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+232], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+248]
-        mov	r11, QWORD PTR [rdx+248]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+240], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+256]
-        mov	r12, QWORD PTR [rdx+256]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+248], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+264]
-        mov	r10, QWORD PTR [rdx+264]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+256], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+272]
-        mov	r11, QWORD PTR [rdx+272]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+264], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+280]
-        mov	r12, QWORD PTR [rdx+280]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+272], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+288]
-        mov	r10, QWORD PTR [rdx+288]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+280], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+296]
-        mov	r11, QWORD PTR [rdx+296]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+288], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+304]
-        mov	r12, QWORD PTR [rdx+304]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+296], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+312]
-        mov	r10, QWORD PTR [rdx+312]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+304], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+320]
-        mov	r11, QWORD PTR [rdx+320]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+312], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+328]
-        mov	r12, QWORD PTR [rdx+328]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+320], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+336]
-        mov	r10, QWORD PTR [rdx+336]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+328], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+344]
-        mov	r11, QWORD PTR [rdx+344]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+336], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+352]
-        mov	r12, QWORD PTR [rdx+352]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+344], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+360]
-        mov	r10, QWORD PTR [rdx+360]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+352], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+368]
-        mov	r11, QWORD PTR [rdx+368]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+360], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+376]
-        mov	r12, QWORD PTR [rdx+376]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+368], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+384]
-        mov	r10, QWORD PTR [rdx+384]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+376], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+392]
-        mov	r11, QWORD PTR [rdx+392]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+384], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+400]
-        mov	r12, QWORD PTR [rdx+400]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+392], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+408]
-        mov	r10, QWORD PTR [rdx+408]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+400], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+416]
-        mov	r11, QWORD PTR [rdx+416]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+408], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+424]
-        mov	r12, QWORD PTR [rdx+424]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+416], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+432]
-        mov	r10, QWORD PTR [rdx+432]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+424], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+440]
-        mov	r11, QWORD PTR [rdx+440]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+432], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+448]
-        mov	r12, QWORD PTR [rdx+448]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+440], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+456]
-        mov	r10, QWORD PTR [rdx+456]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+448], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+464]
-        mov	r11, QWORD PTR [rdx+464]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+456], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+472]
-        mov	r12, QWORD PTR [rdx+472]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+464], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+480]
-        mov	r10, QWORD PTR [rdx+480]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+472], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+488]
-        mov	r11, QWORD PTR [rdx+488]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+480], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+496]
-        mov	r12, QWORD PTR [rdx+496]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+488], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+504]
-        mov	r10, QWORD PTR [rdx+504]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+496], r12
-        sbb	r10, r11
-        mov	QWORD PTR [rcx+504], r10
-        sbb	rax, rax
-        pop	r12
-        ret
-sp_4096_cond_sub_avx2_64 ENDP
-_text ENDS
-ENDIF
-; /* Compare a with b in constant time.
-;  *
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  * return -ve, 0 or +ve if a is less than, equal to or greater than b
-;  * respectively.
-;  */
-_text SEGMENT READONLY PARA
-sp_4096_cmp_64 PROC
-        push	r12
-        xor	r9, r9
-        mov	r8, -1
-        mov	rax, -1
-        mov	r10, 1
-        mov	r11, QWORD PTR [rcx+504]
-        mov	r12, QWORD PTR [rdx+504]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+496]
-        mov	r12, QWORD PTR [rdx+496]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+488]
-        mov	r12, QWORD PTR [rdx+488]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+480]
-        mov	r12, QWORD PTR [rdx+480]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+472]
-        mov	r12, QWORD PTR [rdx+472]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+464]
-        mov	r12, QWORD PTR [rdx+464]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+456]
-        mov	r12, QWORD PTR [rdx+456]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+448]
-        mov	r12, QWORD PTR [rdx+448]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+440]
-        mov	r12, QWORD PTR [rdx+440]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+432]
-        mov	r12, QWORD PTR [rdx+432]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+424]
-        mov	r12, QWORD PTR [rdx+424]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+416]
-        mov	r12, QWORD PTR [rdx+416]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+408]
-        mov	r12, QWORD PTR [rdx+408]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+400]
-        mov	r12, QWORD PTR [rdx+400]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+392]
-        mov	r12, QWORD PTR [rdx+392]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+384]
-        mov	r12, QWORD PTR [rdx+384]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+376]
-        mov	r12, QWORD PTR [rdx+376]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+368]
-        mov	r12, QWORD PTR [rdx+368]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+360]
-        mov	r12, QWORD PTR [rdx+360]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+352]
-        mov	r12, QWORD PTR [rdx+352]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+344]
-        mov	r12, QWORD PTR [rdx+344]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+336]
-        mov	r12, QWORD PTR [rdx+336]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+328]
-        mov	r12, QWORD PTR [rdx+328]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+320]
-        mov	r12, QWORD PTR [rdx+320]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+312]
-        mov	r12, QWORD PTR [rdx+312]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+304]
-        mov	r12, QWORD PTR [rdx+304]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+296]
-        mov	r12, QWORD PTR [rdx+296]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+288]
-        mov	r12, QWORD PTR [rdx+288]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+280]
-        mov	r12, QWORD PTR [rdx+280]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+272]
-        mov	r12, QWORD PTR [rdx+272]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+264]
-        mov	r12, QWORD PTR [rdx+264]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+256]
-        mov	r12, QWORD PTR [rdx+256]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+248]
-        mov	r12, QWORD PTR [rdx+248]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+240]
-        mov	r12, QWORD PTR [rdx+240]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+232]
-        mov	r12, QWORD PTR [rdx+232]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+224]
-        mov	r12, QWORD PTR [rdx+224]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+216]
-        mov	r12, QWORD PTR [rdx+216]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+208]
-        mov	r12, QWORD PTR [rdx+208]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+200]
-        mov	r12, QWORD PTR [rdx+200]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+192]
-        mov	r12, QWORD PTR [rdx+192]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+184]
-        mov	r12, QWORD PTR [rdx+184]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+176]
-        mov	r12, QWORD PTR [rdx+176]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+168]
-        mov	r12, QWORD PTR [rdx+168]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+160]
-        mov	r12, QWORD PTR [rdx+160]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+152]
-        mov	r12, QWORD PTR [rdx+152]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+144]
-        mov	r12, QWORD PTR [rdx+144]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+136]
-        mov	r12, QWORD PTR [rdx+136]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+128]
-        mov	r12, QWORD PTR [rdx+128]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+120]
-        mov	r12, QWORD PTR [rdx+120]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+112]
-        mov	r12, QWORD PTR [rdx+112]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+104]
-        mov	r12, QWORD PTR [rdx+104]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+96]
-        mov	r12, QWORD PTR [rdx+96]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+88]
-        mov	r12, QWORD PTR [rdx+88]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+80]
-        mov	r12, QWORD PTR [rdx+80]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+72]
-        mov	r12, QWORD PTR [rdx+72]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+64]
-        mov	r12, QWORD PTR [rdx+64]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+56]
-        mov	r12, QWORD PTR [rdx+56]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+48]
-        mov	r12, QWORD PTR [rdx+48]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+40]
-        mov	r12, QWORD PTR [rdx+40]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+32]
-        mov	r12, QWORD PTR [rdx+32]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+24]
-        mov	r12, QWORD PTR [rdx+24]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+16]
-        mov	r12, QWORD PTR [rdx+16]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+8]
-        mov	r12, QWORD PTR [rdx+8]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx]
-        mov	r12, QWORD PTR [rdx]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        xor	rax, r8
-        pop	r12
-        ret
-sp_4096_cmp_64 ENDP
-_text ENDS
-IFNDEF WC_NO_CACHE_RESISTANT
-_text SEGMENT READONLY PARA
-sp_4096_get_from_table_64 PROC
-        sub	rsp, 128
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        vmovdqu	OWORD PTR [rsp+96], xmm12
-        vmovdqu	OWORD PTR [rsp+112], xmm13
-        mov	rax, 1
-        movd	xmm10, r8
-        movd	xmm11, rax
-        pxor	xmm13, xmm13
-        pshufd	xmm11, xmm11, 0
-        pshufd	xmm10, xmm10, 0
-        ; START: 0-7
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        add	rcx, 64
-        ; END: 0-7
-        ; START: 8-15
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 64
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        add	rcx, 64
-        ; END: 8-15
-        ; START: 16-23
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 128
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        add	rcx, 64
-        ; END: 16-23
-        ; START: 24-31
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 192
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        add	rcx, 64
-        ; END: 24-31
-        ; START: 32-39
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 256
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        add	rcx, 64
-        ; END: 32-39
-        ; START: 40-47
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 320
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        add	rcx, 64
-        ; END: 40-47
-        ; START: 48-55
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 384
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 384
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 384
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 384
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 384
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 384
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 384
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 384
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 384
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 384
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 384
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 384
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 384
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 384
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 384
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 384
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        add	rcx, 64
-        ; END: 48-55
-        ; START: 56-63
-        pxor	xmm13, xmm13
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        pxor	xmm6, xmm6
-        pxor	xmm7, xmm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 448
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 448
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 448
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 448
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 448
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 448
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 448
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 448
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 448
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 448
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 448
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 448
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 448
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 448
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 448
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 448
-        movdqu	xmm12, xmm13
-        pcmpeqd	xmm12, xmm10
-        movdqu	xmm0, [r9]
-        movdqu	xmm1, [r9+16]
-        movdqu	xmm2, [r9+32]
-        movdqu	xmm3, [r9+48]
-        pand	xmm0, xmm12
-        pand	xmm1, xmm12
-        pand	xmm2, xmm12
-        pand	xmm3, xmm12
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        por	xmm6, xmm2
-        por	xmm7, xmm3
-        paddd	xmm13, xmm11
-        movdqu	[rcx], xmm4
-        movdqu	[rcx+16], xmm5
-        movdqu	[rcx+32], xmm6
-        movdqu	[rcx+48], xmm7
-        ; END: 56-63
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        vmovdqu	xmm12, OWORD PTR [rsp+96]
-        vmovdqu	xmm13, OWORD PTR [rsp+112]
-        add	rsp, 128
-        ret
-sp_4096_get_from_table_64 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Reduce the number back to 4096 bits using Montgomery reduction.
-;  *
-;  * a   A single precision number to reduce in place.
-;  * m   The single precision number representing the modulus.
-;  * mp  The digit representing the negative inverse of m mod 2^n.
-;  */
-_text SEGMENT READONLY PARA
-sp_4096_mont_reduce_avx2_64 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        push	rbp
-        mov	r9, rcx
-        mov	r10, rdx
-        xor	rbp, rbp
-        ; i = 64
-        mov	r11, 64
-        mov	r14, QWORD PTR [r9]
-        mov	r15, QWORD PTR [r9+8]
-        mov	rdi, QWORD PTR [r9+16]
-        mov	rsi, QWORD PTR [r9+24]
-        add	r9, 256
-        xor	rbp, rbp
-L_4096_mont_reduce_avx2_64_loop:
-        ; mu = a[i] * mp
-        mov	rdx, r14
-        mov	r12, r14
-        imul	rdx, r8
-        xor	rbx, rbx
-        ; a[i+0] += m[0] * mu
-        mulx	rcx, rax, QWORD PTR [r10]
-        mov	r14, r15
-        adcx	r12, rax
-        adox	r14, rcx
-        ; a[i+1] += m[1] * mu
-        mulx	rcx, rax, QWORD PTR [r10+8]
-        mov	r15, rdi
-        adcx	r14, rax
-        adox	r15, rcx
-        ; a[i+2] += m[2] * mu
-        mulx	rcx, rax, QWORD PTR [r10+16]
-        mov	rdi, rsi
-        adcx	r15, rax
-        adox	rdi, rcx
-        ; a[i+3] += m[3] * mu
-        mulx	rcx, rax, QWORD PTR [r10+24]
-        mov	rsi, QWORD PTR [r9+-224]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; a[i+4] += m[4] * mu
-        mulx	rcx, rax, QWORD PTR [r10+32]
-        mov	r13, QWORD PTR [r9+-216]
-        adcx	rsi, rax
-        adox	r13, rcx
-        ; a[i+5] += m[5] * mu
-        mulx	rcx, rax, QWORD PTR [r10+40]
-        mov	r12, QWORD PTR [r9+-208]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-216], r13
-        ; a[i+6] += m[6] * mu
-        mulx	rcx, rax, QWORD PTR [r10+48]
-        mov	r13, QWORD PTR [r9+-200]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-208], r12
-        ; a[i+7] += m[7] * mu
-        mulx	rcx, rax, QWORD PTR [r10+56]
-        mov	r12, QWORD PTR [r9+-192]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-200], r13
-        ; a[i+8] += m[8] * mu
-        mulx	rcx, rax, QWORD PTR [r10+64]
-        mov	r13, QWORD PTR [r9+-184]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-192], r12
-        ; a[i+9] += m[9] * mu
-        mulx	rcx, rax, QWORD PTR [r10+72]
-        mov	r12, QWORD PTR [r9+-176]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-184], r13
-        ; a[i+10] += m[10] * mu
-        mulx	rcx, rax, QWORD PTR [r10+80]
-        mov	r13, QWORD PTR [r9+-168]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-176], r12
-        ; a[i+11] += m[11] * mu
-        mulx	rcx, rax, QWORD PTR [r10+88]
-        mov	r12, QWORD PTR [r9+-160]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-168], r13
-        ; a[i+12] += m[12] * mu
-        mulx	rcx, rax, QWORD PTR [r10+96]
-        mov	r13, QWORD PTR [r9+-152]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-160], r12
-        ; a[i+13] += m[13] * mu
-        mulx	rcx, rax, QWORD PTR [r10+104]
-        mov	r12, QWORD PTR [r9+-144]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-152], r13
-        ; a[i+14] += m[14] * mu
-        mulx	rcx, rax, QWORD PTR [r10+112]
-        mov	r13, QWORD PTR [r9+-136]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-144], r12
-        ; a[i+15] += m[15] * mu
-        mulx	rcx, rax, QWORD PTR [r10+120]
-        mov	r12, QWORD PTR [r9+-128]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-136], r13
-        ; a[i+16] += m[16] * mu
-        mulx	rcx, rax, QWORD PTR [r10+128]
-        mov	r13, QWORD PTR [r9+-120]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-128], r12
-        ; a[i+17] += m[17] * mu
-        mulx	rcx, rax, QWORD PTR [r10+136]
-        mov	r12, QWORD PTR [r9+-112]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-120], r13
-        ; a[i+18] += m[18] * mu
-        mulx	rcx, rax, QWORD PTR [r10+144]
-        mov	r13, QWORD PTR [r9+-104]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-112], r12
-        ; a[i+19] += m[19] * mu
-        mulx	rcx, rax, QWORD PTR [r10+152]
-        mov	r12, QWORD PTR [r9+-96]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-104], r13
-        ; a[i+20] += m[20] * mu
-        mulx	rcx, rax, QWORD PTR [r10+160]
-        mov	r13, QWORD PTR [r9+-88]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-96], r12
-        ; a[i+21] += m[21] * mu
-        mulx	rcx, rax, QWORD PTR [r10+168]
-        mov	r12, QWORD PTR [r9+-80]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-88], r13
-        ; a[i+22] += m[22] * mu
-        mulx	rcx, rax, QWORD PTR [r10+176]
-        mov	r13, QWORD PTR [r9+-72]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-80], r12
-        ; a[i+23] += m[23] * mu
-        mulx	rcx, rax, QWORD PTR [r10+184]
-        mov	r12, QWORD PTR [r9+-64]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-72], r13
-        ; a[i+24] += m[24] * mu
-        mulx	rcx, rax, QWORD PTR [r10+192]
-        mov	r13, QWORD PTR [r9+-56]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-64], r12
-        ; a[i+25] += m[25] * mu
-        mulx	rcx, rax, QWORD PTR [r10+200]
-        mov	r12, QWORD PTR [r9+-48]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-56], r13
-        ; a[i+26] += m[26] * mu
-        mulx	rcx, rax, QWORD PTR [r10+208]
-        mov	r13, QWORD PTR [r9+-40]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-48], r12
-        ; a[i+27] += m[27] * mu
-        mulx	rcx, rax, QWORD PTR [r10+216]
-        mov	r12, QWORD PTR [r9+-32]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-40], r13
-        ; a[i+28] += m[28] * mu
-        mulx	rcx, rax, QWORD PTR [r10+224]
-        mov	r13, QWORD PTR [r9+-24]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-32], r12
-        ; a[i+29] += m[29] * mu
-        mulx	rcx, rax, QWORD PTR [r10+232]
-        mov	r12, QWORD PTR [r9+-16]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-24], r13
-        ; a[i+30] += m[30] * mu
-        mulx	rcx, rax, QWORD PTR [r10+240]
-        mov	r13, QWORD PTR [r9+-8]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-16], r12
-        ; a[i+31] += m[31] * mu
-        mulx	rcx, rax, QWORD PTR [r10+248]
-        mov	r12, QWORD PTR [r9]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-8], r13
-        ; a[i+32] += m[32] * mu
-        mulx	rcx, rax, QWORD PTR [r10+256]
-        mov	r13, QWORD PTR [r9+8]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9], r12
-        ; a[i+33] += m[33] * mu
-        mulx	rcx, rax, QWORD PTR [r10+264]
-        mov	r12, QWORD PTR [r9+16]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+8], r13
-        ; a[i+34] += m[34] * mu
-        mulx	rcx, rax, QWORD PTR [r10+272]
-        mov	r13, QWORD PTR [r9+24]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+16], r12
-        ; a[i+35] += m[35] * mu
-        mulx	rcx, rax, QWORD PTR [r10+280]
-        mov	r12, QWORD PTR [r9+32]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+24], r13
-        ; a[i+36] += m[36] * mu
-        mulx	rcx, rax, QWORD PTR [r10+288]
-        mov	r13, QWORD PTR [r9+40]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+32], r12
-        ; a[i+37] += m[37] * mu
-        mulx	rcx, rax, QWORD PTR [r10+296]
-        mov	r12, QWORD PTR [r9+48]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+40], r13
-        ; a[i+38] += m[38] * mu
-        mulx	rcx, rax, QWORD PTR [r10+304]
-        mov	r13, QWORD PTR [r9+56]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+48], r12
-        ; a[i+39] += m[39] * mu
-        mulx	rcx, rax, QWORD PTR [r10+312]
-        mov	r12, QWORD PTR [r9+64]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+56], r13
-        ; a[i+40] += m[40] * mu
-        mulx	rcx, rax, QWORD PTR [r10+320]
-        mov	r13, QWORD PTR [r9+72]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+64], r12
-        ; a[i+41] += m[41] * mu
-        mulx	rcx, rax, QWORD PTR [r10+328]
-        mov	r12, QWORD PTR [r9+80]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+72], r13
-        ; a[i+42] += m[42] * mu
-        mulx	rcx, rax, QWORD PTR [r10+336]
-        mov	r13, QWORD PTR [r9+88]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+80], r12
-        ; a[i+43] += m[43] * mu
-        mulx	rcx, rax, QWORD PTR [r10+344]
-        mov	r12, QWORD PTR [r9+96]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+88], r13
-        ; a[i+44] += m[44] * mu
-        mulx	rcx, rax, QWORD PTR [r10+352]
-        mov	r13, QWORD PTR [r9+104]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+96], r12
-        ; a[i+45] += m[45] * mu
-        mulx	rcx, rax, QWORD PTR [r10+360]
-        mov	r12, QWORD PTR [r9+112]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+104], r13
-        ; a[i+46] += m[46] * mu
-        mulx	rcx, rax, QWORD PTR [r10+368]
-        mov	r13, QWORD PTR [r9+120]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+112], r12
-        ; a[i+47] += m[47] * mu
-        mulx	rcx, rax, QWORD PTR [r10+376]
-        mov	r12, QWORD PTR [r9+128]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+120], r13
-        ; a[i+48] += m[48] * mu
-        mulx	rcx, rax, QWORD PTR [r10+384]
-        mov	r13, QWORD PTR [r9+136]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+128], r12
-        ; a[i+49] += m[49] * mu
-        mulx	rcx, rax, QWORD PTR [r10+392]
-        mov	r12, QWORD PTR [r9+144]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+136], r13
-        ; a[i+50] += m[50] * mu
-        mulx	rcx, rax, QWORD PTR [r10+400]
-        mov	r13, QWORD PTR [r9+152]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+144], r12
-        ; a[i+51] += m[51] * mu
-        mulx	rcx, rax, QWORD PTR [r10+408]
-        mov	r12, QWORD PTR [r9+160]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+152], r13
-        ; a[i+52] += m[52] * mu
-        mulx	rcx, rax, QWORD PTR [r10+416]
-        mov	r13, QWORD PTR [r9+168]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+160], r12
-        ; a[i+53] += m[53] * mu
-        mulx	rcx, rax, QWORD PTR [r10+424]
-        mov	r12, QWORD PTR [r9+176]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+168], r13
-        ; a[i+54] += m[54] * mu
-        mulx	rcx, rax, QWORD PTR [r10+432]
-        mov	r13, QWORD PTR [r9+184]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+176], r12
-        ; a[i+55] += m[55] * mu
-        mulx	rcx, rax, QWORD PTR [r10+440]
-        mov	r12, QWORD PTR [r9+192]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+184], r13
-        ; a[i+56] += m[56] * mu
-        mulx	rcx, rax, QWORD PTR [r10+448]
-        mov	r13, QWORD PTR [r9+200]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+192], r12
-        ; a[i+57] += m[57] * mu
-        mulx	rcx, rax, QWORD PTR [r10+456]
-        mov	r12, QWORD PTR [r9+208]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+200], r13
-        ; a[i+58] += m[58] * mu
-        mulx	rcx, rax, QWORD PTR [r10+464]
-        mov	r13, QWORD PTR [r9+216]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+208], r12
-        ; a[i+59] += m[59] * mu
-        mulx	rcx, rax, QWORD PTR [r10+472]
-        mov	r12, QWORD PTR [r9+224]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+216], r13
-        ; a[i+60] += m[60] * mu
-        mulx	rcx, rax, QWORD PTR [r10+480]
-        mov	r13, QWORD PTR [r9+232]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+224], r12
-        ; a[i+61] += m[61] * mu
-        mulx	rcx, rax, QWORD PTR [r10+488]
-        mov	r12, QWORD PTR [r9+240]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+232], r13
-        ; a[i+62] += m[62] * mu
-        mulx	rcx, rax, QWORD PTR [r10+496]
-        mov	r13, QWORD PTR [r9+248]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+240], r12
-        ; a[i+63] += m[63] * mu
-        mulx	rcx, rax, QWORD PTR [r10+504]
-        mov	r12, QWORD PTR [r9+256]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+248], r13
-        adcx	r12, rbp
-        mov	rbp, rbx
-        mov	QWORD PTR [r9+256], r12
-        adox	rbp, rbx
-        adcx	rbp, rbx
-        ; a += 1
-        add	r9, 8
-        ; i -= 1
-        sub	r11, 1
-        jnz	L_4096_mont_reduce_avx2_64_loop
-        sub	r9, 256
-        neg	rbp
-        mov	r8, r9
-        sub	r9, 512
-        mov	rcx, QWORD PTR [r10]
-        mov	rdx, r14
-        pext	rcx, rcx, rbp
-        sub	rdx, rcx
-        mov	rcx, QWORD PTR [r10+8]
-        mov	rax, r15
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+16]
-        mov	rcx, rdi
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+8], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+24]
-        mov	rdx, rsi
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+16], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+32]
-        mov	rax, QWORD PTR [r8+32]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+24], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+40]
-        mov	rcx, QWORD PTR [r8+40]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+32], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+48]
-        mov	rdx, QWORD PTR [r8+48]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+40], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+56]
-        mov	rax, QWORD PTR [r8+56]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+48], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+64]
-        mov	rcx, QWORD PTR [r8+64]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+56], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+72]
-        mov	rdx, QWORD PTR [r8+72]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+64], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+80]
-        mov	rax, QWORD PTR [r8+80]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+72], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+88]
-        mov	rcx, QWORD PTR [r8+88]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+80], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+96]
-        mov	rdx, QWORD PTR [r8+96]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+88], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+104]
-        mov	rax, QWORD PTR [r8+104]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+96], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+112]
-        mov	rcx, QWORD PTR [r8+112]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+104], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+120]
-        mov	rdx, QWORD PTR [r8+120]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+112], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+128]
-        mov	rax, QWORD PTR [r8+128]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+120], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+136]
-        mov	rcx, QWORD PTR [r8+136]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+128], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+144]
-        mov	rdx, QWORD PTR [r8+144]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+136], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+152]
-        mov	rax, QWORD PTR [r8+152]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+144], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+160]
-        mov	rcx, QWORD PTR [r8+160]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+152], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+168]
-        mov	rdx, QWORD PTR [r8+168]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+160], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+176]
-        mov	rax, QWORD PTR [r8+176]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+168], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+184]
-        mov	rcx, QWORD PTR [r8+184]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+176], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+192]
-        mov	rdx, QWORD PTR [r8+192]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+184], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+200]
-        mov	rax, QWORD PTR [r8+200]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+192], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+208]
-        mov	rcx, QWORD PTR [r8+208]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+200], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+216]
-        mov	rdx, QWORD PTR [r8+216]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+208], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+224]
-        mov	rax, QWORD PTR [r8+224]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+216], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+232]
-        mov	rcx, QWORD PTR [r8+232]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+224], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+240]
-        mov	rdx, QWORD PTR [r8+240]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+232], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+248]
-        mov	rax, QWORD PTR [r8+248]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+240], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+256]
-        mov	rcx, QWORD PTR [r8+256]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+248], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+264]
-        mov	rdx, QWORD PTR [r8+264]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+256], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+272]
-        mov	rax, QWORD PTR [r8+272]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+264], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+280]
-        mov	rcx, QWORD PTR [r8+280]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+272], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+288]
-        mov	rdx, QWORD PTR [r8+288]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+280], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+296]
-        mov	rax, QWORD PTR [r8+296]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+288], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+304]
-        mov	rcx, QWORD PTR [r8+304]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+296], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+312]
-        mov	rdx, QWORD PTR [r8+312]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+304], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+320]
-        mov	rax, QWORD PTR [r8+320]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+312], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+328]
-        mov	rcx, QWORD PTR [r8+328]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+320], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+336]
-        mov	rdx, QWORD PTR [r8+336]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+328], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+344]
-        mov	rax, QWORD PTR [r8+344]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+336], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+352]
-        mov	rcx, QWORD PTR [r8+352]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+344], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+360]
-        mov	rdx, QWORD PTR [r8+360]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+352], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+368]
-        mov	rax, QWORD PTR [r8+368]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+360], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+376]
-        mov	rcx, QWORD PTR [r8+376]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+368], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+384]
-        mov	rdx, QWORD PTR [r8+384]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+376], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+392]
-        mov	rax, QWORD PTR [r8+392]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+384], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+400]
-        mov	rcx, QWORD PTR [r8+400]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+392], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+408]
-        mov	rdx, QWORD PTR [r8+408]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+400], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+416]
-        mov	rax, QWORD PTR [r8+416]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+408], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+424]
-        mov	rcx, QWORD PTR [r8+424]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+416], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+432]
-        mov	rdx, QWORD PTR [r8+432]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+424], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+440]
-        mov	rax, QWORD PTR [r8+440]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+432], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+448]
-        mov	rcx, QWORD PTR [r8+448]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+440], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+456]
-        mov	rdx, QWORD PTR [r8+456]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+448], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+464]
-        mov	rax, QWORD PTR [r8+464]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+456], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+472]
-        mov	rcx, QWORD PTR [r8+472]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+464], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+480]
-        mov	rdx, QWORD PTR [r8+480]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+472], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+488]
-        mov	rax, QWORD PTR [r8+488]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+480], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+496]
-        mov	rcx, QWORD PTR [r8+496]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+488], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+504]
-        mov	rdx, QWORD PTR [r8+504]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+496], rcx
-        sbb	rdx, rax
-        mov	QWORD PTR [r9+504], rdx
-        pop	rbp
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_4096_mont_reduce_avx2_64 ENDP
-_text ENDS
-ENDIF
-IFNDEF WC_NO_CACHE_RESISTANT
-_text SEGMENT READONLY PARA
-sp_4096_get_from_table_avx2_64 PROC
-        sub	rsp, 128
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        vmovdqu	OWORD PTR [rsp+96], xmm12
-        vmovdqu	OWORD PTR [rsp+112], xmm13
-        mov	rax, 1
-        movd	xmm10, r8
-        movd	xmm11, rax
-        vpxor	ymm13, ymm13, ymm13
-        vpermd	ymm10, ymm13, ymm10
-        vpermd	ymm11, ymm13, ymm11
-        ; START: 0-15
-        vpxor	ymm13, ymm13, ymm13
-        vpxor	ymm4, ymm4, ymm4
-        vpxor	ymm5, ymm5, ymm5
-        vpxor	ymm6, ymm6, ymm6
-        vpxor	ymm7, ymm7, ymm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        vmovdqu	YMMWORD PTR [rcx], ymm4
-        vmovdqu	YMMWORD PTR [rcx+32], ymm5
-        vmovdqu	YMMWORD PTR [rcx+64], ymm6
-        vmovdqu	YMMWORD PTR [rcx+96], ymm7
-        add	rcx, 128
-        ; END: 0-15
-        ; START: 16-31
-        vpxor	ymm13, ymm13, ymm13
-        vpxor	ymm4, ymm4, ymm4
-        vpxor	ymm5, ymm5, ymm5
-        vpxor	ymm6, ymm6, ymm6
-        vpxor	ymm7, ymm7, ymm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 128
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        vmovdqu	YMMWORD PTR [rcx], ymm4
-        vmovdqu	YMMWORD PTR [rcx+32], ymm5
-        vmovdqu	YMMWORD PTR [rcx+64], ymm6
-        vmovdqu	YMMWORD PTR [rcx+96], ymm7
-        add	rcx, 128
-        ; END: 16-31
-        ; START: 32-47
-        vpxor	ymm13, ymm13, ymm13
-        vpxor	ymm4, ymm4, ymm4
-        vpxor	ymm5, ymm5, ymm5
-        vpxor	ymm6, ymm6, ymm6
-        vpxor	ymm7, ymm7, ymm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 256
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        vmovdqu	YMMWORD PTR [rcx], ymm4
-        vmovdqu	YMMWORD PTR [rcx+32], ymm5
-        vmovdqu	YMMWORD PTR [rcx+64], ymm6
-        vmovdqu	YMMWORD PTR [rcx+96], ymm7
-        add	rcx, 128
-        ; END: 32-47
-        ; START: 48-63
-        vpxor	ymm13, ymm13, ymm13
-        vpxor	ymm4, ymm4, ymm4
-        vpxor	ymm5, ymm5, ymm5
-        vpxor	ymm6, ymm6, ymm6
-        vpxor	ymm7, ymm7, ymm7
-        ; ENTRY: 0
-        mov	r9, QWORD PTR [rdx]
-        add	r9, 384
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 1
-        mov	r9, QWORD PTR [rdx+8]
-        add	r9, 384
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 2
-        mov	r9, QWORD PTR [rdx+16]
-        add	r9, 384
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 3
-        mov	r9, QWORD PTR [rdx+24]
-        add	r9, 384
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 4
-        mov	r9, QWORD PTR [rdx+32]
-        add	r9, 384
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 5
-        mov	r9, QWORD PTR [rdx+40]
-        add	r9, 384
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 6
-        mov	r9, QWORD PTR [rdx+48]
-        add	r9, 384
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 7
-        mov	r9, QWORD PTR [rdx+56]
-        add	r9, 384
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 8
-        mov	r9, QWORD PTR [rdx+64]
-        add	r9, 384
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 9
-        mov	r9, QWORD PTR [rdx+72]
-        add	r9, 384
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 10
-        mov	r9, QWORD PTR [rdx+80]
-        add	r9, 384
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 11
-        mov	r9, QWORD PTR [rdx+88]
-        add	r9, 384
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 12
-        mov	r9, QWORD PTR [rdx+96]
-        add	r9, 384
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 13
-        mov	r9, QWORD PTR [rdx+104]
-        add	r9, 384
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 14
-        mov	r9, QWORD PTR [rdx+112]
-        add	r9, 384
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        ; ENTRY: 15
-        mov	r9, QWORD PTR [rdx+120]
-        add	r9, 384
-        vpcmpeqd	ymm12, ymm13, ymm10
-        vmovdqu	ymm0, YMMWORD PTR [r9]
-        vmovdqu	ymm1, YMMWORD PTR [r9+32]
-        vmovdqu	ymm2, YMMWORD PTR [r9+64]
-        vmovdqu	ymm3, YMMWORD PTR [r9+96]
-        vpand	ymm0, ymm0, ymm12
-        vpand	ymm1, ymm1, ymm12
-        vpand	ymm2, ymm2, ymm12
-        vpand	ymm3, ymm3, ymm12
-        vpor	ymm4, ymm4, ymm0
-        vpor	ymm5, ymm5, ymm1
-        vpor	ymm6, ymm6, ymm2
-        vpor	ymm7, ymm7, ymm3
-        vpaddd	ymm13, ymm13, ymm11
-        vmovdqu	YMMWORD PTR [rcx], ymm4
-        vmovdqu	YMMWORD PTR [rcx+32], ymm5
-        vmovdqu	YMMWORD PTR [rcx+64], ymm6
-        vmovdqu	YMMWORD PTR [rcx+96], ymm7
-        ; END: 48-63
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        vmovdqu	xmm12, OWORD PTR [rsp+96]
-        vmovdqu	xmm13, OWORD PTR [rsp+112]
-        add	rsp, 128
-        ret
-sp_4096_get_from_table_avx2_64 ENDP
-_text ENDS
-ENDIF
-; /* Conditionally add a and b using the mask m.
-;  * m is -1 to add and 0 when not.
-;  *
-;  * r  A single precision number representing conditional add result.
-;  * a  A single precision number to add with.
-;  * b  A single precision number to add.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_4096_cond_add_32 PROC
-        sub	rsp, 256
-        mov	rax, 0
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp], r10
-        mov	QWORD PTR [rsp+8], r11
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+16], r10
-        mov	QWORD PTR [rsp+24], r11
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+32], r10
-        mov	QWORD PTR [rsp+40], r11
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+48], r10
-        mov	QWORD PTR [rsp+56], r11
-        mov	r10, QWORD PTR [r8+64]
-        mov	r11, QWORD PTR [r8+72]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+64], r10
-        mov	QWORD PTR [rsp+72], r11
-        mov	r10, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [r8+88]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+80], r10
-        mov	QWORD PTR [rsp+88], r11
-        mov	r10, QWORD PTR [r8+96]
-        mov	r11, QWORD PTR [r8+104]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+96], r10
-        mov	QWORD PTR [rsp+104], r11
-        mov	r10, QWORD PTR [r8+112]
-        mov	r11, QWORD PTR [r8+120]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+112], r10
-        mov	QWORD PTR [rsp+120], r11
-        mov	r10, QWORD PTR [r8+128]
-        mov	r11, QWORD PTR [r8+136]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+128], r10
-        mov	QWORD PTR [rsp+136], r11
-        mov	r10, QWORD PTR [r8+144]
-        mov	r11, QWORD PTR [r8+152]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+144], r10
-        mov	QWORD PTR [rsp+152], r11
-        mov	r10, QWORD PTR [r8+160]
-        mov	r11, QWORD PTR [r8+168]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+160], r10
-        mov	QWORD PTR [rsp+168], r11
-        mov	r10, QWORD PTR [r8+176]
-        mov	r11, QWORD PTR [r8+184]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+176], r10
-        mov	QWORD PTR [rsp+184], r11
-        mov	r10, QWORD PTR [r8+192]
-        mov	r11, QWORD PTR [r8+200]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+192], r10
-        mov	QWORD PTR [rsp+200], r11
-        mov	r10, QWORD PTR [r8+208]
-        mov	r11, QWORD PTR [r8+216]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+208], r10
-        mov	QWORD PTR [rsp+216], r11
-        mov	r10, QWORD PTR [r8+224]
-        mov	r11, QWORD PTR [r8+232]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+224], r10
-        mov	QWORD PTR [rsp+232], r11
-        mov	r10, QWORD PTR [r8+240]
-        mov	r11, QWORD PTR [r8+248]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+240], r10
-        mov	QWORD PTR [rsp+248], r11
-        mov	r10, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rsp]
-        add	r10, r8
-        mov	r11, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [rsp+8]
-        adc	r11, r8
-        mov	QWORD PTR [rcx], r10
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r8, QWORD PTR [rsp+16]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+8], r11
-        mov	r11, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [rsp+24]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+16], r10
-        mov	r10, QWORD PTR [rdx+32]
-        mov	r8, QWORD PTR [rsp+32]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+24], r11
-        mov	r11, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [rsp+40]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+32], r10
-        mov	r10, QWORD PTR [rdx+48]
-        mov	r8, QWORD PTR [rsp+48]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+40], r11
-        mov	r11, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [rsp+56]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+48], r10
-        mov	r10, QWORD PTR [rdx+64]
-        mov	r8, QWORD PTR [rsp+64]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+56], r11
-        mov	r11, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [rsp+72]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+64], r10
-        mov	r10, QWORD PTR [rdx+80]
-        mov	r8, QWORD PTR [rsp+80]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+72], r11
-        mov	r11, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [rsp+88]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+80], r10
-        mov	r10, QWORD PTR [rdx+96]
-        mov	r8, QWORD PTR [rsp+96]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+88], r11
-        mov	r11, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [rsp+104]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+96], r10
-        mov	r10, QWORD PTR [rdx+112]
-        mov	r8, QWORD PTR [rsp+112]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+104], r11
-        mov	r11, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [rsp+120]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+112], r10
-        mov	r10, QWORD PTR [rdx+128]
-        mov	r8, QWORD PTR [rsp+128]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+120], r11
-        mov	r11, QWORD PTR [rdx+136]
-        mov	r8, QWORD PTR [rsp+136]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+128], r10
-        mov	r10, QWORD PTR [rdx+144]
-        mov	r8, QWORD PTR [rsp+144]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+136], r11
-        mov	r11, QWORD PTR [rdx+152]
-        mov	r8, QWORD PTR [rsp+152]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+144], r10
-        mov	r10, QWORD PTR [rdx+160]
-        mov	r8, QWORD PTR [rsp+160]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+152], r11
-        mov	r11, QWORD PTR [rdx+168]
-        mov	r8, QWORD PTR [rsp+168]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+160], r10
-        mov	r10, QWORD PTR [rdx+176]
-        mov	r8, QWORD PTR [rsp+176]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+168], r11
-        mov	r11, QWORD PTR [rdx+184]
-        mov	r8, QWORD PTR [rsp+184]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+176], r10
-        mov	r10, QWORD PTR [rdx+192]
-        mov	r8, QWORD PTR [rsp+192]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+184], r11
-        mov	r11, QWORD PTR [rdx+200]
-        mov	r8, QWORD PTR [rsp+200]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+192], r10
-        mov	r10, QWORD PTR [rdx+208]
-        mov	r8, QWORD PTR [rsp+208]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+200], r11
-        mov	r11, QWORD PTR [rdx+216]
-        mov	r8, QWORD PTR [rsp+216]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+208], r10
-        mov	r10, QWORD PTR [rdx+224]
-        mov	r8, QWORD PTR [rsp+224]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+216], r11
-        mov	r11, QWORD PTR [rdx+232]
-        mov	r8, QWORD PTR [rsp+232]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+224], r10
-        mov	r10, QWORD PTR [rdx+240]
-        mov	r8, QWORD PTR [rsp+240]
-        adc	r10, r8
-        mov	QWORD PTR [rcx+232], r11
-        mov	r11, QWORD PTR [rdx+248]
-        mov	r8, QWORD PTR [rsp+248]
-        adc	r11, r8
-        mov	QWORD PTR [rcx+240], r10
-        mov	QWORD PTR [rcx+248], r11
-        adc	rax, 0
-        add	rsp, 256
-        ret
-sp_4096_cond_add_32 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Conditionally add a and b using the mask m.
-;  * m is -1 to add and 0 when not.
-;  *
-;  * r  A single precision number representing conditional add result.
-;  * a  A single precision number to add with.
-;  * b  A single precision number to add.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_4096_cond_add_avx2_32 PROC
-        push	r12
-        mov	rax, 0
-        mov	r12, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx]
-        pext	r12, r12, r9
-        add	r10, r12
-        mov	r12, QWORD PTR [r8+8]
-        mov	r11, QWORD PTR [rdx+8]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+16]
-        mov	r12, QWORD PTR [rdx+16]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+8], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [rdx+24]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+16], r12
-        adc	r10, r11
-        mov	r12, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [rdx+32]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+24], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [rdx+40]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+32], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+48]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+40], r12
-        adc	r10, r11
-        mov	r12, QWORD PTR [r8+56]
-        mov	r11, QWORD PTR [rdx+56]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+48], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+64]
-        mov	r12, QWORD PTR [rdx+64]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+56], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+72]
-        mov	r10, QWORD PTR [rdx+72]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+64], r12
-        adc	r10, r11
-        mov	r12, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [rdx+80]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+72], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+88]
-        mov	r12, QWORD PTR [rdx+88]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+80], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+96]
-        mov	r10, QWORD PTR [rdx+96]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+88], r12
-        adc	r10, r11
-        mov	r12, QWORD PTR [r8+104]
-        mov	r11, QWORD PTR [rdx+104]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+96], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+112]
-        mov	r12, QWORD PTR [rdx+112]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+104], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+120]
-        mov	r10, QWORD PTR [rdx+120]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+112], r12
-        adc	r10, r11
-        mov	r12, QWORD PTR [r8+128]
-        mov	r11, QWORD PTR [rdx+128]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+120], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+136]
-        mov	r12, QWORD PTR [rdx+136]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+128], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+144]
-        mov	r10, QWORD PTR [rdx+144]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+136], r12
-        adc	r10, r11
-        mov	r12, QWORD PTR [r8+152]
-        mov	r11, QWORD PTR [rdx+152]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+144], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+160]
-        mov	r12, QWORD PTR [rdx+160]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+152], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+168]
-        mov	r10, QWORD PTR [rdx+168]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+160], r12
-        adc	r10, r11
-        mov	r12, QWORD PTR [r8+176]
-        mov	r11, QWORD PTR [rdx+176]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+168], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+184]
-        mov	r12, QWORD PTR [rdx+184]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+176], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+192]
-        mov	r10, QWORD PTR [rdx+192]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+184], r12
-        adc	r10, r11
-        mov	r12, QWORD PTR [r8+200]
-        mov	r11, QWORD PTR [rdx+200]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+192], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+208]
-        mov	r12, QWORD PTR [rdx+208]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+200], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+216]
-        mov	r10, QWORD PTR [rdx+216]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+208], r12
-        adc	r10, r11
-        mov	r12, QWORD PTR [r8+224]
-        mov	r11, QWORD PTR [rdx+224]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+216], r10
-        adc	r11, r12
-        mov	r10, QWORD PTR [r8+232]
-        mov	r12, QWORD PTR [rdx+232]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+224], r11
-        adc	r12, r10
-        mov	r11, QWORD PTR [r8+240]
-        mov	r10, QWORD PTR [rdx+240]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+232], r12
-        adc	r10, r11
-        mov	r12, QWORD PTR [r8+248]
-        mov	r11, QWORD PTR [rdx+248]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+240], r10
-        adc	r11, r12
-        mov	QWORD PTR [rcx+248], r11
-        adc	rax, 0
-        pop	r12
-        ret
-sp_4096_cond_add_avx2_32 ENDP
-_text ENDS
-ENDIF
-; /* Shift number left by n bit. (r = a << n)
-;  *
-;  * r  Result of left shift by n.
-;  * a  Number to shift.
-;  * n  Amoutnt o shift.
-;  */
-_text SEGMENT READONLY PARA
-sp_4096_lshift_64 PROC
-        push	r12
-        push	r13
-        mov	cl, r8b
-        mov	rax, rcx
-        mov	r12, 0
-        mov	r13, QWORD PTR [rdx+472]
-        mov	r8, QWORD PTR [rdx+480]
-        mov	r9, QWORD PTR [rdx+488]
-        mov	r10, QWORD PTR [rdx+496]
-        mov	r11, QWORD PTR [rdx+504]
-        shld	r12, r11, cl
-        shld	r11, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r13, cl
-        mov	QWORD PTR [rax+480], r8
-        mov	QWORD PTR [rax+488], r9
-        mov	QWORD PTR [rax+496], r10
-        mov	QWORD PTR [rax+504], r11
-        mov	QWORD PTR [rax+512], r12
-        mov	r11, QWORD PTR [rdx+440]
-        mov	r8, QWORD PTR [rdx+448]
-        mov	r9, QWORD PTR [rdx+456]
-        mov	r10, QWORD PTR [rdx+464]
-        shld	r13, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r11, cl
-        mov	QWORD PTR [rax+448], r8
-        mov	QWORD PTR [rax+456], r9
-        mov	QWORD PTR [rax+464], r10
-        mov	QWORD PTR [rax+472], r13
-        mov	r13, QWORD PTR [rdx+408]
-        mov	r8, QWORD PTR [rdx+416]
-        mov	r9, QWORD PTR [rdx+424]
-        mov	r10, QWORD PTR [rdx+432]
-        shld	r11, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r13, cl
-        mov	QWORD PTR [rax+416], r8
-        mov	QWORD PTR [rax+424], r9
-        mov	QWORD PTR [rax+432], r10
-        mov	QWORD PTR [rax+440], r11
-        mov	r11, QWORD PTR [rdx+376]
-        mov	r8, QWORD PTR [rdx+384]
-        mov	r9, QWORD PTR [rdx+392]
-        mov	r10, QWORD PTR [rdx+400]
-        shld	r13, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r11, cl
-        mov	QWORD PTR [rax+384], r8
-        mov	QWORD PTR [rax+392], r9
-        mov	QWORD PTR [rax+400], r10
-        mov	QWORD PTR [rax+408], r13
-        mov	r13, QWORD PTR [rdx+344]
-        mov	r8, QWORD PTR [rdx+352]
-        mov	r9, QWORD PTR [rdx+360]
-        mov	r10, QWORD PTR [rdx+368]
-        shld	r11, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r13, cl
-        mov	QWORD PTR [rax+352], r8
-        mov	QWORD PTR [rax+360], r9
-        mov	QWORD PTR [rax+368], r10
-        mov	QWORD PTR [rax+376], r11
-        mov	r11, QWORD PTR [rdx+312]
-        mov	r8, QWORD PTR [rdx+320]
-        mov	r9, QWORD PTR [rdx+328]
-        mov	r10, QWORD PTR [rdx+336]
-        shld	r13, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r11, cl
-        mov	QWORD PTR [rax+320], r8
-        mov	QWORD PTR [rax+328], r9
-        mov	QWORD PTR [rax+336], r10
-        mov	QWORD PTR [rax+344], r13
-        mov	r13, QWORD PTR [rdx+280]
-        mov	r8, QWORD PTR [rdx+288]
-        mov	r9, QWORD PTR [rdx+296]
-        mov	r10, QWORD PTR [rdx+304]
-        shld	r11, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r13, cl
-        mov	QWORD PTR [rax+288], r8
-        mov	QWORD PTR [rax+296], r9
-        mov	QWORD PTR [rax+304], r10
-        mov	QWORD PTR [rax+312], r11
-        mov	r11, QWORD PTR [rdx+248]
-        mov	r8, QWORD PTR [rdx+256]
-        mov	r9, QWORD PTR [rdx+264]
-        mov	r10, QWORD PTR [rdx+272]
-        shld	r13, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r11, cl
-        mov	QWORD PTR [rax+256], r8
-        mov	QWORD PTR [rax+264], r9
-        mov	QWORD PTR [rax+272], r10
-        mov	QWORD PTR [rax+280], r13
-        mov	r13, QWORD PTR [rdx+216]
-        mov	r8, QWORD PTR [rdx+224]
-        mov	r9, QWORD PTR [rdx+232]
-        mov	r10, QWORD PTR [rdx+240]
-        shld	r11, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r13, cl
-        mov	QWORD PTR [rax+224], r8
-        mov	QWORD PTR [rax+232], r9
-        mov	QWORD PTR [rax+240], r10
-        mov	QWORD PTR [rax+248], r11
-        mov	r11, QWORD PTR [rdx+184]
-        mov	r8, QWORD PTR [rdx+192]
-        mov	r9, QWORD PTR [rdx+200]
-        mov	r10, QWORD PTR [rdx+208]
-        shld	r13, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r11, cl
-        mov	QWORD PTR [rax+192], r8
-        mov	QWORD PTR [rax+200], r9
-        mov	QWORD PTR [rax+208], r10
-        mov	QWORD PTR [rax+216], r13
-        mov	r13, QWORD PTR [rdx+152]
-        mov	r8, QWORD PTR [rdx+160]
-        mov	r9, QWORD PTR [rdx+168]
-        mov	r10, QWORD PTR [rdx+176]
-        shld	r11, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r13, cl
-        mov	QWORD PTR [rax+160], r8
-        mov	QWORD PTR [rax+168], r9
-        mov	QWORD PTR [rax+176], r10
-        mov	QWORD PTR [rax+184], r11
-        mov	r11, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [rdx+128]
-        mov	r9, QWORD PTR [rdx+136]
-        mov	r10, QWORD PTR [rdx+144]
-        shld	r13, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r11, cl
-        mov	QWORD PTR [rax+128], r8
-        mov	QWORD PTR [rax+136], r9
-        mov	QWORD PTR [rax+144], r10
-        mov	QWORD PTR [rax+152], r13
-        mov	r13, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [rdx+96]
-        mov	r9, QWORD PTR [rdx+104]
-        mov	r10, QWORD PTR [rdx+112]
-        shld	r11, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r13, cl
-        mov	QWORD PTR [rax+96], r8
-        mov	QWORD PTR [rax+104], r9
-        mov	QWORD PTR [rax+112], r10
-        mov	QWORD PTR [rax+120], r11
-        mov	r11, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [rdx+64]
-        mov	r9, QWORD PTR [rdx+72]
-        mov	r10, QWORD PTR [rdx+80]
-        shld	r13, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r11, cl
-        mov	QWORD PTR [rax+64], r8
-        mov	QWORD PTR [rax+72], r9
-        mov	QWORD PTR [rax+80], r10
-        mov	QWORD PTR [rax+88], r13
-        mov	r13, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [rdx+32]
-        mov	r9, QWORD PTR [rdx+40]
-        mov	r10, QWORD PTR [rdx+48]
-        shld	r11, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r13, cl
-        mov	QWORD PTR [rax+32], r8
-        mov	QWORD PTR [rax+40], r9
-        mov	QWORD PTR [rax+48], r10
-        mov	QWORD PTR [rax+56], r11
-        mov	r8, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	r10, QWORD PTR [rdx+16]
-        shld	r13, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shl	r8, cl
-        mov	QWORD PTR [rax], r8
-        mov	QWORD PTR [rax+8], r9
-        mov	QWORD PTR [rax+16], r10
-        mov	QWORD PTR [rax+24], r13
-        pop	r13
-        pop	r12
-        ret
-sp_4096_lshift_64 ENDP
-_text ENDS
-ENDIF
-ENDIF
-IFNDEF WOLFSSL_SP_NO_256
-; /* Multiply a and b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mul_4 PROC
-        push	r12
-        mov	r9, rdx
-        sub	rsp, 32
-        ; A[0] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        mov	QWORD PTR [rsp], rax
-        mov	r11, rdx
-        ; A[0] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[1] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+8], r11
-        ; A[0] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+8]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[2] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+16], r12
-        ; A[0] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[1] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[2] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+16]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[3] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rsp+24], r10
-        ; A[1] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+8]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+16]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[3] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+24]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+32], r11
-        ; A[2] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+16]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+24]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+40], r12
-        ; A[3] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        mov	QWORD PTR [rcx+48], r10
-        mov	QWORD PTR [rcx+56], r11
-        mov	rax, QWORD PTR [rsp]
-        mov	rdx, QWORD PTR [rsp+8]
-        mov	r10, QWORD PTR [rsp+16]
-        mov	r11, QWORD PTR [rsp+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], rdx
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        add	rsp, 32
-        pop	r12
-        ret
-sp_256_mul_4 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Multiply a and b into r. (r = a * b)
-;  *
-;  * r   Result of multiplication.
-;  * a   First number to multiply.
-;  * b   Second number to multiply.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mul_avx2_4 PROC
-        push	rbp
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        mov	rbp, r8
-        mov	rax, rdx
-        mov	rdx, QWORD PTR [rax]
-        mov	r14, QWORD PTR [rbp+8]
-        ; A[0] * B[0]
-        mulx	r9, r8, QWORD PTR [rbp]
-        xor	rbx, rbx
-        ; A[0] * B[1]
-        mulx	r10, rdi, r14
-        adcx	r9, rdi
-        ; A[0] * B[2]
-        mulx	r11, rdi, QWORD PTR [rbp+16]
-        adcx	r10, rdi
-        ; A[0] * B[3]
-        mulx	r12, rdi, QWORD PTR [rbp+24]
-        adcx	r11, rdi
-        mov	rdx, QWORD PTR [rax+8]
-        adcx	r12, rbx
-        ; A[1] * B[0]
-        mulx	rsi, rdi, QWORD PTR [rbp]
-        xor	rbx, rbx
-        adcx	r9, rdi
-        ; A[1] * B[1]
-        mulx	r15, rdi, r14
-        adox	r10, rsi
-        adcx	r10, rdi
-        ; A[1] * B[2]
-        mulx	rsi, rdi, QWORD PTR [rbp+16]
-        adox	r11, r15
-        adcx	r11, rdi
-        ; A[1] * B[3]
-        mulx	r13, rdi, QWORD PTR [rbp+24]
-        adox	r12, rsi
-        adcx	r12, rdi
-        adox	r13, rbx
-        mov	rdx, QWORD PTR [rax+16]
-        adcx	r13, rbx
-        ; A[2] * B[0]
-        mulx	rsi, rdi, QWORD PTR [rbp]
-        xor	rbx, rbx
-        adcx	r10, rdi
-        ; A[2] * B[1]
-        mulx	r15, rdi, r14
-        adox	r11, rsi
-        adcx	r11, rdi
-        ; A[2] * B[2]
-        mulx	rsi, rdi, QWORD PTR [rbp+16]
-        adox	r12, r15
-        adcx	r12, rdi
-        ; A[2] * B[3]
-        mulx	r14, rdi, QWORD PTR [rbp+24]
-        adox	r13, rsi
-        adcx	r13, rdi
-        adox	r14, rbx
-        mov	rdx, QWORD PTR [rax+24]
-        adcx	r14, rbx
-        ; A[3] * B[0]
-        mulx	rsi, rdi, QWORD PTR [rbp]
-        xor	rbx, rbx
-        adcx	r11, rdi
-        ; A[3] * B[1]
-        mulx	r15, rdi, QWORD PTR [rbp+8]
-        adox	r12, rsi
-        adcx	r12, rdi
-        ; A[3] * B[2]
-        mulx	rsi, rdi, QWORD PTR [rbp+16]
-        adox	r13, r15
-        adcx	r13, rdi
-        ; A[3] * B[3]
-        mulx	r15, rdi, QWORD PTR [rbp+24]
-        adox	r14, rsi
-        adcx	r14, rdi
-        adox	r15, rbx
-        adcx	r15, rbx
-        mov	QWORD PTR [rcx], r8
-        mov	QWORD PTR [rcx+8], r9
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        mov	QWORD PTR [rcx+32], r12
-        mov	QWORD PTR [rcx+40], r13
-        mov	QWORD PTR [rcx+48], r14
-        mov	QWORD PTR [rcx+56], r15
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        pop	rbp
-        ret
-sp_256_mul_avx2_4 ENDP
-_text ENDS
-ENDIF
-; /* Square a and put result in r. (r = a * a)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_sqr_4 PROC
-        push	r12
-        push	r13
-        push	r14
-        mov	r8, rdx
-        sub	rsp, 32
-        ; A[0] * A[0]
-        mov	rax, QWORD PTR [r8]
-        mul	rax
-        xor	r11, r11
-        mov	QWORD PTR [rsp], rax
-        mov	r10, rdx
-        ; A[0] * A[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r8]
-        xor	r9, r9
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        mov	QWORD PTR [rsp+8], r10
-        ; A[0] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        ; A[1] * A[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	rax
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+16], r11
-        ; A[0] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r8]
-        xor	r11, r11
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r8+8]
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+24], r9
-        ; A[1] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r8+8]
-        xor	r9, r9
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        ; A[2] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	rax
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        mov	QWORD PTR [rcx+32], r10
-        ; A[2] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r8+16]
-        xor	r10, r10
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+40], r11
-        ; A[3] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	rax
-        add	r9, rax
-        adc	r10, rdx
-        mov	QWORD PTR [rcx+48], r9
-        mov	QWORD PTR [rcx+56], r10
-        mov	rax, QWORD PTR [rsp]
-        mov	rdx, QWORD PTR [rsp+8]
-        mov	r12, QWORD PTR [rsp+16]
-        mov	r13, QWORD PTR [rsp+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], rdx
-        mov	QWORD PTR [rcx+16], r12
-        mov	QWORD PTR [rcx+24], r13
-        add	rsp, 32
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_256_sqr_4 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Square a and put result in r. (r = a * a)
-;  *
-;  * r   Result of squaring.
-;  * a   Number to square in Montgomery form.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_sqr_avx2_4 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        mov	rax, rdx
-        xor	r8, r8
-        mov	rdx, QWORD PTR [rax]
-        mov	rsi, QWORD PTR [rax+8]
-        mov	rbx, QWORD PTR [rax+16]
-        mov	r15, QWORD PTR [rax+24]
-        ; A[0] * A[1]
-        mulx	r10, r9, rsi
-        ; A[0] * A[2]
-        mulx	r11, r8, rbx
-        adox	r10, r8
-        ; A[0] * A[3]
-        mulx	r12, r8, r15
-        mov	rdx, rsi
-        adox	r11, r8
-        ; A[1] * A[2]
-        mulx	rdi, r8, rbx
-        mov	rdx, r15
-        adcx	r11, r8
-        ; A[1] * A[3]
-        mulx	r13, r8, rsi
-        mov	r15, 0
-        adox	r12, rdi
-        adcx	r12, r8
-        ; A[2] * A[3]
-        mulx	r14, r8, rbx
-        adox	r13, r15
-        adcx	r13, r8
-        adox	r14, r15
-        adcx	r14, r15
-        ; Double with Carry Flag
-        xor	r15, r15
-        ; A[0] * A[0]
-        mov	rdx, QWORD PTR [rax]
-        mulx	rdi, r8, rdx
-        adcx	r9, r9
-        adcx	r10, r10
-        adox	r9, rdi
-        ; A[1] * A[1]
-        mov	rdx, QWORD PTR [rax+8]
-        mulx	rbx, rsi, rdx
-        adcx	r11, r11
-        adox	r10, rsi
-        ; A[2] * A[2]
-        mov	rdx, QWORD PTR [rax+16]
-        mulx	rsi, rdi, rdx
-        adcx	r12, r12
-        adox	r11, rbx
-        adcx	r13, r13
-        adox	r12, rdi
-        adcx	r14, r14
-        ; A[3] * A[3]
-        mov	rdx, QWORD PTR [rax+24]
-        mulx	rbx, rdi, rdx
-        adox	r13, rsi
-        adcx	r15, r15
-        adox	r14, rdi
-        adox	r15, rbx
-        mov	QWORD PTR [rcx], r8
-        mov	QWORD PTR [rcx+8], r9
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        mov	QWORD PTR [rcx+32], r12
-        mov	QWORD PTR [rcx+40], r13
-        mov	QWORD PTR [rcx+48], r14
-        mov	QWORD PTR [rcx+56], r15
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_256_sqr_avx2_4 ENDP
-_text ENDS
-ENDIF
-; /* Add b to a into r. (r = a + b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_add_4 PROC
-        push	r12
-        xor	rax, rax
-        mov	r9, QWORD PTR [rdx]
-        mov	r10, QWORD PTR [rdx+8]
-        mov	r11, QWORD PTR [rdx+16]
-        mov	r12, QWORD PTR [rdx+24]
-        add	r9, QWORD PTR [r8]
-        adc	r10, QWORD PTR [r8+8]
-        adc	r11, QWORD PTR [r8+16]
-        adc	r12, QWORD PTR [r8+24]
-        mov	QWORD PTR [rcx], r9
-        mov	QWORD PTR [rcx+8], r10
-        mov	QWORD PTR [rcx+16], r11
-        mov	QWORD PTR [rcx+24], r12
-        adc	rax, 0
-        pop	r12
-        ret
-sp_256_add_4 ENDP
-_text ENDS
-; /* Sub b from a into r. (r = a - b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_sub_4 PROC
-        push	r12
-        xor	rax, rax
-        mov	r9, QWORD PTR [rdx]
-        mov	r10, QWORD PTR [rdx+8]
-        mov	r11, QWORD PTR [rdx+16]
-        mov	r12, QWORD PTR [rdx+24]
-        sub	r9, QWORD PTR [r8]
-        sbb	r10, QWORD PTR [r8+8]
-        sbb	r11, QWORD PTR [r8+16]
-        sbb	r12, QWORD PTR [r8+24]
-        mov	QWORD PTR [rcx], r9
-        mov	QWORD PTR [rcx+8], r10
-        mov	QWORD PTR [rcx+16], r11
-        mov	QWORD PTR [rcx+24], r12
-        sbb	rax, rax
-        pop	r12
-        ret
-sp_256_sub_4 ENDP
-_text ENDS
-; /* Conditionally copy a into r using the mask m.
-;  * m is -1 to copy and 0 when not.
-;  *
-;  * r  A single precision number to copy over.
-;  * a  A single precision number to copy.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_cond_copy_4 PROC
-        mov	rax, QWORD PTR [rcx]
-        mov	r9, QWORD PTR [rcx+8]
-        mov	r10, QWORD PTR [rcx+16]
-        mov	r11, QWORD PTR [rcx+24]
-        xor	rax, QWORD PTR [rdx]
-        xor	r9, QWORD PTR [rdx+8]
-        xor	r10, QWORD PTR [rdx+16]
-        xor	r11, QWORD PTR [rdx+24]
-        and	rax, r8
-        and	r9, r8
-        and	r10, r8
-        and	r11, r8
-        xor	QWORD PTR [rcx], rax
-        xor	QWORD PTR [rcx+8], r9
-        xor	QWORD PTR [rcx+16], r10
-        xor	QWORD PTR [rcx+24], r11
-        ret
-sp_256_cond_copy_4 ENDP
-_text ENDS
-; /* Multiply two Montgomery form numbers mod the modulus (prime).
-;  * (r = a * b mod m)
-;  *
-;  * r   Result of multiplication.
-;  * a   First number to multiply in Montgomery form.
-;  * b   Second number to multiply in Montgomery form.
-;  * m   Modulus (prime).
-;  * mp  Montgomery multiplier.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mont_mul_4 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        mov	r10, rdx
-        ;  A[0] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r10]
-        mov	r11, rax
-        mov	r12, rdx
-        ;  A[0] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r10]
-        xor	r13, r13
-        add	r12, rax
-        adc	r13, rdx
-        ;  A[1] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r10+8]
-        xor	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ;  A[0] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r10]
-        add	r13, rax
-        adc	r14, rdx
-        ;  A[1] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r10+8]
-        xor	r15, r15
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ;  A[2] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r10+16]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ;  A[0] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r10]
-        xor	rdi, rdi
-        add	r14, rax
-        adc	r15, rdx
-        adc	rdi, 0
-        ;  A[1] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r10+8]
-        add	r14, rax
-        adc	r15, rdx
-        adc	rdi, 0
-        ;  A[2] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r10+16]
-        add	r14, rax
-        adc	r15, rdx
-        adc	rdi, 0
-        ;  A[3] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r10+24]
-        add	r14, rax
-        adc	r15, rdx
-        adc	rdi, 0
-        ;  A[1] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r10+8]
-        xor	rsi, rsi
-        add	r15, rax
-        adc	rdi, rdx
-        adc	rsi, 0
-        ;  A[2] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r10+16]
-        add	r15, rax
-        adc	rdi, rdx
-        adc	rsi, 0
-        ;  A[3] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r10+24]
-        add	r15, rax
-        adc	rdi, rdx
-        adc	rsi, 0
-        ;  A[2] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r10+16]
-        xor	rbx, rbx
-        add	rdi, rax
-        adc	rsi, rdx
-        adc	rbx, 0
-        ;  A[3] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r10+24]
-        add	rdi, rax
-        adc	rsi, rdx
-        adc	rbx, 0
-        ;  A[3] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r10+24]
-        add	rsi, rax
-        adc	rbx, rdx
-        ; Start Reduction
-        ; mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192
-        ;    - a[0] << 32 << 192
-        ;   a[0]-a[3] + (a[0] * 2) << 192
-        mov	rax, r11
-        lea	rdx, QWORD PTR [r14+2*r11]
-        mov	r10, r12
-        mov	r8, r13
-        mov	r9, r13
-        ;   a[0]-a[2] << 32
-        shl	r11, 32
-        shld	r9, r10, 32
-        shld	r12, rax, 32
-        ;   - a[0] << 32 << 192
-        sub	rdx, r11
-        ;   + a[0]-a[2] << 32 << 64
-        add	r10, r11
-        adc	r8, r12
-        adc	rdx, r9
-        ; a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu
-        xor	r9, r9
-        ;   a += mu << 256
-        add	r15, rax
-        adc	rdi, r10
-        adc	rsi, r8
-        adc	rbx, rdx
-        sbb	r11, r11
-        ;   a += mu << 192
-        add	r14, rax
-        adc	r15, r10
-        mov	r12, r10
-        adc	rdi, r8
-        adc	rsi, rdx
-        adc	rbx, 0
-        sbb	r11, 0
-        ; mu <<= 32
-        shld	r9, rdx, 32
-        shld	rdx, r8, 32
-        shld	r8, r10, 32
-        shld	r10, rax, 32
-        shl	rax, 32
-        ;   a -= (mu << 32) << 192
-        sub	r14, rax
-        sbb	r15, r10
-        sbb	rdi, r8
-        sbb	rsi, rdx
-        sbb	rbx, r9
-        adc	r11, 0
-        ;   a += (mu << 32) << 64
-        sub	r12, rax
-        adc	r13, r10
-        adc	r14, r8
-        adc	r15, rdx
-        adc	rdi, r9
-        adc	rsi, 0
-        adc	rbx, 0
-        sbb	r11, 0
-        mov	r10, 18446744069414584321
-        ; mask m and sub from result if overflow
-        ;  m[0] = -1 & mask = mask
-        ;  m[2] =  0 & mask = 0
-        mov	eax, r11d
-        and	r10, r11
-        sub	r15, r11
-        sbb	rdi, rax
-        mov	QWORD PTR [rcx], r15
-        sbb	rsi, 0
-        mov	QWORD PTR [rcx+8], rdi
-        sbb	rbx, r10
-        mov	QWORD PTR [rcx+16], rsi
-        mov	QWORD PTR [rcx+24], rbx
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_256_mont_mul_4 ENDP
-_text ENDS
-; /* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m)
-;  *
-;  * r   Result of squaring.
-;  * a   Number to square in Montgomery form.
-;  * m   Modulus (prime).
-;  * mp  Montgomery multiplier.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mont_sqr_4 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        mov	r8, rdx
-        ;  A[0] * A[1]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r8+8]
-        mov	r11, rax
-        mov	r12, rdx
-        ;  A[0] * A[2]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r8+16]
-        xor	r13, r13
-        add	r12, rax
-        adc	r13, rdx
-        ;  A[0] * A[3]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r8+24]
-        xor	r14, r14
-        add	r13, rax
-        adc	r14, rdx
-        ;  A[1] * A[2]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r8+16]
-        xor	r15, r15
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ;  A[1] * A[3]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r8+24]
-        add	r14, rax
-        adc	r15, rdx
-        ;  A[2] * A[3]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r8+24]
-        xor	rdi, rdi
-        add	r15, rax
-        adc	rdi, rdx
-        ; Double
-        xor	rsi, rsi
-        add	r11, r11
-        adc	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        adc	r15, r15
-        adc	rdi, rdi
-        adc	rsi, 0
-        ;  A[0] * A[0]
-        mov	rax, QWORD PTR [r8]
-        mul	rax
-        mov	rax, rax
-        mov	rdx, rdx
-        mov	r10, rax
-        mov	rbx, rdx
-        ;  A[1] * A[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	rax
-        mov	rax, rax
-        mov	rdx, rdx
-        add	r11, rbx
-        adc	r12, rax
-        adc	rdx, 0
-        mov	rbx, rdx
-        ;  A[2] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	rax
-        mov	rax, rax
-        mov	rdx, rdx
-        add	r13, rbx
-        adc	r14, rax
-        adc	rdx, 0
-        mov	rbx, rdx
-        ;  A[3] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	rax
-        mov	rax, rax
-        mov	rdx, rdx
-        add	r15, rbx
-        adc	rdi, rax
-        adc	rsi, rdx
-        ; Start Reduction
-        ; mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192
-        ;    - a[0] << 32 << 192
-        ;   a[0]-a[3] + (a[0] * 2) << 192
-        mov	rax, r10
-        lea	rdx, QWORD PTR [r13+2*r10]
-        mov	r8, r11
-        mov	rbx, r12
-        mov	r9, r12
-        ;   a[0]-a[2] << 32
-        shl	r10, 32
-        shld	r9, r8, 32
-        shld	r11, rax, 32
-        ;   - a[0] << 32 << 192
-        sub	rdx, r10
-        ;   + a[0]-a[2] << 32 << 64
-        add	r8, r10
-        adc	rbx, r11
-        adc	rdx, r9
-        ; a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu
-        xor	r9, r9
-        ;   a += mu << 256
-        add	r14, rax
-        adc	r15, r8
-        adc	rdi, rbx
-        adc	rsi, rdx
-        sbb	r10, r10
-        ;   a += mu << 192
-        add	r13, rax
-        adc	r14, r8
-        mov	r11, r8
-        adc	r15, rbx
-        adc	rdi, rdx
-        adc	rsi, 0
-        sbb	r10, 0
-        ; mu <<= 32
-        shld	r9, rdx, 32
-        shld	rdx, rbx, 32
-        shld	rbx, r8, 32
-        shld	r8, rax, 32
-        shl	rax, 32
-        ;   a -= (mu << 32) << 192
-        sub	r13, rax
-        sbb	r14, r8
-        sbb	r15, rbx
-        sbb	rdi, rdx
-        sbb	rsi, r9
-        adc	r10, 0
-        ;   a += (mu << 32) << 64
-        sub	r11, rax
-        adc	r12, r8
-        adc	r13, rbx
-        adc	r14, rdx
-        adc	r15, r9
-        adc	rdi, 0
-        adc	rsi, 0
-        sbb	r10, 0
-        mov	r8, 18446744069414584321
-        ; mask m and sub from result if overflow
-        ;  m[0] = -1 & mask = mask
-        ;  m[2] =  0 & mask = 0
-        mov	eax, r10d
-        and	r8, r10
-        sub	r14, r10
-        sbb	r15, rax
-        mov	QWORD PTR [rcx], r14
-        sbb	rdi, 0
-        mov	QWORD PTR [rcx+8], r15
-        sbb	rsi, r8
-        mov	QWORD PTR [rcx+16], rdi
-        mov	QWORD PTR [rcx+24], rsi
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_256_mont_sqr_4 ENDP
-_text ENDS
-; /* Compare a with b in constant time.
-;  *
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  * return -ve, 0 or +ve if a is less than, equal to or greater than b
-;  * respectively.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_cmp_4 PROC
-        push	r12
-        xor	r9, r9
-        mov	r8, -1
-        mov	rax, -1
-        mov	r10, 1
-        mov	r11, QWORD PTR [rcx+24]
-        mov	r12, QWORD PTR [rdx+24]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+16]
-        mov	r12, QWORD PTR [rdx+16]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+8]
-        mov	r12, QWORD PTR [rdx+8]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx]
-        mov	r12, QWORD PTR [rdx]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        xor	rax, r8
-        pop	r12
-        ret
-sp_256_cmp_4 ENDP
-_text ENDS
-; /* Conditionally subtract b from a using the mask m.
-;  * m is -1 to subtract and 0 when not copying.
-;  *
-;  * r  A single precision number representing condition subtract result.
-;  * a  A single precision number to subtract from.
-;  * b  A single precision number to subtract.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_cond_sub_4 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        mov	r14, QWORD PTR [r8]
-        mov	r15, QWORD PTR [r8+8]
-        mov	rdi, QWORD PTR [r8+16]
-        mov	rsi, QWORD PTR [r8+24]
-        and	r14, r9
-        and	r15, r9
-        and	rdi, r9
-        and	rsi, r9
-        mov	r10, QWORD PTR [rdx]
-        mov	r11, QWORD PTR [rdx+8]
-        mov	r12, QWORD PTR [rdx+16]
-        mov	r13, QWORD PTR [rdx+24]
-        sub	r10, r14
-        sbb	r11, r15
-        sbb	r12, rdi
-        sbb	r13, rsi
-        mov	QWORD PTR [rcx], r10
-        mov	QWORD PTR [rcx+8], r11
-        mov	QWORD PTR [rcx+16], r12
-        mov	QWORD PTR [rcx+24], r13
-        sbb	rax, rax
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_256_cond_sub_4 ENDP
-_text ENDS
-; /* Reduce the number back to 256 bits using Montgomery reduction.
-;  *
-;  * a   A single precision number to reduce in place.
-;  * m   The single precision number representing the modulus.
-;  * mp  The digit representing the negative inverse of m mod 2^n.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mont_reduce_4 PROC
-        push	rbx
-        push	rsi
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        mov	r8, rcx
-        mov	r9, QWORD PTR [r8]
-        mov	r10, QWORD PTR [r8+8]
-        mov	r11, QWORD PTR [r8+16]
-        mov	r12, QWORD PTR [r8+24]
-        mov	r13, QWORD PTR [r8+32]
-        mov	r14, QWORD PTR [r8+40]
-        mov	r15, QWORD PTR [r8+48]
-        mov	rdi, QWORD PTR [r8+56]
-        ; Start Reduction
-        ; mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192
-        ;    - a[0] << 32 << 192
-        ;   a[0]-a[3] + (a[0] * 2) << 192
-        mov	rax, r9
-        lea	rdx, QWORD PTR [r12+2*r9]
-        mov	rbx, r10
-        mov	rcx, r11
-        mov	rsi, r11
-        ;   a[0]-a[2] << 32
-        shl	r9, 32
-        shld	rsi, rbx, 32
-        shld	r10, rax, 32
-        ;   - a[0] << 32 << 192
-        sub	rdx, r9
-        ;   + a[0]-a[2] << 32 << 64
-        add	rbx, r9
-        adc	rcx, r10
-        adc	rdx, rsi
-        ; a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu
-        xor	rsi, rsi
-        ;   a += mu << 256
-        add	r13, rax
-        adc	r14, rbx
-        adc	r15, rcx
-        adc	rdi, rdx
-        sbb	r9, r9
-        ;   a += mu << 192
-        add	r12, rax
-        adc	r13, rbx
-        mov	r10, rbx
-        adc	r14, rcx
-        adc	r15, rdx
-        adc	rdi, 0
-        sbb	r9, 0
-        ; mu <<= 32
-        shld	rsi, rdx, 32
-        shld	rdx, rcx, 32
-        shld	rcx, rbx, 32
-        shld	rbx, rax, 32
-        shl	rax, 32
-        ;   a -= (mu << 32) << 192
-        sub	r12, rax
-        sbb	r13, rbx
-        sbb	r14, rcx
-        sbb	r15, rdx
-        sbb	rdi, rsi
-        adc	r9, 0
-        ;   a += (mu << 32) << 64
-        sub	r10, rax
-        adc	r11, rbx
-        adc	r12, rcx
-        adc	r13, rdx
-        adc	r14, rsi
-        adc	r15, 0
-        adc	rdi, 0
-        sbb	r9, 0
-        mov	rbx, 18446744069414584321
-        ; mask m and sub from result if overflow
-        ;  m[0] = -1 & mask = mask
-        ;  m[2] =  0 & mask = 0
-        mov	eax, r9d
-        and	rbx, r9
-        sub	r13, r9
-        sbb	r14, rax
-        mov	QWORD PTR [r8], r13
-        sbb	r15, 0
-        mov	QWORD PTR [r8+8], r14
-        sbb	rdi, rbx
-        mov	QWORD PTR [r8+16], r15
-        mov	QWORD PTR [r8+24], rdi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        pop	rsi
-        pop	rbx
-        ret
-sp_256_mont_reduce_4 ENDP
-_text ENDS
-; /* Reduce the number back to 256 bits using Montgomery reduction.
-;  *
-;  * a   A single precision number to reduce in place.
-;  * m   The single precision number representing the modulus.
-;  * mp  The digit representing the negative inverse of m mod 2^n.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mont_reduce_order_4 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        mov	r9, rdx
-        ; i = 0
-        xor	rdi, rdi
-        mov	r10, 4
-        mov	r15, rcx
-L_mont_loop_4:
-        ; mu = a[i] * mp
-        mov	r14, QWORD PTR [r15]
-        imul	r14, r8
-        ; a[i+0] += m[0] * mu
-        mov	rax, QWORD PTR [r9]
-        mov	r12, QWORD PTR [r9+8]
-        mul	r14
-        mov	rsi, QWORD PTR [r15]
-        add	rsi, rax
-        mov	r11, rdx
-        mov	QWORD PTR [r15], rsi
-        adc	r11, 0
-        ; a[i+1] += m[1] * mu
-        mov	rax, r12
-        mul	r14
-        mov	r12, QWORD PTR [r9+16]
-        mov	rsi, QWORD PTR [r15+8]
-        add	rax, r11
-        mov	r13, rdx
-        adc	r13, 0
-        add	rsi, rax
-        mov	QWORD PTR [r15+8], rsi
-        adc	r13, 0
-        ; a[i+2] += m[2] * mu
-        mov	rax, r12
-        mul	r14
-        mov	r12, QWORD PTR [r9+24]
-        mov	rsi, QWORD PTR [r15+16]
-        add	rax, r13
-        mov	r11, rdx
-        adc	r11, 0
-        add	rsi, rax
-        mov	QWORD PTR [r15+16], rsi
-        adc	r11, 0
-        ; a[i+3] += m[3] * mu
-        mov	rax, r12
-        mul	r14
-        mov	rsi, QWORD PTR [r15+24]
-        add	rax, r11
-        adc	rdx, rdi
-        mov	rdi, 0
-        adc	rdi, 0
-        add	rsi, rax
-        mov	QWORD PTR [r15+24], rsi
-        adc	QWORD PTR [r15+32], rdx
-        adc	rdi, 0
-        ; i += 1
-        add	r15, 8
-        dec	r10
-        jnz	L_mont_loop_4
-        xor	rax, rax
-        mov	rdx, QWORD PTR [rcx+32]
-        mov	r10, QWORD PTR [rcx+40]
-        mov	rsi, QWORD PTR [rcx+48]
-        mov	r11, QWORD PTR [rcx+56]
-        sub	rax, rdi
-        mov	r12, QWORD PTR [r9]
-        mov	r13, QWORD PTR [r9+8]
-        mov	r14, QWORD PTR [r9+16]
-        mov	r15, QWORD PTR [r9+24]
-        and	r12, rax
-        and	r13, rax
-        and	r14, rax
-        and	r15, rax
-        sub	rdx, r12
-        sbb	r10, r13
-        sbb	rsi, r14
-        sbb	r11, r15
-        mov	QWORD PTR [rcx], rdx
-        mov	QWORD PTR [rcx+8], r10
-        mov	QWORD PTR [rcx+16], rsi
-        mov	QWORD PTR [rcx+24], r11
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_256_mont_reduce_order_4 ENDP
-_text ENDS
-; /* Add two Montgomery form numbers (r = a + b % m).
-;  *
-;  * r   Result of addition.
-;  * a   First number to add in Montgomery form.
-;  * b   Second number to add in Montgomery form.
-;  * m   Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mont_add_4 PROC
-        push	r12
-        push	r13
-        mov	rax, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r11, QWORD PTR [rdx+24]
-        add	rax, QWORD PTR [r8]
-        adc	r9, QWORD PTR [r8+8]
-        mov	r13, 18446744069414584321
-        adc	r10, QWORD PTR [r8+16]
-        adc	r11, QWORD PTR [r8+24]
-        sbb	rdx, rdx
-        mov	r12d, edx
-        and	r13, rdx
-        sub	rax, rdx
-        sbb	r9, r12
-        sbb	r10, 0
-        sbb	r11, r13
-        adc	rdx, 0
-        and	r12, rdx
-        and	r13, rdx
-        sub	rax, rdx
-        sbb	r9, r12
-        mov	QWORD PTR [rcx], rax
-        sbb	r10, 0
-        mov	QWORD PTR [rcx+8], r9
-        sbb	r11, r13
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        pop	r13
-        pop	r12
-        ret
-sp_256_mont_add_4 ENDP
-_text ENDS
-; /* Double a Montgomery form number (r = a + a % m).
-;  *
-;  * r   Result of doubling.
-;  * a   Number to double in Montgomery form.
-;  * m   Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mont_dbl_4 PROC
-        push	r12
-        push	r13
-        mov	rax, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rdx+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	r10, QWORD PTR [rdx+24]
-        add	rax, rax
-        adc	r8, r8
-        mov	r12, 18446744069414584321
-        adc	r9, r9
-        mov	r13, r10
-        adc	r10, r10
-        sar	r13, 63
-        mov	r11d, r13d
-        and	r12, r13
-        sub	rax, r13
-        sbb	r8, r11
-        sbb	r9, 0
-        sbb	r10, r12
-        adc	r13, 0
-        and	r11, r13
-        and	r12, r13
-        sub	rax, r13
-        sbb	r8, r11
-        mov	QWORD PTR [rcx], rax
-        sbb	r9, 0
-        mov	QWORD PTR [rcx+8], r8
-        sbb	r10, r12
-        mov	QWORD PTR [rcx+16], r9
-        mov	QWORD PTR [rcx+24], r10
-        pop	r13
-        pop	r12
-        ret
-sp_256_mont_dbl_4 ENDP
-_text ENDS
-; /* Triple a Montgomery form number (r = a + a + a % m).
-;  *
-;  * r   Result of Tripling.
-;  * a   Number to triple in Montgomery form.
-;  * m   Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mont_tpl_4 PROC
-        push	r12
-        push	r13
-        mov	rax, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rdx+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	r10, QWORD PTR [rdx+24]
-        add	rax, rax
-        adc	r8, r8
-        mov	r12, 18446744069414584321
-        adc	r9, r9
-        adc	r10, r10
-        sbb	r13, r13
-        mov	r11d, r13d
-        and	r12, r13
-        sub	rax, r13
-        sbb	r8, r11
-        sbb	r9, 0
-        sbb	r10, r12
-        adc	r13, 0
-        and	r11, r13
-        and	r12, r13
-        sub	rax, r13
-        sbb	r8, r11
-        sbb	r9, 0
-        sbb	r10, r12
-        add	rax, QWORD PTR [rdx]
-        adc	r8, QWORD PTR [rdx+8]
-        mov	r12, 18446744069414584321
-        adc	r9, QWORD PTR [rdx+16]
-        adc	r10, QWORD PTR [rdx+24]
-        sbb	r13, 0
-        mov	r11d, r13d
-        and	r12, r13
-        sub	rax, r13
-        sbb	r8, r11
-        sbb	r9, 0
-        sbb	r10, r12
-        adc	r13, 0
-        and	r11, r13
-        and	r12, r13
-        sub	rax, r13
-        sbb	r8, r11
-        mov	QWORD PTR [rcx], rax
-        sbb	r9, 0
-        mov	QWORD PTR [rcx+8], r8
-        sbb	r10, r12
-        mov	QWORD PTR [rcx+16], r9
-        mov	QWORD PTR [rcx+24], r10
-        pop	r13
-        pop	r12
-        ret
-sp_256_mont_tpl_4 ENDP
-_text ENDS
-; /* Subtract two Montgomery form numbers (r = a - b % m).
-;  *
-;  * r   Result of subtration.
-;  * a   Number to subtract from in Montgomery form.
-;  * b   Number to subtract with in Montgomery form.
-;  * m   Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mont_sub_4 PROC
-        push	r12
-        push	r13
-        mov	rax, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r11, QWORD PTR [rdx+24]
-        sub	rax, QWORD PTR [r8]
-        sbb	r9, QWORD PTR [r8+8]
-        mov	r13, 18446744069414584321
-        sbb	r10, QWORD PTR [r8+16]
-        sbb	r11, QWORD PTR [r8+24]
-        sbb	rdx, rdx
-        mov	r12d, edx
-        and	r13, rdx
-        add	rax, rdx
-        adc	r9, r12
-        adc	r10, 0
-        adc	r11, r13
-        adc	rdx, 0
-        and	r12, rdx
-        and	r13, rdx
-        add	rax, rdx
-        adc	r9, r12
-        mov	QWORD PTR [rcx], rax
-        adc	r10, 0
-        mov	QWORD PTR [rcx+8], r9
-        adc	r11, r13
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        pop	r13
-        pop	r12
-        ret
-sp_256_mont_sub_4 ENDP
-_text ENDS
-; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
-;  *
-;  * r  Result of division by 2.
-;  * a  Number to divide.
-;  * m  Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mont_div2_4 PROC
-        push	r12
-        push	r13
-        mov	rax, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rdx+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	r12, 18446744069414584321
-        mov	r13, rax
-        and	r13, 1
-        neg	r13
-        mov	r11d, r13d
-        and	r12, r13
-        add	rax, r13
-        adc	r8, r11
-        adc	r9, 0
-        adc	r10, r12
-        mov	r13, 0
-        adc	r13, 0
-        shrd	rax, r8, 1
-        shrd	r8, r9, 1
-        shrd	r9, r10, 1
-        shrd	r10, r13, 1
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r8
-        mov	QWORD PTR [rcx+16], r9
-        mov	QWORD PTR [rcx+24], r10
-        pop	r13
-        pop	r12
-        ret
-sp_256_mont_div2_4 ENDP
-_text ENDS
-; /* Two Montgomery numbers, subtract double second from first (r = a - 2.b % m).
-;  *
-;  * r   Result of subtration.
-;  * a   Number to subtract from in Montgomery form.
-;  * b   Number to double and subtract with in Montgomery form.
-;  * m   Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mont_rsb_sub_dbl_4 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        mov	rax, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r11, QWORD PTR [rdx+24]
-        mov	r12, QWORD PTR [r8]
-        mov	r13, QWORD PTR [r8+8]
-        mov	r14, QWORD PTR [r8+16]
-        mov	r15, QWORD PTR [r8+24]
-        add	r12, r12
-        adc	r13, r13
-        mov	rsi, 18446744069414584321
-        adc	r14, r14
-        adc	r15, r15
-        sbb	rdx, rdx
-        mov	edi, edx
-        and	rsi, rdx
-        sub	r12, rdx
-        sbb	r13, rdi
-        sbb	r14, 0
-        sbb	r15, rsi
-        adc	rdx, 0
-        and	rdi, rdx
-        and	rsi, rdx
-        sub	r12, rdx
-        sbb	r13, rdi
-        sbb	r14, 0
-        sbb	r15, rsi
-        sub	rax, r12
-        sbb	r9, r13
-        mov	rsi, 18446744069414584321
-        sbb	r10, r14
-        sbb	r11, r15
-        sbb	rdx, 0
-        mov	edi, edx
-        and	rsi, rdx
-        add	rax, rdx
-        adc	r9, rdi
-        adc	r10, 0
-        adc	r11, rsi
-        adc	rdx, 0
-        and	rdi, rdx
-        and	rsi, rdx
-        add	rax, rdx
-        adc	r9, rdi
-        mov	QWORD PTR [rcx], rax
-        adc	r10, 0
-        mov	QWORD PTR [rcx+8], r9
-        adc	r11, rsi
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        mov	r12, QWORD PTR [r8]
-        mov	r13, QWORD PTR [r8+8]
-        mov	r14, QWORD PTR [r8+16]
-        mov	r15, QWORD PTR [r8+24]
-        sub	r12, rax
-        sbb	r13, r9
-        mov	rsi, 18446744069414584321
-        sbb	r14, r10
-        sbb	r15, r11
-        sbb	rdx, rdx
-        mov	edi, edx
-        and	rsi, rdx
-        add	r12, rdx
-        adc	r13, rdi
-        adc	r14, 0
-        adc	r15, rsi
-        adc	rdx, 0
-        and	rdi, rdx
-        and	rsi, rdx
-        add	r12, rdx
-        adc	r13, rdi
-        mov	QWORD PTR [r8], r12
-        adc	r14, 0
-        mov	QWORD PTR [r8+8], r13
-        adc	r15, rsi
-        mov	QWORD PTR [r8+16], r14
-        mov	QWORD PTR [r8+24], r15
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_256_mont_rsb_sub_dbl_4 ENDP
-_text ENDS
-IFNDEF WC_NO_CACHE_RESISTANT
-; /* Touch each possible point that could be being copied.
-;  *
-;  * r      Point to copy into.
-;  * table  Table - start of the entries to access
-;  * idx    Index of point to retrieve.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_get_point_33_4 PROC
-        sub	rsp, 160
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        vmovdqu	OWORD PTR [rsp+96], xmm12
-        vmovdqu	OWORD PTR [rsp+112], xmm13
-        vmovdqu	OWORD PTR [rsp+128], xmm14
-        vmovdqu	OWORD PTR [rsp+144], xmm15
-        mov	rax, 1
-        movd	xmm13, r8d
-        add	rdx, 200
-        movd	xmm15, eax
-        mov	rax, 32
-        pshufd	xmm15, xmm15, 0
-        pshufd	xmm13, xmm13, 0
-        pxor	xmm14, xmm14
-        pxor	xmm0, xmm0
-        pxor	xmm1, xmm1
-        pxor	xmm2, xmm2
-        pxor	xmm3, xmm3
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        movdqa	xmm14, xmm15
-L_256_get_point_33_4_start_1:
-        movdqa	xmm12, xmm14
-        paddd	xmm14, xmm15
-        pcmpeqd	xmm12, xmm13
-        movdqu	xmm6, [rdx]
-        movdqu	xmm7, [rdx+16]
-        movdqu	xmm8, [rdx+64]
-        movdqu	xmm9, [rdx+80]
-        movdqu	xmm10, [rdx+128]
-        movdqu	xmm11, [rdx+144]
-        add	rdx, 200
-        pand	xmm6, xmm12
-        pand	xmm7, xmm12
-        pand	xmm8, xmm12
-        pand	xmm9, xmm12
-        pand	xmm10, xmm12
-        pand	xmm11, xmm12
-        por	xmm0, xmm6
-        por	xmm1, xmm7
-        por	xmm2, xmm8
-        por	xmm3, xmm9
-        por	xmm4, xmm10
-        por	xmm5, xmm11
-        dec	rax
-        jnz	L_256_get_point_33_4_start_1
-        movdqu	[rcx], xmm0
-        movdqu	[rcx+16], xmm1
-        movdqu	[rcx+64], xmm2
-        movdqu	[rcx+80], xmm3
-        movdqu	[rcx+128], xmm4
-        movdqu	[rcx+144], xmm5
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        vmovdqu	xmm12, OWORD PTR [rsp+96]
-        vmovdqu	xmm13, OWORD PTR [rsp+112]
-        vmovdqu	xmm14, OWORD PTR [rsp+128]
-        vmovdqu	xmm15, OWORD PTR [rsp+144]
-        add	rsp, 160
-        ret
-sp_256_get_point_33_4 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Touch each possible point that could be being copied.
-;  *
-;  * r      Point to copy into.
-;  * table  Table - start of the entries to access
-;  * idx    Index of point to retrieve.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_get_point_33_avx2_4 PROC
-        sub	rsp, 64
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        mov	rax, 1
-        movd	xmm7, r8d
-        add	rdx, 200
-        movd	xmm9, eax
-        mov	rax, 32
-        vpxor	ymm8, ymm8, ymm8
-        vpermd	ymm7, ymm8, ymm7
-        vpermd	ymm9, ymm8, ymm9
-        vpxor	ymm0, ymm0, ymm0
-        vpxor	ymm1, ymm1, ymm1
-        vpxor	ymm2, ymm2, ymm2
-        vmovdqa	ymm8, ymm9
-L_256_get_point_33_avx2_4_start:
-        vpcmpeqd	ymm6, ymm8, ymm7
-        vpaddd	ymm8, ymm8, ymm9
-        vmovupd	ymm3, YMMWORD PTR [rdx]
-        vmovupd	ymm4, YMMWORD PTR [rdx+64]
-        vmovupd	ymm5, YMMWORD PTR [rdx+128]
-        add	rdx, 200
-        vpand	ymm3, ymm3, ymm6
-        vpand	ymm4, ymm4, ymm6
-        vpand	ymm5, ymm5, ymm6
-        vpor	ymm0, ymm0, ymm3
-        vpor	ymm1, ymm1, ymm4
-        vpor	ymm2, ymm2, ymm5
-        dec	rax
-        jnz	L_256_get_point_33_avx2_4_start
-        vmovupd	YMMWORD PTR [rcx], ymm0
-        vmovupd	YMMWORD PTR [rcx+64], ymm1
-        vmovupd	YMMWORD PTR [rcx+128], ymm2
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        add	rsp, 64
-        ret
-sp_256_get_point_33_avx2_4 ENDP
-_text ENDS
-ENDIF
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Multiply two Montgomery form numbers mod the modulus (prime).
-;  * (r = a * b mod m)
-;  *
-;  * r   Result of multiplication.
-;  * a   First number to multiply in Montgomery form.
-;  * b   Second number to multiply in Montgomery form.
-;  * m   Modulus (prime).
-;  * mp  Montgomery multiplier.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mont_mul_avx2_4 PROC
-        push	rbp
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        mov	rbp, r8
-        mov	rax, rdx
-        mov	rdx, QWORD PTR [rax]
-        mov	r14, QWORD PTR [rbp+8]
-        ; A[0] * B[0]
-        mulx	r9, r8, QWORD PTR [rbp]
-        xor	rbx, rbx
-        ; A[0] * B[1]
-        mulx	r10, rdi, r14
-        adcx	r9, rdi
-        ; A[0] * B[2]
-        mulx	r11, rdi, QWORD PTR [rbp+16]
-        adcx	r10, rdi
-        ; A[0] * B[3]
-        mulx	r12, rdi, QWORD PTR [rbp+24]
-        adcx	r11, rdi
-        mov	rdx, QWORD PTR [rax+8]
-        adcx	r12, rbx
-        ; A[1] * B[0]
-        mulx	rsi, rdi, QWORD PTR [rbp]
-        xor	rbx, rbx
-        adcx	r9, rdi
-        ; A[1] * B[1]
-        mulx	r15, rdi, r14
-        adox	r10, rsi
-        adcx	r10, rdi
-        ; A[1] * B[2]
-        mulx	rsi, rdi, QWORD PTR [rbp+16]
-        adox	r11, r15
-        adcx	r11, rdi
-        ; A[1] * B[3]
-        mulx	r13, rdi, QWORD PTR [rbp+24]
-        adox	r12, rsi
-        adcx	r12, rdi
-        adox	r13, rbx
-        mov	rdx, QWORD PTR [rax+16]
-        adcx	r13, rbx
-        ; A[2] * B[0]
-        mulx	rsi, rdi, QWORD PTR [rbp]
-        xor	rbx, rbx
-        adcx	r10, rdi
-        ; A[2] * B[1]
-        mulx	r15, rdi, r14
-        adox	r11, rsi
-        adcx	r11, rdi
-        ; A[2] * B[2]
-        mulx	rsi, rdi, QWORD PTR [rbp+16]
-        adox	r12, r15
-        adcx	r12, rdi
-        ; A[2] * B[3]
-        mulx	r14, rdi, QWORD PTR [rbp+24]
-        adox	r13, rsi
-        adcx	r13, rdi
-        adox	r14, rbx
-        mov	rdx, QWORD PTR [rax+24]
-        adcx	r14, rbx
-        ; A[3] * B[0]
-        mulx	rsi, rdi, QWORD PTR [rbp]
-        xor	rbx, rbx
-        adcx	r11, rdi
-        ; A[3] * B[1]
-        mulx	r15, rdi, QWORD PTR [rbp+8]
-        adox	r12, rsi
-        adcx	r12, rdi
-        ; A[3] * B[2]
-        mulx	rsi, rdi, QWORD PTR [rbp+16]
-        adox	r13, r15
-        adcx	r13, rdi
-        ; A[3] * B[3]
-        mulx	r15, rdi, QWORD PTR [rbp+24]
-        adox	r14, rsi
-        adcx	r14, rdi
-        adox	r15, rbx
-        adcx	r15, rbx
-        ; Start Reduction
-        ; mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192
-        ;    - a[0] << 32 << 192
-        ;   a[0]-a[3] + (a[0] * 2) << 192
-        mov	rdi, r8
-        lea	rdx, QWORD PTR [r11+2*r8]
-        mov	rax, r9
-        mov	rbp, r10
-        mov	rsi, r10
-        ;   a[0]-a[2] << 32
-        shl	r8, 32
-        shld	rsi, rax, 32
-        shld	r9, rdi, 32
-        ;   - a[0] << 32 << 192
-        sub	rdx, r8
-        ;   + a[0]-a[2] << 32 << 64
-        add	rax, r8
-        adc	rbp, r9
-        adc	rdx, rsi
-        ; a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu
-        xor	rsi, rsi
-        ;   a += mu << 256
-        add	r12, rdi
-        adc	r13, rax
-        adc	r14, rbp
-        adc	r15, rdx
-        sbb	r8, r8
-        ;   a += mu << 192
-        add	r11, rdi
-        adc	r12, rax
-        mov	r9, rax
-        adc	r13, rbp
-        adc	r14, rdx
-        adc	r15, 0
-        sbb	r8, 0
-        ; mu <<= 32
-        shld	rsi, rdx, 32
-        shld	rdx, rbp, 32
-        shld	rbp, rax, 32
-        shld	rax, rdi, 32
-        shl	rdi, 32
-        ;   a -= (mu << 32) << 192
-        sub	r11, rdi
-        sbb	r12, rax
-        sbb	r13, rbp
-        sbb	r14, rdx
-        sbb	r15, rsi
-        adc	r8, 0
-        ;   a += (mu << 32) << 64
-        sub	r9, rdi
-        adc	r10, rax
-        adc	r11, rbp
-        adc	r12, rdx
-        adc	r13, rsi
-        adc	r14, 0
-        adc	r15, 0
-        sbb	r8, 0
-        mov	rax, 18446744069414584321
-        ; mask m and sub from result if overflow
-        ;  m[0] = -1 & mask = mask
-        ;  m[2] =  0 & mask = 0
-        mov	edi, r8d
-        and	rax, r8
-        sub	r12, r8
-        sbb	r13, rdi
-        mov	QWORD PTR [rcx], r12
-        sbb	r14, 0
-        mov	QWORD PTR [rcx+8], r13
-        sbb	r15, rax
-        mov	QWORD PTR [rcx+16], r14
-        mov	QWORD PTR [rcx+24], r15
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        pop	rbp
-        ret
-sp_256_mont_mul_avx2_4 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m)
-;  *
-;  * r   Result of squaring.
-;  * a   Number to square in Montgomery form.
-;  * m   Modulus (prime).
-;  * mp  Montgomery multiplier.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mont_sqr_avx2_4 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        mov	rax, rdx
-        xor	r8, r8
-        mov	rdx, QWORD PTR [rax]
-        mov	rsi, QWORD PTR [rax+8]
-        mov	rbx, QWORD PTR [rax+16]
-        mov	r15, QWORD PTR [rax+24]
-        ; A[0] * A[1]
-        mulx	r10, r9, rsi
-        ; A[0] * A[2]
-        mulx	r11, r8, rbx
-        adox	r10, r8
-        ; A[0] * A[3]
-        mulx	r12, r8, r15
-        mov	rdx, rsi
-        adox	r11, r8
-        ; A[1] * A[2]
-        mulx	rdi, r8, rbx
-        mov	rdx, r15
-        adcx	r11, r8
-        ; A[1] * A[3]
-        mulx	r13, r8, rsi
-        mov	r15, 0
-        adox	r12, rdi
-        adcx	r12, r8
-        ; A[2] * A[3]
-        mulx	r14, r8, rbx
-        adox	r13, r15
-        adcx	r13, r8
-        adox	r14, r15
-        adcx	r14, r15
-        ; Double with Carry Flag
-        xor	r15, r15
-        ; A[0] * A[0]
-        mov	rdx, QWORD PTR [rax]
-        mulx	rdi, r8, rdx
-        adcx	r9, r9
-        adcx	r10, r10
-        adox	r9, rdi
-        ; A[1] * A[1]
-        mov	rdx, QWORD PTR [rax+8]
-        mulx	rbx, rsi, rdx
-        adcx	r11, r11
-        adox	r10, rsi
-        ; A[2] * A[2]
-        mov	rdx, QWORD PTR [rax+16]
-        mulx	rsi, rdi, rdx
-        adcx	r12, r12
-        adox	r11, rbx
-        adcx	r13, r13
-        adox	r12, rdi
-        adcx	r14, r14
-        ; A[3] * A[3]
-        mov	rdx, QWORD PTR [rax+24]
-        mulx	rbx, rdi, rdx
-        adox	r13, rsi
-        adcx	r15, r15
-        adox	r14, rdi
-        adox	r15, rbx
-        ; Start Reduction
-        ; mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192
-        ;    - a[0] << 32 << 192
-        ;   a[0]-a[3] + (a[0] * 2) << 192
-        mov	rdi, r8
-        lea	rdx, QWORD PTR [r11+2*r8]
-        mov	rax, r9
-        mov	rsi, r10
-        mov	rbx, r10
-        ;   a[0]-a[2] << 32
-        shl	r8, 32
-        shld	rbx, rax, 32
-        shld	r9, rdi, 32
-        ;   - a[0] << 32 << 192
-        sub	rdx, r8
-        ;   + a[0]-a[2] << 32 << 64
-        add	rax, r8
-        adc	rsi, r9
-        adc	rdx, rbx
-        ; a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu
-        xor	rbx, rbx
-        ;   a += mu << 256
-        add	r12, rdi
-        adc	r13, rax
-        adc	r14, rsi
-        adc	r15, rdx
-        sbb	r8, r8
-        ;   a += mu << 192
-        add	r11, rdi
-        adc	r12, rax
-        mov	r9, rax
-        adc	r13, rsi
-        adc	r14, rdx
-        adc	r15, 0
-        sbb	r8, 0
-        ; mu <<= 32
-        shld	rbx, rdx, 32
-        shld	rdx, rsi, 32
-        shld	rsi, rax, 32
-        shld	rax, rdi, 32
-        shl	rdi, 32
-        ;   a -= (mu << 32) << 192
-        sub	r11, rdi
-        sbb	r12, rax
-        sbb	r13, rsi
-        sbb	r14, rdx
-        sbb	r15, rbx
-        adc	r8, 0
-        ;   a += (mu << 32) << 64
-        sub	r9, rdi
-        adc	r10, rax
-        adc	r11, rsi
-        adc	r12, rdx
-        adc	r13, rbx
-        adc	r14, 0
-        adc	r15, 0
-        sbb	r8, 0
-        mov	rax, 18446744069414584321
-        ; mask m and sub from result if overflow
-        ;  m[0] = -1 & mask = mask
-        ;  m[2] =  0 & mask = 0
-        mov	edi, r8d
-        and	rax, r8
-        sub	r12, r8
-        sbb	r13, rdi
-        mov	QWORD PTR [rcx], r12
-        sbb	r14, 0
-        mov	QWORD PTR [rcx+8], r13
-        sbb	r15, rax
-        mov	QWORD PTR [rcx+16], r14
-        mov	QWORD PTR [rcx+24], r15
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_256_mont_sqr_avx2_4 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Conditionally subtract b from a using the mask m.
-;  * m is -1 to subtract and 0 when not copying.
-;  *
-;  * r  A single precision number representing condition subtract result.
-;  * a  A single precision number to subtract from.
-;  * b  A single precision number to subtract.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_cond_sub_avx2_4 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        mov	r14, QWORD PTR [r8]
-        mov	r15, QWORD PTR [r8+8]
-        mov	rdi, QWORD PTR [r8+16]
-        mov	rsi, QWORD PTR [r8+24]
-        and	r14, r9
-        and	r15, r9
-        and	rdi, r9
-        and	rsi, r9
-        mov	r10, QWORD PTR [rdx]
-        mov	r11, QWORD PTR [rdx+8]
-        mov	r12, QWORD PTR [rdx+16]
-        mov	r13, QWORD PTR [rdx+24]
-        sub	r10, r14
-        sbb	r11, r15
-        sbb	r12, rdi
-        sbb	r13, rsi
-        mov	QWORD PTR [rcx], r10
-        mov	QWORD PTR [rcx+8], r11
-        mov	QWORD PTR [rcx+16], r12
-        mov	QWORD PTR [rcx+24], r13
-        sbb	rax, rax
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_256_cond_sub_avx2_4 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Reduce the number back to 256 bits using Montgomery reduction.
-;  *
-;  * a   A single precision number to reduce in place.
-;  * m   The single precision number representing the modulus.
-;  * mp  The digit representing the negative inverse of m mod 2^n.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mont_reduce_order_avx2_4 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        mov	rax, rcx
-        mov	r10, rdx
-        mov	r11, r8
-        mov	r14, QWORD PTR [rax]
-        mov	r15, QWORD PTR [rax+8]
-        mov	rdi, QWORD PTR [rax+16]
-        mov	rsi, QWORD PTR [rax+24]
-        xor	r13, r13
-        xor	r12, r12
-        ; a[0-4] += m[0-3] * mu = m[0-3] * (a[0] * mp)
-        mov	rbx, QWORD PTR [rax+32]
-        ;   mu = a[0] * mp
-        mov	rdx, r14
-        mulx	rcx, rdx, r11
-        ;   a[0] += m[0] * mu
-        mulx	r9, r8, QWORD PTR [r10]
-        adcx	r14, r8
-        ;   a[1] += m[1] * mu
-        mulx	rcx, r8, QWORD PTR [r10+8]
-        adox	r15, r9
-        adcx	r15, r8
-        ;   a[2] += m[2] * mu
-        mulx	r9, r8, QWORD PTR [r10+16]
-        adox	rdi, rcx
-        adcx	rdi, r8
-        ;   a[3] += m[3] * mu
-        mulx	rcx, r8, QWORD PTR [r10+24]
-        adox	rsi, r9
-        adcx	rsi, r8
-        ;   a[4] += carry
-        adox	rbx, rcx
-        adcx	rbx, r12
-        ;   carry
-        adox	r13, r12
-        adcx	r13, r12
-        ; a[1-5] += m[0-3] * mu = m[0-3] * (a[1] * mp)
-        mov	r14, QWORD PTR [rax+40]
-        ;   mu = a[1] * mp
-        mov	rdx, r15
-        mulx	rcx, rdx, r11
-        ;   a[1] += m[0] * mu
-        mulx	r9, r8, QWORD PTR [r10]
-        adcx	r15, r8
-        ;   a[2] += m[1] * mu
-        mulx	rcx, r8, QWORD PTR [r10+8]
-        adox	rdi, r9
-        adcx	rdi, r8
-        ;   a[3] += m[2] * mu
-        mulx	r9, r8, QWORD PTR [r10+16]
-        adox	rsi, rcx
-        adcx	rsi, r8
-        ;   a[4] += m[3] * mu
-        mulx	rcx, r8, QWORD PTR [r10+24]
-        adox	rbx, r9
-        adcx	rbx, r8
-        ;   a[5] += carry
-        adox	r14, rcx
-        adcx	r14, r13
-        mov	r13, r12
-        ;   carry
-        adox	r13, r12
-        adcx	r13, r12
-        ; a[2-6] += m[0-3] * mu = m[0-3] * (a[2] * mp)
-        mov	r15, QWORD PTR [rax+48]
-        ;   mu = a[2] * mp
-        mov	rdx, rdi
-        mulx	rcx, rdx, r11
-        ;   a[2] += m[0] * mu
-        mulx	r9, r8, QWORD PTR [r10]
-        adcx	rdi, r8
-        ;   a[3] += m[1] * mu
-        mulx	rcx, r8, QWORD PTR [r10+8]
-        adox	rsi, r9
-        adcx	rsi, r8
-        ;   a[4] += m[2] * mu
-        mulx	r9, r8, QWORD PTR [r10+16]
-        adox	rbx, rcx
-        adcx	rbx, r8
-        ;   a[5] += m[3] * mu
-        mulx	rcx, r8, QWORD PTR [r10+24]
-        adox	r14, r9
-        adcx	r14, r8
-        ;   a[6] += carry
-        adox	r15, rcx
-        adcx	r15, r13
-        mov	r13, r12
-        ;   carry
-        adox	r13, r12
-        adcx	r13, r12
-        ; a[3-7] += m[0-3] * mu = m[0-3] * (a[3] * mp)
-        mov	rdi, QWORD PTR [rax+56]
-        ;   mu = a[3] * mp
-        mov	rdx, rsi
-        mulx	rcx, rdx, r11
-        ;   a[3] += m[0] * mu
-        mulx	r9, r8, QWORD PTR [r10]
-        adcx	rsi, r8
-        ;   a[4] += m[1] * mu
-        mulx	rcx, r8, QWORD PTR [r10+8]
-        adox	rbx, r9
-        adcx	rbx, r8
-        ;   a[5] += m[2] * mu
-        mulx	r9, r8, QWORD PTR [r10+16]
-        adox	r14, rcx
-        adcx	r14, r8
-        ;   a[6] += m[3] * mu
-        mulx	rcx, r8, QWORD PTR [r10+24]
-        adox	r15, r9
-        adcx	r15, r8
-        ;   a[7] += carry
-        adox	rdi, rcx
-        adcx	rdi, r13
-        mov	r13, r12
-        ;   carry
-        adox	r13, r12
-        adcx	r13, r12
-        ; Subtract mod if carry
-        neg	r13
-        mov	r8, 17562291160714782033
-        mov	r9, 13611842547513532036
-        mov	rdx, 18446744069414584320
-        and	r8, r13
-        and	r9, r13
-        and	rdx, r13
-        sub	rbx, r8
-        sbb	r14, r9
-        sbb	r15, r13
-        sbb	rdi, rdx
-        mov	QWORD PTR [rax], rbx
-        mov	QWORD PTR [rax+8], r14
-        mov	QWORD PTR [rax+16], r15
-        mov	QWORD PTR [rax+24], rdi
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_256_mont_reduce_order_avx2_4 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
-;  *
-;  * r  Result of division by 2.
-;  * a  Number to divide.
-;  * m  Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mont_div2_avx2_4 PROC
-        push	r12
-        push	r13
-        mov	rax, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rdx+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	r12, 18446744069414584321
-        mov	r13, rax
-        and	r13, 1
-        neg	r13
-        mov	r11d, r13d
-        and	r12, r13
-        add	rax, r13
-        adc	r8, r11
-        adc	r9, 0
-        adc	r10, r12
-        mov	r13, 0
-        adc	r13, 0
-        shrd	rax, r8, 1
-        shrd	r8, r9, 1
-        shrd	r9, r10, 1
-        shrd	r10, r13, 1
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r8
-        mov	QWORD PTR [rcx+16], r9
-        mov	QWORD PTR [rcx+24], r10
-        pop	r13
-        pop	r12
-        ret
-sp_256_mont_div2_avx2_4 ENDP
-_text ENDS
-ENDIF
-IFNDEF WC_NO_CACHE_RESISTANT
-; /* Touch each possible entry that could be being copied.
-;  *
-;  * r      Point to copy into.
-;  * table  Table - start of the entries to access
-;  * idx    Index of entry to retrieve.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_get_entry_64_4 PROC
-        sub	rsp, 96
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        ; From entry 1
-        mov	rax, 1
-        movd	xmm9, r8d
-        add	rdx, 64
-        movd	xmm11, eax
-        mov	rax, 63
-        pshufd	xmm11, xmm11, 0
-        pshufd	xmm9, xmm9, 0
-        pxor	xmm10, xmm10
-        pxor	xmm0, xmm0
-        pxor	xmm1, xmm1
-        pxor	xmm2, xmm2
-        pxor	xmm3, xmm3
-        movdqa	xmm10, xmm11
-L_256_get_entry_64_4_start_0:
-        movdqa	xmm8, xmm10
-        paddd	xmm10, xmm11
-        pcmpeqd	xmm8, xmm9
-        movdqu	xmm4, [rdx]
-        movdqu	xmm5, [rdx+16]
-        movdqu	xmm6, [rdx+32]
-        movdqu	xmm7, [rdx+48]
-        add	rdx, 64
-        pand	xmm4, xmm8
-        pand	xmm5, xmm8
-        pand	xmm6, xmm8
-        pand	xmm7, xmm8
-        por	xmm0, xmm4
-        por	xmm1, xmm5
-        por	xmm2, xmm6
-        por	xmm3, xmm7
-        dec	rax
-        jnz	L_256_get_entry_64_4_start_0
-        movdqu	[rcx], xmm0
-        movdqu	[rcx+16], xmm1
-        movdqu	[rcx+64], xmm2
-        movdqu	[rcx+80], xmm3
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        add	rsp, 96
-        ret
-sp_256_get_entry_64_4 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Touch each possible entry that could be being copied.
-;  *
-;  * r      Point to copy into.
-;  * table  Table - start of the entries to access
-;  * idx    Index of entry to retrieve.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_get_entry_64_avx2_4 PROC
-        sub	rsp, 32
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        mov	rax, 1
-        movd	xmm5, r8d
-        add	rdx, 64
-        movd	xmm7, eax
-        mov	rax, 64
-        vpxor	ymm6, ymm6, ymm6
-        vpermd	ymm5, ymm6, ymm5
-        vpermd	ymm7, ymm6, ymm7
-        vpxor	ymm0, ymm0, ymm0
-        vpxor	ymm1, ymm1, ymm1
-        vmovdqa	ymm6, ymm7
-L_256_get_entry_64_avx2_4_start:
-        vpcmpeqd	ymm4, ymm6, ymm5
-        vpaddd	ymm6, ymm6, ymm7
-        vmovupd	ymm2, YMMWORD PTR [rdx]
-        vmovupd	ymm3, YMMWORD PTR [rdx+32]
-        add	rdx, 64
-        vpand	ymm2, ymm2, ymm4
-        vpand	ymm3, ymm3, ymm4
-        vpor	ymm0, ymm0, ymm2
-        vpor	ymm1, ymm1, ymm3
-        dec	rax
-        jnz	L_256_get_entry_64_avx2_4_start
-        vmovupd	YMMWORD PTR [rcx], ymm0
-        vmovupd	YMMWORD PTR [rcx+64], ymm1
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        add	rsp, 32
-        ret
-sp_256_get_entry_64_avx2_4 ENDP
-_text ENDS
-ENDIF
-ENDIF
-IFNDEF WC_NO_CACHE_RESISTANT
-; /* Touch each possible entry that could be being copied.
-;  *
-;  * r      Point to copy into.
-;  * table  Table - start of the entries to access
-;  * idx    Index of entry to retrieve.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_get_entry_65_4 PROC
-        sub	rsp, 96
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        ; From entry 1
-        mov	rax, 1
-        movd	xmm9, r8d
-        add	rdx, 64
-        movd	xmm11, eax
-        mov	rax, 64
-        pshufd	xmm11, xmm11, 0
-        pshufd	xmm9, xmm9, 0
-        pxor	xmm10, xmm10
-        pxor	xmm0, xmm0
-        pxor	xmm1, xmm1
-        pxor	xmm2, xmm2
-        pxor	xmm3, xmm3
-        movdqa	xmm10, xmm11
-L_256_get_entry_65_4_start_0:
-        movdqa	xmm8, xmm10
-        paddd	xmm10, xmm11
-        pcmpeqd	xmm8, xmm9
-        movdqu	xmm4, [rdx]
-        movdqu	xmm5, [rdx+16]
-        movdqu	xmm6, [rdx+32]
-        movdqu	xmm7, [rdx+48]
-        add	rdx, 64
-        pand	xmm4, xmm8
-        pand	xmm5, xmm8
-        pand	xmm6, xmm8
-        pand	xmm7, xmm8
-        por	xmm0, xmm4
-        por	xmm1, xmm5
-        por	xmm2, xmm6
-        por	xmm3, xmm7
-        dec	rax
-        jnz	L_256_get_entry_65_4_start_0
-        movdqu	[rcx], xmm0
-        movdqu	[rcx+16], xmm1
-        movdqu	[rcx+64], xmm2
-        movdqu	[rcx+80], xmm3
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        add	rsp, 96
-        ret
-sp_256_get_entry_65_4 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Touch each possible entry that could be being copied.
-;  *
-;  * r      Point to copy into.
-;  * table  Table - start of the entries to access
-;  * idx    Index of entry to retrieve.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_get_entry_65_avx2_4 PROC
-        sub	rsp, 32
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        mov	rax, 1
-        movd	xmm5, r8d
-        add	rdx, 64
-        movd	xmm7, eax
-        mov	rax, 65
-        vpxor	ymm6, ymm6, ymm6
-        vpermd	ymm5, ymm6, ymm5
-        vpermd	ymm7, ymm6, ymm7
-        vpxor	ymm0, ymm0, ymm0
-        vpxor	ymm1, ymm1, ymm1
-        vmovdqa	ymm6, ymm7
-L_256_get_entry_65_avx2_4_start:
-        vpcmpeqd	ymm4, ymm6, ymm5
-        vpaddd	ymm6, ymm6, ymm7
-        vmovupd	ymm2, YMMWORD PTR [rdx]
-        vmovupd	ymm3, YMMWORD PTR [rdx+32]
-        add	rdx, 64
-        vpand	ymm2, ymm2, ymm4
-        vpand	ymm3, ymm3, ymm4
-        vpor	ymm0, ymm0, ymm2
-        vpor	ymm1, ymm1, ymm3
-        dec	rax
-        jnz	L_256_get_entry_65_avx2_4_start
-        vmovupd	YMMWORD PTR [rcx], ymm0
-        vmovupd	YMMWORD PTR [rcx+64], ymm1
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        add	rsp, 32
-        ret
-sp_256_get_entry_65_avx2_4 ENDP
-_text ENDS
-ENDIF
-ENDIF
-; /* Add 1 to a. (a = a + 1)
-;  *
-;  * a  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_add_one_4 PROC
-        add	QWORD PTR [rcx], 1
-        adc	QWORD PTR [rcx+8], 0
-        adc	QWORD PTR [rcx+16], 0
-        adc	QWORD PTR [rcx+24], 0
-        ret
-sp_256_add_one_4 ENDP
-_text ENDS
-; /* Read big endian unsigned byte array into r.
-;  * Uses the bswap instruction.
-;  *
-;  * r  A single precision integer.
-;  * size  Maximum number of bytes to convert
-;  * a  Byte array.
-;  * n  Number of bytes in array to read.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_from_bin_bswap PROC
-        push	r12
-        push	r13
-        mov	r11, r8
-        mov	r12, rcx
-        add	r11, r9
-        add	r12, 32
-        xor	r13, r13
-        jmp	L_256_from_bin_bswap_64_end
-L_256_from_bin_bswap_64_start:
-        sub	r11, 64
-        mov	rax, QWORD PTR [r11+56]
-        mov	r10, QWORD PTR [r11+48]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        mov	rax, QWORD PTR [r11+40]
-        mov	r10, QWORD PTR [r11+32]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r10
-        mov	rax, QWORD PTR [r11+24]
-        mov	r10, QWORD PTR [r11+16]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        mov	rax, QWORD PTR [r11+8]
-        mov	r10, QWORD PTR [r11]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r10
-        add	rcx, 64
-        sub	r9, 64
-L_256_from_bin_bswap_64_end:
-        cmp	r9, 63
-        jg	L_256_from_bin_bswap_64_start
-        jmp	L_256_from_bin_bswap_8_end
-L_256_from_bin_bswap_8_start:
-        sub	r11, 8
-        mov	rax, QWORD PTR [r11]
-        bswap	rax
-        mov	QWORD PTR [rcx], rax
-        add	rcx, 8
-        sub	r9, 8
-L_256_from_bin_bswap_8_end:
-        cmp	r9, 7
-        jg	L_256_from_bin_bswap_8_start
-        cmp	r9, r13
-        je	L_256_from_bin_bswap_hi_end
-        mov	r10, r13
-        mov	rax, r13
-L_256_from_bin_bswap_hi_start:
-        mov	al, BYTE PTR [r8]
-        shl	r10, 8
-        inc	r8
-        add	r10, rax
-        dec	r9
-        jg	L_256_from_bin_bswap_hi_start
-        mov	QWORD PTR [rcx], r10
-        add	rcx, 8
-L_256_from_bin_bswap_hi_end:
-        cmp	rcx, r12
-        jge	L_256_from_bin_bswap_zero_end
-L_256_from_bin_bswap_zero_start:
-        mov	QWORD PTR [rcx], r13
-        add	rcx, 8
-        cmp	rcx, r12
-        jl	L_256_from_bin_bswap_zero_start
-L_256_from_bin_bswap_zero_end:
-        pop	r13
-        pop	r12
-        ret
-sp_256_from_bin_bswap ENDP
-_text ENDS
-IFNDEF NO_MOVBE_SUPPORT
-; /* Read big endian unsigned byte array into r.
-;  * Uses the movbe instruction which is an optional instruction.
-;  *
-;  * r  A single precision integer.
-;  * size  Maximum number of bytes to convert
-;  * a  Byte array.
-;  * n  Number of bytes in array to read.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_from_bin_movbe PROC
-        push	r12
-        mov	r11, r8
-        mov	r12, rcx
-        add	r11, r9
-        add	r12, 32
-        jmp	L_256_from_bin_movbe_64_end
-L_256_from_bin_movbe_64_start:
-        sub	r11, 64
-        movbe	rax, QWORD PTR [r11+56]
-        movbe	r10, QWORD PTR [r11+48]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        movbe	rax, QWORD PTR [r11+40]
-        movbe	r10, QWORD PTR [r11+32]
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r10
-        movbe	rax, QWORD PTR [r11+24]
-        movbe	r10, QWORD PTR [r11+16]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        movbe	rax, QWORD PTR [r11+8]
-        movbe	r10, QWORD PTR [r11]
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r10
-        add	rcx, 64
-        sub	r9, 64
-L_256_from_bin_movbe_64_end:
-        cmp	r9, 63
-        jg	L_256_from_bin_movbe_64_start
-        jmp	L_256_from_bin_movbe_8_end
-L_256_from_bin_movbe_8_start:
-        sub	r11, 8
-        movbe	rax, QWORD PTR [r11]
-        mov	QWORD PTR [rcx], rax
-        add	rcx, 8
-        sub	r9, 8
-L_256_from_bin_movbe_8_end:
-        cmp	r9, 7
-        jg	L_256_from_bin_movbe_8_start
-        cmp	r9, 0
-        je	L_256_from_bin_movbe_hi_end
-        mov	r10, 0
-        mov	rax, 0
-L_256_from_bin_movbe_hi_start:
-        mov	al, BYTE PTR [r8]
-        shl	r10, 8
-        inc	r8
-        add	r10, rax
-        dec	r9
-        jg	L_256_from_bin_movbe_hi_start
-        mov	QWORD PTR [rcx], r10
-        add	rcx, 8
-L_256_from_bin_movbe_hi_end:
-        cmp	rcx, r12
-        jge	L_256_from_bin_movbe_zero_end
-L_256_from_bin_movbe_zero_start:
-        mov	QWORD PTR [rcx], 0
-        add	rcx, 8
-        cmp	rcx, r12
-        jl	L_256_from_bin_movbe_zero_start
-L_256_from_bin_movbe_zero_end:
-        pop	r12
-        ret
-sp_256_from_bin_movbe ENDP
-_text ENDS
-ENDIF
-; /* Write r as big endian to byte array.
-;  * Fixed length number of bytes written: 32
-;  * Uses the bswap instruction.
-;  *
-;  * r  A single precision integer.
-;  * a  Byte array.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_to_bin_bswap_4 PROC
-        mov	rax, QWORD PTR [rcx+24]
-        mov	r8, QWORD PTR [rcx+16]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx], rax
-        mov	QWORD PTR [rdx+8], r8
-        mov	rax, QWORD PTR [rcx+8]
-        mov	r8, QWORD PTR [rcx]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+16], rax
-        mov	QWORD PTR [rdx+24], r8
-        ret
-sp_256_to_bin_bswap_4 ENDP
-_text ENDS
-IFNDEF NO_MOVBE_SUPPORT
-; /* Write r as big endian to byte array.
-;  * Fixed length number of bytes written: 32
-;  * Uses the movbe instruction which is optional.
-;  *
-;  * r  A single precision integer.
-;  * a  Byte array.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_to_bin_movbe_4 PROC
-        movbe	rax, QWORD PTR [rcx+24]
-        movbe	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rdx], rax
-        mov	QWORD PTR [rdx+8], r8
-        movbe	rax, QWORD PTR [rcx+8]
-        movbe	r8, QWORD PTR [rcx]
-        mov	QWORD PTR [rdx+16], rax
-        mov	QWORD PTR [rdx+24], r8
-        ret
-sp_256_to_bin_movbe_4 ENDP
-_text ENDS
-ENDIF
-; /* Sub b from a into a. (a -= b)
-;  *
-;  * a  A single precision integer and result.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_sub_in_place_4 PROC
-        mov	r8, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r11, QWORD PTR [rdx+24]
-        sub	QWORD PTR [rcx], r8
-        sbb	QWORD PTR [rcx+8], r9
-        sbb	QWORD PTR [rcx+16], r10
-        sbb	QWORD PTR [rcx+24], r11
-        sbb	rax, rax
-        ret
-sp_256_sub_in_place_4 ENDP
-_text ENDS
-; /* Mul a by digit b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision digit.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mul_d_4 PROC
-        push	r12
-        mov	r9, rdx
-        ; A[0] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9]
-        mov	r10, rax
-        mov	r11, rdx
-        mov	QWORD PTR [rcx], r10
-        ; A[1] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        mov	QWORD PTR [rcx+8], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        mov	QWORD PTR [rcx+16], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B
-        mov	rax, r8
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        mov	QWORD PTR [rcx+24], r10
-        mov	QWORD PTR [rcx+32], r11
-        pop	r12
-        ret
-sp_256_mul_d_4 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Mul a by digit b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision digit.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mul_d_avx2_4 PROC
-        push	r12
-        push	r13
-        mov	rax, rdx
-        ; A[0] * B
-        mov	rdx, r8
-        xor	r13, r13
-        mulx	r12, r11, QWORD PTR [rax]
-        mov	QWORD PTR [rcx], r11
-        ; A[1] * B
-        mulx	r10, r9, QWORD PTR [rax+8]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+8], r12
-        ; A[2] * B
-        mulx	r10, r9, QWORD PTR [rax+16]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+16], r11
-        ; A[3] * B
-        mulx	r10, r9, QWORD PTR [rax+24]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        adcx	r11, r13
-        mov	QWORD PTR [rcx+24], r12
-        mov	QWORD PTR [rcx+32], r11
-        pop	r13
-        pop	r12
-        ret
-sp_256_mul_d_avx2_4 ENDP
-_text ENDS
-ENDIF
-IFDEF _WIN64
-; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
-;  *
-;  * d1   The high order half of the number to divide.
-;  * d0   The low order half of the number to divide.
-;  * div  The dividend.
-;  * returns the result of the division.
-;  */
-_text SEGMENT READONLY PARA
-div_256_word_asm_4 PROC
-        mov	r9, rdx
-        mov	rax, r9
-        mov	rdx, rcx
-        div	r8
-        ret
-div_256_word_asm_4 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Multiply two Montgomery form numbers mod the modulus (prime).
-;  * (r = a * b mod m)
-;  *
-;  * r   Result of multiplication.
-;  * a   First number to multiply in Montgomery form.
-;  * b   Second number to multiply in Montgomery form.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mont_mul_order_avx2_4 PROC
-        push	rbp
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        mov	rbp, r8
-        mov	rax, rdx
-        mov	rdx, QWORD PTR [rax]
-        mov	r14, QWORD PTR [rbp+8]
-        ; A[0] * B[0]
-        mulx	r9, r8, QWORD PTR [rbp]
-        xor	rbx, rbx
-        ; A[0] * B[1]
-        mulx	r10, rdi, r14
-        adcx	r9, rdi
-        ; A[0] * B[2]
-        mulx	r11, rdi, QWORD PTR [rbp+16]
-        adcx	r10, rdi
-        ; A[0] * B[3]
-        mulx	r12, rdi, QWORD PTR [rbp+24]
-        adcx	r11, rdi
-        mov	rdx, QWORD PTR [rax+8]
-        adcx	r12, rbx
-        ; A[1] * B[0]
-        mulx	rsi, rdi, QWORD PTR [rbp]
-        xor	rbx, rbx
-        adcx	r9, rdi
-        ; A[1] * B[1]
-        mulx	r15, rdi, r14
-        adox	r10, rsi
-        adcx	r10, rdi
-        ; A[1] * B[2]
-        mulx	rsi, rdi, QWORD PTR [rbp+16]
-        adox	r11, r15
-        adcx	r11, rdi
-        ; A[1] * B[3]
-        mulx	r13, rdi, QWORD PTR [rbp+24]
-        adox	r12, rsi
-        adcx	r12, rdi
-        adox	r13, rbx
-        mov	rdx, QWORD PTR [rax+16]
-        adcx	r13, rbx
-        ; A[2] * B[0]
-        mulx	rsi, rdi, QWORD PTR [rbp]
-        xor	rbx, rbx
-        adcx	r10, rdi
-        ; A[2] * B[1]
-        mulx	r15, rdi, r14
-        adox	r11, rsi
-        adcx	r11, rdi
-        ; A[2] * B[2]
-        mulx	rsi, rdi, QWORD PTR [rbp+16]
-        adox	r12, r15
-        adcx	r12, rdi
-        ; A[2] * B[3]
-        mulx	r14, rdi, QWORD PTR [rbp+24]
-        adox	r13, rsi
-        adcx	r13, rdi
-        adox	r14, rbx
-        mov	rdx, QWORD PTR [rax+24]
-        adcx	r14, rbx
-        ; A[3] * B[0]
-        mulx	rsi, rdi, QWORD PTR [rbp]
-        xor	rbx, rbx
-        adcx	r11, rdi
-        ; A[3] * B[1]
-        mulx	r15, rdi, QWORD PTR [rbp+8]
-        adox	r12, rsi
-        adcx	r12, rdi
-        ; A[3] * B[2]
-        mulx	rsi, rdi, QWORD PTR [rbp+16]
-        adox	r13, r15
-        adcx	r13, rdi
-        ; A[3] * B[3]
-        mulx	r15, rdi, QWORD PTR [rbp+24]
-        adox	r14, rsi
-        adcx	r14, rdi
-        adox	r15, rbx
-        adcx	r15, rbx
-        ; Start Reduction
-        mov	rbx, 14758798090332847183
-        ;  A[0]
-        mov	rdx, rbx
-        imul	rdx, r8
-        mov	rdi, 17562291160714782033
-        xor	rbp, rbp
-        mulx	rax, rsi, rdi
-        mov	rdi, 13611842547513532036
-        adcx	r8, rsi
-        adox	r9, rax
-        mulx	rax, rsi, rdi
-        mov	rdi, 18446744073709551615
-        adcx	r9, rsi
-        adox	r10, rax
-        mulx	rax, rsi, rdi
-        mov	rdi, 18446744069414584320
-        adcx	r10, rsi
-        adox	r11, rax
-        mulx	rax, rsi, rdi
-        adcx	r11, rsi
-        adox	r12, rax
-        adcx	r12, rbp
-        mov	r8, rbp
-        ;   carry
-        adox	r8, rbp
-        adcx	r8, rbp
-        ;  A[1]
-        mov	rdx, rbx
-        imul	rdx, r9
-        mov	rdi, 17562291160714782033
-        xor	rbp, rbp
-        mulx	rax, rsi, rdi
-        mov	rdi, 13611842547513532036
-        adcx	r9, rsi
-        adox	r10, rax
-        mulx	rax, rsi, rdi
-        mov	rdi, 18446744073709551615
-        adcx	r10, rsi
-        adox	r11, rax
-        mulx	rax, rsi, rdi
-        mov	rdi, 18446744069414584320
-        adcx	r11, rsi
-        adox	r12, rax
-        mulx	rax, rsi, rdi
-        adcx	r12, rsi
-        adox	r13, rax
-        adcx	r13, r8
-        mov	r8, rbp
-        ;   carry
-        adox	r8, rbp
-        adcx	r8, rbp
-        ;  A[2]
-        mov	rdx, rbx
-        imul	rdx, r10
-        mov	rdi, 17562291160714782033
-        xor	rbp, rbp
-        mulx	rax, rsi, rdi
-        mov	rdi, 13611842547513532036
-        adcx	r10, rsi
-        adox	r11, rax
-        mulx	rax, rsi, rdi
-        mov	rdi, 18446744073709551615
-        adcx	r11, rsi
-        adox	r12, rax
-        mulx	rax, rsi, rdi
-        mov	rdi, 18446744069414584320
-        adcx	r12, rsi
-        adox	r13, rax
-        mulx	rax, rsi, rdi
-        adcx	r13, rsi
-        adox	r14, rax
-        adcx	r14, r8
-        mov	r8, rbp
-        ;   carry
-        adox	r8, rbp
-        adcx	r8, rbp
-        ;  A[3]
-        mov	rdx, rbx
-        imul	rdx, r11
-        mov	rdi, 17562291160714782033
-        xor	rbp, rbp
-        mulx	rax, rsi, rdi
-        mov	rdi, 13611842547513532036
-        adcx	r11, rsi
-        adox	r12, rax
-        mulx	rax, rsi, rdi
-        mov	rdi, 18446744073709551615
-        adcx	r12, rsi
-        adox	r13, rax
-        mulx	rax, rsi, rdi
-        mov	rdi, 18446744069414584320
-        adcx	r13, rsi
-        adox	r14, rax
-        mulx	rax, rsi, rdi
-        adcx	r14, rsi
-        adox	r15, rax
-        adcx	r15, r8
-        mov	r8, rbp
-        ;   carry
-        adox	r8, rbp
-        adcx	r8, rbp
-        neg	r8
-        mov	rdi, 17562291160714782033
-        mov	rbx, 13611842547513532036
-        and	rdi, r8
-        mov	rbp, 18446744069414584320
-        and	rbx, r8
-        and	rbp, r8
-        sub	r12, rdi
-        sbb	r13, rbx
-        mov	QWORD PTR [rcx], r12
-        sbb	r14, r8
-        mov	QWORD PTR [rcx+8], r13
-        sbb	r15, rbp
-        mov	QWORD PTR [rcx+16], r14
-        mov	QWORD PTR [rcx+24], r15
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        pop	rbp
-        ret
-sp_256_mont_mul_order_avx2_4 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m)
-;  *
-;  * r   Result of squaring.
-;  * a   Number to square in Montgomery form.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mont_sqr_order_avx2_4 PROC
-        push	rbp
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        mov	rax, rdx
-        xor	r8, r8
-        mov	rdx, QWORD PTR [rax]
-        mov	rsi, QWORD PTR [rax+8]
-        mov	rbx, QWORD PTR [rax+16]
-        mov	r15, QWORD PTR [rax+24]
-        ; A[0] * A[1]
-        mulx	r10, r9, rsi
-        ; A[0] * A[2]
-        mulx	r11, r8, rbx
-        adox	r10, r8
-        ; A[0] * A[3]
-        mulx	r12, r8, r15
-        mov	rdx, rsi
-        adox	r11, r8
-        ; A[1] * A[2]
-        mulx	rdi, r8, rbx
-        mov	rdx, r15
-        adcx	r11, r8
-        ; A[1] * A[3]
-        mulx	r13, r8, rsi
-        mov	r15, 0
-        adox	r12, rdi
-        adcx	r12, r8
-        ; A[2] * A[3]
-        mulx	r14, r8, rbx
-        adox	r13, r15
-        adcx	r13, r8
-        adox	r14, r15
-        adcx	r14, r15
-        ; Double with Carry Flag
-        xor	r15, r15
-        ; A[0] * A[0]
-        mov	rdx, QWORD PTR [rax]
-        mulx	rdi, r8, rdx
-        adcx	r9, r9
-        adcx	r10, r10
-        adox	r9, rdi
-        ; A[1] * A[1]
-        mov	rdx, QWORD PTR [rax+8]
-        mulx	rbx, rsi, rdx
-        adcx	r11, r11
-        adox	r10, rsi
-        ; A[2] * A[2]
-        mov	rdx, QWORD PTR [rax+16]
-        mulx	rsi, rdi, rdx
-        adcx	r12, r12
-        adox	r11, rbx
-        adcx	r13, r13
-        adox	r12, rdi
-        adcx	r14, r14
-        ; A[3] * A[3]
-        mov	rdx, QWORD PTR [rax+24]
-        mulx	rbx, rdi, rdx
-        adox	r13, rsi
-        adcx	r15, r15
-        adox	r14, rdi
-        adox	r15, rbx
-        ; Start Reduction
-        mov	rbx, 14758798090332847183
-        ;  A[0]
-        mov	rdx, rbx
-        imul	rdx, r8
-        mov	rdi, 17562291160714782033
-        xor	rbp, rbp
-        mulx	rax, rsi, rdi
-        mov	rdi, 13611842547513532036
-        adcx	r8, rsi
-        adox	r9, rax
-        mulx	rax, rsi, rdi
-        mov	rdi, 18446744073709551615
-        adcx	r9, rsi
-        adox	r10, rax
-        mulx	rax, rsi, rdi
-        mov	rdi, 18446744069414584320
-        adcx	r10, rsi
-        adox	r11, rax
-        mulx	rax, rsi, rdi
-        adcx	r11, rsi
-        adox	r12, rax
-        adcx	r12, rbp
-        mov	r8, rbp
-        ;   carry
-        adox	r8, rbp
-        adcx	r8, rbp
-        ;  A[1]
-        mov	rdx, rbx
-        imul	rdx, r9
-        mov	rdi, 17562291160714782033
-        xor	rbp, rbp
-        mulx	rax, rsi, rdi
-        mov	rdi, 13611842547513532036
-        adcx	r9, rsi
-        adox	r10, rax
-        mulx	rax, rsi, rdi
-        mov	rdi, 18446744073709551615
-        adcx	r10, rsi
-        adox	r11, rax
-        mulx	rax, rsi, rdi
-        mov	rdi, 18446744069414584320
-        adcx	r11, rsi
-        adox	r12, rax
-        mulx	rax, rsi, rdi
-        adcx	r12, rsi
-        adox	r13, rax
-        adcx	r13, r8
-        mov	r8, rbp
-        ;   carry
-        adox	r8, rbp
-        adcx	r8, rbp
-        ;  A[2]
-        mov	rdx, rbx
-        imul	rdx, r10
-        mov	rdi, 17562291160714782033
-        xor	rbp, rbp
-        mulx	rax, rsi, rdi
-        mov	rdi, 13611842547513532036
-        adcx	r10, rsi
-        adox	r11, rax
-        mulx	rax, rsi, rdi
-        mov	rdi, 18446744073709551615
-        adcx	r11, rsi
-        adox	r12, rax
-        mulx	rax, rsi, rdi
-        mov	rdi, 18446744069414584320
-        adcx	r12, rsi
-        adox	r13, rax
-        mulx	rax, rsi, rdi
-        adcx	r13, rsi
-        adox	r14, rax
-        adcx	r14, r8
-        mov	r8, rbp
-        ;   carry
-        adox	r8, rbp
-        adcx	r8, rbp
-        ;  A[3]
-        mov	rdx, rbx
-        imul	rdx, r11
-        mov	rdi, 17562291160714782033
-        xor	rbp, rbp
-        mulx	rax, rsi, rdi
-        mov	rdi, 13611842547513532036
-        adcx	r11, rsi
-        adox	r12, rax
-        mulx	rax, rsi, rdi
-        mov	rdi, 18446744073709551615
-        adcx	r12, rsi
-        adox	r13, rax
-        mulx	rax, rsi, rdi
-        mov	rdi, 18446744069414584320
-        adcx	r13, rsi
-        adox	r14, rax
-        mulx	rax, rsi, rdi
-        adcx	r14, rsi
-        adox	r15, rax
-        adcx	r15, r8
-        mov	r8, rbp
-        ;   carry
-        adox	r8, rbp
-        adcx	r8, rbp
-        neg	r8
-        mov	rdi, 17562291160714782033
-        mov	rbx, 13611842547513532036
-        and	rdi, r8
-        mov	rbp, 18446744069414584320
-        and	rbx, r8
-        and	rbp, r8
-        sub	r12, rdi
-        sbb	r13, rbx
-        mov	QWORD PTR [rcx], r12
-        sbb	r14, r8
-        mov	QWORD PTR [rcx+8], r13
-        sbb	r15, rbp
-        mov	QWORD PTR [rcx+16], r14
-        mov	QWORD PTR [rcx+24], r15
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        pop	rbp
-        ret
-sp_256_mont_sqr_order_avx2_4 ENDP
-_text ENDS
-ENDIF
-; /* Non-constant time modular inversion.
-;  *
-;  * @param  [out]  r   Resulting number.
-;  * @param  [in]   a   Number to invert.
-;  * @param  [in]   m   Modulus.
-;  * @return  MP_OKAY on success.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mod_inv_4 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        sub	rsp, 513
-        mov	r9, QWORD PTR [r8]
-        mov	r10, QWORD PTR [r8+8]
-        mov	r11, QWORD PTR [r8+16]
-        mov	r12, QWORD PTR [r8+24]
-        mov	r13, QWORD PTR [rdx]
-        mov	r14, QWORD PTR [rdx+8]
-        mov	r15, QWORD PTR [rdx+16]
-        mov	rdi, QWORD PTR [rdx+24]
-        mov	rsi, 0
-        test	r13b, 1
-        jnz	L_256_mod_inv_4_v_even_end
-L_256_mod_inv_4_v_even_start:
-        shrd	r13, r14, 1
-        shrd	r14, r15, 1
-        shrd	r15, rdi, 1
-        shr	rdi, 1
-        mov	BYTE PTR [rsp+rsi], 1
-        inc	rsi
-        test	r13b, 1
-        jz	L_256_mod_inv_4_v_even_start
-L_256_mod_inv_4_v_even_end:
-L_256_mod_inv_4_uv_start:
-        cmp	r12, rdi
-        jb	L_256_mod_inv_4_uv_v
-        ja	L_256_mod_inv_4_uv_u
-        cmp	r11, r15
-        jb	L_256_mod_inv_4_uv_v
-        ja	L_256_mod_inv_4_uv_u
-        cmp	r10, r14
-        jb	L_256_mod_inv_4_uv_v
-        ja	L_256_mod_inv_4_uv_u
-        cmp	r9, r13
-        jb	L_256_mod_inv_4_uv_v
-L_256_mod_inv_4_uv_u:
-        mov	BYTE PTR [rsp+rsi], 2
-        inc	rsi
-        sub	r9, r13
-        sbb	r10, r14
-        sbb	r11, r15
-        sbb	r12, rdi
-        shrd	r9, r10, 1
-        shrd	r10, r11, 1
-        shrd	r11, r12, 1
-        shr	r12, 1
-        test	r9b, 1
-        jnz	L_256_mod_inv_4_usubv_even_end
-L_256_mod_inv_4_usubv_even_start:
-        shrd	r9, r10, 1
-        shrd	r10, r11, 1
-        shrd	r11, r12, 1
-        shr	r12, 1
-        mov	BYTE PTR [rsp+rsi], 0
-        inc	rsi
-        test	r9b, 1
-        jz	L_256_mod_inv_4_usubv_even_start
-L_256_mod_inv_4_usubv_even_end:
-        cmp	r9, 1
-        jne	L_256_mod_inv_4_uv_start
-        mov	rdx, r10
-        or	rdx, r11
-        jne	L_256_mod_inv_4_uv_start
-        or	rdx, r12
-        jne	L_256_mod_inv_4_uv_start
-        mov	al, 1
-        jmp	L_256_mod_inv_4_uv_end
-L_256_mod_inv_4_uv_v:
-        mov	BYTE PTR [rsp+rsi], 3
-        inc	rsi
-        sub	r13, r9
-        sbb	r14, r10
-        sbb	r15, r11
-        sbb	rdi, r12
-        shrd	r13, r14, 1
-        shrd	r14, r15, 1
-        shrd	r15, rdi, 1
-        shr	rdi, 1
-        test	r13b, 1
-        jnz	L_256_mod_inv_4_vsubu_even_end
-L_256_mod_inv_4_vsubu_even_start:
-        shrd	r13, r14, 1
-        shrd	r14, r15, 1
-        shrd	r15, rdi, 1
-        shr	rdi, 1
-        mov	BYTE PTR [rsp+rsi], 1
-        inc	rsi
-        test	r13b, 1
-        jz	L_256_mod_inv_4_vsubu_even_start
-L_256_mod_inv_4_vsubu_even_end:
-        cmp	r13, 1
-        jne	L_256_mod_inv_4_uv_start
-        mov	rdx, r14
-        or	rdx, r15
-        jne	L_256_mod_inv_4_uv_start
-        or	rdx, rdi
-        jne	L_256_mod_inv_4_uv_start
-        mov	al, 0
-L_256_mod_inv_4_uv_end:
-        mov	r9, QWORD PTR [r8]
-        mov	r10, QWORD PTR [r8+8]
-        mov	r11, QWORD PTR [r8+16]
-        mov	r12, QWORD PTR [r8+24]
-        mov	r13, 1
-        xor	r14, r14
-        xor	r15, r15
-        xor	rdi, rdi
-        mov	BYTE PTR [rsp+rsi], 7
-        mov	dl, BYTE PTR [rsp]
-        mov	rsi, 1
-        cmp	dl, 1
-        je	L_256_mod_inv_4_op_div2_d
-        jl	L_256_mod_inv_4_op_div2_b
-        cmp	dl, 3
-        je	L_256_mod_inv_4_op_d_sub_b
-        jl	L_256_mod_inv_4_op_b_sub_d
-        jmp	L_256_mod_inv_4_op_end
-L_256_mod_inv_4_op_b_sub_d:
-        sub	r9, r13
-        sbb	r10, r14
-        sbb	r11, r15
-        sbb	r12, rdi
-        jnc	L_256_mod_inv_4_op_div2_b
-        add	r9, QWORD PTR [r8]
-        adc	r10, QWORD PTR [r8+8]
-        adc	r11, QWORD PTR [r8+16]
-        adc	r12, QWORD PTR [r8+24]
-L_256_mod_inv_4_op_div2_b:
-        test	r9b, 1
-        mov	rdx, 0
-        jz	L_256_mod_inv_4_op_div2_b_mod
-        add	r9, QWORD PTR [r8]
-        adc	r10, QWORD PTR [r8+8]
-        adc	r11, QWORD PTR [r8+16]
-        adc	r12, QWORD PTR [r8+24]
-        adc	rdx, 0
-L_256_mod_inv_4_op_div2_b_mod:
-        shrd	r9, r10, 1
-        shrd	r10, r11, 1
-        shrd	r11, r12, 1
-        shrd	r12, rdx, 1
-        mov	dl, BYTE PTR [rsp+rsi]
-        inc	rsi
-        cmp	dl, 1
-        je	L_256_mod_inv_4_op_div2_d
-        jl	L_256_mod_inv_4_op_div2_b
-        cmp	dl, 3
-        je	L_256_mod_inv_4_op_d_sub_b
-        jl	L_256_mod_inv_4_op_b_sub_d
-        jmp	L_256_mod_inv_4_op_end
-L_256_mod_inv_4_op_d_sub_b:
-        sub	r13, r9
-        sbb	r14, r10
-        sbb	r15, r11
-        sbb	rdi, r12
-        jnc	L_256_mod_inv_4_op_div2_d
-        add	r13, QWORD PTR [r8]
-        adc	r14, QWORD PTR [r8+8]
-        adc	r15, QWORD PTR [r8+16]
-        adc	rdi, QWORD PTR [r8+24]
-L_256_mod_inv_4_op_div2_d:
-        test	r13b, 1
-        mov	rdx, 0
-        jz	L_256_mod_inv_4_op_div2_d_mod
-        add	r13, QWORD PTR [r8]
-        adc	r14, QWORD PTR [r8+8]
-        adc	r15, QWORD PTR [r8+16]
-        adc	rdi, QWORD PTR [r8+24]
-        adc	rdx, 0
-L_256_mod_inv_4_op_div2_d_mod:
-        shrd	r13, r14, 1
-        shrd	r14, r15, 1
-        shrd	r15, rdi, 1
-        shrd	rdi, rdx, 1
-        mov	dl, BYTE PTR [rsp+rsi]
-        inc	rsi
-        cmp	dl, 1
-        je	L_256_mod_inv_4_op_div2_d
-        jl	L_256_mod_inv_4_op_div2_b
-        cmp	dl, 3
-        je	L_256_mod_inv_4_op_d_sub_b
-        jl	L_256_mod_inv_4_op_b_sub_d
-L_256_mod_inv_4_op_end:
-        cmp	al, 1
-        jne	L_256_mod_inv_4_store_d
-        mov	QWORD PTR [rcx], r9
-        mov	QWORD PTR [rcx+8], r10
-        mov	QWORD PTR [rcx+16], r11
-        mov	QWORD PTR [rcx+24], r12
-        jmp	L_256_mod_inv_4_store_end
-L_256_mod_inv_4_store_d:
-        mov	QWORD PTR [rcx], r13
-        mov	QWORD PTR [rcx+8], r14
-        mov	QWORD PTR [rcx+16], r15
-        mov	QWORD PTR [rcx+24], rdi
-L_256_mod_inv_4_store_end:
-        add	rsp, 513
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_256_mod_inv_4 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-_DATA SEGMENT
-ALIGN 16
-L_sp256_mod_inv_avx2_4_order DWORD 6497617,32001851,62711546,67108863,67043328,0,0,0,41070783,45522014,67108863,1023,4194303,0,0,0
-ptr_L_sp256_mod_inv_avx2_4_order QWORD L_sp256_mod_inv_avx2_4_order
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_sp256_mod_inv_avx2_4_one QWORD 1, 0,
-    0, 0
-ptr_L_sp256_mod_inv_avx2_4_one QWORD L_sp256_mod_inv_avx2_4_one
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_sp256_mod_inv_avx2_4_all_one DWORD 1,1,1,1,1,1,1,1
-ptr_L_sp256_mod_inv_avx2_4_all_one QWORD L_sp256_mod_inv_avx2_4_all_one
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_sp256_mod_inv_avx2_4_mask01111 DWORD 0,1,1,1,1,0,0,0
-ptr_L_sp256_mod_inv_avx2_4_mask01111 QWORD L_sp256_mod_inv_avx2_4_mask01111
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_sp256_mod_inv_avx2_4_down_one_dword DWORD 1,2,3,4,5,6,7,7
-ptr_L_sp256_mod_inv_avx2_4_down_one_dword QWORD L_sp256_mod_inv_avx2_4_down_one_dword
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_sp256_mod_inv_avx2_4_neg DWORD 0,0,0,0,2147483648,0,0,0
-ptr_L_sp256_mod_inv_avx2_4_neg QWORD L_sp256_mod_inv_avx2_4_neg
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_sp256_mod_inv_avx2_4_up_one_dword DWORD 7,0,1,2,3,7,7,7
-ptr_L_sp256_mod_inv_avx2_4_up_one_dword QWORD L_sp256_mod_inv_avx2_4_up_one_dword
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_sp256_mod_inv_avx2_4_mask26 DWORD 67108863,67108863,67108863,67108863,67108863,0,0,0
-ptr_L_sp256_mod_inv_avx2_4_mask26 QWORD L_sp256_mod_inv_avx2_4_mask26
-_DATA ENDS
-; /* Non-constant time modular inversion.
-;  *
-;  * @param  [out]  r   Resulting number.
-;  * @param  [in]   a   Number to invert.
-;  * @param  [in]   m   Modulus.
-;  * @return  MP_OKAY on success.
-;  */
-_text SEGMENT READONLY PARA
-sp_256_mod_inv_avx2_4 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        sub	rsp, 144
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        vmovdqu	OWORD PTR [rsp+96], xmm12
-        vmovdqu	OWORD PTR [rsp+112], xmm13
-        vmovdqu	OWORD PTR [rsp+128], xmm14
-        mov	rax, QWORD PTR [r8]
-        mov	r9, QWORD PTR [r8+8]
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        mov	r12, QWORD PTR [rdx]
-        mov	r13, QWORD PTR [rdx+8]
-        mov	r14, QWORD PTR [rdx+16]
-        mov	r15, QWORD PTR [rdx+24]
-        mov	rbx, ptr_L_sp256_mod_inv_avx2_4_order
-        vmovupd	ymm6, YMMWORD PTR [rbx]
-        vmovupd	ymm7, YMMWORD PTR [rbx+32]
-        mov	rbx, ptr_L_sp256_mod_inv_avx2_4_one
-        vmovupd	ymm8, YMMWORD PTR [rbx]
-        mov	rbx, ptr_L_sp256_mod_inv_avx2_4_mask01111
-        vmovupd	ymm9, YMMWORD PTR [rbx]
-        mov	rbx, ptr_L_sp256_mod_inv_avx2_4_all_one
-        vmovupd	ymm10, YMMWORD PTR [rbx]
-        mov	rbx, ptr_L_sp256_mod_inv_avx2_4_down_one_dword
-        vmovupd	ymm11, YMMWORD PTR [rbx]
-        mov	rbx, ptr_L_sp256_mod_inv_avx2_4_neg
-        vmovupd	ymm12, YMMWORD PTR [rbx]
-        mov	rbx, ptr_L_sp256_mod_inv_avx2_4_up_one_dword
-        vmovupd	ymm13, YMMWORD PTR [rbx]
-        mov	rbx, ptr_L_sp256_mod_inv_avx2_4_mask26
-        vmovupd	ymm14, YMMWORD PTR [rbx]
-        vpxor	xmm0, xmm0, xmm0
-        vpxor	xmm1, xmm1, xmm1
-        vmovdqu	ymm2, ymm8
-        vpxor	xmm3, xmm3, xmm3
-        test	r12b, 1
-        jnz	L_256_mod_inv_avx2_4_v_even_end
-L_256_mod_inv_avx2_4_v_even_start:
-        shrd	r12, r13, 1
-        shrd	r13, r14, 1
-        shrd	r14, r15, 1
-        shr	r15, 1
-        vptest	ymm2, ymm8
-        jz	L_256_mod_inv_avx2_4_v_even_shr1
-        vpaddd	ymm2, ymm2, ymm6
-        vpaddd	ymm3, ymm3, ymm7
-L_256_mod_inv_avx2_4_v_even_shr1:
-        vpand	ymm4, ymm2, ymm9
-        vpand	ymm5, ymm3, ymm10
-        vpermd	ymm4, ymm11, ymm4
-        vpsrad	ymm2, ymm2, 1
-        vpsrad	ymm3, ymm3, 1
-        vpslld	ymm5, ymm5, 25
-        vpslld	xmm4, xmm4, 25
-        vpaddd	ymm2, ymm2, ymm5
-        vpaddd	ymm3, ymm3, ymm4
-        test	r12b, 1
-        jz	L_256_mod_inv_avx2_4_v_even_start
-L_256_mod_inv_avx2_4_v_even_end:
-L_256_mod_inv_avx2_4_uv_start:
-        cmp	r11, r15
-        jb	L_256_mod_inv_avx2_4_uv_v
-        ja	L_256_mod_inv_avx2_4_uv_u
-        cmp	r10, r14
-        jb	L_256_mod_inv_avx2_4_uv_v
-        ja	L_256_mod_inv_avx2_4_uv_u
-        cmp	r9, r13
-        jb	L_256_mod_inv_avx2_4_uv_v
-        ja	L_256_mod_inv_avx2_4_uv_u
-        cmp	rax, r12
-        jb	L_256_mod_inv_avx2_4_uv_v
-L_256_mod_inv_avx2_4_uv_u:
-        sub	rax, r12
-        sbb	r9, r13
-        vpsubd	ymm0, ymm0, ymm2
-        sbb	r10, r14
-        vpsubd	ymm1, ymm1, ymm3
-        sbb	r11, r15
-        vptest	ymm1, ymm12
-        jz	L_256_mod_inv_avx2_4_usubv_done_neg
-        vpaddd	ymm0, ymm0, ymm6
-        vpaddd	ymm1, ymm1, ymm7
-L_256_mod_inv_avx2_4_usubv_done_neg:
-L_256_mod_inv_avx2_4_usubv_shr1:
-        shrd	rax, r9, 1
-        shrd	r9, r10, 1
-        shrd	r10, r11, 1
-        shr	r11, 1
-        vptest	ymm0, ymm8
-        jz	L_256_mod_inv_avx2_4_usubv_sub_shr1
-        vpaddd	ymm0, ymm0, ymm6
-        vpaddd	ymm1, ymm1, ymm7
-L_256_mod_inv_avx2_4_usubv_sub_shr1:
-        vpand	ymm4, ymm0, ymm9
-        vpand	ymm5, ymm1, ymm10
-        vpermd	ymm4, ymm11, ymm4
-        vpsrad	ymm0, ymm0, 1
-        vpsrad	ymm1, ymm1, 1
-        vpslld	ymm5, ymm5, 25
-        vpslld	xmm4, xmm4, 25
-        vpaddd	ymm0, ymm0, ymm5
-        vpaddd	ymm1, ymm1, ymm4
-        test	al, 1
-        jz	L_256_mod_inv_avx2_4_usubv_shr1
-        cmp	rax, 1
-        jne	L_256_mod_inv_avx2_4_uv_start
-        mov	rdx, r9
-        or	rdx, r10
-        jne	L_256_mod_inv_avx2_4_uv_start
-        or	rdx, r11
-        jne	L_256_mod_inv_avx2_4_uv_start
-        vpextrd	eax, xmm0, 0
-        vpextrd	r10d, xmm0, 1
-        vpextrd	r12d, xmm0, 2
-        vpextrd	r14d, xmm0, 3
-        vpextrd	r9d, xmm1, 0
-        vpextrd	r11d, xmm1, 1
-        vpextrd	r13d, xmm1, 2
-        vpextrd	r15d, xmm1, 3
-        vextracti128 	xmm0, ymm0, 1
-        vextracti128 	xmm1, ymm1, 1
-        vpextrd	edi, xmm0, 0
-        vpextrd	esi, xmm1, 0
-        jmp	L_256_mod_inv_avx2_4_store_done
-L_256_mod_inv_avx2_4_uv_v:
-        sub	r12, rax
-        sbb	r13, r9
-        vpsubd	ymm2, ymm2, ymm0
-        sbb	r14, r10
-        vpsubd	ymm3, ymm3, ymm1
-        sbb	r15, r11
-        vptest	ymm3, ymm12
-        jz	L_256_mod_inv_avx2_4_vsubu_done_neg
-        vpaddd	ymm2, ymm2, ymm6
-        vpaddd	ymm3, ymm3, ymm7
-L_256_mod_inv_avx2_4_vsubu_done_neg:
-L_256_mod_inv_avx2_4_vsubu_shr1:
-        shrd	r12, r13, 1
-        shrd	r13, r14, 1
-        shrd	r14, r15, 1
-        shr	r15, 1
-        vptest	ymm2, ymm8
-        jz	L_256_mod_inv_avx2_4_vsubu_sub_shr1
-        vpaddd	ymm2, ymm2, ymm6
-        vpaddd	ymm3, ymm3, ymm7
-L_256_mod_inv_avx2_4_vsubu_sub_shr1:
-        vpand	ymm4, ymm2, ymm9
-        vpand	ymm5, ymm3, ymm10
-        vpermd	ymm4, ymm11, ymm4
-        vpsrad	ymm2, ymm2, 1
-        vpsrad	ymm3, ymm3, 1
-        vpslld	ymm5, ymm5, 25
-        vpslld	xmm4, xmm4, 25
-        vpaddd	ymm2, ymm2, ymm5
-        vpaddd	ymm3, ymm3, ymm4
-        test	r12b, 1
-        jz	L_256_mod_inv_avx2_4_vsubu_shr1
-        cmp	r12, 1
-        jne	L_256_mod_inv_avx2_4_uv_start
-        mov	rdx, r13
-        or	rdx, r14
-        jne	L_256_mod_inv_avx2_4_uv_start
-        or	rdx, r15
-        jne	L_256_mod_inv_avx2_4_uv_start
-        vpextrd	eax, xmm2, 0
-        vpextrd	r10d, xmm2, 1
-        vpextrd	r12d, xmm2, 2
-        vpextrd	r14d, xmm2, 3
-        vpextrd	r9d, xmm3, 0
-        vpextrd	r11d, xmm3, 1
-        vpextrd	r13d, xmm3, 2
-        vpextrd	r15d, xmm3, 3
-        vextracti128 	xmm2, ymm2, 1
-        vextracti128 	xmm3, ymm3, 1
-        vpextrd	edi, xmm2, 0
-        vpextrd	esi, xmm3, 0
-L_256_mod_inv_avx2_4_store_done:
-        mov	edx, eax
-        and	eax, 67108863
-        sar	edx, 26
-        add	r9d, edx
-        mov	edx, r9d
-        and	r9d, 67108863
-        sar	edx, 26
-        add	r10d, edx
-        mov	edx, r10d
-        and	r10d, 67108863
-        sar	edx, 26
-        add	r11d, edx
-        mov	edx, r11d
-        and	r11d, 67108863
-        sar	edx, 26
-        add	r12d, edx
-        mov	edx, r12d
-        and	r12d, 67108863
-        sar	edx, 26
-        add	r13d, edx
-        mov	edx, r13d
-        and	r13d, 67108863
-        sar	edx, 26
-        add	r14d, edx
-        mov	edx, r14d
-        and	r14d, 67108863
-        sar	edx, 26
-        add	r15d, edx
-        mov	edx, r15d
-        and	r15d, 67108863
-        sar	edx, 26
-        add	edi, edx
-        mov	edx, edi
-        and	edi, 67108863
-        sar	edx, 26
-        add	esi, edx
-        movsxd	r9, r9d
-        movsxd	r11, r11d
-        movsxd	r13, r13d
-        movsxd	r15, r15d
-        movsxd	rsi, esi
-        shl	r9, 26
-        shl	r11, 26
-        shl	r13, 26
-        shl	r15, 26
-        shl	rsi, 26
-        movsxd	rax, eax
-        add	rax, r9
-        movsxd	r10, r10d
-        adc	r10, r11
-        movsxd	r12, r12d
-        adc	r12, r13
-        movsxd	r14, r14d
-        adc	r14, r15
-        movsxd	rdi, edi
-        adc	rdi, rsi
-        jge	L_256_mod_inv_avx2_4_3_no_add_order
-        mov	r9, 2756213597218129
-        mov	r11, 3054930678533947
-        mov	r13, 4503599622973178
-        mov	r15, 68719476735
-        mov	rsi, 281474976645120
-        add	rax, r9
-        add	r10, r11
-        add	r12, r13
-        add	r14, r15
-        add	rdi, rsi
-        mov	rdx, 4503599627370495
-        mov	r9, rax
-        and	rax, rdx
-        sar	r9, 52
-        add	r10, r9
-        mov	r11, r10
-        and	r10, rdx
-        sar	r11, 52
-        add	r12, r11
-        mov	r13, r12
-        and	r12, rdx
-        sar	r13, 52
-        add	r14, r13
-        mov	r15, r14
-        and	r14, rdx
-        sar	r15, 52
-        add	rdi, r15
-L_256_mod_inv_avx2_4_3_no_add_order:
-        mov	r9, r10
-        mov	r11, r12
-        mov	r13, r14
-        shl	r9, 52
-        sar	r10, 12
-        shl	r11, 40
-        sar	r12, 24
-        shl	r13, 28
-        sar	r14, 36
-        shl	rdi, 16
-        add	rax, r9
-        adc	r10, r11
-        adc	r12, r13
-        adc	r14, rdi
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        mov	QWORD PTR [rcx+16], r12
-        mov	QWORD PTR [rcx+24], r14
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        vmovdqu	xmm12, OWORD PTR [rsp+96]
-        vmovdqu	xmm13, OWORD PTR [rsp+112]
-        vmovdqu	xmm14, OWORD PTR [rsp+128]
-        add	rsp, 144
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_256_mod_inv_avx2_4 ENDP
-_text ENDS
-ENDIF
-ENDIF
-IFDEF WOLFSSL_SP_384
-; /* Multiply a and b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_mul_6 PROC
-        push	r12
-        mov	r9, rdx
-        sub	rsp, 48
-        ; A[0] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        mov	QWORD PTR [rsp], rax
-        mov	r11, rdx
-        ; A[0] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[1] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+8], r11
-        ; A[0] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+8]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[2] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+16], r12
-        ; A[0] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[1] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[2] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+16]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[3] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rsp+24], r10
-        ; A[0] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[1] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+16]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[3] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+24]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[4] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+32], r11
-        ; A[0] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+8]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[2] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+24]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[4] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+32]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[5] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+40], r12
-        ; A[1] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+8]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[2] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+16]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[3] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+32]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[5] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+40]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rcx+48], r10
-        ; A[2] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+16]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[3] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+24]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[4] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+40]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+56], r11
-        ; A[3] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+24]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[4] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+32]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[5] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+64], r12
-        ; A[4] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+32]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[5] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+40]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rcx+72], r10
-        ; A[5] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+40]
-        add	r11, rax
-        adc	r12, rdx
-        mov	QWORD PTR [rcx+80], r11
-        mov	QWORD PTR [rcx+88], r12
-        mov	rax, QWORD PTR [rsp]
-        mov	rdx, QWORD PTR [rsp+8]
-        mov	r10, QWORD PTR [rsp+16]
-        mov	r11, QWORD PTR [rsp+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], rdx
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        mov	rax, QWORD PTR [rsp+32]
-        mov	rdx, QWORD PTR [rsp+40]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], rdx
-        add	rsp, 48
-        pop	r12
-        ret
-sp_384_mul_6 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Multiply a and b into r. (r = a * b)
-;  *
-;  * r   Result of multiplication.
-;  * a   First number to multiply.
-;  * b   Second number to multiply.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_mul_avx2_6 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        mov	rax, rdx
-        sub	rsp, 40
-        xor	rbx, rbx
-        mov	rdx, QWORD PTR [rax]
-        ; A[0] * B[0]
-        mulx	r12, r11, QWORD PTR [r8]
-        ; A[0] * B[1]
-        mulx	r13, r9, QWORD PTR [r8+8]
-        adcx	r12, r9
-        ; A[0] * B[2]
-        mulx	r14, r9, QWORD PTR [r8+16]
-        adcx	r13, r9
-        ; A[0] * B[3]
-        mulx	r15, r9, QWORD PTR [r8+24]
-        adcx	r14, r9
-        ; A[0] * B[4]
-        mulx	rdi, r9, QWORD PTR [r8+32]
-        adcx	r15, r9
-        ; A[0] * B[5]
-        mulx	rsi, r9, QWORD PTR [r8+40]
-        adcx	rdi, r9
-        adcx	rsi, rbx
-        mov	QWORD PTR [rsp], r11
-        mov	r11, 0
-        adcx	r11, rbx
-        xor	rbx, rbx
-        mov	rdx, QWORD PTR [rax+8]
-        ; A[1] * B[0]
-        mulx	r10, r9, QWORD PTR [r8]
-        adcx	r12, r9
-        adox	r13, r10
-        ; A[1] * B[1]
-        mulx	r10, r9, QWORD PTR [r8+8]
-        adcx	r13, r9
-        adox	r14, r10
-        ; A[1] * B[2]
-        mulx	r10, r9, QWORD PTR [r8+16]
-        adcx	r14, r9
-        adox	r15, r10
-        ; A[1] * B[3]
-        mulx	r10, r9, QWORD PTR [r8+24]
-        adcx	r15, r9
-        adox	rdi, r10
-        ; A[1] * B[4]
-        mulx	r10, r9, QWORD PTR [r8+32]
-        adcx	rdi, r9
-        adox	rsi, r10
-        ; A[1] * B[5]
-        mulx	r10, r9, QWORD PTR [r8+40]
-        adcx	rsi, r9
-        adox	r11, r10
-        adcx	r11, rbx
-        mov	QWORD PTR [rsp+8], r12
-        mov	r12, 0
-        adcx	r12, rbx
-        adox	r12, rbx
-        xor	rbx, rbx
-        mov	rdx, QWORD PTR [rax+16]
-        ; A[2] * B[0]
-        mulx	r10, r9, QWORD PTR [r8]
-        adcx	r13, r9
-        adox	r14, r10
-        ; A[2] * B[1]
-        mulx	r10, r9, QWORD PTR [r8+8]
-        adcx	r14, r9
-        adox	r15, r10
-        ; A[2] * B[2]
-        mulx	r10, r9, QWORD PTR [r8+16]
-        adcx	r15, r9
-        adox	rdi, r10
-        ; A[2] * B[3]
-        mulx	r10, r9, QWORD PTR [r8+24]
-        adcx	rdi, r9
-        adox	rsi, r10
-        ; A[2] * B[4]
-        mulx	r10, r9, QWORD PTR [r8+32]
-        adcx	rsi, r9
-        adox	r11, r10
-        ; A[2] * B[5]
-        mulx	r10, r9, QWORD PTR [r8+40]
-        adcx	r11, r9
-        adox	r12, r10
-        adcx	r12, rbx
-        mov	QWORD PTR [rsp+16], r13
-        mov	r13, 0
-        adcx	r13, rbx
-        adox	r13, rbx
-        xor	rbx, rbx
-        mov	rdx, QWORD PTR [rax+24]
-        ; A[3] * B[0]
-        mulx	r10, r9, QWORD PTR [r8]
-        adcx	r14, r9
-        adox	r15, r10
-        ; A[3] * B[1]
-        mulx	r10, r9, QWORD PTR [r8+8]
-        adcx	r15, r9
-        adox	rdi, r10
-        ; A[3] * B[2]
-        mulx	r10, r9, QWORD PTR [r8+16]
-        adcx	rdi, r9
-        adox	rsi, r10
-        ; A[3] * B[3]
-        mulx	r10, r9, QWORD PTR [r8+24]
-        adcx	rsi, r9
-        adox	r11, r10
-        ; A[3] * B[4]
-        mulx	r10, r9, QWORD PTR [r8+32]
-        adcx	r11, r9
-        adox	r12, r10
-        ; A[3] * B[5]
-        mulx	r10, r9, QWORD PTR [r8+40]
-        adcx	r12, r9
-        adox	r13, r10
-        adcx	r13, rbx
-        mov	QWORD PTR [rsp+24], r14
-        mov	r14, 0
-        adcx	r14, rbx
-        adox	r14, rbx
-        xor	rbx, rbx
-        mov	rdx, QWORD PTR [rax+32]
-        ; A[4] * B[0]
-        mulx	r10, r9, QWORD PTR [r8]
-        adcx	r15, r9
-        adox	rdi, r10
-        ; A[4] * B[1]
-        mulx	r10, r9, QWORD PTR [r8+8]
-        adcx	rdi, r9
-        adox	rsi, r10
-        ; A[4] * B[2]
-        mulx	r10, r9, QWORD PTR [r8+16]
-        adcx	rsi, r9
-        adox	r11, r10
-        ; A[4] * B[3]
-        mulx	r10, r9, QWORD PTR [r8+24]
-        adcx	r11, r9
-        adox	r12, r10
-        ; A[4] * B[4]
-        mulx	r10, r9, QWORD PTR [r8+32]
-        adcx	r12, r9
-        adox	r13, r10
-        ; A[4] * B[5]
-        mulx	r10, r9, QWORD PTR [r8+40]
-        adcx	r13, r9
-        adox	r14, r10
-        adcx	r14, rbx
-        mov	QWORD PTR [rsp+32], r15
-        mov	rdx, QWORD PTR [rax+40]
-        ; A[5] * B[0]
-        mulx	r10, r9, QWORD PTR [r8]
-        adcx	rdi, r9
-        adox	rsi, r10
-        ; A[5] * B[1]
-        mulx	r10, r9, QWORD PTR [r8+8]
-        adcx	rsi, r9
-        adox	r11, r10
-        ; A[5] * B[2]
-        mulx	r10, r9, QWORD PTR [r8+16]
-        adcx	r11, r9
-        adox	r12, r10
-        ; A[5] * B[3]
-        mulx	r10, r9, QWORD PTR [r8+24]
-        adcx	r12, r9
-        adox	r13, r10
-        ; A[5] * B[4]
-        mulx	r10, r9, QWORD PTR [r8+32]
-        adcx	r13, r9
-        adox	r14, r10
-        ; A[5] * B[5]
-        mulx	r15, r9, QWORD PTR [r8+40]
-        adcx	r14, r9
-        adox	r15, rbx
-        adcx	r15, rbx
-        mov	QWORD PTR [rcx+40], rdi
-        mov	QWORD PTR [rcx+48], rsi
-        mov	QWORD PTR [rcx+56], r11
-        mov	QWORD PTR [rcx+64], r12
-        mov	QWORD PTR [rcx+72], r13
-        mov	QWORD PTR [rcx+80], r14
-        mov	QWORD PTR [rcx+88], r15
-        mov	r11, QWORD PTR [rsp]
-        mov	r12, QWORD PTR [rsp+8]
-        mov	r13, QWORD PTR [rsp+16]
-        mov	r14, QWORD PTR [rsp+24]
-        mov	r15, QWORD PTR [rsp+32]
-        mov	QWORD PTR [rcx], r11
-        mov	QWORD PTR [rcx+8], r12
-        mov	QWORD PTR [rcx+16], r13
-        mov	QWORD PTR [rcx+24], r14
-        mov	QWORD PTR [rcx+32], r15
-        add	rsp, 40
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_384_mul_avx2_6 ENDP
-_text ENDS
-ENDIF
-; /* Square a and put result in r. (r = a * a)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_sqr_6 PROC
-        push	r12
-        push	r13
-        push	r14
-        mov	r8, rdx
-        sub	rsp, 48
-        ; A[0] * A[0]
-        mov	rax, QWORD PTR [r8]
-        mul	rax
-        xor	r11, r11
-        mov	QWORD PTR [rsp], rax
-        mov	r10, rdx
-        ; A[0] * A[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r8]
-        xor	r9, r9
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        mov	QWORD PTR [rsp+8], r10
-        ; A[0] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        ; A[1] * A[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	rax
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+16], r11
-        ; A[0] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r8]
-        xor	r11, r11
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r8+8]
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+24], r9
-        ; A[0] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8]
-        xor	r9, r9
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        ; A[1] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r8+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        ; A[2] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	rax
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        mov	QWORD PTR [rsp+32], r10
-        ; A[0] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rsp+40], r11
-        ; A[1] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+8]
-        xor	r11, r11
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[2] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8+16]
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	rax
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+48], r9
-        ; A[2] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+16]
-        xor	r9, r9
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        ; A[3] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8+24]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        mov	QWORD PTR [rcx+56], r10
-        ; A[3] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+24]
-        xor	r10, r10
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        ; A[4] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	rax
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+64], r11
-        ; A[4] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+32]
-        xor	r11, r11
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+72], r9
-        ; A[5] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	rax
-        add	r10, rax
-        adc	r11, rdx
-        mov	QWORD PTR [rcx+80], r10
-        mov	QWORD PTR [rcx+88], r11
-        mov	rax, QWORD PTR [rsp]
-        mov	rdx, QWORD PTR [rsp+8]
-        mov	r12, QWORD PTR [rsp+16]
-        mov	r13, QWORD PTR [rsp+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], rdx
-        mov	QWORD PTR [rcx+16], r12
-        mov	QWORD PTR [rcx+24], r13
-        mov	rax, QWORD PTR [rsp+32]
-        mov	rdx, QWORD PTR [rsp+40]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], rdx
-        add	rsp, 48
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_384_sqr_6 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Square a and put result in r. (r = a * a)
-;  *
-;  * r   Result of squaring.
-;  * a   Number to square in Montgomery form.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_sqr_avx2_6 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        push	rbp
-        mov	rax, rdx
-        push	rcx
-        xor	rcx, rcx
-        mov	rdx, QWORD PTR [rax]
-        mov	rsi, QWORD PTR [rax+8]
-        mov	rbx, QWORD PTR [rax+16]
-        mov	rbp, QWORD PTR [rax+24]
-        ; Diagonal 0
-        ;   A[1] * A[0]
-        mulx	r11, r10, QWORD PTR [rax+8]
-        ;   A[2] * A[0]
-        mulx	r12, r8, QWORD PTR [rax+16]
-        adcx	r11, r8
-        ;   A[3] * A[0]
-        mulx	r13, r8, QWORD PTR [rax+24]
-        adcx	r12, r8
-        ;   A[4] * A[0]
-        mulx	r14, r8, QWORD PTR [rax+32]
-        adcx	r13, r8
-        ;   A[5] * A[0]
-        mulx	r15, r8, QWORD PTR [rax+40]
-        adcx	r14, r8
-        adcx	r15, rcx
-        ; Diagonal 1
-        mov	rdx, rsi
-        ;   A[2] * A[1]
-        mulx	r9, r8, QWORD PTR [rax+16]
-        adcx	r12, r8
-        adox	r13, r9
-        ;   A[3] * A[1]
-        mulx	r9, r8, QWORD PTR [rax+24]
-        adcx	r13, r8
-        adox	r14, r9
-        ;   A[4] * A[1]
-        mulx	r9, r8, QWORD PTR [rax+32]
-        adcx	r14, r8
-        adox	r15, r9
-        ;   A[5] * A[1]
-        mulx	rdi, r8, QWORD PTR [rax+40]
-        adcx	r15, r8
-        adox	rdi, rcx
-        mov	rdx, rbx
-        ;   A[5] * A[2]
-        mulx	rsi, r8, QWORD PTR [rax+40]
-        adcx	rdi, r8
-        adox	rsi, rcx
-        adcx	rsi, rcx
-        adcx	rbx, rcx
-        ; Diagonal 2
-        ;   A[3] * A[2]
-        mulx	r9, r8, QWORD PTR [rax+24]
-        adcx	r14, r8
-        adox	r15, r9
-        ;   A[4] * A[2]
-        mulx	r9, r8, QWORD PTR [rax+32]
-        adcx	r15, r8
-        adox	rdi, r9
-        mov	rdx, rbp
-        ;   A[4] * A[3]
-        mulx	r9, r8, QWORD PTR [rax+32]
-        adcx	rdi, r8
-        adox	rsi, r9
-        ;   A[5] * A[3]
-        mulx	rbx, r8, QWORD PTR [rax+40]
-        adcx	rsi, r8
-        adox	rbx, rcx
-        mov	rdx, QWORD PTR [rax+32]
-        ;   A[5] * A[4]
-        mulx	rbp, r8, QWORD PTR [rax+40]
-        adcx	rbx, r8
-        adox	rbp, rcx
-        adcx	rbp, rcx
-        adcx	rcx, rcx
-        ; Doubling previous result as we add in square words results
-        ; A[0] * A[0]
-        mov	rdx, QWORD PTR [rax]
-        mulx	r9, r8, rdx
-        pop	rdx
-        mov	QWORD PTR [rdx], r8
-        adox	r10, r10
-        push	rdx
-        adcx	r10, r9
-        ; A[1] * A[1]
-        mov	rdx, QWORD PTR [rax+8]
-        mulx	r9, r8, rdx
-        adox	r11, r11
-        adcx	r11, r8
-        adox	r12, r12
-        adcx	r12, r9
-        ; A[2] * A[2]
-        mov	rdx, QWORD PTR [rax+16]
-        mulx	r9, r8, rdx
-        adox	r13, r13
-        adcx	r13, r8
-        adox	r14, r14
-        adcx	r14, r9
-        ; A[3] * A[3]
-        mov	rdx, QWORD PTR [rax+24]
-        mulx	r9, r8, rdx
-        adox	r15, r15
-        adcx	r15, r8
-        adox	rdi, rdi
-        adcx	rdi, r9
-        ; A[4] * A[4]
-        mov	rdx, QWORD PTR [rax+32]
-        mulx	r9, r8, rdx
-        adox	rsi, rsi
-        adcx	rsi, r8
-        adox	rbx, rbx
-        adcx	rbx, r9
-        ; A[5] * A[5]
-        mov	rdx, QWORD PTR [rax+40]
-        mulx	r9, r8, rdx
-        adox	rbp, rbp
-        adcx	rbp, r8
-        adcx	r9, rcx
-        mov	r8, 0
-        adox	r9, r8
-        pop	rcx
-        mov	QWORD PTR [rcx+8], r10
-        mov	QWORD PTR [rcx+16], r11
-        mov	QWORD PTR [rcx+24], r12
-        mov	QWORD PTR [rcx+32], r13
-        mov	QWORD PTR [rcx+40], r14
-        mov	QWORD PTR [rcx+48], r15
-        mov	QWORD PTR [rcx+56], rdi
-        mov	QWORD PTR [rcx+64], rsi
-        mov	QWORD PTR [rcx+72], rbx
-        mov	QWORD PTR [rcx+80], rbp
-        mov	QWORD PTR [rcx+88], r9
-        pop	rbp
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_384_sqr_avx2_6 ENDP
-_text ENDS
-ENDIF
-; /* Add b to a into r. (r = a + b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_add_6 PROC
-        push	r12
-        push	r13
-        push	r14
-        xor	rax, rax
-        mov	r9, QWORD PTR [rdx]
-        mov	r10, QWORD PTR [rdx+8]
-        mov	r11, QWORD PTR [rdx+16]
-        mov	r12, QWORD PTR [rdx+24]
-        mov	r13, QWORD PTR [rdx+32]
-        mov	r14, QWORD PTR [rdx+40]
-        add	r9, QWORD PTR [r8]
-        adc	r10, QWORD PTR [r8+8]
-        adc	r11, QWORD PTR [r8+16]
-        adc	r12, QWORD PTR [r8+24]
-        adc	r13, QWORD PTR [r8+32]
-        adc	r14, QWORD PTR [r8+40]
-        mov	QWORD PTR [rcx], r9
-        mov	QWORD PTR [rcx+8], r10
-        mov	QWORD PTR [rcx+16], r11
-        mov	QWORD PTR [rcx+24], r12
-        mov	QWORD PTR [rcx+32], r13
-        mov	QWORD PTR [rcx+40], r14
-        adc	rax, 0
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_384_add_6 ENDP
-_text ENDS
-; /* Sub b from a into r. (r = a - b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_sub_6 PROC
-        push	r12
-        push	r13
-        push	r14
-        xor	rax, rax
-        mov	r9, QWORD PTR [rdx]
-        mov	r10, QWORD PTR [rdx+8]
-        mov	r11, QWORD PTR [rdx+16]
-        mov	r12, QWORD PTR [rdx+24]
-        mov	r13, QWORD PTR [rdx+32]
-        mov	r14, QWORD PTR [rdx+40]
-        sub	r9, QWORD PTR [r8]
-        sbb	r10, QWORD PTR [r8+8]
-        sbb	r11, QWORD PTR [r8+16]
-        sbb	r12, QWORD PTR [r8+24]
-        sbb	r13, QWORD PTR [r8+32]
-        sbb	r14, QWORD PTR [r8+40]
-        mov	QWORD PTR [rcx], r9
-        mov	QWORD PTR [rcx+8], r10
-        mov	QWORD PTR [rcx+16], r11
-        mov	QWORD PTR [rcx+24], r12
-        mov	QWORD PTR [rcx+32], r13
-        mov	QWORD PTR [rcx+40], r14
-        sbb	rax, rax
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_384_sub_6 ENDP
-_text ENDS
-; /* Conditionally copy a into r using the mask m.
-;  * m is -1 to copy and 0 when not.
-;  *
-;  * r  A single precision number to copy over.
-;  * a  A single precision number to copy.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_cond_copy_6 PROC
-        push	r12
-        push	r13
-        mov	rax, QWORD PTR [rcx]
-        mov	r9, QWORD PTR [rcx+8]
-        mov	r10, QWORD PTR [rcx+16]
-        mov	r11, QWORD PTR [rcx+24]
-        mov	r12, QWORD PTR [rcx+32]
-        mov	r13, QWORD PTR [rcx+40]
-        xor	rax, QWORD PTR [rdx]
-        xor	r9, QWORD PTR [rdx+8]
-        xor	r10, QWORD PTR [rdx+16]
-        xor	r11, QWORD PTR [rdx+24]
-        xor	r12, QWORD PTR [rdx+32]
-        xor	r13, QWORD PTR [rdx+40]
-        and	rax, r8
-        and	r9, r8
-        and	r10, r8
-        and	r11, r8
-        and	r12, r8
-        and	r13, r8
-        xor	QWORD PTR [rcx], rax
-        xor	QWORD PTR [rcx+8], r9
-        xor	QWORD PTR [rcx+16], r10
-        xor	QWORD PTR [rcx+24], r11
-        xor	QWORD PTR [rcx+32], r12
-        xor	QWORD PTR [rcx+40], r13
-        pop	r13
-        pop	r12
-        ret
-sp_384_cond_copy_6 ENDP
-_text ENDS
-; /* Conditionally subtract b from a using the mask m.
-;  * m is -1 to subtract and 0 when not copying.
-;  *
-;  * r  A single precision number representing condition subtract result.
-;  * a  A single precision number to subtract from.
-;  * b  A single precision number to subtract.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_cond_sub_6 PROC
-        sub	rsp, 48
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp], r10
-        mov	QWORD PTR [rsp+8], r11
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+16], r10
-        mov	QWORD PTR [rsp+24], r11
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+32], r10
-        mov	QWORD PTR [rsp+40], r11
-        mov	r10, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rsp]
-        sub	r10, r8
-        mov	r11, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [rsp+8]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx], r10
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r8, QWORD PTR [rsp+16]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+8], r11
-        mov	r11, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [rsp+24]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+16], r10
-        mov	r10, QWORD PTR [rdx+32]
-        mov	r8, QWORD PTR [rsp+32]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+24], r11
-        mov	r11, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [rsp+40]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+32], r10
-        mov	QWORD PTR [rcx+40], r11
-        sbb	rax, rax
-        add	rsp, 48
-        ret
-sp_384_cond_sub_6 ENDP
-_text ENDS
-; /* Reduce the number back to 384 bits using Montgomery reduction.
-;  *
-;  * a   A single precision number to reduce in place.
-;  * m   The single precision number representing the modulus.
-;  * mp  The digit representing the negative inverse of m mod 2^n.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_mont_reduce_6 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        push	rbp
-        mov	r12, QWORD PTR [rcx]
-        mov	r13, QWORD PTR [rcx+8]
-        mov	r14, QWORD PTR [rcx+16]
-        mov	r15, QWORD PTR [rcx+24]
-        mov	rdi, QWORD PTR [rcx+32]
-        mov	rsi, QWORD PTR [rcx+40]
-        xor	r11, r11
-        ; a[0-7] += m[0-5] * mu[0..1] = m[0-5] * (a[0..1] * mp)
-        mov	rbx, QWORD PTR [rcx+48]
-        mov	rbp, QWORD PTR [rcx+56]
-        mov	rdx, r12
-        mov	rax, r13
-        shld	rax, rdx, 32
-        shl	rdx, 32
-        add	rdx, r12
-        adc	rax, r13
-        add	rax, r12
-        mov	r8, rdx
-        mov	r9, rax
-        mov	r10, rax
-        shld	r9, r8, 32
-        shl	r8, 32
-        shr	r10, 32
-        add	r12, r8
-        adc	r13, r9
-        adc	r14, r10
-        adc	r15, 0
-        adc	rdi, 0
-        adc	rsi, 0
-        adc	rbx, rdx
-        adc	rbp, rax
-        adc	r11, 0
-        add	r8, rax
-        adc	r9, rdx
-        adc	r10, rax
-        mov	rax, 0
-        adc	rax, 0
-        sub	r14, r9
-        sbb	r15, r10
-        sbb	rdi, rax
-        sbb	rsi, 0
-        sbb	rbx, 0
-        sbb	rbp, 0
-        sbb	r11, 0
-        ; a[2-9] += m[0-5] * mu[0..1] = m[0-5] * (a[2..3] * mp)
-        mov	r12, QWORD PTR [rcx+64]
-        mov	r13, QWORD PTR [rcx+72]
-        mov	rdx, r14
-        mov	rax, r15
-        shld	rax, rdx, 32
-        shl	rdx, 32
-        add	rdx, r14
-        adc	rax, r15
-        add	rax, r14
-        mov	r8, rdx
-        mov	r9, rax
-        mov	r10, rax
-        shld	r9, r8, 32
-        shl	r8, 32
-        shr	r10, 32
-        add	r12, r11
-        adc	r13, 0
-        mov	r11, 0
-        adc	r11, 0
-        add	r14, r8
-        adc	r15, r9
-        adc	rdi, r10
-        adc	rsi, 0
-        adc	rbx, 0
-        adc	rbp, 0
-        adc	r12, rdx
-        adc	r13, rax
-        adc	r11, 0
-        add	r8, rax
-        adc	r9, rdx
-        adc	r10, rax
-        mov	rax, 0
-        adc	rax, 0
-        sub	rdi, r9
-        sbb	rsi, r10
-        sbb	rbx, rax
-        sbb	rbp, 0
-        sbb	r12, 0
-        sbb	r13, 0
-        sbb	r11, 0
-        ; a[4-11] += m[0-5] * mu[0..1] = m[0-5] * (a[4..5] * mp)
-        mov	r14, QWORD PTR [rcx+80]
-        mov	r15, QWORD PTR [rcx+88]
-        mov	rdx, rdi
-        mov	rax, rsi
-        shld	rax, rdx, 32
-        shl	rdx, 32
-        add	rdx, rdi
-        adc	rax, rsi
-        add	rax, rdi
-        mov	r8, rdx
-        mov	r9, rax
-        mov	r10, rax
-        shld	r9, r8, 32
-        shl	r8, 32
-        shr	r10, 32
-        add	r14, r11
-        adc	r15, 0
-        mov	r11, 0
-        adc	r11, 0
-        add	rdi, r8
-        adc	rsi, r9
-        adc	rbx, r10
-        adc	rbp, 0
-        adc	r12, 0
-        adc	r13, 0
-        adc	r14, rdx
-        adc	r15, rax
-        adc	r11, 0
-        add	r8, rax
-        adc	r9, rdx
-        adc	r10, rax
-        mov	rax, 0
-        adc	rax, 0
-        sub	rbx, r9
-        sbb	rbp, r10
-        sbb	r12, rax
-        sbb	r13, 0
-        sbb	r14, 0
-        sbb	r15, 0
-        sbb	r11, 0
-        ; Subtract mod if carry
-        neg	r11
-        mov	r10, 18446744073709551614
-        mov	r8d, r11d
-        mov	r9, r11
-        and	r10, r11
-        shl	r9, 32
-        sub	rbx, r8
-        sbb	rbp, r9
-        sbb	r12, r10
-        sbb	r13, r11
-        sbb	r14, r11
-        sbb	r15, r11
-        mov	QWORD PTR [rcx], rbx
-        mov	QWORD PTR [rcx+8], rbp
-        mov	QWORD PTR [rcx+16], r12
-        mov	QWORD PTR [rcx+24], r13
-        mov	QWORD PTR [rcx+32], r14
-        mov	QWORD PTR [rcx+40], r15
-        pop	rbp
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_384_mont_reduce_6 ENDP
-_text ENDS
-; /* Reduce the number back to 384 bits using Montgomery reduction.
-;  *
-;  * a   A single precision number to reduce in place.
-;  * m   The single precision number representing the modulus.
-;  * mp  The digit representing the negative inverse of m mod 2^n.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_mont_reduce_order_6 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        mov	r9, rdx
-        xor	rsi, rsi
-        ; i = 6
-        mov	r10, 6
-        mov	r15, QWORD PTR [rcx]
-        mov	rdi, QWORD PTR [rcx+8]
-L_384_mont_reduce_order_6_loop:
-        ; mu = a[i] * mp
-        mov	r13, r15
-        imul	r13, r8
-        ; a[i+0] += m[0] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9]
-        add	r15, rax
-        adc	r12, rdx
-        ; a[i+1] += m[1] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+8]
-        mov	r15, rdi
-        add	r15, rax
-        adc	r11, rdx
-        add	r15, r12
-        adc	r11, 0
-        ; a[i+2] += m[2] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+16]
-        mov	rdi, QWORD PTR [rcx+16]
-        add	rdi, rax
-        adc	r12, rdx
-        add	rdi, r11
-        adc	r12, 0
-        ; a[i+3] += m[3] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+24]
-        mov	r14, QWORD PTR [rcx+24]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+24], r14
-        adc	r11, 0
-        ; a[i+4] += m[4] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+32]
-        mov	r14, QWORD PTR [rcx+32]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+32], r14
-        adc	r12, 0
-        ; a[i+5] += m[5] * mu
-        mov	rax, r13
-        mul	QWORD PTR [r9+40]
-        mov	r14, QWORD PTR [rcx+40]
-        add	r12, rax
-        adc	rdx, rsi
-        mov	rsi, 0
-        adc	rsi, 0
-        add	r14, r12
-        mov	QWORD PTR [rcx+40], r14
-        adc	QWORD PTR [rcx+48], rdx
-        adc	rsi, 0
-        ; i -= 1
-        add	rcx, 8
-        dec	r10
-        jnz	L_384_mont_reduce_order_6_loop
-        mov	QWORD PTR [rcx], r15
-        mov	QWORD PTR [rcx+8], rdi
-        neg	rsi
-IFDEF _WIN64
-        mov	r8, r9
-        mov	r9, rsi
-ELSE
-        mov	r9, rsi
-        mov	r8, r9
-ENDIF
-        mov	rdx, rcx
-        mov	rcx, rcx
-        sub	rcx, 48
-        call	sp_384_cond_sub_6
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_384_mont_reduce_order_6 ENDP
-_text ENDS
-; /* Compare a with b in constant time.
-;  *
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  * return -ve, 0 or +ve if a is less than, equal to or greater than b
-;  * respectively.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_cmp_6 PROC
-        push	r12
-        xor	r9, r9
-        mov	r8, -1
-        mov	rax, -1
-        mov	r10, 1
-        mov	r11, QWORD PTR [rcx+40]
-        mov	r12, QWORD PTR [rdx+40]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+32]
-        mov	r12, QWORD PTR [rdx+32]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+24]
-        mov	r12, QWORD PTR [rdx+24]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+16]
-        mov	r12, QWORD PTR [rdx+16]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+8]
-        mov	r12, QWORD PTR [rdx+8]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx]
-        mov	r12, QWORD PTR [rdx]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        xor	rax, r8
-        pop	r12
-        ret
-sp_384_cmp_6 ENDP
-_text ENDS
-; /* Add two Montgomery form numbers (r = a + b % m).
-;  *
-;  * r   Result of addition.
-;  * a   First number to add in Montgomery form.
-;  * b   Second number to add in Montgomery form.
-;  * m   Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_384_mont_add_6 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        mov	rax, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r11, QWORD PTR [rdx+24]
-        mov	r12, QWORD PTR [rdx+32]
-        mov	r13, QWORD PTR [rdx+40]
-        add	rax, QWORD PTR [r8]
-        adc	r9, QWORD PTR [r8+8]
-        mov	r15, 18446744069414584320
-        adc	r10, QWORD PTR [r8+16]
-        mov	rdi, 18446744073709551614
-        adc	r11, QWORD PTR [r8+24]
-        adc	r12, QWORD PTR [r8+32]
-        adc	r13, QWORD PTR [r8+40]
-        sbb	rdx, rdx
-        mov	r14d, edx
-        and	r15, rdx
-        and	rdi, rdx
-        sub	rax, r14
-        sbb	r9, r15
-        sbb	r10, rdi
-        sbb	r11, rdx
-        sbb	r12, rdx
-        sbb	r13, rdx
-        adc	rdx, 0
-        and	r14, rdx
-        and	r15, rdx
-        and	rdi, rdx
-        sub	rax, r14
-        sbb	r9, r15
-        mov	QWORD PTR [rcx], rax
-        sbb	r10, rdi
-        mov	QWORD PTR [rcx+8], r9
-        sbb	r11, rdx
-        mov	QWORD PTR [rcx+16], r10
-        sbb	r12, rdx
-        mov	QWORD PTR [rcx+24], r11
-        sbb	r13, rdx
-        mov	QWORD PTR [rcx+32], r12
-        mov	QWORD PTR [rcx+40], r13
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_384_mont_add_6 ENDP
-_text ENDS
-; /* Double a Montgomery form number (r = a + a % m).
-;  *
-;  * r   Result of doubling.
-;  * a   Number to double in Montgomery form.
-;  * m   Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_384_mont_dbl_6 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        mov	rax, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rdx+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	r11, QWORD PTR [rdx+32]
-        mov	r12, QWORD PTR [rdx+40]
-        add	rax, rax
-        adc	r8, r8
-        mov	r14, 18446744069414584320
-        adc	r9, r9
-        mov	r15, 18446744073709551614
-        adc	r10, r10
-        adc	r11, r11
-        mov	rdi, r12
-        adc	r12, r12
-        sar	rdi, 63
-        mov	r13d, edi
-        and	r14, rdi
-        and	r15, rdi
-        sub	rax, r13
-        sbb	r8, r14
-        sbb	r9, r15
-        sbb	r10, rdi
-        sbb	r11, rdi
-        sbb	r12, rdi
-        adc	rdi, 0
-        and	r13, rdi
-        and	r14, rdi
-        and	r15, rdi
-        sub	rax, r13
-        sbb	r8, r14
-        mov	QWORD PTR [rcx], rax
-        sbb	r9, r15
-        mov	QWORD PTR [rcx+8], r8
-        sbb	r10, rdi
-        mov	QWORD PTR [rcx+16], r9
-        sbb	r11, rdi
-        mov	QWORD PTR [rcx+24], r10
-        sbb	r12, rdi
-        mov	QWORD PTR [rcx+32], r11
-        mov	QWORD PTR [rcx+40], r12
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_384_mont_dbl_6 ENDP
-_text ENDS
-; /* Double a Montgomery form number (r = a + a % m).
-;  *
-;  * r   Result of doubling.
-;  * a   Number to double in Montgomery form.
-;  * m   Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_384_mont_tpl_6 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        mov	rax, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rdx+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	r11, QWORD PTR [rdx+32]
-        mov	r12, QWORD PTR [rdx+40]
-        add	rax, rax
-        adc	r8, r8
-        mov	r14, 18446744069414584320
-        adc	r9, r9
-        mov	r15, 18446744073709551614
-        adc	r10, r10
-        adc	r11, r11
-        adc	r12, r12
-        sbb	rdi, rdi
-        mov	r13d, edi
-        and	r14, rdi
-        and	r15, rdi
-        sub	rax, r13
-        sbb	r8, r14
-        sbb	r9, r15
-        sbb	r10, rdi
-        sbb	r11, rdi
-        sbb	r12, rdi
-        adc	rdi, 0
-        and	r13, rdi
-        and	r14, rdi
-        and	r15, rdi
-        sub	rax, r13
-        sbb	r8, r14
-        mov	QWORD PTR [rcx], rax
-        sbb	r9, r15
-        sbb	r10, rdi
-        sbb	r11, rdi
-        sbb	r12, rdi
-        add	rax, QWORD PTR [rdx]
-        adc	r8, QWORD PTR [rdx+8]
-        mov	r14, 18446744069414584320
-        adc	r9, QWORD PTR [rdx+16]
-        mov	r15, 18446744073709551614
-        adc	r10, QWORD PTR [rdx+24]
-        adc	r11, QWORD PTR [rdx+32]
-        adc	r12, QWORD PTR [rdx+40]
-        sbb	rdi, rdi
-        mov	r13d, edi
-        and	r14, rdi
-        and	r15, rdi
-        sub	rax, r13
-        sbb	r8, r14
-        sbb	r9, r15
-        sbb	r10, rdi
-        sbb	r11, rdi
-        sbb	r12, rdi
-        adc	rdi, 0
-        and	r13, rdi
-        and	r14, rdi
-        and	r15, rdi
-        sub	rax, r13
-        sbb	r8, r14
-        mov	QWORD PTR [rcx], rax
-        sbb	r9, r15
-        mov	QWORD PTR [rcx+8], r8
-        sbb	r10, rdi
-        mov	QWORD PTR [rcx+16], r9
-        sbb	r11, rdi
-        mov	QWORD PTR [rcx+24], r10
-        sbb	r12, rdi
-        mov	QWORD PTR [rcx+32], r11
-        mov	QWORD PTR [rcx+40], r12
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_384_mont_tpl_6 ENDP
-_text ENDS
-; /* Subtract two Montgomery form numbers (r = a - b % m).
-;  *
-;  * r   Result of subtration.
-;  * a   Number to subtract from in Montgomery form.
-;  * b   Number to subtract with in Montgomery form.
-;  * m   Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_384_mont_sub_6 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        mov	rax, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r11, QWORD PTR [rdx+24]
-        mov	r12, QWORD PTR [rdx+32]
-        mov	r13, QWORD PTR [rdx+40]
-        sub	rax, QWORD PTR [r8]
-        sbb	r9, QWORD PTR [r8+8]
-        mov	r15, 18446744069414584320
-        sbb	r10, QWORD PTR [r8+16]
-        mov	rdi, 18446744073709551614
-        sbb	r11, QWORD PTR [r8+24]
-        sbb	r12, QWORD PTR [r8+32]
-        sbb	r13, QWORD PTR [r8+40]
-        sbb	rdx, rdx
-        mov	r14d, edx
-        and	r15, rdx
-        and	rdi, rdx
-        add	rax, r14
-        adc	r9, r15
-        adc	r10, rdi
-        adc	r11, rdx
-        adc	r12, rdx
-        adc	r13, rdx
-        adc	rdx, 0
-        and	r14, rdx
-        and	r15, rdx
-        and	rdi, rdx
-        add	rax, r14
-        adc	r9, r15
-        mov	QWORD PTR [rcx], rax
-        adc	r10, rdi
-        mov	QWORD PTR [rcx+8], r9
-        adc	r11, rdx
-        mov	QWORD PTR [rcx+16], r10
-        adc	r12, rdx
-        mov	QWORD PTR [rcx+24], r11
-        adc	r13, rdx
-        mov	QWORD PTR [rcx+32], r12
-        mov	QWORD PTR [rcx+40], r13
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_384_mont_sub_6 ENDP
-_text ENDS
-; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
-;  *
-;  * r  Result of division by 2.
-;  * a  Number to divide.
-;  * m  Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_384_mont_div2_6 PROC
-        push	r12
-        push	r13
-        sub	rsp, 48
-        mov	r13, QWORD PTR [rdx]
-        xor	r12, r12
-        mov	rax, r13
-        and	r13, 1
-        neg	r13
-        mov	r10, QWORD PTR [r8]
-        and	r10, r13
-        mov	QWORD PTR [rsp], r10
-        mov	r10, QWORD PTR [r8+8]
-        and	r10, r13
-        mov	QWORD PTR [rsp+8], r10
-        mov	r10, QWORD PTR [r8+16]
-        and	r10, r13
-        mov	QWORD PTR [rsp+16], r10
-        mov	r10, QWORD PTR [r8+24]
-        and	r10, r13
-        mov	QWORD PTR [rsp+24], r10
-        mov	r10, QWORD PTR [r8+32]
-        and	r10, r13
-        mov	QWORD PTR [rsp+32], r10
-        mov	r10, QWORD PTR [r8+40]
-        and	r10, r13
-        mov	QWORD PTR [rsp+40], r10
-        add	QWORD PTR [rsp], rax
-        mov	rax, QWORD PTR [rdx+8]
-        adc	QWORD PTR [rsp+8], rax
-        mov	rax, QWORD PTR [rdx+16]
-        adc	QWORD PTR [rsp+16], rax
-        mov	rax, QWORD PTR [rdx+24]
-        adc	QWORD PTR [rsp+24], rax
-        mov	rax, QWORD PTR [rdx+32]
-        adc	QWORD PTR [rsp+32], rax
-        mov	rax, QWORD PTR [rdx+40]
-        adc	QWORD PTR [rsp+40], rax
-        adc	r12, 0
-        mov	rax, QWORD PTR [rsp]
-        mov	r9, QWORD PTR [rsp+8]
-        shrd	rax, r9, 1
-        mov	QWORD PTR [rcx], rax
-        mov	rax, QWORD PTR [rsp+16]
-        shrd	r9, rax, 1
-        mov	QWORD PTR [rcx+8], r9
-        mov	r9, QWORD PTR [rsp+24]
-        shrd	rax, r9, 1
-        mov	QWORD PTR [rcx+16], rax
-        mov	rax, QWORD PTR [rsp+32]
-        shrd	r9, rax, 1
-        mov	QWORD PTR [rcx+24], r9
-        mov	r9, QWORD PTR [rsp+40]
-        shrd	rax, r9, 1
-        mov	QWORD PTR [rcx+32], rax
-        shrd	r9, r12, 1
-        mov	QWORD PTR [rcx+40], r9
-        add	rsp, 48
-        pop	r13
-        pop	r12
-        ret
-sp_384_mont_div2_6 ENDP
-_text ENDS
-IFNDEF WC_NO_CACHE_RESISTANT
-; /* Touch each possible point that could be being copied.
-;  *
-;  * r      Point to copy into.
-;  * table  Table - start of the entries to access
-;  * idx    Index of point to retrieve.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_get_point_33_6 PROC
-        sub	rsp, 160
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        vmovdqu	OWORD PTR [rsp+96], xmm12
-        vmovdqu	OWORD PTR [rsp+112], xmm13
-        vmovdqu	OWORD PTR [rsp+128], xmm14
-        vmovdqu	OWORD PTR [rsp+144], xmm15
-        mov	rax, 1
-        movd	xmm13, r8d
-        add	rdx, 296
-        movd	xmm15, eax
-        mov	rax, 32
-        pshufd	xmm15, xmm15, 0
-        pshufd	xmm13, xmm13, 0
-        pxor	xmm14, xmm14
-        pxor	xmm0, xmm0
-        pxor	xmm1, xmm1
-        pxor	xmm2, xmm2
-        pxor	xmm3, xmm3
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        movdqa	xmm14, xmm15
-L_384_get_point_33_6_start_1:
-        movdqa	xmm12, xmm14
-        paddd	xmm14, xmm15
-        pcmpeqd	xmm12, xmm13
-        movdqu	xmm6, [rdx]
-        movdqu	xmm7, [rdx+16]
-        movdqu	xmm8, [rdx+32]
-        movdqu	xmm9, [rdx+96]
-        movdqu	xmm10, [rdx+112]
-        movdqu	xmm11, [rdx+128]
-        add	rdx, 296
-        pand	xmm6, xmm12
-        pand	xmm7, xmm12
-        pand	xmm8, xmm12
-        pand	xmm9, xmm12
-        pand	xmm10, xmm12
-        pand	xmm11, xmm12
-        por	xmm0, xmm6
-        por	xmm1, xmm7
-        por	xmm2, xmm8
-        por	xmm3, xmm9
-        por	xmm4, xmm10
-        por	xmm5, xmm11
-        dec	rax
-        jnz	L_384_get_point_33_6_start_1
-        movdqu	[rcx], xmm0
-        movdqu	[rcx+16], xmm1
-        movdqu	[rcx+32], xmm2
-        movdqu	[rcx+96], xmm3
-        movdqu	[rcx+112], xmm4
-        movdqu	[rcx+128], xmm5
-        mov	rax, 1
-        movd	xmm13, r8d
-        sub	rdx, 9472
-        movd	xmm15, eax
-        mov	rax, 32
-        pshufd	xmm15, xmm15, 0
-        pshufd	xmm13, xmm13, 0
-        pxor	xmm14, xmm14
-        pxor	xmm0, xmm0
-        pxor	xmm1, xmm1
-        pxor	xmm2, xmm2
-        movdqa	xmm14, xmm15
-L_384_get_point_33_6_start_2:
-        movdqa	xmm12, xmm14
-        paddd	xmm14, xmm15
-        pcmpeqd	xmm12, xmm13
-        movdqu	xmm6, [rdx+192]
-        movdqu	xmm7, [rdx+208]
-        movdqu	xmm8, [rdx+224]
-        add	rdx, 296
-        pand	xmm6, xmm12
-        pand	xmm7, xmm12
-        pand	xmm8, xmm12
-        por	xmm0, xmm6
-        por	xmm1, xmm7
-        por	xmm2, xmm8
-        dec	rax
-        jnz	L_384_get_point_33_6_start_2
-        movdqu	[rcx+192], xmm0
-        movdqu	[rcx+208], xmm1
-        movdqu	[rcx+224], xmm2
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        vmovdqu	xmm12, OWORD PTR [rsp+96]
-        vmovdqu	xmm13, OWORD PTR [rsp+112]
-        vmovdqu	xmm14, OWORD PTR [rsp+128]
-        vmovdqu	xmm15, OWORD PTR [rsp+144]
-        add	rsp, 160
-        ret
-sp_384_get_point_33_6 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Touch each possible point that could be being copied.
-;  *
-;  * r      Point to copy into.
-;  * table  Table - start of the entries to access
-;  * idx    Index of point to retrieve.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_get_point_33_avx2_6 PROC
-        sub	rsp, 160
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        vmovdqu	OWORD PTR [rsp+96], xmm12
-        vmovdqu	OWORD PTR [rsp+112], xmm13
-        vmovdqu	OWORD PTR [rsp+128], xmm14
-        vmovdqu	OWORD PTR [rsp+144], xmm15
-        mov	rax, 1
-        movd	xmm13, r8d
-        add	rdx, 296
-        movd	xmm15, eax
-        mov	rax, 32
-        vpxor	ymm14, ymm14, ymm14
-        vpermd	ymm13, ymm14, ymm13
-        vpermd	ymm15, ymm14, ymm15
-        vpxor	ymm0, ymm0, ymm0
-        vpxor	xmm1, xmm1, xmm1
-        vpxor	ymm2, ymm2, ymm2
-        vpxor	xmm3, xmm3, xmm3
-        vpxor	ymm4, ymm4, ymm4
-        vpxor	xmm5, xmm5, xmm5
-        vmovdqa	ymm14, ymm15
-L_384_get_point_33_avx2_6_start:
-        vpcmpeqd	ymm12, ymm14, ymm13
-        vpaddd	ymm14, ymm14, ymm15
-        vmovupd	ymm6, YMMWORD PTR [rdx]
-        vmovdqu	xmm7, OWORD PTR [rdx+32]
-        vmovupd	ymm8, YMMWORD PTR [rdx+96]
-        vmovdqu	xmm9, OWORD PTR [rdx+128]
-        vmovupd	ymm10, YMMWORD PTR [rdx+192]
-        vmovdqu	xmm11, OWORD PTR [rdx+224]
-        add	rdx, 296
-        vpand	ymm6, ymm6, ymm12
-        vpand	xmm7, xmm7, xmm12
-        vpand	ymm8, ymm8, ymm12
-        vpand	xmm9, xmm9, xmm12
-        vpand	ymm10, ymm10, ymm12
-        vpand	xmm11, xmm11, xmm12
-        vpor	ymm0, ymm0, ymm6
-        vpor	xmm1, xmm1, xmm7
-        vpor	ymm2, ymm2, ymm8
-        vpor	xmm3, xmm3, xmm9
-        vpor	ymm4, ymm4, ymm10
-        vpor	xmm5, xmm5, xmm11
-        dec	rax
-        jnz	L_384_get_point_33_avx2_6_start
-        vmovupd	YMMWORD PTR [rcx], ymm0
-        vmovdqu	OWORD PTR [rcx+32], xmm1
-        vmovupd	YMMWORD PTR [rcx+96], ymm2
-        vmovdqu	OWORD PTR [rcx+128], xmm3
-        vmovupd	YMMWORD PTR [rcx+192], ymm4
-        vmovdqu	OWORD PTR [rcx+224], xmm5
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        vmovdqu	xmm12, OWORD PTR [rsp+96]
-        vmovdqu	xmm13, OWORD PTR [rsp+112]
-        vmovdqu	xmm14, OWORD PTR [rsp+128]
-        vmovdqu	xmm15, OWORD PTR [rsp+144]
-        add	rsp, 160
-        ret
-sp_384_get_point_33_avx2_6 ENDP
-_text ENDS
-ENDIF
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Reduce the number back to 384 bits using Montgomery reduction.
-;  *
-;  * a   A single precision number to reduce in place.
-;  * m   The single precision number representing the modulus.
-;  * mp  The digit representing the negative inverse of m mod 2^n.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_mont_reduce_order_avx2_6 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        mov	rax, rdx
-        xor	r15, r15
-        mov	r14, QWORD PTR [rcx]
-        xor	r13, r13
-L_mont_loop_order_avx2_6:
-        ; mu = a[i] * mp
-        mov	rdx, r14
-        mov	r11, r14
-        imul	rdx, r8
-        xor	r13, r13
-        ; a[i+0] += m[0] * mu
-        mulx	r10, r9, QWORD PTR [rax]
-        mov	r14, QWORD PTR [rcx+8]
-        adcx	r11, r9
-        adox	r14, r10
-        ; a[i+1] += m[1] * mu
-        mulx	r10, r9, QWORD PTR [rax+8]
-        mov	r11, QWORD PTR [rcx+16]
-        adcx	r14, r9
-        adox	r11, r10
-        ; a[i+2] += m[2] * mu
-        mulx	r10, r9, QWORD PTR [rax+16]
-        mov	r12, QWORD PTR [rcx+24]
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+16], r11
-        ; a[i+3] += m[3] * mu
-        mulx	r10, r9, QWORD PTR [rax+24]
-        mov	r11, QWORD PTR [rcx+32]
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+24], r12
-        ; a[i+4] += m[4] * mu
-        mulx	r10, r9, QWORD PTR [rax+32]
-        mov	r12, QWORD PTR [rcx+40]
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+32], r11
-        ; a[i+5] += m[5] * mu
-        mulx	r10, r9, QWORD PTR [rax+40]
-        mov	r11, QWORD PTR [rcx+48]
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+40], r12
-        adcx	r11, r15
-        mov	QWORD PTR [rcx+48], r11
-        mov	r15, r13
-        adox	r15, r13
-        adcx	r15, r13
-        ; mu = a[i] * mp
-        mov	rdx, r14
-        mov	r11, r14
-        imul	rdx, r8
-        xor	r13, r13
-        ; a[i+0] += m[0] * mu
-        mulx	r10, r9, QWORD PTR [rax]
-        mov	r14, QWORD PTR [rcx+16]
-        adcx	r11, r9
-        adox	r14, r10
-        ; a[i+1] += m[1] * mu
-        mulx	r10, r9, QWORD PTR [rax+8]
-        mov	r11, QWORD PTR [rcx+24]
-        adcx	r14, r9
-        adox	r11, r10
-        ; a[i+2] += m[2] * mu
-        mulx	r10, r9, QWORD PTR [rax+16]
-        mov	r12, QWORD PTR [rcx+32]
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+24], r11
-        ; a[i+3] += m[3] * mu
-        mulx	r10, r9, QWORD PTR [rax+24]
-        mov	r11, QWORD PTR [rcx+40]
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+32], r12
-        ; a[i+4] += m[4] * mu
-        mulx	r10, r9, QWORD PTR [rax+32]
-        mov	r12, QWORD PTR [rcx+48]
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+40], r11
-        ; a[i+5] += m[5] * mu
-        mulx	r10, r9, QWORD PTR [rax+40]
-        mov	r11, QWORD PTR [rcx+56]
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+48], r12
-        adcx	r11, r15
-        mov	QWORD PTR [rcx+56], r11
-        mov	r15, r13
-        adox	r15, r13
-        adcx	r15, r13
-        ; mu = a[i] * mp
-        mov	rdx, r14
-        mov	r11, r14
-        imul	rdx, r8
-        xor	r13, r13
-        ; a[i+0] += m[0] * mu
-        mulx	r10, r9, QWORD PTR [rax]
-        mov	r14, QWORD PTR [rcx+24]
-        adcx	r11, r9
-        adox	r14, r10
-        ; a[i+1] += m[1] * mu
-        mulx	r10, r9, QWORD PTR [rax+8]
-        mov	r11, QWORD PTR [rcx+32]
-        adcx	r14, r9
-        adox	r11, r10
-        ; a[i+2] += m[2] * mu
-        mulx	r10, r9, QWORD PTR [rax+16]
-        mov	r12, QWORD PTR [rcx+40]
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+32], r11
-        ; a[i+3] += m[3] * mu
-        mulx	r10, r9, QWORD PTR [rax+24]
-        mov	r11, QWORD PTR [rcx+48]
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+40], r12
-        ; a[i+4] += m[4] * mu
-        mulx	r10, r9, QWORD PTR [rax+32]
-        mov	r12, QWORD PTR [rcx+56]
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+48], r11
-        ; a[i+5] += m[5] * mu
-        mulx	r10, r9, QWORD PTR [rax+40]
-        mov	r11, QWORD PTR [rcx+64]
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+56], r12
-        adcx	r11, r15
-        mov	QWORD PTR [rcx+64], r11
-        mov	r15, r13
-        adox	r15, r13
-        adcx	r15, r13
-        ; mu = a[i] * mp
-        mov	rdx, r14
-        mov	r11, r14
-        imul	rdx, r8
-        xor	r13, r13
-        ; a[i+0] += m[0] * mu
-        mulx	r10, r9, QWORD PTR [rax]
-        mov	r14, QWORD PTR [rcx+32]
-        adcx	r11, r9
-        adox	r14, r10
-        ; a[i+1] += m[1] * mu
-        mulx	r10, r9, QWORD PTR [rax+8]
-        mov	r11, QWORD PTR [rcx+40]
-        adcx	r14, r9
-        adox	r11, r10
-        ; a[i+2] += m[2] * mu
-        mulx	r10, r9, QWORD PTR [rax+16]
-        mov	r12, QWORD PTR [rcx+48]
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+40], r11
-        ; a[i+3] += m[3] * mu
-        mulx	r10, r9, QWORD PTR [rax+24]
-        mov	r11, QWORD PTR [rcx+56]
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+48], r12
-        ; a[i+4] += m[4] * mu
-        mulx	r10, r9, QWORD PTR [rax+32]
-        mov	r12, QWORD PTR [rcx+64]
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+56], r11
-        ; a[i+5] += m[5] * mu
-        mulx	r10, r9, QWORD PTR [rax+40]
-        mov	r11, QWORD PTR [rcx+72]
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+64], r12
-        adcx	r11, r15
-        mov	QWORD PTR [rcx+72], r11
-        mov	r15, r13
-        adox	r15, r13
-        adcx	r15, r13
-        ; mu = a[i] * mp
-        mov	rdx, r14
-        mov	r11, r14
-        imul	rdx, r8
-        xor	r13, r13
-        ; a[i+0] += m[0] * mu
-        mulx	r10, r9, QWORD PTR [rax]
-        mov	r14, QWORD PTR [rcx+40]
-        adcx	r11, r9
-        adox	r14, r10
-        ; a[i+1] += m[1] * mu
-        mulx	r10, r9, QWORD PTR [rax+8]
-        mov	r11, QWORD PTR [rcx+48]
-        adcx	r14, r9
-        adox	r11, r10
-        ; a[i+2] += m[2] * mu
-        mulx	r10, r9, QWORD PTR [rax+16]
-        mov	r12, QWORD PTR [rcx+56]
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+48], r11
-        ; a[i+3] += m[3] * mu
-        mulx	r10, r9, QWORD PTR [rax+24]
-        mov	r11, QWORD PTR [rcx+64]
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+56], r12
-        ; a[i+4] += m[4] * mu
-        mulx	r10, r9, QWORD PTR [rax+32]
-        mov	r12, QWORD PTR [rcx+72]
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+64], r11
-        ; a[i+5] += m[5] * mu
-        mulx	r10, r9, QWORD PTR [rax+40]
-        mov	r11, QWORD PTR [rcx+80]
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+72], r12
-        adcx	r11, r15
-        mov	QWORD PTR [rcx+80], r11
-        mov	r15, r13
-        adox	r15, r13
-        adcx	r15, r13
-        ; mu = a[i] * mp
-        mov	rdx, r14
-        mov	r11, r14
-        imul	rdx, r8
-        xor	r13, r13
-        ; a[i+0] += m[0] * mu
-        mulx	r10, r9, QWORD PTR [rax]
-        mov	r14, QWORD PTR [rcx+48]
-        adcx	r11, r9
-        adox	r14, r10
-        ; a[i+1] += m[1] * mu
-        mulx	r10, r9, QWORD PTR [rax+8]
-        mov	r11, QWORD PTR [rcx+56]
-        adcx	r14, r9
-        adox	r11, r10
-        ; a[i+2] += m[2] * mu
-        mulx	r10, r9, QWORD PTR [rax+16]
-        mov	r12, QWORD PTR [rcx+64]
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+56], r11
-        ; a[i+3] += m[3] * mu
-        mulx	r10, r9, QWORD PTR [rax+24]
-        mov	r11, QWORD PTR [rcx+72]
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+64], r12
-        ; a[i+4] += m[4] * mu
-        mulx	r10, r9, QWORD PTR [rax+32]
-        mov	r12, QWORD PTR [rcx+80]
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+72], r11
-        ; a[i+5] += m[5] * mu
-        mulx	r10, r9, QWORD PTR [rax+40]
-        mov	r11, QWORD PTR [rcx+88]
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+80], r12
-        adcx	r11, r15
-        mov	QWORD PTR [rcx+88], r11
-        mov	r15, r13
-        adox	r15, r13
-        adcx	r15, r13
-        neg	r15
-        mov	r8, rcx
-        add	rcx, 48
-        mov	r10, QWORD PTR [rax]
-        mov	rdx, r14
-        pext	r10, r10, r15
-        sub	rdx, r10
-        mov	r10, QWORD PTR [rax+8]
-        mov	r9, QWORD PTR [rcx+8]
-        pext	r10, r10, r15
-        mov	QWORD PTR [r8], rdx
-        sbb	r9, r10
-        mov	rdx, QWORD PTR [rax+16]
-        mov	r10, QWORD PTR [rcx+16]
-        pext	rdx, rdx, r15
-        mov	QWORD PTR [r8+8], r9
-        sbb	r10, rdx
-        mov	r9, QWORD PTR [rax+24]
-        mov	rdx, QWORD PTR [rcx+24]
-        pext	r9, r9, r15
-        mov	QWORD PTR [r8+16], r10
-        sbb	rdx, r9
-        mov	r10, QWORD PTR [rax+32]
-        mov	r9, QWORD PTR [rcx+32]
-        pext	r10, r10, r15
-        mov	QWORD PTR [r8+24], rdx
-        sbb	r9, r10
-        mov	rdx, QWORD PTR [rax+40]
-        mov	r10, QWORD PTR [rcx+40]
-        pext	rdx, rdx, r15
-        mov	QWORD PTR [r8+32], r9
-        sbb	r10, rdx
-        mov	QWORD PTR [r8+40], r10
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_384_mont_reduce_order_avx2_6 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Conditionally subtract b from a using the mask m.
-;  * m is -1 to subtract and 0 when not copying.
-;  *
-;  * r  A single precision number representing condition subtract result.
-;  * a  A single precision number to subtract from.
-;  * b  A single precision number to subtract.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_cond_sub_avx2_6 PROC
-        push	r12
-        mov	r12, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx]
-        pext	r12, r12, r9
-        sub	r10, r12
-        mov	r12, QWORD PTR [r8+8]
-        mov	r11, QWORD PTR [rdx+8]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+16]
-        mov	r12, QWORD PTR [rdx+16]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+8], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [rdx+24]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+16], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [rdx+32]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+24], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [rdx+40]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+32], r11
-        sbb	r12, r10
-        mov	QWORD PTR [rcx+40], r12
-        sbb	rax, rax
-        pop	r12
-        ret
-sp_384_cond_sub_avx2_6 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
-;  *
-;  * r  Result of division by 2.
-;  * a  Number to divide.
-;  * m  Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_384_mont_div2_avx2_6 PROC
-        push	r12
-        push	r13
-        mov	r13, QWORD PTR [rdx]
-        xor	r12, r12
-        mov	r10, r13
-        and	r13, 1
-        neg	r13
-        mov	rax, QWORD PTR [r8]
-        mov	r9, QWORD PTR [r8+8]
-        mov	r10, QWORD PTR [rdx]
-        mov	r11, QWORD PTR [rdx+8]
-        pext	rax, rax, r13
-        pext	r9, r9, r13
-        add	r10, rax
-        adc	r11, r9
-        mov	QWORD PTR [rcx], r10
-        mov	QWORD PTR [rcx+8], r11
-        mov	rax, QWORD PTR [r8+16]
-        mov	r9, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r11, QWORD PTR [rdx+24]
-        pext	rax, rax, r13
-        pext	r9, r9, r13
-        adc	r10, rax
-        adc	r11, r9
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        mov	rax, QWORD PTR [r8+32]
-        mov	r9, QWORD PTR [r8+40]
-        mov	r10, QWORD PTR [rdx+32]
-        mov	r11, QWORD PTR [rdx+40]
-        pext	rax, rax, r13
-        pext	r9, r9, r13
-        adc	r10, rax
-        adc	r11, r9
-        mov	QWORD PTR [rcx+32], r10
-        mov	QWORD PTR [rcx+40], r11
-        adc	r12, 0
-        mov	r10, QWORD PTR [rcx]
-        mov	r11, QWORD PTR [rcx+8]
-        shrd	r10, r11, 1
-        mov	QWORD PTR [rcx], r10
-        mov	r10, QWORD PTR [rcx+16]
-        shrd	r11, r10, 1
-        mov	QWORD PTR [rcx+8], r11
-        mov	r11, QWORD PTR [rcx+24]
-        shrd	r10, r11, 1
-        mov	QWORD PTR [rcx+16], r10
-        mov	r10, QWORD PTR [rcx+32]
-        shrd	r11, r10, 1
-        mov	QWORD PTR [rcx+24], r11
-        mov	r11, QWORD PTR [rcx+40]
-        shrd	r10, r11, 1
-        mov	QWORD PTR [rcx+32], r10
-        shrd	r11, r12, 1
-        mov	QWORD PTR [rcx+40], r11
-        pop	r13
-        pop	r12
-        ret
-sp_384_mont_div2_avx2_6 ENDP
-_text ENDS
-ENDIF
-IFNDEF WC_NO_CACHE_RESISTANT
-; /* Touch each possible entry that could be being copied.
-;  *
-;  * r      Point to copy into.
-;  * table  Table - start of the entries to access
-;  * idx    Index of entry to retrieve.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_get_entry_64_6 PROC
-        sub	rsp, 160
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        vmovdqu	OWORD PTR [rsp+96], xmm12
-        vmovdqu	OWORD PTR [rsp+112], xmm13
-        vmovdqu	OWORD PTR [rsp+128], xmm14
-        vmovdqu	OWORD PTR [rsp+144], xmm15
-        ; From entry 1
-        mov	rax, 1
-        movd	xmm13, r8d
-        add	rdx, 96
-        movd	xmm15, eax
-        mov	rax, 63
-        pshufd	xmm15, xmm15, 0
-        pshufd	xmm13, xmm13, 0
-        pxor	xmm14, xmm14
-        pxor	xmm0, xmm0
-        pxor	xmm1, xmm1
-        pxor	xmm2, xmm2
-        pxor	xmm3, xmm3
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        movdqa	xmm14, xmm15
-L_384_get_entry_64_6_start_0:
-        movdqa	xmm12, xmm14
-        paddd	xmm14, xmm15
-        pcmpeqd	xmm12, xmm13
-        movdqu	xmm6, [rdx]
-        movdqu	xmm7, [rdx+16]
-        movdqu	xmm8, [rdx+32]
-        movdqu	xmm9, [rdx+48]
-        movdqu	xmm10, [rdx+64]
-        movdqu	xmm11, [rdx+80]
-        add	rdx, 96
-        pand	xmm6, xmm12
-        pand	xmm7, xmm12
-        pand	xmm8, xmm12
-        pand	xmm9, xmm12
-        pand	xmm10, xmm12
-        pand	xmm11, xmm12
-        por	xmm0, xmm6
-        por	xmm1, xmm7
-        por	xmm2, xmm8
-        por	xmm3, xmm9
-        por	xmm4, xmm10
-        por	xmm5, xmm11
-        dec	rax
-        jnz	L_384_get_entry_64_6_start_0
-        movdqu	[rcx], xmm0
-        movdqu	[rcx+16], xmm1
-        movdqu	[rcx+32], xmm2
-        movdqu	[rcx+96], xmm3
-        movdqu	[rcx+112], xmm4
-        movdqu	[rcx+128], xmm5
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        vmovdqu	xmm12, OWORD PTR [rsp+96]
-        vmovdqu	xmm13, OWORD PTR [rsp+112]
-        vmovdqu	xmm14, OWORD PTR [rsp+128]
-        vmovdqu	xmm15, OWORD PTR [rsp+144]
-        add	rsp, 160
-        ret
-sp_384_get_entry_64_6 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Touch each possible entry that could be being copied.
-;  *
-;  * r      Point to copy into.
-;  * table  Table - start of the entries to access
-;  * idx    Index of entry to retrieve.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_get_entry_64_avx2_6 PROC
-        sub	rsp, 96
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        mov	rax, 1
-        movd	xmm9, r8d
-        add	rdx, 96
-        movd	xmm11, eax
-        mov	rax, 64
-        vpxor	ymm10, ymm10, ymm10
-        vpermd	ymm9, ymm10, ymm9
-        vpermd	ymm11, ymm10, ymm11
-        vpxor	ymm0, ymm0, ymm0
-        vpxor	xmm1, xmm1, xmm1
-        vpxor	ymm2, ymm2, ymm2
-        vpxor	xmm3, xmm3, xmm3
-        vmovdqa	ymm10, ymm11
-L_384_get_entry_64_avx2_6_start:
-        vpcmpeqd	ymm8, ymm10, ymm9
-        vpaddd	ymm10, ymm10, ymm11
-        vmovupd	ymm4, YMMWORD PTR [rdx]
-        vmovdqu	xmm5, OWORD PTR [rdx+32]
-        vmovupd	ymm6, YMMWORD PTR [rdx+48]
-        vmovdqu	xmm7, OWORD PTR [rdx+80]
-        add	rdx, 96
-        vpand	ymm4, ymm4, ymm8
-        vpand	xmm5, xmm5, xmm8
-        vpand	ymm6, ymm6, ymm8
-        vpand	xmm7, xmm7, xmm8
-        vpor	ymm0, ymm0, ymm4
-        vpor	xmm1, xmm1, xmm5
-        vpor	ymm2, ymm2, ymm6
-        vpor	xmm3, xmm3, xmm7
-        dec	rax
-        jnz	L_384_get_entry_64_avx2_6_start
-        vmovupd	YMMWORD PTR [rcx], ymm0
-        vmovdqu	OWORD PTR [rcx+32], xmm1
-        vmovupd	YMMWORD PTR [rcx+96], ymm2
-        vmovdqu	OWORD PTR [rcx+128], xmm3
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        add	rsp, 96
-        ret
-sp_384_get_entry_64_avx2_6 ENDP
-_text ENDS
-ENDIF
-ENDIF
-IFNDEF WC_NO_CACHE_RESISTANT
-; /* Touch each possible entry that could be being copied.
-;  *
-;  * r      Point to copy into.
-;  * table  Table - start of the entries to access
-;  * idx    Index of entry to retrieve.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_get_entry_65_6 PROC
-        sub	rsp, 160
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        vmovdqu	OWORD PTR [rsp+96], xmm12
-        vmovdqu	OWORD PTR [rsp+112], xmm13
-        vmovdqu	OWORD PTR [rsp+128], xmm14
-        vmovdqu	OWORD PTR [rsp+144], xmm15
-        ; From entry 1
-        mov	rax, 1
-        movd	xmm13, r8d
-        add	rdx, 96
-        movd	xmm15, eax
-        mov	rax, 64
-        pshufd	xmm15, xmm15, 0
-        pshufd	xmm13, xmm13, 0
-        pxor	xmm14, xmm14
-        pxor	xmm0, xmm0
-        pxor	xmm1, xmm1
-        pxor	xmm2, xmm2
-        pxor	xmm3, xmm3
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        movdqa	xmm14, xmm15
-L_384_get_entry_65_6_start_0:
-        movdqa	xmm12, xmm14
-        paddd	xmm14, xmm15
-        pcmpeqd	xmm12, xmm13
-        movdqu	xmm6, [rdx]
-        movdqu	xmm7, [rdx+16]
-        movdqu	xmm8, [rdx+32]
-        movdqu	xmm9, [rdx+48]
-        movdqu	xmm10, [rdx+64]
-        movdqu	xmm11, [rdx+80]
-        add	rdx, 96
-        pand	xmm6, xmm12
-        pand	xmm7, xmm12
-        pand	xmm8, xmm12
-        pand	xmm9, xmm12
-        pand	xmm10, xmm12
-        pand	xmm11, xmm12
-        por	xmm0, xmm6
-        por	xmm1, xmm7
-        por	xmm2, xmm8
-        por	xmm3, xmm9
-        por	xmm4, xmm10
-        por	xmm5, xmm11
-        dec	rax
-        jnz	L_384_get_entry_65_6_start_0
-        movdqu	[rcx], xmm0
-        movdqu	[rcx+16], xmm1
-        movdqu	[rcx+32], xmm2
-        movdqu	[rcx+96], xmm3
-        movdqu	[rcx+112], xmm4
-        movdqu	[rcx+128], xmm5
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        vmovdqu	xmm12, OWORD PTR [rsp+96]
-        vmovdqu	xmm13, OWORD PTR [rsp+112]
-        vmovdqu	xmm14, OWORD PTR [rsp+128]
-        vmovdqu	xmm15, OWORD PTR [rsp+144]
-        add	rsp, 160
-        ret
-sp_384_get_entry_65_6 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Touch each possible entry that could be being copied.
-;  *
-;  * r      Point to copy into.
-;  * table  Table - start of the entries to access
-;  * idx    Index of entry to retrieve.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_get_entry_65_avx2_6 PROC
-        sub	rsp, 96
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        mov	rax, 1
-        movd	xmm9, r8d
-        add	rdx, 96
-        movd	xmm11, eax
-        mov	rax, 65
-        vpxor	ymm10, ymm10, ymm10
-        vpermd	ymm9, ymm10, ymm9
-        vpermd	ymm11, ymm10, ymm11
-        vpxor	ymm0, ymm0, ymm0
-        vpxor	xmm1, xmm1, xmm1
-        vpxor	ymm2, ymm2, ymm2
-        vpxor	xmm3, xmm3, xmm3
-        vmovdqa	ymm10, ymm11
-L_384_get_entry_65_avx2_6_start:
-        vpcmpeqd	ymm8, ymm10, ymm9
-        vpaddd	ymm10, ymm10, ymm11
-        vmovupd	ymm4, YMMWORD PTR [rdx]
-        vmovdqu	xmm5, OWORD PTR [rdx+32]
-        vmovupd	ymm6, YMMWORD PTR [rdx+48]
-        vmovdqu	xmm7, OWORD PTR [rdx+80]
-        add	rdx, 96
-        vpand	ymm4, ymm4, ymm8
-        vpand	xmm5, xmm5, xmm8
-        vpand	ymm6, ymm6, ymm8
-        vpand	xmm7, xmm7, xmm8
-        vpor	ymm0, ymm0, ymm4
-        vpor	xmm1, xmm1, xmm5
-        vpor	ymm2, ymm2, ymm6
-        vpor	xmm3, xmm3, xmm7
-        dec	rax
-        jnz	L_384_get_entry_65_avx2_6_start
-        vmovupd	YMMWORD PTR [rcx], ymm0
-        vmovdqu	OWORD PTR [rcx+32], xmm1
-        vmovupd	YMMWORD PTR [rcx+96], ymm2
-        vmovdqu	OWORD PTR [rcx+128], xmm3
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        add	rsp, 96
-        ret
-sp_384_get_entry_65_avx2_6 ENDP
-_text ENDS
-ENDIF
-ENDIF
-; /* Add 1 to a. (a = a + 1)
-;  *
-;  * a  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_add_one_6 PROC
-        add	QWORD PTR [rcx], 1
-        adc	QWORD PTR [rcx+8], 0
-        adc	QWORD PTR [rcx+16], 0
-        adc	QWORD PTR [rcx+24], 0
-        adc	QWORD PTR [rcx+32], 0
-        adc	QWORD PTR [rcx+40], 0
-        ret
-sp_384_add_one_6 ENDP
-_text ENDS
-; /* Read big endian unsigned byte array into r.
-;  * Uses the bswap instruction.
-;  *
-;  * r  A single precision integer.
-;  * size  Maximum number of bytes to convert
-;  * a  Byte array.
-;  * n  Number of bytes in array to read.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_from_bin_bswap PROC
-        push	r12
-        push	r13
-        mov	r11, r8
-        mov	r12, rcx
-        add	r11, r9
-        add	r12, 48
-        xor	r13, r13
-        jmp	L_384_from_bin_bswap_64_end
-L_384_from_bin_bswap_64_start:
-        sub	r11, 64
-        mov	rax, QWORD PTR [r11+56]
-        mov	r10, QWORD PTR [r11+48]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        mov	rax, QWORD PTR [r11+40]
-        mov	r10, QWORD PTR [r11+32]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r10
-        mov	rax, QWORD PTR [r11+24]
-        mov	r10, QWORD PTR [r11+16]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        mov	rax, QWORD PTR [r11+8]
-        mov	r10, QWORD PTR [r11]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r10
-        add	rcx, 64
-        sub	r9, 64
-L_384_from_bin_bswap_64_end:
-        cmp	r9, 63
-        jg	L_384_from_bin_bswap_64_start
-        jmp	L_384_from_bin_bswap_8_end
-L_384_from_bin_bswap_8_start:
-        sub	r11, 8
-        mov	rax, QWORD PTR [r11]
-        bswap	rax
-        mov	QWORD PTR [rcx], rax
-        add	rcx, 8
-        sub	r9, 8
-L_384_from_bin_bswap_8_end:
-        cmp	r9, 7
-        jg	L_384_from_bin_bswap_8_start
-        cmp	r9, r13
-        je	L_384_from_bin_bswap_hi_end
-        mov	r10, r13
-        mov	rax, r13
-L_384_from_bin_bswap_hi_start:
-        mov	al, BYTE PTR [r8]
-        shl	r10, 8
-        inc	r8
-        add	r10, rax
-        dec	r9
-        jg	L_384_from_bin_bswap_hi_start
-        mov	QWORD PTR [rcx], r10
-        add	rcx, 8
-L_384_from_bin_bswap_hi_end:
-        cmp	rcx, r12
-        jge	L_384_from_bin_bswap_zero_end
-L_384_from_bin_bswap_zero_start:
-        mov	QWORD PTR [rcx], r13
-        add	rcx, 8
-        cmp	rcx, r12
-        jl	L_384_from_bin_bswap_zero_start
-L_384_from_bin_bswap_zero_end:
-        pop	r13
-        pop	r12
-        ret
-sp_384_from_bin_bswap ENDP
-_text ENDS
-IFNDEF NO_MOVBE_SUPPORT
-; /* Read big endian unsigned byte array into r.
-;  * Uses the movbe instruction which is an optional instruction.
-;  *
-;  * r  A single precision integer.
-;  * size  Maximum number of bytes to convert
-;  * a  Byte array.
-;  * n  Number of bytes in array to read.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_from_bin_movbe PROC
-        push	r12
-        mov	r11, r8
-        mov	r12, rcx
-        add	r11, r9
-        add	r12, 48
-        jmp	L_384_from_bin_movbe_64_end
-L_384_from_bin_movbe_64_start:
-        sub	r11, 64
-        movbe	rax, QWORD PTR [r11+56]
-        movbe	r10, QWORD PTR [r11+48]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        movbe	rax, QWORD PTR [r11+40]
-        movbe	r10, QWORD PTR [r11+32]
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r10
-        movbe	rax, QWORD PTR [r11+24]
-        movbe	r10, QWORD PTR [r11+16]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        movbe	rax, QWORD PTR [r11+8]
-        movbe	r10, QWORD PTR [r11]
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r10
-        add	rcx, 64
-        sub	r9, 64
-L_384_from_bin_movbe_64_end:
-        cmp	r9, 63
-        jg	L_384_from_bin_movbe_64_start
-        jmp	L_384_from_bin_movbe_8_end
-L_384_from_bin_movbe_8_start:
-        sub	r11, 8
-        movbe	rax, QWORD PTR [r11]
-        mov	QWORD PTR [rcx], rax
-        add	rcx, 8
-        sub	r9, 8
-L_384_from_bin_movbe_8_end:
-        cmp	r9, 7
-        jg	L_384_from_bin_movbe_8_start
-        cmp	r9, 0
-        je	L_384_from_bin_movbe_hi_end
-        mov	r10, 0
-        mov	rax, 0
-L_384_from_bin_movbe_hi_start:
-        mov	al, BYTE PTR [r8]
-        shl	r10, 8
-        inc	r8
-        add	r10, rax
-        dec	r9
-        jg	L_384_from_bin_movbe_hi_start
-        mov	QWORD PTR [rcx], r10
-        add	rcx, 8
-L_384_from_bin_movbe_hi_end:
-        cmp	rcx, r12
-        jge	L_384_from_bin_movbe_zero_end
-L_384_from_bin_movbe_zero_start:
-        mov	QWORD PTR [rcx], 0
-        add	rcx, 8
-        cmp	rcx, r12
-        jl	L_384_from_bin_movbe_zero_start
-L_384_from_bin_movbe_zero_end:
-        pop	r12
-        ret
-sp_384_from_bin_movbe ENDP
-_text ENDS
-ENDIF
-; /* Write r as big endian to byte array.
-;  * Fixed length number of bytes written: 48
-;  * Uses the bswap instruction.
-;  *
-;  * r  A single precision integer.
-;  * a  Byte array.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_to_bin_bswap_6 PROC
-        mov	rax, QWORD PTR [rcx+40]
-        mov	r8, QWORD PTR [rcx+32]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx], rax
-        mov	QWORD PTR [rdx+8], r8
-        mov	rax, QWORD PTR [rcx+24]
-        mov	r8, QWORD PTR [rcx+16]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+16], rax
-        mov	QWORD PTR [rdx+24], r8
-        mov	rax, QWORD PTR [rcx+8]
-        mov	r8, QWORD PTR [rcx]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+32], rax
-        mov	QWORD PTR [rdx+40], r8
-        ret
-sp_384_to_bin_bswap_6 ENDP
-_text ENDS
-IFNDEF NO_MOVBE_SUPPORT
-; /* Write r as big endian to byte array.
-;  * Fixed length number of bytes written: 48
-;  * Uses the movbe instruction which is optional.
-;  *
-;  * r  A single precision integer.
-;  * a  Byte array.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_to_bin_movbe_6 PROC
-        movbe	rax, QWORD PTR [rcx+40]
-        movbe	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rdx], rax
-        mov	QWORD PTR [rdx+8], r8
-        movbe	rax, QWORD PTR [rcx+24]
-        movbe	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rdx+16], rax
-        mov	QWORD PTR [rdx+24], r8
-        movbe	rax, QWORD PTR [rcx+8]
-        movbe	r8, QWORD PTR [rcx]
-        mov	QWORD PTR [rdx+32], rax
-        mov	QWORD PTR [rdx+40], r8
-        ret
-sp_384_to_bin_movbe_6 ENDP
-_text ENDS
-ENDIF
-; /* Sub b from a into a. (a -= b)
-;  *
-;  * a  A single precision integer and result.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_sub_in_place_6 PROC
-        push	r12
-        push	r13
-        mov	r8, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r11, QWORD PTR [rdx+24]
-        mov	r12, QWORD PTR [rdx+32]
-        mov	r13, QWORD PTR [rdx+40]
-        sub	QWORD PTR [rcx], r8
-        sbb	QWORD PTR [rcx+8], r9
-        sbb	QWORD PTR [rcx+16], r10
-        sbb	QWORD PTR [rcx+24], r11
-        sbb	QWORD PTR [rcx+32], r12
-        sbb	QWORD PTR [rcx+40], r13
-        sbb	rax, rax
-        pop	r13
-        pop	r12
-        ret
-sp_384_sub_in_place_6 ENDP
-_text ENDS
-; /* Mul a by digit b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision digit.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_mul_d_6 PROC
-        push	r12
-        mov	r9, rdx
-        ; A[0] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9]
-        mov	r10, rax
-        mov	r11, rdx
-        mov	QWORD PTR [rcx], r10
-        ; A[1] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        mov	QWORD PTR [rcx+8], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        mov	QWORD PTR [rcx+16], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        mov	QWORD PTR [rcx+24], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        mov	QWORD PTR [rcx+32], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B
-        mov	rax, r8
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        adc	r10, rdx
-        mov	QWORD PTR [rcx+40], r12
-        mov	QWORD PTR [rcx+48], r10
-        pop	r12
-        ret
-sp_384_mul_d_6 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Mul a by digit b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision digit.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_mul_d_avx2_6 PROC
-        push	r12
-        push	r13
-        mov	rax, rdx
-        ; A[0] * B
-        mov	rdx, r8
-        xor	r13, r13
-        mulx	r12, r11, QWORD PTR [rax]
-        mov	QWORD PTR [rcx], r11
-        ; A[1] * B
-        mulx	r10, r9, QWORD PTR [rax+8]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+8], r12
-        ; A[2] * B
-        mulx	r10, r9, QWORD PTR [rax+16]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+16], r11
-        ; A[3] * B
-        mulx	r10, r9, QWORD PTR [rax+24]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+24], r12
-        ; A[4] * B
-        mulx	r10, r9, QWORD PTR [rax+32]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+32], r11
-        ; A[5] * B
-        mulx	r10, r9, QWORD PTR [rax+40]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        adcx	r11, r13
-        mov	QWORD PTR [rcx+40], r12
-        mov	QWORD PTR [rcx+48], r11
-        pop	r13
-        pop	r12
-        ret
-sp_384_mul_d_avx2_6 ENDP
-_text ENDS
-ENDIF
-IFDEF _WIN64
-; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
-;  *
-;  * d1   The high order half of the number to divide.
-;  * d0   The low order half of the number to divide.
-;  * div  The dividend.
-;  * returns the result of the division.
-;  */
-_text SEGMENT READONLY PARA
-div_384_word_asm_6 PROC
-        mov	r9, rdx
-        mov	rax, r9
-        mov	rdx, rcx
-        div	r8
-        ret
-div_384_word_asm_6 ENDP
-_text ENDS
-ENDIF
-; /* Shift number right by 1 bit. (r = a >> 1)
-;  *
-;  * r  Result of right shift by 1.
-;  * a  Number to shift.
-;  */
-_text SEGMENT READONLY PARA
-sp_384_rshift1_6 PROC
-        push	r12
-        mov	rax, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rdx+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	r11, QWORD PTR [rdx+32]
-        mov	r12, QWORD PTR [rdx+40]
-        shrd	rax, r8, 1
-        shrd	r8, r9, 1
-        shrd	r9, r10, 1
-        shrd	r10, r11, 1
-        shrd	r11, r12, 1
-        shr	r12, 1
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r8
-        mov	QWORD PTR [rcx+16], r9
-        mov	QWORD PTR [rcx+24], r10
-        mov	QWORD PTR [rcx+32], r11
-        mov	QWORD PTR [rcx+40], r12
-        pop	r12
-        ret
-sp_384_rshift1_6 ENDP
-_text ENDS
-; /* Divide the number by 2 mod the prime. (r = a / 2 % m)
-;  *
-;  * r  Result of division by 2.
-;  * a  Number to divide.
-;  * m  Modulus
-;  */
-_text SEGMENT READONLY PARA
-sp_384_div2_mod_6 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        push	rbp
-        mov	rax, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r11, QWORD PTR [rdx+24]
-        mov	r12, QWORD PTR [rdx+32]
-        mov	r13, QWORD PTR [rdx+40]
-        mov	r14, QWORD PTR [r8]
-        mov	r15, QWORD PTR [r8+8]
-        mov	rdi, QWORD PTR [r8+16]
-        mov	rsi, QWORD PTR [r8+24]
-        mov	rbx, QWORD PTR [r8+32]
-        mov	rbp, QWORD PTR [r8+40]
-        mov	r8, rax
-        and	r8, 1
-        je	L_384_mod_inv_6_div2_mod_no_add
-        add	rax, r14
-        adc	r9, r15
-        adc	r10, rdi
-        adc	r11, rsi
-        adc	r12, rbx
-        adc	r13, rbp
-        mov	r8, 0
-        adc	r8, 0
-L_384_mod_inv_6_div2_mod_no_add:
-        shrd	rax, r9, 1
-        shrd	r9, r10, 1
-        shrd	r10, r11, 1
-        shrd	r11, r12, 1
-        shrd	r12, r13, 1
-        shrd	r13, r8, 1
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r9
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        mov	QWORD PTR [rcx+32], r12
-        mov	QWORD PTR [rcx+40], r13
-        pop	rbp
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_384_div2_mod_6 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-sp_384_num_bits_6 PROC
-        xor	rax, rax
-        mov	rdx, QWORD PTR [rcx+40]
-        cmp	rdx, 0
-        je	L_384_num_bits_6_end_320
-        mov	rax, -1
-        bsr	rax, rdx
-        add	rax, 321
-        jmp	L_384_num_bits_6_done
-L_384_num_bits_6_end_320:
-        mov	rdx, QWORD PTR [rcx+32]
-        cmp	rdx, 0
-        je	L_384_num_bits_6_end_256
-        mov	rax, -1
-        bsr	rax, rdx
-        add	rax, 257
-        jmp	L_384_num_bits_6_done
-L_384_num_bits_6_end_256:
-        mov	rdx, QWORD PTR [rcx+24]
-        cmp	rdx, 0
-        je	L_384_num_bits_6_end_192
-        mov	rax, -1
-        bsr	rax, rdx
-        add	rax, 193
-        jmp	L_384_num_bits_6_done
-L_384_num_bits_6_end_192:
-        mov	rdx, QWORD PTR [rcx+16]
-        cmp	rdx, 0
-        je	L_384_num_bits_6_end_128
-        mov	rax, -1
-        bsr	rax, rdx
-        add	rax, 129
-        jmp	L_384_num_bits_6_done
-L_384_num_bits_6_end_128:
-        mov	rdx, QWORD PTR [rcx+8]
-        cmp	rdx, 0
-        je	L_384_num_bits_6_end_64
-        mov	rax, -1
-        bsr	rax, rdx
-        add	rax, 65
-        jmp	L_384_num_bits_6_done
-L_384_num_bits_6_end_64:
-        mov	rdx, QWORD PTR [rcx]
-        cmp	rdx, 0
-        je	L_384_num_bits_6_end_0
-        mov	rax, -1
-        bsr	rax, rdx
-        add	rax, 1
-        jmp	L_384_num_bits_6_done
-L_384_num_bits_6_end_0:
-L_384_num_bits_6_done:
-        ret
-sp_384_num_bits_6 ENDP
-_text ENDS
-ENDIF
-IFDEF WOLFSSL_SP_521
-; /* Multiply a and b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_mul_9 PROC
-        push	r12
-        mov	r9, rdx
-        sub	rsp, 72
-        ; A[0] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        mov	QWORD PTR [rsp], rax
-        mov	r11, rdx
-        ; A[0] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[1] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+8], r11
-        ; A[0] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+8]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[2] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+16], r12
-        ; A[0] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[1] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[2] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+16]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[3] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rsp+24], r10
-        ; A[0] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[1] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+16]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[3] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+24]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[4] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+32], r11
-        ; A[0] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+8]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[2] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+24]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[4] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+32]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[5] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+40], r12
-        ; A[0] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[1] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[2] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+16]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[3] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+32]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[5] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+40]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[6] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rsp+48], r10
-        ; A[0] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[1] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+16]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[3] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+24]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[4] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+40]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[6] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+48]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[7] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+56], r11
-        ; A[0] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+8]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[2] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+24]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[4] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+32]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[5] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+48]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[7] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+56]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[8] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+64], r12
-        ; A[1] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+8]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[2] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+16]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[3] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+32]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[5] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+40]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[6] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+56]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[8] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+64]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rcx+72], r10
-        ; A[2] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+16]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[3] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+24]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[4] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+40]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[6] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+48]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[7] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+64]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+80], r11
-        ; A[3] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+24]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[4] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+32]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[5] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+48]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[7] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+56]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[8] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+88], r12
-        ; A[4] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+32]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[5] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+40]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[6] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+56]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[8] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+64]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rcx+96], r10
-        ; A[5] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+40]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[6] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+48]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[7] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+64]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+104], r11
-        ; A[6] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+48]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[7] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+56]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[8] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+112], r12
-        ; A[7] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+56]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[8] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+64]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rcx+120], r10
-        ; A[8] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+64]
-        add	r11, rax
-        adc	r12, rdx
-        mov	QWORD PTR [rcx+128], r11
-        mov	QWORD PTR [rcx+136], r12
-        mov	rax, QWORD PTR [rsp]
-        mov	rdx, QWORD PTR [rsp+8]
-        mov	r10, QWORD PTR [rsp+16]
-        mov	r11, QWORD PTR [rsp+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], rdx
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        mov	rax, QWORD PTR [rsp+32]
-        mov	rdx, QWORD PTR [rsp+40]
-        mov	r10, QWORD PTR [rsp+48]
-        mov	r11, QWORD PTR [rsp+56]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], rdx
-        mov	QWORD PTR [rcx+48], r10
-        mov	QWORD PTR [rcx+56], r11
-        mov	rax, QWORD PTR [rsp+64]
-        mov	QWORD PTR [rcx+64], rax
-        add	rsp, 72
-        pop	r12
-        ret
-sp_521_mul_9 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Multiply a and b into r. (r = a * b)
-;  *
-;  * r   Result of multiplication.
-;  * a   First number to multiply.
-;  * b   Second number to multiply.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_mul_avx2_9 PROC
-        push	rbx
-        push	rbp
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        mov	rbp, r8
-        mov	r8, rcx
-        mov	r9, rdx
-        sub	rsp, 72
-        cmp	r9, r8
-        mov	rbx, rsp
-        cmovne	rbx, r8
-        cmp	rbp, r8
-        cmove	rbx, rsp
-        add	r8, 72
-        xor	r15, r15
-        mov	rdx, QWORD PTR [r9]
-        ; A[0] * B[0]
-        mulx	r11, r10, QWORD PTR [rbp]
-        ; A[0] * B[1]
-        mulx	r12, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx], r10
-        adcx	r11, rax
-        ; A[0] * B[2]
-        mulx	r13, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+8], r11
-        adcx	r12, rax
-        mov	QWORD PTR [rbx+16], r12
-        ; A[0] * B[3]
-        mulx	r10, rax, QWORD PTR [rbp+24]
-        adcx	r13, rax
-        ; A[0] * B[4]
-        mulx	r11, rax, QWORD PTR [rbp+32]
-        mov	QWORD PTR [rbx+24], r13
-        adcx	r10, rax
-        ; A[0] * B[5]
-        mulx	r12, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+32], r10
-        adcx	r11, rax
-        mov	QWORD PTR [rbx+40], r11
-        ; A[0] * B[6]
-        mulx	r13, rax, QWORD PTR [rbp+48]
-        adcx	r12, rax
-        ; A[0] * B[7]
-        mulx	r10, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+48], r12
-        adcx	r13, rax
-        ; A[0] * B[8]
-        mulx	r11, rax, QWORD PTR [rbp+64]
-        mov	QWORD PTR [rbx+56], r13
-        adcx	r10, rax
-        adcx	r11, r15
-        mov	r14, r15
-        adcx	r14, r15
-        mov	QWORD PTR [rbx+64], r10
-        mov	QWORD PTR [r8], r11
-        mov	rdx, QWORD PTR [r9+8]
-        mov	r11, QWORD PTR [rbx+8]
-        mov	r12, QWORD PTR [rbx+16]
-        mov	r13, QWORD PTR [rbx+24]
-        mov	r10, QWORD PTR [rbx+32]
-        ; A[1] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[1] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[1] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+16], r12
-        adcx	r13, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+24], r13
-        mov	r11, QWORD PTR [rbx+40]
-        mov	r12, QWORD PTR [rbx+48]
-        mov	r13, QWORD PTR [rbx+56]
-        ; A[1] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[1] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        mov	QWORD PTR [rbx+32], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[1] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+40], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [rbx+48], r12
-        mov	r10, QWORD PTR [rbx+64]
-        mov	r11, QWORD PTR [r8]
-        ; A[1] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[1] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+56], r13
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[1] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        mov	QWORD PTR [rbx+64], r10
-        mov	r12, r15
-        adcx	r11, rax
-        adox	r12, rcx
-        adcx	r12, r14
-        mov	r14, r15
-        adox	r14, r15
-        adcx	r14, r15
-        mov	QWORD PTR [r8], r11
-        mov	QWORD PTR [r8+8], r12
-        mov	rdx, QWORD PTR [r9+16]
-        mov	r12, QWORD PTR [rbx+16]
-        mov	r13, QWORD PTR [rbx+24]
-        mov	r10, QWORD PTR [rbx+32]
-        mov	r11, QWORD PTR [rbx+40]
-        ; A[2] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[2] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+16], r12
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[2] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+24], r13
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+32], r10
-        mov	r12, QWORD PTR [rbx+48]
-        mov	r13, QWORD PTR [rbx+56]
-        mov	r10, QWORD PTR [rbx+64]
-        ; A[2] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[2] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        mov	QWORD PTR [rbx+40], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[2] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+48], r12
-        adcx	r13, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+56], r13
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        ; A[2] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[2] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+64], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[2] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        mov	QWORD PTR [r8], r11
-        mov	r13, r15
-        adcx	r12, rax
-        adox	r13, rcx
-        adcx	r13, r14
-        mov	r14, r15
-        adox	r14, r15
-        adcx	r14, r15
-        mov	QWORD PTR [r8+8], r12
-        mov	QWORD PTR [r8+16], r13
-        mov	rdx, QWORD PTR [r9+24]
-        mov	r13, QWORD PTR [rbx+24]
-        mov	r10, QWORD PTR [rbx+32]
-        mov	r11, QWORD PTR [rbx+40]
-        mov	r12, QWORD PTR [rbx+48]
-        ; A[3] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[3] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+24], r13
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[3] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+32], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbx+40], r11
-        mov	r13, QWORD PTR [rbx+56]
-        mov	r10, QWORD PTR [rbx+64]
-        mov	r11, QWORD PTR [r8]
-        ; A[3] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[3] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        mov	QWORD PTR [rbx+48], r12
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[3] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+56], r13
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+64], r10
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        ; A[3] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[3] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[3] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        mov	QWORD PTR [r8+8], r12
-        mov	r10, r15
-        adcx	r13, rax
-        adox	r10, rcx
-        adcx	r10, r14
-        mov	r14, r15
-        adox	r14, r15
-        adcx	r14, r15
-        mov	QWORD PTR [r8+16], r13
-        mov	QWORD PTR [r8+24], r10
-        mov	rdx, QWORD PTR [r9+32]
-        mov	r10, QWORD PTR [rbx+32]
-        mov	r11, QWORD PTR [rbx+40]
-        mov	r12, QWORD PTR [rbx+48]
-        mov	r13, QWORD PTR [rbx+56]
-        ; A[4] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[4] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+32], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[4] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+40], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [rbx+48], r12
-        mov	r10, QWORD PTR [rbx+64]
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        ; A[4] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[4] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        mov	QWORD PTR [rbx+56], r13
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[4] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+64], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8], r11
-        mov	r13, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [r8+24]
-        ; A[4] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[4] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8+8], r12
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[4] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        mov	QWORD PTR [r8+16], r13
-        mov	r11, r15
-        adcx	r10, rax
-        adox	r11, rcx
-        adcx	r11, r14
-        mov	r14, r15
-        adox	r14, r15
-        adcx	r14, r15
-        mov	QWORD PTR [r8+24], r10
-        mov	QWORD PTR [r8+32], r11
-        mov	rdx, QWORD PTR [r9+40]
-        mov	r11, QWORD PTR [rbx+40]
-        mov	r12, QWORD PTR [rbx+48]
-        mov	r13, QWORD PTR [rbx+56]
-        mov	r10, QWORD PTR [rbx+64]
-        ; A[5] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[5] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+40], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[5] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+48], r12
-        adcx	r13, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+56], r13
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        ; A[5] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[5] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        mov	QWORD PTR [rbx+64], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[5] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r8+8], r12
-        mov	r10, QWORD PTR [r8+24]
-        mov	r11, QWORD PTR [r8+32]
-        ; A[5] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[5] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8+16], r13
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[5] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        mov	QWORD PTR [r8+24], r10
-        mov	r12, r15
-        adcx	r11, rax
-        adox	r12, rcx
-        adcx	r12, r14
-        mov	r14, r15
-        adox	r14, r15
-        adcx	r14, r15
-        mov	QWORD PTR [r8+32], r11
-        mov	QWORD PTR [r8+40], r12
-        mov	rdx, QWORD PTR [r9+48]
-        mov	r12, QWORD PTR [rbx+48]
-        mov	r13, QWORD PTR [rbx+56]
-        mov	r10, QWORD PTR [rbx+64]
-        mov	r11, QWORD PTR [r8]
-        ; A[6] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[6] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+48], r12
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[6] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+56], r13
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+64], r10
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [r8+24]
-        ; A[6] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[6] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[6] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [r8+8], r12
-        adcx	r13, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+16], r13
-        mov	r11, QWORD PTR [r8+32]
-        mov	r12, QWORD PTR [r8+40]
-        ; A[6] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[6] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8+24], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[6] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        mov	QWORD PTR [r8+32], r11
-        mov	r13, r15
-        adcx	r12, rax
-        adox	r13, rcx
-        adcx	r13, r14
-        mov	r14, r15
-        adox	r14, r15
-        adcx	r14, r15
-        mov	QWORD PTR [r8+40], r12
-        mov	QWORD PTR [r8+48], r13
-        mov	rdx, QWORD PTR [r9+56]
-        mov	r13, QWORD PTR [rbx+56]
-        mov	r10, QWORD PTR [rbx+64]
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        ; A[7] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[7] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+56], r13
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[7] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+64], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8], r11
-        mov	r13, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [r8+24]
-        mov	r11, QWORD PTR [r8+32]
-        ; A[7] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[7] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        mov	QWORD PTR [r8+8], r12
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[7] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [r8+16], r13
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+24], r10
-        mov	r12, QWORD PTR [r8+40]
-        mov	r13, QWORD PTR [r8+48]
-        ; A[7] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[7] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8+32], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[7] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        mov	QWORD PTR [r8+40], r12
-        mov	r10, r15
-        adcx	r13, rax
-        adox	r10, rcx
-        adcx	r10, r14
-        mov	r14, r15
-        adox	r14, r15
-        adcx	r14, r15
-        mov	QWORD PTR [r8+48], r13
-        mov	QWORD PTR [r8+56], r10
-        mov	rdx, QWORD PTR [r9+64]
-        mov	r10, QWORD PTR [rbx+64]
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        ; A[8] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[8] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+64], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[8] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r8+8], r12
-        mov	r10, QWORD PTR [r8+24]
-        mov	r11, QWORD PTR [r8+32]
-        mov	r12, QWORD PTR [r8+40]
-        ; A[8] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[8] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        mov	QWORD PTR [r8+16], r13
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[8] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [r8+24], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8+32], r11
-        mov	r13, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [r8+56]
-        ; A[8] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[8] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8+40], r12
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[8] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        mov	QWORD PTR [r8+48], r13
-        mov	r11, r15
-        adcx	r10, rax
-        adox	r11, rcx
-        adcx	r11, r14
-        mov	QWORD PTR [r8+56], r10
-        mov	QWORD PTR [r8+64], r11
-        sub	r8, 72
-        cmp	r9, r8
-        je	L_start_521_mul_avx2_9
-        cmp	rbp, r8
-        jne	L_end_521_mul_avx2_9
-L_start_521_mul_avx2_9:
-        vmovdqu	xmm0, OWORD PTR [rbx]
-        vmovups	OWORD PTR [r8], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+16]
-        vmovups	OWORD PTR [r8+16], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+32]
-        vmovups	OWORD PTR [r8+32], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+48]
-        vmovups	OWORD PTR [r8+48], xmm0
-        mov	rax, QWORD PTR [rbx+64]
-        mov	QWORD PTR [r8+64], rax
-L_end_521_mul_avx2_9:
-        add	rsp, 72
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        pop	rbp
-        pop	rbx
-        ret
-sp_521_mul_avx2_9 ENDP
-_text ENDS
-ENDIF
-; /* Square a and put result in r. (r = a * a)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_sqr_9 PROC
-        push	r12
-        push	r13
-        push	r14
-        mov	r8, rdx
-        sub	rsp, 72
-        ; A[0] * A[0]
-        mov	rax, QWORD PTR [r8]
-        mul	rax
-        xor	r11, r11
-        mov	QWORD PTR [rsp], rax
-        mov	r10, rdx
-        ; A[0] * A[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r8]
-        xor	r9, r9
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        mov	QWORD PTR [rsp+8], r10
-        ; A[0] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        ; A[1] * A[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	rax
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+16], r11
-        ; A[0] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r8]
-        xor	r11, r11
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r8+8]
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+24], r9
-        ; A[0] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8]
-        xor	r9, r9
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        ; A[1] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r8+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        ; A[2] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	rax
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        mov	QWORD PTR [rsp+32], r10
-        ; A[0] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rsp+40], r11
-        ; A[0] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8]
-        xor	r11, r11
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r9, r12
-        adc	r10, r13
-        adc	r11, r14
-        mov	QWORD PTR [rsp+48], r9
-        ; A[0] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8]
-        xor	r9, r9
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r10, r12
-        adc	r11, r13
-        adc	r9, r14
-        mov	QWORD PTR [rsp+56], r10
-        ; A[0] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rsp+64], r11
-        ; A[1] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+8]
-        xor	r11, r11
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[2] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r9, r12
-        adc	r10, r13
-        adc	r11, r14
-        mov	QWORD PTR [rcx+72], r9
-        ; A[2] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+16]
-        xor	r9, r9
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[3] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r10, r12
-        adc	r11, r13
-        adc	r9, r14
-        mov	QWORD PTR [rcx+80], r10
-        ; A[3] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+24]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[4] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rcx+88], r11
-        ; A[4] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+32]
-        xor	r11, r11
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[5] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+40]
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	rax
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+96], r9
-        ; A[5] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+40]
-        xor	r9, r9
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        ; A[6] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+48]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        mov	QWORD PTR [rcx+104], r10
-        ; A[6] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+48]
-        xor	r10, r10
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        ; A[7] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	rax
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+112], r11
-        ; A[7] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+56]
-        xor	r11, r11
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+120], r9
-        ; A[8] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	rax
-        add	r10, rax
-        adc	r11, rdx
-        mov	QWORD PTR [rcx+128], r10
-        mov	QWORD PTR [rcx+136], r11
-        mov	rax, QWORD PTR [rsp]
-        mov	rdx, QWORD PTR [rsp+8]
-        mov	r12, QWORD PTR [rsp+16]
-        mov	r13, QWORD PTR [rsp+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], rdx
-        mov	QWORD PTR [rcx+16], r12
-        mov	QWORD PTR [rcx+24], r13
-        mov	rax, QWORD PTR [rsp+32]
-        mov	rdx, QWORD PTR [rsp+40]
-        mov	r12, QWORD PTR [rsp+48]
-        mov	r13, QWORD PTR [rsp+56]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], rdx
-        mov	QWORD PTR [rcx+48], r12
-        mov	QWORD PTR [rcx+56], r13
-        mov	rax, QWORD PTR [rsp+64]
-        mov	QWORD PTR [rcx+64], rax
-        add	rsp, 72
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_521_sqr_9 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Square a and put result in r. (r = a * a)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_sqr_avx2_9 PROC
-        push	rbp
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        mov	r8, rcx
-        mov	r9, rdx
-        sub	rsp, 72
-        cmp	r9, r8
-        mov	rbp, rsp
-        cmovne	rbp, r8
-        add	r8, 72
-        xor	r12, r12
-        ; Diagonal 1
-        ; Zero into %r9
-        ; A[1] x A[0]
-        mov	rdx, QWORD PTR [r9]
-        mulx	r11, r10, QWORD PTR [r9+8]
-        mov	QWORD PTR [rbp+8], r10
-        ; Zero into %r8
-        ; A[2] x A[0]
-        mulx	r10, rax, QWORD PTR [r9+16]
-        adcx	r11, rax
-        adox	r10, r12
-        mov	QWORD PTR [rbp+16], r11
-        ; No load %r12 - %r9
-        ; A[3] x A[0]
-        mulx	r14, rax, QWORD PTR [r9+24]
-        adcx	r10, rax
-        adox	r14, r12
-        mov	QWORD PTR [rbp+24], r10
-        ; No load %r13 - %r8
-        ; A[4] x A[0]
-        mulx	r15, rax, QWORD PTR [r9+32]
-        adcx	r14, rax
-        adox	r15, r12
-        ; No store %r12 - %r9
-        ; No load %r14 - %r9
-        ; A[5] x A[0]
-        mulx	rdi, rax, QWORD PTR [r9+40]
-        adcx	r15, rax
-        adox	rdi, r12
-        ; No store %r13 - %r8
-        ; No load %r15 - %r8
-        ; A[6] x A[0]
-        mulx	rsi, rax, QWORD PTR [r9+48]
-        adcx	rdi, rax
-        adox	rsi, r12
-        ; No store %r14 - %r9
-        ; No load %rbx - %r9
-        ; A[7] x A[0]
-        mulx	rbx, rax, QWORD PTR [r9+56]
-        adcx	rsi, rax
-        adox	rbx, r12
-        ; No store %r15 - %r8
-        ; Zero into %r8
-        ; A[8] x A[0]
-        mulx	r10, rax, QWORD PTR [r9+64]
-        adcx	rbx, rax
-        adox	r10, r12
-        ; No store %rbx - %r9
-        ; Zero into %r9
-        ; A[8] x A[1]
-        mov	rdx, QWORD PTR [r9+8]
-        mulx	r11, rax, QWORD PTR [r9+64]
-        adcx	r10, rax
-        adox	r11, r12
-        mov	QWORD PTR [r8], r10
-        ;  Carry
-        adcx	r11, r12
-        mov	r13, r12
-        adcx	r13, r12
-        adox	r13, r12
-        mov	QWORD PTR [r8+8], r11
-        ; Diagonal 2
-        mov	r11, QWORD PTR [rbp+24]
-        ; No load %r12 - %r8
-        ; A[2] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+16]
-        adcx	r11, rax
-        adox	r14, rcx
-        mov	QWORD PTR [rbp+24], r11
-        ; No load %r13 - %r9
-        ; A[3] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+24]
-        adcx	r14, rax
-        adox	r15, rcx
-        ; No store %r12 - %r8
-        ; No load %r14 - %r8
-        ; A[4] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+32]
-        adcx	r15, rax
-        adox	rdi, rcx
-        ; No store %r13 - %r9
-        ; No load %r15 - %r9
-        ; A[5] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+40]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; No store %r14 - %r8
-        ; No load %rbx - %r8
-        ; A[6] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r15 - %r9
-        mov	r11, QWORD PTR [r8]
-        ; A[7] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	rbx, rax
-        adox	r11, rcx
-        ; No store %rbx - %r8
-        mov	r10, QWORD PTR [r8+8]
-        ; A[7] x A[2]
-        mov	rdx, QWORD PTR [r9+16]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8], r11
-        ; Zero into %r9
-        ; A[7] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	r11, rax, QWORD PTR [r9+56]
-        adcx	r10, rax
-        adox	r11, r12
-        mov	QWORD PTR [r8+8], r10
-        ; Zero into %r8
-        ; A[7] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	r10, rax, QWORD PTR [r9+56]
-        adcx	r11, rax
-        adox	r10, r12
-        mov	QWORD PTR [r8+16], r11
-        ;  Carry
-        adcx	r10, r13
-        mov	r13, r12
-        adcx	r13, r12
-        adox	r13, r12
-        mov	QWORD PTR [r8+24], r10
-        ; Diagonal 3
-        ; No load %r14 - %r9
-        ; A[3] x A[2]
-        mov	rdx, QWORD PTR [r9+16]
-        mulx	rcx, rax, QWORD PTR [r9+24]
-        adcx	r15, rax
-        adox	rdi, rcx
-        ; No store %r13 - %r8
-        ; No load %r15 - %r8
-        ; A[4] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+32]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; No store %r14 - %r9
-        ; No load %rbx - %r9
-        ; A[5] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+40]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r15 - %r8
-        mov	r10, QWORD PTR [r8]
-        ; A[6] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	rbx, rax
-        adox	r10, rcx
-        ; No store %rbx - %r9
-        mov	r11, QWORD PTR [r8+8]
-        ; A[6] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8], r10
-        mov	r10, QWORD PTR [r8+16]
-        ; A[6] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+8], r11
-        mov	r11, QWORD PTR [r8+24]
-        ; A[6] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+16], r10
-        ; Zero into %r8
-        ; A[8] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	r10, rax, QWORD PTR [r9+64]
-        adcx	r11, rax
-        adox	r10, r12
-        mov	QWORD PTR [r8+24], r11
-        ; Zero into %r9
-        ; A[8] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	r11, rax, QWORD PTR [r9+64]
-        adcx	r10, rax
-        adox	r11, r12
-        mov	QWORD PTR [r8+32], r10
-        ;  Carry
-        adcx	r11, r13
-        mov	r13, r12
-        adcx	r13, r12
-        adox	r13, r12
-        mov	QWORD PTR [r8+40], r11
-        ; Diagonal 4
-        ; No load %rbx - %r8
-        ; A[4] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	rcx, rax, QWORD PTR [r9+32]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r15 - %r9
-        mov	r11, QWORD PTR [r8]
-        ; A[5] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+40]
-        adcx	rbx, rax
-        adox	r11, rcx
-        ; No store %rbx - %r8
-        mov	r10, QWORD PTR [r8+8]
-        ; A[5] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	rcx, rax, QWORD PTR [r9+40]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8], r11
-        mov	r11, QWORD PTR [r8+16]
-        ; A[8] x A[2]
-        mov	rdx, QWORD PTR [r9+16]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+8], r10
-        mov	r10, QWORD PTR [r8+24]
-        ; A[8] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+16], r11
-        mov	r11, QWORD PTR [r8+32]
-        ; A[7] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+24], r10
-        mov	r10, QWORD PTR [r8+40]
-        ; A[7] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+32], r11
-        ; Zero into %r9
-        ; A[8] x A[6]
-        mulx	r11, rax, QWORD PTR [r9+64]
-        adcx	r10, rax
-        adox	r11, r12
-        mov	QWORD PTR [r8+40], r10
-        ; Zero into %r8
-        ; A[8] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	r10, rax, QWORD PTR [r9+64]
-        adcx	r11, rax
-        adox	r10, r12
-        mov	QWORD PTR [r8+48], r11
-        ;  Carry
-        adcx	r10, r13
-        mov	r13, r12
-        adcx	r13, r12
-        adox	r13, r12
-        mov	QWORD PTR [r8+56], r10
-        mov	QWORD PTR [r8+64], r13
-        ; Double and Add in A[i] x A[i]
-        mov	r11, QWORD PTR [rbp+8]
-        ; A[0] x A[0]
-        mov	rdx, QWORD PTR [r9]
-        mulx	rcx, rax, rdx
-        mov	QWORD PTR [rbp], rax
-        adox	r11, r11
-        adcx	r11, rcx
-        mov	QWORD PTR [rbp+8], r11
-        mov	r10, QWORD PTR [rbp+16]
-        mov	r11, QWORD PTR [rbp+24]
-        ; A[1] x A[1]
-        mov	rdx, QWORD PTR [r9+8]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [rbp+16], r10
-        mov	QWORD PTR [rbp+24], r11
-        ; A[2] x A[2]
-        mov	rdx, QWORD PTR [r9+16]
-        mulx	rcx, rax, rdx
-        adox	r14, r14
-        adox	r15, r15
-        adcx	r14, rax
-        adcx	r15, rcx
-        ; A[3] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	rcx, rax, rdx
-        adox	rdi, rdi
-        adox	rsi, rsi
-        adcx	rdi, rax
-        adcx	rsi, rcx
-        mov	r11, QWORD PTR [r8]
-        ; A[4] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	rcx, rax, rdx
-        adox	rbx, rbx
-        adox	r11, r11
-        adcx	rbx, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8], r11
-        mov	r10, QWORD PTR [r8+8]
-        mov	r11, QWORD PTR [r8+16]
-        ; A[5] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+8], r10
-        mov	QWORD PTR [r8+16], r11
-        mov	r10, QWORD PTR [r8+24]
-        mov	r11, QWORD PTR [r8+32]
-        ; A[6] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+24], r10
-        mov	QWORD PTR [r8+32], r11
-        mov	r10, QWORD PTR [r8+40]
-        mov	r11, QWORD PTR [r8+48]
-        ; A[7] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+40], r10
-        mov	QWORD PTR [r8+48], r11
-        mov	r10, QWORD PTR [r8+56]
-        mov	r11, QWORD PTR [r8+64]
-        ; A[8] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+56], r10
-        mov	QWORD PTR [r8+64], r11
-        mov	QWORD PTR [r8+-40], r14
-        mov	QWORD PTR [r8+-32], r15
-        mov	QWORD PTR [r8+-24], rdi
-        mov	QWORD PTR [r8+-16], rsi
-        mov	QWORD PTR [r8+-8], rbx
-        sub	r8, 72
-        cmp	r9, r8
-        jne	L_end_521_sqr_avx2_9
-        vmovdqu	xmm0, OWORD PTR [rbp]
-        vmovups	OWORD PTR [r8], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbp+16]
-        vmovups	OWORD PTR [r8+16], xmm0
-L_end_521_sqr_avx2_9:
-        add	rsp, 72
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        pop	rbp
-        ret
-sp_521_sqr_avx2_9 ENDP
-_text ENDS
-ENDIF
-; /* Add b to a into r. (r = a + b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_add_9 PROC
-        ; Add
-        mov	r9, QWORD PTR [rdx]
-        xor	rax, rax
-        add	r9, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx+8]
-        mov	QWORD PTR [rcx], r9
-        adc	r10, QWORD PTR [r8+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	QWORD PTR [rcx+8], r10
-        adc	r9, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	QWORD PTR [rcx+16], r9
-        adc	r10, QWORD PTR [r8+24]
-        mov	r9, QWORD PTR [rdx+32]
-        mov	QWORD PTR [rcx+24], r10
-        adc	r9, QWORD PTR [r8+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	QWORD PTR [rcx+32], r9
-        adc	r10, QWORD PTR [r8+40]
-        mov	r9, QWORD PTR [rdx+48]
-        mov	QWORD PTR [rcx+40], r10
-        adc	r9, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+56]
-        mov	QWORD PTR [rcx+48], r9
-        adc	r10, QWORD PTR [r8+56]
-        mov	r9, QWORD PTR [rdx+64]
-        mov	QWORD PTR [rcx+56], r10
-        adc	r9, QWORD PTR [r8+64]
-        mov	QWORD PTR [rcx+64], r9
-        adc	rax, 0
-        ret
-sp_521_add_9 ENDP
-_text ENDS
-; /* Sub b from a into r. (r = a - b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_sub_9 PROC
-        mov	r9, QWORD PTR [rdx]
-        sub	r9, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx+8]
-        mov	QWORD PTR [rcx], r9
-        sbb	r10, QWORD PTR [r8+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	QWORD PTR [rcx+8], r10
-        sbb	r9, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	QWORD PTR [rcx+16], r9
-        sbb	r10, QWORD PTR [r8+24]
-        mov	r9, QWORD PTR [rdx+32]
-        mov	QWORD PTR [rcx+24], r10
-        sbb	r9, QWORD PTR [r8+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	QWORD PTR [rcx+32], r9
-        sbb	r10, QWORD PTR [r8+40]
-        mov	r9, QWORD PTR [rdx+48]
-        mov	QWORD PTR [rcx+40], r10
-        sbb	r9, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+56]
-        mov	QWORD PTR [rcx+48], r9
-        sbb	r10, QWORD PTR [r8+56]
-        mov	r9, QWORD PTR [rdx+64]
-        mov	QWORD PTR [rcx+56], r10
-        sbb	r9, QWORD PTR [r8+64]
-        mov	QWORD PTR [rcx+64], r9
-        sbb	rax, rax
-        ret
-sp_521_sub_9 ENDP
-_text ENDS
-; /* Conditionally copy a into r using the mask m.
-;  * m is -1 to copy and 0 when not.
-;  *
-;  * r  A single precision number to copy over.
-;  * a  A single precision number to copy.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_cond_copy_9 PROC
-        push	r12
-        mov	rax, QWORD PTR [rcx]
-        mov	r9, QWORD PTR [rcx+8]
-        mov	r10, QWORD PTR [rcx+16]
-        mov	r11, QWORD PTR [rcx+24]
-        mov	r12, QWORD PTR [rcx+32]
-        xor	rax, QWORD PTR [rdx]
-        xor	r9, QWORD PTR [rdx+8]
-        xor	r10, QWORD PTR [rdx+16]
-        xor	r11, QWORD PTR [rdx+24]
-        xor	r12, QWORD PTR [rdx+32]
-        and	rax, r8
-        and	r9, r8
-        and	r10, r8
-        and	r11, r8
-        and	r12, r8
-        xor	QWORD PTR [rcx], rax
-        xor	QWORD PTR [rcx+8], r9
-        xor	QWORD PTR [rcx+16], r10
-        xor	QWORD PTR [rcx+24], r11
-        xor	QWORD PTR [rcx+32], r12
-        mov	rax, QWORD PTR [rcx+40]
-        mov	r9, QWORD PTR [rcx+48]
-        mov	r10, QWORD PTR [rcx+56]
-        mov	r11, QWORD PTR [rcx+64]
-        xor	rax, QWORD PTR [rdx+40]
-        xor	r9, QWORD PTR [rdx+48]
-        xor	r10, QWORD PTR [rdx+56]
-        xor	r11, QWORD PTR [rdx+64]
-        and	rax, r8
-        and	r9, r8
-        and	r10, r8
-        and	r11, r8
-        xor	QWORD PTR [rcx+40], rax
-        xor	QWORD PTR [rcx+48], r9
-        xor	QWORD PTR [rcx+56], r10
-        xor	QWORD PTR [rcx+64], r11
-        pop	r12
-        ret
-sp_521_cond_copy_9 ENDP
-_text ENDS
-; /* Multiply two Montgomery form numbers mod the modulus (prime).
-;  * (r = a * b mod m)
-;  *
-;  * r   Result of multiplication.
-;  * a   First number to multiply in Montgomery form.
-;  * b   Second number to multiply in Montgomery form.
-;  * m   Modulus (prime).
-;  * mp  Montgomery multiplier.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_mont_mul_9 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        mov	r9, rdx
-        sub	rsp, 144
-        ; A[0] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9]
-        xor	r15, r15
-        mov	QWORD PTR [rsp], rax
-        mov	r14, rdx
-        ; A[0] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9]
-        xor	r13, r13
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        ; A[1] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+8]
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        mov	QWORD PTR [rsp+8], r14
-        ; A[0] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9]
-        xor	r14, r14
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[1] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+8]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+16]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        mov	QWORD PTR [rsp+16], r15
-        ; A[0] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9]
-        xor	r15, r15
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[1] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+8]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[2] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+16]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[3] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+24]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        mov	QWORD PTR [rsp+24], r13
-        ; A[0] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9]
-        xor	r13, r13
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        ; A[1] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+8]
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        ; A[2] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+16]
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        ; A[3] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+24]
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        ; A[4] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+32]
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        mov	QWORD PTR [rsp+32], r14
-        ; A[0] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9]
-        xor	r14, r14
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[1] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+8]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+16]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+24]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+32]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+40]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        mov	QWORD PTR [rsp+40], r15
-        ; A[0] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9]
-        xor	r15, r15
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[1] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+8]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[2] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+16]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[3] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+24]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[4] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+32]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[5] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+40]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[6] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+48]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        mov	QWORD PTR [rsp+48], r13
-        ; A[0] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9]
-        xor	r13, r13
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        ; A[1] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+8]
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        ; A[2] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+16]
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        ; A[3] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+24]
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        ; A[4] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+32]
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        ; A[5] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+40]
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        ; A[6] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+48]
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        ; A[7] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+56]
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        mov	QWORD PTR [rsp+56], r14
-        ; A[0] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9]
-        xor	r14, r14
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[1] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+8]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+16]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+24]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+32]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+40]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+48]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[7] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+56]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[8] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+64]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        mov	QWORD PTR [rsp+64], r15
-        ; A[1] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+8]
-        xor	r15, r15
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[2] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+16]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[3] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+24]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[4] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+32]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[5] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+40]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[6] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+48]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[7] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+56]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[8] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+64]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        mov	QWORD PTR [rsp+72], r13
-        ; A[2] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+16]
-        xor	r13, r13
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        ; A[3] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+24]
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        ; A[4] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+32]
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        ; A[5] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+40]
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        ; A[6] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+48]
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        ; A[7] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+56]
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        ; A[8] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+64]
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        mov	QWORD PTR [rsp+80], r14
-        ; A[3] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+24]
-        xor	r14, r14
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+32]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+40]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+48]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[7] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+56]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[8] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+64]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        mov	QWORD PTR [rsp+88], r15
-        ; A[4] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+32]
-        xor	r15, r15
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[5] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+40]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[6] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+48]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[7] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+56]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[8] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+64]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        mov	QWORD PTR [rsp+96], r13
-        ; A[5] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+40]
-        xor	r13, r13
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        ; A[6] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+48]
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        ; A[7] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+56]
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        ; A[8] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+64]
-        add	r14, rax
-        adc	r15, rdx
-        adc	r13, 0
-        mov	QWORD PTR [rsp+104], r14
-        ; A[6] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+48]
-        xor	r14, r14
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[7] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+56]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[8] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+64]
-        add	r15, rax
-        adc	r13, rdx
-        adc	r14, 0
-        mov	QWORD PTR [rsp+112], r15
-        ; A[7] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+56]
-        xor	r15, r15
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[8] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+64]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        mov	QWORD PTR [rsp+120], r13
-        ; A[8] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+64]
-        add	r14, rax
-        adc	r15, rdx
-        mov	QWORD PTR [rsp+128], r14
-        mov	QWORD PTR [rsp+136], r15
-        mov	rax, QWORD PTR [rsp+64]
-        mov	rdx, QWORD PTR [rsp+72]
-        mov	r13, QWORD PTR [rsp+80]
-        mov	r12, rax
-        and	r12, 511
-        mov	r14, QWORD PTR [rsp+88]
-        mov	r15, QWORD PTR [rsp+96]
-        mov	r8, QWORD PTR [rsp+104]
-        mov	r9, QWORD PTR [rsp+112]
-        mov	r10, QWORD PTR [rsp+120]
-        mov	r11, QWORD PTR [rsp+128]
-        shrd	rax, rdx, 9
-        shrd	rdx, r13, 9
-        shrd	r13, r14, 9
-        shrd	r14, r15, 9
-        shrd	r15, r8, 9
-        shrd	r8, r9, 9
-        shrd	r9, r10, 9
-        shrd	r10, r11, 9
-        shr	r11, 9
-        add	rax, QWORD PTR [rsp]
-        adc	rdx, QWORD PTR [rsp+8]
-        adc	r13, QWORD PTR [rsp+16]
-        adc	r14, QWORD PTR [rsp+24]
-        adc	r15, QWORD PTR [rsp+32]
-        adc	r8, QWORD PTR [rsp+40]
-        adc	r9, QWORD PTR [rsp+48]
-        adc	r10, QWORD PTR [rsp+56]
-        adc	r12, r11
-        mov	r11, r12
-        shr	r12, 9
-        and	r11, 511
-        add	rax, r12
-        adc	rdx, 0
-        adc	r13, 0
-        adc	r14, 0
-        adc	r15, 0
-        adc	r8, 0
-        adc	r9, 0
-        adc	r10, 0
-        adc	r11, 0
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], rdx
-        mov	QWORD PTR [rcx+16], r13
-        mov	QWORD PTR [rcx+24], r14
-        mov	QWORD PTR [rcx+32], r15
-        mov	QWORD PTR [rcx+40], r8
-        mov	QWORD PTR [rcx+48], r9
-        mov	QWORD PTR [rcx+56], r10
-        mov	QWORD PTR [rcx+64], r11
-        add	rsp, 144
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_521_mont_mul_9 ENDP
-_text ENDS
-; /* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m)
-;  *
-;  * r   Result of squaring.
-;  * a   Number to square in Montgomery form.
-;  * m   Modulus (prime).
-;  * mp  Montgomery multiplier.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_mont_sqr_9 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        mov	r8, rdx
-        sub	rsp, 144
-        ; A[0] * A[0]
-        mov	rax, QWORD PTR [r8]
-        mul	rax
-        xor	r12, r12
-        mov	QWORD PTR [rsp], rax
-        mov	r11, rdx
-        ; A[0] * A[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+8], r11
-        ; A[0] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r8]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * A[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	rax
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+16], r12
-        ; A[0] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r8]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[1] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r8+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rsp+24], r10
-        ; A[0] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[1] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r8+8]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	rax
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+32], r11
-        ; A[0] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8]
-        xor	r11, r11
-        xor	r15, r15
-        mov	r13, rax
-        mov	r14, rdx
-        ; A[1] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8+8]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[2] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r8+16]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        add	r13, r13
-        adc	r14, r14
-        adc	r15, r15
-        add	r12, r13
-        adc	r10, r14
-        adc	r11, r15
-        mov	QWORD PTR [rsp+40], r12
-        ; A[0] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8]
-        xor	r12, r12
-        xor	r15, r15
-        mov	r13, rax
-        mov	r14, rdx
-        ; A[1] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+8]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[2] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8+16]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[3] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	rax
-        add	r13, r13
-        adc	r14, r14
-        adc	r15, r15
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        add	r10, r13
-        adc	r11, r14
-        adc	r12, r15
-        mov	QWORD PTR [rsp+48], r10
-        ; A[0] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        xor	r15, r15
-        mov	r13, rax
-        mov	r14, rdx
-        ; A[1] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+8]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[2] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+16]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[3] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8+24]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        add	r13, r13
-        adc	r14, r14
-        adc	r15, r15
-        add	r11, r13
-        adc	r12, r14
-        adc	r10, r15
-        mov	QWORD PTR [rsp+56], r11
-        ; A[0] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8]
-        xor	r11, r11
-        xor	r15, r15
-        mov	r13, rax
-        mov	r14, rdx
-        ; A[1] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+8]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[2] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+16]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[3] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+24]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[4] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	rax
-        add	r13, r13
-        adc	r14, r14
-        adc	r15, r15
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        add	r12, r13
-        adc	r10, r14
-        adc	r11, r15
-        mov	QWORD PTR [rsp+64], r12
-        ; A[1] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+8]
-        xor	r12, r12
-        xor	r15, r15
-        mov	r13, rax
-        mov	r14, rdx
-        ; A[2] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+16]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[3] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+24]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[4] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+32]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        add	r13, r13
-        adc	r14, r14
-        adc	r15, r15
-        add	r10, r13
-        adc	r11, r14
-        adc	r12, r15
-        mov	QWORD PTR [rsp+72], r10
-        ; A[2] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+16]
-        xor	r10, r10
-        xor	r15, r15
-        mov	r13, rax
-        mov	r14, rdx
-        ; A[3] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+24]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[4] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+32]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[5] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	rax
-        add	r13, r13
-        adc	r14, r14
-        adc	r15, r15
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        add	r11, r13
-        adc	r12, r14
-        adc	r10, r15
-        mov	QWORD PTR [rsp+80], r11
-        ; A[3] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+24]
-        xor	r11, r11
-        xor	r15, r15
-        mov	r13, rax
-        mov	r14, rdx
-        ; A[4] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+32]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        ; A[5] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+40]
-        add	r13, rax
-        adc	r14, rdx
-        adc	r15, 0
-        add	r13, r13
-        adc	r14, r14
-        adc	r15, r15
-        add	r12, r13
-        adc	r10, r14
-        adc	r11, r15
-        mov	QWORD PTR [rsp+88], r12
-        ; A[4] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+32]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[5] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+40]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[6] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	rax
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rsp+96], r10
-        ; A[5] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+40]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[6] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+48]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+104], r11
-        ; A[6] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+48]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[7] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	rax
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+112], r12
-        ; A[7] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+56]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rsp+120], r10
-        ; A[8] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	rax
-        add	r11, rax
-        adc	r12, rdx
-        mov	QWORD PTR [rsp+128], r11
-        mov	QWORD PTR [rsp+136], r12
-        mov	r10, QWORD PTR [rsp+64]
-        mov	r11, QWORD PTR [rsp+72]
-        mov	r12, QWORD PTR [rsp+80]
-        mov	r9, r10
-        and	r9, 511
-        mov	rax, QWORD PTR [rsp+88]
-        mov	rdx, QWORD PTR [rsp+96]
-        mov	r13, QWORD PTR [rsp+104]
-        mov	r14, QWORD PTR [rsp+112]
-        mov	r15, QWORD PTR [rsp+120]
-        mov	r8, QWORD PTR [rsp+128]
-        shrd	r10, r11, 9
-        shrd	r11, r12, 9
-        shrd	r12, rax, 9
-        shrd	rax, rdx, 9
-        shrd	rdx, r13, 9
-        shrd	r13, r14, 9
-        shrd	r14, r15, 9
-        shrd	r15, r8, 9
-        shr	r8, 9
-        add	r10, QWORD PTR [rsp]
-        adc	r11, QWORD PTR [rsp+8]
-        adc	r12, QWORD PTR [rsp+16]
-        adc	rax, QWORD PTR [rsp+24]
-        adc	rdx, QWORD PTR [rsp+32]
-        adc	r13, QWORD PTR [rsp+40]
-        adc	r14, QWORD PTR [rsp+48]
-        adc	r15, QWORD PTR [rsp+56]
-        adc	r9, r8
-        mov	r8, r9
-        shr	r9, 9
-        and	r8, 511
-        add	r10, r9
-        adc	r11, 0
-        adc	r12, 0
-        adc	rax, 0
-        adc	rdx, 0
-        adc	r13, 0
-        adc	r14, 0
-        adc	r15, 0
-        adc	r8, 0
-        mov	QWORD PTR [rcx], r10
-        mov	QWORD PTR [rcx+8], r11
-        mov	QWORD PTR [rcx+16], r12
-        mov	QWORD PTR [rcx+24], rax
-        mov	QWORD PTR [rcx+32], rdx
-        mov	QWORD PTR [rcx+40], r13
-        mov	QWORD PTR [rcx+48], r14
-        mov	QWORD PTR [rcx+56], r15
-        mov	QWORD PTR [rcx+64], r8
-        add	rsp, 144
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_521_mont_sqr_9 ENDP
-_text ENDS
-; /* Compare a with b in constant time.
-;  *
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  * return -ve, 0 or +ve if a is less than, equal to or greater than b
-;  * respectively.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_cmp_9 PROC
-        push	r12
-        xor	r9, r9
-        mov	r8, -1
-        mov	rax, -1
-        mov	r10, 1
-        mov	r11, QWORD PTR [rcx+64]
-        mov	r12, QWORD PTR [rdx+64]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+56]
-        mov	r12, QWORD PTR [rdx+56]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+48]
-        mov	r12, QWORD PTR [rdx+48]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+40]
-        mov	r12, QWORD PTR [rdx+40]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+32]
-        mov	r12, QWORD PTR [rdx+32]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+24]
-        mov	r12, QWORD PTR [rdx+24]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+16]
-        mov	r12, QWORD PTR [rdx+16]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+8]
-        mov	r12, QWORD PTR [rdx+8]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx]
-        mov	r12, QWORD PTR [rdx]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        xor	rax, r8
-        pop	r12
-        ret
-sp_521_cmp_9 ENDP
-_text ENDS
-; /* Conditionally subtract b from a using the mask m.
-;  * m is -1 to subtract and 0 when not copying.
-;  *
-;  * r  A single precision number representing condition subtract result.
-;  * a  A single precision number to subtract from.
-;  * b  A single precision number to subtract.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_cond_sub_9 PROC
-        sub	rsp, 72
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp], r10
-        mov	QWORD PTR [rsp+8], r11
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+16], r10
-        mov	QWORD PTR [rsp+24], r11
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+32], r10
-        mov	QWORD PTR [rsp+40], r11
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+48], r10
-        mov	QWORD PTR [rsp+56], r11
-        mov	r10, QWORD PTR [r8+64]
-        and	r10, r9
-        mov	QWORD PTR [rsp+64], r10
-        mov	r10, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rsp]
-        sub	r10, r8
-        mov	r11, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [rsp+8]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx], r10
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r8, QWORD PTR [rsp+16]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+8], r11
-        mov	r11, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [rsp+24]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+16], r10
-        mov	r10, QWORD PTR [rdx+32]
-        mov	r8, QWORD PTR [rsp+32]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+24], r11
-        mov	r11, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [rsp+40]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+32], r10
-        mov	r10, QWORD PTR [rdx+48]
-        mov	r8, QWORD PTR [rsp+48]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+40], r11
-        mov	r11, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [rsp+56]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+48], r10
-        mov	r10, QWORD PTR [rdx+64]
-        mov	r8, QWORD PTR [rsp+64]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+56], r11
-        mov	QWORD PTR [rcx+64], r10
-        sbb	rax, rax
-        add	rsp, 72
-        ret
-sp_521_cond_sub_9 ENDP
-_text ENDS
-; /* Reduce the number back to 521 bits using Montgomery reduction.
-;  *
-;  * a   A single precision number to reduce in place.
-;  * m   The single precision number representing the modulus.
-;  * mp  The digit representing the negative inverse of m mod 2^n.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_mont_reduce_9 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        mov	rdx, QWORD PTR [rcx+64]
-        mov	rax, QWORD PTR [rcx+72]
-        mov	r8, QWORD PTR [rcx+80]
-        mov	r15, rdx
-        and	r15, 511
-        mov	r9, QWORD PTR [rcx+88]
-        mov	r10, QWORD PTR [rcx+96]
-        mov	r11, QWORD PTR [rcx+104]
-        mov	r12, QWORD PTR [rcx+112]
-        mov	r13, QWORD PTR [rcx+120]
-        mov	r14, QWORD PTR [rcx+128]
-        shrd	rdx, rax, 9
-        shrd	rax, r8, 9
-        shrd	r8, r9, 9
-        shrd	r9, r10, 9
-        shrd	r10, r11, 9
-        shrd	r11, r12, 9
-        shrd	r12, r13, 9
-        shrd	r13, r14, 9
-        shr	r14, 9
-        add	rdx, QWORD PTR [rcx]
-        adc	rax, QWORD PTR [rcx+8]
-        adc	r8, QWORD PTR [rcx+16]
-        adc	r9, QWORD PTR [rcx+24]
-        adc	r10, QWORD PTR [rcx+32]
-        adc	r11, QWORD PTR [rcx+40]
-        adc	r12, QWORD PTR [rcx+48]
-        adc	r13, QWORD PTR [rcx+56]
-        adc	r15, r14
-        mov	r14, r15
-        shr	r15, 9
-        and	r14, 511
-        add	rdx, r15
-        adc	rax, 0
-        adc	r8, 0
-        adc	r9, 0
-        adc	r10, 0
-        adc	r11, 0
-        adc	r12, 0
-        adc	r13, 0
-        adc	r14, 0
-        mov	QWORD PTR [rcx], rdx
-        mov	QWORD PTR [rcx+8], rax
-        mov	QWORD PTR [rcx+16], r8
-        mov	QWORD PTR [rcx+24], r9
-        mov	QWORD PTR [rcx+32], r10
-        mov	QWORD PTR [rcx+40], r11
-        mov	QWORD PTR [rcx+48], r12
-        mov	QWORD PTR [rcx+56], r13
-        mov	QWORD PTR [rcx+64], r14
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_521_mont_reduce_9 ENDP
-_text ENDS
-; /* Reduce the number back to 521 bits using Montgomery reduction.
-;  *
-;  * a   A single precision number to reduce in place.
-;  * m   The single precision number representing the modulus.
-;  * mp  The digit representing the negative inverse of m mod 2^n.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_mont_reduce_order_9 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        mov	r9, rdx
-        xor	rsi, rsi
-        ; i = 9
-        mov	r10, 9
-        mov	r15, QWORD PTR [rcx]
-        mov	rdi, QWORD PTR [rcx+8]
-L_521_mont_reduce_order_9_loop:
-        ; mu = a[i] * mp
-        mov	r13, r15
-        imul	r13, r8
-        cmp	r10, 1
-        jne	L_521_mont_reduce_order_9_nomask
-        and	r13, 511
-L_521_mont_reduce_order_9_nomask:
-        ; a[i+0] += m[0] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9]
-        add	r15, rax
-        mov	QWORD PTR [rcx], r15
-        adc	r12, rdx
-        ; a[i+1] += m[1] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+8]
-        mov	r15, rdi
-        add	r15, rax
-        adc	r11, rdx
-        add	r15, r12
-        adc	r11, 0
-        ; a[i+2] += m[2] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+16]
-        mov	rdi, QWORD PTR [rcx+16]
-        add	rdi, rax
-        adc	r12, rdx
-        add	rdi, r11
-        adc	r12, 0
-        ; a[i+3] += m[3] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+24]
-        mov	r14, QWORD PTR [rcx+24]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+24], r14
-        adc	r11, 0
-        ; a[i+4] += m[4] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+32]
-        mov	r14, QWORD PTR [rcx+32]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+32], r14
-        adc	r12, 0
-        ; a[i+5] += m[5] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+40]
-        mov	r14, QWORD PTR [rcx+40]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+40], r14
-        adc	r11, 0
-        ; a[i+6] += m[6] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+48]
-        mov	r14, QWORD PTR [rcx+48]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+48], r14
-        adc	r12, 0
-        ; a[i+7] += m[7] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+56]
-        mov	r14, QWORD PTR [rcx+56]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+56], r14
-        adc	r11, 0
-        ; a[i+8] += m[8] * mu
-        mov	rax, r13
-        mul	QWORD PTR [r9+64]
-        mov	r14, QWORD PTR [rcx+64]
-        add	r11, rax
-        adc	rdx, rsi
-        mov	rsi, 0
-        adc	rsi, 0
-        add	r14, r11
-        mov	QWORD PTR [rcx+64], r14
-        adc	QWORD PTR [rcx+72], rdx
-        adc	rsi, 0
-        ; i -= 1
-        add	rcx, 8
-        dec	r10
-        jnz	L_521_mont_reduce_order_9_loop
-        mov	QWORD PTR [rcx], r15
-        mov	QWORD PTR [rcx+8], rdi
-        mov	r8, rcx
-        sub	rcx, 72
-        sub	r8, 8
-        mov	rax, QWORD PTR [r8]
-        mov	rdx, QWORD PTR [r8+8]
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        mov	r13, QWORD PTR [r8+32]
-        shrd	rax, rdx, 9
-        shrd	rdx, r10, 9
-        shrd	r10, r11, 9
-        shrd	r11, r13, 9
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], rdx
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        mov	rdx, QWORD PTR [r8+40]
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        mov	rax, QWORD PTR [r8+64]
-        shrd	r13, rdx, 9
-        shrd	rdx, r10, 9
-        shrd	r10, r11, 9
-        shrd	r11, rax, 9
-        mov	QWORD PTR [rcx+32], r13
-        mov	QWORD PTR [rcx+40], rdx
-        mov	QWORD PTR [rcx+48], r10
-        mov	QWORD PTR [rcx+56], r11
-        mov	rdx, QWORD PTR [r8+72]
-        shrd	rax, rdx, 9
-        shr	rdx, 9
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], rdx
-        mov	rsi, QWORD PTR [rcx+64]
-        shr	rsi, 9
-        neg	rsi
-IFDEF _WIN64
-        mov	r8, r9
-        mov	r9, rsi
-ELSE
-        mov	r9, rsi
-        mov	r8, r9
-ENDIF
-        mov	rdx, rcx
-        call	sp_521_cond_sub_9
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_521_mont_reduce_order_9 ENDP
-_text ENDS
-; /* Add two Montgomery form numbers (r = a + b % m).
-;  *
-;  * r   Result of addition.
-;  * a   First number to add in Montgomery form.
-;  * b   Second number to add in Montgomery form.
-;  * m   Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_521_mont_add_9 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        mov	rax, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r11, QWORD PTR [rdx+24]
-        mov	r12, QWORD PTR [rdx+32]
-        mov	r13, QWORD PTR [rdx+40]
-        mov	r14, QWORD PTR [rdx+48]
-        mov	r15, QWORD PTR [rdx+56]
-        mov	rdi, QWORD PTR [rdx+64]
-        add	rax, QWORD PTR [r8]
-        adc	r9, QWORD PTR [r8+8]
-        adc	r10, QWORD PTR [r8+16]
-        adc	r11, QWORD PTR [r8+24]
-        adc	r12, QWORD PTR [r8+32]
-        adc	r13, QWORD PTR [r8+40]
-        adc	r14, QWORD PTR [r8+48]
-        adc	r15, QWORD PTR [r8+56]
-        adc	rdi, QWORD PTR [r8+64]
-        mov	rsi, rdi
-        and	rdi, 511
-        shr	rsi, 9
-        add	rax, rsi
-        adc	r9, 0
-        adc	r10, 0
-        adc	r11, 0
-        adc	r12, 0
-        adc	r13, 0
-        adc	r14, 0
-        adc	r15, 0
-        adc	rdi, 0
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r9
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        mov	QWORD PTR [rcx+32], r12
-        mov	QWORD PTR [rcx+40], r13
-        mov	QWORD PTR [rcx+48], r14
-        mov	QWORD PTR [rcx+56], r15
-        mov	QWORD PTR [rcx+64], rdi
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_521_mont_add_9 ENDP
-_text ENDS
-; /* Double a Montgomery form number (r = a + a % m).
-;  *
-;  * r   Result of addition.
-;  * a   Number to souble in Montgomery form.
-;  * m   Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_521_mont_dbl_9 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        mov	rax, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rdx+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	r11, QWORD PTR [rdx+32]
-        mov	r12, QWORD PTR [rdx+40]
-        mov	r13, QWORD PTR [rdx+48]
-        mov	r14, QWORD PTR [rdx+56]
-        mov	r15, QWORD PTR [rdx+64]
-        add	rax, rax
-        adc	r8, r8
-        adc	r9, r9
-        adc	r10, r10
-        adc	r11, r11
-        adc	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        adc	r15, r15
-        mov	rdi, r15
-        and	r15, 511
-        shr	rdi, 9
-        add	rax, rdi
-        adc	r8, 0
-        adc	r9, 0
-        adc	r10, 0
-        adc	r11, 0
-        adc	r12, 0
-        adc	r13, 0
-        adc	r14, 0
-        adc	r15, 0
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r8
-        mov	QWORD PTR [rcx+16], r9
-        mov	QWORD PTR [rcx+24], r10
-        mov	QWORD PTR [rcx+32], r11
-        mov	QWORD PTR [rcx+40], r12
-        mov	QWORD PTR [rcx+48], r13
-        mov	QWORD PTR [rcx+56], r14
-        mov	QWORD PTR [rcx+64], r15
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_521_mont_dbl_9 ENDP
-_text ENDS
-; /* Triple a Montgomery form number (r = a + a + a % m).
-;  *
-;  * r   Result of Tripling.
-;  * a   Number to triple in Montgomery form.
-;  * m   Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_521_mont_tpl_9 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        mov	rax, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rdx+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	r11, QWORD PTR [rdx+32]
-        mov	r12, QWORD PTR [rdx+40]
-        mov	r13, QWORD PTR [rdx+48]
-        mov	r14, QWORD PTR [rdx+56]
-        mov	r15, QWORD PTR [rdx+64]
-        add	rax, rax
-        adc	r8, r8
-        adc	r9, r9
-        adc	r10, r10
-        adc	r11, r11
-        adc	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        adc	r15, r15
-        add	rax, QWORD PTR [rdx]
-        adc	r8, QWORD PTR [rdx+8]
-        adc	r9, QWORD PTR [rdx+16]
-        adc	r10, QWORD PTR [rdx+24]
-        adc	r11, QWORD PTR [rdx+32]
-        adc	r12, QWORD PTR [rdx+40]
-        adc	r13, QWORD PTR [rdx+48]
-        adc	r14, QWORD PTR [rdx+56]
-        adc	r15, QWORD PTR [rdx+64]
-        mov	rdi, r15
-        and	r15, 511
-        shr	rdi, 9
-        add	rax, rdi
-        adc	r8, 0
-        adc	r9, 0
-        adc	r10, 0
-        adc	r11, 0
-        adc	r12, 0
-        adc	r13, 0
-        adc	r14, 0
-        adc	r15, 0
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r8
-        mov	QWORD PTR [rcx+16], r9
-        mov	QWORD PTR [rcx+24], r10
-        mov	QWORD PTR [rcx+32], r11
-        mov	QWORD PTR [rcx+40], r12
-        mov	QWORD PTR [rcx+48], r13
-        mov	QWORD PTR [rcx+56], r14
-        mov	QWORD PTR [rcx+64], r15
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_521_mont_tpl_9 ENDP
-_text ENDS
-; /* Subtract two Montgomery form numbers (r = a - b % m).
-;  *
-;  * r   Result of addition.
-;  * a   First number to add in Montgomery form.
-;  * b   Second number to add in Montgomery form.
-;  * m   Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_521_mont_sub_9 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        mov	rax, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r11, QWORD PTR [rdx+24]
-        mov	r12, QWORD PTR [rdx+32]
-        mov	r13, QWORD PTR [rdx+40]
-        mov	r14, QWORD PTR [rdx+48]
-        mov	r15, QWORD PTR [rdx+56]
-        mov	rdi, QWORD PTR [rdx+64]
-        sub	rax, QWORD PTR [r8]
-        sbb	r9, QWORD PTR [r8+8]
-        sbb	r10, QWORD PTR [r8+16]
-        sbb	r11, QWORD PTR [r8+24]
-        sbb	r12, QWORD PTR [r8+32]
-        sbb	r13, QWORD PTR [r8+40]
-        sbb	r14, QWORD PTR [r8+48]
-        sbb	r15, QWORD PTR [r8+56]
-        sbb	rdi, QWORD PTR [r8+64]
-        mov	rsi, rdi
-        and	rdi, 511
-        sar	rsi, 9
-        neg	rsi
-        sub	rax, rsi
-        sbb	r9, 0
-        sbb	r10, 0
-        sbb	r11, 0
-        sbb	r12, 0
-        sbb	r13, 0
-        sbb	r14, 0
-        sbb	r15, 0
-        sbb	rdi, 0
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r9
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        mov	QWORD PTR [rcx+32], r12
-        mov	QWORD PTR [rcx+40], r13
-        mov	QWORD PTR [rcx+48], r14
-        mov	QWORD PTR [rcx+56], r15
-        mov	QWORD PTR [rcx+64], rdi
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_521_mont_sub_9 ENDP
-_text ENDS
-; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
-;  *
-;  * r  Result of division by 2.
-;  * a  Number to divide.
-;  * m  Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_521_mont_div2_9 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        mov	rax, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rdx+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	r11, QWORD PTR [rdx+32]
-        mov	r12, QWORD PTR [rdx+40]
-        mov	r13, QWORD PTR [rdx+48]
-        mov	r14, QWORD PTR [rdx+56]
-        mov	r15, QWORD PTR [rdx+64]
-        mov	rdi, rax
-        and	rdi, 1
-        sub	rax, rdi
-        sbb	r8, 0
-        sbb	r9, 0
-        sbb	r10, 0
-        sbb	r11, 0
-        sbb	r12, 0
-        sbb	r13, 0
-        sbb	r14, 0
-        sbb	r15, 0
-        shl	rdi, 9
-        add	r15, rdi
-        shrd	rax, r8, 1
-        shrd	r8, r9, 1
-        shrd	r9, r10, 1
-        shrd	r10, r11, 1
-        shrd	r11, r12, 1
-        shrd	r12, r13, 1
-        shrd	r13, r14, 1
-        shrd	r14, r15, 1
-        shr	r15, 1
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r8
-        mov	QWORD PTR [rcx+16], r9
-        mov	QWORD PTR [rcx+24], r10
-        mov	QWORD PTR [rcx+32], r11
-        mov	QWORD PTR [rcx+40], r12
-        mov	QWORD PTR [rcx+48], r13
-        mov	QWORD PTR [rcx+56], r14
-        mov	QWORD PTR [rcx+64], r15
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_521_mont_div2_9 ENDP
-_text ENDS
-IFNDEF WC_NO_CACHE_RESISTANT
-; /* Touch each possible point that could be being copied.
-;  *
-;  * r      Point to copy into.
-;  * table  Table - start of the entries to access
-;  * idx    Index of point to retrieve.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_get_point_33_9 PROC
-        push	r12
-        push	r13
-        push	r14
-        sub	rsp, 160
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        vmovdqu	OWORD PTR [rsp+96], xmm12
-        vmovdqu	OWORD PTR [rsp+112], xmm13
-        vmovdqu	OWORD PTR [rsp+128], xmm14
-        vmovdqu	OWORD PTR [rsp+144], xmm15
-        mov	r14, 1
-        mov	rax, 1
-        movd	xmm13, r8d
-        add	rdx, 440
-        movd	xmm15, eax
-        mov	rax, 32
-        pshufd	xmm15, xmm15, 0
-        pshufd	xmm13, xmm13, 0
-        pxor	xmm14, xmm14
-        pxor	xmm0, xmm0
-        pxor	xmm1, xmm1
-        pxor	xmm2, xmm2
-        pxor	xmm3, xmm3
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        xor	r12, r12
-        xor	r13, r13
-        movdqa	xmm14, xmm15
-L_521_get_point_33_9_start_1:
-        movdqa	xmm12, xmm14
-        paddd	xmm14, xmm15
-        pcmpeqd	xmm12, xmm13
-        xor	r9, r9
-        cmp	r8, r14
-        sete	r9b
-        neg	r9
-        inc	r14
-        movdqu	xmm6, [rdx]
-        movdqu	xmm7, [rdx+16]
-        movdqu	xmm8, [rdx+32]
-        movdqu	xmm9, [rdx+48]
-        mov	r10, QWORD PTR [rdx+64]
-        movdqu	xmm10, [rdx+144]
-        movdqu	xmm11, [rdx+160]
-        add	rdx, 440
-        pand	xmm6, xmm12
-        pand	xmm7, xmm12
-        pand	xmm8, xmm12
-        pand	xmm9, xmm12
-        pand	xmm10, xmm12
-        pand	xmm11, xmm12
-        and	r10, r9
-        por	xmm0, xmm6
-        por	xmm1, xmm7
-        por	xmm2, xmm8
-        por	xmm3, xmm9
-        por	xmm4, xmm10
-        por	xmm5, xmm11
-        or	r12, r10
-        dec	rax
-        jnz	L_521_get_point_33_9_start_1
-        movdqu	[rcx], xmm0
-        movdqu	[rcx+16], xmm1
-        movdqu	[rcx+32], xmm2
-        movdqu	[rcx+48], xmm3
-        mov	QWORD PTR [rcx+64], r12
-        movdqu	[rcx+144], xmm4
-        movdqu	[rcx+160], xmm5
-        mov	r14, 1
-        mov	rax, 1
-        movd	xmm13, r8d
-        sub	rdx, 14080
-        movd	xmm15, eax
-        mov	rax, 32
-        pshufd	xmm15, xmm15, 0
-        pshufd	xmm13, xmm13, 0
-        pxor	xmm14, xmm14
-        pxor	xmm0, xmm0
-        pxor	xmm1, xmm1
-        pxor	xmm2, xmm2
-        pxor	xmm3, xmm3
-        pxor	xmm4, xmm4
-        pxor	xmm5, xmm5
-        xor	r12, r12
-        xor	r13, r13
-        movdqa	xmm14, xmm15
-L_521_get_point_33_9_start_2:
-        movdqa	xmm12, xmm14
-        paddd	xmm14, xmm15
-        pcmpeqd	xmm12, xmm13
-        xor	r9, r9
-        cmp	r8, r14
-        sete	r9b
-        neg	r9
-        inc	r14
-        movdqu	xmm6, [rdx+176]
-        movdqu	xmm7, [rdx+192]
-        mov	r10, QWORD PTR [rdx+208]
-        movdqu	xmm8, [rdx+288]
-        movdqu	xmm9, [rdx+304]
-        movdqu	xmm10, [rdx+320]
-        movdqu	xmm11, [rdx+336]
-        mov	r11, QWORD PTR [rdx+352]
-        add	rdx, 440
-        pand	xmm6, xmm12
-        pand	xmm7, xmm12
-        pand	xmm8, xmm12
-        pand	xmm9, xmm12
-        pand	xmm10, xmm12
-        pand	xmm11, xmm12
-        and	r10, r9
-        and	r11, r9
-        por	xmm0, xmm6
-        por	xmm1, xmm7
-        por	xmm2, xmm8
-        por	xmm3, xmm9
-        por	xmm4, xmm10
-        por	xmm5, xmm11
-        or	r12, r10
-        or	r13, r11
-        dec	rax
-        jnz	L_521_get_point_33_9_start_2
-        movdqu	[rcx+176], xmm0
-        movdqu	[rcx+192], xmm1
-        mov	QWORD PTR [rcx+208], r12
-        movdqu	[rcx+288], xmm2
-        movdqu	[rcx+304], xmm3
-        movdqu	[rcx+320], xmm4
-        movdqu	[rcx+336], xmm5
-        mov	QWORD PTR [rcx+352], r13
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        vmovdqu	xmm12, OWORD PTR [rsp+96]
-        vmovdqu	xmm13, OWORD PTR [rsp+112]
-        vmovdqu	xmm14, OWORD PTR [rsp+128]
-        vmovdqu	xmm15, OWORD PTR [rsp+144]
-        add	rsp, 160
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_521_get_point_33_9 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Touch each possible point that could be being copied.
-;  *
-;  * r      Point to copy into.
-;  * table  Table - start of the entries to access
-;  * idx    Index of point to retrieve.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_get_point_33_avx2_9 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        sub	rsp, 160
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        vmovdqu	OWORD PTR [rsp+96], xmm12
-        vmovdqu	OWORD PTR [rsp+112], xmm13
-        vmovdqu	OWORD PTR [rsp+128], xmm14
-        vmovdqu	OWORD PTR [rsp+144], xmm15
-        mov	rdi, 1
-        mov	rax, 1
-        movd	xmm13, r8d
-        add	rdx, 440
-        movd	xmm15, eax
-        mov	rax, 32
-        vpxor	ymm14, ymm14, ymm14
-        vpermd	ymm13, ymm14, ymm13
-        vpermd	ymm15, ymm14, ymm15
-        vpxor	ymm0, ymm0, ymm0
-        vpxor	ymm1, ymm1, ymm1
-        vpxor	ymm2, ymm2, ymm2
-        vpxor	ymm3, ymm3, ymm3
-        vpxor	ymm4, ymm4, ymm4
-        vpxor	ymm5, ymm5, ymm5
-        xor	r10, r10
-        xor	r11, r11
-        xor	r12, r12
-        vmovdqa	ymm14, ymm15
-L_521_get_point_33_avx2_9_start:
-        vpcmpeqd	ymm12, ymm14, ymm13
-        vpaddd	ymm14, ymm14, ymm15
-        xor	r9, r9
-        cmp	r8, rdi
-        sete	r9b
-        neg	r9
-        inc	rdi
-        vmovupd	ymm6, YMMWORD PTR [rdx]
-        vmovupd	ymm7, YMMWORD PTR [rdx+32]
-        vmovupd	ymm8, YMMWORD PTR [rdx+144]
-        vmovupd	ymm9, YMMWORD PTR [rdx+176]
-        vmovupd	ymm10, YMMWORD PTR [rdx+288]
-        vmovupd	ymm11, YMMWORD PTR [rdx+320]
-        mov	r13, QWORD PTR [rdx+64]
-        mov	r14, QWORD PTR [rdx+208]
-        mov	r15, QWORD PTR [rdx+352]
-        add	rdx, 440
-        vpand	ymm6, ymm6, ymm12
-        vpand	ymm7, ymm7, ymm12
-        vpand	ymm8, ymm8, ymm12
-        vpand	ymm9, ymm9, ymm12
-        vpand	ymm10, ymm10, ymm12
-        vpand	ymm11, ymm11, ymm12
-        and	r13, r9
-        and	r14, r9
-        and	r15, r9
-        vpor	ymm0, ymm0, ymm6
-        vpor	ymm1, ymm1, ymm7
-        vpor	ymm2, ymm2, ymm8
-        vpor	ymm3, ymm3, ymm9
-        vpor	ymm4, ymm4, ymm10
-        vpor	ymm5, ymm5, ymm11
-        or	r10, r13
-        or	r11, r14
-        or	r12, r15
-        dec	rax
-        jnz	L_521_get_point_33_avx2_9_start
-        vmovupd	YMMWORD PTR [rcx], ymm0
-        vmovupd	YMMWORD PTR [rcx+32], ymm1
-        vmovupd	YMMWORD PTR [rcx+144], ymm2
-        vmovupd	YMMWORD PTR [rcx+176], ymm3
-        vmovupd	YMMWORD PTR [rcx+288], ymm4
-        vmovupd	YMMWORD PTR [rcx+320], ymm5
-        mov	QWORD PTR [rcx+64], r10
-        mov	QWORD PTR [rcx+208], r11
-        mov	QWORD PTR [rcx+352], r12
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        vmovdqu	xmm12, OWORD PTR [rsp+96]
-        vmovdqu	xmm13, OWORD PTR [rsp+112]
-        vmovdqu	xmm14, OWORD PTR [rsp+128]
-        vmovdqu	xmm15, OWORD PTR [rsp+144]
-        add	rsp, 160
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_521_get_point_33_avx2_9 ENDP
-_text ENDS
-ENDIF
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Multiply two Montgomery form numbers mod the modulus (prime).
-;  * (r = a * b mod m)
-;  *
-;  * r   Result of multiplication.
-;  * a   First number to multiply in Montgomery form.
-;  * b   Second number to multiply in Montgomery form.
-;  * m   Modulus (prime).
-;  * mp  Montgomery multiplier.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_mont_mul_avx2_9 PROC
-        push	rbx
-        push	rbp
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        mov	rbp, r8
-        mov	r8, rcx
-        mov	r9, rdx
-        sub	rsp, 144
-        mov	rbx, rsp
-        add	rsp, 72
-        xor	r15, r15
-        mov	rdx, QWORD PTR [r9]
-        ; A[0] * B[0]
-        mulx	r11, r10, QWORD PTR [rbp]
-        ; A[0] * B[1]
-        mulx	r12, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx], r10
-        adcx	r11, rax
-        ; A[0] * B[2]
-        mulx	r13, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+8], r11
-        adcx	r12, rax
-        mov	QWORD PTR [rbx+16], r12
-        ; A[0] * B[3]
-        mulx	r10, rax, QWORD PTR [rbp+24]
-        adcx	r13, rax
-        ; A[0] * B[4]
-        mulx	r11, rax, QWORD PTR [rbp+32]
-        mov	QWORD PTR [rbx+24], r13
-        adcx	r10, rax
-        ; A[0] * B[5]
-        mulx	r12, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+32], r10
-        adcx	r11, rax
-        mov	QWORD PTR [rbx+40], r11
-        ; A[0] * B[6]
-        mulx	r13, rax, QWORD PTR [rbp+48]
-        adcx	r12, rax
-        ; A[0] * B[7]
-        mulx	r10, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+48], r12
-        adcx	r13, rax
-        ; A[0] * B[8]
-        mulx	r11, rax, QWORD PTR [rbp+64]
-        mov	QWORD PTR [rbx+56], r13
-        adcx	r10, rax
-        adcx	r11, r15
-        mov	r14, r15
-        adcx	r14, r15
-        mov	QWORD PTR [rbx+64], r10
-        mov	QWORD PTR [rsp], r11
-        mov	rdx, QWORD PTR [r9+8]
-        mov	r11, QWORD PTR [rbx+8]
-        mov	r12, QWORD PTR [rbx+16]
-        mov	r13, QWORD PTR [rbx+24]
-        mov	r10, QWORD PTR [rbx+32]
-        ; A[1] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[1] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[1] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+16], r12
-        adcx	r13, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+24], r13
-        mov	r11, QWORD PTR [rbx+40]
-        mov	r12, QWORD PTR [rbx+48]
-        mov	r13, QWORD PTR [rbx+56]
-        ; A[1] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[1] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        mov	QWORD PTR [rbx+32], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[1] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+40], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [rbx+48], r12
-        mov	r10, QWORD PTR [rbx+64]
-        mov	r11, QWORD PTR [rsp]
-        ; A[1] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[1] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+56], r13
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[1] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        mov	QWORD PTR [rbx+64], r10
-        mov	r12, r15
-        adcx	r11, rax
-        adox	r12, rcx
-        adcx	r12, r14
-        mov	r14, r15
-        adox	r14, r15
-        adcx	r14, r15
-        mov	QWORD PTR [rsp], r11
-        mov	QWORD PTR [rsp+8], r12
-        mov	rdx, QWORD PTR [r9+16]
-        mov	r12, QWORD PTR [rbx+16]
-        mov	r13, QWORD PTR [rbx+24]
-        mov	r10, QWORD PTR [rbx+32]
-        mov	r11, QWORD PTR [rbx+40]
-        ; A[2] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[2] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+16], r12
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[2] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+24], r13
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+32], r10
-        mov	r12, QWORD PTR [rbx+48]
-        mov	r13, QWORD PTR [rbx+56]
-        mov	r10, QWORD PTR [rbx+64]
-        ; A[2] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[2] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        mov	QWORD PTR [rbx+40], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[2] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+48], r12
-        adcx	r13, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+56], r13
-        mov	r11, QWORD PTR [rsp]
-        mov	r12, QWORD PTR [rsp+8]
-        ; A[2] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[2] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+64], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[2] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        mov	QWORD PTR [rsp], r11
-        mov	r13, r15
-        adcx	r12, rax
-        adox	r13, rcx
-        adcx	r13, r14
-        mov	r14, r15
-        adox	r14, r15
-        adcx	r14, r15
-        mov	QWORD PTR [rsp+8], r12
-        mov	QWORD PTR [rsp+16], r13
-        mov	rdx, QWORD PTR [r9+24]
-        mov	r13, QWORD PTR [rbx+24]
-        mov	r10, QWORD PTR [rbx+32]
-        mov	r11, QWORD PTR [rbx+40]
-        mov	r12, QWORD PTR [rbx+48]
-        ; A[3] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[3] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+24], r13
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[3] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+32], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbx+40], r11
-        mov	r13, QWORD PTR [rbx+56]
-        mov	r10, QWORD PTR [rbx+64]
-        mov	r11, QWORD PTR [rsp]
-        ; A[3] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[3] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        mov	QWORD PTR [rbx+48], r12
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[3] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+56], r13
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+64], r10
-        mov	r12, QWORD PTR [rsp+8]
-        mov	r13, QWORD PTR [rsp+16]
-        ; A[3] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[3] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rsp], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[3] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        mov	QWORD PTR [rsp+8], r12
-        mov	r10, r15
-        adcx	r13, rax
-        adox	r10, rcx
-        adcx	r10, r14
-        mov	r14, r15
-        adox	r14, r15
-        adcx	r14, r15
-        mov	QWORD PTR [rsp+16], r13
-        mov	QWORD PTR [rsp+24], r10
-        mov	rdx, QWORD PTR [r9+32]
-        mov	r10, QWORD PTR [rbx+32]
-        mov	r11, QWORD PTR [rbx+40]
-        mov	r12, QWORD PTR [rbx+48]
-        mov	r13, QWORD PTR [rbx+56]
-        ; A[4] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[4] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+32], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[4] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+40], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [rbx+48], r12
-        mov	r10, QWORD PTR [rbx+64]
-        mov	r11, QWORD PTR [rsp]
-        mov	r12, QWORD PTR [rsp+8]
-        ; A[4] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[4] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        mov	QWORD PTR [rbx+56], r13
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[4] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+64], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rsp], r11
-        mov	r13, QWORD PTR [rsp+16]
-        mov	r10, QWORD PTR [rsp+24]
-        ; A[4] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[4] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rsp+8], r12
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[4] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        mov	QWORD PTR [rsp+16], r13
-        mov	r11, r15
-        adcx	r10, rax
-        adox	r11, rcx
-        adcx	r11, r14
-        mov	r14, r15
-        adox	r14, r15
-        adcx	r14, r15
-        mov	QWORD PTR [rsp+24], r10
-        mov	QWORD PTR [rsp+32], r11
-        mov	rdx, QWORD PTR [r9+40]
-        mov	r11, QWORD PTR [rbx+40]
-        mov	r12, QWORD PTR [rbx+48]
-        mov	r13, QWORD PTR [rbx+56]
-        mov	r10, QWORD PTR [rbx+64]
-        ; A[5] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[5] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+40], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[5] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+48], r12
-        adcx	r13, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+56], r13
-        mov	r11, QWORD PTR [rsp]
-        mov	r12, QWORD PTR [rsp+8]
-        mov	r13, QWORD PTR [rsp+16]
-        ; A[5] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[5] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        mov	QWORD PTR [rbx+64], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[5] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rsp], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [rsp+8], r12
-        mov	r10, QWORD PTR [rsp+24]
-        mov	r11, QWORD PTR [rsp+32]
-        ; A[5] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[5] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rsp+16], r13
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[5] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        mov	QWORD PTR [rsp+24], r10
-        mov	r12, r15
-        adcx	r11, rax
-        adox	r12, rcx
-        adcx	r12, r14
-        mov	r14, r15
-        adox	r14, r15
-        adcx	r14, r15
-        mov	QWORD PTR [rsp+32], r11
-        mov	QWORD PTR [rsp+40], r12
-        mov	rdx, QWORD PTR [r9+48]
-        mov	r12, QWORD PTR [rbx+48]
-        mov	r13, QWORD PTR [rbx+56]
-        mov	r10, QWORD PTR [rbx+64]
-        mov	r11, QWORD PTR [rsp]
-        ; A[6] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[6] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+48], r12
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[6] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+56], r13
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+64], r10
-        mov	r12, QWORD PTR [rsp+8]
-        mov	r13, QWORD PTR [rsp+16]
-        mov	r10, QWORD PTR [rsp+24]
-        ; A[6] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[6] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        mov	QWORD PTR [rsp], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[6] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rsp+8], r12
-        adcx	r13, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rsp+16], r13
-        mov	r11, QWORD PTR [rsp+32]
-        mov	r12, QWORD PTR [rsp+40]
-        ; A[6] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[6] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rsp+24], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[6] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        mov	QWORD PTR [rsp+32], r11
-        mov	r13, r15
-        adcx	r12, rax
-        adox	r13, rcx
-        adcx	r13, r14
-        mov	r14, r15
-        adox	r14, r15
-        adcx	r14, r15
-        mov	QWORD PTR [rsp+40], r12
-        mov	QWORD PTR [rsp+48], r13
-        mov	rdx, QWORD PTR [r9+56]
-        mov	r13, QWORD PTR [rbx+56]
-        mov	r10, QWORD PTR [rbx+64]
-        mov	r11, QWORD PTR [rsp]
-        mov	r12, QWORD PTR [rsp+8]
-        ; A[7] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[7] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+56], r13
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[7] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+64], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rsp], r11
-        mov	r13, QWORD PTR [rsp+16]
-        mov	r10, QWORD PTR [rsp+24]
-        mov	r11, QWORD PTR [rsp+32]
-        ; A[7] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[7] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        mov	QWORD PTR [rsp+8], r12
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[7] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rsp+16], r13
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rsp+24], r10
-        mov	r12, QWORD PTR [rsp+40]
-        mov	r13, QWORD PTR [rsp+48]
-        ; A[7] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[7] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rsp+32], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[7] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        mov	QWORD PTR [rsp+40], r12
-        mov	r10, r15
-        adcx	r13, rax
-        adox	r10, rcx
-        adcx	r10, r14
-        mov	r14, r15
-        adox	r14, r15
-        adcx	r14, r15
-        mov	QWORD PTR [rsp+48], r13
-        mov	QWORD PTR [rsp+56], r10
-        mov	rdx, QWORD PTR [r9+64]
-        mov	r10, QWORD PTR [rbx+64]
-        mov	r11, QWORD PTR [rsp]
-        mov	r12, QWORD PTR [rsp+8]
-        mov	r13, QWORD PTR [rsp+16]
-        ; A[8] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[8] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+64], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[8] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rsp], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [rsp+8], r12
-        mov	r10, QWORD PTR [rsp+24]
-        mov	r11, QWORD PTR [rsp+32]
-        mov	r12, QWORD PTR [rsp+40]
-        ; A[8] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[8] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        mov	QWORD PTR [rsp+16], r13
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[8] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rsp+24], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rsp+32], r11
-        mov	r13, QWORD PTR [rsp+48]
-        mov	r10, QWORD PTR [rsp+56]
-        ; A[8] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[8] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rsp+40], r12
-        adcx	r13, rax
-        adox	r10, rcx
-        ; A[8] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        mov	QWORD PTR [rsp+48], r13
-        mov	r11, r15
-        adcx	r10, rax
-        adox	r11, rcx
-        adcx	r11, r14
-        mov	QWORD PTR [rsp+56], r10
-        mov	QWORD PTR [rsp+64], r11
-        mov	rax, QWORD PTR [rsp+-8]
-        mov	rcx, QWORD PTR [rsp]
-        mov	r10, QWORD PTR [rsp+8]
-        mov	r15, rax
-        and	r15, 511
-        mov	r11, QWORD PTR [rsp+16]
-        mov	r12, QWORD PTR [rsp+24]
-        mov	r13, QWORD PTR [rsp+32]
-        mov	r14, QWORD PTR [rsp+40]
-        mov	rbx, QWORD PTR [rsp+48]
-        mov	rdx, QWORD PTR [rsp+56]
-        sub	rsp, 72
-        shrd	rax, rcx, 9
-        shrd	rcx, r10, 9
-        shrd	r10, r11, 9
-        shrd	r11, r12, 9
-        shrd	r12, r13, 9
-        shrd	r13, r14, 9
-        shrd	r14, rbx, 9
-        shrd	rbx, rdx, 9
-        shr	rdx, 9
-        add	rax, QWORD PTR [rsp]
-        adc	rcx, QWORD PTR [rsp+8]
-        adc	r10, QWORD PTR [rsp+16]
-        adc	r11, QWORD PTR [rsp+24]
-        adc	r12, QWORD PTR [rsp+32]
-        adc	r13, QWORD PTR [rsp+40]
-        adc	r14, QWORD PTR [rsp+48]
-        adc	rbx, QWORD PTR [rsp+56]
-        adc	r15, rdx
-        mov	rdx, r15
-        shr	r15, 9
-        and	rdx, 511
-        add	rax, r15
-        adc	rcx, 0
-        adc	r10, 0
-        adc	r11, 0
-        adc	r12, 0
-        adc	r13, 0
-        adc	r14, 0
-        adc	rbx, 0
-        adc	rdx, 0
-        mov	QWORD PTR [r8], rax
-        mov	QWORD PTR [r8+8], rcx
-        mov	QWORD PTR [r8+16], r10
-        mov	QWORD PTR [r8+24], r11
-        mov	QWORD PTR [r8+32], r12
-        mov	QWORD PTR [r8+40], r13
-        mov	QWORD PTR [r8+48], r14
-        mov	QWORD PTR [r8+56], rbx
-        mov	QWORD PTR [r8+64], rdx
-        add	rsp, 144
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        pop	rbp
-        pop	rbx
-        ret
-sp_521_mont_mul_avx2_9 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m)
-;  *
-;  * r   Result of squaring.
-;  * a   Number to square in Montgomery form.
-;  * m   Modulus (prime).
-;  * mp  Montgomery multiplier.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_mont_sqr_avx2_9 PROC
-        push	rbp
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        mov	r8, rcx
-        mov	r9, rdx
-        sub	rsp, 144
-        mov	rbp, rsp
-        add	rsp, 72
-        xor	r12, r12
-        ; Diagonal 1
-        ; Zero into %r9
-        ; A[1] x A[0]
-        mov	rdx, QWORD PTR [r9]
-        mulx	r11, r10, QWORD PTR [r9+8]
-        mov	QWORD PTR [rbp+8], r10
-        ; Zero into %r8
-        ; A[2] x A[0]
-        mulx	r10, rax, QWORD PTR [r9+16]
-        adcx	r11, rax
-        adox	r10, r12
-        mov	QWORD PTR [rbp+16], r11
-        ; No load %r12 - %r9
-        ; A[3] x A[0]
-        mulx	r14, rax, QWORD PTR [r9+24]
-        adcx	r10, rax
-        adox	r14, r12
-        mov	QWORD PTR [rbp+24], r10
-        ; No load %r13 - %r8
-        ; A[4] x A[0]
-        mulx	r15, rax, QWORD PTR [r9+32]
-        adcx	r14, rax
-        adox	r15, r12
-        ; No store %r12 - %r9
-        ; No load %r14 - %r9
-        ; A[5] x A[0]
-        mulx	rdi, rax, QWORD PTR [r9+40]
-        adcx	r15, rax
-        adox	rdi, r12
-        ; No store %r13 - %r8
-        ; No load %r15 - %r8
-        ; A[6] x A[0]
-        mulx	rsi, rax, QWORD PTR [r9+48]
-        adcx	rdi, rax
-        adox	rsi, r12
-        ; No store %r14 - %r9
-        ; No load %rbx - %r9
-        ; A[7] x A[0]
-        mulx	rbx, rax, QWORD PTR [r9+56]
-        adcx	rsi, rax
-        adox	rbx, r12
-        ; No store %r15 - %r8
-        ; Zero into %r8
-        ; A[8] x A[0]
-        mulx	r10, rax, QWORD PTR [r9+64]
-        adcx	rbx, rax
-        adox	r10, r12
-        ; No store %rbx - %r9
-        ; Zero into %r9
-        ; A[8] x A[1]
-        mov	rdx, QWORD PTR [r9+8]
-        mulx	r11, rax, QWORD PTR [r9+64]
-        adcx	r10, rax
-        adox	r11, r12
-        mov	QWORD PTR [rsp], r10
-        ;  Carry
-        adcx	r11, r12
-        mov	r13, r12
-        adcx	r13, r12
-        adox	r13, r12
-        mov	QWORD PTR [rsp+8], r11
-        ; Diagonal 2
-        mov	r11, QWORD PTR [rbp+24]
-        ; No load %r12 - %r8
-        ; A[2] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+16]
-        adcx	r11, rax
-        adox	r14, rcx
-        mov	QWORD PTR [rbp+24], r11
-        ; No load %r13 - %r9
-        ; A[3] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+24]
-        adcx	r14, rax
-        adox	r15, rcx
-        ; No store %r12 - %r8
-        ; No load %r14 - %r8
-        ; A[4] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+32]
-        adcx	r15, rax
-        adox	rdi, rcx
-        ; No store %r13 - %r9
-        ; No load %r15 - %r9
-        ; A[5] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+40]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; No store %r14 - %r8
-        ; No load %rbx - %r8
-        ; A[6] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r15 - %r9
-        mov	r11, QWORD PTR [rsp]
-        ; A[7] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	rbx, rax
-        adox	r11, rcx
-        ; No store %rbx - %r8
-        mov	r10, QWORD PTR [rsp+8]
-        ; A[7] x A[2]
-        mov	rdx, QWORD PTR [r9+16]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rsp], r11
-        ; Zero into %r9
-        ; A[7] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	r11, rax, QWORD PTR [r9+56]
-        adcx	r10, rax
-        adox	r11, r12
-        mov	QWORD PTR [rsp+8], r10
-        ; Zero into %r8
-        ; A[7] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	r10, rax, QWORD PTR [r9+56]
-        adcx	r11, rax
-        adox	r10, r12
-        mov	QWORD PTR [rsp+16], r11
-        ;  Carry
-        adcx	r10, r13
-        mov	r13, r12
-        adcx	r13, r12
-        adox	r13, r12
-        mov	QWORD PTR [rsp+24], r10
-        ; Diagonal 3
-        ; No load %r14 - %r9
-        ; A[3] x A[2]
-        mov	rdx, QWORD PTR [r9+16]
-        mulx	rcx, rax, QWORD PTR [r9+24]
-        adcx	r15, rax
-        adox	rdi, rcx
-        ; No store %r13 - %r8
-        ; No load %r15 - %r8
-        ; A[4] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+32]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; No store %r14 - %r9
-        ; No load %rbx - %r9
-        ; A[5] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+40]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r15 - %r8
-        mov	r10, QWORD PTR [rsp]
-        ; A[6] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	rbx, rax
-        adox	r10, rcx
-        ; No store %rbx - %r9
-        mov	r11, QWORD PTR [rsp+8]
-        ; A[6] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rsp], r10
-        mov	r10, QWORD PTR [rsp+16]
-        ; A[6] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rsp+8], r11
-        mov	r11, QWORD PTR [rsp+24]
-        ; A[6] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rsp+16], r10
-        ; Zero into %r8
-        ; A[8] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	r10, rax, QWORD PTR [r9+64]
-        adcx	r11, rax
-        adox	r10, r12
-        mov	QWORD PTR [rsp+24], r11
-        ; Zero into %r9
-        ; A[8] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	r11, rax, QWORD PTR [r9+64]
-        adcx	r10, rax
-        adox	r11, r12
-        mov	QWORD PTR [rsp+32], r10
-        ;  Carry
-        adcx	r11, r13
-        mov	r13, r12
-        adcx	r13, r12
-        adox	r13, r12
-        mov	QWORD PTR [rsp+40], r11
-        ; Diagonal 4
-        ; No load %rbx - %r8
-        ; A[4] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	rcx, rax, QWORD PTR [r9+32]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r15 - %r9
-        mov	r11, QWORD PTR [rsp]
-        ; A[5] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+40]
-        adcx	rbx, rax
-        adox	r11, rcx
-        ; No store %rbx - %r8
-        mov	r10, QWORD PTR [rsp+8]
-        ; A[5] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	rcx, rax, QWORD PTR [r9+40]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rsp], r11
-        mov	r11, QWORD PTR [rsp+16]
-        ; A[8] x A[2]
-        mov	rdx, QWORD PTR [r9+16]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rsp+8], r10
-        mov	r10, QWORD PTR [rsp+24]
-        ; A[8] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rsp+16], r11
-        mov	r11, QWORD PTR [rsp+32]
-        ; A[7] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rsp+24], r10
-        mov	r10, QWORD PTR [rsp+40]
-        ; A[7] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	r11, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rsp+32], r11
-        ; Zero into %r9
-        ; A[8] x A[6]
-        mulx	r11, rax, QWORD PTR [r9+64]
-        adcx	r10, rax
-        adox	r11, r12
-        mov	QWORD PTR [rsp+40], r10
-        ; Zero into %r8
-        ; A[8] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	r10, rax, QWORD PTR [r9+64]
-        adcx	r11, rax
-        adox	r10, r12
-        mov	QWORD PTR [rsp+48], r11
-        ;  Carry
-        adcx	r10, r13
-        mov	r13, r12
-        adcx	r13, r12
-        adox	r13, r12
-        mov	QWORD PTR [rsp+56], r10
-        mov	QWORD PTR [rsp+64], r13
-        ; Double and Add in A[i] x A[i]
-        mov	r11, QWORD PTR [rbp+8]
-        ; A[0] x A[0]
-        mov	rdx, QWORD PTR [r9]
-        mulx	rcx, rax, rdx
-        mov	QWORD PTR [rbp], rax
-        adox	r11, r11
-        adcx	r11, rcx
-        mov	QWORD PTR [rbp+8], r11
-        mov	r10, QWORD PTR [rbp+16]
-        mov	r11, QWORD PTR [rbp+24]
-        ; A[1] x A[1]
-        mov	rdx, QWORD PTR [r9+8]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [rbp+16], r10
-        mov	QWORD PTR [rbp+24], r11
-        ; A[2] x A[2]
-        mov	rdx, QWORD PTR [r9+16]
-        mulx	rcx, rax, rdx
-        adox	r14, r14
-        adox	r15, r15
-        adcx	r14, rax
-        adcx	r15, rcx
-        ; A[3] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	rcx, rax, rdx
-        adox	rdi, rdi
-        adox	rsi, rsi
-        adcx	rdi, rax
-        adcx	rsi, rcx
-        mov	r11, QWORD PTR [rsp]
-        ; A[4] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	rcx, rax, rdx
-        adox	rbx, rbx
-        adox	r11, r11
-        adcx	rbx, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [rsp], r11
-        mov	r10, QWORD PTR [rsp+8]
-        mov	r11, QWORD PTR [rsp+16]
-        ; A[5] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [rsp+8], r10
-        mov	QWORD PTR [rsp+16], r11
-        mov	r10, QWORD PTR [rsp+24]
-        mov	r11, QWORD PTR [rsp+32]
-        ; A[6] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [rsp+24], r10
-        mov	QWORD PTR [rsp+32], r11
-        mov	r10, QWORD PTR [rsp+40]
-        mov	r11, QWORD PTR [rsp+48]
-        ; A[7] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [rsp+40], r10
-        mov	QWORD PTR [rsp+48], r11
-        mov	r10, QWORD PTR [rsp+56]
-        mov	r11, QWORD PTR [rsp+64]
-        ; A[8] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [rsp+56], r10
-        mov	QWORD PTR [rsp+64], r11
-        mov	QWORD PTR [rsp+-40], r14
-        mov	QWORD PTR [rsp+-32], r15
-        mov	QWORD PTR [rsp+-24], rdi
-        mov	QWORD PTR [rsp+-16], rsi
-        mov	QWORD PTR [rsp+-8], rbx
-        mov	r10, QWORD PTR [rsp+-8]
-        mov	r11, QWORD PTR [rsp]
-        mov	r14, QWORD PTR [rsp+8]
-        mov	rcx, r10
-        and	rcx, 511
-        mov	r15, QWORD PTR [rsp+16]
-        mov	rdi, QWORD PTR [rsp+24]
-        mov	rsi, QWORD PTR [rsp+32]
-        mov	rbx, QWORD PTR [rsp+40]
-        mov	rdx, QWORD PTR [rsp+48]
-        mov	rax, QWORD PTR [rsp+56]
-        sub	rsp, 72
-        shrd	r10, r11, 9
-        shrd	r11, r14, 9
-        shrd	r14, r15, 9
-        shrd	r15, rdi, 9
-        shrd	rdi, rsi, 9
-        shrd	rsi, rbx, 9
-        shrd	rbx, rdx, 9
-        shrd	rdx, rax, 9
-        shr	rax, 9
-        add	r10, QWORD PTR [rsp]
-        adc	r11, QWORD PTR [rsp+8]
-        adc	r14, QWORD PTR [rsp+16]
-        adc	r15, QWORD PTR [rsp+24]
-        adc	rdi, QWORD PTR [rsp+32]
-        adc	rsi, QWORD PTR [rsp+40]
-        adc	rbx, QWORD PTR [rsp+48]
-        adc	rdx, QWORD PTR [rsp+56]
-        adc	rcx, rax
-        mov	rax, rcx
-        shr	rcx, 9
-        and	rax, 511
-        add	r10, rcx
-        adc	r11, 0
-        adc	r14, 0
-        adc	r15, 0
-        adc	rdi, 0
-        adc	rsi, 0
-        adc	rbx, 0
-        adc	rdx, 0
-        adc	rax, 0
-        mov	QWORD PTR [r8], r10
-        mov	QWORD PTR [r8+8], r11
-        mov	QWORD PTR [r8+16], r14
-        mov	QWORD PTR [r8+24], r15
-        mov	QWORD PTR [r8+32], rdi
-        mov	QWORD PTR [r8+40], rsi
-        mov	QWORD PTR [r8+48], rbx
-        mov	QWORD PTR [r8+56], rdx
-        mov	QWORD PTR [r8+64], rax
-        add	rsp, 144
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        pop	rbp
-        ret
-sp_521_mont_sqr_avx2_9 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Conditionally subtract b from a using the mask m.
-;  * m is -1 to subtract and 0 when not copying.
-;  *
-;  * r  A single precision number representing condition subtract result.
-;  * a  A single precision number to subtract from.
-;  * b  A single precision number to subtract.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_cond_sub_avx2_9 PROC
-        push	r12
-        mov	r12, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx]
-        pext	r12, r12, r9
-        sub	r10, r12
-        mov	r12, QWORD PTR [r8+8]
-        mov	r11, QWORD PTR [rdx+8]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+16]
-        mov	r12, QWORD PTR [rdx+16]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+8], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [rdx+24]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+16], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [rdx+32]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+24], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [rdx+40]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+32], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+48]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+40], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+56]
-        mov	r11, QWORD PTR [rdx+56]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+48], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+64]
-        mov	r12, QWORD PTR [rdx+64]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+56], r11
-        sbb	r12, r10
-        mov	QWORD PTR [rcx+64], r12
-        sbb	rax, rax
-        pop	r12
-        ret
-sp_521_cond_sub_avx2_9 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Reduce the number back to 521 bits using Montgomery reduction.
-;  *
-;  * a   A single precision number to reduce in place.
-;  * m   The single precision number representing the modulus.
-;  * mp  The digit representing the negative inverse of m mod 2^n.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_mont_reduce_order_avx2_9 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        push	rbp
-        mov	r9, rcx
-        mov	r10, rdx
-        xor	rbp, rbp
-        ; i = 9
-        mov	r11, 8
-        mov	r14, QWORD PTR [r9]
-        mov	r15, QWORD PTR [r9+8]
-        mov	rdi, QWORD PTR [r9+16]
-        mov	rsi, QWORD PTR [r9+24]
-        add	r9, 32
-        xor	rbp, rbp
-L_521_mont_reduce_order_avx2_9_loop:
-        ; mu = a[i] * mp
-        mov	rdx, r14
-        mov	r12, r14
-        imul	rdx, r8
-        xor	rbx, rbx
-        ; a[i+0] += m[0] * mu
-        mulx	rcx, rax, QWORD PTR [r10]
-        mov	r14, r15
-        adcx	r12, rax
-        adox	r14, rcx
-        mov	QWORD PTR [r9+-32], r12
-        ; a[i+1] += m[1] * mu
-        mulx	rcx, rax, QWORD PTR [r10+8]
-        mov	r15, rdi
-        adcx	r14, rax
-        adox	r15, rcx
-        ; a[i+2] += m[2] * mu
-        mulx	rcx, rax, QWORD PTR [r10+16]
-        mov	rdi, rsi
-        adcx	r15, rax
-        adox	rdi, rcx
-        ; a[i+3] += m[3] * mu
-        mulx	rcx, rax, QWORD PTR [r10+24]
-        mov	rsi, QWORD PTR [r9]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; a[i+4] += m[4] * mu
-        mulx	rcx, rax, QWORD PTR [r10+32]
-        mov	r13, QWORD PTR [r9+8]
-        adcx	rsi, rax
-        adox	r13, rcx
-        ; a[i+5] += m[5] * mu
-        mulx	rcx, rax, QWORD PTR [r10+40]
-        mov	r12, QWORD PTR [r9+16]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+8], r13
-        ; a[i+6] += m[6] * mu
-        mulx	rcx, rax, QWORD PTR [r10+48]
-        mov	r13, QWORD PTR [r9+24]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+16], r12
-        ; a[i+7] += m[7] * mu
-        mulx	rcx, rax, QWORD PTR [r10+56]
-        mov	r12, QWORD PTR [r9+32]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+24], r13
-        ; a[i+8] += m[8] * mu
-        mulx	rcx, rax, QWORD PTR [r10+64]
-        mov	r13, QWORD PTR [r9+40]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+32], r12
-        adcx	r13, rbp
-        mov	rbp, rbx
-        mov	QWORD PTR [r9+40], r13
-        adox	rbp, rbx
-        adcx	rbp, rbx
-        ; mu = a[i] * mp
-        mov	rdx, r14
-        mov	r13, r14
-        imul	rdx, r8
-        xor	rbx, rbx
-        ; a[i+0] += m[0] * mu
-        mulx	rcx, rax, QWORD PTR [r10]
-        mov	r14, r15
-        adcx	r13, rax
-        adox	r14, rcx
-        mov	QWORD PTR [r9+-24], r13
-        ; a[i+1] += m[1] * mu
-        mulx	rcx, rax, QWORD PTR [r10+8]
-        mov	r15, rdi
-        adcx	r14, rax
-        adox	r15, rcx
-        ; a[i+2] += m[2] * mu
-        mulx	rcx, rax, QWORD PTR [r10+16]
-        mov	rdi, rsi
-        adcx	r15, rax
-        adox	rdi, rcx
-        ; a[i+3] += m[3] * mu
-        mulx	rcx, rax, QWORD PTR [r10+24]
-        mov	rsi, QWORD PTR [r9+8]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; a[i+4] += m[4] * mu
-        mulx	rcx, rax, QWORD PTR [r10+32]
-        mov	r12, QWORD PTR [r9+16]
-        adcx	rsi, rax
-        adox	r12, rcx
-        ; a[i+5] += m[5] * mu
-        mulx	rcx, rax, QWORD PTR [r10+40]
-        mov	r13, QWORD PTR [r9+24]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+16], r12
-        ; a[i+6] += m[6] * mu
-        mulx	rcx, rax, QWORD PTR [r10+48]
-        mov	r12, QWORD PTR [r9+32]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+24], r13
-        ; a[i+7] += m[7] * mu
-        mulx	rcx, rax, QWORD PTR [r10+56]
-        mov	r13, QWORD PTR [r9+40]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+32], r12
-        ; a[i+8] += m[8] * mu
-        mulx	rcx, rax, QWORD PTR [r10+64]
-        mov	r12, QWORD PTR [r9+48]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+40], r13
-        adcx	r12, rbp
-        mov	rbp, rbx
-        mov	QWORD PTR [r9+48], r12
-        adox	rbp, rbx
-        adcx	rbp, rbx
-        ; a += 2
-        add	r9, 16
-        ; i -= 2
-        sub	r11, 2
-        jnz	L_521_mont_reduce_order_avx2_9_loop
-        ; mu = a[i] * mp
-        mov	rdx, r14
-        mov	r12, r14
-        imul	rdx, r8
-        and	rdx, 511
-        xor	rbx, rbx
-        ; a[i+0] += m[0] * mu
-        mulx	rcx, rax, QWORD PTR [r10]
-        mov	r14, r15
-        adcx	r12, rax
-        adox	r14, rcx
-        mov	QWORD PTR [r9+-32], r12
-        ; a[i+1] += m[1] * mu
-        mulx	rcx, rax, QWORD PTR [r10+8]
-        mov	r15, rdi
-        adcx	r14, rax
-        adox	r15, rcx
-        ; a[i+2] += m[2] * mu
-        mulx	rcx, rax, QWORD PTR [r10+16]
-        mov	rdi, rsi
-        adcx	r15, rax
-        adox	rdi, rcx
-        ; a[i+3] += m[3] * mu
-        mulx	rcx, rax, QWORD PTR [r10+24]
-        mov	rsi, QWORD PTR [r9]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; a[i+4] += m[4] * mu
-        mulx	rcx, rax, QWORD PTR [r10+32]
-        mov	r13, QWORD PTR [r9+8]
-        adcx	rsi, rax
-        adox	r13, rcx
-        ; a[i+5] += m[5] * mu
-        mulx	rcx, rax, QWORD PTR [r10+40]
-        mov	r12, QWORD PTR [r9+16]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+8], r13
-        ; a[i+6] += m[6] * mu
-        mulx	rcx, rax, QWORD PTR [r10+48]
-        mov	r13, QWORD PTR [r9+24]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+16], r12
-        ; a[i+7] += m[7] * mu
-        mulx	rcx, rax, QWORD PTR [r10+56]
-        mov	r12, QWORD PTR [r9+32]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+24], r13
-        ; a[i+8] += m[8] * mu
-        mulx	rcx, rax, QWORD PTR [r10+64]
-        mov	r13, QWORD PTR [r9+40]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+32], r12
-        adcx	r13, rbp
-        mov	rbp, rbx
-        mov	QWORD PTR [r9+40], r13
-        adox	rbp, rbx
-        ; a += 1
-        add	r9, 8
-        mov	QWORD PTR [r9+-32], r14
-        mov	QWORD PTR [r9+-24], r15
-        mov	QWORD PTR [r9+-16], rdi
-        mov	QWORD PTR [r9+-8], rsi
-        sub	r9, 32
-        lea	r8, QWORD PTR [r9+-8]
-        sub	r9, 72
-        mov	r12, QWORD PTR [r8]
-        mov	r14, QWORD PTR [r8+8]
-        mov	r15, QWORD PTR [r8+16]
-        mov	rdi, QWORD PTR [r8+24]
-        mov	r13, QWORD PTR [r8+32]
-        shrd	r12, r14, 9
-        shrd	r14, r15, 9
-        shrd	r15, rdi, 9
-        shrd	rdi, r13, 9
-        mov	QWORD PTR [r9], r12
-        mov	QWORD PTR [r9+8], r14
-        mov	QWORD PTR [r9+16], r15
-        mov	QWORD PTR [r9+24], rdi
-        mov	r14, QWORD PTR [r8+40]
-        mov	r15, QWORD PTR [r8+48]
-        mov	rdi, QWORD PTR [r8+56]
-        mov	r12, QWORD PTR [r8+64]
-        shrd	r13, r14, 9
-        shrd	r14, r15, 9
-        shrd	r15, rdi, 9
-        shrd	rdi, r12, 9
-        mov	QWORD PTR [r9+32], r13
-        mov	QWORD PTR [r9+40], r14
-        mov	QWORD PTR [r9+48], r15
-        mov	QWORD PTR [r9+56], rdi
-        mov	r14, QWORD PTR [r8+72]
-        shrd	r12, r14, 9
-        shr	r14, 9
-        mov	QWORD PTR [r9+64], r12
-        mov	QWORD PTR [r9+72], r14
-        mov	rbp, QWORD PTR [r9+64]
-        shr	rbp, 9
-        neg	rbp
-        mov	rcx, QWORD PTR [r10]
-        mov	rdx, QWORD PTR [r9]
-        pext	rcx, rcx, rbp
-        sub	rdx, rcx
-        mov	rcx, QWORD PTR [r10+8]
-        mov	rax, QWORD PTR [r9+8]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+16]
-        mov	rcx, QWORD PTR [r9+16]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+8], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+24]
-        mov	rdx, QWORD PTR [r9+24]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+16], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+32]
-        mov	rax, QWORD PTR [r9+32]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+24], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+40]
-        mov	rcx, QWORD PTR [r9+40]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+32], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+48]
-        mov	rdx, QWORD PTR [r9+48]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+40], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+56]
-        mov	rax, QWORD PTR [r9+56]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+48], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+64]
-        mov	rcx, QWORD PTR [r9+64]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+56], rax
-        sbb	rcx, rdx
-        mov	QWORD PTR [r9+64], rcx
-        pop	rbp
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_521_mont_reduce_order_avx2_9 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
-;  *
-;  * r  Result of division by 2.
-;  * a  Number to divide.
-;  * m  Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_521_mont_div2_avx2_9 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        mov	rax, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rdx+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	r11, QWORD PTR [rdx+32]
-        mov	r12, QWORD PTR [rdx+40]
-        mov	r13, QWORD PTR [rdx+48]
-        mov	r14, QWORD PTR [rdx+56]
-        mov	r15, QWORD PTR [rdx+64]
-        mov	rdi, rax
-        and	rdi, 1
-        sub	rax, rdi
-        sbb	r8, 0
-        sbb	r9, 0
-        sbb	r10, 0
-        sbb	r11, 0
-        sbb	r12, 0
-        sbb	r13, 0
-        sbb	r14, 0
-        sbb	r15, 0
-        shl	rdi, 9
-        add	r15, rdi
-        shrd	rax, r8, 1
-        shrd	r8, r9, 1
-        shrd	r9, r10, 1
-        shrd	r10, r11, 1
-        shrd	r11, r12, 1
-        shrd	r12, r13, 1
-        shrd	r13, r14, 1
-        shrd	r14, r15, 1
-        shr	r15, 1
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r8
-        mov	QWORD PTR [rcx+16], r9
-        mov	QWORD PTR [rcx+24], r10
-        mov	QWORD PTR [rcx+32], r11
-        mov	QWORD PTR [rcx+40], r12
-        mov	QWORD PTR [rcx+48], r13
-        mov	QWORD PTR [rcx+56], r14
-        mov	QWORD PTR [rcx+64], r15
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_521_mont_div2_avx2_9 ENDP
-_text ENDS
-ENDIF
-IFNDEF WC_NO_CACHE_RESISTANT
-; /* Touch each possible entry that could be being copied.
-;  *
-;  * r      Point to copy into.
-;  * table  Table - start of the entries to access
-;  * idx    Index of entry to retrieve.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_get_entry_64_9 PROC
-        push	r12
-        sub	rsp, 160
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        vmovdqu	OWORD PTR [rsp+96], xmm12
-        vmovdqu	OWORD PTR [rsp+112], xmm13
-        vmovdqu	OWORD PTR [rsp+128], xmm14
-        vmovdqu	OWORD PTR [rsp+144], xmm15
-        ; From entry 1
-        mov	r12, 1
-        mov	rax, 1
-        movd	xmm13, r8d
-        add	rdx, 144
-        movd	xmm15, eax
-        mov	rax, 63
-        pshufd	xmm15, xmm15, 0
-        pshufd	xmm13, xmm13, 0
-        pxor	xmm14, xmm14
-        pxor	xmm0, xmm0
-        pxor	xmm1, xmm1
-        pxor	xmm2, xmm2
-        pxor	xmm3, xmm3
-        xor	r11, r11
-        movdqa	xmm14, xmm15
-L_521_get_entry_64_9_start_0:
-        movdqa	xmm12, xmm14
-        paddd	xmm14, xmm15
-        pcmpeqd	xmm12, xmm13
-        xor	r9, r9
-        cmp	r8, r12
-        sete	r9b
-        neg	r9
-        inc	r12
-        movdqu	xmm4, [rdx]
-        movdqu	xmm5, [rdx+16]
-        movdqu	xmm6, [rdx+32]
-        movdqu	xmm7, [rdx+48]
-        mov	r10, QWORD PTR [rdx+64]
-        add	rdx, 144
-        pand	xmm4, xmm12
-        pand	xmm5, xmm12
-        pand	xmm6, xmm12
-        pand	xmm7, xmm12
-        and	r10, r9
-        por	xmm0, xmm4
-        por	xmm1, xmm5
-        por	xmm2, xmm6
-        por	xmm3, xmm7
-        or	r11, r10
-        dec	rax
-        jnz	L_521_get_entry_64_9_start_0
-        movdqu	[rcx], xmm0
-        movdqu	[rcx+16], xmm1
-        movdqu	[rcx+32], xmm2
-        movdqu	[rcx+48], xmm3
-        mov	QWORD PTR [rcx+64], r11
-        ; From entry 1
-        mov	r12, 1
-        mov	rax, 1
-        movd	xmm13, r8d
-        sub	rdx, 9000
-        movd	xmm15, eax
-        mov	rax, 63
-        pshufd	xmm15, xmm15, 0
-        pshufd	xmm13, xmm13, 0
-        pxor	xmm14, xmm14
-        pxor	xmm0, xmm0
-        pxor	xmm1, xmm1
-        pxor	xmm2, xmm2
-        pxor	xmm3, xmm3
-        xor	r11, r11
-        movdqa	xmm14, xmm15
-L_521_get_entry_64_9_start_1:
-        movdqa	xmm12, xmm14
-        paddd	xmm14, xmm15
-        pcmpeqd	xmm12, xmm13
-        xor	r9, r9
-        cmp	r8, r12
-        sete	r9b
-        neg	r9
-        inc	r12
-        movdqu	xmm4, [rdx]
-        movdqu	xmm5, [rdx+16]
-        movdqu	xmm6, [rdx+32]
-        movdqu	xmm7, [rdx+48]
-        mov	r10, QWORD PTR [rdx+64]
-        add	rdx, 144
-        pand	xmm4, xmm12
-        pand	xmm5, xmm12
-        pand	xmm6, xmm12
-        pand	xmm7, xmm12
-        and	r10, r9
-        por	xmm0, xmm4
-        por	xmm1, xmm5
-        por	xmm2, xmm6
-        por	xmm3, xmm7
-        or	r11, r10
-        dec	rax
-        jnz	L_521_get_entry_64_9_start_1
-        movdqu	[rcx+144], xmm0
-        movdqu	[rcx+160], xmm1
-        movdqu	[rcx+176], xmm2
-        movdqu	[rcx+192], xmm3
-        mov	QWORD PTR [rcx+208], r11
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        vmovdqu	xmm12, OWORD PTR [rsp+96]
-        vmovdqu	xmm13, OWORD PTR [rsp+112]
-        vmovdqu	xmm14, OWORD PTR [rsp+128]
-        vmovdqu	xmm15, OWORD PTR [rsp+144]
-        add	rsp, 160
-        pop	r12
-        ret
-sp_521_get_entry_64_9 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Touch each possible entry that could be being copied.
-;  *
-;  * r      Point to copy into.
-;  * table  Table - start of the entries to access
-;  * idx    Index of entry to retrieve.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_get_entry_64_avx2_9 PROC
-        push	r12
-        push	r13
-        push	r14
-        sub	rsp, 96
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        mov	r14, 1
-        mov	rax, 1
-        movd	xmm9, r8d
-        add	rdx, 144
-        movd	xmm11, eax
-        mov	rax, 64
-        vpxor	ymm10, ymm10, ymm10
-        vpermd	ymm9, ymm10, ymm9
-        vpermd	ymm11, ymm10, ymm11
-        vpxor	ymm0, ymm0, ymm0
-        vpxor	ymm1, ymm1, ymm1
-        vpxor	ymm2, ymm2, ymm2
-        vpxor	ymm3, ymm3, ymm3
-        xor	r10, r10
-        xor	r11, r11
-        vmovdqa	ymm10, ymm11
-L_521_get_entry_64_avx2_9_start:
-        vpcmpeqd	ymm8, ymm10, ymm9
-        vpaddd	ymm10, ymm10, ymm11
-        xor	r9, r9
-        cmp	r8, r14
-        sete	r9b
-        neg	r9
-        inc	r14
-        vmovupd	ymm4, YMMWORD PTR [rdx]
-        vmovupd	ymm5, YMMWORD PTR [rdx+32]
-        vmovupd	ymm6, YMMWORD PTR [rdx+72]
-        vmovupd	ymm7, YMMWORD PTR [rdx+104]
-        mov	r12, QWORD PTR [rdx+64]
-        mov	r13, QWORD PTR [rdx+136]
-        add	rdx, 144
-        vpand	ymm4, ymm4, ymm8
-        vpand	ymm5, ymm5, ymm8
-        vpand	ymm6, ymm6, ymm8
-        vpand	ymm7, ymm7, ymm8
-        and	r12, r9
-        and	r13, r9
-        vpor	ymm0, ymm0, ymm4
-        vpor	ymm1, ymm1, ymm5
-        vpor	ymm2, ymm2, ymm6
-        vpor	ymm3, ymm3, ymm7
-        or	r10, r12
-        or	r11, r13
-        dec	rax
-        jnz	L_521_get_entry_64_avx2_9_start
-        vmovupd	YMMWORD PTR [rcx], ymm0
-        vmovupd	YMMWORD PTR [rcx+32], ymm1
-        vmovupd	YMMWORD PTR [rcx+144], ymm2
-        vmovupd	YMMWORD PTR [rcx+176], ymm3
-        mov	QWORD PTR [rcx+64], r10
-        mov	QWORD PTR [rcx+208], r11
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        add	rsp, 96
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_521_get_entry_64_avx2_9 ENDP
-_text ENDS
-ENDIF
-ENDIF
-IFNDEF WC_NO_CACHE_RESISTANT
-; /* Touch each possible entry that could be being copied.
-;  *
-;  * r      Point to copy into.
-;  * table  Table - start of the entries to access
-;  * idx    Index of entry to retrieve.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_get_entry_65_9 PROC
-        push	r12
-        sub	rsp, 160
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        vmovdqu	OWORD PTR [rsp+96], xmm12
-        vmovdqu	OWORD PTR [rsp+112], xmm13
-        vmovdqu	OWORD PTR [rsp+128], xmm14
-        vmovdqu	OWORD PTR [rsp+144], xmm15
-        ; From entry 1
-        mov	r12, 1
-        mov	rax, 1
-        movd	xmm13, r8d
-        add	rdx, 144
-        movd	xmm15, eax
-        mov	rax, 64
-        pshufd	xmm15, xmm15, 0
-        pshufd	xmm13, xmm13, 0
-        pxor	xmm14, xmm14
-        pxor	xmm0, xmm0
-        pxor	xmm1, xmm1
-        pxor	xmm2, xmm2
-        pxor	xmm3, xmm3
-        xor	r11, r11
-        movdqa	xmm14, xmm15
-L_521_get_entry_65_9_start_0:
-        movdqa	xmm12, xmm14
-        paddd	xmm14, xmm15
-        pcmpeqd	xmm12, xmm13
-        xor	r9, r9
-        cmp	r8, r12
-        sete	r9b
-        neg	r9
-        inc	r12
-        movdqu	xmm4, [rdx]
-        movdqu	xmm5, [rdx+16]
-        movdqu	xmm6, [rdx+32]
-        movdqu	xmm7, [rdx+48]
-        mov	r10, QWORD PTR [rdx+64]
-        add	rdx, 144
-        pand	xmm4, xmm12
-        pand	xmm5, xmm12
-        pand	xmm6, xmm12
-        pand	xmm7, xmm12
-        and	r10, r9
-        por	xmm0, xmm4
-        por	xmm1, xmm5
-        por	xmm2, xmm6
-        por	xmm3, xmm7
-        or	r11, r10
-        dec	rax
-        jnz	L_521_get_entry_65_9_start_0
-        movdqu	[rcx], xmm0
-        movdqu	[rcx+16], xmm1
-        movdqu	[rcx+32], xmm2
-        movdqu	[rcx+48], xmm3
-        mov	QWORD PTR [rcx+64], r11
-        ; From entry 1
-        mov	r12, 1
-        mov	rax, 1
-        movd	xmm13, r8d
-        sub	rdx, 9144
-        movd	xmm15, eax
-        mov	rax, 64
-        pshufd	xmm15, xmm15, 0
-        pshufd	xmm13, xmm13, 0
-        pxor	xmm14, xmm14
-        pxor	xmm0, xmm0
-        pxor	xmm1, xmm1
-        pxor	xmm2, xmm2
-        pxor	xmm3, xmm3
-        xor	r11, r11
-        movdqa	xmm14, xmm15
-L_521_get_entry_65_9_start_1:
-        movdqa	xmm12, xmm14
-        paddd	xmm14, xmm15
-        pcmpeqd	xmm12, xmm13
-        xor	r9, r9
-        cmp	r8, r12
-        sete	r9b
-        neg	r9
-        inc	r12
-        movdqu	xmm4, [rdx]
-        movdqu	xmm5, [rdx+16]
-        movdqu	xmm6, [rdx+32]
-        movdqu	xmm7, [rdx+48]
-        mov	r10, QWORD PTR [rdx+64]
-        add	rdx, 144
-        pand	xmm4, xmm12
-        pand	xmm5, xmm12
-        pand	xmm6, xmm12
-        pand	xmm7, xmm12
-        and	r10, r9
-        por	xmm0, xmm4
-        por	xmm1, xmm5
-        por	xmm2, xmm6
-        por	xmm3, xmm7
-        or	r11, r10
-        dec	rax
-        jnz	L_521_get_entry_65_9_start_1
-        movdqu	[rcx+144], xmm0
-        movdqu	[rcx+160], xmm1
-        movdqu	[rcx+176], xmm2
-        movdqu	[rcx+192], xmm3
-        mov	QWORD PTR [rcx+208], r11
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        vmovdqu	xmm12, OWORD PTR [rsp+96]
-        vmovdqu	xmm13, OWORD PTR [rsp+112]
-        vmovdqu	xmm14, OWORD PTR [rsp+128]
-        vmovdqu	xmm15, OWORD PTR [rsp+144]
-        add	rsp, 160
-        pop	r12
-        ret
-sp_521_get_entry_65_9 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Touch each possible entry that could be being copied.
-;  *
-;  * r      Point to copy into.
-;  * table  Table - start of the entries to access
-;  * idx    Index of entry to retrieve.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_get_entry_65_avx2_9 PROC
-        push	r12
-        push	r13
-        push	r14
-        sub	rsp, 96
-        vmovdqu	OWORD PTR [rsp], xmm6
-        vmovdqu	OWORD PTR [rsp+16], xmm7
-        vmovdqu	OWORD PTR [rsp+32], xmm8
-        vmovdqu	OWORD PTR [rsp+48], xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm10
-        vmovdqu	OWORD PTR [rsp+80], xmm11
-        mov	r14, 1
-        mov	rax, 1
-        movd	xmm9, r8d
-        add	rdx, 144
-        movd	xmm11, eax
-        mov	rax, 65
-        vpxor	ymm10, ymm10, ymm10
-        vpermd	ymm9, ymm10, ymm9
-        vpermd	ymm11, ymm10, ymm11
-        vpxor	ymm0, ymm0, ymm0
-        vpxor	ymm1, ymm1, ymm1
-        vpxor	ymm2, ymm2, ymm2
-        vpxor	ymm3, ymm3, ymm3
-        xor	r10, r10
-        xor	r11, r11
-        vmovdqa	ymm10, ymm11
-L_521_get_entry_65_avx2_9_start:
-        vpcmpeqd	ymm8, ymm10, ymm9
-        vpaddd	ymm10, ymm10, ymm11
-        xor	r9, r9
-        cmp	r8, r14
-        sete	r9b
-        neg	r9
-        inc	r14
-        vmovupd	ymm4, YMMWORD PTR [rdx]
-        vmovupd	ymm5, YMMWORD PTR [rdx+32]
-        vmovupd	ymm6, YMMWORD PTR [rdx+72]
-        vmovupd	ymm7, YMMWORD PTR [rdx+104]
-        mov	r12, QWORD PTR [rdx+64]
-        mov	r13, QWORD PTR [rdx+136]
-        add	rdx, 144
-        vpand	ymm4, ymm4, ymm8
-        vpand	ymm5, ymm5, ymm8
-        vpand	ymm6, ymm6, ymm8
-        vpand	ymm7, ymm7, ymm8
-        and	r12, r9
-        and	r13, r9
-        vpor	ymm0, ymm0, ymm4
-        vpor	ymm1, ymm1, ymm5
-        vpor	ymm2, ymm2, ymm6
-        vpor	ymm3, ymm3, ymm7
-        or	r10, r12
-        or	r11, r13
-        dec	rax
-        jnz	L_521_get_entry_65_avx2_9_start
-        vmovupd	YMMWORD PTR [rcx], ymm0
-        vmovupd	YMMWORD PTR [rcx+32], ymm1
-        vmovupd	YMMWORD PTR [rcx+144], ymm2
-        vmovupd	YMMWORD PTR [rcx+176], ymm3
-        mov	QWORD PTR [rcx+64], r10
-        mov	QWORD PTR [rcx+208], r11
-        vmovdqu	xmm6, OWORD PTR [rsp]
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm8, OWORD PTR [rsp+32]
-        vmovdqu	xmm9, OWORD PTR [rsp+48]
-        vmovdqu	xmm10, OWORD PTR [rsp+64]
-        vmovdqu	xmm11, OWORD PTR [rsp+80]
-        add	rsp, 96
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_521_get_entry_65_avx2_9 ENDP
-_text ENDS
-ENDIF
-ENDIF
-; /* Add 1 to a. (a = a + 1)
-;  *
-;  * a  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_add_one_9 PROC
-        add	QWORD PTR [rcx], 1
-        adc	QWORD PTR [rcx+8], 0
-        adc	QWORD PTR [rcx+16], 0
-        adc	QWORD PTR [rcx+24], 0
-        adc	QWORD PTR [rcx+32], 0
-        adc	QWORD PTR [rcx+40], 0
-        adc	QWORD PTR [rcx+48], 0
-        adc	QWORD PTR [rcx+56], 0
-        adc	QWORD PTR [rcx+64], 0
-        ret
-sp_521_add_one_9 ENDP
-_text ENDS
-; /* Read big endian unsigned byte array into r.
-;  * Uses the bswap instruction.
-;  *
-;  * r  A single precision integer.
-;  * size  Maximum number of bytes to convert
-;  * a  Byte array.
-;  * n  Number of bytes in array to read.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_from_bin_bswap PROC
-        push	r12
-        push	r13
-        mov	r11, r8
-        mov	r12, rcx
-        add	r11, r9
-        add	r12, 65
-        xor	r13, r13
-        jmp	L_521_from_bin_bswap_64_end
-L_521_from_bin_bswap_64_start:
-        sub	r11, 64
-        mov	rax, QWORD PTR [r11+56]
-        mov	r10, QWORD PTR [r11+48]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        mov	rax, QWORD PTR [r11+40]
-        mov	r10, QWORD PTR [r11+32]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r10
-        mov	rax, QWORD PTR [r11+24]
-        mov	r10, QWORD PTR [r11+16]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        mov	rax, QWORD PTR [r11+8]
-        mov	r10, QWORD PTR [r11]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r10
-        add	rcx, 64
-        sub	r9, 64
-L_521_from_bin_bswap_64_end:
-        cmp	r9, 63
-        jg	L_521_from_bin_bswap_64_start
-        jmp	L_521_from_bin_bswap_8_end
-L_521_from_bin_bswap_8_start:
-        sub	r11, 8
-        mov	rax, QWORD PTR [r11]
-        bswap	rax
-        mov	QWORD PTR [rcx], rax
-        add	rcx, 8
-        sub	r9, 8
-L_521_from_bin_bswap_8_end:
-        cmp	r9, 7
-        jg	L_521_from_bin_bswap_8_start
-        cmp	r9, r13
-        je	L_521_from_bin_bswap_hi_end
-        mov	r10, r13
-        mov	rax, r13
-L_521_from_bin_bswap_hi_start:
-        mov	al, BYTE PTR [r8]
-        shl	r10, 8
-        inc	r8
-        add	r10, rax
-        dec	r9
-        jg	L_521_from_bin_bswap_hi_start
-        mov	QWORD PTR [rcx], r10
-        add	rcx, 8
-L_521_from_bin_bswap_hi_end:
-        cmp	rcx, r12
-        jge	L_521_from_bin_bswap_zero_end
-L_521_from_bin_bswap_zero_start:
-        mov	QWORD PTR [rcx], r13
-        add	rcx, 8
-        cmp	rcx, r12
-        jl	L_521_from_bin_bswap_zero_start
-L_521_from_bin_bswap_zero_end:
-        pop	r13
-        pop	r12
-        ret
-sp_521_from_bin_bswap ENDP
-_text ENDS
-IFNDEF NO_MOVBE_SUPPORT
-; /* Read big endian unsigned byte array into r.
-;  * Uses the movbe instruction which is an optional instruction.
-;  *
-;  * r  A single precision integer.
-;  * size  Maximum number of bytes to convert
-;  * a  Byte array.
-;  * n  Number of bytes in array to read.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_from_bin_movbe PROC
-        push	r12
-        mov	r11, r8
-        mov	r12, rcx
-        add	r11, r9
-        add	r12, 66
-        jmp	L_521_from_bin_movbe_64_end
-L_521_from_bin_movbe_64_start:
-        sub	r11, 64
-        movbe	rax, QWORD PTR [r11+56]
-        movbe	r10, QWORD PTR [r11+48]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        movbe	rax, QWORD PTR [r11+40]
-        movbe	r10, QWORD PTR [r11+32]
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r10
-        movbe	rax, QWORD PTR [r11+24]
-        movbe	r10, QWORD PTR [r11+16]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        movbe	rax, QWORD PTR [r11+8]
-        movbe	r10, QWORD PTR [r11]
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r10
-        add	rcx, 64
-        sub	r9, 64
-L_521_from_bin_movbe_64_end:
-        cmp	r9, 63
-        jg	L_521_from_bin_movbe_64_start
-        jmp	L_521_from_bin_movbe_8_end
-L_521_from_bin_movbe_8_start:
-        sub	r11, 8
-        movbe	rax, QWORD PTR [r11]
-        mov	QWORD PTR [rcx], rax
-        add	rcx, 8
-        sub	r9, 8
-L_521_from_bin_movbe_8_end:
-        cmp	r9, 7
-        jg	L_521_from_bin_movbe_8_start
-        cmp	r9, 0
-        je	L_521_from_bin_movbe_hi_end
-        mov	r10, 0
-        mov	rax, 0
-L_521_from_bin_movbe_hi_start:
-        mov	al, BYTE PTR [r8]
-        shl	r10, 8
-        inc	r8
-        add	r10, rax
-        dec	r9
-        jg	L_521_from_bin_movbe_hi_start
-        mov	QWORD PTR [rcx], r10
-        add	rcx, 8
-L_521_from_bin_movbe_hi_end:
-        cmp	rcx, r12
-        jge	L_521_from_bin_movbe_zero_end
-L_521_from_bin_movbe_zero_start:
-        mov	QWORD PTR [rcx], 0
-        add	rcx, 8
-        cmp	rcx, r12
-        jl	L_521_from_bin_movbe_zero_start
-L_521_from_bin_movbe_zero_end:
-        pop	r12
-        ret
-sp_521_from_bin_movbe ENDP
-_text ENDS
-ENDIF
-; /* Write r as big endian to byte array.
-;  * Fixed length number of bytes written: 65
-;  * Uses the bswap instruction.
-;  *
-;  * r  A single precision integer.
-;  * a  Byte array.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_to_bin_bswap_9 PROC
-        mov	r8b, BYTE PTR [rcx+64]
-        mov	al, BYTE PTR [rcx+65]
-        mov	BYTE PTR [rdx], al
-        mov	BYTE PTR [rdx+1], r8b
-        mov	rax, QWORD PTR [rcx+56]
-        mov	r8, QWORD PTR [rcx+48]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+2], rax
-        mov	QWORD PTR [rdx+10], r8
-        mov	rax, QWORD PTR [rcx+40]
-        mov	r8, QWORD PTR [rcx+32]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+18], rax
-        mov	QWORD PTR [rdx+26], r8
-        mov	rax, QWORD PTR [rcx+24]
-        mov	r8, QWORD PTR [rcx+16]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+34], rax
-        mov	QWORD PTR [rdx+42], r8
-        mov	rax, QWORD PTR [rcx+8]
-        mov	r8, QWORD PTR [rcx]
-        bswap	rax
-        bswap	r8
-        mov	QWORD PTR [rdx+50], rax
-        mov	QWORD PTR [rdx+58], r8
-        ret
-sp_521_to_bin_bswap_9 ENDP
-_text ENDS
-IFNDEF NO_MOVBE_SUPPORT
-; /* Write r as big endian to byte array.
-;  * Fixed length number of bytes written: 65
-;  * Uses the movbe instruction which is optional.
-;  *
-;  * r  A single precision integer.
-;  * a  Byte array.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_to_bin_movbe_9 PROC
-        mov	r8b, BYTE PTR [rcx+64]
-        mov	al, BYTE PTR [rcx+65]
-        mov	BYTE PTR [rdx], al
-        mov	BYTE PTR [rdx+1], r8b
-        movbe	rax, QWORD PTR [rcx+56]
-        movbe	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rdx+2], rax
-        mov	QWORD PTR [rdx+10], r8
-        movbe	rax, QWORD PTR [rcx+40]
-        movbe	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rdx+18], rax
-        mov	QWORD PTR [rdx+26], r8
-        movbe	rax, QWORD PTR [rcx+24]
-        movbe	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rdx+34], rax
-        mov	QWORD PTR [rdx+42], r8
-        movbe	rax, QWORD PTR [rcx+8]
-        movbe	r8, QWORD PTR [rcx]
-        mov	QWORD PTR [rdx+50], rax
-        mov	QWORD PTR [rdx+58], r8
-        ret
-sp_521_to_bin_movbe_9 ENDP
-_text ENDS
-ENDIF
-; /* Shift number right by 1 bit. (r = a >> 1)
-;  *
-;  * r  Result of right shift by 1.
-;  * a  Number to shift.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_rshift_9 PROC
-        push	r12
-        mov	rcx, r8
-        mov	rax, rcx
-        mov	r8, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r11, QWORD PTR [rdx+24]
-        mov	r12, QWORD PTR [rdx+32]
-        shrd	r8, r9, cl
-        shrd	r9, r10, cl
-        shrd	r10, r11, cl
-        shrd	r11, r12, cl
-        mov	QWORD PTR [rax], r8
-        mov	QWORD PTR [rax+8], r9
-        mov	QWORD PTR [rax+16], r10
-        mov	QWORD PTR [rax+24], r11
-        mov	r9, QWORD PTR [rdx+40]
-        mov	r10, QWORD PTR [rdx+48]
-        mov	r11, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [rdx+64]
-        shrd	r12, r9, cl
-        shrd	r9, r10, cl
-        shrd	r10, r11, cl
-        shrd	r11, r8, cl
-        mov	QWORD PTR [rax+32], r12
-        mov	QWORD PTR [rax+40], r9
-        mov	QWORD PTR [rax+48], r10
-        mov	QWORD PTR [rax+56], r11
-        shr	r8, cl
-        mov	QWORD PTR [rax+64], r8
-        pop	r12
-        ret
-sp_521_rshift_9 ENDP
-_text ENDS
-; /* Shift number left by n bit. (r = a << n)
-;  *
-;  * r  Result of left shift by n.
-;  * a  Number to shift.
-;  * n  Amoutnt o shift.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_lshift_9 PROC
-        push	r12
-        push	r13
-        mov	cl, r8b
-        mov	rax, rcx
-        mov	r12, 0
-        mov	r13, QWORD PTR [rdx+32]
-        mov	r8, QWORD PTR [rdx+40]
-        mov	r9, QWORD PTR [rdx+48]
-        mov	r10, QWORD PTR [rdx+56]
-        mov	r11, QWORD PTR [rdx+64]
-        shld	r12, r11, cl
-        shld	r11, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r13, cl
-        mov	QWORD PTR [rax+40], r8
-        mov	QWORD PTR [rax+48], r9
-        mov	QWORD PTR [rax+56], r10
-        mov	QWORD PTR [rax+64], r11
-        mov	QWORD PTR [rax+72], r12
-        mov	r11, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rdx+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	r10, QWORD PTR [rdx+24]
-        shld	r13, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r11, cl
-        mov	QWORD PTR [rax+8], r8
-        mov	QWORD PTR [rax+16], r9
-        mov	QWORD PTR [rax+24], r10
-        mov	QWORD PTR [rax+32], r13
-        shl	r11, cl
-        mov	QWORD PTR [rax], r11
-        pop	r13
-        pop	r12
-        ret
-sp_521_lshift_9 ENDP
-_text ENDS
-; /* Shift number left by n bit. (r = a << n)
-;  *
-;  * r  Result of left shift by n.
-;  * a  Number to shift.
-;  * n  Amoutnt o shift.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_lshift_18 PROC
-        push	r12
-        push	r13
-        mov	cl, r8b
-        mov	rax, rcx
-        mov	r12, 0
-        mov	r13, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [rdx+112]
-        mov	r9, QWORD PTR [rdx+120]
-        mov	r10, QWORD PTR [rdx+128]
-        mov	r11, QWORD PTR [rdx+136]
-        shld	r12, r11, cl
-        shld	r11, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r13, cl
-        mov	QWORD PTR [rax+112], r8
-        mov	QWORD PTR [rax+120], r9
-        mov	QWORD PTR [rax+128], r10
-        mov	QWORD PTR [rax+136], r11
-        mov	QWORD PTR [rax+144], r12
-        mov	r11, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [rdx+80]
-        mov	r9, QWORD PTR [rdx+88]
-        mov	r10, QWORD PTR [rdx+96]
-        shld	r13, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r11, cl
-        mov	QWORD PTR [rax+80], r8
-        mov	QWORD PTR [rax+88], r9
-        mov	QWORD PTR [rax+96], r10
-        mov	QWORD PTR [rax+104], r13
-        mov	r13, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [rdx+48]
-        mov	r9, QWORD PTR [rdx+56]
-        mov	r10, QWORD PTR [rdx+64]
-        shld	r11, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r13, cl
-        mov	QWORD PTR [rax+48], r8
-        mov	QWORD PTR [rax+56], r9
-        mov	QWORD PTR [rax+64], r10
-        mov	QWORD PTR [rax+72], r11
-        mov	r11, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [rdx+16]
-        mov	r9, QWORD PTR [rdx+24]
-        mov	r10, QWORD PTR [rdx+32]
-        shld	r13, r10, cl
-        shld	r10, r9, cl
-        shld	r9, r8, cl
-        shld	r8, r11, cl
-        mov	QWORD PTR [rax+16], r8
-        mov	QWORD PTR [rax+24], r9
-        mov	QWORD PTR [rax+32], r10
-        mov	QWORD PTR [rax+40], r13
-        mov	r10, QWORD PTR [rdx]
-        shld	r11, r10, cl
-        shl	r10, cl
-        mov	QWORD PTR [rax], r10
-        mov	QWORD PTR [rax+8], r11
-        pop	r13
-        pop	r12
-        ret
-sp_521_lshift_18 ENDP
-_text ENDS
-; /* Sub b from a into a. (a -= b)
-;  *
-;  * a  A single precision integer and result.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_sub_in_place_9 PROC
-        mov	r8, QWORD PTR [rcx]
-        sub	r8, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        sbb	r9, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], r9
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	r9, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        sbb	r9, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], r9
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	r9, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        sbb	r9, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], r9
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	r9, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        sbb	r9, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], r9
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	QWORD PTR [rcx+64], r8
-        sbb	rax, rax
-        ret
-sp_521_sub_in_place_9 ENDP
-_text ENDS
-; /* Mul a by digit b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision digit.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_mul_d_9 PROC
-        push	r12
-        mov	r9, rdx
-        ; A[0] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9]
-        mov	r10, rax
-        mov	r11, rdx
-        mov	QWORD PTR [rcx], r10
-        ; A[1] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        mov	QWORD PTR [rcx+8], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        mov	QWORD PTR [rcx+16], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        mov	QWORD PTR [rcx+24], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        mov	QWORD PTR [rcx+32], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        mov	QWORD PTR [rcx+40], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        mov	QWORD PTR [rcx+48], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        mov	QWORD PTR [rcx+56], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B
-        mov	rax, r8
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        adc	r10, rdx
-        mov	QWORD PTR [rcx+64], r12
-        mov	QWORD PTR [rcx+72], r10
-        pop	r12
-        ret
-sp_521_mul_d_9 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Mul a by digit b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision digit.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_mul_d_avx2_9 PROC
-        push	r12
-        push	r13
-        mov	rax, rdx
-        ; A[0] * B
-        mov	rdx, r8
-        xor	r13, r13
-        mulx	r12, r11, QWORD PTR [rax]
-        mov	QWORD PTR [rcx], r11
-        ; A[1] * B
-        mulx	r10, r9, QWORD PTR [rax+8]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+8], r12
-        ; A[2] * B
-        mulx	r10, r9, QWORD PTR [rax+16]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+16], r11
-        ; A[3] * B
-        mulx	r10, r9, QWORD PTR [rax+24]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+24], r12
-        ; A[4] * B
-        mulx	r10, r9, QWORD PTR [rax+32]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+32], r11
-        ; A[5] * B
-        mulx	r10, r9, QWORD PTR [rax+40]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+40], r12
-        ; A[6] * B
-        mulx	r10, r9, QWORD PTR [rax+48]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+48], r11
-        ; A[7] * B
-        mulx	r10, r9, QWORD PTR [rax+56]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+56], r12
-        ; A[8] * B
-        mulx	r10, r9, QWORD PTR [rax+64]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        adcx	r12, r13
-        mov	QWORD PTR [rcx+64], r11
-        mov	QWORD PTR [rcx+72], r12
-        pop	r13
-        pop	r12
-        ret
-sp_521_mul_d_avx2_9 ENDP
-_text ENDS
-ENDIF
-IFDEF _WIN64
-; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
-;  *
-;  * d1   The high order half of the number to divide.
-;  * d0   The low order half of the number to divide.
-;  * div  The dividend.
-;  * returns the result of the division.
-;  */
-_text SEGMENT READONLY PARA
-div_521_word_asm_9 PROC
-        mov	r9, rdx
-        mov	rax, r9
-        mov	rdx, rcx
-        div	r8
-        ret
-div_521_word_asm_9 ENDP
-_text ENDS
-ENDIF
-; /* Shift number right by 1 bit. (r = a >> 1)
-;  *
-;  * r  Result of right shift by 1.
-;  * a  Number to shift.
-;  */
-_text SEGMENT READONLY PARA
-sp_521_rshift1_9 PROC
-        push	r12
-        mov	rax, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rdx+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	r12, QWORD PTR [rdx+32]
-        shrd	rax, r8, 1
-        shrd	r8, r9, 1
-        shrd	r9, r10, 1
-        shrd	r10, r12, 1
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r8
-        mov	QWORD PTR [rcx+16], r9
-        mov	QWORD PTR [rcx+24], r10
-        mov	r8, QWORD PTR [rdx+40]
-        mov	r9, QWORD PTR [rdx+48]
-        mov	r10, QWORD PTR [rdx+56]
-        mov	rax, QWORD PTR [rdx+64]
-        shrd	r12, r8, 1
-        shrd	r8, r9, 1
-        shrd	r9, r10, 1
-        shrd	r10, rax, 1
-        mov	QWORD PTR [rcx+32], r12
-        mov	QWORD PTR [rcx+40], r8
-        mov	QWORD PTR [rcx+48], r9
-        mov	QWORD PTR [rcx+56], r10
-        shr	rax, 1
-        mov	QWORD PTR [rcx+64], rax
-        pop	r12
-        ret
-sp_521_rshift1_9 ENDP
-_text ENDS
-; /* Divide the number by 2 mod the prime. (r = a / 2 % m)
-;  *
-;  * r  Result of division by 2.
-;  * a  Number to divide.
-;  * m  Modulus
-;  */
-_text SEGMENT READONLY PARA
-sp_521_div2_mod_9 PROC
-        push	r12
-        mov	rax, QWORD PTR [rdx]
-        and	rax, 1
-        je	L_521_mod_inv_9_div2_mod_no_add
-        mov	rax, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        add	rax, r10
-        adc	r9, r11
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r9
-        mov	rax, QWORD PTR [rdx+16]
-        mov	r9, QWORD PTR [rdx+24]
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        adc	rax, r10
-        adc	r9, r11
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r9
-        mov	rax, QWORD PTR [rdx+32]
-        mov	r9, QWORD PTR [rdx+40]
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        adc	rax, r10
-        adc	r9, r11
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r9
-        mov	rax, QWORD PTR [rdx+48]
-        mov	r9, QWORD PTR [rdx+56]
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        adc	rax, r10
-        adc	r9, r11
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r9
-        mov	rax, QWORD PTR [rdx+64]
-        mov	r10, QWORD PTR [r8+64]
-        adc	rax, r10
-        mov	QWORD PTR [rcx+64], rax
-L_521_mod_inv_9_div2_mod_no_add:
-        mov	rax, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r11, QWORD PTR [rdx+24]
-        mov	r12, QWORD PTR [rdx+32]
-        shrd	rax, r9, 1
-        shrd	r9, r10, 1
-        shrd	r10, r11, 1
-        shrd	r11, r12, 1
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r9
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        mov	r9, QWORD PTR [rdx+40]
-        mov	r10, QWORD PTR [rdx+48]
-        mov	r11, QWORD PTR [rdx+56]
-        mov	rax, QWORD PTR [rdx+64]
-        shrd	r12, r9, 1
-        shrd	r9, r10, 1
-        shrd	r10, r11, 1
-        shrd	r11, rax, 1
-        mov	QWORD PTR [rcx+32], r12
-        mov	QWORD PTR [rcx+40], r9
-        mov	QWORD PTR [rcx+48], r10
-        mov	QWORD PTR [rcx+56], r11
-        shr	rax, 1
-        mov	QWORD PTR [rcx+64], rax
-        pop	r12
-        ret
-sp_521_div2_mod_9 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-sp_521_num_bits_9 PROC
-        xor	rax, rax
-        mov	rdx, QWORD PTR [rcx+64]
-        cmp	rdx, 0
-        je	L_521_num_bits_9_end_512
-        mov	rax, -1
-        bsr	rax, rdx
-        add	rax, 513
-        jmp	L_521_num_bits_9_done
-L_521_num_bits_9_end_512:
-        mov	rdx, QWORD PTR [rcx+56]
-        cmp	rdx, 0
-        je	L_521_num_bits_9_end_448
-        mov	rax, -1
-        bsr	rax, rdx
-        add	rax, 449
-        jmp	L_521_num_bits_9_done
-L_521_num_bits_9_end_448:
-        mov	rdx, QWORD PTR [rcx+48]
-        cmp	rdx, 0
-        je	L_521_num_bits_9_end_384
-        mov	rax, -1
-        bsr	rax, rdx
-        add	rax, 385
-        jmp	L_521_num_bits_9_done
-L_521_num_bits_9_end_384:
-        mov	rdx, QWORD PTR [rcx+40]
-        cmp	rdx, 0
-        je	L_521_num_bits_9_end_320
-        mov	rax, -1
-        bsr	rax, rdx
-        add	rax, 321
-        jmp	L_521_num_bits_9_done
-L_521_num_bits_9_end_320:
-        mov	rdx, QWORD PTR [rcx+32]
-        cmp	rdx, 0
-        je	L_521_num_bits_9_end_256
-        mov	rax, -1
-        bsr	rax, rdx
-        add	rax, 257
-        jmp	L_521_num_bits_9_done
-L_521_num_bits_9_end_256:
-        mov	rdx, QWORD PTR [rcx+24]
-        cmp	rdx, 0
-        je	L_521_num_bits_9_end_192
-        mov	rax, -1
-        bsr	rax, rdx
-        add	rax, 193
-        jmp	L_521_num_bits_9_done
-L_521_num_bits_9_end_192:
-        mov	rdx, QWORD PTR [rcx+16]
-        cmp	rdx, 0
-        je	L_521_num_bits_9_end_128
-        mov	rax, -1
-        bsr	rax, rdx
-        add	rax, 129
-        jmp	L_521_num_bits_9_done
-L_521_num_bits_9_end_128:
-        mov	rdx, QWORD PTR [rcx+8]
-        cmp	rdx, 0
-        je	L_521_num_bits_9_end_64
-        mov	rax, -1
-        bsr	rax, rdx
-        add	rax, 65
-        jmp	L_521_num_bits_9_done
-L_521_num_bits_9_end_64:
-        mov	rdx, QWORD PTR [rcx]
-        cmp	rdx, 0
-        je	L_521_num_bits_9_end_0
-        mov	rax, -1
-        bsr	rax, rdx
-        add	rax, 1
-        jmp	L_521_num_bits_9_done
-L_521_num_bits_9_end_0:
-L_521_num_bits_9_done:
-        ret
-sp_521_num_bits_9 ENDP
-_text ENDS
-ENDIF
-IFDEF WOLFSSL_SP_1024
-; /* Multiply a and b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_mul_16 PROC
-        push	r12
-        mov	r9, rdx
-        sub	rsp, 128
-        ; A[0] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        mov	QWORD PTR [rsp], rax
-        mov	r11, rdx
-        ; A[0] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[1] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+8], r11
-        ; A[0] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+8]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[2] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+16], r12
-        ; A[0] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[1] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[2] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+16]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[3] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rsp+24], r10
-        ; A[0] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[1] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+16]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[3] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+24]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[4] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+32], r11
-        ; A[0] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+8]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[2] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+24]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[4] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+32]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[5] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+40], r12
-        ; A[0] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[1] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[2] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+16]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[3] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+32]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[5] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+40]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[6] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rsp+48], r10
-        ; A[0] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[1] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+16]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[3] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+24]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[4] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+40]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[6] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+48]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[7] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+56], r11
-        ; A[0] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+8]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[2] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+24]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[4] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+32]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[5] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+48]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[7] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+56]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[8] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+64], r12
-        ; A[0] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[1] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[2] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+16]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[3] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+32]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[5] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+40]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[6] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+56]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[8] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+64]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[9] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+72]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rsp+72], r10
-        ; A[0] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[1] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+16]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[3] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+24]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[4] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+40]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[6] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+48]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[7] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+64]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[9] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+72]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[10] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+80]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+80], r11
-        ; A[0] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+8]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[2] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+24]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[4] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+32]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[5] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+48]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[7] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+56]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[8] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[9] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+72]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[10] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+80]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[11] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+88]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+88], r12
-        ; A[0] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[1] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[2] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+16]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[3] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+32]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[5] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+40]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[6] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+56]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[8] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+64]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[9] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+72]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[10] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+80]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[11] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+88]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[12] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+96]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rsp+96], r10
-        ; A[0] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[1] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+16]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[3] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+24]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[4] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+40]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[6] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+48]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[7] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+64]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[9] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+72]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[10] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+80]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[11] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+88]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[12] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+96]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[13] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+104]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+104], r11
-        ; A[0] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+8]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[2] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+24]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[4] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+32]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[5] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+48]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[7] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+56]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[8] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[9] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+72]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[10] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+80]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[11] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+88]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[12] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+96]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[13] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+104]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[14] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+112]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+112], r12
-        ; A[0] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[1] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[2] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+16]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[3] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+32]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[5] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+40]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[6] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+56]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[8] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+64]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[9] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+72]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[10] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+80]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[11] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+88]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[12] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+96]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[13] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+104]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[14] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+112]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[15] * B[0]
-        mov	rax, QWORD PTR [r8]
-        mul	QWORD PTR [r9+120]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rsp+120], r10
-        ; A[1] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+8]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+16]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[3] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+24]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[4] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+40]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[6] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+48]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[7] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+64]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[9] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+72]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[10] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+80]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[11] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+88]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[12] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+96]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[13] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+104]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[14] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+112]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[15] * B[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r9+120]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+128], r11
-        ; A[2] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+16]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+24]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[4] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+32]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[5] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+48]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[7] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+56]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[8] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[9] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+72]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[10] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+80]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[11] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+88]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[12] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+96]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[13] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+104]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[14] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+112]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[15] * B[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r9+120]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+136], r12
-        ; A[3] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+24]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+32]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[5] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+40]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[6] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+56]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[8] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+64]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[9] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+72]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[10] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+80]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[11] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+88]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[12] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+96]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[13] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+104]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[14] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+112]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[15] * B[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r9+120]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rcx+144], r10
-        ; A[4] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+32]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+40]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[6] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+48]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[7] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+64]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[9] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+72]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[10] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+80]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[11] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+88]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[12] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+96]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[13] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+104]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[14] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+112]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[15] * B[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r9+120]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+152], r11
-        ; A[5] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+40]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+48]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[7] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+56]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[8] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[9] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+72]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[10] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+80]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[11] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+88]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[12] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+96]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[13] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+104]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[14] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+112]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[15] * B[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r9+120]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+160], r12
-        ; A[6] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+48]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+56]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[8] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+64]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[9] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+72]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[10] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+80]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[11] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+88]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[12] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+96]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[13] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+104]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[14] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+112]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[15] * B[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r9+120]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rcx+168], r10
-        ; A[7] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+56]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+64]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[9] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+72]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[10] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+80]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[11] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+88]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[12] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+96]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[13] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+104]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[14] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+112]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[15] * B[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r9+120]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+176], r11
-        ; A[8] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+64]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[9] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+72]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[10] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+80]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[11] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+88]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[12] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+96]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[13] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+104]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[14] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+112]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[15] * B[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r9+120]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+184], r12
-        ; A[9] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+72]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[10] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+80]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[11] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+88]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[12] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+96]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[13] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+104]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[14] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+112]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[15] * B[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r9+120]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rcx+192], r10
-        ; A[10] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+80]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[11] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+88]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[12] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+96]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[13] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+104]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[14] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+112]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[15] * B[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r9+120]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+200], r11
-        ; A[11] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+88]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[12] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+96]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[13] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+104]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[14] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+112]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[15] * B[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r9+120]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+208], r12
-        ; A[12] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+96]
-        xor	r12, r12
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[13] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+104]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[14] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+112]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[15] * B[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r9+120]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r12, 0
-        mov	QWORD PTR [rcx+216], r10
-        ; A[13] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+104]
-        xor	r10, r10
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[14] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+112]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[15] * B[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r9+120]
-        add	r11, rax
-        adc	r12, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+224], r11
-        ; A[14] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+112]
-        xor	r11, r11
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[15] * B[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r9+120]
-        add	r12, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+232], r12
-        ; A[15] * B[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r9+120]
-        add	r10, rax
-        adc	r11, rdx
-        mov	QWORD PTR [rcx+240], r10
-        mov	QWORD PTR [rcx+248], r11
-        mov	rax, QWORD PTR [rsp]
-        mov	rdx, QWORD PTR [rsp+8]
-        mov	r10, QWORD PTR [rsp+16]
-        mov	r11, QWORD PTR [rsp+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], rdx
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        mov	rax, QWORD PTR [rsp+32]
-        mov	rdx, QWORD PTR [rsp+40]
-        mov	r10, QWORD PTR [rsp+48]
-        mov	r11, QWORD PTR [rsp+56]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], rdx
-        mov	QWORD PTR [rcx+48], r10
-        mov	QWORD PTR [rcx+56], r11
-        mov	rax, QWORD PTR [rsp+64]
-        mov	rdx, QWORD PTR [rsp+72]
-        mov	r10, QWORD PTR [rsp+80]
-        mov	r11, QWORD PTR [rsp+88]
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], rdx
-        mov	QWORD PTR [rcx+80], r10
-        mov	QWORD PTR [rcx+88], r11
-        mov	rax, QWORD PTR [rsp+96]
-        mov	rdx, QWORD PTR [rsp+104]
-        mov	r10, QWORD PTR [rsp+112]
-        mov	r11, QWORD PTR [rsp+120]
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], rdx
-        mov	QWORD PTR [rcx+112], r10
-        mov	QWORD PTR [rcx+120], r11
-        add	rsp, 128
-        pop	r12
-        ret
-sp_1024_mul_16 ENDP
-_text ENDS
-; /* Square a and put result in r. (r = a * a)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_sqr_16 PROC
-        push	r12
-        push	r13
-        push	r14
-        mov	r8, rdx
-        sub	rsp, 128
-        ; A[0] * A[0]
-        mov	rax, QWORD PTR [r8]
-        mul	rax
-        xor	r11, r11
-        mov	QWORD PTR [rsp], rax
-        mov	r10, rdx
-        ; A[0] * A[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	QWORD PTR [r8]
-        xor	r9, r9
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        mov	QWORD PTR [rsp+8], r10
-        ; A[0] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        ; A[1] * A[1]
-        mov	rax, QWORD PTR [r8+8]
-        mul	rax
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rsp+16], r11
-        ; A[0] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r8]
-        xor	r11, r11
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[1] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	QWORD PTR [r8+8]
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rsp+24], r9
-        ; A[0] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8]
-        xor	r9, r9
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        ; A[1] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r8+8]
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        ; A[2] * A[2]
-        mov	rax, QWORD PTR [r8+16]
-        mul	rax
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        mov	QWORD PTR [rsp+32], r10
-        ; A[0] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rsp+40], r11
-        ; A[0] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8]
-        xor	r11, r11
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[3]
-        mov	rax, QWORD PTR [r8+24]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r9, r12
-        adc	r10, r13
-        adc	r11, r14
-        mov	QWORD PTR [rsp+48], r9
-        ; A[0] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8]
-        xor	r9, r9
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r10, r12
-        adc	r11, r13
-        adc	r9, r14
-        mov	QWORD PTR [rsp+56], r10
-        ; A[0] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[4]
-        mov	rax, QWORD PTR [r8+32]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rsp+64], r11
-        ; A[0] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8]
-        xor	r11, r11
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r9, r12
-        adc	r10, r13
-        adc	r11, r14
-        mov	QWORD PTR [rsp+72], r9
-        ; A[0] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8]
-        xor	r9, r9
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[5]
-        mov	rax, QWORD PTR [r8+40]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r10, r12
-        adc	r11, r13
-        adc	r9, r14
-        mov	QWORD PTR [rsp+80], r10
-        ; A[0] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rsp+88], r11
-        ; A[0] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8]
-        xor	r11, r11
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * A[6]
-        mov	rax, QWORD PTR [r8+48]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r9, r12
-        adc	r10, r13
-        adc	r11, r14
-        mov	QWORD PTR [rsp+96], r9
-        ; A[0] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8]
-        xor	r9, r9
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	QWORD PTR [r8+48]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r10, r12
-        adc	r11, r13
-        adc	r9, r14
-        mov	QWORD PTR [rsp+104], r10
-        ; A[0] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+48]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[7] * A[7]
-        mov	rax, QWORD PTR [r8+56]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rsp+112], r11
-        ; A[0] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8]
-        xor	r11, r11
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[1] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+8]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[2] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+48]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[7] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	QWORD PTR [r8+56]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r9, r12
-        adc	r10, r13
-        adc	r11, r14
-        mov	QWORD PTR [rsp+120], r9
-        ; A[1] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+8]
-        xor	r9, r9
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[2] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+16]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[3] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+48]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[7] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+56]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[8] * A[8]
-        mov	rax, QWORD PTR [r8+64]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r10, r12
-        adc	r11, r13
-        adc	r9, r14
-        mov	QWORD PTR [rcx+128], r10
-        ; A[2] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+16]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[3] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+24]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[4] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+48]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[7] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+56]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[8] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	QWORD PTR [r8+64]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rcx+136], r11
-        ; A[3] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+24]
-        xor	r11, r11
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[4] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+32]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[5] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+48]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[7] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+56]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[8] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+64]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[9] * A[9]
-        mov	rax, QWORD PTR [r8+72]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r9, r12
-        adc	r10, r13
-        adc	r11, r14
-        mov	QWORD PTR [rcx+144], r9
-        ; A[4] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+32]
-        xor	r9, r9
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[5] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+40]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[6] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+48]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[7] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+56]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[8] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+64]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[9] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	QWORD PTR [r8+72]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r10, r12
-        adc	r11, r13
-        adc	r9, r14
-        mov	QWORD PTR [rcx+152], r10
-        ; A[5] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+40]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[6] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+48]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[7] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+56]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[8] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+64]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[9] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+72]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[10] * A[10]
-        mov	rax, QWORD PTR [r8+80]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rcx+160], r11
-        ; A[6] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+48]
-        xor	r11, r11
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[7] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+56]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[8] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+64]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[9] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+72]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[10] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	QWORD PTR [r8+80]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r9, r12
-        adc	r10, r13
-        adc	r11, r14
-        mov	QWORD PTR [rcx+168], r9
-        ; A[7] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+56]
-        xor	r9, r9
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[8] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+64]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[9] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+72]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[10] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+80]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[11] * A[11]
-        mov	rax, QWORD PTR [r8+88]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r10, r12
-        adc	r11, r13
-        adc	r9, r14
-        mov	QWORD PTR [rcx+176], r10
-        ; A[8] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+64]
-        xor	r10, r10
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[9] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+72]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[10] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+80]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[11] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	QWORD PTR [r8+88]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r11, r12
-        adc	r9, r13
-        adc	r10, r14
-        mov	QWORD PTR [rcx+184], r11
-        ; A[9] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+72]
-        xor	r11, r11
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[10] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+80]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[11] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+88]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[12] * A[12]
-        mov	rax, QWORD PTR [r8+96]
-        mul	rax
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r9, r12
-        adc	r10, r13
-        adc	r11, r14
-        mov	QWORD PTR [rcx+192], r9
-        ; A[10] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+80]
-        xor	r9, r9
-        xor	r14, r14
-        mov	r12, rax
-        mov	r13, rdx
-        ; A[11] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+88]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        ; A[12] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	QWORD PTR [r8+96]
-        add	r12, rax
-        adc	r13, rdx
-        adc	r14, 0
-        add	r12, r12
-        adc	r13, r13
-        adc	r14, r14
-        add	r10, r12
-        adc	r11, r13
-        adc	r9, r14
-        mov	QWORD PTR [rcx+200], r10
-        ; A[11] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+88]
-        xor	r10, r10
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        ; A[12] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+96]
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        ; A[13] * A[13]
-        mov	rax, QWORD PTR [r8+104]
-        mul	rax
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+208], r11
-        ; A[12] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+96]
-        xor	r11, r11
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[13] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	QWORD PTR [r8+104]
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        add	r9, rax
-        adc	r10, rdx
-        adc	r11, 0
-        mov	QWORD PTR [rcx+216], r9
-        ; A[13] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+104]
-        xor	r9, r9
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        ; A[14] * A[14]
-        mov	rax, QWORD PTR [r8+112]
-        mul	rax
-        add	r10, rax
-        adc	r11, rdx
-        adc	r9, 0
-        mov	QWORD PTR [rcx+224], r10
-        ; A[14] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	QWORD PTR [r8+112]
-        xor	r10, r10
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        add	r11, rax
-        adc	r9, rdx
-        adc	r10, 0
-        mov	QWORD PTR [rcx+232], r11
-        ; A[15] * A[15]
-        mov	rax, QWORD PTR [r8+120]
-        mul	rax
-        add	r9, rax
-        adc	r10, rdx
-        mov	QWORD PTR [rcx+240], r9
-        mov	QWORD PTR [rcx+248], r10
-        mov	rax, QWORD PTR [rsp]
-        mov	rdx, QWORD PTR [rsp+8]
-        mov	r12, QWORD PTR [rsp+16]
-        mov	r13, QWORD PTR [rsp+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], rdx
-        mov	QWORD PTR [rcx+16], r12
-        mov	QWORD PTR [rcx+24], r13
-        mov	rax, QWORD PTR [rsp+32]
-        mov	rdx, QWORD PTR [rsp+40]
-        mov	r12, QWORD PTR [rsp+48]
-        mov	r13, QWORD PTR [rsp+56]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], rdx
-        mov	QWORD PTR [rcx+48], r12
-        mov	QWORD PTR [rcx+56], r13
-        mov	rax, QWORD PTR [rsp+64]
-        mov	rdx, QWORD PTR [rsp+72]
-        mov	r12, QWORD PTR [rsp+80]
-        mov	r13, QWORD PTR [rsp+88]
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], rdx
-        mov	QWORD PTR [rcx+80], r12
-        mov	QWORD PTR [rcx+88], r13
-        mov	rax, QWORD PTR [rsp+96]
-        mov	rdx, QWORD PTR [rsp+104]
-        mov	r12, QWORD PTR [rsp+112]
-        mov	r13, QWORD PTR [rsp+120]
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], rdx
-        mov	QWORD PTR [rcx+112], r12
-        mov	QWORD PTR [rcx+120], r13
-        add	rsp, 128
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_1024_sqr_16 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Multiply a and b into r. (r = a * b)
-;  *
-;  * r   Result of multiplication.
-;  * a   First number to multiply.
-;  * b   Second number to multiply.
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_mul_avx2_16 PROC
-        push	rbx
-        push	rbp
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        mov	rbp, r8
-        mov	r8, rcx
-        mov	r9, rdx
-        sub	rsp, 128
-        cmp	r9, r8
-        mov	rbx, rsp
-        cmovne	rbx, r8
-        cmp	rbp, r8
-        cmove	rbx, rsp
-        add	r8, 128
-        xor	rdi, rdi
-        mov	rdx, QWORD PTR [r9]
-        ; A[0] * B[0]
-        mulx	r11, r10, QWORD PTR [rbp]
-        ; A[0] * B[1]
-        mulx	r12, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx], r10
-        adcx	r11, rax
-        ; A[0] * B[2]
-        mulx	r13, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+8], r11
-        adcx	r12, rax
-        ; A[0] * B[3]
-        mulx	r14, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+16], r12
-        adcx	r13, rax
-        mov	QWORD PTR [rbx+24], r13
-        ; A[0] * B[4]
-        mulx	r10, rax, QWORD PTR [rbp+32]
-        adcx	r14, rax
-        ; A[0] * B[5]
-        mulx	r11, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+32], r14
-        adcx	r10, rax
-        ; A[0] * B[6]
-        mulx	r12, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+40], r10
-        adcx	r11, rax
-        ; A[0] * B[7]
-        mulx	r13, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+48], r11
-        adcx	r12, rax
-        mov	QWORD PTR [rbx+56], r12
-        ; A[0] * B[8]
-        mulx	r14, rax, QWORD PTR [rbp+64]
-        adcx	r13, rax
-        ; A[0] * B[9]
-        mulx	r10, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [rbx+64], r13
-        adcx	r14, rax
-        ; A[0] * B[10]
-        mulx	r11, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [rbx+72], r14
-        adcx	r10, rax
-        ; A[0] * B[11]
-        mulx	r12, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [rbx+80], r10
-        adcx	r11, rax
-        mov	QWORD PTR [rbx+88], r11
-        ; A[0] * B[12]
-        mulx	r13, rax, QWORD PTR [rbp+96]
-        adcx	r12, rax
-        ; A[0] * B[13]
-        mulx	r14, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [rbx+96], r12
-        adcx	r13, rax
-        ; A[0] * B[14]
-        mulx	r10, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        ; A[0] * B[15]
-        mulx	r11, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adcx	r11, rdi
-        mov	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [rbx+120], r10
-        mov	QWORD PTR [r8], r11
-        mov	rdx, QWORD PTR [r9+8]
-        mov	r11, QWORD PTR [rbx+8]
-        mov	r12, QWORD PTR [rbx+16]
-        mov	r13, QWORD PTR [rbx+24]
-        mov	r14, QWORD PTR [rbx+32]
-        mov	r10, QWORD PTR [rbx+40]
-        ; A[1] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[1] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[1] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+16], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[1] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+24], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+32], r14
-        mov	r11, QWORD PTR [rbx+48]
-        mov	r12, QWORD PTR [rbx+56]
-        mov	r13, QWORD PTR [rbx+64]
-        mov	r14, QWORD PTR [rbx+72]
-        ; A[1] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[1] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+40], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[1] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+48], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[1] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+56], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        mov	QWORD PTR [rbx+64], r13
-        mov	r10, QWORD PTR [rbx+80]
-        mov	r11, QWORD PTR [rbx+88]
-        mov	r12, QWORD PTR [rbx+96]
-        mov	r13, QWORD PTR [rbx+104]
-        ; A[1] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[1] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [rbx+72], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[1] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [rbx+80], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[1] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [rbx+88], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [rbx+96], r12
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        ; A[1] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[1] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[1] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[1] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [rbx+120], r10
-        mov	r12, rdi
-        adcx	r11, rax
-        adox	r12, rcx
-        adcx	r12, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8], r11
-        mov	QWORD PTR [r8+8], r12
-        mov	rdx, QWORD PTR [r9+16]
-        mov	r12, QWORD PTR [rbx+16]
-        mov	r13, QWORD PTR [rbx+24]
-        mov	r14, QWORD PTR [rbx+32]
-        mov	r10, QWORD PTR [rbx+40]
-        mov	r11, QWORD PTR [rbx+48]
-        ; A[2] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[2] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+16], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[2] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+24], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[2] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+32], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+40], r10
-        mov	r12, QWORD PTR [rbx+56]
-        mov	r13, QWORD PTR [rbx+64]
-        mov	r14, QWORD PTR [rbx+72]
-        mov	r10, QWORD PTR [rbx+80]
-        ; A[2] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[2] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+48], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[2] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+56], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[2] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+64], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+72], r14
-        mov	r11, QWORD PTR [rbx+88]
-        mov	r12, QWORD PTR [rbx+96]
-        mov	r13, QWORD PTR [rbx+104]
-        mov	r14, QWORD PTR [rbx+112]
-        ; A[2] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[2] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [rbx+80], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[2] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [rbx+88], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[2] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [rbx+96], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        mov	QWORD PTR [rbx+104], r13
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        ; A[2] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[2] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[2] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[2] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8], r11
-        mov	r13, rdi
-        adcx	r12, rax
-        adox	r13, rcx
-        adcx	r13, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+8], r12
-        mov	QWORD PTR [r8+16], r13
-        mov	rdx, QWORD PTR [r9+24]
-        mov	r13, QWORD PTR [rbx+24]
-        mov	r14, QWORD PTR [rbx+32]
-        mov	r10, QWORD PTR [rbx+40]
-        mov	r11, QWORD PTR [rbx+48]
-        mov	r12, QWORD PTR [rbx+56]
-        ; A[3] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[3] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+24], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[3] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+32], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[3] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+40], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbx+48], r11
-        mov	r13, QWORD PTR [rbx+64]
-        mov	r14, QWORD PTR [rbx+72]
-        mov	r10, QWORD PTR [rbx+80]
-        mov	r11, QWORD PTR [rbx+88]
-        ; A[3] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[3] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+56], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[3] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+64], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[3] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+72], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+80], r10
-        mov	r12, QWORD PTR [rbx+96]
-        mov	r13, QWORD PTR [rbx+104]
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        ; A[3] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[3] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [rbx+88], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[3] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [rbx+96], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[3] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+112], r14
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        ; A[3] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[3] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[3] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[3] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+8], r12
-        mov	r14, rdi
-        adcx	r13, rax
-        adox	r14, rcx
-        adcx	r14, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+16], r13
-        mov	QWORD PTR [r8+24], r14
-        mov	rdx, QWORD PTR [r9+32]
-        mov	r14, QWORD PTR [rbx+32]
-        mov	r10, QWORD PTR [rbx+40]
-        mov	r11, QWORD PTR [rbx+48]
-        mov	r12, QWORD PTR [rbx+56]
-        mov	r13, QWORD PTR [rbx+64]
-        ; A[4] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[4] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+32], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[4] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+40], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[4] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+48], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [rbx+56], r12
-        mov	r14, QWORD PTR [rbx+72]
-        mov	r10, QWORD PTR [rbx+80]
-        mov	r11, QWORD PTR [rbx+88]
-        mov	r12, QWORD PTR [rbx+96]
-        ; A[4] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[4] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+64], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[4] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+72], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[4] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+80], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbx+88], r11
-        mov	r13, QWORD PTR [rbx+104]
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        ; A[4] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[4] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [rbx+96], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[4] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[4] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+120], r10
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        mov	r14, QWORD PTR [r8+24]
-        ; A[4] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[4] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[4] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+8], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[4] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+16], r13
-        mov	r10, rdi
-        adcx	r14, rax
-        adox	r10, rcx
-        adcx	r10, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+24], r14
-        mov	QWORD PTR [r8+32], r10
-        mov	rdx, QWORD PTR [r9+40]
-        mov	r10, QWORD PTR [rbx+40]
-        mov	r11, QWORD PTR [rbx+48]
-        mov	r12, QWORD PTR [rbx+56]
-        mov	r13, QWORD PTR [rbx+64]
-        mov	r14, QWORD PTR [rbx+72]
-        ; A[5] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[5] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+40], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[5] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+48], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[5] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+56], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        mov	QWORD PTR [rbx+64], r13
-        mov	r10, QWORD PTR [rbx+80]
-        mov	r11, QWORD PTR [rbx+88]
-        mov	r12, QWORD PTR [rbx+96]
-        mov	r13, QWORD PTR [rbx+104]
-        ; A[5] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[5] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+72], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[5] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+80], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[5] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+88], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [rbx+96], r12
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        ; A[5] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[5] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[5] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[5] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8], r11
-        mov	r13, QWORD PTR [r8+16]
-        mov	r14, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [r8+32]
-        ; A[5] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[5] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+8], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[5] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+16], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[5] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+24], r14
-        mov	r11, rdi
-        adcx	r10, rax
-        adox	r11, rcx
-        adcx	r11, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+32], r10
-        mov	QWORD PTR [r8+40], r11
-        mov	rdx, QWORD PTR [r9+48]
-        mov	r11, QWORD PTR [rbx+48]
-        mov	r12, QWORD PTR [rbx+56]
-        mov	r13, QWORD PTR [rbx+64]
-        mov	r14, QWORD PTR [rbx+72]
-        mov	r10, QWORD PTR [rbx+80]
-        ; A[6] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[6] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+48], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[6] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+56], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[6] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+64], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+72], r14
-        mov	r11, QWORD PTR [rbx+88]
-        mov	r12, QWORD PTR [rbx+96]
-        mov	r13, QWORD PTR [rbx+104]
-        mov	r14, QWORD PTR [rbx+112]
-        ; A[6] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[6] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+80], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[6] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+88], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[6] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+96], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        mov	QWORD PTR [rbx+104], r13
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        ; A[6] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[6] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[6] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[6] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r8+8], r12
-        mov	r14, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        ; A[6] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[6] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+16], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[6] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+24], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[6] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+32], r10
-        mov	r12, rdi
-        adcx	r11, rax
-        adox	r12, rcx
-        adcx	r12, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+40], r11
-        mov	QWORD PTR [r8+48], r12
-        mov	rdx, QWORD PTR [r9+56]
-        mov	r12, QWORD PTR [rbx+56]
-        mov	r13, QWORD PTR [rbx+64]
-        mov	r14, QWORD PTR [rbx+72]
-        mov	r10, QWORD PTR [rbx+80]
-        mov	r11, QWORD PTR [rbx+88]
-        ; A[7] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[7] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+56], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[7] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+64], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[7] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+72], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+80], r10
-        mov	r12, QWORD PTR [rbx+96]
-        mov	r13, QWORD PTR [rbx+104]
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        ; A[7] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[7] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+88], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[7] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+96], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[7] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+112], r14
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        mov	r14, QWORD PTR [r8+24]
-        ; A[7] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[7] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[7] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[7] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+8], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        mov	QWORD PTR [r8+16], r13
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [r8+48]
-        ; A[7] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[7] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+24], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[7] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+32], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[7] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+40], r11
-        mov	r13, rdi
-        adcx	r12, rax
-        adox	r13, rcx
-        adcx	r13, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+48], r12
-        mov	QWORD PTR [r8+56], r13
-        mov	rdx, QWORD PTR [r9+64]
-        mov	r13, QWORD PTR [rbx+64]
-        mov	r14, QWORD PTR [rbx+72]
-        mov	r10, QWORD PTR [rbx+80]
-        mov	r11, QWORD PTR [rbx+88]
-        mov	r12, QWORD PTR [rbx+96]
-        ; A[8] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[8] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+64], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[8] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+72], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[8] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+80], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbx+88], r11
-        mov	r13, QWORD PTR [rbx+104]
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        ; A[8] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[8] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+96], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[8] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[8] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+120], r10
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        mov	r14, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [r8+32]
-        ; A[8] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[8] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[8] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [r8+8], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[8] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+16], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+24], r14
-        mov	r11, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [r8+48]
-        mov	r13, QWORD PTR [r8+56]
-        ; A[8] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[8] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+32], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[8] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+40], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[8] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+48], r12
-        mov	r14, rdi
-        adcx	r13, rax
-        adox	r14, rcx
-        adcx	r14, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+56], r13
-        mov	QWORD PTR [r8+64], r14
-        mov	rdx, QWORD PTR [r9+72]
-        mov	r14, QWORD PTR [rbx+72]
-        mov	r10, QWORD PTR [rbx+80]
-        mov	r11, QWORD PTR [rbx+88]
-        mov	r12, QWORD PTR [rbx+96]
-        mov	r13, QWORD PTR [rbx+104]
-        ; A[9] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[9] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+72], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[9] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+80], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[9] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+88], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [rbx+96], r12
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        ; A[9] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[9] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[9] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[9] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8], r11
-        mov	r13, QWORD PTR [r8+16]
-        mov	r14, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        ; A[9] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[9] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8+8], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[9] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [r8+16], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[9] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+24], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+32], r10
-        mov	r12, QWORD PTR [r8+48]
-        mov	r13, QWORD PTR [r8+56]
-        mov	r14, QWORD PTR [r8+64]
-        ; A[9] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[9] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+40], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[9] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+48], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[9] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+56], r13
-        mov	r10, rdi
-        adcx	r14, rax
-        adox	r10, rcx
-        adcx	r10, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+64], r14
-        mov	QWORD PTR [r8+72], r10
-        mov	rdx, QWORD PTR [r9+80]
-        mov	r10, QWORD PTR [rbx+80]
-        mov	r11, QWORD PTR [rbx+88]
-        mov	r12, QWORD PTR [rbx+96]
-        mov	r13, QWORD PTR [rbx+104]
-        mov	r14, QWORD PTR [rbx+112]
-        ; A[10] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[10] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+80], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[10] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+88], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[10] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+96], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        mov	QWORD PTR [rbx+104], r13
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        ; A[10] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[10] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[10] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[10] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r8+8], r12
-        mov	r14, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [r8+48]
-        ; A[10] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[10] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8+16], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[10] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [r8+24], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[10] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+32], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8+40], r11
-        mov	r13, QWORD PTR [r8+56]
-        mov	r14, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [r8+72]
-        ; A[10] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[10] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+48], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[10] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+56], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[10] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+64], r14
-        mov	r11, rdi
-        adcx	r10, rax
-        adox	r11, rcx
-        adcx	r11, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+72], r10
-        mov	QWORD PTR [r8+80], r11
-        mov	rdx, QWORD PTR [r9+88]
-        mov	r11, QWORD PTR [rbx+88]
-        mov	r12, QWORD PTR [rbx+96]
-        mov	r13, QWORD PTR [rbx+104]
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        ; A[11] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[11] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+88], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[11] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+96], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[11] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbx+112], r14
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        mov	r14, QWORD PTR [r8+24]
-        ; A[11] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[11] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[11] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[11] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8+8], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        mov	QWORD PTR [r8+16], r13
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [r8+48]
-        mov	r13, QWORD PTR [r8+56]
-        ; A[11] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[11] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8+24], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[11] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [r8+32], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[11] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+40], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r8+48], r12
-        mov	r14, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [r8+72]
-        mov	r11, QWORD PTR [r8+80]
-        ; A[11] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[11] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+56], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[11] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+64], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[11] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+72], r10
-        mov	r12, rdi
-        adcx	r11, rax
-        adox	r12, rcx
-        adcx	r12, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+80], r11
-        mov	QWORD PTR [r8+88], r12
-        mov	rdx, QWORD PTR [r9+96]
-        mov	r12, QWORD PTR [rbx+96]
-        mov	r13, QWORD PTR [rbx+104]
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        ; A[12] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[12] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+96], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[12] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[12] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbx+120], r10
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        mov	r14, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [r8+32]
-        ; A[12] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[12] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[12] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [r8+8], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[12] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8+16], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+24], r14
-        mov	r11, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [r8+48]
-        mov	r13, QWORD PTR [r8+56]
-        mov	r14, QWORD PTR [r8+64]
-        ; A[12] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[12] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8+32], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[12] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [r8+40], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[12] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+48], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        mov	QWORD PTR [r8+56], r13
-        mov	r10, QWORD PTR [r8+72]
-        mov	r11, QWORD PTR [r8+80]
-        mov	r12, QWORD PTR [r8+88]
-        ; A[12] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[12] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+64], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[12] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+72], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[12] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+80], r11
-        mov	r13, rdi
-        adcx	r12, rax
-        adox	r13, rcx
-        adcx	r13, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+88], r12
-        mov	QWORD PTR [r8+96], r13
-        mov	rdx, QWORD PTR [r9+104]
-        mov	r13, QWORD PTR [rbx+104]
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        ; A[13] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[13] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+104], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[13] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[13] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8], r11
-        mov	r13, QWORD PTR [r8+16]
-        mov	r14, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        ; A[13] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[13] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [r8+8], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[13] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [r8+16], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[13] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8+24], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+32], r10
-        mov	r12, QWORD PTR [r8+48]
-        mov	r13, QWORD PTR [r8+56]
-        mov	r14, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [r8+72]
-        ; A[13] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[13] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8+40], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[13] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [r8+48], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[13] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+56], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+64], r14
-        mov	r11, QWORD PTR [r8+80]
-        mov	r12, QWORD PTR [r8+88]
-        mov	r13, QWORD PTR [r8+96]
-        ; A[13] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[13] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+72], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[13] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+80], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[13] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+88], r12
-        mov	r14, rdi
-        adcx	r13, rax
-        adox	r14, rcx
-        adcx	r14, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+96], r13
-        mov	QWORD PTR [r8+104], r14
-        mov	rdx, QWORD PTR [r9+112]
-        mov	r14, QWORD PTR [rbx+112]
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        ; A[14] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[14] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+112], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[14] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[14] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r8+8], r12
-        mov	r14, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [r8+48]
-        ; A[14] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[14] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [r8+16], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[14] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [r8+24], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[14] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8+32], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8+40], r11
-        mov	r13, QWORD PTR [r8+56]
-        mov	r14, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [r8+72]
-        mov	r11, QWORD PTR [r8+80]
-        ; A[14] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[14] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8+48], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[14] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [r8+56], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[14] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+64], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+72], r10
-        mov	r12, QWORD PTR [r8+88]
-        mov	r13, QWORD PTR [r8+96]
-        mov	r14, QWORD PTR [r8+104]
-        ; A[14] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[14] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+80], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[14] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+88], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[14] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+96], r13
-        mov	r10, rdi
-        adcx	r14, rax
-        adox	r10, rcx
-        adcx	r10, r15
-        mov	r15, rdi
-        adox	r15, rdi
-        adcx	r15, rdi
-        mov	QWORD PTR [r8+104], r14
-        mov	QWORD PTR [r8+112], r10
-        mov	rdx, QWORD PTR [r9+120]
-        mov	r10, QWORD PTR [rbx+120]
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        mov	r13, QWORD PTR [r8+16]
-        mov	r14, QWORD PTR [r8+24]
-        ; A[15] * B[0]
-        mulx	rcx, rax, QWORD PTR [rbp]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[15] * B[1]
-        mulx	rcx, rax, QWORD PTR [rbp+8]
-        mov	QWORD PTR [rbx+120], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[15] * B[2]
-        mulx	rcx, rax, QWORD PTR [rbp+16]
-        mov	QWORD PTR [r8], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[15] * B[3]
-        mulx	rcx, rax, QWORD PTR [rbp+24]
-        mov	QWORD PTR [r8+8], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        mov	QWORD PTR [r8+16], r13
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [r8+48]
-        mov	r13, QWORD PTR [r8+56]
-        ; A[15] * B[4]
-        mulx	rcx, rax, QWORD PTR [rbp+32]
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[15] * B[5]
-        mulx	rcx, rax, QWORD PTR [rbp+40]
-        mov	QWORD PTR [r8+24], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[15] * B[6]
-        mulx	rcx, rax, QWORD PTR [rbp+48]
-        mov	QWORD PTR [r8+32], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[15] * B[7]
-        mulx	rcx, rax, QWORD PTR [rbp+56]
-        mov	QWORD PTR [r8+40], r11
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r8+48], r12
-        mov	r14, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [r8+72]
-        mov	r11, QWORD PTR [r8+80]
-        mov	r12, QWORD PTR [r8+88]
-        ; A[15] * B[8]
-        mulx	rcx, rax, QWORD PTR [rbp+64]
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[15] * B[9]
-        mulx	rcx, rax, QWORD PTR [rbp+72]
-        mov	QWORD PTR [r8+56], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[15] * B[10]
-        mulx	rcx, rax, QWORD PTR [rbp+80]
-        mov	QWORD PTR [r8+64], r14
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[15] * B[11]
-        mulx	rcx, rax, QWORD PTR [rbp+88]
-        mov	QWORD PTR [r8+72], r10
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8+80], r11
-        mov	r13, QWORD PTR [r8+96]
-        mov	r14, QWORD PTR [r8+104]
-        mov	r10, QWORD PTR [r8+112]
-        ; A[15] * B[12]
-        mulx	rcx, rax, QWORD PTR [rbp+96]
-        adcx	r12, rax
-        adox	r13, rcx
-        ; A[15] * B[13]
-        mulx	rcx, rax, QWORD PTR [rbp+104]
-        mov	QWORD PTR [r8+88], r12
-        adcx	r13, rax
-        adox	r14, rcx
-        ; A[15] * B[14]
-        mulx	rcx, rax, QWORD PTR [rbp+112]
-        mov	QWORD PTR [r8+96], r13
-        adcx	r14, rax
-        adox	r10, rcx
-        ; A[15] * B[15]
-        mulx	rcx, rax, QWORD PTR [rbp+120]
-        mov	QWORD PTR [r8+104], r14
-        mov	r11, rdi
-        adcx	r10, rax
-        adox	r11, rcx
-        adcx	r11, r15
-        mov	QWORD PTR [r8+112], r10
-        mov	QWORD PTR [r8+120], r11
-        sub	r8, 128
-        cmp	r9, r8
-        je	L_start_1024_mul_avx2_16
-        cmp	rbp, r8
-        jne	L_end_1024_mul_avx2_16
-L_start_1024_mul_avx2_16:
-        vmovdqu	xmm0, OWORD PTR [rbx]
-        vmovups	OWORD PTR [r8], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+16]
-        vmovups	OWORD PTR [r8+16], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+32]
-        vmovups	OWORD PTR [r8+32], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+48]
-        vmovups	OWORD PTR [r8+48], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+64]
-        vmovups	OWORD PTR [r8+64], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+80]
-        vmovups	OWORD PTR [r8+80], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+96]
-        vmovups	OWORD PTR [r8+96], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbx+112]
-        vmovups	OWORD PTR [r8+112], xmm0
-L_end_1024_mul_avx2_16:
-        add	rsp, 128
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        pop	rbp
-        pop	rbx
-        ret
-sp_1024_mul_avx2_16 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Square a and put result in r. (r = a * a)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_sqr_avx2_16 PROC
-        push	rbp
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        mov	r8, rcx
-        mov	r9, rdx
-        sub	rsp, 128
-        cmp	r9, r8
-        mov	rbp, rsp
-        cmovne	rbp, r8
-        add	r8, 128
-        xor	r13, r13
-        ; Diagonal 1
-        ; Zero into %r9
-        ; Zero into %r10
-        ; A[1] x A[0]
-        mov	rdx, QWORD PTR [r9]
-        mulx	r11, r10, QWORD PTR [r9+8]
-        ; A[2] x A[0]
-        mulx	r12, rax, QWORD PTR [r9+16]
-        adcx	r11, rax
-        adox	r12, r13
-        mov	QWORD PTR [rbp+8], r10
-        mov	QWORD PTR [rbp+16], r11
-        ; Zero into %r8
-        ; Zero into %r9
-        ; A[3] x A[0]
-        mulx	r10, rax, QWORD PTR [r9+24]
-        adcx	r12, rax
-        adox	r10, r13
-        ; A[4] x A[0]
-        mulx	r11, rax, QWORD PTR [r9+32]
-        adcx	r10, rax
-        adox	r11, r13
-        mov	QWORD PTR [rbp+24], r12
-        mov	QWORD PTR [rbp+32], r10
-        ; Zero into %r10
-        ; Zero into %r8
-        ; A[5] x A[0]
-        mulx	r12, rax, QWORD PTR [r9+40]
-        adcx	r11, rax
-        adox	r12, r13
-        ; A[6] x A[0]
-        mulx	r10, rax, QWORD PTR [r9+48]
-        adcx	r12, rax
-        adox	r10, r13
-        mov	QWORD PTR [rbp+40], r11
-        mov	QWORD PTR [rbp+48], r12
-        ; Zero into %r9
-        ; Zero into %r10
-        ; A[7] x A[0]
-        mulx	r11, rax, QWORD PTR [r9+56]
-        adcx	r10, rax
-        adox	r11, r13
-        ; A[8] x A[0]
-        mulx	r12, rax, QWORD PTR [r9+64]
-        adcx	r11, rax
-        adox	r12, r13
-        mov	QWORD PTR [rbp+56], r10
-        mov	QWORD PTR [rbp+64], r11
-        ; Zero into %r8
-        ; Zero into %r9
-        ; A[9] x A[0]
-        mulx	r10, rax, QWORD PTR [r9+72]
-        adcx	r12, rax
-        adox	r10, r13
-        ; A[10] x A[0]
-        mulx	r11, rax, QWORD PTR [r9+80]
-        adcx	r10, rax
-        adox	r11, r13
-        mov	QWORD PTR [rbp+72], r12
-        mov	QWORD PTR [rbp+80], r10
-        ; No load %r13 - %r10
-        ; A[11] x A[0]
-        mulx	r15, rax, QWORD PTR [r9+88]
-        adcx	r11, rax
-        adox	r15, r13
-        ; A[12] x A[0]
-        mulx	rdi, rax, QWORD PTR [r9+96]
-        adcx	r15, rax
-        adox	rdi, r13
-        mov	QWORD PTR [rbp+88], r11
-        ; No store %r13 - %r10
-        ; No load %r15 - %r9
-        ; A[13] x A[0]
-        mulx	rsi, rax, QWORD PTR [r9+104]
-        adcx	rdi, rax
-        adox	rsi, r13
-        ; A[14] x A[0]
-        mulx	rbx, rax, QWORD PTR [r9+112]
-        adcx	rsi, rax
-        adox	rbx, r13
-        ; No store %r14 - %r8
-        ; No store %r15 - %r9
-        ; Zero into %r8
-        ; Zero into %r9
-        ; A[15] x A[0]
-        mulx	r10, rax, QWORD PTR [r9+120]
-        adcx	rbx, rax
-        adox	r10, r13
-        ; No store %rbx - %r10
-        ;  Carry
-        adcx	r10, r13
-        mov	r14, r13
-        adcx	r14, r13
-        adox	r14, r13
-        mov	QWORD PTR [r8], r10
-        ; Diagonal 2
-        mov	r10, QWORD PTR [rbp+24]
-        mov	r11, QWORD PTR [rbp+32]
-        mov	r12, QWORD PTR [rbp+40]
-        ; A[2] x A[1]
-        mov	rdx, QWORD PTR [r9+8]
-        mulx	rcx, rax, QWORD PTR [r9+16]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[3] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+24]
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbp+24], r10
-        mov	QWORD PTR [rbp+32], r11
-        mov	r10, QWORD PTR [rbp+48]
-        mov	r11, QWORD PTR [rbp+56]
-        ; A[4] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+32]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[5] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+40]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbp+40], r12
-        mov	QWORD PTR [rbp+48], r10
-        mov	r12, QWORD PTR [rbp+64]
-        mov	r10, QWORD PTR [rbp+72]
-        ; A[6] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[7] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbp+56], r11
-        mov	QWORD PTR [rbp+64], r12
-        mov	r11, QWORD PTR [rbp+80]
-        mov	r12, QWORD PTR [rbp+88]
-        ; A[8] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[9] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbp+72], r10
-        mov	QWORD PTR [rbp+80], r11
-        ; No load %r13 - %r8
-        ; A[10] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	r12, rax
-        adox	r15, rcx
-        ; A[11] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	r15, rax
-        adox	rdi, rcx
-        mov	QWORD PTR [rbp+88], r12
-        ; No store %r13 - %r8
-        ; No load %r15 - %r10
-        ; A[12] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+96]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; A[13] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+104]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r14 - %r9
-        ; No store %r15 - %r10
-        mov	r11, QWORD PTR [r8]
-        ; Zero into %r10
-        ; A[14] x A[1]
-        mulx	rcx, rax, QWORD PTR [r9+112]
-        adcx	rbx, rax
-        adox	r11, rcx
-        ; A[15] x A[1]
-        mulx	r12, rax, QWORD PTR [r9+120]
-        adcx	r11, rax
-        adox	r12, r13
-        ; No store %rbx - %r8
-        mov	QWORD PTR [r8], r11
-        ; Zero into %r8
-        ; Zero into %r9
-        ; A[15] x A[2]
-        mov	rdx, QWORD PTR [r9+16]
-        mulx	r10, rax, QWORD PTR [r9+120]
-        adcx	r12, rax
-        adox	r10, r13
-        mov	QWORD PTR [r8+8], r12
-        ;  Carry
-        adcx	r10, r14
-        mov	r14, r13
-        adcx	r14, r13
-        adox	r14, r13
-        mov	QWORD PTR [r8+16], r10
-        ; Diagonal 3
-        mov	r10, QWORD PTR [rbp+40]
-        mov	r11, QWORD PTR [rbp+48]
-        mov	r12, QWORD PTR [rbp+56]
-        ; A[3] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+24]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[4] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+32]
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbp+40], r10
-        mov	QWORD PTR [rbp+48], r11
-        mov	r10, QWORD PTR [rbp+64]
-        mov	r11, QWORD PTR [rbp+72]
-        ; A[5] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+40]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[6] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbp+56], r12
-        mov	QWORD PTR [rbp+64], r10
-        mov	r12, QWORD PTR [rbp+80]
-        mov	r10, QWORD PTR [rbp+88]
-        ; A[7] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[8] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [rbp+72], r11
-        mov	QWORD PTR [rbp+80], r12
-        ; No load %r13 - %r9
-        ; A[9] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	r10, rax
-        adox	r15, rcx
-        ; A[10] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	r15, rax
-        adox	rdi, rcx
-        mov	QWORD PTR [rbp+88], r10
-        ; No store %r13 - %r9
-        ; No load %r15 - %r8
-        ; A[11] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; A[12] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+96]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r14 - %r10
-        ; No store %r15 - %r8
-        mov	r12, QWORD PTR [r8]
-        mov	r10, QWORD PTR [r8+8]
-        ; A[13] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+104]
-        adcx	rbx, rax
-        adox	r12, rcx
-        ; A[14] x A[2]
-        mulx	rcx, rax, QWORD PTR [r9+112]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; No store %rbx - %r9
-        mov	QWORD PTR [r8], r12
-        mov	r11, QWORD PTR [r8+16]
-        ; Zero into %r10
-        ; A[14] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	rcx, rax, QWORD PTR [r9+112]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[14] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	r12, rax, QWORD PTR [r9+112]
-        adcx	r11, rax
-        adox	r12, r13
-        mov	QWORD PTR [r8+8], r10
-        mov	QWORD PTR [r8+16], r11
-        ; Zero into %r8
-        ; Zero into %r9
-        ; A[14] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	r10, rax, QWORD PTR [r9+112]
-        adcx	r12, rax
-        adox	r10, r13
-        mov	QWORD PTR [r8+24], r12
-        ;  Carry
-        adcx	r10, r14
-        mov	r14, r13
-        adcx	r14, r13
-        adox	r14, r13
-        mov	QWORD PTR [r8+32], r10
-        ; Diagonal 4
-        mov	r10, QWORD PTR [rbp+56]
-        mov	r11, QWORD PTR [rbp+64]
-        mov	r12, QWORD PTR [rbp+72]
-        ; A[4] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	rcx, rax, QWORD PTR [r9+32]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[5] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+40]
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbp+56], r10
-        mov	QWORD PTR [rbp+64], r11
-        mov	r10, QWORD PTR [rbp+80]
-        mov	r11, QWORD PTR [rbp+88]
-        ; A[6] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[7] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [rbp+72], r12
-        mov	QWORD PTR [rbp+80], r10
-        ; No load %r13 - %r10
-        ; A[8] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	r11, rax
-        adox	r15, rcx
-        ; A[9] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	r15, rax
-        adox	rdi, rcx
-        mov	QWORD PTR [rbp+88], r11
-        ; No store %r13 - %r10
-        ; No load %r15 - %r9
-        ; A[10] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; A[11] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r14 - %r8
-        ; No store %r15 - %r9
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        ; A[12] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+96]
-        adcx	rbx, rax
-        adox	r10, rcx
-        ; A[13] x A[3]
-        mulx	rcx, rax, QWORD PTR [r9+104]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; No store %rbx - %r10
-        mov	QWORD PTR [r8], r10
-        mov	r12, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [r8+24]
-        ; A[13] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	rcx, rax, QWORD PTR [r9+104]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[13] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	rcx, rax, QWORD PTR [r9+104]
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+8], r11
-        mov	QWORD PTR [r8+16], r12
-        mov	r11, QWORD PTR [r8+32]
-        ; Zero into %r10
-        ; A[13] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, QWORD PTR [r9+104]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[13] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	r12, rax, QWORD PTR [r9+104]
-        adcx	r11, rax
-        adox	r12, r13
-        mov	QWORD PTR [r8+24], r10
-        mov	QWORD PTR [r8+32], r11
-        ; Zero into %r8
-        ; Zero into %r9
-        ; A[13] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	r10, rax, QWORD PTR [r9+104]
-        adcx	r12, rax
-        adox	r10, r13
-        mov	QWORD PTR [r8+40], r12
-        ;  Carry
-        adcx	r10, r14
-        mov	r14, r13
-        adcx	r14, r13
-        adox	r14, r13
-        mov	QWORD PTR [r8+48], r10
-        ; Diagonal 5
-        mov	r10, QWORD PTR [rbp+72]
-        mov	r11, QWORD PTR [rbp+80]
-        mov	r12, QWORD PTR [rbp+88]
-        ; A[5] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	rcx, rax, QWORD PTR [r9+40]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[6] x A[4]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [rbp+72], r10
-        mov	QWORD PTR [rbp+80], r11
-        ; No load %r13 - %r8
-        ; A[7] x A[4]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	r12, rax
-        adox	r15, rcx
-        ; A[8] x A[4]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	r15, rax
-        adox	rdi, rcx
-        mov	QWORD PTR [rbp+88], r12
-        ; No store %r13 - %r8
-        ; No load %r15 - %r10
-        ; A[9] x A[4]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; A[10] x A[4]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r14 - %r9
-        ; No store %r15 - %r10
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        ; A[11] x A[4]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	rbx, rax
-        adox	r11, rcx
-        ; A[12] x A[4]
-        mulx	rcx, rax, QWORD PTR [r9+96]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; No store %rbx - %r8
-        mov	QWORD PTR [r8], r11
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        ; A[12] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	rcx, rax, QWORD PTR [r9+96]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[12] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, QWORD PTR [r9+96]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+8], r12
-        mov	QWORD PTR [r8+16], r10
-        mov	r12, QWORD PTR [r8+32]
-        mov	r10, QWORD PTR [r8+40]
-        ; A[12] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	rcx, rax, QWORD PTR [r9+96]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[12] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	rcx, rax, QWORD PTR [r9+96]
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+24], r11
-        mov	QWORD PTR [r8+32], r12
-        mov	r11, QWORD PTR [r8+48]
-        ; Zero into %r10
-        ; A[12] x A[9]
-        mov	rdx, QWORD PTR [r9+72]
-        mulx	rcx, rax, QWORD PTR [r9+96]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[12] x A[10]
-        mov	rdx, QWORD PTR [r9+80]
-        mulx	r12, rax, QWORD PTR [r9+96]
-        adcx	r11, rax
-        adox	r12, r13
-        mov	QWORD PTR [r8+40], r10
-        mov	QWORD PTR [r8+48], r11
-        ; Zero into %r8
-        ; Zero into %r9
-        ; A[12] x A[11]
-        mov	rdx, QWORD PTR [r9+88]
-        mulx	r10, rax, QWORD PTR [r9+96]
-        adcx	r12, rax
-        adox	r10, r13
-        mov	QWORD PTR [r8+56], r12
-        ;  Carry
-        adcx	r10, r14
-        mov	r14, r13
-        adcx	r14, r13
-        adox	r14, r13
-        mov	QWORD PTR [r8+64], r10
-        ; Diagonal 6
-        mov	r10, QWORD PTR [rbp+88]
-        ; No load %r13 - %r9
-        ; A[6] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	rcx, rax, QWORD PTR [r9+48]
-        adcx	r10, rax
-        adox	r15, rcx
-        ; A[7] x A[5]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	r15, rax
-        adox	rdi, rcx
-        mov	QWORD PTR [rbp+88], r10
-        ; No store %r13 - %r9
-        ; No load %r15 - %r8
-        ; A[8] x A[5]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; A[9] x A[5]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r14 - %r10
-        ; No store %r15 - %r8
-        mov	r12, QWORD PTR [r8]
-        mov	r10, QWORD PTR [r8+8]
-        ; A[10] x A[5]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	rbx, rax
-        adox	r12, rcx
-        ; A[11] x A[5]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; No store %rbx - %r9
-        mov	QWORD PTR [r8], r12
-        mov	r11, QWORD PTR [r8+16]
-        mov	r12, QWORD PTR [r8+24]
-        ; A[11] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[11] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8+8], r10
-        mov	QWORD PTR [r8+16], r11
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        ; A[11] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[11] x A[9]
-        mov	rdx, QWORD PTR [r9+72]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+24], r12
-        mov	QWORD PTR [r8+32], r10
-        mov	r12, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [r8+56]
-        ; A[11] x A[10]
-        mov	rdx, QWORD PTR [r9+80]
-        mulx	rcx, rax, QWORD PTR [r9+88]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[13] x A[9]
-        mov	rdx, QWORD PTR [r9+72]
-        mulx	rcx, rax, QWORD PTR [r9+104]
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+40], r11
-        mov	QWORD PTR [r8+48], r12
-        mov	r11, QWORD PTR [r8+64]
-        ; Zero into %r10
-        ; A[13] x A[10]
-        mov	rdx, QWORD PTR [r9+80]
-        mulx	rcx, rax, QWORD PTR [r9+104]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[13] x A[11]
-        mov	rdx, QWORD PTR [r9+88]
-        mulx	r12, rax, QWORD PTR [r9+104]
-        adcx	r11, rax
-        adox	r12, r13
-        mov	QWORD PTR [r8+56], r10
-        mov	QWORD PTR [r8+64], r11
-        ; Zero into %r8
-        ; Zero into %r9
-        ; A[13] x A[12]
-        mov	rdx, QWORD PTR [r9+96]
-        mulx	r10, rax, QWORD PTR [r9+104]
-        adcx	r12, rax
-        adox	r10, r13
-        mov	QWORD PTR [r8+72], r12
-        ;  Carry
-        adcx	r10, r14
-        mov	r14, r13
-        adcx	r14, r13
-        adox	r14, r13
-        mov	QWORD PTR [r8+80], r10
-        ; Diagonal 7
-        ; No load %r15 - %r9
-        ; A[7] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, QWORD PTR [r9+56]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; A[8] x A[6]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	rsi, rax
-        adox	rbx, rcx
-        ; No store %r14 - %r8
-        ; No store %r15 - %r9
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        ; A[9] x A[6]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	rbx, rax
-        adox	r10, rcx
-        ; A[10] x A[6]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; No store %rbx - %r10
-        mov	QWORD PTR [r8], r10
-        mov	r12, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [r8+24]
-        ; A[10] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[10] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+8], r11
-        mov	QWORD PTR [r8+16], r12
-        mov	r11, QWORD PTR [r8+32]
-        mov	r12, QWORD PTR [r8+40]
-        ; A[10] x A[9]
-        mov	rdx, QWORD PTR [r9+72]
-        mulx	rcx, rax, QWORD PTR [r9+80]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[14] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, QWORD PTR [r9+112]
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8+24], r10
-        mov	QWORD PTR [r8+32], r11
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        ; A[14] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	rcx, rax, QWORD PTR [r9+112]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[14] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	rcx, rax, QWORD PTR [r9+112]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+40], r12
-        mov	QWORD PTR [r8+48], r10
-        mov	r12, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [r8+72]
-        ; A[14] x A[9]
-        mov	rdx, QWORD PTR [r9+72]
-        mulx	rcx, rax, QWORD PTR [r9+112]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[14] x A[10]
-        mov	rdx, QWORD PTR [r9+80]
-        mulx	rcx, rax, QWORD PTR [r9+112]
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+56], r11
-        mov	QWORD PTR [r8+64], r12
-        mov	r11, QWORD PTR [r8+80]
-        ; Zero into %r10
-        ; A[14] x A[11]
-        mov	rdx, QWORD PTR [r9+88]
-        mulx	rcx, rax, QWORD PTR [r9+112]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[14] x A[12]
-        mov	rdx, QWORD PTR [r9+96]
-        mulx	r12, rax, QWORD PTR [r9+112]
-        adcx	r11, rax
-        adox	r12, r13
-        mov	QWORD PTR [r8+72], r10
-        mov	QWORD PTR [r8+80], r11
-        ; Zero into %r8
-        ; Zero into %r9
-        ; A[14] x A[13]
-        mov	rdx, QWORD PTR [r9+104]
-        mulx	r10, rax, QWORD PTR [r9+112]
-        adcx	r12, rax
-        adox	r10, r13
-        mov	QWORD PTR [r8+88], r12
-        ;  Carry
-        adcx	r10, r14
-        mov	r14, r13
-        adcx	r14, r13
-        adox	r14, r13
-        mov	QWORD PTR [r8+96], r10
-        ; Diagonal 8
-        mov	r11, QWORD PTR [r8]
-        mov	r12, QWORD PTR [r8+8]
-        ; A[8] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	rcx, rax, QWORD PTR [r9+64]
-        adcx	rbx, rax
-        adox	r11, rcx
-        ; A[9] x A[7]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; No store %rbx - %r8
-        mov	QWORD PTR [r8], r11
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        ; A[9] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	rcx, rax, QWORD PTR [r9+72]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[15] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	rcx, rax, QWORD PTR [r9+120]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+8], r12
-        mov	QWORD PTR [r8+16], r10
-        mov	r12, QWORD PTR [r8+32]
-        mov	r10, QWORD PTR [r8+40]
-        ; A[15] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	rcx, rax, QWORD PTR [r9+120]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[15] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	rcx, rax, QWORD PTR [r9+120]
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+24], r11
-        mov	QWORD PTR [r8+32], r12
-        mov	r11, QWORD PTR [r8+48]
-        mov	r12, QWORD PTR [r8+56]
-        ; A[15] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, QWORD PTR [r9+120]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[15] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	rcx, rax, QWORD PTR [r9+120]
-        adcx	r11, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r8+40], r10
-        mov	QWORD PTR [r8+48], r11
-        mov	r10, QWORD PTR [r8+64]
-        mov	r11, QWORD PTR [r8+72]
-        ; A[15] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	rcx, rax, QWORD PTR [r9+120]
-        adcx	r12, rax
-        adox	r10, rcx
-        ; A[15] x A[9]
-        mov	rdx, QWORD PTR [r9+72]
-        mulx	rcx, rax, QWORD PTR [r9+120]
-        adcx	r10, rax
-        adox	r11, rcx
-        mov	QWORD PTR [r8+56], r12
-        mov	QWORD PTR [r8+64], r10
-        mov	r12, QWORD PTR [r8+80]
-        mov	r10, QWORD PTR [r8+88]
-        ; A[15] x A[10]
-        mov	rdx, QWORD PTR [r9+80]
-        mulx	rcx, rax, QWORD PTR [r9+120]
-        adcx	r11, rax
-        adox	r12, rcx
-        ; A[15] x A[11]
-        mov	rdx, QWORD PTR [r9+88]
-        mulx	rcx, rax, QWORD PTR [r9+120]
-        adcx	r12, rax
-        adox	r10, rcx
-        mov	QWORD PTR [r8+72], r11
-        mov	QWORD PTR [r8+80], r12
-        mov	r11, QWORD PTR [r8+96]
-        ; Zero into %r10
-        ; A[15] x A[12]
-        mov	rdx, QWORD PTR [r9+96]
-        mulx	rcx, rax, QWORD PTR [r9+120]
-        adcx	r10, rax
-        adox	r11, rcx
-        ; A[15] x A[13]
-        mov	rdx, QWORD PTR [r9+104]
-        mulx	r12, rax, QWORD PTR [r9+120]
-        adcx	r11, rax
-        adox	r12, r13
-        mov	QWORD PTR [r8+88], r10
-        mov	QWORD PTR [r8+96], r11
-        ; Zero into %r8
-        ; Zero into %r9
-        ; A[15] x A[14]
-        mov	rdx, QWORD PTR [r9+112]
-        mulx	r10, rax, QWORD PTR [r9+120]
-        adcx	r12, rax
-        adox	r10, r13
-        mov	QWORD PTR [r8+104], r12
-        ;  Carry
-        adcx	r10, r14
-        mov	r14, r13
-        adcx	r14, r13
-        adox	r14, r13
-        mov	QWORD PTR [r8+112], r10
-        mov	QWORD PTR [r8+120], r14
-        ; Double and Add in A[i] x A[i]
-        mov	r11, QWORD PTR [rbp+8]
-        ; A[0] x A[0]
-        mov	rdx, QWORD PTR [r9]
-        mulx	rcx, rax, rdx
-        mov	QWORD PTR [rbp], rax
-        adox	r11, r11
-        adcx	r11, rcx
-        mov	QWORD PTR [rbp+8], r11
-        mov	r10, QWORD PTR [rbp+16]
-        mov	r11, QWORD PTR [rbp+24]
-        ; A[1] x A[1]
-        mov	rdx, QWORD PTR [r9+8]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [rbp+16], r10
-        mov	QWORD PTR [rbp+24], r11
-        mov	r10, QWORD PTR [rbp+32]
-        mov	r11, QWORD PTR [rbp+40]
-        ; A[2] x A[2]
-        mov	rdx, QWORD PTR [r9+16]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [rbp+32], r10
-        mov	QWORD PTR [rbp+40], r11
-        mov	r10, QWORD PTR [rbp+48]
-        mov	r11, QWORD PTR [rbp+56]
-        ; A[3] x A[3]
-        mov	rdx, QWORD PTR [r9+24]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [rbp+48], r10
-        mov	QWORD PTR [rbp+56], r11
-        mov	r10, QWORD PTR [rbp+64]
-        mov	r11, QWORD PTR [rbp+72]
-        ; A[4] x A[4]
-        mov	rdx, QWORD PTR [r9+32]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [rbp+64], r10
-        mov	QWORD PTR [rbp+72], r11
-        mov	r10, QWORD PTR [rbp+80]
-        mov	r11, QWORD PTR [rbp+88]
-        ; A[5] x A[5]
-        mov	rdx, QWORD PTR [r9+40]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [rbp+80], r10
-        mov	QWORD PTR [rbp+88], r11
-        ; A[6] x A[6]
-        mov	rdx, QWORD PTR [r9+48]
-        mulx	rcx, rax, rdx
-        adox	r15, r15
-        adox	rdi, rdi
-        adcx	r15, rax
-        adcx	rdi, rcx
-        ; A[7] x A[7]
-        mov	rdx, QWORD PTR [r9+56]
-        mulx	rcx, rax, rdx
-        adox	rsi, rsi
-        adox	rbx, rbx
-        adcx	rsi, rax
-        adcx	rbx, rcx
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        ; A[8] x A[8]
-        mov	rdx, QWORD PTR [r9+64]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8], r10
-        mov	QWORD PTR [r8+8], r11
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        ; A[9] x A[9]
-        mov	rdx, QWORD PTR [r9+72]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+16], r10
-        mov	QWORD PTR [r8+24], r11
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        ; A[10] x A[10]
-        mov	rdx, QWORD PTR [r9+80]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+32], r10
-        mov	QWORD PTR [r8+40], r11
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        ; A[11] x A[11]
-        mov	rdx, QWORD PTR [r9+88]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+48], r10
-        mov	QWORD PTR [r8+56], r11
-        mov	r10, QWORD PTR [r8+64]
-        mov	r11, QWORD PTR [r8+72]
-        ; A[12] x A[12]
-        mov	rdx, QWORD PTR [r9+96]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+64], r10
-        mov	QWORD PTR [r8+72], r11
-        mov	r10, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [r8+88]
-        ; A[13] x A[13]
-        mov	rdx, QWORD PTR [r9+104]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+80], r10
-        mov	QWORD PTR [r8+88], r11
-        mov	r10, QWORD PTR [r8+96]
-        mov	r11, QWORD PTR [r8+104]
-        ; A[14] x A[14]
-        mov	rdx, QWORD PTR [r9+112]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+96], r10
-        mov	QWORD PTR [r8+104], r11
-        mov	r10, QWORD PTR [r8+112]
-        mov	r11, QWORD PTR [r8+120]
-        ; A[15] x A[15]
-        mov	rdx, QWORD PTR [r9+120]
-        mulx	rcx, rax, rdx
-        adox	r10, r10
-        adox	r11, r11
-        adcx	r10, rax
-        adcx	r11, rcx
-        mov	QWORD PTR [r8+112], r10
-        mov	QWORD PTR [r8+120], r11
-        mov	QWORD PTR [r8+-32], r15
-        mov	QWORD PTR [r8+-24], rdi
-        mov	QWORD PTR [r8+-16], rsi
-        mov	QWORD PTR [r8+-8], rbx
-        sub	r8, 128
-        cmp	r9, r8
-        jne	L_end_1024_sqr_avx2_16
-        vmovdqu	xmm0, OWORD PTR [rbp]
-        vmovups	OWORD PTR [r8], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbp+16]
-        vmovups	OWORD PTR [r8+16], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbp+32]
-        vmovups	OWORD PTR [r8+32], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbp+48]
-        vmovups	OWORD PTR [r8+48], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbp+64]
-        vmovups	OWORD PTR [r8+64], xmm0
-        vmovdqu	xmm0, OWORD PTR [rbp+80]
-        vmovups	OWORD PTR [r8+80], xmm0
-L_end_1024_sqr_avx2_16:
-        add	rsp, 128
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        pop	rbp
-        ret
-sp_1024_sqr_avx2_16 ENDP
-_text ENDS
-ENDIF
-; /* Add b to a into r. (r = a + b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_add_16 PROC
-        ; Add
-        mov	r9, QWORD PTR [rdx]
-        xor	rax, rax
-        add	r9, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx+8]
-        mov	QWORD PTR [rcx], r9
-        adc	r10, QWORD PTR [r8+8]
-        mov	r9, QWORD PTR [rdx+16]
-        mov	QWORD PTR [rcx+8], r10
-        adc	r9, QWORD PTR [r8+16]
-        mov	r10, QWORD PTR [rdx+24]
-        mov	QWORD PTR [rcx+16], r9
-        adc	r10, QWORD PTR [r8+24]
-        mov	r9, QWORD PTR [rdx+32]
-        mov	QWORD PTR [rcx+24], r10
-        adc	r9, QWORD PTR [r8+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	QWORD PTR [rcx+32], r9
-        adc	r10, QWORD PTR [r8+40]
-        mov	r9, QWORD PTR [rdx+48]
-        mov	QWORD PTR [rcx+40], r10
-        adc	r9, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+56]
-        mov	QWORD PTR [rcx+48], r9
-        adc	r10, QWORD PTR [r8+56]
-        mov	r9, QWORD PTR [rdx+64]
-        mov	QWORD PTR [rcx+56], r10
-        adc	r9, QWORD PTR [r8+64]
-        mov	r10, QWORD PTR [rdx+72]
-        mov	QWORD PTR [rcx+64], r9
-        adc	r10, QWORD PTR [r8+72]
-        mov	r9, QWORD PTR [rdx+80]
-        mov	QWORD PTR [rcx+72], r10
-        adc	r9, QWORD PTR [r8+80]
-        mov	r10, QWORD PTR [rdx+88]
-        mov	QWORD PTR [rcx+80], r9
-        adc	r10, QWORD PTR [r8+88]
-        mov	r9, QWORD PTR [rdx+96]
-        mov	QWORD PTR [rcx+88], r10
-        adc	r9, QWORD PTR [r8+96]
-        mov	r10, QWORD PTR [rdx+104]
-        mov	QWORD PTR [rcx+96], r9
-        adc	r10, QWORD PTR [r8+104]
-        mov	r9, QWORD PTR [rdx+112]
-        mov	QWORD PTR [rcx+104], r10
-        adc	r9, QWORD PTR [r8+112]
-        mov	r10, QWORD PTR [rdx+120]
-        mov	QWORD PTR [rcx+112], r9
-        adc	r10, QWORD PTR [r8+120]
-        mov	QWORD PTR [rcx+120], r10
-        adc	rax, 0
-        ret
-sp_1024_add_16 ENDP
-_text ENDS
-; /* Sub b from a into a. (a -= b)
-;  *
-;  * a  A single precision integer and result.
-;  * b  A single precision integer.
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_sub_in_place_16 PROC
-        mov	r8, QWORD PTR [rcx]
-        sub	r8, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rcx+8]
-        mov	QWORD PTR [rcx], r8
-        sbb	r9, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [rcx+16]
-        mov	QWORD PTR [rcx+8], r9
-        sbb	r8, QWORD PTR [rdx+16]
-        mov	r9, QWORD PTR [rcx+24]
-        mov	QWORD PTR [rcx+16], r8
-        sbb	r9, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [rcx+32]
-        mov	QWORD PTR [rcx+24], r9
-        sbb	r8, QWORD PTR [rdx+32]
-        mov	r9, QWORD PTR [rcx+40]
-        mov	QWORD PTR [rcx+32], r8
-        sbb	r9, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [rcx+48]
-        mov	QWORD PTR [rcx+40], r9
-        sbb	r8, QWORD PTR [rdx+48]
-        mov	r9, QWORD PTR [rcx+56]
-        mov	QWORD PTR [rcx+48], r8
-        sbb	r9, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [rcx+64]
-        mov	QWORD PTR [rcx+56], r9
-        sbb	r8, QWORD PTR [rdx+64]
-        mov	r9, QWORD PTR [rcx+72]
-        mov	QWORD PTR [rcx+64], r8
-        sbb	r9, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [rcx+80]
-        mov	QWORD PTR [rcx+72], r9
-        sbb	r8, QWORD PTR [rdx+80]
-        mov	r9, QWORD PTR [rcx+88]
-        mov	QWORD PTR [rcx+80], r8
-        sbb	r9, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [rcx+96]
-        mov	QWORD PTR [rcx+88], r9
-        sbb	r8, QWORD PTR [rdx+96]
-        mov	r9, QWORD PTR [rcx+104]
-        mov	QWORD PTR [rcx+96], r8
-        sbb	r9, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [rcx+112]
-        mov	QWORD PTR [rcx+104], r9
-        sbb	r8, QWORD PTR [rdx+112]
-        mov	r9, QWORD PTR [rcx+120]
-        mov	QWORD PTR [rcx+112], r8
-        sbb	r9, QWORD PTR [rdx+120]
-        mov	QWORD PTR [rcx+120], r9
-        sbb	rax, rax
-        ret
-sp_1024_sub_in_place_16 ENDP
-_text ENDS
-; /* Conditionally subtract b from a using the mask m.
-;  * m is -1 to subtract and 0 when not copying.
-;  *
-;  * r  A single precision number representing condition subtract result.
-;  * a  A single precision number to subtract from.
-;  * b  A single precision number to subtract.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_cond_sub_16 PROC
-        sub	rsp, 128
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp], r10
-        mov	QWORD PTR [rsp+8], r11
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+16], r10
-        mov	QWORD PTR [rsp+24], r11
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+32], r10
-        mov	QWORD PTR [rsp+40], r11
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+48], r10
-        mov	QWORD PTR [rsp+56], r11
-        mov	r10, QWORD PTR [r8+64]
-        mov	r11, QWORD PTR [r8+72]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+64], r10
-        mov	QWORD PTR [rsp+72], r11
-        mov	r10, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [r8+88]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+80], r10
-        mov	QWORD PTR [rsp+88], r11
-        mov	r10, QWORD PTR [r8+96]
-        mov	r11, QWORD PTR [r8+104]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+96], r10
-        mov	QWORD PTR [rsp+104], r11
-        mov	r10, QWORD PTR [r8+112]
-        mov	r11, QWORD PTR [r8+120]
-        and	r10, r9
-        and	r11, r9
-        mov	QWORD PTR [rsp+112], r10
-        mov	QWORD PTR [rsp+120], r11
-        mov	r10, QWORD PTR [rdx]
-        mov	r8, QWORD PTR [rsp]
-        sub	r10, r8
-        mov	r11, QWORD PTR [rdx+8]
-        mov	r8, QWORD PTR [rsp+8]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx], r10
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r8, QWORD PTR [rsp+16]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+8], r11
-        mov	r11, QWORD PTR [rdx+24]
-        mov	r8, QWORD PTR [rsp+24]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+16], r10
-        mov	r10, QWORD PTR [rdx+32]
-        mov	r8, QWORD PTR [rsp+32]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+24], r11
-        mov	r11, QWORD PTR [rdx+40]
-        mov	r8, QWORD PTR [rsp+40]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+32], r10
-        mov	r10, QWORD PTR [rdx+48]
-        mov	r8, QWORD PTR [rsp+48]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+40], r11
-        mov	r11, QWORD PTR [rdx+56]
-        mov	r8, QWORD PTR [rsp+56]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+48], r10
-        mov	r10, QWORD PTR [rdx+64]
-        mov	r8, QWORD PTR [rsp+64]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+56], r11
-        mov	r11, QWORD PTR [rdx+72]
-        mov	r8, QWORD PTR [rsp+72]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+64], r10
-        mov	r10, QWORD PTR [rdx+80]
-        mov	r8, QWORD PTR [rsp+80]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+72], r11
-        mov	r11, QWORD PTR [rdx+88]
-        mov	r8, QWORD PTR [rsp+88]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+80], r10
-        mov	r10, QWORD PTR [rdx+96]
-        mov	r8, QWORD PTR [rsp+96]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+88], r11
-        mov	r11, QWORD PTR [rdx+104]
-        mov	r8, QWORD PTR [rsp+104]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+96], r10
-        mov	r10, QWORD PTR [rdx+112]
-        mov	r8, QWORD PTR [rsp+112]
-        sbb	r10, r8
-        mov	QWORD PTR [rcx+104], r11
-        mov	r11, QWORD PTR [rdx+120]
-        mov	r8, QWORD PTR [rsp+120]
-        sbb	r11, r8
-        mov	QWORD PTR [rcx+112], r10
-        mov	QWORD PTR [rcx+120], r11
-        sbb	rax, rax
-        add	rsp, 128
-        ret
-sp_1024_cond_sub_16 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Conditionally subtract b from a using the mask m.
-;  * m is -1 to subtract and 0 when not copying.
-;  *
-;  * r  A single precision number representing condition subtract result.
-;  * a  A single precision number to subtract from.
-;  * b  A single precision number to subtract.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_cond_sub_avx2_16 PROC
-        push	r12
-        mov	r12, QWORD PTR [r8]
-        mov	r10, QWORD PTR [rdx]
-        pext	r12, r12, r9
-        sub	r10, r12
-        mov	r12, QWORD PTR [r8+8]
-        mov	r11, QWORD PTR [rdx+8]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+16]
-        mov	r12, QWORD PTR [rdx+16]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+8], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [rdx+24]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+16], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [rdx+32]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+24], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+40]
-        mov	r12, QWORD PTR [rdx+40]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+32], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+48]
-        mov	r10, QWORD PTR [rdx+48]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+40], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+56]
-        mov	r11, QWORD PTR [rdx+56]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+48], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+64]
-        mov	r12, QWORD PTR [rdx+64]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+56], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+72]
-        mov	r10, QWORD PTR [rdx+72]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+64], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [rdx+80]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+72], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+88]
-        mov	r12, QWORD PTR [rdx+88]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+80], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+96]
-        mov	r10, QWORD PTR [rdx+96]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+88], r12
-        sbb	r10, r11
-        mov	r12, QWORD PTR [r8+104]
-        mov	r11, QWORD PTR [rdx+104]
-        pext	r12, r12, r9
-        mov	QWORD PTR [rcx+96], r10
-        sbb	r11, r12
-        mov	r10, QWORD PTR [r8+112]
-        mov	r12, QWORD PTR [rdx+112]
-        pext	r10, r10, r9
-        mov	QWORD PTR [rcx+104], r11
-        sbb	r12, r10
-        mov	r11, QWORD PTR [r8+120]
-        mov	r10, QWORD PTR [rdx+120]
-        pext	r11, r11, r9
-        mov	QWORD PTR [rcx+112], r12
-        sbb	r10, r11
-        mov	QWORD PTR [rcx+120], r10
-        sbb	rax, rax
-        pop	r12
-        ret
-sp_1024_cond_sub_avx2_16 ENDP
-_text ENDS
-ENDIF
-; /* Mul a by digit b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision digit.
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_mul_d_16 PROC
-        push	r12
-        mov	r9, rdx
-        ; A[0] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9]
-        mov	r10, rax
-        mov	r11, rdx
-        mov	QWORD PTR [rcx], r10
-        ; A[1] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+8]
-        add	r11, rax
-        mov	QWORD PTR [rcx+8], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[2] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+16]
-        add	r12, rax
-        mov	QWORD PTR [rcx+16], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[3] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+24]
-        add	r10, rax
-        mov	QWORD PTR [rcx+24], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[4] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+32]
-        add	r11, rax
-        mov	QWORD PTR [rcx+32], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[5] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+40]
-        add	r12, rax
-        mov	QWORD PTR [rcx+40], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[6] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+48]
-        add	r10, rax
-        mov	QWORD PTR [rcx+48], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[7] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+56]
-        add	r11, rax
-        mov	QWORD PTR [rcx+56], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[8] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+64]
-        add	r12, rax
-        mov	QWORD PTR [rcx+64], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[9] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+72]
-        add	r10, rax
-        mov	QWORD PTR [rcx+72], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[10] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+80]
-        add	r11, rax
-        mov	QWORD PTR [rcx+80], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[11] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+88]
-        add	r12, rax
-        mov	QWORD PTR [rcx+88], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[12] * B
-        mov	rax, r8
-        xor	r12, r12
-        mul	QWORD PTR [r9+96]
-        add	r10, rax
-        mov	QWORD PTR [rcx+96], r10
-        adc	r11, rdx
-        adc	r12, 0
-        ; A[13] * B
-        mov	rax, r8
-        xor	r10, r10
-        mul	QWORD PTR [r9+104]
-        add	r11, rax
-        mov	QWORD PTR [rcx+104], r11
-        adc	r12, rdx
-        adc	r10, 0
-        ; A[14] * B
-        mov	rax, r8
-        xor	r11, r11
-        mul	QWORD PTR [r9+112]
-        add	r12, rax
-        mov	QWORD PTR [rcx+112], r12
-        adc	r10, rdx
-        adc	r11, 0
-        ; A[15] * B
-        mov	rax, r8
-        mul	QWORD PTR [r9+120]
-        add	r10, rax
-        adc	r11, rdx
-        mov	QWORD PTR [rcx+120], r10
-        mov	QWORD PTR [rcx+128], r11
-        pop	r12
-        ret
-sp_1024_mul_d_16 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Mul a by digit b into r. (r = a * b)
-;  *
-;  * r  A single precision integer.
-;  * a  A single precision integer.
-;  * b  A single precision digit.
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_mul_d_avx2_16 PROC
-        push	r12
-        push	r13
-        mov	rax, rdx
-        ; A[0] * B
-        mov	rdx, r8
-        xor	r13, r13
-        mulx	r12, r11, QWORD PTR [rax]
-        mov	QWORD PTR [rcx], r11
-        ; A[1] * B
-        mulx	r10, r9, QWORD PTR [rax+8]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+8], r12
-        ; A[2] * B
-        mulx	r10, r9, QWORD PTR [rax+16]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+16], r11
-        ; A[3] * B
-        mulx	r10, r9, QWORD PTR [rax+24]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+24], r12
-        ; A[4] * B
-        mulx	r10, r9, QWORD PTR [rax+32]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+32], r11
-        ; A[5] * B
-        mulx	r10, r9, QWORD PTR [rax+40]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+40], r12
-        ; A[6] * B
-        mulx	r10, r9, QWORD PTR [rax+48]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+48], r11
-        ; A[7] * B
-        mulx	r10, r9, QWORD PTR [rax+56]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+56], r12
-        ; A[8] * B
-        mulx	r10, r9, QWORD PTR [rax+64]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+64], r11
-        ; A[9] * B
-        mulx	r10, r9, QWORD PTR [rax+72]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+72], r12
-        ; A[10] * B
-        mulx	r10, r9, QWORD PTR [rax+80]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+80], r11
-        ; A[11] * B
-        mulx	r10, r9, QWORD PTR [rax+88]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+88], r12
-        ; A[12] * B
-        mulx	r10, r9, QWORD PTR [rax+96]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+96], r11
-        ; A[13] * B
-        mulx	r10, r9, QWORD PTR [rax+104]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        mov	QWORD PTR [rcx+104], r12
-        ; A[14] * B
-        mulx	r10, r9, QWORD PTR [rax+112]
-        mov	r12, r13
-        adcx	r11, r9
-        adox	r12, r10
-        mov	QWORD PTR [rcx+112], r11
-        ; A[15] * B
-        mulx	r10, r9, QWORD PTR [rax+120]
-        mov	r11, r13
-        adcx	r12, r9
-        adox	r11, r10
-        adcx	r11, r13
-        mov	QWORD PTR [rcx+120], r12
-        mov	QWORD PTR [rcx+128], r11
-        pop	r13
-        pop	r12
-        ret
-sp_1024_mul_d_avx2_16 ENDP
-_text ENDS
-ENDIF
-IFDEF _WIN64
-; /* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
-;  *
-;  * d1   The high order half of the number to divide.
-;  * d0   The low order half of the number to divide.
-;  * div  The dividend.
-;  * returns the result of the division.
-;  */
-_text SEGMENT READONLY PARA
-div_1024_word_asm_16 PROC
-        mov	r9, rdx
-        mov	rax, r9
-        mov	rdx, rcx
-        div	r8
-        ret
-div_1024_word_asm_16 ENDP
-_text ENDS
-ENDIF
-; /* Compare a with b in constant time.
-;  *
-;  * a  A single precision integer.
-;  * b  A single precision integer.
-;  * return -ve, 0 or +ve if a is less than, equal to or greater than b
-;  * respectively.
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_cmp_16 PROC
-        push	r12
-        xor	r9, r9
-        mov	r8, -1
-        mov	rax, -1
-        mov	r10, 1
-        mov	r11, QWORD PTR [rcx+120]
-        mov	r12, QWORD PTR [rdx+120]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+112]
-        mov	r12, QWORD PTR [rdx+112]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+104]
-        mov	r12, QWORD PTR [rdx+104]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+96]
-        mov	r12, QWORD PTR [rdx+96]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+88]
-        mov	r12, QWORD PTR [rdx+88]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+80]
-        mov	r12, QWORD PTR [rdx+80]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+72]
-        mov	r12, QWORD PTR [rdx+72]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+64]
-        mov	r12, QWORD PTR [rdx+64]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+56]
-        mov	r12, QWORD PTR [rdx+56]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+48]
-        mov	r12, QWORD PTR [rdx+48]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+40]
-        mov	r12, QWORD PTR [rdx+40]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+32]
-        mov	r12, QWORD PTR [rdx+32]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+24]
-        mov	r12, QWORD PTR [rdx+24]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+16]
-        mov	r12, QWORD PTR [rdx+16]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx+8]
-        mov	r12, QWORD PTR [rdx+8]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        mov	r11, QWORD PTR [rcx]
-        mov	r12, QWORD PTR [rdx]
-        and	r11, r8
-        and	r12, r8
-        sub	r11, r12
-        cmova	rax, r10
-        cmovc	rax, r8
-        cmovnz	r8, r9
-        xor	rax, r8
-        pop	r12
-        ret
-sp_1024_cmp_16 ENDP
-_text ENDS
-; /* Conditionally copy a into r using the mask m.
-;  * m is -1 to copy and 0 when not.
-;  *
-;  * r  A single precision number to copy over.
-;  * a  A single precision number to copy.
-;  * m  Mask value to apply.
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_cond_copy_16 PROC
-        mov	rax, QWORD PTR [rcx]
-        mov	r9, QWORD PTR [rcx+8]
-        mov	r10, QWORD PTR [rcx+16]
-        mov	r11, QWORD PTR [rcx+24]
-        xor	rax, QWORD PTR [rdx]
-        xor	r9, QWORD PTR [rdx+8]
-        xor	r10, QWORD PTR [rdx+16]
-        xor	r11, QWORD PTR [rdx+24]
-        and	rax, r8
-        and	r9, r8
-        and	r10, r8
-        and	r11, r8
-        xor	QWORD PTR [rcx], rax
-        xor	QWORD PTR [rcx+8], r9
-        xor	QWORD PTR [rcx+16], r10
-        xor	QWORD PTR [rcx+24], r11
-        mov	rax, QWORD PTR [rcx+32]
-        mov	r9, QWORD PTR [rcx+40]
-        mov	r10, QWORD PTR [rcx+48]
-        mov	r11, QWORD PTR [rcx+56]
-        xor	rax, QWORD PTR [rdx+32]
-        xor	r9, QWORD PTR [rdx+40]
-        xor	r10, QWORD PTR [rdx+48]
-        xor	r11, QWORD PTR [rdx+56]
-        and	rax, r8
-        and	r9, r8
-        and	r10, r8
-        and	r11, r8
-        xor	QWORD PTR [rcx+32], rax
-        xor	QWORD PTR [rcx+40], r9
-        xor	QWORD PTR [rcx+48], r10
-        xor	QWORD PTR [rcx+56], r11
-        mov	rax, QWORD PTR [rcx+64]
-        mov	r9, QWORD PTR [rcx+72]
-        mov	r10, QWORD PTR [rcx+80]
-        mov	r11, QWORD PTR [rcx+88]
-        xor	rax, QWORD PTR [rdx+64]
-        xor	r9, QWORD PTR [rdx+72]
-        xor	r10, QWORD PTR [rdx+80]
-        xor	r11, QWORD PTR [rdx+88]
-        and	rax, r8
-        and	r9, r8
-        and	r10, r8
-        and	r11, r8
-        xor	QWORD PTR [rcx+64], rax
-        xor	QWORD PTR [rcx+72], r9
-        xor	QWORD PTR [rcx+80], r10
-        xor	QWORD PTR [rcx+88], r11
-        mov	rax, QWORD PTR [rcx+96]
-        mov	r9, QWORD PTR [rcx+104]
-        mov	r10, QWORD PTR [rcx+112]
-        mov	r11, QWORD PTR [rcx+120]
-        xor	rax, QWORD PTR [rdx+96]
-        xor	r9, QWORD PTR [rdx+104]
-        xor	r10, QWORD PTR [rdx+112]
-        xor	r11, QWORD PTR [rdx+120]
-        and	rax, r8
-        and	r9, r8
-        and	r10, r8
-        and	r11, r8
-        xor	QWORD PTR [rcx+96], rax
-        xor	QWORD PTR [rcx+104], r9
-        xor	QWORD PTR [rcx+112], r10
-        xor	QWORD PTR [rcx+120], r11
-        ret
-sp_1024_cond_copy_16 ENDP
-_text ENDS
-; /* Reduce the number back to 1024 bits using Montgomery reduction.
-;  *
-;  * a   A single precision number to reduce in place.
-;  * m   The single precision number representing the modulus.
-;  * mp  The digit representing the negative inverse of m mod 2^n.
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_mont_reduce_16 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        mov	r9, rdx
-        xor	rsi, rsi
-        ; i = 16
-        mov	r10, 16
-        mov	r15, QWORD PTR [rcx]
-        mov	rdi, QWORD PTR [rcx+8]
-L_1024_mont_reduce_16_loop:
-        ; mu = a[i] * mp
-        mov	r13, r15
-        imul	r13, r8
-        ; a[i+0] += m[0] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9]
-        add	r15, rax
-        adc	r12, rdx
-        ; a[i+1] += m[1] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+8]
-        mov	r15, rdi
-        add	r15, rax
-        adc	r11, rdx
-        add	r15, r12
-        adc	r11, 0
-        ; a[i+2] += m[2] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+16]
-        mov	rdi, QWORD PTR [rcx+16]
-        add	rdi, rax
-        adc	r12, rdx
-        add	rdi, r11
-        adc	r12, 0
-        ; a[i+3] += m[3] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+24]
-        mov	r14, QWORD PTR [rcx+24]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+24], r14
-        adc	r11, 0
-        ; a[i+4] += m[4] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+32]
-        mov	r14, QWORD PTR [rcx+32]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+32], r14
-        adc	r12, 0
-        ; a[i+5] += m[5] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+40]
-        mov	r14, QWORD PTR [rcx+40]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+40], r14
-        adc	r11, 0
-        ; a[i+6] += m[6] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+48]
-        mov	r14, QWORD PTR [rcx+48]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+48], r14
-        adc	r12, 0
-        ; a[i+7] += m[7] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+56]
-        mov	r14, QWORD PTR [rcx+56]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+56], r14
-        adc	r11, 0
-        ; a[i+8] += m[8] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+64]
-        mov	r14, QWORD PTR [rcx+64]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+64], r14
-        adc	r12, 0
-        ; a[i+9] += m[9] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+72]
-        mov	r14, QWORD PTR [rcx+72]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+72], r14
-        adc	r11, 0
-        ; a[i+10] += m[10] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+80]
-        mov	r14, QWORD PTR [rcx+80]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+80], r14
-        adc	r12, 0
-        ; a[i+11] += m[11] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+88]
-        mov	r14, QWORD PTR [rcx+88]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+88], r14
-        adc	r11, 0
-        ; a[i+12] += m[12] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+96]
-        mov	r14, QWORD PTR [rcx+96]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+96], r14
-        adc	r12, 0
-        ; a[i+13] += m[13] * mu
-        mov	rax, r13
-        xor	r11, r11
-        mul	QWORD PTR [r9+104]
-        mov	r14, QWORD PTR [rcx+104]
-        add	r14, rax
-        adc	r11, rdx
-        add	r14, r12
-        mov	QWORD PTR [rcx+104], r14
-        adc	r11, 0
-        ; a[i+14] += m[14] * mu
-        mov	rax, r13
-        xor	r12, r12
-        mul	QWORD PTR [r9+112]
-        mov	r14, QWORD PTR [rcx+112]
-        add	r14, rax
-        adc	r12, rdx
-        add	r14, r11
-        mov	QWORD PTR [rcx+112], r14
-        adc	r12, 0
-        ; a[i+15] += m[15] * mu
-        mov	rax, r13
-        mul	QWORD PTR [r9+120]
-        mov	r14, QWORD PTR [rcx+120]
-        add	r12, rax
-        adc	rdx, rsi
-        mov	rsi, 0
-        adc	rsi, 0
-        add	r14, r12
-        mov	QWORD PTR [rcx+120], r14
-        adc	QWORD PTR [rcx+128], rdx
-        adc	rsi, 0
-        ; i -= 1
-        add	rcx, 8
-        dec	r10
-        jnz	L_1024_mont_reduce_16_loop
-        mov	r14, QWORD PTR [rcx+120]
-        mov	QWORD PTR [rcx], r15
-        sub	r14, QWORD PTR [r9+120]
-        mov	QWORD PTR [rcx+8], rdi
-        sbb	r14, r14
-        neg	rsi
-        not	r14
-        or	rsi, r14
-IFDEF _WIN64
-        mov	r8, r9
-        mov	r9, rsi
-ELSE
-        mov	r9, rsi
-        mov	r8, r9
-ENDIF
-        mov	rdx, rcx
-        mov	rcx, rcx
-        sub	rcx, 128
-        call	sp_1024_cond_sub_16
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_1024_mont_reduce_16 ENDP
-_text ENDS
-; /* Add two Montgomery form numbers (r = a + b % m).
-;  *
-;  * r   Result of addition.
-;  * a   First number to add in Montgomery form.
-;  * b   Second number to add in Montgomery form.
-;  * m   Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_mont_add_16 PROC
-        push	r12
-        push	r13
-        sub	rsp, 128
-        mov	rax, QWORD PTR [rdx]
-        mov	r10, QWORD PTR [rdx+8]
-        mov	r11, QWORD PTR [rdx+16]
-        mov	r12, QWORD PTR [rdx+24]
-        add	rax, QWORD PTR [r8]
-        mov	r13, 0
-        adc	r10, QWORD PTR [r8+8]
-        adc	r11, QWORD PTR [r8+16]
-        adc	r12, QWORD PTR [r8+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        mov	QWORD PTR [rcx+16], r11
-        mov	QWORD PTR [rcx+24], r12
-        mov	rax, QWORD PTR [rdx+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	r11, QWORD PTR [rdx+48]
-        mov	r12, QWORD PTR [rdx+56]
-        adc	rax, QWORD PTR [r8+32]
-        adc	r10, QWORD PTR [r8+40]
-        adc	r11, QWORD PTR [r8+48]
-        adc	r12, QWORD PTR [r8+56]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        mov	QWORD PTR [rcx+48], r11
-        mov	QWORD PTR [rcx+56], r12
-        mov	rax, QWORD PTR [rdx+64]
-        mov	r10, QWORD PTR [rdx+72]
-        mov	r11, QWORD PTR [rdx+80]
-        mov	r12, QWORD PTR [rdx+88]
-        adc	rax, QWORD PTR [r8+64]
-        adc	r10, QWORD PTR [r8+72]
-        adc	r11, QWORD PTR [r8+80]
-        adc	r12, QWORD PTR [r8+88]
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], r10
-        mov	QWORD PTR [rcx+80], r11
-        mov	QWORD PTR [rcx+88], r12
-        mov	rax, QWORD PTR [rdx+96]
-        mov	r10, QWORD PTR [rdx+104]
-        mov	r11, QWORD PTR [rdx+112]
-        mov	r12, QWORD PTR [rdx+120]
-        adc	rax, QWORD PTR [r8+96]
-        adc	r10, QWORD PTR [r8+104]
-        adc	r11, QWORD PTR [r8+112]
-        adc	r12, QWORD PTR [r8+120]
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], r10
-        mov	QWORD PTR [rcx+112], r11
-        mov	QWORD PTR [rcx+120], r12
-        sbb	r13, 0
-        sub	r12, QWORD PTR [r9+120]
-        sbb	r12, r12
-        not	r12
-        or	r13, r12
-        mov	r11, QWORD PTR [r9]
-        mov	r12, QWORD PTR [r9+8]
-        and	r11, r13
-        and	r12, r13
-        mov	QWORD PTR [rsp], r11
-        mov	QWORD PTR [rsp+8], r12
-        mov	r11, QWORD PTR [r9+16]
-        mov	r12, QWORD PTR [r9+24]
-        and	r11, r13
-        and	r12, r13
-        mov	QWORD PTR [rsp+16], r11
-        mov	QWORD PTR [rsp+24], r12
-        mov	r11, QWORD PTR [r9+32]
-        mov	r12, QWORD PTR [r9+40]
-        and	r11, r13
-        and	r12, r13
-        mov	QWORD PTR [rsp+32], r11
-        mov	QWORD PTR [rsp+40], r12
-        mov	r11, QWORD PTR [r9+48]
-        mov	r12, QWORD PTR [r9+56]
-        and	r11, r13
-        and	r12, r13
-        mov	QWORD PTR [rsp+48], r11
-        mov	QWORD PTR [rsp+56], r12
-        mov	r11, QWORD PTR [r9+64]
-        mov	r12, QWORD PTR [r9+72]
-        and	r11, r13
-        and	r12, r13
-        mov	QWORD PTR [rsp+64], r11
-        mov	QWORD PTR [rsp+72], r12
-        mov	r11, QWORD PTR [r9+80]
-        mov	r12, QWORD PTR [r9+88]
-        and	r11, r13
-        and	r12, r13
-        mov	QWORD PTR [rsp+80], r11
-        mov	QWORD PTR [rsp+88], r12
-        mov	r11, QWORD PTR [r9+96]
-        mov	r12, QWORD PTR [r9+104]
-        and	r11, r13
-        and	r12, r13
-        mov	QWORD PTR [rsp+96], r11
-        mov	QWORD PTR [rsp+104], r12
-        mov	r11, QWORD PTR [r9+112]
-        mov	r12, QWORD PTR [r9+120]
-        and	r11, r13
-        and	r12, r13
-        mov	QWORD PTR [rsp+112], r11
-        mov	QWORD PTR [rsp+120], r12
-        mov	rax, QWORD PTR [rcx]
-        mov	r10, QWORD PTR [rcx+8]
-        sub	rax, QWORD PTR [rsp]
-        sbb	r10, QWORD PTR [rsp+8]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        mov	rax, QWORD PTR [rcx+16]
-        mov	r10, QWORD PTR [rcx+24]
-        sbb	rax, QWORD PTR [rsp+16]
-        sbb	r10, QWORD PTR [rsp+24]
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r10
-        mov	rax, QWORD PTR [rcx+32]
-        mov	r10, QWORD PTR [rcx+40]
-        sbb	rax, QWORD PTR [rsp+32]
-        sbb	r10, QWORD PTR [rsp+40]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        mov	rax, QWORD PTR [rcx+48]
-        mov	r10, QWORD PTR [rcx+56]
-        sbb	rax, QWORD PTR [rsp+48]
-        sbb	r10, QWORD PTR [rsp+56]
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r10
-        mov	rax, QWORD PTR [rcx+64]
-        mov	r10, QWORD PTR [rcx+72]
-        sbb	rax, QWORD PTR [rsp+64]
-        sbb	r10, QWORD PTR [rsp+72]
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], r10
-        mov	rax, QWORD PTR [rcx+80]
-        mov	r10, QWORD PTR [rcx+88]
-        sbb	rax, QWORD PTR [rsp+80]
-        sbb	r10, QWORD PTR [rsp+88]
-        mov	QWORD PTR [rcx+80], rax
-        mov	QWORD PTR [rcx+88], r10
-        mov	rax, QWORD PTR [rcx+96]
-        mov	r10, QWORD PTR [rcx+104]
-        sbb	rax, QWORD PTR [rsp+96]
-        sbb	r10, QWORD PTR [rsp+104]
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], r10
-        mov	rax, QWORD PTR [rcx+112]
-        mov	r10, QWORD PTR [rcx+120]
-        sbb	rax, QWORD PTR [rsp+112]
-        sbb	r10, QWORD PTR [rsp+120]
-        mov	QWORD PTR [rcx+112], rax
-        mov	QWORD PTR [rcx+120], r10
-        add	rsp, 128
-        pop	r13
-        pop	r12
-        ret
-sp_1024_mont_add_16 ENDP
-_text ENDS
-; /* Double a Montgomery form number (r = a + a % m).
-;  *
-;  * r   Result of addition.
-;  * a   Number to souble in Montgomery form.
-;  * m   Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_mont_dbl_16 PROC
-        push	r12
-        sub	rsp, 128
-        mov	rax, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r11, QWORD PTR [rdx+24]
-        add	rax, QWORD PTR [rdx]
-        mov	r12, 0
-        adc	r9, QWORD PTR [rdx+8]
-        adc	r10, QWORD PTR [rdx+16]
-        adc	r11, QWORD PTR [rdx+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r9
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        mov	rax, QWORD PTR [rdx+32]
-        mov	r9, QWORD PTR [rdx+40]
-        mov	r10, QWORD PTR [rdx+48]
-        mov	r11, QWORD PTR [rdx+56]
-        adc	rax, QWORD PTR [rdx+32]
-        adc	r9, QWORD PTR [rdx+40]
-        adc	r10, QWORD PTR [rdx+48]
-        adc	r11, QWORD PTR [rdx+56]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r9
-        mov	QWORD PTR [rcx+48], r10
-        mov	QWORD PTR [rcx+56], r11
-        mov	rax, QWORD PTR [rdx+64]
-        mov	r9, QWORD PTR [rdx+72]
-        mov	r10, QWORD PTR [rdx+80]
-        mov	r11, QWORD PTR [rdx+88]
-        adc	rax, QWORD PTR [rdx+64]
-        adc	r9, QWORD PTR [rdx+72]
-        adc	r10, QWORD PTR [rdx+80]
-        adc	r11, QWORD PTR [rdx+88]
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], r9
-        mov	QWORD PTR [rcx+80], r10
-        mov	QWORD PTR [rcx+88], r11
-        mov	rax, QWORD PTR [rdx+96]
-        mov	r9, QWORD PTR [rdx+104]
-        mov	r10, QWORD PTR [rdx+112]
-        mov	r11, QWORD PTR [rdx+120]
-        adc	rax, QWORD PTR [rdx+96]
-        adc	r9, QWORD PTR [rdx+104]
-        adc	r10, QWORD PTR [rdx+112]
-        adc	r11, QWORD PTR [rdx+120]
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], r9
-        mov	QWORD PTR [rcx+112], r10
-        mov	QWORD PTR [rcx+120], r11
-        sbb	r12, 0
-        sub	r11, QWORD PTR [r8+120]
-        sbb	r11, r11
-        not	r11
-        or	r12, r11
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp], r10
-        mov	QWORD PTR [rsp+8], r11
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp+16], r10
-        mov	QWORD PTR [rsp+24], r11
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp+32], r10
-        mov	QWORD PTR [rsp+40], r11
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp+48], r10
-        mov	QWORD PTR [rsp+56], r11
-        mov	r10, QWORD PTR [r8+64]
-        mov	r11, QWORD PTR [r8+72]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp+64], r10
-        mov	QWORD PTR [rsp+72], r11
-        mov	r10, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [r8+88]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp+80], r10
-        mov	QWORD PTR [rsp+88], r11
-        mov	r10, QWORD PTR [r8+96]
-        mov	r11, QWORD PTR [r8+104]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp+96], r10
-        mov	QWORD PTR [rsp+104], r11
-        mov	r10, QWORD PTR [r8+112]
-        mov	r11, QWORD PTR [r8+120]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp+112], r10
-        mov	QWORD PTR [rsp+120], r11
-        mov	rax, QWORD PTR [rcx]
-        mov	r9, QWORD PTR [rcx+8]
-        sub	rax, QWORD PTR [rsp]
-        sbb	r9, QWORD PTR [rsp+8]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r9
-        mov	rax, QWORD PTR [rcx+16]
-        mov	r9, QWORD PTR [rcx+24]
-        sbb	rax, QWORD PTR [rsp+16]
-        sbb	r9, QWORD PTR [rsp+24]
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r9
-        mov	rax, QWORD PTR [rcx+32]
-        mov	r9, QWORD PTR [rcx+40]
-        sbb	rax, QWORD PTR [rsp+32]
-        sbb	r9, QWORD PTR [rsp+40]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r9
-        mov	rax, QWORD PTR [rcx+48]
-        mov	r9, QWORD PTR [rcx+56]
-        sbb	rax, QWORD PTR [rsp+48]
-        sbb	r9, QWORD PTR [rsp+56]
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r9
-        mov	rax, QWORD PTR [rcx+64]
-        mov	r9, QWORD PTR [rcx+72]
-        sbb	rax, QWORD PTR [rsp+64]
-        sbb	r9, QWORD PTR [rsp+72]
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], r9
-        mov	rax, QWORD PTR [rcx+80]
-        mov	r9, QWORD PTR [rcx+88]
-        sbb	rax, QWORD PTR [rsp+80]
-        sbb	r9, QWORD PTR [rsp+88]
-        mov	QWORD PTR [rcx+80], rax
-        mov	QWORD PTR [rcx+88], r9
-        mov	rax, QWORD PTR [rcx+96]
-        mov	r9, QWORD PTR [rcx+104]
-        sbb	rax, QWORD PTR [rsp+96]
-        sbb	r9, QWORD PTR [rsp+104]
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], r9
-        mov	rax, QWORD PTR [rcx+112]
-        mov	r9, QWORD PTR [rcx+120]
-        sbb	rax, QWORD PTR [rsp+112]
-        sbb	r9, QWORD PTR [rsp+120]
-        mov	QWORD PTR [rcx+112], rax
-        mov	QWORD PTR [rcx+120], r9
-        add	rsp, 128
-        pop	r12
-        ret
-sp_1024_mont_dbl_16 ENDP
-_text ENDS
-; /* Triple a Montgomery form number (r = a + a + a % m).
-;  *
-;  * r   Result of addition.
-;  * a   Number to souble in Montgomery form.
-;  * m   Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_mont_tpl_16 PROC
-        push	r12
-        sub	rsp, 128
-        mov	rax, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r11, QWORD PTR [rdx+24]
-        add	rax, QWORD PTR [rdx]
-        mov	r12, 0
-        adc	r9, QWORD PTR [rdx+8]
-        adc	r10, QWORD PTR [rdx+16]
-        adc	r11, QWORD PTR [rdx+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r9
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        mov	rax, QWORD PTR [rdx+32]
-        mov	r9, QWORD PTR [rdx+40]
-        mov	r10, QWORD PTR [rdx+48]
-        mov	r11, QWORD PTR [rdx+56]
-        adc	rax, QWORD PTR [rdx+32]
-        adc	r9, QWORD PTR [rdx+40]
-        adc	r10, QWORD PTR [rdx+48]
-        adc	r11, QWORD PTR [rdx+56]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r9
-        mov	QWORD PTR [rcx+48], r10
-        mov	QWORD PTR [rcx+56], r11
-        mov	rax, QWORD PTR [rdx+64]
-        mov	r9, QWORD PTR [rdx+72]
-        mov	r10, QWORD PTR [rdx+80]
-        mov	r11, QWORD PTR [rdx+88]
-        adc	rax, QWORD PTR [rdx+64]
-        adc	r9, QWORD PTR [rdx+72]
-        adc	r10, QWORD PTR [rdx+80]
-        adc	r11, QWORD PTR [rdx+88]
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], r9
-        mov	QWORD PTR [rcx+80], r10
-        mov	QWORD PTR [rcx+88], r11
-        mov	rax, QWORD PTR [rdx+96]
-        mov	r9, QWORD PTR [rdx+104]
-        mov	r10, QWORD PTR [rdx+112]
-        mov	r11, QWORD PTR [rdx+120]
-        adc	rax, QWORD PTR [rdx+96]
-        adc	r9, QWORD PTR [rdx+104]
-        adc	r10, QWORD PTR [rdx+112]
-        adc	r11, QWORD PTR [rdx+120]
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], r9
-        mov	QWORD PTR [rcx+112], r10
-        mov	QWORD PTR [rcx+120], r11
-        sbb	r12, 0
-        sub	r11, QWORD PTR [r8+120]
-        sbb	r11, r11
-        not	r11
-        or	r12, r11
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp], r10
-        mov	QWORD PTR [rsp+8], r11
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp+16], r10
-        mov	QWORD PTR [rsp+24], r11
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp+32], r10
-        mov	QWORD PTR [rsp+40], r11
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp+48], r10
-        mov	QWORD PTR [rsp+56], r11
-        mov	r10, QWORD PTR [r8+64]
-        mov	r11, QWORD PTR [r8+72]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp+64], r10
-        mov	QWORD PTR [rsp+72], r11
-        mov	r10, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [r8+88]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp+80], r10
-        mov	QWORD PTR [rsp+88], r11
-        mov	r10, QWORD PTR [r8+96]
-        mov	r11, QWORD PTR [r8+104]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp+96], r10
-        mov	QWORD PTR [rsp+104], r11
-        mov	r10, QWORD PTR [r8+112]
-        mov	r11, QWORD PTR [r8+120]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp+112], r10
-        mov	QWORD PTR [rsp+120], r11
-        mov	rax, QWORD PTR [rcx]
-        mov	r9, QWORD PTR [rcx+8]
-        sub	rax, QWORD PTR [rsp]
-        sbb	r9, QWORD PTR [rsp+8]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r9
-        mov	rax, QWORD PTR [rcx+16]
-        mov	r9, QWORD PTR [rcx+24]
-        sbb	rax, QWORD PTR [rsp+16]
-        sbb	r9, QWORD PTR [rsp+24]
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r9
-        mov	rax, QWORD PTR [rcx+32]
-        mov	r9, QWORD PTR [rcx+40]
-        sbb	rax, QWORD PTR [rsp+32]
-        sbb	r9, QWORD PTR [rsp+40]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r9
-        mov	rax, QWORD PTR [rcx+48]
-        mov	r9, QWORD PTR [rcx+56]
-        sbb	rax, QWORD PTR [rsp+48]
-        sbb	r9, QWORD PTR [rsp+56]
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r9
-        mov	rax, QWORD PTR [rcx+64]
-        mov	r9, QWORD PTR [rcx+72]
-        sbb	rax, QWORD PTR [rsp+64]
-        sbb	r9, QWORD PTR [rsp+72]
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], r9
-        mov	rax, QWORD PTR [rcx+80]
-        mov	r9, QWORD PTR [rcx+88]
-        sbb	rax, QWORD PTR [rsp+80]
-        sbb	r9, QWORD PTR [rsp+88]
-        mov	QWORD PTR [rcx+80], rax
-        mov	QWORD PTR [rcx+88], r9
-        mov	rax, QWORD PTR [rcx+96]
-        mov	r9, QWORD PTR [rcx+104]
-        sbb	rax, QWORD PTR [rsp+96]
-        sbb	r9, QWORD PTR [rsp+104]
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], r9
-        mov	rax, QWORD PTR [rcx+112]
-        mov	r9, QWORD PTR [rcx+120]
-        sbb	rax, QWORD PTR [rsp+112]
-        sbb	r9, QWORD PTR [rsp+120]
-        mov	QWORD PTR [rcx+112], rax
-        mov	QWORD PTR [rcx+120], r9
-        mov	rax, QWORD PTR [rcx]
-        mov	r9, QWORD PTR [rcx+8]
-        mov	r10, QWORD PTR [rcx+16]
-        mov	r11, QWORD PTR [rcx+24]
-        add	rax, QWORD PTR [rdx]
-        mov	r12, 0
-        adc	r9, QWORD PTR [rdx+8]
-        adc	r10, QWORD PTR [rdx+16]
-        adc	r11, QWORD PTR [rdx+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r9
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        mov	rax, QWORD PTR [rcx+32]
-        mov	r9, QWORD PTR [rcx+40]
-        mov	r10, QWORD PTR [rcx+48]
-        mov	r11, QWORD PTR [rcx+56]
-        adc	rax, QWORD PTR [rdx+32]
-        adc	r9, QWORD PTR [rdx+40]
-        adc	r10, QWORD PTR [rdx+48]
-        adc	r11, QWORD PTR [rdx+56]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r9
-        mov	QWORD PTR [rcx+48], r10
-        mov	QWORD PTR [rcx+56], r11
-        mov	rax, QWORD PTR [rcx+64]
-        mov	r9, QWORD PTR [rcx+72]
-        mov	r10, QWORD PTR [rcx+80]
-        mov	r11, QWORD PTR [rcx+88]
-        adc	rax, QWORD PTR [rdx+64]
-        adc	r9, QWORD PTR [rdx+72]
-        adc	r10, QWORD PTR [rdx+80]
-        adc	r11, QWORD PTR [rdx+88]
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], r9
-        mov	QWORD PTR [rcx+80], r10
-        mov	QWORD PTR [rcx+88], r11
-        mov	rax, QWORD PTR [rcx+96]
-        mov	r9, QWORD PTR [rcx+104]
-        mov	r10, QWORD PTR [rcx+112]
-        mov	r11, QWORD PTR [rcx+120]
-        adc	rax, QWORD PTR [rdx+96]
-        adc	r9, QWORD PTR [rdx+104]
-        adc	r10, QWORD PTR [rdx+112]
-        adc	r11, QWORD PTR [rdx+120]
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], r9
-        mov	QWORD PTR [rcx+112], r10
-        mov	QWORD PTR [rcx+120], r11
-        sbb	r12, 0
-        sub	r11, QWORD PTR [r8+120]
-        sbb	r11, r11
-        not	r11
-        or	r12, r11
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp], r10
-        mov	QWORD PTR [rsp+8], r11
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp+16], r10
-        mov	QWORD PTR [rsp+24], r11
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp+32], r10
-        mov	QWORD PTR [rsp+40], r11
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp+48], r10
-        mov	QWORD PTR [rsp+56], r11
-        mov	r10, QWORD PTR [r8+64]
-        mov	r11, QWORD PTR [r8+72]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp+64], r10
-        mov	QWORD PTR [rsp+72], r11
-        mov	r10, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [r8+88]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp+80], r10
-        mov	QWORD PTR [rsp+88], r11
-        mov	r10, QWORD PTR [r8+96]
-        mov	r11, QWORD PTR [r8+104]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp+96], r10
-        mov	QWORD PTR [rsp+104], r11
-        mov	r10, QWORD PTR [r8+112]
-        mov	r11, QWORD PTR [r8+120]
-        and	r10, r12
-        and	r11, r12
-        mov	QWORD PTR [rsp+112], r10
-        mov	QWORD PTR [rsp+120], r11
-        mov	rax, QWORD PTR [rcx]
-        mov	r9, QWORD PTR [rcx+8]
-        sub	rax, QWORD PTR [rsp]
-        sbb	r9, QWORD PTR [rsp+8]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r9
-        mov	rax, QWORD PTR [rcx+16]
-        mov	r9, QWORD PTR [rcx+24]
-        sbb	rax, QWORD PTR [rsp+16]
-        sbb	r9, QWORD PTR [rsp+24]
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r9
-        mov	rax, QWORD PTR [rcx+32]
-        mov	r9, QWORD PTR [rcx+40]
-        sbb	rax, QWORD PTR [rsp+32]
-        sbb	r9, QWORD PTR [rsp+40]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r9
-        mov	rax, QWORD PTR [rcx+48]
-        mov	r9, QWORD PTR [rcx+56]
-        sbb	rax, QWORD PTR [rsp+48]
-        sbb	r9, QWORD PTR [rsp+56]
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r9
-        mov	rax, QWORD PTR [rcx+64]
-        mov	r9, QWORD PTR [rcx+72]
-        sbb	rax, QWORD PTR [rsp+64]
-        sbb	r9, QWORD PTR [rsp+72]
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], r9
-        mov	rax, QWORD PTR [rcx+80]
-        mov	r9, QWORD PTR [rcx+88]
-        sbb	rax, QWORD PTR [rsp+80]
-        sbb	r9, QWORD PTR [rsp+88]
-        mov	QWORD PTR [rcx+80], rax
-        mov	QWORD PTR [rcx+88], r9
-        mov	rax, QWORD PTR [rcx+96]
-        mov	r9, QWORD PTR [rcx+104]
-        sbb	rax, QWORD PTR [rsp+96]
-        sbb	r9, QWORD PTR [rsp+104]
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], r9
-        mov	rax, QWORD PTR [rcx+112]
-        mov	r9, QWORD PTR [rcx+120]
-        sbb	rax, QWORD PTR [rsp+112]
-        sbb	r9, QWORD PTR [rsp+120]
-        mov	QWORD PTR [rcx+112], rax
-        mov	QWORD PTR [rcx+120], r9
-        add	rsp, 128
-        pop	r12
-        ret
-sp_1024_mont_tpl_16 ENDP
-_text ENDS
-; /* Subtract two Montgomery form numbers (r = a - b % m).
-;  *
-;  * r   Result of addition.
-;  * a   First number to add in Montgomery form.
-;  * b   Second number to add in Montgomery form.
-;  * m   Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_mont_sub_16 PROC
-        push	r12
-        push	r13
-        sub	rsp, 128
-        mov	rax, QWORD PTR [rdx]
-        mov	r10, QWORD PTR [rdx+8]
-        mov	r11, QWORD PTR [rdx+16]
-        mov	r12, QWORD PTR [rdx+24]
-        sub	rax, QWORD PTR [r8]
-        mov	r13, 0
-        sbb	r10, QWORD PTR [r8+8]
-        sbb	r11, QWORD PTR [r8+16]
-        sbb	r12, QWORD PTR [r8+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        mov	QWORD PTR [rcx+16], r11
-        mov	QWORD PTR [rcx+24], r12
-        mov	rax, QWORD PTR [rdx+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	r11, QWORD PTR [rdx+48]
-        mov	r12, QWORD PTR [rdx+56]
-        sbb	rax, QWORD PTR [r8+32]
-        sbb	r10, QWORD PTR [r8+40]
-        sbb	r11, QWORD PTR [r8+48]
-        sbb	r12, QWORD PTR [r8+56]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        mov	QWORD PTR [rcx+48], r11
-        mov	QWORD PTR [rcx+56], r12
-        mov	rax, QWORD PTR [rdx+64]
-        mov	r10, QWORD PTR [rdx+72]
-        mov	r11, QWORD PTR [rdx+80]
-        mov	r12, QWORD PTR [rdx+88]
-        sbb	rax, QWORD PTR [r8+64]
-        sbb	r10, QWORD PTR [r8+72]
-        sbb	r11, QWORD PTR [r8+80]
-        sbb	r12, QWORD PTR [r8+88]
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], r10
-        mov	QWORD PTR [rcx+80], r11
-        mov	QWORD PTR [rcx+88], r12
-        mov	rax, QWORD PTR [rdx+96]
-        mov	r10, QWORD PTR [rdx+104]
-        mov	r11, QWORD PTR [rdx+112]
-        mov	r12, QWORD PTR [rdx+120]
-        sbb	rax, QWORD PTR [r8+96]
-        sbb	r10, QWORD PTR [r8+104]
-        sbb	r11, QWORD PTR [r8+112]
-        sbb	r12, QWORD PTR [r8+120]
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], r10
-        mov	QWORD PTR [rcx+112], r11
-        mov	QWORD PTR [rcx+120], r12
-        sbb	r13, 0
-        mov	r11, QWORD PTR [r9]
-        mov	r12, QWORD PTR [r9+8]
-        and	r11, r13
-        and	r12, r13
-        mov	QWORD PTR [rsp], r11
-        mov	QWORD PTR [rsp+8], r12
-        mov	r11, QWORD PTR [r9+16]
-        mov	r12, QWORD PTR [r9+24]
-        and	r11, r13
-        and	r12, r13
-        mov	QWORD PTR [rsp+16], r11
-        mov	QWORD PTR [rsp+24], r12
-        mov	r11, QWORD PTR [r9+32]
-        mov	r12, QWORD PTR [r9+40]
-        and	r11, r13
-        and	r12, r13
-        mov	QWORD PTR [rsp+32], r11
-        mov	QWORD PTR [rsp+40], r12
-        mov	r11, QWORD PTR [r9+48]
-        mov	r12, QWORD PTR [r9+56]
-        and	r11, r13
-        and	r12, r13
-        mov	QWORD PTR [rsp+48], r11
-        mov	QWORD PTR [rsp+56], r12
-        mov	r11, QWORD PTR [r9+64]
-        mov	r12, QWORD PTR [r9+72]
-        and	r11, r13
-        and	r12, r13
-        mov	QWORD PTR [rsp+64], r11
-        mov	QWORD PTR [rsp+72], r12
-        mov	r11, QWORD PTR [r9+80]
-        mov	r12, QWORD PTR [r9+88]
-        and	r11, r13
-        and	r12, r13
-        mov	QWORD PTR [rsp+80], r11
-        mov	QWORD PTR [rsp+88], r12
-        mov	r11, QWORD PTR [r9+96]
-        mov	r12, QWORD PTR [r9+104]
-        and	r11, r13
-        and	r12, r13
-        mov	QWORD PTR [rsp+96], r11
-        mov	QWORD PTR [rsp+104], r12
-        mov	r11, QWORD PTR [r9+112]
-        mov	r12, QWORD PTR [r9+120]
-        and	r11, r13
-        and	r12, r13
-        mov	QWORD PTR [rsp+112], r11
-        mov	QWORD PTR [rsp+120], r12
-        mov	rax, QWORD PTR [rcx]
-        mov	r10, QWORD PTR [rcx+8]
-        add	rax, QWORD PTR [rsp]
-        adc	r10, QWORD PTR [rsp+8]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        mov	rax, QWORD PTR [rcx+16]
-        mov	r10, QWORD PTR [rcx+24]
-        adc	rax, QWORD PTR [rsp+16]
-        adc	r10, QWORD PTR [rsp+24]
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r10
-        mov	rax, QWORD PTR [rcx+32]
-        mov	r10, QWORD PTR [rcx+40]
-        adc	rax, QWORD PTR [rsp+32]
-        adc	r10, QWORD PTR [rsp+40]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        mov	rax, QWORD PTR [rcx+48]
-        mov	r10, QWORD PTR [rcx+56]
-        adc	rax, QWORD PTR [rsp+48]
-        adc	r10, QWORD PTR [rsp+56]
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r10
-        mov	rax, QWORD PTR [rcx+64]
-        mov	r10, QWORD PTR [rcx+72]
-        adc	rax, QWORD PTR [rsp+64]
-        adc	r10, QWORD PTR [rsp+72]
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], r10
-        mov	rax, QWORD PTR [rcx+80]
-        mov	r10, QWORD PTR [rcx+88]
-        adc	rax, QWORD PTR [rsp+80]
-        adc	r10, QWORD PTR [rsp+88]
-        mov	QWORD PTR [rcx+80], rax
-        mov	QWORD PTR [rcx+88], r10
-        mov	rax, QWORD PTR [rcx+96]
-        mov	r10, QWORD PTR [rcx+104]
-        adc	rax, QWORD PTR [rsp+96]
-        adc	r10, QWORD PTR [rsp+104]
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], r10
-        mov	rax, QWORD PTR [rcx+112]
-        mov	r10, QWORD PTR [rcx+120]
-        adc	rax, QWORD PTR [rsp+112]
-        adc	r10, QWORD PTR [rsp+120]
-        mov	QWORD PTR [rcx+112], rax
-        mov	QWORD PTR [rcx+120], r10
-        add	rsp, 128
-        pop	r13
-        pop	r12
-        ret
-sp_1024_mont_sub_16 ENDP
-_text ENDS
-; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
-;  *
-;  * r  Result of division by 2.
-;  * a  Number to divide.
-;  * m  Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_mont_div2_16 PROC
-        push	r12
-        push	r13
-        sub	rsp, 128
-        mov	r13, QWORD PTR [rdx]
-        xor	r12, r12
-        mov	rax, r13
-        and	r13, 1
-        neg	r13
-        mov	r10, QWORD PTR [r8]
-        and	r10, r13
-        mov	QWORD PTR [rsp], r10
-        mov	r10, QWORD PTR [r8+8]
-        and	r10, r13
-        mov	QWORD PTR [rsp+8], r10
-        mov	r10, QWORD PTR [r8+16]
-        and	r10, r13
-        mov	QWORD PTR [rsp+16], r10
-        mov	r10, QWORD PTR [r8+24]
-        and	r10, r13
-        mov	QWORD PTR [rsp+24], r10
-        mov	r10, QWORD PTR [r8+32]
-        and	r10, r13
-        mov	QWORD PTR [rsp+32], r10
-        mov	r10, QWORD PTR [r8+40]
-        and	r10, r13
-        mov	QWORD PTR [rsp+40], r10
-        mov	r10, QWORD PTR [r8+48]
-        and	r10, r13
-        mov	QWORD PTR [rsp+48], r10
-        mov	r10, QWORD PTR [r8+56]
-        and	r10, r13
-        mov	QWORD PTR [rsp+56], r10
-        mov	r10, QWORD PTR [r8+64]
-        and	r10, r13
-        mov	QWORD PTR [rsp+64], r10
-        mov	r10, QWORD PTR [r8+72]
-        and	r10, r13
-        mov	QWORD PTR [rsp+72], r10
-        mov	r10, QWORD PTR [r8+80]
-        and	r10, r13
-        mov	QWORD PTR [rsp+80], r10
-        mov	r10, QWORD PTR [r8+88]
-        and	r10, r13
-        mov	QWORD PTR [rsp+88], r10
-        mov	r10, QWORD PTR [r8+96]
-        and	r10, r13
-        mov	QWORD PTR [rsp+96], r10
-        mov	r10, QWORD PTR [r8+104]
-        and	r10, r13
-        mov	QWORD PTR [rsp+104], r10
-        mov	r10, QWORD PTR [r8+112]
-        and	r10, r13
-        mov	QWORD PTR [rsp+112], r10
-        mov	r10, QWORD PTR [r8+120]
-        and	r10, r13
-        mov	QWORD PTR [rsp+120], r10
-        add	QWORD PTR [rsp], rax
-        mov	rax, QWORD PTR [rdx+8]
-        adc	QWORD PTR [rsp+8], rax
-        mov	rax, QWORD PTR [rdx+16]
-        adc	QWORD PTR [rsp+16], rax
-        mov	rax, QWORD PTR [rdx+24]
-        adc	QWORD PTR [rsp+24], rax
-        mov	rax, QWORD PTR [rdx+32]
-        adc	QWORD PTR [rsp+32], rax
-        mov	rax, QWORD PTR [rdx+40]
-        adc	QWORD PTR [rsp+40], rax
-        mov	rax, QWORD PTR [rdx+48]
-        adc	QWORD PTR [rsp+48], rax
-        mov	rax, QWORD PTR [rdx+56]
-        adc	QWORD PTR [rsp+56], rax
-        mov	rax, QWORD PTR [rdx+64]
-        adc	QWORD PTR [rsp+64], rax
-        mov	rax, QWORD PTR [rdx+72]
-        adc	QWORD PTR [rsp+72], rax
-        mov	rax, QWORD PTR [rdx+80]
-        adc	QWORD PTR [rsp+80], rax
-        mov	rax, QWORD PTR [rdx+88]
-        adc	QWORD PTR [rsp+88], rax
-        mov	rax, QWORD PTR [rdx+96]
-        adc	QWORD PTR [rsp+96], rax
-        mov	rax, QWORD PTR [rdx+104]
-        adc	QWORD PTR [rsp+104], rax
-        mov	rax, QWORD PTR [rdx+112]
-        adc	QWORD PTR [rsp+112], rax
-        mov	rax, QWORD PTR [rdx+120]
-        adc	QWORD PTR [rsp+120], rax
-        adc	r12, 0
-        mov	rax, QWORD PTR [rsp]
-        mov	r9, QWORD PTR [rsp+8]
-        shrd	rax, r9, 1
-        mov	QWORD PTR [rcx], rax
-        mov	rax, QWORD PTR [rsp+16]
-        shrd	r9, rax, 1
-        mov	QWORD PTR [rcx+8], r9
-        mov	r9, QWORD PTR [rsp+24]
-        shrd	rax, r9, 1
-        mov	QWORD PTR [rcx+16], rax
-        mov	rax, QWORD PTR [rsp+32]
-        shrd	r9, rax, 1
-        mov	QWORD PTR [rcx+24], r9
-        mov	r9, QWORD PTR [rsp+40]
-        shrd	rax, r9, 1
-        mov	QWORD PTR [rcx+32], rax
-        mov	rax, QWORD PTR [rsp+48]
-        shrd	r9, rax, 1
-        mov	QWORD PTR [rcx+40], r9
-        mov	r9, QWORD PTR [rsp+56]
-        shrd	rax, r9, 1
-        mov	QWORD PTR [rcx+48], rax
-        mov	rax, QWORD PTR [rsp+64]
-        shrd	r9, rax, 1
-        mov	QWORD PTR [rcx+56], r9
-        mov	r9, QWORD PTR [rsp+72]
-        shrd	rax, r9, 1
-        mov	QWORD PTR [rcx+64], rax
-        mov	rax, QWORD PTR [rsp+80]
-        shrd	r9, rax, 1
-        mov	QWORD PTR [rcx+72], r9
-        mov	r9, QWORD PTR [rsp+88]
-        shrd	rax, r9, 1
-        mov	QWORD PTR [rcx+80], rax
-        mov	rax, QWORD PTR [rsp+96]
-        shrd	r9, rax, 1
-        mov	QWORD PTR [rcx+88], r9
-        mov	r9, QWORD PTR [rsp+104]
-        shrd	rax, r9, 1
-        mov	QWORD PTR [rcx+96], rax
-        mov	rax, QWORD PTR [rsp+112]
-        shrd	r9, rax, 1
-        mov	QWORD PTR [rcx+104], r9
-        mov	r9, QWORD PTR [rsp+120]
-        shrd	rax, r9, 1
-        mov	QWORD PTR [rcx+112], rax
-        shrd	r9, r12, 1
-        mov	QWORD PTR [rcx+120], r9
-        add	rsp, 128
-        pop	r13
-        pop	r12
-        ret
-sp_1024_mont_div2_16 ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX2
-; /* Reduce the number back to 1024 bits using Montgomery reduction.
-;  *
-;  * a   A single precision number to reduce in place.
-;  * m   The single precision number representing the modulus.
-;  * mp  The digit representing the negative inverse of m mod 2^n.
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_mont_reduce_avx2_16 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        push	rbx
-        push	rbp
-        mov	r9, rcx
-        mov	r10, rdx
-        xor	rbp, rbp
-        ; i = 16
-        mov	r11, 16
-        mov	r14, QWORD PTR [r9]
-        mov	r15, QWORD PTR [r9+8]
-        mov	rdi, QWORD PTR [r9+16]
-        mov	rsi, QWORD PTR [r9+24]
-        add	r9, 64
-        xor	rbp, rbp
-L_1024_mont_reduce_avx2_16_loop:
-        ; mu = a[i] * mp
-        mov	rdx, r14
-        mov	r12, r14
-        imul	rdx, r8
-        xor	rbx, rbx
-        ; a[i+0] += m[0] * mu
-        mulx	rcx, rax, QWORD PTR [r10]
-        mov	r14, r15
-        adcx	r12, rax
-        adox	r14, rcx
-        ; a[i+1] += m[1] * mu
-        mulx	rcx, rax, QWORD PTR [r10+8]
-        mov	r15, rdi
-        adcx	r14, rax
-        adox	r15, rcx
-        ; a[i+2] += m[2] * mu
-        mulx	rcx, rax, QWORD PTR [r10+16]
-        mov	rdi, rsi
-        adcx	r15, rax
-        adox	rdi, rcx
-        ; a[i+3] += m[3] * mu
-        mulx	rcx, rax, QWORD PTR [r10+24]
-        mov	rsi, QWORD PTR [r9+-32]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; a[i+4] += m[4] * mu
-        mulx	rcx, rax, QWORD PTR [r10+32]
-        mov	r13, QWORD PTR [r9+-24]
-        adcx	rsi, rax
-        adox	r13, rcx
-        ; a[i+5] += m[5] * mu
-        mulx	rcx, rax, QWORD PTR [r10+40]
-        mov	r12, QWORD PTR [r9+-16]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-24], r13
-        ; a[i+6] += m[6] * mu
-        mulx	rcx, rax, QWORD PTR [r10+48]
-        mov	r13, QWORD PTR [r9+-8]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-16], r12
-        ; a[i+7] += m[7] * mu
-        mulx	rcx, rax, QWORD PTR [r10+56]
-        mov	r12, QWORD PTR [r9]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-8], r13
-        ; a[i+8] += m[8] * mu
-        mulx	rcx, rax, QWORD PTR [r10+64]
-        mov	r13, QWORD PTR [r9+8]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9], r12
-        ; a[i+9] += m[9] * mu
-        mulx	rcx, rax, QWORD PTR [r10+72]
-        mov	r12, QWORD PTR [r9+16]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+8], r13
-        ; a[i+10] += m[10] * mu
-        mulx	rcx, rax, QWORD PTR [r10+80]
-        mov	r13, QWORD PTR [r9+24]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+16], r12
-        ; a[i+11] += m[11] * mu
-        mulx	rcx, rax, QWORD PTR [r10+88]
-        mov	r12, QWORD PTR [r9+32]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+24], r13
-        ; a[i+12] += m[12] * mu
-        mulx	rcx, rax, QWORD PTR [r10+96]
-        mov	r13, QWORD PTR [r9+40]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+32], r12
-        ; a[i+13] += m[13] * mu
-        mulx	rcx, rax, QWORD PTR [r10+104]
-        mov	r12, QWORD PTR [r9+48]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+40], r13
-        ; a[i+14] += m[14] * mu
-        mulx	rcx, rax, QWORD PTR [r10+112]
-        mov	r13, QWORD PTR [r9+56]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+48], r12
-        ; a[i+15] += m[15] * mu
-        mulx	rcx, rax, QWORD PTR [r10+120]
-        mov	r12, QWORD PTR [r9+64]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+56], r13
-        adcx	r12, rbp
-        mov	rbp, rbx
-        mov	QWORD PTR [r9+64], r12
-        adox	rbp, rbx
-        adcx	rbp, rbx
-        ; mu = a[i] * mp
-        mov	rdx, r14
-        mov	r12, r14
-        imul	rdx, r8
-        xor	rbx, rbx
-        ; a[i+0] += m[0] * mu
-        mulx	rcx, rax, QWORD PTR [r10]
-        mov	r14, r15
-        adcx	r12, rax
-        adox	r14, rcx
-        ; a[i+1] += m[1] * mu
-        mulx	rcx, rax, QWORD PTR [r10+8]
-        mov	r15, rdi
-        adcx	r14, rax
-        adox	r15, rcx
-        ; a[i+2] += m[2] * mu
-        mulx	rcx, rax, QWORD PTR [r10+16]
-        mov	rdi, rsi
-        adcx	r15, rax
-        adox	rdi, rcx
-        ; a[i+3] += m[3] * mu
-        mulx	rcx, rax, QWORD PTR [r10+24]
-        mov	rsi, QWORD PTR [r9+-24]
-        adcx	rdi, rax
-        adox	rsi, rcx
-        ; a[i+4] += m[4] * mu
-        mulx	rcx, rax, QWORD PTR [r10+32]
-        mov	r13, QWORD PTR [r9+-16]
-        adcx	rsi, rax
-        adox	r13, rcx
-        ; a[i+5] += m[5] * mu
-        mulx	rcx, rax, QWORD PTR [r10+40]
-        mov	r12, QWORD PTR [r9+-8]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+-16], r13
-        ; a[i+6] += m[6] * mu
-        mulx	rcx, rax, QWORD PTR [r10+48]
-        mov	r13, QWORD PTR [r9]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+-8], r12
-        ; a[i+7] += m[7] * mu
-        mulx	rcx, rax, QWORD PTR [r10+56]
-        mov	r12, QWORD PTR [r9+8]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9], r13
-        ; a[i+8] += m[8] * mu
-        mulx	rcx, rax, QWORD PTR [r10+64]
-        mov	r13, QWORD PTR [r9+16]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+8], r12
-        ; a[i+9] += m[9] * mu
-        mulx	rcx, rax, QWORD PTR [r10+72]
-        mov	r12, QWORD PTR [r9+24]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+16], r13
-        ; a[i+10] += m[10] * mu
-        mulx	rcx, rax, QWORD PTR [r10+80]
-        mov	r13, QWORD PTR [r9+32]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+24], r12
-        ; a[i+11] += m[11] * mu
-        mulx	rcx, rax, QWORD PTR [r10+88]
-        mov	r12, QWORD PTR [r9+40]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+32], r13
-        ; a[i+12] += m[12] * mu
-        mulx	rcx, rax, QWORD PTR [r10+96]
-        mov	r13, QWORD PTR [r9+48]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+40], r12
-        ; a[i+13] += m[13] * mu
-        mulx	rcx, rax, QWORD PTR [r10+104]
-        mov	r12, QWORD PTR [r9+56]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+48], r13
-        ; a[i+14] += m[14] * mu
-        mulx	rcx, rax, QWORD PTR [r10+112]
-        mov	r13, QWORD PTR [r9+64]
-        adcx	r12, rax
-        adox	r13, rcx
-        mov	QWORD PTR [r9+56], r12
-        ; a[i+15] += m[15] * mu
-        mulx	rcx, rax, QWORD PTR [r10+120]
-        mov	r12, QWORD PTR [r9+72]
-        adcx	r13, rax
-        adox	r12, rcx
-        mov	QWORD PTR [r9+64], r13
-        adcx	r12, rbp
-        mov	rbp, rbx
-        mov	QWORD PTR [r9+72], r12
-        adox	rbp, rbx
-        adcx	rbp, rbx
-        ; a += 2
-        add	r9, 16
-        ; i -= 2
-        sub	r11, 2
-        jnz	L_1024_mont_reduce_avx2_16_loop
-        sub	r9, 64
-        sub	r12, QWORD PTR [r10+120]
-        mov	r8, r9
-        sbb	r12, r12
-        neg	rbp
-        not	r12
-        or	rbp, r12
-        sub	r9, 128
-        mov	rcx, QWORD PTR [r10]
-        mov	rdx, r14
-        pext	rcx, rcx, rbp
-        sub	rdx, rcx
-        mov	rcx, QWORD PTR [r10+8]
-        mov	rax, r15
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+16]
-        mov	rcx, rdi
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+8], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+24]
-        mov	rdx, rsi
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+16], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+32]
-        mov	rax, QWORD PTR [r8+32]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+24], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+40]
-        mov	rcx, QWORD PTR [r8+40]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+32], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+48]
-        mov	rdx, QWORD PTR [r8+48]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+40], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+56]
-        mov	rax, QWORD PTR [r8+56]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+48], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+64]
-        mov	rcx, QWORD PTR [r8+64]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+56], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+72]
-        mov	rdx, QWORD PTR [r8+72]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+64], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+80]
-        mov	rax, QWORD PTR [r8+80]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+72], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+88]
-        mov	rcx, QWORD PTR [r8+88]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+80], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+96]
-        mov	rdx, QWORD PTR [r8+96]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+88], rcx
-        sbb	rdx, rax
-        mov	rcx, QWORD PTR [r10+104]
-        mov	rax, QWORD PTR [r8+104]
-        pext	rcx, rcx, rbp
-        mov	QWORD PTR [r9+96], rdx
-        sbb	rax, rcx
-        mov	rdx, QWORD PTR [r10+112]
-        mov	rcx, QWORD PTR [r8+112]
-        pext	rdx, rdx, rbp
-        mov	QWORD PTR [r9+104], rax
-        sbb	rcx, rdx
-        mov	rax, QWORD PTR [r10+120]
-        mov	rdx, QWORD PTR [r8+120]
-        pext	rax, rax, rbp
-        mov	QWORD PTR [r9+112], rcx
-        sbb	rdx, rax
-        mov	QWORD PTR [r9+120], rdx
-        pop	rbp
-        pop	rbx
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-sp_1024_mont_reduce_avx2_16 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Add two Montgomery form numbers (r = a + b % m).
-;  *
-;  * r   Result of addition.
-;  * a   First number to add in Montgomery form.
-;  * b   Second number to add in Montgomery form.
-;  * m   Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_mont_add_avx2_16 PROC
-        push	r12
-        push	r13
-        mov	rax, QWORD PTR [rdx]
-        mov	r10, QWORD PTR [rdx+8]
-        mov	r11, QWORD PTR [rdx+16]
-        mov	r12, QWORD PTR [rdx+24]
-        add	rax, QWORD PTR [r8]
-        mov	r13, 0
-        adc	r10, QWORD PTR [r8+8]
-        adc	r11, QWORD PTR [r8+16]
-        adc	r12, QWORD PTR [r8+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        mov	QWORD PTR [rcx+16], r11
-        mov	QWORD PTR [rcx+24], r12
-        mov	rax, QWORD PTR [rdx+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	r11, QWORD PTR [rdx+48]
-        mov	r12, QWORD PTR [rdx+56]
-        adc	rax, QWORD PTR [r8+32]
-        adc	r10, QWORD PTR [r8+40]
-        adc	r11, QWORD PTR [r8+48]
-        adc	r12, QWORD PTR [r8+56]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        mov	QWORD PTR [rcx+48], r11
-        mov	QWORD PTR [rcx+56], r12
-        mov	rax, QWORD PTR [rdx+64]
-        mov	r10, QWORD PTR [rdx+72]
-        mov	r11, QWORD PTR [rdx+80]
-        mov	r12, QWORD PTR [rdx+88]
-        adc	rax, QWORD PTR [r8+64]
-        adc	r10, QWORD PTR [r8+72]
-        adc	r11, QWORD PTR [r8+80]
-        adc	r12, QWORD PTR [r8+88]
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], r10
-        mov	QWORD PTR [rcx+80], r11
-        mov	QWORD PTR [rcx+88], r12
-        mov	rax, QWORD PTR [rdx+96]
-        mov	r10, QWORD PTR [rdx+104]
-        mov	r11, QWORD PTR [rdx+112]
-        mov	r12, QWORD PTR [rdx+120]
-        adc	rax, QWORD PTR [r8+96]
-        adc	r10, QWORD PTR [r8+104]
-        adc	r11, QWORD PTR [r8+112]
-        adc	r12, QWORD PTR [r8+120]
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], r10
-        mov	QWORD PTR [rcx+112], r11
-        mov	QWORD PTR [rcx+120], r12
-        sbb	r13, 0
-        sub	r12, QWORD PTR [r9+120]
-        sbb	r12, r12
-        not	r12
-        or	r13, r12
-        mov	r11, QWORD PTR [r9]
-        mov	r12, QWORD PTR [r9+8]
-        mov	rax, QWORD PTR [rcx]
-        mov	r10, QWORD PTR [rcx+8]
-        pext	r11, r11, r13
-        pext	r12, r12, r13
-        sub	rax, r11
-        sbb	r10, r12
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        mov	r11, QWORD PTR [r9+16]
-        mov	r12, QWORD PTR [r9+24]
-        mov	rax, QWORD PTR [rcx+16]
-        mov	r10, QWORD PTR [rcx+24]
-        pext	r11, r11, r13
-        pext	r12, r12, r13
-        sbb	rax, r11
-        sbb	r10, r12
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r10
-        mov	r11, QWORD PTR [r9+32]
-        mov	r12, QWORD PTR [r9+40]
-        mov	rax, QWORD PTR [rcx+32]
-        mov	r10, QWORD PTR [rcx+40]
-        pext	r11, r11, r13
-        pext	r12, r12, r13
-        sbb	rax, r11
-        sbb	r10, r12
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        mov	r11, QWORD PTR [r9+48]
-        mov	r12, QWORD PTR [r9+56]
-        mov	rax, QWORD PTR [rcx+48]
-        mov	r10, QWORD PTR [rcx+56]
-        pext	r11, r11, r13
-        pext	r12, r12, r13
-        sbb	rax, r11
-        sbb	r10, r12
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r10
-        mov	r11, QWORD PTR [r9+64]
-        mov	r12, QWORD PTR [r9+72]
-        mov	rax, QWORD PTR [rcx+64]
-        mov	r10, QWORD PTR [rcx+72]
-        pext	r11, r11, r13
-        pext	r12, r12, r13
-        sbb	rax, r11
-        sbb	r10, r12
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], r10
-        mov	r11, QWORD PTR [r9+80]
-        mov	r12, QWORD PTR [r9+88]
-        mov	rax, QWORD PTR [rcx+80]
-        mov	r10, QWORD PTR [rcx+88]
-        pext	r11, r11, r13
-        pext	r12, r12, r13
-        sbb	rax, r11
-        sbb	r10, r12
-        mov	QWORD PTR [rcx+80], rax
-        mov	QWORD PTR [rcx+88], r10
-        mov	r11, QWORD PTR [r9+96]
-        mov	r12, QWORD PTR [r9+104]
-        mov	rax, QWORD PTR [rcx+96]
-        mov	r10, QWORD PTR [rcx+104]
-        pext	r11, r11, r13
-        pext	r12, r12, r13
-        sbb	rax, r11
-        sbb	r10, r12
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], r10
-        mov	r11, QWORD PTR [r9+112]
-        mov	r12, QWORD PTR [r9+120]
-        mov	rax, QWORD PTR [rcx+112]
-        mov	r10, QWORD PTR [rcx+120]
-        pext	r11, r11, r13
-        pext	r12, r12, r13
-        sbb	rax, r11
-        sbb	r10, r12
-        mov	QWORD PTR [rcx+112], rax
-        mov	QWORD PTR [rcx+120], r10
-        pop	r13
-        pop	r12
-        ret
-sp_1024_mont_add_avx2_16 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Double a Montgomery form number (r = a + a % m).
-;  *
-;  * r   Result of addition.
-;  * a   Number to souble in Montgomery form.
-;  * m   Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_mont_dbl_avx2_16 PROC
-        push	r12
-        mov	rax, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r11, QWORD PTR [rdx+24]
-        add	rax, QWORD PTR [rdx]
-        mov	r12, 0
-        adc	r9, QWORD PTR [rdx+8]
-        adc	r10, QWORD PTR [rdx+16]
-        adc	r11, QWORD PTR [rdx+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r9
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        mov	rax, QWORD PTR [rdx+32]
-        mov	r9, QWORD PTR [rdx+40]
-        mov	r10, QWORD PTR [rdx+48]
-        mov	r11, QWORD PTR [rdx+56]
-        adc	rax, QWORD PTR [rdx+32]
-        adc	r9, QWORD PTR [rdx+40]
-        adc	r10, QWORD PTR [rdx+48]
-        adc	r11, QWORD PTR [rdx+56]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r9
-        mov	QWORD PTR [rcx+48], r10
-        mov	QWORD PTR [rcx+56], r11
-        mov	rax, QWORD PTR [rdx+64]
-        mov	r9, QWORD PTR [rdx+72]
-        mov	r10, QWORD PTR [rdx+80]
-        mov	r11, QWORD PTR [rdx+88]
-        adc	rax, QWORD PTR [rdx+64]
-        adc	r9, QWORD PTR [rdx+72]
-        adc	r10, QWORD PTR [rdx+80]
-        adc	r11, QWORD PTR [rdx+88]
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], r9
-        mov	QWORD PTR [rcx+80], r10
-        mov	QWORD PTR [rcx+88], r11
-        mov	rax, QWORD PTR [rdx+96]
-        mov	r9, QWORD PTR [rdx+104]
-        mov	r10, QWORD PTR [rdx+112]
-        mov	r11, QWORD PTR [rdx+120]
-        adc	rax, QWORD PTR [rdx+96]
-        adc	r9, QWORD PTR [rdx+104]
-        adc	r10, QWORD PTR [rdx+112]
-        adc	r11, QWORD PTR [rdx+120]
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], r9
-        mov	QWORD PTR [rcx+112], r10
-        mov	QWORD PTR [rcx+120], r11
-        sbb	r12, 0
-        sub	r11, QWORD PTR [r8+120]
-        sbb	r11, r11
-        not	r11
-        or	r12, r11
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        mov	rax, QWORD PTR [rcx]
-        mov	r9, QWORD PTR [rcx+8]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sub	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r9
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        mov	rax, QWORD PTR [rcx+16]
-        mov	r9, QWORD PTR [rcx+24]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sbb	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r9
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        mov	rax, QWORD PTR [rcx+32]
-        mov	r9, QWORD PTR [rcx+40]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sbb	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r9
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        mov	rax, QWORD PTR [rcx+48]
-        mov	r9, QWORD PTR [rcx+56]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sbb	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r9
-        mov	r10, QWORD PTR [r8+64]
-        mov	r11, QWORD PTR [r8+72]
-        mov	rax, QWORD PTR [rcx+64]
-        mov	r9, QWORD PTR [rcx+72]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sbb	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], r9
-        mov	r10, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [r8+88]
-        mov	rax, QWORD PTR [rcx+80]
-        mov	r9, QWORD PTR [rcx+88]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sbb	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx+80], rax
-        mov	QWORD PTR [rcx+88], r9
-        mov	r10, QWORD PTR [r8+96]
-        mov	r11, QWORD PTR [r8+104]
-        mov	rax, QWORD PTR [rcx+96]
-        mov	r9, QWORD PTR [rcx+104]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sbb	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], r9
-        mov	r10, QWORD PTR [r8+112]
-        mov	r11, QWORD PTR [r8+120]
-        mov	rax, QWORD PTR [rcx+112]
-        mov	r9, QWORD PTR [rcx+120]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sbb	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx+112], rax
-        mov	QWORD PTR [rcx+120], r9
-        pop	r12
-        ret
-sp_1024_mont_dbl_avx2_16 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Triple a Montgomery form number (r = a + a + a % m).
-;  *
-;  * r   Result of addition.
-;  * a   Number to souble in Montgomery form.
-;  * m   Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_mont_tpl_avx2_16 PROC
-        push	r12
-        mov	rax, QWORD PTR [rdx]
-        mov	r9, QWORD PTR [rdx+8]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r11, QWORD PTR [rdx+24]
-        add	rax, QWORD PTR [rdx]
-        mov	r12, 0
-        adc	r9, QWORD PTR [rdx+8]
-        adc	r10, QWORD PTR [rdx+16]
-        adc	r11, QWORD PTR [rdx+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r9
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        mov	rax, QWORD PTR [rdx+32]
-        mov	r9, QWORD PTR [rdx+40]
-        mov	r10, QWORD PTR [rdx+48]
-        mov	r11, QWORD PTR [rdx+56]
-        adc	rax, QWORD PTR [rdx+32]
-        adc	r9, QWORD PTR [rdx+40]
-        adc	r10, QWORD PTR [rdx+48]
-        adc	r11, QWORD PTR [rdx+56]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r9
-        mov	QWORD PTR [rcx+48], r10
-        mov	QWORD PTR [rcx+56], r11
-        mov	rax, QWORD PTR [rdx+64]
-        mov	r9, QWORD PTR [rdx+72]
-        mov	r10, QWORD PTR [rdx+80]
-        mov	r11, QWORD PTR [rdx+88]
-        adc	rax, QWORD PTR [rdx+64]
-        adc	r9, QWORD PTR [rdx+72]
-        adc	r10, QWORD PTR [rdx+80]
-        adc	r11, QWORD PTR [rdx+88]
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], r9
-        mov	QWORD PTR [rcx+80], r10
-        mov	QWORD PTR [rcx+88], r11
-        mov	rax, QWORD PTR [rdx+96]
-        mov	r9, QWORD PTR [rdx+104]
-        mov	r10, QWORD PTR [rdx+112]
-        mov	r11, QWORD PTR [rdx+120]
-        adc	rax, QWORD PTR [rdx+96]
-        adc	r9, QWORD PTR [rdx+104]
-        adc	r10, QWORD PTR [rdx+112]
-        adc	r11, QWORD PTR [rdx+120]
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], r9
-        mov	QWORD PTR [rcx+112], r10
-        mov	QWORD PTR [rcx+120], r11
-        sbb	r12, 0
-        sub	r11, QWORD PTR [r8+120]
-        sbb	r11, r11
-        not	r11
-        or	r12, r11
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        mov	rax, QWORD PTR [rcx]
-        mov	r9, QWORD PTR [rcx+8]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sub	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r9
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        mov	rax, QWORD PTR [rcx+16]
-        mov	r9, QWORD PTR [rcx+24]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sbb	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r9
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        mov	rax, QWORD PTR [rcx+32]
-        mov	r9, QWORD PTR [rcx+40]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sbb	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r9
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        mov	rax, QWORD PTR [rcx+48]
-        mov	r9, QWORD PTR [rcx+56]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sbb	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r9
-        mov	r10, QWORD PTR [r8+64]
-        mov	r11, QWORD PTR [r8+72]
-        mov	rax, QWORD PTR [rcx+64]
-        mov	r9, QWORD PTR [rcx+72]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sbb	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], r9
-        mov	r10, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [r8+88]
-        mov	rax, QWORD PTR [rcx+80]
-        mov	r9, QWORD PTR [rcx+88]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sbb	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx+80], rax
-        mov	QWORD PTR [rcx+88], r9
-        mov	r10, QWORD PTR [r8+96]
-        mov	r11, QWORD PTR [r8+104]
-        mov	rax, QWORD PTR [rcx+96]
-        mov	r9, QWORD PTR [rcx+104]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sbb	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], r9
-        mov	r10, QWORD PTR [r8+112]
-        mov	r11, QWORD PTR [r8+120]
-        mov	rax, QWORD PTR [rcx+112]
-        mov	r9, QWORD PTR [rcx+120]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sbb	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx+112], rax
-        mov	QWORD PTR [rcx+120], r9
-        mov	rax, QWORD PTR [rcx]
-        mov	r9, QWORD PTR [rcx+8]
-        mov	r10, QWORD PTR [rcx+16]
-        mov	r11, QWORD PTR [rcx+24]
-        add	rax, QWORD PTR [rdx]
-        mov	r12, 0
-        adc	r9, QWORD PTR [rdx+8]
-        adc	r10, QWORD PTR [rdx+16]
-        adc	r11, QWORD PTR [rdx+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r9
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        mov	rax, QWORD PTR [rcx+32]
-        mov	r9, QWORD PTR [rcx+40]
-        mov	r10, QWORD PTR [rcx+48]
-        mov	r11, QWORD PTR [rcx+56]
-        adc	rax, QWORD PTR [rdx+32]
-        adc	r9, QWORD PTR [rdx+40]
-        adc	r10, QWORD PTR [rdx+48]
-        adc	r11, QWORD PTR [rdx+56]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r9
-        mov	QWORD PTR [rcx+48], r10
-        mov	QWORD PTR [rcx+56], r11
-        mov	rax, QWORD PTR [rcx+64]
-        mov	r9, QWORD PTR [rcx+72]
-        mov	r10, QWORD PTR [rcx+80]
-        mov	r11, QWORD PTR [rcx+88]
-        adc	rax, QWORD PTR [rdx+64]
-        adc	r9, QWORD PTR [rdx+72]
-        adc	r10, QWORD PTR [rdx+80]
-        adc	r11, QWORD PTR [rdx+88]
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], r9
-        mov	QWORD PTR [rcx+80], r10
-        mov	QWORD PTR [rcx+88], r11
-        mov	rax, QWORD PTR [rcx+96]
-        mov	r9, QWORD PTR [rcx+104]
-        mov	r10, QWORD PTR [rcx+112]
-        mov	r11, QWORD PTR [rcx+120]
-        adc	rax, QWORD PTR [rdx+96]
-        adc	r9, QWORD PTR [rdx+104]
-        adc	r10, QWORD PTR [rdx+112]
-        adc	r11, QWORD PTR [rdx+120]
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], r9
-        mov	QWORD PTR [rcx+112], r10
-        mov	QWORD PTR [rcx+120], r11
-        sbb	r12, 0
-        sub	r11, QWORD PTR [r8+120]
-        sbb	r11, r11
-        not	r11
-        or	r12, r11
-        mov	r10, QWORD PTR [r8]
-        mov	r11, QWORD PTR [r8+8]
-        mov	rax, QWORD PTR [rcx]
-        mov	r9, QWORD PTR [rcx+8]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sub	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r9
-        mov	r10, QWORD PTR [r8+16]
-        mov	r11, QWORD PTR [r8+24]
-        mov	rax, QWORD PTR [rcx+16]
-        mov	r9, QWORD PTR [rcx+24]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sbb	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r9
-        mov	r10, QWORD PTR [r8+32]
-        mov	r11, QWORD PTR [r8+40]
-        mov	rax, QWORD PTR [rcx+32]
-        mov	r9, QWORD PTR [rcx+40]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sbb	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r9
-        mov	r10, QWORD PTR [r8+48]
-        mov	r11, QWORD PTR [r8+56]
-        mov	rax, QWORD PTR [rcx+48]
-        mov	r9, QWORD PTR [rcx+56]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sbb	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r9
-        mov	r10, QWORD PTR [r8+64]
-        mov	r11, QWORD PTR [r8+72]
-        mov	rax, QWORD PTR [rcx+64]
-        mov	r9, QWORD PTR [rcx+72]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sbb	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], r9
-        mov	r10, QWORD PTR [r8+80]
-        mov	r11, QWORD PTR [r8+88]
-        mov	rax, QWORD PTR [rcx+80]
-        mov	r9, QWORD PTR [rcx+88]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sbb	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx+80], rax
-        mov	QWORD PTR [rcx+88], r9
-        mov	r10, QWORD PTR [r8+96]
-        mov	r11, QWORD PTR [r8+104]
-        mov	rax, QWORD PTR [rcx+96]
-        mov	r9, QWORD PTR [rcx+104]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sbb	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], r9
-        mov	r10, QWORD PTR [r8+112]
-        mov	r11, QWORD PTR [r8+120]
-        mov	rax, QWORD PTR [rcx+112]
-        mov	r9, QWORD PTR [rcx+120]
-        pext	r10, r10, r12
-        pext	r11, r11, r12
-        sbb	rax, r10
-        sbb	r9, r11
-        mov	QWORD PTR [rcx+112], rax
-        mov	QWORD PTR [rcx+120], r9
-        pop	r12
-        ret
-sp_1024_mont_tpl_avx2_16 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Subtract two Montgomery form numbers (r = a - b % m).
-;  *
-;  * r   Result of addition.
-;  * a   First number to add in Montgomery form.
-;  * b   Second number to add in Montgomery form.
-;  * m   Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_mont_sub_avx2_16 PROC
-        push	r12
-        push	r13
-        mov	rax, QWORD PTR [rdx]
-        mov	r10, QWORD PTR [rdx+8]
-        mov	r11, QWORD PTR [rdx+16]
-        mov	r12, QWORD PTR [rdx+24]
-        sub	rax, QWORD PTR [r8]
-        mov	r13, 0
-        sbb	r10, QWORD PTR [r8+8]
-        sbb	r11, QWORD PTR [r8+16]
-        sbb	r12, QWORD PTR [r8+24]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        mov	QWORD PTR [rcx+16], r11
-        mov	QWORD PTR [rcx+24], r12
-        mov	rax, QWORD PTR [rdx+32]
-        mov	r10, QWORD PTR [rdx+40]
-        mov	r11, QWORD PTR [rdx+48]
-        mov	r12, QWORD PTR [rdx+56]
-        sbb	rax, QWORD PTR [r8+32]
-        sbb	r10, QWORD PTR [r8+40]
-        sbb	r11, QWORD PTR [r8+48]
-        sbb	r12, QWORD PTR [r8+56]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        mov	QWORD PTR [rcx+48], r11
-        mov	QWORD PTR [rcx+56], r12
-        mov	rax, QWORD PTR [rdx+64]
-        mov	r10, QWORD PTR [rdx+72]
-        mov	r11, QWORD PTR [rdx+80]
-        mov	r12, QWORD PTR [rdx+88]
-        sbb	rax, QWORD PTR [r8+64]
-        sbb	r10, QWORD PTR [r8+72]
-        sbb	r11, QWORD PTR [r8+80]
-        sbb	r12, QWORD PTR [r8+88]
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], r10
-        mov	QWORD PTR [rcx+80], r11
-        mov	QWORD PTR [rcx+88], r12
-        mov	rax, QWORD PTR [rdx+96]
-        mov	r10, QWORD PTR [rdx+104]
-        mov	r11, QWORD PTR [rdx+112]
-        mov	r12, QWORD PTR [rdx+120]
-        sbb	rax, QWORD PTR [r8+96]
-        sbb	r10, QWORD PTR [r8+104]
-        sbb	r11, QWORD PTR [r8+112]
-        sbb	r12, QWORD PTR [r8+120]
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], r10
-        mov	QWORD PTR [rcx+112], r11
-        mov	QWORD PTR [rcx+120], r12
-        sbb	r13, 0
-        mov	r11, QWORD PTR [r9]
-        mov	r12, QWORD PTR [r9+8]
-        mov	rax, QWORD PTR [rcx]
-        mov	r10, QWORD PTR [rcx+8]
-        pext	r11, r11, r13
-        pext	r12, r12, r13
-        add	rax, r11
-        adc	r10, r12
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        mov	r11, QWORD PTR [r9+16]
-        mov	r12, QWORD PTR [r9+24]
-        mov	rax, QWORD PTR [rcx+16]
-        mov	r10, QWORD PTR [rcx+24]
-        pext	r11, r11, r13
-        pext	r12, r12, r13
-        adc	rax, r11
-        adc	r10, r12
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r10
-        mov	r11, QWORD PTR [r9+32]
-        mov	r12, QWORD PTR [r9+40]
-        mov	rax, QWORD PTR [rcx+32]
-        mov	r10, QWORD PTR [rcx+40]
-        pext	r11, r11, r13
-        pext	r12, r12, r13
-        adc	rax, r11
-        adc	r10, r12
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        mov	r11, QWORD PTR [r9+48]
-        mov	r12, QWORD PTR [r9+56]
-        mov	rax, QWORD PTR [rcx+48]
-        mov	r10, QWORD PTR [rcx+56]
-        pext	r11, r11, r13
-        pext	r12, r12, r13
-        adc	rax, r11
-        adc	r10, r12
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r10
-        mov	r11, QWORD PTR [r9+64]
-        mov	r12, QWORD PTR [r9+72]
-        mov	rax, QWORD PTR [rcx+64]
-        mov	r10, QWORD PTR [rcx+72]
-        pext	r11, r11, r13
-        pext	r12, r12, r13
-        adc	rax, r11
-        adc	r10, r12
-        mov	QWORD PTR [rcx+64], rax
-        mov	QWORD PTR [rcx+72], r10
-        mov	r11, QWORD PTR [r9+80]
-        mov	r12, QWORD PTR [r9+88]
-        mov	rax, QWORD PTR [rcx+80]
-        mov	r10, QWORD PTR [rcx+88]
-        pext	r11, r11, r13
-        pext	r12, r12, r13
-        adc	rax, r11
-        adc	r10, r12
-        mov	QWORD PTR [rcx+80], rax
-        mov	QWORD PTR [rcx+88], r10
-        mov	r11, QWORD PTR [r9+96]
-        mov	r12, QWORD PTR [r9+104]
-        mov	rax, QWORD PTR [rcx+96]
-        mov	r10, QWORD PTR [rcx+104]
-        pext	r11, r11, r13
-        pext	r12, r12, r13
-        adc	rax, r11
-        adc	r10, r12
-        mov	QWORD PTR [rcx+96], rax
-        mov	QWORD PTR [rcx+104], r10
-        mov	r11, QWORD PTR [r9+112]
-        mov	r12, QWORD PTR [r9+120]
-        mov	rax, QWORD PTR [rcx+112]
-        mov	r10, QWORD PTR [rcx+120]
-        pext	r11, r11, r13
-        pext	r12, r12, r13
-        adc	rax, r11
-        adc	r10, r12
-        mov	QWORD PTR [rcx+112], rax
-        mov	QWORD PTR [rcx+120], r10
-        pop	r13
-        pop	r12
-        ret
-sp_1024_mont_sub_avx2_16 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-; /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
-;  *
-;  * r  Result of division by 2.
-;  * a  Number to divide.
-;  * m  Modulus (prime).
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_mont_div2_avx2_16 PROC
-        push	r12
-        push	r13
-        mov	r13, QWORD PTR [rdx]
-        xor	r12, r12
-        mov	r10, r13
-        and	r13, 1
-        neg	r13
-        mov	rax, QWORD PTR [r8]
-        mov	r9, QWORD PTR [r8+8]
-        mov	r10, QWORD PTR [rdx]
-        mov	r11, QWORD PTR [rdx+8]
-        pext	rax, rax, r13
-        pext	r9, r9, r13
-        add	r10, rax
-        adc	r11, r9
-        mov	QWORD PTR [rcx], r10
-        mov	QWORD PTR [rcx+8], r11
-        mov	rax, QWORD PTR [r8+16]
-        mov	r9, QWORD PTR [r8+24]
-        mov	r10, QWORD PTR [rdx+16]
-        mov	r11, QWORD PTR [rdx+24]
-        pext	rax, rax, r13
-        pext	r9, r9, r13
-        adc	r10, rax
-        adc	r11, r9
-        mov	QWORD PTR [rcx+16], r10
-        mov	QWORD PTR [rcx+24], r11
-        mov	rax, QWORD PTR [r8+32]
-        mov	r9, QWORD PTR [r8+40]
-        mov	r10, QWORD PTR [rdx+32]
-        mov	r11, QWORD PTR [rdx+40]
-        pext	rax, rax, r13
-        pext	r9, r9, r13
-        adc	r10, rax
-        adc	r11, r9
-        mov	QWORD PTR [rcx+32], r10
-        mov	QWORD PTR [rcx+40], r11
-        mov	rax, QWORD PTR [r8+48]
-        mov	r9, QWORD PTR [r8+56]
-        mov	r10, QWORD PTR [rdx+48]
-        mov	r11, QWORD PTR [rdx+56]
-        pext	rax, rax, r13
-        pext	r9, r9, r13
-        adc	r10, rax
-        adc	r11, r9
-        mov	QWORD PTR [rcx+48], r10
-        mov	QWORD PTR [rcx+56], r11
-        mov	rax, QWORD PTR [r8+64]
-        mov	r9, QWORD PTR [r8+72]
-        mov	r10, QWORD PTR [rdx+64]
-        mov	r11, QWORD PTR [rdx+72]
-        pext	rax, rax, r13
-        pext	r9, r9, r13
-        adc	r10, rax
-        adc	r11, r9
-        mov	QWORD PTR [rcx+64], r10
-        mov	QWORD PTR [rcx+72], r11
-        mov	rax, QWORD PTR [r8+80]
-        mov	r9, QWORD PTR [r8+88]
-        mov	r10, QWORD PTR [rdx+80]
-        mov	r11, QWORD PTR [rdx+88]
-        pext	rax, rax, r13
-        pext	r9, r9, r13
-        adc	r10, rax
-        adc	r11, r9
-        mov	QWORD PTR [rcx+80], r10
-        mov	QWORD PTR [rcx+88], r11
-        mov	rax, QWORD PTR [r8+96]
-        mov	r9, QWORD PTR [r8+104]
-        mov	r10, QWORD PTR [rdx+96]
-        mov	r11, QWORD PTR [rdx+104]
-        pext	rax, rax, r13
-        pext	r9, r9, r13
-        adc	r10, rax
-        adc	r11, r9
-        mov	QWORD PTR [rcx+96], r10
-        mov	QWORD PTR [rcx+104], r11
-        mov	rax, QWORD PTR [r8+112]
-        mov	r9, QWORD PTR [r8+120]
-        mov	r10, QWORD PTR [rdx+112]
-        mov	r11, QWORD PTR [rdx+120]
-        pext	rax, rax, r13
-        pext	r9, r9, r13
-        adc	r10, rax
-        adc	r11, r9
-        mov	QWORD PTR [rcx+112], r10
-        mov	QWORD PTR [rcx+120], r11
-        adc	r12, 0
-        mov	r10, QWORD PTR [rcx]
-        mov	r11, QWORD PTR [rcx+8]
-        shrd	r10, r11, 1
-        mov	QWORD PTR [rcx], r10
-        mov	r10, QWORD PTR [rcx+16]
-        shrd	r11, r10, 1
-        mov	QWORD PTR [rcx+8], r11
-        mov	r11, QWORD PTR [rcx+24]
-        shrd	r10, r11, 1
-        mov	QWORD PTR [rcx+16], r10
-        mov	r10, QWORD PTR [rcx+32]
-        shrd	r11, r10, 1
-        mov	QWORD PTR [rcx+24], r11
-        mov	r11, QWORD PTR [rcx+40]
-        shrd	r10, r11, 1
-        mov	QWORD PTR [rcx+32], r10
-        mov	r10, QWORD PTR [rcx+48]
-        shrd	r11, r10, 1
-        mov	QWORD PTR [rcx+40], r11
-        mov	r11, QWORD PTR [rcx+56]
-        shrd	r10, r11, 1
-        mov	QWORD PTR [rcx+48], r10
-        mov	r10, QWORD PTR [rcx+64]
-        shrd	r11, r10, 1
-        mov	QWORD PTR [rcx+56], r11
-        mov	r11, QWORD PTR [rcx+72]
-        shrd	r10, r11, 1
-        mov	QWORD PTR [rcx+64], r10
-        mov	r10, QWORD PTR [rcx+80]
-        shrd	r11, r10, 1
-        mov	QWORD PTR [rcx+72], r11
-        mov	r11, QWORD PTR [rcx+88]
-        shrd	r10, r11, 1
-        mov	QWORD PTR [rcx+80], r10
-        mov	r10, QWORD PTR [rcx+96]
-        shrd	r11, r10, 1
-        mov	QWORD PTR [rcx+88], r11
-        mov	r11, QWORD PTR [rcx+104]
-        shrd	r10, r11, 1
-        mov	QWORD PTR [rcx+96], r10
-        mov	r10, QWORD PTR [rcx+112]
-        shrd	r11, r10, 1
-        mov	QWORD PTR [rcx+104], r11
-        mov	r11, QWORD PTR [rcx+120]
-        shrd	r10, r11, 1
-        mov	QWORD PTR [rcx+112], r10
-        shrd	r11, r12, 1
-        mov	QWORD PTR [rcx+120], r11
-        pop	r13
-        pop	r12
-        ret
-sp_1024_mont_div2_avx2_16 ENDP
-_text ENDS
-ENDIF
-; /* Read big endian unsigned byte array into r.
-;  * Uses the bswap instruction.
-;  *
-;  * r  A single precision integer.
-;  * size  Maximum number of bytes to convert
-;  * a  Byte array.
-;  * n  Number of bytes in array to read.
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_from_bin_bswap PROC
-        push	r12
-        push	r13
-        mov	r11, r8
-        mov	r12, rcx
-        add	r11, r9
-        add	r12, 128
-        xor	r13, r13
-        jmp	L_1024_from_bin_bswap_64_end
-L_1024_from_bin_bswap_64_start:
-        sub	r11, 64
-        mov	rax, QWORD PTR [r11+56]
-        mov	r10, QWORD PTR [r11+48]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        mov	rax, QWORD PTR [r11+40]
-        mov	r10, QWORD PTR [r11+32]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r10
-        mov	rax, QWORD PTR [r11+24]
-        mov	r10, QWORD PTR [r11+16]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        mov	rax, QWORD PTR [r11+8]
-        mov	r10, QWORD PTR [r11]
-        bswap	rax
-        bswap	r10
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r10
-        add	rcx, 64
-        sub	r9, 64
-L_1024_from_bin_bswap_64_end:
-        cmp	r9, 63
-        jg	L_1024_from_bin_bswap_64_start
-        jmp	L_1024_from_bin_bswap_8_end
-L_1024_from_bin_bswap_8_start:
-        sub	r11, 8
-        mov	rax, QWORD PTR [r11]
-        bswap	rax
-        mov	QWORD PTR [rcx], rax
-        add	rcx, 8
-        sub	r9, 8
-L_1024_from_bin_bswap_8_end:
-        cmp	r9, 7
-        jg	L_1024_from_bin_bswap_8_start
-        cmp	r9, r13
-        je	L_1024_from_bin_bswap_hi_end
-        mov	r10, r13
-        mov	rax, r13
-L_1024_from_bin_bswap_hi_start:
-        mov	al, BYTE PTR [r8]
-        shl	r10, 8
-        inc	r8
-        add	r10, rax
-        dec	r9
-        jg	L_1024_from_bin_bswap_hi_start
-        mov	QWORD PTR [rcx], r10
-        add	rcx, 8
-L_1024_from_bin_bswap_hi_end:
-        cmp	rcx, r12
-        jge	L_1024_from_bin_bswap_zero_end
-L_1024_from_bin_bswap_zero_start:
-        mov	QWORD PTR [rcx], r13
-        add	rcx, 8
-        cmp	rcx, r12
-        jl	L_1024_from_bin_bswap_zero_start
-L_1024_from_bin_bswap_zero_end:
-        pop	r13
-        pop	r12
-        ret
-sp_1024_from_bin_bswap ENDP
-_text ENDS
-IFNDEF NO_MOVBE_SUPPORT
-; /* Read big endian unsigned byte array into r.
-;  * Uses the movbe instruction which is an optional instruction.
-;  *
-;  * r  A single precision integer.
-;  * size  Maximum number of bytes to convert
-;  * a  Byte array.
-;  * n  Number of bytes in array to read.
-;  */
-_text SEGMENT READONLY PARA
-sp_1024_from_bin_movbe PROC
-        push	r12
-        mov	r11, r8
-        mov	r12, rcx
-        add	r11, r9
-        add	r12, 128
-        jmp	L_1024_from_bin_movbe_64_end
-L_1024_from_bin_movbe_64_start:
-        sub	r11, 64
-        movbe	rax, QWORD PTR [r11+56]
-        movbe	r10, QWORD PTR [r11+48]
-        mov	QWORD PTR [rcx], rax
-        mov	QWORD PTR [rcx+8], r10
-        movbe	rax, QWORD PTR [r11+40]
-        movbe	r10, QWORD PTR [r11+32]
-        mov	QWORD PTR [rcx+16], rax
-        mov	QWORD PTR [rcx+24], r10
-        movbe	rax, QWORD PTR [r11+24]
-        movbe	r10, QWORD PTR [r11+16]
-        mov	QWORD PTR [rcx+32], rax
-        mov	QWORD PTR [rcx+40], r10
-        movbe	rax, QWORD PTR [r11+8]
-        movbe	r10, QWORD PTR [r11]
-        mov	QWORD PTR [rcx+48], rax
-        mov	QWORD PTR [rcx+56], r10
-        add	rcx, 64
-        sub	r9, 64
-L_1024_from_bin_movbe_64_end:
-        cmp	r9, 63
-        jg	L_1024_from_bin_movbe_64_start
-        jmp	L_1024_from_bin_movbe_8_end
-L_1024_from_bin_movbe_8_start:
-        sub	r11, 8
-        movbe	rax, QWORD PTR [r11]
-        mov	QWORD PTR [rcx], rax
-        add	rcx, 8
-        sub	r9, 8
-L_1024_from_bin_movbe_8_end:
-        cmp	r9, 7
-        jg	L_1024_from_bin_movbe_8_start
-        cmp	r9, 0
-        je	L_1024_from_bin_movbe_hi_end
-        mov	r10, 0
-        mov	rax, 0
-L_1024_from_bin_movbe_hi_start:
-        mov	al, BYTE PTR [r8]
-        shl	r10, 8
-        inc	r8
-        add	r10, rax
-        dec	r9
-        jg	L_1024_from_bin_movbe_hi_start
-        mov	QWORD PTR [rcx], r10
-        add	rcx, 8
-L_1024_from_bin_movbe_hi_end:
-        cmp	rcx, r12
-        jge	L_1024_from_bin_movbe_zero_end
-L_1024_from_bin_movbe_zero_start:
-        mov	QWORD PTR [rcx], 0
-        add	rcx, 8
-        cmp	rcx, r12
-        jl	L_1024_from_bin_movbe_zero_start
-L_1024_from_bin_movbe_zero_end:
-        pop	r12
-        ret
-sp_1024_from_bin_movbe ENDP
-_text ENDS
-ENDIF
-ENDIF
-END

+ 0 - 33
lib/wolfssl/wolfssl/include.am

@@ -1,33 +0,0 @@
-# vim:ft=automake
-# All paths should be given relative to the root
-#
-
-include wolfssl/wolfcrypt/include.am
-include wolfssl/openssl/include.am
-
-EXTRA_DIST+= wolfssl/sniffer_error.rc
-
-nobase_include_HEADERS+= \
-                         wolfssl/error-ssl.h \
-                         wolfssl/ssl.h \
-                         wolfssl/sniffer_error.h \
-                         wolfssl/sniffer.h \
-                         wolfssl/callbacks.h \
-                         wolfssl/certs_test.h \
-                         wolfssl/test.h \
-                         wolfssl/version.h \
-                         wolfssl/ocsp.h \
-                         wolfssl/quic.h \
-                         wolfssl/crl.h \
-                         wolfssl/wolfio.h
-
-noinst_HEADERS+= \
-                         wolfssl/internal.h
-
-# For distro build don't install options.h.
-# It depends on the architecture and conflicts with Multi-Arch.
-if BUILD_DISTRO
-noinst_HEADERS+=         wolfssl/options.h
-else
-nobase_include_HEADERS+= wolfssl/options.h
-endif

+ 0 - 123
lib/wolfssl/wolfssl/sniffer_error.rc

@@ -1,123 +0,0 @@
-
-STRINGTABLE
-{
-    1, "Out of Memory"
-    2, "New SSL Sniffer Server Registered"
-    3, "Checking IP Header"
-    4, "SSL Sniffer Server Not Registered"
-    5, "Checking TCP Header"
-
-    6, "SSL Sniffer Server Port Not Registered"
-    7, "RSA Private Decrypt Error"
-    8, "RSA Private Decode Error"
-    9, "Set Cipher Spec Error"
-    10, "Server Hello Input Malformed"
-
-    11, "Couldn't Resume Session Error"
-    12, "Server Did Resumption"
-    13, "Client Hello Input Malformed"
-    14, "Client Trying to Resume"
-    15, "Handshake Input Malformed"
-
-    16, "Got Hello Verify msg"
-    17, "Got Server Hello msg"
-    18, "Got Cert Request msg"
-    19, "Got Server Key Exchange msg"
-    20, "Got Cert msg"
-
-    21, "Got Server Hello Done msg"
-    22, "Got Finished msg"
-    23, "Got Client Hello msg"
-    24, "Got Client Key Exchange msg"
-    25, "Got Cert Verify msg"
-
-    26, "Got Unknown Handshake msg"
-    27, "New SSL Sniffer Session created"
-    28, "Couldn't create new SSL"
-    29, "Got a Packet to decode"
-    30, "No data present"
-
-    31, "Session Not Found"
-    32, "Got an Old Client Hello msg"
-    33, "Old Client Hello Input Malformed"
-    34, "Old Client Hello OK"
-    35, "Bad Old Client Hello"
-
-    36, "Bad Record Header"
-    37, "Record Header Input Malformed"
-    38, "Got a HandShake msg"
-    39, "Bad HandShake msg"
-    40, "Got a Change Cipher Spec msg"
-
-    41, "Got Application Data msg"
-    42, "Bad Application Data"
-    43, "Got an Alert msg"
-    44, "Another msg to Process"
-    45, "Removing Session From Table"
-
-    46, "Bad Key File"
-    47, "Wrong IP Version"
-    48, "Wrong Protocol type"
-    49, "Packet Short for header processing"
-    50, "Got Unknown Record Type"
-
-    51, "Can't Open Trace File"
-    52, "Session in Fatal Error State"
-    53, "Partial SSL record received"
-    54, "Buffer Error, malformed input"
-    55, "Added to Partial Input"
-
-    56, "Received a Duplicate Packet"
-    57, "Received an Out of Order Packet"
-    58, "Received an Overlap Duplicate Packet"
-    59, "Received an Overlap Reassembly Begin Duplicate Packet"
-    60, "Received an Overlap Reassembly End Duplicate Packet"
-
-    61, "Missed the Client Hello Entirely"
-    62, "Got Hello Request msg"
-    63, "Got Session Ticket msg"
-    64, "Bad Input"
-    65, "Bad Decrypt Type"
-
-    66, "Bad Finished Message Processing"
-    67, "Bad Compression Type"
-    68, "Bad DeriveKeys Error"
-    69, "Saw ACK for Missing Packet Error"
-    70, "Bad Decrypt Operation"
-
-    71, "Decrypt Keys Not Set Up"
-    72, "Late Key Load Error"
-    73, "Got Certificate Status msg"
-    74, "RSA Key Missing Error"
-    75, "Secure Renegotiation Not Supported"
-
-    76, "Get Session Stats Failure"
-    77, "Reassembly Buffer Size Exceeded"
-    78, "Dropping Lost Fragment"
-    79, "Dropping Partial Record"
-    80, "Clear ACK Fault"
-
-    81, "Bad Decrypt Size"
-    82, "Extended Master Secret Hash Error"
-    83, "Handshake Message Split Across TLS Records"
-    84, "ECC Private Decode Error"
-    85, "ECC Public Decode Error"
-
-    86, "Watch callback not set"
-    87, "Watch hash failed"
-    88, "Watch callback failed"
-    89, "Bad Certificate Message"
-    90, "Store data callback not set"
-
-    91, "No data destination Error"
-    92, "Store Data callback failed"
-    93, "Loading chain input"
-    94, "Got encrypted extension"
-    95, "Got Hello Retry Request"
-
-    96, "Setting up keys"
-    97, "Unsupported TLS Version"
-    98, "Server Client Key Mismatch"
-
-    99, "Invalid or missing keylog file"
-}

+ 0 - 220
lib/wolfssl/wolfssl/wolfcrypt/include.am

@@ -1,220 +0,0 @@
-# vim:ft=automake
-# All paths should be given relative to the root
-
-nobase_include_HEADERS+= \
-                         wolfssl/wolfcrypt/aes.h \
-                         wolfssl/wolfcrypt/arc4.h \
-                         wolfssl/wolfcrypt/asn.h \
-                         wolfssl/wolfcrypt/asn_public.h \
-                         wolfssl/wolfcrypt/poly1305.h \
-                         wolfssl/wolfcrypt/camellia.h \
-                         wolfssl/wolfcrypt/cmac.h \
-                         wolfssl/wolfcrypt/coding.h \
-                         wolfssl/wolfcrypt/compress.h \
-                         wolfssl/wolfcrypt/des3.h \
-                         wolfssl/wolfcrypt/dh.h \
-                         wolfssl/wolfcrypt/dsa.h \
-                         wolfssl/wolfcrypt/ecc.h \
-                         wolfssl/wolfcrypt/curve25519.h \
-                         wolfssl/wolfcrypt/ed25519.h \
-                         wolfssl/wolfcrypt/fe_operations.h \
-                         wolfssl/wolfcrypt/ge_operations.h \
-                         wolfssl/wolfcrypt/curve448.h \
-                         wolfssl/wolfcrypt/ed448.h \
-                         wolfssl/wolfcrypt/falcon.h \
-                         wolfssl/wolfcrypt/dilithium.h \
-                         wolfssl/wolfcrypt/sphincs.h \
-                         wolfssl/wolfcrypt/fe_448.h \
-                         wolfssl/wolfcrypt/ge_448.h \
-                         wolfssl/wolfcrypt/eccsi.h \
-                         wolfssl/wolfcrypt/sakke.h \
-                         wolfssl/wolfcrypt/error-crypt.h \
-                         wolfssl/wolfcrypt/fips_test.h \
-                         wolfssl/wolfcrypt/hash.h \
-                         wolfssl/wolfcrypt/hmac.h \
-                         wolfssl/wolfcrypt/hpke.h \
-                         wolfssl/wolfcrypt/kdf.h \
-                         wolfssl/wolfcrypt/integer.h \
-                         wolfssl/wolfcrypt/md2.h \
-                         wolfssl/wolfcrypt/md4.h \
-                         wolfssl/wolfcrypt/md5.h \
-                         wolfssl/wolfcrypt/misc.h \
-                         wolfssl/wolfcrypt/pkcs7.h \
-                         wolfssl/wolfcrypt/wc_encrypt.h \
-                         wolfssl/wolfcrypt/wc_port.h \
-                         wolfssl/wolfcrypt/pwdbased.h \
-                         wolfssl/wolfcrypt/chacha.h \
-                         wolfssl/wolfcrypt/chacha20_poly1305.h \
-                         wolfssl/wolfcrypt/random.h \
-                         wolfssl/wolfcrypt/ripemd.h \
-                         wolfssl/wolfcrypt/rsa.h \
-                         wolfssl/wolfcrypt/rc2.h \
-                         wolfssl/wolfcrypt/settings.h \
-                         wolfssl/wolfcrypt/sha256.h \
-                         wolfssl/wolfcrypt/sha512.h \
-                         wolfssl/wolfcrypt/sha.h \
-                         wolfssl/wolfcrypt/signature.h \
-                         wolfssl/wolfcrypt/blake2.h \
-                         wolfssl/wolfcrypt/blake2-int.h \
-                         wolfssl/wolfcrypt/blake2-impl.h \
-                         wolfssl/wolfcrypt/tfm.h \
-                         wolfssl/wolfcrypt/srp.h \
-                         wolfssl/wolfcrypt/types.h \
-                         wolfssl/wolfcrypt/visibility.h \
-                         wolfssl/wolfcrypt/logging.h \
-                         wolfssl/wolfcrypt/memory.h \
-                         wolfssl/wolfcrypt/mpi_class.h \
-                         wolfssl/wolfcrypt/mpi_superclass.h \
-                         wolfssl/wolfcrypt/mem_track.h \
-                         wolfssl/wolfcrypt/wolfevent.h \
-                         wolfssl/wolfcrypt/pkcs12.h \
-                         wolfssl/wolfcrypt/wolfmath.h \
-                         wolfssl/wolfcrypt/sha3.h \
-                         wolfssl/wolfcrypt/siphash.h \
-                         wolfssl/wolfcrypt/cpuid.h \
-                         wolfssl/wolfcrypt/cryptocb.h \
-                         wolfssl/wolfcrypt/kyber.h \
-                         wolfssl/wolfcrypt/wc_kyber.h \
-                         wolfssl/wolfcrypt/ext_kyber.h \
-                         wolfssl/wolfcrypt/sm2.h \
-                         wolfssl/wolfcrypt/sm3.h \
-                         wolfssl/wolfcrypt/sm4.h \
-                         wolfssl/wolfcrypt/lms.h \
-                         wolfssl/wolfcrypt/wc_lms.h \
-                         wolfssl/wolfcrypt/ext_lms.h \
-                         wolfssl/wolfcrypt/xmss.h \
-                         wolfssl/wolfcrypt/wc_xmss.h \
-                         wolfssl/wolfcrypt/ext_xmss.h
-
-noinst_HEADERS+= \
-                         wolfssl/wolfcrypt/port/aria/aria-crypt.h \
-                         wolfssl/wolfcrypt/port/aria/aria-cryptocb.h \
-                         wolfssl/wolfcrypt/port/pic32/pic32mz-crypt.h \
-                         wolfssl/wolfcrypt/port/ti/ti-hash.h \
-                         wolfssl/wolfcrypt/port/ti/ti-ccm.h \
-                         wolfssl/wolfcrypt/port/nrf51.h \
-                         wolfssl/wolfcrypt/port/nxp/ksdk_port.h \
-                         wolfssl/wolfcrypt/port/nxp/dcp_port.h \
-                         wolfssl/wolfcrypt/port/xilinx/xil-sha3.h \
-                         wolfssl/wolfcrypt/port/xilinx/xil-versal-glue.h \
-                         wolfssl/wolfcrypt/port/xilinx/xil-versal-trng.h \
-                         wolfssl/wolfcrypt/port/caam/caam_driver.h \
-                         wolfssl/wolfcrypt/port/caam/caam_error.h \
-                         wolfssl/wolfcrypt/port/caam/caam_qnx.h \
-                         wolfssl/wolfcrypt/port/silabs/silabs_aes.h \
-                         wolfssl/wolfcrypt/port/silabs/silabs_ecc.h \
-                         wolfssl/wolfcrypt/port/silabs/silabs_hash.h \
-                         wolfssl/wolfcrypt/port/silabs/silabs_random.h \
-                         wolfssl/wolfcrypt/port/st/stm32.h \
-                         wolfssl/wolfcrypt/port/st/stsafe.h \
-                         wolfssl/wolfcrypt/port/Espressif/esp32-crypt.h \
-                         wolfssl/wolfcrypt/port/arm/cryptoCell.h \
-                         wolfssl/wolfcrypt/port/Renesas/renesas-tsip-crypt.h \
-                         wolfssl/wolfcrypt/port/Renesas/renesas-fspsm-crypt.h \
-                         wolfssl/wolfcrypt/port/Renesas/renesas-fspsm-types.h \
-                         wolfssl/wolfcrypt/port/Renesas/renesas_sync.h \
-                         wolfssl/wolfcrypt/port/Renesas/renesas_cmn.h \
-                         wolfssl/wolfcrypt/port/Renesas/renesas_tsip_types.h \
-                         wolfssl/wolfcrypt/port/cypress/psoc6_crypto.h
-
-if BUILD_CRYPTOAUTHLIB
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/atmel/atmel.h
-endif
-
-if BUILD_AFALG
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/af_alg/afalg_hash.h
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/af_alg/wc_afalg.h
-endif
-
-if BUILD_KCAPI
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/kcapi/wc_kcapi.h
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/kcapi/kcapi_hash.h
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/kcapi/kcapi_hmac.h
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/kcapi/kcapi_ecc.h
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/kcapi/kcapi_rsa.h
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/kcapi/kcapi_dh.h
-endif
-
-if BUILD_DEVCRYPTO
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/devcrypto/wc_devcrypto.h
-endif
-
-if BUILD_ARIA
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/aria/aria-crypt.h
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/aria/aria-cryptocb.h
-endif
-
-if BUILD_ASYNCCRYPT
-nobase_include_HEADERS+= wolfssl/wolfcrypt/async.h
-endif
-
-if BUILD_PKCS11
-nobase_include_HEADERS+= wolfssl/wolfcrypt/wc_pkcs11.h
-nobase_include_HEADERS+= wolfssl/wolfcrypt/pkcs11.h
-endif
-
-if BUILD_CAVIUM
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/cavium/cavium_nitrox.h
-endif
-
-if BUILD_OCTEON_SYNC
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/cavium/cavium_octeon_sync.h
-endif
-
-if BUILD_INTEL_QA
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/intel/quickassist.h
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/intel/quickassist_mem.h
-endif
-
-if BUILD_INTEL_QA_SYNC
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/intel/quickassist_sync.h
-endif
-
-if BUILD_SP
-nobase_include_HEADERS+= wolfssl/wolfcrypt/sp.h
-nobase_include_HEADERS+= wolfssl/wolfcrypt/sp_int.h
-else
-if BUILD_SP_INT
-nobase_include_HEADERS+= wolfssl/wolfcrypt/sp_int.h
-endif
-endif
-
-if BUILD_SELFTEST
-nobase_include_HEADERS+= wolfssl/wolfcrypt/selftest.h
-endif
-
-if BUILD_FIPS
-if !BUILD_FIPS_V1
-nobase_include_HEADERS+= wolfssl/wolfcrypt/fips.h
-endif
-endif
-
-if BUILD_CAAM
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/caam/wolfcaam.h \
-                         wolfssl/wolfcrypt/port/caam/wolfcaam_sha.h \
-                         wolfssl/wolfcrypt/port/caam/wolfcaam_hash.h \
-                         wolfssl/wolfcrypt/port/caam/wolfcaam_rsa.h \
-                         wolfssl/wolfcrypt/port/caam/wolfcaam_x25519.h \
-                         wolfssl/wolfcrypt/port/caam/wolfcaam_ecdsa.h \
-                         wolfssl/wolfcrypt/port/caam/wolfcaam_cmac.h \
-                         wolfssl/wolfcrypt/port/caam/wolfcaam_aes.h \
-                         wolfssl/wolfcrypt/port/caam/wolfcaam_qnx.h \
-                         wolfssl/wolfcrypt/port/caam/wolfcaam_seco.h \
-                         wolfssl/wolfcrypt/port/caam/wolfcaam_fsl_nxp.h
-endif
-
-if BUILD_IOTSAFE
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/iotsafe/iotsafe.h
-endif
-
-if BUILD_PSA
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/psa/psa.h
-endif
-
-if BUILD_SE050
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/nxp/se050_port.h
-endif
-
-if BUILD_MAXQ10XX
-nobase_include_HEADERS+= wolfssl/wolfcrypt/port/maxim/maxq10xx.h
-endif