MX пре 2 година
родитељ
комит
ea310d7b4b
100 измењених фајлова са 5587 додато и 92066 уклоњено
  1. 1 1
      LICENSE
  2. 2 4
      application.fam
  3. 56 56
      assets/cli/cli_help.txt
  4. 2 0
      config/wolfssl/config.h
  5. 0 73
      lib/base64/base64.c
  6. 0 14
      lib/base64/base64.h
  7. 20 1
      lib/wolfssl/.gitignore
  8. 53 135
      lib/wolfssl/README
  9. 303 302
      lib/wolfssl/wolfcrypt/src/aes.c
  10. 0 2226
      lib/wolfssl/wolfcrypt/src/aes_asm.S
  11. 0 1531
      lib/wolfssl/wolfcrypt/src/aes_asm.asm
  12. 0 15854
      lib/wolfssl/wolfcrypt/src/aes_gcm_asm.S
  13. 0 15423
      lib/wolfssl/wolfcrypt/src/aes_gcm_asm.asm
  14. 0 12962
      lib/wolfssl/wolfcrypt/src/aes_gcm_x86_asm.S
  15. 1 1
      lib/wolfssl/wolfcrypt/src/asm.c
  16. 619 40
      lib/wolfssl/wolfcrypt/src/asn.c
  17. 2 2
      lib/wolfssl/wolfcrypt/src/camellia.c
  18. 7 6
      lib/wolfssl/wolfcrypt/src/chacha.c
  19. 4 4
      lib/wolfssl/wolfcrypt/src/chacha20_poly1305.c
  20. 0 1453
      lib/wolfssl/wolfcrypt/src/chacha_asm.S
  21. 2 1
      lib/wolfssl/wolfcrypt/src/cmac.c
  22. 1 0
      lib/wolfssl/wolfcrypt/src/compress.c
  23. 116 10
      lib/wolfssl/wolfcrypt/src/cryptocb.c
  24. 15 12
      lib/wolfssl/wolfcrypt/src/curve25519.c
  25. 3 83
      lib/wolfssl/wolfcrypt/src/des3.c
  26. 24 6
      lib/wolfssl/wolfcrypt/src/dh.c
  27. 4 4
      lib/wolfssl/wolfcrypt/src/dsa.c
  28. 412 126
      lib/wolfssl/wolfcrypt/src/ecc.c
  29. 8 9
      lib/wolfssl/wolfcrypt/src/eccsi.c
  30. 14 8
      lib/wolfssl/wolfcrypt/src/ed25519.c
  31. 1 1
      lib/wolfssl/wolfcrypt/src/ed448.c
  32. 13 1
      lib/wolfssl/wolfcrypt/src/error.c
  33. 595 187
      lib/wolfssl/wolfcrypt/src/evp.c
  34. 1 1
      lib/wolfssl/wolfcrypt/src/ext_kyber.c
  35. 972 0
      lib/wolfssl/wolfcrypt/src/ext_lms.c
  36. 981 0
      lib/wolfssl/wolfcrypt/src/ext_xmss.c
  37. 126 126
      lib/wolfssl/wolfcrypt/src/fe_448.c
  38. 0 630
      lib/wolfssl/wolfcrypt/src/fe_x25519_128.i
  39. 0 16596
      lib/wolfssl/wolfcrypt/src/fe_x25519_asm.S
  40. 0 3874
      lib/wolfssl/wolfcrypt/src/fp_mont_small.i
  41. 0 147
      lib/wolfssl/wolfcrypt/src/fp_mul_comba_12.i
  42. 0 187
      lib/wolfssl/wolfcrypt/src/fp_mul_comba_17.i
  43. 0 210
      lib/wolfssl/wolfcrypt/src/fp_mul_comba_20.i
  44. 0 243
      lib/wolfssl/wolfcrypt/src/fp_mul_comba_24.i
  45. 0 275
      lib/wolfssl/wolfcrypt/src/fp_mul_comba_28.i
  46. 0 61
      lib/wolfssl/wolfcrypt/src/fp_mul_comba_3.i
  47. 0 321
      lib/wolfssl/wolfcrypt/src/fp_mul_comba_32.i
  48. 0 83
      lib/wolfssl/wolfcrypt/src/fp_mul_comba_4.i
  49. 0 435
      lib/wolfssl/wolfcrypt/src/fp_mul_comba_48.i
  50. 0 99
      lib/wolfssl/wolfcrypt/src/fp_mul_comba_6.i
  51. 0 563
      lib/wolfssl/wolfcrypt/src/fp_mul_comba_64.i
  52. 0 107
      lib/wolfssl/wolfcrypt/src/fp_mul_comba_7.i
  53. 0 115
      lib/wolfssl/wolfcrypt/src/fp_mul_comba_8.i
  54. 0 123
      lib/wolfssl/wolfcrypt/src/fp_mul_comba_9.i
  55. 0 1268
      lib/wolfssl/wolfcrypt/src/fp_mul_comba_small_set.i
  56. 0 177
      lib/wolfssl/wolfcrypt/src/fp_sqr_comba_12.i
  57. 0 227
      lib/wolfssl/wolfcrypt/src/fp_sqr_comba_17.i
  58. 0 257
      lib/wolfssl/wolfcrypt/src/fp_sqr_comba_20.i
  59. 0 297
      lib/wolfssl/wolfcrypt/src/fp_sqr_comba_24.i
  60. 0 337
      lib/wolfssl/wolfcrypt/src/fp_sqr_comba_28.i
  61. 0 73
      lib/wolfssl/wolfcrypt/src/fp_sqr_comba_3.i
  62. 0 377
      lib/wolfssl/wolfcrypt/src/fp_sqr_comba_32.i
  63. 0 97
      lib/wolfssl/wolfcrypt/src/fp_sqr_comba_4.i
  64. 0 537
      lib/wolfssl/wolfcrypt/src/fp_sqr_comba_48.i
  65. 0 117
      lib/wolfssl/wolfcrypt/src/fp_sqr_comba_6.i
  66. 0 697
      lib/wolfssl/wolfcrypt/src/fp_sqr_comba_64.i
  67. 0 127
      lib/wolfssl/wolfcrypt/src/fp_sqr_comba_7.i
  68. 0 137
      lib/wolfssl/wolfcrypt/src/fp_sqr_comba_8.i
  69. 0 147
      lib/wolfssl/wolfcrypt/src/fp_sqr_comba_9.i
  70. 0 1558
      lib/wolfssl/wolfcrypt/src/fp_sqr_comba_small_set.i
  71. 488 488
      lib/wolfssl/wolfcrypt/src/ge_448.c
  72. 0 22
      lib/wolfssl/wolfcrypt/src/ge_low_mem.c
  73. 115 148
      lib/wolfssl/wolfcrypt/src/ge_operations.c
  74. 109 1
      lib/wolfssl/wolfcrypt/src/hash.c
  75. 76 72
      lib/wolfssl/wolfcrypt/src/hmac.c
  76. 21 19
      lib/wolfssl/wolfcrypt/src/hpke.c
  77. 0 212
      lib/wolfssl/wolfcrypt/src/include.am
  78. 16 5
      lib/wolfssl/wolfcrypt/src/integer.c
  79. 15 1
      lib/wolfssl/wolfcrypt/src/kdf.c
  80. 24 11
      lib/wolfssl/wolfcrypt/src/logging.c
  81. 1 1
      lib/wolfssl/wolfcrypt/src/md2.c
  82. 63 56
      lib/wolfssl/wolfcrypt/src/memory.c
  83. 45 10
      lib/wolfssl/wolfcrypt/src/misc.c
  84. 5 4
      lib/wolfssl/wolfcrypt/src/pkcs12.c
  85. 251 106
      lib/wolfssl/wolfcrypt/src/pkcs7.c
  86. 0 1130
      lib/wolfssl/wolfcrypt/src/poly1305_asm.S
  87. 0 109
      lib/wolfssl/wolfcrypt/src/port/Espressif/README.md
  88. 0 432
      lib/wolfssl/wolfcrypt/src/port/Espressif/esp32_aes.c
  89. 0 914
      lib/wolfssl/wolfcrypt/src/port/Espressif/esp32_mp.c
  90. 0 1368
      lib/wolfssl/wolfcrypt/src/port/Espressif/esp32_sha.c
  91. 0 335
      lib/wolfssl/wolfcrypt/src/port/Espressif/esp32_util.c
  92. 0 195
      lib/wolfssl/wolfcrypt/src/port/Renesas/README.md
  93. 0 1300
      lib/wolfssl/wolfcrypt/src/port/Renesas/renesas_common.c
  94. 0 443
      lib/wolfssl/wolfcrypt/src/port/Renesas/renesas_rx64_hw_sha.c
  95. 0 106
      lib/wolfssl/wolfcrypt/src/port/Renesas/renesas_rx64_hw_util.c
  96. 0 589
      lib/wolfssl/wolfcrypt/src/port/Renesas/renesas_sce_aes.c
  97. 0 437
      lib/wolfssl/wolfcrypt/src/port/Renesas/renesas_sce_rsa.c
  98. 0 267
      lib/wolfssl/wolfcrypt/src/port/Renesas/renesas_sce_sha.c
  99. 0 1160
      lib/wolfssl/wolfcrypt/src/port/Renesas/renesas_sce_util.c
  100. 0 963
      lib/wolfssl/wolfcrypt/src/port/Renesas/renesas_tsip_aes.c

+ 1 - 1
LICENSE

@@ -671,4 +671,4 @@ into proprietary programs.  If your program is a subroutine library, you
 may consider it more useful to permit linking proprietary applications with
 the library.  If this is what you want to do, use the GNU Lesser General
 Public License instead of this License.  But first, please read
-<https://www.gnu.org/licenses/why-not-lgpl.html>.
+<https://www.gnu.org/licenses/why-not-lgpl.html>.

+ 2 - 4
application.fam

@@ -7,7 +7,7 @@ App(
     requires=["gui", "cli", "dialogs", "storage", "input", "notification", "bt"],
     stack_size=2 * 1024,
     order=20,
-    fap_version="5.50",
+    fap_version="5.70",
     fap_author="Alexander Kopachov (@akopachov)",
     fap_description="Software-based TOTP/HOTP authenticator for Flipper Zero device",
     fap_weburl="https://github.com/akopachov/flipper-zero_authenticator",
@@ -19,9 +19,6 @@ App(
         Lib(
             name="base32",
         ),
-        Lib(
-            name="base64",
-        ),
         Lib(
             name="timezone_utils",
         ),
@@ -40,6 +37,7 @@ App(
                 "wolfcrypt/src/sha.c",
                 "wolfcrypt/src/sha256.c",
                 "wolfcrypt/src/sha512.c",
+                "wolfcrypt/src/coding.c",
             ],
             cflags=["-Wno-error"],
             cdefines=["HAVE_CONFIG_H"],

+ 56 - 56
assets/cli/cli_help.txt

@@ -1,57 +1,57 @@
-Usage:
-  totp (help | h | ?)
-  totp version
-  totp (list | ls)
-  totp (lsattr | cat) <index>
-  totp (add | mk | new) <name> [-t <type>] [-i <counter>] [-a <algo>] [-e <encoding>] [-d <digits>] [-l <duration>] [-u] [-b <feature>]...
-  totp (update) <index> [-t <type>] [-i <counter>] [-a <algo>] [-e <encoding>] [-n <name>] [-d <digits>] [-l <duration>] [-u] [-s] [-b <feature>]...
-  totp (delete | rm) <index> [-f]
-  totp (move | mv) <index> <new_index>
-  totp pin (set | remove) [-c <slot>]
-  totp notify [<notification>...]
-  totp (timezone | tz) [<timezone>]
-  totp reset
-  totp automation [-k <layout>] [<automation>...]
-
-Commands:
-  help, h, ?       Show command usage help
-  version          Get application version
-  list, ls         List all available tokens
-  lsattr, cat      Displays token details
-  add, mk, new     Add new token
-  update           Update existing token
-  delete, rm       Delete existing token
-  move, mv         Move token
-  pin              Set\change\remove PIN
-  notify           Get or set notification method
-  timezone, tz     Get or set current timezone
-  reset            Reset application to default settings
-  automation       Get or set automation settings
-
-Arguments:
-  name          Token name
-  index         Token index in the list
-  new_index     New token index in the list
-  notification  Notification method to be set. Must be one of: none, sound, vibro
-  timezone      Timezone offset in hours to be set
-  automation    Automation method to be set. Must be one of: none, usb, bt
-
-Options:
-  -t <type>      Token type. Must be one of: totp, hotp [default: totp]
-  -i <counter>   Token initial counter. Applicable for HOTP tokens only. Must be positive integer number [default: 0]
-  -a <algo>      Token hashing algorithm. Must be one of: sha1, sha256, sha512, steam [default: sha1]
-  -d <digits>    Number of digits to generate, one of: 5, 6, 8 [default: 6]
-  -e <encoding>  Token secret encoding, one of base32, base64 [default: base32]
-  -l <duration>  Token lifetime duration in seconds. Applicable for TOTP tokens only.Must be between: 15 and 255 [default: 30]
-  -u             Show console user input as-is without masking
-  -b <feature>   Token automation features to be enabled. Must be one of: none, enter, tab [default: none]
-                 # none - No features
-                 # enter - Type <Enter> key at the end of token input automation
-                 # tab - Type <Tab> key at the end of token input automation
-                 # slower - Type slower
-  -n <name>      Token name
-  -s             Update token secret
-  -f             Force command to do not ask user for interactive confirmation
-  -c <slot>      New crypto key slot. Must be between 12 and 100
-  -k <layout>    Automation keyboard layout. Must be one of: QWERTY, AZERTY, QWERTZ
+Usage:
+  totp (help | h | ?)
+  totp version
+  totp (list | ls)
+  totp (lsattr | cat) <index>
+  totp (add | mk | new) <name> [-t <type>] [-i <counter>] [-a <algo>] [-e <encoding>] [-d <digits>] [-l <duration>] [-u] [-b <feature>]...
+  totp (update) <index> [-t <type>] [-i <counter>] [-a <algo>] [-e <encoding>] [-n <name>] [-d <digits>] [-l <duration>] [-u] [-s] [-b <feature>]...
+  totp (delete | rm) <index> [-f]
+  totp (move | mv) <index> <new_index>
+  totp pin (set | remove) [-c <slot>]
+  totp notify [<notification>...]
+  totp (timezone | tz) [<timezone>]
+  totp reset
+  totp automation [-k <layout>] [<automation>...]
+
+Commands:
+  help, h, ?       Show command usage help
+  version          Get application version
+  list, ls         List all available tokens
+  lsattr, cat      Displays token details
+  add, mk, new     Add new token
+  update           Update existing token
+  delete, rm       Delete existing token
+  move, mv         Move token
+  pin              Set\change\remove PIN
+  notify           Get or set notification method
+  timezone, tz     Get or set current timezone
+  reset            Reset application to default settings
+  automation       Get or set automation settings
+
+Arguments:
+  name          Token name
+  index         Token index in the list
+  new_index     New token index in the list
+  notification  Notification method to be set. Must be one of: none, sound, vibro
+  timezone      Timezone offset in hours to be set
+  automation    Automation method to be set. Must be one of: none, usb, bt
+
+Options:
+  -t <type>      Token type. Must be one of: totp, hotp [default: totp]
+  -i <counter>   Token initial counter. Applicable for HOTP tokens only. Must be positive integer number [default: 0]
+  -a <algo>      Token hashing algorithm. Must be one of: sha1, sha256, sha512, steam [default: sha1]
+  -d <digits>    Token digits count. Must be one of: 5, 6, 8 [default: 6]
+  -e <encoding>  Token secret encoding, one of base32, base64 [default: base32]
+  -l <duration>  Token lifetime duration in seconds. Applicable for TOTP tokens only. Must be between: 15 and 255 [default: 30]
+  -u             Show console user input as-is without masking
+  -b <feature>   Token automation features to be enabled. Must be one of: none, enter, tab [default: none]
+                 # none - No features
+                 # enter - Type <Enter> key at the end of token input automation
+                 # tab - Type <Tab> key at the end of token input automation
+                 # slower - Type slower
+  -n <name>      Token name
+  -s             Update token secret
+  -f             Force command to do not ask user for interactive confirmation
+  -c <slot>      New crypto key slot. Must be between 12 and 100
+  -k <layout>    Automation keyboard layout. Must be one of: QWERTY, AZERTY, QWERTZ
   

+ 2 - 0
config/wolfssl/config.h

@@ -32,3 +32,5 @@
 #define NO_ERROR_STRINGS
 #define NO_OLD_TLS
 #define SINGLE_THREADED
+#define WORD64_AVAILABLE
+#define WOLF_ALLOW_BUILTIN

+ 0 - 73
lib/base64/base64.c

@@ -1,73 +0,0 @@
-/*
- * Base64 encoding/decoding (RFC1341)
- * Copyright (c) 2005, Jouni Malinen <j@w1.fi>
- * Modified and optimized for Flipepr Zero device purposes by Alex Kopachov (@akopachov)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Alternatively, this software may be distributed under the terms of BSD
- * license.
- *
- */
-
-#include "base64.h"
-#include <string.h>
-
-static const uint8_t dtable[] = {0x3e, 0x80, 0x80, 0x80, 0x3f, 0x34, 0x35, 0x36, 0x37, 0x38,
-                                 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x80, 0x80, 0x80, 0x0,  0x80,
-                                 0x80, 0x80, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
-                                 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0x10, 0x11,
-                                 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x80, 0x80,
-                                 0x80, 0x80, 0x80, 0x80, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
-                                 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
-                                 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33};
-
-static uint8_t get_dtable_value(uint8_t index) {
-    return (index < 43 || index > 122) ? 0x80 : dtable[index - 43];
-}
-
-uint8_t* base64_decode(const uint8_t* src, size_t len, size_t* out_len, size_t* out_size) {
-    uint8_t* out;
-    uint8_t* pos;
-    uint8_t in[4];
-    uint8_t block[4];
-    uint8_t tmp;
-    size_t i;
-    size_t count;
-    size_t olen;
-
-    count = 0;
-    for(i = 0; i < len; i++) {
-        if(get_dtable_value(src[i]) != 0x80) count++;
-    }
-
-    if(count == 0 || count % 4) return NULL;
-    olen = count / 4 * 3;
-    pos = out = malloc(olen);
-    *out_size = olen;
-    if(out == NULL) return NULL;
-    count = 0;
-    for(i = 0; i < len; i++) {
-        tmp = get_dtable_value(src[i]);
-        if(tmp == 0x80) continue;
-        in[count] = src[i];
-        block[count] = tmp;
-        count++;
-        if(count == 4) {
-            *pos++ = (block[0] << 2) | (block[1] >> 4);
-            *pos++ = (block[1] << 4) | (block[2] >> 2);
-            *pos++ = (block[2] << 6) | block[3];
-            count = 0;
-        }
-    }
-    if(pos > out) {
-        if(in[2] == '=')
-            pos -= 2;
-        else if(in[3] == '=')
-            pos--;
-    }
-    *out_len = pos - out;
-    return out;
-}

+ 0 - 14
lib/base64/base64.h

@@ -1,14 +0,0 @@
-#pragma once
-
-#include <stdlib.h>
-#include <stdint.h>
-
-/**
- * @brief Decodes Base-64 encoded bytes into plain bytes.
- * @param src Base-64 encoded bytes
- * @param len Base-64 encoded bytes count
- * @param[out] out_len decoded buffer length
- * @param[out] out_size decoded buffer allocated size
- * @return Decoded result buffer if successfully decoded; \c NULL otherwise
- */
-uint8_t* base64_decode(const uint8_t* src, size_t len, size_t* out_len, size_t* out_size);

+ 20 - 1
lib/wolfssl/.gitignore

@@ -64,6 +64,8 @@ ctaocrypt/benchmark/benchmark
 ctaocrypt/test/testctaocrypt
 wolfcrypt/benchmark/benchmark
 wolfcrypt/test/testwolfcrypt
+examples/async/async_client
+examples/async/async_server
 examples/benchmark/tls_bench
 examples/client/client
 examples/echoclient/echoclient
@@ -74,6 +76,7 @@ examples/sctp/sctp-server-dtls
 examples/sctp/sctp-client
 examples/sctp/sctp-client-dtls
 examples/asn1/asn1
+examples/pem/pem
 server_ready
 snifftest
 output
@@ -84,6 +87,7 @@ testsuite/testsuite.test
 tests/unit.test
 tests/bio_write_test.txt
 tests/test-log-dump-to-file.txt
+tests/cert_cache.tmp
 test-write-dhparams.pem
 testsuite/*.der
 testsuite/*.pem
@@ -343,6 +347,8 @@ doc/pdf
 
 # XCODE Index
 IDE/XCODE/Index
+IDE/**/xcshareddata
+IDE/**/DerivedData
 
 # ARM DS-5 && Eclipse
 \.settings/
@@ -403,7 +409,8 @@ libFuzzer
 XXX-fips-test
 
 # ASYNC
-async
+/wolfAsyncCrypt
+/async
 
 # Generated user_settings_asm.h.
 user_settings_asm.h
@@ -417,3 +424,15 @@ user_settings_asm.h
 
 # auto-created CMake backups
 **/CMakeLists.txt.old
+
+# MagicCrypto (ARIA Cipher)
+MagicCrypto
+
+# CMake  build directory
+/out
+/out_temp
+
+# debian packaging
+debian/changelog
+debian/control
+*.deb

+ 53 - 135
lib/wolfssl/README

@@ -5,7 +5,7 @@ library written in ANSI C and targeted for embedded, RTOS, and
 resource-constrained environments - primarily because of its small size, speed,
 and feature set.  It is commonly used in standard operating environments as well
 because of its royalty-free pricing and excellent cross platform support.
-wolfSSL supports industry standards up to the current TLS 1.3 and DTLS 1.2
+wolfSSL supports industry standards up to the current TLS 1.3 and DTLS 1.3
 levels, is up to 20 times smaller than OpenSSL, and offers progressive ciphers
 such as ChaCha20, Curve25519, and Blake2b. User benchmarking and feedback
 reports dramatically better performance when using wolfSSL over OpenSSL.
@@ -20,11 +20,11 @@ certificate #3389). For additional information, visit the wolfCrypt FIPS FAQ
 There are many reasons to choose wolfSSL as your embedded SSL solution. Some of
 the top reasons include size (typical footprint sizes range from 20-100 kB),
 support for the newest standards (SSL 3.0, TLS 1.0, TLS 1.1, TLS 1.2, TLS 1.3,
-DTLS 1.0, and DTLS 1.2), current and progressive cipher support (including
-stream ciphers), multi-platform, royalty free, and an OpenSSL compatibility API
-to ease porting into existing applications which have previously used the
-OpenSSL package. For a complete feature list, see chapter 4 of the wolfSSL
-manual. (https://www.wolfssl.com/docs/wolfssl-manual/ch4/)
+DTLS 1.0, DTLS 1.2, and DTLS 1.3), current and progressive cipher support
+(including stream ciphers), multi-platform, royalty free, and an OpenSSL
+compatibility API to ease porting into existing applications which have
+previously used the OpenSSL package. For a complete feature list, see chapter 4
+of the wolfSSL manual. (https://www.wolfssl.com/docs/wolfssl-manual/ch4/)
 
 *** Notes, Please read ***
 
@@ -70,150 +70,68 @@ should be used for the enum name.
 
 *** end Notes ***
 
-# wolfSSL Release 5.6.3 (Jun 20, 2023)
+# wolfSSL Release 5.6.4 (Oct 30, 2023)
 
-Release 5.6.3 has been developed according to wolfSSL's development and QA process (see link below) and successfully passed the quality criteria.
-
-Release 5.6.3 of wolfSSL embedded TLS has 4 bug fixes:
-
-* Fix for setting the atomic macro options introduced in release 5.6.2. This issue affects GNU gcc autoconf builds. The fix resolves a potential mismatch of the generated macros defined in options.h file and the macros used when the wolfSSL library is compiled. In version 5.6.2 this mismatch could result in unstable runtime behavior.
-* Fix for invalid suffix error with Windows build using the macro GCM_TABLE_4BIT.
-* Improvements to Encrypted Memory support (WC_PROTECT_ENCRYPTED_MEM) implementations for modular exponentiation in SP math-all (sp_int.c) and TFM (tfm.c).
-* Improvements to SendAlert for getting output buffer.
-
-
-# wolfSSL Release 5.6.2 (Jun 09, 2023)
-
-Release 5.6.2 has been developed according to wolfSSL's development and QA process (see link below) and successfully passed the quality criteria.
+Release 5.6.4 has been developed according to wolfSSL's development and QA process (see link below) and successfully passed the quality criteria.
 https://www.wolfssl.com/about/wolfssl-software-development-process-quality-assurance
 
+
 NOTE: * --enable-heapmath is being deprecated and will be removed by 2024
+      * Old CyaSSL/CtaoCrypt shim layer was removed in this release (5.6.4)
 
-Release 5.6.2 of wolfSSL embedded TLS has bug fixes and new features including:
 
 ## Vulnerabilities
-* [Low] In cases where a malicious agent could analyze cache timing at a very detailed level, information about the AES key used could be leaked during T/S Box lookups. One such case was shown on RISC-V hardware using the MicroWalk tool (https://github.com/microwalk-project/Microwalk). A hardened version of T/S Box lookups was added in wolfSSL to help mitigate this potential attack and is now on by default with RISC-V builds and can be enabled on other builds if desired by compiling wolfSSL with the macro WOLFSSL_AES_TOUCH_LINES. Thanks to Jan Wichelmann, Christopher Peredy, Florian Sieck, Anna Pätschke, Thomas Eisenbarth (University of Lübeck): MAMBO-V: Dynamic Side-Channel Leakage Analysis on RISC-V. Fixed in the following GitHub pull request https://github.com/wolfSSL/wolfssl/pull/6309
-* [High] In previous versions of wolfSSL if a TLS 1.3 client gets neither a PSK (pre shared key) extension nor a KSE (key share extension) when connecting to a malicious server, a default predictable buffer gets used for the IKM value when generating the session master secret. Using a potentially known IKM value when generating the session master secret key compromises the key generated, allowing an eavesdropper to reconstruct it and potentially allowing surreptitious access to or meddling with message contents in the session. This issue does not affect client validation of connected servers, nor expose private key information, but could result in an insecure TLS 1.3 session when not controlling both sides of the connection. We recommend that TLS 1.3 client side users update the version of wolfSSL used. Thanks to Johannes from Sectra Communications and Linköping University for the report. Fixed in the following GitHub pull request https://github.com/wolfSSL/wolfssl/pull/6412
+
+* [Medium] A fix was added, but still under review for completeness, for a Bleichenbacher style attack, leading to being able to decrypt a saved TLS connection and potentially forge a signature after probing with a large number of trial connections. This issue is around RSA decryption and affects static RSA cipher suites on the server side, which are not recommended to be used and are off by default. Static RSA cipher suites were also removed from the TLS 1.3 protocol and only present in TLS 1.2 and lower. All padding versions of RSA decrypt are affected since the code under review is outside of the padding processing. Information about the private keys is NOT compromised in affected code. It's recommended to disable static RSA cipher suites and update the version of wolfSSL used if using RSA private decryption alone outside of TLS. The fix is located in this pull request (https://github.com/wolfSSL/wolfssl/pull/6896)
 
 ## New Feature Additions
 
-### New Ports and Expansions
-* Add support for STM32H5
-* Add support for Renesas TSIP v1.17
-* Add Renesas SCE RSA crypto-only support
-* STARCORE DSP port and example builds added
-* Add the function wc_PKCS7_SetDefaultSignedAttribs for setting PKCS7 signed attributes to use with PKCS7 bundle creation
-* NXP IMX6Q CAAM port with QNX and performance optimizations for AES-CTR
-
-### New Build Options
-* ASN.1 print utility to decode ASN.1 syntax and print out human readable text --enable-asn-print. Utility app is located in the directory ./examples/asn1/
-* Add introspection for math build, wc_GetMathInfo() to get information about the math library compiled into the linked wolfSSL library
-* Implement TLS recommendations from RFC 9325 for hardening TLS/DTLS security. Enabled with the autoconf flag --enable-harden-tls.
-* Add option to support disabling thread local storage, --disable-threadlocal
-* Added wc_DsaSign_ex() and wc_DsaVerify_ex() for handling alternative digest algorithms with DSA Sign/Verify
-* Implement atomic operations interface. Macros auto-detect if atomic operations are expected to be available, can be turned off with the macro WOLFSSL_NO_ATOMICS
-* Added support for DTLS 1.3 Authentication and Integrity-Only Cipher Suites
-* Expand crypto callback to have a device ID find callback function with wc_CryptoCb_SetDeviceFindCb. Enabled with the macro WOLF_CRYPTO_CB_FIND
+* DTLS 1.3 PQC: support fragmenting the second ClientHello message. This allows arbitrarily long keys to be used, opening up support for all PQC ciphersuites in DTLS 1.3.
+* SM2/SM3/SM4: Chinese cipher support including TLS 1.3 and 1.2 cipher suites. SM2 SP implementation available.
+* Ability to parse ASN1 only with SMIME_read_PKCS7
+* Added support for MemUse Entropy on Windows
+* Added Ada Bindings for wolfSSL
+* Added a PEM example that converts to and from DER/PEM.
+* Added LMS/HSS and XMSS/XMSS^MT wolfcrypt hooks, both normal and verify-only options.
+* Added support for the AES EAX mode of operation
+* Port for use with Hitch (https://github.com/varnish/hitch) added
+* Add XTS API's to handle multiple sectors in new port ot VeraCrypt
 
 ## Enhancements and Optimizations
 
-### Optimizations
-* Increased performance with ChaCha20 C implementation and general XOR operations
-* Added integer type to the ASN.1 sequencing with ASN.1 Integer sequence
-* With wolfSSL_get_x509_next_altname reset alt name list to head once cycled through if compiling with the macro WOLFSSL_MULTICIRCULATE_ALTNAMELIST
-* Additional key validity sanity checks on input to wolfSSL_EC_KEY_set_private_key
-* adds support for TLSv1.3 stateful session tickets when using SSL_OP_NO_TICKET
-
-### Memory Optimizations
-* Improvements to stack usage and management with SP int math library
-* Optimization to TLS 1.3 server to remove caching messages for Ed25519/Ed448
-* Added a HAVE_CURL macro build for building a subset of the wolfSSL library when linking with cURL
-* Memory usage improvement with reducing the size of alignment needed with AES
-* Reduce run time memory used with ECC operations and ALT_ECC_SIZE
-* Fixes and improvements for building edge cases such as crypto callback without hash-drbg with low footprint options
-* Support HAVE_SESSION_TICKET build option without depending on realloc
-
-### Documentation
-* Instructions for GPDMA on STM32 configuration added
-* Add in instructions for compiling with zephyr on STM32
-* Documentation fixup for wolfSSL_get_chain_cert()
-* Fix the file pointed to in the TI RTOS documentation that we maintain
-* Documentation for wolfSSL_CertManagerFreeCRL
-* Updates made to AES and Chacha documentation
-* Update Japanese comments for Ed25519, AES, and other miscellaneous items
-
-### Tests
-* Add in an option for easily testing malloc failures when building with WOLFSSL_MEM_FAIL_COUNT macro
-* Updated in process for using Expect vs Assert to facilitate more malloc failure tests
-* Enhance wolfCrypt test for builds that do not have ECC SECP curves enabled
-* ESP32 platform-specific VisualGDB test & benchmark projects
-* Update to dependencies in docker container file used for tests
-* Fix up for base 10 output with bundled benchmark application
-
-### Port Updates
-* Zephyr port update, compile time warning fixes, misc. fixes when used with TLS and update of includes
-* Update RIOT-OS to not compile out use of writev by default
-* Update Micrium port to enable use of STM32_RNG
-* Micrium updates for XMEMOVE and XSTRTOK use
-* Various Espressif HW crypto, SHA2, AES, MP updates
-* Added in ASIO build option with CMake builds
-
-### General Enhancements
-* Global codebase cleanup for C89 compliance and wolfCrypt -Wconversion hygiene
-* PKCS#11 enhancement adding a callback for RSA key size when using a hardware key, by default 2048 bit key is used
-* Allow for unknown OIDs in extensions in wolfSSL_X509_set_ext()
-* Allow user to override XSTAT by defining the macro XSTAT when compiling
-* Support UPN and SID with x509 certificate extensions and custom OID build
-* Write next IV in wolfSSL_DES_ede3_cbc_encrypt for better handling of inline encryption
-* Adding NO_ASN_TIME_CHECK build option for compiling out certificate before/after checks
-* Improve different peer recvfrom handling and error reporting with ipv4 vs ipv6
+* Turned on SNI by default on hosts with resources
+* Improved support for Silicon Labs Simplicity Studio and the ERF32 Gecko SDK
+* Thumb-2 and ARM32 Curve25519 and Ed25519 assembly have significantly improved performance.
+* Thumb-2 AES assembly code added.
+* Thumb-2 and ARM32 SP implementations of RSA, DH and ECC have significantly improved performance.
+* Minor performance improvements to SP ECC for Intel x64.
+* AES-XTS assembly code added for Intel x64, Aarch64 and ARM32.
+* Added support for X963 KDFs to ECIES.
+* Added 32-bit type only implementation of AES GMULT using tables.
+* Add support for nginx version 1.25.0
+* Add support for Kerberos version 5 1.21.1
+* Check all CRL entries in case a single issuer has multiple CRL's loaded
+* CRL verify the entire chain including loaded CA's
+* Added example for building wolfSSL as an Apple universal binary framework using configure
+* Sniffer tool now supports decrypting TLS sessions using secrets obtained from a SSLKEYLOGFILE
+* Updates made for EBSNET port
+* Update "--enable-jni" to include additional defines for expanded JNI support. Also includes JCE and JSSE builds under the single enable option now.
 
 ## Fixes
-* Fix for STM32 ECC sign and verify out of bounds buffer write when the hash length passed in is larger than the key size. Thanks to Maximilian for the report.
-* Fix to skip Async_DevCtxInit when using init rsa/ecc label/id api's
-* Revert WOLFSSL_NO_ASN_STRICT macro guard around alternate names directory list
-* In async mode, don't retry decrypting if a valid error is encountered on a packet parse attempt
-* Add additional sanity check on PKCS7 index value in wc_PKCS7_DecryptKekri
-* Fix for padding when using an AuthEnvelope PKCS7 type with GCM/CCM stream ciphers
-* Fix siphash assembly so that no register is left behind
-* Fix to not send a TLS 1.3 session ID resume response when resuming and downgrading to a protocol less than TLS 1.3
-* Fix overwriting serialNumber by favouriteDrink when generating a certificate using Cert struct
-* Fix for the default realloc used with EspressIf builds
-* Track SetDigest usage to avoid invalid free under error conditions
-* DTLS v1.3 fix for epoch 0 check on plaintext message
-* Fix for session ticket memory leak in wolfSSL_Cleanup
-* Fixes for propagating SendAlert errors when the peer disconnects
-* Replace XMEMCPY with XMEMMOVE to fix valgrind-3.15.0 reports "Source and destination overlap in memcpy" when using --enable-aesgcm-stream
-* Fix for potential out-of-bounds write edge case in fp_mod_2d with --enable-fastmath math library
-* Fix getting ECC key size in stm32_ecc_sign_hash_ex
-* Fix for case where wc_PeekErrorNodeLineData was not unlocking error queue on error
-* Fix for async ECC shared secret state
-* Fix for better error checking with sp_gcd with SP int math library
-* Fix memory leak in TLSX_KeyShare_Setup when handling an error case
-* Fix for double free edge case in InitOCSPRequest when handling a memory allocation failure
-* X509 NAME Entry fix for leaking memory on error case
-* Fix wolfssl_asn1_time_to_tm setting unexpected fields in tm struct
-* Fix for FIPS ECC integrity check with crypto callback set
-* BN_to_ASN1_INTEGER fix for handling leading zero byte padding when needed
-* Fix a typo in PP macro and add a ceiling to guard against implementation bugs
-* DTLS 1.3 fix for using the correct label when deriving the resumption key
-* OCSP fix for GetDateInfo edge case with non ASN template builds
-* Allow a user set certificate callback function to override the skipAddCA flag when parsing a certificate
-* SP int: sp_radix_size when radix 10 fix temp size for handling edge case
-* Fixes and improvements for handling failures with memory allocations
-* Fix for DecodeECC_DSA_Sig to handle r and s being initialized
-* Fix for wc_ecc_is_point to ensure that the x and y are in range [0, p-1] and z is one (affine ordinates)
-
-### Build Fixes
-* Fix for building on Windows with CMake and using USER_SETTINGS and fix for options.h creation with CMake when using USER_SETTINGS
-* CMake fixes and improvements for use with mingw32
-* Fix for building with wpas and x509 small options
-* Check if colrm is available for options.h creation when using autoconf
-* Clean up NO_BIG_INT build, removing WOLFSSL_SP_MATH macro and heapmath compile
-* Fix PKCS#7 build with NO_PKCS7_STREAM
-* Fix compilation error in CC-RX and remove unnecessary public key import
-* SP Build fixes for ARM assembly with ARMv6 clz and ARM thumb debug build
-* For to not advertise support for RSA in TLS extensions when compiled with NO_RSA
+
+* Fixed error handling when decrypted pre-master secret is too long when using static RSA.
+* Added a fix for keymod use with i.MX RT1170 CAAM blobs
+* Added a fix for AES-GCM use with Petalinux Xilinx
+* Fixed `wc_SignatureGenerate_ex` to not call verify twice
+* Fixed wolfCrypt FIPS DLL on Win32
+* Fixed TFM math library big-endian reading implementation when a zero length buffer is passed in.
+* Fixed NO_CERT configurations to build correctly.
+* Fixed ARM AES-GCM streaming assembly when –enable-opensslextra defined.
+* Added modulus checks to heap math implementation of mp_exptmod().
+* Fixed Windows assembly code to handle that certain XMM registers are non-volatile.
+* Aarch64 SP ECC implementation of sp_256_mont_dbl_4 has the register list for the assembly code fixed to include all used registers.
+* mp_sqrt_mod_prime fixed to limit the number of iterations of a loop to handle malicious non-prime values being passed in.
+* Ignore session ID's shorter than 32 bytes instead of erroring out
 
 For additional vulnerability information visit the vulnerability page at:
 https://www.wolfssl.com/docs/security-vulnerabilities/

Разлика између датотеке није приказан због своје велике величине
+ 303 - 302
lib/wolfssl/wolfcrypt/src/aes.c


+ 0 - 2226
lib/wolfssl/wolfcrypt/src/aes_asm.S

@@ -1,2226 +0,0 @@
-/* aes_asm.S
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-/* This file is in at&t asm syntax, see .asm for intel syntax */
-
-/* See Intel Advanced Encryption Standard (AES) Instructions Set White Paper
- * by Intel Mobility Group, Israel Development Center, Israel Shay Gueron
- */
-
-#ifdef WOLFSSL_X86_64_BUILD
-
-/*
-AES_CBC_encrypt (const unsigned char *in,
-	unsigned char *out,
-	unsigned char ivec[16],
-	unsigned long length,
-	const unsigned char *KS,
-	int nr)
-*/
-#ifndef __APPLE__
-.globl AES_CBC_encrypt
-AES_CBC_encrypt:
-#else
-.globl _AES_CBC_encrypt
-_AES_CBC_encrypt:
-#endif
-# parameter 1: %rdi
-# parameter 2: %rsi
-# parameter 3: %rdx
-# parameter 4: %rcx
-# parameter 5: %r8
-# parameter 6: %r9d
-movq	%rcx, %r10
-shrq	$4, %rcx
-shlq	$60, %r10
-je	NO_PARTS
-addq	$1, %rcx
-NO_PARTS:
-subq	$16, %rsi
-movdqa	(%rdx), %xmm1
-LOOP:
-pxor	(%rdi), %xmm1
-pxor	(%r8), %xmm1
-addq	$16,%rsi
-addq	$16,%rdi
-cmpl	$12, %r9d
-aesenc	16(%r8),%xmm1
-aesenc	32(%r8),%xmm1
-aesenc	48(%r8),%xmm1
-aesenc	64(%r8),%xmm1
-aesenc	80(%r8),%xmm1
-aesenc	96(%r8),%xmm1
-aesenc	112(%r8),%xmm1
-aesenc	128(%r8),%xmm1
-aesenc	144(%r8),%xmm1
-movdqa	160(%r8),%xmm2
-jb	LAST
-cmpl	$14, %r9d
-
-aesenc	160(%r8),%xmm1
-aesenc	176(%r8),%xmm1
-movdqa	192(%r8),%xmm2
-jb	LAST
-aesenc	192(%r8),%xmm1
-aesenc	208(%r8),%xmm1
-movdqa	224(%r8),%xmm2
-LAST:
-decq	%rcx
-aesenclast %xmm2,%xmm1
-movdqu	%xmm1,(%rsi)
-jne	LOOP
-ret
-
-
-#if defined(WOLFSSL_AESNI_BY4)
-
-/*
-AES_CBC_decrypt_by4 (const unsigned char *in,
-  unsigned char *out,
-  unsigned char ivec[16],
-  unsigned long length,
-  const unsigned char *KS,
-  int nr)
-*/
-#ifndef __APPLE__
-.globl AES_CBC_decrypt_by4
-AES_CBC_decrypt_by4:
-#else
-.globl _AES_CBC_decrypt_by4
-_AES_CBC_decrypt_by4:
-#endif
-# parameter 1: %rdi
-# parameter 2: %rsi
-# parameter 3: %rdx
-# parameter 4: %rcx
-# parameter 5: %r8
-# parameter 6: %r9d
-
-        movq        %rcx, %r10
-        shrq        $4, %rcx
-        shlq        $60, %r10
-        je          DNO_PARTS_4
-        addq        $1, %rcx
-DNO_PARTS_4:
-        movq        %rcx, %r10
-        shlq        $62, %r10
-        shrq        $62, %r10
-        shrq        $2, %rcx
-        movdqu      (%rdx),%xmm5
-        je          DREMAINDER_4
-        subq        $64, %rsi
-DLOOP_4:
-        movdqu      (%rdi), %xmm1
-        movdqu      16(%rdi), %xmm2
-        movdqu      32(%rdi), %xmm3
-        movdqu      48(%rdi), %xmm4
-        movdqa      %xmm1, %xmm6
-        movdqa      %xmm2, %xmm7
-        movdqa      %xmm3, %xmm8
-        movdqa      %xmm4, %xmm15
-        movdqa      (%r8), %xmm9
-        movdqa      16(%r8), %xmm10
-        movdqa      32(%r8), %xmm11
-        movdqa      48(%r8), %xmm12
-        pxor        %xmm9, %xmm1
-        pxor        %xmm9, %xmm2
-        pxor        %xmm9, %xmm3
-        pxor        %xmm9, %xmm4
-        aesdec      %xmm10, %xmm1
-        aesdec      %xmm10, %xmm2
-        aesdec      %xmm10, %xmm3
-        aesdec      %xmm10, %xmm4
-        aesdec      %xmm11, %xmm1
-        aesdec      %xmm11, %xmm2
-        aesdec      %xmm11, %xmm3
-        aesdec      %xmm11, %xmm4
-        aesdec      %xmm12, %xmm1
-        aesdec      %xmm12, %xmm2
-        aesdec      %xmm12, %xmm3
-        aesdec      %xmm12, %xmm4
-        movdqa      64(%r8), %xmm9
-        movdqa      80(%r8), %xmm10
-        movdqa      96(%r8), %xmm11
-        movdqa      112(%r8), %xmm12
-        aesdec      %xmm9, %xmm1
-        aesdec      %xmm9, %xmm2
-        aesdec      %xmm9, %xmm3
-        aesdec      %xmm9, %xmm4
-        aesdec      %xmm10, %xmm1
-        aesdec      %xmm10, %xmm2
-        aesdec      %xmm10, %xmm3
-        aesdec      %xmm10, %xmm4
-        aesdec      %xmm11, %xmm1
-        aesdec      %xmm11, %xmm2
-        aesdec      %xmm11, %xmm3
-        aesdec      %xmm11, %xmm4
-        aesdec      %xmm12, %xmm1
-        aesdec      %xmm12, %xmm2
-        aesdec      %xmm12, %xmm3
-        aesdec      %xmm12, %xmm4
-        movdqa      128(%r8), %xmm9
-        movdqa      144(%r8), %xmm10
-        movdqa      160(%r8), %xmm11
-        cmpl        $12, %r9d
-        aesdec      %xmm9, %xmm1
-        aesdec      %xmm9, %xmm2
-        aesdec      %xmm9, %xmm3
-        aesdec      %xmm9, %xmm4
-        aesdec      %xmm10, %xmm1
-        aesdec      %xmm10, %xmm2
-        aesdec      %xmm10, %xmm3
-        aesdec      %xmm10, %xmm4
-        jb          DLAST_4
-        movdqa      160(%r8), %xmm9
-        movdqa      176(%r8), %xmm10
-        movdqa      192(%r8), %xmm11
-        cmpl        $14, %r9d
-        aesdec      %xmm9, %xmm1
-        aesdec      %xmm9, %xmm2
-        aesdec      %xmm9, %xmm3
-        aesdec      %xmm9, %xmm4
-        aesdec      %xmm10, %xmm1
-        aesdec      %xmm10, %xmm2
-        aesdec      %xmm10, %xmm3
-        aesdec      %xmm10, %xmm4
-        jb          DLAST_4
-        movdqa      192(%r8), %xmm9
-        movdqa      208(%r8), %xmm10
-        movdqa      224(%r8), %xmm11
-        aesdec      %xmm9, %xmm1
-        aesdec      %xmm9, %xmm2
-        aesdec      %xmm9, %xmm3
-        aesdec      %xmm9, %xmm4
-        aesdec      %xmm10, %xmm1
-        aesdec      %xmm10, %xmm2
-        aesdec      %xmm10, %xmm3
-        aesdec      %xmm10, %xmm4
-DLAST_4:
-        addq        $64, %rdi
-        addq        $64, %rsi
-        decq        %rcx
-        aesdeclast  %xmm11, %xmm1
-        aesdeclast  %xmm11, %xmm2
-        aesdeclast  %xmm11, %xmm3
-        aesdeclast  %xmm11, %xmm4
-        pxor        %xmm5, %xmm1
-        pxor        %xmm6, %xmm2
-        pxor        %xmm7, %xmm3
-        pxor        %xmm8, %xmm4
-        movdqu      %xmm1, (%rsi)
-        movdqu      %xmm2, 16(%rsi)
-        movdqu      %xmm3, 32(%rsi)
-        movdqu      %xmm4, 48(%rsi)
-        movdqa      %xmm15,%xmm5
-        jne         DLOOP_4
-        addq        $64, %rsi
-DREMAINDER_4:
-        cmpq        $0, %r10
-        je          DEND_4
-DLOOP_4_2:
-        movdqu      (%rdi), %xmm1
-        movdqa      %xmm1, %xmm15
-        addq        $16, %rdi
-        pxor        (%r8), %xmm1
-        movdqu      160(%r8), %xmm2
-        cmpl        $12, %r9d
-        aesdec      16(%r8), %xmm1
-        aesdec      32(%r8), %xmm1
-        aesdec      48(%r8), %xmm1
-        aesdec      64(%r8), %xmm1
-        aesdec      80(%r8), %xmm1
-        aesdec      96(%r8), %xmm1
-        aesdec      112(%r8), %xmm1
-        aesdec      128(%r8), %xmm1
-        aesdec      144(%r8), %xmm1
-        jb          DLAST_4_2
-        movdqu      192(%r8), %xmm2
-        cmpl        $14, %r9d
-        aesdec      160(%r8), %xmm1
-        aesdec      176(%r8), %xmm1
-        jb          DLAST_4_2
-        movdqu      224(%r8), %xmm2
-        aesdec      192(%r8), %xmm1
-        aesdec      208(%r8), %xmm1
-DLAST_4_2:
-        aesdeclast  %xmm2, %xmm1
-        pxor        %xmm5, %xmm1
-        movdqa      %xmm15, %xmm5
-        movdqu      %xmm1, (%rsi)
-        addq        $16, %rsi
-        decq        %r10
-        jne         DLOOP_4_2
-DEND_4:
-        ret
-
-#elif defined(WOLFSSL_AESNI_BY6)
-
-/*
-AES_CBC_decrypt_by6 (const unsigned char *in,
-  unsigned char *out,
-  unsigned char ivec[16],
-  unsigned long length,
-  const unsigned char *KS,
-  int nr)
-*/
-#ifndef __APPLE__
-.globl AES_CBC_decrypt_by6
-AES_CBC_decrypt_by6:
-#else
-.globl _AES_CBC_decrypt_by6
-_AES_CBC_decrypt_by6:
-#endif
-# parameter 1: %rdi - in
-# parameter 2: %rsi - out
-# parameter 3: %rdx - ivec
-# parameter 4: %rcx - length
-# parameter 5: %r8  - KS
-# parameter 6: %r9d - nr
-
-        movq        %rcx, %r10
-        shrq        $4, %rcx
-        shlq        $60, %r10
-        je          DNO_PARTS_6
-        addq        $1, %rcx
-DNO_PARTS_6:
-        movq        %rax, %r12
-        movq        %rdx, %r13
-        movq        %rbx, %r14
-        movq        $0, %rdx
-        movq        %rcx, %rax
-        movq        $6, %rbx
-        div         %rbx
-        movq        %rax, %rcx
-        movq        %rdx, %r10
-        movq        %r12, %rax
-        movq        %r13, %rdx
-        movq        %r14, %rbx
-        cmpq        $0, %rcx
-        movdqu      (%rdx), %xmm7
-        je          DREMAINDER_6
-        subq        $96, %rsi
-DLOOP_6:
-        movdqu      (%rdi), %xmm1
-        movdqu      16(%rdi), %xmm2
-        movdqu      32(%rdi), %xmm3
-        movdqu      48(%rdi), %xmm4
-        movdqu      64(%rdi), %xmm5
-        movdqu      80(%rdi), %xmm6
-        movdqa      (%r8), %xmm8
-        movdqa      16(%r8), %xmm9
-        movdqa      32(%r8), %xmm10
-        movdqa      48(%r8), %xmm11
-        pxor        %xmm8, %xmm1
-        pxor        %xmm8, %xmm2
-        pxor        %xmm8, %xmm3
-        pxor        %xmm8, %xmm4
-        pxor        %xmm8, %xmm5
-        pxor        %xmm8, %xmm6
-        aesdec      %xmm9, %xmm1
-        aesdec      %xmm9, %xmm2
-        aesdec      %xmm9, %xmm3
-        aesdec      %xmm9, %xmm4
-        aesdec      %xmm9, %xmm5
-        aesdec      %xmm9, %xmm6
-        aesdec      %xmm10, %xmm1
-        aesdec      %xmm10, %xmm2
-        aesdec      %xmm10, %xmm3
-        aesdec      %xmm10, %xmm4
-        aesdec      %xmm10, %xmm5
-        aesdec      %xmm10, %xmm6
-        aesdec      %xmm11, %xmm1
-        aesdec      %xmm11, %xmm2
-        aesdec      %xmm11, %xmm3
-        aesdec      %xmm11, %xmm4
-        aesdec      %xmm11, %xmm5
-        aesdec      %xmm11, %xmm6
-        movdqa      64(%r8), %xmm8
-        movdqa      80(%r8), %xmm9
-        movdqa      96(%r8), %xmm10
-        movdqa      112(%r8), %xmm11
-        aesdec      %xmm8, %xmm1
-        aesdec      %xmm8, %xmm2
-        aesdec      %xmm8, %xmm3
-        aesdec      %xmm8, %xmm4
-        aesdec      %xmm8, %xmm5
-        aesdec      %xmm8, %xmm6
-        aesdec      %xmm9, %xmm1
-        aesdec      %xmm9, %xmm2
-        aesdec      %xmm9, %xmm3
-        aesdec      %xmm9, %xmm4
-        aesdec      %xmm9, %xmm5
-        aesdec      %xmm9, %xmm6
-        aesdec      %xmm10, %xmm1
-        aesdec      %xmm10, %xmm2
-        aesdec      %xmm10, %xmm3
-        aesdec      %xmm10, %xmm4
-        aesdec      %xmm10, %xmm5
-        aesdec      %xmm10, %xmm6
-        aesdec      %xmm11, %xmm1
-        aesdec      %xmm11, %xmm2
-        aesdec      %xmm11, %xmm3
-        aesdec      %xmm11, %xmm4
-        aesdec      %xmm11, %xmm5
-        aesdec      %xmm11, %xmm6
-        movdqa      128(%r8), %xmm8
-        movdqa      144(%r8), %xmm9
-        movdqa      160(%r8), %xmm10
-        cmpl        $12, %r9d
-        aesdec      %xmm8, %xmm1
-        aesdec      %xmm8, %xmm2
-        aesdec      %xmm8, %xmm3
-        aesdec      %xmm8, %xmm4
-        aesdec      %xmm8, %xmm5
-        aesdec      %xmm8, %xmm6
-        aesdec      %xmm9, %xmm1
-        aesdec      %xmm9, %xmm2
-        aesdec      %xmm9, %xmm3
-        aesdec      %xmm9, %xmm4
-        aesdec      %xmm9, %xmm5
-        aesdec      %xmm9, %xmm6
-        jb          DLAST_6
-        movdqa      160(%r8), %xmm8
-        movdqa      176(%r8), %xmm9
-        movdqa      192(%r8), %xmm10
-        cmpl        $14, %r9d
-        aesdec      %xmm8, %xmm1
-        aesdec      %xmm8, %xmm2
-        aesdec      %xmm8, %xmm3
-        aesdec      %xmm8, %xmm4
-        aesdec      %xmm8, %xmm5
-        aesdec      %xmm8, %xmm6
-        aesdec      %xmm9, %xmm1
-        aesdec      %xmm9, %xmm2
-        aesdec      %xmm9, %xmm3
-        aesdec      %xmm9, %xmm4
-        aesdec      %xmm9, %xmm5
-        aesdec      %xmm9, %xmm6
-        jb          DLAST_6
-        movdqa      192(%r8), %xmm8
-        movdqa      208(%r8), %xmm9
-        movdqa      224(%r8), %xmm10
-        aesdec      %xmm8, %xmm1
-        aesdec      %xmm8, %xmm2
-        aesdec      %xmm8, %xmm3
-        aesdec      %xmm8, %xmm4
-        aesdec      %xmm8, %xmm5
-        aesdec      %xmm8, %xmm6
-        aesdec      %xmm9, %xmm1
-        aesdec      %xmm9, %xmm2
-        aesdec      %xmm9, %xmm3
-        aesdec      %xmm9, %xmm4
-        aesdec      %xmm9, %xmm5
-        aesdec      %xmm9, %xmm6
-DLAST_6:
-        addq        $96, %rsi
-        aesdeclast  %xmm10, %xmm1
-        aesdeclast  %xmm10, %xmm2
-        aesdeclast  %xmm10, %xmm3
-        aesdeclast  %xmm10, %xmm4
-        aesdeclast  %xmm10, %xmm5
-        aesdeclast  %xmm10, %xmm6
-        movdqu      (%rdi), %xmm8
-        movdqu      16(%rdi), %xmm9
-        movdqu      32(%rdi), %xmm10
-        movdqu      48(%rdi), %xmm11
-        movdqu      64(%rdi), %xmm12
-        movdqu      80(%rdi), %xmm13
-        pxor        %xmm7, %xmm1
-        pxor        %xmm8, %xmm2
-        pxor        %xmm9, %xmm3
-        pxor        %xmm10, %xmm4
-        pxor        %xmm11, %xmm5
-        pxor        %xmm12, %xmm6
-        movdqu      %xmm13, %xmm7
-        movdqu      %xmm1, (%rsi)
-        movdqu      %xmm2, 16(%rsi)
-        movdqu      %xmm3, 32(%rsi)
-        movdqu      %xmm4, 48(%rsi)
-        movdqu      %xmm5, 64(%rsi)
-        movdqu      %xmm6, 80(%rsi)
-        addq        $96, %rdi
-        decq        %rcx
-        jne         DLOOP_6
-        addq        $96, %rsi
-DREMAINDER_6:
-        cmpq        $0, %r10
-        je          DEND_6
-DLOOP_6_2:
-        movdqu      (%rdi), %xmm1
-        movdqa      %xmm1, %xmm10
-        addq        $16, %rdi
-        pxor        (%r8), %xmm1
-        movdqu      160(%r8), %xmm2
-        cmpl        $12, %r9d
-        aesdec      16(%r8), %xmm1
-        aesdec      32(%r8), %xmm1
-        aesdec      48(%r8), %xmm1
-        aesdec      64(%r8), %xmm1
-        aesdec      80(%r8), %xmm1
-        aesdec      96(%r8), %xmm1
-        aesdec      112(%r8), %xmm1
-        aesdec      128(%r8), %xmm1
-        aesdec      144(%r8), %xmm1
-        jb          DLAST_6_2
-        movdqu      192(%r8), %xmm2
-        cmpl        $14, %r9d
-        aesdec      160(%r8), %xmm1
-        aesdec      176(%r8), %xmm1
-        jb          DLAST_6_2
-        movdqu      224(%r8), %xmm2
-        aesdec      192(%r8), %xmm1
-        aesdec      208(%r8), %xmm1
-DLAST_6_2:
-        aesdeclast  %xmm2, %xmm1
-        pxor        %xmm7, %xmm1
-        movdqa      %xmm10, %xmm7
-        movdqu      %xmm1, (%rsi)
-        addq        $16, %rsi
-        decq        %r10
-        jne         DLOOP_6_2
-DEND_6:
-        ret
-
-#else /* WOLFSSL_AESNI_BYx */
-
-/*
-AES_CBC_decrypt_by8 (const unsigned char *in,
-  unsigned char *out,
-  unsigned char ivec[16],
-  unsigned long length,
-  const unsigned char *KS,
-  int nr)
-*/
-#ifndef __APPLE__
-.globl AES_CBC_decrypt_by8
-AES_CBC_decrypt_by8:
-#else
-.globl _AES_CBC_decrypt_by8
-_AES_CBC_decrypt_by8:
-#endif
-# parameter 1: %rdi - in
-# parameter 2: %rsi - out
-# parameter 3: %rdx - ivec
-# parameter 4: %rcx - length
-# parameter 5: %r8  - KS
-# parameter 6: %r9d - nr
-
-        movq        %rcx, %r10
-        shrq        $4, %rcx
-        shlq        $60, %r10
-        je          DNO_PARTS_8
-        addq        $1, %rcx
-DNO_PARTS_8:
-        movq        %rcx, %r10
-        shlq        $61, %r10
-        shrq        $61, %r10
-        shrq        $3, %rcx
-        movdqu      (%rdx), %xmm9
-        je          DREMAINDER_8
-        subq        $128, %rsi
-DLOOP_8:
-        movdqu      (%rdi), %xmm1
-        movdqu      16(%rdi), %xmm2
-        movdqu      32(%rdi), %xmm3
-        movdqu      48(%rdi), %xmm4
-        movdqu      64(%rdi), %xmm5
-        movdqu      80(%rdi), %xmm6
-        movdqu      96(%rdi), %xmm7
-        movdqu      112(%rdi), %xmm8
-        movdqa      (%r8), %xmm10
-        movdqa      16(%r8), %xmm11
-        movdqa      32(%r8), %xmm12
-        movdqa      48(%r8), %xmm13
-        pxor        %xmm10, %xmm1
-        pxor        %xmm10, %xmm2
-        pxor        %xmm10, %xmm3
-        pxor        %xmm10, %xmm4
-        pxor        %xmm10, %xmm5
-        pxor        %xmm10, %xmm6
-        pxor        %xmm10, %xmm7
-        pxor        %xmm10, %xmm8
-        aesdec      %xmm11, %xmm1
-        aesdec      %xmm11, %xmm2
-        aesdec      %xmm11, %xmm3
-        aesdec      %xmm11, %xmm4
-        aesdec      %xmm11, %xmm5
-        aesdec      %xmm11, %xmm6
-        aesdec      %xmm11, %xmm7
-        aesdec      %xmm11, %xmm8
-        aesdec      %xmm12, %xmm1
-        aesdec      %xmm12, %xmm2
-        aesdec      %xmm12, %xmm3
-        aesdec      %xmm12, %xmm4
-        aesdec      %xmm12, %xmm5
-        aesdec      %xmm12, %xmm6
-        aesdec      %xmm12, %xmm7
-        aesdec      %xmm12, %xmm8
-        aesdec      %xmm13, %xmm1
-        aesdec      %xmm13, %xmm2
-        aesdec      %xmm13, %xmm3
-        aesdec      %xmm13, %xmm4
-        aesdec      %xmm13, %xmm5
-        aesdec      %xmm13, %xmm6
-        aesdec      %xmm13, %xmm7
-        aesdec      %xmm13, %xmm8
-        movdqa      64(%r8), %xmm10
-        movdqa      80(%r8), %xmm11
-        movdqa      96(%r8), %xmm12
-        movdqa      112(%r8), %xmm13
-        aesdec      %xmm10, %xmm1
-        aesdec      %xmm10, %xmm2
-        aesdec      %xmm10, %xmm3
-        aesdec      %xmm10, %xmm4
-        aesdec      %xmm10, %xmm5
-        aesdec      %xmm10, %xmm6
-        aesdec      %xmm10, %xmm7
-        aesdec      %xmm10, %xmm8
-        aesdec      %xmm11, %xmm1
-        aesdec      %xmm11, %xmm2
-        aesdec      %xmm11, %xmm3
-        aesdec      %xmm11, %xmm4
-        aesdec      %xmm11, %xmm5
-        aesdec      %xmm11, %xmm6
-        aesdec      %xmm11, %xmm7
-        aesdec      %xmm11, %xmm8
-        aesdec      %xmm12, %xmm1
-        aesdec      %xmm12, %xmm2
-        aesdec      %xmm12, %xmm3
-        aesdec      %xmm12, %xmm4
-        aesdec      %xmm12, %xmm5
-        aesdec      %xmm12, %xmm6
-        aesdec      %xmm12, %xmm7
-        aesdec      %xmm12, %xmm8
-        aesdec      %xmm13, %xmm1
-        aesdec      %xmm13, %xmm2
-        aesdec      %xmm13, %xmm3
-        aesdec      %xmm13, %xmm4
-        aesdec      %xmm13, %xmm5
-        aesdec      %xmm13, %xmm6
-        aesdec      %xmm13, %xmm7
-        aesdec      %xmm13, %xmm8
-        movdqa      128(%r8), %xmm10
-        movdqa      144(%r8), %xmm11
-        movdqa      160(%r8), %xmm12
-        cmpl        $12, %r9d
-        aesdec      %xmm10, %xmm1
-        aesdec      %xmm10, %xmm2
-        aesdec      %xmm10, %xmm3
-        aesdec      %xmm10, %xmm4
-        aesdec      %xmm10, %xmm5
-        aesdec      %xmm10, %xmm6
-        aesdec      %xmm10, %xmm7
-        aesdec      %xmm10, %xmm8
-        aesdec      %xmm11, %xmm1
-        aesdec      %xmm11, %xmm2
-        aesdec      %xmm11, %xmm3
-        aesdec      %xmm11, %xmm4
-        aesdec      %xmm11, %xmm5
-        aesdec      %xmm11, %xmm6
-        aesdec      %xmm11, %xmm7
-        aesdec      %xmm11, %xmm8
-        jb          DLAST_8
-        movdqa      160(%r8), %xmm10
-        movdqa      176(%r8), %xmm11
-        movdqa      192(%r8), %xmm12
-        cmpl        $14, %r9d
-        aesdec      %xmm10, %xmm1
-        aesdec      %xmm10, %xmm2
-        aesdec      %xmm10, %xmm3
-        aesdec      %xmm10, %xmm4
-        aesdec      %xmm10, %xmm5
-        aesdec      %xmm10, %xmm6
-        aesdec      %xmm10, %xmm7
-        aesdec      %xmm10, %xmm8
-        aesdec      %xmm11, %xmm1
-        aesdec      %xmm11, %xmm2
-        aesdec      %xmm11, %xmm3
-        aesdec      %xmm11, %xmm4
-        aesdec      %xmm11, %xmm5
-        aesdec      %xmm11, %xmm6
-        aesdec      %xmm11, %xmm7
-        aesdec      %xmm11, %xmm8
-        jb          DLAST_8
-        movdqa      192(%r8), %xmm10
-        movdqa      208(%r8), %xmm11
-        movdqa      224(%r8), %xmm12
-        aesdec      %xmm10, %xmm1
-        aesdec      %xmm10, %xmm2
-        aesdec      %xmm10, %xmm3
-        aesdec      %xmm10, %xmm4
-        aesdec      %xmm10, %xmm5
-        aesdec      %xmm10, %xmm6
-        aesdec      %xmm10, %xmm7
-        aesdec      %xmm10, %xmm8
-        aesdec      %xmm11, %xmm1
-        aesdec      %xmm11, %xmm2
-        aesdec      %xmm11, %xmm3
-        aesdec      %xmm11, %xmm4
-        aesdec      %xmm11, %xmm5
-        aesdec      %xmm11, %xmm6
-        aesdec      %xmm11, %xmm7
-        aesdec      %xmm11, %xmm8
-DLAST_8:
-        addq        $128, %rsi
-        aesdeclast  %xmm12, %xmm1
-        aesdeclast  %xmm12, %xmm2
-        aesdeclast  %xmm12, %xmm3
-        aesdeclast  %xmm12, %xmm4
-        aesdeclast  %xmm12, %xmm5
-        aesdeclast  %xmm12, %xmm6
-        aesdeclast  %xmm12, %xmm7
-        aesdeclast  %xmm12, %xmm8
-        movdqu      (%rdi), %xmm10
-        movdqu      16(%rdi), %xmm11
-        movdqu      32(%rdi), %xmm12
-        movdqu      48(%rdi), %xmm13
-        pxor        %xmm9, %xmm1
-        pxor        %xmm10, %xmm2
-        pxor        %xmm11, %xmm3
-        pxor        %xmm12, %xmm4
-        pxor        %xmm13, %xmm5
-        movdqu      64(%rdi), %xmm10
-        movdqu      80(%rdi), %xmm11
-        movdqu      96(%rdi), %xmm12
-        movdqu      112(%rdi), %xmm9
-        pxor        %xmm10, %xmm6
-        pxor        %xmm11, %xmm7
-        pxor        %xmm12, %xmm8
-        movdqu      %xmm1, (%rsi)
-        movdqu      %xmm2, 16(%rsi)
-        movdqu      %xmm3, 32(%rsi)
-        movdqu      %xmm4, 48(%rsi)
-        movdqu      %xmm5, 64(%rsi)
-        movdqu      %xmm6, 80(%rsi)
-        movdqu      %xmm7, 96(%rsi)
-        movdqu      %xmm8, 112(%rsi)
-        addq        $128, %rdi
-        decq        %rcx
-        jne         DLOOP_8
-        addq        $128, %rsi
-DREMAINDER_8:
-        cmpq        $0, %r10
-        je          DEND_8
-DLOOP_8_2:
-        movdqu      (%rdi), %xmm1
-        movdqa      %xmm1, %xmm10
-        addq        $16, %rdi
-        pxor        (%r8), %xmm1
-        movdqu      160(%r8), %xmm2
-        cmpl        $12, %r9d
-        aesdec      16(%r8), %xmm1
-        aesdec      32(%r8), %xmm1
-        aesdec      48(%r8), %xmm1
-        aesdec      64(%r8), %xmm1
-        aesdec      80(%r8), %xmm1
-        aesdec      96(%r8), %xmm1
-        aesdec      112(%r8), %xmm1
-        aesdec      128(%r8), %xmm1
-        aesdec      144(%r8), %xmm1
-        jb          DLAST_8_2
-        movdqu      192(%r8), %xmm2
-        cmpl        $14, %r9d
-        aesdec      160(%r8), %xmm1
-        aesdec      176(%r8), %xmm1
-        jb          DLAST_8_2
-        movdqu      224(%r8), %xmm2
-        aesdec      192(%r8), %xmm1
-        aesdec      208(%r8), %xmm1
-DLAST_8_2:
-        aesdeclast  %xmm2, %xmm1
-        pxor        %xmm9, %xmm1
-        movdqa      %xmm10, %xmm9
-        movdqu      %xmm1, (%rsi)
-        addq        $16, %rsi
-        decq        %r10
-        jne         DLOOP_8_2
-DEND_8:
-        ret
-
-#endif /* WOLFSSL_AESNI_BYx */
-
-
-/*
-AES_ECB_encrypt (const unsigned char *in,
-	unsigned char *out,
-	unsigned long length,
-	const unsigned char *KS,
-	int nr)
-*/
-#ifndef __APPLE__
-.globl AES_ECB_encrypt
-AES_ECB_encrypt:
-#else
-.globl _AES_ECB_encrypt
-_AES_ECB_encrypt:
-#endif
-# parameter 1: %rdi
-# parameter 2: %rsi
-# parameter 3: %rdx
-# parameter 4: %rcx
-# parameter 5: %r8d
-        movq    %rdx, %r10
-        shrq    $4, %rdx
-        shlq    $60, %r10
-        je      EECB_NO_PARTS_4
-        addq    $1, %rdx
-EECB_NO_PARTS_4:
-        movq    %rdx, %r10
-        shlq    $62, %r10
-        shrq    $62, %r10
-        shrq    $2, %rdx
-        je      EECB_REMAINDER_4
-        subq    $64, %rsi
-EECB_LOOP_4:
-        movdqu  (%rdi), %xmm1
-        movdqu  16(%rdi), %xmm2
-        movdqu  32(%rdi), %xmm3
-        movdqu  48(%rdi), %xmm4
-        movdqa  (%rcx), %xmm9
-        movdqa  16(%rcx), %xmm10
-        movdqa  32(%rcx), %xmm11
-        movdqa  48(%rcx), %xmm12
-        pxor    %xmm9, %xmm1
-        pxor    %xmm9, %xmm2
-        pxor    %xmm9, %xmm3
-        pxor    %xmm9, %xmm4
-        aesenc  %xmm10, %xmm1
-        aesenc  %xmm10, %xmm2
-        aesenc  %xmm10, %xmm3
-        aesenc  %xmm10, %xmm4
-        aesenc  %xmm11, %xmm1
-        aesenc  %xmm11, %xmm2
-        aesenc  %xmm11, %xmm3
-        aesenc  %xmm11, %xmm4
-        aesenc  %xmm12, %xmm1
-        aesenc  %xmm12, %xmm2
-        aesenc  %xmm12, %xmm3
-        aesenc  %xmm12, %xmm4
-        movdqa  64(%rcx), %xmm9
-        movdqa  80(%rcx), %xmm10
-        movdqa  96(%rcx), %xmm11
-        movdqa  112(%rcx), %xmm12
-        aesenc  %xmm9, %xmm1
-        aesenc  %xmm9, %xmm2
-        aesenc  %xmm9, %xmm3
-        aesenc  %xmm9, %xmm4
-        aesenc  %xmm10, %xmm1
-        aesenc  %xmm10, %xmm2
-        aesenc  %xmm10, %xmm3
-        aesenc  %xmm10, %xmm4
-        aesenc  %xmm11, %xmm1
-        aesenc  %xmm11, %xmm2
-        aesenc  %xmm11, %xmm3
-        aesenc  %xmm11, %xmm4
-        aesenc  %xmm12, %xmm1
-        aesenc  %xmm12, %xmm2
-        aesenc  %xmm12, %xmm3
-        aesenc  %xmm12, %xmm4
-        movdqa  128(%rcx), %xmm9
-        movdqa  144(%rcx), %xmm10
-        movdqa  160(%rcx), %xmm11
-        cmpl    $12, %r8d
-        aesenc  %xmm9, %xmm1
-        aesenc  %xmm9, %xmm2
-        aesenc  %xmm9, %xmm3
-        aesenc  %xmm9, %xmm4
-        aesenc  %xmm10, %xmm1
-        aesenc  %xmm10, %xmm2
-        aesenc  %xmm10, %xmm3
-        aesenc  %xmm10, %xmm4
-        jb      EECB_LAST_4
-        movdqa  160(%rcx), %xmm9
-        movdqa  176(%rcx), %xmm10
-        movdqa  192(%rcx), %xmm11
-        cmpl    $14, %r8d
-        aesenc  %xmm9, %xmm1
-        aesenc  %xmm9, %xmm2
-        aesenc  %xmm9, %xmm3
-        aesenc  %xmm9, %xmm4
-        aesenc  %xmm10, %xmm1
-        aesenc  %xmm10, %xmm2
-        aesenc  %xmm10, %xmm3
-        aesenc  %xmm10, %xmm4
-        jb      EECB_LAST_4
-        movdqa  192(%rcx), %xmm9
-        movdqa  208(%rcx), %xmm10
-        movdqa  224(%rcx), %xmm11
-        aesenc  %xmm9, %xmm1
-        aesenc  %xmm9, %xmm2
-        aesenc  %xmm9, %xmm3
-        aesenc  %xmm9, %xmm4
-        aesenc  %xmm10, %xmm1
-        aesenc  %xmm10, %xmm2
-        aesenc  %xmm10, %xmm3
-        aesenc  %xmm10, %xmm4
-EECB_LAST_4:
-        addq    $64, %rdi
-        addq    $64, %rsi
-        decq    %rdx
-        aesenclast %xmm11, %xmm1
-        aesenclast %xmm11, %xmm2
-        aesenclast %xmm11, %xmm3
-        aesenclast %xmm11, %xmm4
-        movdqu  %xmm1, (%rsi)
-        movdqu  %xmm2, 16(%rsi)
-        movdqu  %xmm3, 32(%rsi)
-        movdqu  %xmm4, 48(%rsi)
-        jne     EECB_LOOP_4
-        addq    $64, %rsi
-EECB_REMAINDER_4:
-        cmpq    $0, %r10
-        je      EECB_END_4
-EECB_LOOP_4_2:
-        movdqu  (%rdi), %xmm1
-        addq    $16, %rdi
-        pxor    (%rcx), %xmm1
-        movdqu  160(%rcx), %xmm2
-        aesenc  16(%rcx), %xmm1
-        aesenc  32(%rcx), %xmm1
-        aesenc  48(%rcx), %xmm1
-        aesenc  64(%rcx), %xmm1
-        aesenc  80(%rcx), %xmm1
-        aesenc  96(%rcx), %xmm1
-        aesenc  112(%rcx), %xmm1
-        aesenc  128(%rcx), %xmm1
-        aesenc  144(%rcx), %xmm1
-        cmpl    $12, %r8d
-        jb      EECB_LAST_4_2
-        movdqu  192(%rcx), %xmm2
-        aesenc  160(%rcx), %xmm1
-        aesenc  176(%rcx), %xmm1
-        cmpl    $14, %r8d
-        jb      EECB_LAST_4_2
-        movdqu  224(%rcx), %xmm2
-        aesenc  192(%rcx), %xmm1
-        aesenc  208(%rcx), %xmm1
-EECB_LAST_4_2:
-        aesenclast %xmm2, %xmm1
-        movdqu  %xmm1, (%rsi)
-        addq    $16, %rsi
-        decq    %r10
-        jne     EECB_LOOP_4_2
-EECB_END_4:
-        ret
-
-
-/*
-AES_ECB_decrypt (const unsigned char *in,
-  unsigned char *out,
-  unsigned long length,
-  const unsigned char *KS,
-  int nr)
-*/
-#ifndef __APPLE__
-.globl AES_ECB_decrypt
-AES_ECB_decrypt:
-#else
-.globl _AES_ECB_decrypt
-_AES_ECB_decrypt:
-#endif
-# parameter 1: %rdi
-# parameter 2: %rsi
-# parameter 3: %rdx
-# parameter 4: %rcx
-# parameter 5: %r8d
-
-        movq    %rdx, %r10
-        shrq    $4, %rdx
-        shlq    $60, %r10
-        je      DECB_NO_PARTS_4
-        addq    $1, %rdx
-DECB_NO_PARTS_4:
-        movq    %rdx, %r10
-        shlq    $62, %r10
-        shrq    $62, %r10
-        shrq    $2, %rdx
-        je      DECB_REMAINDER_4
-        subq    $64, %rsi
-DECB_LOOP_4:
-        movdqu  (%rdi), %xmm1
-        movdqu  16(%rdi), %xmm2
-        movdqu  32(%rdi), %xmm3
-        movdqu  48(%rdi), %xmm4
-        movdqa  (%rcx), %xmm9
-        movdqa  16(%rcx), %xmm10
-        movdqa  32(%rcx), %xmm11
-        movdqa  48(%rcx), %xmm12
-        pxor    %xmm9, %xmm1
-        pxor    %xmm9, %xmm2
-        pxor    %xmm9, %xmm3
-        pxor    %xmm9, %xmm4
-        aesdec  %xmm10, %xmm1
-        aesdec  %xmm10, %xmm2
-        aesdec  %xmm10, %xmm3
-        aesdec  %xmm10, %xmm4
-        aesdec  %xmm11, %xmm1
-        aesdec  %xmm11, %xmm2
-        aesdec  %xmm11, %xmm3
-        aesdec  %xmm11, %xmm4
-        aesdec  %xmm12, %xmm1
-        aesdec  %xmm12, %xmm2
-        aesdec  %xmm12, %xmm3
-        aesdec  %xmm12, %xmm4
-        movdqa  64(%rcx), %xmm9
-        movdqa  80(%rcx), %xmm10
-        movdqa  96(%rcx), %xmm11
-        movdqa  112(%rcx), %xmm12
-        aesdec  %xmm9, %xmm1
-        aesdec  %xmm9, %xmm2
-        aesdec  %xmm9, %xmm3
-        aesdec  %xmm9, %xmm4
-        aesdec  %xmm10, %xmm1
-        aesdec  %xmm10, %xmm2
-        aesdec  %xmm10, %xmm3
-        aesdec  %xmm10, %xmm4
-        aesdec  %xmm11, %xmm1
-        aesdec  %xmm11, %xmm2
-        aesdec  %xmm11, %xmm3
-        aesdec  %xmm11, %xmm4
-        aesdec  %xmm12, %xmm1
-        aesdec  %xmm12, %xmm2
-        aesdec  %xmm12, %xmm3
-        aesdec  %xmm12, %xmm4
-        movdqa  128(%rcx), %xmm9
-        movdqa  144(%rcx), %xmm10
-        movdqa  160(%rcx), %xmm11
-        cmpl    $12, %r8d
-        aesdec  %xmm9, %xmm1
-        aesdec  %xmm9, %xmm2
-        aesdec  %xmm9, %xmm3
-        aesdec  %xmm9, %xmm4
-        aesdec  %xmm10, %xmm1
-        aesdec  %xmm10, %xmm2
-        aesdec  %xmm10, %xmm3
-        aesdec  %xmm10, %xmm4
-        jb      DECB_LAST_4
-        movdqa  160(%rcx), %xmm9
-        movdqa  176(%rcx), %xmm10
-        movdqa  192(%rcx), %xmm11
-        cmpl    $14, %r8d
-        aesdec  %xmm9, %xmm1
-        aesdec  %xmm9, %xmm2
-        aesdec  %xmm9, %xmm3
-        aesdec  %xmm9, %xmm4
-        aesdec  %xmm10, %xmm1
-        aesdec  %xmm10, %xmm2
-        aesdec  %xmm10, %xmm3
-        aesdec  %xmm10, %xmm4
-        jb      DECB_LAST_4
-        movdqa  192(%rcx), %xmm9
-        movdqa  208(%rcx), %xmm10
-        movdqa  224(%rcx), %xmm11
-        aesdec  %xmm9, %xmm1
-        aesdec  %xmm9, %xmm2
-        aesdec  %xmm9, %xmm3
-        aesdec  %xmm9, %xmm4
-        aesdec  %xmm10, %xmm1
-        aesdec  %xmm10, %xmm2
-        aesdec  %xmm10, %xmm3
-        aesdec  %xmm10, %xmm4
-DECB_LAST_4:
-        addq    $64, %rdi
-        addq    $64, %rsi
-        decq    %rdx
-        aesdeclast %xmm11, %xmm1
-        aesdeclast %xmm11, %xmm2
-        aesdeclast %xmm11, %xmm3
-        aesdeclast %xmm11, %xmm4
-        movdqu  %xmm1, (%rsi)
-        movdqu  %xmm2, 16(%rsi)
-        movdqu  %xmm3, 32(%rsi)
-        movdqu  %xmm4, 48(%rsi)
-        jne     DECB_LOOP_4
-        addq    $64, %rsi
-DECB_REMAINDER_4:
-        cmpq    $0, %r10
-        je      DECB_END_4
-DECB_LOOP_4_2:
-        movdqu  (%rdi), %xmm1
-        addq    $16, %rdi
-        pxor    (%rcx), %xmm1
-        movdqu  160(%rcx), %xmm2
-        cmpl    $12, %r8d
-        aesdec  16(%rcx), %xmm1
-        aesdec  32(%rcx), %xmm1
-        aesdec  48(%rcx), %xmm1
-        aesdec  64(%rcx), %xmm1
-        aesdec  80(%rcx), %xmm1
-        aesdec  96(%rcx), %xmm1
-        aesdec  112(%rcx), %xmm1
-        aesdec  128(%rcx), %xmm1
-        aesdec  144(%rcx), %xmm1
-        jb      DECB_LAST_4_2
-        cmpl    $14, %r8d
-        movdqu  192(%rcx), %xmm2
-        aesdec  160(%rcx), %xmm1
-        aesdec  176(%rcx), %xmm1
-        jb      DECB_LAST_4_2
-        movdqu  224(%rcx), %xmm2
-        aesdec  192(%rcx), %xmm1
-        aesdec  208(%rcx), %xmm1
-DECB_LAST_4_2:
-        aesdeclast %xmm2, %xmm1
-        movdqu  %xmm1, (%rsi)
-        addq    $16, %rsi
-        decq    %r10
-        jne     DECB_LOOP_4_2
-DECB_END_4:
-        ret
-
-
-
-
-/*
-void AES_128_Key_Expansion(const unsigned char* userkey,
-   unsigned char* key_schedule);
-*/
-.align  16,0x90
-#ifndef __APPLE__
-.globl AES_128_Key_Expansion
-AES_128_Key_Expansion:
-#else
-.globl _AES_128_Key_Expansion
-_AES_128_Key_Expansion:
-#endif
-# parameter 1: %rdi
-# parameter 2: %rsi
-movl    $10, 240(%rsi)
-
-movdqu  (%rdi), %xmm1
-movdqa    %xmm1, (%rsi)
-
-
-ASSISTS:
-aeskeygenassist $1, %xmm1, %xmm2
-call PREPARE_ROUNDKEY_128
-movdqa %xmm1, 16(%rsi)
-aeskeygenassist $2, %xmm1, %xmm2
-call PREPARE_ROUNDKEY_128
-movdqa %xmm1, 32(%rsi)
-aeskeygenassist $4, %xmm1, %xmm2
-call PREPARE_ROUNDKEY_128
-movdqa %xmm1, 48(%rsi)
-aeskeygenassist $8, %xmm1, %xmm2
-call PREPARE_ROUNDKEY_128
-movdqa %xmm1, 64(%rsi)
-aeskeygenassist $16, %xmm1, %xmm2
-call PREPARE_ROUNDKEY_128
-movdqa %xmm1, 80(%rsi)
-aeskeygenassist $32, %xmm1, %xmm2
-call PREPARE_ROUNDKEY_128
-movdqa %xmm1, 96(%rsi)
-aeskeygenassist $64, %xmm1, %xmm2
-call PREPARE_ROUNDKEY_128
-movdqa %xmm1, 112(%rsi)
-aeskeygenassist $0x80, %xmm1, %xmm2
-call PREPARE_ROUNDKEY_128
-movdqa %xmm1, 128(%rsi)
-aeskeygenassist $0x1b, %xmm1, %xmm2
-call PREPARE_ROUNDKEY_128
-movdqa %xmm1, 144(%rsi)
-aeskeygenassist $0x36, %xmm1, %xmm2
-call PREPARE_ROUNDKEY_128
-movdqa %xmm1, 160(%rsi)
-ret
-
-PREPARE_ROUNDKEY_128:
-pshufd $255, %xmm2, %xmm2
-movdqa %xmm1, %xmm3
-pslldq $4, %xmm3
-pxor %xmm3, %xmm1
-pslldq $4, %xmm3
-pxor %xmm3, %xmm1
-pslldq $4, %xmm3
-pxor %xmm3, %xmm1
-pxor %xmm2, %xmm1
-ret
-
-
-/*
-void AES_192_Key_Expansion (const unsigned char *userkey,
-  unsigned char *key)
-*/
-#ifndef __APPLE__
-.globl AES_192_Key_Expansion
-AES_192_Key_Expansion:
-#else
-.globl _AES_192_Key_Expansion
-_AES_192_Key_Expansion:
-#endif
-# parameter 1: %rdi
-# parameter 2: %rsi
-
-movdqu (%rdi), %xmm1
-movq 16(%rdi), %xmm3
-movdqa %xmm1, (%rsi)
-movdqa %xmm3, %xmm5
-
-aeskeygenassist $0x1, %xmm3, %xmm2
-call PREPARE_ROUNDKEY_192
-shufpd $0, %xmm1, %xmm5
-movdqa %xmm5, 16(%rsi)
-movdqa %xmm1, %xmm6
-shufpd $1, %xmm3, %xmm6
-movdqa %xmm6, 32(%rsi)
-
-aeskeygenassist $0x2, %xmm3, %xmm2
-call PREPARE_ROUNDKEY_192
-movdqa %xmm1, 48(%rsi)
-movdqa %xmm3, %xmm5
-
-aeskeygenassist $0x4, %xmm3, %xmm2
-call PREPARE_ROUNDKEY_192
-shufpd $0, %xmm1, %xmm5
-movdqa %xmm5, 64(%rsi)
-movdqa %xmm1, %xmm6
-shufpd $1, %xmm3, %xmm6
-movdqa %xmm6, 80(%rsi)
-
-aeskeygenassist $0x8, %xmm3, %xmm2
-call PREPARE_ROUNDKEY_192
-movdqa %xmm1, 96(%rsi)
-movdqa %xmm3, %xmm5
-
-aeskeygenassist $0x10, %xmm3, %xmm2
-call PREPARE_ROUNDKEY_192
-shufpd $0, %xmm1, %xmm5
-movdqa %xmm5, 112(%rsi)
-movdqa %xmm1, %xmm6
-shufpd $1, %xmm3, %xmm6
-movdqa %xmm6, 128(%rsi)
-
-aeskeygenassist $0x20, %xmm3, %xmm2
-call PREPARE_ROUNDKEY_192
-movdqa %xmm1, 144(%rsi)
-movdqa %xmm3, %xmm5
-
-aeskeygenassist $0x40, %xmm3, %xmm2
-call PREPARE_ROUNDKEY_192
-shufpd $0, %xmm1, %xmm5
-movdqa %xmm5, 160(%rsi)
-movdqa %xmm1, %xmm6
-shufpd $1, %xmm3, %xmm6
-movdqa %xmm6, 176(%rsi)
-
-aeskeygenassist $0x80, %xmm3, %xmm2
-call PREPARE_ROUNDKEY_192
-movdqa %xmm1, 192(%rsi)
-movdqa %xmm3, 208(%rsi)
-ret
-
-PREPARE_ROUNDKEY_192:
-pshufd $0x55, %xmm2, %xmm2
-movdqu %xmm1, %xmm4
-pslldq $4, %xmm4
-pxor   %xmm4, %xmm1
-
-pslldq $4, %xmm4
-pxor   %xmm4, %xmm1
-pslldq $4, %xmm4
-pxor  %xmm4, %xmm1
-pxor   %xmm2, %xmm1
-pshufd $0xff, %xmm1, %xmm2
-movdqu %xmm3, %xmm4
-pslldq $4, %xmm4
-pxor   %xmm4, %xmm3
-pxor   %xmm2, %xmm3
-ret
-
-
-/*
-void AES_256_Key_Expansion (const unsigned char *userkey,
-  unsigned char *key)
-*/
-#ifndef __APPLE__
-.globl AES_256_Key_Expansion
-AES_256_Key_Expansion:
-#else
-.globl _AES_256_Key_Expansion
-_AES_256_Key_Expansion:
-#endif
-# parameter 1: %rdi
-# parameter 2: %rsi
-
-movdqu (%rdi), %xmm1
-movdqu 16(%rdi), %xmm3
-movdqa %xmm1, (%rsi)
-movdqa %xmm3, 16(%rsi)
-
-aeskeygenassist $0x1, %xmm3, %xmm2
-call MAKE_RK256_a
-movdqa %xmm1, 32(%rsi)
-aeskeygenassist $0x0, %xmm1, %xmm2
-call MAKE_RK256_b
-movdqa %xmm3, 48(%rsi)
-aeskeygenassist $0x2, %xmm3, %xmm2
-call MAKE_RK256_a
-movdqa %xmm1, 64(%rsi)
-aeskeygenassist $0x0, %xmm1, %xmm2
-call MAKE_RK256_b
-movdqa %xmm3, 80(%rsi)
-aeskeygenassist $0x4, %xmm3, %xmm2
-call MAKE_RK256_a
-movdqa %xmm1, 96(%rsi)
-aeskeygenassist $0x0, %xmm1, %xmm2
-call MAKE_RK256_b
-movdqa %xmm3, 112(%rsi)
-aeskeygenassist $0x8, %xmm3, %xmm2
-call MAKE_RK256_a
-movdqa %xmm1, 128(%rsi)
-aeskeygenassist $0x0, %xmm1, %xmm2
-call MAKE_RK256_b
-movdqa %xmm3, 144(%rsi)
-aeskeygenassist $0x10, %xmm3, %xmm2
-call MAKE_RK256_a
-movdqa %xmm1, 160(%rsi)
-aeskeygenassist $0x0, %xmm1, %xmm2
-call MAKE_RK256_b
-movdqa %xmm3, 176(%rsi)
-aeskeygenassist $0x20, %xmm3, %xmm2
-call MAKE_RK256_a
-movdqa %xmm1, 192(%rsi)
-
-aeskeygenassist $0x0, %xmm1, %xmm2
-call MAKE_RK256_b
-movdqa %xmm3, 208(%rsi)
-aeskeygenassist $0x40, %xmm3, %xmm2
-call MAKE_RK256_a
-movdqa %xmm1, 224(%rsi)
-
-ret
-
-MAKE_RK256_a:
-pshufd $0xff, %xmm2, %xmm2
-movdqa %xmm1, %xmm4
-pslldq $4, %xmm4
-pxor   %xmm4, %xmm1
-pslldq $4, %xmm4
-pxor  %xmm4, %xmm1
-pslldq $4, %xmm4
-pxor  %xmm4, %xmm1
-pxor   %xmm2, %xmm1
-ret
-
-MAKE_RK256_b:
-pshufd $0xaa, %xmm2, %xmm2
-movdqa %xmm3, %xmm4
-pslldq $4, %xmm4
-pxor   %xmm4, %xmm3
-pslldq $4, %xmm4
-pxor  %xmm4, %xmm3
-pslldq $4, %xmm4
-pxor  %xmm4, %xmm3
-pxor   %xmm2, %xmm3
-ret
-
-#elif defined WOLFSSL_X86_BUILD
-
-/*
-AES_CBC_encrypt (const unsigned char *in,
-	unsigned char *out,
-	unsigned char ivec[16],
-	unsigned long length,
-	const unsigned char *KS,
-	int nr)
-*/
-#ifndef __APPLE__
-.globl AES_CBC_encrypt
-AES_CBC_encrypt:
-#else
-.globl _AES_CBC_encrypt
-_AES_CBC_encrypt:
-#endif
-        # parameter 1: stack[4] => %edi
-        # parameter 2: stack[8] => %esi
-        # parameter 3: stack[12] => %edx
-        # parameter 4: stack[16] => %ecx
-        # parameter 5: stack[20] => %eax
-        # parameter 6: stack[24] => %ebx
-        push	%edi
-        push	%esi
-        push	%ebx
-        push	%ebp
-        movl	20(%esp), %edi
-        movl	24(%esp), %esi
-        movl	28(%esp), %edx
-        movl	32(%esp), %ecx
-        movl	36(%esp), %eax
-        movl	40(%esp), %ebx
-
-        movl	%ecx, %ebp
-        shrl	$4, %ecx
-        shll	$60, %ebp
-        je	NO_PARTS
-        addl	$1, %ecx
-        NO_PARTS:
-        subl	$16, %esi
-        movdqa	(%edx), %xmm1
-        LOOP:
-        pxor	(%edi), %xmm1
-        pxor	(%eax), %xmm1
-        addl	$16,%esi
-        addl	$16,%edi
-        cmpl	$12, %ebx
-        aesenc	16(%eax),%xmm1
-        aesenc	32(%eax),%xmm1
-        aesenc	48(%eax),%xmm1
-        aesenc	64(%eax),%xmm1
-        aesenc	80(%eax),%xmm1
-        aesenc	96(%eax),%xmm1
-        aesenc	112(%eax),%xmm1
-        aesenc	128(%eax),%xmm1
-        aesenc	144(%eax),%xmm1
-        movdqa	160(%eax),%xmm2
-        jb	LAST
-        cmpl	$14, %ebx
-
-        aesenc	160(%eax),%xmm1
-        aesenc	176(%eax),%xmm1
-        movdqa	192(%eax),%xmm2
-        jb	LAST
-        aesenc	192(%eax),%xmm1
-        aesenc	208(%eax),%xmm1
-        movdqa	224(%eax),%xmm2
-        LAST:
-        decl	%ecx
-        aesenclast %xmm2,%xmm1
-        movdqu	%xmm1,(%esi)
-        jne	LOOP
-
-        pop	%ebp
-        pop	%ebx
-        pop	%esi
-        pop	%edi
-        ret
-
-
-/*
-AES_CBC_decrypt_by4 (const unsigned char *in,
-  unsigned char *out,
-  unsigned char ivec[16],
-  unsigned long length,
-  const unsigned char *KS,
-  int nr)
-*/
-#ifndef __APPLE__
-.globl AES_CBC_decrypt_by4
-AES_CBC_decrypt_by4:
-#else
-.globl _AES_CBC_decrypt_by4
-_AES_CBC_decrypt_by4:
-#endif
-# parameter 1: stack[4] => %edi
-# parameter 2: stack[8] => %esi
-# parameter 3: stack[12] => %edx
-# parameter 4: stack[16] => %ecx
-# parameter 5: stack[20] => %eax
-# parameter 6: stack[24] => %ebx
-        push	%edi
-        push	%esi
-        push	%ebx
-        push	%ebp
-        movl	20(%esp), %edi
-        movl	24(%esp), %esi
-        movl	28(%esp), %edx
-        movl	32(%esp), %ecx
-        movl	36(%esp), %eax
-        movl	40(%esp), %ebx
-        subl	$16, %esp
-
-        movdqu      (%edx), %xmm0
-        movl        %ecx, %ebp
-        shrl        $4, %ecx
-        shll        $60, %ebp
-        movdqu      %xmm0, (%esp)
-        je          DNO_PARTS_4
-        addl        $1, %ecx
-DNO_PARTS_4:
-        movl        %ecx, %ebp
-        shll        $62, %ebp
-        shrl        $62, %ebp
-        shrl        $2, %ecx
-        je          DREMAINDER_4
-        subl        $64, %esi
-DLOOP_4:
-        movdqu      (%edi), %xmm1
-        movdqu      16(%edi), %xmm2
-        movdqu      32(%edi), %xmm3
-        movdqu      48(%edi), %xmm4
-        movdqa      (%eax), %xmm5
-        movdqa      16(%eax), %xmm6
-        movdqa      32(%eax), %xmm7
-        movdqa      48(%eax), %xmm0
-        pxor        %xmm5, %xmm1
-        pxor        %xmm5, %xmm2
-        pxor        %xmm5, %xmm3
-        pxor        %xmm5, %xmm4
-        aesdec      %xmm6, %xmm1
-        aesdec      %xmm6, %xmm2
-        aesdec      %xmm6, %xmm3
-        aesdec      %xmm6, %xmm4
-        aesdec      %xmm7, %xmm1
-        aesdec      %xmm7, %xmm2
-        aesdec      %xmm7, %xmm3
-        aesdec      %xmm7, %xmm4
-        aesdec      %xmm0, %xmm1
-        aesdec      %xmm0, %xmm2
-        aesdec      %xmm0, %xmm3
-        aesdec      %xmm0, %xmm4
-        movdqa      64(%eax), %xmm5
-        movdqa      80(%eax), %xmm6
-        movdqa      96(%eax), %xmm7
-        movdqa      112(%eax), %xmm0
-        aesdec      %xmm5, %xmm1
-        aesdec      %xmm5, %xmm2
-        aesdec      %xmm5, %xmm3
-        aesdec      %xmm5, %xmm4
-        aesdec      %xmm6, %xmm1
-        aesdec      %xmm6, %xmm2
-        aesdec      %xmm6, %xmm3
-        aesdec      %xmm6, %xmm4
-        aesdec      %xmm7, %xmm1
-        aesdec      %xmm7, %xmm2
-        aesdec      %xmm7, %xmm3
-        aesdec      %xmm7, %xmm4
-        aesdec      %xmm0, %xmm1
-        aesdec      %xmm0, %xmm2
-        aesdec      %xmm0, %xmm3
-        aesdec      %xmm0, %xmm4
-        movdqa      128(%eax), %xmm5
-        movdqa      144(%eax), %xmm6
-        movdqa      160(%eax), %xmm7
-        cmpl        $12, %ebx
-        aesdec      %xmm5, %xmm1
-        aesdec      %xmm5, %xmm2
-        aesdec      %xmm5, %xmm3
-        aesdec      %xmm5, %xmm4
-        aesdec      %xmm6, %xmm1
-        aesdec      %xmm6, %xmm2
-        aesdec      %xmm6, %xmm3
-        aesdec      %xmm6, %xmm4
-        jb          DLAST_4
-        movdqa      160(%eax), %xmm5
-        movdqa      176(%eax), %xmm6
-        movdqa      192(%eax), %xmm7
-        cmpl        $14, %ebx
-        aesdec      %xmm5, %xmm1
-        aesdec      %xmm5, %xmm2
-        aesdec      %xmm5, %xmm3
-        aesdec      %xmm5, %xmm4
-        aesdec      %xmm6, %xmm1
-        aesdec      %xmm6, %xmm2
-        aesdec      %xmm6, %xmm3
-        aesdec      %xmm6, %xmm4
-        jb          DLAST_4
-        movdqa      192(%eax), %xmm5
-        movdqa      208(%eax), %xmm6
-        movdqa      224(%eax), %xmm7
-        aesdec      %xmm5, %xmm1
-        aesdec      %xmm5, %xmm2
-        aesdec      %xmm5, %xmm3
-        aesdec      %xmm5, %xmm4
-        aesdec      %xmm6, %xmm1
-        aesdec      %xmm6, %xmm2
-        aesdec      %xmm6, %xmm3
-        aesdec      %xmm6, %xmm4
-DLAST_4:
-        addl        $64, %esi
-        aesdeclast  %xmm7, %xmm1
-        aesdeclast  %xmm7, %xmm2
-        aesdeclast  %xmm7, %xmm3
-        aesdeclast  %xmm7, %xmm4
-        movdqu      (%esp), %xmm0
-        movdqu      (%edi), %xmm5
-        movdqu      16(%edi), %xmm6
-        movdqu      32(%edi), %xmm7
-        pxor        %xmm0, %xmm1
-        pxor        %xmm5, %xmm2
-        pxor        %xmm6, %xmm3
-        pxor        %xmm7, %xmm4
-        movdqu      48(%edi), %xmm0
-        movdqu      %xmm1, (%esi)
-        movdqu      %xmm2, 16(%esi)
-        movdqu      %xmm3, 32(%esi)
-        movdqu      %xmm4, 48(%esi)
-        movdqu      %xmm0, (%esp)
-        addl        $64, %edi
-        decl        %ecx
-        jne         DLOOP_4
-        addl        $64, %esi
-DREMAINDER_4:
-        cmpl        $0, %ebp
-        je          DEND_4
-DLOOP_4_2:
-        movdqu      (%edi), %xmm1
-        movdqa      %xmm1, %xmm5
-        addl        $16, %edi
-        pxor        (%eax), %xmm1
-        movdqu      160(%eax), %xmm2
-        cmpl        $12, %ebx
-        aesdec      16(%eax), %xmm1
-        aesdec      32(%eax), %xmm1
-        aesdec      48(%eax), %xmm1
-        aesdec      64(%eax), %xmm1
-        aesdec      80(%eax), %xmm1
-        aesdec      96(%eax), %xmm1
-        aesdec      112(%eax), %xmm1
-        aesdec      128(%eax), %xmm1
-        aesdec      144(%eax), %xmm1
-        jb          DLAST_4_2
-        movdqu      192(%eax), %xmm2
-        cmpl        $14, %ebx
-        aesdec      160(%eax), %xmm1
-        aesdec      176(%eax), %xmm1
-        jb          DLAST_4_2
-        movdqu      224(%eax), %xmm2
-        aesdec      192(%eax), %xmm1
-        aesdec      208(%eax), %xmm1
-DLAST_4_2:
-        aesdeclast  %xmm2, %xmm1
-        pxor        %xmm0, %xmm1
-        movdqa      %xmm5, %xmm0
-        movdqu      %xmm1, (%esi)
-        addl        $16, %esi
-        decl        %ebp
-        jne         DLOOP_4_2
-DEND_4:
-
-        addl	$16, %esp
-        pop	%ebp
-        pop	%ebx
-        pop	%esi
-        pop	%edi
-        ret
-
-/*
-AES_ECB_encrypt (const unsigned char *in,
-	unsigned char *out,
-	unsigned long length,
-	const unsigned char *KS,
-	int nr)
-*/
-#ifndef __APPLE__
-.globl AES_ECB_encrypt
-AES_ECB_encrypt:
-#else
-.globl _AES_ECB_encrypt
-_AES_ECB_encrypt:
-#endif
-# parameter 1: stack[4] => %edi
-# parameter 2: stack[8] => %esi
-# parameter 3: stack[12] => %edx
-# parameter 4: stack[16] => %ecx
-# parameter 5: stack[20] => %eax
-        push	%edi
-        push	%esi
-        push	%ebx
-        movl	16(%esp), %edi
-        movl	20(%esp), %esi
-        movl	24(%esp), %edx
-        movl	28(%esp), %ecx
-        movl	32(%esp), %eax
-
-        movl    %edx, %ebx
-        shrl    $4, %edx
-        shll    $60, %ebx
-        je      EECB_NO_PARTS_4
-        addl    $1, %edx
-EECB_NO_PARTS_4:
-        movl    %edx, %ebx
-        shll    $62, %ebx
-        shrl    $62, %ebx
-        shrl    $2, %edx
-        je      EECB_REMAINDER_4
-        subl    $64, %esi
-EECB_LOOP_4:
-        movdqu  (%edi), %xmm1
-        movdqu  16(%edi), %xmm2
-        movdqu  32(%edi), %xmm3
-        movdqu  48(%edi), %xmm4
-        movdqa  (%ecx), %xmm5
-        movdqa  16(%ecx), %xmm6
-        movdqa  32(%ecx), %xmm7
-        movdqa  48(%ecx), %xmm0
-        pxor    %xmm5, %xmm1
-        pxor    %xmm5, %xmm2
-        pxor    %xmm5, %xmm3
-        pxor    %xmm5, %xmm4
-        aesenc  %xmm6, %xmm1
-        aesenc  %xmm6, %xmm2
-        aesenc  %xmm6, %xmm3
-        aesenc  %xmm6, %xmm4
-        aesenc  %xmm7, %xmm1
-        aesenc  %xmm7, %xmm2
-        aesenc  %xmm7, %xmm3
-        aesenc  %xmm7, %xmm4
-        aesenc  %xmm0, %xmm1
-        aesenc  %xmm0, %xmm2
-        aesenc  %xmm0, %xmm3
-        aesenc  %xmm0, %xmm4
-        movdqa  64(%ecx), %xmm5
-        movdqa  80(%ecx), %xmm6
-        movdqa  96(%ecx), %xmm7
-        movdqa  112(%ecx), %xmm0
-        aesenc  %xmm5, %xmm1
-        aesenc  %xmm5, %xmm2
-        aesenc  %xmm5, %xmm3
-        aesenc  %xmm5, %xmm4
-        aesenc  %xmm6, %xmm1
-        aesenc  %xmm6, %xmm2
-        aesenc  %xmm6, %xmm3
-        aesenc  %xmm6, %xmm4
-        aesenc  %xmm7, %xmm1
-        aesenc  %xmm7, %xmm2
-        aesenc  %xmm7, %xmm3
-        aesenc  %xmm7, %xmm4
-        aesenc  %xmm0, %xmm1
-        aesenc  %xmm0, %xmm2
-        aesenc  %xmm0, %xmm3
-        aesenc  %xmm0, %xmm4
-        movdqa  128(%ecx), %xmm5
-        movdqa  144(%ecx), %xmm6
-        movdqa  160(%ecx), %xmm7
-        cmpl    $12, %eax
-        aesenc  %xmm5, %xmm1
-        aesenc  %xmm5, %xmm2
-        aesenc  %xmm5, %xmm3
-        aesenc  %xmm5, %xmm4
-        aesenc  %xmm6, %xmm1
-        aesenc  %xmm6, %xmm2
-        aesenc  %xmm6, %xmm3
-        aesenc  %xmm6, %xmm4
-        jb      EECB_LAST_4
-        movdqa  160(%ecx), %xmm5
-        movdqa  176(%ecx), %xmm6
-        movdqa  192(%ecx), %xmm7
-        cmpl    $14, %eax
-        aesenc  %xmm5, %xmm1
-        aesenc  %xmm5, %xmm2
-        aesenc  %xmm5, %xmm3
-        aesenc  %xmm5, %xmm4
-        aesenc  %xmm6, %xmm1
-        aesenc  %xmm6, %xmm2
-        aesenc  %xmm6, %xmm3
-        aesenc  %xmm6, %xmm4
-        jb      EECB_LAST_4
-        movdqa  192(%ecx), %xmm5
-        movdqa  208(%ecx), %xmm6
-        movdqa  224(%ecx), %xmm7
-        aesenc  %xmm5, %xmm1
-        aesenc  %xmm5, %xmm2
-        aesenc  %xmm5, %xmm3
-        aesenc  %xmm5, %xmm4
-        aesenc  %xmm6, %xmm1
-        aesenc  %xmm6, %xmm2
-        aesenc  %xmm6, %xmm3
-        aesenc  %xmm6, %xmm4
-EECB_LAST_4:
-        addl    $64, %edi
-        addl    $64, %esi
-        decl    %edx
-        aesenclast %xmm7, %xmm1
-        aesenclast %xmm7, %xmm2
-        aesenclast %xmm7, %xmm3
-        aesenclast %xmm7, %xmm4
-        movdqu  %xmm1, (%esi)
-        movdqu  %xmm2, 16(%esi)
-        movdqu  %xmm3, 32(%esi)
-        movdqu  %xmm4, 48(%esi)
-        jne     EECB_LOOP_4
-        addl    $64, %esi
-EECB_REMAINDER_4:
-        cmpl    $0, %ebx
-        je      EECB_END_4
-EECB_LOOP_4_2:
-        movdqu  (%edi), %xmm1
-        addl    $16, %edi
-        pxor    (%ecx), %xmm1
-        movdqu  160(%ecx), %xmm2
-        aesenc  16(%ecx), %xmm1
-        aesenc  32(%ecx), %xmm1
-        aesenc  48(%ecx), %xmm1
-        aesenc  64(%ecx), %xmm1
-        aesenc  80(%ecx), %xmm1
-        aesenc  96(%ecx), %xmm1
-        aesenc  112(%ecx), %xmm1
-        aesenc  128(%ecx), %xmm1
-        aesenc  144(%ecx), %xmm1
-        cmpl    $12, %eax
-        jb      EECB_LAST_4_2
-        movdqu  192(%ecx), %xmm2
-        aesenc  160(%ecx), %xmm1
-        aesenc  176(%ecx), %xmm1
-        cmpl    $14, %eax
-        jb      EECB_LAST_4_2
-        movdqu  224(%ecx), %xmm2
-        aesenc  192(%ecx), %xmm1
-        aesenc  208(%ecx), %xmm1
-EECB_LAST_4_2:
-        aesenclast %xmm2, %xmm1
-        movdqu  %xmm1, (%esi)
-        addl    $16, %esi
-        decl    %ebx
-        jne     EECB_LOOP_4_2
-EECB_END_4:
-
-        pop	%ebx
-        pop	%esi
-        pop	%edi
-        ret
-
-
-/*
-AES_ECB_decrypt (const unsigned char *in,
-  unsigned char *out,
-  unsigned long length,
-  const unsigned char *KS,
-  int nr)
-*/
-#ifndef __APPLE__
-.globl AES_ECB_decrypt
-AES_ECB_decrypt:
-#else
-.globl _AES_ECB_decrypt
-_AES_ECB_decrypt:
-#endif
-# parameter 1: stack[4] => %edi
-# parameter 2: stack[8] => %esi
-# parameter 3: stack[12] => %edx
-# parameter 4: stack[16] => %ecx
-# parameter 5: stack[20] => %eax
-        push	%edi
-        push	%esi
-        push	%ebx
-        movl	20(%esp), %edi
-        movl	24(%esp), %esi
-        movl	28(%esp), %edx
-        movl	32(%esp), %ecx
-        movl	36(%esp), %eax
-
-
-        movl    %edx, %ebx
-        shrl    $4, %edx
-        shll    $60, %ebx
-        je      DECB_NO_PARTS_4
-        addl    $1, %edx
-DECB_NO_PARTS_4:
-        movl    %edx, %ebx
-        shll    $62, %ebx
-        shrl    $62, %ebx
-        shrl    $2, %edx
-        je      DECB_REMAINDER_4
-        subl    $64, %esi
-DECB_LOOP_4:
-        movdqu  (%edi), %xmm1
-        movdqu  16(%edi), %xmm2
-        movdqu  32(%edi), %xmm3
-        movdqu  48(%edi), %xmm4
-        movdqa  (%ecx), %xmm5
-        movdqa  16(%ecx), %xmm6
-        movdqa  32(%ecx), %xmm7
-        movdqa  48(%ecx), %xmm0
-        pxor    %xmm5, %xmm1
-        pxor    %xmm5, %xmm2
-        pxor    %xmm5, %xmm3
-        pxor    %xmm5, %xmm4
-        aesdec  %xmm6, %xmm1
-        aesdec  %xmm6, %xmm2
-        aesdec  %xmm6, %xmm3
-        aesdec  %xmm6, %xmm4
-        aesdec  %xmm7, %xmm1
-        aesdec  %xmm7, %xmm2
-        aesdec  %xmm7, %xmm3
-        aesdec  %xmm7, %xmm4
-        aesdec  %xmm0, %xmm1
-        aesdec  %xmm0, %xmm2
-        aesdec  %xmm0, %xmm3
-        aesdec  %xmm0, %xmm4
-        movdqa  64(%ecx), %xmm5
-        movdqa  80(%ecx), %xmm6
-        movdqa  96(%ecx), %xmm7
-        movdqa  112(%ecx), %xmm0
-        aesdec  %xmm5, %xmm1
-        aesdec  %xmm5, %xmm2
-        aesdec  %xmm5, %xmm3
-        aesdec  %xmm5, %xmm4
-        aesdec  %xmm6, %xmm1
-        aesdec  %xmm6, %xmm2
-        aesdec  %xmm6, %xmm3
-        aesdec  %xmm6, %xmm4
-        aesdec  %xmm7, %xmm1
-        aesdec  %xmm7, %xmm2
-        aesdec  %xmm7, %xmm3
-        aesdec  %xmm7, %xmm4
-        aesdec  %xmm0, %xmm1
-        aesdec  %xmm0, %xmm2
-        aesdec  %xmm0, %xmm3
-        aesdec  %xmm0, %xmm4
-        movdqa  128(%ecx), %xmm5
-        movdqa  144(%ecx), %xmm6
-        movdqa  160(%ecx), %xmm7
-        cmpl    $12, %eax
-        aesdec  %xmm5, %xmm1
-        aesdec  %xmm5, %xmm2
-        aesdec  %xmm5, %xmm3
-        aesdec  %xmm5, %xmm4
-        aesdec  %xmm6, %xmm1
-        aesdec  %xmm6, %xmm2
-        aesdec  %xmm6, %xmm3
-        aesdec  %xmm6, %xmm4
-        jb      DECB_LAST_4
-        movdqa  160(%ecx), %xmm5
-        movdqa  176(%ecx), %xmm6
-        movdqa  192(%ecx), %xmm7
-        cmpl    $14, %eax
-        aesdec  %xmm5, %xmm1
-        aesdec  %xmm5, %xmm2
-        aesdec  %xmm5, %xmm3
-        aesdec  %xmm5, %xmm4
-        aesdec  %xmm6, %xmm1
-        aesdec  %xmm6, %xmm2
-        aesdec  %xmm6, %xmm3
-        aesdec  %xmm6, %xmm4
-        jb      DECB_LAST_4
-        movdqa  192(%ecx), %xmm5
-        movdqa  208(%ecx), %xmm6
-        movdqa  224(%ecx), %xmm7
-        aesdec  %xmm5, %xmm1
-        aesdec  %xmm5, %xmm2
-        aesdec  %xmm5, %xmm3
-        aesdec  %xmm5, %xmm4
-        aesdec  %xmm6, %xmm1
-        aesdec  %xmm6, %xmm2
-        aesdec  %xmm6, %xmm3
-        aesdec  %xmm6, %xmm4
-DECB_LAST_4:
-        addl    $64, %edi
-        addl    $64, %esi
-        decl    %edx
-        aesdeclast %xmm7, %xmm1
-        aesdeclast %xmm7, %xmm2
-        aesdeclast %xmm7, %xmm3
-        aesdeclast %xmm7, %xmm4
-        movdqu  %xmm1, (%esi)
-        movdqu  %xmm2, 16(%esi)
-        movdqu  %xmm3, 32(%esi)
-        movdqu  %xmm4, 48(%esi)
-        jne     DECB_LOOP_4
-        addl    $64, %esi
-DECB_REMAINDER_4:
-        cmpl    $0, %ebx
-        je      DECB_END_4
-DECB_LOOP_4_2:
-        movdqu  (%edi), %xmm1
-        addl    $16, %edi
-        pxor    (%ecx), %xmm1
-        movdqu  160(%ecx), %xmm2
-        cmpl    $12, %eax
-        aesdec  16(%ecx), %xmm1
-        aesdec  32(%ecx), %xmm1
-        aesdec  48(%ecx), %xmm1
-        aesdec  64(%ecx), %xmm1
-        aesdec  80(%ecx), %xmm1
-        aesdec  96(%ecx), %xmm1
-        aesdec  112(%ecx), %xmm1
-        aesdec  128(%ecx), %xmm1
-        aesdec  144(%ecx), %xmm1
-        jb      DECB_LAST_4_2
-        cmpl    $14, %eax
-        movdqu  192(%ecx), %xmm2
-        aesdec  160(%ecx), %xmm1
-        aesdec  176(%ecx), %xmm1
-        jb      DECB_LAST_4_2
-        movdqu  224(%ecx), %xmm2
-        aesdec  192(%ecx), %xmm1
-        aesdec  208(%ecx), %xmm1
-DECB_LAST_4_2:
-        aesdeclast %xmm2, %xmm1
-        movdqu  %xmm1, (%esi)
-        addl    $16, %esi
-        decl    %ebx
-        jne     DECB_LOOP_4_2
-DECB_END_4:
-        pop	%ebx
-        pop	%esi
-        pop	%edi
-        ret
-
-
-
-/*
-void AES_128_Key_Expansion(const unsigned char* userkey,
-   unsigned char* key_schedule);
-*/
-.align  16,0x90
-#ifndef __APPLE__
-.globl AES_128_Key_Expansion
-AES_128_Key_Expansion:
-#else
-.globl _AES_128_Key_Expansion
-_AES_128_Key_Expansion:
-#endif
-        # parameter 1: stack[4] => %eax
-        # parameter 2: stack[8] => %edx
-        movl	4(%esp), %eax
-        movl	8(%esp), %edx
-
-        movl    $10, 240(%edx)
-
-        movdqu  (%eax), %xmm1
-        movdqa    %xmm1, (%edx)
-
-
-ASSISTS:
-        aeskeygenassist $1, %xmm1, %xmm2
-        call PREPARE_ROUNDKEY_128
-        movdqa %xmm1, 16(%edx)
-        aeskeygenassist $2, %xmm1, %xmm2
-        call PREPARE_ROUNDKEY_128
-        movdqa %xmm1, 32(%edx)
-        aeskeygenassist $4, %xmm1, %xmm2
-        call PREPARE_ROUNDKEY_128
-        movdqa %xmm1, 48(%edx)
-        aeskeygenassist $8, %xmm1, %xmm2
-        call PREPARE_ROUNDKEY_128
-        movdqa %xmm1, 64(%edx)
-        aeskeygenassist $16, %xmm1, %xmm2
-        call PREPARE_ROUNDKEY_128
-        movdqa %xmm1, 80(%edx)
-        aeskeygenassist $32, %xmm1, %xmm2
-        call PREPARE_ROUNDKEY_128
-        movdqa %xmm1, 96(%edx)
-        aeskeygenassist $64, %xmm1, %xmm2
-        call PREPARE_ROUNDKEY_128
-        movdqa %xmm1, 112(%edx)
-        aeskeygenassist $0x80, %xmm1, %xmm2
-        call PREPARE_ROUNDKEY_128
-        movdqa %xmm1, 128(%edx)
-        aeskeygenassist $0x1b, %xmm1, %xmm2
-        call PREPARE_ROUNDKEY_128
-        movdqa %xmm1, 144(%edx)
-        aeskeygenassist $0x36, %xmm1, %xmm2
-        call PREPARE_ROUNDKEY_128
-        movdqa %xmm1, 160(%edx)
-        ret
-
-PREPARE_ROUNDKEY_128:
-        pshufd $255, %xmm2, %xmm2
-        movdqa %xmm1, %xmm3
-        pslldq $4, %xmm3
-        pxor %xmm3, %xmm1
-        pslldq $4, %xmm3
-        pxor %xmm3, %xmm1
-        pslldq $4, %xmm3
-        pxor %xmm3, %xmm1
-        pxor %xmm2, %xmm1
-        ret
-
-
-/*
-void AES_192_Key_Expansion (const unsigned char *userkey,
-  unsigned char *key)
-*/
-#ifndef __APPLE__
-.globl AES_192_Key_Expansion
-AES_192_Key_Expansion:
-#else
-.globl _AES_192_Key_Expansion
-_AES_192_Key_Expansion:
-#endif
-        # parameter 1: stack[4] => %eax
-        # parameter 2: stack[8] => %edx
-        movl	4(%esp), %eax
-        movl	8(%esp), %edx
-
-        movdqu (%eax), %xmm1
-        movq 16(%eax), %xmm3
-        movdqa %xmm1, (%edx)
-        movdqa %xmm3, %xmm5
-
-        aeskeygenassist $0x1, %xmm3, %xmm2
-        call PREPARE_ROUNDKEY_192
-        shufpd $0, %xmm1, %xmm5
-        movdqa %xmm5, 16(%edx)
-        movdqa %xmm1, %xmm6
-        shufpd $1, %xmm3, %xmm6
-        movdqa %xmm6, 32(%edx)
-
-        aeskeygenassist $0x2, %xmm3, %xmm2
-        call PREPARE_ROUNDKEY_192
-        movdqa %xmm1, 48(%edx)
-        movdqa %xmm3, %xmm5
-
-        aeskeygenassist $0x4, %xmm3, %xmm2
-        call PREPARE_ROUNDKEY_192
-        shufpd $0, %xmm1, %xmm5
-        movdqa %xmm5, 64(%edx)
-        movdqa %xmm1, %xmm6
-        shufpd $1, %xmm3, %xmm6
-        movdqa %xmm6, 80(%edx)
-
-        aeskeygenassist $0x8, %xmm3, %xmm2
-        call PREPARE_ROUNDKEY_192
-        movdqa %xmm1, 96(%edx)
-        movdqa %xmm3, %xmm5
-
-        aeskeygenassist $0x10, %xmm3, %xmm2
-        call PREPARE_ROUNDKEY_192
-        shufpd $0, %xmm1, %xmm5
-        movdqa %xmm5, 112(%edx)
-        movdqa %xmm1, %xmm6
-        shufpd $1, %xmm3, %xmm6
-        movdqa %xmm6, 128(%edx)
-
-        aeskeygenassist $0x20, %xmm3, %xmm2
-        call PREPARE_ROUNDKEY_192
-        movdqa %xmm1, 144(%edx)
-        movdqa %xmm3, %xmm5
-
-        aeskeygenassist $0x40, %xmm3, %xmm2
-        call PREPARE_ROUNDKEY_192
-        shufpd $0, %xmm1, %xmm5
-        movdqa %xmm5, 160(%edx)
-        movdqa %xmm1, %xmm6
-        shufpd $1, %xmm3, %xmm6
-        movdqa %xmm6, 176(%edx)
-
-        aeskeygenassist $0x80, %xmm3, %xmm2
-        call PREPARE_ROUNDKEY_192
-        movdqa %xmm1, 192(%edx)
-        movdqa %xmm3, 208(%edx)
-        ret
-
-PREPARE_ROUNDKEY_192:
-        pshufd $0x55, %xmm2, %xmm2
-        movdqu %xmm1, %xmm4
-        pslldq $4, %xmm4
-        pxor   %xmm4, %xmm1
-
-        pslldq $4, %xmm4
-        pxor   %xmm4, %xmm1
-        pslldq $4, %xmm4
-        pxor  %xmm4, %xmm1
-        pxor   %xmm2, %xmm1
-        pshufd $0xff, %xmm1, %xmm2
-        movdqu %xmm3, %xmm4
-        pslldq $4, %xmm4
-        pxor   %xmm4, %xmm3
-        pxor   %xmm2, %xmm3
-        ret
-
-
-/*
-void AES_256_Key_Expansion (const unsigned char *userkey,
-  unsigned char *key)
-*/
-#ifndef __APPLE__
-.globl AES_256_Key_Expansion
-AES_256_Key_Expansion:
-#else
-.globl _AES_256_Key_Expansion
-_AES_256_Key_Expansion:
-#endif
-        # parameter 1: stack[4] => %eax
-        # parameter 2: stack[8] => %edx
-        movl	4(%esp), %eax
-        movl	8(%esp), %edx
-
-        movdqu (%eax), %xmm1
-        movdqu 16(%eax), %xmm3
-        movdqa %xmm1, (%edx)
-        movdqa %xmm3, 16(%edx)
-
-        aeskeygenassist $0x1, %xmm3, %xmm2
-        call MAKE_RK256_a
-        movdqa %xmm1, 32(%edx)
-        aeskeygenassist $0x0, %xmm1, %xmm2
-        call MAKE_RK256_b
-        movdqa %xmm3, 48(%edx)
-        aeskeygenassist $0x2, %xmm3, %xmm2
-        call MAKE_RK256_a
-        movdqa %xmm1, 64(%edx)
-        aeskeygenassist $0x0, %xmm1, %xmm2
-        call MAKE_RK256_b
-        movdqa %xmm3, 80(%edx)
-        aeskeygenassist $0x4, %xmm3, %xmm2
-        call MAKE_RK256_a
-        movdqa %xmm1, 96(%edx)
-        aeskeygenassist $0x0, %xmm1, %xmm2
-        call MAKE_RK256_b
-        movdqa %xmm3, 112(%edx)
-        aeskeygenassist $0x8, %xmm3, %xmm2
-        call MAKE_RK256_a
-        movdqa %xmm1, 128(%edx)
-        aeskeygenassist $0x0, %xmm1, %xmm2
-        call MAKE_RK256_b
-        movdqa %xmm3, 144(%edx)
-        aeskeygenassist $0x10, %xmm3, %xmm2
-        call MAKE_RK256_a
-        movdqa %xmm1, 160(%edx)
-        aeskeygenassist $0x0, %xmm1, %xmm2
-        call MAKE_RK256_b
-        movdqa %xmm3, 176(%edx)
-        aeskeygenassist $0x20, %xmm3, %xmm2
-        call MAKE_RK256_a
-        movdqa %xmm1, 192(%edx)
-
-        aeskeygenassist $0x0, %xmm1, %xmm2
-        call MAKE_RK256_b
-        movdqa %xmm3, 208(%edx)
-        aeskeygenassist $0x40, %xmm3, %xmm2
-        call MAKE_RK256_a
-        movdqa %xmm1, 224(%edx)
-
-        ret
-
-MAKE_RK256_a:
-        pshufd $0xff, %xmm2, %xmm2
-        movdqa %xmm1, %xmm4
-        pslldq $4, %xmm4
-        pxor   %xmm4, %xmm1
-        pslldq $4, %xmm4
-        pxor  %xmm4, %xmm1
-        pslldq $4, %xmm4
-        pxor  %xmm4, %xmm1
-        pxor   %xmm2, %xmm1
-        ret
-
-MAKE_RK256_b:
-        pshufd $0xaa, %xmm2, %xmm2
-        movdqa %xmm3, %xmm4
-        pslldq $4, %xmm4
-        pxor   %xmm4, %xmm3
-        pslldq $4, %xmm4
-        pxor  %xmm4, %xmm3
-        pslldq $4, %xmm4
-        pxor  %xmm4, %xmm3
-        pxor   %xmm2, %xmm3
-        ret
-
-#endif /* WOLFSSL_X86_64_BUILD */
-
-#if defined(__linux__) && defined(__ELF__)
-    .section .note.GNU-stack,"",%progbits
-#endif
-

+ 0 - 1531
lib/wolfssl/wolfcrypt/src/aes_asm.asm

@@ -1,1531 +0,0 @@
-; /* aes_asm.asm
-;  *
-;  * Copyright (C) 2006-2023 wolfSSL Inc.
-;  *
-;  * This file is part of wolfSSL.
-;  *
-;  * wolfSSL is free software; you can redistribute it and/or modify
-;  * it under the terms of the GNU General Public License as published by
-;  * the Free Software Foundation; either version 2 of the License, or
-;  * (at your option) any later version.
-;  *
-;  * wolfSSL is distributed in the hope that it will be useful,
-;  * but WITHOUT ANY WARRANTY; without even the implied warranty of
-;  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;  * GNU General Public License for more details.
-;  *
-;  * You should have received a copy of the GNU General Public License
-;  * along with this program; if not, write to the Free Software
-;  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
-;  */
-
-
-
-;
-;
-;  /* See Intel Advanced Encryption Standard (AES) Instructions Set White Paper
-;   * by Israel, Intel Mobility Group Development Center, Israel Shay Gueron
-;   */
-;
-;   /* This file is in intel asm syntax, see .s for at&t syntax */
-;
-
-
-fips_version = 0
-IFDEF HAVE_FIPS
-  fips_version = 1
-  IFDEF HAVE_FIPS_VERSION
-    fips_version = HAVE_FIPS_VERSION
-  ENDIF
-ENDIF
-
-IF fips_version GE 2
-  fipsAh SEGMENT ALIAS(".fipsA$h") 'CODE'
-ELSE
-  _text SEGMENT
-ENDIF
-
-
-;	/*
-;	AES_CBC_encrypt[const	,unsigned	char*in
-;	unsigned	,char*out
-;	unsigned	,char	ivec+16
-;	unsigned	,long	length
-;	const	,unsigned	char*KS
-;	int	nr]
-;	*/
-AES_CBC_encrypt PROC
-;#	parameter	1:	rdi
-;#	parameter	2:	rsi
-;#	parameter	3:	rdx
-;#	parameter	4:	rcx
-;#	parameter	5:	r8
-;#	parameter	6:	r9d
-
-; save rdi and rsi to rax and r11, restore before ret
-	mov rax,rdi
-	mov r11,rsi
-
-; convert to what we had for att&t convention
-	mov rdi,rcx
-	mov rsi,rdx
-	mov rdx,r8
-	mov rcx,r9
-	mov r8,[rsp+40]
-	mov r9d,[rsp+48]
-
-	mov	r10,rcx
-	shr	rcx,4
-	shl	r10,60
-	je	NO_PARTS
-	add	rcx,1
-NO_PARTS:
-	sub	rsi,16
-	movdqa	xmm1,[rdx]
-LOOP_1:
-	pxor	xmm1,[rdi]
-	pxor	xmm1,[r8]
-	add	rsi,16
-	add	rdi,16
-	cmp	r9d,12
-	aesenc	xmm1,16[r8]
-	aesenc	xmm1,32[r8]
-	aesenc	xmm1,48[r8]
-	aesenc	xmm1,64[r8]
-	aesenc	xmm1,80[r8]
-	aesenc	xmm1,96[r8]
-	aesenc	xmm1,112[r8]
-	aesenc	xmm1,128[r8]
-	aesenc	xmm1,144[r8]
-	movdqa	xmm2,160[r8]
-	jb	LAST
-	cmp	r9d,14
-
-	aesenc	xmm1,160[r8]
-	aesenc	xmm1,176[r8]
-	movdqa	xmm2,192[r8]
-	jb	LAST
-	aesenc	xmm1,192[r8]
-	aesenc	xmm1,208[r8]
-	movdqa	xmm2,224[r8]
-LAST:
-	dec	rcx
-	aesenclast	xmm1,xmm2
-	movdqu	[rsi],xmm1
-	jne	LOOP_1
-	; restore non volatile rdi,rsi
-	mov rdi,rax
-	mov rsi,r11
-	ret
-AES_CBC_encrypt ENDP
-
-
-; void AES_CBC_decrypt_by4(const unsigned char* in,
-;                          unsigned char* out,
-;                          unsigned char ivec[16],
-;                          unsigned long length,
-;                          const unsigned char* KS,
-;                          int nr)
-AES_CBC_decrypt_by4 PROC
-; parameter 1: rdi
-; parameter 2: rsi
-; parameter 3: rdx
-; parameter 4: rcx
-; parameter 5: r8
-; parameter 6: r9d
-
-        ; save rdi and rsi to rax and r11, restore before ret
-        mov         rax, rdi
-        mov         r11, rsi
-        ; convert to what we had for att&t convention
-        mov         rdi, rcx
-        mov         rsi, rdx
-        mov         rdx, r8
-        mov         rcx,r9
-        mov         r8, [rsp+40]
-        mov         r9d, [rsp+48]
-        ; on microsoft xmm6-xmm15 are non volatile,
-        ; let's save on stack and restore at end
-        sub         rsp, 8+8*16  ; 8 = align stack , 8 xmm6-12,15 16 bytes each
-        movdqa      [rsp+0], xmm6
-        movdqa      [rsp+16], xmm7
-        movdqa      [rsp+32], xmm8
-        movdqa      [rsp+48], xmm9
-        movdqa      [rsp+64], xmm10
-        movdqa      [rsp+80], xmm11
-        movdqa      [rsp+96], xmm12
-        movdqa      [rsp+112], xmm15
-        ; back to our original code, more or less
-        mov         r10, rcx
-        shr         rcx, 4
-        shl         r10, 60
-        je          DNO_PARTS_4
-        add         rcx, 1
-DNO_PARTS_4:
-        mov         r10, rcx
-        shl         r10, 62
-        shr         r10, 62
-        shr         rcx, 2
-        movdqu      xmm5, [rdx]
-        je          DREMAINDER_4
-        sub         rsi, 64
-DLOOP_4:
-        movdqu      xmm1, [rdi]
-        movdqu      xmm2, 16[rdi]
-        movdqu      xmm3, 32[rdi]
-        movdqu      xmm4, 48[rdi]
-        movdqa      xmm6, xmm1
-        movdqa      xmm7, xmm2
-        movdqa      xmm8, xmm3
-        movdqa      xmm15, xmm4
-        movdqa      xmm9, [r8]
-        movdqa      xmm10, 16[r8]
-        movdqa      xmm11, 32[r8]
-        movdqa      xmm12, 48[r8]
-        pxor        xmm1, xmm9
-        pxor        xmm2, xmm9
-        pxor        xmm3, xmm9
-        pxor        xmm4, xmm9
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-        aesdec      xmm1, xmm11
-        aesdec      xmm2, xmm11
-        aesdec      xmm3, xmm11
-        aesdec      xmm4, xmm11
-        aesdec      xmm1, xmm12
-        aesdec      xmm2, xmm12
-        aesdec      xmm3, xmm12
-        aesdec      xmm4, xmm12
-        movdqa      xmm9, 64[r8]
-        movdqa      xmm10, 80[r8]
-        movdqa      xmm11, 96[r8]
-        movdqa      xmm12, 112[r8]
-        aesdec      xmm1, xmm9
-        aesdec      xmm2, xmm9
-        aesdec      xmm3, xmm9
-        aesdec      xmm4, xmm9
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-        aesdec      xmm1, xmm11
-        aesdec      xmm2, xmm11
-        aesdec      xmm3, xmm11
-        aesdec      xmm4, xmm11
-        aesdec      xmm1, xmm12
-        aesdec      xmm2, xmm12
-        aesdec      xmm3, xmm12
-        aesdec      xmm4, xmm12
-        movdqa      xmm9, 128[r8]
-        movdqa      xmm10, 144[r8]
-        movdqa      xmm11, 160[r8]
-        cmp         r9d, 12
-        aesdec      xmm1, xmm9
-        aesdec      xmm2, xmm9
-        aesdec      xmm3, xmm9
-        aesdec      xmm4, xmm9
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-        jb          DLAST_4
-        movdqa      xmm9, 160[r8]
-        movdqa      xmm10, 176[r8]
-        movdqa      xmm11, 192[r8]
-        cmp         r9d, 14
-        aesdec      xmm1, xmm9
-        aesdec      xmm2, xmm9
-        aesdec      xmm3, xmm9
-        aesdec      xmm4, xmm9
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-        jb          DLAST_4
-        movdqa      xmm9, 192[r8]
-        movdqa      xmm10, 208[r8]
-        movdqa      xmm11, 224[r8]
-        aesdec      xmm1, xmm9
-        aesdec      xmm2, xmm9
-        aesdec      xmm3, xmm9
-        aesdec      xmm4, xmm9
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-DLAST_4:
-        add         rdi, 64
-        add         rsi, 64
-        dec         rcx
-        aesdeclast  xmm1, xmm11
-        aesdeclast  xmm2, xmm11
-        aesdeclast  xmm3, xmm11
-        aesdeclast  xmm4, xmm11
-        pxor        xmm1, xmm5
-        pxor        xmm2, xmm6
-        pxor        xmm3, xmm7
-        pxor        xmm4, xmm8
-        movdqu      [rsi], xmm1
-        movdqu      16[rsi], xmm2
-        movdqu      32[rsi], xmm3
-        movdqu      48[rsi], xmm4
-        movdqa      xmm5, xmm15
-        jne         DLOOP_4
-        add         rsi, 64
-DREMAINDER_4:
-        cmp         r10, 0
-        je          DEND_4
-DLOOP_4_2:
-        movdqu      xmm1, [rdi]
-        movdqa      xmm15, xmm1
-        add         rdi, 16
-        pxor        xmm1, [r8]
-        movdqu      xmm2, 160[r8]
-        cmp         r9d, 12
-        aesdec      xmm1, 16[r8]
-        aesdec      xmm1, 32[r8]
-        aesdec      xmm1, 48[r8]
-        aesdec      xmm1, 64[r8]
-        aesdec      xmm1, 80[r8]
-        aesdec      xmm1, 96[r8]
-        aesdec      xmm1, 112[r8]
-        aesdec      xmm1, 128[r8]
-        aesdec      xmm1, 144[r8]
-        jb          DLAST_4_2
-        movdqu      xmm2, 192[r8]
-        cmp         r9d, 14
-        aesdec      xmm1, 160[r8]
-        aesdec      xmm1, 176[r8]
-        jb          DLAST_4_2
-        movdqu      xmm2, 224[r8]
-        aesdec      xmm1, 192[r8]
-        aesdec      xmm1, 208[r8]
-DLAST_4_2:
-        aesdeclast  xmm1, xmm2
-        pxor        xmm1, xmm5
-        movdqa      xmm5, xmm15
-        movdqu      [rsi], xmm1
-        add         rsi, 16
-        dec         r10
-        jne         DLOOP_4_2
-DEND_4:
-        ; restore non volatile rdi,rsi
-        mov         rdi, rax
-        mov         rsi, r11
-        ; restore non volatile xmms from stack
-        movdqa      xmm6, [rsp+0]
-        movdqa      xmm7, [rsp+16]
-        movdqa      xmm8, [rsp+32]
-        movdqa      xmm9, [rsp+48]
-        movdqa      xmm10, [rsp+64]
-        movdqa      xmm11, [rsp+80]
-        movdqa      xmm12, [rsp+96]
-        movdqa      xmm15, [rsp+112]
-        add         rsp, 8+8*16 ; 8 = align stack , 8 xmm6-12,15 16 bytes each
-        ret
-AES_CBC_decrypt_by4 ENDP
-
-
-; void AES_CBC_decrypt_by6(const unsigned char *in,
-;                          unsigned char *out,
-;                          unsigned char ivec[16],
-;                          unsigned long length,
-;                          const unsigned char *KS,
-;                          int nr)
-AES_CBC_decrypt_by6 PROC
-; parameter 1: rdi - in
-; parameter 2: rsi - out
-; parameter 3: rdx - ivec
-; parameter 4: rcx - length
-; parameter 5: r8  - KS
-; parameter 6: r9d - nr
-
-        ; save rdi and rsi to rax and r11, restore before ret
-        mov         rax, rdi
-        mov         r11, rsi
-        ; convert to what we had for att&t convention
-        mov         rdi, rcx
-        mov         rsi, rdx
-        mov         rdx, r8
-        mov         rcx, r9
-        mov         r8, [rsp+40]
-        mov         r9d, [rsp+48]
-        ; on microsoft xmm6-xmm15 are non volatile,
-        ; let's save on stack and restore at end
-        sub         rsp, 8+9*16  ; 8 = align stack , 9 xmm6-14 16 bytes each
-        movdqa      [rsp+0], xmm6
-        movdqa      [rsp+16], xmm7
-        movdqa      [rsp+32], xmm8
-        movdqa      [rsp+48], xmm9
-        movdqa      [rsp+64], xmm10
-        movdqa      [rsp+80], xmm11
-        movdqa      [rsp+96], xmm12
-        movdqa      [rsp+112], xmm13
-        movdqa      [rsp+128], xmm14
-        ; back to our original code, more or less
-        mov         r10, rcx
-        shr         rcx, 4
-        shl         r10, 60
-        je          DNO_PARTS_6
-        add         rcx, 1
-DNO_PARTS_6:
-        mov         r12, rax
-        mov         r13, rdx
-        mov         r14, rbx
-        mov         rdx, 0
-        mov         rax, rcx
-        mov         rbx, 6
-        div         rbx
-        mov         rcx, rax
-        mov         r10, rdx
-        mov         rax, r12
-        mov         rdx, r13
-        mov         rbx, r14
-        cmp         rcx, 0
-        movdqu      xmm7, [rdx]
-        je          DREMAINDER_6
-        sub         rsi, 96
-DLOOP_6:
-        movdqu      xmm1, [rdi]
-        movdqu      xmm2, 16[rdi]
-        movdqu      xmm3, 32[rdi]
-        movdqu      xmm4, 48[rdi]
-        movdqu      xmm5, 64[rdi]
-        movdqu      xmm6, 80[rdi]
-        movdqa      xmm8, [r8]
-        movdqa      xmm9, 16[r8]
-        movdqa      xmm10, 32[r8]
-        movdqa      xmm11, 48[r8]
-        pxor        xmm1, xmm8
-        pxor        xmm2, xmm8
-        pxor        xmm3, xmm8
-        pxor        xmm4, xmm8
-        pxor        xmm5, xmm8
-        pxor        xmm6, xmm8
-        aesdec      xmm1, xmm9
-        aesdec      xmm2, xmm9
-        aesdec      xmm3, xmm9
-        aesdec      xmm4, xmm9
-        aesdec      xmm5, xmm9
-        aesdec      xmm6, xmm9
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-        aesdec      xmm5, xmm10
-        aesdec      xmm6, xmm10
-        aesdec      xmm1, xmm11
-        aesdec      xmm2, xmm11
-        aesdec      xmm3, xmm11
-        aesdec      xmm4, xmm11
-        aesdec      xmm5, xmm11
-        aesdec      xmm6, xmm11
-        movdqa      xmm8, 64[r8]
-        movdqa      xmm9, 80[r8]
-        movdqa      xmm10, 96[r8]
-        movdqa      xmm11, 112[r8]
-        aesdec      xmm1, xmm8
-        aesdec      xmm2, xmm8
-        aesdec      xmm3, xmm8
-        aesdec      xmm4, xmm8
-        aesdec      xmm5, xmm8
-        aesdec      xmm6, xmm8
-        aesdec      xmm1, xmm9
-        aesdec      xmm2, xmm9
-        aesdec      xmm3, xmm9
-        aesdec      xmm4, xmm9
-        aesdec      xmm5, xmm9
-        aesdec      xmm6, xmm9
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-        aesdec      xmm5, xmm10
-        aesdec      xmm6, xmm10
-        aesdec      xmm1, xmm11
-        aesdec      xmm2, xmm11
-        aesdec      xmm3, xmm11
-        aesdec      xmm4, xmm11
-        aesdec      xmm5, xmm11
-        aesdec      xmm6, xmm11
-        movdqa      xmm8, 128[r8]
-        movdqa      xmm9, 144[r8]
-        movdqa      xmm10, 160[r8]
-        cmp         r9d, 12
-        aesdec      xmm1, xmm8
-        aesdec      xmm2, xmm8
-        aesdec      xmm3, xmm8
-        aesdec      xmm4, xmm8
-        aesdec      xmm5, xmm8
-        aesdec      xmm6, xmm8
-        aesdec      xmm1, xmm9
-        aesdec      xmm2, xmm9
-        aesdec      xmm3, xmm9
-        aesdec      xmm4, xmm9
-        aesdec      xmm5, xmm9
-        aesdec      xmm6, xmm9
-        jb          DLAST_6
-        movdqa      xmm8, 160[r8]
-        movdqa      xmm9, 176[r8]
-        movdqa      xmm10, 192[r8]
-        cmp         r9d, 14
-        aesdec      xmm1, xmm8
-        aesdec      xmm2, xmm8
-        aesdec      xmm3, xmm8
-        aesdec      xmm4, xmm8
-        aesdec      xmm5, xmm8
-        aesdec      xmm6, xmm8
-        aesdec      xmm1, xmm9
-        aesdec      xmm2, xmm9
-        aesdec      xmm3, xmm9
-        aesdec      xmm4, xmm9
-        aesdec      xmm5, xmm9
-        aesdec      xmm6, xmm9
-        jb          DLAST_6
-        movdqa      xmm8, 192[r8]
-        movdqa      xmm9, 208[r8]
-        movdqa      xmm10, 224[r8]
-        aesdec      xmm1, xmm8
-        aesdec      xmm2, xmm8
-        aesdec      xmm3, xmm8
-        aesdec      xmm4, xmm8
-        aesdec      xmm5, xmm8
-        aesdec      xmm6, xmm8
-        aesdec      xmm1, xmm9
-        aesdec      xmm2, xmm9
-        aesdec      xmm3, xmm9
-        aesdec      xmm4, xmm9
-        aesdec      xmm5, xmm9
-        aesdec      xmm6, xmm9
-DLAST_6:
-        add         rsi, 96
-        aesdeclast  xmm1, xmm10
-        aesdeclast  xmm2, xmm10
-        aesdeclast  xmm3, xmm10
-        aesdeclast  xmm4, xmm10
-        aesdeclast  xmm5, xmm10
-        aesdeclast  xmm6, xmm10
-        movdqu      xmm8, [rdi]
-        movdqu      xmm9, 16[rdi]
-        movdqu      xmm10, 32[rdi]
-        movdqu      xmm11, 48[rdi]
-        movdqu      xmm12, 64[rdi]
-        movdqu      xmm13, 80[rdi]
-        pxor        xmm1, xmm7
-        pxor        xmm2, xmm8
-        pxor        xmm3, xmm9
-        pxor        xmm4, xmm10
-        pxor        xmm5, xmm11
-        pxor        xmm6, xmm12
-        movdqu      xmm7, xmm13
-        movdqu      [rsi], xmm1
-        movdqu      16[rsi], xmm2
-        movdqu      32[rsi], xmm3
-        movdqu      48[rsi], xmm4
-        movdqu      64[rsi], xmm5
-        movdqu      80[rsi], xmm6
-        add         rdi, 96
-        dec         rcx
-        jne         DLOOP_6
-        add         rsi, 96
-DREMAINDER_6:
-        cmp         r10, 0
-        je          DEND_6
-DLOOP_6_2:
-        movdqu      xmm1, [rdi]
-        movdqa      xmm10, xmm1
-        add         rdi, 16
-        pxor        xmm1, [r8]
-        movdqu      xmm2, 160[r8]
-        cmp         r9d, 12
-        aesdec      xmm1, 16[r8]
-        aesdec      xmm1, 32[r8]
-        aesdec      xmm1, 48[r8]
-        aesdec      xmm1, 64[r8]
-        aesdec      xmm1, 80[r8]
-        aesdec      xmm1, 96[r8]
-        aesdec      xmm1, 112[r8]
-        aesdec      xmm1, 128[r8]
-        aesdec      xmm1, 144[r8]
-        jb          DLAST_6_2
-        movdqu      xmm2, 192[r8]
-        cmp         r9d, 14
-        aesdec      xmm1, 160[r8]
-        aesdec      xmm1, 176[r8]
-        jb          DLAST_6_2
-        movdqu      xmm2, 224[r8]
-        aesdec      xmm1, 192[r8]
-        aesdec      xmm1, 208[r8]
-DLAST_6_2:
-        aesdeclast  xmm1, xmm2
-        pxor        xmm1, xmm7
-        movdqa      xmm7, xmm10
-        movdqu      [rsi], xmm1
-        add         rsi, 16
-        dec         r10
-        jne         DLOOP_6_2
-DEND_6:
-        ; restore non volatile rdi,rsi
-        mov         rdi, rax
-        mov         rsi, r11
-        ; restore non volatile xmms from stack
-        movdqa      xmm6, [rsp+0]
-        movdqa      xmm7, [rsp+16]
-        movdqa      xmm8, [rsp+32]
-        movdqa      xmm9, [rsp+48]
-        movdqa      xmm10, [rsp+64]
-        movdqa      xmm11, [rsp+80]
-        movdqa      xmm12, [rsp+96]
-        movdqa      xmm13, [rsp+112]
-        movdqa      xmm14, [rsp+128]
-        add         rsp, 8+9*16 ; 8 = align stack , 9 xmm6-14 16 bytes each
-        ret
-AES_CBC_decrypt_by6 ENDP
-
-
-; void AES_CBC_decrypt_by8(const unsigned char *in,
-;                          unsigned char *out,
-;                          unsigned char ivec[16],
-;                          unsigned long length,
-;                          const unsigned char *KS,
-;                          int nr)
-AES_CBC_decrypt_by8 PROC
-; parameter 1: rdi - in
-; parameter 2: rsi - out
-; parameter 3: rdx - ivec
-; parameter 4: rcx - length
-; parameter 5: r8  - KS
-; parameter 6: r9d - nr
-
-        ; save rdi and rsi to rax and r11, restore before ret
-        mov         rax, rdi
-        mov         r11, rsi
-        ; convert to what we had for att&t convention
-        mov         rdi, rcx
-        mov         rsi, rdx
-        mov         rdx, r8
-        mov         rcx,r9
-        mov         r8, [rsp+40]
-        mov         r9d, [rsp+48]
-        ; on microsoft xmm6-xmm15 are non volatile,
-        ; let's save on stack and restore at end
-        sub         rsp, 8+8*16  ; 8 = align stack , 8 xmm6-13 16 bytes each
-        movdqa      [rsp+0], xmm6
-        movdqa      [rsp+16], xmm7
-        movdqa      [rsp+32], xmm8
-        movdqa      [rsp+48], xmm9
-        movdqa      [rsp+64], xmm10
-        movdqa      [rsp+80], xmm11
-        movdqa      [rsp+96], xmm12
-        movdqa      [rsp+112], xmm13
-        ; back to our original code, more or less
-        mov         r10, rcx
-        shr         rcx, 4
-        shl         r10, 60
-        je          DNO_PARTS_8
-        add         rcx, 1
-DNO_PARTS_8:
-        mov         r10, rcx
-        shl         r10, 61
-        shr         r10, 61
-        shr         rcx, 3
-        movdqu      xmm9, [rdx]
-        je          DREMAINDER_8
-        sub         rsi, 128
-DLOOP_8:
-        movdqu      xmm1, [rdi]
-        movdqu      xmm2, 16[rdi]
-        movdqu      xmm3, 32[rdi]
-        movdqu      xmm4, 48[rdi]
-        movdqu      xmm5, 64[rdi]
-        movdqu      xmm6, 80[rdi]
-        movdqu      xmm7, 96[rdi]
-        movdqu      xmm8, 112[rdi]
-        movdqa      xmm10, [r8]
-        movdqa      xmm11, 16[r8]
-        movdqa      xmm12, 32[r8]
-        movdqa      xmm13, 48[r8]
-        pxor        xmm1, xmm10
-        pxor        xmm2, xmm10
-        pxor        xmm3, xmm10
-        pxor        xmm4, xmm10
-        pxor        xmm5, xmm10
-        pxor        xmm6, xmm10
-        pxor        xmm7, xmm10
-        pxor        xmm8, xmm10
-        aesdec      xmm1, xmm11
-        aesdec      xmm2, xmm11
-        aesdec      xmm3, xmm11
-        aesdec      xmm4, xmm11
-        aesdec      xmm5, xmm11
-        aesdec      xmm6, xmm11
-        aesdec      xmm7, xmm11
-        aesdec      xmm8, xmm11
-        aesdec      xmm1, xmm12
-        aesdec      xmm2, xmm12
-        aesdec      xmm3, xmm12
-        aesdec      xmm4, xmm12
-        aesdec      xmm5, xmm12
-        aesdec      xmm6, xmm12
-        aesdec      xmm7, xmm12
-        aesdec      xmm8, xmm12
-        aesdec      xmm1, xmm13
-        aesdec      xmm2, xmm13
-        aesdec      xmm3, xmm13
-        aesdec      xmm4, xmm13
-        aesdec      xmm5, xmm13
-        aesdec      xmm6, xmm13
-        aesdec      xmm7, xmm13
-        aesdec      xmm8, xmm13
-        movdqa      xmm10, 64[r8]
-        movdqa      xmm11, 80[r8]
-        movdqa      xmm12, 96[r8]
-        movdqa      xmm13, 112[r8]
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-        aesdec      xmm5, xmm10
-        aesdec      xmm6, xmm10
-        aesdec      xmm7, xmm10
-        aesdec      xmm8, xmm10
-        aesdec      xmm1, xmm11
-        aesdec      xmm2, xmm11
-        aesdec      xmm3, xmm11
-        aesdec      xmm4, xmm11
-        aesdec      xmm5, xmm11
-        aesdec      xmm6, xmm11
-        aesdec      xmm7, xmm11
-        aesdec      xmm8, xmm11
-        aesdec      xmm1, xmm12
-        aesdec      xmm2, xmm12
-        aesdec      xmm3, xmm12
-        aesdec      xmm4, xmm12
-        aesdec      xmm5, xmm12
-        aesdec      xmm6, xmm12
-        aesdec      xmm7, xmm12
-        aesdec      xmm8, xmm12
-        aesdec      xmm1, xmm13
-        aesdec      xmm2, xmm13
-        aesdec      xmm3, xmm13
-        aesdec      xmm4, xmm13
-        aesdec      xmm5, xmm13
-        aesdec      xmm6, xmm13
-        aesdec      xmm7, xmm13
-        aesdec      xmm8, xmm13
-        movdqa      xmm10, 128[r8]
-        movdqa      xmm11, 144[r8]
-        movdqa      xmm12, 160[r8]
-        cmp         r9d, 12
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-        aesdec      xmm5, xmm10
-        aesdec      xmm6, xmm10
-        aesdec      xmm7, xmm10
-        aesdec      xmm8, xmm10
-        aesdec      xmm1, xmm11
-        aesdec      xmm2, xmm11
-        aesdec      xmm3, xmm11
-        aesdec      xmm4, xmm11
-        aesdec      xmm5, xmm11
-        aesdec      xmm6, xmm11
-        aesdec      xmm7, xmm11
-        aesdec      xmm8, xmm11
-        jb          DLAST_8
-        movdqa      xmm10, 160[r8]
-        movdqa      xmm11, 176[r8]
-        movdqa      xmm12, 192[r8]
-        cmp         r9d, 14
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-        aesdec      xmm5, xmm10
-        aesdec      xmm6, xmm10
-        aesdec      xmm7, xmm10
-        aesdec      xmm8, xmm10
-        aesdec      xmm1, xmm11
-        aesdec      xmm2, xmm11
-        aesdec      xmm3, xmm11
-        aesdec      xmm4, xmm11
-        aesdec      xmm5, xmm11
-        aesdec      xmm6, xmm11
-        aesdec      xmm7, xmm11
-        aesdec      xmm8, xmm11
-        jb          DLAST_8
-        movdqa      xmm10, 192[r8]
-        movdqa      xmm11, 208[r8]
-        movdqa      xmm12, 224[r8]
-        aesdec      xmm1, xmm10
-        aesdec      xmm2, xmm10
-        aesdec      xmm3, xmm10
-        aesdec      xmm4, xmm10
-        aesdec      xmm5, xmm10
-        aesdec      xmm6, xmm10
-        aesdec      xmm7, xmm10
-        aesdec      xmm8, xmm10
-        aesdec      xmm1, xmm11
-        aesdec      xmm2, xmm11
-        aesdec      xmm3, xmm11
-        aesdec      xmm4, xmm11
-        aesdec      xmm5, xmm11
-        aesdec      xmm6, xmm11
-        aesdec      xmm7, xmm11
-        aesdec      xmm8, xmm11
-DLAST_8:
-        add         rsi, 128
-        aesdeclast  xmm1, xmm12
-        aesdeclast  xmm2, xmm12
-        aesdeclast  xmm3, xmm12
-        aesdeclast  xmm4, xmm12
-        aesdeclast  xmm5, xmm12
-        aesdeclast  xmm6, xmm12
-        aesdeclast  xmm7, xmm12
-        aesdeclast  xmm8, xmm12
-        movdqu      xmm10, [rdi]
-        movdqu      xmm11, 16[rdi]
-        movdqu      xmm12, 32[rdi]
-        movdqu      xmm13, 48[rdi]
-        pxor        xmm1, xmm9
-        pxor        xmm2, xmm10
-        pxor        xmm3, xmm11
-        pxor        xmm4, xmm12
-        pxor        xmm5, xmm13
-        movdqu      xmm10, 64[rdi]
-        movdqu      xmm11, 80[rdi]
-        movdqu      xmm12, 96[rdi]
-        movdqu      xmm9, 112[rdi]
-        pxor        xmm6, xmm10
-        pxor        xmm7, xmm11
-        pxor        xmm8, xmm12
-        movdqu      [rsi], xmm1
-        movdqu      16[rsi], xmm2
-        movdqu      32[rsi], xmm3
-        movdqu      48[rsi], xmm4
-        movdqu      64[rsi], xmm5
-        movdqu      80[rsi], xmm6
-        movdqu      96[rsi], xmm7
-        movdqu      112[rsi], xmm8
-        add         rdi, 128
-        dec         rcx
-        jne         DLOOP_8
-        add         rsi, 128
-DREMAINDER_8:
-        cmp         r10, 0 
-        je          DEND_8
-DLOOP_8_2:
-        movdqu      xmm1, [rdi]
-        movdqa      xmm10, xmm1
-        add         rdi, 16
-        pxor        xmm1, [r8]
-        movdqu      xmm2, 160[r8]
-        cmp         r9d, 12
-        aesdec      xmm1, 16[r8]
-        aesdec      xmm1, 32[r8]
-        aesdec      xmm1, 48[r8]
-        aesdec      xmm1, 64[r8]
-        aesdec      xmm1, 80[r8]
-        aesdec      xmm1, 96[r8]
-        aesdec      xmm1, 112[r8]
-        aesdec      xmm1, 128[r8]
-        aesdec      xmm1, 144[r8]
-        jb          DLAST_8_2
-        movdqu      xmm2, 192[r8]
-        cmp         r9d, 14
-        aesdec      xmm1, 160[r8]
-        aesdec      xmm1, 176[r8]
-        jb          DLAST_8_2
-        movdqu      xmm2, 224[r8]
-        aesdec      xmm1, 192[r8]
-        aesdec      xmm1, 208[r8]
-DLAST_8_2:
-        aesdeclast  xmm1, xmm2
-        pxor        xmm1, xmm9
-        movdqa      xmm9, xmm10
-        movdqu      [rsi], xmm1
-        add         rsi, 16
-        dec         r10
-        jne         DLOOP_8_2
-DEND_8:
-        ; restore non volatile rdi,rsi
-        mov         rdi, rax
-        mov         rsi, r11
-        ; restore non volatile xmms from stack
-        movdqa      xmm6, [rsp+0]
-        movdqa      xmm7, [rsp+16]
-        movdqa      xmm8, [rsp+32]
-        movdqa      xmm9, [rsp+48]
-        movdqa      xmm10, [rsp+64]
-        movdqa      xmm11, [rsp+80]
-        movdqa      xmm12, [rsp+96]
-        movdqa      xmm13, [rsp+112]
-        add         rsp, 8+8*16 ; 8 = align stack , 8 xmm6-13 16 bytes each
-        ret
-AES_CBC_decrypt_by8 ENDP
-
-
-;	/*
-;	AES_ECB_encrypt[const	,unsigned	char*in
-;	unsigned	,char*out
-;	unsigned	,long	length
-;	const	,unsigned	char*KS
-;	int	nr]
-;	*/
-;	.	globl	AES_ECB_encrypt
-AES_ECB_encrypt PROC
-;#	parameter	1:	rdi
-;#	parameter	2:	rsi
-;#	parameter	3:	rdx
-;#	parameter	4:	rcx
-;#	parameter	5:	r8d
-
-; save rdi and rsi to rax and r11, restore before ret
-	mov rax,rdi
-	mov r11,rsi
-
-; convert to what we had for att&t convention
-    mov rdi,rcx
-	mov rsi,rdx
-	mov rdx,r8
-	mov rcx,r9
-	mov r8d,[rsp+40]
-
-; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end
-	sub rsp,8+4*16  ; 8 = align stack , 4 xmm9-12, 16 bytes each
-	movdqa [rsp+0], xmm9
-	movdqa [rsp+16], xmm10
-	movdqa [rsp+32], xmm11
-	movdqa [rsp+48], xmm12
-
-
-	mov	r10,rdx
-	shr	rdx,4
-	shl	r10,60
-	je	EECB_NO_PARTS_4
-	add	rdx,1
-EECB_NO_PARTS_4:
-	mov	r10,rdx
-	shl	r10,62
-	shr	r10,62
-	shr	rdx,2
-	je	EECB_REMAINDER_4
-	sub	rsi,64
-EECB_LOOP_4:
-	movdqu  xmm1,[rdi]
-	movdqu	xmm2,16[rdi]
-	movdqu	xmm3,32[rdi]
-	movdqu	xmm4,48[rdi]
-	movdqa  xmm9,[rcx]
-	movdqa	xmm10,16[rcx]
-	movdqa	xmm11,32[rcx]
-	movdqa	xmm12,48[rcx]
-	pxor	xmm1,xmm9
-	pxor	xmm2,xmm9
-	pxor	xmm3,xmm9
-	pxor	xmm4,xmm9
-	aesenc	xmm1,xmm10
-	aesenc	xmm2,xmm10
-	aesenc	xmm3,xmm10
-	aesenc	xmm4,xmm10
-	aesenc	xmm1,xmm11
-	aesenc	xmm2,xmm11
-	aesenc	xmm3,xmm11
-	aesenc	xmm4,xmm11
-	aesenc	xmm1,xmm12
-	aesenc	xmm2,xmm12
-	aesenc	xmm3,xmm12
-	aesenc	xmm4,xmm12
-	movdqa	xmm9,64[rcx]
-	movdqa	xmm10,80[rcx]
-	movdqa	xmm11,96[rcx]
-	movdqa	xmm12,112[rcx]
-	aesenc	xmm1,xmm9
-	aesenc	xmm2,xmm9
-	aesenc	xmm3,xmm9
-	aesenc	xmm4,xmm9
-	aesenc	xmm1,xmm10
-	aesenc	xmm2,xmm10
-	aesenc	xmm3,xmm10
-	aesenc	xmm4,xmm10
-	aesenc	xmm1,xmm11
-	aesenc	xmm2,xmm11
-	aesenc	xmm3,xmm11
-	aesenc	xmm4,xmm11
-	aesenc	xmm1,xmm12
-	aesenc	xmm2,xmm12
-	aesenc	xmm3,xmm12
-	aesenc	xmm4,xmm12
-	movdqa	xmm9,128[rcx]
-	movdqa	xmm10,144[rcx]
-	movdqa	xmm11,160[rcx]
-	cmp	r8d,12
-	aesenc	xmm1,xmm9
-	aesenc	xmm2,xmm9
-	aesenc	xmm3,xmm9
-	aesenc	xmm4,xmm9
-	aesenc	xmm1,xmm10
-	aesenc	xmm2,xmm10
-	aesenc	xmm3,xmm10
-	aesenc	xmm4,xmm10
-	jb	EECB_LAST_4
-	movdqa	xmm9,160[rcx]
-	movdqa	xmm10,176[rcx]
-	movdqa	xmm11,192[rcx]
-	cmp	r8d,14
-	aesenc	xmm1,xmm9
-	aesenc	xmm2,xmm9
-	aesenc	xmm3,xmm9
-	aesenc	xmm4,xmm9
-	aesenc	xmm1,xmm10
-	aesenc	xmm2,xmm10
-	aesenc	xmm3,xmm10
-	aesenc	xmm4,xmm10
-	jb	EECB_LAST_4
-	movdqa	xmm9,192[rcx]
-	movdqa	xmm10,208[rcx]
-	movdqa	xmm11,224[rcx]
-	aesenc	xmm1,xmm9
-	aesenc	xmm2,xmm9
-	aesenc	xmm3,xmm9
-	aesenc	xmm4,xmm9
-	aesenc	xmm1,xmm10
-	aesenc	xmm2,xmm10
-	aesenc	xmm3,xmm10
-	aesenc	xmm4,xmm10
-EECB_LAST_4:
-	add	rdi,64
-	add	rsi,64
-	dec	rdx
-	aesenclast	xmm1,xmm11
-	aesenclast	xmm2,xmm11
-	aesenclast	xmm3,xmm11
-	aesenclast	xmm4,xmm11
-	movdqu	[rsi],xmm1
-	movdqu	16[rsi],xmm2
-	movdqu	32[rsi],xmm3
-	movdqu	48[rsi],xmm4
-	jne	EECB_LOOP_4
-	add	rsi,64
-EECB_REMAINDER_4:
-	cmp	r10,0
-	je	EECB_END_4
-EECB_LOOP_4_2:
-	movdqu  xmm1,[rdi]
-	add	rdi,16
-	pxor	xmm1,[rcx]
-	movdqu	xmm2,160[rcx]
-	aesenc	xmm1,16[rcx]
-	aesenc	xmm1,32[rcx]
-	aesenc	xmm1,48[rcx]
-	aesenc	xmm1,64[rcx]
-	aesenc	xmm1,80[rcx]
-	aesenc	xmm1,96[rcx]
-	aesenc	xmm1,112[rcx]
-	aesenc	xmm1,128[rcx]
-	aesenc	xmm1,144[rcx]
-	cmp	r8d,12
-	jb	EECB_LAST_4_2
-	movdqu	xmm2,192[rcx]
-	aesenc	xmm1,160[rcx]
-	aesenc	xmm1,176[rcx]
-	cmp	r8d,14
-	jb	EECB_LAST_4_2
-	movdqu	xmm2,224[rcx]
-	aesenc	xmm1,192[rcx]
-	aesenc	xmm1,208[rcx]
-EECB_LAST_4_2:
-	aesenclast	xmm1,xmm2
-	movdqu	[rsi],xmm1
-	add	rsi,16
-	dec	r10
-	jne	EECB_LOOP_4_2
-EECB_END_4:
-	; restore non volatile rdi,rsi
-	mov rdi,rax
-	mov rsi,r11
-	; restore non volatile xmms from stack
-	movdqa xmm9, [rsp+0]
-	movdqa xmm10, [rsp+16]
-	movdqa xmm11, [rsp+32]
-	movdqa xmm12, [rsp+48]
-	add rsp,8+4*16 ; 8 = align stack , 4 xmm9-12 16 bytes each
-	ret
-AES_ECB_encrypt ENDP
-
-;	/*
-;	AES_ECB_decrypt[const	,unsigned	char*in
-;	unsigned	,char*out
-;	unsigned	,long	length
-;	const	,unsigned	char*KS
-;	int	nr]
-;	*/
-;	.	globl	AES_ECB_decrypt
-AES_ECB_decrypt PROC
-;#	parameter	1:	rdi
-;#	parameter	2:	rsi
-;#	parameter	3:	rdx
-;#	parameter	4:	rcx
-;#	parameter	5:	r8d
-
-; save rdi and rsi to rax and r11, restore before ret
-	mov rax,rdi
-	mov r11,rsi
-
-; convert to what we had for att&t convention
-	mov rdi,rcx
-	mov rsi,rdx
-	mov rdx,r8
-	mov rcx,r9
-	mov r8d,[rsp+40]
-
-; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end
-	sub rsp,8+4*16  ; 8 = align stack , 4 xmm9-12, 16 bytes each
-	movdqa [rsp+0], xmm9
-	movdqa [rsp+16], xmm10
-	movdqa [rsp+32], xmm11
-	movdqa [rsp+48], xmm12
-
-	mov	r10,rdx
-	shr	rdx,4
-	shl	r10,60
-	je	DECB_NO_PARTS_4
-	add	rdx,1
-DECB_NO_PARTS_4:
-	mov	r10,rdx
-	shl	r10,62
-	shr	r10,62
-	shr	rdx,2
-	je	DECB_REMAINDER_4
-	sub	rsi,64
-DECB_LOOP_4:
-	movdqu  xmm1,[rdi]
-	movdqu	xmm2,16[rdi]
-	movdqu	xmm3,32[rdi]
-	movdqu	xmm4,48[rdi]
-	movdqa  xmm9,[rcx]
-	movdqa	xmm10,16[rcx]
-	movdqa	xmm11,32[rcx]
-	movdqa	xmm12,48[rcx]
-	pxor	xmm1,xmm9
-	pxor	xmm2,xmm9
-	pxor	xmm3,xmm9
-	pxor	xmm4,xmm9
-	aesdec	xmm1,xmm10
-	aesdec	xmm2,xmm10
-	aesdec	xmm3,xmm10
-	aesdec	xmm4,xmm10
-	aesdec	xmm1,xmm11
-	aesdec	xmm2,xmm11
-	aesdec	xmm3,xmm11
-	aesdec	xmm4,xmm11
-	aesdec	xmm1,xmm12
-	aesdec	xmm2,xmm12
-	aesdec	xmm3,xmm12
-	aesdec	xmm4,xmm12
-	movdqa	xmm9,64[rcx]
-	movdqa	xmm10,80[rcx]
-	movdqa	xmm11,96[rcx]
-	movdqa	xmm12,112[rcx]
-	aesdec	xmm1,xmm9
-	aesdec	xmm2,xmm9
-	aesdec	xmm3,xmm9
-	aesdec	xmm4,xmm9
-	aesdec	xmm1,xmm10
-	aesdec	xmm2,xmm10
-	aesdec	xmm3,xmm10
-	aesdec	xmm4,xmm10
-	aesdec	xmm1,xmm11
-	aesdec	xmm2,xmm11
-	aesdec	xmm3,xmm11
-	aesdec	xmm4,xmm11
-	aesdec	xmm1,xmm12
-	aesdec	xmm2,xmm12
-	aesdec	xmm3,xmm12
-	aesdec	xmm4,xmm12
-	movdqa	xmm9,128[rcx]
-	movdqa	xmm10,144[rcx]
-	movdqa	xmm11,160[rcx]
-	cmp	r8d,12
-	aesdec	xmm1,xmm9
-	aesdec	xmm2,xmm9
-	aesdec	xmm3,xmm9
-	aesdec	xmm4,xmm9
-	aesdec	xmm1,xmm10
-	aesdec	xmm2,xmm10
-	aesdec	xmm3,xmm10
-	aesdec	xmm4,xmm10
-	jb	DECB_LAST_4
-	movdqa	xmm9,160[rcx]
-	movdqa	xmm10,176[rcx]
-	movdqa	xmm11,192[rcx]
-	cmp	r8d,14
-	aesdec	xmm1,xmm9
-	aesdec	xmm2,xmm9
-	aesdec	xmm3,xmm9
-	aesdec	xmm4,xmm9
-	aesdec	xmm1,xmm10
-	aesdec	xmm2,xmm10
-	aesdec	xmm3,xmm10
-	aesdec	xmm4,xmm10
-	jb	DECB_LAST_4
-	movdqa	xmm9,192[rcx]
-	movdqa	xmm10,208[rcx]
-	movdqa	xmm11,224[rcx]
-	aesdec	xmm1,xmm9
-	aesdec	xmm2,xmm9
-	aesdec	xmm3,xmm9
-	aesdec	xmm4,xmm9
-	aesdec	xmm1,xmm10
-	aesdec	xmm2,xmm10
-	aesdec	xmm3,xmm10
-	aesdec	xmm4,xmm10
-DECB_LAST_4:
-	add	rdi,64
-	add	rsi,64
-	dec	rdx
-	aesdeclast	xmm1,xmm11
-	aesdeclast	xmm2,xmm11
-	aesdeclast	xmm3,xmm11
-	aesdeclast	xmm4,xmm11
-	movdqu	[rsi],xmm1
-	movdqu	16[rsi],xmm2
-	movdqu	32[rsi],xmm3
-	movdqu	48[rsi],xmm4
-	jne	DECB_LOOP_4
-	add	rsi,64
-DECB_REMAINDER_4:
-	cmp	r10,0
-	je	DECB_END_4
-DECB_LOOP_4_2:
-	movdqu  xmm1,[rdi]
-	add	rdi,16
-	pxor	xmm1,[rcx]
-	movdqu	xmm2,160[rcx]
-	cmp	r8d,12
-	aesdec	xmm1,16[rcx]
-	aesdec	xmm1,32[rcx]
-	aesdec	xmm1,48[rcx]
-	aesdec	xmm1,64[rcx]
-	aesdec	xmm1,80[rcx]
-	aesdec	xmm1,96[rcx]
-	aesdec	xmm1,112[rcx]
-	aesdec	xmm1,128[rcx]
-	aesdec	xmm1,144[rcx]
-	jb	DECB_LAST_4_2
-	cmp	r8d,14
-	movdqu	xmm2,192[rcx]
-	aesdec	xmm1,160[rcx]
-	aesdec	xmm1,176[rcx]
-	jb	DECB_LAST_4_2
-	movdqu	xmm2,224[rcx]
-	aesdec	xmm1,192[rcx]
-	aesdec	xmm1,208[rcx]
-DECB_LAST_4_2:
-	aesdeclast	xmm1,xmm2
-	movdqu	[rsi],xmm1
-	add	rsi,16
-	dec	r10
-	jne	DECB_LOOP_4_2
-DECB_END_4:
-	; restore non volatile rdi,rsi
-	mov rdi,rax
-	mov rsi,r11
-	; restore non volatile xmms from stack
-	movdqa xmm9, [rsp+0]
-	movdqa xmm10, [rsp+16]
-	movdqa xmm11, [rsp+32]
-	movdqa xmm12, [rsp+48]
-	add rsp,8+4*16 ; 8 = align stack , 4 xmm9-12 16 bytes each
-	ret
-AES_ECB_decrypt ENDP
-
-
-
-;	/*
-;	void	,AES_128_Key_Expansion[const	unsigned	char*userkey
-;	unsigned	char*key_schedule]/
-;	*/
-;	.	align	16,0x90
-;	.	globl	AES_128_Key_Expansion
-AES_128_Key_Expansion PROC
-;#	parameter	1:	rdi
-;#	parameter	2:	rsi
-
-; save rdi and rsi to rax and r11, restore before ret
-	mov rax,rdi
-	mov r11,rsi
-
-; convert to what we had for att&t convention
-	mov rdi,rcx
-	mov rsi,rdx
-
-	mov	dword ptr 240[rsi],10
-
-	movdqu	xmm1,[rdi]
-	movdqa	[rsi],xmm1
-
-
-ASSISTS:
-	aeskeygenassist	xmm2,xmm1,1
-	call	PREPARE_ROUNDKEY_128
-	movdqa	16[rsi],xmm1
-
-	aeskeygenassist	xmm2,xmm1,2
-	call	PREPARE_ROUNDKEY_128
-	movdqa	32[rsi],xmm1
-
-	aeskeygenassist	xmm2,xmm1,4
-	call	PREPARE_ROUNDKEY_128
-	movdqa	48[rsi],xmm1
-
-	aeskeygenassist	xmm2,xmm1,8
-	call	PREPARE_ROUNDKEY_128
-	movdqa	64[rsi],xmm1
-
-	aeskeygenassist	xmm2,xmm1,16
-	call	PREPARE_ROUNDKEY_128
-	movdqa	80[rsi],xmm1
-
-	aeskeygenassist	xmm2,xmm1,32
-	call	PREPARE_ROUNDKEY_128
-	movdqa	96[rsi],xmm1
-
-	aeskeygenassist	xmm2,xmm1,64
-	call	PREPARE_ROUNDKEY_128
-	movdqa	112[rsi],xmm1
-	aeskeygenassist	xmm2,xmm1,80h
-	call	PREPARE_ROUNDKEY_128
-	movdqa	128[rsi],xmm1
-	aeskeygenassist	xmm2,xmm1,1bh
-	call	PREPARE_ROUNDKEY_128
-	movdqa	144[rsi],xmm1
-	aeskeygenassist	xmm2,xmm1,36h
-	call	PREPARE_ROUNDKEY_128
-	movdqa	160[rsi],xmm1
-	; restore non volatile rdi,rsi
-	mov rdi,rax
-	mov rsi,r11
-	ret
-
-PREPARE_ROUNDKEY_128:
-	pshufd	xmm2,xmm2,255
-	movdqa	xmm3,xmm1
-	pslldq	xmm3,4
-	pxor	xmm1,xmm3
-	pslldq	xmm3,4
-	pxor	xmm1,xmm3
-	pslldq	xmm3,4
-	pxor	xmm1,xmm3
-	pxor	xmm1,xmm2
-	ret
-AES_128_Key_Expansion ENDP
-
-;	/*
-;	void	,AES_192_Key_Expansion[const	unsigned	char*userkey
-;	unsigned	char*key]
-;	*/
-;	.	globl	AES_192_Key_Expansion
-AES_192_Key_Expansion PROC
-;#	parameter	1:	rdi
-;#	parameter	2:	rsi
-
-; save rdi and rsi to rax and r11, restore before ret
-	mov rax,rdi
-	mov r11,rsi
-
-; convert to what we had for att&t convention
-    mov rdi,rcx
-	mov rsi,rdx
-
-; on microsoft xmm6-xmm15 are non volaitle, let's save on stack and restore at end
-	sub rsp,8+1*16  ; 8 = align stack , 1 xmm6, 16 bytes each
-	movdqa [rsp+0], xmm6
-
-	movdqu  xmm1,[rdi]
-	movq	xmm3,qword ptr 16[rdi]
-	movdqa	[rsi],xmm1
-	movdqa	xmm5,xmm3
-
-	aeskeygenassist	xmm2,xmm3,1h
-	call	PREPARE_ROUNDKEY_192
-	shufpd	xmm5,xmm1,0
-	movdqa	16[rsi],xmm5
-	movdqa	xmm6,xmm1
-	shufpd	xmm6,xmm3,1
-	movdqa	32[rsi],xmm6
-
-	aeskeygenassist	xmm2,xmm3,2h
-	call	PREPARE_ROUNDKEY_192
-	movdqa	48[rsi],xmm1
-	movdqa	xmm5,xmm3
-
-	aeskeygenassist	xmm2,xmm3,4h
-	call	PREPARE_ROUNDKEY_192
-	shufpd	xmm5,xmm1,0
-	movdqa	64[rsi],xmm5
-	movdqa	xmm6,xmm1
-	shufpd	xmm6,xmm3,1
-	movdqa	80[rsi],xmm6
-
-	aeskeygenassist	xmm2,xmm3,8h
-	call	PREPARE_ROUNDKEY_192
-	movdqa	96[rsi],xmm1
-	movdqa	xmm5,xmm3
-
-	aeskeygenassist	xmm2,xmm3,10h
-	call	PREPARE_ROUNDKEY_192
-	shufpd	xmm5,xmm1,0
-	movdqa	112[rsi],xmm5
-	movdqa	xmm6,xmm1
-	shufpd	xmm6,xmm3,1
-	movdqa	128[rsi],xmm6
-
-	aeskeygenassist	xmm2,xmm3,20h
-	call	PREPARE_ROUNDKEY_192
-	movdqa	144[rsi],xmm1
-	movdqa	xmm5,xmm3
-
-	aeskeygenassist	xmm2,xmm3,40h
-	call	PREPARE_ROUNDKEY_192
-	shufpd	xmm5,xmm1,0
-	movdqa	160[rsi],xmm5
-	movdqa	xmm6,xmm1
-	shufpd	xmm6,xmm3,1
-	movdqa	176[rsi],xmm6
-
-	aeskeygenassist	xmm2,xmm3,80h
-	call	PREPARE_ROUNDKEY_192
-	movdqa	192[rsi],xmm1
-	movdqa	208[rsi],xmm3
-	; restore non volatile rdi,rsi
-	mov rdi,rax
-	mov rsi,r11
-; restore non volatile xmms from stack
-	movdqa xmm6, [rsp+0]
-	add rsp,8+1*16 ; 8 = align stack , 1 xmm6 16 bytes each
-	ret
-
-PREPARE_ROUNDKEY_192:
-	pshufd	xmm2,xmm2,55h
-	movdqu	xmm4,xmm1
-	pslldq	xmm4,4
-	pxor	xmm1,xmm4
-
-	pslldq	xmm4,4
-	pxor	xmm1,xmm4
-	pslldq	xmm4,4
-	pxor	xmm1,xmm4
-	pxor	xmm1,xmm2
-	pshufd	xmm2,xmm1,0ffh
-	movdqu	xmm4,xmm3
-	pslldq	xmm4,4
-	pxor	xmm3,xmm4
-	pxor	xmm3,xmm2
-	ret
-AES_192_Key_Expansion ENDP
-
-;	/*
-;	void	,AES_256_Key_Expansion[const	unsigned	char*userkey
-;	unsigned	char*key]
-;	*/
-;	.	globl	AES_256_Key_Expansion
-AES_256_Key_Expansion PROC
-;#	parameter	1:	rdi
-;#	parameter	2:	rsi
-
-; save rdi and rsi to rax and r11, restore before ret
-	mov rax,rdi
-	mov r11,rsi
-
-; convert to what we had for att&t convention
-    mov rdi,rcx
-	mov rsi,rdx
-
-	movdqu  xmm1,[rdi]
-	movdqu	xmm3,16[rdi]
-	movdqa	[rsi],xmm1
-	movdqa	16[rsi],xmm3
-
-	aeskeygenassist	xmm2,xmm3,1h
-	call	MAKE_RK256_a
-	movdqa	32[rsi],xmm1
-	aeskeygenassist	xmm2,xmm1,0h
-	call	MAKE_RK256_b
-	movdqa	48[rsi],xmm3
-	aeskeygenassist	xmm2,xmm3,2h
-	call	MAKE_RK256_a
-	movdqa	64[rsi],xmm1
-	aeskeygenassist	xmm2,xmm1,0h
-	call	MAKE_RK256_b
-	movdqa	80[rsi],xmm3
-	aeskeygenassist	xmm2,xmm3,4h
-	call	MAKE_RK256_a
-	movdqa	96[rsi],xmm1
-	aeskeygenassist	xmm2,xmm1,0h
-	call	MAKE_RK256_b
-	movdqa	112[rsi],xmm3
-	aeskeygenassist	xmm2,xmm3,8h
-	call	MAKE_RK256_a
-	movdqa	128[rsi],xmm1
-	aeskeygenassist	xmm2,xmm1,0h
-	call	MAKE_RK256_b
-	movdqa	144[rsi],xmm3
-	aeskeygenassist	xmm2,xmm3,10h
-	call	MAKE_RK256_a
-	movdqa	160[rsi],xmm1
-	aeskeygenassist	xmm2,xmm1,0h
-	call	MAKE_RK256_b
-	movdqa	176[rsi],xmm3
-	aeskeygenassist	xmm2,xmm3,20h
-	call	MAKE_RK256_a
-	movdqa	192[rsi],xmm1
-
-	aeskeygenassist	xmm2,xmm1,0h
-	call	MAKE_RK256_b
-	movdqa	208[rsi],xmm3
-	aeskeygenassist	xmm2,xmm3,40h
-	call	MAKE_RK256_a
-	movdqa	224[rsi],xmm1
-
-	; restore non volatile rdi,rsi
-	mov rdi,rax
-	mov rsi,r11
-	ret
-AES_256_Key_Expansion ENDP
-
-MAKE_RK256_a:
-	pshufd	xmm2,xmm2,0ffh
-	movdqa	xmm4,xmm1
-	pslldq	xmm4,4
-	pxor	xmm1,xmm4
-	pslldq	xmm4,4
-	pxor	xmm1,xmm4
-	pslldq	xmm4,4
-	pxor	xmm1,xmm4
-	pxor	xmm1,xmm2
-	ret
-
-MAKE_RK256_b:
-	pshufd	xmm2,xmm2,0aah
-	movdqa	xmm4,xmm3
-	pslldq	xmm4,4
-	pxor	xmm3,xmm4
-	pslldq	xmm4,4
-	pxor	xmm3,xmm4
-	pslldq	xmm4,4
-	pxor	xmm3,xmm4
-	pxor	xmm3,xmm2
-	ret
-
-
-IF fips_version GE 2
-  fipsAh ENDS
-ELSE
-  _text ENDS
-ENDIF
-
-END

+ 0 - 15854
lib/wolfssl/wolfcrypt/src/aes_gcm_asm.S

@@ -1,15854 +0,0 @@
-/* aes_gcm_asm
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-#ifdef WOLFSSL_USER_SETTINGS
-#ifdef WOLFSSL_USER_SETTINGS_ASM
-/*
- * user_settings_asm.h is a file generated by the script user_settings_asm.sh.
- * The script takes in a user_settings.h and produces user_settings_asm.h, which
- * is a stripped down version of user_settings.h containing only preprocessor
- * directives. This makes the header safe to include in assembly (.S) files.
- */
-#include "user_settings_asm.h"
-#else
-/*
- * Note: if user_settings.h contains any C code (e.g. a typedef or function
- * prototype), including it here in an assembly (.S) file will cause an
- * assembler failure. See user_settings_asm.h above.
- */
-#include "user_settings.h"
-#endif /* WOLFSSL_USER_SETTINGS_ASM */
-#endif /* WOLFSSL_USER_SETTINGS */
-
-#ifndef HAVE_INTEL_AVX1
-#define HAVE_INTEL_AVX1
-#endif /* HAVE_INTEL_AVX1 */
-#ifndef NO_AVX2_SUPPORT
-#define HAVE_INTEL_AVX2
-#endif /* NO_AVX2_SUPPORT */
-
-#ifdef WOLFSSL_X86_64_BUILD
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_aes_gcm_one:
-.quad	0x0, 0x1
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_aes_gcm_two:
-.quad	0x0, 0x2
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_aes_gcm_three:
-.quad	0x0, 0x3
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_aes_gcm_four:
-.quad	0x0, 0x4
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_aes_gcm_five:
-.quad	0x0, 0x5
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_aes_gcm_six:
-.quad	0x0, 0x6
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_aes_gcm_seven:
-.quad	0x0, 0x7
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_aes_gcm_eight:
-.quad	0x0, 0x8
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_aes_gcm_bswap_epi64:
-.quad	0x1020304050607, 0x8090a0b0c0d0e0f
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_aes_gcm_bswap_mask:
-.quad	0x8090a0b0c0d0e0f, 0x1020304050607
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_aes_gcm_mod2_128:
-.quad	0x1, 0xc200000000000000
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_encrypt
-.type	AES_GCM_encrypt,@function
-.align	16
-AES_GCM_encrypt:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_encrypt
-.p2align	4
-_AES_GCM_encrypt:
-#endif /* __APPLE__ */
-        pushq	%r13
-        pushq	%r12
-        pushq	%rbx
-        pushq	%r14
-        pushq	%r15
-        movq	%rdx, %r12
-        movq	%rcx, %rax
-        movl	48(%rsp), %r11d
-        movl	56(%rsp), %ebx
-        movl	64(%rsp), %r14d
-        movq	72(%rsp), %r15
-        movl	80(%rsp), %r10d
-        subq	$0xa0, %rsp
-        pxor	%xmm4, %xmm4
-        pxor	%xmm6, %xmm6
-        cmpl	$12, %ebx
-        movl	%ebx, %edx
-        jne	L_AES_GCM_encrypt_iv_not_12
-        # # Calculate values when IV is 12 bytes
-        # Set counter based on IV
-        movl	$0x1000000, %ecx
-        pinsrq	$0x00, (%rax), %xmm4
-        pinsrd	$2, 8(%rax), %xmm4
-        pinsrd	$3, %ecx, %xmm4
-        # H = Encrypt X(=0) and T = Encrypt counter
-        movdqa	%xmm4, %xmm1
-        movdqa	(%r15), %xmm5
-        pxor	%xmm5, %xmm1
-        movdqa	16(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	32(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	48(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	64(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	80(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	96(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	112(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	128(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	144(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        cmpl	$11, %r10d
-        movdqa	160(%r15), %xmm7
-        jl	L_AES_GCM_encrypt_calc_iv_12_last
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	176(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        cmpl	$13, %r10d
-        movdqa	192(%r15), %xmm7
-        jl	L_AES_GCM_encrypt_calc_iv_12_last
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	208(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	224(%r15), %xmm7
-L_AES_GCM_encrypt_calc_iv_12_last:
-        aesenclast	%xmm7, %xmm5
-        aesenclast	%xmm7, %xmm1
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm5
-        movdqu	%xmm1, 144(%rsp)
-        jmp	L_AES_GCM_encrypt_iv_done
-L_AES_GCM_encrypt_iv_not_12:
-        # Calculate values when IV is not 12 bytes
-        # H = Encrypt X(=0)
-        movdqa	(%r15), %xmm5
-        aesenc	16(%r15), %xmm5
-        aesenc	32(%r15), %xmm5
-        aesenc	48(%r15), %xmm5
-        aesenc	64(%r15), %xmm5
-        aesenc	80(%r15), %xmm5
-        aesenc	96(%r15), %xmm5
-        aesenc	112(%r15), %xmm5
-        aesenc	128(%r15), %xmm5
-        aesenc	144(%r15), %xmm5
-        cmpl	$11, %r10d
-        movdqa	160(%r15), %xmm9
-        jl	L_AES_GCM_encrypt_calc_iv_1_aesenc_avx_last
-        aesenc	%xmm9, %xmm5
-        aesenc	176(%r15), %xmm5
-        cmpl	$13, %r10d
-        movdqa	192(%r15), %xmm9
-        jl	L_AES_GCM_encrypt_calc_iv_1_aesenc_avx_last
-        aesenc	%xmm9, %xmm5
-        aesenc	208(%r15), %xmm5
-        movdqa	224(%r15), %xmm9
-L_AES_GCM_encrypt_calc_iv_1_aesenc_avx_last:
-        aesenclast	%xmm9, %xmm5
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm5
-        # Calc counter
-        # Initialization vector
-        cmpl	$0x00, %edx
-        movq	$0x00, %rcx
-        je	L_AES_GCM_encrypt_calc_iv_done
-        cmpl	$16, %edx
-        jl	L_AES_GCM_encrypt_calc_iv_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_encrypt_calc_iv_16_loop:
-        movdqu	(%rax,%rcx,1), %xmm8
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm8
-        pxor	%xmm8, %xmm4
-        pshufd	$0x4e, %xmm4, %xmm1
-        pshufd	$0x4e, %xmm5, %xmm2
-        movdqa	%xmm5, %xmm3
-        movdqa	%xmm5, %xmm0
-        pclmulqdq	$0x11, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm4, %xmm0
-        pxor	%xmm4, %xmm1
-        pxor	%xmm5, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm7
-        movdqa	%xmm3, %xmm4
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm7
-        pxor	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm4, %xmm1
-        psrld	$31, %xmm0
-        psrld	$31, %xmm1
-        pslld	$0x01, %xmm7
-        pslld	$0x01, %xmm4
-        movdqa	%xmm0, %xmm2
-        pslldq	$4, %xmm0
-        psrldq	$12, %xmm2
-        pslldq	$4, %xmm1
-        por	%xmm2, %xmm4
-        por	%xmm0, %xmm7
-        por	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm7, %xmm1
-        movdqa	%xmm7, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm7
-        movdqa	%xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm7, %xmm2
-        pxor	%xmm2, %xmm4
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_calc_iv_16_loop
-        movl	%ebx, %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_encrypt_calc_iv_done
-L_AES_GCM_encrypt_calc_iv_lt16:
-        subq	$16, %rsp
-        pxor	%xmm8, %xmm8
-        xorl	%ebx, %ebx
-        movdqu	%xmm8, (%rsp)
-L_AES_GCM_encrypt_calc_iv_loop:
-        movzbl	(%rax,%rcx,1), %r13d
-        movb	%r13b, (%rsp,%rbx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_calc_iv_loop
-        movdqu	(%rsp), %xmm8
-        addq	$16, %rsp
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm8
-        pxor	%xmm8, %xmm4
-        pshufd	$0x4e, %xmm4, %xmm1
-        pshufd	$0x4e, %xmm5, %xmm2
-        movdqa	%xmm5, %xmm3
-        movdqa	%xmm5, %xmm0
-        pclmulqdq	$0x11, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm4, %xmm0
-        pxor	%xmm4, %xmm1
-        pxor	%xmm5, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm7
-        movdqa	%xmm3, %xmm4
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm7
-        pxor	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm4, %xmm1
-        psrld	$31, %xmm0
-        psrld	$31, %xmm1
-        pslld	$0x01, %xmm7
-        pslld	$0x01, %xmm4
-        movdqa	%xmm0, %xmm2
-        pslldq	$4, %xmm0
-        psrldq	$12, %xmm2
-        pslldq	$4, %xmm1
-        por	%xmm2, %xmm4
-        por	%xmm0, %xmm7
-        por	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm7, %xmm1
-        movdqa	%xmm7, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm7
-        movdqa	%xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm7, %xmm2
-        pxor	%xmm2, %xmm4
-L_AES_GCM_encrypt_calc_iv_done:
-        # T = Encrypt counter
-        pxor	%xmm0, %xmm0
-        shll	$3, %edx
-        pinsrq	$0x00, %rdx, %xmm0
-        pxor	%xmm0, %xmm4
-        pshufd	$0x4e, %xmm4, %xmm1
-        pshufd	$0x4e, %xmm5, %xmm2
-        movdqa	%xmm5, %xmm3
-        movdqa	%xmm5, %xmm0
-        pclmulqdq	$0x11, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm4, %xmm0
-        pxor	%xmm4, %xmm1
-        pxor	%xmm5, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm7
-        movdqa	%xmm3, %xmm4
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm7
-        pxor	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm4, %xmm1
-        psrld	$31, %xmm0
-        psrld	$31, %xmm1
-        pslld	$0x01, %xmm7
-        pslld	$0x01, %xmm4
-        movdqa	%xmm0, %xmm2
-        pslldq	$4, %xmm0
-        psrldq	$12, %xmm2
-        pslldq	$4, %xmm1
-        por	%xmm2, %xmm4
-        por	%xmm0, %xmm7
-        por	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm7, %xmm1
-        movdqa	%xmm7, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm7
-        movdqa	%xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm7, %xmm2
-        pxor	%xmm2, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm4
-        #   Encrypt counter
-        movdqa	(%r15), %xmm8
-        pxor	%xmm4, %xmm8
-        aesenc	16(%r15), %xmm8
-        aesenc	32(%r15), %xmm8
-        aesenc	48(%r15), %xmm8
-        aesenc	64(%r15), %xmm8
-        aesenc	80(%r15), %xmm8
-        aesenc	96(%r15), %xmm8
-        aesenc	112(%r15), %xmm8
-        aesenc	128(%r15), %xmm8
-        aesenc	144(%r15), %xmm8
-        cmpl	$11, %r10d
-        movdqa	160(%r15), %xmm9
-        jl	L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last
-        aesenc	%xmm9, %xmm8
-        aesenc	176(%r15), %xmm8
-        cmpl	$13, %r10d
-        movdqa	192(%r15), %xmm9
-        jl	L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last
-        aesenc	%xmm9, %xmm8
-        aesenc	208(%r15), %xmm8
-        movdqa	224(%r15), %xmm9
-L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last:
-        aesenclast	%xmm9, %xmm8
-        movdqu	%xmm8, 144(%rsp)
-L_AES_GCM_encrypt_iv_done:
-        # Additional authentication data
-        movl	%r11d, %edx
-        cmpl	$0x00, %edx
-        je	L_AES_GCM_encrypt_calc_aad_done
-        xorl	%ecx, %ecx
-        cmpl	$16, %edx
-        jl	L_AES_GCM_encrypt_calc_aad_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_encrypt_calc_aad_16_loop:
-        movdqu	(%r12,%rcx,1), %xmm8
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm8
-        pxor	%xmm8, %xmm6
-        pshufd	$0x4e, %xmm6, %xmm1
-        pshufd	$0x4e, %xmm5, %xmm2
-        movdqa	%xmm5, %xmm3
-        movdqa	%xmm5, %xmm0
-        pclmulqdq	$0x11, %xmm6, %xmm3
-        pclmulqdq	$0x00, %xmm6, %xmm0
-        pxor	%xmm6, %xmm1
-        pxor	%xmm5, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm7
-        movdqa	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm7
-        pxor	%xmm1, %xmm6
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm6, %xmm1
-        psrld	$31, %xmm0
-        psrld	$31, %xmm1
-        pslld	$0x01, %xmm7
-        pslld	$0x01, %xmm6
-        movdqa	%xmm0, %xmm2
-        pslldq	$4, %xmm0
-        psrldq	$12, %xmm2
-        pslldq	$4, %xmm1
-        por	%xmm2, %xmm6
-        por	%xmm0, %xmm7
-        por	%xmm1, %xmm6
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm7, %xmm1
-        movdqa	%xmm7, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm7
-        movdqa	%xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm7, %xmm2
-        pxor	%xmm2, %xmm6
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_calc_aad_16_loop
-        movl	%r11d, %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_encrypt_calc_aad_done
-L_AES_GCM_encrypt_calc_aad_lt16:
-        subq	$16, %rsp
-        pxor	%xmm8, %xmm8
-        xorl	%ebx, %ebx
-        movdqu	%xmm8, (%rsp)
-L_AES_GCM_encrypt_calc_aad_loop:
-        movzbl	(%r12,%rcx,1), %r13d
-        movb	%r13b, (%rsp,%rbx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_calc_aad_loop
-        movdqu	(%rsp), %xmm8
-        addq	$16, %rsp
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm8
-        pxor	%xmm8, %xmm6
-        pshufd	$0x4e, %xmm6, %xmm1
-        pshufd	$0x4e, %xmm5, %xmm2
-        movdqa	%xmm5, %xmm3
-        movdqa	%xmm5, %xmm0
-        pclmulqdq	$0x11, %xmm6, %xmm3
-        pclmulqdq	$0x00, %xmm6, %xmm0
-        pxor	%xmm6, %xmm1
-        pxor	%xmm5, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm7
-        movdqa	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm7
-        pxor	%xmm1, %xmm6
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm6, %xmm1
-        psrld	$31, %xmm0
-        psrld	$31, %xmm1
-        pslld	$0x01, %xmm7
-        pslld	$0x01, %xmm6
-        movdqa	%xmm0, %xmm2
-        pslldq	$4, %xmm0
-        psrldq	$12, %xmm2
-        pslldq	$4, %xmm1
-        por	%xmm2, %xmm6
-        por	%xmm0, %xmm7
-        por	%xmm1, %xmm6
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm7, %xmm1
-        movdqa	%xmm7, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm7
-        movdqa	%xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm7, %xmm2
-        pxor	%xmm2, %xmm6
-L_AES_GCM_encrypt_calc_aad_done:
-        # Calculate counter and H
-        pshufb	L_aes_gcm_bswap_epi64(%rip), %xmm4
-        movdqa	%xmm5, %xmm9
-        paddd	L_aes_gcm_one(%rip), %xmm4
-        movdqa	%xmm5, %xmm8
-        movdqu	%xmm4, 128(%rsp)
-        psrlq	$63, %xmm9
-        psllq	$0x01, %xmm8
-        pslldq	$8, %xmm9
-        por	%xmm9, %xmm8
-        pshufd	$0xff, %xmm5, %xmm5
-        psrad	$31, %xmm5
-        pand	L_aes_gcm_mod2_128(%rip), %xmm5
-        pxor	%xmm8, %xmm5
-        xorq	%rbx, %rbx
-        cmpl	$0x80, %r9d
-        movl	%r9d, %r13d
-        jl	L_AES_GCM_encrypt_done_128
-        andl	$0xffffff80, %r13d
-        movdqa	%xmm6, %xmm2
-        # H ^ 1
-        movdqu	%xmm5, (%rsp)
-        # H ^ 2
-        pshufd	$0x4e, %xmm5, %xmm9
-        pshufd	$0x4e, %xmm5, %xmm10
-        movdqa	%xmm5, %xmm11
-        movdqa	%xmm5, %xmm8
-        pclmulqdq	$0x11, %xmm5, %xmm11
-        pclmulqdq	$0x00, %xmm5, %xmm8
-        pxor	%xmm5, %xmm9
-        pxor	%xmm5, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm0
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm0
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm0
-        movdqu	%xmm0, 16(%rsp)
-        # H ^ 3
-        pshufd	$0x4e, %xmm5, %xmm9
-        pshufd	$0x4e, %xmm0, %xmm10
-        movdqa	%xmm0, %xmm11
-        movdqa	%xmm0, %xmm8
-        pclmulqdq	$0x11, %xmm5, %xmm11
-        pclmulqdq	$0x00, %xmm5, %xmm8
-        pxor	%xmm5, %xmm9
-        pxor	%xmm0, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm1
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm1
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm1
-        movdqu	%xmm1, 32(%rsp)
-        # H ^ 4
-        pshufd	$0x4e, %xmm0, %xmm9
-        pshufd	$0x4e, %xmm0, %xmm10
-        movdqa	%xmm0, %xmm11
-        movdqa	%xmm0, %xmm8
-        pclmulqdq	$0x11, %xmm0, %xmm11
-        pclmulqdq	$0x00, %xmm0, %xmm8
-        pxor	%xmm0, %xmm9
-        pxor	%xmm0, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm3
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm3
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm3
-        movdqu	%xmm3, 48(%rsp)
-        # H ^ 5
-        pshufd	$0x4e, %xmm0, %xmm9
-        pshufd	$0x4e, %xmm1, %xmm10
-        movdqa	%xmm1, %xmm11
-        movdqa	%xmm1, %xmm8
-        pclmulqdq	$0x11, %xmm0, %xmm11
-        pclmulqdq	$0x00, %xmm0, %xmm8
-        pxor	%xmm0, %xmm9
-        pxor	%xmm1, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm7
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm7
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm7
-        movdqu	%xmm7, 64(%rsp)
-        # H ^ 6
-        pshufd	$0x4e, %xmm1, %xmm9
-        pshufd	$0x4e, %xmm1, %xmm10
-        movdqa	%xmm1, %xmm11
-        movdqa	%xmm1, %xmm8
-        pclmulqdq	$0x11, %xmm1, %xmm11
-        pclmulqdq	$0x00, %xmm1, %xmm8
-        pxor	%xmm1, %xmm9
-        pxor	%xmm1, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm7
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm7
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm7
-        movdqu	%xmm7, 80(%rsp)
-        # H ^ 7
-        pshufd	$0x4e, %xmm1, %xmm9
-        pshufd	$0x4e, %xmm3, %xmm10
-        movdqa	%xmm3, %xmm11
-        movdqa	%xmm3, %xmm8
-        pclmulqdq	$0x11, %xmm1, %xmm11
-        pclmulqdq	$0x00, %xmm1, %xmm8
-        pxor	%xmm1, %xmm9
-        pxor	%xmm3, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm7
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm7
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm7
-        movdqu	%xmm7, 96(%rsp)
-        # H ^ 8
-        pshufd	$0x4e, %xmm3, %xmm9
-        pshufd	$0x4e, %xmm3, %xmm10
-        movdqa	%xmm3, %xmm11
-        movdqa	%xmm3, %xmm8
-        pclmulqdq	$0x11, %xmm3, %xmm11
-        pclmulqdq	$0x00, %xmm3, %xmm8
-        pxor	%xmm3, %xmm9
-        pxor	%xmm3, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm7
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm7
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm7
-        movdqu	%xmm7, 112(%rsp)
-        # First 128 bytes of input
-        movdqu	128(%rsp), %xmm8
-        movdqa	L_aes_gcm_bswap_epi64(%rip), %xmm1
-        movdqa	%xmm8, %xmm0
-        pshufb	%xmm1, %xmm8
-        movdqa	%xmm0, %xmm9
-        paddd	L_aes_gcm_one(%rip), %xmm9
-        pshufb	%xmm1, %xmm9
-        movdqa	%xmm0, %xmm10
-        paddd	L_aes_gcm_two(%rip), %xmm10
-        pshufb	%xmm1, %xmm10
-        movdqa	%xmm0, %xmm11
-        paddd	L_aes_gcm_three(%rip), %xmm11
-        pshufb	%xmm1, %xmm11
-        movdqa	%xmm0, %xmm12
-        paddd	L_aes_gcm_four(%rip), %xmm12
-        pshufb	%xmm1, %xmm12
-        movdqa	%xmm0, %xmm13
-        paddd	L_aes_gcm_five(%rip), %xmm13
-        pshufb	%xmm1, %xmm13
-        movdqa	%xmm0, %xmm14
-        paddd	L_aes_gcm_six(%rip), %xmm14
-        pshufb	%xmm1, %xmm14
-        movdqa	%xmm0, %xmm15
-        paddd	L_aes_gcm_seven(%rip), %xmm15
-        pshufb	%xmm1, %xmm15
-        paddd	L_aes_gcm_eight(%rip), %xmm0
-        movdqa	(%r15), %xmm7
-        movdqu	%xmm0, 128(%rsp)
-        pxor	%xmm7, %xmm8
-        pxor	%xmm7, %xmm9
-        pxor	%xmm7, %xmm10
-        pxor	%xmm7, %xmm11
-        pxor	%xmm7, %xmm12
-        pxor	%xmm7, %xmm13
-        pxor	%xmm7, %xmm14
-        pxor	%xmm7, %xmm15
-        movdqa	16(%r15), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	32(%r15), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	48(%r15), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	64(%r15), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	80(%r15), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	96(%r15), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	112(%r15), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	128(%r15), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	144(%r15), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        cmpl	$11, %r10d
-        movdqa	160(%r15), %xmm7
-        jl	L_AES_GCM_encrypt_enc_done
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	176(%r15), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        cmpl	$13, %r10d
-        movdqa	192(%r15), %xmm7
-        jl	L_AES_GCM_encrypt_enc_done
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	208(%r15), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	224(%r15), %xmm7
-L_AES_GCM_encrypt_enc_done:
-        aesenclast	%xmm7, %xmm8
-        aesenclast	%xmm7, %xmm9
-        movdqu	(%rdi), %xmm0
-        movdqu	16(%rdi), %xmm1
-        pxor	%xmm0, %xmm8
-        pxor	%xmm1, %xmm9
-        movdqu	%xmm8, (%rsi)
-        movdqu	%xmm9, 16(%rsi)
-        aesenclast	%xmm7, %xmm10
-        aesenclast	%xmm7, %xmm11
-        movdqu	32(%rdi), %xmm0
-        movdqu	48(%rdi), %xmm1
-        pxor	%xmm0, %xmm10
-        pxor	%xmm1, %xmm11
-        movdqu	%xmm10, 32(%rsi)
-        movdqu	%xmm11, 48(%rsi)
-        aesenclast	%xmm7, %xmm12
-        aesenclast	%xmm7, %xmm13
-        movdqu	64(%rdi), %xmm0
-        movdqu	80(%rdi), %xmm1
-        pxor	%xmm0, %xmm12
-        pxor	%xmm1, %xmm13
-        movdqu	%xmm12, 64(%rsi)
-        movdqu	%xmm13, 80(%rsi)
-        aesenclast	%xmm7, %xmm14
-        aesenclast	%xmm7, %xmm15
-        movdqu	96(%rdi), %xmm0
-        movdqu	112(%rdi), %xmm1
-        pxor	%xmm0, %xmm14
-        pxor	%xmm1, %xmm15
-        movdqu	%xmm14, 96(%rsi)
-        movdqu	%xmm15, 112(%rsi)
-        cmpl	$0x80, %r13d
-        movl	$0x80, %ebx
-        jle	L_AES_GCM_encrypt_end_128
-        # More 128 bytes of input
-L_AES_GCM_encrypt_ghash_128:
-        leaq	(%rdi,%rbx,1), %rcx
-        leaq	(%rsi,%rbx,1), %rdx
-        movdqu	128(%rsp), %xmm8
-        movdqa	L_aes_gcm_bswap_epi64(%rip), %xmm1
-        movdqa	%xmm8, %xmm0
-        pshufb	%xmm1, %xmm8
-        movdqa	%xmm0, %xmm9
-        paddd	L_aes_gcm_one(%rip), %xmm9
-        pshufb	%xmm1, %xmm9
-        movdqa	%xmm0, %xmm10
-        paddd	L_aes_gcm_two(%rip), %xmm10
-        pshufb	%xmm1, %xmm10
-        movdqa	%xmm0, %xmm11
-        paddd	L_aes_gcm_three(%rip), %xmm11
-        pshufb	%xmm1, %xmm11
-        movdqa	%xmm0, %xmm12
-        paddd	L_aes_gcm_four(%rip), %xmm12
-        pshufb	%xmm1, %xmm12
-        movdqa	%xmm0, %xmm13
-        paddd	L_aes_gcm_five(%rip), %xmm13
-        pshufb	%xmm1, %xmm13
-        movdqa	%xmm0, %xmm14
-        paddd	L_aes_gcm_six(%rip), %xmm14
-        pshufb	%xmm1, %xmm14
-        movdqa	%xmm0, %xmm15
-        paddd	L_aes_gcm_seven(%rip), %xmm15
-        pshufb	%xmm1, %xmm15
-        paddd	L_aes_gcm_eight(%rip), %xmm0
-        movdqa	(%r15), %xmm7
-        movdqu	%xmm0, 128(%rsp)
-        pxor	%xmm7, %xmm8
-        pxor	%xmm7, %xmm9
-        pxor	%xmm7, %xmm10
-        pxor	%xmm7, %xmm11
-        pxor	%xmm7, %xmm12
-        pxor	%xmm7, %xmm13
-        pxor	%xmm7, %xmm14
-        pxor	%xmm7, %xmm15
-        movdqu	112(%rsp), %xmm7
-        movdqu	-128(%rdx), %xmm0
-        aesenc	16(%r15), %xmm8
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        pxor	%xmm2, %xmm0
-        pshufd	$0x4e, %xmm7, %xmm1
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm7, %xmm1
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm3
-        pclmulqdq	$0x11, %xmm7, %xmm3
-        aesenc	16(%r15), %xmm9
-        aesenc	16(%r15), %xmm10
-        movdqa	%xmm0, %xmm2
-        pclmulqdq	$0x00, %xmm7, %xmm2
-        aesenc	16(%r15), %xmm11
-        aesenc	16(%r15), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm1
-        aesenc	16(%r15), %xmm13
-        aesenc	16(%r15), %xmm14
-        aesenc	16(%r15), %xmm15
-        pxor	%xmm2, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqu	96(%rsp), %xmm7
-        movdqu	-112(%rdx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	32(%r15), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	32(%r15), %xmm9
-        aesenc	32(%r15), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	32(%r15), %xmm11
-        aesenc	32(%r15), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	32(%r15), %xmm13
-        aesenc	32(%r15), %xmm14
-        aesenc	32(%r15), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	80(%rsp), %xmm7
-        movdqu	-96(%rdx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	48(%r15), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	48(%r15), %xmm9
-        aesenc	48(%r15), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	48(%r15), %xmm11
-        aesenc	48(%r15), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	48(%r15), %xmm13
-        aesenc	48(%r15), %xmm14
-        aesenc	48(%r15), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	64(%rsp), %xmm7
-        movdqu	-80(%rdx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	64(%r15), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	64(%r15), %xmm9
-        aesenc	64(%r15), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	64(%r15), %xmm11
-        aesenc	64(%r15), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	64(%r15), %xmm13
-        aesenc	64(%r15), %xmm14
-        aesenc	64(%r15), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	48(%rsp), %xmm7
-        movdqu	-64(%rdx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	80(%r15), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	80(%r15), %xmm9
-        aesenc	80(%r15), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	80(%r15), %xmm11
-        aesenc	80(%r15), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	80(%r15), %xmm13
-        aesenc	80(%r15), %xmm14
-        aesenc	80(%r15), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	32(%rsp), %xmm7
-        movdqu	-48(%rdx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	96(%r15), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	96(%r15), %xmm9
-        aesenc	96(%r15), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	96(%r15), %xmm11
-        aesenc	96(%r15), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	96(%r15), %xmm13
-        aesenc	96(%r15), %xmm14
-        aesenc	96(%r15), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	16(%rsp), %xmm7
-        movdqu	-32(%rdx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	112(%r15), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	112(%r15), %xmm9
-        aesenc	112(%r15), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	112(%r15), %xmm11
-        aesenc	112(%r15), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	112(%r15), %xmm13
-        aesenc	112(%r15), %xmm14
-        aesenc	112(%r15), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	(%rsp), %xmm7
-        movdqu	-16(%rdx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	128(%r15), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	128(%r15), %xmm9
-        aesenc	128(%r15), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	128(%r15), %xmm11
-        aesenc	128(%r15), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	128(%r15), %xmm13
-        aesenc	128(%r15), %xmm14
-        aesenc	128(%r15), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqa	%xmm1, %xmm5
-        psrldq	$8, %xmm1
-        pslldq	$8, %xmm5
-        aesenc	144(%r15), %xmm8
-        pxor	%xmm5, %xmm2
-        pxor	%xmm1, %xmm3
-        movdqa	%xmm2, %xmm7
-        movdqa	%xmm2, %xmm4
-        movdqa	%xmm2, %xmm5
-        aesenc	144(%r15), %xmm9
-        pslld	$31, %xmm7
-        pslld	$30, %xmm4
-        pslld	$25, %xmm5
-        aesenc	144(%r15), %xmm10
-        pxor	%xmm4, %xmm7
-        pxor	%xmm5, %xmm7
-        aesenc	144(%r15), %xmm11
-        movdqa	%xmm7, %xmm4
-        pslldq	$12, %xmm7
-        psrldq	$4, %xmm4
-        aesenc	144(%r15), %xmm12
-        pxor	%xmm7, %xmm2
-        movdqa	%xmm2, %xmm5
-        movdqa	%xmm2, %xmm1
-        movdqa	%xmm2, %xmm0
-        aesenc	144(%r15), %xmm13
-        psrld	$0x01, %xmm5
-        psrld	$2, %xmm1
-        psrld	$7, %xmm0
-        aesenc	144(%r15), %xmm14
-        pxor	%xmm1, %xmm5
-        pxor	%xmm0, %xmm5
-        aesenc	144(%r15), %xmm15
-        pxor	%xmm4, %xmm5
-        pxor	%xmm5, %xmm2
-        pxor	%xmm3, %xmm2
-        cmpl	$11, %r10d
-        movdqa	160(%r15), %xmm7
-        jl	L_AES_GCM_encrypt_aesenc_128_ghash_avx_done
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	176(%r15), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        cmpl	$13, %r10d
-        movdqa	192(%r15), %xmm7
-        jl	L_AES_GCM_encrypt_aesenc_128_ghash_avx_done
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	208(%r15), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	224(%r15), %xmm7
-L_AES_GCM_encrypt_aesenc_128_ghash_avx_done:
-        aesenclast	%xmm7, %xmm8
-        aesenclast	%xmm7, %xmm9
-        movdqu	(%rcx), %xmm0
-        movdqu	16(%rcx), %xmm1
-        pxor	%xmm0, %xmm8
-        pxor	%xmm1, %xmm9
-        movdqu	%xmm8, (%rdx)
-        movdqu	%xmm9, 16(%rdx)
-        aesenclast	%xmm7, %xmm10
-        aesenclast	%xmm7, %xmm11
-        movdqu	32(%rcx), %xmm0
-        movdqu	48(%rcx), %xmm1
-        pxor	%xmm0, %xmm10
-        pxor	%xmm1, %xmm11
-        movdqu	%xmm10, 32(%rdx)
-        movdqu	%xmm11, 48(%rdx)
-        aesenclast	%xmm7, %xmm12
-        aesenclast	%xmm7, %xmm13
-        movdqu	64(%rcx), %xmm0
-        movdqu	80(%rcx), %xmm1
-        pxor	%xmm0, %xmm12
-        pxor	%xmm1, %xmm13
-        movdqu	%xmm12, 64(%rdx)
-        movdqu	%xmm13, 80(%rdx)
-        aesenclast	%xmm7, %xmm14
-        aesenclast	%xmm7, %xmm15
-        movdqu	96(%rcx), %xmm0
-        movdqu	112(%rcx), %xmm1
-        pxor	%xmm0, %xmm14
-        pxor	%xmm1, %xmm15
-        movdqu	%xmm14, 96(%rdx)
-        movdqu	%xmm15, 112(%rdx)
-        addl	$0x80, %ebx
-        cmpl	%r13d, %ebx
-        jl	L_AES_GCM_encrypt_ghash_128
-L_AES_GCM_encrypt_end_128:
-        movdqa	L_aes_gcm_bswap_mask(%rip), %xmm4
-        pshufb	%xmm4, %xmm8
-        pshufb	%xmm4, %xmm9
-        pshufb	%xmm4, %xmm10
-        pshufb	%xmm4, %xmm11
-        pxor	%xmm2, %xmm8
-        pshufb	%xmm4, %xmm12
-        pshufb	%xmm4, %xmm13
-        pshufb	%xmm4, %xmm14
-        pshufb	%xmm4, %xmm15
-        movdqu	112(%rsp), %xmm7
-        pshufd	$0x4e, %xmm8, %xmm1
-        pshufd	$0x4e, %xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        pclmulqdq	$0x11, %xmm8, %xmm3
-        pclmulqdq	$0x00, %xmm8, %xmm0
-        pxor	%xmm8, %xmm1
-        pxor	%xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm4
-        movdqa	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm6
-        movdqu	96(%rsp), %xmm7
-        pshufd	$0x4e, %xmm9, %xmm1
-        pshufd	$0x4e, %xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        pclmulqdq	$0x11, %xmm9, %xmm3
-        pclmulqdq	$0x00, %xmm9, %xmm0
-        pxor	%xmm9, %xmm1
-        pxor	%xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        pxor	%xmm0, %xmm4
-        pxor	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm6
-        movdqu	80(%rsp), %xmm7
-        pshufd	$0x4e, %xmm10, %xmm1
-        pshufd	$0x4e, %xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        pclmulqdq	$0x11, %xmm10, %xmm3
-        pclmulqdq	$0x00, %xmm10, %xmm0
-        pxor	%xmm10, %xmm1
-        pxor	%xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        pxor	%xmm0, %xmm4
-        pxor	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm6
-        movdqu	64(%rsp), %xmm7
-        pshufd	$0x4e, %xmm11, %xmm1
-        pshufd	$0x4e, %xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        pclmulqdq	$0x11, %xmm11, %xmm3
-        pclmulqdq	$0x00, %xmm11, %xmm0
-        pxor	%xmm11, %xmm1
-        pxor	%xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        pxor	%xmm0, %xmm4
-        pxor	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm6
-        movdqu	48(%rsp), %xmm7
-        pshufd	$0x4e, %xmm12, %xmm1
-        pshufd	$0x4e, %xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        pclmulqdq	$0x11, %xmm12, %xmm3
-        pclmulqdq	$0x00, %xmm12, %xmm0
-        pxor	%xmm12, %xmm1
-        pxor	%xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        pxor	%xmm0, %xmm4
-        pxor	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm6
-        movdqu	32(%rsp), %xmm7
-        pshufd	$0x4e, %xmm13, %xmm1
-        pshufd	$0x4e, %xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        pclmulqdq	$0x11, %xmm13, %xmm3
-        pclmulqdq	$0x00, %xmm13, %xmm0
-        pxor	%xmm13, %xmm1
-        pxor	%xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        pxor	%xmm0, %xmm4
-        pxor	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm6
-        movdqu	16(%rsp), %xmm7
-        pshufd	$0x4e, %xmm14, %xmm1
-        pshufd	$0x4e, %xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        pclmulqdq	$0x11, %xmm14, %xmm3
-        pclmulqdq	$0x00, %xmm14, %xmm0
-        pxor	%xmm14, %xmm1
-        pxor	%xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        pxor	%xmm0, %xmm4
-        pxor	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm6
-        movdqu	(%rsp), %xmm7
-        pshufd	$0x4e, %xmm15, %xmm1
-        pshufd	$0x4e, %xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        pclmulqdq	$0x11, %xmm15, %xmm3
-        pclmulqdq	$0x00, %xmm15, %xmm0
-        pxor	%xmm15, %xmm1
-        pxor	%xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        pxor	%xmm0, %xmm4
-        pxor	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm6
-        movdqa	%xmm4, %xmm0
-        movdqa	%xmm4, %xmm1
-        movdqa	%xmm4, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm4
-        movdqa	%xmm4, %xmm2
-        movdqa	%xmm4, %xmm3
-        movdqa	%xmm4, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm4, %xmm2
-        pxor	%xmm2, %xmm6
-        movdqu	(%rsp), %xmm5
-L_AES_GCM_encrypt_done_128:
-        movl	%r9d, %edx
-        cmpl	%edx, %ebx
-        jge	L_AES_GCM_encrypt_done_enc
-        movl	%r9d, %r13d
-        andl	$0xfffffff0, %r13d
-        cmpl	%r13d, %ebx
-        jge	L_AES_GCM_encrypt_last_block_done
-        leaq	(%rdi,%rbx,1), %rcx
-        leaq	(%rsi,%rbx,1), %rdx
-        movdqu	128(%rsp), %xmm8
-        movdqa	%xmm8, %xmm9
-        pshufb	L_aes_gcm_bswap_epi64(%rip), %xmm8
-        paddd	L_aes_gcm_one(%rip), %xmm9
-        pxor	(%r15), %xmm8
-        movdqu	%xmm9, 128(%rsp)
-        aesenc	16(%r15), %xmm8
-        aesenc	32(%r15), %xmm8
-        aesenc	48(%r15), %xmm8
-        aesenc	64(%r15), %xmm8
-        aesenc	80(%r15), %xmm8
-        aesenc	96(%r15), %xmm8
-        aesenc	112(%r15), %xmm8
-        aesenc	128(%r15), %xmm8
-        aesenc	144(%r15), %xmm8
-        cmpl	$11, %r10d
-        movdqa	160(%r15), %xmm9
-        jl	L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last
-        aesenc	%xmm9, %xmm8
-        aesenc	176(%r15), %xmm8
-        cmpl	$13, %r10d
-        movdqa	192(%r15), %xmm9
-        jl	L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last
-        aesenc	%xmm9, %xmm8
-        aesenc	208(%r15), %xmm8
-        movdqa	224(%r15), %xmm9
-L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last:
-        aesenclast	%xmm9, %xmm8
-        movdqu	(%rcx), %xmm9
-        pxor	%xmm9, %xmm8
-        movdqu	%xmm8, (%rdx)
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm8
-        pxor	%xmm8, %xmm6
-        addl	$16, %ebx
-        cmpl	%r13d, %ebx
-        jge	L_AES_GCM_encrypt_last_block_ghash
-L_AES_GCM_encrypt_last_block_start:
-        leaq	(%rdi,%rbx,1), %rcx
-        leaq	(%rsi,%rbx,1), %rdx
-        movdqu	128(%rsp), %xmm8
-        movdqa	%xmm8, %xmm9
-        pshufb	L_aes_gcm_bswap_epi64(%rip), %xmm8
-        paddd	L_aes_gcm_one(%rip), %xmm9
-        pxor	(%r15), %xmm8
-        movdqu	%xmm9, 128(%rsp)
-        movdqa	%xmm6, %xmm10
-        pclmulqdq	$16, %xmm5, %xmm10
-        aesenc	16(%r15), %xmm8
-        aesenc	32(%r15), %xmm8
-        movdqa	%xmm6, %xmm11
-        pclmulqdq	$0x01, %xmm5, %xmm11
-        aesenc	48(%r15), %xmm8
-        aesenc	64(%r15), %xmm8
-        movdqa	%xmm6, %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm12
-        aesenc	80(%r15), %xmm8
-        movdqa	%xmm6, %xmm1
-        pclmulqdq	$0x11, %xmm5, %xmm1
-        aesenc	96(%r15), %xmm8
-        pxor	%xmm11, %xmm10
-        movdqa	%xmm10, %xmm2
-        psrldq	$8, %xmm10
-        pslldq	$8, %xmm2
-        aesenc	112(%r15), %xmm8
-        movdqa	%xmm1, %xmm3
-        pxor	%xmm12, %xmm2
-        pxor	%xmm10, %xmm3
-        movdqa	L_aes_gcm_mod2_128(%rip), %xmm0
-        movdqa	%xmm2, %xmm11
-        pclmulqdq	$16, %xmm0, %xmm11
-        aesenc	128(%r15), %xmm8
-        pshufd	$0x4e, %xmm2, %xmm10
-        pxor	%xmm11, %xmm10
-        movdqa	%xmm10, %xmm11
-        pclmulqdq	$16, %xmm0, %xmm11
-        aesenc	144(%r15), %xmm8
-        pshufd	$0x4e, %xmm10, %xmm6
-        pxor	%xmm11, %xmm6
-        pxor	%xmm3, %xmm6
-        cmpl	$11, %r10d
-        movdqa	160(%r15), %xmm9
-        jl	L_AES_GCM_encrypt_aesenc_gfmul_last
-        aesenc	%xmm9, %xmm8
-        aesenc	176(%r15), %xmm8
-        cmpl	$13, %r10d
-        movdqa	192(%r15), %xmm9
-        jl	L_AES_GCM_encrypt_aesenc_gfmul_last
-        aesenc	%xmm9, %xmm8
-        aesenc	208(%r15), %xmm8
-        movdqa	224(%r15), %xmm9
-L_AES_GCM_encrypt_aesenc_gfmul_last:
-        aesenclast	%xmm9, %xmm8
-        movdqu	(%rcx), %xmm9
-        pxor	%xmm9, %xmm8
-        movdqu	%xmm8, (%rdx)
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm8
-        pxor	%xmm8, %xmm6
-        addl	$16, %ebx
-        cmpl	%r13d, %ebx
-        jl	L_AES_GCM_encrypt_last_block_start
-L_AES_GCM_encrypt_last_block_ghash:
-        pshufd	$0x4e, %xmm5, %xmm9
-        pshufd	$0x4e, %xmm6, %xmm10
-        movdqa	%xmm6, %xmm11
-        movdqa	%xmm6, %xmm8
-        pclmulqdq	$0x11, %xmm5, %xmm11
-        pclmulqdq	$0x00, %xmm5, %xmm8
-        pxor	%xmm5, %xmm9
-        pxor	%xmm6, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm6
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm6
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm6
-L_AES_GCM_encrypt_last_block_done:
-        movl	%r9d, %ecx
-        movl	%ecx, %edx
-        andl	$15, %ecx
-        jz	L_AES_GCM_encrypt_aesenc_last15_enc_avx_done
-        movdqu	128(%rsp), %xmm4
-        pshufb	L_aes_gcm_bswap_epi64(%rip), %xmm4
-        pxor	(%r15), %xmm4
-        aesenc	16(%r15), %xmm4
-        aesenc	32(%r15), %xmm4
-        aesenc	48(%r15), %xmm4
-        aesenc	64(%r15), %xmm4
-        aesenc	80(%r15), %xmm4
-        aesenc	96(%r15), %xmm4
-        aesenc	112(%r15), %xmm4
-        aesenc	128(%r15), %xmm4
-        aesenc	144(%r15), %xmm4
-        cmpl	$11, %r10d
-        movdqa	160(%r15), %xmm9
-        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last
-        aesenc	%xmm9, %xmm4
-        aesenc	176(%r15), %xmm4
-        cmpl	$13, %r10d
-        movdqa	192(%r15), %xmm9
-        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last
-        aesenc	%xmm9, %xmm4
-        aesenc	208(%r15), %xmm4
-        movdqa	224(%r15), %xmm9
-L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last:
-        aesenclast	%xmm9, %xmm4
-        subq	$16, %rsp
-        xorl	%ecx, %ecx
-        movdqu	%xmm4, (%rsp)
-L_AES_GCM_encrypt_aesenc_last15_enc_avx_loop:
-        movzbl	(%rdi,%rbx,1), %r13d
-        xorb	(%rsp,%rcx,1), %r13b
-        movb	%r13b, (%rsi,%rbx,1)
-        movb	%r13b, (%rsp,%rcx,1)
-        incl	%ebx
-        incl	%ecx
-        cmpl	%edx, %ebx
-        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_loop
-        xorq	%r13, %r13
-        cmpl	$16, %ecx
-        je	L_AES_GCM_encrypt_aesenc_last15_enc_avx_finish_enc
-L_AES_GCM_encrypt_aesenc_last15_enc_avx_byte_loop:
-        movb	%r13b, (%rsp,%rcx,1)
-        incl	%ecx
-        cmpl	$16, %ecx
-        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_byte_loop
-L_AES_GCM_encrypt_aesenc_last15_enc_avx_finish_enc:
-        movdqu	(%rsp), %xmm4
-        addq	$16, %rsp
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm4
-        pxor	%xmm4, %xmm6
-        pshufd	$0x4e, %xmm5, %xmm9
-        pshufd	$0x4e, %xmm6, %xmm10
-        movdqa	%xmm6, %xmm11
-        movdqa	%xmm6, %xmm8
-        pclmulqdq	$0x11, %xmm5, %xmm11
-        pclmulqdq	$0x00, %xmm5, %xmm8
-        pxor	%xmm5, %xmm9
-        pxor	%xmm6, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm6
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm6
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm6
-L_AES_GCM_encrypt_aesenc_last15_enc_avx_done:
-L_AES_GCM_encrypt_done_enc:
-        movl	%r9d, %edx
-        movl	%r11d, %ecx
-        shlq	$3, %rdx
-        shlq	$3, %rcx
-        pinsrq	$0x00, %rdx, %xmm0
-        pinsrq	$0x01, %rcx, %xmm0
-        pxor	%xmm0, %xmm6
-        pshufd	$0x4e, %xmm5, %xmm9
-        pshufd	$0x4e, %xmm6, %xmm10
-        movdqa	%xmm6, %xmm11
-        movdqa	%xmm6, %xmm8
-        pclmulqdq	$0x11, %xmm5, %xmm11
-        pclmulqdq	$0x00, %xmm5, %xmm8
-        pxor	%xmm5, %xmm9
-        pxor	%xmm6, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm6
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm6
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm6
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm6
-        movdqu	144(%rsp), %xmm0
-        pxor	%xmm6, %xmm0
-        cmpl	$16, %r14d
-        je	L_AES_GCM_encrypt_store_tag_16
-        xorq	%rcx, %rcx
-        movdqu	%xmm0, (%rsp)
-L_AES_GCM_encrypt_store_tag_loop:
-        movzbl	(%rsp,%rcx,1), %r13d
-        movb	%r13b, (%r8,%rcx,1)
-        incl	%ecx
-        cmpl	%r14d, %ecx
-        jne	L_AES_GCM_encrypt_store_tag_loop
-        jmp	L_AES_GCM_encrypt_store_tag_done
-L_AES_GCM_encrypt_store_tag_16:
-        movdqu	%xmm0, (%r8)
-L_AES_GCM_encrypt_store_tag_done:
-        addq	$0xa0, %rsp
-        popq	%r15
-        popq	%r14
-        popq	%rbx
-        popq	%r12
-        popq	%r13
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_encrypt,.-AES_GCM_encrypt
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_decrypt
-.type	AES_GCM_decrypt,@function
-.align	16
-AES_GCM_decrypt:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_decrypt
-.p2align	4
-_AES_GCM_decrypt:
-#endif /* __APPLE__ */
-        pushq	%r13
-        pushq	%r12
-        pushq	%rbx
-        pushq	%r14
-        pushq	%r15
-        pushq	%rbp
-        movq	%rdx, %r12
-        movq	%rcx, %rax
-        movl	56(%rsp), %r11d
-        movl	64(%rsp), %ebx
-        movl	72(%rsp), %r14d
-        movq	80(%rsp), %r15
-        movl	88(%rsp), %r10d
-        movq	96(%rsp), %rbp
-        subq	$0xa8, %rsp
-        pxor	%xmm4, %xmm4
-        pxor	%xmm6, %xmm6
-        cmpl	$12, %ebx
-        movl	%ebx, %edx
-        jne	L_AES_GCM_decrypt_iv_not_12
-        # # Calculate values when IV is 12 bytes
-        # Set counter based on IV
-        movl	$0x1000000, %ecx
-        pinsrq	$0x00, (%rax), %xmm4
-        pinsrd	$2, 8(%rax), %xmm4
-        pinsrd	$3, %ecx, %xmm4
-        # H = Encrypt X(=0) and T = Encrypt counter
-        movdqa	%xmm4, %xmm1
-        movdqa	(%r15), %xmm5
-        pxor	%xmm5, %xmm1
-        movdqa	16(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	32(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	48(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	64(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	80(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	96(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	112(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	128(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	144(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        cmpl	$11, %r10d
-        movdqa	160(%r15), %xmm7
-        jl	L_AES_GCM_decrypt_calc_iv_12_last
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	176(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        cmpl	$13, %r10d
-        movdqa	192(%r15), %xmm7
-        jl	L_AES_GCM_decrypt_calc_iv_12_last
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	208(%r15), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	224(%r15), %xmm7
-L_AES_GCM_decrypt_calc_iv_12_last:
-        aesenclast	%xmm7, %xmm5
-        aesenclast	%xmm7, %xmm1
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm5
-        movdqu	%xmm1, 144(%rsp)
-        jmp	L_AES_GCM_decrypt_iv_done
-L_AES_GCM_decrypt_iv_not_12:
-        # Calculate values when IV is not 12 bytes
-        # H = Encrypt X(=0)
-        movdqa	(%r15), %xmm5
-        aesenc	16(%r15), %xmm5
-        aesenc	32(%r15), %xmm5
-        aesenc	48(%r15), %xmm5
-        aesenc	64(%r15), %xmm5
-        aesenc	80(%r15), %xmm5
-        aesenc	96(%r15), %xmm5
-        aesenc	112(%r15), %xmm5
-        aesenc	128(%r15), %xmm5
-        aesenc	144(%r15), %xmm5
-        cmpl	$11, %r10d
-        movdqa	160(%r15), %xmm9
-        jl	L_AES_GCM_decrypt_calc_iv_1_aesenc_avx_last
-        aesenc	%xmm9, %xmm5
-        aesenc	176(%r15), %xmm5
-        cmpl	$13, %r10d
-        movdqa	192(%r15), %xmm9
-        jl	L_AES_GCM_decrypt_calc_iv_1_aesenc_avx_last
-        aesenc	%xmm9, %xmm5
-        aesenc	208(%r15), %xmm5
-        movdqa	224(%r15), %xmm9
-L_AES_GCM_decrypt_calc_iv_1_aesenc_avx_last:
-        aesenclast	%xmm9, %xmm5
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm5
-        # Calc counter
-        # Initialization vector
-        cmpl	$0x00, %edx
-        movq	$0x00, %rcx
-        je	L_AES_GCM_decrypt_calc_iv_done
-        cmpl	$16, %edx
-        jl	L_AES_GCM_decrypt_calc_iv_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_decrypt_calc_iv_16_loop:
-        movdqu	(%rax,%rcx,1), %xmm8
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm8
-        pxor	%xmm8, %xmm4
-        pshufd	$0x4e, %xmm4, %xmm1
-        pshufd	$0x4e, %xmm5, %xmm2
-        movdqa	%xmm5, %xmm3
-        movdqa	%xmm5, %xmm0
-        pclmulqdq	$0x11, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm4, %xmm0
-        pxor	%xmm4, %xmm1
-        pxor	%xmm5, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm7
-        movdqa	%xmm3, %xmm4
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm7
-        pxor	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm4, %xmm1
-        psrld	$31, %xmm0
-        psrld	$31, %xmm1
-        pslld	$0x01, %xmm7
-        pslld	$0x01, %xmm4
-        movdqa	%xmm0, %xmm2
-        pslldq	$4, %xmm0
-        psrldq	$12, %xmm2
-        pslldq	$4, %xmm1
-        por	%xmm2, %xmm4
-        por	%xmm0, %xmm7
-        por	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm7, %xmm1
-        movdqa	%xmm7, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm7
-        movdqa	%xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm7, %xmm2
-        pxor	%xmm2, %xmm4
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_calc_iv_16_loop
-        movl	%ebx, %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_decrypt_calc_iv_done
-L_AES_GCM_decrypt_calc_iv_lt16:
-        subq	$16, %rsp
-        pxor	%xmm8, %xmm8
-        xorl	%ebx, %ebx
-        movdqu	%xmm8, (%rsp)
-L_AES_GCM_decrypt_calc_iv_loop:
-        movzbl	(%rax,%rcx,1), %r13d
-        movb	%r13b, (%rsp,%rbx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_calc_iv_loop
-        movdqu	(%rsp), %xmm8
-        addq	$16, %rsp
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm8
-        pxor	%xmm8, %xmm4
-        pshufd	$0x4e, %xmm4, %xmm1
-        pshufd	$0x4e, %xmm5, %xmm2
-        movdqa	%xmm5, %xmm3
-        movdqa	%xmm5, %xmm0
-        pclmulqdq	$0x11, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm4, %xmm0
-        pxor	%xmm4, %xmm1
-        pxor	%xmm5, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm7
-        movdqa	%xmm3, %xmm4
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm7
-        pxor	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm4, %xmm1
-        psrld	$31, %xmm0
-        psrld	$31, %xmm1
-        pslld	$0x01, %xmm7
-        pslld	$0x01, %xmm4
-        movdqa	%xmm0, %xmm2
-        pslldq	$4, %xmm0
-        psrldq	$12, %xmm2
-        pslldq	$4, %xmm1
-        por	%xmm2, %xmm4
-        por	%xmm0, %xmm7
-        por	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm7, %xmm1
-        movdqa	%xmm7, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm7
-        movdqa	%xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm7, %xmm2
-        pxor	%xmm2, %xmm4
-L_AES_GCM_decrypt_calc_iv_done:
-        # T = Encrypt counter
-        pxor	%xmm0, %xmm0
-        shll	$3, %edx
-        pinsrq	$0x00, %rdx, %xmm0
-        pxor	%xmm0, %xmm4
-        pshufd	$0x4e, %xmm4, %xmm1
-        pshufd	$0x4e, %xmm5, %xmm2
-        movdqa	%xmm5, %xmm3
-        movdqa	%xmm5, %xmm0
-        pclmulqdq	$0x11, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm4, %xmm0
-        pxor	%xmm4, %xmm1
-        pxor	%xmm5, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm7
-        movdqa	%xmm3, %xmm4
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm7
-        pxor	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm4, %xmm1
-        psrld	$31, %xmm0
-        psrld	$31, %xmm1
-        pslld	$0x01, %xmm7
-        pslld	$0x01, %xmm4
-        movdqa	%xmm0, %xmm2
-        pslldq	$4, %xmm0
-        psrldq	$12, %xmm2
-        pslldq	$4, %xmm1
-        por	%xmm2, %xmm4
-        por	%xmm0, %xmm7
-        por	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm7, %xmm1
-        movdqa	%xmm7, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm7
-        movdqa	%xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm7, %xmm2
-        pxor	%xmm2, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm4
-        #   Encrypt counter
-        movdqa	(%r15), %xmm8
-        pxor	%xmm4, %xmm8
-        aesenc	16(%r15), %xmm8
-        aesenc	32(%r15), %xmm8
-        aesenc	48(%r15), %xmm8
-        aesenc	64(%r15), %xmm8
-        aesenc	80(%r15), %xmm8
-        aesenc	96(%r15), %xmm8
-        aesenc	112(%r15), %xmm8
-        aesenc	128(%r15), %xmm8
-        aesenc	144(%r15), %xmm8
-        cmpl	$11, %r10d
-        movdqa	160(%r15), %xmm9
-        jl	L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last
-        aesenc	%xmm9, %xmm8
-        aesenc	176(%r15), %xmm8
-        cmpl	$13, %r10d
-        movdqa	192(%r15), %xmm9
-        jl	L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last
-        aesenc	%xmm9, %xmm8
-        aesenc	208(%r15), %xmm8
-        movdqa	224(%r15), %xmm9
-L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last:
-        aesenclast	%xmm9, %xmm8
-        movdqu	%xmm8, 144(%rsp)
-L_AES_GCM_decrypt_iv_done:
-        # Additional authentication data
-        movl	%r11d, %edx
-        cmpl	$0x00, %edx
-        je	L_AES_GCM_decrypt_calc_aad_done
-        xorl	%ecx, %ecx
-        cmpl	$16, %edx
-        jl	L_AES_GCM_decrypt_calc_aad_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_decrypt_calc_aad_16_loop:
-        movdqu	(%r12,%rcx,1), %xmm8
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm8
-        pxor	%xmm8, %xmm6
-        pshufd	$0x4e, %xmm6, %xmm1
-        pshufd	$0x4e, %xmm5, %xmm2
-        movdqa	%xmm5, %xmm3
-        movdqa	%xmm5, %xmm0
-        pclmulqdq	$0x11, %xmm6, %xmm3
-        pclmulqdq	$0x00, %xmm6, %xmm0
-        pxor	%xmm6, %xmm1
-        pxor	%xmm5, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm7
-        movdqa	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm7
-        pxor	%xmm1, %xmm6
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm6, %xmm1
-        psrld	$31, %xmm0
-        psrld	$31, %xmm1
-        pslld	$0x01, %xmm7
-        pslld	$0x01, %xmm6
-        movdqa	%xmm0, %xmm2
-        pslldq	$4, %xmm0
-        psrldq	$12, %xmm2
-        pslldq	$4, %xmm1
-        por	%xmm2, %xmm6
-        por	%xmm0, %xmm7
-        por	%xmm1, %xmm6
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm7, %xmm1
-        movdqa	%xmm7, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm7
-        movdqa	%xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm7, %xmm2
-        pxor	%xmm2, %xmm6
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_calc_aad_16_loop
-        movl	%r11d, %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_decrypt_calc_aad_done
-L_AES_GCM_decrypt_calc_aad_lt16:
-        subq	$16, %rsp
-        pxor	%xmm8, %xmm8
-        xorl	%ebx, %ebx
-        movdqu	%xmm8, (%rsp)
-L_AES_GCM_decrypt_calc_aad_loop:
-        movzbl	(%r12,%rcx,1), %r13d
-        movb	%r13b, (%rsp,%rbx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_calc_aad_loop
-        movdqu	(%rsp), %xmm8
-        addq	$16, %rsp
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm8
-        pxor	%xmm8, %xmm6
-        pshufd	$0x4e, %xmm6, %xmm1
-        pshufd	$0x4e, %xmm5, %xmm2
-        movdqa	%xmm5, %xmm3
-        movdqa	%xmm5, %xmm0
-        pclmulqdq	$0x11, %xmm6, %xmm3
-        pclmulqdq	$0x00, %xmm6, %xmm0
-        pxor	%xmm6, %xmm1
-        pxor	%xmm5, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm7
-        movdqa	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm7
-        pxor	%xmm1, %xmm6
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm6, %xmm1
-        psrld	$31, %xmm0
-        psrld	$31, %xmm1
-        pslld	$0x01, %xmm7
-        pslld	$0x01, %xmm6
-        movdqa	%xmm0, %xmm2
-        pslldq	$4, %xmm0
-        psrldq	$12, %xmm2
-        pslldq	$4, %xmm1
-        por	%xmm2, %xmm6
-        por	%xmm0, %xmm7
-        por	%xmm1, %xmm6
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm7, %xmm1
-        movdqa	%xmm7, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm7
-        movdqa	%xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm7, %xmm2
-        pxor	%xmm2, %xmm6
-L_AES_GCM_decrypt_calc_aad_done:
-        # Calculate counter and H
-        pshufb	L_aes_gcm_bswap_epi64(%rip), %xmm4
-        movdqa	%xmm5, %xmm9
-        paddd	L_aes_gcm_one(%rip), %xmm4
-        movdqa	%xmm5, %xmm8
-        movdqu	%xmm4, 128(%rsp)
-        psrlq	$63, %xmm9
-        psllq	$0x01, %xmm8
-        pslldq	$8, %xmm9
-        por	%xmm9, %xmm8
-        pshufd	$0xff, %xmm5, %xmm5
-        psrad	$31, %xmm5
-        pand	L_aes_gcm_mod2_128(%rip), %xmm5
-        pxor	%xmm8, %xmm5
-        xorl	%ebx, %ebx
-        cmpl	$0x80, %r9d
-        movl	%r9d, %r13d
-        jl	L_AES_GCM_decrypt_done_128
-        andl	$0xffffff80, %r13d
-        movdqa	%xmm6, %xmm2
-        # H ^ 1
-        movdqu	%xmm5, (%rsp)
-        # H ^ 2
-        pshufd	$0x4e, %xmm5, %xmm9
-        pshufd	$0x4e, %xmm5, %xmm10
-        movdqa	%xmm5, %xmm11
-        movdqa	%xmm5, %xmm8
-        pclmulqdq	$0x11, %xmm5, %xmm11
-        pclmulqdq	$0x00, %xmm5, %xmm8
-        pxor	%xmm5, %xmm9
-        pxor	%xmm5, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm0
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm0
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm0
-        movdqu	%xmm0, 16(%rsp)
-        # H ^ 3
-        pshufd	$0x4e, %xmm5, %xmm9
-        pshufd	$0x4e, %xmm0, %xmm10
-        movdqa	%xmm0, %xmm11
-        movdqa	%xmm0, %xmm8
-        pclmulqdq	$0x11, %xmm5, %xmm11
-        pclmulqdq	$0x00, %xmm5, %xmm8
-        pxor	%xmm5, %xmm9
-        pxor	%xmm0, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm1
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm1
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm1
-        movdqu	%xmm1, 32(%rsp)
-        # H ^ 4
-        pshufd	$0x4e, %xmm0, %xmm9
-        pshufd	$0x4e, %xmm0, %xmm10
-        movdqa	%xmm0, %xmm11
-        movdqa	%xmm0, %xmm8
-        pclmulqdq	$0x11, %xmm0, %xmm11
-        pclmulqdq	$0x00, %xmm0, %xmm8
-        pxor	%xmm0, %xmm9
-        pxor	%xmm0, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm3
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm3
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm3
-        movdqu	%xmm3, 48(%rsp)
-        # H ^ 5
-        pshufd	$0x4e, %xmm0, %xmm9
-        pshufd	$0x4e, %xmm1, %xmm10
-        movdqa	%xmm1, %xmm11
-        movdqa	%xmm1, %xmm8
-        pclmulqdq	$0x11, %xmm0, %xmm11
-        pclmulqdq	$0x00, %xmm0, %xmm8
-        pxor	%xmm0, %xmm9
-        pxor	%xmm1, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm7
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm7
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm7
-        movdqu	%xmm7, 64(%rsp)
-        # H ^ 6
-        pshufd	$0x4e, %xmm1, %xmm9
-        pshufd	$0x4e, %xmm1, %xmm10
-        movdqa	%xmm1, %xmm11
-        movdqa	%xmm1, %xmm8
-        pclmulqdq	$0x11, %xmm1, %xmm11
-        pclmulqdq	$0x00, %xmm1, %xmm8
-        pxor	%xmm1, %xmm9
-        pxor	%xmm1, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm7
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm7
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm7
-        movdqu	%xmm7, 80(%rsp)
-        # H ^ 7
-        pshufd	$0x4e, %xmm1, %xmm9
-        pshufd	$0x4e, %xmm3, %xmm10
-        movdqa	%xmm3, %xmm11
-        movdqa	%xmm3, %xmm8
-        pclmulqdq	$0x11, %xmm1, %xmm11
-        pclmulqdq	$0x00, %xmm1, %xmm8
-        pxor	%xmm1, %xmm9
-        pxor	%xmm3, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm7
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm7
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm7
-        movdqu	%xmm7, 96(%rsp)
-        # H ^ 8
-        pshufd	$0x4e, %xmm3, %xmm9
-        pshufd	$0x4e, %xmm3, %xmm10
-        movdqa	%xmm3, %xmm11
-        movdqa	%xmm3, %xmm8
-        pclmulqdq	$0x11, %xmm3, %xmm11
-        pclmulqdq	$0x00, %xmm3, %xmm8
-        pxor	%xmm3, %xmm9
-        pxor	%xmm3, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm7
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm7
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm7
-        movdqu	%xmm7, 112(%rsp)
-L_AES_GCM_decrypt_ghash_128:
-        leaq	(%rdi,%rbx,1), %rcx
-        leaq	(%rsi,%rbx,1), %rdx
-        movdqu	128(%rsp), %xmm8
-        movdqa	L_aes_gcm_bswap_epi64(%rip), %xmm1
-        movdqa	%xmm8, %xmm0
-        pshufb	%xmm1, %xmm8
-        movdqa	%xmm0, %xmm9
-        paddd	L_aes_gcm_one(%rip), %xmm9
-        pshufb	%xmm1, %xmm9
-        movdqa	%xmm0, %xmm10
-        paddd	L_aes_gcm_two(%rip), %xmm10
-        pshufb	%xmm1, %xmm10
-        movdqa	%xmm0, %xmm11
-        paddd	L_aes_gcm_three(%rip), %xmm11
-        pshufb	%xmm1, %xmm11
-        movdqa	%xmm0, %xmm12
-        paddd	L_aes_gcm_four(%rip), %xmm12
-        pshufb	%xmm1, %xmm12
-        movdqa	%xmm0, %xmm13
-        paddd	L_aes_gcm_five(%rip), %xmm13
-        pshufb	%xmm1, %xmm13
-        movdqa	%xmm0, %xmm14
-        paddd	L_aes_gcm_six(%rip), %xmm14
-        pshufb	%xmm1, %xmm14
-        movdqa	%xmm0, %xmm15
-        paddd	L_aes_gcm_seven(%rip), %xmm15
-        pshufb	%xmm1, %xmm15
-        paddd	L_aes_gcm_eight(%rip), %xmm0
-        movdqa	(%r15), %xmm7
-        movdqu	%xmm0, 128(%rsp)
-        pxor	%xmm7, %xmm8
-        pxor	%xmm7, %xmm9
-        pxor	%xmm7, %xmm10
-        pxor	%xmm7, %xmm11
-        pxor	%xmm7, %xmm12
-        pxor	%xmm7, %xmm13
-        pxor	%xmm7, %xmm14
-        pxor	%xmm7, %xmm15
-        movdqu	112(%rsp), %xmm7
-        movdqu	(%rcx), %xmm0
-        aesenc	16(%r15), %xmm8
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        pxor	%xmm2, %xmm0
-        pshufd	$0x4e, %xmm7, %xmm1
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm7, %xmm1
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm3
-        pclmulqdq	$0x11, %xmm7, %xmm3
-        aesenc	16(%r15), %xmm9
-        aesenc	16(%r15), %xmm10
-        movdqa	%xmm0, %xmm2
-        pclmulqdq	$0x00, %xmm7, %xmm2
-        aesenc	16(%r15), %xmm11
-        aesenc	16(%r15), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm1
-        aesenc	16(%r15), %xmm13
-        aesenc	16(%r15), %xmm14
-        aesenc	16(%r15), %xmm15
-        pxor	%xmm2, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqu	96(%rsp), %xmm7
-        movdqu	16(%rcx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	32(%r15), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	32(%r15), %xmm9
-        aesenc	32(%r15), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	32(%r15), %xmm11
-        aesenc	32(%r15), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	32(%r15), %xmm13
-        aesenc	32(%r15), %xmm14
-        aesenc	32(%r15), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	80(%rsp), %xmm7
-        movdqu	32(%rcx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	48(%r15), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	48(%r15), %xmm9
-        aesenc	48(%r15), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	48(%r15), %xmm11
-        aesenc	48(%r15), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	48(%r15), %xmm13
-        aesenc	48(%r15), %xmm14
-        aesenc	48(%r15), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	64(%rsp), %xmm7
-        movdqu	48(%rcx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	64(%r15), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	64(%r15), %xmm9
-        aesenc	64(%r15), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	64(%r15), %xmm11
-        aesenc	64(%r15), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	64(%r15), %xmm13
-        aesenc	64(%r15), %xmm14
-        aesenc	64(%r15), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	48(%rsp), %xmm7
-        movdqu	64(%rcx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	80(%r15), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	80(%r15), %xmm9
-        aesenc	80(%r15), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	80(%r15), %xmm11
-        aesenc	80(%r15), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	80(%r15), %xmm13
-        aesenc	80(%r15), %xmm14
-        aesenc	80(%r15), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	32(%rsp), %xmm7
-        movdqu	80(%rcx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	96(%r15), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	96(%r15), %xmm9
-        aesenc	96(%r15), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	96(%r15), %xmm11
-        aesenc	96(%r15), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	96(%r15), %xmm13
-        aesenc	96(%r15), %xmm14
-        aesenc	96(%r15), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	16(%rsp), %xmm7
-        movdqu	96(%rcx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	112(%r15), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	112(%r15), %xmm9
-        aesenc	112(%r15), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	112(%r15), %xmm11
-        aesenc	112(%r15), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	112(%r15), %xmm13
-        aesenc	112(%r15), %xmm14
-        aesenc	112(%r15), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	(%rsp), %xmm7
-        movdqu	112(%rcx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	128(%r15), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	128(%r15), %xmm9
-        aesenc	128(%r15), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	128(%r15), %xmm11
-        aesenc	128(%r15), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	128(%r15), %xmm13
-        aesenc	128(%r15), %xmm14
-        aesenc	128(%r15), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqa	%xmm1, %xmm5
-        psrldq	$8, %xmm1
-        pslldq	$8, %xmm5
-        aesenc	144(%r15), %xmm8
-        pxor	%xmm5, %xmm2
-        pxor	%xmm1, %xmm3
-        movdqa	%xmm2, %xmm7
-        movdqa	%xmm2, %xmm4
-        movdqa	%xmm2, %xmm5
-        aesenc	144(%r15), %xmm9
-        pslld	$31, %xmm7
-        pslld	$30, %xmm4
-        pslld	$25, %xmm5
-        aesenc	144(%r15), %xmm10
-        pxor	%xmm4, %xmm7
-        pxor	%xmm5, %xmm7
-        aesenc	144(%r15), %xmm11
-        movdqa	%xmm7, %xmm4
-        pslldq	$12, %xmm7
-        psrldq	$4, %xmm4
-        aesenc	144(%r15), %xmm12
-        pxor	%xmm7, %xmm2
-        movdqa	%xmm2, %xmm5
-        movdqa	%xmm2, %xmm1
-        movdqa	%xmm2, %xmm0
-        aesenc	144(%r15), %xmm13
-        psrld	$0x01, %xmm5
-        psrld	$2, %xmm1
-        psrld	$7, %xmm0
-        aesenc	144(%r15), %xmm14
-        pxor	%xmm1, %xmm5
-        pxor	%xmm0, %xmm5
-        aesenc	144(%r15), %xmm15
-        pxor	%xmm4, %xmm5
-        pxor	%xmm5, %xmm2
-        pxor	%xmm3, %xmm2
-        cmpl	$11, %r10d
-        movdqa	160(%r15), %xmm7
-        jl	L_AES_GCM_decrypt_aesenc_128_ghash_avx_done
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	176(%r15), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        cmpl	$13, %r10d
-        movdqa	192(%r15), %xmm7
-        jl	L_AES_GCM_decrypt_aesenc_128_ghash_avx_done
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	208(%r15), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	224(%r15), %xmm7
-L_AES_GCM_decrypt_aesenc_128_ghash_avx_done:
-        aesenclast	%xmm7, %xmm8
-        aesenclast	%xmm7, %xmm9
-        movdqu	(%rcx), %xmm0
-        movdqu	16(%rcx), %xmm1
-        pxor	%xmm0, %xmm8
-        pxor	%xmm1, %xmm9
-        movdqu	%xmm8, (%rdx)
-        movdqu	%xmm9, 16(%rdx)
-        aesenclast	%xmm7, %xmm10
-        aesenclast	%xmm7, %xmm11
-        movdqu	32(%rcx), %xmm0
-        movdqu	48(%rcx), %xmm1
-        pxor	%xmm0, %xmm10
-        pxor	%xmm1, %xmm11
-        movdqu	%xmm10, 32(%rdx)
-        movdqu	%xmm11, 48(%rdx)
-        aesenclast	%xmm7, %xmm12
-        aesenclast	%xmm7, %xmm13
-        movdqu	64(%rcx), %xmm0
-        movdqu	80(%rcx), %xmm1
-        pxor	%xmm0, %xmm12
-        pxor	%xmm1, %xmm13
-        movdqu	%xmm12, 64(%rdx)
-        movdqu	%xmm13, 80(%rdx)
-        aesenclast	%xmm7, %xmm14
-        aesenclast	%xmm7, %xmm15
-        movdqu	96(%rcx), %xmm0
-        movdqu	112(%rcx), %xmm1
-        pxor	%xmm0, %xmm14
-        pxor	%xmm1, %xmm15
-        movdqu	%xmm14, 96(%rdx)
-        movdqu	%xmm15, 112(%rdx)
-        addl	$0x80, %ebx
-        cmpl	%r13d, %ebx
-        jl	L_AES_GCM_decrypt_ghash_128
-        movdqa	%xmm2, %xmm6
-        movdqu	(%rsp), %xmm5
-L_AES_GCM_decrypt_done_128:
-        movl	%r9d, %edx
-        cmpl	%edx, %ebx
-        jge	L_AES_GCM_decrypt_done_dec
-        movl	%r9d, %r13d
-        andl	$0xfffffff0, %r13d
-        cmpl	%r13d, %ebx
-        jge	L_AES_GCM_decrypt_last_block_done
-L_AES_GCM_decrypt_last_block_start:
-        leaq	(%rdi,%rbx,1), %rcx
-        leaq	(%rsi,%rbx,1), %rdx
-        movdqu	(%rcx), %xmm1
-        movdqa	%xmm5, %xmm0
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm1
-        pxor	%xmm6, %xmm1
-        movdqu	128(%rsp), %xmm8
-        movdqa	%xmm8, %xmm9
-        pshufb	L_aes_gcm_bswap_epi64(%rip), %xmm8
-        paddd	L_aes_gcm_one(%rip), %xmm9
-        pxor	(%r15), %xmm8
-        movdqu	%xmm9, 128(%rsp)
-        movdqa	%xmm1, %xmm10
-        pclmulqdq	$16, %xmm0, %xmm10
-        aesenc	16(%r15), %xmm8
-        aesenc	32(%r15), %xmm8
-        movdqa	%xmm1, %xmm11
-        pclmulqdq	$0x01, %xmm0, %xmm11
-        aesenc	48(%r15), %xmm8
-        aesenc	64(%r15), %xmm8
-        movdqa	%xmm1, %xmm12
-        pclmulqdq	$0x00, %xmm0, %xmm12
-        aesenc	80(%r15), %xmm8
-        movdqa	%xmm1, %xmm1
-        pclmulqdq	$0x11, %xmm0, %xmm1
-        aesenc	96(%r15), %xmm8
-        pxor	%xmm11, %xmm10
-        movdqa	%xmm10, %xmm2
-        psrldq	$8, %xmm10
-        pslldq	$8, %xmm2
-        aesenc	112(%r15), %xmm8
-        movdqa	%xmm1, %xmm3
-        pxor	%xmm12, %xmm2
-        pxor	%xmm10, %xmm3
-        movdqa	L_aes_gcm_mod2_128(%rip), %xmm0
-        movdqa	%xmm2, %xmm11
-        pclmulqdq	$16, %xmm0, %xmm11
-        aesenc	128(%r15), %xmm8
-        pshufd	$0x4e, %xmm2, %xmm10
-        pxor	%xmm11, %xmm10
-        movdqa	%xmm10, %xmm11
-        pclmulqdq	$16, %xmm0, %xmm11
-        aesenc	144(%r15), %xmm8
-        pshufd	$0x4e, %xmm10, %xmm6
-        pxor	%xmm11, %xmm6
-        pxor	%xmm3, %xmm6
-        cmpl	$11, %r10d
-        movdqa	160(%r15), %xmm9
-        jl	L_AES_GCM_decrypt_aesenc_gfmul_last
-        aesenc	%xmm9, %xmm8
-        aesenc	176(%r15), %xmm8
-        cmpl	$13, %r10d
-        movdqa	192(%r15), %xmm9
-        jl	L_AES_GCM_decrypt_aesenc_gfmul_last
-        aesenc	%xmm9, %xmm8
-        aesenc	208(%r15), %xmm8
-        movdqa	224(%r15), %xmm9
-L_AES_GCM_decrypt_aesenc_gfmul_last:
-        aesenclast	%xmm9, %xmm8
-        movdqu	(%rcx), %xmm9
-        pxor	%xmm9, %xmm8
-        movdqu	%xmm8, (%rdx)
-        addl	$16, %ebx
-        cmpl	%r13d, %ebx
-        jl	L_AES_GCM_decrypt_last_block_start
-L_AES_GCM_decrypt_last_block_done:
-        movl	%r9d, %ecx
-        movl	%ecx, %edx
-        andl	$15, %ecx
-        jz	L_AES_GCM_decrypt_aesenc_last15_dec_avx_done
-        movdqu	128(%rsp), %xmm4
-        pshufb	L_aes_gcm_bswap_epi64(%rip), %xmm4
-        pxor	(%r15), %xmm4
-        aesenc	16(%r15), %xmm4
-        aesenc	32(%r15), %xmm4
-        aesenc	48(%r15), %xmm4
-        aesenc	64(%r15), %xmm4
-        aesenc	80(%r15), %xmm4
-        aesenc	96(%r15), %xmm4
-        aesenc	112(%r15), %xmm4
-        aesenc	128(%r15), %xmm4
-        aesenc	144(%r15), %xmm4
-        cmpl	$11, %r10d
-        movdqa	160(%r15), %xmm9
-        jl	L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last
-        aesenc	%xmm9, %xmm4
-        aesenc	176(%r15), %xmm4
-        cmpl	$13, %r10d
-        movdqa	192(%r15), %xmm9
-        jl	L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last
-        aesenc	%xmm9, %xmm4
-        aesenc	208(%r15), %xmm4
-        movdqa	224(%r15), %xmm9
-L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last:
-        aesenclast	%xmm9, %xmm4
-        subq	$32, %rsp
-        xorl	%ecx, %ecx
-        movdqu	%xmm4, (%rsp)
-        pxor	%xmm0, %xmm0
-        movdqu	%xmm0, 16(%rsp)
-L_AES_GCM_decrypt_aesenc_last15_dec_avx_loop:
-        movzbl	(%rdi,%rbx,1), %r13d
-        movb	%r13b, 16(%rsp,%rcx,1)
-        xorb	(%rsp,%rcx,1), %r13b
-        movb	%r13b, (%rsi,%rbx,1)
-        incl	%ebx
-        incl	%ecx
-        cmpl	%edx, %ebx
-        jl	L_AES_GCM_decrypt_aesenc_last15_dec_avx_loop
-        movdqu	16(%rsp), %xmm4
-        addq	$32, %rsp
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm4
-        pxor	%xmm4, %xmm6
-        pshufd	$0x4e, %xmm5, %xmm9
-        pshufd	$0x4e, %xmm6, %xmm10
-        movdqa	%xmm6, %xmm11
-        movdqa	%xmm6, %xmm8
-        pclmulqdq	$0x11, %xmm5, %xmm11
-        pclmulqdq	$0x00, %xmm5, %xmm8
-        pxor	%xmm5, %xmm9
-        pxor	%xmm6, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm6
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm6
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm6
-L_AES_GCM_decrypt_aesenc_last15_dec_avx_done:
-L_AES_GCM_decrypt_done_dec:
-        movl	%r9d, %edx
-        movl	%r11d, %ecx
-        shlq	$3, %rdx
-        shlq	$3, %rcx
-        pinsrq	$0x00, %rdx, %xmm0
-        pinsrq	$0x01, %rcx, %xmm0
-        pxor	%xmm0, %xmm6
-        pshufd	$0x4e, %xmm5, %xmm9
-        pshufd	$0x4e, %xmm6, %xmm10
-        movdqa	%xmm6, %xmm11
-        movdqa	%xmm6, %xmm8
-        pclmulqdq	$0x11, %xmm5, %xmm11
-        pclmulqdq	$0x00, %xmm5, %xmm8
-        pxor	%xmm5, %xmm9
-        pxor	%xmm6, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm6
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm6
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm6
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm6
-        movdqu	144(%rsp), %xmm0
-        pxor	%xmm6, %xmm0
-        cmpl	$16, %r14d
-        je	L_AES_GCM_decrypt_cmp_tag_16
-        subq	$16, %rsp
-        xorq	%rcx, %rcx
-        xorq	%rbx, %rbx
-        movdqu	%xmm0, (%rsp)
-L_AES_GCM_decrypt_cmp_tag_loop:
-        movzbl	(%rsp,%rcx,1), %r13d
-        xorb	(%r8,%rcx,1), %r13b
-        orb	%r13b, %bl
-        incl	%ecx
-        cmpl	%r14d, %ecx
-        jne	L_AES_GCM_decrypt_cmp_tag_loop
-        cmpb	$0x00, %bl
-        sete	%bl
-        addq	$16, %rsp
-        xorq	%rcx, %rcx
-        jmp	L_AES_GCM_decrypt_cmp_tag_done
-L_AES_GCM_decrypt_cmp_tag_16:
-        movdqu	(%r8), %xmm1
-        pcmpeqb	%xmm1, %xmm0
-        pmovmskb	%xmm0, %rdx
-        # %%edx == 0xFFFF then return 1 else => return 0
-        xorl	%ebx, %ebx
-        cmpl	$0xffff, %edx
-        sete	%bl
-L_AES_GCM_decrypt_cmp_tag_done:
-        movl	%ebx, (%rbp)
-        addq	$0xa8, %rsp
-        popq	%rbp
-        popq	%r15
-        popq	%r14
-        popq	%rbx
-        popq	%r12
-        popq	%r13
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_decrypt,.-AES_GCM_decrypt
-#endif /* __APPLE__ */
-#ifdef WOLFSSL_AESGCM_STREAM
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_init_aesni
-.type	AES_GCM_init_aesni,@function
-.align	16
-AES_GCM_init_aesni:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_init_aesni
-.p2align	4
-_AES_GCM_init_aesni:
-#endif /* __APPLE__ */
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        movq	%rdx, %r10
-        movl	%ecx, %r11d
-        movq	32(%rsp), %rax
-        subq	$16, %rsp
-        pxor	%xmm4, %xmm4
-        movl	%r11d, %edx
-        cmpl	$12, %edx
-        jne	L_AES_GCM_init_aesni_iv_not_12
-        # # Calculate values when IV is 12 bytes
-        # Set counter based on IV
-        movl	$0x1000000, %ecx
-        pinsrq	$0x00, (%r10), %xmm4
-        pinsrd	$2, 8(%r10), %xmm4
-        pinsrd	$3, %ecx, %xmm4
-        # H = Encrypt X(=0) and T = Encrypt counter
-        movdqa	%xmm4, %xmm1
-        movdqa	(%rdi), %xmm5
-        pxor	%xmm5, %xmm1
-        movdqa	16(%rdi), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	32(%rdi), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	48(%rdi), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	64(%rdi), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	80(%rdi), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	96(%rdi), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	112(%rdi), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	128(%rdi), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	144(%rdi), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        cmpl	$11, %esi
-        movdqa	160(%rdi), %xmm7
-        jl	L_AES_GCM_init_aesni_calc_iv_12_last
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	176(%rdi), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        cmpl	$13, %esi
-        movdqa	192(%rdi), %xmm7
-        jl	L_AES_GCM_init_aesni_calc_iv_12_last
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	208(%rdi), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	224(%rdi), %xmm7
-L_AES_GCM_init_aesni_calc_iv_12_last:
-        aesenclast	%xmm7, %xmm5
-        aesenclast	%xmm7, %xmm1
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm5
-        movdqu	%xmm1, %xmm15
-        jmp	L_AES_GCM_init_aesni_iv_done
-L_AES_GCM_init_aesni_iv_not_12:
-        # Calculate values when IV is not 12 bytes
-        # H = Encrypt X(=0)
-        movdqa	(%rdi), %xmm5
-        aesenc	16(%rdi), %xmm5
-        aesenc	32(%rdi), %xmm5
-        aesenc	48(%rdi), %xmm5
-        aesenc	64(%rdi), %xmm5
-        aesenc	80(%rdi), %xmm5
-        aesenc	96(%rdi), %xmm5
-        aesenc	112(%rdi), %xmm5
-        aesenc	128(%rdi), %xmm5
-        aesenc	144(%rdi), %xmm5
-        cmpl	$11, %esi
-        movdqa	160(%rdi), %xmm9
-        jl	L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last
-        aesenc	%xmm9, %xmm5
-        aesenc	176(%rdi), %xmm5
-        cmpl	$13, %esi
-        movdqa	192(%rdi), %xmm9
-        jl	L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last
-        aesenc	%xmm9, %xmm5
-        aesenc	208(%rdi), %xmm5
-        movdqa	224(%rdi), %xmm9
-L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last:
-        aesenclast	%xmm9, %xmm5
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm5
-        # Calc counter
-        # Initialization vector
-        cmpl	$0x00, %edx
-        movq	$0x00, %rcx
-        je	L_AES_GCM_init_aesni_calc_iv_done
-        cmpl	$16, %edx
-        jl	L_AES_GCM_init_aesni_calc_iv_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_init_aesni_calc_iv_16_loop:
-        movdqu	(%r10,%rcx,1), %xmm8
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm8
-        pxor	%xmm8, %xmm4
-        pshufd	$0x4e, %xmm4, %xmm1
-        pshufd	$0x4e, %xmm5, %xmm2
-        movdqa	%xmm5, %xmm3
-        movdqa	%xmm5, %xmm0
-        pclmulqdq	$0x11, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm4, %xmm0
-        pxor	%xmm4, %xmm1
-        pxor	%xmm5, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm7
-        movdqa	%xmm3, %xmm4
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm7
-        pxor	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm4, %xmm1
-        psrld	$31, %xmm0
-        psrld	$31, %xmm1
-        pslld	$0x01, %xmm7
-        pslld	$0x01, %xmm4
-        movdqa	%xmm0, %xmm2
-        pslldq	$4, %xmm0
-        psrldq	$12, %xmm2
-        pslldq	$4, %xmm1
-        por	%xmm2, %xmm4
-        por	%xmm0, %xmm7
-        por	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm7, %xmm1
-        movdqa	%xmm7, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm7
-        movdqa	%xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm7, %xmm2
-        pxor	%xmm2, %xmm4
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_init_aesni_calc_iv_16_loop
-        movl	%r11d, %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_init_aesni_calc_iv_done
-L_AES_GCM_init_aesni_calc_iv_lt16:
-        subq	$16, %rsp
-        pxor	%xmm8, %xmm8
-        xorl	%r13d, %r13d
-        movdqu	%xmm8, (%rsp)
-L_AES_GCM_init_aesni_calc_iv_loop:
-        movzbl	(%r10,%rcx,1), %r12d
-        movb	%r12b, (%rsp,%r13,1)
-        incl	%ecx
-        incl	%r13d
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_init_aesni_calc_iv_loop
-        movdqu	(%rsp), %xmm8
-        addq	$16, %rsp
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm8
-        pxor	%xmm8, %xmm4
-        pshufd	$0x4e, %xmm4, %xmm1
-        pshufd	$0x4e, %xmm5, %xmm2
-        movdqa	%xmm5, %xmm3
-        movdqa	%xmm5, %xmm0
-        pclmulqdq	$0x11, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm4, %xmm0
-        pxor	%xmm4, %xmm1
-        pxor	%xmm5, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm7
-        movdqa	%xmm3, %xmm4
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm7
-        pxor	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm4, %xmm1
-        psrld	$31, %xmm0
-        psrld	$31, %xmm1
-        pslld	$0x01, %xmm7
-        pslld	$0x01, %xmm4
-        movdqa	%xmm0, %xmm2
-        pslldq	$4, %xmm0
-        psrldq	$12, %xmm2
-        pslldq	$4, %xmm1
-        por	%xmm2, %xmm4
-        por	%xmm0, %xmm7
-        por	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm7, %xmm1
-        movdqa	%xmm7, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm7
-        movdqa	%xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm7, %xmm2
-        pxor	%xmm2, %xmm4
-L_AES_GCM_init_aesni_calc_iv_done:
-        # T = Encrypt counter
-        pxor	%xmm0, %xmm0
-        shll	$3, %edx
-        pinsrq	$0x00, %rdx, %xmm0
-        pxor	%xmm0, %xmm4
-        pshufd	$0x4e, %xmm4, %xmm1
-        pshufd	$0x4e, %xmm5, %xmm2
-        movdqa	%xmm5, %xmm3
-        movdqa	%xmm5, %xmm0
-        pclmulqdq	$0x11, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm4, %xmm0
-        pxor	%xmm4, %xmm1
-        pxor	%xmm5, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm7
-        movdqa	%xmm3, %xmm4
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm7
-        pxor	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm4, %xmm1
-        psrld	$31, %xmm0
-        psrld	$31, %xmm1
-        pslld	$0x01, %xmm7
-        pslld	$0x01, %xmm4
-        movdqa	%xmm0, %xmm2
-        pslldq	$4, %xmm0
-        psrldq	$12, %xmm2
-        pslldq	$4, %xmm1
-        por	%xmm2, %xmm4
-        por	%xmm0, %xmm7
-        por	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm7, %xmm1
-        movdqa	%xmm7, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm7
-        movdqa	%xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm7, %xmm2
-        pxor	%xmm2, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm4
-        #   Encrypt counter
-        movdqa	(%rdi), %xmm8
-        pxor	%xmm4, %xmm8
-        aesenc	16(%rdi), %xmm8
-        aesenc	32(%rdi), %xmm8
-        aesenc	48(%rdi), %xmm8
-        aesenc	64(%rdi), %xmm8
-        aesenc	80(%rdi), %xmm8
-        aesenc	96(%rdi), %xmm8
-        aesenc	112(%rdi), %xmm8
-        aesenc	128(%rdi), %xmm8
-        aesenc	144(%rdi), %xmm8
-        cmpl	$11, %esi
-        movdqa	160(%rdi), %xmm9
-        jl	L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last
-        aesenc	%xmm9, %xmm8
-        aesenc	176(%rdi), %xmm8
-        cmpl	$13, %esi
-        movdqa	192(%rdi), %xmm9
-        jl	L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last
-        aesenc	%xmm9, %xmm8
-        aesenc	208(%rdi), %xmm8
-        movdqa	224(%rdi), %xmm9
-L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last:
-        aesenclast	%xmm9, %xmm8
-        movdqu	%xmm8, %xmm15
-L_AES_GCM_init_aesni_iv_done:
-        movdqa	%xmm15, (%rax)
-        pshufb	L_aes_gcm_bswap_epi64(%rip), %xmm4
-        paddd	L_aes_gcm_one(%rip), %xmm4
-        movdqa	%xmm5, (%r8)
-        movdqa	%xmm4, (%r9)
-        addq	$16, %rsp
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_init_aesni,.-AES_GCM_init_aesni
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_aad_update_aesni
-.type	AES_GCM_aad_update_aesni,@function
-.align	16
-AES_GCM_aad_update_aesni:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_aad_update_aesni
-.p2align	4
-_AES_GCM_aad_update_aesni:
-#endif /* __APPLE__ */
-        movq	%rcx, %rax
-        movdqa	(%rdx), %xmm5
-        movdqa	(%rax), %xmm6
-        xorl	%ecx, %ecx
-L_AES_GCM_aad_update_aesni_16_loop:
-        movdqu	(%rdi,%rcx,1), %xmm8
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm8
-        pxor	%xmm8, %xmm5
-        pshufd	$0x4e, %xmm5, %xmm1
-        pshufd	$0x4e, %xmm6, %xmm2
-        movdqa	%xmm6, %xmm3
-        movdqa	%xmm6, %xmm0
-        pclmulqdq	$0x11, %xmm5, %xmm3
-        pclmulqdq	$0x00, %xmm5, %xmm0
-        pxor	%xmm5, %xmm1
-        pxor	%xmm6, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm4
-        movdqa	%xmm3, %xmm5
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm5
-        movdqa	%xmm4, %xmm0
-        movdqa	%xmm5, %xmm1
-        psrld	$31, %xmm0
-        psrld	$31, %xmm1
-        pslld	$0x01, %xmm4
-        pslld	$0x01, %xmm5
-        movdqa	%xmm0, %xmm2
-        pslldq	$4, %xmm0
-        psrldq	$12, %xmm2
-        pslldq	$4, %xmm1
-        por	%xmm2, %xmm5
-        por	%xmm0, %xmm4
-        por	%xmm1, %xmm5
-        movdqa	%xmm4, %xmm0
-        movdqa	%xmm4, %xmm1
-        movdqa	%xmm4, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm4
-        movdqa	%xmm4, %xmm2
-        movdqa	%xmm4, %xmm3
-        movdqa	%xmm4, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm4, %xmm2
-        pxor	%xmm2, %xmm5
-        addl	$16, %ecx
-        cmpl	%esi, %ecx
-        jl	L_AES_GCM_aad_update_aesni_16_loop
-        movdqa	%xmm5, (%rdx)
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_aad_update_aesni,.-AES_GCM_aad_update_aesni
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_encrypt_block_aesni
-.type	AES_GCM_encrypt_block_aesni,@function
-.align	16
-AES_GCM_encrypt_block_aesni:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_encrypt_block_aesni
-.p2align	4
-_AES_GCM_encrypt_block_aesni:
-#endif /* __APPLE__ */
-        movq	%rdx, %r10
-        movq	%rcx, %r11
-        movdqu	(%r8), %xmm8
-        movdqa	%xmm8, %xmm9
-        pshufb	L_aes_gcm_bswap_epi64(%rip), %xmm8
-        paddd	L_aes_gcm_one(%rip), %xmm9
-        pxor	(%rdi), %xmm8
-        movdqu	%xmm9, (%r8)
-        aesenc	16(%rdi), %xmm8
-        aesenc	32(%rdi), %xmm8
-        aesenc	48(%rdi), %xmm8
-        aesenc	64(%rdi), %xmm8
-        aesenc	80(%rdi), %xmm8
-        aesenc	96(%rdi), %xmm8
-        aesenc	112(%rdi), %xmm8
-        aesenc	128(%rdi), %xmm8
-        aesenc	144(%rdi), %xmm8
-        cmpl	$11, %esi
-        movdqa	160(%rdi), %xmm9
-        jl	L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last
-        aesenc	%xmm9, %xmm8
-        aesenc	176(%rdi), %xmm8
-        cmpl	$13, %esi
-        movdqa	192(%rdi), %xmm9
-        jl	L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last
-        aesenc	%xmm9, %xmm8
-        aesenc	208(%rdi), %xmm8
-        movdqa	224(%rdi), %xmm9
-L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last:
-        aesenclast	%xmm9, %xmm8
-        movdqu	(%r11), %xmm9
-        pxor	%xmm9, %xmm8
-        movdqu	%xmm8, (%r10)
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm8
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_encrypt_block_aesni,.-AES_GCM_encrypt_block_aesni
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_ghash_block_aesni
-.type	AES_GCM_ghash_block_aesni,@function
-.align	16
-AES_GCM_ghash_block_aesni:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_ghash_block_aesni
-.p2align	4
-_AES_GCM_ghash_block_aesni:
-#endif /* __APPLE__ */
-        movdqa	(%rsi), %xmm4
-        movdqa	(%rdx), %xmm5
-        movdqu	(%rdi), %xmm8
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm8
-        pxor	%xmm8, %xmm4
-        pshufd	$0x4e, %xmm4, %xmm1
-        pshufd	$0x4e, %xmm5, %xmm2
-        movdqa	%xmm5, %xmm3
-        movdqa	%xmm5, %xmm0
-        pclmulqdq	$0x11, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm4, %xmm0
-        pxor	%xmm4, %xmm1
-        pxor	%xmm5, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm6
-        movdqa	%xmm3, %xmm4
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm6
-        pxor	%xmm1, %xmm4
-        movdqa	%xmm6, %xmm0
-        movdqa	%xmm4, %xmm1
-        psrld	$31, %xmm0
-        psrld	$31, %xmm1
-        pslld	$0x01, %xmm6
-        pslld	$0x01, %xmm4
-        movdqa	%xmm0, %xmm2
-        pslldq	$4, %xmm0
-        psrldq	$12, %xmm2
-        pslldq	$4, %xmm1
-        por	%xmm2, %xmm4
-        por	%xmm0, %xmm6
-        por	%xmm1, %xmm4
-        movdqa	%xmm6, %xmm0
-        movdqa	%xmm6, %xmm1
-        movdqa	%xmm6, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm6
-        movdqa	%xmm6, %xmm2
-        movdqa	%xmm6, %xmm3
-        movdqa	%xmm6, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm6, %xmm2
-        pxor	%xmm2, %xmm4
-        movdqa	%xmm4, (%rsi)
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_ghash_block_aesni,.-AES_GCM_ghash_block_aesni
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_encrypt_update_aesni
-.type	AES_GCM_encrypt_update_aesni,@function
-.align	16
-AES_GCM_encrypt_update_aesni:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_encrypt_update_aesni
-.p2align	4
-_AES_GCM_encrypt_update_aesni:
-#endif /* __APPLE__ */
-        pushq	%r13
-        pushq	%r12
-        pushq	%r14
-        movq	%rdx, %r10
-        movq	%rcx, %r11
-        movq	32(%rsp), %rax
-        movq	40(%rsp), %r12
-        subq	$0xa0, %rsp
-        movdqa	(%r9), %xmm6
-        movdqa	(%rax), %xmm5
-        movdqa	%xmm5, %xmm9
-        movdqa	%xmm5, %xmm8
-        psrlq	$63, %xmm9
-        psllq	$0x01, %xmm8
-        pslldq	$8, %xmm9
-        por	%xmm9, %xmm8
-        pshufd	$0xff, %xmm5, %xmm5
-        psrad	$31, %xmm5
-        pand	L_aes_gcm_mod2_128(%rip), %xmm5
-        pxor	%xmm8, %xmm5
-        xorq	%r14, %r14
-        cmpl	$0x80, %r8d
-        movl	%r8d, %r13d
-        jl	L_AES_GCM_encrypt_update_aesni_done_128
-        andl	$0xffffff80, %r13d
-        movdqa	%xmm6, %xmm2
-        # H ^ 1
-        movdqu	%xmm5, (%rsp)
-        # H ^ 2
-        pshufd	$0x4e, %xmm5, %xmm9
-        pshufd	$0x4e, %xmm5, %xmm10
-        movdqa	%xmm5, %xmm11
-        movdqa	%xmm5, %xmm8
-        pclmulqdq	$0x11, %xmm5, %xmm11
-        pclmulqdq	$0x00, %xmm5, %xmm8
-        pxor	%xmm5, %xmm9
-        pxor	%xmm5, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm0
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm0
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm0
-        movdqu	%xmm0, 16(%rsp)
-        # H ^ 3
-        pshufd	$0x4e, %xmm5, %xmm9
-        pshufd	$0x4e, %xmm0, %xmm10
-        movdqa	%xmm0, %xmm11
-        movdqa	%xmm0, %xmm8
-        pclmulqdq	$0x11, %xmm5, %xmm11
-        pclmulqdq	$0x00, %xmm5, %xmm8
-        pxor	%xmm5, %xmm9
-        pxor	%xmm0, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm1
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm1
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm1
-        movdqu	%xmm1, 32(%rsp)
-        # H ^ 4
-        pshufd	$0x4e, %xmm0, %xmm9
-        pshufd	$0x4e, %xmm0, %xmm10
-        movdqa	%xmm0, %xmm11
-        movdqa	%xmm0, %xmm8
-        pclmulqdq	$0x11, %xmm0, %xmm11
-        pclmulqdq	$0x00, %xmm0, %xmm8
-        pxor	%xmm0, %xmm9
-        pxor	%xmm0, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm3
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm3
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm3
-        movdqu	%xmm3, 48(%rsp)
-        # H ^ 5
-        pshufd	$0x4e, %xmm0, %xmm9
-        pshufd	$0x4e, %xmm1, %xmm10
-        movdqa	%xmm1, %xmm11
-        movdqa	%xmm1, %xmm8
-        pclmulqdq	$0x11, %xmm0, %xmm11
-        pclmulqdq	$0x00, %xmm0, %xmm8
-        pxor	%xmm0, %xmm9
-        pxor	%xmm1, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm7
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm7
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm7
-        movdqu	%xmm7, 64(%rsp)
-        # H ^ 6
-        pshufd	$0x4e, %xmm1, %xmm9
-        pshufd	$0x4e, %xmm1, %xmm10
-        movdqa	%xmm1, %xmm11
-        movdqa	%xmm1, %xmm8
-        pclmulqdq	$0x11, %xmm1, %xmm11
-        pclmulqdq	$0x00, %xmm1, %xmm8
-        pxor	%xmm1, %xmm9
-        pxor	%xmm1, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm7
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm7
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm7
-        movdqu	%xmm7, 80(%rsp)
-        # H ^ 7
-        pshufd	$0x4e, %xmm1, %xmm9
-        pshufd	$0x4e, %xmm3, %xmm10
-        movdqa	%xmm3, %xmm11
-        movdqa	%xmm3, %xmm8
-        pclmulqdq	$0x11, %xmm1, %xmm11
-        pclmulqdq	$0x00, %xmm1, %xmm8
-        pxor	%xmm1, %xmm9
-        pxor	%xmm3, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm7
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm7
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm7
-        movdqu	%xmm7, 96(%rsp)
-        # H ^ 8
-        pshufd	$0x4e, %xmm3, %xmm9
-        pshufd	$0x4e, %xmm3, %xmm10
-        movdqa	%xmm3, %xmm11
-        movdqa	%xmm3, %xmm8
-        pclmulqdq	$0x11, %xmm3, %xmm11
-        pclmulqdq	$0x00, %xmm3, %xmm8
-        pxor	%xmm3, %xmm9
-        pxor	%xmm3, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm7
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm7
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm7
-        movdqu	%xmm7, 112(%rsp)
-        # First 128 bytes of input
-        movdqu	(%r12), %xmm8
-        movdqa	L_aes_gcm_bswap_epi64(%rip), %xmm1
-        movdqa	%xmm8, %xmm0
-        pshufb	%xmm1, %xmm8
-        movdqa	%xmm0, %xmm9
-        paddd	L_aes_gcm_one(%rip), %xmm9
-        pshufb	%xmm1, %xmm9
-        movdqa	%xmm0, %xmm10
-        paddd	L_aes_gcm_two(%rip), %xmm10
-        pshufb	%xmm1, %xmm10
-        movdqa	%xmm0, %xmm11
-        paddd	L_aes_gcm_three(%rip), %xmm11
-        pshufb	%xmm1, %xmm11
-        movdqa	%xmm0, %xmm12
-        paddd	L_aes_gcm_four(%rip), %xmm12
-        pshufb	%xmm1, %xmm12
-        movdqa	%xmm0, %xmm13
-        paddd	L_aes_gcm_five(%rip), %xmm13
-        pshufb	%xmm1, %xmm13
-        movdqa	%xmm0, %xmm14
-        paddd	L_aes_gcm_six(%rip), %xmm14
-        pshufb	%xmm1, %xmm14
-        movdqa	%xmm0, %xmm15
-        paddd	L_aes_gcm_seven(%rip), %xmm15
-        pshufb	%xmm1, %xmm15
-        paddd	L_aes_gcm_eight(%rip), %xmm0
-        movdqa	(%rdi), %xmm7
-        movdqu	%xmm0, (%r12)
-        pxor	%xmm7, %xmm8
-        pxor	%xmm7, %xmm9
-        pxor	%xmm7, %xmm10
-        pxor	%xmm7, %xmm11
-        pxor	%xmm7, %xmm12
-        pxor	%xmm7, %xmm13
-        pxor	%xmm7, %xmm14
-        pxor	%xmm7, %xmm15
-        movdqa	16(%rdi), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	32(%rdi), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	48(%rdi), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	64(%rdi), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	80(%rdi), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	96(%rdi), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	112(%rdi), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	128(%rdi), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	144(%rdi), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        cmpl	$11, %esi
-        movdqa	160(%rdi), %xmm7
-        jl	L_AES_GCM_encrypt_update_aesni_enc_done
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	176(%rdi), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        cmpl	$13, %esi
-        movdqa	192(%rdi), %xmm7
-        jl	L_AES_GCM_encrypt_update_aesni_enc_done
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	208(%rdi), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	224(%rdi), %xmm7
-L_AES_GCM_encrypt_update_aesni_enc_done:
-        aesenclast	%xmm7, %xmm8
-        aesenclast	%xmm7, %xmm9
-        movdqu	(%r11), %xmm0
-        movdqu	16(%r11), %xmm1
-        pxor	%xmm0, %xmm8
-        pxor	%xmm1, %xmm9
-        movdqu	%xmm8, (%r10)
-        movdqu	%xmm9, 16(%r10)
-        aesenclast	%xmm7, %xmm10
-        aesenclast	%xmm7, %xmm11
-        movdqu	32(%r11), %xmm0
-        movdqu	48(%r11), %xmm1
-        pxor	%xmm0, %xmm10
-        pxor	%xmm1, %xmm11
-        movdqu	%xmm10, 32(%r10)
-        movdqu	%xmm11, 48(%r10)
-        aesenclast	%xmm7, %xmm12
-        aesenclast	%xmm7, %xmm13
-        movdqu	64(%r11), %xmm0
-        movdqu	80(%r11), %xmm1
-        pxor	%xmm0, %xmm12
-        pxor	%xmm1, %xmm13
-        movdqu	%xmm12, 64(%r10)
-        movdqu	%xmm13, 80(%r10)
-        aesenclast	%xmm7, %xmm14
-        aesenclast	%xmm7, %xmm15
-        movdqu	96(%r11), %xmm0
-        movdqu	112(%r11), %xmm1
-        pxor	%xmm0, %xmm14
-        pxor	%xmm1, %xmm15
-        movdqu	%xmm14, 96(%r10)
-        movdqu	%xmm15, 112(%r10)
-        cmpl	$0x80, %r13d
-        movl	$0x80, %r14d
-        jle	L_AES_GCM_encrypt_update_aesni_end_128
-        # More 128 bytes of input
-L_AES_GCM_encrypt_update_aesni_ghash_128:
-        leaq	(%r11,%r14,1), %rcx
-        leaq	(%r10,%r14,1), %rdx
-        movdqu	(%r12), %xmm8
-        movdqa	L_aes_gcm_bswap_epi64(%rip), %xmm1
-        movdqa	%xmm8, %xmm0
-        pshufb	%xmm1, %xmm8
-        movdqa	%xmm0, %xmm9
-        paddd	L_aes_gcm_one(%rip), %xmm9
-        pshufb	%xmm1, %xmm9
-        movdqa	%xmm0, %xmm10
-        paddd	L_aes_gcm_two(%rip), %xmm10
-        pshufb	%xmm1, %xmm10
-        movdqa	%xmm0, %xmm11
-        paddd	L_aes_gcm_three(%rip), %xmm11
-        pshufb	%xmm1, %xmm11
-        movdqa	%xmm0, %xmm12
-        paddd	L_aes_gcm_four(%rip), %xmm12
-        pshufb	%xmm1, %xmm12
-        movdqa	%xmm0, %xmm13
-        paddd	L_aes_gcm_five(%rip), %xmm13
-        pshufb	%xmm1, %xmm13
-        movdqa	%xmm0, %xmm14
-        paddd	L_aes_gcm_six(%rip), %xmm14
-        pshufb	%xmm1, %xmm14
-        movdqa	%xmm0, %xmm15
-        paddd	L_aes_gcm_seven(%rip), %xmm15
-        pshufb	%xmm1, %xmm15
-        paddd	L_aes_gcm_eight(%rip), %xmm0
-        movdqa	(%rdi), %xmm7
-        movdqu	%xmm0, (%r12)
-        pxor	%xmm7, %xmm8
-        pxor	%xmm7, %xmm9
-        pxor	%xmm7, %xmm10
-        pxor	%xmm7, %xmm11
-        pxor	%xmm7, %xmm12
-        pxor	%xmm7, %xmm13
-        pxor	%xmm7, %xmm14
-        pxor	%xmm7, %xmm15
-        movdqu	112(%rsp), %xmm7
-        movdqu	-128(%rdx), %xmm0
-        aesenc	16(%rdi), %xmm8
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        pxor	%xmm2, %xmm0
-        pshufd	$0x4e, %xmm7, %xmm1
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm7, %xmm1
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm3
-        pclmulqdq	$0x11, %xmm7, %xmm3
-        aesenc	16(%rdi), %xmm9
-        aesenc	16(%rdi), %xmm10
-        movdqa	%xmm0, %xmm2
-        pclmulqdq	$0x00, %xmm7, %xmm2
-        aesenc	16(%rdi), %xmm11
-        aesenc	16(%rdi), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm1
-        aesenc	16(%rdi), %xmm13
-        aesenc	16(%rdi), %xmm14
-        aesenc	16(%rdi), %xmm15
-        pxor	%xmm2, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqu	96(%rsp), %xmm7
-        movdqu	-112(%rdx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	32(%rdi), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	32(%rdi), %xmm9
-        aesenc	32(%rdi), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	32(%rdi), %xmm11
-        aesenc	32(%rdi), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	32(%rdi), %xmm13
-        aesenc	32(%rdi), %xmm14
-        aesenc	32(%rdi), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	80(%rsp), %xmm7
-        movdqu	-96(%rdx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	48(%rdi), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	48(%rdi), %xmm9
-        aesenc	48(%rdi), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	48(%rdi), %xmm11
-        aesenc	48(%rdi), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	48(%rdi), %xmm13
-        aesenc	48(%rdi), %xmm14
-        aesenc	48(%rdi), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	64(%rsp), %xmm7
-        movdqu	-80(%rdx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	64(%rdi), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	64(%rdi), %xmm9
-        aesenc	64(%rdi), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	64(%rdi), %xmm11
-        aesenc	64(%rdi), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	64(%rdi), %xmm13
-        aesenc	64(%rdi), %xmm14
-        aesenc	64(%rdi), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	48(%rsp), %xmm7
-        movdqu	-64(%rdx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	80(%rdi), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	80(%rdi), %xmm9
-        aesenc	80(%rdi), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	80(%rdi), %xmm11
-        aesenc	80(%rdi), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	80(%rdi), %xmm13
-        aesenc	80(%rdi), %xmm14
-        aesenc	80(%rdi), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	32(%rsp), %xmm7
-        movdqu	-48(%rdx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	96(%rdi), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	96(%rdi), %xmm9
-        aesenc	96(%rdi), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	96(%rdi), %xmm11
-        aesenc	96(%rdi), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	96(%rdi), %xmm13
-        aesenc	96(%rdi), %xmm14
-        aesenc	96(%rdi), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	16(%rsp), %xmm7
-        movdqu	-32(%rdx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	112(%rdi), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	112(%rdi), %xmm9
-        aesenc	112(%rdi), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	112(%rdi), %xmm11
-        aesenc	112(%rdi), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	112(%rdi), %xmm13
-        aesenc	112(%rdi), %xmm14
-        aesenc	112(%rdi), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	(%rsp), %xmm7
-        movdqu	-16(%rdx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	128(%rdi), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	128(%rdi), %xmm9
-        aesenc	128(%rdi), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	128(%rdi), %xmm11
-        aesenc	128(%rdi), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	128(%rdi), %xmm13
-        aesenc	128(%rdi), %xmm14
-        aesenc	128(%rdi), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqa	%xmm1, %xmm5
-        psrldq	$8, %xmm1
-        pslldq	$8, %xmm5
-        aesenc	144(%rdi), %xmm8
-        pxor	%xmm5, %xmm2
-        pxor	%xmm1, %xmm3
-        movdqa	%xmm2, %xmm7
-        movdqa	%xmm2, %xmm4
-        movdqa	%xmm2, %xmm5
-        aesenc	144(%rdi), %xmm9
-        pslld	$31, %xmm7
-        pslld	$30, %xmm4
-        pslld	$25, %xmm5
-        aesenc	144(%rdi), %xmm10
-        pxor	%xmm4, %xmm7
-        pxor	%xmm5, %xmm7
-        aesenc	144(%rdi), %xmm11
-        movdqa	%xmm7, %xmm4
-        pslldq	$12, %xmm7
-        psrldq	$4, %xmm4
-        aesenc	144(%rdi), %xmm12
-        pxor	%xmm7, %xmm2
-        movdqa	%xmm2, %xmm5
-        movdqa	%xmm2, %xmm1
-        movdqa	%xmm2, %xmm0
-        aesenc	144(%rdi), %xmm13
-        psrld	$0x01, %xmm5
-        psrld	$2, %xmm1
-        psrld	$7, %xmm0
-        aesenc	144(%rdi), %xmm14
-        pxor	%xmm1, %xmm5
-        pxor	%xmm0, %xmm5
-        aesenc	144(%rdi), %xmm15
-        pxor	%xmm4, %xmm5
-        pxor	%xmm5, %xmm2
-        pxor	%xmm3, %xmm2
-        cmpl	$11, %esi
-        movdqa	160(%rdi), %xmm7
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_128_ghash_avx_done
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	176(%rdi), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        cmpl	$13, %esi
-        movdqa	192(%rdi), %xmm7
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_128_ghash_avx_done
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	208(%rdi), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	224(%rdi), %xmm7
-L_AES_GCM_encrypt_update_aesni_aesenc_128_ghash_avx_done:
-        aesenclast	%xmm7, %xmm8
-        aesenclast	%xmm7, %xmm9
-        movdqu	(%rcx), %xmm0
-        movdqu	16(%rcx), %xmm1
-        pxor	%xmm0, %xmm8
-        pxor	%xmm1, %xmm9
-        movdqu	%xmm8, (%rdx)
-        movdqu	%xmm9, 16(%rdx)
-        aesenclast	%xmm7, %xmm10
-        aesenclast	%xmm7, %xmm11
-        movdqu	32(%rcx), %xmm0
-        movdqu	48(%rcx), %xmm1
-        pxor	%xmm0, %xmm10
-        pxor	%xmm1, %xmm11
-        movdqu	%xmm10, 32(%rdx)
-        movdqu	%xmm11, 48(%rdx)
-        aesenclast	%xmm7, %xmm12
-        aesenclast	%xmm7, %xmm13
-        movdqu	64(%rcx), %xmm0
-        movdqu	80(%rcx), %xmm1
-        pxor	%xmm0, %xmm12
-        pxor	%xmm1, %xmm13
-        movdqu	%xmm12, 64(%rdx)
-        movdqu	%xmm13, 80(%rdx)
-        aesenclast	%xmm7, %xmm14
-        aesenclast	%xmm7, %xmm15
-        movdqu	96(%rcx), %xmm0
-        movdqu	112(%rcx), %xmm1
-        pxor	%xmm0, %xmm14
-        pxor	%xmm1, %xmm15
-        movdqu	%xmm14, 96(%rdx)
-        movdqu	%xmm15, 112(%rdx)
-        addl	$0x80, %r14d
-        cmpl	%r13d, %r14d
-        jl	L_AES_GCM_encrypt_update_aesni_ghash_128
-L_AES_GCM_encrypt_update_aesni_end_128:
-        movdqa	L_aes_gcm_bswap_mask(%rip), %xmm4
-        pshufb	%xmm4, %xmm8
-        pshufb	%xmm4, %xmm9
-        pshufb	%xmm4, %xmm10
-        pshufb	%xmm4, %xmm11
-        pxor	%xmm2, %xmm8
-        pshufb	%xmm4, %xmm12
-        pshufb	%xmm4, %xmm13
-        pshufb	%xmm4, %xmm14
-        pshufb	%xmm4, %xmm15
-        movdqu	112(%rsp), %xmm7
-        pshufd	$0x4e, %xmm8, %xmm1
-        pshufd	$0x4e, %xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        pclmulqdq	$0x11, %xmm8, %xmm3
-        pclmulqdq	$0x00, %xmm8, %xmm0
-        pxor	%xmm8, %xmm1
-        pxor	%xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm4
-        movdqa	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm6
-        movdqu	96(%rsp), %xmm7
-        pshufd	$0x4e, %xmm9, %xmm1
-        pshufd	$0x4e, %xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        pclmulqdq	$0x11, %xmm9, %xmm3
-        pclmulqdq	$0x00, %xmm9, %xmm0
-        pxor	%xmm9, %xmm1
-        pxor	%xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        pxor	%xmm0, %xmm4
-        pxor	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm6
-        movdqu	80(%rsp), %xmm7
-        pshufd	$0x4e, %xmm10, %xmm1
-        pshufd	$0x4e, %xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        pclmulqdq	$0x11, %xmm10, %xmm3
-        pclmulqdq	$0x00, %xmm10, %xmm0
-        pxor	%xmm10, %xmm1
-        pxor	%xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        pxor	%xmm0, %xmm4
-        pxor	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm6
-        movdqu	64(%rsp), %xmm7
-        pshufd	$0x4e, %xmm11, %xmm1
-        pshufd	$0x4e, %xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        pclmulqdq	$0x11, %xmm11, %xmm3
-        pclmulqdq	$0x00, %xmm11, %xmm0
-        pxor	%xmm11, %xmm1
-        pxor	%xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        pxor	%xmm0, %xmm4
-        pxor	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm6
-        movdqu	48(%rsp), %xmm7
-        pshufd	$0x4e, %xmm12, %xmm1
-        pshufd	$0x4e, %xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        pclmulqdq	$0x11, %xmm12, %xmm3
-        pclmulqdq	$0x00, %xmm12, %xmm0
-        pxor	%xmm12, %xmm1
-        pxor	%xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        pxor	%xmm0, %xmm4
-        pxor	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm6
-        movdqu	32(%rsp), %xmm7
-        pshufd	$0x4e, %xmm13, %xmm1
-        pshufd	$0x4e, %xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        pclmulqdq	$0x11, %xmm13, %xmm3
-        pclmulqdq	$0x00, %xmm13, %xmm0
-        pxor	%xmm13, %xmm1
-        pxor	%xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        pxor	%xmm0, %xmm4
-        pxor	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm6
-        movdqu	16(%rsp), %xmm7
-        pshufd	$0x4e, %xmm14, %xmm1
-        pshufd	$0x4e, %xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        pclmulqdq	$0x11, %xmm14, %xmm3
-        pclmulqdq	$0x00, %xmm14, %xmm0
-        pxor	%xmm14, %xmm1
-        pxor	%xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        pxor	%xmm0, %xmm4
-        pxor	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm6
-        movdqu	(%rsp), %xmm7
-        pshufd	$0x4e, %xmm15, %xmm1
-        pshufd	$0x4e, %xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        pclmulqdq	$0x11, %xmm15, %xmm3
-        pclmulqdq	$0x00, %xmm15, %xmm0
-        pxor	%xmm15, %xmm1
-        pxor	%xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        pxor	%xmm0, %xmm4
-        pxor	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm6
-        movdqa	%xmm4, %xmm0
-        movdqa	%xmm4, %xmm1
-        movdqa	%xmm4, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm4
-        movdqa	%xmm4, %xmm2
-        movdqa	%xmm4, %xmm3
-        movdqa	%xmm4, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm4, %xmm2
-        pxor	%xmm2, %xmm6
-        movdqu	(%rsp), %xmm5
-L_AES_GCM_encrypt_update_aesni_done_128:
-        movl	%r8d, %edx
-        cmpl	%edx, %r14d
-        jge	L_AES_GCM_encrypt_update_aesni_done_enc
-        movl	%r8d, %r13d
-        andl	$0xfffffff0, %r13d
-        cmpl	%r13d, %r14d
-        jge	L_AES_GCM_encrypt_update_aesni_last_block_done
-        leaq	(%r11,%r14,1), %rcx
-        leaq	(%r10,%r14,1), %rdx
-        movdqu	(%r12), %xmm8
-        movdqa	%xmm8, %xmm9
-        pshufb	L_aes_gcm_bswap_epi64(%rip), %xmm8
-        paddd	L_aes_gcm_one(%rip), %xmm9
-        pxor	(%rdi), %xmm8
-        movdqu	%xmm9, (%r12)
-        aesenc	16(%rdi), %xmm8
-        aesenc	32(%rdi), %xmm8
-        aesenc	48(%rdi), %xmm8
-        aesenc	64(%rdi), %xmm8
-        aesenc	80(%rdi), %xmm8
-        aesenc	96(%rdi), %xmm8
-        aesenc	112(%rdi), %xmm8
-        aesenc	128(%rdi), %xmm8
-        aesenc	144(%rdi), %xmm8
-        cmpl	$11, %esi
-        movdqa	160(%rdi), %xmm9
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last
-        aesenc	%xmm9, %xmm8
-        aesenc	176(%rdi), %xmm8
-        cmpl	$13, %esi
-        movdqa	192(%rdi), %xmm9
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last
-        aesenc	%xmm9, %xmm8
-        aesenc	208(%rdi), %xmm8
-        movdqa	224(%rdi), %xmm9
-L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last:
-        aesenclast	%xmm9, %xmm8
-        movdqu	(%rcx), %xmm9
-        pxor	%xmm9, %xmm8
-        movdqu	%xmm8, (%rdx)
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm8
-        pxor	%xmm8, %xmm6
-        addl	$16, %r14d
-        cmpl	%r13d, %r14d
-        jge	L_AES_GCM_encrypt_update_aesni_last_block_ghash
-L_AES_GCM_encrypt_update_aesni_last_block_start:
-        leaq	(%r11,%r14,1), %rcx
-        leaq	(%r10,%r14,1), %rdx
-        movdqu	(%r12), %xmm8
-        movdqa	%xmm8, %xmm9
-        pshufb	L_aes_gcm_bswap_epi64(%rip), %xmm8
-        paddd	L_aes_gcm_one(%rip), %xmm9
-        pxor	(%rdi), %xmm8
-        movdqu	%xmm9, (%r12)
-        movdqa	%xmm6, %xmm10
-        pclmulqdq	$16, %xmm5, %xmm10
-        aesenc	16(%rdi), %xmm8
-        aesenc	32(%rdi), %xmm8
-        movdqa	%xmm6, %xmm11
-        pclmulqdq	$0x01, %xmm5, %xmm11
-        aesenc	48(%rdi), %xmm8
-        aesenc	64(%rdi), %xmm8
-        movdqa	%xmm6, %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm12
-        aesenc	80(%rdi), %xmm8
-        movdqa	%xmm6, %xmm1
-        pclmulqdq	$0x11, %xmm5, %xmm1
-        aesenc	96(%rdi), %xmm8
-        pxor	%xmm11, %xmm10
-        movdqa	%xmm10, %xmm2
-        psrldq	$8, %xmm10
-        pslldq	$8, %xmm2
-        aesenc	112(%rdi), %xmm8
-        movdqa	%xmm1, %xmm3
-        pxor	%xmm12, %xmm2
-        pxor	%xmm10, %xmm3
-        movdqa	L_aes_gcm_mod2_128(%rip), %xmm0
-        movdqa	%xmm2, %xmm11
-        pclmulqdq	$16, %xmm0, %xmm11
-        aesenc	128(%rdi), %xmm8
-        pshufd	$0x4e, %xmm2, %xmm10
-        pxor	%xmm11, %xmm10
-        movdqa	%xmm10, %xmm11
-        pclmulqdq	$16, %xmm0, %xmm11
-        aesenc	144(%rdi), %xmm8
-        pshufd	$0x4e, %xmm10, %xmm6
-        pxor	%xmm11, %xmm6
-        pxor	%xmm3, %xmm6
-        cmpl	$11, %esi
-        movdqa	160(%rdi), %xmm9
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last
-        aesenc	%xmm9, %xmm8
-        aesenc	176(%rdi), %xmm8
-        cmpl	$13, %esi
-        movdqa	192(%rdi), %xmm9
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last
-        aesenc	%xmm9, %xmm8
-        aesenc	208(%rdi), %xmm8
-        movdqa	224(%rdi), %xmm9
-L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last:
-        aesenclast	%xmm9, %xmm8
-        movdqu	(%rcx), %xmm9
-        pxor	%xmm9, %xmm8
-        movdqu	%xmm8, (%rdx)
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm8
-        pxor	%xmm8, %xmm6
-        addl	$16, %r14d
-        cmpl	%r13d, %r14d
-        jl	L_AES_GCM_encrypt_update_aesni_last_block_start
-L_AES_GCM_encrypt_update_aesni_last_block_ghash:
-        pshufd	$0x4e, %xmm5, %xmm9
-        pshufd	$0x4e, %xmm6, %xmm10
-        movdqa	%xmm6, %xmm11
-        movdqa	%xmm6, %xmm8
-        pclmulqdq	$0x11, %xmm5, %xmm11
-        pclmulqdq	$0x00, %xmm5, %xmm8
-        pxor	%xmm5, %xmm9
-        pxor	%xmm6, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm6
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm6
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm6
-L_AES_GCM_encrypt_update_aesni_last_block_done:
-L_AES_GCM_encrypt_update_aesni_done_enc:
-        movdqa	%xmm6, (%r9)
-        addq	$0xa0, %rsp
-        popq	%r14
-        popq	%r12
-        popq	%r13
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_encrypt_update_aesni,.-AES_GCM_encrypt_update_aesni
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_encrypt_final_aesni
-.type	AES_GCM_encrypt_final_aesni,@function
-.align	16
-AES_GCM_encrypt_final_aesni:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_encrypt_final_aesni
-.p2align	4
-_AES_GCM_encrypt_final_aesni:
-#endif /* __APPLE__ */
-        pushq	%r13
-        movl	%edx, %eax
-        movl	%ecx, %r10d
-        movl	%r8d, %r11d
-        movq	16(%rsp), %r8
-        subq	$16, %rsp
-        movdqa	(%rdi), %xmm4
-        movdqa	(%r9), %xmm5
-        movdqa	(%r8), %xmm6
-        movdqa	%xmm5, %xmm9
-        movdqa	%xmm5, %xmm8
-        psrlq	$63, %xmm9
-        psllq	$0x01, %xmm8
-        pslldq	$8, %xmm9
-        por	%xmm9, %xmm8
-        pshufd	$0xff, %xmm5, %xmm5
-        psrad	$31, %xmm5
-        pand	L_aes_gcm_mod2_128(%rip), %xmm5
-        pxor	%xmm8, %xmm5
-        movl	%r10d, %edx
-        movl	%r11d, %ecx
-        shlq	$3, %rdx
-        shlq	$3, %rcx
-        pinsrq	$0x00, %rdx, %xmm0
-        pinsrq	$0x01, %rcx, %xmm0
-        pxor	%xmm0, %xmm4
-        pshufd	$0x4e, %xmm5, %xmm9
-        pshufd	$0x4e, %xmm4, %xmm10
-        movdqa	%xmm4, %xmm11
-        movdqa	%xmm4, %xmm8
-        pclmulqdq	$0x11, %xmm5, %xmm11
-        pclmulqdq	$0x00, %xmm5, %xmm8
-        pxor	%xmm5, %xmm9
-        pxor	%xmm4, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm4
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm4
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm4
-        movdqu	%xmm6, %xmm0
-        pxor	%xmm4, %xmm0
-        cmpl	$16, %eax
-        je	L_AES_GCM_encrypt_final_aesni_store_tag_16
-        xorq	%rcx, %rcx
-        movdqu	%xmm0, (%rsp)
-L_AES_GCM_encrypt_final_aesni_store_tag_loop:
-        movzbl	(%rsp,%rcx,1), %r13d
-        movb	%r13b, (%rsi,%rcx,1)
-        incl	%ecx
-        cmpl	%eax, %ecx
-        jne	L_AES_GCM_encrypt_final_aesni_store_tag_loop
-        jmp	L_AES_GCM_encrypt_final_aesni_store_tag_done
-L_AES_GCM_encrypt_final_aesni_store_tag_16:
-        movdqu	%xmm0, (%rsi)
-L_AES_GCM_encrypt_final_aesni_store_tag_done:
-        addq	$16, %rsp
-        popq	%r13
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_encrypt_final_aesni,.-AES_GCM_encrypt_final_aesni
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_decrypt_update_aesni
-.type	AES_GCM_decrypt_update_aesni,@function
-.align	16
-AES_GCM_decrypt_update_aesni:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_decrypt_update_aesni
-.p2align	4
-_AES_GCM_decrypt_update_aesni:
-#endif /* __APPLE__ */
-        pushq	%r13
-        pushq	%r12
-        pushq	%r14
-        pushq	%r15
-        movq	%rdx, %r10
-        movq	%rcx, %r11
-        movq	40(%rsp), %rax
-        movq	48(%rsp), %r12
-        subq	$0xa8, %rsp
-        movdqa	(%r9), %xmm6
-        movdqa	(%rax), %xmm5
-        movdqa	%xmm5, %xmm9
-        movdqa	%xmm5, %xmm8
-        psrlq	$63, %xmm9
-        psllq	$0x01, %xmm8
-        pslldq	$8, %xmm9
-        por	%xmm9, %xmm8
-        pshufd	$0xff, %xmm5, %xmm5
-        psrad	$31, %xmm5
-        pand	L_aes_gcm_mod2_128(%rip), %xmm5
-        pxor	%xmm8, %xmm5
-        xorl	%r14d, %r14d
-        cmpl	$0x80, %r8d
-        movl	%r8d, %r13d
-        jl	L_AES_GCM_decrypt_update_aesni_done_128
-        andl	$0xffffff80, %r13d
-        movdqa	%xmm6, %xmm2
-        # H ^ 1
-        movdqu	%xmm5, (%rsp)
-        # H ^ 2
-        pshufd	$0x4e, %xmm5, %xmm9
-        pshufd	$0x4e, %xmm5, %xmm10
-        movdqa	%xmm5, %xmm11
-        movdqa	%xmm5, %xmm8
-        pclmulqdq	$0x11, %xmm5, %xmm11
-        pclmulqdq	$0x00, %xmm5, %xmm8
-        pxor	%xmm5, %xmm9
-        pxor	%xmm5, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm0
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm0
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm0
-        movdqu	%xmm0, 16(%rsp)
-        # H ^ 3
-        pshufd	$0x4e, %xmm5, %xmm9
-        pshufd	$0x4e, %xmm0, %xmm10
-        movdqa	%xmm0, %xmm11
-        movdqa	%xmm0, %xmm8
-        pclmulqdq	$0x11, %xmm5, %xmm11
-        pclmulqdq	$0x00, %xmm5, %xmm8
-        pxor	%xmm5, %xmm9
-        pxor	%xmm0, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm1
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm1
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm1
-        movdqu	%xmm1, 32(%rsp)
-        # H ^ 4
-        pshufd	$0x4e, %xmm0, %xmm9
-        pshufd	$0x4e, %xmm0, %xmm10
-        movdqa	%xmm0, %xmm11
-        movdqa	%xmm0, %xmm8
-        pclmulqdq	$0x11, %xmm0, %xmm11
-        pclmulqdq	$0x00, %xmm0, %xmm8
-        pxor	%xmm0, %xmm9
-        pxor	%xmm0, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm3
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm3
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm3
-        movdqu	%xmm3, 48(%rsp)
-        # H ^ 5
-        pshufd	$0x4e, %xmm0, %xmm9
-        pshufd	$0x4e, %xmm1, %xmm10
-        movdqa	%xmm1, %xmm11
-        movdqa	%xmm1, %xmm8
-        pclmulqdq	$0x11, %xmm0, %xmm11
-        pclmulqdq	$0x00, %xmm0, %xmm8
-        pxor	%xmm0, %xmm9
-        pxor	%xmm1, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm7
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm7
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm7
-        movdqu	%xmm7, 64(%rsp)
-        # H ^ 6
-        pshufd	$0x4e, %xmm1, %xmm9
-        pshufd	$0x4e, %xmm1, %xmm10
-        movdqa	%xmm1, %xmm11
-        movdqa	%xmm1, %xmm8
-        pclmulqdq	$0x11, %xmm1, %xmm11
-        pclmulqdq	$0x00, %xmm1, %xmm8
-        pxor	%xmm1, %xmm9
-        pxor	%xmm1, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm7
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm7
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm7
-        movdqu	%xmm7, 80(%rsp)
-        # H ^ 7
-        pshufd	$0x4e, %xmm1, %xmm9
-        pshufd	$0x4e, %xmm3, %xmm10
-        movdqa	%xmm3, %xmm11
-        movdqa	%xmm3, %xmm8
-        pclmulqdq	$0x11, %xmm1, %xmm11
-        pclmulqdq	$0x00, %xmm1, %xmm8
-        pxor	%xmm1, %xmm9
-        pxor	%xmm3, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm7
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm7
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm7
-        movdqu	%xmm7, 96(%rsp)
-        # H ^ 8
-        pshufd	$0x4e, %xmm3, %xmm9
-        pshufd	$0x4e, %xmm3, %xmm10
-        movdqa	%xmm3, %xmm11
-        movdqa	%xmm3, %xmm8
-        pclmulqdq	$0x11, %xmm3, %xmm11
-        pclmulqdq	$0x00, %xmm3, %xmm8
-        pxor	%xmm3, %xmm9
-        pxor	%xmm3, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm7
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm7
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm7
-        movdqu	%xmm7, 112(%rsp)
-L_AES_GCM_decrypt_update_aesni_ghash_128:
-        leaq	(%r11,%r14,1), %rcx
-        leaq	(%r10,%r14,1), %rdx
-        movdqu	(%r12), %xmm8
-        movdqa	L_aes_gcm_bswap_epi64(%rip), %xmm1
-        movdqa	%xmm8, %xmm0
-        pshufb	%xmm1, %xmm8
-        movdqa	%xmm0, %xmm9
-        paddd	L_aes_gcm_one(%rip), %xmm9
-        pshufb	%xmm1, %xmm9
-        movdqa	%xmm0, %xmm10
-        paddd	L_aes_gcm_two(%rip), %xmm10
-        pshufb	%xmm1, %xmm10
-        movdqa	%xmm0, %xmm11
-        paddd	L_aes_gcm_three(%rip), %xmm11
-        pshufb	%xmm1, %xmm11
-        movdqa	%xmm0, %xmm12
-        paddd	L_aes_gcm_four(%rip), %xmm12
-        pshufb	%xmm1, %xmm12
-        movdqa	%xmm0, %xmm13
-        paddd	L_aes_gcm_five(%rip), %xmm13
-        pshufb	%xmm1, %xmm13
-        movdqa	%xmm0, %xmm14
-        paddd	L_aes_gcm_six(%rip), %xmm14
-        pshufb	%xmm1, %xmm14
-        movdqa	%xmm0, %xmm15
-        paddd	L_aes_gcm_seven(%rip), %xmm15
-        pshufb	%xmm1, %xmm15
-        paddd	L_aes_gcm_eight(%rip), %xmm0
-        movdqa	(%rdi), %xmm7
-        movdqu	%xmm0, (%r12)
-        pxor	%xmm7, %xmm8
-        pxor	%xmm7, %xmm9
-        pxor	%xmm7, %xmm10
-        pxor	%xmm7, %xmm11
-        pxor	%xmm7, %xmm12
-        pxor	%xmm7, %xmm13
-        pxor	%xmm7, %xmm14
-        pxor	%xmm7, %xmm15
-        movdqu	112(%rsp), %xmm7
-        movdqu	(%rcx), %xmm0
-        aesenc	16(%rdi), %xmm8
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        pxor	%xmm2, %xmm0
-        pshufd	$0x4e, %xmm7, %xmm1
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm7, %xmm1
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm3
-        pclmulqdq	$0x11, %xmm7, %xmm3
-        aesenc	16(%rdi), %xmm9
-        aesenc	16(%rdi), %xmm10
-        movdqa	%xmm0, %xmm2
-        pclmulqdq	$0x00, %xmm7, %xmm2
-        aesenc	16(%rdi), %xmm11
-        aesenc	16(%rdi), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm1
-        aesenc	16(%rdi), %xmm13
-        aesenc	16(%rdi), %xmm14
-        aesenc	16(%rdi), %xmm15
-        pxor	%xmm2, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqu	96(%rsp), %xmm7
-        movdqu	16(%rcx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	32(%rdi), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	32(%rdi), %xmm9
-        aesenc	32(%rdi), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	32(%rdi), %xmm11
-        aesenc	32(%rdi), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	32(%rdi), %xmm13
-        aesenc	32(%rdi), %xmm14
-        aesenc	32(%rdi), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	80(%rsp), %xmm7
-        movdqu	32(%rcx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	48(%rdi), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	48(%rdi), %xmm9
-        aesenc	48(%rdi), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	48(%rdi), %xmm11
-        aesenc	48(%rdi), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	48(%rdi), %xmm13
-        aesenc	48(%rdi), %xmm14
-        aesenc	48(%rdi), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	64(%rsp), %xmm7
-        movdqu	48(%rcx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	64(%rdi), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	64(%rdi), %xmm9
-        aesenc	64(%rdi), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	64(%rdi), %xmm11
-        aesenc	64(%rdi), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	64(%rdi), %xmm13
-        aesenc	64(%rdi), %xmm14
-        aesenc	64(%rdi), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	48(%rsp), %xmm7
-        movdqu	64(%rcx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	80(%rdi), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	80(%rdi), %xmm9
-        aesenc	80(%rdi), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	80(%rdi), %xmm11
-        aesenc	80(%rdi), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	80(%rdi), %xmm13
-        aesenc	80(%rdi), %xmm14
-        aesenc	80(%rdi), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	32(%rsp), %xmm7
-        movdqu	80(%rcx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	96(%rdi), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	96(%rdi), %xmm9
-        aesenc	96(%rdi), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	96(%rdi), %xmm11
-        aesenc	96(%rdi), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	96(%rdi), %xmm13
-        aesenc	96(%rdi), %xmm14
-        aesenc	96(%rdi), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	16(%rsp), %xmm7
-        movdqu	96(%rcx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	112(%rdi), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	112(%rdi), %xmm9
-        aesenc	112(%rdi), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	112(%rdi), %xmm11
-        aesenc	112(%rdi), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	112(%rdi), %xmm13
-        aesenc	112(%rdi), %xmm14
-        aesenc	112(%rdi), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	(%rsp), %xmm7
-        movdqu	112(%rcx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm0
-        aesenc	128(%rdi), %xmm8
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        aesenc	128(%rdi), %xmm9
-        aesenc	128(%rdi), %xmm10
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        aesenc	128(%rdi), %xmm11
-        aesenc	128(%rdi), %xmm12
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        aesenc	128(%rdi), %xmm13
-        aesenc	128(%rdi), %xmm14
-        aesenc	128(%rdi), %xmm15
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqa	%xmm1, %xmm5
-        psrldq	$8, %xmm1
-        pslldq	$8, %xmm5
-        aesenc	144(%rdi), %xmm8
-        pxor	%xmm5, %xmm2
-        pxor	%xmm1, %xmm3
-        movdqa	%xmm2, %xmm7
-        movdqa	%xmm2, %xmm4
-        movdqa	%xmm2, %xmm5
-        aesenc	144(%rdi), %xmm9
-        pslld	$31, %xmm7
-        pslld	$30, %xmm4
-        pslld	$25, %xmm5
-        aesenc	144(%rdi), %xmm10
-        pxor	%xmm4, %xmm7
-        pxor	%xmm5, %xmm7
-        aesenc	144(%rdi), %xmm11
-        movdqa	%xmm7, %xmm4
-        pslldq	$12, %xmm7
-        psrldq	$4, %xmm4
-        aesenc	144(%rdi), %xmm12
-        pxor	%xmm7, %xmm2
-        movdqa	%xmm2, %xmm5
-        movdqa	%xmm2, %xmm1
-        movdqa	%xmm2, %xmm0
-        aesenc	144(%rdi), %xmm13
-        psrld	$0x01, %xmm5
-        psrld	$2, %xmm1
-        psrld	$7, %xmm0
-        aesenc	144(%rdi), %xmm14
-        pxor	%xmm1, %xmm5
-        pxor	%xmm0, %xmm5
-        aesenc	144(%rdi), %xmm15
-        pxor	%xmm4, %xmm5
-        pxor	%xmm5, %xmm2
-        pxor	%xmm3, %xmm2
-        cmpl	$11, %esi
-        movdqa	160(%rdi), %xmm7
-        jl	L_AES_GCM_decrypt_update_aesni_aesenc_128_ghash_avx_done
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	176(%rdi), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        cmpl	$13, %esi
-        movdqa	192(%rdi), %xmm7
-        jl	L_AES_GCM_decrypt_update_aesni_aesenc_128_ghash_avx_done
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	208(%rdi), %xmm7
-        aesenc	%xmm7, %xmm8
-        aesenc	%xmm7, %xmm9
-        aesenc	%xmm7, %xmm10
-        aesenc	%xmm7, %xmm11
-        aesenc	%xmm7, %xmm12
-        aesenc	%xmm7, %xmm13
-        aesenc	%xmm7, %xmm14
-        aesenc	%xmm7, %xmm15
-        movdqa	224(%rdi), %xmm7
-L_AES_GCM_decrypt_update_aesni_aesenc_128_ghash_avx_done:
-        aesenclast	%xmm7, %xmm8
-        aesenclast	%xmm7, %xmm9
-        movdqu	(%rcx), %xmm0
-        movdqu	16(%rcx), %xmm1
-        pxor	%xmm0, %xmm8
-        pxor	%xmm1, %xmm9
-        movdqu	%xmm8, (%rdx)
-        movdqu	%xmm9, 16(%rdx)
-        aesenclast	%xmm7, %xmm10
-        aesenclast	%xmm7, %xmm11
-        movdqu	32(%rcx), %xmm0
-        movdqu	48(%rcx), %xmm1
-        pxor	%xmm0, %xmm10
-        pxor	%xmm1, %xmm11
-        movdqu	%xmm10, 32(%rdx)
-        movdqu	%xmm11, 48(%rdx)
-        aesenclast	%xmm7, %xmm12
-        aesenclast	%xmm7, %xmm13
-        movdqu	64(%rcx), %xmm0
-        movdqu	80(%rcx), %xmm1
-        pxor	%xmm0, %xmm12
-        pxor	%xmm1, %xmm13
-        movdqu	%xmm12, 64(%rdx)
-        movdqu	%xmm13, 80(%rdx)
-        aesenclast	%xmm7, %xmm14
-        aesenclast	%xmm7, %xmm15
-        movdqu	96(%rcx), %xmm0
-        movdqu	112(%rcx), %xmm1
-        pxor	%xmm0, %xmm14
-        pxor	%xmm1, %xmm15
-        movdqu	%xmm14, 96(%rdx)
-        movdqu	%xmm15, 112(%rdx)
-        addl	$0x80, %r14d
-        cmpl	%r13d, %r14d
-        jl	L_AES_GCM_decrypt_update_aesni_ghash_128
-        movdqa	%xmm2, %xmm6
-        movdqu	(%rsp), %xmm5
-L_AES_GCM_decrypt_update_aesni_done_128:
-        movl	%r8d, %edx
-        cmpl	%edx, %r14d
-        jge	L_AES_GCM_decrypt_update_aesni_done_dec
-        movl	%r8d, %r13d
-        andl	$0xfffffff0, %r13d
-        cmpl	%r13d, %r14d
-        jge	L_AES_GCM_decrypt_update_aesni_last_block_done
-L_AES_GCM_decrypt_update_aesni_last_block_start:
-        leaq	(%r11,%r14,1), %rcx
-        leaq	(%r10,%r14,1), %rdx
-        movdqu	(%rcx), %xmm1
-        movdqa	%xmm5, %xmm0
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm1
-        pxor	%xmm6, %xmm1
-        movdqu	(%r12), %xmm8
-        movdqa	%xmm8, %xmm9
-        pshufb	L_aes_gcm_bswap_epi64(%rip), %xmm8
-        paddd	L_aes_gcm_one(%rip), %xmm9
-        pxor	(%rdi), %xmm8
-        movdqu	%xmm9, (%r12)
-        movdqa	%xmm1, %xmm10
-        pclmulqdq	$16, %xmm0, %xmm10
-        aesenc	16(%rdi), %xmm8
-        aesenc	32(%rdi), %xmm8
-        movdqa	%xmm1, %xmm11
-        pclmulqdq	$0x01, %xmm0, %xmm11
-        aesenc	48(%rdi), %xmm8
-        aesenc	64(%rdi), %xmm8
-        movdqa	%xmm1, %xmm12
-        pclmulqdq	$0x00, %xmm0, %xmm12
-        aesenc	80(%rdi), %xmm8
-        movdqa	%xmm1, %xmm1
-        pclmulqdq	$0x11, %xmm0, %xmm1
-        aesenc	96(%rdi), %xmm8
-        pxor	%xmm11, %xmm10
-        movdqa	%xmm10, %xmm2
-        psrldq	$8, %xmm10
-        pslldq	$8, %xmm2
-        aesenc	112(%rdi), %xmm8
-        movdqa	%xmm1, %xmm3
-        pxor	%xmm12, %xmm2
-        pxor	%xmm10, %xmm3
-        movdqa	L_aes_gcm_mod2_128(%rip), %xmm0
-        movdqa	%xmm2, %xmm11
-        pclmulqdq	$16, %xmm0, %xmm11
-        aesenc	128(%rdi), %xmm8
-        pshufd	$0x4e, %xmm2, %xmm10
-        pxor	%xmm11, %xmm10
-        movdqa	%xmm10, %xmm11
-        pclmulqdq	$16, %xmm0, %xmm11
-        aesenc	144(%rdi), %xmm8
-        pshufd	$0x4e, %xmm10, %xmm6
-        pxor	%xmm11, %xmm6
-        pxor	%xmm3, %xmm6
-        cmpl	$11, %esi
-        movdqa	160(%rdi), %xmm9
-        jl	L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last
-        aesenc	%xmm9, %xmm8
-        aesenc	176(%rdi), %xmm8
-        cmpl	$13, %esi
-        movdqa	192(%rdi), %xmm9
-        jl	L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last
-        aesenc	%xmm9, %xmm8
-        aesenc	208(%rdi), %xmm8
-        movdqa	224(%rdi), %xmm9
-L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last:
-        aesenclast	%xmm9, %xmm8
-        movdqu	(%rcx), %xmm9
-        pxor	%xmm9, %xmm8
-        movdqu	%xmm8, (%rdx)
-        addl	$16, %r14d
-        cmpl	%r13d, %r14d
-        jl	L_AES_GCM_decrypt_update_aesni_last_block_start
-L_AES_GCM_decrypt_update_aesni_last_block_done:
-L_AES_GCM_decrypt_update_aesni_done_dec:
-        movdqa	%xmm6, (%r9)
-        addq	$0xa8, %rsp
-        popq	%r15
-        popq	%r14
-        popq	%r12
-        popq	%r13
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_decrypt_update_aesni,.-AES_GCM_decrypt_update_aesni
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_decrypt_final_aesni
-.type	AES_GCM_decrypt_final_aesni,@function
-.align	16
-AES_GCM_decrypt_final_aesni:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_decrypt_final_aesni
-.p2align	4
-_AES_GCM_decrypt_final_aesni:
-#endif /* __APPLE__ */
-        pushq	%r13
-        pushq	%rbp
-        pushq	%r12
-        movl	%edx, %eax
-        movl	%ecx, %r10d
-        movl	%r8d, %r11d
-        movq	32(%rsp), %r8
-        movq	40(%rsp), %rbp
-        subq	$16, %rsp
-        movdqa	(%rdi), %xmm6
-        movdqa	(%r9), %xmm5
-        movdqa	(%r8), %xmm15
-        movdqa	%xmm5, %xmm9
-        movdqa	%xmm5, %xmm8
-        psrlq	$63, %xmm9
-        psllq	$0x01, %xmm8
-        pslldq	$8, %xmm9
-        por	%xmm9, %xmm8
-        pshufd	$0xff, %xmm5, %xmm5
-        psrad	$31, %xmm5
-        pand	L_aes_gcm_mod2_128(%rip), %xmm5
-        pxor	%xmm8, %xmm5
-        movl	%r10d, %edx
-        movl	%r11d, %ecx
-        shlq	$3, %rdx
-        shlq	$3, %rcx
-        pinsrq	$0x00, %rdx, %xmm0
-        pinsrq	$0x01, %rcx, %xmm0
-        pxor	%xmm0, %xmm6
-        pshufd	$0x4e, %xmm5, %xmm9
-        pshufd	$0x4e, %xmm6, %xmm10
-        movdqa	%xmm6, %xmm11
-        movdqa	%xmm6, %xmm8
-        pclmulqdq	$0x11, %xmm5, %xmm11
-        pclmulqdq	$0x00, %xmm5, %xmm8
-        pxor	%xmm5, %xmm9
-        pxor	%xmm6, %xmm10
-        pclmulqdq	$0x00, %xmm10, %xmm9
-        pxor	%xmm8, %xmm9
-        pxor	%xmm11, %xmm9
-        movdqa	%xmm9, %xmm10
-        movdqa	%xmm11, %xmm6
-        pslldq	$8, %xmm10
-        psrldq	$8, %xmm9
-        pxor	%xmm10, %xmm8
-        pxor	%xmm9, %xmm6
-        movdqa	%xmm8, %xmm12
-        movdqa	%xmm8, %xmm13
-        movdqa	%xmm8, %xmm14
-        pslld	$31, %xmm12
-        pslld	$30, %xmm13
-        pslld	$25, %xmm14
-        pxor	%xmm13, %xmm12
-        pxor	%xmm14, %xmm12
-        movdqa	%xmm12, %xmm13
-        psrldq	$4, %xmm13
-        pslldq	$12, %xmm12
-        pxor	%xmm12, %xmm8
-        movdqa	%xmm8, %xmm14
-        movdqa	%xmm8, %xmm10
-        movdqa	%xmm8, %xmm9
-        psrld	$0x01, %xmm14
-        psrld	$2, %xmm10
-        psrld	$7, %xmm9
-        pxor	%xmm10, %xmm14
-        pxor	%xmm9, %xmm14
-        pxor	%xmm13, %xmm14
-        pxor	%xmm8, %xmm14
-        pxor	%xmm14, %xmm6
-        pshufb	L_aes_gcm_bswap_mask(%rip), %xmm6
-        movdqu	%xmm15, %xmm0
-        pxor	%xmm6, %xmm0
-        cmpl	$16, %eax
-        je	L_AES_GCM_decrypt_final_aesni_cmp_tag_16
-        subq	$16, %rsp
-        xorq	%rcx, %rcx
-        xorq	%r12, %r12
-        movdqu	%xmm0, (%rsp)
-L_AES_GCM_decrypt_final_aesni_cmp_tag_loop:
-        movzbl	(%rsp,%rcx,1), %r13d
-        xorb	(%rsi,%rcx,1), %r13b
-        orb	%r13b, %r12b
-        incl	%ecx
-        cmpl	%eax, %ecx
-        jne	L_AES_GCM_decrypt_final_aesni_cmp_tag_loop
-        cmpb	$0x00, %r12b
-        sete	%r12b
-        addq	$16, %rsp
-        xorq	%rcx, %rcx
-        jmp	L_AES_GCM_decrypt_final_aesni_cmp_tag_done
-L_AES_GCM_decrypt_final_aesni_cmp_tag_16:
-        movdqu	(%rsi), %xmm1
-        pcmpeqb	%xmm1, %xmm0
-        pmovmskb	%xmm0, %rdx
-        # %%edx == 0xFFFF then return 1 else => return 0
-        xorl	%r12d, %r12d
-        cmpl	$0xffff, %edx
-        sete	%r12b
-L_AES_GCM_decrypt_final_aesni_cmp_tag_done:
-        movl	%r12d, (%rbp)
-        addq	$16, %rsp
-        popq	%r12
-        popq	%rbp
-        popq	%r13
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_decrypt_final_aesni,.-AES_GCM_decrypt_final_aesni
-#endif /* __APPLE__ */
-#endif /* WOLFSSL_AESGCM_STREAM */
-#ifdef HAVE_INTEL_AVX1
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx1_aes_gcm_one:
-.quad	0x0, 0x1
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx1_aes_gcm_two:
-.quad	0x0, 0x2
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx1_aes_gcm_three:
-.quad	0x0, 0x3
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx1_aes_gcm_four:
-.quad	0x0, 0x4
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx1_aes_gcm_five:
-.quad	0x0, 0x5
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx1_aes_gcm_six:
-.quad	0x0, 0x6
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx1_aes_gcm_seven:
-.quad	0x0, 0x7
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx1_aes_gcm_eight:
-.quad	0x0, 0x8
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx1_aes_gcm_bswap_epi64:
-.quad	0x1020304050607, 0x8090a0b0c0d0e0f
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx1_aes_gcm_bswap_mask:
-.quad	0x8090a0b0c0d0e0f, 0x1020304050607
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx1_aes_gcm_mod2_128:
-.quad	0x1, 0xc200000000000000
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_encrypt_avx1
-.type	AES_GCM_encrypt_avx1,@function
-.align	16
-AES_GCM_encrypt_avx1:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_encrypt_avx1
-.p2align	4
-_AES_GCM_encrypt_avx1:
-#endif /* __APPLE__ */
-        pushq	%r13
-        pushq	%r12
-        pushq	%rbx
-        pushq	%r14
-        pushq	%r15
-        movq	%rdx, %r12
-        movq	%rcx, %rax
-        movl	48(%rsp), %r11d
-        movl	56(%rsp), %ebx
-        movl	64(%rsp), %r14d
-        movq	72(%rsp), %r15
-        movl	80(%rsp), %r10d
-        subq	$0xa0, %rsp
-        vpxor	%xmm4, %xmm4, %xmm4
-        vpxor	%xmm6, %xmm6, %xmm6
-        movl	%ebx, %edx
-        cmpl	$12, %edx
-        jne	L_AES_GCM_encrypt_avx1_iv_not_12
-        # # Calculate values when IV is 12 bytes
-        # Set counter based on IV
-        movl	$0x1000000, %ecx
-        vmovq	(%rax), %xmm4
-        vpinsrd	$2, 8(%rax), %xmm4, %xmm4
-        vpinsrd	$3, %ecx, %xmm4, %xmm4
-        # H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqa	(%r15), %xmm5
-        vpxor	%xmm5, %xmm4, %xmm1
-        vmovdqa	16(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	32(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	48(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	64(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	80(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	96(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	112(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	128(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	144(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        cmpl	$11, %r10d
-        vmovdqa	160(%r15), %xmm7
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_12_last
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	176(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        cmpl	$13, %r10d
-        vmovdqa	192(%r15), %xmm7
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_12_last
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	208(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	224(%r15), %xmm7
-L_AES_GCM_encrypt_avx1_calc_iv_12_last:
-        vaesenclast	%xmm7, %xmm5, %xmm5
-        vaesenclast	%xmm7, %xmm1, %xmm1
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
-        vmovdqu	%xmm1, 144(%rsp)
-        jmp	L_AES_GCM_encrypt_avx1_iv_done
-L_AES_GCM_encrypt_avx1_iv_not_12:
-        # Calculate values when IV is not 12 bytes
-        # H = Encrypt X(=0)
-        vmovdqa	(%r15), %xmm5
-        vaesenc	16(%r15), %xmm5, %xmm5
-        vaesenc	32(%r15), %xmm5, %xmm5
-        vaesenc	48(%r15), %xmm5, %xmm5
-        vaesenc	64(%r15), %xmm5, %xmm5
-        vaesenc	80(%r15), %xmm5, %xmm5
-        vaesenc	96(%r15), %xmm5, %xmm5
-        vaesenc	112(%r15), %xmm5, %xmm5
-        vaesenc	128(%r15), %xmm5, %xmm5
-        vaesenc	144(%r15), %xmm5, %xmm5
-        cmpl	$11, %r10d
-        vmovdqa	160(%r15), %xmm9
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm9, %xmm5, %xmm5
-        vaesenc	176(%r15), %xmm5, %xmm5
-        cmpl	$13, %r10d
-        vmovdqa	192(%r15), %xmm9
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm9, %xmm5, %xmm5
-        vaesenc	208(%r15), %xmm5, %xmm5
-        vmovdqa	224(%r15), %xmm9
-L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last:
-        vaesenclast	%xmm9, %xmm5, %xmm5
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
-        # Calc counter
-        # Initialization vector
-        cmpl	$0x00, %edx
-        movq	$0x00, %rcx
-        je	L_AES_GCM_encrypt_avx1_calc_iv_done
-        cmpl	$16, %edx
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_encrypt_avx1_calc_iv_16_loop:
-        vmovdqu	(%rax,%rcx,1), %xmm8
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
-        vpxor	%xmm8, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm7
-        vmovdqa	%xmm3, %xmm4
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm1, %xmm4, %xmm4
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        vpslld	$31, %xmm7, %xmm0
-        vpslld	$30, %xmm7, %xmm1
-        vpslld	$25, %xmm7, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm7, %xmm7
-        vpsrld	$0x01, %xmm7, %xmm2
-        vpsrld	$2, %xmm7, %xmm3
-        vpsrld	$7, %xmm7, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm4, %xmm4
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_16_loop
-        movl	%ebx, %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_encrypt_avx1_calc_iv_done
-L_AES_GCM_encrypt_avx1_calc_iv_lt16:
-        subq	$16, %rsp
-        vpxor	%xmm8, %xmm8, %xmm8
-        xorl	%ebx, %ebx
-        vmovdqu	%xmm8, (%rsp)
-L_AES_GCM_encrypt_avx1_calc_iv_loop:
-        movzbl	(%rax,%rcx,1), %r13d
-        movb	%r13b, (%rsp,%rbx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_loop
-        vmovdqu	(%rsp), %xmm8
-        addq	$16, %rsp
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
-        vpxor	%xmm8, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm7
-        vmovdqa	%xmm3, %xmm4
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm1, %xmm4, %xmm4
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        vpslld	$31, %xmm7, %xmm0
-        vpslld	$30, %xmm7, %xmm1
-        vpslld	$25, %xmm7, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm7, %xmm7
-        vpsrld	$0x01, %xmm7, %xmm2
-        vpsrld	$2, %xmm7, %xmm3
-        vpsrld	$7, %xmm7, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm4, %xmm4
-L_AES_GCM_encrypt_avx1_calc_iv_done:
-        # T = Encrypt counter
-        vpxor	%xmm0, %xmm0, %xmm0
-        shll	$3, %edx
-        vmovq	%rdx, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm7
-        vmovdqa	%xmm3, %xmm4
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm1, %xmm4, %xmm4
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        vpslld	$31, %xmm7, %xmm0
-        vpslld	$30, %xmm7, %xmm1
-        vpslld	$25, %xmm7, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm7, %xmm7
-        vpsrld	$0x01, %xmm7, %xmm2
-        vpsrld	$2, %xmm7, %xmm3
-        vpsrld	$7, %xmm7, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4
-        #   Encrypt counter
-        vmovdqa	(%r15), %xmm8
-        vpxor	%xmm4, %xmm8, %xmm8
-        vaesenc	16(%r15), %xmm8, %xmm8
-        vaesenc	32(%r15), %xmm8, %xmm8
-        vaesenc	48(%r15), %xmm8, %xmm8
-        vaesenc	64(%r15), %xmm8, %xmm8
-        vaesenc	80(%r15), %xmm8, %xmm8
-        vaesenc	96(%r15), %xmm8, %xmm8
-        vaesenc	112(%r15), %xmm8, %xmm8
-        vaesenc	128(%r15), %xmm8, %xmm8
-        vaesenc	144(%r15), %xmm8, %xmm8
-        cmpl	$11, %r10d
-        vmovdqa	160(%r15), %xmm9
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm9, %xmm8, %xmm8
-        vaesenc	176(%r15), %xmm8, %xmm8
-        cmpl	$13, %r10d
-        vmovdqa	192(%r15), %xmm9
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm9, %xmm8, %xmm8
-        vaesenc	208(%r15), %xmm8, %xmm8
-        vmovdqa	224(%r15), %xmm9
-L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last:
-        vaesenclast	%xmm9, %xmm8, %xmm8
-        vmovdqu	%xmm8, 144(%rsp)
-L_AES_GCM_encrypt_avx1_iv_done:
-        # Additional authentication data
-        movl	%r11d, %edx
-        cmpl	$0x00, %edx
-        je	L_AES_GCM_encrypt_avx1_calc_aad_done
-        xorl	%ecx, %ecx
-        cmpl	$16, %edx
-        jl	L_AES_GCM_encrypt_avx1_calc_aad_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_encrypt_avx1_calc_aad_16_loop:
-        vmovdqu	(%r12,%rcx,1), %xmm8
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
-        vpxor	%xmm8, %xmm6, %xmm6
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm6, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm6, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm0
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm7
-        vmovdqa	%xmm3, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm6, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm6, %xmm6
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm6, %xmm6
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm6, %xmm6
-        vpslld	$31, %xmm7, %xmm0
-        vpslld	$30, %xmm7, %xmm1
-        vpslld	$25, %xmm7, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm7, %xmm7
-        vpsrld	$0x01, %xmm7, %xmm2
-        vpsrld	$2, %xmm7, %xmm3
-        vpsrld	$7, %xmm7, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm6, %xmm6
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_avx1_calc_aad_16_loop
-        movl	%r11d, %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_encrypt_avx1_calc_aad_done
-L_AES_GCM_encrypt_avx1_calc_aad_lt16:
-        subq	$16, %rsp
-        vpxor	%xmm8, %xmm8, %xmm8
-        xorl	%ebx, %ebx
-        vmovdqu	%xmm8, (%rsp)
-L_AES_GCM_encrypt_avx1_calc_aad_loop:
-        movzbl	(%r12,%rcx,1), %r13d
-        movb	%r13b, (%rsp,%rbx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_avx1_calc_aad_loop
-        vmovdqu	(%rsp), %xmm8
-        addq	$16, %rsp
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
-        vpxor	%xmm8, %xmm6, %xmm6
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm6, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm6, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm0
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm7
-        vmovdqa	%xmm3, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm6, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm6, %xmm6
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm6, %xmm6
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm6, %xmm6
-        vpslld	$31, %xmm7, %xmm0
-        vpslld	$30, %xmm7, %xmm1
-        vpslld	$25, %xmm7, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm7, %xmm7
-        vpsrld	$0x01, %xmm7, %xmm2
-        vpsrld	$2, %xmm7, %xmm3
-        vpsrld	$7, %xmm7, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm6, %xmm6
-L_AES_GCM_encrypt_avx1_calc_aad_done:
-        # Calculate counter and H
-        vpsrlq	$63, %xmm5, %xmm9
-        vpsllq	$0x01, %xmm5, %xmm8
-        vpslldq	$8, %xmm9, %xmm9
-        vpor	%xmm9, %xmm8, %xmm8
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpshufb	L_avx1_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4
-        vpand	L_avx1_aes_gcm_mod2_128(%rip), %xmm5, %xmm5
-        vpaddd	L_avx1_aes_gcm_one(%rip), %xmm4, %xmm4
-        vpxor	%xmm8, %xmm5, %xmm5
-        vmovdqu	%xmm4, 128(%rsp)
-        xorl	%ebx, %ebx
-        cmpl	$0x80, %r9d
-        movl	%r9d, %r13d
-        jl	L_AES_GCM_encrypt_avx1_done_128
-        andl	$0xffffff80, %r13d
-        vmovdqa	%xmm6, %xmm2
-        # H ^ 1
-        vmovdqu	%xmm5, (%rsp)
-        # H ^ 2
-        vpclmulqdq	$0x00, %xmm5, %xmm5, %xmm8
-        vpclmulqdq	$0x11, %xmm5, %xmm5, %xmm0
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm0, %xmm0
-        vmovdqu	%xmm0, 16(%rsp)
-        # H ^ 3
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm5, %xmm9
-        vpshufd	$0x4e, %xmm0, %xmm10
-        vpclmulqdq	$0x11, %xmm5, %xmm0, %xmm11
-        vpclmulqdq	$0x00, %xmm5, %xmm0, %xmm8
-        vpxor	%xmm5, %xmm9, %xmm9
-        vpxor	%xmm0, %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm10, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm11, %xmm9, %xmm9
-        vpslldq	$8, %xmm9, %xmm10
-        vpsrldq	$8, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm8, %xmm8
-        vpxor	%xmm9, %xmm11, %xmm1
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm1, %xmm1
-        vmovdqu	%xmm1, 32(%rsp)
-        # H ^ 4
-        vpclmulqdq	$0x00, %xmm0, %xmm0, %xmm8
-        vpclmulqdq	$0x11, %xmm0, %xmm0, %xmm3
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm3, %xmm3
-        vmovdqu	%xmm3, 48(%rsp)
-        # H ^ 5
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm0, %xmm9
-        vpshufd	$0x4e, %xmm1, %xmm10
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm11
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm8
-        vpxor	%xmm0, %xmm9, %xmm9
-        vpxor	%xmm1, %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm10, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm11, %xmm9, %xmm9
-        vpslldq	$8, %xmm9, %xmm10
-        vpsrldq	$8, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm8, %xmm8
-        vpxor	%xmm9, %xmm11, %xmm7
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm7, %xmm7
-        vmovdqu	%xmm7, 64(%rsp)
-        # H ^ 6
-        vpclmulqdq	$0x00, %xmm1, %xmm1, %xmm8
-        vpclmulqdq	$0x11, %xmm1, %xmm1, %xmm7
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm7, %xmm7
-        vmovdqu	%xmm7, 80(%rsp)
-        # H ^ 7
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm1, %xmm9
-        vpshufd	$0x4e, %xmm3, %xmm10
-        vpclmulqdq	$0x11, %xmm1, %xmm3, %xmm11
-        vpclmulqdq	$0x00, %xmm1, %xmm3, %xmm8
-        vpxor	%xmm1, %xmm9, %xmm9
-        vpxor	%xmm3, %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm10, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm11, %xmm9, %xmm9
-        vpslldq	$8, %xmm9, %xmm10
-        vpsrldq	$8, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm8, %xmm8
-        vpxor	%xmm9, %xmm11, %xmm7
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm7, %xmm7
-        vmovdqu	%xmm7, 96(%rsp)
-        # H ^ 8
-        vpclmulqdq	$0x00, %xmm3, %xmm3, %xmm8
-        vpclmulqdq	$0x11, %xmm3, %xmm3, %xmm7
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm7, %xmm7
-        vmovdqu	%xmm7, 112(%rsp)
-        # First 128 bytes of input
-        vmovdqu	128(%rsp), %xmm0
-        vmovdqa	L_avx1_aes_gcm_bswap_epi64(%rip), %xmm1
-        vpshufb	%xmm1, %xmm0, %xmm8
-        vpaddd	L_avx1_aes_gcm_one(%rip), %xmm0, %xmm9
-        vpshufb	%xmm1, %xmm9, %xmm9
-        vpaddd	L_avx1_aes_gcm_two(%rip), %xmm0, %xmm10
-        vpshufb	%xmm1, %xmm10, %xmm10
-        vpaddd	L_avx1_aes_gcm_three(%rip), %xmm0, %xmm11
-        vpshufb	%xmm1, %xmm11, %xmm11
-        vpaddd	L_avx1_aes_gcm_four(%rip), %xmm0, %xmm12
-        vpshufb	%xmm1, %xmm12, %xmm12
-        vpaddd	L_avx1_aes_gcm_five(%rip), %xmm0, %xmm13
-        vpshufb	%xmm1, %xmm13, %xmm13
-        vpaddd	L_avx1_aes_gcm_six(%rip), %xmm0, %xmm14
-        vpshufb	%xmm1, %xmm14, %xmm14
-        vpaddd	L_avx1_aes_gcm_seven(%rip), %xmm0, %xmm15
-        vpshufb	%xmm1, %xmm15, %xmm15
-        vpaddd	L_avx1_aes_gcm_eight(%rip), %xmm0, %xmm0
-        vmovdqa	(%r15), %xmm7
-        vmovdqu	%xmm0, 128(%rsp)
-        vpxor	%xmm7, %xmm8, %xmm8
-        vpxor	%xmm7, %xmm9, %xmm9
-        vpxor	%xmm7, %xmm10, %xmm10
-        vpxor	%xmm7, %xmm11, %xmm11
-        vpxor	%xmm7, %xmm12, %xmm12
-        vpxor	%xmm7, %xmm13, %xmm13
-        vpxor	%xmm7, %xmm14, %xmm14
-        vpxor	%xmm7, %xmm15, %xmm15
-        vmovdqa	16(%r15), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	32(%r15), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	48(%r15), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	64(%r15), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	80(%r15), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	96(%r15), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	112(%r15), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	128(%r15), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	144(%r15), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        cmpl	$11, %r10d
-        vmovdqa	160(%r15), %xmm7
-        jl	L_AES_GCM_encrypt_avx1_aesenc_128_enc_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	176(%r15), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        cmpl	$13, %r10d
-        vmovdqa	192(%r15), %xmm7
-        jl	L_AES_GCM_encrypt_avx1_aesenc_128_enc_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	208(%r15), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	224(%r15), %xmm7
-L_AES_GCM_encrypt_avx1_aesenc_128_enc_done:
-        vaesenclast	%xmm7, %xmm8, %xmm8
-        vaesenclast	%xmm7, %xmm9, %xmm9
-        vmovdqu	(%rdi), %xmm0
-        vmovdqu	16(%rdi), %xmm1
-        vpxor	%xmm0, %xmm8, %xmm8
-        vpxor	%xmm1, %xmm9, %xmm9
-        vmovdqu	%xmm8, (%rsi)
-        vmovdqu	%xmm9, 16(%rsi)
-        vaesenclast	%xmm7, %xmm10, %xmm10
-        vaesenclast	%xmm7, %xmm11, %xmm11
-        vmovdqu	32(%rdi), %xmm0
-        vmovdqu	48(%rdi), %xmm1
-        vpxor	%xmm0, %xmm10, %xmm10
-        vpxor	%xmm1, %xmm11, %xmm11
-        vmovdqu	%xmm10, 32(%rsi)
-        vmovdqu	%xmm11, 48(%rsi)
-        vaesenclast	%xmm7, %xmm12, %xmm12
-        vaesenclast	%xmm7, %xmm13, %xmm13
-        vmovdqu	64(%rdi), %xmm0
-        vmovdqu	80(%rdi), %xmm1
-        vpxor	%xmm0, %xmm12, %xmm12
-        vpxor	%xmm1, %xmm13, %xmm13
-        vmovdqu	%xmm12, 64(%rsi)
-        vmovdqu	%xmm13, 80(%rsi)
-        vaesenclast	%xmm7, %xmm14, %xmm14
-        vaesenclast	%xmm7, %xmm15, %xmm15
-        vmovdqu	96(%rdi), %xmm0
-        vmovdqu	112(%rdi), %xmm1
-        vpxor	%xmm0, %xmm14, %xmm14
-        vpxor	%xmm1, %xmm15, %xmm15
-        vmovdqu	%xmm14, 96(%rsi)
-        vmovdqu	%xmm15, 112(%rsi)
-        cmpl	$0x80, %r13d
-        movl	$0x80, %ebx
-        jle	L_AES_GCM_encrypt_avx1_end_128
-        # More 128 bytes of input
-L_AES_GCM_encrypt_avx1_ghash_128:
-        leaq	(%rdi,%rbx,1), %rcx
-        leaq	(%rsi,%rbx,1), %rdx
-        vmovdqu	128(%rsp), %xmm0
-        vmovdqa	L_avx1_aes_gcm_bswap_epi64(%rip), %xmm1
-        vpshufb	%xmm1, %xmm0, %xmm8
-        vpaddd	L_avx1_aes_gcm_one(%rip), %xmm0, %xmm9
-        vpshufb	%xmm1, %xmm9, %xmm9
-        vpaddd	L_avx1_aes_gcm_two(%rip), %xmm0, %xmm10
-        vpshufb	%xmm1, %xmm10, %xmm10
-        vpaddd	L_avx1_aes_gcm_three(%rip), %xmm0, %xmm11
-        vpshufb	%xmm1, %xmm11, %xmm11
-        vpaddd	L_avx1_aes_gcm_four(%rip), %xmm0, %xmm12
-        vpshufb	%xmm1, %xmm12, %xmm12
-        vpaddd	L_avx1_aes_gcm_five(%rip), %xmm0, %xmm13
-        vpshufb	%xmm1, %xmm13, %xmm13
-        vpaddd	L_avx1_aes_gcm_six(%rip), %xmm0, %xmm14
-        vpshufb	%xmm1, %xmm14, %xmm14
-        vpaddd	L_avx1_aes_gcm_seven(%rip), %xmm0, %xmm15
-        vpshufb	%xmm1, %xmm15, %xmm15
-        vpaddd	L_avx1_aes_gcm_eight(%rip), %xmm0, %xmm0
-        vmovdqa	(%r15), %xmm7
-        vmovdqu	%xmm0, 128(%rsp)
-        vpxor	%xmm7, %xmm8, %xmm8
-        vpxor	%xmm7, %xmm9, %xmm9
-        vpxor	%xmm7, %xmm10, %xmm10
-        vpxor	%xmm7, %xmm11, %xmm11
-        vpxor	%xmm7, %xmm12, %xmm12
-        vpxor	%xmm7, %xmm13, %xmm13
-        vpxor	%xmm7, %xmm14, %xmm14
-        vpxor	%xmm7, %xmm15, %xmm15
-        vmovdqu	112(%rsp), %xmm7
-        vmovdqu	-128(%rdx), %xmm0
-        vaesenc	16(%r15), %xmm8, %xmm8
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm3
-        vaesenc	16(%r15), %xmm9, %xmm9
-        vaesenc	16(%r15), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm2
-        vaesenc	16(%r15), %xmm11, %xmm11
-        vaesenc	16(%r15), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm1, %xmm1
-        vaesenc	16(%r15), %xmm13, %xmm13
-        vaesenc	16(%r15), %xmm14, %xmm14
-        vaesenc	16(%r15), %xmm15, %xmm15
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqu	96(%rsp), %xmm7
-        vmovdqu	-112(%rdx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	32(%r15), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	32(%r15), %xmm9, %xmm9
-        vaesenc	32(%r15), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	32(%r15), %xmm11, %xmm11
-        vaesenc	32(%r15), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	32(%r15), %xmm13, %xmm13
-        vaesenc	32(%r15), %xmm14, %xmm14
-        vaesenc	32(%r15), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	80(%rsp), %xmm7
-        vmovdqu	-96(%rdx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	48(%r15), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	48(%r15), %xmm9, %xmm9
-        vaesenc	48(%r15), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	48(%r15), %xmm11, %xmm11
-        vaesenc	48(%r15), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	48(%r15), %xmm13, %xmm13
-        vaesenc	48(%r15), %xmm14, %xmm14
-        vaesenc	48(%r15), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	64(%rsp), %xmm7
-        vmovdqu	-80(%rdx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	64(%r15), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	64(%r15), %xmm9, %xmm9
-        vaesenc	64(%r15), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	64(%r15), %xmm11, %xmm11
-        vaesenc	64(%r15), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	64(%r15), %xmm13, %xmm13
-        vaesenc	64(%r15), %xmm14, %xmm14
-        vaesenc	64(%r15), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	48(%rsp), %xmm7
-        vmovdqu	-64(%rdx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	80(%r15), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	80(%r15), %xmm9, %xmm9
-        vaesenc	80(%r15), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	80(%r15), %xmm11, %xmm11
-        vaesenc	80(%r15), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	80(%r15), %xmm13, %xmm13
-        vaesenc	80(%r15), %xmm14, %xmm14
-        vaesenc	80(%r15), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	32(%rsp), %xmm7
-        vmovdqu	-48(%rdx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	96(%r15), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	96(%r15), %xmm9, %xmm9
-        vaesenc	96(%r15), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	96(%r15), %xmm11, %xmm11
-        vaesenc	96(%r15), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	96(%r15), %xmm13, %xmm13
-        vaesenc	96(%r15), %xmm14, %xmm14
-        vaesenc	96(%r15), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	16(%rsp), %xmm7
-        vmovdqu	-32(%rdx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	112(%r15), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	112(%r15), %xmm9, %xmm9
-        vaesenc	112(%r15), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	112(%r15), %xmm11, %xmm11
-        vaesenc	112(%r15), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	112(%r15), %xmm13, %xmm13
-        vaesenc	112(%r15), %xmm14, %xmm14
-        vaesenc	112(%r15), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	(%rsp), %xmm7
-        vmovdqu	-16(%rdx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	128(%r15), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	128(%r15), %xmm9, %xmm9
-        vaesenc	128(%r15), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	128(%r15), %xmm11, %xmm11
-        vaesenc	128(%r15), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	128(%r15), %xmm13, %xmm13
-        vaesenc	128(%r15), %xmm14, %xmm14
-        vaesenc	128(%r15), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpslldq	$8, %xmm1, %xmm5
-        vpsrldq	$8, %xmm1, %xmm1
-        vaesenc	144(%r15), %xmm8, %xmm8
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm3, %xmm3
-        vaesenc	144(%r15), %xmm9, %xmm9
-        vpslld	$31, %xmm2, %xmm7
-        vpslld	$30, %xmm2, %xmm4
-        vpslld	$25, %xmm2, %xmm5
-        vaesenc	144(%r15), %xmm10, %xmm10
-        vpxor	%xmm4, %xmm7, %xmm7
-        vpxor	%xmm5, %xmm7, %xmm7
-        vaesenc	144(%r15), %xmm11, %xmm11
-        vpsrldq	$4, %xmm7, %xmm4
-        vpslldq	$12, %xmm7, %xmm7
-        vaesenc	144(%r15), %xmm12, %xmm12
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpsrld	$0x01, %xmm2, %xmm5
-        vaesenc	144(%r15), %xmm13, %xmm13
-        vpsrld	$2, %xmm2, %xmm1
-        vpsrld	$7, %xmm2, %xmm0
-        vaesenc	144(%r15), %xmm14, %xmm14
-        vpxor	%xmm1, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vaesenc	144(%r15), %xmm15, %xmm15
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm2, %xmm2
-        cmpl	$11, %r10d
-        vmovdqa	160(%r15), %xmm7
-        jl	L_AES_GCM_encrypt_avx1_aesenc_128_ghash_avx_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	176(%r15), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        cmpl	$13, %r10d
-        vmovdqa	192(%r15), %xmm7
-        jl	L_AES_GCM_encrypt_avx1_aesenc_128_ghash_avx_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	208(%r15), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	224(%r15), %xmm7
-L_AES_GCM_encrypt_avx1_aesenc_128_ghash_avx_done:
-        vaesenclast	%xmm7, %xmm8, %xmm8
-        vaesenclast	%xmm7, %xmm9, %xmm9
-        vmovdqu	(%rcx), %xmm0
-        vmovdqu	16(%rcx), %xmm1
-        vpxor	%xmm0, %xmm8, %xmm8
-        vpxor	%xmm1, %xmm9, %xmm9
-        vmovdqu	%xmm8, (%rdx)
-        vmovdqu	%xmm9, 16(%rdx)
-        vaesenclast	%xmm7, %xmm10, %xmm10
-        vaesenclast	%xmm7, %xmm11, %xmm11
-        vmovdqu	32(%rcx), %xmm0
-        vmovdqu	48(%rcx), %xmm1
-        vpxor	%xmm0, %xmm10, %xmm10
-        vpxor	%xmm1, %xmm11, %xmm11
-        vmovdqu	%xmm10, 32(%rdx)
-        vmovdqu	%xmm11, 48(%rdx)
-        vaesenclast	%xmm7, %xmm12, %xmm12
-        vaesenclast	%xmm7, %xmm13, %xmm13
-        vmovdqu	64(%rcx), %xmm0
-        vmovdqu	80(%rcx), %xmm1
-        vpxor	%xmm0, %xmm12, %xmm12
-        vpxor	%xmm1, %xmm13, %xmm13
-        vmovdqu	%xmm12, 64(%rdx)
-        vmovdqu	%xmm13, 80(%rdx)
-        vaesenclast	%xmm7, %xmm14, %xmm14
-        vaesenclast	%xmm7, %xmm15, %xmm15
-        vmovdqu	96(%rcx), %xmm0
-        vmovdqu	112(%rcx), %xmm1
-        vpxor	%xmm0, %xmm14, %xmm14
-        vpxor	%xmm1, %xmm15, %xmm15
-        vmovdqu	%xmm14, 96(%rdx)
-        vmovdqu	%xmm15, 112(%rdx)
-        addl	$0x80, %ebx
-        cmpl	%r13d, %ebx
-        jl	L_AES_GCM_encrypt_avx1_ghash_128
-L_AES_GCM_encrypt_avx1_end_128:
-        vmovdqa	L_avx1_aes_gcm_bswap_mask(%rip), %xmm4
-        vpshufb	%xmm4, %xmm8, %xmm8
-        vpshufb	%xmm4, %xmm9, %xmm9
-        vpshufb	%xmm4, %xmm10, %xmm10
-        vpshufb	%xmm4, %xmm11, %xmm11
-        vpxor	%xmm2, %xmm8, %xmm8
-        vpshufb	%xmm4, %xmm12, %xmm12
-        vpshufb	%xmm4, %xmm13, %xmm13
-        vpshufb	%xmm4, %xmm14, %xmm14
-        vpshufb	%xmm4, %xmm15, %xmm15
-        vmovdqu	(%rsp), %xmm7
-        vmovdqu	16(%rsp), %xmm5
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm15, %xmm1
-        vpshufd	$0x4e, %xmm7, %xmm2
-        vpclmulqdq	$0x11, %xmm15, %xmm7, %xmm3
-        vpclmulqdq	$0x00, %xmm15, %xmm7, %xmm0
-        vpxor	%xmm15, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm4
-        vmovdqa	%xmm3, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm6, %xmm6
-        # ghash_gfmul_xor_avx
-        vpshufd	$0x4e, %xmm14, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm14, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm14, %xmm5, %xmm0
-        vpxor	%xmm14, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm6, %xmm6
-        vmovdqu	32(%rsp), %xmm7
-        vmovdqu	48(%rsp), %xmm5
-        # ghash_gfmul_xor_avx
-        vpshufd	$0x4e, %xmm13, %xmm1
-        vpshufd	$0x4e, %xmm7, %xmm2
-        vpclmulqdq	$0x11, %xmm13, %xmm7, %xmm3
-        vpclmulqdq	$0x00, %xmm13, %xmm7, %xmm0
-        vpxor	%xmm13, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm6, %xmm6
-        # ghash_gfmul_xor_avx
-        vpshufd	$0x4e, %xmm12, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm12, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm12, %xmm5, %xmm0
-        vpxor	%xmm12, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm6, %xmm6
-        vmovdqu	64(%rsp), %xmm7
-        vmovdqu	80(%rsp), %xmm5
-        # ghash_gfmul_xor_avx
-        vpshufd	$0x4e, %xmm11, %xmm1
-        vpshufd	$0x4e, %xmm7, %xmm2
-        vpclmulqdq	$0x11, %xmm11, %xmm7, %xmm3
-        vpclmulqdq	$0x00, %xmm11, %xmm7, %xmm0
-        vpxor	%xmm11, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm6, %xmm6
-        # ghash_gfmul_xor_avx
-        vpshufd	$0x4e, %xmm10, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm10, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm10, %xmm5, %xmm0
-        vpxor	%xmm10, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm6, %xmm6
-        vmovdqu	96(%rsp), %xmm7
-        vmovdqu	112(%rsp), %xmm5
-        # ghash_gfmul_xor_avx
-        vpshufd	$0x4e, %xmm9, %xmm1
-        vpshufd	$0x4e, %xmm7, %xmm2
-        vpclmulqdq	$0x11, %xmm9, %xmm7, %xmm3
-        vpclmulqdq	$0x00, %xmm9, %xmm7, %xmm0
-        vpxor	%xmm9, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm6, %xmm6
-        # ghash_gfmul_xor_avx
-        vpshufd	$0x4e, %xmm8, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm8, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm8, %xmm5, %xmm0
-        vpxor	%xmm8, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpslld	$31, %xmm4, %xmm0
-        vpslld	$30, %xmm4, %xmm1
-        vpslld	$25, %xmm4, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpsrld	$0x01, %xmm4, %xmm2
-        vpsrld	$2, %xmm4, %xmm3
-        vpsrld	$7, %xmm4, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm4, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm6, %xmm6
-        vmovdqu	(%rsp), %xmm5
-L_AES_GCM_encrypt_avx1_done_128:
-        movl	%r9d, %edx
-        cmpl	%edx, %ebx
-        jge	L_AES_GCM_encrypt_avx1_done_enc
-        movl	%r9d, %r13d
-        andl	$0xfffffff0, %r13d
-        cmpl	%r13d, %ebx
-        jge	L_AES_GCM_encrypt_avx1_last_block_done
-        vmovdqu	128(%rsp), %xmm9
-        vpshufb	L_avx1_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8
-        vpaddd	L_avx1_aes_gcm_one(%rip), %xmm9, %xmm9
-        vmovdqu	%xmm9, 128(%rsp)
-        vpxor	(%r15), %xmm8, %xmm8
-        vaesenc	16(%r15), %xmm8, %xmm8
-        vaesenc	32(%r15), %xmm8, %xmm8
-        vaesenc	48(%r15), %xmm8, %xmm8
-        vaesenc	64(%r15), %xmm8, %xmm8
-        vaesenc	80(%r15), %xmm8, %xmm8
-        vaesenc	96(%r15), %xmm8, %xmm8
-        vaesenc	112(%r15), %xmm8, %xmm8
-        vaesenc	128(%r15), %xmm8, %xmm8
-        vaesenc	144(%r15), %xmm8, %xmm8
-        cmpl	$11, %r10d
-        vmovdqa	160(%r15), %xmm9
-        jl	L_AES_GCM_encrypt_avx1_aesenc_block_last
-        vaesenc	%xmm9, %xmm8, %xmm8
-        vaesenc	176(%r15), %xmm8, %xmm8
-        cmpl	$13, %r10d
-        vmovdqa	192(%r15), %xmm9
-        jl	L_AES_GCM_encrypt_avx1_aesenc_block_last
-        vaesenc	%xmm9, %xmm8, %xmm8
-        vaesenc	208(%r15), %xmm8, %xmm8
-        vmovdqa	224(%r15), %xmm9
-L_AES_GCM_encrypt_avx1_aesenc_block_last:
-        vaesenclast	%xmm9, %xmm8, %xmm8
-        vmovdqu	(%rdi,%rbx,1), %xmm9
-        vpxor	%xmm9, %xmm8, %xmm8
-        vmovdqu	%xmm8, (%rsi,%rbx,1)
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
-        vpxor	%xmm8, %xmm6, %xmm6
-        addl	$16, %ebx
-        cmpl	%r13d, %ebx
-        jge	L_AES_GCM_encrypt_avx1_last_block_ghash
-L_AES_GCM_encrypt_avx1_last_block_start:
-        vmovdqu	(%rdi,%rbx,1), %xmm13
-        vmovdqu	128(%rsp), %xmm9
-        vpshufb	L_avx1_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8
-        vpaddd	L_avx1_aes_gcm_one(%rip), %xmm9, %xmm9
-        vmovdqu	%xmm9, 128(%rsp)
-        vpxor	(%r15), %xmm8, %xmm8
-        vpclmulqdq	$16, %xmm5, %xmm6, %xmm10
-        vaesenc	16(%r15), %xmm8, %xmm8
-        vaesenc	32(%r15), %xmm8, %xmm8
-        vpclmulqdq	$0x01, %xmm5, %xmm6, %xmm11
-        vaesenc	48(%r15), %xmm8, %xmm8
-        vaesenc	64(%r15), %xmm8, %xmm8
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm12
-        vaesenc	80(%r15), %xmm8, %xmm8
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm1
-        vaesenc	96(%r15), %xmm8, %xmm8
-        vpxor	%xmm11, %xmm10, %xmm10
-        vpslldq	$8, %xmm10, %xmm2
-        vpsrldq	$8, %xmm10, %xmm10
-        vaesenc	112(%r15), %xmm8, %xmm8
-        vpxor	%xmm12, %xmm2, %xmm2
-        vpxor	%xmm10, %xmm1, %xmm3
-        vmovdqa	L_avx1_aes_gcm_mod2_128(%rip), %xmm0
-        vpclmulqdq	$16, %xmm0, %xmm2, %xmm11
-        vaesenc	128(%r15), %xmm8, %xmm8
-        vpshufd	$0x4e, %xmm2, %xmm10
-        vpxor	%xmm11, %xmm10, %xmm10
-        vpclmulqdq	$16, %xmm0, %xmm10, %xmm11
-        vaesenc	144(%r15), %xmm8, %xmm8
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpxor	%xmm11, %xmm10, %xmm10
-        vpxor	%xmm3, %xmm10, %xmm6
-        cmpl	$11, %r10d
-        vmovdqa	160(%r15), %xmm9
-        jl	L_AES_GCM_encrypt_avx1_aesenc_gfmul_last
-        vaesenc	%xmm9, %xmm8, %xmm8
-        vaesenc	176(%r15), %xmm8, %xmm8
-        cmpl	$13, %r10d
-        vmovdqa	192(%r15), %xmm9
-        jl	L_AES_GCM_encrypt_avx1_aesenc_gfmul_last
-        vaesenc	%xmm9, %xmm8, %xmm8
-        vaesenc	208(%r15), %xmm8, %xmm8
-        vmovdqa	224(%r15), %xmm9
-L_AES_GCM_encrypt_avx1_aesenc_gfmul_last:
-        vaesenclast	%xmm9, %xmm8, %xmm8
-        vmovdqa	%xmm13, %xmm0
-        vpxor	%xmm0, %xmm8, %xmm8
-        vmovdqu	%xmm8, (%rsi,%rbx,1)
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
-        addl	$16, %ebx
-        vpxor	%xmm8, %xmm6, %xmm6
-        cmpl	%r13d, %ebx
-        jl	L_AES_GCM_encrypt_avx1_last_block_start
-L_AES_GCM_encrypt_avx1_last_block_ghash:
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm5, %xmm9
-        vpshufd	$0x4e, %xmm6, %xmm10
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm11
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm8
-        vpxor	%xmm5, %xmm9, %xmm9
-        vpxor	%xmm6, %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm10, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm11, %xmm9, %xmm9
-        vpslldq	$8, %xmm9, %xmm10
-        vpsrldq	$8, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm8, %xmm8
-        vpxor	%xmm9, %xmm11, %xmm6
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm6, %xmm6
-L_AES_GCM_encrypt_avx1_last_block_done:
-        movl	%r9d, %ecx
-        movl	%ecx, %edx
-        andl	$15, %ecx
-        jz	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_done
-        vmovdqu	128(%rsp), %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4
-        vpxor	(%r15), %xmm4, %xmm4
-        vaesenc	16(%r15), %xmm4, %xmm4
-        vaesenc	32(%r15), %xmm4, %xmm4
-        vaesenc	48(%r15), %xmm4, %xmm4
-        vaesenc	64(%r15), %xmm4, %xmm4
-        vaesenc	80(%r15), %xmm4, %xmm4
-        vaesenc	96(%r15), %xmm4, %xmm4
-        vaesenc	112(%r15), %xmm4, %xmm4
-        vaesenc	128(%r15), %xmm4, %xmm4
-        vaesenc	144(%r15), %xmm4, %xmm4
-        cmpl	$11, %r10d
-        vmovdqa	160(%r15), %xmm9
-        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last
-        vaesenc	%xmm9, %xmm4, %xmm4
-        vaesenc	176(%r15), %xmm4, %xmm4
-        cmpl	$13, %r10d
-        vmovdqa	192(%r15), %xmm9
-        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last
-        vaesenc	%xmm9, %xmm4, %xmm4
-        vaesenc	208(%r15), %xmm4, %xmm4
-        vmovdqa	224(%r15), %xmm9
-L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last:
-        vaesenclast	%xmm9, %xmm4, %xmm4
-        subq	$16, %rsp
-        xorl	%ecx, %ecx
-        vmovdqu	%xmm4, (%rsp)
-L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_loop:
-        movzbl	(%rdi,%rbx,1), %r13d
-        xorb	(%rsp,%rcx,1), %r13b
-        movb	%r13b, (%rsi,%rbx,1)
-        movb	%r13b, (%rsp,%rcx,1)
-        incl	%ebx
-        incl	%ecx
-        cmpl	%edx, %ebx
-        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_loop
-        xorq	%r13, %r13
-        cmpl	$16, %ecx
-        je	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_finish_enc
-L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_byte_loop:
-        movb	%r13b, (%rsp,%rcx,1)
-        incl	%ecx
-        cmpl	$16, %ecx
-        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_byte_loop
-L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_finish_enc:
-        vmovdqu	(%rsp), %xmm4
-        addq	$16, %rsp
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4
-        vpxor	%xmm4, %xmm6, %xmm6
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm5, %xmm9
-        vpshufd	$0x4e, %xmm6, %xmm10
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm11
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm8
-        vpxor	%xmm5, %xmm9, %xmm9
-        vpxor	%xmm6, %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm10, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm11, %xmm9, %xmm9
-        vpslldq	$8, %xmm9, %xmm10
-        vpsrldq	$8, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm8, %xmm8
-        vpxor	%xmm9, %xmm11, %xmm6
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm6, %xmm6
-L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_done:
-L_AES_GCM_encrypt_avx1_done_enc:
-        movl	%r9d, %edx
-        movl	%r11d, %ecx
-        shlq	$3, %rdx
-        shlq	$3, %rcx
-        vmovq	%rdx, %xmm0
-        vmovq	%rcx, %xmm1
-        vpunpcklqdq	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm6, %xmm6
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm5, %xmm9
-        vpshufd	$0x4e, %xmm6, %xmm10
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm11
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm8
-        vpxor	%xmm5, %xmm9, %xmm9
-        vpxor	%xmm6, %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm10, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm11, %xmm9, %xmm9
-        vpslldq	$8, %xmm9, %xmm10
-        vpsrldq	$8, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm8, %xmm8
-        vpxor	%xmm9, %xmm11, %xmm6
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm6, %xmm6
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6
-        vmovdqu	144(%rsp), %xmm0
-        vpxor	%xmm6, %xmm0, %xmm0
-        cmpl	$16, %r14d
-        je	L_AES_GCM_encrypt_avx1_store_tag_16
-        xorq	%rcx, %rcx
-        vmovdqu	%xmm0, (%rsp)
-L_AES_GCM_encrypt_avx1_store_tag_loop:
-        movzbl	(%rsp,%rcx,1), %r13d
-        movb	%r13b, (%r8,%rcx,1)
-        incl	%ecx
-        cmpl	%r14d, %ecx
-        jne	L_AES_GCM_encrypt_avx1_store_tag_loop
-        jmp	L_AES_GCM_encrypt_avx1_store_tag_done
-L_AES_GCM_encrypt_avx1_store_tag_16:
-        vmovdqu	%xmm0, (%r8)
-L_AES_GCM_encrypt_avx1_store_tag_done:
-        vzeroupper
-        addq	$0xa0, %rsp
-        popq	%r15
-        popq	%r14
-        popq	%rbx
-        popq	%r12
-        popq	%r13
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_encrypt_avx1,.-AES_GCM_encrypt_avx1
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_decrypt_avx1
-.type	AES_GCM_decrypt_avx1,@function
-.align	16
-AES_GCM_decrypt_avx1:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_decrypt_avx1
-.p2align	4
-_AES_GCM_decrypt_avx1:
-#endif /* __APPLE__ */
-        pushq	%r13
-        pushq	%r12
-        pushq	%rbx
-        pushq	%r14
-        pushq	%r15
-        pushq	%rbp
-        movq	%rdx, %r12
-        movq	%rcx, %rax
-        movl	56(%rsp), %r11d
-        movl	64(%rsp), %ebx
-        movl	72(%rsp), %r14d
-        movq	80(%rsp), %r15
-        movl	88(%rsp), %r10d
-        movq	96(%rsp), %rbp
-        subq	$0xa8, %rsp
-        vpxor	%xmm4, %xmm4, %xmm4
-        vpxor	%xmm6, %xmm6, %xmm6
-        cmpl	$12, %ebx
-        movl	%ebx, %edx
-        jne	L_AES_GCM_decrypt_avx1_iv_not_12
-        # # Calculate values when IV is 12 bytes
-        # Set counter based on IV
-        movl	$0x1000000, %ecx
-        vmovq	(%rax), %xmm4
-        vpinsrd	$2, 8(%rax), %xmm4, %xmm4
-        vpinsrd	$3, %ecx, %xmm4, %xmm4
-        # H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqa	(%r15), %xmm5
-        vpxor	%xmm5, %xmm4, %xmm1
-        vmovdqa	16(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	32(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	48(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	64(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	80(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	96(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	112(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	128(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	144(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        cmpl	$11, %r10d
-        vmovdqa	160(%r15), %xmm7
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_12_last
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	176(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        cmpl	$13, %r10d
-        vmovdqa	192(%r15), %xmm7
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_12_last
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	208(%r15), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	224(%r15), %xmm7
-L_AES_GCM_decrypt_avx1_calc_iv_12_last:
-        vaesenclast	%xmm7, %xmm5, %xmm5
-        vaesenclast	%xmm7, %xmm1, %xmm1
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
-        vmovdqu	%xmm1, 144(%rsp)
-        jmp	L_AES_GCM_decrypt_avx1_iv_done
-L_AES_GCM_decrypt_avx1_iv_not_12:
-        # Calculate values when IV is not 12 bytes
-        # H = Encrypt X(=0)
-        vmovdqa	(%r15), %xmm5
-        vaesenc	16(%r15), %xmm5, %xmm5
-        vaesenc	32(%r15), %xmm5, %xmm5
-        vaesenc	48(%r15), %xmm5, %xmm5
-        vaesenc	64(%r15), %xmm5, %xmm5
-        vaesenc	80(%r15), %xmm5, %xmm5
-        vaesenc	96(%r15), %xmm5, %xmm5
-        vaesenc	112(%r15), %xmm5, %xmm5
-        vaesenc	128(%r15), %xmm5, %xmm5
-        vaesenc	144(%r15), %xmm5, %xmm5
-        cmpl	$11, %r10d
-        vmovdqa	160(%r15), %xmm9
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm9, %xmm5, %xmm5
-        vaesenc	176(%r15), %xmm5, %xmm5
-        cmpl	$13, %r10d
-        vmovdqa	192(%r15), %xmm9
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm9, %xmm5, %xmm5
-        vaesenc	208(%r15), %xmm5, %xmm5
-        vmovdqa	224(%r15), %xmm9
-L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last:
-        vaesenclast	%xmm9, %xmm5, %xmm5
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
-        # Calc counter
-        # Initialization vector
-        cmpl	$0x00, %edx
-        movq	$0x00, %rcx
-        je	L_AES_GCM_decrypt_avx1_calc_iv_done
-        cmpl	$16, %edx
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_decrypt_avx1_calc_iv_16_loop:
-        vmovdqu	(%rax,%rcx,1), %xmm8
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
-        vpxor	%xmm8, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm7
-        vmovdqa	%xmm3, %xmm4
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm1, %xmm4, %xmm4
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        vpslld	$31, %xmm7, %xmm0
-        vpslld	$30, %xmm7, %xmm1
-        vpslld	$25, %xmm7, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm7, %xmm7
-        vpsrld	$0x01, %xmm7, %xmm2
-        vpsrld	$2, %xmm7, %xmm3
-        vpsrld	$7, %xmm7, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm4, %xmm4
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_16_loop
-        movl	%ebx, %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_decrypt_avx1_calc_iv_done
-L_AES_GCM_decrypt_avx1_calc_iv_lt16:
-        subq	$16, %rsp
-        vpxor	%xmm8, %xmm8, %xmm8
-        xorl	%ebx, %ebx
-        vmovdqu	%xmm8, (%rsp)
-L_AES_GCM_decrypt_avx1_calc_iv_loop:
-        movzbl	(%rax,%rcx,1), %r13d
-        movb	%r13b, (%rsp,%rbx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_loop
-        vmovdqu	(%rsp), %xmm8
-        addq	$16, %rsp
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
-        vpxor	%xmm8, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm7
-        vmovdqa	%xmm3, %xmm4
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm1, %xmm4, %xmm4
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        vpslld	$31, %xmm7, %xmm0
-        vpslld	$30, %xmm7, %xmm1
-        vpslld	$25, %xmm7, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm7, %xmm7
-        vpsrld	$0x01, %xmm7, %xmm2
-        vpsrld	$2, %xmm7, %xmm3
-        vpsrld	$7, %xmm7, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm4, %xmm4
-L_AES_GCM_decrypt_avx1_calc_iv_done:
-        # T = Encrypt counter
-        vpxor	%xmm0, %xmm0, %xmm0
-        shll	$3, %edx
-        vmovq	%rdx, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm7
-        vmovdqa	%xmm3, %xmm4
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm1, %xmm4, %xmm4
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        vpslld	$31, %xmm7, %xmm0
-        vpslld	$30, %xmm7, %xmm1
-        vpslld	$25, %xmm7, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm7, %xmm7
-        vpsrld	$0x01, %xmm7, %xmm2
-        vpsrld	$2, %xmm7, %xmm3
-        vpsrld	$7, %xmm7, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4
-        #   Encrypt counter
-        vmovdqa	(%r15), %xmm8
-        vpxor	%xmm4, %xmm8, %xmm8
-        vaesenc	16(%r15), %xmm8, %xmm8
-        vaesenc	32(%r15), %xmm8, %xmm8
-        vaesenc	48(%r15), %xmm8, %xmm8
-        vaesenc	64(%r15), %xmm8, %xmm8
-        vaesenc	80(%r15), %xmm8, %xmm8
-        vaesenc	96(%r15), %xmm8, %xmm8
-        vaesenc	112(%r15), %xmm8, %xmm8
-        vaesenc	128(%r15), %xmm8, %xmm8
-        vaesenc	144(%r15), %xmm8, %xmm8
-        cmpl	$11, %r10d
-        vmovdqa	160(%r15), %xmm9
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm9, %xmm8, %xmm8
-        vaesenc	176(%r15), %xmm8, %xmm8
-        cmpl	$13, %r10d
-        vmovdqa	192(%r15), %xmm9
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm9, %xmm8, %xmm8
-        vaesenc	208(%r15), %xmm8, %xmm8
-        vmovdqa	224(%r15), %xmm9
-L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last:
-        vaesenclast	%xmm9, %xmm8, %xmm8
-        vmovdqu	%xmm8, 144(%rsp)
-L_AES_GCM_decrypt_avx1_iv_done:
-        # Additional authentication data
-        movl	%r11d, %edx
-        cmpl	$0x00, %edx
-        je	L_AES_GCM_decrypt_avx1_calc_aad_done
-        xorl	%ecx, %ecx
-        cmpl	$16, %edx
-        jl	L_AES_GCM_decrypt_avx1_calc_aad_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_decrypt_avx1_calc_aad_16_loop:
-        vmovdqu	(%r12,%rcx,1), %xmm8
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
-        vpxor	%xmm8, %xmm6, %xmm6
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm6, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm6, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm0
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm7
-        vmovdqa	%xmm3, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm6, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm6, %xmm6
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm6, %xmm6
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm6, %xmm6
-        vpslld	$31, %xmm7, %xmm0
-        vpslld	$30, %xmm7, %xmm1
-        vpslld	$25, %xmm7, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm7, %xmm7
-        vpsrld	$0x01, %xmm7, %xmm2
-        vpsrld	$2, %xmm7, %xmm3
-        vpsrld	$7, %xmm7, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm6, %xmm6
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_avx1_calc_aad_16_loop
-        movl	%r11d, %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_decrypt_avx1_calc_aad_done
-L_AES_GCM_decrypt_avx1_calc_aad_lt16:
-        subq	$16, %rsp
-        vpxor	%xmm8, %xmm8, %xmm8
-        xorl	%ebx, %ebx
-        vmovdqu	%xmm8, (%rsp)
-L_AES_GCM_decrypt_avx1_calc_aad_loop:
-        movzbl	(%r12,%rcx,1), %r13d
-        movb	%r13b, (%rsp,%rbx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_avx1_calc_aad_loop
-        vmovdqu	(%rsp), %xmm8
-        addq	$16, %rsp
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
-        vpxor	%xmm8, %xmm6, %xmm6
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm6, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm6, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm0
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm7
-        vmovdqa	%xmm3, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm6, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm6, %xmm6
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm6, %xmm6
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm6, %xmm6
-        vpslld	$31, %xmm7, %xmm0
-        vpslld	$30, %xmm7, %xmm1
-        vpslld	$25, %xmm7, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm7, %xmm7
-        vpsrld	$0x01, %xmm7, %xmm2
-        vpsrld	$2, %xmm7, %xmm3
-        vpsrld	$7, %xmm7, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm6, %xmm6
-L_AES_GCM_decrypt_avx1_calc_aad_done:
-        # Calculate counter and H
-        vpsrlq	$63, %xmm5, %xmm9
-        vpsllq	$0x01, %xmm5, %xmm8
-        vpslldq	$8, %xmm9, %xmm9
-        vpor	%xmm9, %xmm8, %xmm8
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpshufb	L_avx1_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4
-        vpand	L_avx1_aes_gcm_mod2_128(%rip), %xmm5, %xmm5
-        vpaddd	L_avx1_aes_gcm_one(%rip), %xmm4, %xmm4
-        vpxor	%xmm8, %xmm5, %xmm5
-        vmovdqu	%xmm4, 128(%rsp)
-        xorl	%ebx, %ebx
-        cmpl	$0x80, %r9d
-        movl	%r9d, %r13d
-        jl	L_AES_GCM_decrypt_avx1_done_128
-        andl	$0xffffff80, %r13d
-        vmovdqa	%xmm6, %xmm2
-        # H ^ 1
-        vmovdqu	%xmm5, (%rsp)
-        # H ^ 2
-        vpclmulqdq	$0x00, %xmm5, %xmm5, %xmm8
-        vpclmulqdq	$0x11, %xmm5, %xmm5, %xmm0
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm0, %xmm0
-        vmovdqu	%xmm0, 16(%rsp)
-        # H ^ 3
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm5, %xmm9
-        vpshufd	$0x4e, %xmm0, %xmm10
-        vpclmulqdq	$0x11, %xmm5, %xmm0, %xmm11
-        vpclmulqdq	$0x00, %xmm5, %xmm0, %xmm8
-        vpxor	%xmm5, %xmm9, %xmm9
-        vpxor	%xmm0, %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm10, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm11, %xmm9, %xmm9
-        vpslldq	$8, %xmm9, %xmm10
-        vpsrldq	$8, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm8, %xmm8
-        vpxor	%xmm9, %xmm11, %xmm1
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm1, %xmm1
-        vmovdqu	%xmm1, 32(%rsp)
-        # H ^ 4
-        vpclmulqdq	$0x00, %xmm0, %xmm0, %xmm8
-        vpclmulqdq	$0x11, %xmm0, %xmm0, %xmm3
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm3, %xmm3
-        vmovdqu	%xmm3, 48(%rsp)
-        # H ^ 5
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm0, %xmm9
-        vpshufd	$0x4e, %xmm1, %xmm10
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm11
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm8
-        vpxor	%xmm0, %xmm9, %xmm9
-        vpxor	%xmm1, %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm10, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm11, %xmm9, %xmm9
-        vpslldq	$8, %xmm9, %xmm10
-        vpsrldq	$8, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm8, %xmm8
-        vpxor	%xmm9, %xmm11, %xmm7
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm7, %xmm7
-        vmovdqu	%xmm7, 64(%rsp)
-        # H ^ 6
-        vpclmulqdq	$0x00, %xmm1, %xmm1, %xmm8
-        vpclmulqdq	$0x11, %xmm1, %xmm1, %xmm7
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm7, %xmm7
-        vmovdqu	%xmm7, 80(%rsp)
-        # H ^ 7
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm1, %xmm9
-        vpshufd	$0x4e, %xmm3, %xmm10
-        vpclmulqdq	$0x11, %xmm1, %xmm3, %xmm11
-        vpclmulqdq	$0x00, %xmm1, %xmm3, %xmm8
-        vpxor	%xmm1, %xmm9, %xmm9
-        vpxor	%xmm3, %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm10, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm11, %xmm9, %xmm9
-        vpslldq	$8, %xmm9, %xmm10
-        vpsrldq	$8, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm8, %xmm8
-        vpxor	%xmm9, %xmm11, %xmm7
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm7, %xmm7
-        vmovdqu	%xmm7, 96(%rsp)
-        # H ^ 8
-        vpclmulqdq	$0x00, %xmm3, %xmm3, %xmm8
-        vpclmulqdq	$0x11, %xmm3, %xmm3, %xmm7
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm7, %xmm7
-        vmovdqu	%xmm7, 112(%rsp)
-L_AES_GCM_decrypt_avx1_ghash_128:
-        leaq	(%rdi,%rbx,1), %rcx
-        leaq	(%rsi,%rbx,1), %rdx
-        vmovdqu	128(%rsp), %xmm0
-        vmovdqa	L_avx1_aes_gcm_bswap_epi64(%rip), %xmm1
-        vpshufb	%xmm1, %xmm0, %xmm8
-        vpaddd	L_avx1_aes_gcm_one(%rip), %xmm0, %xmm9
-        vpshufb	%xmm1, %xmm9, %xmm9
-        vpaddd	L_avx1_aes_gcm_two(%rip), %xmm0, %xmm10
-        vpshufb	%xmm1, %xmm10, %xmm10
-        vpaddd	L_avx1_aes_gcm_three(%rip), %xmm0, %xmm11
-        vpshufb	%xmm1, %xmm11, %xmm11
-        vpaddd	L_avx1_aes_gcm_four(%rip), %xmm0, %xmm12
-        vpshufb	%xmm1, %xmm12, %xmm12
-        vpaddd	L_avx1_aes_gcm_five(%rip), %xmm0, %xmm13
-        vpshufb	%xmm1, %xmm13, %xmm13
-        vpaddd	L_avx1_aes_gcm_six(%rip), %xmm0, %xmm14
-        vpshufb	%xmm1, %xmm14, %xmm14
-        vpaddd	L_avx1_aes_gcm_seven(%rip), %xmm0, %xmm15
-        vpshufb	%xmm1, %xmm15, %xmm15
-        vpaddd	L_avx1_aes_gcm_eight(%rip), %xmm0, %xmm0
-        vmovdqa	(%r15), %xmm7
-        vmovdqu	%xmm0, 128(%rsp)
-        vpxor	%xmm7, %xmm8, %xmm8
-        vpxor	%xmm7, %xmm9, %xmm9
-        vpxor	%xmm7, %xmm10, %xmm10
-        vpxor	%xmm7, %xmm11, %xmm11
-        vpxor	%xmm7, %xmm12, %xmm12
-        vpxor	%xmm7, %xmm13, %xmm13
-        vpxor	%xmm7, %xmm14, %xmm14
-        vpxor	%xmm7, %xmm15, %xmm15
-        vmovdqu	112(%rsp), %xmm7
-        vmovdqu	(%rcx), %xmm0
-        vaesenc	16(%r15), %xmm8, %xmm8
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm3
-        vaesenc	16(%r15), %xmm9, %xmm9
-        vaesenc	16(%r15), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm2
-        vaesenc	16(%r15), %xmm11, %xmm11
-        vaesenc	16(%r15), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm1, %xmm1
-        vaesenc	16(%r15), %xmm13, %xmm13
-        vaesenc	16(%r15), %xmm14, %xmm14
-        vaesenc	16(%r15), %xmm15, %xmm15
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqu	96(%rsp), %xmm7
-        vmovdqu	16(%rcx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	32(%r15), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	32(%r15), %xmm9, %xmm9
-        vaesenc	32(%r15), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	32(%r15), %xmm11, %xmm11
-        vaesenc	32(%r15), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	32(%r15), %xmm13, %xmm13
-        vaesenc	32(%r15), %xmm14, %xmm14
-        vaesenc	32(%r15), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	80(%rsp), %xmm7
-        vmovdqu	32(%rcx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	48(%r15), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	48(%r15), %xmm9, %xmm9
-        vaesenc	48(%r15), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	48(%r15), %xmm11, %xmm11
-        vaesenc	48(%r15), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	48(%r15), %xmm13, %xmm13
-        vaesenc	48(%r15), %xmm14, %xmm14
-        vaesenc	48(%r15), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	64(%rsp), %xmm7
-        vmovdqu	48(%rcx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	64(%r15), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	64(%r15), %xmm9, %xmm9
-        vaesenc	64(%r15), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	64(%r15), %xmm11, %xmm11
-        vaesenc	64(%r15), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	64(%r15), %xmm13, %xmm13
-        vaesenc	64(%r15), %xmm14, %xmm14
-        vaesenc	64(%r15), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	48(%rsp), %xmm7
-        vmovdqu	64(%rcx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	80(%r15), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	80(%r15), %xmm9, %xmm9
-        vaesenc	80(%r15), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	80(%r15), %xmm11, %xmm11
-        vaesenc	80(%r15), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	80(%r15), %xmm13, %xmm13
-        vaesenc	80(%r15), %xmm14, %xmm14
-        vaesenc	80(%r15), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	32(%rsp), %xmm7
-        vmovdqu	80(%rcx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	96(%r15), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	96(%r15), %xmm9, %xmm9
-        vaesenc	96(%r15), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	96(%r15), %xmm11, %xmm11
-        vaesenc	96(%r15), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	96(%r15), %xmm13, %xmm13
-        vaesenc	96(%r15), %xmm14, %xmm14
-        vaesenc	96(%r15), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	16(%rsp), %xmm7
-        vmovdqu	96(%rcx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	112(%r15), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	112(%r15), %xmm9, %xmm9
-        vaesenc	112(%r15), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	112(%r15), %xmm11, %xmm11
-        vaesenc	112(%r15), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	112(%r15), %xmm13, %xmm13
-        vaesenc	112(%r15), %xmm14, %xmm14
-        vaesenc	112(%r15), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	(%rsp), %xmm7
-        vmovdqu	112(%rcx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	128(%r15), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	128(%r15), %xmm9, %xmm9
-        vaesenc	128(%r15), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	128(%r15), %xmm11, %xmm11
-        vaesenc	128(%r15), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	128(%r15), %xmm13, %xmm13
-        vaesenc	128(%r15), %xmm14, %xmm14
-        vaesenc	128(%r15), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpslldq	$8, %xmm1, %xmm5
-        vpsrldq	$8, %xmm1, %xmm1
-        vaesenc	144(%r15), %xmm8, %xmm8
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm3, %xmm3
-        vaesenc	144(%r15), %xmm9, %xmm9
-        vpslld	$31, %xmm2, %xmm7
-        vpslld	$30, %xmm2, %xmm4
-        vpslld	$25, %xmm2, %xmm5
-        vaesenc	144(%r15), %xmm10, %xmm10
-        vpxor	%xmm4, %xmm7, %xmm7
-        vpxor	%xmm5, %xmm7, %xmm7
-        vaesenc	144(%r15), %xmm11, %xmm11
-        vpsrldq	$4, %xmm7, %xmm4
-        vpslldq	$12, %xmm7, %xmm7
-        vaesenc	144(%r15), %xmm12, %xmm12
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpsrld	$0x01, %xmm2, %xmm5
-        vaesenc	144(%r15), %xmm13, %xmm13
-        vpsrld	$2, %xmm2, %xmm1
-        vpsrld	$7, %xmm2, %xmm0
-        vaesenc	144(%r15), %xmm14, %xmm14
-        vpxor	%xmm1, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vaesenc	144(%r15), %xmm15, %xmm15
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm2, %xmm2
-        cmpl	$11, %r10d
-        vmovdqa	160(%r15), %xmm7
-        jl	L_AES_GCM_decrypt_avx1_aesenc_128_ghash_avx_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	176(%r15), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        cmpl	$13, %r10d
-        vmovdqa	192(%r15), %xmm7
-        jl	L_AES_GCM_decrypt_avx1_aesenc_128_ghash_avx_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	208(%r15), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	224(%r15), %xmm7
-L_AES_GCM_decrypt_avx1_aesenc_128_ghash_avx_done:
-        vaesenclast	%xmm7, %xmm8, %xmm8
-        vaesenclast	%xmm7, %xmm9, %xmm9
-        vmovdqu	(%rcx), %xmm0
-        vmovdqu	16(%rcx), %xmm1
-        vpxor	%xmm0, %xmm8, %xmm8
-        vpxor	%xmm1, %xmm9, %xmm9
-        vmovdqu	%xmm8, (%rdx)
-        vmovdqu	%xmm9, 16(%rdx)
-        vaesenclast	%xmm7, %xmm10, %xmm10
-        vaesenclast	%xmm7, %xmm11, %xmm11
-        vmovdqu	32(%rcx), %xmm0
-        vmovdqu	48(%rcx), %xmm1
-        vpxor	%xmm0, %xmm10, %xmm10
-        vpxor	%xmm1, %xmm11, %xmm11
-        vmovdqu	%xmm10, 32(%rdx)
-        vmovdqu	%xmm11, 48(%rdx)
-        vaesenclast	%xmm7, %xmm12, %xmm12
-        vaesenclast	%xmm7, %xmm13, %xmm13
-        vmovdqu	64(%rcx), %xmm0
-        vmovdqu	80(%rcx), %xmm1
-        vpxor	%xmm0, %xmm12, %xmm12
-        vpxor	%xmm1, %xmm13, %xmm13
-        vmovdqu	%xmm12, 64(%rdx)
-        vmovdqu	%xmm13, 80(%rdx)
-        vaesenclast	%xmm7, %xmm14, %xmm14
-        vaesenclast	%xmm7, %xmm15, %xmm15
-        vmovdqu	96(%rcx), %xmm0
-        vmovdqu	112(%rcx), %xmm1
-        vpxor	%xmm0, %xmm14, %xmm14
-        vpxor	%xmm1, %xmm15, %xmm15
-        vmovdqu	%xmm14, 96(%rdx)
-        vmovdqu	%xmm15, 112(%rdx)
-        addl	$0x80, %ebx
-        cmpl	%r13d, %ebx
-        jl	L_AES_GCM_decrypt_avx1_ghash_128
-        vmovdqa	%xmm2, %xmm6
-        vmovdqu	(%rsp), %xmm5
-L_AES_GCM_decrypt_avx1_done_128:
-        movl	%r9d, %edx
-        cmpl	%edx, %ebx
-        jge	L_AES_GCM_decrypt_avx1_done_dec
-        movl	%r9d, %r13d
-        andl	$0xfffffff0, %r13d
-        cmpl	%r13d, %ebx
-        jge	L_AES_GCM_decrypt_avx1_last_block_done
-L_AES_GCM_decrypt_avx1_last_block_start:
-        vmovdqu	(%rdi,%rbx,1), %xmm13
-        vmovdqa	%xmm5, %xmm0
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm13, %xmm1
-        vpxor	%xmm6, %xmm1, %xmm1
-        vmovdqu	128(%rsp), %xmm9
-        vpshufb	L_avx1_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8
-        vpaddd	L_avx1_aes_gcm_one(%rip), %xmm9, %xmm9
-        vmovdqu	%xmm9, 128(%rsp)
-        vpxor	(%r15), %xmm8, %xmm8
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm10
-        vaesenc	16(%r15), %xmm8, %xmm8
-        vaesenc	32(%r15), %xmm8, %xmm8
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm11
-        vaesenc	48(%r15), %xmm8, %xmm8
-        vaesenc	64(%r15), %xmm8, %xmm8
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm12
-        vaesenc	80(%r15), %xmm8, %xmm8
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vaesenc	96(%r15), %xmm8, %xmm8
-        vpxor	%xmm11, %xmm10, %xmm10
-        vpslldq	$8, %xmm10, %xmm2
-        vpsrldq	$8, %xmm10, %xmm10
-        vaesenc	112(%r15), %xmm8, %xmm8
-        vpxor	%xmm12, %xmm2, %xmm2
-        vpxor	%xmm10, %xmm1, %xmm3
-        vmovdqa	L_avx1_aes_gcm_mod2_128(%rip), %xmm0
-        vpclmulqdq	$16, %xmm0, %xmm2, %xmm11
-        vaesenc	128(%r15), %xmm8, %xmm8
-        vpshufd	$0x4e, %xmm2, %xmm10
-        vpxor	%xmm11, %xmm10, %xmm10
-        vpclmulqdq	$16, %xmm0, %xmm10, %xmm11
-        vaesenc	144(%r15), %xmm8, %xmm8
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpxor	%xmm11, %xmm10, %xmm10
-        vpxor	%xmm3, %xmm10, %xmm6
-        cmpl	$11, %r10d
-        vmovdqa	160(%r15), %xmm9
-        jl	L_AES_GCM_decrypt_avx1_aesenc_gfmul_last
-        vaesenc	%xmm9, %xmm8, %xmm8
-        vaesenc	176(%r15), %xmm8, %xmm8
-        cmpl	$13, %r10d
-        vmovdqa	192(%r15), %xmm9
-        jl	L_AES_GCM_decrypt_avx1_aesenc_gfmul_last
-        vaesenc	%xmm9, %xmm8, %xmm8
-        vaesenc	208(%r15), %xmm8, %xmm8
-        vmovdqa	224(%r15), %xmm9
-L_AES_GCM_decrypt_avx1_aesenc_gfmul_last:
-        vaesenclast	%xmm9, %xmm8, %xmm8
-        vmovdqa	%xmm13, %xmm0
-        vpxor	%xmm0, %xmm8, %xmm8
-        vmovdqu	%xmm8, (%rsi,%rbx,1)
-        addl	$16, %ebx
-        cmpl	%r13d, %ebx
-        jl	L_AES_GCM_decrypt_avx1_last_block_start
-L_AES_GCM_decrypt_avx1_last_block_done:
-        movl	%r9d, %ecx
-        movl	%ecx, %edx
-        andl	$15, %ecx
-        jz	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_done
-        vmovdqu	128(%rsp), %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4
-        vpxor	(%r15), %xmm4, %xmm4
-        vaesenc	16(%r15), %xmm4, %xmm4
-        vaesenc	32(%r15), %xmm4, %xmm4
-        vaesenc	48(%r15), %xmm4, %xmm4
-        vaesenc	64(%r15), %xmm4, %xmm4
-        vaesenc	80(%r15), %xmm4, %xmm4
-        vaesenc	96(%r15), %xmm4, %xmm4
-        vaesenc	112(%r15), %xmm4, %xmm4
-        vaesenc	128(%r15), %xmm4, %xmm4
-        vaesenc	144(%r15), %xmm4, %xmm4
-        cmpl	$11, %r10d
-        vmovdqa	160(%r15), %xmm9
-        jl	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last
-        vaesenc	%xmm9, %xmm4, %xmm4
-        vaesenc	176(%r15), %xmm4, %xmm4
-        cmpl	$13, %r10d
-        vmovdqa	192(%r15), %xmm9
-        jl	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last
-        vaesenc	%xmm9, %xmm4, %xmm4
-        vaesenc	208(%r15), %xmm4, %xmm4
-        vmovdqa	224(%r15), %xmm9
-L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last:
-        vaesenclast	%xmm9, %xmm4, %xmm4
-        subq	$32, %rsp
-        xorl	%ecx, %ecx
-        vmovdqu	%xmm4, (%rsp)
-        vpxor	%xmm0, %xmm0, %xmm0
-        vmovdqu	%xmm0, 16(%rsp)
-L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_loop:
-        movzbl	(%rdi,%rbx,1), %r13d
-        movb	%r13b, 16(%rsp,%rcx,1)
-        xorb	(%rsp,%rcx,1), %r13b
-        movb	%r13b, (%rsi,%rbx,1)
-        incl	%ebx
-        incl	%ecx
-        cmpl	%edx, %ebx
-        jl	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_loop
-        vmovdqu	16(%rsp), %xmm4
-        addq	$32, %rsp
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4
-        vpxor	%xmm4, %xmm6, %xmm6
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm5, %xmm9
-        vpshufd	$0x4e, %xmm6, %xmm10
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm11
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm8
-        vpxor	%xmm5, %xmm9, %xmm9
-        vpxor	%xmm6, %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm10, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm11, %xmm9, %xmm9
-        vpslldq	$8, %xmm9, %xmm10
-        vpsrldq	$8, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm8, %xmm8
-        vpxor	%xmm9, %xmm11, %xmm6
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm6, %xmm6
-L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_done:
-L_AES_GCM_decrypt_avx1_done_dec:
-        movl	%r9d, %edx
-        movl	%r11d, %ecx
-        shlq	$3, %rdx
-        shlq	$3, %rcx
-        vmovq	%rdx, %xmm0
-        vmovq	%rcx, %xmm1
-        vpunpcklqdq	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm6, %xmm6
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm5, %xmm9
-        vpshufd	$0x4e, %xmm6, %xmm10
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm11
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm8
-        vpxor	%xmm5, %xmm9, %xmm9
-        vpxor	%xmm6, %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm10, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm11, %xmm9, %xmm9
-        vpslldq	$8, %xmm9, %xmm10
-        vpsrldq	$8, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm8, %xmm8
-        vpxor	%xmm9, %xmm11, %xmm6
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm6, %xmm6
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6
-        vmovdqu	144(%rsp), %xmm0
-        vpxor	%xmm6, %xmm0, %xmm0
-        cmpl	$16, %r14d
-        je	L_AES_GCM_decrypt_avx1_cmp_tag_16
-        subq	$16, %rsp
-        xorq	%rcx, %rcx
-        xorq	%rbx, %rbx
-        vmovdqu	%xmm0, (%rsp)
-L_AES_GCM_decrypt_avx1_cmp_tag_loop:
-        movzbl	(%rsp,%rcx,1), %r13d
-        xorb	(%r8,%rcx,1), %r13b
-        orb	%r13b, %bl
-        incl	%ecx
-        cmpl	%r14d, %ecx
-        jne	L_AES_GCM_decrypt_avx1_cmp_tag_loop
-        cmpb	$0x00, %bl
-        sete	%bl
-        addq	$16, %rsp
-        xorq	%rcx, %rcx
-        jmp	L_AES_GCM_decrypt_avx1_cmp_tag_done
-L_AES_GCM_decrypt_avx1_cmp_tag_16:
-        vmovdqu	(%r8), %xmm1
-        vpcmpeqb	%xmm1, %xmm0, %xmm0
-        vpmovmskb	%xmm0, %rdx
-        # %%edx == 0xFFFF then return 1 else => return 0
-        xorl	%ebx, %ebx
-        cmpl	$0xffff, %edx
-        sete	%bl
-L_AES_GCM_decrypt_avx1_cmp_tag_done:
-        movl	%ebx, (%rbp)
-        vzeroupper
-        addq	$0xa8, %rsp
-        popq	%rbp
-        popq	%r15
-        popq	%r14
-        popq	%rbx
-        popq	%r12
-        popq	%r13
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_decrypt_avx1,.-AES_GCM_decrypt_avx1
-#endif /* __APPLE__ */
-#ifdef WOLFSSL_AESGCM_STREAM
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_init_avx1
-.type	AES_GCM_init_avx1,@function
-.align	16
-AES_GCM_init_avx1:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_init_avx1
-.p2align	4
-_AES_GCM_init_avx1:
-#endif /* __APPLE__ */
-        pushq	%r12
-        pushq	%r13
-        movq	%rdx, %r10
-        movl	%ecx, %r11d
-        movq	24(%rsp), %rax
-        subq	$16, %rsp
-        vpxor	%xmm4, %xmm4, %xmm4
-        movl	%r11d, %edx
-        cmpl	$12, %edx
-        jne	L_AES_GCM_init_avx1_iv_not_12
-        # # Calculate values when IV is 12 bytes
-        # Set counter based on IV
-        movl	$0x1000000, %ecx
-        vmovq	(%r10), %xmm4
-        vpinsrd	$2, 8(%r10), %xmm4, %xmm4
-        vpinsrd	$3, %ecx, %xmm4, %xmm4
-        # H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqa	(%rdi), %xmm5
-        vpxor	%xmm5, %xmm4, %xmm1
-        vmovdqa	16(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	32(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	48(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	64(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	80(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	96(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	112(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	128(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	144(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        cmpl	$11, %esi
-        vmovdqa	160(%rdi), %xmm7
-        jl	L_AES_GCM_init_avx1_calc_iv_12_last
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	176(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        cmpl	$13, %esi
-        vmovdqa	192(%rdi), %xmm7
-        jl	L_AES_GCM_init_avx1_calc_iv_12_last
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	208(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	224(%rdi), %xmm7
-L_AES_GCM_init_avx1_calc_iv_12_last:
-        vaesenclast	%xmm7, %xmm5, %xmm5
-        vaesenclast	%xmm7, %xmm1, %xmm1
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
-        vmovdqu	%xmm1, %xmm15
-        jmp	L_AES_GCM_init_avx1_iv_done
-L_AES_GCM_init_avx1_iv_not_12:
-        # Calculate values when IV is not 12 bytes
-        # H = Encrypt X(=0)
-        vmovdqa	(%rdi), %xmm5
-        vaesenc	16(%rdi), %xmm5, %xmm5
-        vaesenc	32(%rdi), %xmm5, %xmm5
-        vaesenc	48(%rdi), %xmm5, %xmm5
-        vaesenc	64(%rdi), %xmm5, %xmm5
-        vaesenc	80(%rdi), %xmm5, %xmm5
-        vaesenc	96(%rdi), %xmm5, %xmm5
-        vaesenc	112(%rdi), %xmm5, %xmm5
-        vaesenc	128(%rdi), %xmm5, %xmm5
-        vaesenc	144(%rdi), %xmm5, %xmm5
-        cmpl	$11, %esi
-        vmovdqa	160(%rdi), %xmm9
-        jl	L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm9, %xmm5, %xmm5
-        vaesenc	176(%rdi), %xmm5, %xmm5
-        cmpl	$13, %esi
-        vmovdqa	192(%rdi), %xmm9
-        jl	L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm9, %xmm5, %xmm5
-        vaesenc	208(%rdi), %xmm5, %xmm5
-        vmovdqa	224(%rdi), %xmm9
-L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last:
-        vaesenclast	%xmm9, %xmm5, %xmm5
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
-        # Calc counter
-        # Initialization vector
-        cmpl	$0x00, %edx
-        movq	$0x00, %rcx
-        je	L_AES_GCM_init_avx1_calc_iv_done
-        cmpl	$16, %edx
-        jl	L_AES_GCM_init_avx1_calc_iv_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_init_avx1_calc_iv_16_loop:
-        vmovdqu	(%r10,%rcx,1), %xmm8
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
-        vpxor	%xmm8, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm7
-        vmovdqa	%xmm3, %xmm4
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm1, %xmm4, %xmm4
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        vpslld	$31, %xmm7, %xmm0
-        vpslld	$30, %xmm7, %xmm1
-        vpslld	$25, %xmm7, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm7, %xmm7
-        vpsrld	$0x01, %xmm7, %xmm2
-        vpsrld	$2, %xmm7, %xmm3
-        vpsrld	$7, %xmm7, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm4, %xmm4
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_init_avx1_calc_iv_16_loop
-        movl	%r11d, %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_init_avx1_calc_iv_done
-L_AES_GCM_init_avx1_calc_iv_lt16:
-        subq	$16, %rsp
-        vpxor	%xmm8, %xmm8, %xmm8
-        xorl	%r13d, %r13d
-        vmovdqu	%xmm8, (%rsp)
-L_AES_GCM_init_avx1_calc_iv_loop:
-        movzbl	(%r10,%rcx,1), %r12d
-        movb	%r12b, (%rsp,%r13,1)
-        incl	%ecx
-        incl	%r13d
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_init_avx1_calc_iv_loop
-        vmovdqu	(%rsp), %xmm8
-        addq	$16, %rsp
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
-        vpxor	%xmm8, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm7
-        vmovdqa	%xmm3, %xmm4
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm1, %xmm4, %xmm4
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        vpslld	$31, %xmm7, %xmm0
-        vpslld	$30, %xmm7, %xmm1
-        vpslld	$25, %xmm7, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm7, %xmm7
-        vpsrld	$0x01, %xmm7, %xmm2
-        vpsrld	$2, %xmm7, %xmm3
-        vpsrld	$7, %xmm7, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm4, %xmm4
-L_AES_GCM_init_avx1_calc_iv_done:
-        # T = Encrypt counter
-        vpxor	%xmm0, %xmm0, %xmm0
-        shll	$3, %edx
-        vmovq	%rdx, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm7
-        vmovdqa	%xmm3, %xmm4
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm1, %xmm4, %xmm4
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        vpslld	$31, %xmm7, %xmm0
-        vpslld	$30, %xmm7, %xmm1
-        vpslld	$25, %xmm7, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm7, %xmm7
-        vpsrld	$0x01, %xmm7, %xmm2
-        vpsrld	$2, %xmm7, %xmm3
-        vpsrld	$7, %xmm7, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4
-        #   Encrypt counter
-        vmovdqa	(%rdi), %xmm8
-        vpxor	%xmm4, %xmm8, %xmm8
-        vaesenc	16(%rdi), %xmm8, %xmm8
-        vaesenc	32(%rdi), %xmm8, %xmm8
-        vaesenc	48(%rdi), %xmm8, %xmm8
-        vaesenc	64(%rdi), %xmm8, %xmm8
-        vaesenc	80(%rdi), %xmm8, %xmm8
-        vaesenc	96(%rdi), %xmm8, %xmm8
-        vaesenc	112(%rdi), %xmm8, %xmm8
-        vaesenc	128(%rdi), %xmm8, %xmm8
-        vaesenc	144(%rdi), %xmm8, %xmm8
-        cmpl	$11, %esi
-        vmovdqa	160(%rdi), %xmm9
-        jl	L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm9, %xmm8, %xmm8
-        vaesenc	176(%rdi), %xmm8, %xmm8
-        cmpl	$13, %esi
-        vmovdqa	192(%rdi), %xmm9
-        jl	L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm9, %xmm8, %xmm8
-        vaesenc	208(%rdi), %xmm8, %xmm8
-        vmovdqa	224(%rdi), %xmm9
-L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last:
-        vaesenclast	%xmm9, %xmm8, %xmm8
-        vmovdqu	%xmm8, %xmm15
-L_AES_GCM_init_avx1_iv_done:
-        vmovdqa	%xmm15, (%rax)
-        vpshufb	L_avx1_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4
-        vpaddd	L_avx1_aes_gcm_one(%rip), %xmm4, %xmm4
-        vmovdqa	%xmm5, (%r8)
-        vmovdqa	%xmm4, (%r9)
-        vzeroupper
-        addq	$16, %rsp
-        popq	%r13
-        popq	%r12
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_init_avx1,.-AES_GCM_init_avx1
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_aad_update_avx1
-.type	AES_GCM_aad_update_avx1,@function
-.align	16
-AES_GCM_aad_update_avx1:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_aad_update_avx1
-.p2align	4
-_AES_GCM_aad_update_avx1:
-#endif /* __APPLE__ */
-        movq	%rcx, %rax
-        vmovdqa	(%rdx), %xmm5
-        vmovdqa	(%rax), %xmm6
-        xorl	%ecx, %ecx
-L_AES_GCM_aad_update_avx1_16_loop:
-        vmovdqu	(%rdi,%rcx,1), %xmm8
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
-        vpxor	%xmm8, %xmm5, %xmm5
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm5, %xmm1
-        vpshufd	$0x4e, %xmm6, %xmm2
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm0
-        vpxor	%xmm5, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm4
-        vmovdqa	%xmm3, %xmm5
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm5, %xmm5
-        vpsrld	$31, %xmm4, %xmm0
-        vpsrld	$31, %xmm5, %xmm1
-        vpslld	$0x01, %xmm4, %xmm4
-        vpslld	$0x01, %xmm5, %xmm5
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm5, %xmm5
-        vpor	%xmm0, %xmm4, %xmm4
-        vpor	%xmm1, %xmm5, %xmm5
-        vpslld	$31, %xmm4, %xmm0
-        vpslld	$30, %xmm4, %xmm1
-        vpslld	$25, %xmm4, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpsrld	$0x01, %xmm4, %xmm2
-        vpsrld	$2, %xmm4, %xmm3
-        vpsrld	$7, %xmm4, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm4, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm5, %xmm5
-        addl	$16, %ecx
-        cmpl	%esi, %ecx
-        jl	L_AES_GCM_aad_update_avx1_16_loop
-        vmovdqa	%xmm5, (%rdx)
-        vzeroupper
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_aad_update_avx1,.-AES_GCM_aad_update_avx1
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_encrypt_block_avx1
-.type	AES_GCM_encrypt_block_avx1,@function
-.align	16
-AES_GCM_encrypt_block_avx1:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_encrypt_block_avx1
-.p2align	4
-_AES_GCM_encrypt_block_avx1:
-#endif /* __APPLE__ */
-        movq	%rdx, %r10
-        movq	%rcx, %r11
-        vmovdqu	(%r8), %xmm9
-        vpshufb	L_avx1_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8
-        vpaddd	L_avx1_aes_gcm_one(%rip), %xmm9, %xmm9
-        vmovdqu	%xmm9, (%r8)
-        vpxor	(%rdi), %xmm8, %xmm8
-        vaesenc	16(%rdi), %xmm8, %xmm8
-        vaesenc	32(%rdi), %xmm8, %xmm8
-        vaesenc	48(%rdi), %xmm8, %xmm8
-        vaesenc	64(%rdi), %xmm8, %xmm8
-        vaesenc	80(%rdi), %xmm8, %xmm8
-        vaesenc	96(%rdi), %xmm8, %xmm8
-        vaesenc	112(%rdi), %xmm8, %xmm8
-        vaesenc	128(%rdi), %xmm8, %xmm8
-        vaesenc	144(%rdi), %xmm8, %xmm8
-        cmpl	$11, %esi
-        vmovdqa	160(%rdi), %xmm9
-        jl	L_AES_GCM_encrypt_block_avx1_aesenc_block_last
-        vaesenc	%xmm9, %xmm8, %xmm8
-        vaesenc	176(%rdi), %xmm8, %xmm8
-        cmpl	$13, %esi
-        vmovdqa	192(%rdi), %xmm9
-        jl	L_AES_GCM_encrypt_block_avx1_aesenc_block_last
-        vaesenc	%xmm9, %xmm8, %xmm8
-        vaesenc	208(%rdi), %xmm8, %xmm8
-        vmovdqa	224(%rdi), %xmm9
-L_AES_GCM_encrypt_block_avx1_aesenc_block_last:
-        vaesenclast	%xmm9, %xmm8, %xmm8
-        vmovdqu	(%r11), %xmm9
-        vpxor	%xmm9, %xmm8, %xmm8
-        vmovdqu	%xmm8, (%r10)
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
-        vzeroupper
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_encrypt_block_avx1,.-AES_GCM_encrypt_block_avx1
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_ghash_block_avx1
-.type	AES_GCM_ghash_block_avx1,@function
-.align	16
-AES_GCM_ghash_block_avx1:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_ghash_block_avx1
-.p2align	4
-_AES_GCM_ghash_block_avx1:
-#endif /* __APPLE__ */
-        vmovdqa	(%rsi), %xmm4
-        vmovdqa	(%rdx), %xmm5
-        vmovdqu	(%rdi), %xmm8
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
-        vpxor	%xmm8, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm6
-        vmovdqa	%xmm3, %xmm4
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm6, %xmm6
-        vpxor	%xmm1, %xmm4, %xmm4
-        vpsrld	$31, %xmm6, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm6, %xmm6
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm6, %xmm6
-        vpor	%xmm1, %xmm4, %xmm4
-        vpslld	$31, %xmm6, %xmm0
-        vpslld	$30, %xmm6, %xmm1
-        vpslld	$25, %xmm6, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm6, %xmm6
-        vpsrld	$0x01, %xmm6, %xmm2
-        vpsrld	$2, %xmm6, %xmm3
-        vpsrld	$7, %xmm6, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm4, %xmm4
-        vmovdqa	%xmm4, (%rsi)
-        vzeroupper
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_ghash_block_avx1,.-AES_GCM_ghash_block_avx1
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_encrypt_update_avx1
-.type	AES_GCM_encrypt_update_avx1,@function
-.align	16
-AES_GCM_encrypt_update_avx1:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_encrypt_update_avx1
-.p2align	4
-_AES_GCM_encrypt_update_avx1:
-#endif /* __APPLE__ */
-        pushq	%r13
-        pushq	%r12
-        pushq	%r14
-        movq	%rdx, %r10
-        movq	%rcx, %r11
-        movq	32(%rsp), %rax
-        movq	40(%rsp), %r12
-        subq	$0xa0, %rsp
-        vmovdqa	(%r9), %xmm6
-        vmovdqa	(%rax), %xmm5
-        vpsrlq	$63, %xmm5, %xmm9
-        vpsllq	$0x01, %xmm5, %xmm8
-        vpslldq	$8, %xmm9, %xmm9
-        vpor	%xmm9, %xmm8, %xmm8
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpand	L_avx1_aes_gcm_mod2_128(%rip), %xmm5, %xmm5
-        vpxor	%xmm8, %xmm5, %xmm5
-        xorl	%r14d, %r14d
-        cmpl	$0x80, %r8d
-        movl	%r8d, %r13d
-        jl	L_AES_GCM_encrypt_update_avx1_done_128
-        andl	$0xffffff80, %r13d
-        vmovdqa	%xmm6, %xmm2
-        # H ^ 1
-        vmovdqu	%xmm5, (%rsp)
-        # H ^ 2
-        vpclmulqdq	$0x00, %xmm5, %xmm5, %xmm8
-        vpclmulqdq	$0x11, %xmm5, %xmm5, %xmm0
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm0, %xmm0
-        vmovdqu	%xmm0, 16(%rsp)
-        # H ^ 3
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm5, %xmm9
-        vpshufd	$0x4e, %xmm0, %xmm10
-        vpclmulqdq	$0x11, %xmm5, %xmm0, %xmm11
-        vpclmulqdq	$0x00, %xmm5, %xmm0, %xmm8
-        vpxor	%xmm5, %xmm9, %xmm9
-        vpxor	%xmm0, %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm10, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm11, %xmm9, %xmm9
-        vpslldq	$8, %xmm9, %xmm10
-        vpsrldq	$8, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm8, %xmm8
-        vpxor	%xmm9, %xmm11, %xmm1
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm1, %xmm1
-        vmovdqu	%xmm1, 32(%rsp)
-        # H ^ 4
-        vpclmulqdq	$0x00, %xmm0, %xmm0, %xmm8
-        vpclmulqdq	$0x11, %xmm0, %xmm0, %xmm3
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm3, %xmm3
-        vmovdqu	%xmm3, 48(%rsp)
-        # H ^ 5
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm0, %xmm9
-        vpshufd	$0x4e, %xmm1, %xmm10
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm11
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm8
-        vpxor	%xmm0, %xmm9, %xmm9
-        vpxor	%xmm1, %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm10, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm11, %xmm9, %xmm9
-        vpslldq	$8, %xmm9, %xmm10
-        vpsrldq	$8, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm8, %xmm8
-        vpxor	%xmm9, %xmm11, %xmm7
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm7, %xmm7
-        vmovdqu	%xmm7, 64(%rsp)
-        # H ^ 6
-        vpclmulqdq	$0x00, %xmm1, %xmm1, %xmm8
-        vpclmulqdq	$0x11, %xmm1, %xmm1, %xmm7
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm7, %xmm7
-        vmovdqu	%xmm7, 80(%rsp)
-        # H ^ 7
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm1, %xmm9
-        vpshufd	$0x4e, %xmm3, %xmm10
-        vpclmulqdq	$0x11, %xmm1, %xmm3, %xmm11
-        vpclmulqdq	$0x00, %xmm1, %xmm3, %xmm8
-        vpxor	%xmm1, %xmm9, %xmm9
-        vpxor	%xmm3, %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm10, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm11, %xmm9, %xmm9
-        vpslldq	$8, %xmm9, %xmm10
-        vpsrldq	$8, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm8, %xmm8
-        vpxor	%xmm9, %xmm11, %xmm7
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm7, %xmm7
-        vmovdqu	%xmm7, 96(%rsp)
-        # H ^ 8
-        vpclmulqdq	$0x00, %xmm3, %xmm3, %xmm8
-        vpclmulqdq	$0x11, %xmm3, %xmm3, %xmm7
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm7, %xmm7
-        vmovdqu	%xmm7, 112(%rsp)
-        # First 128 bytes of input
-        vmovdqu	(%r12), %xmm0
-        vmovdqa	L_avx1_aes_gcm_bswap_epi64(%rip), %xmm1
-        vpshufb	%xmm1, %xmm0, %xmm8
-        vpaddd	L_avx1_aes_gcm_one(%rip), %xmm0, %xmm9
-        vpshufb	%xmm1, %xmm9, %xmm9
-        vpaddd	L_avx1_aes_gcm_two(%rip), %xmm0, %xmm10
-        vpshufb	%xmm1, %xmm10, %xmm10
-        vpaddd	L_avx1_aes_gcm_three(%rip), %xmm0, %xmm11
-        vpshufb	%xmm1, %xmm11, %xmm11
-        vpaddd	L_avx1_aes_gcm_four(%rip), %xmm0, %xmm12
-        vpshufb	%xmm1, %xmm12, %xmm12
-        vpaddd	L_avx1_aes_gcm_five(%rip), %xmm0, %xmm13
-        vpshufb	%xmm1, %xmm13, %xmm13
-        vpaddd	L_avx1_aes_gcm_six(%rip), %xmm0, %xmm14
-        vpshufb	%xmm1, %xmm14, %xmm14
-        vpaddd	L_avx1_aes_gcm_seven(%rip), %xmm0, %xmm15
-        vpshufb	%xmm1, %xmm15, %xmm15
-        vpaddd	L_avx1_aes_gcm_eight(%rip), %xmm0, %xmm0
-        vmovdqa	(%rdi), %xmm7
-        vmovdqu	%xmm0, (%r12)
-        vpxor	%xmm7, %xmm8, %xmm8
-        vpxor	%xmm7, %xmm9, %xmm9
-        vpxor	%xmm7, %xmm10, %xmm10
-        vpxor	%xmm7, %xmm11, %xmm11
-        vpxor	%xmm7, %xmm12, %xmm12
-        vpxor	%xmm7, %xmm13, %xmm13
-        vpxor	%xmm7, %xmm14, %xmm14
-        vpxor	%xmm7, %xmm15, %xmm15
-        vmovdqa	16(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	32(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	48(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	64(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	80(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	96(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	112(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	128(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	144(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        cmpl	$11, %esi
-        vmovdqa	160(%rdi), %xmm7
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_128_enc_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	176(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        cmpl	$13, %esi
-        vmovdqa	192(%rdi), %xmm7
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_128_enc_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	208(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	224(%rdi), %xmm7
-L_AES_GCM_encrypt_update_avx1_aesenc_128_enc_done:
-        vaesenclast	%xmm7, %xmm8, %xmm8
-        vaesenclast	%xmm7, %xmm9, %xmm9
-        vmovdqu	(%r11), %xmm0
-        vmovdqu	16(%r11), %xmm1
-        vpxor	%xmm0, %xmm8, %xmm8
-        vpxor	%xmm1, %xmm9, %xmm9
-        vmovdqu	%xmm8, (%r10)
-        vmovdqu	%xmm9, 16(%r10)
-        vaesenclast	%xmm7, %xmm10, %xmm10
-        vaesenclast	%xmm7, %xmm11, %xmm11
-        vmovdqu	32(%r11), %xmm0
-        vmovdqu	48(%r11), %xmm1
-        vpxor	%xmm0, %xmm10, %xmm10
-        vpxor	%xmm1, %xmm11, %xmm11
-        vmovdqu	%xmm10, 32(%r10)
-        vmovdqu	%xmm11, 48(%r10)
-        vaesenclast	%xmm7, %xmm12, %xmm12
-        vaesenclast	%xmm7, %xmm13, %xmm13
-        vmovdqu	64(%r11), %xmm0
-        vmovdqu	80(%r11), %xmm1
-        vpxor	%xmm0, %xmm12, %xmm12
-        vpxor	%xmm1, %xmm13, %xmm13
-        vmovdqu	%xmm12, 64(%r10)
-        vmovdqu	%xmm13, 80(%r10)
-        vaesenclast	%xmm7, %xmm14, %xmm14
-        vaesenclast	%xmm7, %xmm15, %xmm15
-        vmovdqu	96(%r11), %xmm0
-        vmovdqu	112(%r11), %xmm1
-        vpxor	%xmm0, %xmm14, %xmm14
-        vpxor	%xmm1, %xmm15, %xmm15
-        vmovdqu	%xmm14, 96(%r10)
-        vmovdqu	%xmm15, 112(%r10)
-        cmpl	$0x80, %r13d
-        movl	$0x80, %r14d
-        jle	L_AES_GCM_encrypt_update_avx1_end_128
-        # More 128 bytes of input
-L_AES_GCM_encrypt_update_avx1_ghash_128:
-        leaq	(%r11,%r14,1), %rcx
-        leaq	(%r10,%r14,1), %rdx
-        vmovdqu	(%r12), %xmm0
-        vmovdqa	L_avx1_aes_gcm_bswap_epi64(%rip), %xmm1
-        vpshufb	%xmm1, %xmm0, %xmm8
-        vpaddd	L_avx1_aes_gcm_one(%rip), %xmm0, %xmm9
-        vpshufb	%xmm1, %xmm9, %xmm9
-        vpaddd	L_avx1_aes_gcm_two(%rip), %xmm0, %xmm10
-        vpshufb	%xmm1, %xmm10, %xmm10
-        vpaddd	L_avx1_aes_gcm_three(%rip), %xmm0, %xmm11
-        vpshufb	%xmm1, %xmm11, %xmm11
-        vpaddd	L_avx1_aes_gcm_four(%rip), %xmm0, %xmm12
-        vpshufb	%xmm1, %xmm12, %xmm12
-        vpaddd	L_avx1_aes_gcm_five(%rip), %xmm0, %xmm13
-        vpshufb	%xmm1, %xmm13, %xmm13
-        vpaddd	L_avx1_aes_gcm_six(%rip), %xmm0, %xmm14
-        vpshufb	%xmm1, %xmm14, %xmm14
-        vpaddd	L_avx1_aes_gcm_seven(%rip), %xmm0, %xmm15
-        vpshufb	%xmm1, %xmm15, %xmm15
-        vpaddd	L_avx1_aes_gcm_eight(%rip), %xmm0, %xmm0
-        vmovdqa	(%rdi), %xmm7
-        vmovdqu	%xmm0, (%r12)
-        vpxor	%xmm7, %xmm8, %xmm8
-        vpxor	%xmm7, %xmm9, %xmm9
-        vpxor	%xmm7, %xmm10, %xmm10
-        vpxor	%xmm7, %xmm11, %xmm11
-        vpxor	%xmm7, %xmm12, %xmm12
-        vpxor	%xmm7, %xmm13, %xmm13
-        vpxor	%xmm7, %xmm14, %xmm14
-        vpxor	%xmm7, %xmm15, %xmm15
-        vmovdqu	112(%rsp), %xmm7
-        vmovdqu	-128(%rdx), %xmm0
-        vaesenc	16(%rdi), %xmm8, %xmm8
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm3
-        vaesenc	16(%rdi), %xmm9, %xmm9
-        vaesenc	16(%rdi), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm2
-        vaesenc	16(%rdi), %xmm11, %xmm11
-        vaesenc	16(%rdi), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm1, %xmm1
-        vaesenc	16(%rdi), %xmm13, %xmm13
-        vaesenc	16(%rdi), %xmm14, %xmm14
-        vaesenc	16(%rdi), %xmm15, %xmm15
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqu	96(%rsp), %xmm7
-        vmovdqu	-112(%rdx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	32(%rdi), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	32(%rdi), %xmm9, %xmm9
-        vaesenc	32(%rdi), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	32(%rdi), %xmm11, %xmm11
-        vaesenc	32(%rdi), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	32(%rdi), %xmm13, %xmm13
-        vaesenc	32(%rdi), %xmm14, %xmm14
-        vaesenc	32(%rdi), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	80(%rsp), %xmm7
-        vmovdqu	-96(%rdx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	48(%rdi), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	48(%rdi), %xmm9, %xmm9
-        vaesenc	48(%rdi), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	48(%rdi), %xmm11, %xmm11
-        vaesenc	48(%rdi), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	48(%rdi), %xmm13, %xmm13
-        vaesenc	48(%rdi), %xmm14, %xmm14
-        vaesenc	48(%rdi), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	64(%rsp), %xmm7
-        vmovdqu	-80(%rdx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	64(%rdi), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	64(%rdi), %xmm9, %xmm9
-        vaesenc	64(%rdi), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	64(%rdi), %xmm11, %xmm11
-        vaesenc	64(%rdi), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	64(%rdi), %xmm13, %xmm13
-        vaesenc	64(%rdi), %xmm14, %xmm14
-        vaesenc	64(%rdi), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	48(%rsp), %xmm7
-        vmovdqu	-64(%rdx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	80(%rdi), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	80(%rdi), %xmm9, %xmm9
-        vaesenc	80(%rdi), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	80(%rdi), %xmm11, %xmm11
-        vaesenc	80(%rdi), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	80(%rdi), %xmm13, %xmm13
-        vaesenc	80(%rdi), %xmm14, %xmm14
-        vaesenc	80(%rdi), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	32(%rsp), %xmm7
-        vmovdqu	-48(%rdx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	96(%rdi), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	96(%rdi), %xmm9, %xmm9
-        vaesenc	96(%rdi), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	96(%rdi), %xmm11, %xmm11
-        vaesenc	96(%rdi), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	96(%rdi), %xmm13, %xmm13
-        vaesenc	96(%rdi), %xmm14, %xmm14
-        vaesenc	96(%rdi), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	16(%rsp), %xmm7
-        vmovdqu	-32(%rdx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	112(%rdi), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	112(%rdi), %xmm9, %xmm9
-        vaesenc	112(%rdi), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	112(%rdi), %xmm11, %xmm11
-        vaesenc	112(%rdi), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	112(%rdi), %xmm13, %xmm13
-        vaesenc	112(%rdi), %xmm14, %xmm14
-        vaesenc	112(%rdi), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	(%rsp), %xmm7
-        vmovdqu	-16(%rdx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	128(%rdi), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	128(%rdi), %xmm9, %xmm9
-        vaesenc	128(%rdi), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	128(%rdi), %xmm11, %xmm11
-        vaesenc	128(%rdi), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	128(%rdi), %xmm13, %xmm13
-        vaesenc	128(%rdi), %xmm14, %xmm14
-        vaesenc	128(%rdi), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpslldq	$8, %xmm1, %xmm5
-        vpsrldq	$8, %xmm1, %xmm1
-        vaesenc	144(%rdi), %xmm8, %xmm8
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm3, %xmm3
-        vaesenc	144(%rdi), %xmm9, %xmm9
-        vpslld	$31, %xmm2, %xmm7
-        vpslld	$30, %xmm2, %xmm4
-        vpslld	$25, %xmm2, %xmm5
-        vaesenc	144(%rdi), %xmm10, %xmm10
-        vpxor	%xmm4, %xmm7, %xmm7
-        vpxor	%xmm5, %xmm7, %xmm7
-        vaesenc	144(%rdi), %xmm11, %xmm11
-        vpsrldq	$4, %xmm7, %xmm4
-        vpslldq	$12, %xmm7, %xmm7
-        vaesenc	144(%rdi), %xmm12, %xmm12
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpsrld	$0x01, %xmm2, %xmm5
-        vaesenc	144(%rdi), %xmm13, %xmm13
-        vpsrld	$2, %xmm2, %xmm1
-        vpsrld	$7, %xmm2, %xmm0
-        vaesenc	144(%rdi), %xmm14, %xmm14
-        vpxor	%xmm1, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vaesenc	144(%rdi), %xmm15, %xmm15
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm2, %xmm2
-        cmpl	$11, %esi
-        vmovdqa	160(%rdi), %xmm7
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_128_ghash_avx_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	176(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        cmpl	$13, %esi
-        vmovdqa	192(%rdi), %xmm7
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_128_ghash_avx_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	208(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	224(%rdi), %xmm7
-L_AES_GCM_encrypt_update_avx1_aesenc_128_ghash_avx_done:
-        vaesenclast	%xmm7, %xmm8, %xmm8
-        vaesenclast	%xmm7, %xmm9, %xmm9
-        vmovdqu	(%rcx), %xmm0
-        vmovdqu	16(%rcx), %xmm1
-        vpxor	%xmm0, %xmm8, %xmm8
-        vpxor	%xmm1, %xmm9, %xmm9
-        vmovdqu	%xmm8, (%rdx)
-        vmovdqu	%xmm9, 16(%rdx)
-        vaesenclast	%xmm7, %xmm10, %xmm10
-        vaesenclast	%xmm7, %xmm11, %xmm11
-        vmovdqu	32(%rcx), %xmm0
-        vmovdqu	48(%rcx), %xmm1
-        vpxor	%xmm0, %xmm10, %xmm10
-        vpxor	%xmm1, %xmm11, %xmm11
-        vmovdqu	%xmm10, 32(%rdx)
-        vmovdqu	%xmm11, 48(%rdx)
-        vaesenclast	%xmm7, %xmm12, %xmm12
-        vaesenclast	%xmm7, %xmm13, %xmm13
-        vmovdqu	64(%rcx), %xmm0
-        vmovdqu	80(%rcx), %xmm1
-        vpxor	%xmm0, %xmm12, %xmm12
-        vpxor	%xmm1, %xmm13, %xmm13
-        vmovdqu	%xmm12, 64(%rdx)
-        vmovdqu	%xmm13, 80(%rdx)
-        vaesenclast	%xmm7, %xmm14, %xmm14
-        vaesenclast	%xmm7, %xmm15, %xmm15
-        vmovdqu	96(%rcx), %xmm0
-        vmovdqu	112(%rcx), %xmm1
-        vpxor	%xmm0, %xmm14, %xmm14
-        vpxor	%xmm1, %xmm15, %xmm15
-        vmovdqu	%xmm14, 96(%rdx)
-        vmovdqu	%xmm15, 112(%rdx)
-        addl	$0x80, %r14d
-        cmpl	%r13d, %r14d
-        jl	L_AES_GCM_encrypt_update_avx1_ghash_128
-L_AES_GCM_encrypt_update_avx1_end_128:
-        vmovdqa	L_avx1_aes_gcm_bswap_mask(%rip), %xmm4
-        vpshufb	%xmm4, %xmm8, %xmm8
-        vpshufb	%xmm4, %xmm9, %xmm9
-        vpshufb	%xmm4, %xmm10, %xmm10
-        vpshufb	%xmm4, %xmm11, %xmm11
-        vpxor	%xmm2, %xmm8, %xmm8
-        vpshufb	%xmm4, %xmm12, %xmm12
-        vpshufb	%xmm4, %xmm13, %xmm13
-        vpshufb	%xmm4, %xmm14, %xmm14
-        vpshufb	%xmm4, %xmm15, %xmm15
-        vmovdqu	(%rsp), %xmm7
-        vmovdqu	16(%rsp), %xmm5
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm15, %xmm1
-        vpshufd	$0x4e, %xmm7, %xmm2
-        vpclmulqdq	$0x11, %xmm15, %xmm7, %xmm3
-        vpclmulqdq	$0x00, %xmm15, %xmm7, %xmm0
-        vpxor	%xmm15, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm4
-        vmovdqa	%xmm3, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm6, %xmm6
-        # ghash_gfmul_xor_avx
-        vpshufd	$0x4e, %xmm14, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm14, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm14, %xmm5, %xmm0
-        vpxor	%xmm14, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm6, %xmm6
-        vmovdqu	32(%rsp), %xmm7
-        vmovdqu	48(%rsp), %xmm5
-        # ghash_gfmul_xor_avx
-        vpshufd	$0x4e, %xmm13, %xmm1
-        vpshufd	$0x4e, %xmm7, %xmm2
-        vpclmulqdq	$0x11, %xmm13, %xmm7, %xmm3
-        vpclmulqdq	$0x00, %xmm13, %xmm7, %xmm0
-        vpxor	%xmm13, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm6, %xmm6
-        # ghash_gfmul_xor_avx
-        vpshufd	$0x4e, %xmm12, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm12, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm12, %xmm5, %xmm0
-        vpxor	%xmm12, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm6, %xmm6
-        vmovdqu	64(%rsp), %xmm7
-        vmovdqu	80(%rsp), %xmm5
-        # ghash_gfmul_xor_avx
-        vpshufd	$0x4e, %xmm11, %xmm1
-        vpshufd	$0x4e, %xmm7, %xmm2
-        vpclmulqdq	$0x11, %xmm11, %xmm7, %xmm3
-        vpclmulqdq	$0x00, %xmm11, %xmm7, %xmm0
-        vpxor	%xmm11, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm6, %xmm6
-        # ghash_gfmul_xor_avx
-        vpshufd	$0x4e, %xmm10, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm10, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm10, %xmm5, %xmm0
-        vpxor	%xmm10, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm6, %xmm6
-        vmovdqu	96(%rsp), %xmm7
-        vmovdqu	112(%rsp), %xmm5
-        # ghash_gfmul_xor_avx
-        vpshufd	$0x4e, %xmm9, %xmm1
-        vpshufd	$0x4e, %xmm7, %xmm2
-        vpclmulqdq	$0x11, %xmm9, %xmm7, %xmm3
-        vpclmulqdq	$0x00, %xmm9, %xmm7, %xmm0
-        vpxor	%xmm9, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm6, %xmm6
-        # ghash_gfmul_xor_avx
-        vpshufd	$0x4e, %xmm8, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm8, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm8, %xmm5, %xmm0
-        vpxor	%xmm8, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpslld	$31, %xmm4, %xmm0
-        vpslld	$30, %xmm4, %xmm1
-        vpslld	$25, %xmm4, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpsrld	$0x01, %xmm4, %xmm2
-        vpsrld	$2, %xmm4, %xmm3
-        vpsrld	$7, %xmm4, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm4, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm6, %xmm6
-        vmovdqu	(%rsp), %xmm5
-L_AES_GCM_encrypt_update_avx1_done_128:
-        movl	%r8d, %edx
-        cmpl	%edx, %r14d
-        jge	L_AES_GCM_encrypt_update_avx1_done_enc
-        movl	%r8d, %r13d
-        andl	$0xfffffff0, %r13d
-        cmpl	%r13d, %r14d
-        jge	L_AES_GCM_encrypt_update_avx1_last_block_done
-        vmovdqu	(%r12), %xmm9
-        vpshufb	L_avx1_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8
-        vpaddd	L_avx1_aes_gcm_one(%rip), %xmm9, %xmm9
-        vmovdqu	%xmm9, (%r12)
-        vpxor	(%rdi), %xmm8, %xmm8
-        vaesenc	16(%rdi), %xmm8, %xmm8
-        vaesenc	32(%rdi), %xmm8, %xmm8
-        vaesenc	48(%rdi), %xmm8, %xmm8
-        vaesenc	64(%rdi), %xmm8, %xmm8
-        vaesenc	80(%rdi), %xmm8, %xmm8
-        vaesenc	96(%rdi), %xmm8, %xmm8
-        vaesenc	112(%rdi), %xmm8, %xmm8
-        vaesenc	128(%rdi), %xmm8, %xmm8
-        vaesenc	144(%rdi), %xmm8, %xmm8
-        cmpl	$11, %esi
-        vmovdqa	160(%rdi), %xmm9
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_block_last
-        vaesenc	%xmm9, %xmm8, %xmm8
-        vaesenc	176(%rdi), %xmm8, %xmm8
-        cmpl	$13, %esi
-        vmovdqa	192(%rdi), %xmm9
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_block_last
-        vaesenc	%xmm9, %xmm8, %xmm8
-        vaesenc	208(%rdi), %xmm8, %xmm8
-        vmovdqa	224(%rdi), %xmm9
-L_AES_GCM_encrypt_update_avx1_aesenc_block_last:
-        vaesenclast	%xmm9, %xmm8, %xmm8
-        vmovdqu	(%r11,%r14,1), %xmm9
-        vpxor	%xmm9, %xmm8, %xmm8
-        vmovdqu	%xmm8, (%r10,%r14,1)
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
-        vpxor	%xmm8, %xmm6, %xmm6
-        addl	$16, %r14d
-        cmpl	%r13d, %r14d
-        jge	L_AES_GCM_encrypt_update_avx1_last_block_ghash
-L_AES_GCM_encrypt_update_avx1_last_block_start:
-        vmovdqu	(%r11,%r14,1), %xmm13
-        vmovdqu	(%r12), %xmm9
-        vpshufb	L_avx1_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8
-        vpaddd	L_avx1_aes_gcm_one(%rip), %xmm9, %xmm9
-        vmovdqu	%xmm9, (%r12)
-        vpxor	(%rdi), %xmm8, %xmm8
-        vpclmulqdq	$16, %xmm5, %xmm6, %xmm10
-        vaesenc	16(%rdi), %xmm8, %xmm8
-        vaesenc	32(%rdi), %xmm8, %xmm8
-        vpclmulqdq	$0x01, %xmm5, %xmm6, %xmm11
-        vaesenc	48(%rdi), %xmm8, %xmm8
-        vaesenc	64(%rdi), %xmm8, %xmm8
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm12
-        vaesenc	80(%rdi), %xmm8, %xmm8
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm1
-        vaesenc	96(%rdi), %xmm8, %xmm8
-        vpxor	%xmm11, %xmm10, %xmm10
-        vpslldq	$8, %xmm10, %xmm2
-        vpsrldq	$8, %xmm10, %xmm10
-        vaesenc	112(%rdi), %xmm8, %xmm8
-        vpxor	%xmm12, %xmm2, %xmm2
-        vpxor	%xmm10, %xmm1, %xmm3
-        vmovdqa	L_avx1_aes_gcm_mod2_128(%rip), %xmm0
-        vpclmulqdq	$16, %xmm0, %xmm2, %xmm11
-        vaesenc	128(%rdi), %xmm8, %xmm8
-        vpshufd	$0x4e, %xmm2, %xmm10
-        vpxor	%xmm11, %xmm10, %xmm10
-        vpclmulqdq	$16, %xmm0, %xmm10, %xmm11
-        vaesenc	144(%rdi), %xmm8, %xmm8
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpxor	%xmm11, %xmm10, %xmm10
-        vpxor	%xmm3, %xmm10, %xmm6
-        cmpl	$11, %esi
-        vmovdqa	160(%rdi), %xmm9
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last
-        vaesenc	%xmm9, %xmm8, %xmm8
-        vaesenc	176(%rdi), %xmm8, %xmm8
-        cmpl	$13, %esi
-        vmovdqa	192(%rdi), %xmm9
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last
-        vaesenc	%xmm9, %xmm8, %xmm8
-        vaesenc	208(%rdi), %xmm8, %xmm8
-        vmovdqa	224(%rdi), %xmm9
-L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last:
-        vaesenclast	%xmm9, %xmm8, %xmm8
-        vmovdqa	%xmm13, %xmm0
-        vpxor	%xmm0, %xmm8, %xmm8
-        vmovdqu	%xmm8, (%r10,%r14,1)
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm8, %xmm8
-        addl	$16, %r14d
-        vpxor	%xmm8, %xmm6, %xmm6
-        cmpl	%r13d, %r14d
-        jl	L_AES_GCM_encrypt_update_avx1_last_block_start
-L_AES_GCM_encrypt_update_avx1_last_block_ghash:
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm5, %xmm9
-        vpshufd	$0x4e, %xmm6, %xmm10
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm11
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm8
-        vpxor	%xmm5, %xmm9, %xmm9
-        vpxor	%xmm6, %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm10, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm11, %xmm9, %xmm9
-        vpslldq	$8, %xmm9, %xmm10
-        vpsrldq	$8, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm8, %xmm8
-        vpxor	%xmm9, %xmm11, %xmm6
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm6, %xmm6
-L_AES_GCM_encrypt_update_avx1_last_block_done:
-L_AES_GCM_encrypt_update_avx1_done_enc:
-        vmovdqa	%xmm6, (%r9)
-        vzeroupper
-        addq	$0xa0, %rsp
-        popq	%r14
-        popq	%r12
-        popq	%r13
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_encrypt_update_avx1,.-AES_GCM_encrypt_update_avx1
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_encrypt_final_avx1
-.type	AES_GCM_encrypt_final_avx1,@function
-.align	16
-AES_GCM_encrypt_final_avx1:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_encrypt_final_avx1
-.p2align	4
-_AES_GCM_encrypt_final_avx1:
-#endif /* __APPLE__ */
-        pushq	%r13
-        movl	%edx, %eax
-        movl	%ecx, %r10d
-        movl	%r8d, %r11d
-        movq	16(%rsp), %r8
-        subq	$16, %rsp
-        vmovdqa	(%rdi), %xmm4
-        vmovdqa	(%r9), %xmm5
-        vmovdqa	(%r8), %xmm6
-        vpsrlq	$63, %xmm5, %xmm9
-        vpsllq	$0x01, %xmm5, %xmm8
-        vpslldq	$8, %xmm9, %xmm9
-        vpor	%xmm9, %xmm8, %xmm8
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpand	L_avx1_aes_gcm_mod2_128(%rip), %xmm5, %xmm5
-        vpxor	%xmm8, %xmm5, %xmm5
-        movl	%r10d, %edx
-        movl	%r11d, %ecx
-        shlq	$3, %rdx
-        shlq	$3, %rcx
-        vmovq	%rdx, %xmm0
-        vmovq	%rcx, %xmm1
-        vpunpcklqdq	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm5, %xmm9
-        vpshufd	$0x4e, %xmm4, %xmm10
-        vpclmulqdq	$0x11, %xmm5, %xmm4, %xmm11
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm8
-        vpxor	%xmm5, %xmm9, %xmm9
-        vpxor	%xmm4, %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm10, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm11, %xmm9, %xmm9
-        vpslldq	$8, %xmm9, %xmm10
-        vpsrldq	$8, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm8, %xmm8
-        vpxor	%xmm9, %xmm11, %xmm4
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm4, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4
-        vpxor	%xmm6, %xmm4, %xmm0
-        cmpl	$16, %eax
-        je	L_AES_GCM_encrypt_final_avx1_store_tag_16
-        xorq	%rcx, %rcx
-        vmovdqu	%xmm0, (%rsp)
-L_AES_GCM_encrypt_final_avx1_store_tag_loop:
-        movzbl	(%rsp,%rcx,1), %r13d
-        movb	%r13b, (%rsi,%rcx,1)
-        incl	%ecx
-        cmpl	%eax, %ecx
-        jne	L_AES_GCM_encrypt_final_avx1_store_tag_loop
-        jmp	L_AES_GCM_encrypt_final_avx1_store_tag_done
-L_AES_GCM_encrypt_final_avx1_store_tag_16:
-        vmovdqu	%xmm0, (%rsi)
-L_AES_GCM_encrypt_final_avx1_store_tag_done:
-        vzeroupper
-        addq	$16, %rsp
-        popq	%r13
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_encrypt_final_avx1,.-AES_GCM_encrypt_final_avx1
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_decrypt_update_avx1
-.type	AES_GCM_decrypt_update_avx1,@function
-.align	16
-AES_GCM_decrypt_update_avx1:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_decrypt_update_avx1
-.p2align	4
-_AES_GCM_decrypt_update_avx1:
-#endif /* __APPLE__ */
-        pushq	%r13
-        pushq	%r12
-        pushq	%r14
-        movq	%rdx, %r10
-        movq	%rcx, %r11
-        movq	32(%rsp), %rax
-        movq	40(%rsp), %r12
-        subq	$0xa8, %rsp
-        vmovdqa	(%r9), %xmm6
-        vmovdqa	(%rax), %xmm5
-        vpsrlq	$63, %xmm5, %xmm9
-        vpsllq	$0x01, %xmm5, %xmm8
-        vpslldq	$8, %xmm9, %xmm9
-        vpor	%xmm9, %xmm8, %xmm8
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpand	L_avx1_aes_gcm_mod2_128(%rip), %xmm5, %xmm5
-        vpxor	%xmm8, %xmm5, %xmm5
-        xorl	%r14d, %r14d
-        cmpl	$0x80, %r8d
-        movl	%r8d, %r13d
-        jl	L_AES_GCM_decrypt_update_avx1_done_128
-        andl	$0xffffff80, %r13d
-        vmovdqa	%xmm6, %xmm2
-        # H ^ 1
-        vmovdqu	%xmm5, (%rsp)
-        # H ^ 2
-        vpclmulqdq	$0x00, %xmm5, %xmm5, %xmm8
-        vpclmulqdq	$0x11, %xmm5, %xmm5, %xmm0
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm0, %xmm0
-        vmovdqu	%xmm0, 16(%rsp)
-        # H ^ 3
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm5, %xmm9
-        vpshufd	$0x4e, %xmm0, %xmm10
-        vpclmulqdq	$0x11, %xmm5, %xmm0, %xmm11
-        vpclmulqdq	$0x00, %xmm5, %xmm0, %xmm8
-        vpxor	%xmm5, %xmm9, %xmm9
-        vpxor	%xmm0, %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm10, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm11, %xmm9, %xmm9
-        vpslldq	$8, %xmm9, %xmm10
-        vpsrldq	$8, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm8, %xmm8
-        vpxor	%xmm9, %xmm11, %xmm1
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm1, %xmm1
-        vmovdqu	%xmm1, 32(%rsp)
-        # H ^ 4
-        vpclmulqdq	$0x00, %xmm0, %xmm0, %xmm8
-        vpclmulqdq	$0x11, %xmm0, %xmm0, %xmm3
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm3, %xmm3
-        vmovdqu	%xmm3, 48(%rsp)
-        # H ^ 5
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm0, %xmm9
-        vpshufd	$0x4e, %xmm1, %xmm10
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm11
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm8
-        vpxor	%xmm0, %xmm9, %xmm9
-        vpxor	%xmm1, %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm10, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm11, %xmm9, %xmm9
-        vpslldq	$8, %xmm9, %xmm10
-        vpsrldq	$8, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm8, %xmm8
-        vpxor	%xmm9, %xmm11, %xmm7
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm7, %xmm7
-        vmovdqu	%xmm7, 64(%rsp)
-        # H ^ 6
-        vpclmulqdq	$0x00, %xmm1, %xmm1, %xmm8
-        vpclmulqdq	$0x11, %xmm1, %xmm1, %xmm7
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm7, %xmm7
-        vmovdqu	%xmm7, 80(%rsp)
-        # H ^ 7
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm1, %xmm9
-        vpshufd	$0x4e, %xmm3, %xmm10
-        vpclmulqdq	$0x11, %xmm1, %xmm3, %xmm11
-        vpclmulqdq	$0x00, %xmm1, %xmm3, %xmm8
-        vpxor	%xmm1, %xmm9, %xmm9
-        vpxor	%xmm3, %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm10, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm11, %xmm9, %xmm9
-        vpslldq	$8, %xmm9, %xmm10
-        vpsrldq	$8, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm8, %xmm8
-        vpxor	%xmm9, %xmm11, %xmm7
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm7, %xmm7
-        vmovdqu	%xmm7, 96(%rsp)
-        # H ^ 8
-        vpclmulqdq	$0x00, %xmm3, %xmm3, %xmm8
-        vpclmulqdq	$0x11, %xmm3, %xmm3, %xmm7
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm7, %xmm7
-        vmovdqu	%xmm7, 112(%rsp)
-L_AES_GCM_decrypt_update_avx1_ghash_128:
-        leaq	(%r11,%r14,1), %rcx
-        leaq	(%r10,%r14,1), %rdx
-        vmovdqu	(%r12), %xmm0
-        vmovdqa	L_avx1_aes_gcm_bswap_epi64(%rip), %xmm1
-        vpshufb	%xmm1, %xmm0, %xmm8
-        vpaddd	L_avx1_aes_gcm_one(%rip), %xmm0, %xmm9
-        vpshufb	%xmm1, %xmm9, %xmm9
-        vpaddd	L_avx1_aes_gcm_two(%rip), %xmm0, %xmm10
-        vpshufb	%xmm1, %xmm10, %xmm10
-        vpaddd	L_avx1_aes_gcm_three(%rip), %xmm0, %xmm11
-        vpshufb	%xmm1, %xmm11, %xmm11
-        vpaddd	L_avx1_aes_gcm_four(%rip), %xmm0, %xmm12
-        vpshufb	%xmm1, %xmm12, %xmm12
-        vpaddd	L_avx1_aes_gcm_five(%rip), %xmm0, %xmm13
-        vpshufb	%xmm1, %xmm13, %xmm13
-        vpaddd	L_avx1_aes_gcm_six(%rip), %xmm0, %xmm14
-        vpshufb	%xmm1, %xmm14, %xmm14
-        vpaddd	L_avx1_aes_gcm_seven(%rip), %xmm0, %xmm15
-        vpshufb	%xmm1, %xmm15, %xmm15
-        vpaddd	L_avx1_aes_gcm_eight(%rip), %xmm0, %xmm0
-        vmovdqa	(%rdi), %xmm7
-        vmovdqu	%xmm0, (%r12)
-        vpxor	%xmm7, %xmm8, %xmm8
-        vpxor	%xmm7, %xmm9, %xmm9
-        vpxor	%xmm7, %xmm10, %xmm10
-        vpxor	%xmm7, %xmm11, %xmm11
-        vpxor	%xmm7, %xmm12, %xmm12
-        vpxor	%xmm7, %xmm13, %xmm13
-        vpxor	%xmm7, %xmm14, %xmm14
-        vpxor	%xmm7, %xmm15, %xmm15
-        vmovdqu	112(%rsp), %xmm7
-        vmovdqu	(%rcx), %xmm0
-        vaesenc	16(%rdi), %xmm8, %xmm8
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm3
-        vaesenc	16(%rdi), %xmm9, %xmm9
-        vaesenc	16(%rdi), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm2
-        vaesenc	16(%rdi), %xmm11, %xmm11
-        vaesenc	16(%rdi), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm1, %xmm1
-        vaesenc	16(%rdi), %xmm13, %xmm13
-        vaesenc	16(%rdi), %xmm14, %xmm14
-        vaesenc	16(%rdi), %xmm15, %xmm15
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqu	96(%rsp), %xmm7
-        vmovdqu	16(%rcx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	32(%rdi), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	32(%rdi), %xmm9, %xmm9
-        vaesenc	32(%rdi), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	32(%rdi), %xmm11, %xmm11
-        vaesenc	32(%rdi), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	32(%rdi), %xmm13, %xmm13
-        vaesenc	32(%rdi), %xmm14, %xmm14
-        vaesenc	32(%rdi), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	80(%rsp), %xmm7
-        vmovdqu	32(%rcx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	48(%rdi), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	48(%rdi), %xmm9, %xmm9
-        vaesenc	48(%rdi), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	48(%rdi), %xmm11, %xmm11
-        vaesenc	48(%rdi), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	48(%rdi), %xmm13, %xmm13
-        vaesenc	48(%rdi), %xmm14, %xmm14
-        vaesenc	48(%rdi), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	64(%rsp), %xmm7
-        vmovdqu	48(%rcx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	64(%rdi), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	64(%rdi), %xmm9, %xmm9
-        vaesenc	64(%rdi), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	64(%rdi), %xmm11, %xmm11
-        vaesenc	64(%rdi), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	64(%rdi), %xmm13, %xmm13
-        vaesenc	64(%rdi), %xmm14, %xmm14
-        vaesenc	64(%rdi), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	48(%rsp), %xmm7
-        vmovdqu	64(%rcx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	80(%rdi), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	80(%rdi), %xmm9, %xmm9
-        vaesenc	80(%rdi), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	80(%rdi), %xmm11, %xmm11
-        vaesenc	80(%rdi), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	80(%rdi), %xmm13, %xmm13
-        vaesenc	80(%rdi), %xmm14, %xmm14
-        vaesenc	80(%rdi), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	32(%rsp), %xmm7
-        vmovdqu	80(%rcx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	96(%rdi), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	96(%rdi), %xmm9, %xmm9
-        vaesenc	96(%rdi), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	96(%rdi), %xmm11, %xmm11
-        vaesenc	96(%rdi), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	96(%rdi), %xmm13, %xmm13
-        vaesenc	96(%rdi), %xmm14, %xmm14
-        vaesenc	96(%rdi), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	16(%rsp), %xmm7
-        vmovdqu	96(%rcx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	112(%rdi), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	112(%rdi), %xmm9, %xmm9
-        vaesenc	112(%rdi), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	112(%rdi), %xmm11, %xmm11
-        vaesenc	112(%rdi), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	112(%rdi), %xmm13, %xmm13
-        vaesenc	112(%rdi), %xmm14, %xmm14
-        vaesenc	112(%rdi), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	(%rsp), %xmm7
-        vmovdqu	112(%rcx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vaesenc	128(%rdi), %xmm8, %xmm8
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vaesenc	128(%rdi), %xmm9, %xmm9
-        vaesenc	128(%rdi), %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vaesenc	128(%rdi), %xmm11, %xmm11
-        vaesenc	128(%rdi), %xmm12, %xmm12
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vaesenc	128(%rdi), %xmm13, %xmm13
-        vaesenc	128(%rdi), %xmm14, %xmm14
-        vaesenc	128(%rdi), %xmm15, %xmm15
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpslldq	$8, %xmm1, %xmm5
-        vpsrldq	$8, %xmm1, %xmm1
-        vaesenc	144(%rdi), %xmm8, %xmm8
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm3, %xmm3
-        vaesenc	144(%rdi), %xmm9, %xmm9
-        vpslld	$31, %xmm2, %xmm7
-        vpslld	$30, %xmm2, %xmm4
-        vpslld	$25, %xmm2, %xmm5
-        vaesenc	144(%rdi), %xmm10, %xmm10
-        vpxor	%xmm4, %xmm7, %xmm7
-        vpxor	%xmm5, %xmm7, %xmm7
-        vaesenc	144(%rdi), %xmm11, %xmm11
-        vpsrldq	$4, %xmm7, %xmm4
-        vpslldq	$12, %xmm7, %xmm7
-        vaesenc	144(%rdi), %xmm12, %xmm12
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpsrld	$0x01, %xmm2, %xmm5
-        vaesenc	144(%rdi), %xmm13, %xmm13
-        vpsrld	$2, %xmm2, %xmm1
-        vpsrld	$7, %xmm2, %xmm0
-        vaesenc	144(%rdi), %xmm14, %xmm14
-        vpxor	%xmm1, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vaesenc	144(%rdi), %xmm15, %xmm15
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm2, %xmm2
-        cmpl	$11, %esi
-        vmovdqa	160(%rdi), %xmm7
-        jl	L_AES_GCM_decrypt_update_avx1_aesenc_128_ghash_avx_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	176(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        cmpl	$13, %esi
-        vmovdqa	192(%rdi), %xmm7
-        jl	L_AES_GCM_decrypt_update_avx1_aesenc_128_ghash_avx_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	208(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqa	224(%rdi), %xmm7
-L_AES_GCM_decrypt_update_avx1_aesenc_128_ghash_avx_done:
-        vaesenclast	%xmm7, %xmm8, %xmm8
-        vaesenclast	%xmm7, %xmm9, %xmm9
-        vmovdqu	(%rcx), %xmm0
-        vmovdqu	16(%rcx), %xmm1
-        vpxor	%xmm0, %xmm8, %xmm8
-        vpxor	%xmm1, %xmm9, %xmm9
-        vmovdqu	%xmm8, (%rdx)
-        vmovdqu	%xmm9, 16(%rdx)
-        vaesenclast	%xmm7, %xmm10, %xmm10
-        vaesenclast	%xmm7, %xmm11, %xmm11
-        vmovdqu	32(%rcx), %xmm0
-        vmovdqu	48(%rcx), %xmm1
-        vpxor	%xmm0, %xmm10, %xmm10
-        vpxor	%xmm1, %xmm11, %xmm11
-        vmovdqu	%xmm10, 32(%rdx)
-        vmovdqu	%xmm11, 48(%rdx)
-        vaesenclast	%xmm7, %xmm12, %xmm12
-        vaesenclast	%xmm7, %xmm13, %xmm13
-        vmovdqu	64(%rcx), %xmm0
-        vmovdqu	80(%rcx), %xmm1
-        vpxor	%xmm0, %xmm12, %xmm12
-        vpxor	%xmm1, %xmm13, %xmm13
-        vmovdqu	%xmm12, 64(%rdx)
-        vmovdqu	%xmm13, 80(%rdx)
-        vaesenclast	%xmm7, %xmm14, %xmm14
-        vaesenclast	%xmm7, %xmm15, %xmm15
-        vmovdqu	96(%rcx), %xmm0
-        vmovdqu	112(%rcx), %xmm1
-        vpxor	%xmm0, %xmm14, %xmm14
-        vpxor	%xmm1, %xmm15, %xmm15
-        vmovdqu	%xmm14, 96(%rdx)
-        vmovdqu	%xmm15, 112(%rdx)
-        addl	$0x80, %r14d
-        cmpl	%r13d, %r14d
-        jl	L_AES_GCM_decrypt_update_avx1_ghash_128
-        vmovdqa	%xmm2, %xmm6
-        vmovdqu	(%rsp), %xmm5
-L_AES_GCM_decrypt_update_avx1_done_128:
-        movl	%r8d, %edx
-        cmpl	%edx, %r14d
-        jge	L_AES_GCM_decrypt_update_avx1_done_dec
-        movl	%r8d, %r13d
-        andl	$0xfffffff0, %r13d
-        cmpl	%r13d, %r14d
-        jge	L_AES_GCM_decrypt_update_avx1_last_block_done
-L_AES_GCM_decrypt_update_avx1_last_block_start:
-        vmovdqu	(%r11,%r14,1), %xmm13
-        vmovdqa	%xmm5, %xmm0
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm13, %xmm1
-        vpxor	%xmm6, %xmm1, %xmm1
-        vmovdqu	(%r12), %xmm9
-        vpshufb	L_avx1_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8
-        vpaddd	L_avx1_aes_gcm_one(%rip), %xmm9, %xmm9
-        vmovdqu	%xmm9, (%r12)
-        vpxor	(%rdi), %xmm8, %xmm8
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm10
-        vaesenc	16(%rdi), %xmm8, %xmm8
-        vaesenc	32(%rdi), %xmm8, %xmm8
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm11
-        vaesenc	48(%rdi), %xmm8, %xmm8
-        vaesenc	64(%rdi), %xmm8, %xmm8
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm12
-        vaesenc	80(%rdi), %xmm8, %xmm8
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vaesenc	96(%rdi), %xmm8, %xmm8
-        vpxor	%xmm11, %xmm10, %xmm10
-        vpslldq	$8, %xmm10, %xmm2
-        vpsrldq	$8, %xmm10, %xmm10
-        vaesenc	112(%rdi), %xmm8, %xmm8
-        vpxor	%xmm12, %xmm2, %xmm2
-        vpxor	%xmm10, %xmm1, %xmm3
-        vmovdqa	L_avx1_aes_gcm_mod2_128(%rip), %xmm0
-        vpclmulqdq	$16, %xmm0, %xmm2, %xmm11
-        vaesenc	128(%rdi), %xmm8, %xmm8
-        vpshufd	$0x4e, %xmm2, %xmm10
-        vpxor	%xmm11, %xmm10, %xmm10
-        vpclmulqdq	$16, %xmm0, %xmm10, %xmm11
-        vaesenc	144(%rdi), %xmm8, %xmm8
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpxor	%xmm11, %xmm10, %xmm10
-        vpxor	%xmm3, %xmm10, %xmm6
-        cmpl	$11, %esi
-        vmovdqa	160(%rdi), %xmm9
-        jl	L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last
-        vaesenc	%xmm9, %xmm8, %xmm8
-        vaesenc	176(%rdi), %xmm8, %xmm8
-        cmpl	$13, %esi
-        vmovdqa	192(%rdi), %xmm9
-        jl	L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last
-        vaesenc	%xmm9, %xmm8, %xmm8
-        vaesenc	208(%rdi), %xmm8, %xmm8
-        vmovdqa	224(%rdi), %xmm9
-L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last:
-        vaesenclast	%xmm9, %xmm8, %xmm8
-        vmovdqa	%xmm13, %xmm0
-        vpxor	%xmm0, %xmm8, %xmm8
-        vmovdqu	%xmm8, (%r10,%r14,1)
-        addl	$16, %r14d
-        cmpl	%r13d, %r14d
-        jl	L_AES_GCM_decrypt_update_avx1_last_block_start
-L_AES_GCM_decrypt_update_avx1_last_block_done:
-L_AES_GCM_decrypt_update_avx1_done_dec:
-        vmovdqa	%xmm6, (%r9)
-        vzeroupper
-        addq	$0xa8, %rsp
-        popq	%r14
-        popq	%r12
-        popq	%r13
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_decrypt_update_avx1,.-AES_GCM_decrypt_update_avx1
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_decrypt_final_avx1
-.type	AES_GCM_decrypt_final_avx1,@function
-.align	16
-AES_GCM_decrypt_final_avx1:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_decrypt_final_avx1
-.p2align	4
-_AES_GCM_decrypt_final_avx1:
-#endif /* __APPLE__ */
-        pushq	%r13
-        pushq	%rbp
-        pushq	%r12
-        movl	%edx, %eax
-        movl	%ecx, %r10d
-        movl	%r8d, %r11d
-        movq	32(%rsp), %r8
-        movq	40(%rsp), %rbp
-        subq	$16, %rsp
-        vmovdqa	(%rdi), %xmm6
-        vmovdqa	(%r9), %xmm5
-        vmovdqa	(%r8), %xmm15
-        vpsrlq	$63, %xmm5, %xmm9
-        vpsllq	$0x01, %xmm5, %xmm8
-        vpslldq	$8, %xmm9, %xmm9
-        vpor	%xmm9, %xmm8, %xmm8
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpand	L_avx1_aes_gcm_mod2_128(%rip), %xmm5, %xmm5
-        vpxor	%xmm8, %xmm5, %xmm5
-        movl	%r10d, %edx
-        movl	%r11d, %ecx
-        shlq	$3, %rdx
-        shlq	$3, %rcx
-        vmovq	%rdx, %xmm0
-        vmovq	%rcx, %xmm1
-        vpunpcklqdq	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm6, %xmm6
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm5, %xmm9
-        vpshufd	$0x4e, %xmm6, %xmm10
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm11
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm8
-        vpxor	%xmm5, %xmm9, %xmm9
-        vpxor	%xmm6, %xmm10, %xmm10
-        vpclmulqdq	$0x00, %xmm10, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm11, %xmm9, %xmm9
-        vpslldq	$8, %xmm9, %xmm10
-        vpsrldq	$8, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm8, %xmm8
-        vpxor	%xmm9, %xmm11, %xmm6
-        vpslld	$31, %xmm8, %xmm12
-        vpslld	$30, %xmm8, %xmm13
-        vpslld	$25, %xmm8, %xmm14
-        vpxor	%xmm13, %xmm12, %xmm12
-        vpxor	%xmm14, %xmm12, %xmm12
-        vpsrldq	$4, %xmm12, %xmm13
-        vpslldq	$12, %xmm12, %xmm12
-        vpxor	%xmm12, %xmm8, %xmm8
-        vpsrld	$0x01, %xmm8, %xmm14
-        vpsrld	$2, %xmm8, %xmm10
-        vpsrld	$7, %xmm8, %xmm9
-        vpxor	%xmm10, %xmm14, %xmm14
-        vpxor	%xmm9, %xmm14, %xmm14
-        vpxor	%xmm13, %xmm14, %xmm14
-        vpxor	%xmm8, %xmm14, %xmm14
-        vpxor	%xmm14, %xmm6, %xmm6
-        vpshufb	L_avx1_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6
-        vpxor	%xmm15, %xmm6, %xmm0
-        cmpl	$16, %eax
-        je	L_AES_GCM_decrypt_final_avx1_cmp_tag_16
-        subq	$16, %rsp
-        xorq	%rcx, %rcx
-        xorq	%r12, %r12
-        vmovdqu	%xmm0, (%rsp)
-L_AES_GCM_decrypt_final_avx1_cmp_tag_loop:
-        movzbl	(%rsp,%rcx,1), %r13d
-        xorb	(%rsi,%rcx,1), %r13b
-        orb	%r13b, %r12b
-        incl	%ecx
-        cmpl	%eax, %ecx
-        jne	L_AES_GCM_decrypt_final_avx1_cmp_tag_loop
-        cmpb	$0x00, %r12b
-        sete	%r12b
-        addq	$16, %rsp
-        xorq	%rcx, %rcx
-        jmp	L_AES_GCM_decrypt_final_avx1_cmp_tag_done
-L_AES_GCM_decrypt_final_avx1_cmp_tag_16:
-        vmovdqu	(%rsi), %xmm1
-        vpcmpeqb	%xmm1, %xmm0, %xmm0
-        vpmovmskb	%xmm0, %rdx
-        # %%edx == 0xFFFF then return 1 else => return 0
-        xorl	%r12d, %r12d
-        cmpl	$0xffff, %edx
-        sete	%r12b
-L_AES_GCM_decrypt_final_avx1_cmp_tag_done:
-        movl	%r12d, (%rbp)
-        vzeroupper
-        addq	$16, %rsp
-        popq	%r12
-        popq	%rbp
-        popq	%r13
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_decrypt_final_avx1,.-AES_GCM_decrypt_final_avx1
-#endif /* __APPLE__ */
-#endif /* WOLFSSL_AESGCM_STREAM */
-#endif /* HAVE_INTEL_AVX1 */
-#ifdef HAVE_INTEL_AVX2
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx2_aes_gcm_one:
-.quad	0x0, 0x1
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx2_aes_gcm_two:
-.quad	0x0, 0x2
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx2_aes_gcm_three:
-.quad	0x0, 0x3
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx2_aes_gcm_four:
-.quad	0x0, 0x4
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx2_aes_gcm_five:
-.quad	0x0, 0x5
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx2_aes_gcm_six:
-.quad	0x0, 0x6
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx2_aes_gcm_seven:
-.quad	0x0, 0x7
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx2_aes_gcm_eight:
-.quad	0x0, 0x8
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx2_aes_gcm_bswap_one:
-.quad	0x0, 0x100000000000000
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx2_aes_gcm_bswap_epi64:
-.quad	0x1020304050607, 0x8090a0b0c0d0e0f
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx2_aes_gcm_bswap_mask:
-.quad	0x8090a0b0c0d0e0f, 0x1020304050607
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_avx2_aes_gcm_mod2_128:
-.quad	0x1, 0xc200000000000000
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_encrypt_avx2
-.type	AES_GCM_encrypt_avx2,@function
-.align	16
-AES_GCM_encrypt_avx2:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_encrypt_avx2
-.p2align	4
-_AES_GCM_encrypt_avx2:
-#endif /* __APPLE__ */
-        pushq	%r13
-        pushq	%r12
-        pushq	%r15
-        pushq	%rbx
-        pushq	%r14
-        movq	%rdx, %r12
-        movq	%rcx, %rax
-        movq	%r8, %r15
-        movq	%rsi, %r8
-        movl	%r9d, %r10d
-        movl	48(%rsp), %r11d
-        movl	56(%rsp), %ebx
-        movl	64(%rsp), %r14d
-        movq	72(%rsp), %rsi
-        movl	80(%rsp), %r9d
-        subq	$0xa0, %rsp
-        vpxor	%xmm4, %xmm4, %xmm4
-        vpxor	%xmm6, %xmm6, %xmm6
-        movl	%ebx, %edx
-        cmpl	$12, %edx
-        je	L_AES_GCM_encrypt_avx2_iv_12
-        # Calculate values when IV is not 12 bytes
-        # H = Encrypt X(=0)
-        vmovdqu	(%rsi), %xmm5
-        vaesenc	16(%rsi), %xmm5, %xmm5
-        vaesenc	32(%rsi), %xmm5, %xmm5
-        vaesenc	48(%rsi), %xmm5, %xmm5
-        vaesenc	64(%rsi), %xmm5, %xmm5
-        vaesenc	80(%rsi), %xmm5, %xmm5
-        vaesenc	96(%rsi), %xmm5, %xmm5
-        vaesenc	112(%rsi), %xmm5, %xmm5
-        vaesenc	128(%rsi), %xmm5, %xmm5
-        vaesenc	144(%rsi), %xmm5, %xmm5
-        cmpl	$11, %r9d
-        vmovdqu	160(%rsi), %xmm0
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	176(%rsi), %xmm5, %xmm5
-        cmpl	$13, %r9d
-        vmovdqu	192(%rsi), %xmm0
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	208(%rsi), %xmm5, %xmm5
-        vmovdqu	224(%rsi), %xmm0
-L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last:
-        vaesenclast	%xmm0, %xmm5, %xmm5
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
-        # Calc counter
-        # Initialization vector
-        cmpl	$0x00, %edx
-        movq	$0x00, %rcx
-        je	L_AES_GCM_encrypt_avx2_calc_iv_done
-        cmpl	$16, %edx
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_encrypt_avx2_calc_iv_16_loop:
-        vmovdqu	(%rax,%rcx,1), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_16_loop
-        movl	%ebx, %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_encrypt_avx2_calc_iv_done
-L_AES_GCM_encrypt_avx2_calc_iv_lt16:
-        vpxor	%xmm0, %xmm0, %xmm0
-        xorl	%ebx, %ebx
-        vmovdqu	%xmm0, (%rsp)
-L_AES_GCM_encrypt_avx2_calc_iv_loop:
-        movzbl	(%rax,%rcx,1), %r13d
-        movb	%r13b, (%rsp,%rbx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_loop
-        vmovdqu	(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-L_AES_GCM_encrypt_avx2_calc_iv_done:
-        # T = Encrypt counter
-        vpxor	%xmm0, %xmm0, %xmm0
-        shll	$3, %edx
-        vmovq	%rdx, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4
-        #   Encrypt counter
-        vmovdqu	(%rsi), %xmm15
-        vpxor	%xmm4, %xmm15, %xmm15
-        vaesenc	16(%rsi), %xmm15, %xmm15
-        vaesenc	32(%rsi), %xmm15, %xmm15
-        vaesenc	48(%rsi), %xmm15, %xmm15
-        vaesenc	64(%rsi), %xmm15, %xmm15
-        vaesenc	80(%rsi), %xmm15, %xmm15
-        vaesenc	96(%rsi), %xmm15, %xmm15
-        vaesenc	112(%rsi), %xmm15, %xmm15
-        vaesenc	128(%rsi), %xmm15, %xmm15
-        vaesenc	144(%rsi), %xmm15, %xmm15
-        cmpl	$11, %r9d
-        vmovdqu	160(%rsi), %xmm0
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vaesenc	176(%rsi), %xmm15, %xmm15
-        cmpl	$13, %r9d
-        vmovdqu	192(%rsi), %xmm0
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vaesenc	208(%rsi), %xmm15, %xmm15
-        vmovdqu	224(%rsi), %xmm0
-L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last:
-        vaesenclast	%xmm0, %xmm15, %xmm15
-        jmp	L_AES_GCM_encrypt_avx2_iv_done
-L_AES_GCM_encrypt_avx2_iv_12:
-        # # Calculate values when IV is 12 bytes
-        # Set counter based on IV
-        vmovdqu	L_avx2_aes_gcm_bswap_one(%rip), %xmm4
-        vmovdqu	(%rsi), %xmm5
-        vpblendd	$7, (%rax), %xmm4, %xmm4
-        # H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqu	16(%rsi), %xmm7
-        vpxor	%xmm5, %xmm4, %xmm15
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	32(%rsi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vmovdqu	48(%rsi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vmovdqu	64(%rsi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vmovdqu	80(%rsi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vmovdqu	96(%rsi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vmovdqu	112(%rsi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vmovdqu	128(%rsi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vmovdqu	144(%rsi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        cmpl	$11, %r9d
-        vmovdqu	160(%rsi), %xmm0
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_12_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vmovdqu	176(%rsi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        cmpl	$13, %r9d
-        vmovdqu	192(%rsi), %xmm0
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_12_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vmovdqu	208(%rsi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vmovdqu	224(%rsi), %xmm0
-L_AES_GCM_encrypt_avx2_calc_iv_12_last:
-        vaesenclast	%xmm0, %xmm5, %xmm5
-        vaesenclast	%xmm0, %xmm15, %xmm15
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
-L_AES_GCM_encrypt_avx2_iv_done:
-        # Additional authentication data
-        movl	%r11d, %edx
-        cmpl	$0x00, %edx
-        je	L_AES_GCM_encrypt_avx2_calc_aad_done
-        xorl	%ecx, %ecx
-        cmpl	$16, %edx
-        jl	L_AES_GCM_encrypt_avx2_calc_aad_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_encrypt_avx2_calc_aad_16_loop:
-        vmovdqu	(%r12,%rcx,1), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm0, %xmm6, %xmm6
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm6, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm6, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm6, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm6
-        # ghash_mid
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm6, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm6, %xmm6
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm6, %xmm6
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm6, %xmm6
-        # ghash_red
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm6, %xmm6
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_avx2_calc_aad_16_loop
-        movl	%r11d, %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_encrypt_avx2_calc_aad_done
-L_AES_GCM_encrypt_avx2_calc_aad_lt16:
-        vpxor	%xmm0, %xmm0, %xmm0
-        xorl	%ebx, %ebx
-        vmovdqu	%xmm0, (%rsp)
-L_AES_GCM_encrypt_avx2_calc_aad_loop:
-        movzbl	(%r12,%rcx,1), %r13d
-        movb	%r13b, (%rsp,%rbx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_avx2_calc_aad_loop
-        vmovdqu	(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm0, %xmm6, %xmm6
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm6, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm6, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm6, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm6
-        # ghash_mid
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm6, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm6, %xmm6
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm6, %xmm6
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm6, %xmm6
-        # ghash_red
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm6, %xmm6
-L_AES_GCM_encrypt_avx2_calc_aad_done:
-        # Calculate counter and H
-        vpsrlq	$63, %xmm5, %xmm1
-        vpsllq	$0x01, %xmm5, %xmm0
-        vpslldq	$8, %xmm1, %xmm1
-        vpor	%xmm1, %xmm0, %xmm0
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpshufb	L_avx2_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4
-        vpand	L_avx2_aes_gcm_mod2_128(%rip), %xmm5, %xmm5
-        vpaddd	L_avx2_aes_gcm_one(%rip), %xmm4, %xmm4
-        vpxor	%xmm0, %xmm5, %xmm5
-        xorl	%ebx, %ebx
-        cmpl	$0x80, %r10d
-        movl	%r10d, %r13d
-        jl	L_AES_GCM_encrypt_avx2_done_128
-        andl	$0xffffff80, %r13d
-        vmovdqu	%xmm4, 128(%rsp)
-        vmovdqu	%xmm15, 144(%rsp)
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm3
-        # H ^ 1 and H ^ 2
-        vpclmulqdq	$0x00, %xmm5, %xmm5, %xmm9
-        vpclmulqdq	$0x11, %xmm5, %xmm5, %xmm10
-        vpclmulqdq	$16, %xmm3, %xmm9, %xmm8
-        vpshufd	$0x4e, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpclmulqdq	$16, %xmm3, %xmm9, %xmm8
-        vpshufd	$0x4e, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm9, %xmm10, %xmm0
-        vmovdqu	%xmm5, (%rsp)
-        vmovdqu	%xmm0, 16(%rsp)
-        # H ^ 3 and H ^ 4
-        vpclmulqdq	$16, %xmm5, %xmm0, %xmm11
-        vpclmulqdq	$0x01, %xmm5, %xmm0, %xmm10
-        vpclmulqdq	$0x00, %xmm5, %xmm0, %xmm9
-        vpclmulqdq	$0x11, %xmm5, %xmm0, %xmm12
-        vpclmulqdq	$0x00, %xmm0, %xmm0, %xmm13
-        vpclmulqdq	$0x11, %xmm0, %xmm0, %xmm14
-        vpxor	%xmm10, %xmm11, %xmm11
-        vpslldq	$8, %xmm11, %xmm10
-        vpsrldq	$8, %xmm11, %xmm11
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm11, %xmm12, %xmm12
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpxor	%xmm12, %xmm10, %xmm10
-        vpxor	%xmm14, %xmm13, %xmm2
-        vpxor	%xmm9, %xmm10, %xmm1
-        vmovdqu	%xmm1, 32(%rsp)
-        vmovdqu	%xmm2, 48(%rsp)
-        # H ^ 5 and H ^ 6
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm11
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm10
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm9
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm12
-        vpclmulqdq	$0x00, %xmm1, %xmm1, %xmm13
-        vpclmulqdq	$0x11, %xmm1, %xmm1, %xmm14
-        vpxor	%xmm10, %xmm11, %xmm11
-        vpslldq	$8, %xmm11, %xmm10
-        vpsrldq	$8, %xmm11, %xmm11
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm11, %xmm12, %xmm12
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpxor	%xmm12, %xmm10, %xmm10
-        vpxor	%xmm14, %xmm13, %xmm0
-        vpxor	%xmm9, %xmm10, %xmm7
-        vmovdqu	%xmm7, 64(%rsp)
-        vmovdqu	%xmm0, 80(%rsp)
-        # H ^ 7 and H ^ 8
-        vpclmulqdq	$16, %xmm1, %xmm2, %xmm11
-        vpclmulqdq	$0x01, %xmm1, %xmm2, %xmm10
-        vpclmulqdq	$0x00, %xmm1, %xmm2, %xmm9
-        vpclmulqdq	$0x11, %xmm1, %xmm2, %xmm12
-        vpclmulqdq	$0x00, %xmm2, %xmm2, %xmm13
-        vpclmulqdq	$0x11, %xmm2, %xmm2, %xmm14
-        vpxor	%xmm10, %xmm11, %xmm11
-        vpslldq	$8, %xmm11, %xmm10
-        vpsrldq	$8, %xmm11, %xmm11
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm11, %xmm12, %xmm12
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpxor	%xmm12, %xmm10, %xmm10
-        vpxor	%xmm14, %xmm13, %xmm0
-        vpxor	%xmm9, %xmm10, %xmm7
-        vmovdqu	%xmm7, 96(%rsp)
-        vmovdqu	%xmm0, 112(%rsp)
-        # First 128 bytes of input
-        # aesenc_128
-        # aesenc_ctr
-        vmovdqu	128(%rsp), %xmm0
-        vmovdqu	L_avx2_aes_gcm_bswap_epi64(%rip), %xmm1
-        vpaddd	L_avx2_aes_gcm_one(%rip), %xmm0, %xmm9
-        vpshufb	%xmm1, %xmm0, %xmm8
-        vpaddd	L_avx2_aes_gcm_two(%rip), %xmm0, %xmm10
-        vpshufb	%xmm1, %xmm9, %xmm9
-        vpaddd	L_avx2_aes_gcm_three(%rip), %xmm0, %xmm11
-        vpshufb	%xmm1, %xmm10, %xmm10
-        vpaddd	L_avx2_aes_gcm_four(%rip), %xmm0, %xmm12
-        vpshufb	%xmm1, %xmm11, %xmm11
-        vpaddd	L_avx2_aes_gcm_five(%rip), %xmm0, %xmm13
-        vpshufb	%xmm1, %xmm12, %xmm12
-        vpaddd	L_avx2_aes_gcm_six(%rip), %xmm0, %xmm14
-        vpshufb	%xmm1, %xmm13, %xmm13
-        vpaddd	L_avx2_aes_gcm_seven(%rip), %xmm0, %xmm15
-        vpshufb	%xmm1, %xmm14, %xmm14
-        vpaddd	L_avx2_aes_gcm_eight(%rip), %xmm0, %xmm0
-        vpshufb	%xmm1, %xmm15, %xmm15
-        # aesenc_xor
-        vmovdqu	(%rsi), %xmm7
-        vmovdqu	%xmm0, 128(%rsp)
-        vpxor	%xmm7, %xmm8, %xmm8
-        vpxor	%xmm7, %xmm9, %xmm9
-        vpxor	%xmm7, %xmm10, %xmm10
-        vpxor	%xmm7, %xmm11, %xmm11
-        vpxor	%xmm7, %xmm12, %xmm12
-        vpxor	%xmm7, %xmm13, %xmm13
-        vpxor	%xmm7, %xmm14, %xmm14
-        vpxor	%xmm7, %xmm15, %xmm15
-        vmovdqu	16(%rsi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	32(%rsi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	48(%rsi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	64(%rsi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	80(%rsi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	96(%rsi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	112(%rsi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	128(%rsi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	144(%rsi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        cmpl	$11, %r9d
-        vmovdqu	160(%rsi), %xmm7
-        jl	L_AES_GCM_encrypt_avx2_aesenc_128_enc_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	176(%rsi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        cmpl	$13, %r9d
-        vmovdqu	192(%rsi), %xmm7
-        jl	L_AES_GCM_encrypt_avx2_aesenc_128_enc_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	208(%rsi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	224(%rsi), %xmm7
-L_AES_GCM_encrypt_avx2_aesenc_128_enc_done:
-        # aesenc_last
-        vaesenclast	%xmm7, %xmm8, %xmm8
-        vaesenclast	%xmm7, %xmm9, %xmm9
-        vaesenclast	%xmm7, %xmm10, %xmm10
-        vaesenclast	%xmm7, %xmm11, %xmm11
-        vmovdqu	(%rdi), %xmm0
-        vmovdqu	16(%rdi), %xmm1
-        vmovdqu	32(%rdi), %xmm2
-        vmovdqu	48(%rdi), %xmm3
-        vpxor	%xmm0, %xmm8, %xmm8
-        vpxor	%xmm1, %xmm9, %xmm9
-        vpxor	%xmm2, %xmm10, %xmm10
-        vpxor	%xmm3, %xmm11, %xmm11
-        vmovdqu	%xmm8, (%r8)
-        vmovdqu	%xmm9, 16(%r8)
-        vmovdqu	%xmm10, 32(%r8)
-        vmovdqu	%xmm11, 48(%r8)
-        vaesenclast	%xmm7, %xmm12, %xmm12
-        vaesenclast	%xmm7, %xmm13, %xmm13
-        vaesenclast	%xmm7, %xmm14, %xmm14
-        vaesenclast	%xmm7, %xmm15, %xmm15
-        vmovdqu	64(%rdi), %xmm0
-        vmovdqu	80(%rdi), %xmm1
-        vmovdqu	96(%rdi), %xmm2
-        vmovdqu	112(%rdi), %xmm3
-        vpxor	%xmm0, %xmm12, %xmm12
-        vpxor	%xmm1, %xmm13, %xmm13
-        vpxor	%xmm2, %xmm14, %xmm14
-        vpxor	%xmm3, %xmm15, %xmm15
-        vmovdqu	%xmm12, 64(%r8)
-        vmovdqu	%xmm13, 80(%r8)
-        vmovdqu	%xmm14, 96(%r8)
-        vmovdqu	%xmm15, 112(%r8)
-        cmpl	$0x80, %r13d
-        movl	$0x80, %ebx
-        jle	L_AES_GCM_encrypt_avx2_end_128
-        # More 128 bytes of input
-L_AES_GCM_encrypt_avx2_ghash_128:
-        # aesenc_128_ghash
-        leaq	(%rdi,%rbx,1), %rcx
-        leaq	(%r8,%rbx,1), %rdx
-        # aesenc_ctr
-        vmovdqu	128(%rsp), %xmm0
-        vmovdqu	L_avx2_aes_gcm_bswap_epi64(%rip), %xmm1
-        vpaddd	L_avx2_aes_gcm_one(%rip), %xmm0, %xmm9
-        vpshufb	%xmm1, %xmm0, %xmm8
-        vpaddd	L_avx2_aes_gcm_two(%rip), %xmm0, %xmm10
-        vpshufb	%xmm1, %xmm9, %xmm9
-        vpaddd	L_avx2_aes_gcm_three(%rip), %xmm0, %xmm11
-        vpshufb	%xmm1, %xmm10, %xmm10
-        vpaddd	L_avx2_aes_gcm_four(%rip), %xmm0, %xmm12
-        vpshufb	%xmm1, %xmm11, %xmm11
-        vpaddd	L_avx2_aes_gcm_five(%rip), %xmm0, %xmm13
-        vpshufb	%xmm1, %xmm12, %xmm12
-        vpaddd	L_avx2_aes_gcm_six(%rip), %xmm0, %xmm14
-        vpshufb	%xmm1, %xmm13, %xmm13
-        vpaddd	L_avx2_aes_gcm_seven(%rip), %xmm0, %xmm15
-        vpshufb	%xmm1, %xmm14, %xmm14
-        vpaddd	L_avx2_aes_gcm_eight(%rip), %xmm0, %xmm0
-        vpshufb	%xmm1, %xmm15, %xmm15
-        # aesenc_xor
-        vmovdqu	(%rsi), %xmm7
-        vmovdqu	%xmm0, 128(%rsp)
-        vpxor	%xmm7, %xmm8, %xmm8
-        vpxor	%xmm7, %xmm9, %xmm9
-        vpxor	%xmm7, %xmm10, %xmm10
-        vpxor	%xmm7, %xmm11, %xmm11
-        vpxor	%xmm7, %xmm12, %xmm12
-        vpxor	%xmm7, %xmm13, %xmm13
-        vpxor	%xmm7, %xmm14, %xmm14
-        vpxor	%xmm7, %xmm15, %xmm15
-        # aesenc_pclmul_1
-        vmovdqu	-128(%rdx), %xmm1
-        vmovdqu	16(%rsi), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vmovdqu	112(%rsp), %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm5
-        vpclmulqdq	$0x01, %xmm2, %xmm1, %xmm3
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm6
-        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_2
-        vmovdqu	-112(%rdx), %xmm1
-        vmovdqu	96(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	32(%rsi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	-96(%rdx), %xmm1
-        vmovdqu	80(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	48(%rsi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	-80(%rdx), %xmm1
-        vmovdqu	64(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	64(%rsi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	-64(%rdx), %xmm1
-        vmovdqu	48(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	80(%rsi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	-48(%rdx), %xmm1
-        vmovdqu	32(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	96(%rsi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	-32(%rdx), %xmm1
-        vmovdqu	16(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	112(%rsi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	-16(%rdx), %xmm1
-        vmovdqu	(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	128(%rsi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_l
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpslldq	$8, %xmm5, %xmm1
-        vpsrldq	$8, %xmm5, %xmm5
-        vmovdqu	144(%rsi), %xmm4
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm0
-        vaesenc	%xmm4, %xmm8, %xmm8
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm7, %xmm7
-        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
-        vaesenc	%xmm4, %xmm9, %xmm9
-        vaesenc	%xmm4, %xmm10, %xmm10
-        vaesenc	%xmm4, %xmm11, %xmm11
-        vpshufd	$0x4e, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
-        vaesenc	%xmm4, %xmm12, %xmm12
-        vaesenc	%xmm4, %xmm13, %xmm13
-        vaesenc	%xmm4, %xmm14, %xmm14
-        vpshufd	$0x4e, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm7, %xmm6, %xmm6
-        vaesenc	%xmm4, %xmm15, %xmm15
-        cmpl	$11, %r9d
-        vmovdqu	160(%rsi), %xmm7
-        jl	L_AES_GCM_encrypt_avx2_aesenc_128_ghash_avx_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	176(%rsi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        cmpl	$13, %r9d
-        vmovdqu	192(%rsi), %xmm7
-        jl	L_AES_GCM_encrypt_avx2_aesenc_128_ghash_avx_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	208(%rsi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	224(%rsi), %xmm7
-L_AES_GCM_encrypt_avx2_aesenc_128_ghash_avx_done:
-        # aesenc_last
-        vaesenclast	%xmm7, %xmm8, %xmm8
-        vaesenclast	%xmm7, %xmm9, %xmm9
-        vaesenclast	%xmm7, %xmm10, %xmm10
-        vaesenclast	%xmm7, %xmm11, %xmm11
-        vmovdqu	(%rcx), %xmm0
-        vmovdqu	16(%rcx), %xmm1
-        vmovdqu	32(%rcx), %xmm2
-        vmovdqu	48(%rcx), %xmm3
-        vpxor	%xmm0, %xmm8, %xmm8
-        vpxor	%xmm1, %xmm9, %xmm9
-        vpxor	%xmm2, %xmm10, %xmm10
-        vpxor	%xmm3, %xmm11, %xmm11
-        vmovdqu	%xmm8, (%rdx)
-        vmovdqu	%xmm9, 16(%rdx)
-        vmovdqu	%xmm10, 32(%rdx)
-        vmovdqu	%xmm11, 48(%rdx)
-        vaesenclast	%xmm7, %xmm12, %xmm12
-        vaesenclast	%xmm7, %xmm13, %xmm13
-        vaesenclast	%xmm7, %xmm14, %xmm14
-        vaesenclast	%xmm7, %xmm15, %xmm15
-        vmovdqu	64(%rcx), %xmm0
-        vmovdqu	80(%rcx), %xmm1
-        vmovdqu	96(%rcx), %xmm2
-        vmovdqu	112(%rcx), %xmm3
-        vpxor	%xmm0, %xmm12, %xmm12
-        vpxor	%xmm1, %xmm13, %xmm13
-        vpxor	%xmm2, %xmm14, %xmm14
-        vpxor	%xmm3, %xmm15, %xmm15
-        vmovdqu	%xmm12, 64(%rdx)
-        vmovdqu	%xmm13, 80(%rdx)
-        vmovdqu	%xmm14, 96(%rdx)
-        vmovdqu	%xmm15, 112(%rdx)
-        # aesenc_128_ghash - end
-        addl	$0x80, %ebx
-        cmpl	%r13d, %ebx
-        jl	L_AES_GCM_encrypt_avx2_ghash_128
-L_AES_GCM_encrypt_avx2_end_128:
-        vmovdqu	L_avx2_aes_gcm_bswap_mask(%rip), %xmm4
-        vpshufb	%xmm4, %xmm8, %xmm8
-        vpshufb	%xmm4, %xmm9, %xmm9
-        vpshufb	%xmm4, %xmm10, %xmm10
-        vpshufb	%xmm4, %xmm11, %xmm11
-        vpshufb	%xmm4, %xmm12, %xmm12
-        vpshufb	%xmm4, %xmm13, %xmm13
-        vpshufb	%xmm4, %xmm14, %xmm14
-        vpshufb	%xmm4, %xmm15, %xmm15
-        vpxor	%xmm6, %xmm8, %xmm8
-        vmovdqu	(%rsp), %xmm7
-        vpclmulqdq	$16, %xmm15, %xmm7, %xmm5
-        vpclmulqdq	$0x01, %xmm15, %xmm7, %xmm1
-        vpclmulqdq	$0x00, %xmm15, %xmm7, %xmm4
-        vpclmulqdq	$0x11, %xmm15, %xmm7, %xmm6
-        vpxor	%xmm1, %xmm5, %xmm5
-        vmovdqu	16(%rsp), %xmm7
-        vpclmulqdq	$16, %xmm14, %xmm7, %xmm2
-        vpclmulqdq	$0x01, %xmm14, %xmm7, %xmm1
-        vpclmulqdq	$0x00, %xmm14, %xmm7, %xmm0
-        vpclmulqdq	$0x11, %xmm14, %xmm7, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm4, %xmm4
-        vmovdqu	32(%rsp), %xmm15
-        vmovdqu	48(%rsp), %xmm7
-        vpclmulqdq	$16, %xmm13, %xmm15, %xmm2
-        vpclmulqdq	$0x01, %xmm13, %xmm15, %xmm1
-        vpclmulqdq	$0x00, %xmm13, %xmm15, %xmm0
-        vpclmulqdq	$0x11, %xmm13, %xmm15, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpclmulqdq	$16, %xmm12, %xmm7, %xmm2
-        vpclmulqdq	$0x01, %xmm12, %xmm7, %xmm1
-        vpclmulqdq	$0x00, %xmm12, %xmm7, %xmm0
-        vpclmulqdq	$0x11, %xmm12, %xmm7, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm4, %xmm4
-        vmovdqu	64(%rsp), %xmm15
-        vmovdqu	80(%rsp), %xmm7
-        vpclmulqdq	$16, %xmm11, %xmm15, %xmm2
-        vpclmulqdq	$0x01, %xmm11, %xmm15, %xmm1
-        vpclmulqdq	$0x00, %xmm11, %xmm15, %xmm0
-        vpclmulqdq	$0x11, %xmm11, %xmm15, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpclmulqdq	$16, %xmm10, %xmm7, %xmm2
-        vpclmulqdq	$0x01, %xmm10, %xmm7, %xmm1
-        vpclmulqdq	$0x00, %xmm10, %xmm7, %xmm0
-        vpclmulqdq	$0x11, %xmm10, %xmm7, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm4, %xmm4
-        vmovdqu	96(%rsp), %xmm15
-        vmovdqu	112(%rsp), %xmm7
-        vpclmulqdq	$16, %xmm9, %xmm15, %xmm2
-        vpclmulqdq	$0x01, %xmm9, %xmm15, %xmm1
-        vpclmulqdq	$0x00, %xmm9, %xmm15, %xmm0
-        vpclmulqdq	$0x11, %xmm9, %xmm15, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpclmulqdq	$16, %xmm8, %xmm7, %xmm2
-        vpclmulqdq	$0x01, %xmm8, %xmm7, %xmm1
-        vpclmulqdq	$0x00, %xmm8, %xmm7, %xmm0
-        vpclmulqdq	$0x11, %xmm8, %xmm7, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpslldq	$8, %xmm5, %xmm7
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpxor	%xmm5, %xmm6, %xmm6
-        # ghash_red
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm4, %xmm0
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm6, %xmm6
-        vmovdqu	(%rsp), %xmm5
-        vmovdqu	128(%rsp), %xmm4
-        vmovdqu	144(%rsp), %xmm15
-L_AES_GCM_encrypt_avx2_done_128:
-        cmpl	%r10d, %ebx
-        je	L_AES_GCM_encrypt_avx2_done_enc
-        movl	%r10d, %r13d
-        andl	$0xfffffff0, %r13d
-        cmpl	%r13d, %ebx
-        jge	L_AES_GCM_encrypt_avx2_last_block_done
-        # aesenc_block
-        vmovdqu	%xmm4, %xmm1
-        vpshufb	L_avx2_aes_gcm_bswap_epi64(%rip), %xmm1, %xmm0
-        vpaddd	L_avx2_aes_gcm_one(%rip), %xmm1, %xmm1
-        vpxor	(%rsi), %xmm0, %xmm0
-        vmovdqu	16(%rsi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	32(%rsi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	48(%rsi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	64(%rsi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	80(%rsi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	96(%rsi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	112(%rsi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	128(%rsi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	144(%rsi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	%xmm1, %xmm4
-        cmpl	$11, %r9d
-        vmovdqu	160(%rsi), %xmm1
-        jl	L_AES_GCM_encrypt_avx2_aesenc_block_last
-        vaesenc	%xmm1, %xmm0, %xmm0
-        vmovdqu	176(%rsi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        cmpl	$13, %r9d
-        vmovdqu	192(%rsi), %xmm1
-        jl	L_AES_GCM_encrypt_avx2_aesenc_block_last
-        vaesenc	%xmm1, %xmm0, %xmm0
-        vmovdqu	208(%rsi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	224(%rsi), %xmm1
-L_AES_GCM_encrypt_avx2_aesenc_block_last:
-        vaesenclast	%xmm1, %xmm0, %xmm0
-        vmovdqu	(%rdi,%rbx,1), %xmm1
-        vpxor	%xmm1, %xmm0, %xmm0
-        vmovdqu	%xmm0, (%r8,%rbx,1)
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm0, %xmm6, %xmm6
-        addl	$16, %ebx
-        cmpl	%r13d, %ebx
-        jge	L_AES_GCM_encrypt_avx2_last_block_ghash
-L_AES_GCM_encrypt_avx2_last_block_start:
-        vmovdqu	(%rdi,%rbx,1), %xmm12
-        vpshufb	L_avx2_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm11
-        vpaddd	L_avx2_aes_gcm_one(%rip), %xmm4, %xmm4
-        # aesenc_gfmul_sb
-        vpclmulqdq	$0x01, %xmm5, %xmm6, %xmm2
-        vpclmulqdq	$16, %xmm5, %xmm6, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm1
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm8
-        vpxor	(%rsi), %xmm11, %xmm11
-        vaesenc	16(%rsi), %xmm11, %xmm11
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpslldq	$8, %xmm3, %xmm2
-        vpsrldq	$8, %xmm3, %xmm3
-        vaesenc	32(%rsi), %xmm11, %xmm11
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm2, %xmm1
-        vaesenc	48(%rsi), %xmm11, %xmm11
-        vaesenc	64(%rsi), %xmm11, %xmm11
-        vaesenc	80(%rsi), %xmm11, %xmm11
-        vpshufd	$0x4e, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm2, %xmm1
-        vaesenc	96(%rsi), %xmm11, %xmm11
-        vaesenc	112(%rsi), %xmm11, %xmm11
-        vaesenc	128(%rsi), %xmm11, %xmm11
-        vpshufd	$0x4e, %xmm2, %xmm2
-        vaesenc	144(%rsi), %xmm11, %xmm11
-        vpxor	%xmm3, %xmm8, %xmm8
-        vpxor	%xmm8, %xmm2, %xmm2
-        vmovdqu	160(%rsi), %xmm0
-        cmpl	$11, %r9d
-        jl	L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	176(%rsi), %xmm11, %xmm11
-        vmovdqu	192(%rsi), %xmm0
-        cmpl	$13, %r9d
-        jl	L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	208(%rsi), %xmm11, %xmm11
-        vmovdqu	224(%rsi), %xmm0
-L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last:
-        vaesenclast	%xmm0, %xmm11, %xmm11
-        vpxor	%xmm1, %xmm2, %xmm6
-        vpxor	%xmm12, %xmm11, %xmm11
-        vmovdqu	%xmm11, (%r8,%rbx,1)
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm11, %xmm11
-        vpxor	%xmm11, %xmm6, %xmm6
-        addl	$16, %ebx
-        cmpl	%r13d, %ebx
-        jl	L_AES_GCM_encrypt_avx2_last_block_start
-L_AES_GCM_encrypt_avx2_last_block_ghash:
-        # ghash_gfmul_red
-        vpclmulqdq	$16, %xmm5, %xmm6, %xmm10
-        vpclmulqdq	$0x01, %xmm5, %xmm6, %xmm9
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm8
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpslldq	$8, %xmm10, %xmm9
-        vpsrldq	$8, %xmm10, %xmm10
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm6
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm9, %xmm8
-        vpshufd	$0x4e, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm9, %xmm8
-        vpshufd	$0x4e, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm6, %xmm6
-        vpxor	%xmm9, %xmm6, %xmm6
-        vpxor	%xmm8, %xmm6, %xmm6
-L_AES_GCM_encrypt_avx2_last_block_done:
-        movl	%r10d, %ecx
-        movl	%r10d, %edx
-        andl	$15, %ecx
-        jz	L_AES_GCM_encrypt_avx2_done_enc
-        # aesenc_last15_enc
-        vpshufb	L_avx2_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4
-        vpxor	(%rsi), %xmm4, %xmm4
-        vaesenc	16(%rsi), %xmm4, %xmm4
-        vaesenc	32(%rsi), %xmm4, %xmm4
-        vaesenc	48(%rsi), %xmm4, %xmm4
-        vaesenc	64(%rsi), %xmm4, %xmm4
-        vaesenc	80(%rsi), %xmm4, %xmm4
-        vaesenc	96(%rsi), %xmm4, %xmm4
-        vaesenc	112(%rsi), %xmm4, %xmm4
-        vaesenc	128(%rsi), %xmm4, %xmm4
-        vaesenc	144(%rsi), %xmm4, %xmm4
-        cmpl	$11, %r9d
-        vmovdqu	160(%rsi), %xmm0
-        jl	L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last
-        vaesenc	%xmm0, %xmm4, %xmm4
-        vaesenc	176(%rsi), %xmm4, %xmm4
-        cmpl	$13, %r9d
-        vmovdqu	192(%rsi), %xmm0
-        jl	L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last
-        vaesenc	%xmm0, %xmm4, %xmm4
-        vaesenc	208(%rsi), %xmm4, %xmm4
-        vmovdqu	224(%rsi), %xmm0
-L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last:
-        vaesenclast	%xmm0, %xmm4, %xmm4
-        xorl	%ecx, %ecx
-        vpxor	%xmm0, %xmm0, %xmm0
-        vmovdqu	%xmm4, (%rsp)
-        vmovdqu	%xmm0, 16(%rsp)
-L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_loop:
-        movzbl	(%rdi,%rbx,1), %r13d
-        xorb	(%rsp,%rcx,1), %r13b
-        movb	%r13b, 16(%rsp,%rcx,1)
-        movb	%r13b, (%r8,%rbx,1)
-        incl	%ebx
-        incl	%ecx
-        cmpl	%edx, %ebx
-        jl	L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_loop
-L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_finish_enc:
-        vmovdqu	16(%rsp), %xmm4
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4
-        vpxor	%xmm4, %xmm6, %xmm6
-        # ghash_gfmul_red
-        vpclmulqdq	$16, %xmm5, %xmm6, %xmm2
-        vpclmulqdq	$0x01, %xmm5, %xmm6, %xmm1
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm0
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm6
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm6, %xmm6
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm0, %xmm6, %xmm6
-L_AES_GCM_encrypt_avx2_done_enc:
-        # calc_tag
-        shlq	$3, %r10
-        shlq	$3, %r11
-        vmovq	%r10, %xmm0
-        vmovq	%r11, %xmm1
-        vpunpcklqdq	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm6, %xmm0, %xmm0
-        # ghash_gfmul_red
-        vpclmulqdq	$16, %xmm5, %xmm0, %xmm4
-        vpclmulqdq	$0x01, %xmm5, %xmm0, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm0, %xmm2
-        vpxor	%xmm3, %xmm4, %xmm4
-        vpslldq	$8, %xmm4, %xmm3
-        vpsrldq	$8, %xmm4, %xmm4
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpclmulqdq	$0x11, %xmm5, %xmm0, %xmm0
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm3, %xmm2
-        vpshufd	$0x4e, %xmm3, %xmm3
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm3, %xmm2
-        vpshufd	$0x4e, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm15, %xmm0, %xmm0
-        # store_tag
-        cmpl	$16, %r14d
-        je	L_AES_GCM_encrypt_avx2_store_tag_16
-        xorq	%rcx, %rcx
-        vmovdqu	%xmm0, (%rsp)
-L_AES_GCM_encrypt_avx2_store_tag_loop:
-        movzbl	(%rsp,%rcx,1), %r13d
-        movb	%r13b, (%r15,%rcx,1)
-        incl	%ecx
-        cmpl	%r14d, %ecx
-        jne	L_AES_GCM_encrypt_avx2_store_tag_loop
-        jmp	L_AES_GCM_encrypt_avx2_store_tag_done
-L_AES_GCM_encrypt_avx2_store_tag_16:
-        vmovdqu	%xmm0, (%r15)
-L_AES_GCM_encrypt_avx2_store_tag_done:
-        vzeroupper
-        addq	$0xa0, %rsp
-        popq	%r14
-        popq	%rbx
-        popq	%r15
-        popq	%r12
-        popq	%r13
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_encrypt_avx2,.-AES_GCM_encrypt_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_decrypt_avx2
-.type	AES_GCM_decrypt_avx2,@function
-.align	16
-AES_GCM_decrypt_avx2:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_decrypt_avx2
-.p2align	4
-_AES_GCM_decrypt_avx2:
-#endif /* __APPLE__ */
-        pushq	%r13
-        pushq	%r12
-        pushq	%r14
-        pushq	%rbx
-        pushq	%r15
-        pushq	%rbp
-        movq	%rdx, %r12
-        movq	%rcx, %rax
-        movq	%r8, %r14
-        movq	%rsi, %r8
-        movl	%r9d, %r10d
-        movl	56(%rsp), %r11d
-        movl	64(%rsp), %ebx
-        movl	72(%rsp), %r15d
-        movq	80(%rsp), %rsi
-        movl	88(%rsp), %r9d
-        movq	96(%rsp), %rbp
-        subq	$0xa8, %rsp
-        vpxor	%xmm4, %xmm4, %xmm4
-        vpxor	%xmm6, %xmm6, %xmm6
-        movl	%ebx, %edx
-        cmpl	$12, %edx
-        je	L_AES_GCM_decrypt_avx2_iv_12
-        # Calculate values when IV is not 12 bytes
-        # H = Encrypt X(=0)
-        vmovdqu	(%rsi), %xmm5
-        vaesenc	16(%rsi), %xmm5, %xmm5
-        vaesenc	32(%rsi), %xmm5, %xmm5
-        vaesenc	48(%rsi), %xmm5, %xmm5
-        vaesenc	64(%rsi), %xmm5, %xmm5
-        vaesenc	80(%rsi), %xmm5, %xmm5
-        vaesenc	96(%rsi), %xmm5, %xmm5
-        vaesenc	112(%rsi), %xmm5, %xmm5
-        vaesenc	128(%rsi), %xmm5, %xmm5
-        vaesenc	144(%rsi), %xmm5, %xmm5
-        cmpl	$11, %r9d
-        vmovdqu	160(%rsi), %xmm0
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	176(%rsi), %xmm5, %xmm5
-        cmpl	$13, %r9d
-        vmovdqu	192(%rsi), %xmm0
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	208(%rsi), %xmm5, %xmm5
-        vmovdqu	224(%rsi), %xmm0
-L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last:
-        vaesenclast	%xmm0, %xmm5, %xmm5
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
-        # Calc counter
-        # Initialization vector
-        cmpl	$0x00, %edx
-        movq	$0x00, %rcx
-        je	L_AES_GCM_decrypt_avx2_calc_iv_done
-        cmpl	$16, %edx
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_decrypt_avx2_calc_iv_16_loop:
-        vmovdqu	(%rax,%rcx,1), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_16_loop
-        movl	%ebx, %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_decrypt_avx2_calc_iv_done
-L_AES_GCM_decrypt_avx2_calc_iv_lt16:
-        vpxor	%xmm0, %xmm0, %xmm0
-        xorl	%ebx, %ebx
-        vmovdqu	%xmm0, (%rsp)
-L_AES_GCM_decrypt_avx2_calc_iv_loop:
-        movzbl	(%rax,%rcx,1), %r13d
-        movb	%r13b, (%rsp,%rbx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_loop
-        vmovdqu	(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-L_AES_GCM_decrypt_avx2_calc_iv_done:
-        # T = Encrypt counter
-        vpxor	%xmm0, %xmm0, %xmm0
-        shll	$3, %edx
-        vmovq	%rdx, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4
-        #   Encrypt counter
-        vmovdqu	(%rsi), %xmm15
-        vpxor	%xmm4, %xmm15, %xmm15
-        vaesenc	16(%rsi), %xmm15, %xmm15
-        vaesenc	32(%rsi), %xmm15, %xmm15
-        vaesenc	48(%rsi), %xmm15, %xmm15
-        vaesenc	64(%rsi), %xmm15, %xmm15
-        vaesenc	80(%rsi), %xmm15, %xmm15
-        vaesenc	96(%rsi), %xmm15, %xmm15
-        vaesenc	112(%rsi), %xmm15, %xmm15
-        vaesenc	128(%rsi), %xmm15, %xmm15
-        vaesenc	144(%rsi), %xmm15, %xmm15
-        cmpl	$11, %r9d
-        vmovdqu	160(%rsi), %xmm0
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vaesenc	176(%rsi), %xmm15, %xmm15
-        cmpl	$13, %r9d
-        vmovdqu	192(%rsi), %xmm0
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vaesenc	208(%rsi), %xmm15, %xmm15
-        vmovdqu	224(%rsi), %xmm0
-L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last:
-        vaesenclast	%xmm0, %xmm15, %xmm15
-        jmp	L_AES_GCM_decrypt_avx2_iv_done
-L_AES_GCM_decrypt_avx2_iv_12:
-        # # Calculate values when IV is 12 bytes
-        # Set counter based on IV
-        vmovdqu	L_avx2_aes_gcm_bswap_one(%rip), %xmm4
-        vmovdqu	(%rsi), %xmm5
-        vpblendd	$7, (%rax), %xmm4, %xmm4
-        # H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqu	16(%rsi), %xmm7
-        vpxor	%xmm5, %xmm4, %xmm15
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	32(%rsi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vmovdqu	48(%rsi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vmovdqu	64(%rsi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vmovdqu	80(%rsi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vmovdqu	96(%rsi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vmovdqu	112(%rsi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vmovdqu	128(%rsi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vmovdqu	144(%rsi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        cmpl	$11, %r9d
-        vmovdqu	160(%rsi), %xmm0
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_12_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vmovdqu	176(%rsi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        cmpl	$13, %r9d
-        vmovdqu	192(%rsi), %xmm0
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_12_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vmovdqu	208(%rsi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm15, %xmm15
-        vmovdqu	224(%rsi), %xmm0
-L_AES_GCM_decrypt_avx2_calc_iv_12_last:
-        vaesenclast	%xmm0, %xmm5, %xmm5
-        vaesenclast	%xmm0, %xmm15, %xmm15
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
-L_AES_GCM_decrypt_avx2_iv_done:
-        # Additional authentication data
-        movl	%r11d, %edx
-        cmpl	$0x00, %edx
-        je	L_AES_GCM_decrypt_avx2_calc_aad_done
-        xorl	%ecx, %ecx
-        cmpl	$16, %edx
-        jl	L_AES_GCM_decrypt_avx2_calc_aad_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_decrypt_avx2_calc_aad_16_loop:
-        vmovdqu	(%r12,%rcx,1), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm0, %xmm6, %xmm6
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm6, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm6, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm6, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm6
-        # ghash_mid
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm6, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm6, %xmm6
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm6, %xmm6
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm6, %xmm6
-        # ghash_red
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm6, %xmm6
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_avx2_calc_aad_16_loop
-        movl	%r11d, %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_decrypt_avx2_calc_aad_done
-L_AES_GCM_decrypt_avx2_calc_aad_lt16:
-        vpxor	%xmm0, %xmm0, %xmm0
-        xorl	%ebx, %ebx
-        vmovdqu	%xmm0, (%rsp)
-L_AES_GCM_decrypt_avx2_calc_aad_loop:
-        movzbl	(%r12,%rcx,1), %r13d
-        movb	%r13b, (%rsp,%rbx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_avx2_calc_aad_loop
-        vmovdqu	(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm0, %xmm6, %xmm6
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm6, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm6, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm6, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm6
-        # ghash_mid
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm6, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm6, %xmm6
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm6, %xmm6
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm6, %xmm6
-        # ghash_red
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm6, %xmm6
-L_AES_GCM_decrypt_avx2_calc_aad_done:
-        # Calculate counter and H
-        vpsrlq	$63, %xmm5, %xmm1
-        vpsllq	$0x01, %xmm5, %xmm0
-        vpslldq	$8, %xmm1, %xmm1
-        vpor	%xmm1, %xmm0, %xmm0
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpshufb	L_avx2_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4
-        vpand	L_avx2_aes_gcm_mod2_128(%rip), %xmm5, %xmm5
-        vpaddd	L_avx2_aes_gcm_one(%rip), %xmm4, %xmm4
-        vpxor	%xmm0, %xmm5, %xmm5
-        xorl	%ebx, %ebx
-        cmpl	$0x80, %r10d
-        movl	%r10d, %r13d
-        jl	L_AES_GCM_decrypt_avx2_done_128
-        andl	$0xffffff80, %r13d
-        vmovdqu	%xmm4, 128(%rsp)
-        vmovdqu	%xmm15, 144(%rsp)
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm3
-        # H ^ 1 and H ^ 2
-        vpclmulqdq	$0x00, %xmm5, %xmm5, %xmm9
-        vpclmulqdq	$0x11, %xmm5, %xmm5, %xmm10
-        vpclmulqdq	$16, %xmm3, %xmm9, %xmm8
-        vpshufd	$0x4e, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpclmulqdq	$16, %xmm3, %xmm9, %xmm8
-        vpshufd	$0x4e, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm9, %xmm10, %xmm0
-        vmovdqu	%xmm5, (%rsp)
-        vmovdqu	%xmm0, 16(%rsp)
-        # H ^ 3 and H ^ 4
-        vpclmulqdq	$16, %xmm5, %xmm0, %xmm11
-        vpclmulqdq	$0x01, %xmm5, %xmm0, %xmm10
-        vpclmulqdq	$0x00, %xmm5, %xmm0, %xmm9
-        vpclmulqdq	$0x11, %xmm5, %xmm0, %xmm12
-        vpclmulqdq	$0x00, %xmm0, %xmm0, %xmm13
-        vpclmulqdq	$0x11, %xmm0, %xmm0, %xmm14
-        vpxor	%xmm10, %xmm11, %xmm11
-        vpslldq	$8, %xmm11, %xmm10
-        vpsrldq	$8, %xmm11, %xmm11
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm11, %xmm12, %xmm12
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpxor	%xmm12, %xmm10, %xmm10
-        vpxor	%xmm14, %xmm13, %xmm2
-        vpxor	%xmm9, %xmm10, %xmm1
-        vmovdqu	%xmm1, 32(%rsp)
-        vmovdqu	%xmm2, 48(%rsp)
-        # H ^ 5 and H ^ 6
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm11
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm10
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm9
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm12
-        vpclmulqdq	$0x00, %xmm1, %xmm1, %xmm13
-        vpclmulqdq	$0x11, %xmm1, %xmm1, %xmm14
-        vpxor	%xmm10, %xmm11, %xmm11
-        vpslldq	$8, %xmm11, %xmm10
-        vpsrldq	$8, %xmm11, %xmm11
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm11, %xmm12, %xmm12
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpxor	%xmm12, %xmm10, %xmm10
-        vpxor	%xmm14, %xmm13, %xmm0
-        vpxor	%xmm9, %xmm10, %xmm7
-        vmovdqu	%xmm7, 64(%rsp)
-        vmovdqu	%xmm0, 80(%rsp)
-        # H ^ 7 and H ^ 8
-        vpclmulqdq	$16, %xmm1, %xmm2, %xmm11
-        vpclmulqdq	$0x01, %xmm1, %xmm2, %xmm10
-        vpclmulqdq	$0x00, %xmm1, %xmm2, %xmm9
-        vpclmulqdq	$0x11, %xmm1, %xmm2, %xmm12
-        vpclmulqdq	$0x00, %xmm2, %xmm2, %xmm13
-        vpclmulqdq	$0x11, %xmm2, %xmm2, %xmm14
-        vpxor	%xmm10, %xmm11, %xmm11
-        vpslldq	$8, %xmm11, %xmm10
-        vpsrldq	$8, %xmm11, %xmm11
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm11, %xmm12, %xmm12
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpxor	%xmm12, %xmm10, %xmm10
-        vpxor	%xmm14, %xmm13, %xmm0
-        vpxor	%xmm9, %xmm10, %xmm7
-        vmovdqu	%xmm7, 96(%rsp)
-        vmovdqu	%xmm0, 112(%rsp)
-L_AES_GCM_decrypt_avx2_ghash_128:
-        # aesenc_128_ghash
-        leaq	(%rdi,%rbx,1), %rcx
-        leaq	(%r8,%rbx,1), %rdx
-        # aesenc_ctr
-        vmovdqu	128(%rsp), %xmm0
-        vmovdqu	L_avx2_aes_gcm_bswap_epi64(%rip), %xmm1
-        vpaddd	L_avx2_aes_gcm_one(%rip), %xmm0, %xmm9
-        vpshufb	%xmm1, %xmm0, %xmm8
-        vpaddd	L_avx2_aes_gcm_two(%rip), %xmm0, %xmm10
-        vpshufb	%xmm1, %xmm9, %xmm9
-        vpaddd	L_avx2_aes_gcm_three(%rip), %xmm0, %xmm11
-        vpshufb	%xmm1, %xmm10, %xmm10
-        vpaddd	L_avx2_aes_gcm_four(%rip), %xmm0, %xmm12
-        vpshufb	%xmm1, %xmm11, %xmm11
-        vpaddd	L_avx2_aes_gcm_five(%rip), %xmm0, %xmm13
-        vpshufb	%xmm1, %xmm12, %xmm12
-        vpaddd	L_avx2_aes_gcm_six(%rip), %xmm0, %xmm14
-        vpshufb	%xmm1, %xmm13, %xmm13
-        vpaddd	L_avx2_aes_gcm_seven(%rip), %xmm0, %xmm15
-        vpshufb	%xmm1, %xmm14, %xmm14
-        vpaddd	L_avx2_aes_gcm_eight(%rip), %xmm0, %xmm0
-        vpshufb	%xmm1, %xmm15, %xmm15
-        # aesenc_xor
-        vmovdqu	(%rsi), %xmm7
-        vmovdqu	%xmm0, 128(%rsp)
-        vpxor	%xmm7, %xmm8, %xmm8
-        vpxor	%xmm7, %xmm9, %xmm9
-        vpxor	%xmm7, %xmm10, %xmm10
-        vpxor	%xmm7, %xmm11, %xmm11
-        vpxor	%xmm7, %xmm12, %xmm12
-        vpxor	%xmm7, %xmm13, %xmm13
-        vpxor	%xmm7, %xmm14, %xmm14
-        vpxor	%xmm7, %xmm15, %xmm15
-        # aesenc_pclmul_1
-        vmovdqu	(%rcx), %xmm1
-        vmovdqu	16(%rsi), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vmovdqu	112(%rsp), %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm5
-        vpclmulqdq	$0x01, %xmm2, %xmm1, %xmm3
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm6
-        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_2
-        vmovdqu	16(%rcx), %xmm1
-        vmovdqu	96(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	32(%rsi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	32(%rcx), %xmm1
-        vmovdqu	80(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	48(%rsi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	48(%rcx), %xmm1
-        vmovdqu	64(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	64(%rsi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	64(%rcx), %xmm1
-        vmovdqu	48(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	80(%rsi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	80(%rcx), %xmm1
-        vmovdqu	32(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	96(%rsi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	96(%rcx), %xmm1
-        vmovdqu	16(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	112(%rsi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	112(%rcx), %xmm1
-        vmovdqu	(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	128(%rsi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_l
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpslldq	$8, %xmm5, %xmm1
-        vpsrldq	$8, %xmm5, %xmm5
-        vmovdqu	144(%rsi), %xmm4
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm0
-        vaesenc	%xmm4, %xmm8, %xmm8
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm7, %xmm7
-        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
-        vaesenc	%xmm4, %xmm9, %xmm9
-        vaesenc	%xmm4, %xmm10, %xmm10
-        vaesenc	%xmm4, %xmm11, %xmm11
-        vpshufd	$0x4e, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
-        vaesenc	%xmm4, %xmm12, %xmm12
-        vaesenc	%xmm4, %xmm13, %xmm13
-        vaesenc	%xmm4, %xmm14, %xmm14
-        vpshufd	$0x4e, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm7, %xmm6, %xmm6
-        vaesenc	%xmm4, %xmm15, %xmm15
-        cmpl	$11, %r9d
-        vmovdqu	160(%rsi), %xmm7
-        jl	L_AES_GCM_decrypt_avx2_aesenc_128_ghash_avx_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	176(%rsi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        cmpl	$13, %r9d
-        vmovdqu	192(%rsi), %xmm7
-        jl	L_AES_GCM_decrypt_avx2_aesenc_128_ghash_avx_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	208(%rsi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	224(%rsi), %xmm7
-L_AES_GCM_decrypt_avx2_aesenc_128_ghash_avx_done:
-        # aesenc_last
-        vaesenclast	%xmm7, %xmm8, %xmm8
-        vaesenclast	%xmm7, %xmm9, %xmm9
-        vaesenclast	%xmm7, %xmm10, %xmm10
-        vaesenclast	%xmm7, %xmm11, %xmm11
-        vmovdqu	(%rcx), %xmm0
-        vmovdqu	16(%rcx), %xmm1
-        vmovdqu	32(%rcx), %xmm2
-        vmovdqu	48(%rcx), %xmm3
-        vpxor	%xmm0, %xmm8, %xmm8
-        vpxor	%xmm1, %xmm9, %xmm9
-        vpxor	%xmm2, %xmm10, %xmm10
-        vpxor	%xmm3, %xmm11, %xmm11
-        vmovdqu	%xmm8, (%rdx)
-        vmovdqu	%xmm9, 16(%rdx)
-        vmovdqu	%xmm10, 32(%rdx)
-        vmovdqu	%xmm11, 48(%rdx)
-        vaesenclast	%xmm7, %xmm12, %xmm12
-        vaesenclast	%xmm7, %xmm13, %xmm13
-        vaesenclast	%xmm7, %xmm14, %xmm14
-        vaesenclast	%xmm7, %xmm15, %xmm15
-        vmovdqu	64(%rcx), %xmm0
-        vmovdqu	80(%rcx), %xmm1
-        vmovdqu	96(%rcx), %xmm2
-        vmovdqu	112(%rcx), %xmm3
-        vpxor	%xmm0, %xmm12, %xmm12
-        vpxor	%xmm1, %xmm13, %xmm13
-        vpxor	%xmm2, %xmm14, %xmm14
-        vpxor	%xmm3, %xmm15, %xmm15
-        vmovdqu	%xmm12, 64(%rdx)
-        vmovdqu	%xmm13, 80(%rdx)
-        vmovdqu	%xmm14, 96(%rdx)
-        vmovdqu	%xmm15, 112(%rdx)
-        # aesenc_128_ghash - end
-        addl	$0x80, %ebx
-        cmpl	%r13d, %ebx
-        jl	L_AES_GCM_decrypt_avx2_ghash_128
-        vmovdqu	(%rsp), %xmm5
-        vmovdqu	128(%rsp), %xmm4
-        vmovdqu	144(%rsp), %xmm15
-L_AES_GCM_decrypt_avx2_done_128:
-        cmpl	%r10d, %ebx
-        jge	L_AES_GCM_decrypt_avx2_done_dec
-        movl	%r10d, %r13d
-        andl	$0xfffffff0, %r13d
-        cmpl	%r13d, %ebx
-        jge	L_AES_GCM_decrypt_avx2_last_block_done
-L_AES_GCM_decrypt_avx2_last_block_start:
-        vmovdqu	(%rdi,%rbx,1), %xmm11
-        vpshufb	L_avx2_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm10
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm11, %xmm12
-        vpaddd	L_avx2_aes_gcm_one(%rip), %xmm4, %xmm4
-        vpxor	%xmm6, %xmm12, %xmm12
-        # aesenc_gfmul_sb
-        vpclmulqdq	$0x01, %xmm5, %xmm12, %xmm2
-        vpclmulqdq	$16, %xmm5, %xmm12, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm12, %xmm1
-        vpclmulqdq	$0x11, %xmm5, %xmm12, %xmm8
-        vpxor	(%rsi), %xmm10, %xmm10
-        vaesenc	16(%rsi), %xmm10, %xmm10
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpslldq	$8, %xmm3, %xmm2
-        vpsrldq	$8, %xmm3, %xmm3
-        vaesenc	32(%rsi), %xmm10, %xmm10
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm2, %xmm1
-        vaesenc	48(%rsi), %xmm10, %xmm10
-        vaesenc	64(%rsi), %xmm10, %xmm10
-        vaesenc	80(%rsi), %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm2, %xmm1
-        vaesenc	96(%rsi), %xmm10, %xmm10
-        vaesenc	112(%rsi), %xmm10, %xmm10
-        vaesenc	128(%rsi), %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm2, %xmm2
-        vaesenc	144(%rsi), %xmm10, %xmm10
-        vpxor	%xmm3, %xmm8, %xmm8
-        vpxor	%xmm8, %xmm2, %xmm2
-        vmovdqu	160(%rsi), %xmm0
-        cmpl	$11, %r9d
-        jl	L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	176(%rsi), %xmm10, %xmm10
-        vmovdqu	192(%rsi), %xmm0
-        cmpl	$13, %r9d
-        jl	L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	208(%rsi), %xmm10, %xmm10
-        vmovdqu	224(%rsi), %xmm0
-L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last:
-        vaesenclast	%xmm0, %xmm10, %xmm10
-        vpxor	%xmm1, %xmm2, %xmm6
-        vpxor	%xmm11, %xmm10, %xmm10
-        vmovdqu	%xmm10, (%r8,%rbx,1)
-        addl	$16, %ebx
-        cmpl	%r13d, %ebx
-        jl	L_AES_GCM_decrypt_avx2_last_block_start
-L_AES_GCM_decrypt_avx2_last_block_done:
-        movl	%r10d, %ecx
-        movl	%r10d, %edx
-        andl	$15, %ecx
-        jz	L_AES_GCM_decrypt_avx2_done_dec
-        # aesenc_last15_dec
-        vpshufb	L_avx2_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4
-        vpxor	(%rsi), %xmm4, %xmm4
-        vaesenc	16(%rsi), %xmm4, %xmm4
-        vaesenc	32(%rsi), %xmm4, %xmm4
-        vaesenc	48(%rsi), %xmm4, %xmm4
-        vaesenc	64(%rsi), %xmm4, %xmm4
-        vaesenc	80(%rsi), %xmm4, %xmm4
-        vaesenc	96(%rsi), %xmm4, %xmm4
-        vaesenc	112(%rsi), %xmm4, %xmm4
-        vaesenc	128(%rsi), %xmm4, %xmm4
-        vaesenc	144(%rsi), %xmm4, %xmm4
-        cmpl	$11, %r9d
-        vmovdqu	160(%rsi), %xmm1
-        jl	L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last
-        vaesenc	%xmm1, %xmm4, %xmm4
-        vaesenc	176(%rsi), %xmm4, %xmm4
-        cmpl	$13, %r9d
-        vmovdqu	192(%rsi), %xmm1
-        jl	L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last
-        vaesenc	%xmm1, %xmm4, %xmm4
-        vaesenc	208(%rsi), %xmm4, %xmm4
-        vmovdqu	224(%rsi), %xmm1
-L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last:
-        vaesenclast	%xmm1, %xmm4, %xmm4
-        xorl	%ecx, %ecx
-        vpxor	%xmm0, %xmm0, %xmm0
-        vmovdqu	%xmm4, (%rsp)
-        vmovdqu	%xmm0, 16(%rsp)
-L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_loop:
-        movzbl	(%rdi,%rbx,1), %r13d
-        movb	%r13b, 16(%rsp,%rcx,1)
-        xorb	(%rsp,%rcx,1), %r13b
-        movb	%r13b, (%r8,%rbx,1)
-        incl	%ebx
-        incl	%ecx
-        cmpl	%edx, %ebx
-        jl	L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_loop
-        vmovdqu	16(%rsp), %xmm4
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4
-        vpxor	%xmm4, %xmm6, %xmm6
-        # ghash_gfmul_red
-        vpclmulqdq	$16, %xmm5, %xmm6, %xmm2
-        vpclmulqdq	$0x01, %xmm5, %xmm6, %xmm1
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm0
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm6
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm6, %xmm6
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm0, %xmm6, %xmm6
-L_AES_GCM_decrypt_avx2_done_dec:
-        # calc_tag
-        shlq	$3, %r10
-        shlq	$3, %r11
-        vmovq	%r10, %xmm0
-        vmovq	%r11, %xmm1
-        vpunpcklqdq	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm6, %xmm0, %xmm0
-        # ghash_gfmul_red
-        vpclmulqdq	$16, %xmm5, %xmm0, %xmm4
-        vpclmulqdq	$0x01, %xmm5, %xmm0, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm0, %xmm2
-        vpxor	%xmm3, %xmm4, %xmm4
-        vpslldq	$8, %xmm4, %xmm3
-        vpsrldq	$8, %xmm4, %xmm4
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpclmulqdq	$0x11, %xmm5, %xmm0, %xmm0
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm3, %xmm2
-        vpshufd	$0x4e, %xmm3, %xmm3
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm3, %xmm2
-        vpshufd	$0x4e, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm15, %xmm0, %xmm0
-        # cmp_tag
-        cmpl	$16, %r15d
-        je	L_AES_GCM_decrypt_avx2_cmp_tag_16
-        xorq	%rdx, %rdx
-        xorq	%rax, %rax
-        vmovdqu	%xmm0, (%rsp)
-L_AES_GCM_decrypt_avx2_cmp_tag_loop:
-        movzbl	(%rsp,%rdx,1), %r13d
-        xorb	(%r14,%rdx,1), %r13b
-        orb	%r13b, %al
-        incl	%edx
-        cmpl	%r15d, %edx
-        jne	L_AES_GCM_decrypt_avx2_cmp_tag_loop
-        cmpb	$0x00, %al
-        sete	%al
-        jmp	L_AES_GCM_decrypt_avx2_cmp_tag_done
-L_AES_GCM_decrypt_avx2_cmp_tag_16:
-        vmovdqu	(%r14), %xmm1
-        vpcmpeqb	%xmm1, %xmm0, %xmm0
-        vpmovmskb	%xmm0, %rdx
-        # %%edx == 0xFFFF then return 1 else => return 0
-        xorl	%eax, %eax
-        cmpl	$0xffff, %edx
-        sete	%al
-L_AES_GCM_decrypt_avx2_cmp_tag_done:
-        movl	%eax, (%rbp)
-        vzeroupper
-        addq	$0xa8, %rsp
-        popq	%rbp
-        popq	%r15
-        popq	%rbx
-        popq	%r14
-        popq	%r12
-        popq	%r13
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_decrypt_avx2,.-AES_GCM_decrypt_avx2
-#endif /* __APPLE__ */
-#ifdef WOLFSSL_AESGCM_STREAM
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_init_avx2
-.type	AES_GCM_init_avx2,@function
-.align	16
-AES_GCM_init_avx2:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_init_avx2
-.p2align	4
-_AES_GCM_init_avx2:
-#endif /* __APPLE__ */
-        pushq	%rbx
-        pushq	%r12
-        movq	%rdx, %r10
-        movl	%ecx, %r11d
-        movq	24(%rsp), %rax
-        subq	$16, %rsp
-        vpxor	%xmm4, %xmm4, %xmm4
-        movl	%r11d, %edx
-        cmpl	$12, %edx
-        je	L_AES_GCM_init_avx2_iv_12
-        # Calculate values when IV is not 12 bytes
-        # H = Encrypt X(=0)
-        vmovdqu	(%rdi), %xmm5
-        vaesenc	16(%rdi), %xmm5, %xmm5
-        vaesenc	32(%rdi), %xmm5, %xmm5
-        vaesenc	48(%rdi), %xmm5, %xmm5
-        vaesenc	64(%rdi), %xmm5, %xmm5
-        vaesenc	80(%rdi), %xmm5, %xmm5
-        vaesenc	96(%rdi), %xmm5, %xmm5
-        vaesenc	112(%rdi), %xmm5, %xmm5
-        vaesenc	128(%rdi), %xmm5, %xmm5
-        vaesenc	144(%rdi), %xmm5, %xmm5
-        cmpl	$11, %esi
-        vmovdqu	160(%rdi), %xmm0
-        jl	L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	176(%rdi), %xmm5, %xmm5
-        cmpl	$13, %esi
-        vmovdqu	192(%rdi), %xmm0
-        jl	L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	208(%rdi), %xmm5, %xmm5
-        vmovdqu	224(%rdi), %xmm0
-L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last:
-        vaesenclast	%xmm0, %xmm5, %xmm5
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
-        # Calc counter
-        # Initialization vector
-        cmpl	$0x00, %edx
-        movq	$0x00, %rcx
-        je	L_AES_GCM_init_avx2_calc_iv_done
-        cmpl	$16, %edx
-        jl	L_AES_GCM_init_avx2_calc_iv_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_init_avx2_calc_iv_16_loop:
-        vmovdqu	(%r10,%rcx,1), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm6
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm6, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm6, %xmm6
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm6, %xmm6
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm6, %xmm0
-        vpshufd	$0x4e, %xmm6, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_init_avx2_calc_iv_16_loop
-        movl	%r11d, %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_init_avx2_calc_iv_done
-L_AES_GCM_init_avx2_calc_iv_lt16:
-        vpxor	%xmm0, %xmm0, %xmm0
-        xorl	%ebx, %ebx
-        vmovdqu	%xmm0, (%rsp)
-L_AES_GCM_init_avx2_calc_iv_loop:
-        movzbl	(%r10,%rcx,1), %r12d
-        movb	%r12b, (%rsp,%rbx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_init_avx2_calc_iv_loop
-        vmovdqu	(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm6
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm6, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm6, %xmm6
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm6, %xmm6
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm6, %xmm0
-        vpshufd	$0x4e, %xmm6, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-L_AES_GCM_init_avx2_calc_iv_done:
-        # T = Encrypt counter
-        vpxor	%xmm0, %xmm0, %xmm0
-        shll	$3, %edx
-        vmovq	%rdx, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm6
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm6, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm6, %xmm6
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm6, %xmm6
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm6, %xmm0
-        vpshufd	$0x4e, %xmm6, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm4, %xmm4
-        #   Encrypt counter
-        vmovdqu	(%rdi), %xmm7
-        vpxor	%xmm4, %xmm7, %xmm7
-        vaesenc	16(%rdi), %xmm7, %xmm7
-        vaesenc	32(%rdi), %xmm7, %xmm7
-        vaesenc	48(%rdi), %xmm7, %xmm7
-        vaesenc	64(%rdi), %xmm7, %xmm7
-        vaesenc	80(%rdi), %xmm7, %xmm7
-        vaesenc	96(%rdi), %xmm7, %xmm7
-        vaesenc	112(%rdi), %xmm7, %xmm7
-        vaesenc	128(%rdi), %xmm7, %xmm7
-        vaesenc	144(%rdi), %xmm7, %xmm7
-        cmpl	$11, %esi
-        vmovdqu	160(%rdi), %xmm0
-        jl	L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vaesenc	176(%rdi), %xmm7, %xmm7
-        cmpl	$13, %esi
-        vmovdqu	192(%rdi), %xmm0
-        jl	L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vaesenc	208(%rdi), %xmm7, %xmm7
-        vmovdqu	224(%rdi), %xmm0
-L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last:
-        vaesenclast	%xmm0, %xmm7, %xmm7
-        jmp	L_AES_GCM_init_avx2_iv_done
-L_AES_GCM_init_avx2_iv_12:
-        # # Calculate values when IV is 12 bytes
-        # Set counter based on IV
-        vmovdqu	L_avx2_aes_gcm_bswap_one(%rip), %xmm4
-        vmovdqu	(%rdi), %xmm5
-        vpblendd	$7, (%r10), %xmm4, %xmm4
-        # H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqu	16(%rdi), %xmm6
-        vpxor	%xmm5, %xmm4, %xmm7
-        vaesenc	%xmm6, %xmm5, %xmm5
-        vaesenc	%xmm6, %xmm7, %xmm7
-        vmovdqu	32(%rdi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vmovdqu	48(%rdi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vmovdqu	64(%rdi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vmovdqu	80(%rdi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vmovdqu	96(%rdi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vmovdqu	112(%rdi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vmovdqu	128(%rdi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vmovdqu	144(%rdi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        cmpl	$11, %esi
-        vmovdqu	160(%rdi), %xmm0
-        jl	L_AES_GCM_init_avx2_calc_iv_12_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vmovdqu	176(%rdi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        cmpl	$13, %esi
-        vmovdqu	192(%rdi), %xmm0
-        jl	L_AES_GCM_init_avx2_calc_iv_12_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vmovdqu	208(%rdi), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vmovdqu	224(%rdi), %xmm0
-L_AES_GCM_init_avx2_calc_iv_12_last:
-        vaesenclast	%xmm0, %xmm5, %xmm5
-        vaesenclast	%xmm0, %xmm7, %xmm7
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5
-L_AES_GCM_init_avx2_iv_done:
-        vmovdqu	%xmm7, (%rax)
-        vpshufb	L_avx2_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4
-        vpaddd	L_avx2_aes_gcm_one(%rip), %xmm4, %xmm4
-        vmovdqu	%xmm5, (%r8)
-        vmovdqu	%xmm4, (%r9)
-        vzeroupper
-        addq	$16, %rsp
-        popq	%r12
-        popq	%rbx
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_init_avx2,.-AES_GCM_init_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_aad_update_avx2
-.type	AES_GCM_aad_update_avx2,@function
-.align	16
-AES_GCM_aad_update_avx2:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_aad_update_avx2
-.p2align	4
-_AES_GCM_aad_update_avx2:
-#endif /* __APPLE__ */
-        movq	%rcx, %rax
-        vmovdqu	(%rdx), %xmm4
-        vmovdqu	(%rax), %xmm5
-        xorl	%ecx, %ecx
-L_AES_GCM_aad_update_avx2_16_loop:
-        vmovdqu	(%rdi,%rcx,1), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm6
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm6, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm6, %xmm6
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm6, %xmm6
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm6, %xmm0
-        vpshufd	$0x4e, %xmm6, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-        addl	$16, %ecx
-        cmpl	%esi, %ecx
-        jl	L_AES_GCM_aad_update_avx2_16_loop
-        vmovdqu	%xmm4, (%rdx)
-        vzeroupper
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_aad_update_avx2,.-AES_GCM_aad_update_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_encrypt_block_avx2
-.type	AES_GCM_encrypt_block_avx2,@function
-.align	16
-AES_GCM_encrypt_block_avx2:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_encrypt_block_avx2
-.p2align	4
-_AES_GCM_encrypt_block_avx2:
-#endif /* __APPLE__ */
-        movq	%rdx, %r10
-        movq	%rcx, %r11
-        subq	$0x98, %rsp
-        vmovdqu	(%r8), %xmm3
-        # aesenc_block
-        vmovdqu	%xmm3, %xmm1
-        vpshufb	L_avx2_aes_gcm_bswap_epi64(%rip), %xmm1, %xmm0
-        vpaddd	L_avx2_aes_gcm_one(%rip), %xmm1, %xmm1
-        vpxor	(%rdi), %xmm0, %xmm0
-        vmovdqu	16(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	32(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	48(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	64(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	80(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	96(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	112(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	128(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	144(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	%xmm1, %xmm3
-        cmpl	$11, %esi
-        vmovdqu	160(%rdi), %xmm1
-        jl	L_AES_GCM_encrypt_block_avx2_aesenc_block_last
-        vaesenc	%xmm1, %xmm0, %xmm0
-        vmovdqu	176(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        cmpl	$13, %esi
-        vmovdqu	192(%rdi), %xmm1
-        jl	L_AES_GCM_encrypt_block_avx2_aesenc_block_last
-        vaesenc	%xmm1, %xmm0, %xmm0
-        vmovdqu	208(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	224(%rdi), %xmm1
-L_AES_GCM_encrypt_block_avx2_aesenc_block_last:
-        vaesenclast	%xmm1, %xmm0, %xmm0
-        vmovdqu	(%r11), %xmm1
-        vpxor	%xmm1, %xmm0, %xmm0
-        vmovdqu	%xmm0, (%r10)
-        vmovdqu	%xmm3, (%r8)
-        vzeroupper
-        addq	$0x98, %rsp
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_encrypt_block_avx2,.-AES_GCM_encrypt_block_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_ghash_block_avx2
-.type	AES_GCM_ghash_block_avx2,@function
-.align	16
-AES_GCM_ghash_block_avx2:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_ghash_block_avx2
-.p2align	4
-_AES_GCM_ghash_block_avx2:
-#endif /* __APPLE__ */
-        vmovdqu	(%rsi), %xmm4
-        vmovdqu	(%rdx), %xmm5
-        vmovdqu	(%rdi), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm6
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm6, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm6, %xmm6
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm6, %xmm6
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm6, %xmm0
-        vpshufd	$0x4e, %xmm6, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-        vmovdqu	%xmm4, (%rsi)
-        vzeroupper
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_ghash_block_avx2,.-AES_GCM_ghash_block_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_encrypt_update_avx2
-.type	AES_GCM_encrypt_update_avx2,@function
-.align	16
-AES_GCM_encrypt_update_avx2:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_encrypt_update_avx2
-.p2align	4
-_AES_GCM_encrypt_update_avx2:
-#endif /* __APPLE__ */
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        movq	%rdx, %r10
-        movq	%rcx, %r11
-        movq	32(%rsp), %rax
-        movq	40(%rsp), %r12
-        subq	$0x98, %rsp
-        vmovdqu	(%r9), %xmm6
-        vmovdqu	(%rax), %xmm5
-        vmovdqu	(%r12), %xmm4
-        vpsrlq	$63, %xmm5, %xmm1
-        vpsllq	$0x01, %xmm5, %xmm0
-        vpslldq	$8, %xmm1, %xmm1
-        vpor	%xmm1, %xmm0, %xmm0
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpand	L_avx2_aes_gcm_mod2_128(%rip), %xmm5, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        xorl	%r14d, %r14d
-        cmpl	$0x80, %r8d
-        movl	%r8d, %r13d
-        jl	L_AES_GCM_encrypt_update_avx2_done_128
-        andl	$0xffffff80, %r13d
-        vmovdqu	%xmm4, 128(%rsp)
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm3
-        # H ^ 1 and H ^ 2
-        vpclmulqdq	$0x00, %xmm5, %xmm5, %xmm9
-        vpclmulqdq	$0x11, %xmm5, %xmm5, %xmm10
-        vpclmulqdq	$16, %xmm3, %xmm9, %xmm8
-        vpshufd	$0x4e, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpclmulqdq	$16, %xmm3, %xmm9, %xmm8
-        vpshufd	$0x4e, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm9, %xmm10, %xmm0
-        vmovdqu	%xmm5, (%rsp)
-        vmovdqu	%xmm0, 16(%rsp)
-        # H ^ 3 and H ^ 4
-        vpclmulqdq	$16, %xmm5, %xmm0, %xmm11
-        vpclmulqdq	$0x01, %xmm5, %xmm0, %xmm10
-        vpclmulqdq	$0x00, %xmm5, %xmm0, %xmm9
-        vpclmulqdq	$0x11, %xmm5, %xmm0, %xmm12
-        vpclmulqdq	$0x00, %xmm0, %xmm0, %xmm13
-        vpclmulqdq	$0x11, %xmm0, %xmm0, %xmm14
-        vpxor	%xmm10, %xmm11, %xmm11
-        vpslldq	$8, %xmm11, %xmm10
-        vpsrldq	$8, %xmm11, %xmm11
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm11, %xmm12, %xmm12
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpxor	%xmm12, %xmm10, %xmm10
-        vpxor	%xmm14, %xmm13, %xmm2
-        vpxor	%xmm9, %xmm10, %xmm1
-        vmovdqu	%xmm1, 32(%rsp)
-        vmovdqu	%xmm2, 48(%rsp)
-        # H ^ 5 and H ^ 6
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm11
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm10
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm9
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm12
-        vpclmulqdq	$0x00, %xmm1, %xmm1, %xmm13
-        vpclmulqdq	$0x11, %xmm1, %xmm1, %xmm14
-        vpxor	%xmm10, %xmm11, %xmm11
-        vpslldq	$8, %xmm11, %xmm10
-        vpsrldq	$8, %xmm11, %xmm11
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm11, %xmm12, %xmm12
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpxor	%xmm12, %xmm10, %xmm10
-        vpxor	%xmm14, %xmm13, %xmm0
-        vpxor	%xmm9, %xmm10, %xmm7
-        vmovdqu	%xmm7, 64(%rsp)
-        vmovdqu	%xmm0, 80(%rsp)
-        # H ^ 7 and H ^ 8
-        vpclmulqdq	$16, %xmm1, %xmm2, %xmm11
-        vpclmulqdq	$0x01, %xmm1, %xmm2, %xmm10
-        vpclmulqdq	$0x00, %xmm1, %xmm2, %xmm9
-        vpclmulqdq	$0x11, %xmm1, %xmm2, %xmm12
-        vpclmulqdq	$0x00, %xmm2, %xmm2, %xmm13
-        vpclmulqdq	$0x11, %xmm2, %xmm2, %xmm14
-        vpxor	%xmm10, %xmm11, %xmm11
-        vpslldq	$8, %xmm11, %xmm10
-        vpsrldq	$8, %xmm11, %xmm11
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm11, %xmm12, %xmm12
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpxor	%xmm12, %xmm10, %xmm10
-        vpxor	%xmm14, %xmm13, %xmm0
-        vpxor	%xmm9, %xmm10, %xmm7
-        vmovdqu	%xmm7, 96(%rsp)
-        vmovdqu	%xmm0, 112(%rsp)
-        # First 128 bytes of input
-        # aesenc_128
-        # aesenc_ctr
-        vmovdqu	128(%rsp), %xmm0
-        vmovdqu	L_avx2_aes_gcm_bswap_epi64(%rip), %xmm1
-        vpaddd	L_avx2_aes_gcm_one(%rip), %xmm0, %xmm9
-        vpshufb	%xmm1, %xmm0, %xmm8
-        vpaddd	L_avx2_aes_gcm_two(%rip), %xmm0, %xmm10
-        vpshufb	%xmm1, %xmm9, %xmm9
-        vpaddd	L_avx2_aes_gcm_three(%rip), %xmm0, %xmm11
-        vpshufb	%xmm1, %xmm10, %xmm10
-        vpaddd	L_avx2_aes_gcm_four(%rip), %xmm0, %xmm12
-        vpshufb	%xmm1, %xmm11, %xmm11
-        vpaddd	L_avx2_aes_gcm_five(%rip), %xmm0, %xmm13
-        vpshufb	%xmm1, %xmm12, %xmm12
-        vpaddd	L_avx2_aes_gcm_six(%rip), %xmm0, %xmm14
-        vpshufb	%xmm1, %xmm13, %xmm13
-        vpaddd	L_avx2_aes_gcm_seven(%rip), %xmm0, %xmm15
-        vpshufb	%xmm1, %xmm14, %xmm14
-        vpaddd	L_avx2_aes_gcm_eight(%rip), %xmm0, %xmm0
-        vpshufb	%xmm1, %xmm15, %xmm15
-        # aesenc_xor
-        vmovdqu	(%rdi), %xmm7
-        vmovdqu	%xmm0, 128(%rsp)
-        vpxor	%xmm7, %xmm8, %xmm8
-        vpxor	%xmm7, %xmm9, %xmm9
-        vpxor	%xmm7, %xmm10, %xmm10
-        vpxor	%xmm7, %xmm11, %xmm11
-        vpxor	%xmm7, %xmm12, %xmm12
-        vpxor	%xmm7, %xmm13, %xmm13
-        vpxor	%xmm7, %xmm14, %xmm14
-        vpxor	%xmm7, %xmm15, %xmm15
-        vmovdqu	16(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	32(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	48(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	64(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	80(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	96(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	112(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	128(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	144(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        cmpl	$11, %esi
-        vmovdqu	160(%rdi), %xmm7
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_128_enc_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	176(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        cmpl	$13, %esi
-        vmovdqu	192(%rdi), %xmm7
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_128_enc_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	208(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	224(%rdi), %xmm7
-L_AES_GCM_encrypt_update_avx2_aesenc_128_enc_done:
-        # aesenc_last
-        vaesenclast	%xmm7, %xmm8, %xmm8
-        vaesenclast	%xmm7, %xmm9, %xmm9
-        vaesenclast	%xmm7, %xmm10, %xmm10
-        vaesenclast	%xmm7, %xmm11, %xmm11
-        vmovdqu	(%r11), %xmm0
-        vmovdqu	16(%r11), %xmm1
-        vmovdqu	32(%r11), %xmm2
-        vmovdqu	48(%r11), %xmm3
-        vpxor	%xmm0, %xmm8, %xmm8
-        vpxor	%xmm1, %xmm9, %xmm9
-        vpxor	%xmm2, %xmm10, %xmm10
-        vpxor	%xmm3, %xmm11, %xmm11
-        vmovdqu	%xmm8, (%r10)
-        vmovdqu	%xmm9, 16(%r10)
-        vmovdqu	%xmm10, 32(%r10)
-        vmovdqu	%xmm11, 48(%r10)
-        vaesenclast	%xmm7, %xmm12, %xmm12
-        vaesenclast	%xmm7, %xmm13, %xmm13
-        vaesenclast	%xmm7, %xmm14, %xmm14
-        vaesenclast	%xmm7, %xmm15, %xmm15
-        vmovdqu	64(%r11), %xmm0
-        vmovdqu	80(%r11), %xmm1
-        vmovdqu	96(%r11), %xmm2
-        vmovdqu	112(%r11), %xmm3
-        vpxor	%xmm0, %xmm12, %xmm12
-        vpxor	%xmm1, %xmm13, %xmm13
-        vpxor	%xmm2, %xmm14, %xmm14
-        vpxor	%xmm3, %xmm15, %xmm15
-        vmovdqu	%xmm12, 64(%r10)
-        vmovdqu	%xmm13, 80(%r10)
-        vmovdqu	%xmm14, 96(%r10)
-        vmovdqu	%xmm15, 112(%r10)
-        cmpl	$0x80, %r13d
-        movl	$0x80, %r14d
-        jle	L_AES_GCM_encrypt_update_avx2_end_128
-        # More 128 bytes of input
-L_AES_GCM_encrypt_update_avx2_ghash_128:
-        # aesenc_128_ghash
-        leaq	(%r11,%r14,1), %rcx
-        leaq	(%r10,%r14,1), %rdx
-        # aesenc_ctr
-        vmovdqu	128(%rsp), %xmm0
-        vmovdqu	L_avx2_aes_gcm_bswap_epi64(%rip), %xmm1
-        vpaddd	L_avx2_aes_gcm_one(%rip), %xmm0, %xmm9
-        vpshufb	%xmm1, %xmm0, %xmm8
-        vpaddd	L_avx2_aes_gcm_two(%rip), %xmm0, %xmm10
-        vpshufb	%xmm1, %xmm9, %xmm9
-        vpaddd	L_avx2_aes_gcm_three(%rip), %xmm0, %xmm11
-        vpshufb	%xmm1, %xmm10, %xmm10
-        vpaddd	L_avx2_aes_gcm_four(%rip), %xmm0, %xmm12
-        vpshufb	%xmm1, %xmm11, %xmm11
-        vpaddd	L_avx2_aes_gcm_five(%rip), %xmm0, %xmm13
-        vpshufb	%xmm1, %xmm12, %xmm12
-        vpaddd	L_avx2_aes_gcm_six(%rip), %xmm0, %xmm14
-        vpshufb	%xmm1, %xmm13, %xmm13
-        vpaddd	L_avx2_aes_gcm_seven(%rip), %xmm0, %xmm15
-        vpshufb	%xmm1, %xmm14, %xmm14
-        vpaddd	L_avx2_aes_gcm_eight(%rip), %xmm0, %xmm0
-        vpshufb	%xmm1, %xmm15, %xmm15
-        # aesenc_xor
-        vmovdqu	(%rdi), %xmm7
-        vmovdqu	%xmm0, 128(%rsp)
-        vpxor	%xmm7, %xmm8, %xmm8
-        vpxor	%xmm7, %xmm9, %xmm9
-        vpxor	%xmm7, %xmm10, %xmm10
-        vpxor	%xmm7, %xmm11, %xmm11
-        vpxor	%xmm7, %xmm12, %xmm12
-        vpxor	%xmm7, %xmm13, %xmm13
-        vpxor	%xmm7, %xmm14, %xmm14
-        vpxor	%xmm7, %xmm15, %xmm15
-        # aesenc_pclmul_1
-        vmovdqu	-128(%rdx), %xmm1
-        vmovdqu	16(%rdi), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vmovdqu	112(%rsp), %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm5
-        vpclmulqdq	$0x01, %xmm2, %xmm1, %xmm3
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm6
-        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_2
-        vmovdqu	-112(%rdx), %xmm1
-        vmovdqu	96(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	32(%rdi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	-96(%rdx), %xmm1
-        vmovdqu	80(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	48(%rdi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	-80(%rdx), %xmm1
-        vmovdqu	64(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	64(%rdi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	-64(%rdx), %xmm1
-        vmovdqu	48(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	80(%rdi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	-48(%rdx), %xmm1
-        vmovdqu	32(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	96(%rdi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	-32(%rdx), %xmm1
-        vmovdqu	16(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	112(%rdi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	-16(%rdx), %xmm1
-        vmovdqu	(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	128(%rdi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_l
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpslldq	$8, %xmm5, %xmm1
-        vpsrldq	$8, %xmm5, %xmm5
-        vmovdqu	144(%rdi), %xmm4
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm0
-        vaesenc	%xmm4, %xmm8, %xmm8
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm7, %xmm7
-        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
-        vaesenc	%xmm4, %xmm9, %xmm9
-        vaesenc	%xmm4, %xmm10, %xmm10
-        vaesenc	%xmm4, %xmm11, %xmm11
-        vpshufd	$0x4e, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
-        vaesenc	%xmm4, %xmm12, %xmm12
-        vaesenc	%xmm4, %xmm13, %xmm13
-        vaesenc	%xmm4, %xmm14, %xmm14
-        vpshufd	$0x4e, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm7, %xmm6, %xmm6
-        vaesenc	%xmm4, %xmm15, %xmm15
-        cmpl	$11, %esi
-        vmovdqu	160(%rdi), %xmm7
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_128_ghash_avx_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	176(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        cmpl	$13, %esi
-        vmovdqu	192(%rdi), %xmm7
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_128_ghash_avx_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	208(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	224(%rdi), %xmm7
-L_AES_GCM_encrypt_update_avx2_aesenc_128_ghash_avx_done:
-        # aesenc_last
-        vaesenclast	%xmm7, %xmm8, %xmm8
-        vaesenclast	%xmm7, %xmm9, %xmm9
-        vaesenclast	%xmm7, %xmm10, %xmm10
-        vaesenclast	%xmm7, %xmm11, %xmm11
-        vmovdqu	(%rcx), %xmm0
-        vmovdqu	16(%rcx), %xmm1
-        vmovdqu	32(%rcx), %xmm2
-        vmovdqu	48(%rcx), %xmm3
-        vpxor	%xmm0, %xmm8, %xmm8
-        vpxor	%xmm1, %xmm9, %xmm9
-        vpxor	%xmm2, %xmm10, %xmm10
-        vpxor	%xmm3, %xmm11, %xmm11
-        vmovdqu	%xmm8, (%rdx)
-        vmovdqu	%xmm9, 16(%rdx)
-        vmovdqu	%xmm10, 32(%rdx)
-        vmovdqu	%xmm11, 48(%rdx)
-        vaesenclast	%xmm7, %xmm12, %xmm12
-        vaesenclast	%xmm7, %xmm13, %xmm13
-        vaesenclast	%xmm7, %xmm14, %xmm14
-        vaesenclast	%xmm7, %xmm15, %xmm15
-        vmovdqu	64(%rcx), %xmm0
-        vmovdqu	80(%rcx), %xmm1
-        vmovdqu	96(%rcx), %xmm2
-        vmovdqu	112(%rcx), %xmm3
-        vpxor	%xmm0, %xmm12, %xmm12
-        vpxor	%xmm1, %xmm13, %xmm13
-        vpxor	%xmm2, %xmm14, %xmm14
-        vpxor	%xmm3, %xmm15, %xmm15
-        vmovdqu	%xmm12, 64(%rdx)
-        vmovdqu	%xmm13, 80(%rdx)
-        vmovdqu	%xmm14, 96(%rdx)
-        vmovdqu	%xmm15, 112(%rdx)
-        # aesenc_128_ghash - end
-        addl	$0x80, %r14d
-        cmpl	%r13d, %r14d
-        jl	L_AES_GCM_encrypt_update_avx2_ghash_128
-L_AES_GCM_encrypt_update_avx2_end_128:
-        vmovdqu	L_avx2_aes_gcm_bswap_mask(%rip), %xmm4
-        vpshufb	%xmm4, %xmm8, %xmm8
-        vpshufb	%xmm4, %xmm9, %xmm9
-        vpshufb	%xmm4, %xmm10, %xmm10
-        vpshufb	%xmm4, %xmm11, %xmm11
-        vpshufb	%xmm4, %xmm12, %xmm12
-        vpshufb	%xmm4, %xmm13, %xmm13
-        vpshufb	%xmm4, %xmm14, %xmm14
-        vpshufb	%xmm4, %xmm15, %xmm15
-        vpxor	%xmm6, %xmm8, %xmm8
-        vmovdqu	(%rsp), %xmm7
-        vpclmulqdq	$16, %xmm15, %xmm7, %xmm5
-        vpclmulqdq	$0x01, %xmm15, %xmm7, %xmm1
-        vpclmulqdq	$0x00, %xmm15, %xmm7, %xmm4
-        vpclmulqdq	$0x11, %xmm15, %xmm7, %xmm6
-        vpxor	%xmm1, %xmm5, %xmm5
-        vmovdqu	16(%rsp), %xmm7
-        vpclmulqdq	$16, %xmm14, %xmm7, %xmm2
-        vpclmulqdq	$0x01, %xmm14, %xmm7, %xmm1
-        vpclmulqdq	$0x00, %xmm14, %xmm7, %xmm0
-        vpclmulqdq	$0x11, %xmm14, %xmm7, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm4, %xmm4
-        vmovdqu	32(%rsp), %xmm15
-        vmovdqu	48(%rsp), %xmm7
-        vpclmulqdq	$16, %xmm13, %xmm15, %xmm2
-        vpclmulqdq	$0x01, %xmm13, %xmm15, %xmm1
-        vpclmulqdq	$0x00, %xmm13, %xmm15, %xmm0
-        vpclmulqdq	$0x11, %xmm13, %xmm15, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpclmulqdq	$16, %xmm12, %xmm7, %xmm2
-        vpclmulqdq	$0x01, %xmm12, %xmm7, %xmm1
-        vpclmulqdq	$0x00, %xmm12, %xmm7, %xmm0
-        vpclmulqdq	$0x11, %xmm12, %xmm7, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm4, %xmm4
-        vmovdqu	64(%rsp), %xmm15
-        vmovdqu	80(%rsp), %xmm7
-        vpclmulqdq	$16, %xmm11, %xmm15, %xmm2
-        vpclmulqdq	$0x01, %xmm11, %xmm15, %xmm1
-        vpclmulqdq	$0x00, %xmm11, %xmm15, %xmm0
-        vpclmulqdq	$0x11, %xmm11, %xmm15, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpclmulqdq	$16, %xmm10, %xmm7, %xmm2
-        vpclmulqdq	$0x01, %xmm10, %xmm7, %xmm1
-        vpclmulqdq	$0x00, %xmm10, %xmm7, %xmm0
-        vpclmulqdq	$0x11, %xmm10, %xmm7, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm4, %xmm4
-        vmovdqu	96(%rsp), %xmm15
-        vmovdqu	112(%rsp), %xmm7
-        vpclmulqdq	$16, %xmm9, %xmm15, %xmm2
-        vpclmulqdq	$0x01, %xmm9, %xmm15, %xmm1
-        vpclmulqdq	$0x00, %xmm9, %xmm15, %xmm0
-        vpclmulqdq	$0x11, %xmm9, %xmm15, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpclmulqdq	$16, %xmm8, %xmm7, %xmm2
-        vpclmulqdq	$0x01, %xmm8, %xmm7, %xmm1
-        vpclmulqdq	$0x00, %xmm8, %xmm7, %xmm0
-        vpclmulqdq	$0x11, %xmm8, %xmm7, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpslldq	$8, %xmm5, %xmm7
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpxor	%xmm5, %xmm6, %xmm6
-        # ghash_red
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm4, %xmm0
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm6, %xmm6
-        vmovdqu	(%rsp), %xmm5
-        vmovdqu	128(%rsp), %xmm4
-L_AES_GCM_encrypt_update_avx2_done_128:
-        cmpl	%r8d, %r14d
-        je	L_AES_GCM_encrypt_update_avx2_done_enc
-        movl	%r8d, %r13d
-        andl	$0xfffffff0, %r13d
-        cmpl	%r13d, %r14d
-        jge	L_AES_GCM_encrypt_update_avx2_last_block_done
-        # aesenc_block
-        vmovdqu	%xmm4, %xmm1
-        vpshufb	L_avx2_aes_gcm_bswap_epi64(%rip), %xmm1, %xmm0
-        vpaddd	L_avx2_aes_gcm_one(%rip), %xmm1, %xmm1
-        vpxor	(%rdi), %xmm0, %xmm0
-        vmovdqu	16(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	32(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	48(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	64(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	80(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	96(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	112(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	128(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	144(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	%xmm1, %xmm4
-        cmpl	$11, %esi
-        vmovdqu	160(%rdi), %xmm1
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_block_last
-        vaesenc	%xmm1, %xmm0, %xmm0
-        vmovdqu	176(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        cmpl	$13, %esi
-        vmovdqu	192(%rdi), %xmm1
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_block_last
-        vaesenc	%xmm1, %xmm0, %xmm0
-        vmovdqu	208(%rdi), %xmm2
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vmovdqu	224(%rdi), %xmm1
-L_AES_GCM_encrypt_update_avx2_aesenc_block_last:
-        vaesenclast	%xmm1, %xmm0, %xmm0
-        vmovdqu	(%r11,%r14,1), %xmm1
-        vpxor	%xmm1, %xmm0, %xmm0
-        vmovdqu	%xmm0, (%r10,%r14,1)
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm0, %xmm6, %xmm6
-        addl	$16, %r14d
-        cmpl	%r13d, %r14d
-        jge	L_AES_GCM_encrypt_update_avx2_last_block_ghash
-L_AES_GCM_encrypt_update_avx2_last_block_start:
-        vmovdqu	(%r11,%r14,1), %xmm12
-        vpshufb	L_avx2_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm11
-        vpaddd	L_avx2_aes_gcm_one(%rip), %xmm4, %xmm4
-        # aesenc_gfmul_sb
-        vpclmulqdq	$0x01, %xmm5, %xmm6, %xmm2
-        vpclmulqdq	$16, %xmm5, %xmm6, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm1
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm8
-        vpxor	(%rdi), %xmm11, %xmm11
-        vaesenc	16(%rdi), %xmm11, %xmm11
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpslldq	$8, %xmm3, %xmm2
-        vpsrldq	$8, %xmm3, %xmm3
-        vaesenc	32(%rdi), %xmm11, %xmm11
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm2, %xmm1
-        vaesenc	48(%rdi), %xmm11, %xmm11
-        vaesenc	64(%rdi), %xmm11, %xmm11
-        vaesenc	80(%rdi), %xmm11, %xmm11
-        vpshufd	$0x4e, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm2, %xmm1
-        vaesenc	96(%rdi), %xmm11, %xmm11
-        vaesenc	112(%rdi), %xmm11, %xmm11
-        vaesenc	128(%rdi), %xmm11, %xmm11
-        vpshufd	$0x4e, %xmm2, %xmm2
-        vaesenc	144(%rdi), %xmm11, %xmm11
-        vpxor	%xmm3, %xmm8, %xmm8
-        vpxor	%xmm8, %xmm2, %xmm2
-        vmovdqu	160(%rdi), %xmm0
-        cmpl	$11, %esi
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	176(%rdi), %xmm11, %xmm11
-        vmovdqu	192(%rdi), %xmm0
-        cmpl	$13, %esi
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	208(%rdi), %xmm11, %xmm11
-        vmovdqu	224(%rdi), %xmm0
-L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last:
-        vaesenclast	%xmm0, %xmm11, %xmm11
-        vpxor	%xmm1, %xmm2, %xmm6
-        vpxor	%xmm12, %xmm11, %xmm11
-        vmovdqu	%xmm11, (%r10,%r14,1)
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm11, %xmm11
-        vpxor	%xmm11, %xmm6, %xmm6
-        addl	$16, %r14d
-        cmpl	%r13d, %r14d
-        jl	L_AES_GCM_encrypt_update_avx2_last_block_start
-L_AES_GCM_encrypt_update_avx2_last_block_ghash:
-        # ghash_gfmul_red
-        vpclmulqdq	$16, %xmm5, %xmm6, %xmm10
-        vpclmulqdq	$0x01, %xmm5, %xmm6, %xmm9
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm8
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpslldq	$8, %xmm10, %xmm9
-        vpsrldq	$8, %xmm10, %xmm10
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm6
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm9, %xmm8
-        vpshufd	$0x4e, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm9, %xmm8
-        vpshufd	$0x4e, %xmm9, %xmm9
-        vpxor	%xmm10, %xmm6, %xmm6
-        vpxor	%xmm9, %xmm6, %xmm6
-        vpxor	%xmm8, %xmm6, %xmm6
-L_AES_GCM_encrypt_update_avx2_last_block_done:
-L_AES_GCM_encrypt_update_avx2_done_enc:
-        vmovdqu	%xmm6, (%r9)
-        vmovdqu	%xmm4, (%r12)
-        vzeroupper
-        addq	$0x98, %rsp
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_encrypt_update_avx2,.-AES_GCM_encrypt_update_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_encrypt_final_avx2
-.type	AES_GCM_encrypt_final_avx2,@function
-.align	16
-AES_GCM_encrypt_final_avx2:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_encrypt_final_avx2
-.p2align	4
-_AES_GCM_encrypt_final_avx2:
-#endif /* __APPLE__ */
-        movq	8(%rsp), %rax
-        subq	$16, %rsp
-        vmovdqu	(%rdi), %xmm4
-        vmovdqu	(%r9), %xmm5
-        vmovdqu	(%rax), %xmm6
-        vpsrlq	$63, %xmm5, %xmm1
-        vpsllq	$0x01, %xmm5, %xmm0
-        vpslldq	$8, %xmm1, %xmm1
-        vpor	%xmm1, %xmm0, %xmm0
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpand	L_avx2_aes_gcm_mod2_128(%rip), %xmm5, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        # calc_tag
-        shlq	$3, %rcx
-        shlq	$3, %r8
-        vmovq	%rcx, %xmm0
-        vmovq	%r8, %xmm1
-        vpunpcklqdq	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm4, %xmm0, %xmm0
-        # ghash_gfmul_red
-        vpclmulqdq	$16, %xmm5, %xmm0, %xmm7
-        vpclmulqdq	$0x01, %xmm5, %xmm0, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm0, %xmm2
-        vpxor	%xmm3, %xmm7, %xmm7
-        vpslldq	$8, %xmm7, %xmm3
-        vpsrldq	$8, %xmm7, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpclmulqdq	$0x11, %xmm5, %xmm0, %xmm0
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm3, %xmm2
-        vpshufd	$0x4e, %xmm3, %xmm3
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm3, %xmm2
-        vpshufd	$0x4e, %xmm3, %xmm3
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm6, %xmm0, %xmm0
-        # store_tag
-        cmpl	$16, %edx
-        je	L_AES_GCM_encrypt_final_avx2_store_tag_16
-        xorq	%r10, %r10
-        vmovdqu	%xmm0, (%rsp)
-L_AES_GCM_encrypt_final_avx2_store_tag_loop:
-        movzbl	(%rsp,%r10,1), %r11d
-        movb	%r11b, (%rsi,%r10,1)
-        incl	%r10d
-        cmpl	%edx, %r10d
-        jne	L_AES_GCM_encrypt_final_avx2_store_tag_loop
-        jmp	L_AES_GCM_encrypt_final_avx2_store_tag_done
-L_AES_GCM_encrypt_final_avx2_store_tag_16:
-        vmovdqu	%xmm0, (%rsi)
-L_AES_GCM_encrypt_final_avx2_store_tag_done:
-        vzeroupper
-        addq	$16, %rsp
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_encrypt_final_avx2,.-AES_GCM_encrypt_final_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_decrypt_update_avx2
-.type	AES_GCM_decrypt_update_avx2,@function
-.align	16
-AES_GCM_decrypt_update_avx2:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_decrypt_update_avx2
-.p2align	4
-_AES_GCM_decrypt_update_avx2:
-#endif /* __APPLE__ */
-        pushq	%r13
-        pushq	%r12
-        pushq	%r14
-        movq	%rdx, %r10
-        movq	%rcx, %r11
-        movq	32(%rsp), %rax
-        movq	40(%rsp), %r12
-        subq	$0xa8, %rsp
-        vmovdqu	(%r9), %xmm6
-        vmovdqu	(%rax), %xmm5
-        vmovdqu	(%r12), %xmm4
-        # Calculate H
-        vpsrlq	$63, %xmm5, %xmm1
-        vpsllq	$0x01, %xmm5, %xmm0
-        vpslldq	$8, %xmm1, %xmm1
-        vpor	%xmm1, %xmm0, %xmm0
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpand	L_avx2_aes_gcm_mod2_128(%rip), %xmm5, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        xorl	%r14d, %r14d
-        cmpl	$0x80, %r8d
-        movl	%r8d, %r13d
-        jl	L_AES_GCM_decrypt_update_avx2_done_128
-        andl	$0xffffff80, %r13d
-        vmovdqu	%xmm4, 128(%rsp)
-        vmovdqu	%xmm15, 144(%rsp)
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm3
-        # H ^ 1 and H ^ 2
-        vpclmulqdq	$0x00, %xmm5, %xmm5, %xmm9
-        vpclmulqdq	$0x11, %xmm5, %xmm5, %xmm10
-        vpclmulqdq	$16, %xmm3, %xmm9, %xmm8
-        vpshufd	$0x4e, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpclmulqdq	$16, %xmm3, %xmm9, %xmm8
-        vpshufd	$0x4e, %xmm9, %xmm9
-        vpxor	%xmm8, %xmm9, %xmm9
-        vpxor	%xmm9, %xmm10, %xmm0
-        vmovdqu	%xmm5, (%rsp)
-        vmovdqu	%xmm0, 16(%rsp)
-        # H ^ 3 and H ^ 4
-        vpclmulqdq	$16, %xmm5, %xmm0, %xmm11
-        vpclmulqdq	$0x01, %xmm5, %xmm0, %xmm10
-        vpclmulqdq	$0x00, %xmm5, %xmm0, %xmm9
-        vpclmulqdq	$0x11, %xmm5, %xmm0, %xmm12
-        vpclmulqdq	$0x00, %xmm0, %xmm0, %xmm13
-        vpclmulqdq	$0x11, %xmm0, %xmm0, %xmm14
-        vpxor	%xmm10, %xmm11, %xmm11
-        vpslldq	$8, %xmm11, %xmm10
-        vpsrldq	$8, %xmm11, %xmm11
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm11, %xmm12, %xmm12
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpxor	%xmm12, %xmm10, %xmm10
-        vpxor	%xmm14, %xmm13, %xmm2
-        vpxor	%xmm9, %xmm10, %xmm1
-        vmovdqu	%xmm1, 32(%rsp)
-        vmovdqu	%xmm2, 48(%rsp)
-        # H ^ 5 and H ^ 6
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm11
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm10
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm9
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm12
-        vpclmulqdq	$0x00, %xmm1, %xmm1, %xmm13
-        vpclmulqdq	$0x11, %xmm1, %xmm1, %xmm14
-        vpxor	%xmm10, %xmm11, %xmm11
-        vpslldq	$8, %xmm11, %xmm10
-        vpsrldq	$8, %xmm11, %xmm11
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm11, %xmm12, %xmm12
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpxor	%xmm12, %xmm10, %xmm10
-        vpxor	%xmm14, %xmm13, %xmm0
-        vpxor	%xmm9, %xmm10, %xmm7
-        vmovdqu	%xmm7, 64(%rsp)
-        vmovdqu	%xmm0, 80(%rsp)
-        # H ^ 7 and H ^ 8
-        vpclmulqdq	$16, %xmm1, %xmm2, %xmm11
-        vpclmulqdq	$0x01, %xmm1, %xmm2, %xmm10
-        vpclmulqdq	$0x00, %xmm1, %xmm2, %xmm9
-        vpclmulqdq	$0x11, %xmm1, %xmm2, %xmm12
-        vpclmulqdq	$0x00, %xmm2, %xmm2, %xmm13
-        vpclmulqdq	$0x11, %xmm2, %xmm2, %xmm14
-        vpxor	%xmm10, %xmm11, %xmm11
-        vpslldq	$8, %xmm11, %xmm10
-        vpsrldq	$8, %xmm11, %xmm11
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm9, %xmm10, %xmm10
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpclmulqdq	$16, %xmm3, %xmm10, %xmm9
-        vpclmulqdq	$16, %xmm3, %xmm13, %xmm8
-        vpshufd	$0x4e, %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm13, %xmm13
-        vpxor	%xmm11, %xmm12, %xmm12
-        vpxor	%xmm8, %xmm13, %xmm13
-        vpxor	%xmm12, %xmm10, %xmm10
-        vpxor	%xmm14, %xmm13, %xmm0
-        vpxor	%xmm9, %xmm10, %xmm7
-        vmovdqu	%xmm7, 96(%rsp)
-        vmovdqu	%xmm0, 112(%rsp)
-L_AES_GCM_decrypt_update_avx2_ghash_128:
-        # aesenc_128_ghash
-        leaq	(%r11,%r14,1), %rcx
-        leaq	(%r10,%r14,1), %rdx
-        # aesenc_ctr
-        vmovdqu	128(%rsp), %xmm0
-        vmovdqu	L_avx2_aes_gcm_bswap_epi64(%rip), %xmm1
-        vpaddd	L_avx2_aes_gcm_one(%rip), %xmm0, %xmm9
-        vpshufb	%xmm1, %xmm0, %xmm8
-        vpaddd	L_avx2_aes_gcm_two(%rip), %xmm0, %xmm10
-        vpshufb	%xmm1, %xmm9, %xmm9
-        vpaddd	L_avx2_aes_gcm_three(%rip), %xmm0, %xmm11
-        vpshufb	%xmm1, %xmm10, %xmm10
-        vpaddd	L_avx2_aes_gcm_four(%rip), %xmm0, %xmm12
-        vpshufb	%xmm1, %xmm11, %xmm11
-        vpaddd	L_avx2_aes_gcm_five(%rip), %xmm0, %xmm13
-        vpshufb	%xmm1, %xmm12, %xmm12
-        vpaddd	L_avx2_aes_gcm_six(%rip), %xmm0, %xmm14
-        vpshufb	%xmm1, %xmm13, %xmm13
-        vpaddd	L_avx2_aes_gcm_seven(%rip), %xmm0, %xmm15
-        vpshufb	%xmm1, %xmm14, %xmm14
-        vpaddd	L_avx2_aes_gcm_eight(%rip), %xmm0, %xmm0
-        vpshufb	%xmm1, %xmm15, %xmm15
-        # aesenc_xor
-        vmovdqu	(%rdi), %xmm7
-        vmovdqu	%xmm0, 128(%rsp)
-        vpxor	%xmm7, %xmm8, %xmm8
-        vpxor	%xmm7, %xmm9, %xmm9
-        vpxor	%xmm7, %xmm10, %xmm10
-        vpxor	%xmm7, %xmm11, %xmm11
-        vpxor	%xmm7, %xmm12, %xmm12
-        vpxor	%xmm7, %xmm13, %xmm13
-        vpxor	%xmm7, %xmm14, %xmm14
-        vpxor	%xmm7, %xmm15, %xmm15
-        # aesenc_pclmul_1
-        vmovdqu	(%rcx), %xmm1
-        vmovdqu	16(%rdi), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vmovdqu	112(%rsp), %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm5
-        vpclmulqdq	$0x01, %xmm2, %xmm1, %xmm3
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm6
-        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_2
-        vmovdqu	16(%rcx), %xmm1
-        vmovdqu	96(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	32(%rdi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	32(%rcx), %xmm1
-        vmovdqu	80(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	48(%rdi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	48(%rcx), %xmm1
-        vmovdqu	64(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	64(%rdi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	64(%rcx), %xmm1
-        vmovdqu	48(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	80(%rdi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	80(%rcx), %xmm1
-        vmovdqu	32(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	96(%rdi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	96(%rcx), %xmm1
-        vmovdqu	16(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	112(%rdi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_n
-        vmovdqu	112(%rcx), %xmm1
-        vmovdqu	(%rsp), %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vmovdqu	128(%rdi), %xmm0
-        vpxor	%xmm1, %xmm7, %xmm7
-        vaesenc	%xmm0, %xmm8, %xmm8
-        vaesenc	%xmm0, %xmm9, %xmm9
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	%xmm0, %xmm11, %xmm11
-        vaesenc	%xmm0, %xmm12, %xmm12
-        vaesenc	%xmm0, %xmm13, %xmm13
-        vaesenc	%xmm0, %xmm14, %xmm14
-        vaesenc	%xmm0, %xmm15, %xmm15
-        # aesenc_pclmul_l
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpslldq	$8, %xmm5, %xmm1
-        vpsrldq	$8, %xmm5, %xmm5
-        vmovdqu	144(%rdi), %xmm4
-        vmovdqu	L_avx2_aes_gcm_mod2_128(%rip), %xmm0
-        vaesenc	%xmm4, %xmm8, %xmm8
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm7, %xmm7
-        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
-        vaesenc	%xmm4, %xmm9, %xmm9
-        vaesenc	%xmm4, %xmm10, %xmm10
-        vaesenc	%xmm4, %xmm11, %xmm11
-        vpshufd	$0x4e, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
-        vaesenc	%xmm4, %xmm12, %xmm12
-        vaesenc	%xmm4, %xmm13, %xmm13
-        vaesenc	%xmm4, %xmm14, %xmm14
-        vpshufd	$0x4e, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm7, %xmm6, %xmm6
-        vaesenc	%xmm4, %xmm15, %xmm15
-        cmpl	$11, %esi
-        vmovdqu	160(%rdi), %xmm7
-        jl	L_AES_GCM_decrypt_update_avx2_aesenc_128_ghash_avx_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	176(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        cmpl	$13, %esi
-        vmovdqu	192(%rdi), %xmm7
-        jl	L_AES_GCM_decrypt_update_avx2_aesenc_128_ghash_avx_done
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	208(%rdi), %xmm7
-        vaesenc	%xmm7, %xmm8, %xmm8
-        vaesenc	%xmm7, %xmm9, %xmm9
-        vaesenc	%xmm7, %xmm10, %xmm10
-        vaesenc	%xmm7, %xmm11, %xmm11
-        vaesenc	%xmm7, %xmm12, %xmm12
-        vaesenc	%xmm7, %xmm13, %xmm13
-        vaesenc	%xmm7, %xmm14, %xmm14
-        vaesenc	%xmm7, %xmm15, %xmm15
-        vmovdqu	224(%rdi), %xmm7
-L_AES_GCM_decrypt_update_avx2_aesenc_128_ghash_avx_done:
-        # aesenc_last
-        vaesenclast	%xmm7, %xmm8, %xmm8
-        vaesenclast	%xmm7, %xmm9, %xmm9
-        vaesenclast	%xmm7, %xmm10, %xmm10
-        vaesenclast	%xmm7, %xmm11, %xmm11
-        vmovdqu	(%rcx), %xmm0
-        vmovdqu	16(%rcx), %xmm1
-        vmovdqu	32(%rcx), %xmm2
-        vmovdqu	48(%rcx), %xmm3
-        vpxor	%xmm0, %xmm8, %xmm8
-        vpxor	%xmm1, %xmm9, %xmm9
-        vpxor	%xmm2, %xmm10, %xmm10
-        vpxor	%xmm3, %xmm11, %xmm11
-        vmovdqu	%xmm8, (%rdx)
-        vmovdqu	%xmm9, 16(%rdx)
-        vmovdqu	%xmm10, 32(%rdx)
-        vmovdqu	%xmm11, 48(%rdx)
-        vaesenclast	%xmm7, %xmm12, %xmm12
-        vaesenclast	%xmm7, %xmm13, %xmm13
-        vaesenclast	%xmm7, %xmm14, %xmm14
-        vaesenclast	%xmm7, %xmm15, %xmm15
-        vmovdqu	64(%rcx), %xmm0
-        vmovdqu	80(%rcx), %xmm1
-        vmovdqu	96(%rcx), %xmm2
-        vmovdqu	112(%rcx), %xmm3
-        vpxor	%xmm0, %xmm12, %xmm12
-        vpxor	%xmm1, %xmm13, %xmm13
-        vpxor	%xmm2, %xmm14, %xmm14
-        vpxor	%xmm3, %xmm15, %xmm15
-        vmovdqu	%xmm12, 64(%rdx)
-        vmovdqu	%xmm13, 80(%rdx)
-        vmovdqu	%xmm14, 96(%rdx)
-        vmovdqu	%xmm15, 112(%rdx)
-        # aesenc_128_ghash - end
-        addl	$0x80, %r14d
-        cmpl	%r13d, %r14d
-        jl	L_AES_GCM_decrypt_update_avx2_ghash_128
-        vmovdqu	(%rsp), %xmm5
-        vmovdqu	128(%rsp), %xmm4
-        vmovdqu	144(%rsp), %xmm15
-L_AES_GCM_decrypt_update_avx2_done_128:
-        cmpl	%r8d, %r14d
-        jge	L_AES_GCM_decrypt_update_avx2_done_dec
-        movl	%r8d, %r13d
-        andl	$0xfffffff0, %r13d
-        cmpl	%r13d, %r14d
-        jge	L_AES_GCM_decrypt_update_avx2_last_block_done
-L_AES_GCM_decrypt_update_avx2_last_block_start:
-        vmovdqu	(%r11,%r14,1), %xmm11
-        vpshufb	L_avx2_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm10
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm11, %xmm12
-        vpaddd	L_avx2_aes_gcm_one(%rip), %xmm4, %xmm4
-        vpxor	%xmm6, %xmm12, %xmm12
-        # aesenc_gfmul_sb
-        vpclmulqdq	$0x01, %xmm5, %xmm12, %xmm2
-        vpclmulqdq	$16, %xmm5, %xmm12, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm12, %xmm1
-        vpclmulqdq	$0x11, %xmm5, %xmm12, %xmm8
-        vpxor	(%rdi), %xmm10, %xmm10
-        vaesenc	16(%rdi), %xmm10, %xmm10
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpslldq	$8, %xmm3, %xmm2
-        vpsrldq	$8, %xmm3, %xmm3
-        vaesenc	32(%rdi), %xmm10, %xmm10
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm2, %xmm1
-        vaesenc	48(%rdi), %xmm10, %xmm10
-        vaesenc	64(%rdi), %xmm10, %xmm10
-        vaesenc	80(%rdi), %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm2, %xmm1
-        vaesenc	96(%rdi), %xmm10, %xmm10
-        vaesenc	112(%rdi), %xmm10, %xmm10
-        vaesenc	128(%rdi), %xmm10, %xmm10
-        vpshufd	$0x4e, %xmm2, %xmm2
-        vaesenc	144(%rdi), %xmm10, %xmm10
-        vpxor	%xmm3, %xmm8, %xmm8
-        vpxor	%xmm8, %xmm2, %xmm2
-        vmovdqu	160(%rdi), %xmm0
-        cmpl	$11, %esi
-        jl	L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	176(%rdi), %xmm10, %xmm10
-        vmovdqu	192(%rdi), %xmm0
-        cmpl	$13, %esi
-        jl	L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last
-        vaesenc	%xmm0, %xmm10, %xmm10
-        vaesenc	208(%rdi), %xmm10, %xmm10
-        vmovdqu	224(%rdi), %xmm0
-L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last:
-        vaesenclast	%xmm0, %xmm10, %xmm10
-        vpxor	%xmm1, %xmm2, %xmm6
-        vpxor	%xmm11, %xmm10, %xmm10
-        vmovdqu	%xmm10, (%r10,%r14,1)
-        addl	$16, %r14d
-        cmpl	%r13d, %r14d
-        jl	L_AES_GCM_decrypt_update_avx2_last_block_start
-L_AES_GCM_decrypt_update_avx2_last_block_done:
-L_AES_GCM_decrypt_update_avx2_done_dec:
-        vmovdqu	%xmm6, (%r9)
-        vmovdqu	%xmm4, (%r12)
-        vzeroupper
-        addq	$0xa8, %rsp
-        popq	%r14
-        popq	%r12
-        popq	%r13
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_decrypt_update_avx2,.-AES_GCM_decrypt_update_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	AES_GCM_decrypt_final_avx2
-.type	AES_GCM_decrypt_final_avx2,@function
-.align	16
-AES_GCM_decrypt_final_avx2:
-#else
-.section	__TEXT,__text
-.globl	_AES_GCM_decrypt_final_avx2
-.p2align	4
-_AES_GCM_decrypt_final_avx2:
-#endif /* __APPLE__ */
-        pushq	%r12
-        movq	16(%rsp), %rax
-        movq	24(%rsp), %r10
-        subq	$16, %rsp
-        vmovdqu	(%rdi), %xmm4
-        vmovdqu	(%r9), %xmm5
-        vmovdqu	(%rax), %xmm6
-        vpsrlq	$63, %xmm5, %xmm1
-        vpsllq	$0x01, %xmm5, %xmm0
-        vpslldq	$8, %xmm1, %xmm1
-        vpor	%xmm1, %xmm0, %xmm0
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpand	L_avx2_aes_gcm_mod2_128(%rip), %xmm5, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        # calc_tag
-        shlq	$3, %rcx
-        shlq	$3, %r8
-        vmovq	%rcx, %xmm0
-        vmovq	%r8, %xmm1
-        vpunpcklqdq	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm4, %xmm0, %xmm0
-        # ghash_gfmul_red
-        vpclmulqdq	$16, %xmm5, %xmm0, %xmm7
-        vpclmulqdq	$0x01, %xmm5, %xmm0, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm0, %xmm2
-        vpxor	%xmm3, %xmm7, %xmm7
-        vpslldq	$8, %xmm7, %xmm3
-        vpsrldq	$8, %xmm7, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpclmulqdq	$0x11, %xmm5, %xmm0, %xmm0
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm3, %xmm2
-        vpshufd	$0x4e, %xmm3, %xmm3
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpclmulqdq	$16, L_avx2_aes_gcm_mod2_128(%rip), %xmm3, %xmm2
-        vpshufd	$0x4e, %xmm3, %xmm3
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vpshufb	L_avx2_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0
-        vpxor	%xmm6, %xmm0, %xmm0
-        # cmp_tag
-        cmpl	$16, %edx
-        je	L_AES_GCM_decrypt_final_avx2_cmp_tag_16
-        xorq	%r11, %r11
-        xorq	%r9, %r9
-        vmovdqu	%xmm0, (%rsp)
-L_AES_GCM_decrypt_final_avx2_cmp_tag_loop:
-        movzbl	(%rsp,%r11,1), %r12d
-        xorb	(%rsi,%r11,1), %r12b
-        orb	%r12b, %r9b
-        incl	%r11d
-        cmpl	%edx, %r11d
-        jne	L_AES_GCM_decrypt_final_avx2_cmp_tag_loop
-        cmpb	$0x00, %r9b
-        sete	%r9b
-        jmp	L_AES_GCM_decrypt_final_avx2_cmp_tag_done
-L_AES_GCM_decrypt_final_avx2_cmp_tag_16:
-        vmovdqu	(%rsi), %xmm1
-        vpcmpeqb	%xmm1, %xmm0, %xmm0
-        vpmovmskb	%xmm0, %r11
-        # %%edx == 0xFFFF then return 1 else => return 0
-        xorl	%r9d, %r9d
-        cmpl	$0xffff, %r11d
-        sete	%r9b
-L_AES_GCM_decrypt_final_avx2_cmp_tag_done:
-        movl	%r9d, (%r10)
-        vzeroupper
-        addq	$16, %rsp
-        popq	%r12
-        repz retq
-#ifndef __APPLE__
-.size	AES_GCM_decrypt_final_avx2,.-AES_GCM_decrypt_final_avx2
-#endif /* __APPLE__ */
-#endif /* WOLFSSL_AESGCM_STREAM */
-#endif /* HAVE_INTEL_AVX2 */
-#endif /* WOLFSSL_X86_64_BUILD */
-
-#if defined(__linux__) && defined(__ELF__)
-.section	.note.GNU-stack,"",%progbits
-#endif

+ 0 - 15423
lib/wolfssl/wolfcrypt/src/aes_gcm_asm.asm

@@ -1,15423 +0,0 @@
-; /* aes_gcm_asm
-;  *
-;  * Copyright (C) 2006-2023 wolfSSL Inc.
-;  *
-;  * This file is part of wolfSSL.
-;  *
-;  * wolfSSL is free software; you can redistribute it and/or modify
-;  * it under the terms of the GNU General Public License as published by
-;  * the Free Software Foundation; either version 2 of the License, or
-;  * (at your option) any later version.
-;  *
-;  * wolfSSL is distributed in the hope that it will be useful,
-;  * but WITHOUT ANY WARRANTY; without even the implied warranty of
-;  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;  * GNU General Public License for more details.
-;  *
-;  * You should have received a copy of the GNU General Public License
-;  * along with this program; if not, write to the Free Software
-;  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
-;  */
-IF @Version LT 1200
-; AVX2 instructions not recognized by old versions of MASM
-IFNDEF NO_AVX2_SUPPORT
-NO_AVX2_SUPPORT = 1
-ENDIF
-; MOVBE instruction not recognized by old versions of MASM
-IFNDEF NO_MOVBE_SUPPORT
-NO_MOVBE_SUPPORT = 1
-ENDIF
-ENDIF
-
-IFNDEF HAVE_INTEL_AVX1
-HAVE_INTEL_AVX1 = 1
-ENDIF
-IFNDEF NO_AVX2_SUPPORT
-HAVE_INTEL_AVX2 = 1
-ENDIF
-
-IFNDEF _WIN64
-_WIN64 = 1
-ENDIF
-
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_one QWORD 0, 1
-ptr_L_aes_gcm_one QWORD L_aes_gcm_one
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_two QWORD 0, 2
-ptr_L_aes_gcm_two QWORD L_aes_gcm_two
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_three QWORD 0, 3
-ptr_L_aes_gcm_three QWORD L_aes_gcm_three
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_four QWORD 0, 4
-ptr_L_aes_gcm_four QWORD L_aes_gcm_four
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_five QWORD 0, 5
-ptr_L_aes_gcm_five QWORD L_aes_gcm_five
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_six QWORD 0, 6
-ptr_L_aes_gcm_six QWORD L_aes_gcm_six
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_seven QWORD 0, 7
-ptr_L_aes_gcm_seven QWORD L_aes_gcm_seven
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_eight QWORD 0, 8
-ptr_L_aes_gcm_eight QWORD L_aes_gcm_eight
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_bswap_epi64 QWORD 283686952306183, 579005069656919567
-ptr_L_aes_gcm_bswap_epi64 QWORD L_aes_gcm_bswap_epi64
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_bswap_mask QWORD 579005069656919567, 283686952306183
-ptr_L_aes_gcm_bswap_mask QWORD L_aes_gcm_bswap_mask
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_aes_gcm_mod2_128 QWORD 1, 13979173243358019584
-ptr_L_aes_gcm_mod2_128 QWORD L_aes_gcm_mod2_128
-_DATA ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt PROC
-        push	r13
-        push	rdi
-        push	rsi
-        push	r12
-        push	rbx
-        push	r14
-        push	r15
-        mov	rdi, rcx
-        mov	rsi, rdx
-        mov	r12, r8
-        mov	rax, r9
-        mov	r8, QWORD PTR [rsp+96]
-        mov	r9d, DWORD PTR [rsp+104]
-        mov	r11d, DWORD PTR [rsp+112]
-        mov	ebx, DWORD PTR [rsp+120]
-        mov	r14d, DWORD PTR [rsp+128]
-        mov	r15, QWORD PTR [rsp+136]
-        mov	r10d, DWORD PTR [rsp+144]
-        sub	rsp, 160
-        pxor	xmm4, xmm4
-        pxor	xmm6, xmm6
-        cmp	ebx, 12
-        mov	edx, ebx
-        jne	L_AES_GCM_encrypt_iv_not_12
-        ; # Calculate values when IV is 12 bytes
-        ; Set counter based on IV
-        mov	ecx, 16777216
-        pinsrq	xmm4, QWORD PTR [rax], 0
-        pinsrd	xmm4, DWORD PTR [rax+8], 2
-        pinsrd	xmm4, ecx, 3
-        ; H = Encrypt X(=0) and T = Encrypt counter
-        movdqa	xmm1, xmm4
-        movdqa	xmm5, OWORD PTR [r15]
-        pxor	xmm1, xmm5
-        movdqa	xmm7, OWORD PTR [r15+16]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+32]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+48]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+64]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+80]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+96]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+112]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+128]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+144]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        cmp	r10d, 11
-        movdqa	xmm7, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_calc_iv_12_last
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+176]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        cmp	r10d, 13
-        movdqa	xmm7, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_calc_iv_12_last
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+208]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_calc_iv_12_last:
-        aesenclast	xmm5, xmm7
-        aesenclast	xmm1, xmm7
-        pshufb	xmm5, OWORD PTR L_aes_gcm_bswap_mask
-        movdqu	[rsp+144], xmm1
-        jmp	L_AES_GCM_encrypt_iv_done
-L_AES_GCM_encrypt_iv_not_12:
-        ; Calculate values when IV is not 12 bytes
-        ; H = Encrypt X(=0)
-        movdqa	xmm5, OWORD PTR [r15]
-        aesenc	xmm5, [r15+16]
-        aesenc	xmm5, [r15+32]
-        aesenc	xmm5, [r15+48]
-        aesenc	xmm5, [r15+64]
-        aesenc	xmm5, [r15+80]
-        aesenc	xmm5, [r15+96]
-        aesenc	xmm5, [r15+112]
-        aesenc	xmm5, [r15+128]
-        aesenc	xmm5, [r15+144]
-        cmp	r10d, 11
-        movdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_calc_iv_1_aesenc_avx_last
-        aesenc	xmm5, xmm9
-        aesenc	xmm5, [r15+176]
-        cmp	r10d, 13
-        movdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_calc_iv_1_aesenc_avx_last
-        aesenc	xmm5, xmm9
-        aesenc	xmm5, [r15+208]
-        movdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_calc_iv_1_aesenc_avx_last:
-        aesenclast	xmm5, xmm9
-        pshufb	xmm5, OWORD PTR L_aes_gcm_bswap_mask
-        ; Calc counter
-        ; Initialization vector
-        cmp	edx, 0
-        mov	rcx, 0
-        je	L_AES_GCM_encrypt_calc_iv_done
-        cmp	edx, 16
-        jl	L_AES_GCM_encrypt_calc_iv_lt16
-        and	edx, 4294967280
-L_AES_GCM_encrypt_calc_iv_16_loop:
-        movdqu	xmm8, [rax+rcx]
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm4, xmm8
-        pshufd	xmm1, xmm4, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm4, 17
-        pclmulqdq	xmm0, xmm4, 0
-        pxor	xmm1, xmm4
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm4, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm4
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm4, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm4, xmm2
-        por	xmm7, xmm0
-        por	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm4, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_calc_iv_16_loop
-        mov	edx, ebx
-        cmp	ecx, edx
-        je	L_AES_GCM_encrypt_calc_iv_done
-L_AES_GCM_encrypt_calc_iv_lt16:
-        sub	rsp, 16
-        pxor	xmm8, xmm8
-        xor	ebx, ebx
-        movdqu	[rsp], xmm8
-L_AES_GCM_encrypt_calc_iv_loop:
-        movzx	r13d, BYTE PTR [rax+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_calc_iv_loop
-        movdqu	xmm8, [rsp]
-        add	rsp, 16
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm4, xmm8
-        pshufd	xmm1, xmm4, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm4, 17
-        pclmulqdq	xmm0, xmm4, 0
-        pxor	xmm1, xmm4
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm4, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm4
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm4, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm4, xmm2
-        por	xmm7, xmm0
-        por	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm4, xmm2
-L_AES_GCM_encrypt_calc_iv_done:
-        ; T = Encrypt counter
-        pxor	xmm0, xmm0
-        shl	edx, 3
-        pinsrq	xmm0, rdx, 0
-        pxor	xmm4, xmm0
-        pshufd	xmm1, xmm4, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm4, 17
-        pclmulqdq	xmm0, xmm4, 0
-        pxor	xmm1, xmm4
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm4, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm4
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm4, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm4, xmm2
-        por	xmm7, xmm0
-        por	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm4, xmm2
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_mask
-        ;   Encrypt counter
-        movdqa	xmm8, OWORD PTR [r15]
-        pxor	xmm8, xmm4
-        aesenc	xmm8, [r15+16]
-        aesenc	xmm8, [r15+32]
-        aesenc	xmm8, [r15+48]
-        aesenc	xmm8, [r15+64]
-        aesenc	xmm8, [r15+80]
-        aesenc	xmm8, [r15+96]
-        aesenc	xmm8, [r15+112]
-        aesenc	xmm8, [r15+128]
-        aesenc	xmm8, [r15+144]
-        cmp	r10d, 11
-        movdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [r15+176]
-        cmp	r10d, 13
-        movdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [r15+208]
-        movdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last:
-        aesenclast	xmm8, xmm9
-        movdqu	[rsp+144], xmm8
-L_AES_GCM_encrypt_iv_done:
-        ; Additional authentication data
-        mov	edx, r11d
-        cmp	edx, 0
-        je	L_AES_GCM_encrypt_calc_aad_done
-        xor	ecx, ecx
-        cmp	edx, 16
-        jl	L_AES_GCM_encrypt_calc_aad_lt16
-        and	edx, 4294967280
-L_AES_GCM_encrypt_calc_aad_16_loop:
-        movdqu	xmm8, [r12+rcx]
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm6, xmm8
-        pshufd	xmm1, xmm6, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm6, 17
-        pclmulqdq	xmm0, xmm6, 0
-        pxor	xmm1, xmm6
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm6, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm6
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm6, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm6, xmm2
-        por	xmm7, xmm0
-        por	xmm6, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm6, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_calc_aad_16_loop
-        mov	edx, r11d
-        cmp	ecx, edx
-        je	L_AES_GCM_encrypt_calc_aad_done
-L_AES_GCM_encrypt_calc_aad_lt16:
-        sub	rsp, 16
-        pxor	xmm8, xmm8
-        xor	ebx, ebx
-        movdqu	[rsp], xmm8
-L_AES_GCM_encrypt_calc_aad_loop:
-        movzx	r13d, BYTE PTR [r12+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_calc_aad_loop
-        movdqu	xmm8, [rsp]
-        add	rsp, 16
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm6, xmm8
-        pshufd	xmm1, xmm6, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm6, 17
-        pclmulqdq	xmm0, xmm6, 0
-        pxor	xmm1, xmm6
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm6, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm6
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm6, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm6, xmm2
-        por	xmm7, xmm0
-        por	xmm6, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm6, xmm2
-L_AES_GCM_encrypt_calc_aad_done:
-        ; Calculate counter and H
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_epi64
-        movdqa	xmm9, xmm5
-        paddd	xmm4, OWORD PTR L_aes_gcm_one
-        movdqa	xmm8, xmm5
-        movdqu	[rsp+128], xmm4
-        psrlq	xmm9, 63
-        psllq	xmm8, 1
-        pslldq	xmm9, 8
-        por	xmm8, xmm9
-        pshufd	xmm5, xmm5, 255
-        psrad	xmm5, 31
-        pand	xmm5, OWORD PTR L_aes_gcm_mod2_128
-        pxor	xmm5, xmm8
-        xor	rbx, rbx
-        cmp	r9d, 128
-        mov	r13d, r9d
-        jl	L_AES_GCM_encrypt_done_128
-        and	r13d, 4294967168
-        movdqa	xmm2, xmm6
-        ; H ^ 1
-        movdqu	[rsp], xmm5
-        ; H ^ 2
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm5, 78
-        movdqa	xmm11, xmm5
-        movdqa	xmm8, xmm5
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm5
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm0, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm0, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm0, xmm14
-        movdqu	[rsp+16], xmm0
-        ; H ^ 3
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm0, 78
-        movdqa	xmm11, xmm0
-        movdqa	xmm8, xmm0
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm0
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm1, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm1, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm1, xmm14
-        movdqu	[rsp+32], xmm1
-        ; H ^ 4
-        pshufd	xmm9, xmm0, 78
-        pshufd	xmm10, xmm0, 78
-        movdqa	xmm11, xmm0
-        movdqa	xmm8, xmm0
-        pclmulqdq	xmm11, xmm0, 17
-        pclmulqdq	xmm8, xmm0, 0
-        pxor	xmm9, xmm0
-        pxor	xmm10, xmm0
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm3, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm3, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm3, xmm14
-        movdqu	[rsp+48], xmm3
-        ; H ^ 5
-        pshufd	xmm9, xmm0, 78
-        pshufd	xmm10, xmm1, 78
-        movdqa	xmm11, xmm1
-        movdqa	xmm8, xmm1
-        pclmulqdq	xmm11, xmm0, 17
-        pclmulqdq	xmm8, xmm0, 0
-        pxor	xmm9, xmm0
-        pxor	xmm10, xmm1
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+64], xmm7
-        ; H ^ 6
-        pshufd	xmm9, xmm1, 78
-        pshufd	xmm10, xmm1, 78
-        movdqa	xmm11, xmm1
-        movdqa	xmm8, xmm1
-        pclmulqdq	xmm11, xmm1, 17
-        pclmulqdq	xmm8, xmm1, 0
-        pxor	xmm9, xmm1
-        pxor	xmm10, xmm1
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+80], xmm7
-        ; H ^ 7
-        pshufd	xmm9, xmm1, 78
-        pshufd	xmm10, xmm3, 78
-        movdqa	xmm11, xmm3
-        movdqa	xmm8, xmm3
-        pclmulqdq	xmm11, xmm1, 17
-        pclmulqdq	xmm8, xmm1, 0
-        pxor	xmm9, xmm1
-        pxor	xmm10, xmm3
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+96], xmm7
-        ; H ^ 8
-        pshufd	xmm9, xmm3, 78
-        pshufd	xmm10, xmm3, 78
-        movdqa	xmm11, xmm3
-        movdqa	xmm8, xmm3
-        pclmulqdq	xmm11, xmm3, 17
-        pclmulqdq	xmm8, xmm3, 0
-        pxor	xmm9, xmm3
-        pxor	xmm10, xmm3
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+112], xmm7
-        ; First 128 bytes of input
-        movdqu	xmm8, [rsp+128]
-        movdqa	xmm1, OWORD PTR L_aes_gcm_bswap_epi64
-        movdqa	xmm0, xmm8
-        pshufb	xmm8, xmm1
-        movdqa	xmm9, xmm0
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pshufb	xmm9, xmm1
-        movdqa	xmm10, xmm0
-        paddd	xmm10, OWORD PTR L_aes_gcm_two
-        pshufb	xmm10, xmm1
-        movdqa	xmm11, xmm0
-        paddd	xmm11, OWORD PTR L_aes_gcm_three
-        pshufb	xmm11, xmm1
-        movdqa	xmm12, xmm0
-        paddd	xmm12, OWORD PTR L_aes_gcm_four
-        pshufb	xmm12, xmm1
-        movdqa	xmm13, xmm0
-        paddd	xmm13, OWORD PTR L_aes_gcm_five
-        pshufb	xmm13, xmm1
-        movdqa	xmm14, xmm0
-        paddd	xmm14, OWORD PTR L_aes_gcm_six
-        pshufb	xmm14, xmm1
-        movdqa	xmm15, xmm0
-        paddd	xmm15, OWORD PTR L_aes_gcm_seven
-        pshufb	xmm15, xmm1
-        paddd	xmm0, OWORD PTR L_aes_gcm_eight
-        movdqa	xmm7, OWORD PTR [r15]
-        movdqu	[rsp+128], xmm0
-        pxor	xmm8, xmm7
-        pxor	xmm9, xmm7
-        pxor	xmm10, xmm7
-        pxor	xmm11, xmm7
-        pxor	xmm12, xmm7
-        pxor	xmm13, xmm7
-        pxor	xmm14, xmm7
-        pxor	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+16]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+32]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+48]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+64]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+80]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+96]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+112]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+128]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+144]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        cmp	r10d, 11
-        movdqa	xmm7, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_enc_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+176]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        cmp	r10d, 13
-        movdqa	xmm7, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_enc_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+208]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_enc_done:
-        aesenclast	xmm8, xmm7
-        aesenclast	xmm9, xmm7
-        movdqu	xmm0, [rdi]
-        movdqu	xmm1, [rdi+16]
-        pxor	xmm8, xmm0
-        pxor	xmm9, xmm1
-        movdqu	[rsi], xmm8
-        movdqu	[rsi+16], xmm9
-        aesenclast	xmm10, xmm7
-        aesenclast	xmm11, xmm7
-        movdqu	xmm0, [rdi+32]
-        movdqu	xmm1, [rdi+48]
-        pxor	xmm10, xmm0
-        pxor	xmm11, xmm1
-        movdqu	[rsi+32], xmm10
-        movdqu	[rsi+48], xmm11
-        aesenclast	xmm12, xmm7
-        aesenclast	xmm13, xmm7
-        movdqu	xmm0, [rdi+64]
-        movdqu	xmm1, [rdi+80]
-        pxor	xmm12, xmm0
-        pxor	xmm13, xmm1
-        movdqu	[rsi+64], xmm12
-        movdqu	[rsi+80], xmm13
-        aesenclast	xmm14, xmm7
-        aesenclast	xmm15, xmm7
-        movdqu	xmm0, [rdi+96]
-        movdqu	xmm1, [rdi+112]
-        pxor	xmm14, xmm0
-        pxor	xmm15, xmm1
-        movdqu	[rsi+96], xmm14
-        movdqu	[rsi+112], xmm15
-        cmp	r13d, 128
-        mov	ebx, 128
-        jle	L_AES_GCM_encrypt_end_128
-        ; More 128 bytes of input
-L_AES_GCM_encrypt_ghash_128:
-        lea	rcx, QWORD PTR [rdi+rbx]
-        lea	rdx, QWORD PTR [rsi+rbx]
-        movdqu	xmm8, [rsp+128]
-        movdqa	xmm1, OWORD PTR L_aes_gcm_bswap_epi64
-        movdqa	xmm0, xmm8
-        pshufb	xmm8, xmm1
-        movdqa	xmm9, xmm0
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pshufb	xmm9, xmm1
-        movdqa	xmm10, xmm0
-        paddd	xmm10, OWORD PTR L_aes_gcm_two
-        pshufb	xmm10, xmm1
-        movdqa	xmm11, xmm0
-        paddd	xmm11, OWORD PTR L_aes_gcm_three
-        pshufb	xmm11, xmm1
-        movdqa	xmm12, xmm0
-        paddd	xmm12, OWORD PTR L_aes_gcm_four
-        pshufb	xmm12, xmm1
-        movdqa	xmm13, xmm0
-        paddd	xmm13, OWORD PTR L_aes_gcm_five
-        pshufb	xmm13, xmm1
-        movdqa	xmm14, xmm0
-        paddd	xmm14, OWORD PTR L_aes_gcm_six
-        pshufb	xmm14, xmm1
-        movdqa	xmm15, xmm0
-        paddd	xmm15, OWORD PTR L_aes_gcm_seven
-        pshufb	xmm15, xmm1
-        paddd	xmm0, OWORD PTR L_aes_gcm_eight
-        movdqa	xmm7, OWORD PTR [r15]
-        movdqu	[rsp+128], xmm0
-        pxor	xmm8, xmm7
-        pxor	xmm9, xmm7
-        pxor	xmm10, xmm7
-        pxor	xmm11, xmm7
-        pxor	xmm12, xmm7
-        pxor	xmm13, xmm7
-        pxor	xmm14, xmm7
-        pxor	xmm15, xmm7
-        movdqu	xmm7, [rsp+112]
-        movdqu	xmm0, [rdx+-128]
-        aesenc	xmm8, [r15+16]
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm0, xmm2
-        pshufd	xmm1, xmm7, 78
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm1, xmm7
-        pxor	xmm5, xmm0
-        movdqa	xmm3, xmm0
-        pclmulqdq	xmm3, xmm7, 17
-        aesenc	xmm9, [r15+16]
-        aesenc	xmm10, [r15+16]
-        movdqa	xmm2, xmm0
-        pclmulqdq	xmm2, xmm7, 0
-        aesenc	xmm11, [r15+16]
-        aesenc	xmm12, [r15+16]
-        pclmulqdq	xmm1, xmm5, 0
-        aesenc	xmm13, [r15+16]
-        aesenc	xmm14, [r15+16]
-        aesenc	xmm15, [r15+16]
-        pxor	xmm1, xmm2
-        pxor	xmm1, xmm3
-        movdqu	xmm7, [rsp+96]
-        movdqu	xmm0, [rdx+-112]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+32]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+32]
-        aesenc	xmm10, [r15+32]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+32]
-        aesenc	xmm12, [r15+32]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+32]
-        aesenc	xmm14, [r15+32]
-        aesenc	xmm15, [r15+32]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+80]
-        movdqu	xmm0, [rdx+-96]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+48]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+48]
-        aesenc	xmm10, [r15+48]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+48]
-        aesenc	xmm12, [r15+48]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+48]
-        aesenc	xmm14, [r15+48]
-        aesenc	xmm15, [r15+48]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+64]
-        movdqu	xmm0, [rdx+-80]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+64]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+64]
-        aesenc	xmm10, [r15+64]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+64]
-        aesenc	xmm12, [r15+64]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+64]
-        aesenc	xmm14, [r15+64]
-        aesenc	xmm15, [r15+64]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+48]
-        movdqu	xmm0, [rdx+-64]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+80]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+80]
-        aesenc	xmm10, [r15+80]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+80]
-        aesenc	xmm12, [r15+80]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+80]
-        aesenc	xmm14, [r15+80]
-        aesenc	xmm15, [r15+80]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+32]
-        movdqu	xmm0, [rdx+-48]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+96]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+96]
-        aesenc	xmm10, [r15+96]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+96]
-        aesenc	xmm12, [r15+96]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+96]
-        aesenc	xmm14, [r15+96]
-        aesenc	xmm15, [r15+96]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+16]
-        movdqu	xmm0, [rdx+-32]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+112]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+112]
-        aesenc	xmm10, [r15+112]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+112]
-        aesenc	xmm12, [r15+112]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+112]
-        aesenc	xmm14, [r15+112]
-        aesenc	xmm15, [r15+112]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp]
-        movdqu	xmm0, [rdx+-16]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+128]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+128]
-        aesenc	xmm10, [r15+128]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+128]
-        aesenc	xmm12, [r15+128]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+128]
-        aesenc	xmm14, [r15+128]
-        aesenc	xmm15, [r15+128]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqa	xmm5, xmm1
-        psrldq	xmm1, 8
-        pslldq	xmm5, 8
-        aesenc	xmm8, [r15+144]
-        pxor	xmm2, xmm5
-        pxor	xmm3, xmm1
-        movdqa	xmm7, xmm2
-        movdqa	xmm4, xmm2
-        movdqa	xmm5, xmm2
-        aesenc	xmm9, [r15+144]
-        pslld	xmm7, 31
-        pslld	xmm4, 30
-        pslld	xmm5, 25
-        aesenc	xmm10, [r15+144]
-        pxor	xmm7, xmm4
-        pxor	xmm7, xmm5
-        aesenc	xmm11, [r15+144]
-        movdqa	xmm4, xmm7
-        pslldq	xmm7, 12
-        psrldq	xmm4, 4
-        aesenc	xmm12, [r15+144]
-        pxor	xmm2, xmm7
-        movdqa	xmm5, xmm2
-        movdqa	xmm1, xmm2
-        movdqa	xmm0, xmm2
-        aesenc	xmm13, [r15+144]
-        psrld	xmm5, 1
-        psrld	xmm1, 2
-        psrld	xmm0, 7
-        aesenc	xmm14, [r15+144]
-        pxor	xmm5, xmm1
-        pxor	xmm5, xmm0
-        aesenc	xmm15, [r15+144]
-        pxor	xmm5, xmm4
-        pxor	xmm2, xmm5
-        pxor	xmm2, xmm3
-        cmp	r10d, 11
-        movdqa	xmm7, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_aesenc_128_ghash_avx_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+176]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        cmp	r10d, 13
-        movdqa	xmm7, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_aesenc_128_ghash_avx_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+208]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_aesenc_128_ghash_avx_done:
-        aesenclast	xmm8, xmm7
-        aesenclast	xmm9, xmm7
-        movdqu	xmm0, [rcx]
-        movdqu	xmm1, [rcx+16]
-        pxor	xmm8, xmm0
-        pxor	xmm9, xmm1
-        movdqu	[rdx], xmm8
-        movdqu	[rdx+16], xmm9
-        aesenclast	xmm10, xmm7
-        aesenclast	xmm11, xmm7
-        movdqu	xmm0, [rcx+32]
-        movdqu	xmm1, [rcx+48]
-        pxor	xmm10, xmm0
-        pxor	xmm11, xmm1
-        movdqu	[rdx+32], xmm10
-        movdqu	[rdx+48], xmm11
-        aesenclast	xmm12, xmm7
-        aesenclast	xmm13, xmm7
-        movdqu	xmm0, [rcx+64]
-        movdqu	xmm1, [rcx+80]
-        pxor	xmm12, xmm0
-        pxor	xmm13, xmm1
-        movdqu	[rdx+64], xmm12
-        movdqu	[rdx+80], xmm13
-        aesenclast	xmm14, xmm7
-        aesenclast	xmm15, xmm7
-        movdqu	xmm0, [rcx+96]
-        movdqu	xmm1, [rcx+112]
-        pxor	xmm14, xmm0
-        pxor	xmm15, xmm1
-        movdqu	[rdx+96], xmm14
-        movdqu	[rdx+112], xmm15
-        add	ebx, 128
-        cmp	ebx, r13d
-        jl	L_AES_GCM_encrypt_ghash_128
-L_AES_GCM_encrypt_end_128:
-        movdqa	xmm4, OWORD PTR L_aes_gcm_bswap_mask
-        pshufb	xmm8, xmm4
-        pshufb	xmm9, xmm4
-        pshufb	xmm10, xmm4
-        pshufb	xmm11, xmm4
-        pxor	xmm8, xmm2
-        pshufb	xmm12, xmm4
-        pshufb	xmm13, xmm4
-        pshufb	xmm14, xmm4
-        pshufb	xmm15, xmm4
-        movdqu	xmm7, [rsp+112]
-        pshufd	xmm1, xmm8, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm8, 17
-        pclmulqdq	xmm0, xmm8, 0
-        pxor	xmm1, xmm8
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm4, xmm0
-        movdqa	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+96]
-        pshufd	xmm1, xmm9, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm9, 17
-        pclmulqdq	xmm0, xmm9, 0
-        pxor	xmm1, xmm9
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+80]
-        pshufd	xmm1, xmm10, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm10, 17
-        pclmulqdq	xmm0, xmm10, 0
-        pxor	xmm1, xmm10
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+64]
-        pshufd	xmm1, xmm11, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm11, 17
-        pclmulqdq	xmm0, xmm11, 0
-        pxor	xmm1, xmm11
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+48]
-        pshufd	xmm1, xmm12, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm12, 17
-        pclmulqdq	xmm0, xmm12, 0
-        pxor	xmm1, xmm12
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+32]
-        pshufd	xmm1, xmm13, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm13, 17
-        pclmulqdq	xmm0, xmm13, 0
-        pxor	xmm1, xmm13
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+16]
-        pshufd	xmm1, xmm14, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm14, 17
-        pclmulqdq	xmm0, xmm14, 0
-        pxor	xmm1, xmm14
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp]
-        pshufd	xmm1, xmm15, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm15, 17
-        pclmulqdq	xmm0, xmm15, 0
-        pxor	xmm1, xmm15
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqa	xmm0, xmm4
-        movdqa	xmm1, xmm4
-        movdqa	xmm2, xmm4
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm4, xmm0
-        movdqa	xmm2, xmm4
-        movdqa	xmm3, xmm4
-        movdqa	xmm0, xmm4
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm4
-        pxor	xmm6, xmm2
-        movdqu	xmm5, [rsp]
-L_AES_GCM_encrypt_done_128:
-        mov	edx, r9d
-        cmp	ebx, edx
-        jge	L_AES_GCM_encrypt_done_enc
-        mov	r13d, r9d
-        and	r13d, 4294967280
-        cmp	ebx, r13d
-        jge	L_AES_GCM_encrypt_last_block_done
-        lea	rcx, QWORD PTR [rdi+rbx]
-        lea	rdx, QWORD PTR [rsi+rbx]
-        movdqu	xmm8, [rsp+128]
-        movdqa	xmm9, xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_epi64
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pxor	xmm8, [r15]
-        movdqu	[rsp+128], xmm9
-        aesenc	xmm8, [r15+16]
-        aesenc	xmm8, [r15+32]
-        aesenc	xmm8, [r15+48]
-        aesenc	xmm8, [r15+64]
-        aesenc	xmm8, [r15+80]
-        aesenc	xmm8, [r15+96]
-        aesenc	xmm8, [r15+112]
-        aesenc	xmm8, [r15+128]
-        aesenc	xmm8, [r15+144]
-        cmp	r10d, 11
-        movdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [r15+176]
-        cmp	r10d, 13
-        movdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [r15+208]
-        movdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last:
-        aesenclast	xmm8, xmm9
-        movdqu	xmm9, [rcx]
-        pxor	xmm8, xmm9
-        movdqu	[rdx], xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm6, xmm8
-        add	ebx, 16
-        cmp	ebx, r13d
-        jge	L_AES_GCM_encrypt_last_block_ghash
-L_AES_GCM_encrypt_last_block_start:
-        lea	rcx, QWORD PTR [rdi+rbx]
-        lea	rdx, QWORD PTR [rsi+rbx]
-        movdqu	xmm8, [rsp+128]
-        movdqa	xmm9, xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_epi64
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pxor	xmm8, [r15]
-        movdqu	[rsp+128], xmm9
-        movdqa	xmm10, xmm6
-        pclmulqdq	xmm10, xmm5, 16
-        aesenc	xmm8, [r15+16]
-        aesenc	xmm8, [r15+32]
-        movdqa	xmm11, xmm6
-        pclmulqdq	xmm11, xmm5, 1
-        aesenc	xmm8, [r15+48]
-        aesenc	xmm8, [r15+64]
-        movdqa	xmm12, xmm6
-        pclmulqdq	xmm12, xmm5, 0
-        aesenc	xmm8, [r15+80]
-        movdqa	xmm1, xmm6
-        pclmulqdq	xmm1, xmm5, 17
-        aesenc	xmm8, [r15+96]
-        pxor	xmm10, xmm11
-        movdqa	xmm2, xmm10
-        psrldq	xmm10, 8
-        pslldq	xmm2, 8
-        aesenc	xmm8, [r15+112]
-        movdqa	xmm3, xmm1
-        pxor	xmm2, xmm12
-        pxor	xmm3, xmm10
-        movdqa	xmm0, OWORD PTR L_aes_gcm_mod2_128
-        movdqa	xmm11, xmm2
-        pclmulqdq	xmm11, xmm0, 16
-        aesenc	xmm8, [r15+128]
-        pshufd	xmm10, xmm2, 78
-        pxor	xmm10, xmm11
-        movdqa	xmm11, xmm10
-        pclmulqdq	xmm11, xmm0, 16
-        aesenc	xmm8, [r15+144]
-        pshufd	xmm6, xmm10, 78
-        pxor	xmm6, xmm11
-        pxor	xmm6, xmm3
-        cmp	r10d, 11
-        movdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_aesenc_gfmul_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [r15+176]
-        cmp	r10d, 13
-        movdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_aesenc_gfmul_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [r15+208]
-        movdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_aesenc_gfmul_last:
-        aesenclast	xmm8, xmm9
-        movdqu	xmm9, [rcx]
-        pxor	xmm8, xmm9
-        movdqu	[rdx], xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm6, xmm8
-        add	ebx, 16
-        cmp	ebx, r13d
-        jl	L_AES_GCM_encrypt_last_block_start
-L_AES_GCM_encrypt_last_block_ghash:
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm6, 78
-        movdqa	xmm11, xmm6
-        movdqa	xmm8, xmm6
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm6
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm6, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm6, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm6, xmm14
-L_AES_GCM_encrypt_last_block_done:
-        mov	ecx, r9d
-        mov	edx, ecx
-        and	ecx, 15
-        jz	L_AES_GCM_encrypt_aesenc_last15_enc_avx_done
-        movdqu	xmm4, [rsp+128]
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_epi64
-        pxor	xmm4, [r15]
-        aesenc	xmm4, [r15+16]
-        aesenc	xmm4, [r15+32]
-        aesenc	xmm4, [r15+48]
-        aesenc	xmm4, [r15+64]
-        aesenc	xmm4, [r15+80]
-        aesenc	xmm4, [r15+96]
-        aesenc	xmm4, [r15+112]
-        aesenc	xmm4, [r15+128]
-        aesenc	xmm4, [r15+144]
-        cmp	r10d, 11
-        movdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last
-        aesenc	xmm4, xmm9
-        aesenc	xmm4, [r15+176]
-        cmp	r10d, 13
-        movdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last
-        aesenc	xmm4, xmm9
-        aesenc	xmm4, [r15+208]
-        movdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last:
-        aesenclast	xmm4, xmm9
-        sub	rsp, 16
-        xor	ecx, ecx
-        movdqu	[rsp], xmm4
-L_AES_GCM_encrypt_aesenc_last15_enc_avx_loop:
-        movzx	r13d, BYTE PTR [rdi+rbx]
-        xor	r13b, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [rsi+rbx], r13b
-        mov	BYTE PTR [rsp+rcx], r13b
-        inc	ebx
-        inc	ecx
-        cmp	ebx, edx
-        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_loop
-        xor	r13, r13
-        cmp	ecx, 16
-        je	L_AES_GCM_encrypt_aesenc_last15_enc_avx_finish_enc
-L_AES_GCM_encrypt_aesenc_last15_enc_avx_byte_loop:
-        mov	BYTE PTR [rsp+rcx], r13b
-        inc	ecx
-        cmp	ecx, 16
-        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_byte_loop
-L_AES_GCM_encrypt_aesenc_last15_enc_avx_finish_enc:
-        movdqu	xmm4, [rsp]
-        add	rsp, 16
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm6, xmm4
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm6, 78
-        movdqa	xmm11, xmm6
-        movdqa	xmm8, xmm6
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm6
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm6, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm6, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm6, xmm14
-L_AES_GCM_encrypt_aesenc_last15_enc_avx_done:
-L_AES_GCM_encrypt_done_enc:
-        mov	edx, r9d
-        mov	ecx, r11d
-        shl	rdx, 3
-        shl	rcx, 3
-        pinsrq	xmm0, rdx, 0
-        pinsrq	xmm0, rcx, 1
-        pxor	xmm6, xmm0
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm6, 78
-        movdqa	xmm11, xmm6
-        movdqa	xmm8, xmm6
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm6
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm6, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm6, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm6, xmm14
-        pshufb	xmm6, OWORD PTR L_aes_gcm_bswap_mask
-        movdqu	xmm0, [rsp+144]
-        pxor	xmm0, xmm6
-        cmp	r14d, 16
-        je	L_AES_GCM_encrypt_store_tag_16
-        xor	rcx, rcx
-        movdqu	[rsp], xmm0
-L_AES_GCM_encrypt_store_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [r8+rcx], r13b
-        inc	ecx
-        cmp	ecx, r14d
-        jne	L_AES_GCM_encrypt_store_tag_loop
-        jmp	L_AES_GCM_encrypt_store_tag_done
-L_AES_GCM_encrypt_store_tag_16:
-        movdqu	[r8], xmm0
-L_AES_GCM_encrypt_store_tag_done:
-        add	rsp, 160
-        pop	r15
-        pop	r14
-        pop	rbx
-        pop	r12
-        pop	rsi
-        pop	rdi
-        pop	r13
-        ret
-AES_GCM_encrypt ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_decrypt PROC
-        push	r13
-        push	rdi
-        push	rsi
-        push	r12
-        push	rbx
-        push	r14
-        push	r15
-        push	rbp
-        mov	rdi, rcx
-        mov	rsi, rdx
-        mov	r12, r8
-        mov	rax, r9
-        mov	r8, QWORD PTR [rsp+104]
-        mov	r9d, DWORD PTR [rsp+112]
-        mov	r11d, DWORD PTR [rsp+120]
-        mov	ebx, DWORD PTR [rsp+128]
-        mov	r14d, DWORD PTR [rsp+136]
-        mov	r15, QWORD PTR [rsp+144]
-        mov	r10d, DWORD PTR [rsp+152]
-        mov	rbp, QWORD PTR [rsp+160]
-        sub	rsp, 168
-        pxor	xmm4, xmm4
-        pxor	xmm6, xmm6
-        cmp	ebx, 12
-        mov	edx, ebx
-        jne	L_AES_GCM_decrypt_iv_not_12
-        ; # Calculate values when IV is 12 bytes
-        ; Set counter based on IV
-        mov	ecx, 16777216
-        pinsrq	xmm4, QWORD PTR [rax], 0
-        pinsrd	xmm4, DWORD PTR [rax+8], 2
-        pinsrd	xmm4, ecx, 3
-        ; H = Encrypt X(=0) and T = Encrypt counter
-        movdqa	xmm1, xmm4
-        movdqa	xmm5, OWORD PTR [r15]
-        pxor	xmm1, xmm5
-        movdqa	xmm7, OWORD PTR [r15+16]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+32]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+48]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+64]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+80]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+96]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+112]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+128]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+144]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        cmp	r10d, 11
-        movdqa	xmm7, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_calc_iv_12_last
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+176]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        cmp	r10d, 13
-        movdqa	xmm7, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_calc_iv_12_last
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+208]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_calc_iv_12_last:
-        aesenclast	xmm5, xmm7
-        aesenclast	xmm1, xmm7
-        pshufb	xmm5, OWORD PTR L_aes_gcm_bswap_mask
-        movdqu	[rsp+144], xmm1
-        jmp	L_AES_GCM_decrypt_iv_done
-L_AES_GCM_decrypt_iv_not_12:
-        ; Calculate values when IV is not 12 bytes
-        ; H = Encrypt X(=0)
-        movdqa	xmm5, OWORD PTR [r15]
-        aesenc	xmm5, [r15+16]
-        aesenc	xmm5, [r15+32]
-        aesenc	xmm5, [r15+48]
-        aesenc	xmm5, [r15+64]
-        aesenc	xmm5, [r15+80]
-        aesenc	xmm5, [r15+96]
-        aesenc	xmm5, [r15+112]
-        aesenc	xmm5, [r15+128]
-        aesenc	xmm5, [r15+144]
-        cmp	r10d, 11
-        movdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_calc_iv_1_aesenc_avx_last
-        aesenc	xmm5, xmm9
-        aesenc	xmm5, [r15+176]
-        cmp	r10d, 13
-        movdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_calc_iv_1_aesenc_avx_last
-        aesenc	xmm5, xmm9
-        aesenc	xmm5, [r15+208]
-        movdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_calc_iv_1_aesenc_avx_last:
-        aesenclast	xmm5, xmm9
-        pshufb	xmm5, OWORD PTR L_aes_gcm_bswap_mask
-        ; Calc counter
-        ; Initialization vector
-        cmp	edx, 0
-        mov	rcx, 0
-        je	L_AES_GCM_decrypt_calc_iv_done
-        cmp	edx, 16
-        jl	L_AES_GCM_decrypt_calc_iv_lt16
-        and	edx, 4294967280
-L_AES_GCM_decrypt_calc_iv_16_loop:
-        movdqu	xmm8, [rax+rcx]
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm4, xmm8
-        pshufd	xmm1, xmm4, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm4, 17
-        pclmulqdq	xmm0, xmm4, 0
-        pxor	xmm1, xmm4
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm4, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm4
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm4, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm4, xmm2
-        por	xmm7, xmm0
-        por	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm4, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_calc_iv_16_loop
-        mov	edx, ebx
-        cmp	ecx, edx
-        je	L_AES_GCM_decrypt_calc_iv_done
-L_AES_GCM_decrypt_calc_iv_lt16:
-        sub	rsp, 16
-        pxor	xmm8, xmm8
-        xor	ebx, ebx
-        movdqu	[rsp], xmm8
-L_AES_GCM_decrypt_calc_iv_loop:
-        movzx	r13d, BYTE PTR [rax+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_calc_iv_loop
-        movdqu	xmm8, [rsp]
-        add	rsp, 16
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm4, xmm8
-        pshufd	xmm1, xmm4, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm4, 17
-        pclmulqdq	xmm0, xmm4, 0
-        pxor	xmm1, xmm4
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm4, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm4
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm4, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm4, xmm2
-        por	xmm7, xmm0
-        por	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm4, xmm2
-L_AES_GCM_decrypt_calc_iv_done:
-        ; T = Encrypt counter
-        pxor	xmm0, xmm0
-        shl	edx, 3
-        pinsrq	xmm0, rdx, 0
-        pxor	xmm4, xmm0
-        pshufd	xmm1, xmm4, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm4, 17
-        pclmulqdq	xmm0, xmm4, 0
-        pxor	xmm1, xmm4
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm4, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm4
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm4, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm4, xmm2
-        por	xmm7, xmm0
-        por	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm4, xmm2
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_mask
-        ;   Encrypt counter
-        movdqa	xmm8, OWORD PTR [r15]
-        pxor	xmm8, xmm4
-        aesenc	xmm8, [r15+16]
-        aesenc	xmm8, [r15+32]
-        aesenc	xmm8, [r15+48]
-        aesenc	xmm8, [r15+64]
-        aesenc	xmm8, [r15+80]
-        aesenc	xmm8, [r15+96]
-        aesenc	xmm8, [r15+112]
-        aesenc	xmm8, [r15+128]
-        aesenc	xmm8, [r15+144]
-        cmp	r10d, 11
-        movdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [r15+176]
-        cmp	r10d, 13
-        movdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [r15+208]
-        movdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last:
-        aesenclast	xmm8, xmm9
-        movdqu	[rsp+144], xmm8
-L_AES_GCM_decrypt_iv_done:
-        ; Additional authentication data
-        mov	edx, r11d
-        cmp	edx, 0
-        je	L_AES_GCM_decrypt_calc_aad_done
-        xor	ecx, ecx
-        cmp	edx, 16
-        jl	L_AES_GCM_decrypt_calc_aad_lt16
-        and	edx, 4294967280
-L_AES_GCM_decrypt_calc_aad_16_loop:
-        movdqu	xmm8, [r12+rcx]
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm6, xmm8
-        pshufd	xmm1, xmm6, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm6, 17
-        pclmulqdq	xmm0, xmm6, 0
-        pxor	xmm1, xmm6
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm6, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm6
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm6, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm6, xmm2
-        por	xmm7, xmm0
-        por	xmm6, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm6, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_calc_aad_16_loop
-        mov	edx, r11d
-        cmp	ecx, edx
-        je	L_AES_GCM_decrypt_calc_aad_done
-L_AES_GCM_decrypt_calc_aad_lt16:
-        sub	rsp, 16
-        pxor	xmm8, xmm8
-        xor	ebx, ebx
-        movdqu	[rsp], xmm8
-L_AES_GCM_decrypt_calc_aad_loop:
-        movzx	r13d, BYTE PTR [r12+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_calc_aad_loop
-        movdqu	xmm8, [rsp]
-        add	rsp, 16
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm6, xmm8
-        pshufd	xmm1, xmm6, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm6, 17
-        pclmulqdq	xmm0, xmm6, 0
-        pxor	xmm1, xmm6
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm6, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm6
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm6, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm6, xmm2
-        por	xmm7, xmm0
-        por	xmm6, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm6, xmm2
-L_AES_GCM_decrypt_calc_aad_done:
-        ; Calculate counter and H
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_epi64
-        movdqa	xmm9, xmm5
-        paddd	xmm4, OWORD PTR L_aes_gcm_one
-        movdqa	xmm8, xmm5
-        movdqu	[rsp+128], xmm4
-        psrlq	xmm9, 63
-        psllq	xmm8, 1
-        pslldq	xmm9, 8
-        por	xmm8, xmm9
-        pshufd	xmm5, xmm5, 255
-        psrad	xmm5, 31
-        pand	xmm5, OWORD PTR L_aes_gcm_mod2_128
-        pxor	xmm5, xmm8
-        xor	ebx, ebx
-        cmp	r9d, 128
-        mov	r13d, r9d
-        jl	L_AES_GCM_decrypt_done_128
-        and	r13d, 4294967168
-        movdqa	xmm2, xmm6
-        ; H ^ 1
-        movdqu	[rsp], xmm5
-        ; H ^ 2
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm5, 78
-        movdqa	xmm11, xmm5
-        movdqa	xmm8, xmm5
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm5
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm0, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm0, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm0, xmm14
-        movdqu	[rsp+16], xmm0
-        ; H ^ 3
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm0, 78
-        movdqa	xmm11, xmm0
-        movdqa	xmm8, xmm0
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm0
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm1, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm1, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm1, xmm14
-        movdqu	[rsp+32], xmm1
-        ; H ^ 4
-        pshufd	xmm9, xmm0, 78
-        pshufd	xmm10, xmm0, 78
-        movdqa	xmm11, xmm0
-        movdqa	xmm8, xmm0
-        pclmulqdq	xmm11, xmm0, 17
-        pclmulqdq	xmm8, xmm0, 0
-        pxor	xmm9, xmm0
-        pxor	xmm10, xmm0
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm3, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm3, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm3, xmm14
-        movdqu	[rsp+48], xmm3
-        ; H ^ 5
-        pshufd	xmm9, xmm0, 78
-        pshufd	xmm10, xmm1, 78
-        movdqa	xmm11, xmm1
-        movdqa	xmm8, xmm1
-        pclmulqdq	xmm11, xmm0, 17
-        pclmulqdq	xmm8, xmm0, 0
-        pxor	xmm9, xmm0
-        pxor	xmm10, xmm1
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+64], xmm7
-        ; H ^ 6
-        pshufd	xmm9, xmm1, 78
-        pshufd	xmm10, xmm1, 78
-        movdqa	xmm11, xmm1
-        movdqa	xmm8, xmm1
-        pclmulqdq	xmm11, xmm1, 17
-        pclmulqdq	xmm8, xmm1, 0
-        pxor	xmm9, xmm1
-        pxor	xmm10, xmm1
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+80], xmm7
-        ; H ^ 7
-        pshufd	xmm9, xmm1, 78
-        pshufd	xmm10, xmm3, 78
-        movdqa	xmm11, xmm3
-        movdqa	xmm8, xmm3
-        pclmulqdq	xmm11, xmm1, 17
-        pclmulqdq	xmm8, xmm1, 0
-        pxor	xmm9, xmm1
-        pxor	xmm10, xmm3
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+96], xmm7
-        ; H ^ 8
-        pshufd	xmm9, xmm3, 78
-        pshufd	xmm10, xmm3, 78
-        movdqa	xmm11, xmm3
-        movdqa	xmm8, xmm3
-        pclmulqdq	xmm11, xmm3, 17
-        pclmulqdq	xmm8, xmm3, 0
-        pxor	xmm9, xmm3
-        pxor	xmm10, xmm3
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+112], xmm7
-L_AES_GCM_decrypt_ghash_128:
-        lea	rcx, QWORD PTR [rdi+rbx]
-        lea	rdx, QWORD PTR [rsi+rbx]
-        movdqu	xmm8, [rsp+128]
-        movdqa	xmm1, OWORD PTR L_aes_gcm_bswap_epi64
-        movdqa	xmm0, xmm8
-        pshufb	xmm8, xmm1
-        movdqa	xmm9, xmm0
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pshufb	xmm9, xmm1
-        movdqa	xmm10, xmm0
-        paddd	xmm10, OWORD PTR L_aes_gcm_two
-        pshufb	xmm10, xmm1
-        movdqa	xmm11, xmm0
-        paddd	xmm11, OWORD PTR L_aes_gcm_three
-        pshufb	xmm11, xmm1
-        movdqa	xmm12, xmm0
-        paddd	xmm12, OWORD PTR L_aes_gcm_four
-        pshufb	xmm12, xmm1
-        movdqa	xmm13, xmm0
-        paddd	xmm13, OWORD PTR L_aes_gcm_five
-        pshufb	xmm13, xmm1
-        movdqa	xmm14, xmm0
-        paddd	xmm14, OWORD PTR L_aes_gcm_six
-        pshufb	xmm14, xmm1
-        movdqa	xmm15, xmm0
-        paddd	xmm15, OWORD PTR L_aes_gcm_seven
-        pshufb	xmm15, xmm1
-        paddd	xmm0, OWORD PTR L_aes_gcm_eight
-        movdqa	xmm7, OWORD PTR [r15]
-        movdqu	[rsp+128], xmm0
-        pxor	xmm8, xmm7
-        pxor	xmm9, xmm7
-        pxor	xmm10, xmm7
-        pxor	xmm11, xmm7
-        pxor	xmm12, xmm7
-        pxor	xmm13, xmm7
-        pxor	xmm14, xmm7
-        pxor	xmm15, xmm7
-        movdqu	xmm7, [rsp+112]
-        movdqu	xmm0, [rcx]
-        aesenc	xmm8, [r15+16]
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm0, xmm2
-        pshufd	xmm1, xmm7, 78
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm1, xmm7
-        pxor	xmm5, xmm0
-        movdqa	xmm3, xmm0
-        pclmulqdq	xmm3, xmm7, 17
-        aesenc	xmm9, [r15+16]
-        aesenc	xmm10, [r15+16]
-        movdqa	xmm2, xmm0
-        pclmulqdq	xmm2, xmm7, 0
-        aesenc	xmm11, [r15+16]
-        aesenc	xmm12, [r15+16]
-        pclmulqdq	xmm1, xmm5, 0
-        aesenc	xmm13, [r15+16]
-        aesenc	xmm14, [r15+16]
-        aesenc	xmm15, [r15+16]
-        pxor	xmm1, xmm2
-        pxor	xmm1, xmm3
-        movdqu	xmm7, [rsp+96]
-        movdqu	xmm0, [rcx+16]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+32]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+32]
-        aesenc	xmm10, [r15+32]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+32]
-        aesenc	xmm12, [r15+32]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+32]
-        aesenc	xmm14, [r15+32]
-        aesenc	xmm15, [r15+32]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+80]
-        movdqu	xmm0, [rcx+32]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+48]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+48]
-        aesenc	xmm10, [r15+48]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+48]
-        aesenc	xmm12, [r15+48]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+48]
-        aesenc	xmm14, [r15+48]
-        aesenc	xmm15, [r15+48]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+64]
-        movdqu	xmm0, [rcx+48]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+64]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+64]
-        aesenc	xmm10, [r15+64]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+64]
-        aesenc	xmm12, [r15+64]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+64]
-        aesenc	xmm14, [r15+64]
-        aesenc	xmm15, [r15+64]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+48]
-        movdqu	xmm0, [rcx+64]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+80]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+80]
-        aesenc	xmm10, [r15+80]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+80]
-        aesenc	xmm12, [r15+80]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+80]
-        aesenc	xmm14, [r15+80]
-        aesenc	xmm15, [r15+80]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+32]
-        movdqu	xmm0, [rcx+80]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+96]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+96]
-        aesenc	xmm10, [r15+96]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+96]
-        aesenc	xmm12, [r15+96]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+96]
-        aesenc	xmm14, [r15+96]
-        aesenc	xmm15, [r15+96]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+16]
-        movdqu	xmm0, [rcx+96]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+112]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+112]
-        aesenc	xmm10, [r15+112]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+112]
-        aesenc	xmm12, [r15+112]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+112]
-        aesenc	xmm14, [r15+112]
-        aesenc	xmm15, [r15+112]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp]
-        movdqu	xmm0, [rcx+112]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [r15+128]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [r15+128]
-        aesenc	xmm10, [r15+128]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [r15+128]
-        aesenc	xmm12, [r15+128]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [r15+128]
-        aesenc	xmm14, [r15+128]
-        aesenc	xmm15, [r15+128]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqa	xmm5, xmm1
-        psrldq	xmm1, 8
-        pslldq	xmm5, 8
-        aesenc	xmm8, [r15+144]
-        pxor	xmm2, xmm5
-        pxor	xmm3, xmm1
-        movdqa	xmm7, xmm2
-        movdqa	xmm4, xmm2
-        movdqa	xmm5, xmm2
-        aesenc	xmm9, [r15+144]
-        pslld	xmm7, 31
-        pslld	xmm4, 30
-        pslld	xmm5, 25
-        aesenc	xmm10, [r15+144]
-        pxor	xmm7, xmm4
-        pxor	xmm7, xmm5
-        aesenc	xmm11, [r15+144]
-        movdqa	xmm4, xmm7
-        pslldq	xmm7, 12
-        psrldq	xmm4, 4
-        aesenc	xmm12, [r15+144]
-        pxor	xmm2, xmm7
-        movdqa	xmm5, xmm2
-        movdqa	xmm1, xmm2
-        movdqa	xmm0, xmm2
-        aesenc	xmm13, [r15+144]
-        psrld	xmm5, 1
-        psrld	xmm1, 2
-        psrld	xmm0, 7
-        aesenc	xmm14, [r15+144]
-        pxor	xmm5, xmm1
-        pxor	xmm5, xmm0
-        aesenc	xmm15, [r15+144]
-        pxor	xmm5, xmm4
-        pxor	xmm2, xmm5
-        pxor	xmm2, xmm3
-        cmp	r10d, 11
-        movdqa	xmm7, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_aesenc_128_ghash_avx_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+176]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        cmp	r10d, 13
-        movdqa	xmm7, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_aesenc_128_ghash_avx_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+208]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_aesenc_128_ghash_avx_done:
-        aesenclast	xmm8, xmm7
-        aesenclast	xmm9, xmm7
-        movdqu	xmm0, [rcx]
-        movdqu	xmm1, [rcx+16]
-        pxor	xmm8, xmm0
-        pxor	xmm9, xmm1
-        movdqu	[rdx], xmm8
-        movdqu	[rdx+16], xmm9
-        aesenclast	xmm10, xmm7
-        aesenclast	xmm11, xmm7
-        movdqu	xmm0, [rcx+32]
-        movdqu	xmm1, [rcx+48]
-        pxor	xmm10, xmm0
-        pxor	xmm11, xmm1
-        movdqu	[rdx+32], xmm10
-        movdqu	[rdx+48], xmm11
-        aesenclast	xmm12, xmm7
-        aesenclast	xmm13, xmm7
-        movdqu	xmm0, [rcx+64]
-        movdqu	xmm1, [rcx+80]
-        pxor	xmm12, xmm0
-        pxor	xmm13, xmm1
-        movdqu	[rdx+64], xmm12
-        movdqu	[rdx+80], xmm13
-        aesenclast	xmm14, xmm7
-        aesenclast	xmm15, xmm7
-        movdqu	xmm0, [rcx+96]
-        movdqu	xmm1, [rcx+112]
-        pxor	xmm14, xmm0
-        pxor	xmm15, xmm1
-        movdqu	[rdx+96], xmm14
-        movdqu	[rdx+112], xmm15
-        add	ebx, 128
-        cmp	ebx, r13d
-        jl	L_AES_GCM_decrypt_ghash_128
-        movdqa	xmm6, xmm2
-        movdqu	xmm5, [rsp]
-L_AES_GCM_decrypt_done_128:
-        mov	edx, r9d
-        cmp	ebx, edx
-        jge	L_AES_GCM_decrypt_done_dec
-        mov	r13d, r9d
-        and	r13d, 4294967280
-        cmp	ebx, r13d
-        jge	L_AES_GCM_decrypt_last_block_done
-L_AES_GCM_decrypt_last_block_start:
-        lea	rcx, QWORD PTR [rdi+rbx]
-        lea	rdx, QWORD PTR [rsi+rbx]
-        movdqu	xmm1, [rcx]
-        movdqa	xmm0, xmm5
-        pshufb	xmm1, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm1, xmm6
-        movdqu	xmm8, [rsp+128]
-        movdqa	xmm9, xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_epi64
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pxor	xmm8, [r15]
-        movdqu	[rsp+128], xmm9
-        movdqa	xmm10, xmm1
-        pclmulqdq	xmm10, xmm0, 16
-        aesenc	xmm8, [r15+16]
-        aesenc	xmm8, [r15+32]
-        movdqa	xmm11, xmm1
-        pclmulqdq	xmm11, xmm0, 1
-        aesenc	xmm8, [r15+48]
-        aesenc	xmm8, [r15+64]
-        movdqa	xmm12, xmm1
-        pclmulqdq	xmm12, xmm0, 0
-        aesenc	xmm8, [r15+80]
-        movdqa	xmm1, xmm1
-        pclmulqdq	xmm1, xmm0, 17
-        aesenc	xmm8, [r15+96]
-        pxor	xmm10, xmm11
-        movdqa	xmm2, xmm10
-        psrldq	xmm10, 8
-        pslldq	xmm2, 8
-        aesenc	xmm8, [r15+112]
-        movdqa	xmm3, xmm1
-        pxor	xmm2, xmm12
-        pxor	xmm3, xmm10
-        movdqa	xmm0, OWORD PTR L_aes_gcm_mod2_128
-        movdqa	xmm11, xmm2
-        pclmulqdq	xmm11, xmm0, 16
-        aesenc	xmm8, [r15+128]
-        pshufd	xmm10, xmm2, 78
-        pxor	xmm10, xmm11
-        movdqa	xmm11, xmm10
-        pclmulqdq	xmm11, xmm0, 16
-        aesenc	xmm8, [r15+144]
-        pshufd	xmm6, xmm10, 78
-        pxor	xmm6, xmm11
-        pxor	xmm6, xmm3
-        cmp	r10d, 11
-        movdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_aesenc_gfmul_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [r15+176]
-        cmp	r10d, 13
-        movdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_aesenc_gfmul_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [r15+208]
-        movdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_aesenc_gfmul_last:
-        aesenclast	xmm8, xmm9
-        movdqu	xmm9, [rcx]
-        pxor	xmm8, xmm9
-        movdqu	[rdx], xmm8
-        add	ebx, 16
-        cmp	ebx, r13d
-        jl	L_AES_GCM_decrypt_last_block_start
-L_AES_GCM_decrypt_last_block_done:
-        mov	ecx, r9d
-        mov	edx, ecx
-        and	ecx, 15
-        jz	L_AES_GCM_decrypt_aesenc_last15_dec_avx_done
-        movdqu	xmm4, [rsp+128]
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_epi64
-        pxor	xmm4, [r15]
-        aesenc	xmm4, [r15+16]
-        aesenc	xmm4, [r15+32]
-        aesenc	xmm4, [r15+48]
-        aesenc	xmm4, [r15+64]
-        aesenc	xmm4, [r15+80]
-        aesenc	xmm4, [r15+96]
-        aesenc	xmm4, [r15+112]
-        aesenc	xmm4, [r15+128]
-        aesenc	xmm4, [r15+144]
-        cmp	r10d, 11
-        movdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last
-        aesenc	xmm4, xmm9
-        aesenc	xmm4, [r15+176]
-        cmp	r10d, 13
-        movdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last
-        aesenc	xmm4, xmm9
-        aesenc	xmm4, [r15+208]
-        movdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last:
-        aesenclast	xmm4, xmm9
-        sub	rsp, 32
-        xor	ecx, ecx
-        movdqu	[rsp], xmm4
-        pxor	xmm0, xmm0
-        movdqu	[rsp+16], xmm0
-L_AES_GCM_decrypt_aesenc_last15_dec_avx_loop:
-        movzx	r13d, BYTE PTR [rdi+rbx]
-        mov	BYTE PTR [rsp+rcx+16], r13b
-        xor	r13b, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [rsi+rbx], r13b
-        inc	ebx
-        inc	ecx
-        cmp	ebx, edx
-        jl	L_AES_GCM_decrypt_aesenc_last15_dec_avx_loop
-        movdqu	xmm4, [rsp+16]
-        add	rsp, 32
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm6, xmm4
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm6, 78
-        movdqa	xmm11, xmm6
-        movdqa	xmm8, xmm6
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm6
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm6, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm6, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm6, xmm14
-L_AES_GCM_decrypt_aesenc_last15_dec_avx_done:
-L_AES_GCM_decrypt_done_dec:
-        mov	edx, r9d
-        mov	ecx, r11d
-        shl	rdx, 3
-        shl	rcx, 3
-        pinsrq	xmm0, rdx, 0
-        pinsrq	xmm0, rcx, 1
-        pxor	xmm6, xmm0
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm6, 78
-        movdqa	xmm11, xmm6
-        movdqa	xmm8, xmm6
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm6
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm6, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm6, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm6, xmm14
-        pshufb	xmm6, OWORD PTR L_aes_gcm_bswap_mask
-        movdqu	xmm0, [rsp+144]
-        pxor	xmm0, xmm6
-        cmp	r14d, 16
-        je	L_AES_GCM_decrypt_cmp_tag_16
-        sub	rsp, 16
-        xor	rcx, rcx
-        xor	rbx, rbx
-        movdqu	[rsp], xmm0
-L_AES_GCM_decrypt_cmp_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+rcx]
-        xor	r13b, BYTE PTR [r8+rcx]
-        or	bl, r13b
-        inc	ecx
-        cmp	ecx, r14d
-        jne	L_AES_GCM_decrypt_cmp_tag_loop
-        cmp	rbx, 0
-        sete	bl
-        add	rsp, 16
-        xor	rcx, rcx
-        jmp	L_AES_GCM_decrypt_cmp_tag_done
-L_AES_GCM_decrypt_cmp_tag_16:
-        movdqu	xmm1, [r8]
-        pcmpeqb	xmm0, xmm1
-        pmovmskb	rdx, xmm0
-        ; %%edx == 0xFFFF then return 1 else => return 0
-        xor	ebx, ebx
-        cmp	edx, 65535
-        sete	bl
-L_AES_GCM_decrypt_cmp_tag_done:
-        mov	DWORD PTR [rbp], ebx
-        add	rsp, 168
-        pop	rbp
-        pop	r15
-        pop	r14
-        pop	rbx
-        pop	r12
-        pop	rsi
-        pop	rdi
-        pop	r13
-        ret
-AES_GCM_decrypt ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_init_aesni PROC
-        push	rdi
-        push	rsi
-        push	r12
-        push	r13
-        push	r14
-        mov	rdi, rcx
-        mov	rsi, rdx
-        mov	r10, r8
-        mov	r11d, r9d
-        mov	rax, QWORD PTR [rsp+80]
-        mov	r8, QWORD PTR [rsp+88]
-        mov	r9, QWORD PTR [rsp+96]
-        sub	rsp, 16
-        pxor	xmm4, xmm4
-        mov	edx, r11d
-        cmp	edx, 12
-        jne	L_AES_GCM_init_aesni_iv_not_12
-        ; # Calculate values when IV is 12 bytes
-        ; Set counter based on IV
-        mov	ecx, 16777216
-        pinsrq	xmm4, QWORD PTR [r10], 0
-        pinsrd	xmm4, DWORD PTR [r10+8], 2
-        pinsrd	xmm4, ecx, 3
-        ; H = Encrypt X(=0) and T = Encrypt counter
-        movdqa	xmm1, xmm4
-        movdqa	xmm5, OWORD PTR [rdi]
-        pxor	xmm1, xmm5
-        movdqa	xmm7, OWORD PTR [rdi+16]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [rdi+32]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [rdi+48]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [rdi+64]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [rdi+80]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [rdi+96]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [rdi+112]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [rdi+128]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [rdi+144]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        cmp	esi, 11
-        movdqa	xmm7, OWORD PTR [rdi+160]
-        jl	L_AES_GCM_init_aesni_calc_iv_12_last
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [rdi+176]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        cmp	esi, 13
-        movdqa	xmm7, OWORD PTR [rdi+192]
-        jl	L_AES_GCM_init_aesni_calc_iv_12_last
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [rdi+208]
-        aesenc	xmm5, xmm7
-        aesenc	xmm1, xmm7
-        movdqa	xmm7, OWORD PTR [rdi+224]
-L_AES_GCM_init_aesni_calc_iv_12_last:
-        aesenclast	xmm5, xmm7
-        aesenclast	xmm1, xmm7
-        pshufb	xmm5, OWORD PTR L_aes_gcm_bswap_mask
-        movdqu	xmm15, xmm1
-        jmp	L_AES_GCM_init_aesni_iv_done
-L_AES_GCM_init_aesni_iv_not_12:
-        ; Calculate values when IV is not 12 bytes
-        ; H = Encrypt X(=0)
-        movdqa	xmm5, OWORD PTR [rdi]
-        aesenc	xmm5, [rdi+16]
-        aesenc	xmm5, [rdi+32]
-        aesenc	xmm5, [rdi+48]
-        aesenc	xmm5, [rdi+64]
-        aesenc	xmm5, [rdi+80]
-        aesenc	xmm5, [rdi+96]
-        aesenc	xmm5, [rdi+112]
-        aesenc	xmm5, [rdi+128]
-        aesenc	xmm5, [rdi+144]
-        cmp	esi, 11
-        movdqa	xmm9, OWORD PTR [rdi+160]
-        jl	L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last
-        aesenc	xmm5, xmm9
-        aesenc	xmm5, [rdi+176]
-        cmp	esi, 13
-        movdqa	xmm9, OWORD PTR [rdi+192]
-        jl	L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last
-        aesenc	xmm5, xmm9
-        aesenc	xmm5, [rdi+208]
-        movdqa	xmm9, OWORD PTR [rdi+224]
-L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last:
-        aesenclast	xmm5, xmm9
-        pshufb	xmm5, OWORD PTR L_aes_gcm_bswap_mask
-        ; Calc counter
-        ; Initialization vector
-        cmp	edx, 0
-        mov	rcx, 0
-        je	L_AES_GCM_init_aesni_calc_iv_done
-        cmp	edx, 16
-        jl	L_AES_GCM_init_aesni_calc_iv_lt16
-        and	edx, 4294967280
-L_AES_GCM_init_aesni_calc_iv_16_loop:
-        movdqu	xmm8, [r10+rcx]
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm4, xmm8
-        pshufd	xmm1, xmm4, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm4, 17
-        pclmulqdq	xmm0, xmm4, 0
-        pxor	xmm1, xmm4
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm4, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm4
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm4, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm4, xmm2
-        por	xmm7, xmm0
-        por	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm4, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_init_aesni_calc_iv_16_loop
-        mov	edx, r11d
-        cmp	ecx, edx
-        je	L_AES_GCM_init_aesni_calc_iv_done
-L_AES_GCM_init_aesni_calc_iv_lt16:
-        sub	rsp, 16
-        pxor	xmm8, xmm8
-        xor	r13d, r13d
-        movdqu	[rsp], xmm8
-L_AES_GCM_init_aesni_calc_iv_loop:
-        movzx	r12d, BYTE PTR [r10+rcx]
-        mov	BYTE PTR [rsp+r13], r12b
-        inc	ecx
-        inc	r13d
-        cmp	ecx, edx
-        jl	L_AES_GCM_init_aesni_calc_iv_loop
-        movdqu	xmm8, [rsp]
-        add	rsp, 16
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm4, xmm8
-        pshufd	xmm1, xmm4, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm4, 17
-        pclmulqdq	xmm0, xmm4, 0
-        pxor	xmm1, xmm4
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm4, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm4
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm4, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm4, xmm2
-        por	xmm7, xmm0
-        por	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm4, xmm2
-L_AES_GCM_init_aesni_calc_iv_done:
-        ; T = Encrypt counter
-        pxor	xmm0, xmm0
-        shl	edx, 3
-        pinsrq	xmm0, rdx, 0
-        pxor	xmm4, xmm0
-        pshufd	xmm1, xmm4, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm4, 17
-        pclmulqdq	xmm0, xmm4, 0
-        pxor	xmm1, xmm4
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm7, xmm0
-        movdqa	xmm4, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm7, xmm2
-        pxor	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm4
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm7, 1
-        pslld	xmm4, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm4, xmm2
-        por	xmm7, xmm0
-        por	xmm4, xmm1
-        movdqa	xmm0, xmm7
-        movdqa	xmm1, xmm7
-        movdqa	xmm2, xmm7
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm7, xmm0
-        movdqa	xmm2, xmm7
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm7
-        pxor	xmm4, xmm2
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_mask
-        ;   Encrypt counter
-        movdqa	xmm8, OWORD PTR [rdi]
-        pxor	xmm8, xmm4
-        aesenc	xmm8, [rdi+16]
-        aesenc	xmm8, [rdi+32]
-        aesenc	xmm8, [rdi+48]
-        aesenc	xmm8, [rdi+64]
-        aesenc	xmm8, [rdi+80]
-        aesenc	xmm8, [rdi+96]
-        aesenc	xmm8, [rdi+112]
-        aesenc	xmm8, [rdi+128]
-        aesenc	xmm8, [rdi+144]
-        cmp	esi, 11
-        movdqa	xmm9, OWORD PTR [rdi+160]
-        jl	L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [rdi+176]
-        cmp	esi, 13
-        movdqa	xmm9, OWORD PTR [rdi+192]
-        jl	L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [rdi+208]
-        movdqa	xmm9, OWORD PTR [rdi+224]
-L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last:
-        aesenclast	xmm8, xmm9
-        movdqu	xmm15, xmm8
-L_AES_GCM_init_aesni_iv_done:
-        movdqa	OWORD PTR [r9], xmm15
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_epi64
-        paddd	xmm4, OWORD PTR L_aes_gcm_one
-        movdqa	OWORD PTR [rax], xmm5
-        movdqa	OWORD PTR [r8], xmm4
-        add	rsp, 16
-        pop	r14
-        pop	r13
-        pop	r12
-        pop	rsi
-        pop	rdi
-        ret
-AES_GCM_init_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_aad_update_aesni PROC
-        mov	rax, rcx
-        movdqa	xmm5, OWORD PTR [r8]
-        movdqa	xmm6, OWORD PTR [r9]
-        xor	ecx, ecx
-L_AES_GCM_aad_update_aesni_16_loop:
-        movdqu	xmm8, [rax+rcx]
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm5, xmm8
-        pshufd	xmm1, xmm5, 78
-        pshufd	xmm2, xmm6, 78
-        movdqa	xmm3, xmm6
-        movdqa	xmm0, xmm6
-        pclmulqdq	xmm3, xmm5, 17
-        pclmulqdq	xmm0, xmm5, 0
-        pxor	xmm1, xmm5
-        pxor	xmm2, xmm6
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm4, xmm0
-        movdqa	xmm5, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm5, xmm1
-        movdqa	xmm0, xmm4
-        movdqa	xmm1, xmm5
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm4, 1
-        pslld	xmm5, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm5, xmm2
-        por	xmm4, xmm0
-        por	xmm5, xmm1
-        movdqa	xmm0, xmm4
-        movdqa	xmm1, xmm4
-        movdqa	xmm2, xmm4
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm4, xmm0
-        movdqa	xmm2, xmm4
-        movdqa	xmm3, xmm4
-        movdqa	xmm0, xmm4
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm4
-        pxor	xmm5, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_aad_update_aesni_16_loop
-        movdqa	OWORD PTR [r8], xmm5
-        ret
-AES_GCM_aad_update_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_block_aesni PROC
-        mov	r10, r8
-        mov	r11, r9
-        mov	rax, QWORD PTR [rsp+40]
-        movdqu	xmm8, [rax]
-        movdqa	xmm9, xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_epi64
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pxor	xmm8, [rcx]
-        movdqu	[rax], xmm9
-        aesenc	xmm8, [rcx+16]
-        aesenc	xmm8, [rcx+32]
-        aesenc	xmm8, [rcx+48]
-        aesenc	xmm8, [rcx+64]
-        aesenc	xmm8, [rcx+80]
-        aesenc	xmm8, [rcx+96]
-        aesenc	xmm8, [rcx+112]
-        aesenc	xmm8, [rcx+128]
-        aesenc	xmm8, [rcx+144]
-        cmp	edx, 11
-        movdqa	xmm9, OWORD PTR [rcx+160]
-        jl	L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [rcx+176]
-        cmp	edx, 13
-        movdqa	xmm9, OWORD PTR [rcx+192]
-        jl	L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [rcx+208]
-        movdqa	xmm9, OWORD PTR [rcx+224]
-L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last:
-        aesenclast	xmm8, xmm9
-        movdqu	xmm9, [r11]
-        pxor	xmm8, xmm9
-        movdqu	[r10], xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        ret
-AES_GCM_encrypt_block_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_ghash_block_aesni PROC
-        movdqa	xmm4, OWORD PTR [rdx]
-        movdqa	xmm5, OWORD PTR [r8]
-        movdqu	xmm8, [rcx]
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm4, xmm8
-        pshufd	xmm1, xmm4, 78
-        pshufd	xmm2, xmm5, 78
-        movdqa	xmm3, xmm5
-        movdqa	xmm0, xmm5
-        pclmulqdq	xmm3, xmm4, 17
-        pclmulqdq	xmm0, xmm4, 0
-        pxor	xmm1, xmm4
-        pxor	xmm2, xmm5
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm6, xmm0
-        movdqa	xmm4, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm6, xmm2
-        pxor	xmm4, xmm1
-        movdqa	xmm0, xmm6
-        movdqa	xmm1, xmm4
-        psrld	xmm0, 31
-        psrld	xmm1, 31
-        pslld	xmm6, 1
-        pslld	xmm4, 1
-        movdqa	xmm2, xmm0
-        pslldq	xmm0, 4
-        psrldq	xmm2, 12
-        pslldq	xmm1, 4
-        por	xmm4, xmm2
-        por	xmm6, xmm0
-        por	xmm4, xmm1
-        movdqa	xmm0, xmm6
-        movdqa	xmm1, xmm6
-        movdqa	xmm2, xmm6
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm6, xmm0
-        movdqa	xmm2, xmm6
-        movdqa	xmm3, xmm6
-        movdqa	xmm0, xmm6
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm6
-        pxor	xmm4, xmm2
-        movdqa	OWORD PTR [rdx], xmm4
-        ret
-AES_GCM_ghash_block_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_update_aesni PROC
-        push	r13
-        push	r12
-        push	r14
-        push	r15
-        push	rdi
-        mov	rax, rcx
-        mov	r10, r8
-        mov	r8d, edx
-        mov	r11, r9
-        mov	r9d, DWORD PTR [rsp+80]
-        mov	r12, QWORD PTR [rsp+88]
-        mov	r14, QWORD PTR [rsp+96]
-        mov	r15, QWORD PTR [rsp+104]
-        sub	rsp, 160
-        movdqa	xmm6, OWORD PTR [r12]
-        movdqa	xmm5, OWORD PTR [r14]
-        movdqa	xmm9, xmm5
-        movdqa	xmm8, xmm5
-        psrlq	xmm9, 63
-        psllq	xmm8, 1
-        pslldq	xmm9, 8
-        por	xmm8, xmm9
-        pshufd	xmm5, xmm5, 255
-        psrad	xmm5, 31
-        pand	xmm5, OWORD PTR L_aes_gcm_mod2_128
-        pxor	xmm5, xmm8
-        xor	rdi, rdi
-        cmp	r9d, 128
-        mov	r13d, r9d
-        jl	L_AES_GCM_encrypt_update_aesni_done_128
-        and	r13d, 4294967168
-        movdqa	xmm2, xmm6
-        ; H ^ 1
-        movdqu	[rsp], xmm5
-        ; H ^ 2
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm5, 78
-        movdqa	xmm11, xmm5
-        movdqa	xmm8, xmm5
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm5
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm0, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm0, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm0, xmm14
-        movdqu	[rsp+16], xmm0
-        ; H ^ 3
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm0, 78
-        movdqa	xmm11, xmm0
-        movdqa	xmm8, xmm0
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm0
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm1, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm1, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm1, xmm14
-        movdqu	[rsp+32], xmm1
-        ; H ^ 4
-        pshufd	xmm9, xmm0, 78
-        pshufd	xmm10, xmm0, 78
-        movdqa	xmm11, xmm0
-        movdqa	xmm8, xmm0
-        pclmulqdq	xmm11, xmm0, 17
-        pclmulqdq	xmm8, xmm0, 0
-        pxor	xmm9, xmm0
-        pxor	xmm10, xmm0
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm3, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm3, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm3, xmm14
-        movdqu	[rsp+48], xmm3
-        ; H ^ 5
-        pshufd	xmm9, xmm0, 78
-        pshufd	xmm10, xmm1, 78
-        movdqa	xmm11, xmm1
-        movdqa	xmm8, xmm1
-        pclmulqdq	xmm11, xmm0, 17
-        pclmulqdq	xmm8, xmm0, 0
-        pxor	xmm9, xmm0
-        pxor	xmm10, xmm1
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+64], xmm7
-        ; H ^ 6
-        pshufd	xmm9, xmm1, 78
-        pshufd	xmm10, xmm1, 78
-        movdqa	xmm11, xmm1
-        movdqa	xmm8, xmm1
-        pclmulqdq	xmm11, xmm1, 17
-        pclmulqdq	xmm8, xmm1, 0
-        pxor	xmm9, xmm1
-        pxor	xmm10, xmm1
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+80], xmm7
-        ; H ^ 7
-        pshufd	xmm9, xmm1, 78
-        pshufd	xmm10, xmm3, 78
-        movdqa	xmm11, xmm3
-        movdqa	xmm8, xmm3
-        pclmulqdq	xmm11, xmm1, 17
-        pclmulqdq	xmm8, xmm1, 0
-        pxor	xmm9, xmm1
-        pxor	xmm10, xmm3
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+96], xmm7
-        ; H ^ 8
-        pshufd	xmm9, xmm3, 78
-        pshufd	xmm10, xmm3, 78
-        movdqa	xmm11, xmm3
-        movdqa	xmm8, xmm3
-        pclmulqdq	xmm11, xmm3, 17
-        pclmulqdq	xmm8, xmm3, 0
-        pxor	xmm9, xmm3
-        pxor	xmm10, xmm3
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+112], xmm7
-        ; First 128 bytes of input
-        movdqu	xmm8, [r15]
-        movdqa	xmm1, OWORD PTR L_aes_gcm_bswap_epi64
-        movdqa	xmm0, xmm8
-        pshufb	xmm8, xmm1
-        movdqa	xmm9, xmm0
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pshufb	xmm9, xmm1
-        movdqa	xmm10, xmm0
-        paddd	xmm10, OWORD PTR L_aes_gcm_two
-        pshufb	xmm10, xmm1
-        movdqa	xmm11, xmm0
-        paddd	xmm11, OWORD PTR L_aes_gcm_three
-        pshufb	xmm11, xmm1
-        movdqa	xmm12, xmm0
-        paddd	xmm12, OWORD PTR L_aes_gcm_four
-        pshufb	xmm12, xmm1
-        movdqa	xmm13, xmm0
-        paddd	xmm13, OWORD PTR L_aes_gcm_five
-        pshufb	xmm13, xmm1
-        movdqa	xmm14, xmm0
-        paddd	xmm14, OWORD PTR L_aes_gcm_six
-        pshufb	xmm14, xmm1
-        movdqa	xmm15, xmm0
-        paddd	xmm15, OWORD PTR L_aes_gcm_seven
-        pshufb	xmm15, xmm1
-        paddd	xmm0, OWORD PTR L_aes_gcm_eight
-        movdqa	xmm7, OWORD PTR [rax]
-        movdqu	[r15], xmm0
-        pxor	xmm8, xmm7
-        pxor	xmm9, xmm7
-        pxor	xmm10, xmm7
-        pxor	xmm11, xmm7
-        pxor	xmm12, xmm7
-        pxor	xmm13, xmm7
-        pxor	xmm14, xmm7
-        pxor	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+16]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+32]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+48]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+64]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+80]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+96]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+112]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+128]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+144]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        cmp	r8d, 11
-        movdqa	xmm7, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_aesni_enc_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+176]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        cmp	r8d, 13
-        movdqa	xmm7, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_aesni_enc_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+208]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_aesni_enc_done:
-        aesenclast	xmm8, xmm7
-        aesenclast	xmm9, xmm7
-        movdqu	xmm0, [r11]
-        movdqu	xmm1, [r11+16]
-        pxor	xmm8, xmm0
-        pxor	xmm9, xmm1
-        movdqu	[r10], xmm8
-        movdqu	[r10+16], xmm9
-        aesenclast	xmm10, xmm7
-        aesenclast	xmm11, xmm7
-        movdqu	xmm0, [r11+32]
-        movdqu	xmm1, [r11+48]
-        pxor	xmm10, xmm0
-        pxor	xmm11, xmm1
-        movdqu	[r10+32], xmm10
-        movdqu	[r10+48], xmm11
-        aesenclast	xmm12, xmm7
-        aesenclast	xmm13, xmm7
-        movdqu	xmm0, [r11+64]
-        movdqu	xmm1, [r11+80]
-        pxor	xmm12, xmm0
-        pxor	xmm13, xmm1
-        movdqu	[r10+64], xmm12
-        movdqu	[r10+80], xmm13
-        aesenclast	xmm14, xmm7
-        aesenclast	xmm15, xmm7
-        movdqu	xmm0, [r11+96]
-        movdqu	xmm1, [r11+112]
-        pxor	xmm14, xmm0
-        pxor	xmm15, xmm1
-        movdqu	[r10+96], xmm14
-        movdqu	[r10+112], xmm15
-        cmp	r13d, 128
-        mov	edi, 128
-        jle	L_AES_GCM_encrypt_update_aesni_end_128
-        ; More 128 bytes of input
-L_AES_GCM_encrypt_update_aesni_ghash_128:
-        lea	rcx, QWORD PTR [r11+rdi]
-        lea	rdx, QWORD PTR [r10+rdi]
-        movdqu	xmm8, [r15]
-        movdqa	xmm1, OWORD PTR L_aes_gcm_bswap_epi64
-        movdqa	xmm0, xmm8
-        pshufb	xmm8, xmm1
-        movdqa	xmm9, xmm0
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pshufb	xmm9, xmm1
-        movdqa	xmm10, xmm0
-        paddd	xmm10, OWORD PTR L_aes_gcm_two
-        pshufb	xmm10, xmm1
-        movdqa	xmm11, xmm0
-        paddd	xmm11, OWORD PTR L_aes_gcm_three
-        pshufb	xmm11, xmm1
-        movdqa	xmm12, xmm0
-        paddd	xmm12, OWORD PTR L_aes_gcm_four
-        pshufb	xmm12, xmm1
-        movdqa	xmm13, xmm0
-        paddd	xmm13, OWORD PTR L_aes_gcm_five
-        pshufb	xmm13, xmm1
-        movdqa	xmm14, xmm0
-        paddd	xmm14, OWORD PTR L_aes_gcm_six
-        pshufb	xmm14, xmm1
-        movdqa	xmm15, xmm0
-        paddd	xmm15, OWORD PTR L_aes_gcm_seven
-        pshufb	xmm15, xmm1
-        paddd	xmm0, OWORD PTR L_aes_gcm_eight
-        movdqa	xmm7, OWORD PTR [rax]
-        movdqu	[r15], xmm0
-        pxor	xmm8, xmm7
-        pxor	xmm9, xmm7
-        pxor	xmm10, xmm7
-        pxor	xmm11, xmm7
-        pxor	xmm12, xmm7
-        pxor	xmm13, xmm7
-        pxor	xmm14, xmm7
-        pxor	xmm15, xmm7
-        movdqu	xmm7, [rsp+112]
-        movdqu	xmm0, [rdx+-128]
-        aesenc	xmm8, [rax+16]
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm0, xmm2
-        pshufd	xmm1, xmm7, 78
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm1, xmm7
-        pxor	xmm5, xmm0
-        movdqa	xmm3, xmm0
-        pclmulqdq	xmm3, xmm7, 17
-        aesenc	xmm9, [rax+16]
-        aesenc	xmm10, [rax+16]
-        movdqa	xmm2, xmm0
-        pclmulqdq	xmm2, xmm7, 0
-        aesenc	xmm11, [rax+16]
-        aesenc	xmm12, [rax+16]
-        pclmulqdq	xmm1, xmm5, 0
-        aesenc	xmm13, [rax+16]
-        aesenc	xmm14, [rax+16]
-        aesenc	xmm15, [rax+16]
-        pxor	xmm1, xmm2
-        pxor	xmm1, xmm3
-        movdqu	xmm7, [rsp+96]
-        movdqu	xmm0, [rdx+-112]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+32]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+32]
-        aesenc	xmm10, [rax+32]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+32]
-        aesenc	xmm12, [rax+32]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+32]
-        aesenc	xmm14, [rax+32]
-        aesenc	xmm15, [rax+32]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+80]
-        movdqu	xmm0, [rdx+-96]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+48]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+48]
-        aesenc	xmm10, [rax+48]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+48]
-        aesenc	xmm12, [rax+48]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+48]
-        aesenc	xmm14, [rax+48]
-        aesenc	xmm15, [rax+48]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+64]
-        movdqu	xmm0, [rdx+-80]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+64]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+64]
-        aesenc	xmm10, [rax+64]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+64]
-        aesenc	xmm12, [rax+64]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+64]
-        aesenc	xmm14, [rax+64]
-        aesenc	xmm15, [rax+64]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+48]
-        movdqu	xmm0, [rdx+-64]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+80]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+80]
-        aesenc	xmm10, [rax+80]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+80]
-        aesenc	xmm12, [rax+80]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+80]
-        aesenc	xmm14, [rax+80]
-        aesenc	xmm15, [rax+80]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+32]
-        movdqu	xmm0, [rdx+-48]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+96]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+96]
-        aesenc	xmm10, [rax+96]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+96]
-        aesenc	xmm12, [rax+96]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+96]
-        aesenc	xmm14, [rax+96]
-        aesenc	xmm15, [rax+96]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+16]
-        movdqu	xmm0, [rdx+-32]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+112]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+112]
-        aesenc	xmm10, [rax+112]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+112]
-        aesenc	xmm12, [rax+112]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+112]
-        aesenc	xmm14, [rax+112]
-        aesenc	xmm15, [rax+112]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp]
-        movdqu	xmm0, [rdx+-16]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+128]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+128]
-        aesenc	xmm10, [rax+128]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+128]
-        aesenc	xmm12, [rax+128]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+128]
-        aesenc	xmm14, [rax+128]
-        aesenc	xmm15, [rax+128]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqa	xmm5, xmm1
-        psrldq	xmm1, 8
-        pslldq	xmm5, 8
-        aesenc	xmm8, [rax+144]
-        pxor	xmm2, xmm5
-        pxor	xmm3, xmm1
-        movdqa	xmm7, xmm2
-        movdqa	xmm4, xmm2
-        movdqa	xmm5, xmm2
-        aesenc	xmm9, [rax+144]
-        pslld	xmm7, 31
-        pslld	xmm4, 30
-        pslld	xmm5, 25
-        aesenc	xmm10, [rax+144]
-        pxor	xmm7, xmm4
-        pxor	xmm7, xmm5
-        aesenc	xmm11, [rax+144]
-        movdqa	xmm4, xmm7
-        pslldq	xmm7, 12
-        psrldq	xmm4, 4
-        aesenc	xmm12, [rax+144]
-        pxor	xmm2, xmm7
-        movdqa	xmm5, xmm2
-        movdqa	xmm1, xmm2
-        movdqa	xmm0, xmm2
-        aesenc	xmm13, [rax+144]
-        psrld	xmm5, 1
-        psrld	xmm1, 2
-        psrld	xmm0, 7
-        aesenc	xmm14, [rax+144]
-        pxor	xmm5, xmm1
-        pxor	xmm5, xmm0
-        aesenc	xmm15, [rax+144]
-        pxor	xmm5, xmm4
-        pxor	xmm2, xmm5
-        pxor	xmm2, xmm3
-        cmp	r8d, 11
-        movdqa	xmm7, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_128_ghash_avx_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+176]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        cmp	r8d, 13
-        movdqa	xmm7, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_128_ghash_avx_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+208]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_aesni_aesenc_128_ghash_avx_done:
-        aesenclast	xmm8, xmm7
-        aesenclast	xmm9, xmm7
-        movdqu	xmm0, [rcx]
-        movdqu	xmm1, [rcx+16]
-        pxor	xmm8, xmm0
-        pxor	xmm9, xmm1
-        movdqu	[rdx], xmm8
-        movdqu	[rdx+16], xmm9
-        aesenclast	xmm10, xmm7
-        aesenclast	xmm11, xmm7
-        movdqu	xmm0, [rcx+32]
-        movdqu	xmm1, [rcx+48]
-        pxor	xmm10, xmm0
-        pxor	xmm11, xmm1
-        movdqu	[rdx+32], xmm10
-        movdqu	[rdx+48], xmm11
-        aesenclast	xmm12, xmm7
-        aesenclast	xmm13, xmm7
-        movdqu	xmm0, [rcx+64]
-        movdqu	xmm1, [rcx+80]
-        pxor	xmm12, xmm0
-        pxor	xmm13, xmm1
-        movdqu	[rdx+64], xmm12
-        movdqu	[rdx+80], xmm13
-        aesenclast	xmm14, xmm7
-        aesenclast	xmm15, xmm7
-        movdqu	xmm0, [rcx+96]
-        movdqu	xmm1, [rcx+112]
-        pxor	xmm14, xmm0
-        pxor	xmm15, xmm1
-        movdqu	[rdx+96], xmm14
-        movdqu	[rdx+112], xmm15
-        add	edi, 128
-        cmp	edi, r13d
-        jl	L_AES_GCM_encrypt_update_aesni_ghash_128
-L_AES_GCM_encrypt_update_aesni_end_128:
-        movdqa	xmm4, OWORD PTR L_aes_gcm_bswap_mask
-        pshufb	xmm8, xmm4
-        pshufb	xmm9, xmm4
-        pshufb	xmm10, xmm4
-        pshufb	xmm11, xmm4
-        pxor	xmm8, xmm2
-        pshufb	xmm12, xmm4
-        pshufb	xmm13, xmm4
-        pshufb	xmm14, xmm4
-        pshufb	xmm15, xmm4
-        movdqu	xmm7, [rsp+112]
-        pshufd	xmm1, xmm8, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm8, 17
-        pclmulqdq	xmm0, xmm8, 0
-        pxor	xmm1, xmm8
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        movdqa	xmm4, xmm0
-        movdqa	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+96]
-        pshufd	xmm1, xmm9, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm9, 17
-        pclmulqdq	xmm0, xmm9, 0
-        pxor	xmm1, xmm9
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+80]
-        pshufd	xmm1, xmm10, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm10, 17
-        pclmulqdq	xmm0, xmm10, 0
-        pxor	xmm1, xmm10
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+64]
-        pshufd	xmm1, xmm11, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm11, 17
-        pclmulqdq	xmm0, xmm11, 0
-        pxor	xmm1, xmm11
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+48]
-        pshufd	xmm1, xmm12, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm12, 17
-        pclmulqdq	xmm0, xmm12, 0
-        pxor	xmm1, xmm12
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+32]
-        pshufd	xmm1, xmm13, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm13, 17
-        pclmulqdq	xmm0, xmm13, 0
-        pxor	xmm1, xmm13
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp+16]
-        pshufd	xmm1, xmm14, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm14, 17
-        pclmulqdq	xmm0, xmm14, 0
-        pxor	xmm1, xmm14
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqu	xmm7, [rsp]
-        pshufd	xmm1, xmm15, 78
-        pshufd	xmm2, xmm7, 78
-        movdqa	xmm3, xmm7
-        movdqa	xmm0, xmm7
-        pclmulqdq	xmm3, xmm15, 17
-        pclmulqdq	xmm0, xmm15, 0
-        pxor	xmm1, xmm15
-        pxor	xmm2, xmm7
-        pclmulqdq	xmm1, xmm2, 0
-        pxor	xmm1, xmm0
-        pxor	xmm1, xmm3
-        movdqa	xmm2, xmm1
-        pxor	xmm4, xmm0
-        pxor	xmm6, xmm3
-        pslldq	xmm2, 8
-        psrldq	xmm1, 8
-        pxor	xmm4, xmm2
-        pxor	xmm6, xmm1
-        movdqa	xmm0, xmm4
-        movdqa	xmm1, xmm4
-        movdqa	xmm2, xmm4
-        pslld	xmm0, 31
-        pslld	xmm1, 30
-        pslld	xmm2, 25
-        pxor	xmm0, xmm1
-        pxor	xmm0, xmm2
-        movdqa	xmm1, xmm0
-        psrldq	xmm1, 4
-        pslldq	xmm0, 12
-        pxor	xmm4, xmm0
-        movdqa	xmm2, xmm4
-        movdqa	xmm3, xmm4
-        movdqa	xmm0, xmm4
-        psrld	xmm2, 1
-        psrld	xmm3, 2
-        psrld	xmm0, 7
-        pxor	xmm2, xmm3
-        pxor	xmm2, xmm0
-        pxor	xmm2, xmm1
-        pxor	xmm2, xmm4
-        pxor	xmm6, xmm2
-        movdqu	xmm5, [rsp]
-L_AES_GCM_encrypt_update_aesni_done_128:
-        mov	edx, r9d
-        cmp	edi, edx
-        jge	L_AES_GCM_encrypt_update_aesni_done_enc
-        mov	r13d, r9d
-        and	r13d, 4294967280
-        cmp	edi, r13d
-        jge	L_AES_GCM_encrypt_update_aesni_last_block_done
-        lea	rcx, QWORD PTR [r11+rdi]
-        lea	rdx, QWORD PTR [r10+rdi]
-        movdqu	xmm8, [r15]
-        movdqa	xmm9, xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_epi64
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pxor	xmm8, [rax]
-        movdqu	[r15], xmm9
-        aesenc	xmm8, [rax+16]
-        aesenc	xmm8, [rax+32]
-        aesenc	xmm8, [rax+48]
-        aesenc	xmm8, [rax+64]
-        aesenc	xmm8, [rax+80]
-        aesenc	xmm8, [rax+96]
-        aesenc	xmm8, [rax+112]
-        aesenc	xmm8, [rax+128]
-        aesenc	xmm8, [rax+144]
-        cmp	r8d, 11
-        movdqa	xmm9, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [rax+176]
-        cmp	r8d, 13
-        movdqa	xmm9, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [rax+208]
-        movdqa	xmm9, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last:
-        aesenclast	xmm8, xmm9
-        movdqu	xmm9, [rcx]
-        pxor	xmm8, xmm9
-        movdqu	[rdx], xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm6, xmm8
-        add	edi, 16
-        cmp	edi, r13d
-        jge	L_AES_GCM_encrypt_update_aesni_last_block_ghash
-L_AES_GCM_encrypt_update_aesni_last_block_start:
-        lea	rcx, QWORD PTR [r11+rdi]
-        lea	rdx, QWORD PTR [r10+rdi]
-        movdqu	xmm8, [r15]
-        movdqa	xmm9, xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_epi64
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pxor	xmm8, [rax]
-        movdqu	[r15], xmm9
-        movdqa	xmm10, xmm6
-        pclmulqdq	xmm10, xmm5, 16
-        aesenc	xmm8, [rax+16]
-        aesenc	xmm8, [rax+32]
-        movdqa	xmm11, xmm6
-        pclmulqdq	xmm11, xmm5, 1
-        aesenc	xmm8, [rax+48]
-        aesenc	xmm8, [rax+64]
-        movdqa	xmm12, xmm6
-        pclmulqdq	xmm12, xmm5, 0
-        aesenc	xmm8, [rax+80]
-        movdqa	xmm1, xmm6
-        pclmulqdq	xmm1, xmm5, 17
-        aesenc	xmm8, [rax+96]
-        pxor	xmm10, xmm11
-        movdqa	xmm2, xmm10
-        psrldq	xmm10, 8
-        pslldq	xmm2, 8
-        aesenc	xmm8, [rax+112]
-        movdqa	xmm3, xmm1
-        pxor	xmm2, xmm12
-        pxor	xmm3, xmm10
-        movdqa	xmm0, OWORD PTR L_aes_gcm_mod2_128
-        movdqa	xmm11, xmm2
-        pclmulqdq	xmm11, xmm0, 16
-        aesenc	xmm8, [rax+128]
-        pshufd	xmm10, xmm2, 78
-        pxor	xmm10, xmm11
-        movdqa	xmm11, xmm10
-        pclmulqdq	xmm11, xmm0, 16
-        aesenc	xmm8, [rax+144]
-        pshufd	xmm6, xmm10, 78
-        pxor	xmm6, xmm11
-        pxor	xmm6, xmm3
-        cmp	r8d, 11
-        movdqa	xmm9, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [rax+176]
-        cmp	r8d, 13
-        movdqa	xmm9, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [rax+208]
-        movdqa	xmm9, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last:
-        aesenclast	xmm8, xmm9
-        movdqu	xmm9, [rcx]
-        pxor	xmm8, xmm9
-        movdqu	[rdx], xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm6, xmm8
-        add	edi, 16
-        cmp	edi, r13d
-        jl	L_AES_GCM_encrypt_update_aesni_last_block_start
-L_AES_GCM_encrypt_update_aesni_last_block_ghash:
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm6, 78
-        movdqa	xmm11, xmm6
-        movdqa	xmm8, xmm6
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm6
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm6, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm6, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm6, xmm14
-L_AES_GCM_encrypt_update_aesni_last_block_done:
-L_AES_GCM_encrypt_update_aesni_done_enc:
-        movdqa	OWORD PTR [r12], xmm6
-        add	rsp, 160
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r12
-        pop	r13
-        ret
-AES_GCM_encrypt_update_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_final_aesni PROC
-        push	r13
-        push	r12
-        push	r14
-        mov	rax, rcx
-        mov	r10d, r9d
-        mov	r9, rdx
-        mov	r11d, DWORD PTR [rsp+64]
-        mov	r12, QWORD PTR [rsp+72]
-        mov	r14, QWORD PTR [rsp+80]
-        sub	rsp, 16
-        movdqa	xmm4, OWORD PTR [rax]
-        movdqa	xmm5, OWORD PTR [r12]
-        movdqa	xmm6, OWORD PTR [r14]
-        movdqa	xmm9, xmm5
-        movdqa	xmm8, xmm5
-        psrlq	xmm9, 63
-        psllq	xmm8, 1
-        pslldq	xmm9, 8
-        por	xmm8, xmm9
-        pshufd	xmm5, xmm5, 255
-        psrad	xmm5, 31
-        pand	xmm5, OWORD PTR L_aes_gcm_mod2_128
-        pxor	xmm5, xmm8
-        mov	edx, r10d
-        mov	ecx, r11d
-        shl	rdx, 3
-        shl	rcx, 3
-        pinsrq	xmm0, rdx, 0
-        pinsrq	xmm0, rcx, 1
-        pxor	xmm4, xmm0
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm4, 78
-        movdqa	xmm11, xmm4
-        movdqa	xmm8, xmm4
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm4
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm4, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm4, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm4, xmm14
-        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_mask
-        movdqu	xmm0, xmm6
-        pxor	xmm0, xmm4
-        cmp	r8d, 16
-        je	L_AES_GCM_encrypt_final_aesni_store_tag_16
-        xor	rcx, rcx
-        movdqu	[rsp], xmm0
-L_AES_GCM_encrypt_final_aesni_store_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [r9+rcx], r13b
-        inc	ecx
-        cmp	ecx, r8d
-        jne	L_AES_GCM_encrypt_final_aesni_store_tag_loop
-        jmp	L_AES_GCM_encrypt_final_aesni_store_tag_done
-L_AES_GCM_encrypt_final_aesni_store_tag_16:
-        movdqu	[r9], xmm0
-L_AES_GCM_encrypt_final_aesni_store_tag_done:
-        add	rsp, 16
-        pop	r14
-        pop	r12
-        pop	r13
-        ret
-AES_GCM_encrypt_final_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_decrypt_update_aesni PROC
-        push	r13
-        push	r12
-        push	r14
-        push	r15
-        push	rdi
-        push	rsi
-        mov	rax, rcx
-        mov	r10, r8
-        mov	r8d, edx
-        mov	r11, r9
-        mov	r9d, DWORD PTR [rsp+88]
-        mov	r12, QWORD PTR [rsp+96]
-        mov	r14, QWORD PTR [rsp+104]
-        mov	r15, QWORD PTR [rsp+112]
-        sub	rsp, 168
-        movdqa	xmm6, OWORD PTR [r12]
-        movdqa	xmm5, OWORD PTR [r14]
-        movdqa	xmm9, xmm5
-        movdqa	xmm8, xmm5
-        psrlq	xmm9, 63
-        psllq	xmm8, 1
-        pslldq	xmm9, 8
-        por	xmm8, xmm9
-        pshufd	xmm5, xmm5, 255
-        psrad	xmm5, 31
-        pand	xmm5, OWORD PTR L_aes_gcm_mod2_128
-        pxor	xmm5, xmm8
-        xor	edi, edi
-        cmp	r9d, 128
-        mov	r13d, r9d
-        jl	L_AES_GCM_decrypt_update_aesni_done_128
-        and	r13d, 4294967168
-        movdqa	xmm2, xmm6
-        ; H ^ 1
-        movdqu	[rsp], xmm5
-        ; H ^ 2
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm5, 78
-        movdqa	xmm11, xmm5
-        movdqa	xmm8, xmm5
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm5
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm0, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm0, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm0, xmm14
-        movdqu	[rsp+16], xmm0
-        ; H ^ 3
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm0, 78
-        movdqa	xmm11, xmm0
-        movdqa	xmm8, xmm0
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm0
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm1, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm1, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm1, xmm14
-        movdqu	[rsp+32], xmm1
-        ; H ^ 4
-        pshufd	xmm9, xmm0, 78
-        pshufd	xmm10, xmm0, 78
-        movdqa	xmm11, xmm0
-        movdqa	xmm8, xmm0
-        pclmulqdq	xmm11, xmm0, 17
-        pclmulqdq	xmm8, xmm0, 0
-        pxor	xmm9, xmm0
-        pxor	xmm10, xmm0
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm3, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm3, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm3, xmm14
-        movdqu	[rsp+48], xmm3
-        ; H ^ 5
-        pshufd	xmm9, xmm0, 78
-        pshufd	xmm10, xmm1, 78
-        movdqa	xmm11, xmm1
-        movdqa	xmm8, xmm1
-        pclmulqdq	xmm11, xmm0, 17
-        pclmulqdq	xmm8, xmm0, 0
-        pxor	xmm9, xmm0
-        pxor	xmm10, xmm1
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+64], xmm7
-        ; H ^ 6
-        pshufd	xmm9, xmm1, 78
-        pshufd	xmm10, xmm1, 78
-        movdqa	xmm11, xmm1
-        movdqa	xmm8, xmm1
-        pclmulqdq	xmm11, xmm1, 17
-        pclmulqdq	xmm8, xmm1, 0
-        pxor	xmm9, xmm1
-        pxor	xmm10, xmm1
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+80], xmm7
-        ; H ^ 7
-        pshufd	xmm9, xmm1, 78
-        pshufd	xmm10, xmm3, 78
-        movdqa	xmm11, xmm3
-        movdqa	xmm8, xmm3
-        pclmulqdq	xmm11, xmm1, 17
-        pclmulqdq	xmm8, xmm1, 0
-        pxor	xmm9, xmm1
-        pxor	xmm10, xmm3
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+96], xmm7
-        ; H ^ 8
-        pshufd	xmm9, xmm3, 78
-        pshufd	xmm10, xmm3, 78
-        movdqa	xmm11, xmm3
-        movdqa	xmm8, xmm3
-        pclmulqdq	xmm11, xmm3, 17
-        pclmulqdq	xmm8, xmm3, 0
-        pxor	xmm9, xmm3
-        pxor	xmm10, xmm3
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm7, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm7, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm7, xmm14
-        movdqu	[rsp+112], xmm7
-L_AES_GCM_decrypt_update_aesni_ghash_128:
-        lea	rcx, QWORD PTR [r11+rdi]
-        lea	rdx, QWORD PTR [r10+rdi]
-        movdqu	xmm8, [r15]
-        movdqa	xmm1, OWORD PTR L_aes_gcm_bswap_epi64
-        movdqa	xmm0, xmm8
-        pshufb	xmm8, xmm1
-        movdqa	xmm9, xmm0
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pshufb	xmm9, xmm1
-        movdqa	xmm10, xmm0
-        paddd	xmm10, OWORD PTR L_aes_gcm_two
-        pshufb	xmm10, xmm1
-        movdqa	xmm11, xmm0
-        paddd	xmm11, OWORD PTR L_aes_gcm_three
-        pshufb	xmm11, xmm1
-        movdqa	xmm12, xmm0
-        paddd	xmm12, OWORD PTR L_aes_gcm_four
-        pshufb	xmm12, xmm1
-        movdqa	xmm13, xmm0
-        paddd	xmm13, OWORD PTR L_aes_gcm_five
-        pshufb	xmm13, xmm1
-        movdqa	xmm14, xmm0
-        paddd	xmm14, OWORD PTR L_aes_gcm_six
-        pshufb	xmm14, xmm1
-        movdqa	xmm15, xmm0
-        paddd	xmm15, OWORD PTR L_aes_gcm_seven
-        pshufb	xmm15, xmm1
-        paddd	xmm0, OWORD PTR L_aes_gcm_eight
-        movdqa	xmm7, OWORD PTR [rax]
-        movdqu	[r15], xmm0
-        pxor	xmm8, xmm7
-        pxor	xmm9, xmm7
-        pxor	xmm10, xmm7
-        pxor	xmm11, xmm7
-        pxor	xmm12, xmm7
-        pxor	xmm13, xmm7
-        pxor	xmm14, xmm7
-        pxor	xmm15, xmm7
-        movdqu	xmm7, [rsp+112]
-        movdqu	xmm0, [rcx]
-        aesenc	xmm8, [rax+16]
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm0, xmm2
-        pshufd	xmm1, xmm7, 78
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm1, xmm7
-        pxor	xmm5, xmm0
-        movdqa	xmm3, xmm0
-        pclmulqdq	xmm3, xmm7, 17
-        aesenc	xmm9, [rax+16]
-        aesenc	xmm10, [rax+16]
-        movdqa	xmm2, xmm0
-        pclmulqdq	xmm2, xmm7, 0
-        aesenc	xmm11, [rax+16]
-        aesenc	xmm12, [rax+16]
-        pclmulqdq	xmm1, xmm5, 0
-        aesenc	xmm13, [rax+16]
-        aesenc	xmm14, [rax+16]
-        aesenc	xmm15, [rax+16]
-        pxor	xmm1, xmm2
-        pxor	xmm1, xmm3
-        movdqu	xmm7, [rsp+96]
-        movdqu	xmm0, [rcx+16]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+32]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+32]
-        aesenc	xmm10, [rax+32]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+32]
-        aesenc	xmm12, [rax+32]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+32]
-        aesenc	xmm14, [rax+32]
-        aesenc	xmm15, [rax+32]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+80]
-        movdqu	xmm0, [rcx+32]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+48]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+48]
-        aesenc	xmm10, [rax+48]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+48]
-        aesenc	xmm12, [rax+48]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+48]
-        aesenc	xmm14, [rax+48]
-        aesenc	xmm15, [rax+48]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+64]
-        movdqu	xmm0, [rcx+48]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+64]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+64]
-        aesenc	xmm10, [rax+64]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+64]
-        aesenc	xmm12, [rax+64]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+64]
-        aesenc	xmm14, [rax+64]
-        aesenc	xmm15, [rax+64]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+48]
-        movdqu	xmm0, [rcx+64]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+80]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+80]
-        aesenc	xmm10, [rax+80]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+80]
-        aesenc	xmm12, [rax+80]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+80]
-        aesenc	xmm14, [rax+80]
-        aesenc	xmm15, [rax+80]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+32]
-        movdqu	xmm0, [rcx+80]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+96]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+96]
-        aesenc	xmm10, [rax+96]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+96]
-        aesenc	xmm12, [rax+96]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+96]
-        aesenc	xmm14, [rax+96]
-        aesenc	xmm15, [rax+96]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp+16]
-        movdqu	xmm0, [rcx+96]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+112]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+112]
-        aesenc	xmm10, [rax+112]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+112]
-        aesenc	xmm12, [rax+112]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+112]
-        aesenc	xmm14, [rax+112]
-        aesenc	xmm15, [rax+112]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqu	xmm7, [rsp]
-        movdqu	xmm0, [rcx+112]
-        pshufd	xmm4, xmm7, 78
-        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
-        aesenc	xmm8, [rax+128]
-        pxor	xmm4, xmm7
-        pshufd	xmm5, xmm0, 78
-        pxor	xmm5, xmm0
-        movdqa	xmm6, xmm0
-        pclmulqdq	xmm6, xmm7, 17
-        aesenc	xmm9, [rax+128]
-        aesenc	xmm10, [rax+128]
-        pclmulqdq	xmm7, xmm0, 0
-        aesenc	xmm11, [rax+128]
-        aesenc	xmm12, [rax+128]
-        pclmulqdq	xmm4, xmm5, 0
-        aesenc	xmm13, [rax+128]
-        aesenc	xmm14, [rax+128]
-        aesenc	xmm15, [rax+128]
-        pxor	xmm1, xmm7
-        pxor	xmm2, xmm7
-        pxor	xmm1, xmm6
-        pxor	xmm3, xmm6
-        pxor	xmm1, xmm4
-        movdqa	xmm5, xmm1
-        psrldq	xmm1, 8
-        pslldq	xmm5, 8
-        aesenc	xmm8, [rax+144]
-        pxor	xmm2, xmm5
-        pxor	xmm3, xmm1
-        movdqa	xmm7, xmm2
-        movdqa	xmm4, xmm2
-        movdqa	xmm5, xmm2
-        aesenc	xmm9, [rax+144]
-        pslld	xmm7, 31
-        pslld	xmm4, 30
-        pslld	xmm5, 25
-        aesenc	xmm10, [rax+144]
-        pxor	xmm7, xmm4
-        pxor	xmm7, xmm5
-        aesenc	xmm11, [rax+144]
-        movdqa	xmm4, xmm7
-        pslldq	xmm7, 12
-        psrldq	xmm4, 4
-        aesenc	xmm12, [rax+144]
-        pxor	xmm2, xmm7
-        movdqa	xmm5, xmm2
-        movdqa	xmm1, xmm2
-        movdqa	xmm0, xmm2
-        aesenc	xmm13, [rax+144]
-        psrld	xmm5, 1
-        psrld	xmm1, 2
-        psrld	xmm0, 7
-        aesenc	xmm14, [rax+144]
-        pxor	xmm5, xmm1
-        pxor	xmm5, xmm0
-        aesenc	xmm15, [rax+144]
-        pxor	xmm5, xmm4
-        pxor	xmm2, xmm5
-        pxor	xmm2, xmm3
-        cmp	r8d, 11
-        movdqa	xmm7, OWORD PTR [rax+160]
-        jl	L_AES_GCM_decrypt_update_aesni_aesenc_128_ghash_avx_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+176]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        cmp	r8d, 13
-        movdqa	xmm7, OWORD PTR [rax+192]
-        jl	L_AES_GCM_decrypt_update_aesni_aesenc_128_ghash_avx_done
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+208]
-        aesenc	xmm8, xmm7
-        aesenc	xmm9, xmm7
-        aesenc	xmm10, xmm7
-        aesenc	xmm11, xmm7
-        aesenc	xmm12, xmm7
-        aesenc	xmm13, xmm7
-        aesenc	xmm14, xmm7
-        aesenc	xmm15, xmm7
-        movdqa	xmm7, OWORD PTR [rax+224]
-L_AES_GCM_decrypt_update_aesni_aesenc_128_ghash_avx_done:
-        aesenclast	xmm8, xmm7
-        aesenclast	xmm9, xmm7
-        movdqu	xmm0, [rcx]
-        movdqu	xmm1, [rcx+16]
-        pxor	xmm8, xmm0
-        pxor	xmm9, xmm1
-        movdqu	[rdx], xmm8
-        movdqu	[rdx+16], xmm9
-        aesenclast	xmm10, xmm7
-        aesenclast	xmm11, xmm7
-        movdqu	xmm0, [rcx+32]
-        movdqu	xmm1, [rcx+48]
-        pxor	xmm10, xmm0
-        pxor	xmm11, xmm1
-        movdqu	[rdx+32], xmm10
-        movdqu	[rdx+48], xmm11
-        aesenclast	xmm12, xmm7
-        aesenclast	xmm13, xmm7
-        movdqu	xmm0, [rcx+64]
-        movdqu	xmm1, [rcx+80]
-        pxor	xmm12, xmm0
-        pxor	xmm13, xmm1
-        movdqu	[rdx+64], xmm12
-        movdqu	[rdx+80], xmm13
-        aesenclast	xmm14, xmm7
-        aesenclast	xmm15, xmm7
-        movdqu	xmm0, [rcx+96]
-        movdqu	xmm1, [rcx+112]
-        pxor	xmm14, xmm0
-        pxor	xmm15, xmm1
-        movdqu	[rdx+96], xmm14
-        movdqu	[rdx+112], xmm15
-        add	edi, 128
-        cmp	edi, r13d
-        jl	L_AES_GCM_decrypt_update_aesni_ghash_128
-        movdqa	xmm6, xmm2
-        movdqu	xmm5, [rsp]
-L_AES_GCM_decrypt_update_aesni_done_128:
-        mov	edx, r9d
-        cmp	edi, edx
-        jge	L_AES_GCM_decrypt_update_aesni_done_dec
-        mov	r13d, r9d
-        and	r13d, 4294967280
-        cmp	edi, r13d
-        jge	L_AES_GCM_decrypt_update_aesni_last_block_done
-L_AES_GCM_decrypt_update_aesni_last_block_start:
-        lea	rcx, QWORD PTR [r11+rdi]
-        lea	rdx, QWORD PTR [r10+rdi]
-        movdqu	xmm1, [rcx]
-        movdqa	xmm0, xmm5
-        pshufb	xmm1, OWORD PTR L_aes_gcm_bswap_mask
-        pxor	xmm1, xmm6
-        movdqu	xmm8, [r15]
-        movdqa	xmm9, xmm8
-        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_epi64
-        paddd	xmm9, OWORD PTR L_aes_gcm_one
-        pxor	xmm8, [rax]
-        movdqu	[r15], xmm9
-        movdqa	xmm10, xmm1
-        pclmulqdq	xmm10, xmm0, 16
-        aesenc	xmm8, [rax+16]
-        aesenc	xmm8, [rax+32]
-        movdqa	xmm11, xmm1
-        pclmulqdq	xmm11, xmm0, 1
-        aesenc	xmm8, [rax+48]
-        aesenc	xmm8, [rax+64]
-        movdqa	xmm12, xmm1
-        pclmulqdq	xmm12, xmm0, 0
-        aesenc	xmm8, [rax+80]
-        movdqa	xmm1, xmm1
-        pclmulqdq	xmm1, xmm0, 17
-        aesenc	xmm8, [rax+96]
-        pxor	xmm10, xmm11
-        movdqa	xmm2, xmm10
-        psrldq	xmm10, 8
-        pslldq	xmm2, 8
-        aesenc	xmm8, [rax+112]
-        movdqa	xmm3, xmm1
-        pxor	xmm2, xmm12
-        pxor	xmm3, xmm10
-        movdqa	xmm0, OWORD PTR L_aes_gcm_mod2_128
-        movdqa	xmm11, xmm2
-        pclmulqdq	xmm11, xmm0, 16
-        aesenc	xmm8, [rax+128]
-        pshufd	xmm10, xmm2, 78
-        pxor	xmm10, xmm11
-        movdqa	xmm11, xmm10
-        pclmulqdq	xmm11, xmm0, 16
-        aesenc	xmm8, [rax+144]
-        pshufd	xmm6, xmm10, 78
-        pxor	xmm6, xmm11
-        pxor	xmm6, xmm3
-        cmp	r8d, 11
-        movdqa	xmm9, OWORD PTR [rax+160]
-        jl	L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [rax+176]
-        cmp	r8d, 13
-        movdqa	xmm9, OWORD PTR [rax+192]
-        jl	L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last
-        aesenc	xmm8, xmm9
-        aesenc	xmm8, [rax+208]
-        movdqa	xmm9, OWORD PTR [rax+224]
-L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last:
-        aesenclast	xmm8, xmm9
-        movdqu	xmm9, [rcx]
-        pxor	xmm8, xmm9
-        movdqu	[rdx], xmm8
-        add	edi, 16
-        cmp	edi, r13d
-        jl	L_AES_GCM_decrypt_update_aesni_last_block_start
-L_AES_GCM_decrypt_update_aesni_last_block_done:
-L_AES_GCM_decrypt_update_aesni_done_dec:
-        movdqa	OWORD PTR [r12], xmm6
-        add	rsp, 168
-        pop	rsi
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r12
-        pop	r13
-        ret
-AES_GCM_decrypt_update_aesni ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_decrypt_final_aesni PROC
-        push	r13
-        push	r12
-        push	r14
-        push	rbp
-        push	r15
-        mov	rax, rcx
-        mov	r10d, r9d
-        mov	r9, rdx
-        mov	r11d, DWORD PTR [rsp+80]
-        mov	r12, QWORD PTR [rsp+88]
-        mov	r14, QWORD PTR [rsp+96]
-        mov	rbp, QWORD PTR [rsp+104]
-        sub	rsp, 16
-        movdqa	xmm6, OWORD PTR [rax]
-        movdqa	xmm5, OWORD PTR [r12]
-        movdqa	xmm15, OWORD PTR [r14]
-        movdqa	xmm9, xmm5
-        movdqa	xmm8, xmm5
-        psrlq	xmm9, 63
-        psllq	xmm8, 1
-        pslldq	xmm9, 8
-        por	xmm8, xmm9
-        pshufd	xmm5, xmm5, 255
-        psrad	xmm5, 31
-        pand	xmm5, OWORD PTR L_aes_gcm_mod2_128
-        pxor	xmm5, xmm8
-        mov	edx, r10d
-        mov	ecx, r11d
-        shl	rdx, 3
-        shl	rcx, 3
-        pinsrq	xmm0, rdx, 0
-        pinsrq	xmm0, rcx, 1
-        pxor	xmm6, xmm0
-        pshufd	xmm9, xmm5, 78
-        pshufd	xmm10, xmm6, 78
-        movdqa	xmm11, xmm6
-        movdqa	xmm8, xmm6
-        pclmulqdq	xmm11, xmm5, 17
-        pclmulqdq	xmm8, xmm5, 0
-        pxor	xmm9, xmm5
-        pxor	xmm10, xmm6
-        pclmulqdq	xmm9, xmm10, 0
-        pxor	xmm9, xmm8
-        pxor	xmm9, xmm11
-        movdqa	xmm10, xmm9
-        movdqa	xmm6, xmm11
-        pslldq	xmm10, 8
-        psrldq	xmm9, 8
-        pxor	xmm8, xmm10
-        pxor	xmm6, xmm9
-        movdqa	xmm12, xmm8
-        movdqa	xmm13, xmm8
-        movdqa	xmm14, xmm8
-        pslld	xmm12, 31
-        pslld	xmm13, 30
-        pslld	xmm14, 25
-        pxor	xmm12, xmm13
-        pxor	xmm12, xmm14
-        movdqa	xmm13, xmm12
-        psrldq	xmm13, 4
-        pslldq	xmm12, 12
-        pxor	xmm8, xmm12
-        movdqa	xmm14, xmm8
-        movdqa	xmm10, xmm8
-        movdqa	xmm9, xmm8
-        psrld	xmm14, 1
-        psrld	xmm10, 2
-        psrld	xmm9, 7
-        pxor	xmm14, xmm10
-        pxor	xmm14, xmm9
-        pxor	xmm14, xmm13
-        pxor	xmm14, xmm8
-        pxor	xmm6, xmm14
-        pshufb	xmm6, OWORD PTR L_aes_gcm_bswap_mask
-        movdqu	xmm0, xmm15
-        pxor	xmm0, xmm6
-        cmp	r8d, 16
-        je	L_AES_GCM_decrypt_final_aesni_cmp_tag_16
-        sub	rsp, 16
-        xor	rcx, rcx
-        xor	r15, r15
-        movdqu	[rsp], xmm0
-L_AES_GCM_decrypt_final_aesni_cmp_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+rcx]
-        xor	r13b, BYTE PTR [r9+rcx]
-        or	r15b, r13b
-        inc	ecx
-        cmp	ecx, r8d
-        jne	L_AES_GCM_decrypt_final_aesni_cmp_tag_loop
-        cmp	r15, 0
-        sete	r15b
-        add	rsp, 16
-        xor	rcx, rcx
-        jmp	L_AES_GCM_decrypt_final_aesni_cmp_tag_done
-L_AES_GCM_decrypt_final_aesni_cmp_tag_16:
-        movdqu	xmm1, [r9]
-        pcmpeqb	xmm0, xmm1
-        pmovmskb	rdx, xmm0
-        ; %%edx == 0xFFFF then return 1 else => return 0
-        xor	r15d, r15d
-        cmp	edx, 65535
-        sete	r15b
-L_AES_GCM_decrypt_final_aesni_cmp_tag_done:
-        mov	DWORD PTR [rbp], r15d
-        add	rsp, 16
-        pop	r15
-        pop	rbp
-        pop	r14
-        pop	r12
-        pop	r13
-        ret
-AES_GCM_decrypt_final_aesni ENDP
-_text ENDS
-IFDEF HAVE_INTEL_AVX1
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_one QWORD 0, 1
-ptr_L_avx1_aes_gcm_one QWORD L_avx1_aes_gcm_one
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_two QWORD 0, 2
-ptr_L_avx1_aes_gcm_two QWORD L_avx1_aes_gcm_two
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_three QWORD 0, 3
-ptr_L_avx1_aes_gcm_three QWORD L_avx1_aes_gcm_three
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_four QWORD 0, 4
-ptr_L_avx1_aes_gcm_four QWORD L_avx1_aes_gcm_four
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_five QWORD 0, 5
-ptr_L_avx1_aes_gcm_five QWORD L_avx1_aes_gcm_five
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_six QWORD 0, 6
-ptr_L_avx1_aes_gcm_six QWORD L_avx1_aes_gcm_six
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_seven QWORD 0, 7
-ptr_L_avx1_aes_gcm_seven QWORD L_avx1_aes_gcm_seven
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_eight QWORD 0, 8
-ptr_L_avx1_aes_gcm_eight QWORD L_avx1_aes_gcm_eight
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_bswap_epi64 QWORD 283686952306183, 579005069656919567
-ptr_L_avx1_aes_gcm_bswap_epi64 QWORD L_avx1_aes_gcm_bswap_epi64
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_bswap_mask QWORD 579005069656919567, 283686952306183
-ptr_L_avx1_aes_gcm_bswap_mask QWORD L_avx1_aes_gcm_bswap_mask
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx1_aes_gcm_mod2_128 QWORD 1, 13979173243358019584
-ptr_L_avx1_aes_gcm_mod2_128 QWORD L_avx1_aes_gcm_mod2_128
-_DATA ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_avx1 PROC
-        push	r13
-        push	rdi
-        push	rsi
-        push	r12
-        push	rbx
-        push	r14
-        push	r15
-        mov	rdi, rcx
-        mov	rsi, rdx
-        mov	r12, r8
-        mov	rax, r9
-        mov	r8, QWORD PTR [rsp+96]
-        mov	r9d, DWORD PTR [rsp+104]
-        mov	r11d, DWORD PTR [rsp+112]
-        mov	ebx, DWORD PTR [rsp+120]
-        mov	r14d, DWORD PTR [rsp+128]
-        mov	r15, QWORD PTR [rsp+136]
-        mov	r10d, DWORD PTR [rsp+144]
-        sub	rsp, 160
-        vpxor	xmm4, xmm4, xmm4
-        vpxor	xmm6, xmm6, xmm6
-        mov	edx, ebx
-        cmp	edx, 12
-        jne	L_AES_GCM_encrypt_avx1_iv_not_12
-        ; # Calculate values when IV is 12 bytes
-        ; Set counter based on IV
-        mov	ecx, 16777216
-        vmovq	xmm4, QWORD PTR [rax]
-        vpinsrd	xmm4, xmm4, DWORD PTR [rax+8], 2
-        vpinsrd	xmm4, xmm4, ecx, 3
-        ; H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqa	xmm5, OWORD PTR [r15]
-        vpxor	xmm1, xmm4, xmm5
-        vmovdqa	xmm7, OWORD PTR [r15+16]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+32]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+48]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+64]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+80]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+96]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+112]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+128]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+144]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        cmp	r10d, 11
-        vmovdqa	xmm7, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+176]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        cmp	r10d, 13
-        vmovdqa	xmm7, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+208]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_avx1_calc_iv_12_last:
-        vaesenclast	xmm5, xmm5, xmm7
-        vaesenclast	xmm1, xmm1, xmm7
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vmovdqu	OWORD PTR [rsp+144], xmm1
-        jmp	L_AES_GCM_encrypt_avx1_iv_done
-L_AES_GCM_encrypt_avx1_iv_not_12:
-        ; Calculate values when IV is not 12 bytes
-        ; H = Encrypt X(=0)
-        vmovdqa	xmm5, OWORD PTR [r15]
-        vaesenc	xmm5, xmm5, [r15+16]
-        vaesenc	xmm5, xmm5, [r15+32]
-        vaesenc	xmm5, xmm5, [r15+48]
-        vaesenc	xmm5, xmm5, [r15+64]
-        vaesenc	xmm5, xmm5, [r15+80]
-        vaesenc	xmm5, xmm5, [r15+96]
-        vaesenc	xmm5, xmm5, [r15+112]
-        vaesenc	xmm5, xmm5, [r15+128]
-        vaesenc	xmm5, xmm5, [r15+144]
-        cmp	r10d, 11
-        vmovdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm9
-        vaesenc	xmm5, xmm5, [r15+176]
-        cmp	r10d, 13
-        vmovdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm9
-        vaesenc	xmm5, xmm5, [r15+208]
-        vmovdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last:
-        vaesenclast	xmm5, xmm5, xmm9
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        ; Calc counter
-        ; Initialization vector
-        cmp	edx, 0
-        mov	rcx, 0
-        je	L_AES_GCM_encrypt_avx1_calc_iv_done
-        cmp	edx, 16
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_lt16
-        and	edx, 4294967280
-L_AES_GCM_encrypt_avx1_calc_iv_16_loop:
-        vmovdqu	xmm8, OWORD PTR [rax+rcx]
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm8
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm4, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpxor	xmm1, xmm1, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm4, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm4, xmm4, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm4, xmm4, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_16_loop
-        mov	edx, ebx
-        cmp	ecx, edx
-        je	L_AES_GCM_encrypt_avx1_calc_iv_done
-L_AES_GCM_encrypt_avx1_calc_iv_lt16:
-        sub	rsp, 16
-        vpxor	xmm8, xmm8, xmm8
-        xor	ebx, ebx
-        vmovdqu	OWORD PTR [rsp], xmm8
-L_AES_GCM_encrypt_avx1_calc_iv_loop:
-        movzx	r13d, BYTE PTR [rax+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_loop
-        vmovdqu	xmm8, OWORD PTR [rsp]
-        add	rsp, 16
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm8
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm4, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpxor	xmm1, xmm1, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm4, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm4, xmm4, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm4, xmm4, xmm2
-L_AES_GCM_encrypt_avx1_calc_iv_done:
-        ; T = Encrypt counter
-        vpxor	xmm0, xmm0, xmm0
-        shl	edx, 3
-        vmovq	xmm0, rdx
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm4, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpxor	xmm1, xmm1, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm4, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm4, xmm4, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm4, xmm4, xmm2
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        ;   Encrypt counter
-        vmovdqa	xmm8, OWORD PTR [r15]
-        vpxor	xmm8, xmm8, xmm4
-        vaesenc	xmm8, xmm8, [r15+16]
-        vaesenc	xmm8, xmm8, [r15+32]
-        vaesenc	xmm8, xmm8, [r15+48]
-        vaesenc	xmm8, xmm8, [r15+64]
-        vaesenc	xmm8, xmm8, [r15+80]
-        vaesenc	xmm8, xmm8, [r15+96]
-        vaesenc	xmm8, xmm8, [r15+112]
-        vaesenc	xmm8, xmm8, [r15+128]
-        vaesenc	xmm8, xmm8, [r15+144]
-        cmp	r10d, 11
-        vmovdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [r15+176]
-        cmp	r10d, 13
-        vmovdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [r15+208]
-        vmovdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last:
-        vaesenclast	xmm8, xmm8, xmm9
-        vmovdqu	OWORD PTR [rsp+144], xmm8
-L_AES_GCM_encrypt_avx1_iv_done:
-        ; Additional authentication data
-        mov	edx, r11d
-        cmp	edx, 0
-        je	L_AES_GCM_encrypt_avx1_calc_aad_done
-        xor	ecx, ecx
-        cmp	edx, 16
-        jl	L_AES_GCM_encrypt_avx1_calc_aad_lt16
-        and	edx, 4294967280
-L_AES_GCM_encrypt_avx1_calc_aad_16_loop:
-        vmovdqu	xmm8, OWORD PTR [r12+rcx]
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm8
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm6, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm6, 17
-        vpclmulqdq	xmm0, xmm5, xmm6, 0
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm6, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm6, xmm6, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm6, xmm6, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm6, xmm6, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm6, xmm6, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_avx1_calc_aad_16_loop
-        mov	edx, r11d
-        cmp	ecx, edx
-        je	L_AES_GCM_encrypt_avx1_calc_aad_done
-L_AES_GCM_encrypt_avx1_calc_aad_lt16:
-        sub	rsp, 16
-        vpxor	xmm8, xmm8, xmm8
-        xor	ebx, ebx
-        vmovdqu	OWORD PTR [rsp], xmm8
-L_AES_GCM_encrypt_avx1_calc_aad_loop:
-        movzx	r13d, BYTE PTR [r12+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_avx1_calc_aad_loop
-        vmovdqu	xmm8, OWORD PTR [rsp]
-        add	rsp, 16
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm8
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm6, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm6, 17
-        vpclmulqdq	xmm0, xmm5, xmm6, 0
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm6, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm6, xmm6, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm6, xmm6, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm6, xmm6, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm6, xmm6, xmm2
-L_AES_GCM_encrypt_avx1_calc_aad_done:
-        ; Calculate counter and H
-        vpsrlq	xmm9, xmm5, 63
-        vpsllq	xmm8, xmm5, 1
-        vpslldq	xmm9, xmm9, 8
-        vpor	xmm8, xmm8, xmm9
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpand	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128
-        vpaddd	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_one
-        vpxor	xmm5, xmm5, xmm8
-        vmovdqu	OWORD PTR [rsp+128], xmm4
-        xor	ebx, ebx
-        cmp	r9d, 128
-        mov	r13d, r9d
-        jl	L_AES_GCM_encrypt_avx1_done_128
-        and	r13d, 4294967168
-        vmovdqa	xmm2, xmm6
-        ; H ^ 1
-        vmovdqu	OWORD PTR [rsp], xmm5
-        ; H ^ 2
-        vpclmulqdq	xmm8, xmm5, xmm5, 0
-        vpclmulqdq	xmm0, xmm5, xmm5, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm0, xmm0, xmm14
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-        ; H ^ 3
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm0, 78
-        vpclmulqdq	xmm11, xmm0, xmm5, 17
-        vpclmulqdq	xmm8, xmm0, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm0
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm1, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm1, xmm1, xmm14
-        vmovdqu	OWORD PTR [rsp+32], xmm1
-        ; H ^ 4
-        vpclmulqdq	xmm8, xmm0, xmm0, 0
-        vpclmulqdq	xmm3, xmm0, xmm0, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm3, xmm3, xmm14
-        vmovdqu	OWORD PTR [rsp+48], xmm3
-        ; H ^ 5
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm0, 78
-        vpshufd	xmm10, xmm1, 78
-        vpclmulqdq	xmm11, xmm1, xmm0, 17
-        vpclmulqdq	xmm8, xmm1, xmm0, 0
-        vpxor	xmm9, xmm9, xmm0
-        vpxor	xmm10, xmm10, xmm1
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm7, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+64], xmm7
-        ; H ^ 6
-        vpclmulqdq	xmm8, xmm1, xmm1, 0
-        vpclmulqdq	xmm7, xmm1, xmm1, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+80], xmm7
-        ; H ^ 7
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm1, 78
-        vpshufd	xmm10, xmm3, 78
-        vpclmulqdq	xmm11, xmm3, xmm1, 17
-        vpclmulqdq	xmm8, xmm3, xmm1, 0
-        vpxor	xmm9, xmm9, xmm1
-        vpxor	xmm10, xmm10, xmm3
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm7, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+96], xmm7
-        ; H ^ 8
-        vpclmulqdq	xmm8, xmm3, xmm3, 0
-        vpclmulqdq	xmm7, xmm3, xmm3, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+112], xmm7
-        ; First 128 bytes of input
-        vmovdqu	xmm0, OWORD PTR [rsp+128]
-        vmovdqa	xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven
-        vpshufb	xmm15, xmm15, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight
-        vmovdqa	xmm7, OWORD PTR [r15]
-        vmovdqu	OWORD PTR [rsp+128], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+16]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+32]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+48]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+64]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+80]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+96]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+112]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+128]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+144]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r10d, 11
-        vmovdqa	xmm7, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_avx1_aesenc_128_enc_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r10d, 13
-        vmovdqa	xmm7, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_avx1_aesenc_128_enc_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_avx1_aesenc_128_enc_done:
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vmovdqu	xmm0, OWORD PTR [rdi]
-        vmovdqu	xmm1, OWORD PTR [rdi+16]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vmovdqu	OWORD PTR [rsi], xmm8
-        vmovdqu	OWORD PTR [rsi+16], xmm9
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [rdi+32]
-        vmovdqu	xmm1, OWORD PTR [rdi+48]
-        vpxor	xmm10, xmm10, xmm0
-        vpxor	xmm11, xmm11, xmm1
-        vmovdqu	OWORD PTR [rsi+32], xmm10
-        vmovdqu	OWORD PTR [rsi+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vmovdqu	xmm0, OWORD PTR [rdi+64]
-        vmovdqu	xmm1, OWORD PTR [rdi+80]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vmovdqu	OWORD PTR [rsi+64], xmm12
-        vmovdqu	OWORD PTR [rsi+80], xmm13
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rdi+96]
-        vmovdqu	xmm1, OWORD PTR [rdi+112]
-        vpxor	xmm14, xmm14, xmm0
-        vpxor	xmm15, xmm15, xmm1
-        vmovdqu	OWORD PTR [rsi+96], xmm14
-        vmovdqu	OWORD PTR [rsi+112], xmm15
-        cmp	r13d, 128
-        mov	ebx, 128
-        jle	L_AES_GCM_encrypt_avx1_end_128
-        ; More 128 bytes of input
-L_AES_GCM_encrypt_avx1_ghash_128:
-        lea	rcx, QWORD PTR [rdi+rbx]
-        lea	rdx, QWORD PTR [rsi+rbx]
-        vmovdqu	xmm0, OWORD PTR [rsp+128]
-        vmovdqa	xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven
-        vpshufb	xmm15, xmm15, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight
-        vmovdqa	xmm7, OWORD PTR [r15]
-        vmovdqu	OWORD PTR [rsp+128], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsp+112]
-        vmovdqu	xmm0, OWORD PTR [rdx+-128]
-        vaesenc	xmm8, xmm8, [r15+16]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm0, xmm0, xmm2
-        vpshufd	xmm1, xmm7, 78
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm3, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+16]
-        vaesenc	xmm10, xmm10, [r15+16]
-        vpclmulqdq	xmm2, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+16]
-        vaesenc	xmm12, xmm12, [r15+16]
-        vpclmulqdq	xmm1, xmm1, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+16]
-        vaesenc	xmm14, xmm14, [r15+16]
-        vaesenc	xmm15, xmm15, [r15+16]
-        vpxor	xmm1, xmm1, xmm2
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqu	xmm7, OWORD PTR [rsp+96]
-        vmovdqu	xmm0, OWORD PTR [rdx+-112]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+32]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+32]
-        vaesenc	xmm10, xmm10, [r15+32]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+32]
-        vaesenc	xmm12, xmm12, [r15+32]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+32]
-        vaesenc	xmm14, xmm14, [r15+32]
-        vaesenc	xmm15, xmm15, [r15+32]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+80]
-        vmovdqu	xmm0, OWORD PTR [rdx+-96]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+48]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+48]
-        vaesenc	xmm10, xmm10, [r15+48]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+48]
-        vaesenc	xmm12, xmm12, [r15+48]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+48]
-        vaesenc	xmm14, xmm14, [r15+48]
-        vaesenc	xmm15, xmm15, [r15+48]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+64]
-        vmovdqu	xmm0, OWORD PTR [rdx+-80]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+64]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+64]
-        vaesenc	xmm10, xmm10, [r15+64]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+64]
-        vaesenc	xmm12, xmm12, [r15+64]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+64]
-        vaesenc	xmm14, xmm14, [r15+64]
-        vaesenc	xmm15, xmm15, [r15+64]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+48]
-        vmovdqu	xmm0, OWORD PTR [rdx+-64]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+80]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+80]
-        vaesenc	xmm10, xmm10, [r15+80]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+80]
-        vaesenc	xmm12, xmm12, [r15+80]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+80]
-        vaesenc	xmm14, xmm14, [r15+80]
-        vaesenc	xmm15, xmm15, [r15+80]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+32]
-        vmovdqu	xmm0, OWORD PTR [rdx+-48]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+96]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+96]
-        vaesenc	xmm10, xmm10, [r15+96]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+96]
-        vaesenc	xmm12, xmm12, [r15+96]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+96]
-        vaesenc	xmm14, xmm14, [r15+96]
-        vaesenc	xmm15, xmm15, [r15+96]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm0, OWORD PTR [rdx+-32]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+112]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+112]
-        vaesenc	xmm10, xmm10, [r15+112]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+112]
-        vaesenc	xmm12, xmm12, [r15+112]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+112]
-        vaesenc	xmm14, xmm14, [r15+112]
-        vaesenc	xmm15, xmm15, [r15+112]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp]
-        vmovdqu	xmm0, OWORD PTR [rdx+-16]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+128]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+128]
-        vaesenc	xmm10, xmm10, [r15+128]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+128]
-        vaesenc	xmm12, xmm12, [r15+128]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+128]
-        vaesenc	xmm14, xmm14, [r15+128]
-        vaesenc	xmm15, xmm15, [r15+128]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vpslldq	xmm5, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vaesenc	xmm8, xmm8, [r15+144]
-        vpxor	xmm2, xmm2, xmm5
-        vpxor	xmm3, xmm3, xmm1
-        vaesenc	xmm9, xmm9, [r15+144]
-        vpslld	xmm7, xmm2, 31
-        vpslld	xmm4, xmm2, 30
-        vpslld	xmm5, xmm2, 25
-        vaesenc	xmm10, xmm10, [r15+144]
-        vpxor	xmm7, xmm7, xmm4
-        vpxor	xmm7, xmm7, xmm5
-        vaesenc	xmm11, xmm11, [r15+144]
-        vpsrldq	xmm4, xmm7, 4
-        vpslldq	xmm7, xmm7, 12
-        vaesenc	xmm12, xmm12, [r15+144]
-        vpxor	xmm2, xmm2, xmm7
-        vpsrld	xmm5, xmm2, 1
-        vaesenc	xmm13, xmm13, [r15+144]
-        vpsrld	xmm1, xmm2, 2
-        vpsrld	xmm0, xmm2, 7
-        vaesenc	xmm14, xmm14, [r15+144]
-        vpxor	xmm5, xmm5, xmm1
-        vpxor	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, [r15+144]
-        vpxor	xmm5, xmm5, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpxor	xmm2, xmm2, xmm3
-        cmp	r10d, 11
-        vmovdqa	xmm7, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_avx1_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r10d, 13
-        vmovdqa	xmm7, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_avx1_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_avx1_aesenc_128_ghash_avx_done:
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vmovdqu	xmm1, OWORD PTR [rcx+16]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vmovdqu	OWORD PTR [rdx], xmm8
-        vmovdqu	OWORD PTR [rdx+16], xmm9
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+32]
-        vmovdqu	xmm1, OWORD PTR [rcx+48]
-        vpxor	xmm10, xmm10, xmm0
-        vpxor	xmm11, xmm11, xmm1
-        vmovdqu	OWORD PTR [rdx+32], xmm10
-        vmovdqu	OWORD PTR [rdx+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+64]
-        vmovdqu	xmm1, OWORD PTR [rcx+80]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vmovdqu	OWORD PTR [rdx+64], xmm12
-        vmovdqu	OWORD PTR [rdx+80], xmm13
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+96]
-        vmovdqu	xmm1, OWORD PTR [rcx+112]
-        vpxor	xmm14, xmm14, xmm0
-        vpxor	xmm15, xmm15, xmm1
-        vmovdqu	OWORD PTR [rdx+96], xmm14
-        vmovdqu	OWORD PTR [rdx+112], xmm15
-        add	ebx, 128
-        cmp	ebx, r13d
-        jl	L_AES_GCM_encrypt_avx1_ghash_128
-L_AES_GCM_encrypt_avx1_end_128:
-        vmovdqa	xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpshufb	xmm8, xmm8, xmm4
-        vpshufb	xmm9, xmm9, xmm4
-        vpshufb	xmm10, xmm10, xmm4
-        vpshufb	xmm11, xmm11, xmm4
-        vpxor	xmm8, xmm8, xmm2
-        vpshufb	xmm12, xmm12, xmm4
-        vpshufb	xmm13, xmm13, xmm4
-        vpshufb	xmm14, xmm14, xmm4
-        vpshufb	xmm15, xmm15, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp]
-        vmovdqu	xmm5, OWORD PTR [rsp+16]
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm15, 78
-        vpshufd	xmm2, xmm7, 78
-        vpclmulqdq	xmm3, xmm7, xmm15, 17
-        vpclmulqdq	xmm0, xmm7, xmm15, 0
-        vpxor	xmm1, xmm1, xmm15
-        vpxor	xmm2, xmm2, xmm7
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm4, xmm0
-        vmovdqa	xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm14, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm14, 17
-        vpclmulqdq	xmm0, xmm5, xmm14, 0
-        vpxor	xmm1, xmm1, xmm14
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vmovdqu	xmm7, OWORD PTR [rsp+32]
-        vmovdqu	xmm5, OWORD PTR [rsp+48]
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm13, 78
-        vpshufd	xmm2, xmm7, 78
-        vpclmulqdq	xmm3, xmm7, xmm13, 17
-        vpclmulqdq	xmm0, xmm7, xmm13, 0
-        vpxor	xmm1, xmm1, xmm13
-        vpxor	xmm2, xmm2, xmm7
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm12, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm12, 17
-        vpclmulqdq	xmm0, xmm5, xmm12, 0
-        vpxor	xmm1, xmm1, xmm12
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vmovdqu	xmm7, OWORD PTR [rsp+64]
-        vmovdqu	xmm5, OWORD PTR [rsp+80]
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm11, 78
-        vpshufd	xmm2, xmm7, 78
-        vpclmulqdq	xmm3, xmm7, xmm11, 17
-        vpclmulqdq	xmm0, xmm7, xmm11, 0
-        vpxor	xmm1, xmm1, xmm11
-        vpxor	xmm2, xmm2, xmm7
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm10, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm10, 17
-        vpclmulqdq	xmm0, xmm5, xmm10, 0
-        vpxor	xmm1, xmm1, xmm10
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vmovdqu	xmm7, OWORD PTR [rsp+96]
-        vmovdqu	xmm5, OWORD PTR [rsp+112]
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm9, 78
-        vpshufd	xmm2, xmm7, 78
-        vpclmulqdq	xmm3, xmm7, xmm9, 17
-        vpclmulqdq	xmm0, xmm7, xmm9, 0
-        vpxor	xmm1, xmm1, xmm9
-        vpxor	xmm2, xmm2, xmm7
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm8, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm8, 17
-        vpclmulqdq	xmm0, xmm5, xmm8, 0
-        vpxor	xmm1, xmm1, xmm8
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vpslld	xmm0, xmm4, 31
-        vpslld	xmm1, xmm4, 30
-        vpslld	xmm2, xmm4, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm4, xmm4, xmm0
-        vpsrld	xmm2, xmm4, 1
-        vpsrld	xmm3, xmm4, 2
-        vpsrld	xmm0, xmm4, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm4
-        vpxor	xmm6, xmm6, xmm2
-        vmovdqu	xmm5, OWORD PTR [rsp]
-L_AES_GCM_encrypt_avx1_done_128:
-        mov	edx, r9d
-        cmp	ebx, edx
-        jge	L_AES_GCM_encrypt_avx1_done_enc
-        mov	r13d, r9d
-        and	r13d, 4294967280
-        cmp	ebx, r13d
-        jge	L_AES_GCM_encrypt_avx1_last_block_done
-        vmovdqu	xmm9, OWORD PTR [rsp+128]
-        vpshufb	xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one
-        vmovdqu	OWORD PTR [rsp+128], xmm9
-        vpxor	xmm8, xmm8, [r15]
-        vaesenc	xmm8, xmm8, [r15+16]
-        vaesenc	xmm8, xmm8, [r15+32]
-        vaesenc	xmm8, xmm8, [r15+48]
-        vaesenc	xmm8, xmm8, [r15+64]
-        vaesenc	xmm8, xmm8, [r15+80]
-        vaesenc	xmm8, xmm8, [r15+96]
-        vaesenc	xmm8, xmm8, [r15+112]
-        vaesenc	xmm8, xmm8, [r15+128]
-        vaesenc	xmm8, xmm8, [r15+144]
-        cmp	r10d, 11
-        vmovdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_avx1_aesenc_block_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [r15+176]
-        cmp	r10d, 13
-        vmovdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_avx1_aesenc_block_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [r15+208]
-        vmovdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_avx1_aesenc_block_last:
-        vaesenclast	xmm8, xmm8, xmm9
-        vmovdqu	xmm9, OWORD PTR [rdi+rbx]
-        vpxor	xmm8, xmm8, xmm9
-        vmovdqu	OWORD PTR [rsi+rbx], xmm8
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm8
-        add	ebx, 16
-        cmp	ebx, r13d
-        jge	L_AES_GCM_encrypt_avx1_last_block_ghash
-L_AES_GCM_encrypt_avx1_last_block_start:
-        vmovdqu	xmm13, OWORD PTR [rdi+rbx]
-        vmovdqu	xmm9, OWORD PTR [rsp+128]
-        vpshufb	xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one
-        vmovdqu	OWORD PTR [rsp+128], xmm9
-        vpxor	xmm8, xmm8, [r15]
-        vpclmulqdq	xmm10, xmm6, xmm5, 16
-        vaesenc	xmm8, xmm8, [r15+16]
-        vaesenc	xmm8, xmm8, [r15+32]
-        vpclmulqdq	xmm11, xmm6, xmm5, 1
-        vaesenc	xmm8, xmm8, [r15+48]
-        vaesenc	xmm8, xmm8, [r15+64]
-        vpclmulqdq	xmm12, xmm6, xmm5, 0
-        vaesenc	xmm8, xmm8, [r15+80]
-        vpclmulqdq	xmm1, xmm6, xmm5, 17
-        vaesenc	xmm8, xmm8, [r15+96]
-        vpxor	xmm10, xmm10, xmm11
-        vpslldq	xmm2, xmm10, 8
-        vpsrldq	xmm10, xmm10, 8
-        vaesenc	xmm8, xmm8, [r15+112]
-        vpxor	xmm2, xmm2, xmm12
-        vpxor	xmm3, xmm1, xmm10
-        vmovdqa	xmm0, OWORD PTR L_avx1_aes_gcm_mod2_128
-        vpclmulqdq	xmm11, xmm2, xmm0, 16
-        vaesenc	xmm8, xmm8, [r15+128]
-        vpshufd	xmm10, xmm2, 78
-        vpxor	xmm10, xmm10, xmm11
-        vpclmulqdq	xmm11, xmm10, xmm0, 16
-        vaesenc	xmm8, xmm8, [r15+144]
-        vpshufd	xmm10, xmm10, 78
-        vpxor	xmm10, xmm10, xmm11
-        vpxor	xmm6, xmm10, xmm3
-        cmp	r10d, 11
-        vmovdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_avx1_aesenc_gfmul_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [r15+176]
-        cmp	r10d, 13
-        vmovdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_avx1_aesenc_gfmul_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [r15+208]
-        vmovdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_avx1_aesenc_gfmul_last:
-        vaesenclast	xmm8, xmm8, xmm9
-        vmovdqa	xmm0, xmm13
-        vpxor	xmm8, xmm8, xmm0
-        vmovdqu	OWORD PTR [rsi+rbx], xmm8
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        add	ebx, 16
-        vpxor	xmm6, xmm6, xmm8
-        cmp	ebx, r13d
-        jl	L_AES_GCM_encrypt_avx1_last_block_start
-L_AES_GCM_encrypt_avx1_last_block_ghash:
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm6, 78
-        vpclmulqdq	xmm11, xmm6, xmm5, 17
-        vpclmulqdq	xmm8, xmm6, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm6
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm6, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm6, xmm6, xmm14
-L_AES_GCM_encrypt_avx1_last_block_done:
-        mov	ecx, r9d
-        mov	edx, ecx
-        and	ecx, 15
-        jz	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_done
-        vmovdqu	xmm4, OWORD PTR [rsp+128]
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpxor	xmm4, xmm4, [r15]
-        vaesenc	xmm4, xmm4, [r15+16]
-        vaesenc	xmm4, xmm4, [r15+32]
-        vaesenc	xmm4, xmm4, [r15+48]
-        vaesenc	xmm4, xmm4, [r15+64]
-        vaesenc	xmm4, xmm4, [r15+80]
-        vaesenc	xmm4, xmm4, [r15+96]
-        vaesenc	xmm4, xmm4, [r15+112]
-        vaesenc	xmm4, xmm4, [r15+128]
-        vaesenc	xmm4, xmm4, [r15+144]
-        cmp	r10d, 11
-        vmovdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last
-        vaesenc	xmm4, xmm4, xmm9
-        vaesenc	xmm4, xmm4, [r15+176]
-        cmp	r10d, 13
-        vmovdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last
-        vaesenc	xmm4, xmm4, xmm9
-        vaesenc	xmm4, xmm4, [r15+208]
-        vmovdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last:
-        vaesenclast	xmm4, xmm4, xmm9
-        sub	rsp, 16
-        xor	ecx, ecx
-        vmovdqu	OWORD PTR [rsp], xmm4
-L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_loop:
-        movzx	r13d, BYTE PTR [rdi+rbx]
-        xor	r13b, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [rsi+rbx], r13b
-        mov	BYTE PTR [rsp+rcx], r13b
-        inc	ebx
-        inc	ecx
-        cmp	ebx, edx
-        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_loop
-        xor	r13, r13
-        cmp	ecx, 16
-        je	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_finish_enc
-L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_byte_loop:
-        mov	BYTE PTR [rsp+rcx], r13b
-        inc	ecx
-        cmp	ecx, 16
-        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_byte_loop
-L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_finish_enc:
-        vmovdqu	xmm4, OWORD PTR [rsp]
-        add	rsp, 16
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm4
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm6, 78
-        vpclmulqdq	xmm11, xmm6, xmm5, 17
-        vpclmulqdq	xmm8, xmm6, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm6
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm6, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm6, xmm6, xmm14
-L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_done:
-L_AES_GCM_encrypt_avx1_done_enc:
-        mov	edx, r9d
-        mov	ecx, r11d
-        shl	rdx, 3
-        shl	rcx, 3
-        vmovq	xmm0, rdx
-        vmovq	xmm1, rcx
-        vpunpcklqdq	xmm0, xmm0, xmm1
-        vpxor	xmm6, xmm6, xmm0
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm6, 78
-        vpclmulqdq	xmm11, xmm6, xmm5, 17
-        vpclmulqdq	xmm8, xmm6, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm6
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm6, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm6, xmm6, xmm14
-        vpshufb	xmm6, xmm6, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vmovdqu	xmm0, OWORD PTR [rsp+144]
-        vpxor	xmm0, xmm0, xmm6
-        cmp	r14d, 16
-        je	L_AES_GCM_encrypt_avx1_store_tag_16
-        xor	rcx, rcx
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_encrypt_avx1_store_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [r8+rcx], r13b
-        inc	ecx
-        cmp	ecx, r14d
-        jne	L_AES_GCM_encrypt_avx1_store_tag_loop
-        jmp	L_AES_GCM_encrypt_avx1_store_tag_done
-L_AES_GCM_encrypt_avx1_store_tag_16:
-        vmovdqu	OWORD PTR [r8], xmm0
-L_AES_GCM_encrypt_avx1_store_tag_done:
-        vzeroupper
-        add	rsp, 160
-        pop	r15
-        pop	r14
-        pop	rbx
-        pop	r12
-        pop	rsi
-        pop	rdi
-        pop	r13
-        ret
-AES_GCM_encrypt_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_decrypt_avx1 PROC
-        push	r13
-        push	rdi
-        push	rsi
-        push	r12
-        push	rbx
-        push	r14
-        push	r15
-        push	rbp
-        mov	rdi, rcx
-        mov	rsi, rdx
-        mov	r12, r8
-        mov	rax, r9
-        mov	r8, QWORD PTR [rsp+104]
-        mov	r9d, DWORD PTR [rsp+112]
-        mov	r11d, DWORD PTR [rsp+120]
-        mov	ebx, DWORD PTR [rsp+128]
-        mov	r14d, DWORD PTR [rsp+136]
-        mov	r15, QWORD PTR [rsp+144]
-        mov	r10d, DWORD PTR [rsp+152]
-        mov	rbp, QWORD PTR [rsp+160]
-        sub	rsp, 168
-        vpxor	xmm4, xmm4, xmm4
-        vpxor	xmm6, xmm6, xmm6
-        cmp	ebx, 12
-        mov	edx, ebx
-        jne	L_AES_GCM_decrypt_avx1_iv_not_12
-        ; # Calculate values when IV is 12 bytes
-        ; Set counter based on IV
-        mov	ecx, 16777216
-        vmovq	xmm4, QWORD PTR [rax]
-        vpinsrd	xmm4, xmm4, DWORD PTR [rax+8], 2
-        vpinsrd	xmm4, xmm4, ecx, 3
-        ; H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqa	xmm5, OWORD PTR [r15]
-        vpxor	xmm1, xmm4, xmm5
-        vmovdqa	xmm7, OWORD PTR [r15+16]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+32]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+48]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+64]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+80]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+96]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+112]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+128]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+144]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        cmp	r10d, 11
-        vmovdqa	xmm7, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+176]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        cmp	r10d, 13
-        vmovdqa	xmm7, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+208]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_avx1_calc_iv_12_last:
-        vaesenclast	xmm5, xmm5, xmm7
-        vaesenclast	xmm1, xmm1, xmm7
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vmovdqu	OWORD PTR [rsp+144], xmm1
-        jmp	L_AES_GCM_decrypt_avx1_iv_done
-L_AES_GCM_decrypt_avx1_iv_not_12:
-        ; Calculate values when IV is not 12 bytes
-        ; H = Encrypt X(=0)
-        vmovdqa	xmm5, OWORD PTR [r15]
-        vaesenc	xmm5, xmm5, [r15+16]
-        vaesenc	xmm5, xmm5, [r15+32]
-        vaesenc	xmm5, xmm5, [r15+48]
-        vaesenc	xmm5, xmm5, [r15+64]
-        vaesenc	xmm5, xmm5, [r15+80]
-        vaesenc	xmm5, xmm5, [r15+96]
-        vaesenc	xmm5, xmm5, [r15+112]
-        vaesenc	xmm5, xmm5, [r15+128]
-        vaesenc	xmm5, xmm5, [r15+144]
-        cmp	r10d, 11
-        vmovdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm9
-        vaesenc	xmm5, xmm5, [r15+176]
-        cmp	r10d, 13
-        vmovdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm9
-        vaesenc	xmm5, xmm5, [r15+208]
-        vmovdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last:
-        vaesenclast	xmm5, xmm5, xmm9
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        ; Calc counter
-        ; Initialization vector
-        cmp	edx, 0
-        mov	rcx, 0
-        je	L_AES_GCM_decrypt_avx1_calc_iv_done
-        cmp	edx, 16
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_lt16
-        and	edx, 4294967280
-L_AES_GCM_decrypt_avx1_calc_iv_16_loop:
-        vmovdqu	xmm8, OWORD PTR [rax+rcx]
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm8
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm4, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpxor	xmm1, xmm1, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm4, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm4, xmm4, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm4, xmm4, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_16_loop
-        mov	edx, ebx
-        cmp	ecx, edx
-        je	L_AES_GCM_decrypt_avx1_calc_iv_done
-L_AES_GCM_decrypt_avx1_calc_iv_lt16:
-        sub	rsp, 16
-        vpxor	xmm8, xmm8, xmm8
-        xor	ebx, ebx
-        vmovdqu	OWORD PTR [rsp], xmm8
-L_AES_GCM_decrypt_avx1_calc_iv_loop:
-        movzx	r13d, BYTE PTR [rax+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_loop
-        vmovdqu	xmm8, OWORD PTR [rsp]
-        add	rsp, 16
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm8
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm4, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpxor	xmm1, xmm1, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm4, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm4, xmm4, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm4, xmm4, xmm2
-L_AES_GCM_decrypt_avx1_calc_iv_done:
-        ; T = Encrypt counter
-        vpxor	xmm0, xmm0, xmm0
-        shl	edx, 3
-        vmovq	xmm0, rdx
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm4, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpxor	xmm1, xmm1, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm4, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm4, xmm4, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm4, xmm4, xmm2
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        ;   Encrypt counter
-        vmovdqa	xmm8, OWORD PTR [r15]
-        vpxor	xmm8, xmm8, xmm4
-        vaesenc	xmm8, xmm8, [r15+16]
-        vaesenc	xmm8, xmm8, [r15+32]
-        vaesenc	xmm8, xmm8, [r15+48]
-        vaesenc	xmm8, xmm8, [r15+64]
-        vaesenc	xmm8, xmm8, [r15+80]
-        vaesenc	xmm8, xmm8, [r15+96]
-        vaesenc	xmm8, xmm8, [r15+112]
-        vaesenc	xmm8, xmm8, [r15+128]
-        vaesenc	xmm8, xmm8, [r15+144]
-        cmp	r10d, 11
-        vmovdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [r15+176]
-        cmp	r10d, 13
-        vmovdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [r15+208]
-        vmovdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last:
-        vaesenclast	xmm8, xmm8, xmm9
-        vmovdqu	OWORD PTR [rsp+144], xmm8
-L_AES_GCM_decrypt_avx1_iv_done:
-        ; Additional authentication data
-        mov	edx, r11d
-        cmp	edx, 0
-        je	L_AES_GCM_decrypt_avx1_calc_aad_done
-        xor	ecx, ecx
-        cmp	edx, 16
-        jl	L_AES_GCM_decrypt_avx1_calc_aad_lt16
-        and	edx, 4294967280
-L_AES_GCM_decrypt_avx1_calc_aad_16_loop:
-        vmovdqu	xmm8, OWORD PTR [r12+rcx]
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm8
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm6, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm6, 17
-        vpclmulqdq	xmm0, xmm5, xmm6, 0
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm6, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm6, xmm6, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm6, xmm6, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm6, xmm6, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm6, xmm6, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_avx1_calc_aad_16_loop
-        mov	edx, r11d
-        cmp	ecx, edx
-        je	L_AES_GCM_decrypt_avx1_calc_aad_done
-L_AES_GCM_decrypt_avx1_calc_aad_lt16:
-        sub	rsp, 16
-        vpxor	xmm8, xmm8, xmm8
-        xor	ebx, ebx
-        vmovdqu	OWORD PTR [rsp], xmm8
-L_AES_GCM_decrypt_avx1_calc_aad_loop:
-        movzx	r13d, BYTE PTR [r12+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_avx1_calc_aad_loop
-        vmovdqu	xmm8, OWORD PTR [rsp]
-        add	rsp, 16
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm8
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm6, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm6, 17
-        vpclmulqdq	xmm0, xmm5, xmm6, 0
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm6, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm6, xmm6, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm6, xmm6, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm6, xmm6, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm6, xmm6, xmm2
-L_AES_GCM_decrypt_avx1_calc_aad_done:
-        ; Calculate counter and H
-        vpsrlq	xmm9, xmm5, 63
-        vpsllq	xmm8, xmm5, 1
-        vpslldq	xmm9, xmm9, 8
-        vpor	xmm8, xmm8, xmm9
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpand	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128
-        vpaddd	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_one
-        vpxor	xmm5, xmm5, xmm8
-        vmovdqu	OWORD PTR [rsp+128], xmm4
-        xor	ebx, ebx
-        cmp	r9d, 128
-        mov	r13d, r9d
-        jl	L_AES_GCM_decrypt_avx1_done_128
-        and	r13d, 4294967168
-        vmovdqa	xmm2, xmm6
-        ; H ^ 1
-        vmovdqu	OWORD PTR [rsp], xmm5
-        ; H ^ 2
-        vpclmulqdq	xmm8, xmm5, xmm5, 0
-        vpclmulqdq	xmm0, xmm5, xmm5, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm0, xmm0, xmm14
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-        ; H ^ 3
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm0, 78
-        vpclmulqdq	xmm11, xmm0, xmm5, 17
-        vpclmulqdq	xmm8, xmm0, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm0
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm1, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm1, xmm1, xmm14
-        vmovdqu	OWORD PTR [rsp+32], xmm1
-        ; H ^ 4
-        vpclmulqdq	xmm8, xmm0, xmm0, 0
-        vpclmulqdq	xmm3, xmm0, xmm0, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm3, xmm3, xmm14
-        vmovdqu	OWORD PTR [rsp+48], xmm3
-        ; H ^ 5
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm0, 78
-        vpshufd	xmm10, xmm1, 78
-        vpclmulqdq	xmm11, xmm1, xmm0, 17
-        vpclmulqdq	xmm8, xmm1, xmm0, 0
-        vpxor	xmm9, xmm9, xmm0
-        vpxor	xmm10, xmm10, xmm1
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm7, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+64], xmm7
-        ; H ^ 6
-        vpclmulqdq	xmm8, xmm1, xmm1, 0
-        vpclmulqdq	xmm7, xmm1, xmm1, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+80], xmm7
-        ; H ^ 7
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm1, 78
-        vpshufd	xmm10, xmm3, 78
-        vpclmulqdq	xmm11, xmm3, xmm1, 17
-        vpclmulqdq	xmm8, xmm3, xmm1, 0
-        vpxor	xmm9, xmm9, xmm1
-        vpxor	xmm10, xmm10, xmm3
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm7, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+96], xmm7
-        ; H ^ 8
-        vpclmulqdq	xmm8, xmm3, xmm3, 0
-        vpclmulqdq	xmm7, xmm3, xmm3, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+112], xmm7
-L_AES_GCM_decrypt_avx1_ghash_128:
-        lea	rcx, QWORD PTR [rdi+rbx]
-        lea	rdx, QWORD PTR [rsi+rbx]
-        vmovdqu	xmm0, OWORD PTR [rsp+128]
-        vmovdqa	xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven
-        vpshufb	xmm15, xmm15, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight
-        vmovdqa	xmm7, OWORD PTR [r15]
-        vmovdqu	OWORD PTR [rsp+128], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsp+112]
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vaesenc	xmm8, xmm8, [r15+16]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm0, xmm0, xmm2
-        vpshufd	xmm1, xmm7, 78
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm3, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+16]
-        vaesenc	xmm10, xmm10, [r15+16]
-        vpclmulqdq	xmm2, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+16]
-        vaesenc	xmm12, xmm12, [r15+16]
-        vpclmulqdq	xmm1, xmm1, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+16]
-        vaesenc	xmm14, xmm14, [r15+16]
-        vaesenc	xmm15, xmm15, [r15+16]
-        vpxor	xmm1, xmm1, xmm2
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqu	xmm7, OWORD PTR [rsp+96]
-        vmovdqu	xmm0, OWORD PTR [rcx+16]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+32]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+32]
-        vaesenc	xmm10, xmm10, [r15+32]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+32]
-        vaesenc	xmm12, xmm12, [r15+32]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+32]
-        vaesenc	xmm14, xmm14, [r15+32]
-        vaesenc	xmm15, xmm15, [r15+32]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+80]
-        vmovdqu	xmm0, OWORD PTR [rcx+32]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+48]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+48]
-        vaesenc	xmm10, xmm10, [r15+48]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+48]
-        vaesenc	xmm12, xmm12, [r15+48]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+48]
-        vaesenc	xmm14, xmm14, [r15+48]
-        vaesenc	xmm15, xmm15, [r15+48]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+64]
-        vmovdqu	xmm0, OWORD PTR [rcx+48]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+64]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+64]
-        vaesenc	xmm10, xmm10, [r15+64]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+64]
-        vaesenc	xmm12, xmm12, [r15+64]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+64]
-        vaesenc	xmm14, xmm14, [r15+64]
-        vaesenc	xmm15, xmm15, [r15+64]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+48]
-        vmovdqu	xmm0, OWORD PTR [rcx+64]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+80]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+80]
-        vaesenc	xmm10, xmm10, [r15+80]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+80]
-        vaesenc	xmm12, xmm12, [r15+80]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+80]
-        vaesenc	xmm14, xmm14, [r15+80]
-        vaesenc	xmm15, xmm15, [r15+80]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+32]
-        vmovdqu	xmm0, OWORD PTR [rcx+80]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+96]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+96]
-        vaesenc	xmm10, xmm10, [r15+96]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+96]
-        vaesenc	xmm12, xmm12, [r15+96]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+96]
-        vaesenc	xmm14, xmm14, [r15+96]
-        vaesenc	xmm15, xmm15, [r15+96]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm0, OWORD PTR [rcx+96]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+112]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+112]
-        vaesenc	xmm10, xmm10, [r15+112]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+112]
-        vaesenc	xmm12, xmm12, [r15+112]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+112]
-        vaesenc	xmm14, xmm14, [r15+112]
-        vaesenc	xmm15, xmm15, [r15+112]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp]
-        vmovdqu	xmm0, OWORD PTR [rcx+112]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [r15+128]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [r15+128]
-        vaesenc	xmm10, xmm10, [r15+128]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [r15+128]
-        vaesenc	xmm12, xmm12, [r15+128]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [r15+128]
-        vaesenc	xmm14, xmm14, [r15+128]
-        vaesenc	xmm15, xmm15, [r15+128]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vpslldq	xmm5, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vaesenc	xmm8, xmm8, [r15+144]
-        vpxor	xmm2, xmm2, xmm5
-        vpxor	xmm3, xmm3, xmm1
-        vaesenc	xmm9, xmm9, [r15+144]
-        vpslld	xmm7, xmm2, 31
-        vpslld	xmm4, xmm2, 30
-        vpslld	xmm5, xmm2, 25
-        vaesenc	xmm10, xmm10, [r15+144]
-        vpxor	xmm7, xmm7, xmm4
-        vpxor	xmm7, xmm7, xmm5
-        vaesenc	xmm11, xmm11, [r15+144]
-        vpsrldq	xmm4, xmm7, 4
-        vpslldq	xmm7, xmm7, 12
-        vaesenc	xmm12, xmm12, [r15+144]
-        vpxor	xmm2, xmm2, xmm7
-        vpsrld	xmm5, xmm2, 1
-        vaesenc	xmm13, xmm13, [r15+144]
-        vpsrld	xmm1, xmm2, 2
-        vpsrld	xmm0, xmm2, 7
-        vaesenc	xmm14, xmm14, [r15+144]
-        vpxor	xmm5, xmm5, xmm1
-        vpxor	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, [r15+144]
-        vpxor	xmm5, xmm5, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpxor	xmm2, xmm2, xmm3
-        cmp	r10d, 11
-        vmovdqa	xmm7, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_avx1_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r10d, 13
-        vmovdqa	xmm7, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_avx1_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_avx1_aesenc_128_ghash_avx_done:
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vmovdqu	xmm1, OWORD PTR [rcx+16]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vmovdqu	OWORD PTR [rdx], xmm8
-        vmovdqu	OWORD PTR [rdx+16], xmm9
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+32]
-        vmovdqu	xmm1, OWORD PTR [rcx+48]
-        vpxor	xmm10, xmm10, xmm0
-        vpxor	xmm11, xmm11, xmm1
-        vmovdqu	OWORD PTR [rdx+32], xmm10
-        vmovdqu	OWORD PTR [rdx+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+64]
-        vmovdqu	xmm1, OWORD PTR [rcx+80]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vmovdqu	OWORD PTR [rdx+64], xmm12
-        vmovdqu	OWORD PTR [rdx+80], xmm13
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+96]
-        vmovdqu	xmm1, OWORD PTR [rcx+112]
-        vpxor	xmm14, xmm14, xmm0
-        vpxor	xmm15, xmm15, xmm1
-        vmovdqu	OWORD PTR [rdx+96], xmm14
-        vmovdqu	OWORD PTR [rdx+112], xmm15
-        add	ebx, 128
-        cmp	ebx, r13d
-        jl	L_AES_GCM_decrypt_avx1_ghash_128
-        vmovdqa	xmm6, xmm2
-        vmovdqu	xmm5, OWORD PTR [rsp]
-L_AES_GCM_decrypt_avx1_done_128:
-        mov	edx, r9d
-        cmp	ebx, edx
-        jge	L_AES_GCM_decrypt_avx1_done_dec
-        mov	r13d, r9d
-        and	r13d, 4294967280
-        cmp	ebx, r13d
-        jge	L_AES_GCM_decrypt_avx1_last_block_done
-L_AES_GCM_decrypt_avx1_last_block_start:
-        vmovdqu	xmm13, OWORD PTR [rdi+rbx]
-        vmovdqa	xmm0, xmm5
-        vpshufb	xmm1, xmm13, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm1, xmm1, xmm6
-        vmovdqu	xmm9, OWORD PTR [rsp+128]
-        vpshufb	xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one
-        vmovdqu	OWORD PTR [rsp+128], xmm9
-        vpxor	xmm8, xmm8, [r15]
-        vpclmulqdq	xmm10, xmm1, xmm0, 16
-        vaesenc	xmm8, xmm8, [r15+16]
-        vaesenc	xmm8, xmm8, [r15+32]
-        vpclmulqdq	xmm11, xmm1, xmm0, 1
-        vaesenc	xmm8, xmm8, [r15+48]
-        vaesenc	xmm8, xmm8, [r15+64]
-        vpclmulqdq	xmm12, xmm1, xmm0, 0
-        vaesenc	xmm8, xmm8, [r15+80]
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vaesenc	xmm8, xmm8, [r15+96]
-        vpxor	xmm10, xmm10, xmm11
-        vpslldq	xmm2, xmm10, 8
-        vpsrldq	xmm10, xmm10, 8
-        vaesenc	xmm8, xmm8, [r15+112]
-        vpxor	xmm2, xmm2, xmm12
-        vpxor	xmm3, xmm1, xmm10
-        vmovdqa	xmm0, OWORD PTR L_avx1_aes_gcm_mod2_128
-        vpclmulqdq	xmm11, xmm2, xmm0, 16
-        vaesenc	xmm8, xmm8, [r15+128]
-        vpshufd	xmm10, xmm2, 78
-        vpxor	xmm10, xmm10, xmm11
-        vpclmulqdq	xmm11, xmm10, xmm0, 16
-        vaesenc	xmm8, xmm8, [r15+144]
-        vpshufd	xmm10, xmm10, 78
-        vpxor	xmm10, xmm10, xmm11
-        vpxor	xmm6, xmm10, xmm3
-        cmp	r10d, 11
-        vmovdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_avx1_aesenc_gfmul_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [r15+176]
-        cmp	r10d, 13
-        vmovdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_avx1_aesenc_gfmul_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [r15+208]
-        vmovdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_avx1_aesenc_gfmul_last:
-        vaesenclast	xmm8, xmm8, xmm9
-        vmovdqa	xmm0, xmm13
-        vpxor	xmm8, xmm8, xmm0
-        vmovdqu	OWORD PTR [rsi+rbx], xmm8
-        add	ebx, 16
-        cmp	ebx, r13d
-        jl	L_AES_GCM_decrypt_avx1_last_block_start
-L_AES_GCM_decrypt_avx1_last_block_done:
-        mov	ecx, r9d
-        mov	edx, ecx
-        and	ecx, 15
-        jz	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_done
-        vmovdqu	xmm4, OWORD PTR [rsp+128]
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpxor	xmm4, xmm4, [r15]
-        vaesenc	xmm4, xmm4, [r15+16]
-        vaesenc	xmm4, xmm4, [r15+32]
-        vaesenc	xmm4, xmm4, [r15+48]
-        vaesenc	xmm4, xmm4, [r15+64]
-        vaesenc	xmm4, xmm4, [r15+80]
-        vaesenc	xmm4, xmm4, [r15+96]
-        vaesenc	xmm4, xmm4, [r15+112]
-        vaesenc	xmm4, xmm4, [r15+128]
-        vaesenc	xmm4, xmm4, [r15+144]
-        cmp	r10d, 11
-        vmovdqa	xmm9, OWORD PTR [r15+160]
-        jl	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last
-        vaesenc	xmm4, xmm4, xmm9
-        vaesenc	xmm4, xmm4, [r15+176]
-        cmp	r10d, 13
-        vmovdqa	xmm9, OWORD PTR [r15+192]
-        jl	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last
-        vaesenc	xmm4, xmm4, xmm9
-        vaesenc	xmm4, xmm4, [r15+208]
-        vmovdqa	xmm9, OWORD PTR [r15+224]
-L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last:
-        vaesenclast	xmm4, xmm4, xmm9
-        sub	rsp, 32
-        xor	ecx, ecx
-        vmovdqu	OWORD PTR [rsp], xmm4
-        vpxor	xmm0, xmm0, xmm0
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_loop:
-        movzx	r13d, BYTE PTR [rdi+rbx]
-        mov	BYTE PTR [rsp+rcx+16], r13b
-        xor	r13b, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [rsi+rbx], r13b
-        inc	ebx
-        inc	ecx
-        cmp	ebx, edx
-        jl	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_loop
-        vmovdqu	xmm4, OWORD PTR [rsp+16]
-        add	rsp, 32
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm4
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm6, 78
-        vpclmulqdq	xmm11, xmm6, xmm5, 17
-        vpclmulqdq	xmm8, xmm6, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm6
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm6, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm6, xmm6, xmm14
-L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_done:
-L_AES_GCM_decrypt_avx1_done_dec:
-        mov	edx, r9d
-        mov	ecx, r11d
-        shl	rdx, 3
-        shl	rcx, 3
-        vmovq	xmm0, rdx
-        vmovq	xmm1, rcx
-        vpunpcklqdq	xmm0, xmm0, xmm1
-        vpxor	xmm6, xmm6, xmm0
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm6, 78
-        vpclmulqdq	xmm11, xmm6, xmm5, 17
-        vpclmulqdq	xmm8, xmm6, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm6
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm6, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm6, xmm6, xmm14
-        vpshufb	xmm6, xmm6, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vmovdqu	xmm0, OWORD PTR [rsp+144]
-        vpxor	xmm0, xmm0, xmm6
-        cmp	r14d, 16
-        je	L_AES_GCM_decrypt_avx1_cmp_tag_16
-        sub	rsp, 16
-        xor	rcx, rcx
-        xor	rbx, rbx
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_decrypt_avx1_cmp_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+rcx]
-        xor	r13b, BYTE PTR [r8+rcx]
-        or	bl, r13b
-        inc	ecx
-        cmp	ecx, r14d
-        jne	L_AES_GCM_decrypt_avx1_cmp_tag_loop
-        cmp	rbx, 0
-        sete	bl
-        add	rsp, 16
-        xor	rcx, rcx
-        jmp	L_AES_GCM_decrypt_avx1_cmp_tag_done
-L_AES_GCM_decrypt_avx1_cmp_tag_16:
-        vmovdqu	xmm1, OWORD PTR [r8]
-        vpcmpeqb	xmm0, xmm0, xmm1
-        vpmovmskb	rdx, xmm0
-        ; %%edx == 0xFFFF then return 1 else => return 0
-        xor	ebx, ebx
-        cmp	edx, 65535
-        sete	bl
-L_AES_GCM_decrypt_avx1_cmp_tag_done:
-        mov	DWORD PTR [rbp], ebx
-        vzeroupper
-        add	rsp, 168
-        pop	rbp
-        pop	r15
-        pop	r14
-        pop	rbx
-        pop	r12
-        pop	rsi
-        pop	rdi
-        pop	r13
-        ret
-AES_GCM_decrypt_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_init_avx1 PROC
-        push	rdi
-        push	rsi
-        push	r12
-        push	r13
-        mov	rdi, rcx
-        mov	rsi, rdx
-        mov	r10, r8
-        mov	r11d, r9d
-        mov	rax, QWORD PTR [rsp+72]
-        mov	r8, QWORD PTR [rsp+80]
-        mov	r9, QWORD PTR [rsp+88]
-        sub	rsp, 16
-        vpxor	xmm4, xmm4, xmm4
-        mov	edx, r11d
-        cmp	edx, 12
-        jne	L_AES_GCM_init_avx1_iv_not_12
-        ; # Calculate values when IV is 12 bytes
-        ; Set counter based on IV
-        mov	ecx, 16777216
-        vmovq	xmm4, QWORD PTR [r10]
-        vpinsrd	xmm4, xmm4, DWORD PTR [r10+8], 2
-        vpinsrd	xmm4, xmm4, ecx, 3
-        ; H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqa	xmm5, OWORD PTR [rdi]
-        vpxor	xmm1, xmm4, xmm5
-        vmovdqa	xmm7, OWORD PTR [rdi+16]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [rdi+32]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [rdi+48]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [rdi+64]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [rdi+80]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [rdi+96]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [rdi+112]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [rdi+128]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [rdi+144]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        cmp	esi, 11
-        vmovdqa	xmm7, OWORD PTR [rdi+160]
-        jl	L_AES_GCM_init_avx1_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [rdi+176]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        cmp	esi, 13
-        vmovdqa	xmm7, OWORD PTR [rdi+192]
-        jl	L_AES_GCM_init_avx1_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [rdi+208]
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm1, xmm1, xmm7
-        vmovdqa	xmm7, OWORD PTR [rdi+224]
-L_AES_GCM_init_avx1_calc_iv_12_last:
-        vaesenclast	xmm5, xmm5, xmm7
-        vaesenclast	xmm1, xmm1, xmm7
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vmovdqu	xmm15, xmm1
-        jmp	L_AES_GCM_init_avx1_iv_done
-L_AES_GCM_init_avx1_iv_not_12:
-        ; Calculate values when IV is not 12 bytes
-        ; H = Encrypt X(=0)
-        vmovdqa	xmm5, OWORD PTR [rdi]
-        vaesenc	xmm5, xmm5, [rdi+16]
-        vaesenc	xmm5, xmm5, [rdi+32]
-        vaesenc	xmm5, xmm5, [rdi+48]
-        vaesenc	xmm5, xmm5, [rdi+64]
-        vaesenc	xmm5, xmm5, [rdi+80]
-        vaesenc	xmm5, xmm5, [rdi+96]
-        vaesenc	xmm5, xmm5, [rdi+112]
-        vaesenc	xmm5, xmm5, [rdi+128]
-        vaesenc	xmm5, xmm5, [rdi+144]
-        cmp	esi, 11
-        vmovdqa	xmm9, OWORD PTR [rdi+160]
-        jl	L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm9
-        vaesenc	xmm5, xmm5, [rdi+176]
-        cmp	esi, 13
-        vmovdqa	xmm9, OWORD PTR [rdi+192]
-        jl	L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm9
-        vaesenc	xmm5, xmm5, [rdi+208]
-        vmovdqa	xmm9, OWORD PTR [rdi+224]
-L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last:
-        vaesenclast	xmm5, xmm5, xmm9
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        ; Calc counter
-        ; Initialization vector
-        cmp	edx, 0
-        mov	rcx, 0
-        je	L_AES_GCM_init_avx1_calc_iv_done
-        cmp	edx, 16
-        jl	L_AES_GCM_init_avx1_calc_iv_lt16
-        and	edx, 4294967280
-L_AES_GCM_init_avx1_calc_iv_16_loop:
-        vmovdqu	xmm8, OWORD PTR [r10+rcx]
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm8
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm4, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpxor	xmm1, xmm1, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm4, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm4, xmm4, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm4, xmm4, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_init_avx1_calc_iv_16_loop
-        mov	edx, r11d
-        cmp	ecx, edx
-        je	L_AES_GCM_init_avx1_calc_iv_done
-L_AES_GCM_init_avx1_calc_iv_lt16:
-        sub	rsp, 16
-        vpxor	xmm8, xmm8, xmm8
-        xor	r13d, r13d
-        vmovdqu	OWORD PTR [rsp], xmm8
-L_AES_GCM_init_avx1_calc_iv_loop:
-        movzx	r12d, BYTE PTR [r10+rcx]
-        mov	BYTE PTR [rsp+r13], r12b
-        inc	ecx
-        inc	r13d
-        cmp	ecx, edx
-        jl	L_AES_GCM_init_avx1_calc_iv_loop
-        vmovdqu	xmm8, OWORD PTR [rsp]
-        add	rsp, 16
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm8
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm4, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpxor	xmm1, xmm1, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm4, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm4, xmm4, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm4, xmm4, xmm2
-L_AES_GCM_init_avx1_calc_iv_done:
-        ; T = Encrypt counter
-        vpxor	xmm0, xmm0, xmm0
-        shl	edx, 3
-        vmovq	xmm0, rdx
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm4, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpxor	xmm1, xmm1, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm7, xmm0
-        vmovdqa	xmm4, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm7, xmm7, xmm2
-        vpxor	xmm4, xmm4, xmm1
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        vpslld	xmm0, xmm7, 31
-        vpslld	xmm1, xmm7, 30
-        vpslld	xmm2, xmm7, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm7, xmm7, xmm0
-        vpsrld	xmm2, xmm7, 1
-        vpsrld	xmm3, xmm7, 2
-        vpsrld	xmm0, xmm7, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm4, xmm4, xmm2
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        ;   Encrypt counter
-        vmovdqa	xmm8, OWORD PTR [rdi]
-        vpxor	xmm8, xmm8, xmm4
-        vaesenc	xmm8, xmm8, [rdi+16]
-        vaesenc	xmm8, xmm8, [rdi+32]
-        vaesenc	xmm8, xmm8, [rdi+48]
-        vaesenc	xmm8, xmm8, [rdi+64]
-        vaesenc	xmm8, xmm8, [rdi+80]
-        vaesenc	xmm8, xmm8, [rdi+96]
-        vaesenc	xmm8, xmm8, [rdi+112]
-        vaesenc	xmm8, xmm8, [rdi+128]
-        vaesenc	xmm8, xmm8, [rdi+144]
-        cmp	esi, 11
-        vmovdqa	xmm9, OWORD PTR [rdi+160]
-        jl	L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [rdi+176]
-        cmp	esi, 13
-        vmovdqa	xmm9, OWORD PTR [rdi+192]
-        jl	L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [rdi+208]
-        vmovdqa	xmm9, OWORD PTR [rdi+224]
-L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last:
-        vaesenclast	xmm8, xmm8, xmm9
-        vmovdqu	xmm15, xmm8
-L_AES_GCM_init_avx1_iv_done:
-        vmovdqa	OWORD PTR [r9], xmm15
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpaddd	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_one
-        vmovdqa	OWORD PTR [rax], xmm5
-        vmovdqa	OWORD PTR [r8], xmm4
-        vzeroupper
-        add	rsp, 16
-        pop	r13
-        pop	r12
-        pop	rsi
-        pop	rdi
-        ret
-AES_GCM_init_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_aad_update_avx1 PROC
-        mov	rax, rcx
-        vmovdqa	xmm5, OWORD PTR [r8]
-        vmovdqa	xmm6, OWORD PTR [r9]
-        xor	ecx, ecx
-L_AES_GCM_aad_update_avx1_16_loop:
-        vmovdqu	xmm8, OWORD PTR [rax+rcx]
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm8
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm5, 78
-        vpshufd	xmm2, xmm6, 78
-        vpclmulqdq	xmm3, xmm6, xmm5, 17
-        vpclmulqdq	xmm0, xmm6, xmm5, 0
-        vpxor	xmm1, xmm1, xmm5
-        vpxor	xmm2, xmm2, xmm6
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm4, xmm0
-        vmovdqa	xmm5, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm5, xmm5, xmm1
-        vpsrld	xmm0, xmm4, 31
-        vpsrld	xmm1, xmm5, 31
-        vpslld	xmm4, xmm4, 1
-        vpslld	xmm5, xmm5, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm5, xmm5, xmm2
-        vpor	xmm4, xmm4, xmm0
-        vpor	xmm5, xmm5, xmm1
-        vpslld	xmm0, xmm4, 31
-        vpslld	xmm1, xmm4, 30
-        vpslld	xmm2, xmm4, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm4, xmm4, xmm0
-        vpsrld	xmm2, xmm4, 1
-        vpsrld	xmm3, xmm4, 2
-        vpsrld	xmm0, xmm4, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm4
-        vpxor	xmm5, xmm5, xmm2
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_aad_update_avx1_16_loop
-        vmovdqa	OWORD PTR [r8], xmm5
-        vzeroupper
-        ret
-AES_GCM_aad_update_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_block_avx1 PROC
-        mov	r10, r8
-        mov	r11, r9
-        mov	rax, QWORD PTR [rsp+40]
-        vmovdqu	xmm9, OWORD PTR [rax]
-        vpshufb	xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one
-        vmovdqu	OWORD PTR [rax], xmm9
-        vpxor	xmm8, xmm8, [rcx]
-        vaesenc	xmm8, xmm8, [rcx+16]
-        vaesenc	xmm8, xmm8, [rcx+32]
-        vaesenc	xmm8, xmm8, [rcx+48]
-        vaesenc	xmm8, xmm8, [rcx+64]
-        vaesenc	xmm8, xmm8, [rcx+80]
-        vaesenc	xmm8, xmm8, [rcx+96]
-        vaesenc	xmm8, xmm8, [rcx+112]
-        vaesenc	xmm8, xmm8, [rcx+128]
-        vaesenc	xmm8, xmm8, [rcx+144]
-        cmp	edx, 11
-        vmovdqa	xmm9, OWORD PTR [rcx+160]
-        jl	L_AES_GCM_encrypt_block_avx1_aesenc_block_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [rcx+176]
-        cmp	edx, 13
-        vmovdqa	xmm9, OWORD PTR [rcx+192]
-        jl	L_AES_GCM_encrypt_block_avx1_aesenc_block_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [rcx+208]
-        vmovdqa	xmm9, OWORD PTR [rcx+224]
-L_AES_GCM_encrypt_block_avx1_aesenc_block_last:
-        vaesenclast	xmm8, xmm8, xmm9
-        vmovdqu	xmm9, OWORD PTR [r11]
-        vpxor	xmm8, xmm8, xmm9
-        vmovdqu	OWORD PTR [r10], xmm8
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vzeroupper
-        ret
-AES_GCM_encrypt_block_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_ghash_block_avx1 PROC
-        vmovdqa	xmm4, OWORD PTR [rdx]
-        vmovdqa	xmm5, OWORD PTR [r8]
-        vmovdqu	xmm8, OWORD PTR [rcx]
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm8
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm4, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpxor	xmm1, xmm1, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm6, xmm0
-        vmovdqa	xmm4, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm6, xmm6, xmm2
-        vpxor	xmm4, xmm4, xmm1
-        vpsrld	xmm0, xmm6, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm6, xmm6, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm6, xmm6, xmm0
-        vpor	xmm4, xmm4, xmm1
-        vpslld	xmm0, xmm6, 31
-        vpslld	xmm1, xmm6, 30
-        vpslld	xmm2, xmm6, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm6, xmm6, xmm0
-        vpsrld	xmm2, xmm6, 1
-        vpsrld	xmm3, xmm6, 2
-        vpsrld	xmm0, xmm6, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm6
-        vpxor	xmm4, xmm4, xmm2
-        vmovdqa	OWORD PTR [rdx], xmm4
-        vzeroupper
-        ret
-AES_GCM_ghash_block_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_update_avx1 PROC
-        push	r13
-        push	r12
-        push	r14
-        push	r15
-        push	rdi
-        mov	rax, rcx
-        mov	r10, r8
-        mov	r8d, edx
-        mov	r11, r9
-        mov	r9d, DWORD PTR [rsp+80]
-        mov	r12, QWORD PTR [rsp+88]
-        mov	r14, QWORD PTR [rsp+96]
-        mov	r15, QWORD PTR [rsp+104]
-        sub	rsp, 160
-        vmovdqa	xmm6, OWORD PTR [r12]
-        vmovdqa	xmm5, OWORD PTR [r14]
-        vpsrlq	xmm9, xmm5, 63
-        vpsllq	xmm8, xmm5, 1
-        vpslldq	xmm9, xmm9, 8
-        vpor	xmm8, xmm8, xmm9
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpand	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128
-        vpxor	xmm5, xmm5, xmm8
-        xor	edi, edi
-        cmp	r9d, 128
-        mov	r13d, r9d
-        jl	L_AES_GCM_encrypt_update_avx1_done_128
-        and	r13d, 4294967168
-        vmovdqa	xmm2, xmm6
-        ; H ^ 1
-        vmovdqu	OWORD PTR [rsp], xmm5
-        ; H ^ 2
-        vpclmulqdq	xmm8, xmm5, xmm5, 0
-        vpclmulqdq	xmm0, xmm5, xmm5, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm0, xmm0, xmm14
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-        ; H ^ 3
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm0, 78
-        vpclmulqdq	xmm11, xmm0, xmm5, 17
-        vpclmulqdq	xmm8, xmm0, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm0
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm1, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm1, xmm1, xmm14
-        vmovdqu	OWORD PTR [rsp+32], xmm1
-        ; H ^ 4
-        vpclmulqdq	xmm8, xmm0, xmm0, 0
-        vpclmulqdq	xmm3, xmm0, xmm0, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm3, xmm3, xmm14
-        vmovdqu	OWORD PTR [rsp+48], xmm3
-        ; H ^ 5
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm0, 78
-        vpshufd	xmm10, xmm1, 78
-        vpclmulqdq	xmm11, xmm1, xmm0, 17
-        vpclmulqdq	xmm8, xmm1, xmm0, 0
-        vpxor	xmm9, xmm9, xmm0
-        vpxor	xmm10, xmm10, xmm1
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm7, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+64], xmm7
-        ; H ^ 6
-        vpclmulqdq	xmm8, xmm1, xmm1, 0
-        vpclmulqdq	xmm7, xmm1, xmm1, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+80], xmm7
-        ; H ^ 7
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm1, 78
-        vpshufd	xmm10, xmm3, 78
-        vpclmulqdq	xmm11, xmm3, xmm1, 17
-        vpclmulqdq	xmm8, xmm3, xmm1, 0
-        vpxor	xmm9, xmm9, xmm1
-        vpxor	xmm10, xmm10, xmm3
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm7, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+96], xmm7
-        ; H ^ 8
-        vpclmulqdq	xmm8, xmm3, xmm3, 0
-        vpclmulqdq	xmm7, xmm3, xmm3, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+112], xmm7
-        ; First 128 bytes of input
-        vmovdqu	xmm0, OWORD PTR [r15]
-        vmovdqa	xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven
-        vpshufb	xmm15, xmm15, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight
-        vmovdqa	xmm7, OWORD PTR [rax]
-        vmovdqu	OWORD PTR [r15], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+16]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+32]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+48]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+64]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+80]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+96]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+112]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+128]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+144]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r8d, 11
-        vmovdqa	xmm7, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_128_enc_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r8d, 13
-        vmovdqa	xmm7, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_128_enc_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_avx1_aesenc_128_enc_done:
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vmovdqu	xmm0, OWORD PTR [r11]
-        vmovdqu	xmm1, OWORD PTR [r11+16]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vmovdqu	OWORD PTR [r10], xmm8
-        vmovdqu	OWORD PTR [r10+16], xmm9
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [r11+32]
-        vmovdqu	xmm1, OWORD PTR [r11+48]
-        vpxor	xmm10, xmm10, xmm0
-        vpxor	xmm11, xmm11, xmm1
-        vmovdqu	OWORD PTR [r10+32], xmm10
-        vmovdqu	OWORD PTR [r10+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vmovdqu	xmm0, OWORD PTR [r11+64]
-        vmovdqu	xmm1, OWORD PTR [r11+80]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vmovdqu	OWORD PTR [r10+64], xmm12
-        vmovdqu	OWORD PTR [r10+80], xmm13
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [r11+96]
-        vmovdqu	xmm1, OWORD PTR [r11+112]
-        vpxor	xmm14, xmm14, xmm0
-        vpxor	xmm15, xmm15, xmm1
-        vmovdqu	OWORD PTR [r10+96], xmm14
-        vmovdqu	OWORD PTR [r10+112], xmm15
-        cmp	r13d, 128
-        mov	edi, 128
-        jle	L_AES_GCM_encrypt_update_avx1_end_128
-        ; More 128 bytes of input
-L_AES_GCM_encrypt_update_avx1_ghash_128:
-        lea	rcx, QWORD PTR [r11+rdi]
-        lea	rdx, QWORD PTR [r10+rdi]
-        vmovdqu	xmm0, OWORD PTR [r15]
-        vmovdqa	xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven
-        vpshufb	xmm15, xmm15, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight
-        vmovdqa	xmm7, OWORD PTR [rax]
-        vmovdqu	OWORD PTR [r15], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsp+112]
-        vmovdqu	xmm0, OWORD PTR [rdx+-128]
-        vaesenc	xmm8, xmm8, [rax+16]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm0, xmm0, xmm2
-        vpshufd	xmm1, xmm7, 78
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm3, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+16]
-        vaesenc	xmm10, xmm10, [rax+16]
-        vpclmulqdq	xmm2, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+16]
-        vaesenc	xmm12, xmm12, [rax+16]
-        vpclmulqdq	xmm1, xmm1, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+16]
-        vaesenc	xmm14, xmm14, [rax+16]
-        vaesenc	xmm15, xmm15, [rax+16]
-        vpxor	xmm1, xmm1, xmm2
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqu	xmm7, OWORD PTR [rsp+96]
-        vmovdqu	xmm0, OWORD PTR [rdx+-112]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+32]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+32]
-        vaesenc	xmm10, xmm10, [rax+32]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+32]
-        vaesenc	xmm12, xmm12, [rax+32]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+32]
-        vaesenc	xmm14, xmm14, [rax+32]
-        vaesenc	xmm15, xmm15, [rax+32]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+80]
-        vmovdqu	xmm0, OWORD PTR [rdx+-96]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+48]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+48]
-        vaesenc	xmm10, xmm10, [rax+48]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+48]
-        vaesenc	xmm12, xmm12, [rax+48]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+48]
-        vaesenc	xmm14, xmm14, [rax+48]
-        vaesenc	xmm15, xmm15, [rax+48]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+64]
-        vmovdqu	xmm0, OWORD PTR [rdx+-80]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+64]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+64]
-        vaesenc	xmm10, xmm10, [rax+64]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+64]
-        vaesenc	xmm12, xmm12, [rax+64]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+64]
-        vaesenc	xmm14, xmm14, [rax+64]
-        vaesenc	xmm15, xmm15, [rax+64]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+48]
-        vmovdqu	xmm0, OWORD PTR [rdx+-64]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+80]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+80]
-        vaesenc	xmm10, xmm10, [rax+80]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+80]
-        vaesenc	xmm12, xmm12, [rax+80]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+80]
-        vaesenc	xmm14, xmm14, [rax+80]
-        vaesenc	xmm15, xmm15, [rax+80]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+32]
-        vmovdqu	xmm0, OWORD PTR [rdx+-48]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+96]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+96]
-        vaesenc	xmm10, xmm10, [rax+96]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+96]
-        vaesenc	xmm12, xmm12, [rax+96]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+96]
-        vaesenc	xmm14, xmm14, [rax+96]
-        vaesenc	xmm15, xmm15, [rax+96]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm0, OWORD PTR [rdx+-32]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+112]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+112]
-        vaesenc	xmm10, xmm10, [rax+112]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+112]
-        vaesenc	xmm12, xmm12, [rax+112]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+112]
-        vaesenc	xmm14, xmm14, [rax+112]
-        vaesenc	xmm15, xmm15, [rax+112]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp]
-        vmovdqu	xmm0, OWORD PTR [rdx+-16]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+128]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+128]
-        vaesenc	xmm10, xmm10, [rax+128]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+128]
-        vaesenc	xmm12, xmm12, [rax+128]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+128]
-        vaesenc	xmm14, xmm14, [rax+128]
-        vaesenc	xmm15, xmm15, [rax+128]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vpslldq	xmm5, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vaesenc	xmm8, xmm8, [rax+144]
-        vpxor	xmm2, xmm2, xmm5
-        vpxor	xmm3, xmm3, xmm1
-        vaesenc	xmm9, xmm9, [rax+144]
-        vpslld	xmm7, xmm2, 31
-        vpslld	xmm4, xmm2, 30
-        vpslld	xmm5, xmm2, 25
-        vaesenc	xmm10, xmm10, [rax+144]
-        vpxor	xmm7, xmm7, xmm4
-        vpxor	xmm7, xmm7, xmm5
-        vaesenc	xmm11, xmm11, [rax+144]
-        vpsrldq	xmm4, xmm7, 4
-        vpslldq	xmm7, xmm7, 12
-        vaesenc	xmm12, xmm12, [rax+144]
-        vpxor	xmm2, xmm2, xmm7
-        vpsrld	xmm5, xmm2, 1
-        vaesenc	xmm13, xmm13, [rax+144]
-        vpsrld	xmm1, xmm2, 2
-        vpsrld	xmm0, xmm2, 7
-        vaesenc	xmm14, xmm14, [rax+144]
-        vpxor	xmm5, xmm5, xmm1
-        vpxor	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, [rax+144]
-        vpxor	xmm5, xmm5, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpxor	xmm2, xmm2, xmm3
-        cmp	r8d, 11
-        vmovdqa	xmm7, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r8d, 13
-        vmovdqa	xmm7, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_avx1_aesenc_128_ghash_avx_done:
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vmovdqu	xmm1, OWORD PTR [rcx+16]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vmovdqu	OWORD PTR [rdx], xmm8
-        vmovdqu	OWORD PTR [rdx+16], xmm9
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+32]
-        vmovdqu	xmm1, OWORD PTR [rcx+48]
-        vpxor	xmm10, xmm10, xmm0
-        vpxor	xmm11, xmm11, xmm1
-        vmovdqu	OWORD PTR [rdx+32], xmm10
-        vmovdqu	OWORD PTR [rdx+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+64]
-        vmovdqu	xmm1, OWORD PTR [rcx+80]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vmovdqu	OWORD PTR [rdx+64], xmm12
-        vmovdqu	OWORD PTR [rdx+80], xmm13
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+96]
-        vmovdqu	xmm1, OWORD PTR [rcx+112]
-        vpxor	xmm14, xmm14, xmm0
-        vpxor	xmm15, xmm15, xmm1
-        vmovdqu	OWORD PTR [rdx+96], xmm14
-        vmovdqu	OWORD PTR [rdx+112], xmm15
-        add	edi, 128
-        cmp	edi, r13d
-        jl	L_AES_GCM_encrypt_update_avx1_ghash_128
-L_AES_GCM_encrypt_update_avx1_end_128:
-        vmovdqa	xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpshufb	xmm8, xmm8, xmm4
-        vpshufb	xmm9, xmm9, xmm4
-        vpshufb	xmm10, xmm10, xmm4
-        vpshufb	xmm11, xmm11, xmm4
-        vpxor	xmm8, xmm8, xmm2
-        vpshufb	xmm12, xmm12, xmm4
-        vpshufb	xmm13, xmm13, xmm4
-        vpshufb	xmm14, xmm14, xmm4
-        vpshufb	xmm15, xmm15, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp]
-        vmovdqu	xmm5, OWORD PTR [rsp+16]
-        ; ghash_gfmul_avx
-        vpshufd	xmm1, xmm15, 78
-        vpshufd	xmm2, xmm7, 78
-        vpclmulqdq	xmm3, xmm7, xmm15, 17
-        vpclmulqdq	xmm0, xmm7, xmm15, 0
-        vpxor	xmm1, xmm1, xmm15
-        vpxor	xmm2, xmm2, xmm7
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqa	xmm4, xmm0
-        vmovdqa	xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm14, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm14, 17
-        vpclmulqdq	xmm0, xmm5, xmm14, 0
-        vpxor	xmm1, xmm1, xmm14
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vmovdqu	xmm7, OWORD PTR [rsp+32]
-        vmovdqu	xmm5, OWORD PTR [rsp+48]
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm13, 78
-        vpshufd	xmm2, xmm7, 78
-        vpclmulqdq	xmm3, xmm7, xmm13, 17
-        vpclmulqdq	xmm0, xmm7, xmm13, 0
-        vpxor	xmm1, xmm1, xmm13
-        vpxor	xmm2, xmm2, xmm7
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm12, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm12, 17
-        vpclmulqdq	xmm0, xmm5, xmm12, 0
-        vpxor	xmm1, xmm1, xmm12
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vmovdqu	xmm7, OWORD PTR [rsp+64]
-        vmovdqu	xmm5, OWORD PTR [rsp+80]
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm11, 78
-        vpshufd	xmm2, xmm7, 78
-        vpclmulqdq	xmm3, xmm7, xmm11, 17
-        vpclmulqdq	xmm0, xmm7, xmm11, 0
-        vpxor	xmm1, xmm1, xmm11
-        vpxor	xmm2, xmm2, xmm7
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm10, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm10, 17
-        vpclmulqdq	xmm0, xmm5, xmm10, 0
-        vpxor	xmm1, xmm1, xmm10
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vmovdqu	xmm7, OWORD PTR [rsp+96]
-        vmovdqu	xmm5, OWORD PTR [rsp+112]
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm9, 78
-        vpshufd	xmm2, xmm7, 78
-        vpclmulqdq	xmm3, xmm7, xmm9, 17
-        vpclmulqdq	xmm0, xmm7, xmm9, 0
-        vpxor	xmm1, xmm1, xmm9
-        vpxor	xmm2, xmm2, xmm7
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        ; ghash_gfmul_xor_avx
-        vpshufd	xmm1, xmm8, 78
-        vpshufd	xmm2, xmm5, 78
-        vpclmulqdq	xmm3, xmm5, xmm8, 17
-        vpclmulqdq	xmm0, xmm5, xmm8, 0
-        vpxor	xmm1, xmm1, xmm8
-        vpxor	xmm2, xmm2, xmm5
-        vpclmulqdq	xmm1, xmm1, xmm2, 0
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm1, xmm1, xmm3
-        vpxor	xmm4, xmm4, xmm0
-        vpxor	xmm6, xmm6, xmm3
-        vpslldq	xmm2, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vpxor	xmm4, xmm4, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vpslld	xmm0, xmm4, 31
-        vpslld	xmm1, xmm4, 30
-        vpslld	xmm2, xmm4, 25
-        vpxor	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm2
-        vmovdqa	xmm1, xmm0
-        vpsrldq	xmm1, xmm1, 4
-        vpslldq	xmm0, xmm0, 12
-        vpxor	xmm4, xmm4, xmm0
-        vpsrld	xmm2, xmm4, 1
-        vpsrld	xmm3, xmm4, 2
-        vpsrld	xmm0, xmm4, 7
-        vpxor	xmm2, xmm2, xmm3
-        vpxor	xmm2, xmm2, xmm0
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm2, xmm2, xmm4
-        vpxor	xmm6, xmm6, xmm2
-        vmovdqu	xmm5, OWORD PTR [rsp]
-L_AES_GCM_encrypt_update_avx1_done_128:
-        mov	edx, r9d
-        cmp	edi, edx
-        jge	L_AES_GCM_encrypt_update_avx1_done_enc
-        mov	r13d, r9d
-        and	r13d, 4294967280
-        cmp	edi, r13d
-        jge	L_AES_GCM_encrypt_update_avx1_last_block_done
-        vmovdqu	xmm9, OWORD PTR [r15]
-        vpshufb	xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one
-        vmovdqu	OWORD PTR [r15], xmm9
-        vpxor	xmm8, xmm8, [rax]
-        vaesenc	xmm8, xmm8, [rax+16]
-        vaesenc	xmm8, xmm8, [rax+32]
-        vaesenc	xmm8, xmm8, [rax+48]
-        vaesenc	xmm8, xmm8, [rax+64]
-        vaesenc	xmm8, xmm8, [rax+80]
-        vaesenc	xmm8, xmm8, [rax+96]
-        vaesenc	xmm8, xmm8, [rax+112]
-        vaesenc	xmm8, xmm8, [rax+128]
-        vaesenc	xmm8, xmm8, [rax+144]
-        cmp	r8d, 11
-        vmovdqa	xmm9, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_block_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [rax+176]
-        cmp	r8d, 13
-        vmovdqa	xmm9, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_block_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [rax+208]
-        vmovdqa	xmm9, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_avx1_aesenc_block_last:
-        vaesenclast	xmm8, xmm8, xmm9
-        vmovdqu	xmm9, OWORD PTR [r11+rdi]
-        vpxor	xmm8, xmm8, xmm9
-        vmovdqu	OWORD PTR [r10+rdi], xmm8
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm8
-        add	edi, 16
-        cmp	edi, r13d
-        jge	L_AES_GCM_encrypt_update_avx1_last_block_ghash
-L_AES_GCM_encrypt_update_avx1_last_block_start:
-        vmovdqu	xmm13, OWORD PTR [r11+rdi]
-        vmovdqu	xmm9, OWORD PTR [r15]
-        vpshufb	xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one
-        vmovdqu	OWORD PTR [r15], xmm9
-        vpxor	xmm8, xmm8, [rax]
-        vpclmulqdq	xmm10, xmm6, xmm5, 16
-        vaesenc	xmm8, xmm8, [rax+16]
-        vaesenc	xmm8, xmm8, [rax+32]
-        vpclmulqdq	xmm11, xmm6, xmm5, 1
-        vaesenc	xmm8, xmm8, [rax+48]
-        vaesenc	xmm8, xmm8, [rax+64]
-        vpclmulqdq	xmm12, xmm6, xmm5, 0
-        vaesenc	xmm8, xmm8, [rax+80]
-        vpclmulqdq	xmm1, xmm6, xmm5, 17
-        vaesenc	xmm8, xmm8, [rax+96]
-        vpxor	xmm10, xmm10, xmm11
-        vpslldq	xmm2, xmm10, 8
-        vpsrldq	xmm10, xmm10, 8
-        vaesenc	xmm8, xmm8, [rax+112]
-        vpxor	xmm2, xmm2, xmm12
-        vpxor	xmm3, xmm1, xmm10
-        vmovdqa	xmm0, OWORD PTR L_avx1_aes_gcm_mod2_128
-        vpclmulqdq	xmm11, xmm2, xmm0, 16
-        vaesenc	xmm8, xmm8, [rax+128]
-        vpshufd	xmm10, xmm2, 78
-        vpxor	xmm10, xmm10, xmm11
-        vpclmulqdq	xmm11, xmm10, xmm0, 16
-        vaesenc	xmm8, xmm8, [rax+144]
-        vpshufd	xmm10, xmm10, 78
-        vpxor	xmm10, xmm10, xmm11
-        vpxor	xmm6, xmm10, xmm3
-        cmp	r8d, 11
-        vmovdqa	xmm9, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [rax+176]
-        cmp	r8d, 13
-        vmovdqa	xmm9, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [rax+208]
-        vmovdqa	xmm9, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last:
-        vaesenclast	xmm8, xmm8, xmm9
-        vmovdqa	xmm0, xmm13
-        vpxor	xmm8, xmm8, xmm0
-        vmovdqu	OWORD PTR [r10+rdi], xmm8
-        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        add	edi, 16
-        vpxor	xmm6, xmm6, xmm8
-        cmp	edi, r13d
-        jl	L_AES_GCM_encrypt_update_avx1_last_block_start
-L_AES_GCM_encrypt_update_avx1_last_block_ghash:
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm6, 78
-        vpclmulqdq	xmm11, xmm6, xmm5, 17
-        vpclmulqdq	xmm8, xmm6, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm6
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm6, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm6, xmm6, xmm14
-L_AES_GCM_encrypt_update_avx1_last_block_done:
-L_AES_GCM_encrypt_update_avx1_done_enc:
-        vmovdqa	OWORD PTR [r12], xmm6
-        vzeroupper
-        add	rsp, 160
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r12
-        pop	r13
-        ret
-AES_GCM_encrypt_update_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_final_avx1 PROC
-        push	r13
-        push	r12
-        push	r14
-        mov	rax, rcx
-        mov	r10d, r9d
-        mov	r9, rdx
-        mov	r11d, DWORD PTR [rsp+64]
-        mov	r12, QWORD PTR [rsp+72]
-        mov	r14, QWORD PTR [rsp+80]
-        sub	rsp, 16
-        vmovdqa	xmm4, OWORD PTR [rax]
-        vmovdqa	xmm5, OWORD PTR [r12]
-        vmovdqa	xmm6, OWORD PTR [r14]
-        vpsrlq	xmm9, xmm5, 63
-        vpsllq	xmm8, xmm5, 1
-        vpslldq	xmm9, xmm9, 8
-        vpor	xmm8, xmm8, xmm9
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpand	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128
-        vpxor	xmm5, xmm5, xmm8
-        mov	edx, r10d
-        mov	ecx, r11d
-        shl	rdx, 3
-        shl	rcx, 3
-        vmovq	xmm0, rdx
-        vmovq	xmm1, rcx
-        vpunpcklqdq	xmm0, xmm0, xmm1
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm4, 78
-        vpclmulqdq	xmm11, xmm4, xmm5, 17
-        vpclmulqdq	xmm8, xmm4, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm4
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm4, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm4, xmm4, xmm14
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm0, xmm4, xmm6
-        cmp	r8d, 16
-        je	L_AES_GCM_encrypt_final_avx1_store_tag_16
-        xor	rcx, rcx
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_encrypt_final_avx1_store_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [r9+rcx], r13b
-        inc	ecx
-        cmp	ecx, r8d
-        jne	L_AES_GCM_encrypt_final_avx1_store_tag_loop
-        jmp	L_AES_GCM_encrypt_final_avx1_store_tag_done
-L_AES_GCM_encrypt_final_avx1_store_tag_16:
-        vmovdqu	OWORD PTR [r9], xmm0
-L_AES_GCM_encrypt_final_avx1_store_tag_done:
-        vzeroupper
-        add	rsp, 16
-        pop	r14
-        pop	r12
-        pop	r13
-        ret
-AES_GCM_encrypt_final_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_decrypt_update_avx1 PROC
-        push	r13
-        push	r12
-        push	r14
-        push	r15
-        push	rdi
-        mov	rax, rcx
-        mov	r10, r8
-        mov	r8d, edx
-        mov	r11, r9
-        mov	r9d, DWORD PTR [rsp+80]
-        mov	r12, QWORD PTR [rsp+88]
-        mov	r14, QWORD PTR [rsp+96]
-        mov	r15, QWORD PTR [rsp+104]
-        sub	rsp, 168
-        vmovdqa	xmm6, OWORD PTR [r12]
-        vmovdqa	xmm5, OWORD PTR [r14]
-        vpsrlq	xmm9, xmm5, 63
-        vpsllq	xmm8, xmm5, 1
-        vpslldq	xmm9, xmm9, 8
-        vpor	xmm8, xmm8, xmm9
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpand	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128
-        vpxor	xmm5, xmm5, xmm8
-        xor	edi, edi
-        cmp	r9d, 128
-        mov	r13d, r9d
-        jl	L_AES_GCM_decrypt_update_avx1_done_128
-        and	r13d, 4294967168
-        vmovdqa	xmm2, xmm6
-        ; H ^ 1
-        vmovdqu	OWORD PTR [rsp], xmm5
-        ; H ^ 2
-        vpclmulqdq	xmm8, xmm5, xmm5, 0
-        vpclmulqdq	xmm0, xmm5, xmm5, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm0, xmm0, xmm14
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-        ; H ^ 3
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm0, 78
-        vpclmulqdq	xmm11, xmm0, xmm5, 17
-        vpclmulqdq	xmm8, xmm0, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm0
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm1, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm1, xmm1, xmm14
-        vmovdqu	OWORD PTR [rsp+32], xmm1
-        ; H ^ 4
-        vpclmulqdq	xmm8, xmm0, xmm0, 0
-        vpclmulqdq	xmm3, xmm0, xmm0, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm3, xmm3, xmm14
-        vmovdqu	OWORD PTR [rsp+48], xmm3
-        ; H ^ 5
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm0, 78
-        vpshufd	xmm10, xmm1, 78
-        vpclmulqdq	xmm11, xmm1, xmm0, 17
-        vpclmulqdq	xmm8, xmm1, xmm0, 0
-        vpxor	xmm9, xmm9, xmm0
-        vpxor	xmm10, xmm10, xmm1
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm7, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+64], xmm7
-        ; H ^ 6
-        vpclmulqdq	xmm8, xmm1, xmm1, 0
-        vpclmulqdq	xmm7, xmm1, xmm1, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+80], xmm7
-        ; H ^ 7
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm1, 78
-        vpshufd	xmm10, xmm3, 78
-        vpclmulqdq	xmm11, xmm3, xmm1, 17
-        vpclmulqdq	xmm8, xmm3, xmm1, 0
-        vpxor	xmm9, xmm9, xmm1
-        vpxor	xmm10, xmm10, xmm3
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm7, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+96], xmm7
-        ; H ^ 8
-        vpclmulqdq	xmm8, xmm3, xmm3, 0
-        vpclmulqdq	xmm7, xmm3, xmm3, 17
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm7, xmm7, xmm14
-        vmovdqu	OWORD PTR [rsp+112], xmm7
-L_AES_GCM_decrypt_update_avx1_ghash_128:
-        lea	rcx, QWORD PTR [r11+rdi]
-        lea	rdx, QWORD PTR [r10+rdi]
-        vmovdqu	xmm0, OWORD PTR [r15]
-        vmovdqa	xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven
-        vpshufb	xmm15, xmm15, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight
-        vmovdqa	xmm7, OWORD PTR [rax]
-        vmovdqu	OWORD PTR [r15], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsp+112]
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vaesenc	xmm8, xmm8, [rax+16]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm0, xmm0, xmm2
-        vpshufd	xmm1, xmm7, 78
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm3, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+16]
-        vaesenc	xmm10, xmm10, [rax+16]
-        vpclmulqdq	xmm2, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+16]
-        vaesenc	xmm12, xmm12, [rax+16]
-        vpclmulqdq	xmm1, xmm1, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+16]
-        vaesenc	xmm14, xmm14, [rax+16]
-        vaesenc	xmm15, xmm15, [rax+16]
-        vpxor	xmm1, xmm1, xmm2
-        vpxor	xmm1, xmm1, xmm3
-        vmovdqu	xmm7, OWORD PTR [rsp+96]
-        vmovdqu	xmm0, OWORD PTR [rcx+16]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+32]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+32]
-        vaesenc	xmm10, xmm10, [rax+32]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+32]
-        vaesenc	xmm12, xmm12, [rax+32]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+32]
-        vaesenc	xmm14, xmm14, [rax+32]
-        vaesenc	xmm15, xmm15, [rax+32]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+80]
-        vmovdqu	xmm0, OWORD PTR [rcx+32]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+48]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+48]
-        vaesenc	xmm10, xmm10, [rax+48]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+48]
-        vaesenc	xmm12, xmm12, [rax+48]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+48]
-        vaesenc	xmm14, xmm14, [rax+48]
-        vaesenc	xmm15, xmm15, [rax+48]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+64]
-        vmovdqu	xmm0, OWORD PTR [rcx+48]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+64]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+64]
-        vaesenc	xmm10, xmm10, [rax+64]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+64]
-        vaesenc	xmm12, xmm12, [rax+64]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+64]
-        vaesenc	xmm14, xmm14, [rax+64]
-        vaesenc	xmm15, xmm15, [rax+64]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+48]
-        vmovdqu	xmm0, OWORD PTR [rcx+64]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+80]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+80]
-        vaesenc	xmm10, xmm10, [rax+80]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+80]
-        vaesenc	xmm12, xmm12, [rax+80]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+80]
-        vaesenc	xmm14, xmm14, [rax+80]
-        vaesenc	xmm15, xmm15, [rax+80]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+32]
-        vmovdqu	xmm0, OWORD PTR [rcx+80]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+96]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+96]
-        vaesenc	xmm10, xmm10, [rax+96]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+96]
-        vaesenc	xmm12, xmm12, [rax+96]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+96]
-        vaesenc	xmm14, xmm14, [rax+96]
-        vaesenc	xmm15, xmm15, [rax+96]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vmovdqu	xmm0, OWORD PTR [rcx+96]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+112]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+112]
-        vaesenc	xmm10, xmm10, [rax+112]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+112]
-        vaesenc	xmm12, xmm12, [rax+112]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+112]
-        vaesenc	xmm14, xmm14, [rax+112]
-        vaesenc	xmm15, xmm15, [rax+112]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vmovdqu	xmm7, OWORD PTR [rsp]
-        vmovdqu	xmm0, OWORD PTR [rcx+112]
-        vpshufd	xmm4, xmm7, 78
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vaesenc	xmm8, xmm8, [rax+128]
-        vpxor	xmm4, xmm4, xmm7
-        vpshufd	xmm5, xmm0, 78
-        vpxor	xmm5, xmm5, xmm0
-        vpclmulqdq	xmm6, xmm0, xmm7, 17
-        vaesenc	xmm9, xmm9, [rax+128]
-        vaesenc	xmm10, xmm10, [rax+128]
-        vpclmulqdq	xmm7, xmm0, xmm7, 0
-        vaesenc	xmm11, xmm11, [rax+128]
-        vaesenc	xmm12, xmm12, [rax+128]
-        vpclmulqdq	xmm4, xmm4, xmm5, 0
-        vaesenc	xmm13, xmm13, [rax+128]
-        vaesenc	xmm14, xmm14, [rax+128]
-        vaesenc	xmm15, xmm15, [rax+128]
-        vpxor	xmm1, xmm1, xmm7
-        vpxor	xmm2, xmm2, xmm7
-        vpxor	xmm1, xmm1, xmm6
-        vpxor	xmm3, xmm3, xmm6
-        vpxor	xmm1, xmm1, xmm4
-        vpslldq	xmm5, xmm1, 8
-        vpsrldq	xmm1, xmm1, 8
-        vaesenc	xmm8, xmm8, [rax+144]
-        vpxor	xmm2, xmm2, xmm5
-        vpxor	xmm3, xmm3, xmm1
-        vaesenc	xmm9, xmm9, [rax+144]
-        vpslld	xmm7, xmm2, 31
-        vpslld	xmm4, xmm2, 30
-        vpslld	xmm5, xmm2, 25
-        vaesenc	xmm10, xmm10, [rax+144]
-        vpxor	xmm7, xmm7, xmm4
-        vpxor	xmm7, xmm7, xmm5
-        vaesenc	xmm11, xmm11, [rax+144]
-        vpsrldq	xmm4, xmm7, 4
-        vpslldq	xmm7, xmm7, 12
-        vaesenc	xmm12, xmm12, [rax+144]
-        vpxor	xmm2, xmm2, xmm7
-        vpsrld	xmm5, xmm2, 1
-        vaesenc	xmm13, xmm13, [rax+144]
-        vpsrld	xmm1, xmm2, 2
-        vpsrld	xmm0, xmm2, 7
-        vaesenc	xmm14, xmm14, [rax+144]
-        vpxor	xmm5, xmm5, xmm1
-        vpxor	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, [rax+144]
-        vpxor	xmm5, xmm5, xmm4
-        vpxor	xmm2, xmm2, xmm5
-        vpxor	xmm2, xmm2, xmm3
-        cmp	r8d, 11
-        vmovdqa	xmm7, OWORD PTR [rax+160]
-        jl	L_AES_GCM_decrypt_update_avx1_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r8d, 13
-        vmovdqa	xmm7, OWORD PTR [rax+192]
-        jl	L_AES_GCM_decrypt_update_avx1_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqa	xmm7, OWORD PTR [rax+224]
-L_AES_GCM_decrypt_update_avx1_aesenc_128_ghash_avx_done:
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vmovdqu	xmm1, OWORD PTR [rcx+16]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vmovdqu	OWORD PTR [rdx], xmm8
-        vmovdqu	OWORD PTR [rdx+16], xmm9
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+32]
-        vmovdqu	xmm1, OWORD PTR [rcx+48]
-        vpxor	xmm10, xmm10, xmm0
-        vpxor	xmm11, xmm11, xmm1
-        vmovdqu	OWORD PTR [rdx+32], xmm10
-        vmovdqu	OWORD PTR [rdx+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+64]
-        vmovdqu	xmm1, OWORD PTR [rcx+80]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vmovdqu	OWORD PTR [rdx+64], xmm12
-        vmovdqu	OWORD PTR [rdx+80], xmm13
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+96]
-        vmovdqu	xmm1, OWORD PTR [rcx+112]
-        vpxor	xmm14, xmm14, xmm0
-        vpxor	xmm15, xmm15, xmm1
-        vmovdqu	OWORD PTR [rdx+96], xmm14
-        vmovdqu	OWORD PTR [rdx+112], xmm15
-        add	edi, 128
-        cmp	edi, r13d
-        jl	L_AES_GCM_decrypt_update_avx1_ghash_128
-        vmovdqa	xmm6, xmm2
-        vmovdqu	xmm5, OWORD PTR [rsp]
-L_AES_GCM_decrypt_update_avx1_done_128:
-        mov	edx, r9d
-        cmp	edi, edx
-        jge	L_AES_GCM_decrypt_update_avx1_done_dec
-        mov	r13d, r9d
-        and	r13d, 4294967280
-        cmp	edi, r13d
-        jge	L_AES_GCM_decrypt_update_avx1_last_block_done
-L_AES_GCM_decrypt_update_avx1_last_block_start:
-        vmovdqu	xmm13, OWORD PTR [r11+rdi]
-        vmovdqa	xmm0, xmm5
-        vpshufb	xmm1, xmm13, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm1, xmm1, xmm6
-        vmovdqu	xmm9, OWORD PTR [r15]
-        vpshufb	xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one
-        vmovdqu	OWORD PTR [r15], xmm9
-        vpxor	xmm8, xmm8, [rax]
-        vpclmulqdq	xmm10, xmm1, xmm0, 16
-        vaesenc	xmm8, xmm8, [rax+16]
-        vaesenc	xmm8, xmm8, [rax+32]
-        vpclmulqdq	xmm11, xmm1, xmm0, 1
-        vaesenc	xmm8, xmm8, [rax+48]
-        vaesenc	xmm8, xmm8, [rax+64]
-        vpclmulqdq	xmm12, xmm1, xmm0, 0
-        vaesenc	xmm8, xmm8, [rax+80]
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vaesenc	xmm8, xmm8, [rax+96]
-        vpxor	xmm10, xmm10, xmm11
-        vpslldq	xmm2, xmm10, 8
-        vpsrldq	xmm10, xmm10, 8
-        vaesenc	xmm8, xmm8, [rax+112]
-        vpxor	xmm2, xmm2, xmm12
-        vpxor	xmm3, xmm1, xmm10
-        vmovdqa	xmm0, OWORD PTR L_avx1_aes_gcm_mod2_128
-        vpclmulqdq	xmm11, xmm2, xmm0, 16
-        vaesenc	xmm8, xmm8, [rax+128]
-        vpshufd	xmm10, xmm2, 78
-        vpxor	xmm10, xmm10, xmm11
-        vpclmulqdq	xmm11, xmm10, xmm0, 16
-        vaesenc	xmm8, xmm8, [rax+144]
-        vpshufd	xmm10, xmm10, 78
-        vpxor	xmm10, xmm10, xmm11
-        vpxor	xmm6, xmm10, xmm3
-        cmp	r8d, 11
-        vmovdqa	xmm9, OWORD PTR [rax+160]
-        jl	L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [rax+176]
-        cmp	r8d, 13
-        vmovdqa	xmm9, OWORD PTR [rax+192]
-        jl	L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last
-        vaesenc	xmm8, xmm8, xmm9
-        vaesenc	xmm8, xmm8, [rax+208]
-        vmovdqa	xmm9, OWORD PTR [rax+224]
-L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last:
-        vaesenclast	xmm8, xmm8, xmm9
-        vmovdqa	xmm0, xmm13
-        vpxor	xmm8, xmm8, xmm0
-        vmovdqu	OWORD PTR [r10+rdi], xmm8
-        add	edi, 16
-        cmp	edi, r13d
-        jl	L_AES_GCM_decrypt_update_avx1_last_block_start
-L_AES_GCM_decrypt_update_avx1_last_block_done:
-L_AES_GCM_decrypt_update_avx1_done_dec:
-        vmovdqa	OWORD PTR [r12], xmm6
-        vzeroupper
-        add	rsp, 168
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r12
-        pop	r13
-        ret
-AES_GCM_decrypt_update_avx1 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_decrypt_final_avx1 PROC
-        push	r13
-        push	r12
-        push	r14
-        push	rbp
-        push	r15
-        mov	rax, rcx
-        mov	r10d, r9d
-        mov	r9, rdx
-        mov	r11d, DWORD PTR [rsp+80]
-        mov	r12, QWORD PTR [rsp+88]
-        mov	r14, QWORD PTR [rsp+96]
-        mov	rbp, QWORD PTR [rsp+104]
-        sub	rsp, 16
-        vmovdqa	xmm6, OWORD PTR [rax]
-        vmovdqa	xmm5, OWORD PTR [r12]
-        vmovdqa	xmm15, OWORD PTR [r14]
-        vpsrlq	xmm9, xmm5, 63
-        vpsllq	xmm8, xmm5, 1
-        vpslldq	xmm9, xmm9, 8
-        vpor	xmm8, xmm8, xmm9
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpand	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128
-        vpxor	xmm5, xmm5, xmm8
-        mov	edx, r10d
-        mov	ecx, r11d
-        shl	rdx, 3
-        shl	rcx, 3
-        vmovq	xmm0, rdx
-        vmovq	xmm1, rcx
-        vpunpcklqdq	xmm0, xmm0, xmm1
-        vpxor	xmm6, xmm6, xmm0
-        ; ghash_gfmul_red_avx
-        vpshufd	xmm9, xmm5, 78
-        vpshufd	xmm10, xmm6, 78
-        vpclmulqdq	xmm11, xmm6, xmm5, 17
-        vpclmulqdq	xmm8, xmm6, xmm5, 0
-        vpxor	xmm9, xmm9, xmm5
-        vpxor	xmm10, xmm10, xmm6
-        vpclmulqdq	xmm9, xmm9, xmm10, 0
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm9, xmm9, xmm11
-        vpslldq	xmm10, xmm9, 8
-        vpsrldq	xmm9, xmm9, 8
-        vpxor	xmm8, xmm8, xmm10
-        vpxor	xmm6, xmm11, xmm9
-        vpslld	xmm12, xmm8, 31
-        vpslld	xmm13, xmm8, 30
-        vpslld	xmm14, xmm8, 25
-        vpxor	xmm12, xmm12, xmm13
-        vpxor	xmm12, xmm12, xmm14
-        vpsrldq	xmm13, xmm12, 4
-        vpslldq	xmm12, xmm12, 12
-        vpxor	xmm8, xmm8, xmm12
-        vpsrld	xmm14, xmm8, 1
-        vpsrld	xmm10, xmm8, 2
-        vpsrld	xmm9, xmm8, 7
-        vpxor	xmm14, xmm14, xmm10
-        vpxor	xmm14, xmm14, xmm9
-        vpxor	xmm14, xmm14, xmm13
-        vpxor	xmm14, xmm14, xmm8
-        vpxor	xmm6, xmm6, xmm14
-        vpshufb	xmm6, xmm6, OWORD PTR L_avx1_aes_gcm_bswap_mask
-        vpxor	xmm0, xmm6, xmm15
-        cmp	r8d, 16
-        je	L_AES_GCM_decrypt_final_avx1_cmp_tag_16
-        sub	rsp, 16
-        xor	rcx, rcx
-        xor	r15, r15
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_decrypt_final_avx1_cmp_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+rcx]
-        xor	r13b, BYTE PTR [r9+rcx]
-        or	r15b, r13b
-        inc	ecx
-        cmp	ecx, r8d
-        jne	L_AES_GCM_decrypt_final_avx1_cmp_tag_loop
-        cmp	r15, 0
-        sete	r15b
-        add	rsp, 16
-        xor	rcx, rcx
-        jmp	L_AES_GCM_decrypt_final_avx1_cmp_tag_done
-L_AES_GCM_decrypt_final_avx1_cmp_tag_16:
-        vmovdqu	xmm1, OWORD PTR [r9]
-        vpcmpeqb	xmm0, xmm0, xmm1
-        vpmovmskb	rdx, xmm0
-        ; %%edx == 0xFFFF then return 1 else => return 0
-        xor	r15d, r15d
-        cmp	edx, 65535
-        sete	r15b
-L_AES_GCM_decrypt_final_avx1_cmp_tag_done:
-        mov	DWORD PTR [rbp], r15d
-        vzeroupper
-        add	rsp, 16
-        pop	r15
-        pop	rbp
-        pop	r14
-        pop	r12
-        pop	r13
-        ret
-AES_GCM_decrypt_final_avx1 ENDP
-_text ENDS
-ENDIF
-IFDEF HAVE_INTEL_AVX2
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_one QWORD 0, 1
-ptr_L_avx2_aes_gcm_one QWORD L_avx2_aes_gcm_one
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_two QWORD 0, 2
-ptr_L_avx2_aes_gcm_two QWORD L_avx2_aes_gcm_two
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_three QWORD 0, 3
-ptr_L_avx2_aes_gcm_three QWORD L_avx2_aes_gcm_three
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_four QWORD 0, 4
-ptr_L_avx2_aes_gcm_four QWORD L_avx2_aes_gcm_four
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_five QWORD 0, 5
-ptr_L_avx2_aes_gcm_five QWORD L_avx2_aes_gcm_five
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_six QWORD 0, 6
-ptr_L_avx2_aes_gcm_six QWORD L_avx2_aes_gcm_six
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_seven QWORD 0, 7
-ptr_L_avx2_aes_gcm_seven QWORD L_avx2_aes_gcm_seven
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_eight QWORD 0, 8
-ptr_L_avx2_aes_gcm_eight QWORD L_avx2_aes_gcm_eight
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_bswap_one QWORD 0, 72057594037927936
-ptr_L_avx2_aes_gcm_bswap_one QWORD L_avx2_aes_gcm_bswap_one
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_bswap_epi64 QWORD 283686952306183, 579005069656919567
-ptr_L_avx2_aes_gcm_bswap_epi64 QWORD L_avx2_aes_gcm_bswap_epi64
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_bswap_mask QWORD 579005069656919567, 283686952306183
-ptr_L_avx2_aes_gcm_bswap_mask QWORD L_avx2_aes_gcm_bswap_mask
-_DATA ENDS
-_DATA SEGMENT
-ALIGN 16
-L_avx2_aes_gcm_mod2_128 QWORD 1, 13979173243358019584
-ptr_L_avx2_aes_gcm_mod2_128 QWORD L_avx2_aes_gcm_mod2_128
-_DATA ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_avx2 PROC
-        push	r13
-        push	rdi
-        push	r12
-        push	r15
-        push	rbx
-        push	r14
-        push	rsi
-        mov	rdi, rcx
-        mov	r12, r8
-        mov	rax, r9
-        mov	r15, QWORD PTR [rsp+96]
-        mov	r8, rdx
-        mov	r10d, DWORD PTR [rsp+104]
-        mov	r11d, DWORD PTR [rsp+112]
-        mov	ebx, DWORD PTR [rsp+120]
-        mov	r14d, DWORD PTR [rsp+128]
-        mov	rsi, QWORD PTR [rsp+136]
-        mov	r9d, DWORD PTR [rsp+144]
-        sub	rsp, 160
-        vpxor	xmm4, xmm4, xmm4
-        vpxor	xmm6, xmm6, xmm6
-        mov	edx, ebx
-        cmp	edx, 12
-        je	L_AES_GCM_encrypt_avx2_iv_12
-        ; Calculate values when IV is not 12 bytes
-        ; H = Encrypt X(=0)
-        vmovdqu	xmm5, OWORD PTR [rsi]
-        vaesenc	xmm5, xmm5, [rsi+16]
-        vaesenc	xmm5, xmm5, [rsi+32]
-        vaesenc	xmm5, xmm5, [rsi+48]
-        vaesenc	xmm5, xmm5, [rsi+64]
-        vaesenc	xmm5, xmm5, [rsi+80]
-        vaesenc	xmm5, xmm5, [rsi+96]
-        vaesenc	xmm5, xmm5, [rsi+112]
-        vaesenc	xmm5, xmm5, [rsi+128]
-        vaesenc	xmm5, xmm5, [rsi+144]
-        cmp	r9d, 11
-        vmovdqu	xmm0, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm5, xmm5, [rsi+176]
-        cmp	r9d, 13
-        vmovdqu	xmm0, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm5, xmm5, [rsi+208]
-        vmovdqu	xmm0, OWORD PTR [rsi+224]
-L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last:
-        vaesenclast	xmm5, xmm5, xmm0
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        ; Calc counter
-        ; Initialization vector
-        cmp	edx, 0
-        mov	rcx, 0
-        je	L_AES_GCM_encrypt_avx2_calc_iv_done
-        cmp	edx, 16
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_lt16
-        and	edx, 4294967280
-L_AES_GCM_encrypt_avx2_calc_iv_16_loop:
-        vmovdqu	xmm0, OWORD PTR [rax+rcx]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm7, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm7, xmm2, 16
-        vpshufd	xmm1, xmm7, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_16_loop
-        mov	edx, ebx
-        cmp	ecx, edx
-        je	L_AES_GCM_encrypt_avx2_calc_iv_done
-L_AES_GCM_encrypt_avx2_calc_iv_lt16:
-        vpxor	xmm0, xmm0, xmm0
-        xor	ebx, ebx
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_encrypt_avx2_calc_iv_loop:
-        movzx	r13d, BYTE PTR [rax+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_loop
-        vmovdqu	xmm0, OWORD PTR [rsp]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm7, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm7, xmm2, 16
-        vpshufd	xmm1, xmm7, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-L_AES_GCM_encrypt_avx2_calc_iv_done:
-        ; T = Encrypt counter
-        vpxor	xmm0, xmm0, xmm0
-        shl	edx, 3
-        vmovq	xmm0, rdx
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm7, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm7, xmm2, 16
-        vpshufd	xmm1, xmm7, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        ;   Encrypt counter
-        vmovdqu	xmm15, OWORD PTR [rsi]
-        vpxor	xmm15, xmm15, xmm4
-        vaesenc	xmm15, xmm15, [rsi+16]
-        vaesenc	xmm15, xmm15, [rsi+32]
-        vaesenc	xmm15, xmm15, [rsi+48]
-        vaesenc	xmm15, xmm15, [rsi+64]
-        vaesenc	xmm15, xmm15, [rsi+80]
-        vaesenc	xmm15, xmm15, [rsi+96]
-        vaesenc	xmm15, xmm15, [rsi+112]
-        vaesenc	xmm15, xmm15, [rsi+128]
-        vaesenc	xmm15, xmm15, [rsi+144]
-        cmp	r9d, 11
-        vmovdqu	xmm0, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm15, xmm15, xmm0
-        vaesenc	xmm15, xmm15, [rsi+176]
-        cmp	r9d, 13
-        vmovdqu	xmm0, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm15, xmm15, xmm0
-        vaesenc	xmm15, xmm15, [rsi+208]
-        vmovdqu	xmm0, OWORD PTR [rsi+224]
-L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last:
-        vaesenclast	xmm15, xmm15, xmm0
-        jmp	L_AES_GCM_encrypt_avx2_iv_done
-L_AES_GCM_encrypt_avx2_iv_12:
-        ; # Calculate values when IV is 12 bytes
-        ; Set counter based on IV
-        vmovdqu	xmm4, OWORD PTR L_avx2_aes_gcm_bswap_one
-        vmovdqu	xmm5, OWORD PTR [rsi]
-        vpblendd	xmm4, xmm4, [rax], 7
-        ; H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqu	xmm7, OWORD PTR [rsi+16]
-        vpxor	xmm15, xmm4, xmm5
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rsi+32]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+48]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+64]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+80]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+96]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+112]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+128]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+144]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        cmp	r9d, 11
-        vmovdqu	xmm0, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+176]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        cmp	r9d, 13
-        vmovdqu	xmm0, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+208]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+224]
-L_AES_GCM_encrypt_avx2_calc_iv_12_last:
-        vaesenclast	xmm5, xmm5, xmm0
-        vaesenclast	xmm15, xmm15, xmm0
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask
-L_AES_GCM_encrypt_avx2_iv_done:
-        ; Additional authentication data
-        mov	edx, r11d
-        cmp	edx, 0
-        je	L_AES_GCM_encrypt_avx2_calc_aad_done
-        xor	ecx, ecx
-        cmp	edx, 16
-        jl	L_AES_GCM_encrypt_avx2_calc_aad_lt16
-        and	edx, 4294967280
-L_AES_GCM_encrypt_avx2_calc_aad_16_loop:
-        vmovdqu	xmm0, OWORD PTR [r12+rcx]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm6, 16
-        vpclmulqdq	xmm1, xmm5, xmm6, 1
-        vpclmulqdq	xmm0, xmm5, xmm6, 0
-        vpclmulqdq	xmm3, xmm5, xmm6, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm7, xmm0, xmm1
-        vpxor	xmm6, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm6, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm6, xmm6, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm6, xmm6, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm6, xmm6, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm7, xmm2, 16
-        vpshufd	xmm1, xmm7, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm6, xmm6, xmm1
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_avx2_calc_aad_16_loop
-        mov	edx, r11d
-        cmp	ecx, edx
-        je	L_AES_GCM_encrypt_avx2_calc_aad_done
-L_AES_GCM_encrypt_avx2_calc_aad_lt16:
-        vpxor	xmm0, xmm0, xmm0
-        xor	ebx, ebx
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_encrypt_avx2_calc_aad_loop:
-        movzx	r13d, BYTE PTR [r12+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_encrypt_avx2_calc_aad_loop
-        vmovdqu	xmm0, OWORD PTR [rsp]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm6, 16
-        vpclmulqdq	xmm1, xmm5, xmm6, 1
-        vpclmulqdq	xmm0, xmm5, xmm6, 0
-        vpclmulqdq	xmm3, xmm5, xmm6, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm7, xmm0, xmm1
-        vpxor	xmm6, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm6, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm6, xmm6, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm6, xmm6, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm6, xmm6, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm7, xmm2, 16
-        vpshufd	xmm1, xmm7, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm6, xmm6, xmm1
-L_AES_GCM_encrypt_avx2_calc_aad_done:
-        ; Calculate counter and H
-        vpsrlq	xmm1, xmm5, 63
-        vpsllq	xmm0, xmm5, 1
-        vpslldq	xmm1, xmm1, 8
-        vpor	xmm0, xmm0, xmm1
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpand	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpaddd	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one
-        vpxor	xmm5, xmm5, xmm0
-        xor	ebx, ebx
-        cmp	r10d, 128
-        mov	r13d, r10d
-        jl	L_AES_GCM_encrypt_avx2_done_128
-        and	r13d, 4294967168
-        vmovdqu	OWORD PTR [rsp+128], xmm4
-        vmovdqu	OWORD PTR [rsp+144], xmm15
-        vmovdqu	xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128
-        ; H ^ 1 and H ^ 2
-        vpclmulqdq	xmm9, xmm5, xmm5, 0
-        vpclmulqdq	xmm10, xmm5, xmm5, 17
-        vpclmulqdq	xmm8, xmm9, xmm3, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm9, xmm9, xmm8
-        vpclmulqdq	xmm8, xmm9, xmm3, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm0, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp], xmm5
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-        ; H ^ 3 and H ^ 4
-        vpclmulqdq	xmm11, xmm0, xmm5, 16
-        vpclmulqdq	xmm10, xmm0, xmm5, 1
-        vpclmulqdq	xmm9, xmm0, xmm5, 0
-        vpclmulqdq	xmm12, xmm0, xmm5, 17
-        vpclmulqdq	xmm13, xmm0, xmm0, 0
-        vpclmulqdq	xmm14, xmm0, xmm0, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm2, xmm13, xmm14
-        vpxor	xmm1, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+32], xmm1
-        vmovdqu	OWORD PTR [rsp+48], xmm2
-        ; H ^ 5 and H ^ 6
-        vpclmulqdq	xmm11, xmm1, xmm0, 16
-        vpclmulqdq	xmm10, xmm1, xmm0, 1
-        vpclmulqdq	xmm9, xmm1, xmm0, 0
-        vpclmulqdq	xmm12, xmm1, xmm0, 17
-        vpclmulqdq	xmm13, xmm1, xmm1, 0
-        vpclmulqdq	xmm14, xmm1, xmm1, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm0, xmm13, xmm14
-        vpxor	xmm7, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm7
-        vmovdqu	OWORD PTR [rsp+80], xmm0
-        ; H ^ 7 and H ^ 8
-        vpclmulqdq	xmm11, xmm2, xmm1, 16
-        vpclmulqdq	xmm10, xmm2, xmm1, 1
-        vpclmulqdq	xmm9, xmm2, xmm1, 0
-        vpclmulqdq	xmm12, xmm2, xmm1, 17
-        vpclmulqdq	xmm13, xmm2, xmm2, 0
-        vpclmulqdq	xmm14, xmm2, xmm2, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm0, xmm13, xmm14
-        vpxor	xmm7, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+96], xmm7
-        vmovdqu	OWORD PTR [rsp+112], xmm0
-        ; First 128 bytes of input
-        ; aesenc_128
-        ; aesenc_ctr
-        vmovdqu	xmm0, OWORD PTR [rsp+128]
-        vmovdqu	xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight
-        vpshufb	xmm15, xmm15, xmm1
-        ; aesenc_xor
-        vmovdqu	xmm7, OWORD PTR [rsi]
-        vmovdqu	OWORD PTR [rsp+128], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+16]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+32]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+48]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+64]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+80]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+96]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+112]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+128]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+144]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r9d, 11
-        vmovdqu	xmm7, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_encrypt_avx2_aesenc_128_enc_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r9d, 13
-        vmovdqu	xmm7, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_encrypt_avx2_aesenc_128_enc_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+224]
-L_AES_GCM_encrypt_avx2_aesenc_128_enc_done:
-        ; aesenc_last
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [rdi]
-        vmovdqu	xmm1, OWORD PTR [rdi+16]
-        vmovdqu	xmm2, OWORD PTR [rdi+32]
-        vmovdqu	xmm3, OWORD PTR [rdi+48]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vpxor	xmm10, xmm10, xmm2
-        vpxor	xmm11, xmm11, xmm3
-        vmovdqu	OWORD PTR [r8], xmm8
-        vmovdqu	OWORD PTR [r8+16], xmm9
-        vmovdqu	OWORD PTR [r8+32], xmm10
-        vmovdqu	OWORD PTR [r8+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rdi+64]
-        vmovdqu	xmm1, OWORD PTR [rdi+80]
-        vmovdqu	xmm2, OWORD PTR [rdi+96]
-        vmovdqu	xmm3, OWORD PTR [rdi+112]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vpxor	xmm14, xmm14, xmm2
-        vpxor	xmm15, xmm15, xmm3
-        vmovdqu	OWORD PTR [r8+64], xmm12
-        vmovdqu	OWORD PTR [r8+80], xmm13
-        vmovdqu	OWORD PTR [r8+96], xmm14
-        vmovdqu	OWORD PTR [r8+112], xmm15
-        cmp	r13d, 128
-        mov	ebx, 128
-        jle	L_AES_GCM_encrypt_avx2_end_128
-        ; More 128 bytes of input
-L_AES_GCM_encrypt_avx2_ghash_128:
-        ; aesenc_128_ghash
-        lea	rcx, QWORD PTR [rdi+rbx]
-        lea	rdx, QWORD PTR [r8+rbx]
-        ; aesenc_ctr
-        vmovdqu	xmm0, OWORD PTR [rsp+128]
-        vmovdqu	xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight
-        vpshufb	xmm15, xmm15, xmm1
-        ; aesenc_xor
-        vmovdqu	xmm7, OWORD PTR [rsi]
-        vmovdqu	OWORD PTR [rsp+128], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        ; aesenc_pclmul_1
-        vmovdqu	xmm1, OWORD PTR [rdx+-128]
-        vmovdqu	xmm0, OWORD PTR [rsi+16]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vmovdqu	xmm2, OWORD PTR [rsp+112]
-        vpxor	xmm1, xmm1, xmm6
-        vpclmulqdq	xmm5, xmm1, xmm2, 16
-        vpclmulqdq	xmm3, xmm1, xmm2, 1
-        vpclmulqdq	xmm6, xmm1, xmm2, 0
-        vpclmulqdq	xmm7, xmm1, xmm2, 17
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_2
-        vmovdqu	xmm1, OWORD PTR [rdx+-112]
-        vmovdqu	xmm0, OWORD PTR [rsp+96]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+32]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-96]
-        vmovdqu	xmm0, OWORD PTR [rsp+80]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+48]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-80]
-        vmovdqu	xmm0, OWORD PTR [rsp+64]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+64]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-64]
-        vmovdqu	xmm0, OWORD PTR [rsp+48]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+80]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-48]
-        vmovdqu	xmm0, OWORD PTR [rsp+32]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+96]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-32]
-        vmovdqu	xmm0, OWORD PTR [rsp+16]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+112]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-16]
-        vmovdqu	xmm0, OWORD PTR [rsp]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+128]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_l
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm6, xmm6, xmm4
-        vpxor	xmm5, xmm5, xmm3
-        vpslldq	xmm1, xmm5, 8
-        vpsrldq	xmm5, xmm5, 8
-        vmovdqu	xmm4, OWORD PTR [rsi+144]
-        vmovdqu	xmm0, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vaesenc	xmm8, xmm8, xmm4
-        vpxor	xmm6, xmm6, xmm1
-        vpxor	xmm7, xmm7, xmm5
-        vpclmulqdq	xmm3, xmm6, xmm0, 16
-        vaesenc	xmm9, xmm9, xmm4
-        vaesenc	xmm10, xmm10, xmm4
-        vaesenc	xmm11, xmm11, xmm4
-        vpshufd	xmm6, xmm6, 78
-        vpxor	xmm6, xmm6, xmm3
-        vpclmulqdq	xmm3, xmm6, xmm0, 16
-        vaesenc	xmm12, xmm12, xmm4
-        vaesenc	xmm13, xmm13, xmm4
-        vaesenc	xmm14, xmm14, xmm4
-        vpshufd	xmm6, xmm6, 78
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm6, xmm6, xmm7
-        vaesenc	xmm15, xmm15, xmm4
-        cmp	r9d, 11
-        vmovdqu	xmm7, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_encrypt_avx2_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r9d, 13
-        vmovdqu	xmm7, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_encrypt_avx2_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+224]
-L_AES_GCM_encrypt_avx2_aesenc_128_ghash_avx_done:
-        ; aesenc_last
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vmovdqu	xmm1, OWORD PTR [rcx+16]
-        vmovdqu	xmm2, OWORD PTR [rcx+32]
-        vmovdqu	xmm3, OWORD PTR [rcx+48]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vpxor	xmm10, xmm10, xmm2
-        vpxor	xmm11, xmm11, xmm3
-        vmovdqu	OWORD PTR [rdx], xmm8
-        vmovdqu	OWORD PTR [rdx+16], xmm9
-        vmovdqu	OWORD PTR [rdx+32], xmm10
-        vmovdqu	OWORD PTR [rdx+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+64]
-        vmovdqu	xmm1, OWORD PTR [rcx+80]
-        vmovdqu	xmm2, OWORD PTR [rcx+96]
-        vmovdqu	xmm3, OWORD PTR [rcx+112]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vpxor	xmm14, xmm14, xmm2
-        vpxor	xmm15, xmm15, xmm3
-        vmovdqu	OWORD PTR [rdx+64], xmm12
-        vmovdqu	OWORD PTR [rdx+80], xmm13
-        vmovdqu	OWORD PTR [rdx+96], xmm14
-        vmovdqu	OWORD PTR [rdx+112], xmm15
-        ; aesenc_128_ghash - end
-        add	ebx, 128
-        cmp	ebx, r13d
-        jl	L_AES_GCM_encrypt_avx2_ghash_128
-L_AES_GCM_encrypt_avx2_end_128:
-        vmovdqu	xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpshufb	xmm8, xmm8, xmm4
-        vpshufb	xmm9, xmm9, xmm4
-        vpshufb	xmm10, xmm10, xmm4
-        vpshufb	xmm11, xmm11, xmm4
-        vpshufb	xmm12, xmm12, xmm4
-        vpshufb	xmm13, xmm13, xmm4
-        vpshufb	xmm14, xmm14, xmm4
-        vpshufb	xmm15, xmm15, xmm4
-        vpxor	xmm8, xmm8, xmm6
-        vmovdqu	xmm7, OWORD PTR [rsp]
-        vpclmulqdq	xmm5, xmm7, xmm15, 16
-        vpclmulqdq	xmm1, xmm7, xmm15, 1
-        vpclmulqdq	xmm4, xmm7, xmm15, 0
-        vpclmulqdq	xmm6, xmm7, xmm15, 17
-        vpxor	xmm5, xmm5, xmm1
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vpclmulqdq	xmm2, xmm7, xmm14, 16
-        vpclmulqdq	xmm1, xmm7, xmm14, 1
-        vpclmulqdq	xmm0, xmm7, xmm14, 0
-        vpclmulqdq	xmm3, xmm7, xmm14, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vmovdqu	xmm15, OWORD PTR [rsp+32]
-        vmovdqu	xmm7, OWORD PTR [rsp+48]
-        vpclmulqdq	xmm2, xmm15, xmm13, 16
-        vpclmulqdq	xmm1, xmm15, xmm13, 1
-        vpclmulqdq	xmm0, xmm15, xmm13, 0
-        vpclmulqdq	xmm3, xmm15, xmm13, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vpclmulqdq	xmm2, xmm7, xmm12, 16
-        vpclmulqdq	xmm1, xmm7, xmm12, 1
-        vpclmulqdq	xmm0, xmm7, xmm12, 0
-        vpclmulqdq	xmm3, xmm7, xmm12, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vmovdqu	xmm15, OWORD PTR [rsp+64]
-        vmovdqu	xmm7, OWORD PTR [rsp+80]
-        vpclmulqdq	xmm2, xmm15, xmm11, 16
-        vpclmulqdq	xmm1, xmm15, xmm11, 1
-        vpclmulqdq	xmm0, xmm15, xmm11, 0
-        vpclmulqdq	xmm3, xmm15, xmm11, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vpclmulqdq	xmm2, xmm7, xmm10, 16
-        vpclmulqdq	xmm1, xmm7, xmm10, 1
-        vpclmulqdq	xmm0, xmm7, xmm10, 0
-        vpclmulqdq	xmm3, xmm7, xmm10, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vmovdqu	xmm15, OWORD PTR [rsp+96]
-        vmovdqu	xmm7, OWORD PTR [rsp+112]
-        vpclmulqdq	xmm2, xmm15, xmm9, 16
-        vpclmulqdq	xmm1, xmm15, xmm9, 1
-        vpclmulqdq	xmm0, xmm15, xmm9, 0
-        vpclmulqdq	xmm3, xmm15, xmm9, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vpclmulqdq	xmm2, xmm7, xmm8, 16
-        vpclmulqdq	xmm1, xmm7, xmm8, 1
-        vpclmulqdq	xmm0, xmm7, xmm8, 0
-        vpclmulqdq	xmm3, xmm7, xmm8, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vpslldq	xmm7, xmm5, 8
-        vpsrldq	xmm5, xmm5, 8
-        vpxor	xmm4, xmm4, xmm7
-        vpxor	xmm6, xmm6, xmm5
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm4, xmm2, 16
-        vpshufd	xmm1, xmm4, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm6, xmm6, xmm1
-        vmovdqu	xmm5, OWORD PTR [rsp]
-        vmovdqu	xmm4, OWORD PTR [rsp+128]
-        vmovdqu	xmm15, OWORD PTR [rsp+144]
-L_AES_GCM_encrypt_avx2_done_128:
-        cmp	ebx, r10d
-        je	L_AES_GCM_encrypt_avx2_done_enc
-        mov	r13d, r10d
-        and	r13d, 4294967280
-        cmp	ebx, r13d
-        jge	L_AES_GCM_encrypt_avx2_last_block_done
-        ; aesenc_block
-        vmovdqu	xmm1, xmm4
-        vpshufb	xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_one
-        vpxor	xmm0, xmm0, [rsi]
-        vmovdqu	xmm2, OWORD PTR [rsi+16]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rsi+32]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rsi+48]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rsi+64]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rsi+80]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rsi+96]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rsi+112]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rsi+128]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rsi+144]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm4, xmm1
-        cmp	r9d, 11
-        vmovdqu	xmm1, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_encrypt_avx2_aesenc_block_last
-        vaesenc	xmm0, xmm0, xmm1
-        vmovdqu	xmm2, OWORD PTR [rsi+176]
-        vaesenc	xmm0, xmm0, xmm2
-        cmp	r9d, 13
-        vmovdqu	xmm1, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_encrypt_avx2_aesenc_block_last
-        vaesenc	xmm0, xmm0, xmm1
-        vmovdqu	xmm2, OWORD PTR [rsi+208]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm1, OWORD PTR [rsi+224]
-L_AES_GCM_encrypt_avx2_aesenc_block_last:
-        vaesenclast	xmm0, xmm0, xmm1
-        vmovdqu	xmm1, OWORD PTR [rdi+rbx]
-        vpxor	xmm0, xmm0, xmm1
-        vmovdqu	OWORD PTR [r8+rbx], xmm0
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm0
-        add	ebx, 16
-        cmp	ebx, r13d
-        jge	L_AES_GCM_encrypt_avx2_last_block_ghash
-L_AES_GCM_encrypt_avx2_last_block_start:
-        vmovdqu	xmm12, OWORD PTR [rdi+rbx]
-        vpshufb	xmm11, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one
-        ; aesenc_gfmul_sb
-        vpclmulqdq	xmm2, xmm6, xmm5, 1
-        vpclmulqdq	xmm3, xmm6, xmm5, 16
-        vpclmulqdq	xmm1, xmm6, xmm5, 0
-        vpclmulqdq	xmm8, xmm6, xmm5, 17
-        vpxor	xmm11, xmm11, [rsi]
-        vaesenc	xmm11, xmm11, [rsi+16]
-        vpxor	xmm3, xmm3, xmm2
-        vpslldq	xmm2, xmm3, 8
-        vpsrldq	xmm3, xmm3, 8
-        vaesenc	xmm11, xmm11, [rsi+32]
-        vpxor	xmm2, xmm2, xmm1
-        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vaesenc	xmm11, xmm11, [rsi+48]
-        vaesenc	xmm11, xmm11, [rsi+64]
-        vaesenc	xmm11, xmm11, [rsi+80]
-        vpshufd	xmm2, xmm2, 78
-        vpxor	xmm2, xmm2, xmm1
-        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vaesenc	xmm11, xmm11, [rsi+96]
-        vaesenc	xmm11, xmm11, [rsi+112]
-        vaesenc	xmm11, xmm11, [rsi+128]
-        vpshufd	xmm2, xmm2, 78
-        vaesenc	xmm11, xmm11, [rsi+144]
-        vpxor	xmm8, xmm8, xmm3
-        vpxor	xmm2, xmm2, xmm8
-        vmovdqu	xmm0, OWORD PTR [rsi+160]
-        cmp	r9d, 11
-        jl	L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm11, xmm11, [rsi+176]
-        vmovdqu	xmm0, OWORD PTR [rsi+192]
-        cmp	r9d, 13
-        jl	L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm11, xmm11, [rsi+208]
-        vmovdqu	xmm0, OWORD PTR [rsi+224]
-L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last:
-        vaesenclast	xmm11, xmm11, xmm0
-        vpxor	xmm6, xmm2, xmm1
-        vpxor	xmm11, xmm11, xmm12
-        vmovdqu	OWORD PTR [r8+rbx], xmm11
-        vpshufb	xmm11, xmm11, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm11
-        add	ebx, 16
-        cmp	ebx, r13d
-        jl	L_AES_GCM_encrypt_avx2_last_block_start
-L_AES_GCM_encrypt_avx2_last_block_ghash:
-        ; ghash_gfmul_red
-        vpclmulqdq	xmm10, xmm6, xmm5, 16
-        vpclmulqdq	xmm9, xmm6, xmm5, 1
-        vpclmulqdq	xmm8, xmm6, xmm5, 0
-        vpxor	xmm10, xmm10, xmm9
-        vpslldq	xmm9, xmm10, 8
-        vpsrldq	xmm10, xmm10, 8
-        vpxor	xmm9, xmm9, xmm8
-        vpclmulqdq	xmm6, xmm6, xmm5, 17
-        vpclmulqdq	xmm8, xmm9, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm9, xmm9, xmm8
-        vpclmulqdq	xmm8, xmm9, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm6, xmm6, xmm10
-        vpxor	xmm6, xmm6, xmm9
-        vpxor	xmm6, xmm6, xmm8
-L_AES_GCM_encrypt_avx2_last_block_done:
-        mov	ecx, r10d
-        mov	edx, r10d
-        and	ecx, 15
-        jz	L_AES_GCM_encrypt_avx2_done_enc
-        ; aesenc_last15_enc
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpxor	xmm4, xmm4, [rsi]
-        vaesenc	xmm4, xmm4, [rsi+16]
-        vaesenc	xmm4, xmm4, [rsi+32]
-        vaesenc	xmm4, xmm4, [rsi+48]
-        vaesenc	xmm4, xmm4, [rsi+64]
-        vaesenc	xmm4, xmm4, [rsi+80]
-        vaesenc	xmm4, xmm4, [rsi+96]
-        vaesenc	xmm4, xmm4, [rsi+112]
-        vaesenc	xmm4, xmm4, [rsi+128]
-        vaesenc	xmm4, xmm4, [rsi+144]
-        cmp	r9d, 11
-        vmovdqu	xmm0, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last
-        vaesenc	xmm4, xmm4, xmm0
-        vaesenc	xmm4, xmm4, [rsi+176]
-        cmp	r9d, 13
-        vmovdqu	xmm0, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last
-        vaesenc	xmm4, xmm4, xmm0
-        vaesenc	xmm4, xmm4, [rsi+208]
-        vmovdqu	xmm0, OWORD PTR [rsi+224]
-L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last:
-        vaesenclast	xmm4, xmm4, xmm0
-        xor	ecx, ecx
-        vpxor	xmm0, xmm0, xmm0
-        vmovdqu	OWORD PTR [rsp], xmm4
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_loop:
-        movzx	r13d, BYTE PTR [rdi+rbx]
-        xor	r13b, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [rsp+rcx+16], r13b
-        mov	BYTE PTR [r8+rbx], r13b
-        inc	ebx
-        inc	ecx
-        cmp	ebx, edx
-        jl	L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_loop
-L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_finish_enc:
-        vmovdqu	xmm4, OWORD PTR [rsp+16]
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm4
-        ; ghash_gfmul_red
-        vpclmulqdq	xmm2, xmm6, xmm5, 16
-        vpclmulqdq	xmm1, xmm6, xmm5, 1
-        vpclmulqdq	xmm0, xmm6, xmm5, 0
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm6, xmm6, xmm5, 17
-        vpclmulqdq	xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm6, xmm6, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vpxor	xmm6, xmm6, xmm0
-L_AES_GCM_encrypt_avx2_done_enc:
-        ; calc_tag
-        shl	r10, 3
-        shl	r11, 3
-        vmovq	xmm0, r10
-        vmovq	xmm1, r11
-        vpunpcklqdq	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm6
-        ; ghash_gfmul_red
-        vpclmulqdq	xmm4, xmm0, xmm5, 16
-        vpclmulqdq	xmm3, xmm0, xmm5, 1
-        vpclmulqdq	xmm2, xmm0, xmm5, 0
-        vpxor	xmm4, xmm4, xmm3
-        vpslldq	xmm3, xmm4, 8
-        vpsrldq	xmm4, xmm4, 8
-        vpxor	xmm3, xmm3, xmm2
-        vpclmulqdq	xmm0, xmm0, xmm5, 17
-        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm3, xmm3, 78
-        vpxor	xmm3, xmm3, xmm2
-        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm3, xmm3, 78
-        vpxor	xmm0, xmm0, xmm4
-        vpxor	xmm0, xmm0, xmm3
-        vpxor	xmm0, xmm0, xmm2
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm0, xmm0, xmm15
-        ; store_tag
-        cmp	r14d, 16
-        je	L_AES_GCM_encrypt_avx2_store_tag_16
-        xor	rcx, rcx
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_encrypt_avx2_store_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [r15+rcx], r13b
-        inc	ecx
-        cmp	ecx, r14d
-        jne	L_AES_GCM_encrypt_avx2_store_tag_loop
-        jmp	L_AES_GCM_encrypt_avx2_store_tag_done
-L_AES_GCM_encrypt_avx2_store_tag_16:
-        vmovdqu	OWORD PTR [r15], xmm0
-L_AES_GCM_encrypt_avx2_store_tag_done:
-        vzeroupper
-        add	rsp, 160
-        pop	rsi
-        pop	r14
-        pop	rbx
-        pop	r15
-        pop	r12
-        pop	rdi
-        pop	r13
-        ret
-AES_GCM_encrypt_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_decrypt_avx2 PROC
-        push	r13
-        push	rdi
-        push	r12
-        push	r14
-        push	rbx
-        push	r15
-        push	rsi
-        push	rbp
-        mov	rdi, rcx
-        mov	r12, r8
-        mov	rax, r9
-        mov	r14, QWORD PTR [rsp+104]
-        mov	r8, rdx
-        mov	r10d, DWORD PTR [rsp+112]
-        mov	r11d, DWORD PTR [rsp+120]
-        mov	ebx, DWORD PTR [rsp+128]
-        mov	r15d, DWORD PTR [rsp+136]
-        mov	rsi, QWORD PTR [rsp+144]
-        mov	r9d, DWORD PTR [rsp+152]
-        mov	rbp, QWORD PTR [rsp+160]
-        sub	rsp, 168
-        vpxor	xmm4, xmm4, xmm4
-        vpxor	xmm6, xmm6, xmm6
-        mov	edx, ebx
-        cmp	edx, 12
-        je	L_AES_GCM_decrypt_avx2_iv_12
-        ; Calculate values when IV is not 12 bytes
-        ; H = Encrypt X(=0)
-        vmovdqu	xmm5, OWORD PTR [rsi]
-        vaesenc	xmm5, xmm5, [rsi+16]
-        vaesenc	xmm5, xmm5, [rsi+32]
-        vaesenc	xmm5, xmm5, [rsi+48]
-        vaesenc	xmm5, xmm5, [rsi+64]
-        vaesenc	xmm5, xmm5, [rsi+80]
-        vaesenc	xmm5, xmm5, [rsi+96]
-        vaesenc	xmm5, xmm5, [rsi+112]
-        vaesenc	xmm5, xmm5, [rsi+128]
-        vaesenc	xmm5, xmm5, [rsi+144]
-        cmp	r9d, 11
-        vmovdqu	xmm0, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm5, xmm5, [rsi+176]
-        cmp	r9d, 13
-        vmovdqu	xmm0, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm5, xmm5, [rsi+208]
-        vmovdqu	xmm0, OWORD PTR [rsi+224]
-L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last:
-        vaesenclast	xmm5, xmm5, xmm0
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        ; Calc counter
-        ; Initialization vector
-        cmp	edx, 0
-        mov	rcx, 0
-        je	L_AES_GCM_decrypt_avx2_calc_iv_done
-        cmp	edx, 16
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_lt16
-        and	edx, 4294967280
-L_AES_GCM_decrypt_avx2_calc_iv_16_loop:
-        vmovdqu	xmm0, OWORD PTR [rax+rcx]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm7, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm7, xmm2, 16
-        vpshufd	xmm1, xmm7, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_16_loop
-        mov	edx, ebx
-        cmp	ecx, edx
-        je	L_AES_GCM_decrypt_avx2_calc_iv_done
-L_AES_GCM_decrypt_avx2_calc_iv_lt16:
-        vpxor	xmm0, xmm0, xmm0
-        xor	ebx, ebx
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_decrypt_avx2_calc_iv_loop:
-        movzx	r13d, BYTE PTR [rax+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_loop
-        vmovdqu	xmm0, OWORD PTR [rsp]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm7, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm7, xmm2, 16
-        vpshufd	xmm1, xmm7, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-L_AES_GCM_decrypt_avx2_calc_iv_done:
-        ; T = Encrypt counter
-        vpxor	xmm0, xmm0, xmm0
-        shl	edx, 3
-        vmovq	xmm0, rdx
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm7, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm7, xmm2, 16
-        vpshufd	xmm1, xmm7, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        ;   Encrypt counter
-        vmovdqu	xmm15, OWORD PTR [rsi]
-        vpxor	xmm15, xmm15, xmm4
-        vaesenc	xmm15, xmm15, [rsi+16]
-        vaesenc	xmm15, xmm15, [rsi+32]
-        vaesenc	xmm15, xmm15, [rsi+48]
-        vaesenc	xmm15, xmm15, [rsi+64]
-        vaesenc	xmm15, xmm15, [rsi+80]
-        vaesenc	xmm15, xmm15, [rsi+96]
-        vaesenc	xmm15, xmm15, [rsi+112]
-        vaesenc	xmm15, xmm15, [rsi+128]
-        vaesenc	xmm15, xmm15, [rsi+144]
-        cmp	r9d, 11
-        vmovdqu	xmm0, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm15, xmm15, xmm0
-        vaesenc	xmm15, xmm15, [rsi+176]
-        cmp	r9d, 13
-        vmovdqu	xmm0, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm15, xmm15, xmm0
-        vaesenc	xmm15, xmm15, [rsi+208]
-        vmovdqu	xmm0, OWORD PTR [rsi+224]
-L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last:
-        vaesenclast	xmm15, xmm15, xmm0
-        jmp	L_AES_GCM_decrypt_avx2_iv_done
-L_AES_GCM_decrypt_avx2_iv_12:
-        ; # Calculate values when IV is 12 bytes
-        ; Set counter based on IV
-        vmovdqu	xmm4, OWORD PTR L_avx2_aes_gcm_bswap_one
-        vmovdqu	xmm5, OWORD PTR [rsi]
-        vpblendd	xmm4, xmm4, [rax], 7
-        ; H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqu	xmm7, OWORD PTR [rsi+16]
-        vpxor	xmm15, xmm4, xmm5
-        vaesenc	xmm5, xmm5, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rsi+32]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+48]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+64]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+80]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+96]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+112]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+128]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+144]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        cmp	r9d, 11
-        vmovdqu	xmm0, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+176]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        cmp	r9d, 13
-        vmovdqu	xmm0, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+208]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        vmovdqu	xmm0, OWORD PTR [rsi+224]
-L_AES_GCM_decrypt_avx2_calc_iv_12_last:
-        vaesenclast	xmm5, xmm5, xmm0
-        vaesenclast	xmm15, xmm15, xmm0
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask
-L_AES_GCM_decrypt_avx2_iv_done:
-        ; Additional authentication data
-        mov	edx, r11d
-        cmp	edx, 0
-        je	L_AES_GCM_decrypt_avx2_calc_aad_done
-        xor	ecx, ecx
-        cmp	edx, 16
-        jl	L_AES_GCM_decrypt_avx2_calc_aad_lt16
-        and	edx, 4294967280
-L_AES_GCM_decrypt_avx2_calc_aad_16_loop:
-        vmovdqu	xmm0, OWORD PTR [r12+rcx]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm6, 16
-        vpclmulqdq	xmm1, xmm5, xmm6, 1
-        vpclmulqdq	xmm0, xmm5, xmm6, 0
-        vpclmulqdq	xmm3, xmm5, xmm6, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm7, xmm0, xmm1
-        vpxor	xmm6, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm6, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm6, xmm6, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm6, xmm6, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm6, xmm6, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm7, xmm2, 16
-        vpshufd	xmm1, xmm7, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm6, xmm6, xmm1
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_avx2_calc_aad_16_loop
-        mov	edx, r11d
-        cmp	ecx, edx
-        je	L_AES_GCM_decrypt_avx2_calc_aad_done
-L_AES_GCM_decrypt_avx2_calc_aad_lt16:
-        vpxor	xmm0, xmm0, xmm0
-        xor	ebx, ebx
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_decrypt_avx2_calc_aad_loop:
-        movzx	r13d, BYTE PTR [r12+rcx]
-        mov	BYTE PTR [rsp+rbx], r13b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_decrypt_avx2_calc_aad_loop
-        vmovdqu	xmm0, OWORD PTR [rsp]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm6, 16
-        vpclmulqdq	xmm1, xmm5, xmm6, 1
-        vpclmulqdq	xmm0, xmm5, xmm6, 0
-        vpclmulqdq	xmm3, xmm5, xmm6, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm7, xmm0, xmm1
-        vpxor	xmm6, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm7, 31
-        vpsrld	xmm1, xmm6, 31
-        vpslld	xmm7, xmm7, 1
-        vpslld	xmm6, xmm6, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm6, xmm6, xmm2
-        vpor	xmm7, xmm7, xmm0
-        vpor	xmm6, xmm6, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm7, xmm2, 16
-        vpshufd	xmm1, xmm7, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm6, xmm6, xmm1
-L_AES_GCM_decrypt_avx2_calc_aad_done:
-        ; Calculate counter and H
-        vpsrlq	xmm1, xmm5, 63
-        vpsllq	xmm0, xmm5, 1
-        vpslldq	xmm1, xmm1, 8
-        vpor	xmm0, xmm0, xmm1
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpand	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpaddd	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one
-        vpxor	xmm5, xmm5, xmm0
-        xor	ebx, ebx
-        cmp	r10d, 128
-        mov	r13d, r10d
-        jl	L_AES_GCM_decrypt_avx2_done_128
-        and	r13d, 4294967168
-        vmovdqu	OWORD PTR [rsp+128], xmm4
-        vmovdqu	OWORD PTR [rsp+144], xmm15
-        vmovdqu	xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128
-        ; H ^ 1 and H ^ 2
-        vpclmulqdq	xmm9, xmm5, xmm5, 0
-        vpclmulqdq	xmm10, xmm5, xmm5, 17
-        vpclmulqdq	xmm8, xmm9, xmm3, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm9, xmm9, xmm8
-        vpclmulqdq	xmm8, xmm9, xmm3, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm0, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp], xmm5
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-        ; H ^ 3 and H ^ 4
-        vpclmulqdq	xmm11, xmm0, xmm5, 16
-        vpclmulqdq	xmm10, xmm0, xmm5, 1
-        vpclmulqdq	xmm9, xmm0, xmm5, 0
-        vpclmulqdq	xmm12, xmm0, xmm5, 17
-        vpclmulqdq	xmm13, xmm0, xmm0, 0
-        vpclmulqdq	xmm14, xmm0, xmm0, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm2, xmm13, xmm14
-        vpxor	xmm1, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+32], xmm1
-        vmovdqu	OWORD PTR [rsp+48], xmm2
-        ; H ^ 5 and H ^ 6
-        vpclmulqdq	xmm11, xmm1, xmm0, 16
-        vpclmulqdq	xmm10, xmm1, xmm0, 1
-        vpclmulqdq	xmm9, xmm1, xmm0, 0
-        vpclmulqdq	xmm12, xmm1, xmm0, 17
-        vpclmulqdq	xmm13, xmm1, xmm1, 0
-        vpclmulqdq	xmm14, xmm1, xmm1, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm0, xmm13, xmm14
-        vpxor	xmm7, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm7
-        vmovdqu	OWORD PTR [rsp+80], xmm0
-        ; H ^ 7 and H ^ 8
-        vpclmulqdq	xmm11, xmm2, xmm1, 16
-        vpclmulqdq	xmm10, xmm2, xmm1, 1
-        vpclmulqdq	xmm9, xmm2, xmm1, 0
-        vpclmulqdq	xmm12, xmm2, xmm1, 17
-        vpclmulqdq	xmm13, xmm2, xmm2, 0
-        vpclmulqdq	xmm14, xmm2, xmm2, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm0, xmm13, xmm14
-        vpxor	xmm7, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+96], xmm7
-        vmovdqu	OWORD PTR [rsp+112], xmm0
-L_AES_GCM_decrypt_avx2_ghash_128:
-        ; aesenc_128_ghash
-        lea	rcx, QWORD PTR [rdi+rbx]
-        lea	rdx, QWORD PTR [r8+rbx]
-        ; aesenc_ctr
-        vmovdqu	xmm0, OWORD PTR [rsp+128]
-        vmovdqu	xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight
-        vpshufb	xmm15, xmm15, xmm1
-        ; aesenc_xor
-        vmovdqu	xmm7, OWORD PTR [rsi]
-        vmovdqu	OWORD PTR [rsp+128], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        ; aesenc_pclmul_1
-        vmovdqu	xmm1, OWORD PTR [rcx]
-        vmovdqu	xmm0, OWORD PTR [rsi+16]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vmovdqu	xmm2, OWORD PTR [rsp+112]
-        vpxor	xmm1, xmm1, xmm6
-        vpclmulqdq	xmm5, xmm1, xmm2, 16
-        vpclmulqdq	xmm3, xmm1, xmm2, 1
-        vpclmulqdq	xmm6, xmm1, xmm2, 0
-        vpclmulqdq	xmm7, xmm1, xmm2, 17
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_2
-        vmovdqu	xmm1, OWORD PTR [rcx+16]
-        vmovdqu	xmm0, OWORD PTR [rsp+96]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+32]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+32]
-        vmovdqu	xmm0, OWORD PTR [rsp+80]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+48]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+48]
-        vmovdqu	xmm0, OWORD PTR [rsp+64]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+64]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+64]
-        vmovdqu	xmm0, OWORD PTR [rsp+48]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+80]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+80]
-        vmovdqu	xmm0, OWORD PTR [rsp+32]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+96]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+96]
-        vmovdqu	xmm0, OWORD PTR [rsp+16]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+112]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+112]
-        vmovdqu	xmm0, OWORD PTR [rsp]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rsi+128]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_l
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm6, xmm6, xmm4
-        vpxor	xmm5, xmm5, xmm3
-        vpslldq	xmm1, xmm5, 8
-        vpsrldq	xmm5, xmm5, 8
-        vmovdqu	xmm4, OWORD PTR [rsi+144]
-        vmovdqu	xmm0, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vaesenc	xmm8, xmm8, xmm4
-        vpxor	xmm6, xmm6, xmm1
-        vpxor	xmm7, xmm7, xmm5
-        vpclmulqdq	xmm3, xmm6, xmm0, 16
-        vaesenc	xmm9, xmm9, xmm4
-        vaesenc	xmm10, xmm10, xmm4
-        vaesenc	xmm11, xmm11, xmm4
-        vpshufd	xmm6, xmm6, 78
-        vpxor	xmm6, xmm6, xmm3
-        vpclmulqdq	xmm3, xmm6, xmm0, 16
-        vaesenc	xmm12, xmm12, xmm4
-        vaesenc	xmm13, xmm13, xmm4
-        vaesenc	xmm14, xmm14, xmm4
-        vpshufd	xmm6, xmm6, 78
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm6, xmm6, xmm7
-        vaesenc	xmm15, xmm15, xmm4
-        cmp	r9d, 11
-        vmovdqu	xmm7, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_decrypt_avx2_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r9d, 13
-        vmovdqu	xmm7, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_decrypt_avx2_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rsi+224]
-L_AES_GCM_decrypt_avx2_aesenc_128_ghash_avx_done:
-        ; aesenc_last
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vmovdqu	xmm1, OWORD PTR [rcx+16]
-        vmovdqu	xmm2, OWORD PTR [rcx+32]
-        vmovdqu	xmm3, OWORD PTR [rcx+48]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vpxor	xmm10, xmm10, xmm2
-        vpxor	xmm11, xmm11, xmm3
-        vmovdqu	OWORD PTR [rdx], xmm8
-        vmovdqu	OWORD PTR [rdx+16], xmm9
-        vmovdqu	OWORD PTR [rdx+32], xmm10
-        vmovdqu	OWORD PTR [rdx+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+64]
-        vmovdqu	xmm1, OWORD PTR [rcx+80]
-        vmovdqu	xmm2, OWORD PTR [rcx+96]
-        vmovdqu	xmm3, OWORD PTR [rcx+112]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vpxor	xmm14, xmm14, xmm2
-        vpxor	xmm15, xmm15, xmm3
-        vmovdqu	OWORD PTR [rdx+64], xmm12
-        vmovdqu	OWORD PTR [rdx+80], xmm13
-        vmovdqu	OWORD PTR [rdx+96], xmm14
-        vmovdqu	OWORD PTR [rdx+112], xmm15
-        ; aesenc_128_ghash - end
-        add	ebx, 128
-        cmp	ebx, r13d
-        jl	L_AES_GCM_decrypt_avx2_ghash_128
-        vmovdqu	xmm5, OWORD PTR [rsp]
-        vmovdqu	xmm4, OWORD PTR [rsp+128]
-        vmovdqu	xmm15, OWORD PTR [rsp+144]
-L_AES_GCM_decrypt_avx2_done_128:
-        cmp	ebx, r10d
-        jge	L_AES_GCM_decrypt_avx2_done_dec
-        mov	r13d, r10d
-        and	r13d, 4294967280
-        cmp	ebx, r13d
-        jge	L_AES_GCM_decrypt_avx2_last_block_done
-L_AES_GCM_decrypt_avx2_last_block_start:
-        vmovdqu	xmm11, OWORD PTR [rdi+rbx]
-        vpshufb	xmm10, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpshufb	xmm12, xmm11, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpaddd	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one
-        vpxor	xmm12, xmm12, xmm6
-        ; aesenc_gfmul_sb
-        vpclmulqdq	xmm2, xmm12, xmm5, 1
-        vpclmulqdq	xmm3, xmm12, xmm5, 16
-        vpclmulqdq	xmm1, xmm12, xmm5, 0
-        vpclmulqdq	xmm8, xmm12, xmm5, 17
-        vpxor	xmm10, xmm10, [rsi]
-        vaesenc	xmm10, xmm10, [rsi+16]
-        vpxor	xmm3, xmm3, xmm2
-        vpslldq	xmm2, xmm3, 8
-        vpsrldq	xmm3, xmm3, 8
-        vaesenc	xmm10, xmm10, [rsi+32]
-        vpxor	xmm2, xmm2, xmm1
-        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vaesenc	xmm10, xmm10, [rsi+48]
-        vaesenc	xmm10, xmm10, [rsi+64]
-        vaesenc	xmm10, xmm10, [rsi+80]
-        vpshufd	xmm2, xmm2, 78
-        vpxor	xmm2, xmm2, xmm1
-        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vaesenc	xmm10, xmm10, [rsi+96]
-        vaesenc	xmm10, xmm10, [rsi+112]
-        vaesenc	xmm10, xmm10, [rsi+128]
-        vpshufd	xmm2, xmm2, 78
-        vaesenc	xmm10, xmm10, [rsi+144]
-        vpxor	xmm8, xmm8, xmm3
-        vpxor	xmm2, xmm2, xmm8
-        vmovdqu	xmm0, OWORD PTR [rsi+160]
-        cmp	r9d, 11
-        jl	L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm10, xmm10, [rsi+176]
-        vmovdqu	xmm0, OWORD PTR [rsi+192]
-        cmp	r9d, 13
-        jl	L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm10, xmm10, [rsi+208]
-        vmovdqu	xmm0, OWORD PTR [rsi+224]
-L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last:
-        vaesenclast	xmm10, xmm10, xmm0
-        vpxor	xmm6, xmm2, xmm1
-        vpxor	xmm10, xmm10, xmm11
-        vmovdqu	OWORD PTR [r8+rbx], xmm10
-        add	ebx, 16
-        cmp	ebx, r13d
-        jl	L_AES_GCM_decrypt_avx2_last_block_start
-L_AES_GCM_decrypt_avx2_last_block_done:
-        mov	ecx, r10d
-        mov	edx, r10d
-        and	ecx, 15
-        jz	L_AES_GCM_decrypt_avx2_done_dec
-        ; aesenc_last15_dec
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpxor	xmm4, xmm4, [rsi]
-        vaesenc	xmm4, xmm4, [rsi+16]
-        vaesenc	xmm4, xmm4, [rsi+32]
-        vaesenc	xmm4, xmm4, [rsi+48]
-        vaesenc	xmm4, xmm4, [rsi+64]
-        vaesenc	xmm4, xmm4, [rsi+80]
-        vaesenc	xmm4, xmm4, [rsi+96]
-        vaesenc	xmm4, xmm4, [rsi+112]
-        vaesenc	xmm4, xmm4, [rsi+128]
-        vaesenc	xmm4, xmm4, [rsi+144]
-        cmp	r9d, 11
-        vmovdqu	xmm1, OWORD PTR [rsi+160]
-        jl	L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last
-        vaesenc	xmm4, xmm4, xmm1
-        vaesenc	xmm4, xmm4, [rsi+176]
-        cmp	r9d, 13
-        vmovdqu	xmm1, OWORD PTR [rsi+192]
-        jl	L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last
-        vaesenc	xmm4, xmm4, xmm1
-        vaesenc	xmm4, xmm4, [rsi+208]
-        vmovdqu	xmm1, OWORD PTR [rsi+224]
-L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last:
-        vaesenclast	xmm4, xmm4, xmm1
-        xor	ecx, ecx
-        vpxor	xmm0, xmm0, xmm0
-        vmovdqu	OWORD PTR [rsp], xmm4
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_loop:
-        movzx	r13d, BYTE PTR [rdi+rbx]
-        mov	BYTE PTR [rsp+rcx+16], r13b
-        xor	r13b, BYTE PTR [rsp+rcx]
-        mov	BYTE PTR [r8+rbx], r13b
-        inc	ebx
-        inc	ecx
-        cmp	ebx, edx
-        jl	L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_loop
-        vmovdqu	xmm4, OWORD PTR [rsp+16]
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm4
-        ; ghash_gfmul_red
-        vpclmulqdq	xmm2, xmm6, xmm5, 16
-        vpclmulqdq	xmm1, xmm6, xmm5, 1
-        vpclmulqdq	xmm0, xmm6, xmm5, 0
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm6, xmm6, xmm5, 17
-        vpclmulqdq	xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm6, xmm6, xmm2
-        vpxor	xmm6, xmm6, xmm1
-        vpxor	xmm6, xmm6, xmm0
-L_AES_GCM_decrypt_avx2_done_dec:
-        ; calc_tag
-        shl	r10, 3
-        shl	r11, 3
-        vmovq	xmm0, r10
-        vmovq	xmm1, r11
-        vpunpcklqdq	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm6
-        ; ghash_gfmul_red
-        vpclmulqdq	xmm4, xmm0, xmm5, 16
-        vpclmulqdq	xmm3, xmm0, xmm5, 1
-        vpclmulqdq	xmm2, xmm0, xmm5, 0
-        vpxor	xmm4, xmm4, xmm3
-        vpslldq	xmm3, xmm4, 8
-        vpsrldq	xmm4, xmm4, 8
-        vpxor	xmm3, xmm3, xmm2
-        vpclmulqdq	xmm0, xmm0, xmm5, 17
-        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm3, xmm3, 78
-        vpxor	xmm3, xmm3, xmm2
-        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm3, xmm3, 78
-        vpxor	xmm0, xmm0, xmm4
-        vpxor	xmm0, xmm0, xmm3
-        vpxor	xmm0, xmm0, xmm2
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm0, xmm0, xmm15
-        ; cmp_tag
-        cmp	r15d, 16
-        je	L_AES_GCM_decrypt_avx2_cmp_tag_16
-        xor	rdx, rdx
-        xor	rax, rax
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_decrypt_avx2_cmp_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+rdx]
-        xor	r13b, BYTE PTR [r14+rdx]
-        or	al, r13b
-        inc	edx
-        cmp	edx, r15d
-        jne	L_AES_GCM_decrypt_avx2_cmp_tag_loop
-        cmp	rax, 0
-        sete	al
-        jmp	L_AES_GCM_decrypt_avx2_cmp_tag_done
-L_AES_GCM_decrypt_avx2_cmp_tag_16:
-        vmovdqu	xmm1, OWORD PTR [r14]
-        vpcmpeqb	xmm0, xmm0, xmm1
-        vpmovmskb	rdx, xmm0
-        ; %%edx == 0xFFFF then return 1 else => return 0
-        xor	eax, eax
-        cmp	edx, 65535
-        sete	al
-L_AES_GCM_decrypt_avx2_cmp_tag_done:
-        mov	DWORD PTR [rbp], eax
-        vzeroupper
-        add	rsp, 168
-        pop	rbp
-        pop	rsi
-        pop	r15
-        pop	rbx
-        pop	r14
-        pop	r12
-        pop	rdi
-        pop	r13
-        ret
-AES_GCM_decrypt_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_init_avx2 PROC
-        push	rbx
-        push	rdi
-        push	rsi
-        push	r12
-        mov	rdi, rcx
-        mov	rsi, rdx
-        mov	r10, r8
-        mov	r11d, r9d
-        mov	rax, QWORD PTR [rsp+72]
-        mov	r8, QWORD PTR [rsp+80]
-        mov	r9, QWORD PTR [rsp+88]
-        sub	rsp, 16
-        vpxor	xmm4, xmm4, xmm4
-        mov	edx, r11d
-        cmp	edx, 12
-        je	L_AES_GCM_init_avx2_iv_12
-        ; Calculate values when IV is not 12 bytes
-        ; H = Encrypt X(=0)
-        vmovdqu	xmm5, OWORD PTR [rdi]
-        vaesenc	xmm5, xmm5, [rdi+16]
-        vaesenc	xmm5, xmm5, [rdi+32]
-        vaesenc	xmm5, xmm5, [rdi+48]
-        vaesenc	xmm5, xmm5, [rdi+64]
-        vaesenc	xmm5, xmm5, [rdi+80]
-        vaesenc	xmm5, xmm5, [rdi+96]
-        vaesenc	xmm5, xmm5, [rdi+112]
-        vaesenc	xmm5, xmm5, [rdi+128]
-        vaesenc	xmm5, xmm5, [rdi+144]
-        cmp	esi, 11
-        vmovdqu	xmm0, OWORD PTR [rdi+160]
-        jl	L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm5, xmm5, [rdi+176]
-        cmp	esi, 13
-        vmovdqu	xmm0, OWORD PTR [rdi+192]
-        jl	L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm5, xmm5, [rdi+208]
-        vmovdqu	xmm0, OWORD PTR [rdi+224]
-L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last:
-        vaesenclast	xmm5, xmm5, xmm0
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        ; Calc counter
-        ; Initialization vector
-        cmp	edx, 0
-        mov	rcx, 0
-        je	L_AES_GCM_init_avx2_calc_iv_done
-        cmp	edx, 16
-        jl	L_AES_GCM_init_avx2_calc_iv_lt16
-        and	edx, 4294967280
-L_AES_GCM_init_avx2_calc_iv_16_loop:
-        vmovdqu	xmm0, OWORD PTR [r10+rcx]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm6, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm6, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm6, xmm6, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm6, xmm6, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm6, xmm2, 16
-        vpshufd	xmm1, xmm6, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_init_avx2_calc_iv_16_loop
-        mov	edx, r11d
-        cmp	ecx, edx
-        je	L_AES_GCM_init_avx2_calc_iv_done
-L_AES_GCM_init_avx2_calc_iv_lt16:
-        vpxor	xmm0, xmm0, xmm0
-        xor	ebx, ebx
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_init_avx2_calc_iv_loop:
-        movzx	r12d, BYTE PTR [r10+rcx]
-        mov	BYTE PTR [rsp+rbx], r12b
-        inc	ecx
-        inc	ebx
-        cmp	ecx, edx
-        jl	L_AES_GCM_init_avx2_calc_iv_loop
-        vmovdqu	xmm0, OWORD PTR [rsp]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm6, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm6, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm6, xmm6, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm6, xmm6, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm6, xmm2, 16
-        vpshufd	xmm1, xmm6, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-L_AES_GCM_init_avx2_calc_iv_done:
-        ; T = Encrypt counter
-        vpxor	xmm0, xmm0, xmm0
-        shl	edx, 3
-        vmovq	xmm0, rdx
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm6, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm6, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm6, xmm6, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm6, xmm6, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm6, xmm2, 16
-        vpshufd	xmm1, xmm6, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        ;   Encrypt counter
-        vmovdqu	xmm7, OWORD PTR [rdi]
-        vpxor	xmm7, xmm7, xmm4
-        vaesenc	xmm7, xmm7, [rdi+16]
-        vaesenc	xmm7, xmm7, [rdi+32]
-        vaesenc	xmm7, xmm7, [rdi+48]
-        vaesenc	xmm7, xmm7, [rdi+64]
-        vaesenc	xmm7, xmm7, [rdi+80]
-        vaesenc	xmm7, xmm7, [rdi+96]
-        vaesenc	xmm7, xmm7, [rdi+112]
-        vaesenc	xmm7, xmm7, [rdi+128]
-        vaesenc	xmm7, xmm7, [rdi+144]
-        cmp	esi, 11
-        vmovdqu	xmm0, OWORD PTR [rdi+160]
-        jl	L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm7, xmm7, xmm0
-        vaesenc	xmm7, xmm7, [rdi+176]
-        cmp	esi, 13
-        vmovdqu	xmm0, OWORD PTR [rdi+192]
-        jl	L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	xmm7, xmm7, xmm0
-        vaesenc	xmm7, xmm7, [rdi+208]
-        vmovdqu	xmm0, OWORD PTR [rdi+224]
-L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last:
-        vaesenclast	xmm7, xmm7, xmm0
-        jmp	L_AES_GCM_init_avx2_iv_done
-L_AES_GCM_init_avx2_iv_12:
-        ; # Calculate values when IV is 12 bytes
-        ; Set counter based on IV
-        vmovdqu	xmm4, OWORD PTR L_avx2_aes_gcm_bswap_one
-        vmovdqu	xmm5, OWORD PTR [rdi]
-        vpblendd	xmm4, xmm4, [r10], 7
-        ; H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqu	xmm6, OWORD PTR [rdi+16]
-        vpxor	xmm7, xmm4, xmm5
-        vaesenc	xmm5, xmm5, xmm6
-        vaesenc	xmm7, xmm7, xmm6
-        vmovdqu	xmm0, OWORD PTR [rdi+32]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        vmovdqu	xmm0, OWORD PTR [rdi+48]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        vmovdqu	xmm0, OWORD PTR [rdi+64]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        vmovdqu	xmm0, OWORD PTR [rdi+80]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        vmovdqu	xmm0, OWORD PTR [rdi+96]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        vmovdqu	xmm0, OWORD PTR [rdi+112]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        vmovdqu	xmm0, OWORD PTR [rdi+128]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        vmovdqu	xmm0, OWORD PTR [rdi+144]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        cmp	esi, 11
-        vmovdqu	xmm0, OWORD PTR [rdi+160]
-        jl	L_AES_GCM_init_avx2_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        vmovdqu	xmm0, OWORD PTR [rdi+176]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        cmp	esi, 13
-        vmovdqu	xmm0, OWORD PTR [rdi+192]
-        jl	L_AES_GCM_init_avx2_calc_iv_12_last
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        vmovdqu	xmm0, OWORD PTR [rdi+208]
-        vaesenc	xmm5, xmm5, xmm0
-        vaesenc	xmm7, xmm7, xmm0
-        vmovdqu	xmm0, OWORD PTR [rdi+224]
-L_AES_GCM_init_avx2_calc_iv_12_last:
-        vaesenclast	xmm5, xmm5, xmm0
-        vaesenclast	xmm7, xmm7, xmm0
-        vpshufb	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask
-L_AES_GCM_init_avx2_iv_done:
-        vmovdqu	OWORD PTR [r9], xmm7
-        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one
-        vmovdqu	OWORD PTR [rax], xmm5
-        vmovdqu	OWORD PTR [r8], xmm4
-        vzeroupper
-        add	rsp, 16
-        pop	r12
-        pop	rsi
-        pop	rdi
-        pop	rbx
-        ret
-AES_GCM_init_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_aad_update_avx2 PROC
-        mov	rax, rcx
-        vmovdqu	xmm4, OWORD PTR [r8]
-        vmovdqu	xmm5, OWORD PTR [r9]
-        xor	ecx, ecx
-L_AES_GCM_aad_update_avx2_16_loop:
-        vmovdqu	xmm0, OWORD PTR [rax+rcx]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm6, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm6, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm6, xmm6, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm6, xmm6, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm6, xmm2, 16
-        vpshufd	xmm1, xmm6, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-        add	ecx, 16
-        cmp	ecx, edx
-        jl	L_AES_GCM_aad_update_avx2_16_loop
-        vmovdqu	OWORD PTR [r8], xmm4
-        vzeroupper
-        ret
-AES_GCM_aad_update_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_block_avx2 PROC
-        mov	r10, r8
-        mov	r11, r9
-        mov	rax, QWORD PTR [rsp+40]
-        sub	rsp, 152
-        vmovdqu	xmm3, OWORD PTR [rax]
-        ; aesenc_block
-        vmovdqu	xmm1, xmm3
-        vpshufb	xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_one
-        vpxor	xmm0, xmm0, [rcx]
-        vmovdqu	xmm2, OWORD PTR [rcx+16]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rcx+32]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rcx+48]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rcx+64]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rcx+80]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rcx+96]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rcx+112]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rcx+128]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rcx+144]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm3, xmm1
-        cmp	edx, 11
-        vmovdqu	xmm1, OWORD PTR [rcx+160]
-        jl	L_AES_GCM_encrypt_block_avx2_aesenc_block_last
-        vaesenc	xmm0, xmm0, xmm1
-        vmovdqu	xmm2, OWORD PTR [rcx+176]
-        vaesenc	xmm0, xmm0, xmm2
-        cmp	edx, 13
-        vmovdqu	xmm1, OWORD PTR [rcx+192]
-        jl	L_AES_GCM_encrypt_block_avx2_aesenc_block_last
-        vaesenc	xmm0, xmm0, xmm1
-        vmovdqu	xmm2, OWORD PTR [rcx+208]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm1, OWORD PTR [rcx+224]
-L_AES_GCM_encrypt_block_avx2_aesenc_block_last:
-        vaesenclast	xmm0, xmm0, xmm1
-        vmovdqu	xmm1, OWORD PTR [r11]
-        vpxor	xmm0, xmm0, xmm1
-        vmovdqu	OWORD PTR [r10], xmm0
-        vmovdqu	OWORD PTR [rax], xmm3
-        vzeroupper
-        add	rsp, 152
-        ret
-AES_GCM_encrypt_block_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_ghash_block_avx2 PROC
-        vmovdqu	xmm4, OWORD PTR [rdx]
-        vmovdqu	xmm5, OWORD PTR [r8]
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm4, xmm4, xmm0
-        ; ghash_gfmul_avx
-        vpclmulqdq	xmm2, xmm5, xmm4, 16
-        vpclmulqdq	xmm1, xmm5, xmm4, 1
-        vpclmulqdq	xmm0, xmm5, xmm4, 0
-        vpclmulqdq	xmm3, xmm5, xmm4, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpslldq	xmm1, xmm2, 8
-        vpsrldq	xmm2, xmm2, 8
-        vpxor	xmm6, xmm0, xmm1
-        vpxor	xmm4, xmm3, xmm2
-        ; ghash_mid
-        vpsrld	xmm0, xmm6, 31
-        vpsrld	xmm1, xmm4, 31
-        vpslld	xmm6, xmm6, 1
-        vpslld	xmm4, xmm4, 1
-        vpsrldq	xmm2, xmm0, 12
-        vpslldq	xmm0, xmm0, 4
-        vpslldq	xmm1, xmm1, 4
-        vpor	xmm4, xmm4, xmm2
-        vpor	xmm6, xmm6, xmm0
-        vpor	xmm4, xmm4, xmm1
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm6, xmm2, 16
-        vpshufd	xmm1, xmm6, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm4, xmm4, xmm1
-        vmovdqu	OWORD PTR [rdx], xmm4
-        vzeroupper
-        ret
-AES_GCM_ghash_block_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_update_avx2 PROC
-        push	r12
-        push	r13
-        push	r14
-        push	r15
-        push	rdi
-        mov	rax, rcx
-        mov	r10, r8
-        mov	r8d, edx
-        mov	r11, r9
-        mov	r9d, DWORD PTR [rsp+80]
-        mov	r12, QWORD PTR [rsp+88]
-        mov	r13, QWORD PTR [rsp+96]
-        mov	r14, QWORD PTR [rsp+104]
-        sub	rsp, 152
-        vmovdqu	xmm6, OWORD PTR [r12]
-        vmovdqu	xmm5, OWORD PTR [r13]
-        vmovdqu	xmm4, OWORD PTR [r14]
-        vpsrlq	xmm1, xmm5, 63
-        vpsllq	xmm0, xmm5, 1
-        vpslldq	xmm1, xmm1, 8
-        vpor	xmm0, xmm0, xmm1
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpand	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpxor	xmm5, xmm5, xmm0
-        xor	edi, edi
-        cmp	r9d, 128
-        mov	r15d, r9d
-        jl	L_AES_GCM_encrypt_update_avx2_done_128
-        and	r15d, 4294967168
-        vmovdqu	OWORD PTR [rsp+128], xmm4
-        vmovdqu	xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128
-        ; H ^ 1 and H ^ 2
-        vpclmulqdq	xmm9, xmm5, xmm5, 0
-        vpclmulqdq	xmm10, xmm5, xmm5, 17
-        vpclmulqdq	xmm8, xmm9, xmm3, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm9, xmm9, xmm8
-        vpclmulqdq	xmm8, xmm9, xmm3, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm0, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp], xmm5
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-        ; H ^ 3 and H ^ 4
-        vpclmulqdq	xmm11, xmm0, xmm5, 16
-        vpclmulqdq	xmm10, xmm0, xmm5, 1
-        vpclmulqdq	xmm9, xmm0, xmm5, 0
-        vpclmulqdq	xmm12, xmm0, xmm5, 17
-        vpclmulqdq	xmm13, xmm0, xmm0, 0
-        vpclmulqdq	xmm14, xmm0, xmm0, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm2, xmm13, xmm14
-        vpxor	xmm1, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+32], xmm1
-        vmovdqu	OWORD PTR [rsp+48], xmm2
-        ; H ^ 5 and H ^ 6
-        vpclmulqdq	xmm11, xmm1, xmm0, 16
-        vpclmulqdq	xmm10, xmm1, xmm0, 1
-        vpclmulqdq	xmm9, xmm1, xmm0, 0
-        vpclmulqdq	xmm12, xmm1, xmm0, 17
-        vpclmulqdq	xmm13, xmm1, xmm1, 0
-        vpclmulqdq	xmm14, xmm1, xmm1, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm0, xmm13, xmm14
-        vpxor	xmm7, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm7
-        vmovdqu	OWORD PTR [rsp+80], xmm0
-        ; H ^ 7 and H ^ 8
-        vpclmulqdq	xmm11, xmm2, xmm1, 16
-        vpclmulqdq	xmm10, xmm2, xmm1, 1
-        vpclmulqdq	xmm9, xmm2, xmm1, 0
-        vpclmulqdq	xmm12, xmm2, xmm1, 17
-        vpclmulqdq	xmm13, xmm2, xmm2, 0
-        vpclmulqdq	xmm14, xmm2, xmm2, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm0, xmm13, xmm14
-        vpxor	xmm7, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+96], xmm7
-        vmovdqu	OWORD PTR [rsp+112], xmm0
-        ; First 128 bytes of input
-        ; aesenc_128
-        ; aesenc_ctr
-        vmovdqu	xmm0, OWORD PTR [rsp+128]
-        vmovdqu	xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight
-        vpshufb	xmm15, xmm15, xmm1
-        ; aesenc_xor
-        vmovdqu	xmm7, OWORD PTR [rax]
-        vmovdqu	OWORD PTR [rsp+128], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+16]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+32]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+48]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+64]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+80]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+96]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+112]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+128]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+144]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r8d, 11
-        vmovdqu	xmm7, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_128_enc_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r8d, 13
-        vmovdqu	xmm7, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_128_enc_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_avx2_aesenc_128_enc_done:
-        ; aesenc_last
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [r11]
-        vmovdqu	xmm1, OWORD PTR [r11+16]
-        vmovdqu	xmm2, OWORD PTR [r11+32]
-        vmovdqu	xmm3, OWORD PTR [r11+48]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vpxor	xmm10, xmm10, xmm2
-        vpxor	xmm11, xmm11, xmm3
-        vmovdqu	OWORD PTR [r10], xmm8
-        vmovdqu	OWORD PTR [r10+16], xmm9
-        vmovdqu	OWORD PTR [r10+32], xmm10
-        vmovdqu	OWORD PTR [r10+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [r11+64]
-        vmovdqu	xmm1, OWORD PTR [r11+80]
-        vmovdqu	xmm2, OWORD PTR [r11+96]
-        vmovdqu	xmm3, OWORD PTR [r11+112]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vpxor	xmm14, xmm14, xmm2
-        vpxor	xmm15, xmm15, xmm3
-        vmovdqu	OWORD PTR [r10+64], xmm12
-        vmovdqu	OWORD PTR [r10+80], xmm13
-        vmovdqu	OWORD PTR [r10+96], xmm14
-        vmovdqu	OWORD PTR [r10+112], xmm15
-        cmp	r15d, 128
-        mov	edi, 128
-        jle	L_AES_GCM_encrypt_update_avx2_end_128
-        ; More 128 bytes of input
-L_AES_GCM_encrypt_update_avx2_ghash_128:
-        ; aesenc_128_ghash
-        lea	rcx, QWORD PTR [r11+rdi]
-        lea	rdx, QWORD PTR [r10+rdi]
-        ; aesenc_ctr
-        vmovdqu	xmm0, OWORD PTR [rsp+128]
-        vmovdqu	xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight
-        vpshufb	xmm15, xmm15, xmm1
-        ; aesenc_xor
-        vmovdqu	xmm7, OWORD PTR [rax]
-        vmovdqu	OWORD PTR [rsp+128], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        ; aesenc_pclmul_1
-        vmovdqu	xmm1, OWORD PTR [rdx+-128]
-        vmovdqu	xmm0, OWORD PTR [rax+16]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vmovdqu	xmm2, OWORD PTR [rsp+112]
-        vpxor	xmm1, xmm1, xmm6
-        vpclmulqdq	xmm5, xmm1, xmm2, 16
-        vpclmulqdq	xmm3, xmm1, xmm2, 1
-        vpclmulqdq	xmm6, xmm1, xmm2, 0
-        vpclmulqdq	xmm7, xmm1, xmm2, 17
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_2
-        vmovdqu	xmm1, OWORD PTR [rdx+-112]
-        vmovdqu	xmm0, OWORD PTR [rsp+96]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+32]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-96]
-        vmovdqu	xmm0, OWORD PTR [rsp+80]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+48]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-80]
-        vmovdqu	xmm0, OWORD PTR [rsp+64]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+64]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-64]
-        vmovdqu	xmm0, OWORD PTR [rsp+48]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+80]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-48]
-        vmovdqu	xmm0, OWORD PTR [rsp+32]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+96]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-32]
-        vmovdqu	xmm0, OWORD PTR [rsp+16]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+112]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rdx+-16]
-        vmovdqu	xmm0, OWORD PTR [rsp]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+128]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_l
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm6, xmm6, xmm4
-        vpxor	xmm5, xmm5, xmm3
-        vpslldq	xmm1, xmm5, 8
-        vpsrldq	xmm5, xmm5, 8
-        vmovdqu	xmm4, OWORD PTR [rax+144]
-        vmovdqu	xmm0, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vaesenc	xmm8, xmm8, xmm4
-        vpxor	xmm6, xmm6, xmm1
-        vpxor	xmm7, xmm7, xmm5
-        vpclmulqdq	xmm3, xmm6, xmm0, 16
-        vaesenc	xmm9, xmm9, xmm4
-        vaesenc	xmm10, xmm10, xmm4
-        vaesenc	xmm11, xmm11, xmm4
-        vpshufd	xmm6, xmm6, 78
-        vpxor	xmm6, xmm6, xmm3
-        vpclmulqdq	xmm3, xmm6, xmm0, 16
-        vaesenc	xmm12, xmm12, xmm4
-        vaesenc	xmm13, xmm13, xmm4
-        vaesenc	xmm14, xmm14, xmm4
-        vpshufd	xmm6, xmm6, 78
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm6, xmm6, xmm7
-        vaesenc	xmm15, xmm15, xmm4
-        cmp	r8d, 11
-        vmovdqu	xmm7, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r8d, 13
-        vmovdqu	xmm7, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_avx2_aesenc_128_ghash_avx_done:
-        ; aesenc_last
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vmovdqu	xmm1, OWORD PTR [rcx+16]
-        vmovdqu	xmm2, OWORD PTR [rcx+32]
-        vmovdqu	xmm3, OWORD PTR [rcx+48]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vpxor	xmm10, xmm10, xmm2
-        vpxor	xmm11, xmm11, xmm3
-        vmovdqu	OWORD PTR [rdx], xmm8
-        vmovdqu	OWORD PTR [rdx+16], xmm9
-        vmovdqu	OWORD PTR [rdx+32], xmm10
-        vmovdqu	OWORD PTR [rdx+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+64]
-        vmovdqu	xmm1, OWORD PTR [rcx+80]
-        vmovdqu	xmm2, OWORD PTR [rcx+96]
-        vmovdqu	xmm3, OWORD PTR [rcx+112]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vpxor	xmm14, xmm14, xmm2
-        vpxor	xmm15, xmm15, xmm3
-        vmovdqu	OWORD PTR [rdx+64], xmm12
-        vmovdqu	OWORD PTR [rdx+80], xmm13
-        vmovdqu	OWORD PTR [rdx+96], xmm14
-        vmovdqu	OWORD PTR [rdx+112], xmm15
-        ; aesenc_128_ghash - end
-        add	edi, 128
-        cmp	edi, r15d
-        jl	L_AES_GCM_encrypt_update_avx2_ghash_128
-L_AES_GCM_encrypt_update_avx2_end_128:
-        vmovdqu	xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpshufb	xmm8, xmm8, xmm4
-        vpshufb	xmm9, xmm9, xmm4
-        vpshufb	xmm10, xmm10, xmm4
-        vpshufb	xmm11, xmm11, xmm4
-        vpshufb	xmm12, xmm12, xmm4
-        vpshufb	xmm13, xmm13, xmm4
-        vpshufb	xmm14, xmm14, xmm4
-        vpshufb	xmm15, xmm15, xmm4
-        vpxor	xmm8, xmm8, xmm6
-        vmovdqu	xmm7, OWORD PTR [rsp]
-        vpclmulqdq	xmm5, xmm7, xmm15, 16
-        vpclmulqdq	xmm1, xmm7, xmm15, 1
-        vpclmulqdq	xmm4, xmm7, xmm15, 0
-        vpclmulqdq	xmm6, xmm7, xmm15, 17
-        vpxor	xmm5, xmm5, xmm1
-        vmovdqu	xmm7, OWORD PTR [rsp+16]
-        vpclmulqdq	xmm2, xmm7, xmm14, 16
-        vpclmulqdq	xmm1, xmm7, xmm14, 1
-        vpclmulqdq	xmm0, xmm7, xmm14, 0
-        vpclmulqdq	xmm3, xmm7, xmm14, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vmovdqu	xmm15, OWORD PTR [rsp+32]
-        vmovdqu	xmm7, OWORD PTR [rsp+48]
-        vpclmulqdq	xmm2, xmm15, xmm13, 16
-        vpclmulqdq	xmm1, xmm15, xmm13, 1
-        vpclmulqdq	xmm0, xmm15, xmm13, 0
-        vpclmulqdq	xmm3, xmm15, xmm13, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vpclmulqdq	xmm2, xmm7, xmm12, 16
-        vpclmulqdq	xmm1, xmm7, xmm12, 1
-        vpclmulqdq	xmm0, xmm7, xmm12, 0
-        vpclmulqdq	xmm3, xmm7, xmm12, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vmovdqu	xmm15, OWORD PTR [rsp+64]
-        vmovdqu	xmm7, OWORD PTR [rsp+80]
-        vpclmulqdq	xmm2, xmm15, xmm11, 16
-        vpclmulqdq	xmm1, xmm15, xmm11, 1
-        vpclmulqdq	xmm0, xmm15, xmm11, 0
-        vpclmulqdq	xmm3, xmm15, xmm11, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vpclmulqdq	xmm2, xmm7, xmm10, 16
-        vpclmulqdq	xmm1, xmm7, xmm10, 1
-        vpclmulqdq	xmm0, xmm7, xmm10, 0
-        vpclmulqdq	xmm3, xmm7, xmm10, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vmovdqu	xmm15, OWORD PTR [rsp+96]
-        vmovdqu	xmm7, OWORD PTR [rsp+112]
-        vpclmulqdq	xmm2, xmm15, xmm9, 16
-        vpclmulqdq	xmm1, xmm15, xmm9, 1
-        vpclmulqdq	xmm0, xmm15, xmm9, 0
-        vpclmulqdq	xmm3, xmm15, xmm9, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vpclmulqdq	xmm2, xmm7, xmm8, 16
-        vpclmulqdq	xmm1, xmm7, xmm8, 1
-        vpclmulqdq	xmm0, xmm7, xmm8, 0
-        vpclmulqdq	xmm3, xmm7, xmm8, 17
-        vpxor	xmm2, xmm2, xmm1
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm4, xmm4, xmm0
-        vpslldq	xmm7, xmm5, 8
-        vpsrldq	xmm5, xmm5, 8
-        vpxor	xmm4, xmm4, xmm7
-        vpxor	xmm6, xmm6, xmm5
-        ; ghash_red
-        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpclmulqdq	xmm0, xmm4, xmm2, 16
-        vpshufd	xmm1, xmm4, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpclmulqdq	xmm0, xmm1, xmm2, 16
-        vpshufd	xmm1, xmm1, 78
-        vpxor	xmm1, xmm1, xmm0
-        vpxor	xmm6, xmm6, xmm1
-        vmovdqu	xmm5, OWORD PTR [rsp]
-        vmovdqu	xmm4, OWORD PTR [rsp+128]
-L_AES_GCM_encrypt_update_avx2_done_128:
-        cmp	edi, r9d
-        je	L_AES_GCM_encrypt_update_avx2_done_enc
-        mov	r15d, r9d
-        and	r15d, 4294967280
-        cmp	edi, r15d
-        jge	L_AES_GCM_encrypt_update_avx2_last_block_done
-        ; aesenc_block
-        vmovdqu	xmm1, xmm4
-        vpshufb	xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_one
-        vpxor	xmm0, xmm0, [rax]
-        vmovdqu	xmm2, OWORD PTR [rax+16]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rax+32]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rax+48]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rax+64]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rax+80]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rax+96]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rax+112]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rax+128]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm2, OWORD PTR [rax+144]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm4, xmm1
-        cmp	r8d, 11
-        vmovdqu	xmm1, OWORD PTR [rax+160]
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_block_last
-        vaesenc	xmm0, xmm0, xmm1
-        vmovdqu	xmm2, OWORD PTR [rax+176]
-        vaesenc	xmm0, xmm0, xmm2
-        cmp	r8d, 13
-        vmovdqu	xmm1, OWORD PTR [rax+192]
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_block_last
-        vaesenc	xmm0, xmm0, xmm1
-        vmovdqu	xmm2, OWORD PTR [rax+208]
-        vaesenc	xmm0, xmm0, xmm2
-        vmovdqu	xmm1, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_avx2_aesenc_block_last:
-        vaesenclast	xmm0, xmm0, xmm1
-        vmovdqu	xmm1, OWORD PTR [r11+rdi]
-        vpxor	xmm0, xmm0, xmm1
-        vmovdqu	OWORD PTR [r10+rdi], xmm0
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm0
-        add	edi, 16
-        cmp	edi, r15d
-        jge	L_AES_GCM_encrypt_update_avx2_last_block_ghash
-L_AES_GCM_encrypt_update_avx2_last_block_start:
-        vmovdqu	xmm12, OWORD PTR [r11+rdi]
-        vpshufb	xmm11, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one
-        ; aesenc_gfmul_sb
-        vpclmulqdq	xmm2, xmm6, xmm5, 1
-        vpclmulqdq	xmm3, xmm6, xmm5, 16
-        vpclmulqdq	xmm1, xmm6, xmm5, 0
-        vpclmulqdq	xmm8, xmm6, xmm5, 17
-        vpxor	xmm11, xmm11, [rax]
-        vaesenc	xmm11, xmm11, [rax+16]
-        vpxor	xmm3, xmm3, xmm2
-        vpslldq	xmm2, xmm3, 8
-        vpsrldq	xmm3, xmm3, 8
-        vaesenc	xmm11, xmm11, [rax+32]
-        vpxor	xmm2, xmm2, xmm1
-        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vaesenc	xmm11, xmm11, [rax+48]
-        vaesenc	xmm11, xmm11, [rax+64]
-        vaesenc	xmm11, xmm11, [rax+80]
-        vpshufd	xmm2, xmm2, 78
-        vpxor	xmm2, xmm2, xmm1
-        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vaesenc	xmm11, xmm11, [rax+96]
-        vaesenc	xmm11, xmm11, [rax+112]
-        vaesenc	xmm11, xmm11, [rax+128]
-        vpshufd	xmm2, xmm2, 78
-        vaesenc	xmm11, xmm11, [rax+144]
-        vpxor	xmm8, xmm8, xmm3
-        vpxor	xmm2, xmm2, xmm8
-        vmovdqu	xmm0, OWORD PTR [rax+160]
-        cmp	r8d, 11
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm11, xmm11, [rax+176]
-        vmovdqu	xmm0, OWORD PTR [rax+192]
-        cmp	r8d, 13
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm11, xmm11, [rax+208]
-        vmovdqu	xmm0, OWORD PTR [rax+224]
-L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last:
-        vaesenclast	xmm11, xmm11, xmm0
-        vpxor	xmm6, xmm2, xmm1
-        vpxor	xmm11, xmm11, xmm12
-        vmovdqu	OWORD PTR [r10+rdi], xmm11
-        vpshufb	xmm11, xmm11, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm6, xmm6, xmm11
-        add	edi, 16
-        cmp	edi, r15d
-        jl	L_AES_GCM_encrypt_update_avx2_last_block_start
-L_AES_GCM_encrypt_update_avx2_last_block_ghash:
-        ; ghash_gfmul_red
-        vpclmulqdq	xmm10, xmm6, xmm5, 16
-        vpclmulqdq	xmm9, xmm6, xmm5, 1
-        vpclmulqdq	xmm8, xmm6, xmm5, 0
-        vpxor	xmm10, xmm10, xmm9
-        vpslldq	xmm9, xmm10, 8
-        vpsrldq	xmm10, xmm10, 8
-        vpxor	xmm9, xmm9, xmm8
-        vpclmulqdq	xmm6, xmm6, xmm5, 17
-        vpclmulqdq	xmm8, xmm9, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm9, xmm9, xmm8
-        vpclmulqdq	xmm8, xmm9, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm6, xmm6, xmm10
-        vpxor	xmm6, xmm6, xmm9
-        vpxor	xmm6, xmm6, xmm8
-L_AES_GCM_encrypt_update_avx2_last_block_done:
-L_AES_GCM_encrypt_update_avx2_done_enc:
-        vmovdqu	OWORD PTR [r12], xmm6
-        vmovdqu	OWORD PTR [r14], xmm4
-        vzeroupper
-        add	rsp, 152
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-AES_GCM_encrypt_update_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_encrypt_final_avx2 PROC
-        push	r12
-        push	r13
-        mov	eax, DWORD PTR [rsp+56]
-        mov	r10, QWORD PTR [rsp+64]
-        mov	r11, QWORD PTR [rsp+72]
-        sub	rsp, 16
-        vmovdqu	xmm4, OWORD PTR [rcx]
-        vmovdqu	xmm5, OWORD PTR [r10]
-        vmovdqu	xmm6, OWORD PTR [r11]
-        vpsrlq	xmm1, xmm5, 63
-        vpsllq	xmm0, xmm5, 1
-        vpslldq	xmm1, xmm1, 8
-        vpor	xmm0, xmm0, xmm1
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpand	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpxor	xmm5, xmm5, xmm0
-        ; calc_tag
-        shl	r9, 3
-        shl	rax, 3
-        vmovq	xmm0, r9
-        vmovq	xmm1, rax
-        vpunpcklqdq	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm4
-        ; ghash_gfmul_red
-        vpclmulqdq	xmm7, xmm0, xmm5, 16
-        vpclmulqdq	xmm3, xmm0, xmm5, 1
-        vpclmulqdq	xmm2, xmm0, xmm5, 0
-        vpxor	xmm7, xmm7, xmm3
-        vpslldq	xmm3, xmm7, 8
-        vpsrldq	xmm7, xmm7, 8
-        vpxor	xmm3, xmm3, xmm2
-        vpclmulqdq	xmm0, xmm0, xmm5, 17
-        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm3, xmm3, 78
-        vpxor	xmm3, xmm3, xmm2
-        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm3, xmm3, 78
-        vpxor	xmm0, xmm0, xmm7
-        vpxor	xmm0, xmm0, xmm3
-        vpxor	xmm0, xmm0, xmm2
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm0, xmm0, xmm6
-        ; store_tag
-        cmp	r8d, 16
-        je	L_AES_GCM_encrypt_final_avx2_store_tag_16
-        xor	r12, r12
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_encrypt_final_avx2_store_tag_loop:
-        movzx	r13d, BYTE PTR [rsp+r12]
-        mov	BYTE PTR [rdx+r12], r13b
-        inc	r12d
-        cmp	r12d, r8d
-        jne	L_AES_GCM_encrypt_final_avx2_store_tag_loop
-        jmp	L_AES_GCM_encrypt_final_avx2_store_tag_done
-L_AES_GCM_encrypt_final_avx2_store_tag_16:
-        vmovdqu	OWORD PTR [rdx], xmm0
-L_AES_GCM_encrypt_final_avx2_store_tag_done:
-        vzeroupper
-        add	rsp, 16
-        pop	r13
-        pop	r12
-        ret
-AES_GCM_encrypt_final_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_decrypt_update_avx2 PROC
-        push	r13
-        push	r12
-        push	r14
-        push	r15
-        push	rdi
-        mov	rax, rcx
-        mov	r10, r8
-        mov	r8d, edx
-        mov	r11, r9
-        mov	r9d, DWORD PTR [rsp+80]
-        mov	r12, QWORD PTR [rsp+88]
-        mov	r14, QWORD PTR [rsp+96]
-        mov	r15, QWORD PTR [rsp+104]
-        sub	rsp, 168
-        vmovdqu	xmm6, OWORD PTR [r12]
-        vmovdqu	xmm5, OWORD PTR [r14]
-        vmovdqu	xmm4, OWORD PTR [r15]
-        ; Calculate H
-        vpsrlq	xmm1, xmm5, 63
-        vpsllq	xmm0, xmm5, 1
-        vpslldq	xmm1, xmm1, 8
-        vpor	xmm0, xmm0, xmm1
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpand	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpxor	xmm5, xmm5, xmm0
-        xor	edi, edi
-        cmp	r9d, 128
-        mov	r13d, r9d
-        jl	L_AES_GCM_decrypt_update_avx2_done_128
-        and	r13d, 4294967168
-        vmovdqu	OWORD PTR [rsp+128], xmm4
-        vmovdqu	OWORD PTR [rsp+144], xmm15
-        vmovdqu	xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128
-        ; H ^ 1 and H ^ 2
-        vpclmulqdq	xmm9, xmm5, xmm5, 0
-        vpclmulqdq	xmm10, xmm5, xmm5, 17
-        vpclmulqdq	xmm8, xmm9, xmm3, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm9, xmm9, xmm8
-        vpclmulqdq	xmm8, xmm9, xmm3, 16
-        vpshufd	xmm9, xmm9, 78
-        vpxor	xmm9, xmm9, xmm8
-        vpxor	xmm0, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp], xmm5
-        vmovdqu	OWORD PTR [rsp+16], xmm0
-        ; H ^ 3 and H ^ 4
-        vpclmulqdq	xmm11, xmm0, xmm5, 16
-        vpclmulqdq	xmm10, xmm0, xmm5, 1
-        vpclmulqdq	xmm9, xmm0, xmm5, 0
-        vpclmulqdq	xmm12, xmm0, xmm5, 17
-        vpclmulqdq	xmm13, xmm0, xmm0, 0
-        vpclmulqdq	xmm14, xmm0, xmm0, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm2, xmm13, xmm14
-        vpxor	xmm1, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+32], xmm1
-        vmovdqu	OWORD PTR [rsp+48], xmm2
-        ; H ^ 5 and H ^ 6
-        vpclmulqdq	xmm11, xmm1, xmm0, 16
-        vpclmulqdq	xmm10, xmm1, xmm0, 1
-        vpclmulqdq	xmm9, xmm1, xmm0, 0
-        vpclmulqdq	xmm12, xmm1, xmm0, 17
-        vpclmulqdq	xmm13, xmm1, xmm1, 0
-        vpclmulqdq	xmm14, xmm1, xmm1, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm0, xmm13, xmm14
-        vpxor	xmm7, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+64], xmm7
-        vmovdqu	OWORD PTR [rsp+80], xmm0
-        ; H ^ 7 and H ^ 8
-        vpclmulqdq	xmm11, xmm2, xmm1, 16
-        vpclmulqdq	xmm10, xmm2, xmm1, 1
-        vpclmulqdq	xmm9, xmm2, xmm1, 0
-        vpclmulqdq	xmm12, xmm2, xmm1, 17
-        vpclmulqdq	xmm13, xmm2, xmm2, 0
-        vpclmulqdq	xmm14, xmm2, xmm2, 17
-        vpxor	xmm11, xmm11, xmm10
-        vpslldq	xmm10, xmm11, 8
-        vpsrldq	xmm11, xmm11, 8
-        vpxor	xmm10, xmm10, xmm9
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm10, xmm10, xmm9
-        vpxor	xmm13, xmm13, xmm8
-        vpclmulqdq	xmm9, xmm10, xmm3, 16
-        vpclmulqdq	xmm8, xmm13, xmm3, 16
-        vpshufd	xmm10, xmm10, 78
-        vpshufd	xmm13, xmm13, 78
-        vpxor	xmm12, xmm12, xmm11
-        vpxor	xmm13, xmm13, xmm8
-        vpxor	xmm10, xmm10, xmm12
-        vpxor	xmm0, xmm13, xmm14
-        vpxor	xmm7, xmm10, xmm9
-        vmovdqu	OWORD PTR [rsp+96], xmm7
-        vmovdqu	OWORD PTR [rsp+112], xmm0
-L_AES_GCM_decrypt_update_avx2_ghash_128:
-        ; aesenc_128_ghash
-        lea	rcx, QWORD PTR [r11+rdi]
-        lea	rdx, QWORD PTR [r10+rdi]
-        ; aesenc_ctr
-        vmovdqu	xmm0, OWORD PTR [rsp+128]
-        vmovdqu	xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpaddd	xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one
-        vpshufb	xmm8, xmm0, xmm1
-        vpaddd	xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two
-        vpshufb	xmm9, xmm9, xmm1
-        vpaddd	xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three
-        vpshufb	xmm10, xmm10, xmm1
-        vpaddd	xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four
-        vpshufb	xmm11, xmm11, xmm1
-        vpaddd	xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five
-        vpshufb	xmm12, xmm12, xmm1
-        vpaddd	xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six
-        vpshufb	xmm13, xmm13, xmm1
-        vpaddd	xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven
-        vpshufb	xmm14, xmm14, xmm1
-        vpaddd	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight
-        vpshufb	xmm15, xmm15, xmm1
-        ; aesenc_xor
-        vmovdqu	xmm7, OWORD PTR [rax]
-        vmovdqu	OWORD PTR [rsp+128], xmm0
-        vpxor	xmm8, xmm8, xmm7
-        vpxor	xmm9, xmm9, xmm7
-        vpxor	xmm10, xmm10, xmm7
-        vpxor	xmm11, xmm11, xmm7
-        vpxor	xmm12, xmm12, xmm7
-        vpxor	xmm13, xmm13, xmm7
-        vpxor	xmm14, xmm14, xmm7
-        vpxor	xmm15, xmm15, xmm7
-        ; aesenc_pclmul_1
-        vmovdqu	xmm1, OWORD PTR [rcx]
-        vmovdqu	xmm0, OWORD PTR [rax+16]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vmovdqu	xmm2, OWORD PTR [rsp+112]
-        vpxor	xmm1, xmm1, xmm6
-        vpclmulqdq	xmm5, xmm1, xmm2, 16
-        vpclmulqdq	xmm3, xmm1, xmm2, 1
-        vpclmulqdq	xmm6, xmm1, xmm2, 0
-        vpclmulqdq	xmm7, xmm1, xmm2, 17
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_2
-        vmovdqu	xmm1, OWORD PTR [rcx+16]
-        vmovdqu	xmm0, OWORD PTR [rsp+96]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+32]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+32]
-        vmovdqu	xmm0, OWORD PTR [rsp+80]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+48]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+48]
-        vmovdqu	xmm0, OWORD PTR [rsp+64]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+64]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+64]
-        vmovdqu	xmm0, OWORD PTR [rsp+48]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+80]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+80]
-        vmovdqu	xmm0, OWORD PTR [rsp+32]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+96]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+96]
-        vmovdqu	xmm0, OWORD PTR [rsp+16]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+112]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_n
-        vmovdqu	xmm1, OWORD PTR [rcx+112]
-        vmovdqu	xmm0, OWORD PTR [rsp]
-        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm5, xmm5, xmm2
-        vpclmulqdq	xmm2, xmm1, xmm0, 16
-        vpxor	xmm5, xmm5, xmm3
-        vpclmulqdq	xmm3, xmm1, xmm0, 1
-        vpxor	xmm6, xmm6, xmm4
-        vpclmulqdq	xmm4, xmm1, xmm0, 0
-        vpclmulqdq	xmm1, xmm1, xmm0, 17
-        vmovdqu	xmm0, OWORD PTR [rax+128]
-        vpxor	xmm7, xmm7, xmm1
-        vaesenc	xmm8, xmm8, xmm0
-        vaesenc	xmm9, xmm9, xmm0
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm11, xmm11, xmm0
-        vaesenc	xmm12, xmm12, xmm0
-        vaesenc	xmm13, xmm13, xmm0
-        vaesenc	xmm14, xmm14, xmm0
-        vaesenc	xmm15, xmm15, xmm0
-        ; aesenc_pclmul_l
-        vpxor	xmm5, xmm5, xmm2
-        vpxor	xmm6, xmm6, xmm4
-        vpxor	xmm5, xmm5, xmm3
-        vpslldq	xmm1, xmm5, 8
-        vpsrldq	xmm5, xmm5, 8
-        vmovdqu	xmm4, OWORD PTR [rax+144]
-        vmovdqu	xmm0, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vaesenc	xmm8, xmm8, xmm4
-        vpxor	xmm6, xmm6, xmm1
-        vpxor	xmm7, xmm7, xmm5
-        vpclmulqdq	xmm3, xmm6, xmm0, 16
-        vaesenc	xmm9, xmm9, xmm4
-        vaesenc	xmm10, xmm10, xmm4
-        vaesenc	xmm11, xmm11, xmm4
-        vpshufd	xmm6, xmm6, 78
-        vpxor	xmm6, xmm6, xmm3
-        vpclmulqdq	xmm3, xmm6, xmm0, 16
-        vaesenc	xmm12, xmm12, xmm4
-        vaesenc	xmm13, xmm13, xmm4
-        vaesenc	xmm14, xmm14, xmm4
-        vpshufd	xmm6, xmm6, 78
-        vpxor	xmm6, xmm6, xmm3
-        vpxor	xmm6, xmm6, xmm7
-        vaesenc	xmm15, xmm15, xmm4
-        cmp	r8d, 11
-        vmovdqu	xmm7, OWORD PTR [rax+160]
-        jl	L_AES_GCM_decrypt_update_avx2_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+176]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        cmp	r8d, 13
-        vmovdqu	xmm7, OWORD PTR [rax+192]
-        jl	L_AES_GCM_decrypt_update_avx2_aesenc_128_ghash_avx_done
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+208]
-        vaesenc	xmm8, xmm8, xmm7
-        vaesenc	xmm9, xmm9, xmm7
-        vaesenc	xmm10, xmm10, xmm7
-        vaesenc	xmm11, xmm11, xmm7
-        vaesenc	xmm12, xmm12, xmm7
-        vaesenc	xmm13, xmm13, xmm7
-        vaesenc	xmm14, xmm14, xmm7
-        vaesenc	xmm15, xmm15, xmm7
-        vmovdqu	xmm7, OWORD PTR [rax+224]
-L_AES_GCM_decrypt_update_avx2_aesenc_128_ghash_avx_done:
-        ; aesenc_last
-        vaesenclast	xmm8, xmm8, xmm7
-        vaesenclast	xmm9, xmm9, xmm7
-        vaesenclast	xmm10, xmm10, xmm7
-        vaesenclast	xmm11, xmm11, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx]
-        vmovdqu	xmm1, OWORD PTR [rcx+16]
-        vmovdqu	xmm2, OWORD PTR [rcx+32]
-        vmovdqu	xmm3, OWORD PTR [rcx+48]
-        vpxor	xmm8, xmm8, xmm0
-        vpxor	xmm9, xmm9, xmm1
-        vpxor	xmm10, xmm10, xmm2
-        vpxor	xmm11, xmm11, xmm3
-        vmovdqu	OWORD PTR [rdx], xmm8
-        vmovdqu	OWORD PTR [rdx+16], xmm9
-        vmovdqu	OWORD PTR [rdx+32], xmm10
-        vmovdqu	OWORD PTR [rdx+48], xmm11
-        vaesenclast	xmm12, xmm12, xmm7
-        vaesenclast	xmm13, xmm13, xmm7
-        vaesenclast	xmm14, xmm14, xmm7
-        vaesenclast	xmm15, xmm15, xmm7
-        vmovdqu	xmm0, OWORD PTR [rcx+64]
-        vmovdqu	xmm1, OWORD PTR [rcx+80]
-        vmovdqu	xmm2, OWORD PTR [rcx+96]
-        vmovdqu	xmm3, OWORD PTR [rcx+112]
-        vpxor	xmm12, xmm12, xmm0
-        vpxor	xmm13, xmm13, xmm1
-        vpxor	xmm14, xmm14, xmm2
-        vpxor	xmm15, xmm15, xmm3
-        vmovdqu	OWORD PTR [rdx+64], xmm12
-        vmovdqu	OWORD PTR [rdx+80], xmm13
-        vmovdqu	OWORD PTR [rdx+96], xmm14
-        vmovdqu	OWORD PTR [rdx+112], xmm15
-        ; aesenc_128_ghash - end
-        add	edi, 128
-        cmp	edi, r13d
-        jl	L_AES_GCM_decrypt_update_avx2_ghash_128
-        vmovdqu	xmm5, OWORD PTR [rsp]
-        vmovdqu	xmm4, OWORD PTR [rsp+128]
-        vmovdqu	xmm15, OWORD PTR [rsp+144]
-L_AES_GCM_decrypt_update_avx2_done_128:
-        cmp	edi, r9d
-        jge	L_AES_GCM_decrypt_update_avx2_done_dec
-        mov	r13d, r9d
-        and	r13d, 4294967280
-        cmp	edi, r13d
-        jge	L_AES_GCM_decrypt_update_avx2_last_block_done
-L_AES_GCM_decrypt_update_avx2_last_block_start:
-        vmovdqu	xmm11, OWORD PTR [r11+rdi]
-        vpshufb	xmm10, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
-        vpshufb	xmm12, xmm11, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpaddd	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one
-        vpxor	xmm12, xmm12, xmm6
-        ; aesenc_gfmul_sb
-        vpclmulqdq	xmm2, xmm12, xmm5, 1
-        vpclmulqdq	xmm3, xmm12, xmm5, 16
-        vpclmulqdq	xmm1, xmm12, xmm5, 0
-        vpclmulqdq	xmm8, xmm12, xmm5, 17
-        vpxor	xmm10, xmm10, [rax]
-        vaesenc	xmm10, xmm10, [rax+16]
-        vpxor	xmm3, xmm3, xmm2
-        vpslldq	xmm2, xmm3, 8
-        vpsrldq	xmm3, xmm3, 8
-        vaesenc	xmm10, xmm10, [rax+32]
-        vpxor	xmm2, xmm2, xmm1
-        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vaesenc	xmm10, xmm10, [rax+48]
-        vaesenc	xmm10, xmm10, [rax+64]
-        vaesenc	xmm10, xmm10, [rax+80]
-        vpshufd	xmm2, xmm2, 78
-        vpxor	xmm2, xmm2, xmm1
-        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vaesenc	xmm10, xmm10, [rax+96]
-        vaesenc	xmm10, xmm10, [rax+112]
-        vaesenc	xmm10, xmm10, [rax+128]
-        vpshufd	xmm2, xmm2, 78
-        vaesenc	xmm10, xmm10, [rax+144]
-        vpxor	xmm8, xmm8, xmm3
-        vpxor	xmm2, xmm2, xmm8
-        vmovdqu	xmm0, OWORD PTR [rax+160]
-        cmp	r8d, 11
-        jl	L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm10, xmm10, [rax+176]
-        vmovdqu	xmm0, OWORD PTR [rax+192]
-        cmp	r8d, 13
-        jl	L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last
-        vaesenc	xmm10, xmm10, xmm0
-        vaesenc	xmm10, xmm10, [rax+208]
-        vmovdqu	xmm0, OWORD PTR [rax+224]
-L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last:
-        vaesenclast	xmm10, xmm10, xmm0
-        vpxor	xmm6, xmm2, xmm1
-        vpxor	xmm10, xmm10, xmm11
-        vmovdqu	OWORD PTR [r10+rdi], xmm10
-        add	edi, 16
-        cmp	edi, r13d
-        jl	L_AES_GCM_decrypt_update_avx2_last_block_start
-L_AES_GCM_decrypt_update_avx2_last_block_done:
-L_AES_GCM_decrypt_update_avx2_done_dec:
-        vmovdqu	OWORD PTR [r12], xmm6
-        vmovdqu	OWORD PTR [r15], xmm4
-        vzeroupper
-        add	rsp, 168
-        pop	rdi
-        pop	r15
-        pop	r14
-        pop	r12
-        pop	r13
-        ret
-AES_GCM_decrypt_update_avx2 ENDP
-_text ENDS
-_text SEGMENT READONLY PARA
-AES_GCM_decrypt_final_avx2 PROC
-        push	r12
-        push	r13
-        push	r14
-        mov	eax, DWORD PTR [rsp+64]
-        mov	r10, QWORD PTR [rsp+72]
-        mov	r11, QWORD PTR [rsp+80]
-        mov	r12, QWORD PTR [rsp+88]
-        sub	rsp, 16
-        vmovdqu	xmm4, OWORD PTR [rcx]
-        vmovdqu	xmm5, OWORD PTR [r10]
-        vmovdqu	xmm6, OWORD PTR [r11]
-        vpsrlq	xmm1, xmm5, 63
-        vpsllq	xmm0, xmm5, 1
-        vpslldq	xmm1, xmm1, 8
-        vpor	xmm0, xmm0, xmm1
-        vpshufd	xmm5, xmm5, 255
-        vpsrad	xmm5, xmm5, 31
-        vpand	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128
-        vpxor	xmm5, xmm5, xmm0
-        ; calc_tag
-        shl	r9, 3
-        shl	rax, 3
-        vmovq	xmm0, r9
-        vmovq	xmm1, rax
-        vpunpcklqdq	xmm0, xmm0, xmm1
-        vpxor	xmm0, xmm0, xmm4
-        ; ghash_gfmul_red
-        vpclmulqdq	xmm7, xmm0, xmm5, 16
-        vpclmulqdq	xmm3, xmm0, xmm5, 1
-        vpclmulqdq	xmm2, xmm0, xmm5, 0
-        vpxor	xmm7, xmm7, xmm3
-        vpslldq	xmm3, xmm7, 8
-        vpsrldq	xmm7, xmm7, 8
-        vpxor	xmm3, xmm3, xmm2
-        vpclmulqdq	xmm0, xmm0, xmm5, 17
-        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm3, xmm3, 78
-        vpxor	xmm3, xmm3, xmm2
-        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
-        vpshufd	xmm3, xmm3, 78
-        vpxor	xmm0, xmm0, xmm7
-        vpxor	xmm0, xmm0, xmm3
-        vpxor	xmm0, xmm0, xmm2
-        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
-        vpxor	xmm0, xmm0, xmm6
-        ; cmp_tag
-        cmp	r8d, 16
-        je	L_AES_GCM_decrypt_final_avx2_cmp_tag_16
-        xor	r13, r13
-        xor	r10, r10
-        vmovdqu	OWORD PTR [rsp], xmm0
-L_AES_GCM_decrypt_final_avx2_cmp_tag_loop:
-        movzx	r14d, BYTE PTR [rsp+r13]
-        xor	r14b, BYTE PTR [rdx+r13]
-        or	r10b, r14b
-        inc	r13d
-        cmp	r13d, r8d
-        jne	L_AES_GCM_decrypt_final_avx2_cmp_tag_loop
-        cmp	r10, 0
-        sete	r10b
-        jmp	L_AES_GCM_decrypt_final_avx2_cmp_tag_done
-L_AES_GCM_decrypt_final_avx2_cmp_tag_16:
-        vmovdqu	xmm1, OWORD PTR [rdx]
-        vpcmpeqb	xmm0, xmm0, xmm1
-        vpmovmskb	r13, xmm0
-        ; %%edx == 0xFFFF then return 1 else => return 0
-        xor	r10d, r10d
-        cmp	r13d, 65535
-        sete	r10b
-L_AES_GCM_decrypt_final_avx2_cmp_tag_done:
-        mov	DWORD PTR [r12], r10d
-        vzeroupper
-        add	rsp, 16
-        pop	r14
-        pop	r13
-        pop	r12
-        ret
-AES_GCM_decrypt_final_avx2 ENDP
-_text ENDS
-ENDIF
-END

+ 0 - 12962
lib/wolfssl/wolfcrypt/src/aes_gcm_x86_asm.S

@@ -1,12962 +0,0 @@
-/* aes_gcm_x86_asm
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-#ifdef WOLFSSL_USER_SETTINGS
-#include "wolfssl/wolfcrypt/settings.h"
-#endif
-
-#ifndef HAVE_INTEL_AVX1
-#define HAVE_INTEL_AVX1
-#endif /* HAVE_INTEL_AVX1 */
-#ifndef NO_AVX2_SUPPORT
-#define HAVE_INTEL_AVX2
-#endif /* NO_AVX2_SUPPORT */
-
-.type	data, @object
-L_aes_gcm_one:
-.long	0x0,0x0,0x1,0x0
-.type	data, @object
-L_aes_gcm_two:
-.long	0x0,0x0,0x2,0x0
-.type	data, @object
-L_aes_gcm_three:
-.long	0x0,0x0,0x3,0x0
-.type	data, @object
-L_aes_gcm_four:
-.long	0x0,0x0,0x4,0x0
-.type	data, @object
-L_aes_gcm_bswap_epi64:
-.long	0x4050607,0x10203,0xc0d0e0f,0x8090a0b
-.type	data, @object
-L_aes_gcm_bswap_mask:
-.long	0xc0d0e0f,0x8090a0b,0x4050607,0x10203
-.type	data, @object
-L_aes_gcm_mod2_128:
-.long	0x1,0x0,0x0,0xc2000000
-.type	data, @object
-L_aes_gcm_avx1_one:
-.long	0x0,0x0,0x1,0x0
-.type	data, @object
-L_aes_gcm_avx1_two:
-.long	0x0,0x0,0x2,0x0
-.type	data, @object
-L_aes_gcm_avx1_three:
-.long	0x0,0x0,0x3,0x0
-.type	data, @object
-L_aes_gcm_avx1_four:
-.long	0x0,0x0,0x4,0x0
-.type	data, @object
-L_aes_gcm_avx1_bswap_epi64:
-.long	0x4050607,0x10203,0xc0d0e0f,0x8090a0b
-.type	data, @object
-L_aes_gcm_avx1_bswap_mask:
-.long	0xc0d0e0f,0x8090a0b,0x4050607,0x10203
-.type	data, @object
-L_aes_gcm_avx1_mod2_128:
-.long	0x1,0x0,0x0,0xc2000000
-.type	data, @object
-L_aes_gcm_avx2_one:
-.long	0x0,0x0,0x1,0x0
-.type	data, @object
-L_aes_gcm_avx2_two:
-.long	0x0,0x0,0x2,0x0
-.type	data, @object
-L_aes_gcm_avx2_three:
-.long	0x0,0x0,0x3,0x0
-.type	data, @object
-L_aes_gcm_avx2_four:
-.long	0x0,0x0,0x4,0x0
-.type	data, @object
-L_avx2_aes_gcm_bswap_one:
-.long	0x0,0x0,0x0,0x1000000
-.type	data, @object
-L_aes_gcm_avx2_bswap_epi64:
-.long	0x4050607,0x10203,0xc0d0e0f,0x8090a0b
-.type	data, @object
-L_aes_gcm_avx2_bswap_mask:
-.long	0xc0d0e0f,0x8090a0b,0x4050607,0x10203
-.type	data, @object
-L_aes_gcm_avx2_mod2_128:
-.long	0x1,0x0,0x0,0xc2000000
-.text
-.globl	AES_GCM_encrypt
-.type	AES_GCM_encrypt,@function
-.align	16
-AES_GCM_encrypt:
-        pushl	%ebx
-        pushl	%esi
-        pushl	%edi
-        pushl	%ebp
-        subl	$0x70, %esp
-        movl	144(%esp), %esi
-        movl	168(%esp), %ebp
-        movl	160(%esp), %edx
-        pxor	%xmm0, %xmm0
-        pxor	%xmm2, %xmm2
-        cmpl	$12, %edx
-        jne	L_AES_GCM_encrypt_iv_not_12
-        # # Calculate values when IV is 12 bytes
-        # Set counter based on IV
-        movl	$0x1000000, %ecx
-        pinsrd	$0x00, (%esi), %xmm0
-        pinsrd	$0x01, 4(%esi), %xmm0
-        pinsrd	$2, 8(%esi), %xmm0
-        pinsrd	$3, %ecx, %xmm0
-        # H = Encrypt X(=0) and T = Encrypt counter
-        movdqa	%xmm0, %xmm5
-        movdqa	(%ebp), %xmm1
-        pxor	%xmm1, %xmm5
-        movdqa	16(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	32(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	48(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	64(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	80(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	96(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	112(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	128(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	144(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        cmpl	$11, 172(%esp)
-        movdqa	160(%ebp), %xmm3
-        jl	L_AES_GCM_encrypt_calc_iv_12_last
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	176(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        cmpl	$13, 172(%esp)
-        movdqa	192(%ebp), %xmm3
-        jl	L_AES_GCM_encrypt_calc_iv_12_last
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	208(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	224(%ebp), %xmm3
-L_AES_GCM_encrypt_calc_iv_12_last:
-        aesenclast	%xmm3, %xmm1
-        aesenclast	%xmm3, %xmm5
-        pshufb	L_aes_gcm_bswap_mask, %xmm1
-        movdqu	%xmm5, 80(%esp)
-        jmp	L_AES_GCM_encrypt_iv_done
-L_AES_GCM_encrypt_iv_not_12:
-        # Calculate values when IV is not 12 bytes
-        # H = Encrypt X(=0)
-        movdqa	(%ebp), %xmm1
-        aesenc	16(%ebp), %xmm1
-        aesenc	32(%ebp), %xmm1
-        aesenc	48(%ebp), %xmm1
-        aesenc	64(%ebp), %xmm1
-        aesenc	80(%ebp), %xmm1
-        aesenc	96(%ebp), %xmm1
-        aesenc	112(%ebp), %xmm1
-        aesenc	128(%ebp), %xmm1
-        aesenc	144(%ebp), %xmm1
-        cmpl	$11, 172(%esp)
-        movdqa	160(%ebp), %xmm5
-        jl	L_AES_GCM_encrypt_calc_iv_1_aesenc_avx_last
-        aesenc	%xmm5, %xmm1
-        aesenc	176(%ebp), %xmm1
-        cmpl	$13, 172(%esp)
-        movdqa	192(%ebp), %xmm5
-        jl	L_AES_GCM_encrypt_calc_iv_1_aesenc_avx_last
-        aesenc	%xmm5, %xmm1
-        aesenc	208(%ebp), %xmm1
-        movdqa	224(%ebp), %xmm5
-L_AES_GCM_encrypt_calc_iv_1_aesenc_avx_last:
-        aesenclast	%xmm5, %xmm1
-        pshufb	L_aes_gcm_bswap_mask, %xmm1
-        # Calc counter
-        # Initialization vector
-        cmpl	$0x00, %edx
-        movl	$0x00, %ecx
-        je	L_AES_GCM_encrypt_calc_iv_done
-        cmpl	$16, %edx
-        jl	L_AES_GCM_encrypt_calc_iv_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_encrypt_calc_iv_16_loop:
-        movdqu	(%esi,%ecx,1), %xmm4
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm4, %xmm0
-        pshufd	$0x4e, %xmm0, %xmm5
-        pshufd	$0x4e, %xmm1, %xmm6
-        movdqa	%xmm1, %xmm7
-        movdqa	%xmm1, %xmm4
-        pclmulqdq	$0x11, %xmm0, %xmm7
-        pclmulqdq	$0x00, %xmm0, %xmm4
-        pxor	%xmm0, %xmm5
-        pxor	%xmm1, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm4, %xmm3
-        movdqa	%xmm7, %xmm0
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm3
-        pxor	%xmm5, %xmm0
-        movdqa	%xmm3, %xmm4
-        movdqa	%xmm0, %xmm5
-        psrld	$31, %xmm4
-        psrld	$31, %xmm5
-        pslld	$0x01, %xmm3
-        pslld	$0x01, %xmm0
-        movdqa	%xmm4, %xmm6
-        pslldq	$4, %xmm4
-        psrldq	$12, %xmm6
-        pslldq	$4, %xmm5
-        por	%xmm6, %xmm0
-        por	%xmm4, %xmm3
-        por	%xmm5, %xmm0
-        movdqa	%xmm3, %xmm4
-        movdqa	%xmm3, %xmm5
-        movdqa	%xmm3, %xmm6
-        pslld	$31, %xmm4
-        pslld	$30, %xmm5
-        pslld	$25, %xmm6
-        pxor	%xmm5, %xmm4
-        pxor	%xmm6, %xmm4
-        movdqa	%xmm4, %xmm5
-        psrldq	$4, %xmm5
-        pslldq	$12, %xmm4
-        pxor	%xmm4, %xmm3
-        movdqa	%xmm3, %xmm6
-        movdqa	%xmm3, %xmm7
-        movdqa	%xmm3, %xmm4
-        psrld	$0x01, %xmm6
-        psrld	$2, %xmm7
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm6
-        pxor	%xmm4, %xmm6
-        pxor	%xmm5, %xmm6
-        pxor	%xmm3, %xmm6
-        pxor	%xmm6, %xmm0
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_calc_iv_16_loop
-        movl	160(%esp), %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_encrypt_calc_iv_done
-L_AES_GCM_encrypt_calc_iv_lt16:
-        subl	$16, %esp
-        pxor	%xmm4, %xmm4
-        xorl	%ebx, %ebx
-        movdqu	%xmm4, (%esp)
-L_AES_GCM_encrypt_calc_iv_loop:
-        movzbl	(%esi,%ecx,1), %eax
-        movb	%al, (%esp,%ebx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_calc_iv_loop
-        movdqu	(%esp), %xmm4
-        addl	$16, %esp
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm4, %xmm0
-        pshufd	$0x4e, %xmm0, %xmm5
-        pshufd	$0x4e, %xmm1, %xmm6
-        movdqa	%xmm1, %xmm7
-        movdqa	%xmm1, %xmm4
-        pclmulqdq	$0x11, %xmm0, %xmm7
-        pclmulqdq	$0x00, %xmm0, %xmm4
-        pxor	%xmm0, %xmm5
-        pxor	%xmm1, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm4, %xmm3
-        movdqa	%xmm7, %xmm0
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm3
-        pxor	%xmm5, %xmm0
-        movdqa	%xmm3, %xmm4
-        movdqa	%xmm0, %xmm5
-        psrld	$31, %xmm4
-        psrld	$31, %xmm5
-        pslld	$0x01, %xmm3
-        pslld	$0x01, %xmm0
-        movdqa	%xmm4, %xmm6
-        pslldq	$4, %xmm4
-        psrldq	$12, %xmm6
-        pslldq	$4, %xmm5
-        por	%xmm6, %xmm0
-        por	%xmm4, %xmm3
-        por	%xmm5, %xmm0
-        movdqa	%xmm3, %xmm4
-        movdqa	%xmm3, %xmm5
-        movdqa	%xmm3, %xmm6
-        pslld	$31, %xmm4
-        pslld	$30, %xmm5
-        pslld	$25, %xmm6
-        pxor	%xmm5, %xmm4
-        pxor	%xmm6, %xmm4
-        movdqa	%xmm4, %xmm5
-        psrldq	$4, %xmm5
-        pslldq	$12, %xmm4
-        pxor	%xmm4, %xmm3
-        movdqa	%xmm3, %xmm6
-        movdqa	%xmm3, %xmm7
-        movdqa	%xmm3, %xmm4
-        psrld	$0x01, %xmm6
-        psrld	$2, %xmm7
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm6
-        pxor	%xmm4, %xmm6
-        pxor	%xmm5, %xmm6
-        pxor	%xmm3, %xmm6
-        pxor	%xmm6, %xmm0
-L_AES_GCM_encrypt_calc_iv_done:
-        # T = Encrypt counter
-        pxor	%xmm4, %xmm4
-        shll	$3, %edx
-        pinsrd	$0x00, %edx, %xmm4
-        pxor	%xmm4, %xmm0
-        pshufd	$0x4e, %xmm0, %xmm5
-        pshufd	$0x4e, %xmm1, %xmm6
-        movdqa	%xmm1, %xmm7
-        movdqa	%xmm1, %xmm4
-        pclmulqdq	$0x11, %xmm0, %xmm7
-        pclmulqdq	$0x00, %xmm0, %xmm4
-        pxor	%xmm0, %xmm5
-        pxor	%xmm1, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm4, %xmm3
-        movdqa	%xmm7, %xmm0
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm3
-        pxor	%xmm5, %xmm0
-        movdqa	%xmm3, %xmm4
-        movdqa	%xmm0, %xmm5
-        psrld	$31, %xmm4
-        psrld	$31, %xmm5
-        pslld	$0x01, %xmm3
-        pslld	$0x01, %xmm0
-        movdqa	%xmm4, %xmm6
-        pslldq	$4, %xmm4
-        psrldq	$12, %xmm6
-        pslldq	$4, %xmm5
-        por	%xmm6, %xmm0
-        por	%xmm4, %xmm3
-        por	%xmm5, %xmm0
-        movdqa	%xmm3, %xmm4
-        movdqa	%xmm3, %xmm5
-        movdqa	%xmm3, %xmm6
-        pslld	$31, %xmm4
-        pslld	$30, %xmm5
-        pslld	$25, %xmm6
-        pxor	%xmm5, %xmm4
-        pxor	%xmm6, %xmm4
-        movdqa	%xmm4, %xmm5
-        psrldq	$4, %xmm5
-        pslldq	$12, %xmm4
-        pxor	%xmm4, %xmm3
-        movdqa	%xmm3, %xmm6
-        movdqa	%xmm3, %xmm7
-        movdqa	%xmm3, %xmm4
-        psrld	$0x01, %xmm6
-        psrld	$2, %xmm7
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm6
-        pxor	%xmm4, %xmm6
-        pxor	%xmm5, %xmm6
-        pxor	%xmm3, %xmm6
-        pxor	%xmm6, %xmm0
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        #   Encrypt counter
-        movdqa	(%ebp), %xmm4
-        pxor	%xmm0, %xmm4
-        aesenc	16(%ebp), %xmm4
-        aesenc	32(%ebp), %xmm4
-        aesenc	48(%ebp), %xmm4
-        aesenc	64(%ebp), %xmm4
-        aesenc	80(%ebp), %xmm4
-        aesenc	96(%ebp), %xmm4
-        aesenc	112(%ebp), %xmm4
-        aesenc	128(%ebp), %xmm4
-        aesenc	144(%ebp), %xmm4
-        cmpl	$11, 172(%esp)
-        movdqa	160(%ebp), %xmm5
-        jl	L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last
-        aesenc	%xmm5, %xmm4
-        aesenc	176(%ebp), %xmm4
-        cmpl	$13, 172(%esp)
-        movdqa	192(%ebp), %xmm5
-        jl	L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last
-        aesenc	%xmm5, %xmm4
-        aesenc	208(%ebp), %xmm4
-        movdqa	224(%ebp), %xmm5
-L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last:
-        aesenclast	%xmm5, %xmm4
-        movdqu	%xmm4, 80(%esp)
-L_AES_GCM_encrypt_iv_done:
-        movl	140(%esp), %esi
-        # Additional authentication data
-        movl	156(%esp), %edx
-        cmpl	$0x00, %edx
-        je	L_AES_GCM_encrypt_calc_aad_done
-        xorl	%ecx, %ecx
-        cmpl	$16, %edx
-        jl	L_AES_GCM_encrypt_calc_aad_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_encrypt_calc_aad_16_loop:
-        movdqu	(%esi,%ecx,1), %xmm4
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm4, %xmm2
-        pshufd	$0x4e, %xmm2, %xmm5
-        pshufd	$0x4e, %xmm1, %xmm6
-        movdqa	%xmm1, %xmm7
-        movdqa	%xmm1, %xmm4
-        pclmulqdq	$0x11, %xmm2, %xmm7
-        pclmulqdq	$0x00, %xmm2, %xmm4
-        pxor	%xmm2, %xmm5
-        pxor	%xmm1, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm4, %xmm3
-        movdqa	%xmm7, %xmm2
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm3
-        pxor	%xmm5, %xmm2
-        movdqa	%xmm3, %xmm4
-        movdqa	%xmm2, %xmm5
-        psrld	$31, %xmm4
-        psrld	$31, %xmm5
-        pslld	$0x01, %xmm3
-        pslld	$0x01, %xmm2
-        movdqa	%xmm4, %xmm6
-        pslldq	$4, %xmm4
-        psrldq	$12, %xmm6
-        pslldq	$4, %xmm5
-        por	%xmm6, %xmm2
-        por	%xmm4, %xmm3
-        por	%xmm5, %xmm2
-        movdqa	%xmm3, %xmm4
-        movdqa	%xmm3, %xmm5
-        movdqa	%xmm3, %xmm6
-        pslld	$31, %xmm4
-        pslld	$30, %xmm5
-        pslld	$25, %xmm6
-        pxor	%xmm5, %xmm4
-        pxor	%xmm6, %xmm4
-        movdqa	%xmm4, %xmm5
-        psrldq	$4, %xmm5
-        pslldq	$12, %xmm4
-        pxor	%xmm4, %xmm3
-        movdqa	%xmm3, %xmm6
-        movdqa	%xmm3, %xmm7
-        movdqa	%xmm3, %xmm4
-        psrld	$0x01, %xmm6
-        psrld	$2, %xmm7
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm6
-        pxor	%xmm4, %xmm6
-        pxor	%xmm5, %xmm6
-        pxor	%xmm3, %xmm6
-        pxor	%xmm6, %xmm2
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_calc_aad_16_loop
-        movl	156(%esp), %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_encrypt_calc_aad_done
-L_AES_GCM_encrypt_calc_aad_lt16:
-        subl	$16, %esp
-        pxor	%xmm4, %xmm4
-        xorl	%ebx, %ebx
-        movdqu	%xmm4, (%esp)
-L_AES_GCM_encrypt_calc_aad_loop:
-        movzbl	(%esi,%ecx,1), %eax
-        movb	%al, (%esp,%ebx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_calc_aad_loop
-        movdqu	(%esp), %xmm4
-        addl	$16, %esp
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm4, %xmm2
-        pshufd	$0x4e, %xmm2, %xmm5
-        pshufd	$0x4e, %xmm1, %xmm6
-        movdqa	%xmm1, %xmm7
-        movdqa	%xmm1, %xmm4
-        pclmulqdq	$0x11, %xmm2, %xmm7
-        pclmulqdq	$0x00, %xmm2, %xmm4
-        pxor	%xmm2, %xmm5
-        pxor	%xmm1, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm4, %xmm3
-        movdqa	%xmm7, %xmm2
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm3
-        pxor	%xmm5, %xmm2
-        movdqa	%xmm3, %xmm4
-        movdqa	%xmm2, %xmm5
-        psrld	$31, %xmm4
-        psrld	$31, %xmm5
-        pslld	$0x01, %xmm3
-        pslld	$0x01, %xmm2
-        movdqa	%xmm4, %xmm6
-        pslldq	$4, %xmm4
-        psrldq	$12, %xmm6
-        pslldq	$4, %xmm5
-        por	%xmm6, %xmm2
-        por	%xmm4, %xmm3
-        por	%xmm5, %xmm2
-        movdqa	%xmm3, %xmm4
-        movdqa	%xmm3, %xmm5
-        movdqa	%xmm3, %xmm6
-        pslld	$31, %xmm4
-        pslld	$30, %xmm5
-        pslld	$25, %xmm6
-        pxor	%xmm5, %xmm4
-        pxor	%xmm6, %xmm4
-        movdqa	%xmm4, %xmm5
-        psrldq	$4, %xmm5
-        pslldq	$12, %xmm4
-        pxor	%xmm4, %xmm3
-        movdqa	%xmm3, %xmm6
-        movdqa	%xmm3, %xmm7
-        movdqa	%xmm3, %xmm4
-        psrld	$0x01, %xmm6
-        psrld	$2, %xmm7
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm6
-        pxor	%xmm4, %xmm6
-        pxor	%xmm5, %xmm6
-        pxor	%xmm3, %xmm6
-        pxor	%xmm6, %xmm2
-L_AES_GCM_encrypt_calc_aad_done:
-        movdqu	%xmm2, 96(%esp)
-        movl	132(%esp), %esi
-        movl	136(%esp), %edi
-        # Calculate counter and H
-        pshufb	L_aes_gcm_bswap_epi64, %xmm0
-        movdqa	%xmm1, %xmm5
-        paddd	L_aes_gcm_one, %xmm0
-        movdqa	%xmm1, %xmm4
-        movdqu	%xmm0, 64(%esp)
-        psrlq	$63, %xmm5
-        psllq	$0x01, %xmm4
-        pslldq	$8, %xmm5
-        por	%xmm5, %xmm4
-        pshufd	$0xff, %xmm1, %xmm1
-        psrad	$31, %xmm1
-        pand	L_aes_gcm_mod2_128, %xmm1
-        pxor	%xmm4, %xmm1
-        xorl	%ebx, %ebx
-        movl	152(%esp), %eax
-        cmpl	$0x40, %eax
-        jl	L_AES_GCM_encrypt_done_64
-        andl	$0xffffffc0, %eax
-        movdqa	%xmm2, %xmm6
-        # H ^ 1
-        movdqu	%xmm1, (%esp)
-        # H ^ 2
-        pshufd	$0x4e, %xmm1, %xmm5
-        pshufd	$0x4e, %xmm1, %xmm6
-        movdqa	%xmm1, %xmm7
-        movdqa	%xmm1, %xmm4
-        pclmulqdq	$0x11, %xmm1, %xmm7
-        pclmulqdq	$0x00, %xmm1, %xmm4
-        pxor	%xmm1, %xmm5
-        pxor	%xmm1, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm7, %xmm0
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm4
-        pxor	%xmm5, %xmm0
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        movdqa	%xmm4, %xmm7
-        pslld	$31, %xmm5
-        pslld	$30, %xmm6
-        pslld	$25, %xmm7
-        pxor	%xmm6, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm7
-        psrldq	$4, %xmm7
-        pslldq	$12, %xmm5
-        pxor	%xmm5, %xmm4
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        psrld	$0x01, %xmm5
-        psrld	$2, %xmm6
-        pxor	%xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm5, %xmm0
-        movdqu	%xmm0, 16(%esp)
-        # H ^ 3
-        pshufd	$0x4e, %xmm1, %xmm5
-        pshufd	$0x4e, %xmm0, %xmm6
-        movdqa	%xmm0, %xmm7
-        movdqa	%xmm0, %xmm4
-        pclmulqdq	$0x11, %xmm1, %xmm7
-        pclmulqdq	$0x00, %xmm1, %xmm4
-        pxor	%xmm1, %xmm5
-        pxor	%xmm0, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm7, %xmm3
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm4
-        pxor	%xmm5, %xmm3
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        movdqa	%xmm4, %xmm7
-        pslld	$31, %xmm5
-        pslld	$30, %xmm6
-        pslld	$25, %xmm7
-        pxor	%xmm6, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm7
-        psrldq	$4, %xmm7
-        pslldq	$12, %xmm5
-        pxor	%xmm5, %xmm4
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        psrld	$0x01, %xmm5
-        psrld	$2, %xmm6
-        pxor	%xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm5, %xmm3
-        movdqu	%xmm3, 32(%esp)
-        # H ^ 4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pshufd	$0x4e, %xmm0, %xmm6
-        movdqa	%xmm0, %xmm7
-        movdqa	%xmm0, %xmm4
-        pclmulqdq	$0x11, %xmm0, %xmm7
-        pclmulqdq	$0x00, %xmm0, %xmm4
-        pxor	%xmm0, %xmm5
-        pxor	%xmm0, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm7, %xmm3
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm4
-        pxor	%xmm5, %xmm3
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        movdqa	%xmm4, %xmm7
-        pslld	$31, %xmm5
-        pslld	$30, %xmm6
-        pslld	$25, %xmm7
-        pxor	%xmm6, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm7
-        psrldq	$4, %xmm7
-        pslldq	$12, %xmm5
-        pxor	%xmm5, %xmm4
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        psrld	$0x01, %xmm5
-        psrld	$2, %xmm6
-        pxor	%xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm5, %xmm3
-        movdqu	%xmm3, 48(%esp)
-        # First 64 bytes of input
-        # Encrypt 64 bytes of counter
-        movdqu	64(%esp), %xmm4
-        movdqa	L_aes_gcm_bswap_epi64, %xmm3
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        movdqa	%xmm4, %xmm7
-        pshufb	%xmm3, %xmm4
-        paddd	L_aes_gcm_one, %xmm5
-        pshufb	%xmm3, %xmm5
-        paddd	L_aes_gcm_two, %xmm6
-        pshufb	%xmm3, %xmm6
-        paddd	L_aes_gcm_three, %xmm7
-        pshufb	%xmm3, %xmm7
-        movdqu	64(%esp), %xmm3
-        paddd	L_aes_gcm_four, %xmm3
-        movdqu	%xmm3, 64(%esp)
-        movdqa	(%ebp), %xmm3
-        pxor	%xmm3, %xmm4
-        pxor	%xmm3, %xmm5
-        pxor	%xmm3, %xmm6
-        pxor	%xmm3, %xmm7
-        movdqa	16(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	32(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	48(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	64(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	80(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	96(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	112(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	128(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	144(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        cmpl	$11, 172(%esp)
-        movdqa	160(%ebp), %xmm3
-        jl	L_AES_GCM_encrypt_enc_done
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	176(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        cmpl	$13, 172(%esp)
-        movdqa	192(%ebp), %xmm3
-        jl	L_AES_GCM_encrypt_enc_done
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	208(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	224(%ebp), %xmm3
-L_AES_GCM_encrypt_enc_done:
-        aesenclast	%xmm3, %xmm4
-        aesenclast	%xmm3, %xmm5
-        movdqu	(%esi), %xmm0
-        movdqu	16(%esi), %xmm1
-        pxor	%xmm0, %xmm4
-        pxor	%xmm1, %xmm5
-        movdqu	%xmm4, (%edi)
-        movdqu	%xmm5, 16(%edi)
-        aesenclast	%xmm3, %xmm6
-        aesenclast	%xmm3, %xmm7
-        movdqu	32(%esi), %xmm0
-        movdqu	48(%esi), %xmm1
-        pxor	%xmm0, %xmm6
-        pxor	%xmm1, %xmm7
-        movdqu	%xmm6, 32(%edi)
-        movdqu	%xmm7, 48(%edi)
-        cmpl	$0x40, %eax
-        movl	$0x40, %ebx
-        movl	%esi, %ecx
-        movl	%edi, %edx
-        jle	L_AES_GCM_encrypt_end_64
-        # More 64 bytes of input
-L_AES_GCM_encrypt_ghash_64:
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        # Encrypt 64 bytes of counter
-        movdqu	64(%esp), %xmm4
-        movdqa	L_aes_gcm_bswap_epi64, %xmm3
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        movdqa	%xmm4, %xmm7
-        pshufb	%xmm3, %xmm4
-        paddd	L_aes_gcm_one, %xmm5
-        pshufb	%xmm3, %xmm5
-        paddd	L_aes_gcm_two, %xmm6
-        pshufb	%xmm3, %xmm6
-        paddd	L_aes_gcm_three, %xmm7
-        pshufb	%xmm3, %xmm7
-        movdqu	64(%esp), %xmm3
-        paddd	L_aes_gcm_four, %xmm3
-        movdqu	%xmm3, 64(%esp)
-        movdqa	(%ebp), %xmm3
-        pxor	%xmm3, %xmm4
-        pxor	%xmm3, %xmm5
-        pxor	%xmm3, %xmm6
-        pxor	%xmm3, %xmm7
-        movdqa	16(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	32(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	48(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	64(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	80(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	96(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	112(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	128(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	144(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        cmpl	$11, 172(%esp)
-        movdqa	160(%ebp), %xmm3
-        jl	L_AES_GCM_encrypt_aesenc_64_ghash_avx_done
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	176(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        cmpl	$13, 172(%esp)
-        movdqa	192(%ebp), %xmm3
-        jl	L_AES_GCM_encrypt_aesenc_64_ghash_avx_done
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	208(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	224(%ebp), %xmm3
-L_AES_GCM_encrypt_aesenc_64_ghash_avx_done:
-        aesenclast	%xmm3, %xmm4
-        aesenclast	%xmm3, %xmm5
-        movdqu	(%ecx), %xmm0
-        movdqu	16(%ecx), %xmm1
-        pxor	%xmm0, %xmm4
-        pxor	%xmm1, %xmm5
-        movdqu	%xmm4, (%edx)
-        movdqu	%xmm5, 16(%edx)
-        aesenclast	%xmm3, %xmm6
-        aesenclast	%xmm3, %xmm7
-        movdqu	32(%ecx), %xmm0
-        movdqu	48(%ecx), %xmm1
-        pxor	%xmm0, %xmm6
-        pxor	%xmm1, %xmm7
-        movdqu	%xmm6, 32(%edx)
-        movdqu	%xmm7, 48(%edx)
-        # ghash encrypted counter
-        movdqu	96(%esp), %xmm6
-        movdqu	48(%esp), %xmm3
-        movdqu	-64(%edx), %xmm4
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm6, %xmm4
-        pshufd	$0x4e, %xmm3, %xmm5
-        pshufd	$0x4e, %xmm4, %xmm1
-        pxor	%xmm3, %xmm5
-        pxor	%xmm4, %xmm1
-        movdqa	%xmm4, %xmm7
-        pclmulqdq	$0x11, %xmm3, %xmm7
-        movdqa	%xmm4, %xmm6
-        pclmulqdq	$0x00, %xmm3, %xmm6
-        pclmulqdq	$0x00, %xmm1, %xmm5
-        pxor	%xmm6, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqu	32(%esp), %xmm3
-        movdqu	-48(%edx), %xmm4
-        pshufd	$0x4e, %xmm3, %xmm0
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm3, %xmm0
-        pshufd	$0x4e, %xmm4, %xmm1
-        pxor	%xmm4, %xmm1
-        movdqa	%xmm4, %xmm2
-        pclmulqdq	$0x11, %xmm3, %xmm2
-        pclmulqdq	$0x00, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm1, %xmm0
-        pxor	%xmm3, %xmm5
-        pxor	%xmm3, %xmm6
-        pxor	%xmm2, %xmm5
-        pxor	%xmm2, %xmm7
-        pxor	%xmm0, %xmm5
-        movdqu	16(%esp), %xmm3
-        movdqu	-32(%edx), %xmm4
-        pshufd	$0x4e, %xmm3, %xmm0
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm3, %xmm0
-        pshufd	$0x4e, %xmm4, %xmm1
-        pxor	%xmm4, %xmm1
-        movdqa	%xmm4, %xmm2
-        pclmulqdq	$0x11, %xmm3, %xmm2
-        pclmulqdq	$0x00, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm1, %xmm0
-        pxor	%xmm3, %xmm5
-        pxor	%xmm3, %xmm6
-        pxor	%xmm2, %xmm5
-        pxor	%xmm2, %xmm7
-        pxor	%xmm0, %xmm5
-        movdqu	(%esp), %xmm3
-        movdqu	-16(%edx), %xmm4
-        pshufd	$0x4e, %xmm3, %xmm0
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm3, %xmm0
-        pshufd	$0x4e, %xmm4, %xmm1
-        pxor	%xmm4, %xmm1
-        movdqa	%xmm4, %xmm2
-        pclmulqdq	$0x11, %xmm3, %xmm2
-        pclmulqdq	$0x00, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm1, %xmm0
-        pxor	%xmm3, %xmm5
-        pxor	%xmm3, %xmm6
-        pxor	%xmm2, %xmm5
-        pxor	%xmm2, %xmm7
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm5, %xmm1
-        psrldq	$8, %xmm5
-        pslldq	$8, %xmm1
-        pxor	%xmm1, %xmm6
-        pxor	%xmm5, %xmm7
-        movdqa	%xmm6, %xmm3
-        movdqa	%xmm6, %xmm0
-        movdqa	%xmm6, %xmm1
-        pslld	$31, %xmm3
-        pslld	$30, %xmm0
-        pslld	$25, %xmm1
-        pxor	%xmm0, %xmm3
-        pxor	%xmm1, %xmm3
-        movdqa	%xmm3, %xmm0
-        pslldq	$12, %xmm3
-        psrldq	$4, %xmm0
-        pxor	%xmm3, %xmm6
-        movdqa	%xmm6, %xmm1
-        movdqa	%xmm6, %xmm5
-        movdqa	%xmm6, %xmm4
-        psrld	$0x01, %xmm1
-        psrld	$2, %xmm5
-        psrld	$7, %xmm4
-        pxor	%xmm5, %xmm1
-        pxor	%xmm4, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm1, %xmm6
-        pxor	%xmm7, %xmm6
-        movdqu	%xmm6, 96(%esp)
-        addl	$0x40, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_encrypt_ghash_64
-L_AES_GCM_encrypt_end_64:
-        movdqu	96(%esp), %xmm2
-        # Block 1
-        movdqa	L_aes_gcm_bswap_mask, %xmm4
-        movdqu	(%edx), %xmm1
-        pshufb	%xmm4, %xmm1
-        movdqu	48(%esp), %xmm3
-        pxor	%xmm2, %xmm1
-        pshufd	$0x4e, %xmm1, %xmm5
-        pshufd	$0x4e, %xmm3, %xmm6
-        movdqa	%xmm3, %xmm7
-        movdqa	%xmm3, %xmm4
-        pclmulqdq	$0x11, %xmm1, %xmm7
-        pclmulqdq	$0x00, %xmm1, %xmm4
-        pxor	%xmm1, %xmm5
-        pxor	%xmm3, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm4, %xmm0
-        movdqa	%xmm7, %xmm2
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm0
-        pxor	%xmm5, %xmm2
-        # Block 2
-        movdqa	L_aes_gcm_bswap_mask, %xmm4
-        movdqu	16(%edx), %xmm1
-        pshufb	%xmm4, %xmm1
-        movdqu	32(%esp), %xmm3
-        pshufd	$0x4e, %xmm1, %xmm5
-        pshufd	$0x4e, %xmm3, %xmm6
-        movdqa	%xmm3, %xmm7
-        movdqa	%xmm3, %xmm4
-        pclmulqdq	$0x11, %xmm1, %xmm7
-        pclmulqdq	$0x00, %xmm1, %xmm4
-        pxor	%xmm1, %xmm5
-        pxor	%xmm3, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        pxor	%xmm4, %xmm0
-        pxor	%xmm7, %xmm2
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm0
-        pxor	%xmm5, %xmm2
-        # Block 3
-        movdqa	L_aes_gcm_bswap_mask, %xmm4
-        movdqu	32(%edx), %xmm1
-        pshufb	%xmm4, %xmm1
-        movdqu	16(%esp), %xmm3
-        pshufd	$0x4e, %xmm1, %xmm5
-        pshufd	$0x4e, %xmm3, %xmm6
-        movdqa	%xmm3, %xmm7
-        movdqa	%xmm3, %xmm4
-        pclmulqdq	$0x11, %xmm1, %xmm7
-        pclmulqdq	$0x00, %xmm1, %xmm4
-        pxor	%xmm1, %xmm5
-        pxor	%xmm3, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        pxor	%xmm4, %xmm0
-        pxor	%xmm7, %xmm2
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm0
-        pxor	%xmm5, %xmm2
-        # Block 4
-        movdqa	L_aes_gcm_bswap_mask, %xmm4
-        movdqu	48(%edx), %xmm1
-        pshufb	%xmm4, %xmm1
-        movdqu	(%esp), %xmm3
-        pshufd	$0x4e, %xmm1, %xmm5
-        pshufd	$0x4e, %xmm3, %xmm6
-        movdqa	%xmm3, %xmm7
-        movdqa	%xmm3, %xmm4
-        pclmulqdq	$0x11, %xmm1, %xmm7
-        pclmulqdq	$0x00, %xmm1, %xmm4
-        pxor	%xmm1, %xmm5
-        pxor	%xmm3, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        pxor	%xmm4, %xmm0
-        pxor	%xmm7, %xmm2
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm0
-        pxor	%xmm5, %xmm2
-        movdqa	%xmm0, %xmm4
-        movdqa	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pslld	$31, %xmm4
-        pslld	$30, %xmm5
-        pslld	$25, %xmm6
-        pxor	%xmm5, %xmm4
-        pxor	%xmm6, %xmm4
-        movdqa	%xmm4, %xmm5
-        psrldq	$4, %xmm5
-        pslldq	$12, %xmm4
-        pxor	%xmm4, %xmm0
-        movdqa	%xmm0, %xmm6
-        movdqa	%xmm0, %xmm7
-        movdqa	%xmm0, %xmm4
-        psrld	$0x01, %xmm6
-        psrld	$2, %xmm7
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm6
-        pxor	%xmm4, %xmm6
-        pxor	%xmm5, %xmm6
-        pxor	%xmm0, %xmm6
-        pxor	%xmm6, %xmm2
-        movdqu	(%esp), %xmm1
-L_AES_GCM_encrypt_done_64:
-        movl	152(%esp), %edx
-        cmpl	%edx, %ebx
-        jge	L_AES_GCM_encrypt_done_enc
-        movl	152(%esp), %eax
-        andl	$0xfffffff0, %eax
-        cmpl	%eax, %ebx
-        jge	L_AES_GCM_encrypt_last_block_done
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        movdqu	64(%esp), %xmm4
-        movdqa	%xmm4, %xmm5
-        pshufb	L_aes_gcm_bswap_epi64, %xmm4
-        paddd	L_aes_gcm_one, %xmm5
-        pxor	(%ebp), %xmm4
-        movdqu	%xmm5, 64(%esp)
-        aesenc	16(%ebp), %xmm4
-        aesenc	32(%ebp), %xmm4
-        aesenc	48(%ebp), %xmm4
-        aesenc	64(%ebp), %xmm4
-        aesenc	80(%ebp), %xmm4
-        aesenc	96(%ebp), %xmm4
-        aesenc	112(%ebp), %xmm4
-        aesenc	128(%ebp), %xmm4
-        aesenc	144(%ebp), %xmm4
-        cmpl	$11, 172(%esp)
-        movdqa	160(%ebp), %xmm5
-        jl	L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last
-        aesenc	%xmm5, %xmm4
-        aesenc	176(%ebp), %xmm4
-        cmpl	$13, 172(%esp)
-        movdqa	192(%ebp), %xmm5
-        jl	L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last
-        aesenc	%xmm5, %xmm4
-        aesenc	208(%ebp), %xmm4
-        movdqa	224(%ebp), %xmm5
-L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last:
-        aesenclast	%xmm5, %xmm4
-        movdqu	(%ecx), %xmm5
-        pxor	%xmm5, %xmm4
-        movdqu	%xmm4, (%edx)
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm4, %xmm2
-        addl	$16, %ebx
-        cmpl	%eax, %ebx
-        jge	L_AES_GCM_encrypt_last_block_ghash
-L_AES_GCM_encrypt_last_block_start:
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        movdqu	64(%esp), %xmm4
-        movdqa	%xmm4, %xmm5
-        pshufb	L_aes_gcm_bswap_epi64, %xmm4
-        paddd	L_aes_gcm_one, %xmm5
-        pxor	(%ebp), %xmm4
-        movdqu	%xmm5, 64(%esp)
-        movdqu	%xmm2, %xmm0
-        pclmulqdq	$16, %xmm1, %xmm0
-        aesenc	16(%ebp), %xmm4
-        aesenc	32(%ebp), %xmm4
-        movdqu	%xmm2, %xmm3
-        pclmulqdq	$0x01, %xmm1, %xmm3
-        aesenc	48(%ebp), %xmm4
-        aesenc	64(%ebp), %xmm4
-        aesenc	80(%ebp), %xmm4
-        movdqu	%xmm2, %xmm5
-        pclmulqdq	$0x11, %xmm1, %xmm5
-        aesenc	96(%ebp), %xmm4
-        pxor	%xmm3, %xmm0
-        movdqa	%xmm0, %xmm6
-        psrldq	$8, %xmm0
-        pslldq	$8, %xmm6
-        aesenc	112(%ebp), %xmm4
-        movdqu	%xmm2, %xmm3
-        pclmulqdq	$0x00, %xmm1, %xmm3
-        pxor	%xmm3, %xmm6
-        pxor	%xmm0, %xmm5
-        movdqa	L_aes_gcm_mod2_128, %xmm7
-        movdqa	%xmm6, %xmm3
-        pclmulqdq	$16, %xmm7, %xmm3
-        aesenc	128(%ebp), %xmm4
-        pshufd	$0x4e, %xmm6, %xmm0
-        pxor	%xmm3, %xmm0
-        movdqa	%xmm0, %xmm3
-        pclmulqdq	$16, %xmm7, %xmm3
-        aesenc	144(%ebp), %xmm4
-        pshufd	$0x4e, %xmm0, %xmm2
-        pxor	%xmm3, %xmm2
-        pxor	%xmm5, %xmm2
-        cmpl	$11, 172(%esp)
-        movdqa	160(%ebp), %xmm5
-        jl	L_AES_GCM_encrypt_aesenc_gfmul_last
-        aesenc	%xmm5, %xmm4
-        aesenc	176(%ebp), %xmm4
-        cmpl	$13, 172(%esp)
-        movdqa	192(%ebp), %xmm5
-        jl	L_AES_GCM_encrypt_aesenc_gfmul_last
-        aesenc	%xmm5, %xmm4
-        aesenc	208(%ebp), %xmm4
-        movdqa	224(%ebp), %xmm5
-L_AES_GCM_encrypt_aesenc_gfmul_last:
-        aesenclast	%xmm5, %xmm4
-        movdqu	(%ecx), %xmm5
-        pxor	%xmm5, %xmm4
-        movdqu	%xmm4, (%edx)
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm4, %xmm2
-        addl	$16, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_encrypt_last_block_start
-L_AES_GCM_encrypt_last_block_ghash:
-        pshufd	$0x4e, %xmm1, %xmm5
-        pshufd	$0x4e, %xmm2, %xmm6
-        movdqa	%xmm2, %xmm7
-        movdqa	%xmm2, %xmm4
-        pclmulqdq	$0x11, %xmm1, %xmm7
-        pclmulqdq	$0x00, %xmm1, %xmm4
-        pxor	%xmm1, %xmm5
-        pxor	%xmm2, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm7, %xmm2
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm4
-        pxor	%xmm5, %xmm2
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        movdqa	%xmm4, %xmm7
-        pslld	$31, %xmm5
-        pslld	$30, %xmm6
-        pslld	$25, %xmm7
-        pxor	%xmm6, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm7
-        psrldq	$4, %xmm7
-        pslldq	$12, %xmm5
-        pxor	%xmm5, %xmm4
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        psrld	$0x01, %xmm5
-        psrld	$2, %xmm6
-        pxor	%xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm5, %xmm2
-L_AES_GCM_encrypt_last_block_done:
-        movl	152(%esp), %ecx
-        movl	%ecx, %edx
-        andl	$15, %ecx
-        jz	L_AES_GCM_encrypt_aesenc_last15_enc_avx_done
-        movdqu	64(%esp), %xmm0
-        pshufb	L_aes_gcm_bswap_epi64, %xmm0
-        pxor	(%ebp), %xmm0
-        aesenc	16(%ebp), %xmm0
-        aesenc	32(%ebp), %xmm0
-        aesenc	48(%ebp), %xmm0
-        aesenc	64(%ebp), %xmm0
-        aesenc	80(%ebp), %xmm0
-        aesenc	96(%ebp), %xmm0
-        aesenc	112(%ebp), %xmm0
-        aesenc	128(%ebp), %xmm0
-        aesenc	144(%ebp), %xmm0
-        cmpl	$11, 172(%esp)
-        movdqa	160(%ebp), %xmm5
-        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last
-        aesenc	%xmm5, %xmm0
-        aesenc	176(%ebp), %xmm0
-        cmpl	$13, 172(%esp)
-        movdqa	192(%ebp), %xmm5
-        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last
-        aesenc	%xmm5, %xmm0
-        aesenc	208(%ebp), %xmm0
-        movdqa	224(%ebp), %xmm5
-L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last:
-        aesenclast	%xmm5, %xmm0
-        subl	$16, %esp
-        xorl	%ecx, %ecx
-        movdqu	%xmm0, (%esp)
-L_AES_GCM_encrypt_aesenc_last15_enc_avx_loop:
-        movzbl	(%esi,%ebx,1), %eax
-        xorb	(%esp,%ecx,1), %al
-        movb	%al, (%edi,%ebx,1)
-        movb	%al, (%esp,%ecx,1)
-        incl	%ebx
-        incl	%ecx
-        cmpl	%edx, %ebx
-        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_loop
-        xorl	%eax, %eax
-        cmpl	$16, %ecx
-        je	L_AES_GCM_encrypt_aesenc_last15_enc_avx_finish_enc
-L_AES_GCM_encrypt_aesenc_last15_enc_avx_byte_loop:
-        movb	%al, (%esp,%ecx,1)
-        incl	%ecx
-        cmpl	$16, %ecx
-        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_byte_loop
-L_AES_GCM_encrypt_aesenc_last15_enc_avx_finish_enc:
-        movdqu	(%esp), %xmm0
-        addl	$16, %esp
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        pxor	%xmm0, %xmm2
-        pshufd	$0x4e, %xmm1, %xmm5
-        pshufd	$0x4e, %xmm2, %xmm6
-        movdqa	%xmm2, %xmm7
-        movdqa	%xmm2, %xmm4
-        pclmulqdq	$0x11, %xmm1, %xmm7
-        pclmulqdq	$0x00, %xmm1, %xmm4
-        pxor	%xmm1, %xmm5
-        pxor	%xmm2, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm7, %xmm2
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm4
-        pxor	%xmm5, %xmm2
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        movdqa	%xmm4, %xmm7
-        pslld	$31, %xmm5
-        pslld	$30, %xmm6
-        pslld	$25, %xmm7
-        pxor	%xmm6, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm7
-        psrldq	$4, %xmm7
-        pslldq	$12, %xmm5
-        pxor	%xmm5, %xmm4
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        psrld	$0x01, %xmm5
-        psrld	$2, %xmm6
-        pxor	%xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm5, %xmm2
-L_AES_GCM_encrypt_aesenc_last15_enc_avx_done:
-L_AES_GCM_encrypt_done_enc:
-        movl	148(%esp), %edi
-        movl	164(%esp), %ebx
-        movl	152(%esp), %edx
-        movl	156(%esp), %ecx
-        shll	$3, %edx
-        shll	$3, %ecx
-        pinsrd	$0x00, %edx, %xmm4
-        pinsrd	$2, %ecx, %xmm4
-        movl	152(%esp), %edx
-        movl	156(%esp), %ecx
-        shrl	$29, %edx
-        shrl	$29, %ecx
-        pinsrd	$0x01, %edx, %xmm4
-        pinsrd	$3, %ecx, %xmm4
-        pxor	%xmm4, %xmm2
-        pshufd	$0x4e, %xmm1, %xmm5
-        pshufd	$0x4e, %xmm2, %xmm6
-        movdqa	%xmm2, %xmm7
-        movdqa	%xmm2, %xmm4
-        pclmulqdq	$0x11, %xmm1, %xmm7
-        pclmulqdq	$0x00, %xmm1, %xmm4
-        pxor	%xmm1, %xmm5
-        pxor	%xmm2, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm7, %xmm2
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm4
-        pxor	%xmm5, %xmm2
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        movdqa	%xmm4, %xmm7
-        pslld	$31, %xmm5
-        pslld	$30, %xmm6
-        pslld	$25, %xmm7
-        pxor	%xmm6, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm7
-        psrldq	$4, %xmm7
-        pslldq	$12, %xmm5
-        pxor	%xmm5, %xmm4
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        psrld	$0x01, %xmm5
-        psrld	$2, %xmm6
-        pxor	%xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm5, %xmm2
-        pshufb	L_aes_gcm_bswap_mask, %xmm2
-        movdqu	80(%esp), %xmm4
-        pxor	%xmm2, %xmm4
-        cmpl	$16, %ebx
-        je	L_AES_GCM_encrypt_store_tag_16
-        xorl	%ecx, %ecx
-        movdqu	%xmm4, (%esp)
-L_AES_GCM_encrypt_store_tag_loop:
-        movzbl	(%esp,%ecx,1), %eax
-        movb	%al, (%edi,%ecx,1)
-        incl	%ecx
-        cmpl	%ebx, %ecx
-        jne	L_AES_GCM_encrypt_store_tag_loop
-        jmp	L_AES_GCM_encrypt_store_tag_done
-L_AES_GCM_encrypt_store_tag_16:
-        movdqu	%xmm4, (%edi)
-L_AES_GCM_encrypt_store_tag_done:
-        addl	$0x70, %esp
-        popl	%ebp
-        popl	%edi
-        popl	%esi
-        popl	%ebx
-        ret
-.size	AES_GCM_encrypt,.-AES_GCM_encrypt
-.text
-.globl	AES_GCM_decrypt
-.type	AES_GCM_decrypt,@function
-.align	16
-AES_GCM_decrypt:
-        pushl	%ebx
-        pushl	%esi
-        pushl	%edi
-        pushl	%ebp
-        subl	$0xb0, %esp
-        movl	208(%esp), %esi
-        movl	232(%esp), %ebp
-        movl	224(%esp), %edx
-        pxor	%xmm0, %xmm0
-        pxor	%xmm2, %xmm2
-        cmpl	$12, %edx
-        jne	L_AES_GCM_decrypt_iv_not_12
-        # # Calculate values when IV is 12 bytes
-        # Set counter based on IV
-        movl	$0x1000000, %ecx
-        pinsrd	$0x00, (%esi), %xmm0
-        pinsrd	$0x01, 4(%esi), %xmm0
-        pinsrd	$2, 8(%esi), %xmm0
-        pinsrd	$3, %ecx, %xmm0
-        # H = Encrypt X(=0) and T = Encrypt counter
-        movdqa	%xmm0, %xmm5
-        movdqa	(%ebp), %xmm1
-        pxor	%xmm1, %xmm5
-        movdqa	16(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	32(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	48(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	64(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	80(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	96(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	112(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	128(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	144(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        cmpl	$11, 236(%esp)
-        movdqa	160(%ebp), %xmm3
-        jl	L_AES_GCM_decrypt_calc_iv_12_last
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	176(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        cmpl	$13, 236(%esp)
-        movdqa	192(%ebp), %xmm3
-        jl	L_AES_GCM_decrypt_calc_iv_12_last
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	208(%ebp), %xmm3
-        aesenc	%xmm3, %xmm1
-        aesenc	%xmm3, %xmm5
-        movdqa	224(%ebp), %xmm3
-L_AES_GCM_decrypt_calc_iv_12_last:
-        aesenclast	%xmm3, %xmm1
-        aesenclast	%xmm3, %xmm5
-        pshufb	L_aes_gcm_bswap_mask, %xmm1
-        movdqu	%xmm5, 80(%esp)
-        jmp	L_AES_GCM_decrypt_iv_done
-L_AES_GCM_decrypt_iv_not_12:
-        # Calculate values when IV is not 12 bytes
-        # H = Encrypt X(=0)
-        movdqa	(%ebp), %xmm1
-        aesenc	16(%ebp), %xmm1
-        aesenc	32(%ebp), %xmm1
-        aesenc	48(%ebp), %xmm1
-        aesenc	64(%ebp), %xmm1
-        aesenc	80(%ebp), %xmm1
-        aesenc	96(%ebp), %xmm1
-        aesenc	112(%ebp), %xmm1
-        aesenc	128(%ebp), %xmm1
-        aesenc	144(%ebp), %xmm1
-        cmpl	$11, 236(%esp)
-        movdqa	160(%ebp), %xmm5
-        jl	L_AES_GCM_decrypt_calc_iv_1_aesenc_avx_last
-        aesenc	%xmm5, %xmm1
-        aesenc	176(%ebp), %xmm1
-        cmpl	$13, 236(%esp)
-        movdqa	192(%ebp), %xmm5
-        jl	L_AES_GCM_decrypt_calc_iv_1_aesenc_avx_last
-        aesenc	%xmm5, %xmm1
-        aesenc	208(%ebp), %xmm1
-        movdqa	224(%ebp), %xmm5
-L_AES_GCM_decrypt_calc_iv_1_aesenc_avx_last:
-        aesenclast	%xmm5, %xmm1
-        pshufb	L_aes_gcm_bswap_mask, %xmm1
-        # Calc counter
-        # Initialization vector
-        cmpl	$0x00, %edx
-        movl	$0x00, %ecx
-        je	L_AES_GCM_decrypt_calc_iv_done
-        cmpl	$16, %edx
-        jl	L_AES_GCM_decrypt_calc_iv_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_decrypt_calc_iv_16_loop:
-        movdqu	(%esi,%ecx,1), %xmm4
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm4, %xmm0
-        pshufd	$0x4e, %xmm0, %xmm5
-        pshufd	$0x4e, %xmm1, %xmm6
-        movdqa	%xmm1, %xmm7
-        movdqa	%xmm1, %xmm4
-        pclmulqdq	$0x11, %xmm0, %xmm7
-        pclmulqdq	$0x00, %xmm0, %xmm4
-        pxor	%xmm0, %xmm5
-        pxor	%xmm1, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm4, %xmm3
-        movdqa	%xmm7, %xmm0
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm3
-        pxor	%xmm5, %xmm0
-        movdqa	%xmm3, %xmm4
-        movdqa	%xmm0, %xmm5
-        psrld	$31, %xmm4
-        psrld	$31, %xmm5
-        pslld	$0x01, %xmm3
-        pslld	$0x01, %xmm0
-        movdqa	%xmm4, %xmm6
-        pslldq	$4, %xmm4
-        psrldq	$12, %xmm6
-        pslldq	$4, %xmm5
-        por	%xmm6, %xmm0
-        por	%xmm4, %xmm3
-        por	%xmm5, %xmm0
-        movdqa	%xmm3, %xmm4
-        movdqa	%xmm3, %xmm5
-        movdqa	%xmm3, %xmm6
-        pslld	$31, %xmm4
-        pslld	$30, %xmm5
-        pslld	$25, %xmm6
-        pxor	%xmm5, %xmm4
-        pxor	%xmm6, %xmm4
-        movdqa	%xmm4, %xmm5
-        psrldq	$4, %xmm5
-        pslldq	$12, %xmm4
-        pxor	%xmm4, %xmm3
-        movdqa	%xmm3, %xmm6
-        movdqa	%xmm3, %xmm7
-        movdqa	%xmm3, %xmm4
-        psrld	$0x01, %xmm6
-        psrld	$2, %xmm7
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm6
-        pxor	%xmm4, %xmm6
-        pxor	%xmm5, %xmm6
-        pxor	%xmm3, %xmm6
-        pxor	%xmm6, %xmm0
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_calc_iv_16_loop
-        movl	224(%esp), %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_decrypt_calc_iv_done
-L_AES_GCM_decrypt_calc_iv_lt16:
-        subl	$16, %esp
-        pxor	%xmm4, %xmm4
-        xorl	%ebx, %ebx
-        movdqu	%xmm4, (%esp)
-L_AES_GCM_decrypt_calc_iv_loop:
-        movzbl	(%esi,%ecx,1), %eax
-        movb	%al, (%esp,%ebx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_calc_iv_loop
-        movdqu	(%esp), %xmm4
-        addl	$16, %esp
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm4, %xmm0
-        pshufd	$0x4e, %xmm0, %xmm5
-        pshufd	$0x4e, %xmm1, %xmm6
-        movdqa	%xmm1, %xmm7
-        movdqa	%xmm1, %xmm4
-        pclmulqdq	$0x11, %xmm0, %xmm7
-        pclmulqdq	$0x00, %xmm0, %xmm4
-        pxor	%xmm0, %xmm5
-        pxor	%xmm1, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm4, %xmm3
-        movdqa	%xmm7, %xmm0
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm3
-        pxor	%xmm5, %xmm0
-        movdqa	%xmm3, %xmm4
-        movdqa	%xmm0, %xmm5
-        psrld	$31, %xmm4
-        psrld	$31, %xmm5
-        pslld	$0x01, %xmm3
-        pslld	$0x01, %xmm0
-        movdqa	%xmm4, %xmm6
-        pslldq	$4, %xmm4
-        psrldq	$12, %xmm6
-        pslldq	$4, %xmm5
-        por	%xmm6, %xmm0
-        por	%xmm4, %xmm3
-        por	%xmm5, %xmm0
-        movdqa	%xmm3, %xmm4
-        movdqa	%xmm3, %xmm5
-        movdqa	%xmm3, %xmm6
-        pslld	$31, %xmm4
-        pslld	$30, %xmm5
-        pslld	$25, %xmm6
-        pxor	%xmm5, %xmm4
-        pxor	%xmm6, %xmm4
-        movdqa	%xmm4, %xmm5
-        psrldq	$4, %xmm5
-        pslldq	$12, %xmm4
-        pxor	%xmm4, %xmm3
-        movdqa	%xmm3, %xmm6
-        movdqa	%xmm3, %xmm7
-        movdqa	%xmm3, %xmm4
-        psrld	$0x01, %xmm6
-        psrld	$2, %xmm7
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm6
-        pxor	%xmm4, %xmm6
-        pxor	%xmm5, %xmm6
-        pxor	%xmm3, %xmm6
-        pxor	%xmm6, %xmm0
-L_AES_GCM_decrypt_calc_iv_done:
-        # T = Encrypt counter
-        pxor	%xmm4, %xmm4
-        shll	$3, %edx
-        pinsrd	$0x00, %edx, %xmm4
-        pxor	%xmm4, %xmm0
-        pshufd	$0x4e, %xmm0, %xmm5
-        pshufd	$0x4e, %xmm1, %xmm6
-        movdqa	%xmm1, %xmm7
-        movdqa	%xmm1, %xmm4
-        pclmulqdq	$0x11, %xmm0, %xmm7
-        pclmulqdq	$0x00, %xmm0, %xmm4
-        pxor	%xmm0, %xmm5
-        pxor	%xmm1, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm4, %xmm3
-        movdqa	%xmm7, %xmm0
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm3
-        pxor	%xmm5, %xmm0
-        movdqa	%xmm3, %xmm4
-        movdqa	%xmm0, %xmm5
-        psrld	$31, %xmm4
-        psrld	$31, %xmm5
-        pslld	$0x01, %xmm3
-        pslld	$0x01, %xmm0
-        movdqa	%xmm4, %xmm6
-        pslldq	$4, %xmm4
-        psrldq	$12, %xmm6
-        pslldq	$4, %xmm5
-        por	%xmm6, %xmm0
-        por	%xmm4, %xmm3
-        por	%xmm5, %xmm0
-        movdqa	%xmm3, %xmm4
-        movdqa	%xmm3, %xmm5
-        movdqa	%xmm3, %xmm6
-        pslld	$31, %xmm4
-        pslld	$30, %xmm5
-        pslld	$25, %xmm6
-        pxor	%xmm5, %xmm4
-        pxor	%xmm6, %xmm4
-        movdqa	%xmm4, %xmm5
-        psrldq	$4, %xmm5
-        pslldq	$12, %xmm4
-        pxor	%xmm4, %xmm3
-        movdqa	%xmm3, %xmm6
-        movdqa	%xmm3, %xmm7
-        movdqa	%xmm3, %xmm4
-        psrld	$0x01, %xmm6
-        psrld	$2, %xmm7
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm6
-        pxor	%xmm4, %xmm6
-        pxor	%xmm5, %xmm6
-        pxor	%xmm3, %xmm6
-        pxor	%xmm6, %xmm0
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        #   Encrypt counter
-        movdqa	(%ebp), %xmm4
-        pxor	%xmm0, %xmm4
-        aesenc	16(%ebp), %xmm4
-        aesenc	32(%ebp), %xmm4
-        aesenc	48(%ebp), %xmm4
-        aesenc	64(%ebp), %xmm4
-        aesenc	80(%ebp), %xmm4
-        aesenc	96(%ebp), %xmm4
-        aesenc	112(%ebp), %xmm4
-        aesenc	128(%ebp), %xmm4
-        aesenc	144(%ebp), %xmm4
-        cmpl	$11, 236(%esp)
-        movdqa	160(%ebp), %xmm5
-        jl	L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last
-        aesenc	%xmm5, %xmm4
-        aesenc	176(%ebp), %xmm4
-        cmpl	$13, 236(%esp)
-        movdqa	192(%ebp), %xmm5
-        jl	L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last
-        aesenc	%xmm5, %xmm4
-        aesenc	208(%ebp), %xmm4
-        movdqa	224(%ebp), %xmm5
-L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last:
-        aesenclast	%xmm5, %xmm4
-        movdqu	%xmm4, 80(%esp)
-L_AES_GCM_decrypt_iv_done:
-        movl	204(%esp), %esi
-        # Additional authentication data
-        movl	220(%esp), %edx
-        cmpl	$0x00, %edx
-        je	L_AES_GCM_decrypt_calc_aad_done
-        xorl	%ecx, %ecx
-        cmpl	$16, %edx
-        jl	L_AES_GCM_decrypt_calc_aad_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_decrypt_calc_aad_16_loop:
-        movdqu	(%esi,%ecx,1), %xmm4
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm4, %xmm2
-        pshufd	$0x4e, %xmm2, %xmm5
-        pshufd	$0x4e, %xmm1, %xmm6
-        movdqa	%xmm1, %xmm7
-        movdqa	%xmm1, %xmm4
-        pclmulqdq	$0x11, %xmm2, %xmm7
-        pclmulqdq	$0x00, %xmm2, %xmm4
-        pxor	%xmm2, %xmm5
-        pxor	%xmm1, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm4, %xmm3
-        movdqa	%xmm7, %xmm2
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm3
-        pxor	%xmm5, %xmm2
-        movdqa	%xmm3, %xmm4
-        movdqa	%xmm2, %xmm5
-        psrld	$31, %xmm4
-        psrld	$31, %xmm5
-        pslld	$0x01, %xmm3
-        pslld	$0x01, %xmm2
-        movdqa	%xmm4, %xmm6
-        pslldq	$4, %xmm4
-        psrldq	$12, %xmm6
-        pslldq	$4, %xmm5
-        por	%xmm6, %xmm2
-        por	%xmm4, %xmm3
-        por	%xmm5, %xmm2
-        movdqa	%xmm3, %xmm4
-        movdqa	%xmm3, %xmm5
-        movdqa	%xmm3, %xmm6
-        pslld	$31, %xmm4
-        pslld	$30, %xmm5
-        pslld	$25, %xmm6
-        pxor	%xmm5, %xmm4
-        pxor	%xmm6, %xmm4
-        movdqa	%xmm4, %xmm5
-        psrldq	$4, %xmm5
-        pslldq	$12, %xmm4
-        pxor	%xmm4, %xmm3
-        movdqa	%xmm3, %xmm6
-        movdqa	%xmm3, %xmm7
-        movdqa	%xmm3, %xmm4
-        psrld	$0x01, %xmm6
-        psrld	$2, %xmm7
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm6
-        pxor	%xmm4, %xmm6
-        pxor	%xmm5, %xmm6
-        pxor	%xmm3, %xmm6
-        pxor	%xmm6, %xmm2
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_calc_aad_16_loop
-        movl	220(%esp), %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_decrypt_calc_aad_done
-L_AES_GCM_decrypt_calc_aad_lt16:
-        subl	$16, %esp
-        pxor	%xmm4, %xmm4
-        xorl	%ebx, %ebx
-        movdqu	%xmm4, (%esp)
-L_AES_GCM_decrypt_calc_aad_loop:
-        movzbl	(%esi,%ecx,1), %eax
-        movb	%al, (%esp,%ebx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_calc_aad_loop
-        movdqu	(%esp), %xmm4
-        addl	$16, %esp
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm4, %xmm2
-        pshufd	$0x4e, %xmm2, %xmm5
-        pshufd	$0x4e, %xmm1, %xmm6
-        movdqa	%xmm1, %xmm7
-        movdqa	%xmm1, %xmm4
-        pclmulqdq	$0x11, %xmm2, %xmm7
-        pclmulqdq	$0x00, %xmm2, %xmm4
-        pxor	%xmm2, %xmm5
-        pxor	%xmm1, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm4, %xmm3
-        movdqa	%xmm7, %xmm2
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm3
-        pxor	%xmm5, %xmm2
-        movdqa	%xmm3, %xmm4
-        movdqa	%xmm2, %xmm5
-        psrld	$31, %xmm4
-        psrld	$31, %xmm5
-        pslld	$0x01, %xmm3
-        pslld	$0x01, %xmm2
-        movdqa	%xmm4, %xmm6
-        pslldq	$4, %xmm4
-        psrldq	$12, %xmm6
-        pslldq	$4, %xmm5
-        por	%xmm6, %xmm2
-        por	%xmm4, %xmm3
-        por	%xmm5, %xmm2
-        movdqa	%xmm3, %xmm4
-        movdqa	%xmm3, %xmm5
-        movdqa	%xmm3, %xmm6
-        pslld	$31, %xmm4
-        pslld	$30, %xmm5
-        pslld	$25, %xmm6
-        pxor	%xmm5, %xmm4
-        pxor	%xmm6, %xmm4
-        movdqa	%xmm4, %xmm5
-        psrldq	$4, %xmm5
-        pslldq	$12, %xmm4
-        pxor	%xmm4, %xmm3
-        movdqa	%xmm3, %xmm6
-        movdqa	%xmm3, %xmm7
-        movdqa	%xmm3, %xmm4
-        psrld	$0x01, %xmm6
-        psrld	$2, %xmm7
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm6
-        pxor	%xmm4, %xmm6
-        pxor	%xmm5, %xmm6
-        pxor	%xmm3, %xmm6
-        pxor	%xmm6, %xmm2
-L_AES_GCM_decrypt_calc_aad_done:
-        movdqu	%xmm2, 96(%esp)
-        movl	196(%esp), %esi
-        movl	200(%esp), %edi
-        # Calculate counter and H
-        pshufb	L_aes_gcm_bswap_epi64, %xmm0
-        movdqa	%xmm1, %xmm5
-        paddd	L_aes_gcm_one, %xmm0
-        movdqa	%xmm1, %xmm4
-        movdqu	%xmm0, 64(%esp)
-        psrlq	$63, %xmm5
-        psllq	$0x01, %xmm4
-        pslldq	$8, %xmm5
-        por	%xmm5, %xmm4
-        pshufd	$0xff, %xmm1, %xmm1
-        psrad	$31, %xmm1
-        pand	L_aes_gcm_mod2_128, %xmm1
-        pxor	%xmm4, %xmm1
-        xorl	%ebx, %ebx
-        cmpl	$0x40, 216(%esp)
-        movl	216(%esp), %eax
-        jl	L_AES_GCM_decrypt_done_64
-        andl	$0xffffffc0, %eax
-        movdqa	%xmm2, %xmm6
-        # H ^ 1
-        movdqu	%xmm1, (%esp)
-        # H ^ 2
-        pshufd	$0x4e, %xmm1, %xmm5
-        pshufd	$0x4e, %xmm1, %xmm6
-        movdqa	%xmm1, %xmm7
-        movdqa	%xmm1, %xmm4
-        pclmulqdq	$0x11, %xmm1, %xmm7
-        pclmulqdq	$0x00, %xmm1, %xmm4
-        pxor	%xmm1, %xmm5
-        pxor	%xmm1, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm7, %xmm0
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm4
-        pxor	%xmm5, %xmm0
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        movdqa	%xmm4, %xmm7
-        pslld	$31, %xmm5
-        pslld	$30, %xmm6
-        pslld	$25, %xmm7
-        pxor	%xmm6, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm7
-        psrldq	$4, %xmm7
-        pslldq	$12, %xmm5
-        pxor	%xmm5, %xmm4
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        psrld	$0x01, %xmm5
-        psrld	$2, %xmm6
-        pxor	%xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm5, %xmm0
-        movdqu	%xmm0, 16(%esp)
-        # H ^ 3
-        pshufd	$0x4e, %xmm1, %xmm5
-        pshufd	$0x4e, %xmm0, %xmm6
-        movdqa	%xmm0, %xmm7
-        movdqa	%xmm0, %xmm4
-        pclmulqdq	$0x11, %xmm1, %xmm7
-        pclmulqdq	$0x00, %xmm1, %xmm4
-        pxor	%xmm1, %xmm5
-        pxor	%xmm0, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm7, %xmm3
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm4
-        pxor	%xmm5, %xmm3
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        movdqa	%xmm4, %xmm7
-        pslld	$31, %xmm5
-        pslld	$30, %xmm6
-        pslld	$25, %xmm7
-        pxor	%xmm6, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm7
-        psrldq	$4, %xmm7
-        pslldq	$12, %xmm5
-        pxor	%xmm5, %xmm4
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        psrld	$0x01, %xmm5
-        psrld	$2, %xmm6
-        pxor	%xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm5, %xmm3
-        movdqu	%xmm3, 32(%esp)
-        # H ^ 4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pshufd	$0x4e, %xmm0, %xmm6
-        movdqa	%xmm0, %xmm7
-        movdqa	%xmm0, %xmm4
-        pclmulqdq	$0x11, %xmm0, %xmm7
-        pclmulqdq	$0x00, %xmm0, %xmm4
-        pxor	%xmm0, %xmm5
-        pxor	%xmm0, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm7, %xmm3
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm4
-        pxor	%xmm5, %xmm3
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        movdqa	%xmm4, %xmm7
-        pslld	$31, %xmm5
-        pslld	$30, %xmm6
-        pslld	$25, %xmm7
-        pxor	%xmm6, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm7
-        psrldq	$4, %xmm7
-        pslldq	$12, %xmm5
-        pxor	%xmm5, %xmm4
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        psrld	$0x01, %xmm5
-        psrld	$2, %xmm6
-        pxor	%xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm5, %xmm3
-        movdqu	%xmm3, 48(%esp)
-        cmpl	%esi, %edi
-        jne	L_AES_GCM_decrypt_ghash_64
-L_AES_GCM_decrypt_ghash_64_inplace:
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        # Encrypt 64 bytes of counter
-        movdqu	64(%esp), %xmm4
-        movdqa	L_aes_gcm_bswap_epi64, %xmm3
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        movdqa	%xmm4, %xmm7
-        pshufb	%xmm3, %xmm4
-        paddd	L_aes_gcm_one, %xmm5
-        pshufb	%xmm3, %xmm5
-        paddd	L_aes_gcm_two, %xmm6
-        pshufb	%xmm3, %xmm6
-        paddd	L_aes_gcm_three, %xmm7
-        pshufb	%xmm3, %xmm7
-        movdqu	64(%esp), %xmm3
-        paddd	L_aes_gcm_four, %xmm3
-        movdqu	%xmm3, 64(%esp)
-        movdqa	(%ebp), %xmm3
-        pxor	%xmm3, %xmm4
-        pxor	%xmm3, %xmm5
-        pxor	%xmm3, %xmm6
-        pxor	%xmm3, %xmm7
-        movdqa	16(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	32(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	48(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	64(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	80(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	96(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	112(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	128(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	144(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        cmpl	$11, 236(%esp)
-        movdqa	160(%ebp), %xmm3
-        jl	L_AES_GCM_decryptinplace_aesenc_64_ghash_avx_done
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	176(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        cmpl	$13, 236(%esp)
-        movdqa	192(%ebp), %xmm3
-        jl	L_AES_GCM_decryptinplace_aesenc_64_ghash_avx_done
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	208(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	224(%ebp), %xmm3
-L_AES_GCM_decryptinplace_aesenc_64_ghash_avx_done:
-        aesenclast	%xmm3, %xmm4
-        aesenclast	%xmm3, %xmm5
-        movdqu	(%ecx), %xmm0
-        movdqu	16(%ecx), %xmm1
-        pxor	%xmm0, %xmm4
-        pxor	%xmm1, %xmm5
-        movdqu	%xmm0, 112(%esp)
-        movdqu	%xmm1, 128(%esp)
-        movdqu	%xmm4, (%edx)
-        movdqu	%xmm5, 16(%edx)
-        aesenclast	%xmm3, %xmm6
-        aesenclast	%xmm3, %xmm7
-        movdqu	32(%ecx), %xmm0
-        movdqu	48(%ecx), %xmm1
-        pxor	%xmm0, %xmm6
-        pxor	%xmm1, %xmm7
-        movdqu	%xmm0, 144(%esp)
-        movdqu	%xmm1, 160(%esp)
-        movdqu	%xmm6, 32(%edx)
-        movdqu	%xmm7, 48(%edx)
-        # ghash encrypted counter
-        movdqu	96(%esp), %xmm6
-        movdqu	48(%esp), %xmm3
-        movdqu	112(%esp), %xmm4
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm6, %xmm4
-        pshufd	$0x4e, %xmm3, %xmm5
-        pshufd	$0x4e, %xmm4, %xmm1
-        pxor	%xmm3, %xmm5
-        pxor	%xmm4, %xmm1
-        movdqa	%xmm4, %xmm7
-        pclmulqdq	$0x11, %xmm3, %xmm7
-        movdqa	%xmm4, %xmm6
-        pclmulqdq	$0x00, %xmm3, %xmm6
-        pclmulqdq	$0x00, %xmm1, %xmm5
-        pxor	%xmm6, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqu	32(%esp), %xmm3
-        movdqu	128(%esp), %xmm4
-        pshufd	$0x4e, %xmm3, %xmm0
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm3, %xmm0
-        pshufd	$0x4e, %xmm4, %xmm1
-        pxor	%xmm4, %xmm1
-        movdqa	%xmm4, %xmm2
-        pclmulqdq	$0x11, %xmm3, %xmm2
-        pclmulqdq	$0x00, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm1, %xmm0
-        pxor	%xmm3, %xmm5
-        pxor	%xmm3, %xmm6
-        pxor	%xmm2, %xmm5
-        pxor	%xmm2, %xmm7
-        pxor	%xmm0, %xmm5
-        movdqu	16(%esp), %xmm3
-        movdqu	144(%esp), %xmm4
-        pshufd	$0x4e, %xmm3, %xmm0
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm3, %xmm0
-        pshufd	$0x4e, %xmm4, %xmm1
-        pxor	%xmm4, %xmm1
-        movdqa	%xmm4, %xmm2
-        pclmulqdq	$0x11, %xmm3, %xmm2
-        pclmulqdq	$0x00, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm1, %xmm0
-        pxor	%xmm3, %xmm5
-        pxor	%xmm3, %xmm6
-        pxor	%xmm2, %xmm5
-        pxor	%xmm2, %xmm7
-        pxor	%xmm0, %xmm5
-        movdqu	(%esp), %xmm3
-        movdqu	160(%esp), %xmm4
-        pshufd	$0x4e, %xmm3, %xmm0
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm3, %xmm0
-        pshufd	$0x4e, %xmm4, %xmm1
-        pxor	%xmm4, %xmm1
-        movdqa	%xmm4, %xmm2
-        pclmulqdq	$0x11, %xmm3, %xmm2
-        pclmulqdq	$0x00, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm1, %xmm0
-        pxor	%xmm3, %xmm5
-        pxor	%xmm3, %xmm6
-        pxor	%xmm2, %xmm5
-        pxor	%xmm2, %xmm7
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm5, %xmm1
-        psrldq	$8, %xmm5
-        pslldq	$8, %xmm1
-        pxor	%xmm1, %xmm6
-        pxor	%xmm5, %xmm7
-        movdqa	%xmm6, %xmm3
-        movdqa	%xmm6, %xmm0
-        movdqa	%xmm6, %xmm1
-        pslld	$31, %xmm3
-        pslld	$30, %xmm0
-        pslld	$25, %xmm1
-        pxor	%xmm0, %xmm3
-        pxor	%xmm1, %xmm3
-        movdqa	%xmm3, %xmm0
-        pslldq	$12, %xmm3
-        psrldq	$4, %xmm0
-        pxor	%xmm3, %xmm6
-        movdqa	%xmm6, %xmm1
-        movdqa	%xmm6, %xmm5
-        movdqa	%xmm6, %xmm4
-        psrld	$0x01, %xmm1
-        psrld	$2, %xmm5
-        psrld	$7, %xmm4
-        pxor	%xmm5, %xmm1
-        pxor	%xmm4, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm1, %xmm6
-        pxor	%xmm7, %xmm6
-        movdqu	%xmm6, 96(%esp)
-        addl	$0x40, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_decrypt_ghash_64_inplace
-        jmp	L_AES_GCM_decrypt_ghash_64_done
-L_AES_GCM_decrypt_ghash_64:
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        # Encrypt 64 bytes of counter
-        movdqu	64(%esp), %xmm4
-        movdqa	L_aes_gcm_bswap_epi64, %xmm3
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        movdqa	%xmm4, %xmm7
-        pshufb	%xmm3, %xmm4
-        paddd	L_aes_gcm_one, %xmm5
-        pshufb	%xmm3, %xmm5
-        paddd	L_aes_gcm_two, %xmm6
-        pshufb	%xmm3, %xmm6
-        paddd	L_aes_gcm_three, %xmm7
-        pshufb	%xmm3, %xmm7
-        movdqu	64(%esp), %xmm3
-        paddd	L_aes_gcm_four, %xmm3
-        movdqu	%xmm3, 64(%esp)
-        movdqa	(%ebp), %xmm3
-        pxor	%xmm3, %xmm4
-        pxor	%xmm3, %xmm5
-        pxor	%xmm3, %xmm6
-        pxor	%xmm3, %xmm7
-        movdqa	16(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	32(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	48(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	64(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	80(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	96(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	112(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	128(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	144(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        cmpl	$11, 236(%esp)
-        movdqa	160(%ebp), %xmm3
-        jl	L_AES_GCM_decrypt_aesenc_64_ghash_avx_done
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	176(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        cmpl	$13, 236(%esp)
-        movdqa	192(%ebp), %xmm3
-        jl	L_AES_GCM_decrypt_aesenc_64_ghash_avx_done
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	208(%ebp), %xmm3
-        aesenc	%xmm3, %xmm4
-        aesenc	%xmm3, %xmm5
-        aesenc	%xmm3, %xmm6
-        aesenc	%xmm3, %xmm7
-        movdqa	224(%ebp), %xmm3
-L_AES_GCM_decrypt_aesenc_64_ghash_avx_done:
-        aesenclast	%xmm3, %xmm4
-        aesenclast	%xmm3, %xmm5
-        movdqu	(%ecx), %xmm0
-        movdqu	16(%ecx), %xmm1
-        pxor	%xmm0, %xmm4
-        pxor	%xmm1, %xmm5
-        movdqu	%xmm0, (%ecx)
-        movdqu	%xmm1, 16(%ecx)
-        movdqu	%xmm4, (%edx)
-        movdqu	%xmm5, 16(%edx)
-        aesenclast	%xmm3, %xmm6
-        aesenclast	%xmm3, %xmm7
-        movdqu	32(%ecx), %xmm0
-        movdqu	48(%ecx), %xmm1
-        pxor	%xmm0, %xmm6
-        pxor	%xmm1, %xmm7
-        movdqu	%xmm0, 32(%ecx)
-        movdqu	%xmm1, 48(%ecx)
-        movdqu	%xmm6, 32(%edx)
-        movdqu	%xmm7, 48(%edx)
-        # ghash encrypted counter
-        movdqu	96(%esp), %xmm6
-        movdqu	48(%esp), %xmm3
-        movdqu	(%ecx), %xmm4
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm6, %xmm4
-        pshufd	$0x4e, %xmm3, %xmm5
-        pshufd	$0x4e, %xmm4, %xmm1
-        pxor	%xmm3, %xmm5
-        pxor	%xmm4, %xmm1
-        movdqa	%xmm4, %xmm7
-        pclmulqdq	$0x11, %xmm3, %xmm7
-        movdqa	%xmm4, %xmm6
-        pclmulqdq	$0x00, %xmm3, %xmm6
-        pclmulqdq	$0x00, %xmm1, %xmm5
-        pxor	%xmm6, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqu	32(%esp), %xmm3
-        movdqu	16(%ecx), %xmm4
-        pshufd	$0x4e, %xmm3, %xmm0
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm3, %xmm0
-        pshufd	$0x4e, %xmm4, %xmm1
-        pxor	%xmm4, %xmm1
-        movdqa	%xmm4, %xmm2
-        pclmulqdq	$0x11, %xmm3, %xmm2
-        pclmulqdq	$0x00, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm1, %xmm0
-        pxor	%xmm3, %xmm5
-        pxor	%xmm3, %xmm6
-        pxor	%xmm2, %xmm5
-        pxor	%xmm2, %xmm7
-        pxor	%xmm0, %xmm5
-        movdqu	16(%esp), %xmm3
-        movdqu	32(%ecx), %xmm4
-        pshufd	$0x4e, %xmm3, %xmm0
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm3, %xmm0
-        pshufd	$0x4e, %xmm4, %xmm1
-        pxor	%xmm4, %xmm1
-        movdqa	%xmm4, %xmm2
-        pclmulqdq	$0x11, %xmm3, %xmm2
-        pclmulqdq	$0x00, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm1, %xmm0
-        pxor	%xmm3, %xmm5
-        pxor	%xmm3, %xmm6
-        pxor	%xmm2, %xmm5
-        pxor	%xmm2, %xmm7
-        pxor	%xmm0, %xmm5
-        movdqu	(%esp), %xmm3
-        movdqu	48(%ecx), %xmm4
-        pshufd	$0x4e, %xmm3, %xmm0
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        pxor	%xmm3, %xmm0
-        pshufd	$0x4e, %xmm4, %xmm1
-        pxor	%xmm4, %xmm1
-        movdqa	%xmm4, %xmm2
-        pclmulqdq	$0x11, %xmm3, %xmm2
-        pclmulqdq	$0x00, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm1, %xmm0
-        pxor	%xmm3, %xmm5
-        pxor	%xmm3, %xmm6
-        pxor	%xmm2, %xmm5
-        pxor	%xmm2, %xmm7
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm5, %xmm1
-        psrldq	$8, %xmm5
-        pslldq	$8, %xmm1
-        pxor	%xmm1, %xmm6
-        pxor	%xmm5, %xmm7
-        movdqa	%xmm6, %xmm3
-        movdqa	%xmm6, %xmm0
-        movdqa	%xmm6, %xmm1
-        pslld	$31, %xmm3
-        pslld	$30, %xmm0
-        pslld	$25, %xmm1
-        pxor	%xmm0, %xmm3
-        pxor	%xmm1, %xmm3
-        movdqa	%xmm3, %xmm0
-        pslldq	$12, %xmm3
-        psrldq	$4, %xmm0
-        pxor	%xmm3, %xmm6
-        movdqa	%xmm6, %xmm1
-        movdqa	%xmm6, %xmm5
-        movdqa	%xmm6, %xmm4
-        psrld	$0x01, %xmm1
-        psrld	$2, %xmm5
-        psrld	$7, %xmm4
-        pxor	%xmm5, %xmm1
-        pxor	%xmm4, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm1, %xmm6
-        pxor	%xmm7, %xmm6
-        movdqu	%xmm6, 96(%esp)
-        addl	$0x40, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_decrypt_ghash_64
-L_AES_GCM_decrypt_ghash_64_done:
-        movdqa	%xmm6, %xmm2
-        movdqu	(%esp), %xmm1
-L_AES_GCM_decrypt_done_64:
-        movl	216(%esp), %edx
-        cmpl	%edx, %ebx
-        jge	L_AES_GCM_decrypt_done_dec
-        movl	216(%esp), %eax
-        andl	$0xfffffff0, %eax
-        cmpl	%eax, %ebx
-        jge	L_AES_GCM_decrypt_last_block_done
-L_AES_GCM_decrypt_last_block_start:
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        movdqu	(%ecx), %xmm5
-        pshufb	L_aes_gcm_bswap_mask, %xmm5
-        pxor	%xmm2, %xmm5
-        movdqu	%xmm5, (%esp)
-        movdqu	64(%esp), %xmm4
-        movdqa	%xmm4, %xmm5
-        pshufb	L_aes_gcm_bswap_epi64, %xmm4
-        paddd	L_aes_gcm_one, %xmm5
-        pxor	(%ebp), %xmm4
-        movdqu	%xmm5, 64(%esp)
-        movdqu	(%esp), %xmm0
-        pclmulqdq	$16, %xmm1, %xmm0
-        aesenc	16(%ebp), %xmm4
-        aesenc	32(%ebp), %xmm4
-        movdqu	(%esp), %xmm3
-        pclmulqdq	$0x01, %xmm1, %xmm3
-        aesenc	48(%ebp), %xmm4
-        aesenc	64(%ebp), %xmm4
-        aesenc	80(%ebp), %xmm4
-        movdqu	(%esp), %xmm5
-        pclmulqdq	$0x11, %xmm1, %xmm5
-        aesenc	96(%ebp), %xmm4
-        pxor	%xmm3, %xmm0
-        movdqa	%xmm0, %xmm6
-        psrldq	$8, %xmm0
-        pslldq	$8, %xmm6
-        aesenc	112(%ebp), %xmm4
-        movdqu	(%esp), %xmm3
-        pclmulqdq	$0x00, %xmm1, %xmm3
-        pxor	%xmm3, %xmm6
-        pxor	%xmm0, %xmm5
-        movdqa	L_aes_gcm_mod2_128, %xmm7
-        movdqa	%xmm6, %xmm3
-        pclmulqdq	$16, %xmm7, %xmm3
-        aesenc	128(%ebp), %xmm4
-        pshufd	$0x4e, %xmm6, %xmm0
-        pxor	%xmm3, %xmm0
-        movdqa	%xmm0, %xmm3
-        pclmulqdq	$16, %xmm7, %xmm3
-        aesenc	144(%ebp), %xmm4
-        pshufd	$0x4e, %xmm0, %xmm2
-        pxor	%xmm3, %xmm2
-        pxor	%xmm5, %xmm2
-        cmpl	$11, 236(%esp)
-        movdqa	160(%ebp), %xmm5
-        jl	L_AES_GCM_decrypt_aesenc_gfmul_last
-        aesenc	%xmm5, %xmm4
-        aesenc	176(%ebp), %xmm4
-        cmpl	$13, 236(%esp)
-        movdqa	192(%ebp), %xmm5
-        jl	L_AES_GCM_decrypt_aesenc_gfmul_last
-        aesenc	%xmm5, %xmm4
-        aesenc	208(%ebp), %xmm4
-        movdqa	224(%ebp), %xmm5
-L_AES_GCM_decrypt_aesenc_gfmul_last:
-        aesenclast	%xmm5, %xmm4
-        movdqu	(%ecx), %xmm5
-        pxor	%xmm5, %xmm4
-        movdqu	%xmm4, (%edx)
-        addl	$16, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_decrypt_last_block_start
-L_AES_GCM_decrypt_last_block_done:
-        movl	216(%esp), %ecx
-        movl	%ecx, %edx
-        andl	$15, %ecx
-        jz	L_AES_GCM_decrypt_aesenc_last15_dec_avx_done
-        movdqu	64(%esp), %xmm0
-        pshufb	L_aes_gcm_bswap_epi64, %xmm0
-        pxor	(%ebp), %xmm0
-        aesenc	16(%ebp), %xmm0
-        aesenc	32(%ebp), %xmm0
-        aesenc	48(%ebp), %xmm0
-        aesenc	64(%ebp), %xmm0
-        aesenc	80(%ebp), %xmm0
-        aesenc	96(%ebp), %xmm0
-        aesenc	112(%ebp), %xmm0
-        aesenc	128(%ebp), %xmm0
-        aesenc	144(%ebp), %xmm0
-        cmpl	$11, 236(%esp)
-        movdqa	160(%ebp), %xmm5
-        jl	L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last
-        aesenc	%xmm5, %xmm0
-        aesenc	176(%ebp), %xmm0
-        cmpl	$13, 236(%esp)
-        movdqa	192(%ebp), %xmm5
-        jl	L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last
-        aesenc	%xmm5, %xmm0
-        aesenc	208(%ebp), %xmm0
-        movdqa	224(%ebp), %xmm5
-L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last:
-        aesenclast	%xmm5, %xmm0
-        subl	$32, %esp
-        xorl	%ecx, %ecx
-        movdqu	%xmm0, (%esp)
-        pxor	%xmm4, %xmm4
-        movdqu	%xmm4, 16(%esp)
-L_AES_GCM_decrypt_aesenc_last15_dec_avx_loop:
-        movzbl	(%esi,%ebx,1), %eax
-        movb	%al, 16(%esp,%ecx,1)
-        xorb	(%esp,%ecx,1), %al
-        movb	%al, (%edi,%ebx,1)
-        incl	%ebx
-        incl	%ecx
-        cmpl	%edx, %ebx
-        jl	L_AES_GCM_decrypt_aesenc_last15_dec_avx_loop
-        movdqu	16(%esp), %xmm0
-        addl	$32, %esp
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        pxor	%xmm0, %xmm2
-        pshufd	$0x4e, %xmm1, %xmm5
-        pshufd	$0x4e, %xmm2, %xmm6
-        movdqa	%xmm2, %xmm7
-        movdqa	%xmm2, %xmm4
-        pclmulqdq	$0x11, %xmm1, %xmm7
-        pclmulqdq	$0x00, %xmm1, %xmm4
-        pxor	%xmm1, %xmm5
-        pxor	%xmm2, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm7, %xmm2
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm4
-        pxor	%xmm5, %xmm2
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        movdqa	%xmm4, %xmm7
-        pslld	$31, %xmm5
-        pslld	$30, %xmm6
-        pslld	$25, %xmm7
-        pxor	%xmm6, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm7
-        psrldq	$4, %xmm7
-        pslldq	$12, %xmm5
-        pxor	%xmm5, %xmm4
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        psrld	$0x01, %xmm5
-        psrld	$2, %xmm6
-        pxor	%xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm5, %xmm2
-L_AES_GCM_decrypt_aesenc_last15_dec_avx_done:
-L_AES_GCM_decrypt_done_dec:
-        movl	212(%esp), %esi
-        movl	228(%esp), %ebp
-        movl	216(%esp), %edx
-        movl	220(%esp), %ecx
-        shll	$3, %edx
-        shll	$3, %ecx
-        pinsrd	$0x00, %edx, %xmm4
-        pinsrd	$2, %ecx, %xmm4
-        movl	216(%esp), %edx
-        movl	220(%esp), %ecx
-        shrl	$29, %edx
-        shrl	$29, %ecx
-        pinsrd	$0x01, %edx, %xmm4
-        pinsrd	$3, %ecx, %xmm4
-        pxor	%xmm4, %xmm2
-        pshufd	$0x4e, %xmm1, %xmm5
-        pshufd	$0x4e, %xmm2, %xmm6
-        movdqa	%xmm2, %xmm7
-        movdqa	%xmm2, %xmm4
-        pclmulqdq	$0x11, %xmm1, %xmm7
-        pclmulqdq	$0x00, %xmm1, %xmm4
-        pxor	%xmm1, %xmm5
-        pxor	%xmm2, %xmm6
-        pclmulqdq	$0x00, %xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm6
-        movdqa	%xmm7, %xmm2
-        pslldq	$8, %xmm6
-        psrldq	$8, %xmm5
-        pxor	%xmm6, %xmm4
-        pxor	%xmm5, %xmm2
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        movdqa	%xmm4, %xmm7
-        pslld	$31, %xmm5
-        pslld	$30, %xmm6
-        pslld	$25, %xmm7
-        pxor	%xmm6, %xmm5
-        pxor	%xmm7, %xmm5
-        movdqa	%xmm5, %xmm7
-        psrldq	$4, %xmm7
-        pslldq	$12, %xmm5
-        pxor	%xmm5, %xmm4
-        movdqa	%xmm4, %xmm5
-        movdqa	%xmm4, %xmm6
-        psrld	$0x01, %xmm5
-        psrld	$2, %xmm6
-        pxor	%xmm6, %xmm5
-        pxor	%xmm4, %xmm5
-        psrld	$7, %xmm4
-        pxor	%xmm7, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm5, %xmm2
-        pshufb	L_aes_gcm_bswap_mask, %xmm2
-        movdqu	80(%esp), %xmm4
-        pxor	%xmm2, %xmm4
-        movl	240(%esp), %edi
-        cmpl	$16, %ebp
-        je	L_AES_GCM_decrypt_cmp_tag_16
-        subl	$16, %esp
-        xorl	%ecx, %ecx
-        xorl	%ebx, %ebx
-        movdqu	%xmm4, (%esp)
-L_AES_GCM_decrypt_cmp_tag_loop:
-        movzbl	(%esp,%ecx,1), %eax
-        xorb	(%esi,%ecx,1), %al
-        orb	%al, %bl
-        incl	%ecx
-        cmpl	%ebp, %ecx
-        jne	L_AES_GCM_decrypt_cmp_tag_loop
-        cmpb	$0x00, %bl
-        sete	%bl
-        addl	$16, %esp
-        xorl	%ecx, %ecx
-        jmp	L_AES_GCM_decrypt_cmp_tag_done
-L_AES_GCM_decrypt_cmp_tag_16:
-        movdqu	(%esi), %xmm5
-        pcmpeqb	%xmm5, %xmm4
-        pmovmskb	%xmm4, %edx
-        # %%edx == 0xFFFF then return 1 else => return 0
-        xorl	%ebx, %ebx
-        cmpl	$0xffff, %edx
-        sete	%bl
-L_AES_GCM_decrypt_cmp_tag_done:
-        movl	%ebx, (%edi)
-        addl	$0xb0, %esp
-        popl	%ebp
-        popl	%edi
-        popl	%esi
-        popl	%ebx
-        ret
-.size	AES_GCM_decrypt,.-AES_GCM_decrypt
-#ifdef WOLFSSL_AESGCM_STREAM
-.text
-.globl	AES_GCM_init_aesni
-.type	AES_GCM_init_aesni,@function
-.align	16
-AES_GCM_init_aesni:
-        pushl	%ebx
-        pushl	%esi
-        pushl	%edi
-        pushl	%ebp
-        subl	$16, %esp
-        movl	36(%esp), %ebp
-        movl	44(%esp), %esi
-        movl	60(%esp), %edi
-        pxor	%xmm4, %xmm4
-        movl	48(%esp), %edx
-        cmpl	$12, %edx
-        jne	L_AES_GCM_init_aesni_iv_not_12
-        # # Calculate values when IV is 12 bytes
-        # Set counter based on IV
-        movl	$0x1000000, %ecx
-        pinsrd	$0x00, (%esi), %xmm4
-        pinsrd	$0x01, 4(%esi), %xmm4
-        pinsrd	$2, 8(%esi), %xmm4
-        pinsrd	$3, %ecx, %xmm4
-        # H = Encrypt X(=0) and T = Encrypt counter
-        movdqa	%xmm4, %xmm1
-        movdqa	(%ebp), %xmm5
-        pxor	%xmm5, %xmm1
-        movdqa	16(%ebp), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	32(%ebp), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	48(%ebp), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	64(%ebp), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	80(%ebp), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	96(%ebp), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	112(%ebp), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	128(%ebp), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	144(%ebp), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        cmpl	$11, 40(%esp)
-        movdqa	160(%ebp), %xmm7
-        jl	L_AES_GCM_init_aesni_calc_iv_12_last
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	176(%ebp), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        cmpl	$13, 40(%esp)
-        movdqa	192(%ebp), %xmm7
-        jl	L_AES_GCM_init_aesni_calc_iv_12_last
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	208(%ebp), %xmm7
-        aesenc	%xmm7, %xmm5
-        aesenc	%xmm7, %xmm1
-        movdqa	224(%ebp), %xmm7
-L_AES_GCM_init_aesni_calc_iv_12_last:
-        aesenclast	%xmm7, %xmm5
-        aesenclast	%xmm7, %xmm1
-        pshufb	L_aes_gcm_bswap_mask, %xmm5
-        movdqu	%xmm1, (%edi)
-        jmp	L_AES_GCM_init_aesni_iv_done
-L_AES_GCM_init_aesni_iv_not_12:
-        # Calculate values when IV is not 12 bytes
-        # H = Encrypt X(=0)
-        movdqa	(%ebp), %xmm5
-        aesenc	16(%ebp), %xmm5
-        aesenc	32(%ebp), %xmm5
-        aesenc	48(%ebp), %xmm5
-        aesenc	64(%ebp), %xmm5
-        aesenc	80(%ebp), %xmm5
-        aesenc	96(%ebp), %xmm5
-        aesenc	112(%ebp), %xmm5
-        aesenc	128(%ebp), %xmm5
-        aesenc	144(%ebp), %xmm5
-        cmpl	$11, 40(%esp)
-        movdqa	160(%ebp), %xmm1
-        jl	L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last
-        aesenc	%xmm1, %xmm5
-        aesenc	176(%ebp), %xmm5
-        cmpl	$13, 40(%esp)
-        movdqa	192(%ebp), %xmm1
-        jl	L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last
-        aesenc	%xmm1, %xmm5
-        aesenc	208(%ebp), %xmm5
-        movdqa	224(%ebp), %xmm1
-L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last:
-        aesenclast	%xmm1, %xmm5
-        pshufb	L_aes_gcm_bswap_mask, %xmm5
-        # Calc counter
-        # Initialization vector
-        cmpl	$0x00, %edx
-        movl	$0x00, %ecx
-        je	L_AES_GCM_init_aesni_calc_iv_done
-        cmpl	$16, %edx
-        jl	L_AES_GCM_init_aesni_calc_iv_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_init_aesni_calc_iv_16_loop:
-        movdqu	(%esi,%ecx,1), %xmm0
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        pxor	%xmm0, %xmm4
-        pshufd	$0x4e, %xmm4, %xmm1
-        pshufd	$0x4e, %xmm5, %xmm2
-        movdqa	%xmm5, %xmm3
-        movdqa	%xmm5, %xmm0
-        pclmulqdq	$0x11, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm4, %xmm0
-        pxor	%xmm4, %xmm1
-        pxor	%xmm5, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm7
-        movdqa	%xmm3, %xmm4
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm7
-        pxor	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm4, %xmm1
-        psrld	$31, %xmm0
-        psrld	$31, %xmm1
-        pslld	$0x01, %xmm7
-        pslld	$0x01, %xmm4
-        movdqa	%xmm0, %xmm2
-        pslldq	$4, %xmm0
-        psrldq	$12, %xmm2
-        pslldq	$4, %xmm1
-        por	%xmm2, %xmm4
-        por	%xmm0, %xmm7
-        por	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm7, %xmm1
-        movdqa	%xmm7, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm7
-        movdqa	%xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm7, %xmm2
-        pxor	%xmm2, %xmm4
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_init_aesni_calc_iv_16_loop
-        movl	48(%esp), %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_init_aesni_calc_iv_done
-L_AES_GCM_init_aesni_calc_iv_lt16:
-        subl	$16, %esp
-        pxor	%xmm0, %xmm0
-        xorl	%ebx, %ebx
-        movdqu	%xmm0, (%esp)
-L_AES_GCM_init_aesni_calc_iv_loop:
-        movzbl	(%esi,%ecx,1), %eax
-        movb	%al, (%esp,%ebx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_init_aesni_calc_iv_loop
-        movdqu	(%esp), %xmm0
-        addl	$16, %esp
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        pxor	%xmm0, %xmm4
-        pshufd	$0x4e, %xmm4, %xmm1
-        pshufd	$0x4e, %xmm5, %xmm2
-        movdqa	%xmm5, %xmm3
-        movdqa	%xmm5, %xmm0
-        pclmulqdq	$0x11, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm4, %xmm0
-        pxor	%xmm4, %xmm1
-        pxor	%xmm5, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm7
-        movdqa	%xmm3, %xmm4
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm7
-        pxor	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm4, %xmm1
-        psrld	$31, %xmm0
-        psrld	$31, %xmm1
-        pslld	$0x01, %xmm7
-        pslld	$0x01, %xmm4
-        movdqa	%xmm0, %xmm2
-        pslldq	$4, %xmm0
-        psrldq	$12, %xmm2
-        pslldq	$4, %xmm1
-        por	%xmm2, %xmm4
-        por	%xmm0, %xmm7
-        por	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm7, %xmm1
-        movdqa	%xmm7, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm7
-        movdqa	%xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm7, %xmm2
-        pxor	%xmm2, %xmm4
-L_AES_GCM_init_aesni_calc_iv_done:
-        # T = Encrypt counter
-        pxor	%xmm0, %xmm0
-        shll	$3, %edx
-        pinsrd	$0x00, %edx, %xmm0
-        pxor	%xmm0, %xmm4
-        pshufd	$0x4e, %xmm4, %xmm1
-        pshufd	$0x4e, %xmm5, %xmm2
-        movdqa	%xmm5, %xmm3
-        movdqa	%xmm5, %xmm0
-        pclmulqdq	$0x11, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm4, %xmm0
-        pxor	%xmm4, %xmm1
-        pxor	%xmm5, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm7
-        movdqa	%xmm3, %xmm4
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm7
-        pxor	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm4, %xmm1
-        psrld	$31, %xmm0
-        psrld	$31, %xmm1
-        pslld	$0x01, %xmm7
-        pslld	$0x01, %xmm4
-        movdqa	%xmm0, %xmm2
-        pslldq	$4, %xmm0
-        psrldq	$12, %xmm2
-        pslldq	$4, %xmm1
-        por	%xmm2, %xmm4
-        por	%xmm0, %xmm7
-        por	%xmm1, %xmm4
-        movdqa	%xmm7, %xmm0
-        movdqa	%xmm7, %xmm1
-        movdqa	%xmm7, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm7
-        movdqa	%xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm7, %xmm2
-        pxor	%xmm2, %xmm4
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        #   Encrypt counter
-        movdqa	(%ebp), %xmm0
-        pxor	%xmm4, %xmm0
-        aesenc	16(%ebp), %xmm0
-        aesenc	32(%ebp), %xmm0
-        aesenc	48(%ebp), %xmm0
-        aesenc	64(%ebp), %xmm0
-        aesenc	80(%ebp), %xmm0
-        aesenc	96(%ebp), %xmm0
-        aesenc	112(%ebp), %xmm0
-        aesenc	128(%ebp), %xmm0
-        aesenc	144(%ebp), %xmm0
-        cmpl	$11, 40(%esp)
-        movdqa	160(%ebp), %xmm1
-        jl	L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last
-        aesenc	%xmm1, %xmm0
-        aesenc	176(%ebp), %xmm0
-        cmpl	$13, 40(%esp)
-        movdqa	192(%ebp), %xmm1
-        jl	L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last
-        aesenc	%xmm1, %xmm0
-        aesenc	208(%ebp), %xmm0
-        movdqa	224(%ebp), %xmm1
-L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last:
-        aesenclast	%xmm1, %xmm0
-        movdqu	%xmm0, (%edi)
-L_AES_GCM_init_aesni_iv_done:
-        movl	52(%esp), %ebp
-        movl	56(%esp), %edi
-        pshufb	L_aes_gcm_bswap_epi64, %xmm4
-        paddd	L_aes_gcm_one, %xmm4
-        movdqa	%xmm5, (%ebp)
-        movdqa	%xmm4, (%edi)
-        addl	$16, %esp
-        popl	%ebp
-        popl	%edi
-        popl	%esi
-        popl	%ebx
-        ret
-.size	AES_GCM_init_aesni,.-AES_GCM_init_aesni
-.text
-.globl	AES_GCM_aad_update_aesni
-.type	AES_GCM_aad_update_aesni,@function
-.align	16
-AES_GCM_aad_update_aesni:
-        pushl	%esi
-        pushl	%edi
-        movl	12(%esp), %esi
-        movl	16(%esp), %edx
-        movl	20(%esp), %edi
-        movl	24(%esp), %eax
-        movdqa	(%edi), %xmm5
-        movdqa	(%eax), %xmm6
-        xorl	%ecx, %ecx
-L_AES_GCM_aad_update_aesni_16_loop:
-        movdqu	(%esi,%ecx,1), %xmm0
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        pxor	%xmm0, %xmm5
-        pshufd	$0x4e, %xmm5, %xmm1
-        pshufd	$0x4e, %xmm6, %xmm2
-        movdqa	%xmm6, %xmm3
-        movdqa	%xmm6, %xmm0
-        pclmulqdq	$0x11, %xmm5, %xmm3
-        pclmulqdq	$0x00, %xmm5, %xmm0
-        pxor	%xmm5, %xmm1
-        pxor	%xmm6, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm4
-        movdqa	%xmm3, %xmm5
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm5
-        movdqa	%xmm4, %xmm0
-        movdqa	%xmm5, %xmm1
-        psrld	$31, %xmm0
-        psrld	$31, %xmm1
-        pslld	$0x01, %xmm4
-        pslld	$0x01, %xmm5
-        movdqa	%xmm0, %xmm2
-        pslldq	$4, %xmm0
-        psrldq	$12, %xmm2
-        pslldq	$4, %xmm1
-        por	%xmm2, %xmm5
-        por	%xmm0, %xmm4
-        por	%xmm1, %xmm5
-        movdqa	%xmm4, %xmm0
-        movdqa	%xmm4, %xmm1
-        movdqa	%xmm4, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm4
-        movdqa	%xmm4, %xmm2
-        movdqa	%xmm4, %xmm3
-        movdqa	%xmm4, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm4, %xmm2
-        pxor	%xmm2, %xmm5
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_aad_update_aesni_16_loop
-        movdqa	%xmm5, (%edi)
-        popl	%edi
-        popl	%esi
-        ret
-.size	AES_GCM_aad_update_aesni,.-AES_GCM_aad_update_aesni
-.text
-.globl	AES_GCM_encrypt_block_aesni
-.type	AES_GCM_encrypt_block_aesni,@function
-.align	16
-AES_GCM_encrypt_block_aesni:
-        pushl	%esi
-        pushl	%edi
-        movl	12(%esp), %ecx
-        movl	16(%esp), %eax
-        movl	20(%esp), %edi
-        movl	24(%esp), %esi
-        movl	28(%esp), %edx
-        movdqu	(%edx), %xmm0
-        movdqa	%xmm0, %xmm1
-        pshufb	L_aes_gcm_bswap_epi64, %xmm0
-        paddd	L_aes_gcm_one, %xmm1
-        pxor	(%ecx), %xmm0
-        movdqu	%xmm1, (%edx)
-        aesenc	16(%ecx), %xmm0
-        aesenc	32(%ecx), %xmm0
-        aesenc	48(%ecx), %xmm0
-        aesenc	64(%ecx), %xmm0
-        aesenc	80(%ecx), %xmm0
-        aesenc	96(%ecx), %xmm0
-        aesenc	112(%ecx), %xmm0
-        aesenc	128(%ecx), %xmm0
-        aesenc	144(%ecx), %xmm0
-        cmpl	$11, %eax
-        movdqa	160(%ecx), %xmm1
-        jl	L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last
-        aesenc	%xmm1, %xmm0
-        aesenc	176(%ecx), %xmm0
-        cmpl	$13, %eax
-        movdqa	192(%ecx), %xmm1
-        jl	L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last
-        aesenc	%xmm1, %xmm0
-        aesenc	208(%ecx), %xmm0
-        movdqa	224(%ecx), %xmm1
-L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last:
-        aesenclast	%xmm1, %xmm0
-        movdqu	(%esi), %xmm1
-        pxor	%xmm1, %xmm0
-        movdqu	%xmm0, (%edi)
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        popl	%edi
-        popl	%esi
-        ret
-.size	AES_GCM_encrypt_block_aesni,.-AES_GCM_encrypt_block_aesni
-.text
-.globl	AES_GCM_ghash_block_aesni
-.type	AES_GCM_ghash_block_aesni,@function
-.align	16
-AES_GCM_ghash_block_aesni:
-        movl	4(%esp), %edx
-        movl	8(%esp), %eax
-        movl	12(%esp), %ecx
-        movdqa	(%eax), %xmm4
-        movdqa	(%ecx), %xmm5
-        movdqu	(%edx), %xmm0
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        pxor	%xmm0, %xmm4
-        pshufd	$0x4e, %xmm4, %xmm1
-        pshufd	$0x4e, %xmm5, %xmm2
-        movdqa	%xmm5, %xmm3
-        movdqa	%xmm5, %xmm0
-        pclmulqdq	$0x11, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm4, %xmm0
-        pxor	%xmm4, %xmm1
-        pxor	%xmm5, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm6
-        movdqa	%xmm3, %xmm4
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm6
-        pxor	%xmm1, %xmm4
-        movdqa	%xmm6, %xmm0
-        movdqa	%xmm4, %xmm1
-        psrld	$31, %xmm0
-        psrld	$31, %xmm1
-        pslld	$0x01, %xmm6
-        pslld	$0x01, %xmm4
-        movdqa	%xmm0, %xmm2
-        pslldq	$4, %xmm0
-        psrldq	$12, %xmm2
-        pslldq	$4, %xmm1
-        por	%xmm2, %xmm4
-        por	%xmm0, %xmm6
-        por	%xmm1, %xmm4
-        movdqa	%xmm6, %xmm0
-        movdqa	%xmm6, %xmm1
-        movdqa	%xmm6, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm6
-        movdqa	%xmm6, %xmm2
-        movdqa	%xmm6, %xmm3
-        movdqa	%xmm6, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm6, %xmm2
-        pxor	%xmm2, %xmm4
-        movdqa	%xmm4, (%eax)
-        ret
-.size	AES_GCM_ghash_block_aesni,.-AES_GCM_ghash_block_aesni
-.text
-.globl	AES_GCM_encrypt_update_aesni
-.type	AES_GCM_encrypt_update_aesni,@function
-.align	16
-AES_GCM_encrypt_update_aesni:
-        pushl	%ebx
-        pushl	%esi
-        pushl	%edi
-        pushl	%ebp
-        subl	$0x60, %esp
-        movl	144(%esp), %esi
-        movdqa	(%esi), %xmm4
-        movdqu	%xmm4, 64(%esp)
-        movl	136(%esp), %esi
-        movl	140(%esp), %ebp
-        movdqa	(%esi), %xmm6
-        movdqa	(%ebp), %xmm5
-        movdqu	%xmm6, 80(%esp)
-        movl	116(%esp), %ebp
-        movl	124(%esp), %edi
-        movl	128(%esp), %esi
-        movdqa	%xmm5, %xmm1
-        movdqa	%xmm5, %xmm0
-        psrlq	$63, %xmm1
-        psllq	$0x01, %xmm0
-        pslldq	$8, %xmm1
-        por	%xmm1, %xmm0
-        pshufd	$0xff, %xmm5, %xmm5
-        psrad	$31, %xmm5
-        pand	L_aes_gcm_mod2_128, %xmm5
-        pxor	%xmm0, %xmm5
-        xorl	%ebx, %ebx
-        cmpl	$0x40, 132(%esp)
-        movl	132(%esp), %eax
-        jl	L_AES_GCM_encrypt_update_aesni_done_64
-        andl	$0xffffffc0, %eax
-        movdqa	%xmm6, %xmm2
-        # H ^ 1
-        movdqu	%xmm5, (%esp)
-        # H ^ 2
-        pshufd	$0x4e, %xmm5, %xmm1
-        pshufd	$0x4e, %xmm5, %xmm2
-        movdqa	%xmm5, %xmm3
-        movdqa	%xmm5, %xmm0
-        pclmulqdq	$0x11, %xmm5, %xmm3
-        pclmulqdq	$0x00, %xmm5, %xmm0
-        pxor	%xmm5, %xmm1
-        pxor	%xmm5, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm3, %xmm4
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm0
-        pxor	%xmm1, %xmm4
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        movdqa	%xmm0, %xmm3
-        pslld	$31, %xmm1
-        pslld	$30, %xmm2
-        pslld	$25, %xmm3
-        pxor	%xmm2, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm3
-        psrldq	$4, %xmm3
-        pslldq	$12, %xmm1
-        pxor	%xmm1, %xmm0
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        psrld	$0x01, %xmm1
-        psrld	$2, %xmm2
-        pxor	%xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm1, %xmm4
-        movdqu	%xmm4, 16(%esp)
-        # H ^ 3
-        pshufd	$0x4e, %xmm5, %xmm1
-        pshufd	$0x4e, %xmm4, %xmm2
-        movdqa	%xmm4, %xmm3
-        movdqa	%xmm4, %xmm0
-        pclmulqdq	$0x11, %xmm5, %xmm3
-        pclmulqdq	$0x00, %xmm5, %xmm0
-        pxor	%xmm5, %xmm1
-        pxor	%xmm4, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm3, %xmm7
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm0
-        pxor	%xmm1, %xmm7
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        movdqa	%xmm0, %xmm3
-        pslld	$31, %xmm1
-        pslld	$30, %xmm2
-        pslld	$25, %xmm3
-        pxor	%xmm2, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm3
-        psrldq	$4, %xmm3
-        pslldq	$12, %xmm1
-        pxor	%xmm1, %xmm0
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        psrld	$0x01, %xmm1
-        psrld	$2, %xmm2
-        pxor	%xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm1, %xmm7
-        movdqu	%xmm7, 32(%esp)
-        # H ^ 4
-        pshufd	$0x4e, %xmm4, %xmm1
-        pshufd	$0x4e, %xmm4, %xmm2
-        movdqa	%xmm4, %xmm3
-        movdqa	%xmm4, %xmm0
-        pclmulqdq	$0x11, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm4, %xmm0
-        pxor	%xmm4, %xmm1
-        pxor	%xmm4, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm3, %xmm7
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm0
-        pxor	%xmm1, %xmm7
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        movdqa	%xmm0, %xmm3
-        pslld	$31, %xmm1
-        pslld	$30, %xmm2
-        pslld	$25, %xmm3
-        pxor	%xmm2, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm3
-        psrldq	$4, %xmm3
-        pslldq	$12, %xmm1
-        pxor	%xmm1, %xmm0
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        psrld	$0x01, %xmm1
-        psrld	$2, %xmm2
-        pxor	%xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm1, %xmm7
-        movdqu	%xmm7, 48(%esp)
-        # First 64 bytes of input
-        # Encrypt 64 bytes of counter
-        movdqu	64(%esp), %xmm0
-        movdqa	L_aes_gcm_bswap_epi64, %xmm7
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        movdqa	%xmm0, %xmm3
-        pshufb	%xmm7, %xmm0
-        paddd	L_aes_gcm_one, %xmm1
-        pshufb	%xmm7, %xmm1
-        paddd	L_aes_gcm_two, %xmm2
-        pshufb	%xmm7, %xmm2
-        paddd	L_aes_gcm_three, %xmm3
-        pshufb	%xmm7, %xmm3
-        movdqu	64(%esp), %xmm7
-        paddd	L_aes_gcm_four, %xmm7
-        movdqu	%xmm7, 64(%esp)
-        movdqa	(%ebp), %xmm7
-        pxor	%xmm7, %xmm0
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm7, %xmm3
-        movdqa	16(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	32(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	48(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	64(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	80(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	96(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	112(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	128(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	144(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        cmpl	$11, 120(%esp)
-        movdqa	160(%ebp), %xmm7
-        jl	L_AES_GCM_encrypt_update_aesni_enc_done
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	176(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        cmpl	$13, 120(%esp)
-        movdqa	192(%ebp), %xmm7
-        jl	L_AES_GCM_encrypt_update_aesni_enc_done
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	208(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	224(%ebp), %xmm7
-L_AES_GCM_encrypt_update_aesni_enc_done:
-        aesenclast	%xmm7, %xmm0
-        aesenclast	%xmm7, %xmm1
-        movdqu	(%esi), %xmm4
-        movdqu	16(%esi), %xmm5
-        pxor	%xmm4, %xmm0
-        pxor	%xmm5, %xmm1
-        movdqu	%xmm0, (%edi)
-        movdqu	%xmm1, 16(%edi)
-        aesenclast	%xmm7, %xmm2
-        aesenclast	%xmm7, %xmm3
-        movdqu	32(%esi), %xmm4
-        movdqu	48(%esi), %xmm5
-        pxor	%xmm4, %xmm2
-        pxor	%xmm5, %xmm3
-        movdqu	%xmm2, 32(%edi)
-        movdqu	%xmm3, 48(%edi)
-        cmpl	$0x40, %eax
-        movl	$0x40, %ebx
-        jle	L_AES_GCM_encrypt_update_aesni_end_64
-        # More 64 bytes of input
-L_AES_GCM_encrypt_update_aesni_ghash_64:
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        # Encrypt 64 bytes of counter
-        movdqu	64(%esp), %xmm0
-        movdqa	L_aes_gcm_bswap_epi64, %xmm7
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        movdqa	%xmm0, %xmm3
-        pshufb	%xmm7, %xmm0
-        paddd	L_aes_gcm_one, %xmm1
-        pshufb	%xmm7, %xmm1
-        paddd	L_aes_gcm_two, %xmm2
-        pshufb	%xmm7, %xmm2
-        paddd	L_aes_gcm_three, %xmm3
-        pshufb	%xmm7, %xmm3
-        movdqu	64(%esp), %xmm7
-        paddd	L_aes_gcm_four, %xmm7
-        movdqu	%xmm7, 64(%esp)
-        movdqa	(%ebp), %xmm7
-        pxor	%xmm7, %xmm0
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm7, %xmm3
-        movdqa	16(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	32(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	48(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	64(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	80(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	96(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	112(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	128(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	144(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        cmpl	$11, 120(%esp)
-        movdqa	160(%ebp), %xmm7
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_64_ghash_avx_done
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	176(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        cmpl	$13, 120(%esp)
-        movdqa	192(%ebp), %xmm7
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_64_ghash_avx_done
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	208(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	224(%ebp), %xmm7
-L_AES_GCM_encrypt_update_aesni_aesenc_64_ghash_avx_done:
-        aesenclast	%xmm7, %xmm0
-        aesenclast	%xmm7, %xmm1
-        movdqu	(%ecx), %xmm4
-        movdqu	16(%ecx), %xmm5
-        pxor	%xmm4, %xmm0
-        pxor	%xmm5, %xmm1
-        movdqu	%xmm0, (%edx)
-        movdqu	%xmm1, 16(%edx)
-        aesenclast	%xmm7, %xmm2
-        aesenclast	%xmm7, %xmm3
-        movdqu	32(%ecx), %xmm4
-        movdqu	48(%ecx), %xmm5
-        pxor	%xmm4, %xmm2
-        pxor	%xmm5, %xmm3
-        movdqu	%xmm2, 32(%edx)
-        movdqu	%xmm3, 48(%edx)
-        # ghash encrypted counter
-        movdqu	80(%esp), %xmm2
-        movdqu	48(%esp), %xmm7
-        movdqu	-64(%edx), %xmm0
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        pxor	%xmm2, %xmm0
-        pshufd	$0x4e, %xmm7, %xmm1
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm7, %xmm1
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm3
-        pclmulqdq	$0x11, %xmm7, %xmm3
-        movdqa	%xmm0, %xmm2
-        pclmulqdq	$0x00, %xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm5, %xmm1
-        pxor	%xmm2, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqu	32(%esp), %xmm7
-        movdqu	-48(%edx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	16(%esp), %xmm7
-        movdqu	-32(%edx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	(%esp), %xmm7
-        movdqu	-16(%edx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqa	%xmm1, %xmm5
-        psrldq	$8, %xmm1
-        pslldq	$8, %xmm5
-        pxor	%xmm5, %xmm2
-        pxor	%xmm1, %xmm3
-        movdqa	%xmm2, %xmm7
-        movdqa	%xmm2, %xmm4
-        movdqa	%xmm2, %xmm5
-        pslld	$31, %xmm7
-        pslld	$30, %xmm4
-        pslld	$25, %xmm5
-        pxor	%xmm4, %xmm7
-        pxor	%xmm5, %xmm7
-        movdqa	%xmm7, %xmm4
-        pslldq	$12, %xmm7
-        psrldq	$4, %xmm4
-        pxor	%xmm7, %xmm2
-        movdqa	%xmm2, %xmm5
-        movdqa	%xmm2, %xmm1
-        movdqa	%xmm2, %xmm0
-        psrld	$0x01, %xmm5
-        psrld	$2, %xmm1
-        psrld	$7, %xmm0
-        pxor	%xmm1, %xmm5
-        pxor	%xmm0, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm5, %xmm2
-        pxor	%xmm3, %xmm2
-        movdqu	%xmm2, 80(%esp)
-        addl	$0x40, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_encrypt_update_aesni_ghash_64
-L_AES_GCM_encrypt_update_aesni_end_64:
-        movdqu	80(%esp), %xmm6
-        # Block 1
-        movdqa	L_aes_gcm_bswap_mask, %xmm0
-        movdqu	(%edx), %xmm5
-        pshufb	%xmm0, %xmm5
-        movdqu	48(%esp), %xmm7
-        pxor	%xmm6, %xmm5
-        pshufd	$0x4e, %xmm5, %xmm1
-        pshufd	$0x4e, %xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        pclmulqdq	$0x11, %xmm5, %xmm3
-        pclmulqdq	$0x00, %xmm5, %xmm0
-        pxor	%xmm5, %xmm1
-        pxor	%xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm0, %xmm4
-        movdqa	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm6
-        # Block 2
-        movdqa	L_aes_gcm_bswap_mask, %xmm0
-        movdqu	16(%edx), %xmm5
-        pshufb	%xmm0, %xmm5
-        movdqu	32(%esp), %xmm7
-        pshufd	$0x4e, %xmm5, %xmm1
-        pshufd	$0x4e, %xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        pclmulqdq	$0x11, %xmm5, %xmm3
-        pclmulqdq	$0x00, %xmm5, %xmm0
-        pxor	%xmm5, %xmm1
-        pxor	%xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        pxor	%xmm0, %xmm4
-        pxor	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm6
-        # Block 3
-        movdqa	L_aes_gcm_bswap_mask, %xmm0
-        movdqu	32(%edx), %xmm5
-        pshufb	%xmm0, %xmm5
-        movdqu	16(%esp), %xmm7
-        pshufd	$0x4e, %xmm5, %xmm1
-        pshufd	$0x4e, %xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        pclmulqdq	$0x11, %xmm5, %xmm3
-        pclmulqdq	$0x00, %xmm5, %xmm0
-        pxor	%xmm5, %xmm1
-        pxor	%xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        pxor	%xmm0, %xmm4
-        pxor	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm6
-        # Block 4
-        movdqa	L_aes_gcm_bswap_mask, %xmm0
-        movdqu	48(%edx), %xmm5
-        pshufb	%xmm0, %xmm5
-        movdqu	(%esp), %xmm7
-        pshufd	$0x4e, %xmm5, %xmm1
-        pshufd	$0x4e, %xmm7, %xmm2
-        movdqa	%xmm7, %xmm3
-        movdqa	%xmm7, %xmm0
-        pclmulqdq	$0x11, %xmm5, %xmm3
-        pclmulqdq	$0x00, %xmm5, %xmm0
-        pxor	%xmm5, %xmm1
-        pxor	%xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        pxor	%xmm0, %xmm4
-        pxor	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm4
-        pxor	%xmm1, %xmm6
-        movdqa	%xmm4, %xmm0
-        movdqa	%xmm4, %xmm1
-        movdqa	%xmm4, %xmm2
-        pslld	$31, %xmm0
-        pslld	$30, %xmm1
-        pslld	$25, %xmm2
-        pxor	%xmm1, %xmm0
-        pxor	%xmm2, %xmm0
-        movdqa	%xmm0, %xmm1
-        psrldq	$4, %xmm1
-        pslldq	$12, %xmm0
-        pxor	%xmm0, %xmm4
-        movdqa	%xmm4, %xmm2
-        movdqa	%xmm4, %xmm3
-        movdqa	%xmm4, %xmm0
-        psrld	$0x01, %xmm2
-        psrld	$2, %xmm3
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm2
-        pxor	%xmm0, %xmm2
-        pxor	%xmm1, %xmm2
-        pxor	%xmm4, %xmm2
-        pxor	%xmm2, %xmm6
-        movdqu	(%esp), %xmm5
-L_AES_GCM_encrypt_update_aesni_done_64:
-        movl	132(%esp), %edx
-        cmpl	%edx, %ebx
-        jge	L_AES_GCM_encrypt_update_aesni_done_enc
-        movl	132(%esp), %eax
-        andl	$0xfffffff0, %eax
-        cmpl	%eax, %ebx
-        jge	L_AES_GCM_encrypt_update_aesni_last_block_done
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        movdqu	64(%esp), %xmm0
-        movdqa	%xmm0, %xmm1
-        pshufb	L_aes_gcm_bswap_epi64, %xmm0
-        paddd	L_aes_gcm_one, %xmm1
-        pxor	(%ebp), %xmm0
-        movdqu	%xmm1, 64(%esp)
-        aesenc	16(%ebp), %xmm0
-        aesenc	32(%ebp), %xmm0
-        aesenc	48(%ebp), %xmm0
-        aesenc	64(%ebp), %xmm0
-        aesenc	80(%ebp), %xmm0
-        aesenc	96(%ebp), %xmm0
-        aesenc	112(%ebp), %xmm0
-        aesenc	128(%ebp), %xmm0
-        aesenc	144(%ebp), %xmm0
-        cmpl	$11, 120(%esp)
-        movdqa	160(%ebp), %xmm1
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last
-        aesenc	%xmm1, %xmm0
-        aesenc	176(%ebp), %xmm0
-        cmpl	$13, 120(%esp)
-        movdqa	192(%ebp), %xmm1
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last
-        aesenc	%xmm1, %xmm0
-        aesenc	208(%ebp), %xmm0
-        movdqa	224(%ebp), %xmm1
-L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last:
-        aesenclast	%xmm1, %xmm0
-        movdqu	(%ecx), %xmm1
-        pxor	%xmm1, %xmm0
-        movdqu	%xmm0, (%edx)
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        pxor	%xmm0, %xmm6
-        addl	$16, %ebx
-        cmpl	%eax, %ebx
-        jge	L_AES_GCM_encrypt_update_aesni_last_block_ghash
-L_AES_GCM_encrypt_update_aesni_last_block_start:
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        movdqu	64(%esp), %xmm0
-        movdqa	%xmm0, %xmm1
-        pshufb	L_aes_gcm_bswap_epi64, %xmm0
-        paddd	L_aes_gcm_one, %xmm1
-        pxor	(%ebp), %xmm0
-        movdqu	%xmm1, 64(%esp)
-        movdqu	%xmm6, %xmm4
-        pclmulqdq	$16, %xmm5, %xmm4
-        aesenc	16(%ebp), %xmm0
-        aesenc	32(%ebp), %xmm0
-        movdqu	%xmm6, %xmm7
-        pclmulqdq	$0x01, %xmm5, %xmm7
-        aesenc	48(%ebp), %xmm0
-        aesenc	64(%ebp), %xmm0
-        aesenc	80(%ebp), %xmm0
-        movdqu	%xmm6, %xmm1
-        pclmulqdq	$0x11, %xmm5, %xmm1
-        aesenc	96(%ebp), %xmm0
-        pxor	%xmm7, %xmm4
-        movdqa	%xmm4, %xmm2
-        psrldq	$8, %xmm4
-        pslldq	$8, %xmm2
-        aesenc	112(%ebp), %xmm0
-        movdqu	%xmm6, %xmm7
-        pclmulqdq	$0x00, %xmm5, %xmm7
-        pxor	%xmm7, %xmm2
-        pxor	%xmm4, %xmm1
-        movdqa	L_aes_gcm_mod2_128, %xmm3
-        movdqa	%xmm2, %xmm7
-        pclmulqdq	$16, %xmm3, %xmm7
-        aesenc	128(%ebp), %xmm0
-        pshufd	$0x4e, %xmm2, %xmm4
-        pxor	%xmm7, %xmm4
-        movdqa	%xmm4, %xmm7
-        pclmulqdq	$16, %xmm3, %xmm7
-        aesenc	144(%ebp), %xmm0
-        pshufd	$0x4e, %xmm4, %xmm6
-        pxor	%xmm7, %xmm6
-        pxor	%xmm1, %xmm6
-        cmpl	$11, 120(%esp)
-        movdqa	160(%ebp), %xmm1
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last
-        aesenc	%xmm1, %xmm0
-        aesenc	176(%ebp), %xmm0
-        cmpl	$13, 120(%esp)
-        movdqa	192(%ebp), %xmm1
-        jl	L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last
-        aesenc	%xmm1, %xmm0
-        aesenc	208(%ebp), %xmm0
-        movdqa	224(%ebp), %xmm1
-L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last:
-        aesenclast	%xmm1, %xmm0
-        movdqu	(%ecx), %xmm1
-        pxor	%xmm1, %xmm0
-        movdqu	%xmm0, (%edx)
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        pxor	%xmm0, %xmm6
-        addl	$16, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_encrypt_update_aesni_last_block_start
-L_AES_GCM_encrypt_update_aesni_last_block_ghash:
-        pshufd	$0x4e, %xmm5, %xmm1
-        pshufd	$0x4e, %xmm6, %xmm2
-        movdqa	%xmm6, %xmm3
-        movdqa	%xmm6, %xmm0
-        pclmulqdq	$0x11, %xmm5, %xmm3
-        pclmulqdq	$0x00, %xmm5, %xmm0
-        pxor	%xmm5, %xmm1
-        pxor	%xmm6, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm0
-        pxor	%xmm1, %xmm6
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        movdqa	%xmm0, %xmm3
-        pslld	$31, %xmm1
-        pslld	$30, %xmm2
-        pslld	$25, %xmm3
-        pxor	%xmm2, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm3
-        psrldq	$4, %xmm3
-        pslldq	$12, %xmm1
-        pxor	%xmm1, %xmm0
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        psrld	$0x01, %xmm1
-        psrld	$2, %xmm2
-        pxor	%xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm1, %xmm6
-L_AES_GCM_encrypt_update_aesni_last_block_done:
-L_AES_GCM_encrypt_update_aesni_done_enc:
-        movl	136(%esp), %esi
-        movl	144(%esp), %edi
-        movdqu	64(%esp), %xmm4
-        movdqa	%xmm6, (%esi)
-        movdqu	%xmm4, (%edi)
-        addl	$0x60, %esp
-        popl	%ebp
-        popl	%edi
-        popl	%esi
-        popl	%ebx
-        ret
-.size	AES_GCM_encrypt_update_aesni,.-AES_GCM_encrypt_update_aesni
-.text
-.globl	AES_GCM_encrypt_final_aesni
-.type	AES_GCM_encrypt_final_aesni,@function
-.align	16
-AES_GCM_encrypt_final_aesni:
-        pushl	%esi
-        pushl	%edi
-        pushl	%ebp
-        subl	$16, %esp
-        movl	32(%esp), %ebp
-        movl	52(%esp), %esi
-        movl	56(%esp), %edi
-        movdqa	(%ebp), %xmm4
-        movdqa	(%esi), %xmm5
-        movdqa	(%edi), %xmm6
-        movdqa	%xmm5, %xmm1
-        movdqa	%xmm5, %xmm0
-        psrlq	$63, %xmm1
-        psllq	$0x01, %xmm0
-        pslldq	$8, %xmm1
-        por	%xmm1, %xmm0
-        pshufd	$0xff, %xmm5, %xmm5
-        psrad	$31, %xmm5
-        pand	L_aes_gcm_mod2_128, %xmm5
-        pxor	%xmm0, %xmm5
-        movl	44(%esp), %edx
-        movl	48(%esp), %ecx
-        shll	$3, %edx
-        shll	$3, %ecx
-        pinsrd	$0x00, %edx, %xmm0
-        pinsrd	$2, %ecx, %xmm0
-        movl	44(%esp), %edx
-        movl	48(%esp), %ecx
-        shrl	$29, %edx
-        shrl	$29, %ecx
-        pinsrd	$0x01, %edx, %xmm0
-        pinsrd	$3, %ecx, %xmm0
-        pxor	%xmm0, %xmm4
-        pshufd	$0x4e, %xmm5, %xmm1
-        pshufd	$0x4e, %xmm4, %xmm2
-        movdqa	%xmm4, %xmm3
-        movdqa	%xmm4, %xmm0
-        pclmulqdq	$0x11, %xmm5, %xmm3
-        pclmulqdq	$0x00, %xmm5, %xmm0
-        pxor	%xmm5, %xmm1
-        pxor	%xmm4, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm3, %xmm4
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm0
-        pxor	%xmm1, %xmm4
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        movdqa	%xmm0, %xmm3
-        pslld	$31, %xmm1
-        pslld	$30, %xmm2
-        pslld	$25, %xmm3
-        pxor	%xmm2, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm3
-        psrldq	$4, %xmm3
-        pslldq	$12, %xmm1
-        pxor	%xmm1, %xmm0
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        psrld	$0x01, %xmm1
-        psrld	$2, %xmm2
-        pxor	%xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm1, %xmm4
-        pshufb	L_aes_gcm_bswap_mask, %xmm4
-        movdqu	%xmm6, %xmm0
-        pxor	%xmm4, %xmm0
-        movl	36(%esp), %edi
-        cmpl	$16, 40(%esp)
-        je	L_AES_GCM_encrypt_final_aesni_store_tag_16
-        xorl	%ecx, %ecx
-        movdqu	%xmm0, (%esp)
-L_AES_GCM_encrypt_final_aesni_store_tag_loop:
-        movzbl	(%esp,%ecx,1), %eax
-        movb	%al, (%edi,%ecx,1)
-        incl	%ecx
-        cmpl	40(%esp), %ecx
-        jne	L_AES_GCM_encrypt_final_aesni_store_tag_loop
-        jmp	L_AES_GCM_encrypt_final_aesni_store_tag_done
-L_AES_GCM_encrypt_final_aesni_store_tag_16:
-        movdqu	%xmm0, (%edi)
-L_AES_GCM_encrypt_final_aesni_store_tag_done:
-        addl	$16, %esp
-        popl	%ebp
-        popl	%edi
-        popl	%esi
-        ret
-.size	AES_GCM_encrypt_final_aesni,.-AES_GCM_encrypt_final_aesni
-.text
-.globl	AES_GCM_decrypt_update_aesni
-.type	AES_GCM_decrypt_update_aesni,@function
-.align	16
-AES_GCM_decrypt_update_aesni:
-        pushl	%ebx
-        pushl	%esi
-        pushl	%edi
-        pushl	%ebp
-        subl	$0xa0, %esp
-        movl	208(%esp), %esi
-        movdqa	(%esi), %xmm4
-        movdqu	%xmm4, 64(%esp)
-        movl	200(%esp), %esi
-        movl	204(%esp), %ebp
-        movdqa	(%esi), %xmm6
-        movdqa	(%ebp), %xmm5
-        movdqu	%xmm6, 80(%esp)
-        movl	180(%esp), %ebp
-        movl	188(%esp), %edi
-        movl	192(%esp), %esi
-        movdqa	%xmm5, %xmm1
-        movdqa	%xmm5, %xmm0
-        psrlq	$63, %xmm1
-        psllq	$0x01, %xmm0
-        pslldq	$8, %xmm1
-        por	%xmm1, %xmm0
-        pshufd	$0xff, %xmm5, %xmm5
-        psrad	$31, %xmm5
-        pand	L_aes_gcm_mod2_128, %xmm5
-        pxor	%xmm0, %xmm5
-        xorl	%ebx, %ebx
-        cmpl	$0x40, 196(%esp)
-        movl	196(%esp), %eax
-        jl	L_AES_GCM_decrypt_update_aesni_done_64
-        andl	$0xffffffc0, %eax
-        movdqa	%xmm6, %xmm2
-        # H ^ 1
-        movdqu	%xmm5, (%esp)
-        # H ^ 2
-        pshufd	$0x4e, %xmm5, %xmm1
-        pshufd	$0x4e, %xmm5, %xmm2
-        movdqa	%xmm5, %xmm3
-        movdqa	%xmm5, %xmm0
-        pclmulqdq	$0x11, %xmm5, %xmm3
-        pclmulqdq	$0x00, %xmm5, %xmm0
-        pxor	%xmm5, %xmm1
-        pxor	%xmm5, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm3, %xmm4
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm0
-        pxor	%xmm1, %xmm4
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        movdqa	%xmm0, %xmm3
-        pslld	$31, %xmm1
-        pslld	$30, %xmm2
-        pslld	$25, %xmm3
-        pxor	%xmm2, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm3
-        psrldq	$4, %xmm3
-        pslldq	$12, %xmm1
-        pxor	%xmm1, %xmm0
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        psrld	$0x01, %xmm1
-        psrld	$2, %xmm2
-        pxor	%xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm1, %xmm4
-        movdqu	%xmm4, 16(%esp)
-        # H ^ 3
-        pshufd	$0x4e, %xmm5, %xmm1
-        pshufd	$0x4e, %xmm4, %xmm2
-        movdqa	%xmm4, %xmm3
-        movdqa	%xmm4, %xmm0
-        pclmulqdq	$0x11, %xmm5, %xmm3
-        pclmulqdq	$0x00, %xmm5, %xmm0
-        pxor	%xmm5, %xmm1
-        pxor	%xmm4, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm3, %xmm7
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm0
-        pxor	%xmm1, %xmm7
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        movdqa	%xmm0, %xmm3
-        pslld	$31, %xmm1
-        pslld	$30, %xmm2
-        pslld	$25, %xmm3
-        pxor	%xmm2, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm3
-        psrldq	$4, %xmm3
-        pslldq	$12, %xmm1
-        pxor	%xmm1, %xmm0
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        psrld	$0x01, %xmm1
-        psrld	$2, %xmm2
-        pxor	%xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm1, %xmm7
-        movdqu	%xmm7, 32(%esp)
-        # H ^ 4
-        pshufd	$0x4e, %xmm4, %xmm1
-        pshufd	$0x4e, %xmm4, %xmm2
-        movdqa	%xmm4, %xmm3
-        movdqa	%xmm4, %xmm0
-        pclmulqdq	$0x11, %xmm4, %xmm3
-        pclmulqdq	$0x00, %xmm4, %xmm0
-        pxor	%xmm4, %xmm1
-        pxor	%xmm4, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm3, %xmm7
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm0
-        pxor	%xmm1, %xmm7
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        movdqa	%xmm0, %xmm3
-        pslld	$31, %xmm1
-        pslld	$30, %xmm2
-        pslld	$25, %xmm3
-        pxor	%xmm2, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm3
-        psrldq	$4, %xmm3
-        pslldq	$12, %xmm1
-        pxor	%xmm1, %xmm0
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        psrld	$0x01, %xmm1
-        psrld	$2, %xmm2
-        pxor	%xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm1, %xmm7
-        movdqu	%xmm7, 48(%esp)
-        cmpl	%esi, %edi
-        jne	L_AES_GCM_decrypt_update_aesni_ghash_64
-L_AES_GCM_decrypt_update_aesni_ghash_64_inplace:
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        # Encrypt 64 bytes of counter
-        movdqu	64(%esp), %xmm0
-        movdqa	L_aes_gcm_bswap_epi64, %xmm7
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        movdqa	%xmm0, %xmm3
-        pshufb	%xmm7, %xmm0
-        paddd	L_aes_gcm_one, %xmm1
-        pshufb	%xmm7, %xmm1
-        paddd	L_aes_gcm_two, %xmm2
-        pshufb	%xmm7, %xmm2
-        paddd	L_aes_gcm_three, %xmm3
-        pshufb	%xmm7, %xmm3
-        movdqu	64(%esp), %xmm7
-        paddd	L_aes_gcm_four, %xmm7
-        movdqu	%xmm7, 64(%esp)
-        movdqa	(%ebp), %xmm7
-        pxor	%xmm7, %xmm0
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm7, %xmm3
-        movdqa	16(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	32(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	48(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	64(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	80(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	96(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	112(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	128(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	144(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        cmpl	$11, 184(%esp)
-        movdqa	160(%ebp), %xmm7
-        jl	L_AES_GCM_decrypt_update_aesniinplace_aesenc_64_ghash_avx_done
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	176(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        cmpl	$13, 184(%esp)
-        movdqa	192(%ebp), %xmm7
-        jl	L_AES_GCM_decrypt_update_aesniinplace_aesenc_64_ghash_avx_done
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	208(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	224(%ebp), %xmm7
-L_AES_GCM_decrypt_update_aesniinplace_aesenc_64_ghash_avx_done:
-        aesenclast	%xmm7, %xmm0
-        aesenclast	%xmm7, %xmm1
-        movdqu	(%ecx), %xmm4
-        movdqu	16(%ecx), %xmm5
-        pxor	%xmm4, %xmm0
-        pxor	%xmm5, %xmm1
-        movdqu	%xmm4, 96(%esp)
-        movdqu	%xmm5, 112(%esp)
-        movdqu	%xmm0, (%edx)
-        movdqu	%xmm1, 16(%edx)
-        aesenclast	%xmm7, %xmm2
-        aesenclast	%xmm7, %xmm3
-        movdqu	32(%ecx), %xmm4
-        movdqu	48(%ecx), %xmm5
-        pxor	%xmm4, %xmm2
-        pxor	%xmm5, %xmm3
-        movdqu	%xmm4, 128(%esp)
-        movdqu	%xmm5, 144(%esp)
-        movdqu	%xmm2, 32(%edx)
-        movdqu	%xmm3, 48(%edx)
-        # ghash encrypted counter
-        movdqu	80(%esp), %xmm2
-        movdqu	48(%esp), %xmm7
-        movdqu	96(%esp), %xmm0
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        pxor	%xmm2, %xmm0
-        pshufd	$0x4e, %xmm7, %xmm1
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm7, %xmm1
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm3
-        pclmulqdq	$0x11, %xmm7, %xmm3
-        movdqa	%xmm0, %xmm2
-        pclmulqdq	$0x00, %xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm5, %xmm1
-        pxor	%xmm2, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqu	32(%esp), %xmm7
-        movdqu	112(%esp), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	16(%esp), %xmm7
-        movdqu	128(%esp), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	(%esp), %xmm7
-        movdqu	144(%esp), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqa	%xmm1, %xmm5
-        psrldq	$8, %xmm1
-        pslldq	$8, %xmm5
-        pxor	%xmm5, %xmm2
-        pxor	%xmm1, %xmm3
-        movdqa	%xmm2, %xmm7
-        movdqa	%xmm2, %xmm4
-        movdqa	%xmm2, %xmm5
-        pslld	$31, %xmm7
-        pslld	$30, %xmm4
-        pslld	$25, %xmm5
-        pxor	%xmm4, %xmm7
-        pxor	%xmm5, %xmm7
-        movdqa	%xmm7, %xmm4
-        pslldq	$12, %xmm7
-        psrldq	$4, %xmm4
-        pxor	%xmm7, %xmm2
-        movdqa	%xmm2, %xmm5
-        movdqa	%xmm2, %xmm1
-        movdqa	%xmm2, %xmm0
-        psrld	$0x01, %xmm5
-        psrld	$2, %xmm1
-        psrld	$7, %xmm0
-        pxor	%xmm1, %xmm5
-        pxor	%xmm0, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm5, %xmm2
-        pxor	%xmm3, %xmm2
-        movdqu	%xmm2, 80(%esp)
-        addl	$0x40, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_decrypt_update_aesni_ghash_64_inplace
-        jmp	L_AES_GCM_decrypt_update_aesni_ghash_64_done
-L_AES_GCM_decrypt_update_aesni_ghash_64:
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        # Encrypt 64 bytes of counter
-        movdqu	64(%esp), %xmm0
-        movdqa	L_aes_gcm_bswap_epi64, %xmm7
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        movdqa	%xmm0, %xmm3
-        pshufb	%xmm7, %xmm0
-        paddd	L_aes_gcm_one, %xmm1
-        pshufb	%xmm7, %xmm1
-        paddd	L_aes_gcm_two, %xmm2
-        pshufb	%xmm7, %xmm2
-        paddd	L_aes_gcm_three, %xmm3
-        pshufb	%xmm7, %xmm3
-        movdqu	64(%esp), %xmm7
-        paddd	L_aes_gcm_four, %xmm7
-        movdqu	%xmm7, 64(%esp)
-        movdqa	(%ebp), %xmm7
-        pxor	%xmm7, %xmm0
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm7, %xmm3
-        movdqa	16(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	32(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	48(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	64(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	80(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	96(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	112(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	128(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	144(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        cmpl	$11, 184(%esp)
-        movdqa	160(%ebp), %xmm7
-        jl	L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	176(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        cmpl	$13, 184(%esp)
-        movdqa	192(%ebp), %xmm7
-        jl	L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	208(%ebp), %xmm7
-        aesenc	%xmm7, %xmm0
-        aesenc	%xmm7, %xmm1
-        aesenc	%xmm7, %xmm2
-        aesenc	%xmm7, %xmm3
-        movdqa	224(%ebp), %xmm7
-L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done:
-        aesenclast	%xmm7, %xmm0
-        aesenclast	%xmm7, %xmm1
-        movdqu	(%ecx), %xmm4
-        movdqu	16(%ecx), %xmm5
-        pxor	%xmm4, %xmm0
-        pxor	%xmm5, %xmm1
-        movdqu	%xmm4, (%ecx)
-        movdqu	%xmm5, 16(%ecx)
-        movdqu	%xmm0, (%edx)
-        movdqu	%xmm1, 16(%edx)
-        aesenclast	%xmm7, %xmm2
-        aesenclast	%xmm7, %xmm3
-        movdqu	32(%ecx), %xmm4
-        movdqu	48(%ecx), %xmm5
-        pxor	%xmm4, %xmm2
-        pxor	%xmm5, %xmm3
-        movdqu	%xmm4, 32(%ecx)
-        movdqu	%xmm5, 48(%ecx)
-        movdqu	%xmm2, 32(%edx)
-        movdqu	%xmm3, 48(%edx)
-        # ghash encrypted counter
-        movdqu	80(%esp), %xmm2
-        movdqu	48(%esp), %xmm7
-        movdqu	(%ecx), %xmm0
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        pxor	%xmm2, %xmm0
-        pshufd	$0x4e, %xmm7, %xmm1
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm7, %xmm1
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm3
-        pclmulqdq	$0x11, %xmm7, %xmm3
-        movdqa	%xmm0, %xmm2
-        pclmulqdq	$0x00, %xmm7, %xmm2
-        pclmulqdq	$0x00, %xmm5, %xmm1
-        pxor	%xmm2, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqu	32(%esp), %xmm7
-        movdqu	16(%ecx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	16(%esp), %xmm7
-        movdqu	32(%ecx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqu	(%esp), %xmm7
-        movdqu	48(%ecx), %xmm0
-        pshufd	$0x4e, %xmm7, %xmm4
-        pshufb	L_aes_gcm_bswap_mask, %xmm0
-        pxor	%xmm7, %xmm4
-        pshufd	$0x4e, %xmm0, %xmm5
-        pxor	%xmm0, %xmm5
-        movdqa	%xmm0, %xmm6
-        pclmulqdq	$0x11, %xmm7, %xmm6
-        pclmulqdq	$0x00, %xmm0, %xmm7
-        pclmulqdq	$0x00, %xmm5, %xmm4
-        pxor	%xmm7, %xmm1
-        pxor	%xmm7, %xmm2
-        pxor	%xmm6, %xmm1
-        pxor	%xmm6, %xmm3
-        pxor	%xmm4, %xmm1
-        movdqa	%xmm1, %xmm5
-        psrldq	$8, %xmm1
-        pslldq	$8, %xmm5
-        pxor	%xmm5, %xmm2
-        pxor	%xmm1, %xmm3
-        movdqa	%xmm2, %xmm7
-        movdqa	%xmm2, %xmm4
-        movdqa	%xmm2, %xmm5
-        pslld	$31, %xmm7
-        pslld	$30, %xmm4
-        pslld	$25, %xmm5
-        pxor	%xmm4, %xmm7
-        pxor	%xmm5, %xmm7
-        movdqa	%xmm7, %xmm4
-        pslldq	$12, %xmm7
-        psrldq	$4, %xmm4
-        pxor	%xmm7, %xmm2
-        movdqa	%xmm2, %xmm5
-        movdqa	%xmm2, %xmm1
-        movdqa	%xmm2, %xmm0
-        psrld	$0x01, %xmm5
-        psrld	$2, %xmm1
-        psrld	$7, %xmm0
-        pxor	%xmm1, %xmm5
-        pxor	%xmm0, %xmm5
-        pxor	%xmm4, %xmm5
-        pxor	%xmm5, %xmm2
-        pxor	%xmm3, %xmm2
-        movdqu	%xmm2, 80(%esp)
-        addl	$0x40, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_decrypt_update_aesni_ghash_64
-L_AES_GCM_decrypt_update_aesni_ghash_64_done:
-        movdqa	%xmm2, %xmm6
-        movdqu	(%esp), %xmm5
-L_AES_GCM_decrypt_update_aesni_done_64:
-        movl	196(%esp), %edx
-        cmpl	%edx, %ebx
-        jge	L_AES_GCM_decrypt_update_aesni_done_dec
-        movl	196(%esp), %eax
-        andl	$0xfffffff0, %eax
-        cmpl	%eax, %ebx
-        jge	L_AES_GCM_decrypt_update_aesni_last_block_done
-L_AES_GCM_decrypt_update_aesni_last_block_start:
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        movdqu	(%ecx), %xmm1
-        pshufb	L_aes_gcm_bswap_mask, %xmm1
-        pxor	%xmm6, %xmm1
-        movdqu	%xmm1, (%esp)
-        movdqu	64(%esp), %xmm0
-        movdqa	%xmm0, %xmm1
-        pshufb	L_aes_gcm_bswap_epi64, %xmm0
-        paddd	L_aes_gcm_one, %xmm1
-        pxor	(%ebp), %xmm0
-        movdqu	%xmm1, 64(%esp)
-        movdqu	(%esp), %xmm4
-        pclmulqdq	$16, %xmm5, %xmm4
-        aesenc	16(%ebp), %xmm0
-        aesenc	32(%ebp), %xmm0
-        movdqu	(%esp), %xmm7
-        pclmulqdq	$0x01, %xmm5, %xmm7
-        aesenc	48(%ebp), %xmm0
-        aesenc	64(%ebp), %xmm0
-        aesenc	80(%ebp), %xmm0
-        movdqu	(%esp), %xmm1
-        pclmulqdq	$0x11, %xmm5, %xmm1
-        aesenc	96(%ebp), %xmm0
-        pxor	%xmm7, %xmm4
-        movdqa	%xmm4, %xmm2
-        psrldq	$8, %xmm4
-        pslldq	$8, %xmm2
-        aesenc	112(%ebp), %xmm0
-        movdqu	(%esp), %xmm7
-        pclmulqdq	$0x00, %xmm5, %xmm7
-        pxor	%xmm7, %xmm2
-        pxor	%xmm4, %xmm1
-        movdqa	L_aes_gcm_mod2_128, %xmm3
-        movdqa	%xmm2, %xmm7
-        pclmulqdq	$16, %xmm3, %xmm7
-        aesenc	128(%ebp), %xmm0
-        pshufd	$0x4e, %xmm2, %xmm4
-        pxor	%xmm7, %xmm4
-        movdqa	%xmm4, %xmm7
-        pclmulqdq	$16, %xmm3, %xmm7
-        aesenc	144(%ebp), %xmm0
-        pshufd	$0x4e, %xmm4, %xmm6
-        pxor	%xmm7, %xmm6
-        pxor	%xmm1, %xmm6
-        cmpl	$11, 184(%esp)
-        movdqa	160(%ebp), %xmm1
-        jl	L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last
-        aesenc	%xmm1, %xmm0
-        aesenc	176(%ebp), %xmm0
-        cmpl	$13, 184(%esp)
-        movdqa	192(%ebp), %xmm1
-        jl	L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last
-        aesenc	%xmm1, %xmm0
-        aesenc	208(%ebp), %xmm0
-        movdqa	224(%ebp), %xmm1
-L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last:
-        aesenclast	%xmm1, %xmm0
-        movdqu	(%ecx), %xmm1
-        pxor	%xmm1, %xmm0
-        movdqu	%xmm0, (%edx)
-        addl	$16, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_decrypt_update_aesni_last_block_start
-L_AES_GCM_decrypt_update_aesni_last_block_done:
-L_AES_GCM_decrypt_update_aesni_done_dec:
-        movl	200(%esp), %esi
-        movl	208(%esp), %edi
-        movdqu	64(%esp), %xmm4
-        movdqa	%xmm6, (%esi)
-        movdqu	%xmm4, (%edi)
-        addl	$0xa0, %esp
-        popl	%ebp
-        popl	%edi
-        popl	%esi
-        popl	%ebx
-        ret
-.size	AES_GCM_decrypt_update_aesni,.-AES_GCM_decrypt_update_aesni
-.text
-.globl	AES_GCM_decrypt_final_aesni
-.type	AES_GCM_decrypt_final_aesni,@function
-.align	16
-AES_GCM_decrypt_final_aesni:
-        pushl	%ebx
-        pushl	%esi
-        pushl	%edi
-        pushl	%ebp
-        subl	$16, %esp
-        movl	36(%esp), %ebp
-        movl	56(%esp), %esi
-        movl	60(%esp), %edi
-        movdqa	(%ebp), %xmm6
-        movdqa	(%esi), %xmm5
-        movdqa	(%edi), %xmm7
-        movdqa	%xmm5, %xmm1
-        movdqa	%xmm5, %xmm0
-        psrlq	$63, %xmm1
-        psllq	$0x01, %xmm0
-        pslldq	$8, %xmm1
-        por	%xmm1, %xmm0
-        pshufd	$0xff, %xmm5, %xmm5
-        psrad	$31, %xmm5
-        pand	L_aes_gcm_mod2_128, %xmm5
-        pxor	%xmm0, %xmm5
-        movl	48(%esp), %edx
-        movl	52(%esp), %ecx
-        shll	$3, %edx
-        shll	$3, %ecx
-        pinsrd	$0x00, %edx, %xmm0
-        pinsrd	$2, %ecx, %xmm0
-        movl	48(%esp), %edx
-        movl	52(%esp), %ecx
-        shrl	$29, %edx
-        shrl	$29, %ecx
-        pinsrd	$0x01, %edx, %xmm0
-        pinsrd	$3, %ecx, %xmm0
-        pxor	%xmm0, %xmm6
-        pshufd	$0x4e, %xmm5, %xmm1
-        pshufd	$0x4e, %xmm6, %xmm2
-        movdqa	%xmm6, %xmm3
-        movdqa	%xmm6, %xmm0
-        pclmulqdq	$0x11, %xmm5, %xmm3
-        pclmulqdq	$0x00, %xmm5, %xmm0
-        pxor	%xmm5, %xmm1
-        pxor	%xmm6, %xmm2
-        pclmulqdq	$0x00, %xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm2
-        movdqa	%xmm3, %xmm6
-        pslldq	$8, %xmm2
-        psrldq	$8, %xmm1
-        pxor	%xmm2, %xmm0
-        pxor	%xmm1, %xmm6
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        movdqa	%xmm0, %xmm3
-        pslld	$31, %xmm1
-        pslld	$30, %xmm2
-        pslld	$25, %xmm3
-        pxor	%xmm2, %xmm1
-        pxor	%xmm3, %xmm1
-        movdqa	%xmm1, %xmm3
-        psrldq	$4, %xmm3
-        pslldq	$12, %xmm1
-        pxor	%xmm1, %xmm0
-        movdqa	%xmm0, %xmm1
-        movdqa	%xmm0, %xmm2
-        psrld	$0x01, %xmm1
-        psrld	$2, %xmm2
-        pxor	%xmm2, %xmm1
-        pxor	%xmm0, %xmm1
-        psrld	$7, %xmm0
-        pxor	%xmm3, %xmm1
-        pxor	%xmm0, %xmm1
-        pxor	%xmm1, %xmm6
-        pshufb	L_aes_gcm_bswap_mask, %xmm6
-        movdqu	%xmm7, %xmm0
-        pxor	%xmm6, %xmm0
-        movl	40(%esp), %esi
-        movl	64(%esp), %edi
-        cmpl	$16, 44(%esp)
-        je	L_AES_GCM_decrypt_final_aesni_cmp_tag_16
-        subl	$16, %esp
-        xorl	%ecx, %ecx
-        xorl	%ebx, %ebx
-        movdqu	%xmm0, (%esp)
-L_AES_GCM_decrypt_final_aesni_cmp_tag_loop:
-        movzbl	(%esp,%ecx,1), %eax
-        xorb	(%esi,%ecx,1), %al
-        orb	%al, %bl
-        incl	%ecx
-        cmpl	44(%esp), %ecx
-        jne	L_AES_GCM_decrypt_final_aesni_cmp_tag_loop
-        cmpb	$0x00, %bl
-        sete	%bl
-        addl	$16, %esp
-        xorl	%ecx, %ecx
-        jmp	L_AES_GCM_decrypt_final_aesni_cmp_tag_done
-L_AES_GCM_decrypt_final_aesni_cmp_tag_16:
-        movdqu	(%esi), %xmm1
-        pcmpeqb	%xmm1, %xmm0
-        pmovmskb	%xmm0, %edx
-        # %%edx == 0xFFFF then return 1 else => return 0
-        xorl	%ebx, %ebx
-        cmpl	$0xffff, %edx
-        sete	%bl
-L_AES_GCM_decrypt_final_aesni_cmp_tag_done:
-        movl	%ebx, (%edi)
-        addl	$16, %esp
-        popl	%ebp
-        popl	%edi
-        popl	%esi
-        popl	%ebx
-        ret
-.size	AES_GCM_decrypt_final_aesni,.-AES_GCM_decrypt_final_aesni
-#endif /* WOLFSSL_AESGCM_STREAM */
-#ifdef HAVE_INTEL_AVX1
-.text
-.globl	AES_GCM_encrypt_avx1
-.type	AES_GCM_encrypt_avx1,@function
-.align	16
-AES_GCM_encrypt_avx1:
-        pushl	%ebx
-        pushl	%esi
-        pushl	%edi
-        pushl	%ebp
-        subl	$0x70, %esp
-        movl	144(%esp), %esi
-        movl	168(%esp), %ebp
-        movl	160(%esp), %edx
-        vpxor	%xmm0, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm2, %xmm2
-        cmpl	$12, %edx
-        jne	L_AES_GCM_encrypt_avx1_iv_not_12
-        # # Calculate values when IV is 12 bytes
-        # Set counter based on IV
-        movl	$0x1000000, %ecx
-        vpinsrd	$0x00, (%esi), %xmm0, %xmm0
-        vpinsrd	$0x01, 4(%esi), %xmm0, %xmm0
-        vpinsrd	$2, 8(%esi), %xmm0, %xmm0
-        vpinsrd	$3, %ecx, %xmm0, %xmm0
-        # H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqa	(%ebp), %xmm1
-        vpxor	%xmm1, %xmm0, %xmm5
-        vmovdqa	16(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	32(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	48(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	64(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	80(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	96(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	112(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	128(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	144(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        cmpl	$11, 172(%esp)
-        vmovdqa	160(%ebp), %xmm3
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_12_last
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	176(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        cmpl	$13, 172(%esp)
-        vmovdqa	192(%ebp), %xmm3
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_12_last
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	208(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	224(%ebp), %xmm3
-L_AES_GCM_encrypt_avx1_calc_iv_12_last:
-        vaesenclast	%xmm3, %xmm1, %xmm1
-        vaesenclast	%xmm3, %xmm5, %xmm5
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm1, %xmm1
-        vmovdqu	%xmm5, 80(%esp)
-        jmp	L_AES_GCM_encrypt_avx1_iv_done
-L_AES_GCM_encrypt_avx1_iv_not_12:
-        # Calculate values when IV is not 12 bytes
-        # H = Encrypt X(=0)
-        vmovdqa	(%ebp), %xmm1
-        vaesenc	16(%ebp), %xmm1, %xmm1
-        vaesenc	32(%ebp), %xmm1, %xmm1
-        vaesenc	48(%ebp), %xmm1, %xmm1
-        vaesenc	64(%ebp), %xmm1, %xmm1
-        vaesenc	80(%ebp), %xmm1, %xmm1
-        vaesenc	96(%ebp), %xmm1, %xmm1
-        vaesenc	112(%ebp), %xmm1, %xmm1
-        vaesenc	128(%ebp), %xmm1, %xmm1
-        vaesenc	144(%ebp), %xmm1, %xmm1
-        cmpl	$11, 172(%esp)
-        vmovdqa	160(%ebp), %xmm5
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm5, %xmm1, %xmm1
-        vaesenc	176(%ebp), %xmm1, %xmm1
-        cmpl	$13, 172(%esp)
-        vmovdqa	192(%ebp), %xmm5
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm5, %xmm1, %xmm1
-        vaesenc	208(%ebp), %xmm1, %xmm1
-        vmovdqa	224(%ebp), %xmm5
-L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last:
-        vaesenclast	%xmm5, %xmm1, %xmm1
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm1, %xmm1
-        # Calc counter
-        # Initialization vector
-        cmpl	$0x00, %edx
-        movl	$0x00, %ecx
-        je	L_AES_GCM_encrypt_avx1_calc_iv_done
-        cmpl	$16, %edx
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_encrypt_avx1_calc_iv_16_loop:
-        vmovdqu	(%esi,%ecx,1), %xmm4
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm0, %xmm0
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpshufd	$0x4e, %xmm1, %xmm6
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm7
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vmovdqa	%xmm4, %xmm3
-        vmovdqa	%xmm7, %xmm0
-        vpslldq	$8, %xmm5, %xmm6
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm5, %xmm0, %xmm0
-        vpsrld	$31, %xmm3, %xmm4
-        vpsrld	$31, %xmm0, %xmm5
-        vpslld	$0x01, %xmm3, %xmm3
-        vpslld	$0x01, %xmm0, %xmm0
-        vpsrldq	$12, %xmm4, %xmm6
-        vpslldq	$4, %xmm4, %xmm4
-        vpslldq	$4, %xmm5, %xmm5
-        vpor	%xmm6, %xmm0, %xmm0
-        vpor	%xmm4, %xmm3, %xmm3
-        vpor	%xmm5, %xmm0, %xmm0
-        vpslld	$31, %xmm3, %xmm4
-        vpslld	$30, %xmm3, %xmm5
-        vpslld	$25, %xmm3, %xmm6
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpxor	%xmm6, %xmm4, %xmm4
-        vmovdqa	%xmm4, %xmm5
-        vpsrldq	$4, %xmm5, %xmm5
-        vpslldq	$12, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm3, %xmm3
-        vpsrld	$0x01, %xmm3, %xmm6
-        vpsrld	$2, %xmm3, %xmm7
-        vpsrld	$7, %xmm3, %xmm4
-        vpxor	%xmm7, %xmm6, %xmm6
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm6, %xmm0, %xmm0
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_16_loop
-        movl	160(%esp), %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_encrypt_avx1_calc_iv_done
-L_AES_GCM_encrypt_avx1_calc_iv_lt16:
-        subl	$16, %esp
-        vpxor	%xmm4, %xmm4, %xmm4
-        xorl	%ebx, %ebx
-        vmovdqu	%xmm4, (%esp)
-L_AES_GCM_encrypt_avx1_calc_iv_loop:
-        movzbl	(%esi,%ecx,1), %eax
-        movb	%al, (%esp,%ebx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_loop
-        vmovdqu	(%esp), %xmm4
-        addl	$16, %esp
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm0, %xmm0
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpshufd	$0x4e, %xmm1, %xmm6
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm7
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vmovdqa	%xmm4, %xmm3
-        vmovdqa	%xmm7, %xmm0
-        vpslldq	$8, %xmm5, %xmm6
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm5, %xmm0, %xmm0
-        vpsrld	$31, %xmm3, %xmm4
-        vpsrld	$31, %xmm0, %xmm5
-        vpslld	$0x01, %xmm3, %xmm3
-        vpslld	$0x01, %xmm0, %xmm0
-        vpsrldq	$12, %xmm4, %xmm6
-        vpslldq	$4, %xmm4, %xmm4
-        vpslldq	$4, %xmm5, %xmm5
-        vpor	%xmm6, %xmm0, %xmm0
-        vpor	%xmm4, %xmm3, %xmm3
-        vpor	%xmm5, %xmm0, %xmm0
-        vpslld	$31, %xmm3, %xmm4
-        vpslld	$30, %xmm3, %xmm5
-        vpslld	$25, %xmm3, %xmm6
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpxor	%xmm6, %xmm4, %xmm4
-        vmovdqa	%xmm4, %xmm5
-        vpsrldq	$4, %xmm5, %xmm5
-        vpslldq	$12, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm3, %xmm3
-        vpsrld	$0x01, %xmm3, %xmm6
-        vpsrld	$2, %xmm3, %xmm7
-        vpsrld	$7, %xmm3, %xmm4
-        vpxor	%xmm7, %xmm6, %xmm6
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm6, %xmm0, %xmm0
-L_AES_GCM_encrypt_avx1_calc_iv_done:
-        # T = Encrypt counter
-        vpxor	%xmm4, %xmm4, %xmm4
-        shll	$3, %edx
-        vpinsrd	$0x00, %edx, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm0, %xmm0
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpshufd	$0x4e, %xmm1, %xmm6
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm7
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vmovdqa	%xmm4, %xmm3
-        vmovdqa	%xmm7, %xmm0
-        vpslldq	$8, %xmm5, %xmm6
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm5, %xmm0, %xmm0
-        vpsrld	$31, %xmm3, %xmm4
-        vpsrld	$31, %xmm0, %xmm5
-        vpslld	$0x01, %xmm3, %xmm3
-        vpslld	$0x01, %xmm0, %xmm0
-        vpsrldq	$12, %xmm4, %xmm6
-        vpslldq	$4, %xmm4, %xmm4
-        vpslldq	$4, %xmm5, %xmm5
-        vpor	%xmm6, %xmm0, %xmm0
-        vpor	%xmm4, %xmm3, %xmm3
-        vpor	%xmm5, %xmm0, %xmm0
-        vpslld	$31, %xmm3, %xmm4
-        vpslld	$30, %xmm3, %xmm5
-        vpslld	$25, %xmm3, %xmm6
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpxor	%xmm6, %xmm4, %xmm4
-        vmovdqa	%xmm4, %xmm5
-        vpsrldq	$4, %xmm5, %xmm5
-        vpslldq	$12, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm3, %xmm3
-        vpsrld	$0x01, %xmm3, %xmm6
-        vpsrld	$2, %xmm3, %xmm7
-        vpsrld	$7, %xmm3, %xmm4
-        vpxor	%xmm7, %xmm6, %xmm6
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm6, %xmm0, %xmm0
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        #   Encrypt counter
-        vmovdqa	(%ebp), %xmm4
-        vpxor	%xmm0, %xmm4, %xmm4
-        vaesenc	16(%ebp), %xmm4, %xmm4
-        vaesenc	32(%ebp), %xmm4, %xmm4
-        vaesenc	48(%ebp), %xmm4, %xmm4
-        vaesenc	64(%ebp), %xmm4, %xmm4
-        vaesenc	80(%ebp), %xmm4, %xmm4
-        vaesenc	96(%ebp), %xmm4, %xmm4
-        vaesenc	112(%ebp), %xmm4, %xmm4
-        vaesenc	128(%ebp), %xmm4, %xmm4
-        vaesenc	144(%ebp), %xmm4, %xmm4
-        cmpl	$11, 172(%esp)
-        vmovdqa	160(%ebp), %xmm5
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm5, %xmm4, %xmm4
-        vaesenc	176(%ebp), %xmm4, %xmm4
-        cmpl	$13, 172(%esp)
-        vmovdqa	192(%ebp), %xmm5
-        jl	L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm5, %xmm4, %xmm4
-        vaesenc	208(%ebp), %xmm4, %xmm4
-        vmovdqa	224(%ebp), %xmm5
-L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last:
-        vaesenclast	%xmm5, %xmm4, %xmm4
-        vmovdqu	%xmm4, 80(%esp)
-L_AES_GCM_encrypt_avx1_iv_done:
-        movl	140(%esp), %esi
-        # Additional authentication data
-        movl	156(%esp), %edx
-        cmpl	$0x00, %edx
-        je	L_AES_GCM_encrypt_avx1_calc_aad_done
-        xorl	%ecx, %ecx
-        cmpl	$16, %edx
-        jl	L_AES_GCM_encrypt_avx1_calc_aad_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_encrypt_avx1_calc_aad_16_loop:
-        vmovdqu	(%esi,%ecx,1), %xmm4
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm2, %xmm2
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm2, %xmm5
-        vpshufd	$0x4e, %xmm1, %xmm6
-        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm4
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vmovdqa	%xmm4, %xmm3
-        vmovdqa	%xmm7, %xmm2
-        vpslldq	$8, %xmm5, %xmm6
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpsrld	$31, %xmm3, %xmm4
-        vpsrld	$31, %xmm2, %xmm5
-        vpslld	$0x01, %xmm3, %xmm3
-        vpslld	$0x01, %xmm2, %xmm2
-        vpsrldq	$12, %xmm4, %xmm6
-        vpslldq	$4, %xmm4, %xmm4
-        vpslldq	$4, %xmm5, %xmm5
-        vpor	%xmm6, %xmm2, %xmm2
-        vpor	%xmm4, %xmm3, %xmm3
-        vpor	%xmm5, %xmm2, %xmm2
-        vpslld	$31, %xmm3, %xmm4
-        vpslld	$30, %xmm3, %xmm5
-        vpslld	$25, %xmm3, %xmm6
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpxor	%xmm6, %xmm4, %xmm4
-        vmovdqa	%xmm4, %xmm5
-        vpsrldq	$4, %xmm5, %xmm5
-        vpslldq	$12, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm3, %xmm3
-        vpsrld	$0x01, %xmm3, %xmm6
-        vpsrld	$2, %xmm3, %xmm7
-        vpsrld	$7, %xmm3, %xmm4
-        vpxor	%xmm7, %xmm6, %xmm6
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm6, %xmm2, %xmm2
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_avx1_calc_aad_16_loop
-        movl	156(%esp), %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_encrypt_avx1_calc_aad_done
-L_AES_GCM_encrypt_avx1_calc_aad_lt16:
-        subl	$16, %esp
-        vpxor	%xmm4, %xmm4, %xmm4
-        xorl	%ebx, %ebx
-        vmovdqu	%xmm4, (%esp)
-L_AES_GCM_encrypt_avx1_calc_aad_loop:
-        movzbl	(%esi,%ecx,1), %eax
-        movb	%al, (%esp,%ebx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_avx1_calc_aad_loop
-        vmovdqu	(%esp), %xmm4
-        addl	$16, %esp
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm2, %xmm2
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm2, %xmm5
-        vpshufd	$0x4e, %xmm1, %xmm6
-        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm4
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vmovdqa	%xmm4, %xmm3
-        vmovdqa	%xmm7, %xmm2
-        vpslldq	$8, %xmm5, %xmm6
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpsrld	$31, %xmm3, %xmm4
-        vpsrld	$31, %xmm2, %xmm5
-        vpslld	$0x01, %xmm3, %xmm3
-        vpslld	$0x01, %xmm2, %xmm2
-        vpsrldq	$12, %xmm4, %xmm6
-        vpslldq	$4, %xmm4, %xmm4
-        vpslldq	$4, %xmm5, %xmm5
-        vpor	%xmm6, %xmm2, %xmm2
-        vpor	%xmm4, %xmm3, %xmm3
-        vpor	%xmm5, %xmm2, %xmm2
-        vpslld	$31, %xmm3, %xmm4
-        vpslld	$30, %xmm3, %xmm5
-        vpslld	$25, %xmm3, %xmm6
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpxor	%xmm6, %xmm4, %xmm4
-        vmovdqa	%xmm4, %xmm5
-        vpsrldq	$4, %xmm5, %xmm5
-        vpslldq	$12, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm3, %xmm3
-        vpsrld	$0x01, %xmm3, %xmm6
-        vpsrld	$2, %xmm3, %xmm7
-        vpsrld	$7, %xmm3, %xmm4
-        vpxor	%xmm7, %xmm6, %xmm6
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm6, %xmm2, %xmm2
-L_AES_GCM_encrypt_avx1_calc_aad_done:
-        vmovdqu	%xmm2, 96(%esp)
-        movl	132(%esp), %esi
-        movl	136(%esp), %edi
-        # Calculate counter and H
-        vpsrlq	$63, %xmm1, %xmm5
-        vpsllq	$0x01, %xmm1, %xmm4
-        vpslldq	$8, %xmm5, %xmm5
-        vpor	%xmm5, %xmm4, %xmm4
-        vpshufd	$0xff, %xmm1, %xmm1
-        vpsrad	$31, %xmm1, %xmm1
-        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm0, %xmm0
-        vpand	L_aes_gcm_avx1_mod2_128, %xmm1, %xmm1
-        vpaddd	L_aes_gcm_avx1_one, %xmm0, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	%xmm0, 64(%esp)
-        xorl	%ebx, %ebx
-        cmpl	$0x40, 152(%esp)
-        movl	152(%esp), %eax
-        jl	L_AES_GCM_encrypt_avx1_done_64
-        andl	$0xffffffc0, %eax
-        vmovdqa	%xmm2, %xmm6
-        # H ^ 1
-        vmovdqu	%xmm1, (%esp)
-        # H ^ 2
-        vpclmulqdq	$0x00, %xmm1, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm1, %xmm1, %xmm0
-        vpslld	$31, %xmm4, %xmm5
-        vpslld	$30, %xmm4, %xmm6
-        vpslld	$25, %xmm4, %xmm7
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpsrldq	$4, %xmm5, %xmm7
-        vpslldq	$12, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpsrld	$0x01, %xmm4, %xmm5
-        vpsrld	$2, %xmm4, %xmm6
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpsrld	$7, %xmm4, %xmm4
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm0, %xmm0
-        vmovdqu	%xmm0, 16(%esp)
-        # H ^ 3
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm1, %xmm5
-        vpshufd	$0x4e, %xmm0, %xmm6
-        vpclmulqdq	$0x11, %xmm1, %xmm0, %xmm7
-        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm4
-        vpxor	%xmm1, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpslldq	$8, %xmm5, %xmm6
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm4, %xmm4
-        vpxor	%xmm5, %xmm7, %xmm3
-        vpslld	$31, %xmm4, %xmm5
-        vpslld	$30, %xmm4, %xmm6
-        vpslld	$25, %xmm4, %xmm7
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpsrldq	$4, %xmm5, %xmm7
-        vpslldq	$12, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpsrld	$0x01, %xmm4, %xmm5
-        vpsrld	$2, %xmm4, %xmm6
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpsrld	$7, %xmm4, %xmm4
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm3, %xmm3
-        vmovdqu	%xmm3, 32(%esp)
-        # H ^ 4
-        vpclmulqdq	$0x00, %xmm0, %xmm0, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm0, %xmm3
-        vpslld	$31, %xmm4, %xmm5
-        vpslld	$30, %xmm4, %xmm6
-        vpslld	$25, %xmm4, %xmm7
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpsrldq	$4, %xmm5, %xmm7
-        vpslldq	$12, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpsrld	$0x01, %xmm4, %xmm5
-        vpsrld	$2, %xmm4, %xmm6
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpsrld	$7, %xmm4, %xmm4
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm3, %xmm3
-        vmovdqu	%xmm3, 48(%esp)
-        # First 64 bytes of input
-        vmovdqu	64(%esp), %xmm4
-        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm3
-        vpaddd	L_aes_gcm_avx1_one, %xmm4, %xmm5
-        vpshufb	%xmm3, %xmm5, %xmm5
-        vpaddd	L_aes_gcm_avx1_two, %xmm4, %xmm6
-        vpshufb	%xmm3, %xmm6, %xmm6
-        vpaddd	L_aes_gcm_avx1_three, %xmm4, %xmm7
-        vpshufb	%xmm3, %xmm7, %xmm7
-        vpshufb	%xmm3, %xmm4, %xmm4
-        vmovdqu	64(%esp), %xmm3
-        vpaddd	L_aes_gcm_avx1_four, %xmm3, %xmm3
-        vmovdqu	%xmm3, 64(%esp)
-        vmovdqa	(%ebp), %xmm3
-        vpxor	%xmm3, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm7, %xmm7
-        vmovdqa	16(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	32(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	48(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	64(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	80(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	96(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	112(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	128(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	144(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        cmpl	$11, 172(%esp)
-        vmovdqa	160(%ebp), %xmm3
-        jl	L_AES_GCM_encrypt_avx1_aesenc_64_enc_done
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	176(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        cmpl	$13, 172(%esp)
-        vmovdqa	192(%ebp), %xmm3
-        jl	L_AES_GCM_encrypt_avx1_aesenc_64_enc_done
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	208(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	224(%ebp), %xmm3
-L_AES_GCM_encrypt_avx1_aesenc_64_enc_done:
-        vaesenclast	%xmm3, %xmm4, %xmm4
-        vaesenclast	%xmm3, %xmm5, %xmm5
-        vmovdqu	(%esi), %xmm0
-        vmovdqu	16(%esi), %xmm1
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm5, %xmm5
-        vmovdqu	%xmm0, (%esi)
-        vmovdqu	%xmm1, 16(%esi)
-        vmovdqu	%xmm4, (%edi)
-        vmovdqu	%xmm5, 16(%edi)
-        vaesenclast	%xmm3, %xmm6, %xmm6
-        vaesenclast	%xmm3, %xmm7, %xmm7
-        vmovdqu	32(%esi), %xmm0
-        vmovdqu	48(%esi), %xmm1
-        vpxor	%xmm0, %xmm6, %xmm6
-        vpxor	%xmm1, %xmm7, %xmm7
-        vmovdqu	%xmm0, 32(%esi)
-        vmovdqu	%xmm1, 48(%esi)
-        vmovdqu	%xmm6, 32(%edi)
-        vmovdqu	%xmm7, 48(%edi)
-        cmpl	$0x40, %eax
-        movl	$0x40, %ebx
-        movl	%esi, %ecx
-        movl	%edi, %edx
-        jle	L_AES_GCM_encrypt_avx1_end_64
-        # More 64 bytes of input
-L_AES_GCM_encrypt_avx1_ghash_64:
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        vmovdqu	64(%esp), %xmm4
-        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm3
-        vpaddd	L_aes_gcm_avx1_one, %xmm4, %xmm5
-        vpshufb	%xmm3, %xmm5, %xmm5
-        vpaddd	L_aes_gcm_avx1_two, %xmm4, %xmm6
-        vpshufb	%xmm3, %xmm6, %xmm6
-        vpaddd	L_aes_gcm_avx1_three, %xmm4, %xmm7
-        vpshufb	%xmm3, %xmm7, %xmm7
-        vpshufb	%xmm3, %xmm4, %xmm4
-        vmovdqu	64(%esp), %xmm3
-        vpaddd	L_aes_gcm_avx1_four, %xmm3, %xmm3
-        vmovdqu	%xmm3, 64(%esp)
-        vmovdqa	(%ebp), %xmm3
-        vpxor	%xmm3, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm7, %xmm7
-        vmovdqa	16(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	32(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	48(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	64(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	80(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	96(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	112(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	128(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	144(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        cmpl	$11, 172(%esp)
-        vmovdqa	160(%ebp), %xmm3
-        jl	L_AES_GCM_encrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	176(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        cmpl	$13, 172(%esp)
-        vmovdqa	192(%ebp), %xmm3
-        jl	L_AES_GCM_encrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	208(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	224(%ebp), %xmm3
-L_AES_GCM_encrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
-        vaesenclast	%xmm3, %xmm4, %xmm4
-        vaesenclast	%xmm3, %xmm5, %xmm5
-        vmovdqu	(%ecx), %xmm0
-        vmovdqu	16(%ecx), %xmm1
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm5, %xmm5
-        vmovdqu	%xmm4, (%edx)
-        vmovdqu	%xmm5, 16(%edx)
-        vaesenclast	%xmm3, %xmm6, %xmm6
-        vaesenclast	%xmm3, %xmm7, %xmm7
-        vmovdqu	32(%ecx), %xmm0
-        vmovdqu	48(%ecx), %xmm1
-        vpxor	%xmm0, %xmm6, %xmm6
-        vpxor	%xmm1, %xmm7, %xmm7
-        vmovdqu	%xmm6, 32(%edx)
-        vmovdqu	%xmm7, 48(%edx)
-        # ghash encrypted counter
-        vmovdqu	96(%esp), %xmm6
-        vmovdqu	48(%esp), %xmm3
-        vmovdqu	-64(%edx), %xmm4
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm6, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm3, %xmm5
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm7
-        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm6
-        vpclmulqdq	$0x00, %xmm1, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vmovdqu	32(%esp), %xmm3
-        vmovdqu	-48(%edx), %xmm4
-        vpshufd	$0x4e, %xmm3, %xmm0
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm0, %xmm0
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm2
-        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm3
-        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm0, %xmm5, %xmm5
-        vmovdqu	16(%esp), %xmm3
-        vmovdqu	-32(%edx), %xmm4
-        vpshufd	$0x4e, %xmm3, %xmm0
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm0, %xmm0
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm2
-        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm3
-        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm0, %xmm5, %xmm5
-        vmovdqu	(%esp), %xmm3
-        vmovdqu	-16(%edx), %xmm4
-        vpshufd	$0x4e, %xmm3, %xmm0
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm0, %xmm0
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm2
-        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm3
-        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpslldq	$8, %xmm5, %xmm1
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm7, %xmm7
-        vpslld	$31, %xmm6, %xmm3
-        vpslld	$30, %xmm6, %xmm0
-        vpslld	$25, %xmm6, %xmm1
-        vpxor	%xmm0, %xmm3, %xmm3
-        vpxor	%xmm1, %xmm3, %xmm3
-        vpsrldq	$4, %xmm3, %xmm0
-        vpslldq	$12, %xmm3, %xmm3
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpsrld	$0x01, %xmm6, %xmm1
-        vpsrld	$2, %xmm6, %xmm5
-        vpsrld	$7, %xmm6, %xmm4
-        vpxor	%xmm5, %xmm1, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm7, %xmm6, %xmm6
-        vmovdqu	%xmm6, 96(%esp)
-        addl	$0x40, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_encrypt_avx1_ghash_64
-L_AES_GCM_encrypt_avx1_end_64:
-        vmovdqu	96(%esp), %xmm2
-        # Block 1
-        vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm4
-        vmovdqa	(%edx), %xmm1
-        vpshufb	%xmm4, %xmm1, %xmm1
-        vmovdqu	48(%esp), %xmm3
-        vpxor	%xmm2, %xmm1, %xmm1
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm1, %xmm5
-        vpshufd	$0x4e, %xmm3, %xmm6
-        vpclmulqdq	$0x11, %xmm1, %xmm3, %xmm7
-        vpclmulqdq	$0x00, %xmm1, %xmm3, %xmm4
-        vpxor	%xmm1, %xmm5, %xmm5
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vmovdqa	%xmm4, %xmm0
-        vmovdqa	%xmm7, %xmm2
-        vpslldq	$8, %xmm5, %xmm6
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm0, %xmm0
-        vpxor	%xmm5, %xmm2, %xmm2
-        # Block 2
-        vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm4
-        vmovdqa	16(%edx), %xmm1
-        vpshufb	%xmm4, %xmm1, %xmm1
-        vmovdqu	32(%esp), %xmm3
-        # ghash_gfmul_xor_avx
-        vpshufd	$0x4e, %xmm1, %xmm5
-        vpshufd	$0x4e, %xmm3, %xmm6
-        vpclmulqdq	$0x11, %xmm1, %xmm3, %xmm7
-        vpclmulqdq	$0x00, %xmm1, %xmm3, %xmm4
-        vpxor	%xmm1, %xmm5, %xmm5
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpslldq	$8, %xmm5, %xmm6
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm0, %xmm0
-        vpxor	%xmm5, %xmm2, %xmm2
-        # Block 3
-        vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm4
-        vmovdqa	32(%edx), %xmm1
-        vpshufb	%xmm4, %xmm1, %xmm1
-        vmovdqu	16(%esp), %xmm3
-        # ghash_gfmul_xor_avx
-        vpshufd	$0x4e, %xmm1, %xmm5
-        vpshufd	$0x4e, %xmm3, %xmm6
-        vpclmulqdq	$0x11, %xmm1, %xmm3, %xmm7
-        vpclmulqdq	$0x00, %xmm1, %xmm3, %xmm4
-        vpxor	%xmm1, %xmm5, %xmm5
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpslldq	$8, %xmm5, %xmm6
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm0, %xmm0
-        vpxor	%xmm5, %xmm2, %xmm2
-        # Block 4
-        vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm4
-        vmovdqa	48(%edx), %xmm1
-        vpshufb	%xmm4, %xmm1, %xmm1
-        vmovdqu	(%esp), %xmm3
-        # ghash_gfmul_xor_avx
-        vpshufd	$0x4e, %xmm1, %xmm5
-        vpshufd	$0x4e, %xmm3, %xmm6
-        vpclmulqdq	$0x11, %xmm1, %xmm3, %xmm7
-        vpclmulqdq	$0x00, %xmm1, %xmm3, %xmm4
-        vpxor	%xmm1, %xmm5, %xmm5
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpslldq	$8, %xmm5, %xmm6
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm0, %xmm0
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpslld	$31, %xmm0, %xmm4
-        vpslld	$30, %xmm0, %xmm5
-        vpslld	$25, %xmm0, %xmm6
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpxor	%xmm6, %xmm4, %xmm4
-        vmovdqa	%xmm4, %xmm5
-        vpsrldq	$4, %xmm5, %xmm5
-        vpslldq	$12, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm0, %xmm0
-        vpsrld	$0x01, %xmm0, %xmm6
-        vpsrld	$2, %xmm0, %xmm7
-        vpsrld	$7, %xmm0, %xmm4
-        vpxor	%xmm7, %xmm6, %xmm6
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm6, %xmm6
-        vpxor	%xmm0, %xmm6, %xmm6
-        vpxor	%xmm6, %xmm2, %xmm2
-        vmovdqu	(%esp), %xmm1
-L_AES_GCM_encrypt_avx1_done_64:
-        movl	152(%esp), %edx
-        cmpl	%edx, %ebx
-        jge	L_AES_GCM_encrypt_avx1_done_enc
-        movl	152(%esp), %eax
-        andl	$0xfffffff0, %eax
-        cmpl	%eax, %ebx
-        jge	L_AES_GCM_encrypt_avx1_last_block_done
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        vmovdqu	64(%esp), %xmm5
-        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm5, %xmm4
-        vpaddd	L_aes_gcm_avx1_one, %xmm5, %xmm5
-        vmovdqu	%xmm5, 64(%esp)
-        vpxor	(%ebp), %xmm4, %xmm4
-        vaesenc	16(%ebp), %xmm4, %xmm4
-        vaesenc	32(%ebp), %xmm4, %xmm4
-        vaesenc	48(%ebp), %xmm4, %xmm4
-        vaesenc	64(%ebp), %xmm4, %xmm4
-        vaesenc	80(%ebp), %xmm4, %xmm4
-        vaesenc	96(%ebp), %xmm4, %xmm4
-        vaesenc	112(%ebp), %xmm4, %xmm4
-        vaesenc	128(%ebp), %xmm4, %xmm4
-        vaesenc	144(%ebp), %xmm4, %xmm4
-        cmpl	$11, 172(%esp)
-        vmovdqa	160(%ebp), %xmm5
-        jl	L_AES_GCM_encrypt_avx1_aesenc_block_aesenc_avx_last
-        vaesenc	%xmm5, %xmm4, %xmm4
-        vaesenc	176(%ebp), %xmm4, %xmm4
-        cmpl	$13, 172(%esp)
-        vmovdqa	192(%ebp), %xmm5
-        jl	L_AES_GCM_encrypt_avx1_aesenc_block_aesenc_avx_last
-        vaesenc	%xmm5, %xmm4, %xmm4
-        vaesenc	208(%ebp), %xmm4, %xmm4
-        vmovdqa	224(%ebp), %xmm5
-L_AES_GCM_encrypt_avx1_aesenc_block_aesenc_avx_last:
-        vaesenclast	%xmm5, %xmm4, %xmm4
-        vmovdqu	(%ecx), %xmm5
-        vpxor	%xmm5, %xmm4, %xmm4
-        vmovdqu	%xmm4, (%edx)
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm2, %xmm2
-        addl	$16, %ebx
-        cmpl	%eax, %ebx
-        jge	L_AES_GCM_encrypt_avx1_last_block_ghash
-L_AES_GCM_encrypt_avx1_last_block_start:
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        vmovdqu	64(%esp), %xmm5
-        vmovdqu	%xmm2, %xmm7
-        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm5, %xmm4
-        vpaddd	L_aes_gcm_avx1_one, %xmm5, %xmm5
-        vmovdqu	%xmm5, 64(%esp)
-        vpxor	(%ebp), %xmm4, %xmm4
-        vpclmulqdq	$16, %xmm1, %xmm7, %xmm0
-        vaesenc	16(%ebp), %xmm4, %xmm4
-        vaesenc	32(%ebp), %xmm4, %xmm4
-        vpclmulqdq	$0x01, %xmm1, %xmm7, %xmm3
-        vaesenc	48(%ebp), %xmm4, %xmm4
-        vaesenc	64(%ebp), %xmm4, %xmm4
-        vaesenc	80(%ebp), %xmm4, %xmm4
-        vpclmulqdq	$0x11, %xmm1, %xmm7, %xmm5
-        vaesenc	96(%ebp), %xmm4, %xmm4
-        vpxor	%xmm3, %xmm0, %xmm0
-        vpslldq	$8, %xmm0, %xmm6
-        vpsrldq	$8, %xmm0, %xmm0
-        vaesenc	112(%ebp), %xmm4, %xmm4
-        vpclmulqdq	$0x00, %xmm1, %xmm7, %xmm3
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm0, %xmm5, %xmm5
-        vmovdqa	L_aes_gcm_avx1_mod2_128, %xmm7
-        vpclmulqdq	$16, %xmm7, %xmm6, %xmm3
-        vaesenc	128(%ebp), %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm6, %xmm0
-        vpxor	%xmm3, %xmm0, %xmm0
-        vpclmulqdq	$16, %xmm7, %xmm0, %xmm3
-        vaesenc	144(%ebp), %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm2
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm5, %xmm2, %xmm2
-        cmpl	$11, 172(%esp)
-        vmovdqa	160(%ebp), %xmm5
-        jl	L_AES_GCM_encrypt_avx1_aesenc_gfmul_last
-        vaesenc	%xmm5, %xmm4, %xmm4
-        vaesenc	176(%ebp), %xmm4, %xmm4
-        cmpl	$13, 172(%esp)
-        vmovdqa	192(%ebp), %xmm5
-        jl	L_AES_GCM_encrypt_avx1_aesenc_gfmul_last
-        vaesenc	%xmm5, %xmm4, %xmm4
-        vaesenc	208(%ebp), %xmm4, %xmm4
-        vmovdqa	224(%ebp), %xmm5
-L_AES_GCM_encrypt_avx1_aesenc_gfmul_last:
-        vaesenclast	%xmm5, %xmm4, %xmm4
-        vmovdqu	(%ecx), %xmm5
-        vpxor	%xmm5, %xmm4, %xmm4
-        vmovdqu	%xmm4, (%edx)
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        addl	$16, %ebx
-        vpxor	%xmm4, %xmm2, %xmm2
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_encrypt_avx1_last_block_start
-L_AES_GCM_encrypt_avx1_last_block_ghash:
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm1, %xmm5
-        vpshufd	$0x4e, %xmm2, %xmm6
-        vpclmulqdq	$0x11, %xmm1, %xmm2, %xmm7
-        vpclmulqdq	$0x00, %xmm1, %xmm2, %xmm4
-        vpxor	%xmm1, %xmm5, %xmm5
-        vpxor	%xmm2, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpslldq	$8, %xmm5, %xmm6
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm4, %xmm4
-        vpxor	%xmm5, %xmm7, %xmm2
-        vpslld	$31, %xmm4, %xmm5
-        vpslld	$30, %xmm4, %xmm6
-        vpslld	$25, %xmm4, %xmm7
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpsrldq	$4, %xmm5, %xmm7
-        vpslldq	$12, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpsrld	$0x01, %xmm4, %xmm5
-        vpsrld	$2, %xmm4, %xmm6
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpsrld	$7, %xmm4, %xmm4
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm2, %xmm2
-L_AES_GCM_encrypt_avx1_last_block_done:
-        movl	152(%esp), %ecx
-        movl	%ecx, %edx
-        andl	$15, %ecx
-        jz	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_done
-        vmovdqu	64(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm0, %xmm0
-        vpxor	(%ebp), %xmm0, %xmm0
-        vaesenc	16(%ebp), %xmm0, %xmm0
-        vaesenc	32(%ebp), %xmm0, %xmm0
-        vaesenc	48(%ebp), %xmm0, %xmm0
-        vaesenc	64(%ebp), %xmm0, %xmm0
-        vaesenc	80(%ebp), %xmm0, %xmm0
-        vaesenc	96(%ebp), %xmm0, %xmm0
-        vaesenc	112(%ebp), %xmm0, %xmm0
-        vaesenc	128(%ebp), %xmm0, %xmm0
-        vaesenc	144(%ebp), %xmm0, %xmm0
-        cmpl	$11, 172(%esp)
-        vmovdqa	160(%ebp), %xmm5
-        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last
-        vaesenc	%xmm5, %xmm0, %xmm0
-        vaesenc	176(%ebp), %xmm0, %xmm0
-        cmpl	$13, 172(%esp)
-        vmovdqa	192(%ebp), %xmm5
-        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last
-        vaesenc	%xmm5, %xmm0, %xmm0
-        vaesenc	208(%ebp), %xmm0, %xmm0
-        vmovdqa	224(%ebp), %xmm5
-L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last:
-        vaesenclast	%xmm5, %xmm0, %xmm0
-        subl	$16, %esp
-        xorl	%ecx, %ecx
-        vmovdqu	%xmm0, (%esp)
-L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_loop:
-        movzbl	(%esi,%ebx,1), %eax
-        xorb	(%esp,%ecx,1), %al
-        movb	%al, (%edi,%ebx,1)
-        movb	%al, (%esp,%ecx,1)
-        incl	%ebx
-        incl	%ecx
-        cmpl	%edx, %ebx
-        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_loop
-        xorl	%eax, %eax
-        cmpl	$16, %ecx
-        je	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_finish_enc
-L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_byte_loop:
-        movb	%al, (%esp,%ecx,1)
-        incl	%ecx
-        cmpl	$16, %ecx
-        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_byte_loop
-L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_finish_enc:
-        vmovdqu	(%esp), %xmm0
-        addl	$16, %esp
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm2, %xmm2
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm1, %xmm5
-        vpshufd	$0x4e, %xmm2, %xmm6
-        vpclmulqdq	$0x11, %xmm1, %xmm2, %xmm7
-        vpclmulqdq	$0x00, %xmm1, %xmm2, %xmm4
-        vpxor	%xmm1, %xmm5, %xmm5
-        vpxor	%xmm2, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpslldq	$8, %xmm5, %xmm6
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm4, %xmm4
-        vpxor	%xmm5, %xmm7, %xmm2
-        vpslld	$31, %xmm4, %xmm5
-        vpslld	$30, %xmm4, %xmm6
-        vpslld	$25, %xmm4, %xmm7
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpsrldq	$4, %xmm5, %xmm7
-        vpslldq	$12, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpsrld	$0x01, %xmm4, %xmm5
-        vpsrld	$2, %xmm4, %xmm6
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpsrld	$7, %xmm4, %xmm4
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm2, %xmm2
-L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_done:
-L_AES_GCM_encrypt_avx1_done_enc:
-        movl	148(%esp), %edi
-        movl	164(%esp), %ebx
-        movl	152(%esp), %edx
-        movl	156(%esp), %ecx
-        shll	$3, %edx
-        shll	$3, %ecx
-        vpinsrd	$0x00, %edx, %xmm4, %xmm4
-        vpinsrd	$2, %ecx, %xmm4, %xmm4
-        movl	152(%esp), %edx
-        movl	156(%esp), %ecx
-        shrl	$29, %edx
-        shrl	$29, %ecx
-        vpinsrd	$0x01, %edx, %xmm4, %xmm4
-        vpinsrd	$3, %ecx, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm2, %xmm2
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm1, %xmm5
-        vpshufd	$0x4e, %xmm2, %xmm6
-        vpclmulqdq	$0x11, %xmm1, %xmm2, %xmm7
-        vpclmulqdq	$0x00, %xmm1, %xmm2, %xmm4
-        vpxor	%xmm1, %xmm5, %xmm5
-        vpxor	%xmm2, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpslldq	$8, %xmm5, %xmm6
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm4, %xmm4
-        vpxor	%xmm5, %xmm7, %xmm2
-        vpslld	$31, %xmm4, %xmm5
-        vpslld	$30, %xmm4, %xmm6
-        vpslld	$25, %xmm4, %xmm7
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpsrldq	$4, %xmm5, %xmm7
-        vpslldq	$12, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpsrld	$0x01, %xmm4, %xmm5
-        vpsrld	$2, %xmm4, %xmm6
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpsrld	$7, %xmm4, %xmm4
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm2, %xmm2
-        vpxor	80(%esp), %xmm2, %xmm4
-        cmpl	$16, %ebx
-        je	L_AES_GCM_encrypt_avx1_store_tag_16
-        xorl	%ecx, %ecx
-        vmovdqu	%xmm4, (%esp)
-L_AES_GCM_encrypt_avx1_store_tag_loop:
-        movzbl	(%esp,%ecx,1), %eax
-        movb	%al, (%edi,%ecx,1)
-        incl	%ecx
-        cmpl	%ebx, %ecx
-        jne	L_AES_GCM_encrypt_avx1_store_tag_loop
-        jmp	L_AES_GCM_encrypt_avx1_store_tag_done
-L_AES_GCM_encrypt_avx1_store_tag_16:
-        vmovdqu	%xmm4, (%edi)
-L_AES_GCM_encrypt_avx1_store_tag_done:
-        addl	$0x70, %esp
-        popl	%ebp
-        popl	%edi
-        popl	%esi
-        popl	%ebx
-        ret
-.size	AES_GCM_encrypt_avx1,.-AES_GCM_encrypt_avx1
-.text
-.globl	AES_GCM_decrypt_avx1
-.type	AES_GCM_decrypt_avx1,@function
-.align	16
-AES_GCM_decrypt_avx1:
-        pushl	%ebx
-        pushl	%esi
-        pushl	%edi
-        pushl	%ebp
-        subl	$0xb0, %esp
-        movl	208(%esp), %esi
-        movl	232(%esp), %ebp
-        movl	224(%esp), %edx
-        vpxor	%xmm0, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm2, %xmm2
-        cmpl	$12, %edx
-        jne	L_AES_GCM_decrypt_avx1_iv_not_12
-        # # Calculate values when IV is 12 bytes
-        # Set counter based on IV
-        movl	$0x1000000, %ecx
-        vpinsrd	$0x00, (%esi), %xmm0, %xmm0
-        vpinsrd	$0x01, 4(%esi), %xmm0, %xmm0
-        vpinsrd	$2, 8(%esi), %xmm0, %xmm0
-        vpinsrd	$3, %ecx, %xmm0, %xmm0
-        # H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqa	(%ebp), %xmm1
-        vpxor	%xmm1, %xmm0, %xmm5
-        vmovdqa	16(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	32(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	48(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	64(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	80(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	96(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	112(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	128(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	144(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        cmpl	$11, 236(%esp)
-        vmovdqa	160(%ebp), %xmm3
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_12_last
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	176(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        cmpl	$13, 236(%esp)
-        vmovdqa	192(%ebp), %xmm3
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_12_last
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	208(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm1, %xmm1
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vmovdqa	224(%ebp), %xmm3
-L_AES_GCM_decrypt_avx1_calc_iv_12_last:
-        vaesenclast	%xmm3, %xmm1, %xmm1
-        vaesenclast	%xmm3, %xmm5, %xmm5
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm1, %xmm1
-        vmovdqu	%xmm5, 80(%esp)
-        jmp	L_AES_GCM_decrypt_avx1_iv_done
-L_AES_GCM_decrypt_avx1_iv_not_12:
-        # Calculate values when IV is not 12 bytes
-        # H = Encrypt X(=0)
-        vmovdqa	(%ebp), %xmm1
-        vaesenc	16(%ebp), %xmm1, %xmm1
-        vaesenc	32(%ebp), %xmm1, %xmm1
-        vaesenc	48(%ebp), %xmm1, %xmm1
-        vaesenc	64(%ebp), %xmm1, %xmm1
-        vaesenc	80(%ebp), %xmm1, %xmm1
-        vaesenc	96(%ebp), %xmm1, %xmm1
-        vaesenc	112(%ebp), %xmm1, %xmm1
-        vaesenc	128(%ebp), %xmm1, %xmm1
-        vaesenc	144(%ebp), %xmm1, %xmm1
-        cmpl	$11, 236(%esp)
-        vmovdqa	160(%ebp), %xmm5
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm5, %xmm1, %xmm1
-        vaesenc	176(%ebp), %xmm1, %xmm1
-        cmpl	$13, 236(%esp)
-        vmovdqa	192(%ebp), %xmm5
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm5, %xmm1, %xmm1
-        vaesenc	208(%ebp), %xmm1, %xmm1
-        vmovdqa	224(%ebp), %xmm5
-L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last:
-        vaesenclast	%xmm5, %xmm1, %xmm1
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm1, %xmm1
-        # Calc counter
-        # Initialization vector
-        cmpl	$0x00, %edx
-        movl	$0x00, %ecx
-        je	L_AES_GCM_decrypt_avx1_calc_iv_done
-        cmpl	$16, %edx
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_decrypt_avx1_calc_iv_16_loop:
-        vmovdqu	(%esi,%ecx,1), %xmm4
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm0, %xmm0
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpshufd	$0x4e, %xmm1, %xmm6
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm7
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vmovdqa	%xmm4, %xmm3
-        vmovdqa	%xmm7, %xmm0
-        vpslldq	$8, %xmm5, %xmm6
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm5, %xmm0, %xmm0
-        vpsrld	$31, %xmm3, %xmm4
-        vpsrld	$31, %xmm0, %xmm5
-        vpslld	$0x01, %xmm3, %xmm3
-        vpslld	$0x01, %xmm0, %xmm0
-        vpsrldq	$12, %xmm4, %xmm6
-        vpslldq	$4, %xmm4, %xmm4
-        vpslldq	$4, %xmm5, %xmm5
-        vpor	%xmm6, %xmm0, %xmm0
-        vpor	%xmm4, %xmm3, %xmm3
-        vpor	%xmm5, %xmm0, %xmm0
-        vpslld	$31, %xmm3, %xmm4
-        vpslld	$30, %xmm3, %xmm5
-        vpslld	$25, %xmm3, %xmm6
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpxor	%xmm6, %xmm4, %xmm4
-        vmovdqa	%xmm4, %xmm5
-        vpsrldq	$4, %xmm5, %xmm5
-        vpslldq	$12, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm3, %xmm3
-        vpsrld	$0x01, %xmm3, %xmm6
-        vpsrld	$2, %xmm3, %xmm7
-        vpsrld	$7, %xmm3, %xmm4
-        vpxor	%xmm7, %xmm6, %xmm6
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm6, %xmm0, %xmm0
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_16_loop
-        movl	224(%esp), %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_decrypt_avx1_calc_iv_done
-L_AES_GCM_decrypt_avx1_calc_iv_lt16:
-        subl	$16, %esp
-        vpxor	%xmm4, %xmm4, %xmm4
-        xorl	%ebx, %ebx
-        vmovdqu	%xmm4, (%esp)
-L_AES_GCM_decrypt_avx1_calc_iv_loop:
-        movzbl	(%esi,%ecx,1), %eax
-        movb	%al, (%esp,%ebx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_loop
-        vmovdqu	(%esp), %xmm4
-        addl	$16, %esp
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm0, %xmm0
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpshufd	$0x4e, %xmm1, %xmm6
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm7
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vmovdqa	%xmm4, %xmm3
-        vmovdqa	%xmm7, %xmm0
-        vpslldq	$8, %xmm5, %xmm6
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm5, %xmm0, %xmm0
-        vpsrld	$31, %xmm3, %xmm4
-        vpsrld	$31, %xmm0, %xmm5
-        vpslld	$0x01, %xmm3, %xmm3
-        vpslld	$0x01, %xmm0, %xmm0
-        vpsrldq	$12, %xmm4, %xmm6
-        vpslldq	$4, %xmm4, %xmm4
-        vpslldq	$4, %xmm5, %xmm5
-        vpor	%xmm6, %xmm0, %xmm0
-        vpor	%xmm4, %xmm3, %xmm3
-        vpor	%xmm5, %xmm0, %xmm0
-        vpslld	$31, %xmm3, %xmm4
-        vpslld	$30, %xmm3, %xmm5
-        vpslld	$25, %xmm3, %xmm6
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpxor	%xmm6, %xmm4, %xmm4
-        vmovdqa	%xmm4, %xmm5
-        vpsrldq	$4, %xmm5, %xmm5
-        vpslldq	$12, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm3, %xmm3
-        vpsrld	$0x01, %xmm3, %xmm6
-        vpsrld	$2, %xmm3, %xmm7
-        vpsrld	$7, %xmm3, %xmm4
-        vpxor	%xmm7, %xmm6, %xmm6
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm6, %xmm0, %xmm0
-L_AES_GCM_decrypt_avx1_calc_iv_done:
-        # T = Encrypt counter
-        vpxor	%xmm4, %xmm4, %xmm4
-        shll	$3, %edx
-        vpinsrd	$0x00, %edx, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm0, %xmm0
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpshufd	$0x4e, %xmm1, %xmm6
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm7
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vmovdqa	%xmm4, %xmm3
-        vmovdqa	%xmm7, %xmm0
-        vpslldq	$8, %xmm5, %xmm6
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm5, %xmm0, %xmm0
-        vpsrld	$31, %xmm3, %xmm4
-        vpsrld	$31, %xmm0, %xmm5
-        vpslld	$0x01, %xmm3, %xmm3
-        vpslld	$0x01, %xmm0, %xmm0
-        vpsrldq	$12, %xmm4, %xmm6
-        vpslldq	$4, %xmm4, %xmm4
-        vpslldq	$4, %xmm5, %xmm5
-        vpor	%xmm6, %xmm0, %xmm0
-        vpor	%xmm4, %xmm3, %xmm3
-        vpor	%xmm5, %xmm0, %xmm0
-        vpslld	$31, %xmm3, %xmm4
-        vpslld	$30, %xmm3, %xmm5
-        vpslld	$25, %xmm3, %xmm6
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpxor	%xmm6, %xmm4, %xmm4
-        vmovdqa	%xmm4, %xmm5
-        vpsrldq	$4, %xmm5, %xmm5
-        vpslldq	$12, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm3, %xmm3
-        vpsrld	$0x01, %xmm3, %xmm6
-        vpsrld	$2, %xmm3, %xmm7
-        vpsrld	$7, %xmm3, %xmm4
-        vpxor	%xmm7, %xmm6, %xmm6
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm6, %xmm0, %xmm0
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        #   Encrypt counter
-        vmovdqa	(%ebp), %xmm4
-        vpxor	%xmm0, %xmm4, %xmm4
-        vaesenc	16(%ebp), %xmm4, %xmm4
-        vaesenc	32(%ebp), %xmm4, %xmm4
-        vaesenc	48(%ebp), %xmm4, %xmm4
-        vaesenc	64(%ebp), %xmm4, %xmm4
-        vaesenc	80(%ebp), %xmm4, %xmm4
-        vaesenc	96(%ebp), %xmm4, %xmm4
-        vaesenc	112(%ebp), %xmm4, %xmm4
-        vaesenc	128(%ebp), %xmm4, %xmm4
-        vaesenc	144(%ebp), %xmm4, %xmm4
-        cmpl	$11, 236(%esp)
-        vmovdqa	160(%ebp), %xmm5
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm5, %xmm4, %xmm4
-        vaesenc	176(%ebp), %xmm4, %xmm4
-        cmpl	$13, 236(%esp)
-        vmovdqa	192(%ebp), %xmm5
-        jl	L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm5, %xmm4, %xmm4
-        vaesenc	208(%ebp), %xmm4, %xmm4
-        vmovdqa	224(%ebp), %xmm5
-L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last:
-        vaesenclast	%xmm5, %xmm4, %xmm4
-        vmovdqu	%xmm4, 80(%esp)
-L_AES_GCM_decrypt_avx1_iv_done:
-        movl	204(%esp), %esi
-        # Additional authentication data
-        movl	220(%esp), %edx
-        cmpl	$0x00, %edx
-        je	L_AES_GCM_decrypt_avx1_calc_aad_done
-        xorl	%ecx, %ecx
-        cmpl	$16, %edx
-        jl	L_AES_GCM_decrypt_avx1_calc_aad_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_decrypt_avx1_calc_aad_16_loop:
-        vmovdqu	(%esi,%ecx,1), %xmm4
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm2, %xmm2
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm2, %xmm5
-        vpshufd	$0x4e, %xmm1, %xmm6
-        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm4
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vmovdqa	%xmm4, %xmm3
-        vmovdqa	%xmm7, %xmm2
-        vpslldq	$8, %xmm5, %xmm6
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpsrld	$31, %xmm3, %xmm4
-        vpsrld	$31, %xmm2, %xmm5
-        vpslld	$0x01, %xmm3, %xmm3
-        vpslld	$0x01, %xmm2, %xmm2
-        vpsrldq	$12, %xmm4, %xmm6
-        vpslldq	$4, %xmm4, %xmm4
-        vpslldq	$4, %xmm5, %xmm5
-        vpor	%xmm6, %xmm2, %xmm2
-        vpor	%xmm4, %xmm3, %xmm3
-        vpor	%xmm5, %xmm2, %xmm2
-        vpslld	$31, %xmm3, %xmm4
-        vpslld	$30, %xmm3, %xmm5
-        vpslld	$25, %xmm3, %xmm6
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpxor	%xmm6, %xmm4, %xmm4
-        vmovdqa	%xmm4, %xmm5
-        vpsrldq	$4, %xmm5, %xmm5
-        vpslldq	$12, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm3, %xmm3
-        vpsrld	$0x01, %xmm3, %xmm6
-        vpsrld	$2, %xmm3, %xmm7
-        vpsrld	$7, %xmm3, %xmm4
-        vpxor	%xmm7, %xmm6, %xmm6
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm6, %xmm2, %xmm2
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_avx1_calc_aad_16_loop
-        movl	220(%esp), %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_decrypt_avx1_calc_aad_done
-L_AES_GCM_decrypt_avx1_calc_aad_lt16:
-        subl	$16, %esp
-        vpxor	%xmm4, %xmm4, %xmm4
-        xorl	%ebx, %ebx
-        vmovdqu	%xmm4, (%esp)
-L_AES_GCM_decrypt_avx1_calc_aad_loop:
-        movzbl	(%esi,%ecx,1), %eax
-        movb	%al, (%esp,%ebx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_avx1_calc_aad_loop
-        vmovdqu	(%esp), %xmm4
-        addl	$16, %esp
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm2, %xmm2
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm2, %xmm5
-        vpshufd	$0x4e, %xmm1, %xmm6
-        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm4
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vmovdqa	%xmm4, %xmm3
-        vmovdqa	%xmm7, %xmm2
-        vpslldq	$8, %xmm5, %xmm6
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpsrld	$31, %xmm3, %xmm4
-        vpsrld	$31, %xmm2, %xmm5
-        vpslld	$0x01, %xmm3, %xmm3
-        vpslld	$0x01, %xmm2, %xmm2
-        vpsrldq	$12, %xmm4, %xmm6
-        vpslldq	$4, %xmm4, %xmm4
-        vpslldq	$4, %xmm5, %xmm5
-        vpor	%xmm6, %xmm2, %xmm2
-        vpor	%xmm4, %xmm3, %xmm3
-        vpor	%xmm5, %xmm2, %xmm2
-        vpslld	$31, %xmm3, %xmm4
-        vpslld	$30, %xmm3, %xmm5
-        vpslld	$25, %xmm3, %xmm6
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpxor	%xmm6, %xmm4, %xmm4
-        vmovdqa	%xmm4, %xmm5
-        vpsrldq	$4, %xmm5, %xmm5
-        vpslldq	$12, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm3, %xmm3
-        vpsrld	$0x01, %xmm3, %xmm6
-        vpsrld	$2, %xmm3, %xmm7
-        vpsrld	$7, %xmm3, %xmm4
-        vpxor	%xmm7, %xmm6, %xmm6
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm6, %xmm2, %xmm2
-L_AES_GCM_decrypt_avx1_calc_aad_done:
-        vmovdqu	%xmm2, 96(%esp)
-        movl	196(%esp), %esi
-        movl	200(%esp), %edi
-        # Calculate counter and H
-        vpsrlq	$63, %xmm1, %xmm5
-        vpsllq	$0x01, %xmm1, %xmm4
-        vpslldq	$8, %xmm5, %xmm5
-        vpor	%xmm5, %xmm4, %xmm4
-        vpshufd	$0xff, %xmm1, %xmm1
-        vpsrad	$31, %xmm1, %xmm1
-        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm0, %xmm0
-        vpand	L_aes_gcm_avx1_mod2_128, %xmm1, %xmm1
-        vpaddd	L_aes_gcm_avx1_one, %xmm0, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	%xmm0, 64(%esp)
-        xorl	%ebx, %ebx
-        cmpl	$0x40, 216(%esp)
-        movl	216(%esp), %eax
-        jl	L_AES_GCM_decrypt_avx1_done_64
-        andl	$0xffffffc0, %eax
-        vmovdqa	%xmm2, %xmm6
-        # H ^ 1
-        vmovdqu	%xmm1, (%esp)
-        # H ^ 2
-        vpclmulqdq	$0x00, %xmm1, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm1, %xmm1, %xmm0
-        vpslld	$31, %xmm4, %xmm5
-        vpslld	$30, %xmm4, %xmm6
-        vpslld	$25, %xmm4, %xmm7
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpsrldq	$4, %xmm5, %xmm7
-        vpslldq	$12, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpsrld	$0x01, %xmm4, %xmm5
-        vpsrld	$2, %xmm4, %xmm6
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpsrld	$7, %xmm4, %xmm4
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm0, %xmm0
-        vmovdqu	%xmm0, 16(%esp)
-        # H ^ 3
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm1, %xmm5
-        vpshufd	$0x4e, %xmm0, %xmm6
-        vpclmulqdq	$0x11, %xmm1, %xmm0, %xmm7
-        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm4
-        vpxor	%xmm1, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpslldq	$8, %xmm5, %xmm6
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm4, %xmm4
-        vpxor	%xmm5, %xmm7, %xmm3
-        vpslld	$31, %xmm4, %xmm5
-        vpslld	$30, %xmm4, %xmm6
-        vpslld	$25, %xmm4, %xmm7
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpsrldq	$4, %xmm5, %xmm7
-        vpslldq	$12, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpsrld	$0x01, %xmm4, %xmm5
-        vpsrld	$2, %xmm4, %xmm6
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpsrld	$7, %xmm4, %xmm4
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm3, %xmm3
-        vmovdqu	%xmm3, 32(%esp)
-        # H ^ 4
-        vpclmulqdq	$0x00, %xmm0, %xmm0, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm0, %xmm3
-        vpslld	$31, %xmm4, %xmm5
-        vpslld	$30, %xmm4, %xmm6
-        vpslld	$25, %xmm4, %xmm7
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpsrldq	$4, %xmm5, %xmm7
-        vpslldq	$12, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpsrld	$0x01, %xmm4, %xmm5
-        vpsrld	$2, %xmm4, %xmm6
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpsrld	$7, %xmm4, %xmm4
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm3, %xmm3
-        vmovdqu	%xmm3, 48(%esp)
-        cmpl	%esi, %edi
-        jne	L_AES_GCM_decrypt_avx1_ghash_64
-L_AES_GCM_decrypt_avx1_ghash_64_inplace:
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        vmovdqu	64(%esp), %xmm4
-        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm3
-        vpaddd	L_aes_gcm_avx1_one, %xmm4, %xmm5
-        vpshufb	%xmm3, %xmm5, %xmm5
-        vpaddd	L_aes_gcm_avx1_two, %xmm4, %xmm6
-        vpshufb	%xmm3, %xmm6, %xmm6
-        vpaddd	L_aes_gcm_avx1_three, %xmm4, %xmm7
-        vpshufb	%xmm3, %xmm7, %xmm7
-        vpshufb	%xmm3, %xmm4, %xmm4
-        vmovdqu	64(%esp), %xmm3
-        vpaddd	L_aes_gcm_avx1_four, %xmm3, %xmm3
-        vmovdqu	%xmm3, 64(%esp)
-        vmovdqa	(%ebp), %xmm3
-        vpxor	%xmm3, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm7, %xmm7
-        vmovdqa	16(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	32(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	48(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	64(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	80(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	96(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	112(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	128(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	144(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        cmpl	$11, 236(%esp)
-        vmovdqa	160(%ebp), %xmm3
-        jl	L_AES_GCM_decrypt_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	176(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        cmpl	$13, 236(%esp)
-        vmovdqa	192(%ebp), %xmm3
-        jl	L_AES_GCM_decrypt_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	208(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	224(%ebp), %xmm3
-L_AES_GCM_decrypt_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done:
-        vaesenclast	%xmm3, %xmm4, %xmm4
-        vaesenclast	%xmm3, %xmm5, %xmm5
-        vmovdqu	(%ecx), %xmm0
-        vmovdqu	16(%ecx), %xmm1
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm5, %xmm5
-        vmovdqu	%xmm0, 112(%esp)
-        vmovdqu	%xmm1, 128(%esp)
-        vmovdqu	%xmm4, (%edx)
-        vmovdqu	%xmm5, 16(%edx)
-        vaesenclast	%xmm3, %xmm6, %xmm6
-        vaesenclast	%xmm3, %xmm7, %xmm7
-        vmovdqu	32(%ecx), %xmm0
-        vmovdqu	48(%ecx), %xmm1
-        vpxor	%xmm0, %xmm6, %xmm6
-        vpxor	%xmm1, %xmm7, %xmm7
-        vmovdqu	%xmm0, 144(%esp)
-        vmovdqu	%xmm1, 160(%esp)
-        vmovdqu	%xmm6, 32(%edx)
-        vmovdqu	%xmm7, 48(%edx)
-        # ghash encrypted counter
-        vmovdqu	96(%esp), %xmm6
-        vmovdqu	48(%esp), %xmm3
-        vmovdqu	112(%esp), %xmm4
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm6, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm3, %xmm5
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm7
-        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm6
-        vpclmulqdq	$0x00, %xmm1, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vmovdqu	32(%esp), %xmm3
-        vmovdqu	128(%esp), %xmm4
-        vpshufd	$0x4e, %xmm3, %xmm0
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm0, %xmm0
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm2
-        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm3
-        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm0, %xmm5, %xmm5
-        vmovdqu	16(%esp), %xmm3
-        vmovdqu	144(%esp), %xmm4
-        vpshufd	$0x4e, %xmm3, %xmm0
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm0, %xmm0
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm2
-        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm3
-        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm0, %xmm5, %xmm5
-        vmovdqu	(%esp), %xmm3
-        vmovdqu	160(%esp), %xmm4
-        vpshufd	$0x4e, %xmm3, %xmm0
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm0, %xmm0
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm2
-        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm3
-        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpslldq	$8, %xmm5, %xmm1
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm7, %xmm7
-        vpslld	$31, %xmm6, %xmm3
-        vpslld	$30, %xmm6, %xmm0
-        vpslld	$25, %xmm6, %xmm1
-        vpxor	%xmm0, %xmm3, %xmm3
-        vpxor	%xmm1, %xmm3, %xmm3
-        vpsrldq	$4, %xmm3, %xmm0
-        vpslldq	$12, %xmm3, %xmm3
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpsrld	$0x01, %xmm6, %xmm1
-        vpsrld	$2, %xmm6, %xmm5
-        vpsrld	$7, %xmm6, %xmm4
-        vpxor	%xmm5, %xmm1, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm7, %xmm6, %xmm6
-        vmovdqu	%xmm6, 96(%esp)
-        addl	$0x40, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_decrypt_avx1_ghash_64_inplace
-        jmp	L_AES_GCM_decrypt_avx1_ghash_64_done
-L_AES_GCM_decrypt_avx1_ghash_64:
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        vmovdqu	64(%esp), %xmm4
-        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm3
-        vpaddd	L_aes_gcm_avx1_one, %xmm4, %xmm5
-        vpshufb	%xmm3, %xmm5, %xmm5
-        vpaddd	L_aes_gcm_avx1_two, %xmm4, %xmm6
-        vpshufb	%xmm3, %xmm6, %xmm6
-        vpaddd	L_aes_gcm_avx1_three, %xmm4, %xmm7
-        vpshufb	%xmm3, %xmm7, %xmm7
-        vpshufb	%xmm3, %xmm4, %xmm4
-        vmovdqu	64(%esp), %xmm3
-        vpaddd	L_aes_gcm_avx1_four, %xmm3, %xmm3
-        vmovdqu	%xmm3, 64(%esp)
-        vmovdqa	(%ebp), %xmm3
-        vpxor	%xmm3, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm7, %xmm7
-        vmovdqa	16(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	32(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	48(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	64(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	80(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	96(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	112(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	128(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	144(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        cmpl	$11, 236(%esp)
-        vmovdqa	160(%ebp), %xmm3
-        jl	L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	176(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        cmpl	$13, 236(%esp)
-        vmovdqa	192(%ebp), %xmm3
-        jl	L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	208(%ebp), %xmm3
-        vaesenc	%xmm3, %xmm4, %xmm4
-        vaesenc	%xmm3, %xmm5, %xmm5
-        vaesenc	%xmm3, %xmm6, %xmm6
-        vaesenc	%xmm3, %xmm7, %xmm7
-        vmovdqa	224(%ebp), %xmm3
-L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
-        vaesenclast	%xmm3, %xmm4, %xmm4
-        vaesenclast	%xmm3, %xmm5, %xmm5
-        vmovdqu	(%ecx), %xmm0
-        vmovdqu	16(%ecx), %xmm1
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm5, %xmm5
-        vmovdqu	%xmm0, (%ecx)
-        vmovdqu	%xmm1, 16(%ecx)
-        vmovdqu	%xmm4, (%edx)
-        vmovdqu	%xmm5, 16(%edx)
-        vaesenclast	%xmm3, %xmm6, %xmm6
-        vaesenclast	%xmm3, %xmm7, %xmm7
-        vmovdqu	32(%ecx), %xmm0
-        vmovdqu	48(%ecx), %xmm1
-        vpxor	%xmm0, %xmm6, %xmm6
-        vpxor	%xmm1, %xmm7, %xmm7
-        vmovdqu	%xmm0, 32(%ecx)
-        vmovdqu	%xmm1, 48(%ecx)
-        vmovdqu	%xmm6, 32(%edx)
-        vmovdqu	%xmm7, 48(%edx)
-        # ghash encrypted counter
-        vmovdqu	96(%esp), %xmm6
-        vmovdqu	48(%esp), %xmm3
-        vmovdqu	(%ecx), %xmm4
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm6, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm3, %xmm5
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm7
-        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm6
-        vpclmulqdq	$0x00, %xmm1, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vmovdqu	32(%esp), %xmm3
-        vmovdqu	16(%ecx), %xmm4
-        vpshufd	$0x4e, %xmm3, %xmm0
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm0, %xmm0
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm2
-        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm3
-        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm0, %xmm5, %xmm5
-        vmovdqu	16(%esp), %xmm3
-        vmovdqu	32(%ecx), %xmm4
-        vpshufd	$0x4e, %xmm3, %xmm0
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm0, %xmm0
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm2
-        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm3
-        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm0, %xmm5, %xmm5
-        vmovdqu	(%esp), %xmm3
-        vmovdqu	48(%ecx), %xmm4
-        vpshufd	$0x4e, %xmm3, %xmm0
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm0, %xmm0
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm2
-        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm3
-        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpslldq	$8, %xmm5, %xmm1
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm7, %xmm7
-        vpslld	$31, %xmm6, %xmm3
-        vpslld	$30, %xmm6, %xmm0
-        vpslld	$25, %xmm6, %xmm1
-        vpxor	%xmm0, %xmm3, %xmm3
-        vpxor	%xmm1, %xmm3, %xmm3
-        vpsrldq	$4, %xmm3, %xmm0
-        vpslldq	$12, %xmm3, %xmm3
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpsrld	$0x01, %xmm6, %xmm1
-        vpsrld	$2, %xmm6, %xmm5
-        vpsrld	$7, %xmm6, %xmm4
-        vpxor	%xmm5, %xmm1, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm7, %xmm6, %xmm6
-        vmovdqu	%xmm6, 96(%esp)
-        addl	$0x40, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_decrypt_avx1_ghash_64
-L_AES_GCM_decrypt_avx1_ghash_64_done:
-        vmovdqa	%xmm6, %xmm2
-        vmovdqu	(%esp), %xmm1
-L_AES_GCM_decrypt_avx1_done_64:
-        movl	216(%esp), %edx
-        cmpl	%edx, %ebx
-        jge	L_AES_GCM_decrypt_avx1_done_dec
-        movl	216(%esp), %eax
-        andl	$0xfffffff0, %eax
-        cmpl	%eax, %ebx
-        jge	L_AES_GCM_decrypt_avx1_last_block_done
-L_AES_GCM_decrypt_avx1_last_block_start:
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        vmovdqu	(%ecx), %xmm7
-        pshufb	L_aes_gcm_avx1_bswap_mask, %xmm7
-        pxor	%xmm2, %xmm7
-        vmovdqu	64(%esp), %xmm5
-        vmovdqu	%xmm7, %xmm7
-        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm5, %xmm4
-        vpaddd	L_aes_gcm_avx1_one, %xmm5, %xmm5
-        vmovdqu	%xmm5, 64(%esp)
-        vpxor	(%ebp), %xmm4, %xmm4
-        vpclmulqdq	$16, %xmm1, %xmm7, %xmm0
-        vaesenc	16(%ebp), %xmm4, %xmm4
-        vaesenc	32(%ebp), %xmm4, %xmm4
-        vpclmulqdq	$0x01, %xmm1, %xmm7, %xmm3
-        vaesenc	48(%ebp), %xmm4, %xmm4
-        vaesenc	64(%ebp), %xmm4, %xmm4
-        vaesenc	80(%ebp), %xmm4, %xmm4
-        vpclmulqdq	$0x11, %xmm1, %xmm7, %xmm5
-        vaesenc	96(%ebp), %xmm4, %xmm4
-        vpxor	%xmm3, %xmm0, %xmm0
-        vpslldq	$8, %xmm0, %xmm6
-        vpsrldq	$8, %xmm0, %xmm0
-        vaesenc	112(%ebp), %xmm4, %xmm4
-        vpclmulqdq	$0x00, %xmm1, %xmm7, %xmm3
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm0, %xmm5, %xmm5
-        vmovdqa	L_aes_gcm_avx1_mod2_128, %xmm7
-        vpclmulqdq	$16, %xmm7, %xmm6, %xmm3
-        vaesenc	128(%ebp), %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm6, %xmm0
-        vpxor	%xmm3, %xmm0, %xmm0
-        vpclmulqdq	$16, %xmm7, %xmm0, %xmm3
-        vaesenc	144(%ebp), %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm2
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm5, %xmm2, %xmm2
-        cmpl	$11, 236(%esp)
-        vmovdqa	160(%ebp), %xmm5
-        jl	L_AES_GCM_decrypt_avx1_aesenc_gfmul_last
-        vaesenc	%xmm5, %xmm4, %xmm4
-        vaesenc	176(%ebp), %xmm4, %xmm4
-        cmpl	$13, 236(%esp)
-        vmovdqa	192(%ebp), %xmm5
-        jl	L_AES_GCM_decrypt_avx1_aesenc_gfmul_last
-        vaesenc	%xmm5, %xmm4, %xmm4
-        vaesenc	208(%ebp), %xmm4, %xmm4
-        vmovdqa	224(%ebp), %xmm5
-L_AES_GCM_decrypt_avx1_aesenc_gfmul_last:
-        vaesenclast	%xmm5, %xmm4, %xmm4
-        vmovdqu	(%ecx), %xmm5
-        vpxor	%xmm5, %xmm4, %xmm4
-        vmovdqu	%xmm4, (%edx)
-        addl	$16, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_decrypt_avx1_last_block_start
-L_AES_GCM_decrypt_avx1_last_block_done:
-        movl	216(%esp), %ecx
-        movl	%ecx, %edx
-        andl	$15, %ecx
-        jz	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_done
-        vmovdqu	64(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm0, %xmm0
-        vpxor	(%ebp), %xmm0, %xmm0
-        vaesenc	16(%ebp), %xmm0, %xmm0
-        vaesenc	32(%ebp), %xmm0, %xmm0
-        vaesenc	48(%ebp), %xmm0, %xmm0
-        vaesenc	64(%ebp), %xmm0, %xmm0
-        vaesenc	80(%ebp), %xmm0, %xmm0
-        vaesenc	96(%ebp), %xmm0, %xmm0
-        vaesenc	112(%ebp), %xmm0, %xmm0
-        vaesenc	128(%ebp), %xmm0, %xmm0
-        vaesenc	144(%ebp), %xmm0, %xmm0
-        cmpl	$11, 236(%esp)
-        vmovdqa	160(%ebp), %xmm5
-        jl	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last
-        vaesenc	%xmm5, %xmm0, %xmm0
-        vaesenc	176(%ebp), %xmm0, %xmm0
-        cmpl	$13, 236(%esp)
-        vmovdqa	192(%ebp), %xmm5
-        jl	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last
-        vaesenc	%xmm5, %xmm0, %xmm0
-        vaesenc	208(%ebp), %xmm0, %xmm0
-        vmovdqa	224(%ebp), %xmm5
-L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last:
-        vaesenclast	%xmm5, %xmm0, %xmm0
-        subl	$32, %esp
-        xorl	%ecx, %ecx
-        vmovdqu	%xmm0, (%esp)
-        vpxor	%xmm4, %xmm4, %xmm4
-        vmovdqu	%xmm4, 16(%esp)
-L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_loop:
-        movzbl	(%esi,%ebx,1), %eax
-        movb	%al, 16(%esp,%ecx,1)
-        xorb	(%esp,%ecx,1), %al
-        movb	%al, (%edi,%ebx,1)
-        incl	%ebx
-        incl	%ecx
-        cmpl	%edx, %ebx
-        jl	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_loop
-        vmovdqu	16(%esp), %xmm0
-        addl	$32, %esp
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm2, %xmm2
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm1, %xmm5
-        vpshufd	$0x4e, %xmm2, %xmm6
-        vpclmulqdq	$0x11, %xmm1, %xmm2, %xmm7
-        vpclmulqdq	$0x00, %xmm1, %xmm2, %xmm4
-        vpxor	%xmm1, %xmm5, %xmm5
-        vpxor	%xmm2, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpslldq	$8, %xmm5, %xmm6
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm4, %xmm4
-        vpxor	%xmm5, %xmm7, %xmm2
-        vpslld	$31, %xmm4, %xmm5
-        vpslld	$30, %xmm4, %xmm6
-        vpslld	$25, %xmm4, %xmm7
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpsrldq	$4, %xmm5, %xmm7
-        vpslldq	$12, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpsrld	$0x01, %xmm4, %xmm5
-        vpsrld	$2, %xmm4, %xmm6
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpsrld	$7, %xmm4, %xmm4
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm2, %xmm2
-L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_done:
-L_AES_GCM_decrypt_avx1_done_dec:
-        movl	212(%esp), %esi
-        movl	228(%esp), %ebp
-        movl	216(%esp), %edx
-        movl	220(%esp), %ecx
-        shll	$3, %edx
-        shll	$3, %ecx
-        vpinsrd	$0x00, %edx, %xmm4, %xmm4
-        vpinsrd	$2, %ecx, %xmm4, %xmm4
-        movl	216(%esp), %edx
-        movl	220(%esp), %ecx
-        shrl	$29, %edx
-        shrl	$29, %ecx
-        vpinsrd	$0x01, %edx, %xmm4, %xmm4
-        vpinsrd	$3, %ecx, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm2, %xmm2
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm1, %xmm5
-        vpshufd	$0x4e, %xmm2, %xmm6
-        vpclmulqdq	$0x11, %xmm1, %xmm2, %xmm7
-        vpclmulqdq	$0x00, %xmm1, %xmm2, %xmm4
-        vpxor	%xmm1, %xmm5, %xmm5
-        vpxor	%xmm2, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpslldq	$8, %xmm5, %xmm6
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm4, %xmm4
-        vpxor	%xmm5, %xmm7, %xmm2
-        vpslld	$31, %xmm4, %xmm5
-        vpslld	$30, %xmm4, %xmm6
-        vpslld	$25, %xmm4, %xmm7
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpsrldq	$4, %xmm5, %xmm7
-        vpslldq	$12, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm4, %xmm4
-        vpsrld	$0x01, %xmm4, %xmm5
-        vpsrld	$2, %xmm4, %xmm6
-        vpxor	%xmm6, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpsrld	$7, %xmm4, %xmm4
-        vpxor	%xmm7, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm2, %xmm2
-        vpxor	80(%esp), %xmm2, %xmm4
-        movl	240(%esp), %edi
-        cmpl	$16, %ebp
-        je	L_AES_GCM_decrypt_avx1_cmp_tag_16
-        subl	$16, %esp
-        xorl	%ecx, %ecx
-        xorl	%ebx, %ebx
-        vmovdqu	%xmm4, (%esp)
-L_AES_GCM_decrypt_avx1_cmp_tag_loop:
-        movzbl	(%esp,%ecx,1), %eax
-        xorb	(%esi,%ecx,1), %al
-        orb	%al, %bl
-        incl	%ecx
-        cmpl	%ebp, %ecx
-        jne	L_AES_GCM_decrypt_avx1_cmp_tag_loop
-        cmpb	$0x00, %bl
-        sete	%bl
-        addl	$16, %esp
-        xorl	%ecx, %ecx
-        jmp	L_AES_GCM_decrypt_avx1_cmp_tag_done
-L_AES_GCM_decrypt_avx1_cmp_tag_16:
-        vmovdqu	(%esi), %xmm5
-        vpcmpeqb	%xmm5, %xmm4, %xmm4
-        vpmovmskb	%xmm4, %edx
-        # %%edx == 0xFFFF then return 1 else => return 0
-        xorl	%ebx, %ebx
-        cmpl	$0xffff, %edx
-        sete	%bl
-L_AES_GCM_decrypt_avx1_cmp_tag_done:
-        movl	%ebx, (%edi)
-        addl	$0xb0, %esp
-        popl	%ebp
-        popl	%edi
-        popl	%esi
-        popl	%ebx
-        ret
-.size	AES_GCM_decrypt_avx1,.-AES_GCM_decrypt_avx1
-#ifdef WOLFSSL_AESGCM_STREAM
-.text
-.globl	AES_GCM_init_avx1
-.type	AES_GCM_init_avx1,@function
-.align	16
-AES_GCM_init_avx1:
-        pushl	%ebx
-        pushl	%esi
-        pushl	%edi
-        pushl	%ebp
-        subl	$16, %esp
-        movl	36(%esp), %ebp
-        movl	44(%esp), %esi
-        movl	60(%esp), %edi
-        vpxor	%xmm4, %xmm4, %xmm4
-        movl	48(%esp), %edx
-        cmpl	$12, %edx
-        jne	L_AES_GCM_init_avx1_iv_not_12
-        # # Calculate values when IV is 12 bytes
-        # Set counter based on IV
-        movl	$0x1000000, %ecx
-        vpinsrd	$0x00, (%esi), %xmm4, %xmm4
-        vpinsrd	$0x01, 4(%esi), %xmm4, %xmm4
-        vpinsrd	$2, 8(%esi), %xmm4, %xmm4
-        vpinsrd	$3, %ecx, %xmm4, %xmm4
-        # H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqa	(%ebp), %xmm5
-        vpxor	%xmm5, %xmm4, %xmm1
-        vmovdqa	16(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	32(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	48(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	64(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	80(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	96(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	112(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	128(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	144(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        cmpl	$11, 40(%esp)
-        vmovdqa	160(%ebp), %xmm7
-        jl	L_AES_GCM_init_avx1_calc_iv_12_last
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	176(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        cmpl	$13, 40(%esp)
-        vmovdqa	192(%ebp), %xmm7
-        jl	L_AES_GCM_init_avx1_calc_iv_12_last
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	208(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vmovdqa	224(%ebp), %xmm7
-L_AES_GCM_init_avx1_calc_iv_12_last:
-        vaesenclast	%xmm7, %xmm5, %xmm5
-        vaesenclast	%xmm7, %xmm1, %xmm1
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm5, %xmm5
-        vmovdqu	%xmm1, (%edi)
-        jmp	L_AES_GCM_init_avx1_iv_done
-L_AES_GCM_init_avx1_iv_not_12:
-        # Calculate values when IV is not 12 bytes
-        # H = Encrypt X(=0)
-        vmovdqa	(%ebp), %xmm5
-        vaesenc	16(%ebp), %xmm5, %xmm5
-        vaesenc	32(%ebp), %xmm5, %xmm5
-        vaesenc	48(%ebp), %xmm5, %xmm5
-        vaesenc	64(%ebp), %xmm5, %xmm5
-        vaesenc	80(%ebp), %xmm5, %xmm5
-        vaesenc	96(%ebp), %xmm5, %xmm5
-        vaesenc	112(%ebp), %xmm5, %xmm5
-        vaesenc	128(%ebp), %xmm5, %xmm5
-        vaesenc	144(%ebp), %xmm5, %xmm5
-        cmpl	$11, 40(%esp)
-        vmovdqa	160(%ebp), %xmm1
-        jl	L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm1, %xmm5, %xmm5
-        vaesenc	176(%ebp), %xmm5, %xmm5
-        cmpl	$13, 40(%esp)
-        vmovdqa	192(%ebp), %xmm1
-        jl	L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm1, %xmm5, %xmm5
-        vaesenc	208(%ebp), %xmm5, %xmm5
-        vmovdqa	224(%ebp), %xmm1
-L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last:
-        vaesenclast	%xmm1, %xmm5, %xmm5
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm5, %xmm5
-        # Calc counter
-        # Initialization vector
-        cmpl	$0x00, %edx
-        movl	$0x00, %ecx
-        je	L_AES_GCM_init_avx1_calc_iv_done
-        cmpl	$16, %edx
-        jl	L_AES_GCM_init_avx1_calc_iv_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_init_avx1_calc_iv_16_loop:
-        vmovdqu	(%esi,%ecx,1), %xmm0
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm7
-        vmovdqa	%xmm3, %xmm4
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm1, %xmm4, %xmm4
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        vpslld	$31, %xmm7, %xmm0
-        vpslld	$30, %xmm7, %xmm1
-        vpslld	$25, %xmm7, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm7, %xmm7
-        vpsrld	$0x01, %xmm7, %xmm2
-        vpsrld	$2, %xmm7, %xmm3
-        vpsrld	$7, %xmm7, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm4, %xmm4
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_init_avx1_calc_iv_16_loop
-        movl	48(%esp), %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_init_avx1_calc_iv_done
-L_AES_GCM_init_avx1_calc_iv_lt16:
-        subl	$16, %esp
-        vpxor	%xmm0, %xmm0, %xmm0
-        xorl	%ebx, %ebx
-        vmovdqu	%xmm0, (%esp)
-L_AES_GCM_init_avx1_calc_iv_loop:
-        movzbl	(%esi,%ecx,1), %eax
-        movb	%al, (%esp,%ebx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_init_avx1_calc_iv_loop
-        vmovdqu	(%esp), %xmm0
-        addl	$16, %esp
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm7
-        vmovdqa	%xmm3, %xmm4
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm1, %xmm4, %xmm4
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        vpslld	$31, %xmm7, %xmm0
-        vpslld	$30, %xmm7, %xmm1
-        vpslld	$25, %xmm7, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm7, %xmm7
-        vpsrld	$0x01, %xmm7, %xmm2
-        vpsrld	$2, %xmm7, %xmm3
-        vpsrld	$7, %xmm7, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm4, %xmm4
-L_AES_GCM_init_avx1_calc_iv_done:
-        # T = Encrypt counter
-        vpxor	%xmm0, %xmm0, %xmm0
-        shll	$3, %edx
-        vpinsrd	$0x00, %edx, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm7
-        vmovdqa	%xmm3, %xmm4
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm7, %xmm7
-        vpxor	%xmm1, %xmm4, %xmm4
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        vpslld	$31, %xmm7, %xmm0
-        vpslld	$30, %xmm7, %xmm1
-        vpslld	$25, %xmm7, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm7, %xmm7
-        vpsrld	$0x01, %xmm7, %xmm2
-        vpsrld	$2, %xmm7, %xmm3
-        vpsrld	$7, %xmm7, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        #   Encrypt counter
-        vmovdqa	(%ebp), %xmm0
-        vpxor	%xmm4, %xmm0, %xmm0
-        vaesenc	16(%ebp), %xmm0, %xmm0
-        vaesenc	32(%ebp), %xmm0, %xmm0
-        vaesenc	48(%ebp), %xmm0, %xmm0
-        vaesenc	64(%ebp), %xmm0, %xmm0
-        vaesenc	80(%ebp), %xmm0, %xmm0
-        vaesenc	96(%ebp), %xmm0, %xmm0
-        vaesenc	112(%ebp), %xmm0, %xmm0
-        vaesenc	128(%ebp), %xmm0, %xmm0
-        vaesenc	144(%ebp), %xmm0, %xmm0
-        cmpl	$11, 40(%esp)
-        vmovdqa	160(%ebp), %xmm1
-        jl	L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm1, %xmm0, %xmm0
-        vaesenc	176(%ebp), %xmm0, %xmm0
-        cmpl	$13, 40(%esp)
-        vmovdqa	192(%ebp), %xmm1
-        jl	L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm1, %xmm0, %xmm0
-        vaesenc	208(%ebp), %xmm0, %xmm0
-        vmovdqa	224(%ebp), %xmm1
-L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last:
-        vaesenclast	%xmm1, %xmm0, %xmm0
-        vmovdqu	%xmm0, (%edi)
-L_AES_GCM_init_avx1_iv_done:
-        movl	52(%esp), %ebp
-        movl	56(%esp), %edi
-        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm4, %xmm4
-        vpaddd	L_aes_gcm_avx1_one, %xmm4, %xmm4
-        vmovdqa	%xmm5, (%ebp)
-        vmovdqa	%xmm4, (%edi)
-        addl	$16, %esp
-        popl	%ebp
-        popl	%edi
-        popl	%esi
-        popl	%ebx
-        ret
-.size	AES_GCM_init_avx1,.-AES_GCM_init_avx1
-.text
-.globl	AES_GCM_aad_update_avx1
-.type	AES_GCM_aad_update_avx1,@function
-.align	16
-AES_GCM_aad_update_avx1:
-        pushl	%esi
-        pushl	%edi
-        movl	12(%esp), %esi
-        movl	16(%esp), %edx
-        movl	20(%esp), %edi
-        movl	24(%esp), %eax
-        vmovdqa	(%edi), %xmm5
-        vmovdqa	(%eax), %xmm6
-        xorl	%ecx, %ecx
-L_AES_GCM_aad_update_avx1_16_loop:
-        vmovdqu	(%esi,%ecx,1), %xmm0
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm5, %xmm5
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm5, %xmm1
-        vpshufd	$0x4e, %xmm6, %xmm2
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm0
-        vpxor	%xmm5, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm4
-        vmovdqa	%xmm3, %xmm5
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm5, %xmm5
-        vpsrld	$31, %xmm4, %xmm0
-        vpsrld	$31, %xmm5, %xmm1
-        vpslld	$0x01, %xmm4, %xmm4
-        vpslld	$0x01, %xmm5, %xmm5
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm5, %xmm5
-        vpor	%xmm0, %xmm4, %xmm4
-        vpor	%xmm1, %xmm5, %xmm5
-        vpslld	$31, %xmm4, %xmm0
-        vpslld	$30, %xmm4, %xmm1
-        vpslld	$25, %xmm4, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpsrld	$0x01, %xmm4, %xmm2
-        vpsrld	$2, %xmm4, %xmm3
-        vpsrld	$7, %xmm4, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm4, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm5, %xmm5
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_aad_update_avx1_16_loop
-        vmovdqa	%xmm5, (%edi)
-        popl	%edi
-        popl	%esi
-        ret
-.size	AES_GCM_aad_update_avx1,.-AES_GCM_aad_update_avx1
-.text
-.globl	AES_GCM_encrypt_block_avx1
-.type	AES_GCM_encrypt_block_avx1,@function
-.align	16
-AES_GCM_encrypt_block_avx1:
-        pushl	%esi
-        pushl	%edi
-        movl	12(%esp), %ecx
-        movl	16(%esp), %eax
-        movl	20(%esp), %edi
-        movl	24(%esp), %esi
-        movl	28(%esp), %edx
-        vmovdqu	(%edx), %xmm1
-        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm1, %xmm0
-        vpaddd	L_aes_gcm_avx1_one, %xmm1, %xmm1
-        vmovdqu	%xmm1, (%edx)
-        vpxor	(%ecx), %xmm0, %xmm0
-        vaesenc	16(%ecx), %xmm0, %xmm0
-        vaesenc	32(%ecx), %xmm0, %xmm0
-        vaesenc	48(%ecx), %xmm0, %xmm0
-        vaesenc	64(%ecx), %xmm0, %xmm0
-        vaesenc	80(%ecx), %xmm0, %xmm0
-        vaesenc	96(%ecx), %xmm0, %xmm0
-        vaesenc	112(%ecx), %xmm0, %xmm0
-        vaesenc	128(%ecx), %xmm0, %xmm0
-        vaesenc	144(%ecx), %xmm0, %xmm0
-        cmpl	$11, %eax
-        vmovdqa	160(%ecx), %xmm1
-        jl	L_AES_GCM_encrypt_block_avx1_aesenc_block_aesenc_avx_last
-        vaesenc	%xmm1, %xmm0, %xmm0
-        vaesenc	176(%ecx), %xmm0, %xmm0
-        cmpl	$13, %eax
-        vmovdqa	192(%ecx), %xmm1
-        jl	L_AES_GCM_encrypt_block_avx1_aesenc_block_aesenc_avx_last
-        vaesenc	%xmm1, %xmm0, %xmm0
-        vaesenc	208(%ecx), %xmm0, %xmm0
-        vmovdqa	224(%ecx), %xmm1
-L_AES_GCM_encrypt_block_avx1_aesenc_block_aesenc_avx_last:
-        vaesenclast	%xmm1, %xmm0, %xmm0
-        vmovdqu	(%esi), %xmm1
-        vpxor	%xmm1, %xmm0, %xmm0
-        vmovdqu	%xmm0, (%edi)
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        popl	%edi
-        popl	%esi
-        ret
-.size	AES_GCM_encrypt_block_avx1,.-AES_GCM_encrypt_block_avx1
-.text
-.globl	AES_GCM_ghash_block_avx1
-.type	AES_GCM_ghash_block_avx1,@function
-.align	16
-AES_GCM_ghash_block_avx1:
-        movl	4(%esp), %edx
-        movl	8(%esp), %eax
-        movl	12(%esp), %ecx
-        vmovdqa	(%eax), %xmm4
-        vmovdqa	(%ecx), %xmm5
-        vmovdqu	(%edx), %xmm0
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpshufd	$0x4e, %xmm5, %xmm2
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm6
-        vmovdqa	%xmm3, %xmm4
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm6, %xmm6
-        vpxor	%xmm1, %xmm4, %xmm4
-        vpsrld	$31, %xmm6, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm6, %xmm6
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm6, %xmm6
-        vpor	%xmm1, %xmm4, %xmm4
-        vpslld	$31, %xmm6, %xmm0
-        vpslld	$30, %xmm6, %xmm1
-        vpslld	$25, %xmm6, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm6, %xmm6
-        vpsrld	$0x01, %xmm6, %xmm2
-        vpsrld	$2, %xmm6, %xmm3
-        vpsrld	$7, %xmm6, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm4, %xmm4
-        vmovdqa	%xmm4, (%eax)
-        ret
-.size	AES_GCM_ghash_block_avx1,.-AES_GCM_ghash_block_avx1
-.text
-.globl	AES_GCM_encrypt_update_avx1
-.type	AES_GCM_encrypt_update_avx1,@function
-.align	16
-AES_GCM_encrypt_update_avx1:
-        pushl	%ebx
-        pushl	%esi
-        pushl	%edi
-        pushl	%ebp
-        subl	$0x60, %esp
-        movl	144(%esp), %esi
-        vmovdqa	(%esi), %xmm4
-        vmovdqu	%xmm4, 64(%esp)
-        movl	136(%esp), %esi
-        movl	140(%esp), %ebp
-        vmovdqa	(%esi), %xmm6
-        vmovdqa	(%ebp), %xmm5
-        vmovdqu	%xmm6, 80(%esp)
-        movl	116(%esp), %ebp
-        movl	124(%esp), %edi
-        movl	128(%esp), %esi
-        vpsrlq	$63, %xmm5, %xmm1
-        vpsllq	$0x01, %xmm5, %xmm0
-        vpslldq	$8, %xmm1, %xmm1
-        vpor	%xmm1, %xmm0, %xmm0
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpand	L_aes_gcm_avx1_mod2_128, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        xorl	%ebx, %ebx
-        cmpl	$0x40, 132(%esp)
-        movl	132(%esp), %eax
-        jl	L_AES_GCM_encrypt_update_avx1_done_64
-        andl	$0xffffffc0, %eax
-        vmovdqa	%xmm6, %xmm2
-        # H ^ 1
-        vmovdqu	%xmm5, (%esp)
-        # H ^ 2
-        vpclmulqdq	$0x00, %xmm5, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm5, %xmm5, %xmm4
-        vpslld	$31, %xmm0, %xmm1
-        vpslld	$30, %xmm0, %xmm2
-        vpslld	$25, %xmm0, %xmm3
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpsrldq	$4, %xmm1, %xmm3
-        vpslldq	$12, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpsrld	$0x01, %xmm0, %xmm1
-        vpsrld	$2, %xmm0, %xmm2
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpsrld	$7, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-        vmovdqu	%xmm4, 16(%esp)
-        # H ^ 3
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm5, %xmm1
-        vpshufd	$0x4e, %xmm4, %xmm2
-        vpclmulqdq	$0x11, %xmm5, %xmm4, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm0
-        vpxor	%xmm5, %xmm1, %xmm1
-        vpxor	%xmm4, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm0, %xmm0
-        vpxor	%xmm1, %xmm3, %xmm7
-        vpslld	$31, %xmm0, %xmm1
-        vpslld	$30, %xmm0, %xmm2
-        vpslld	$25, %xmm0, %xmm3
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpsrldq	$4, %xmm1, %xmm3
-        vpslldq	$12, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpsrld	$0x01, %xmm0, %xmm1
-        vpsrld	$2, %xmm0, %xmm2
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpsrld	$7, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        vmovdqu	%xmm7, 32(%esp)
-        # H ^ 4
-        vpclmulqdq	$0x00, %xmm4, %xmm4, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm4, %xmm7
-        vpslld	$31, %xmm0, %xmm1
-        vpslld	$30, %xmm0, %xmm2
-        vpslld	$25, %xmm0, %xmm3
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpsrldq	$4, %xmm1, %xmm3
-        vpslldq	$12, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpsrld	$0x01, %xmm0, %xmm1
-        vpsrld	$2, %xmm0, %xmm2
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpsrld	$7, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        vmovdqu	%xmm7, 48(%esp)
-        # First 64 bytes of input
-        vmovdqu	64(%esp), %xmm0
-        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm7
-        vpaddd	L_aes_gcm_avx1_one, %xmm0, %xmm1
-        vpshufb	%xmm7, %xmm1, %xmm1
-        vpaddd	L_aes_gcm_avx1_two, %xmm0, %xmm2
-        vpshufb	%xmm7, %xmm2, %xmm2
-        vpaddd	L_aes_gcm_avx1_three, %xmm0, %xmm3
-        vpshufb	%xmm7, %xmm3, %xmm3
-        vpshufb	%xmm7, %xmm0, %xmm0
-        vmovdqu	64(%esp), %xmm7
-        vpaddd	L_aes_gcm_avx1_four, %xmm7, %xmm7
-        vmovdqu	%xmm7, 64(%esp)
-        vmovdqa	(%ebp), %xmm7
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm3, %xmm3
-        vmovdqa	16(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	32(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	48(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	64(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	80(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	96(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	112(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	128(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	144(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$11, 120(%esp)
-        vmovdqa	160(%ebp), %xmm7
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	176(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$13, 120(%esp)
-        vmovdqa	192(%ebp), %xmm7
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	208(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	224(%ebp), %xmm7
-L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done:
-        vaesenclast	%xmm7, %xmm0, %xmm0
-        vaesenclast	%xmm7, %xmm1, %xmm1
-        vmovdqu	(%esi), %xmm4
-        vmovdqu	16(%esi), %xmm5
-        vpxor	%xmm4, %xmm0, %xmm0
-        vpxor	%xmm5, %xmm1, %xmm1
-        vmovdqu	%xmm4, (%esi)
-        vmovdqu	%xmm5, 16(%esi)
-        vmovdqu	%xmm0, (%edi)
-        vmovdqu	%xmm1, 16(%edi)
-        vaesenclast	%xmm7, %xmm2, %xmm2
-        vaesenclast	%xmm7, %xmm3, %xmm3
-        vmovdqu	32(%esi), %xmm4
-        vmovdqu	48(%esi), %xmm5
-        vpxor	%xmm4, %xmm2, %xmm2
-        vpxor	%xmm5, %xmm3, %xmm3
-        vmovdqu	%xmm4, 32(%esi)
-        vmovdqu	%xmm5, 48(%esi)
-        vmovdqu	%xmm2, 32(%edi)
-        vmovdqu	%xmm3, 48(%edi)
-        cmpl	$0x40, %eax
-        movl	$0x40, %ebx
-        movl	%esi, %ecx
-        movl	%edi, %edx
-        jle	L_AES_GCM_encrypt_update_avx1_end_64
-        # More 64 bytes of input
-L_AES_GCM_encrypt_update_avx1_ghash_64:
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        vmovdqu	64(%esp), %xmm0
-        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm7
-        vpaddd	L_aes_gcm_avx1_one, %xmm0, %xmm1
-        vpshufb	%xmm7, %xmm1, %xmm1
-        vpaddd	L_aes_gcm_avx1_two, %xmm0, %xmm2
-        vpshufb	%xmm7, %xmm2, %xmm2
-        vpaddd	L_aes_gcm_avx1_three, %xmm0, %xmm3
-        vpshufb	%xmm7, %xmm3, %xmm3
-        vpshufb	%xmm7, %xmm0, %xmm0
-        vmovdqu	64(%esp), %xmm7
-        vpaddd	L_aes_gcm_avx1_four, %xmm7, %xmm7
-        vmovdqu	%xmm7, 64(%esp)
-        vmovdqa	(%ebp), %xmm7
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm3, %xmm3
-        vmovdqa	16(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	32(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	48(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	64(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	80(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	96(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	112(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	128(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	144(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$11, 120(%esp)
-        vmovdqa	160(%ebp), %xmm7
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	176(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$13, 120(%esp)
-        vmovdqa	192(%ebp), %xmm7
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	208(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	224(%ebp), %xmm7
-L_AES_GCM_encrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
-        vaesenclast	%xmm7, %xmm0, %xmm0
-        vaesenclast	%xmm7, %xmm1, %xmm1
-        vmovdqu	(%ecx), %xmm4
-        vmovdqu	16(%ecx), %xmm5
-        vpxor	%xmm4, %xmm0, %xmm0
-        vpxor	%xmm5, %xmm1, %xmm1
-        vmovdqu	%xmm0, (%edx)
-        vmovdqu	%xmm1, 16(%edx)
-        vaesenclast	%xmm7, %xmm2, %xmm2
-        vaesenclast	%xmm7, %xmm3, %xmm3
-        vmovdqu	32(%ecx), %xmm4
-        vmovdqu	48(%ecx), %xmm5
-        vpxor	%xmm4, %xmm2, %xmm2
-        vpxor	%xmm5, %xmm3, %xmm3
-        vmovdqu	%xmm2, 32(%edx)
-        vmovdqu	%xmm3, 48(%edx)
-        # ghash encrypted counter
-        vmovdqu	80(%esp), %xmm2
-        vmovdqu	48(%esp), %xmm7
-        vmovdqu	-64(%edx), %xmm0
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm3
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm2
-        vpclmulqdq	$0x00, %xmm5, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqu	32(%esp), %xmm7
-        vmovdqu	-48(%edx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	16(%esp), %xmm7
-        vmovdqu	-32(%edx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	(%esp), %xmm7
-        vmovdqu	-16(%edx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpslldq	$8, %xmm1, %xmm5
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm3, %xmm3
-        vpslld	$31, %xmm2, %xmm7
-        vpslld	$30, %xmm2, %xmm4
-        vpslld	$25, %xmm2, %xmm5
-        vpxor	%xmm4, %xmm7, %xmm7
-        vpxor	%xmm5, %xmm7, %xmm7
-        vpsrldq	$4, %xmm7, %xmm4
-        vpslldq	$12, %xmm7, %xmm7
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpsrld	$0x01, %xmm2, %xmm5
-        vpsrld	$2, %xmm2, %xmm1
-        vpsrld	$7, %xmm2, %xmm0
-        vpxor	%xmm1, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm2, %xmm2
-        vmovdqu	%xmm2, 80(%esp)
-        addl	$0x40, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_encrypt_update_avx1_ghash_64
-L_AES_GCM_encrypt_update_avx1_end_64:
-        movdqu	80(%esp), %xmm6
-        # Block 1
-        vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm0
-        vmovdqu	(%edx), %xmm5
-        pshufb	%xmm0, %xmm5
-        vmovdqu	48(%esp), %xmm7
-        pxor	%xmm6, %xmm5
-        # ghash_gfmul_avx
-        vpshufd	$0x4e, %xmm5, %xmm1
-        vpshufd	$0x4e, %xmm7, %xmm2
-        vpclmulqdq	$0x11, %xmm5, %xmm7, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm7, %xmm0
-        vpxor	%xmm5, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqa	%xmm0, %xmm4
-        vmovdqa	%xmm3, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm6, %xmm6
-        # Block 2
-        vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm0
-        vmovdqu	16(%edx), %xmm5
-        pshufb	%xmm0, %xmm5
-        vmovdqu	32(%esp), %xmm7
-        # ghash_gfmul_xor_avx
-        vpshufd	$0x4e, %xmm5, %xmm1
-        vpshufd	$0x4e, %xmm7, %xmm2
-        vpclmulqdq	$0x11, %xmm5, %xmm7, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm7, %xmm0
-        vpxor	%xmm5, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm6, %xmm6
-        # Block 3
-        vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm0
-        vmovdqu	32(%edx), %xmm5
-        pshufb	%xmm0, %xmm5
-        vmovdqu	16(%esp), %xmm7
-        # ghash_gfmul_xor_avx
-        vpshufd	$0x4e, %xmm5, %xmm1
-        vpshufd	$0x4e, %xmm7, %xmm2
-        vpclmulqdq	$0x11, %xmm5, %xmm7, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm7, %xmm0
-        vpxor	%xmm5, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm6, %xmm6
-        # Block 4
-        vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm0
-        vmovdqu	48(%edx), %xmm5
-        pshufb	%xmm0, %xmm5
-        vmovdqu	(%esp), %xmm7
-        # ghash_gfmul_xor_avx
-        vpshufd	$0x4e, %xmm5, %xmm1
-        vpshufd	$0x4e, %xmm7, %xmm2
-        vpclmulqdq	$0x11, %xmm5, %xmm7, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm7, %xmm0
-        vpxor	%xmm5, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm4, %xmm4
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpslld	$31, %xmm4, %xmm0
-        vpslld	$30, %xmm4, %xmm1
-        vpslld	$25, %xmm4, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vmovdqa	%xmm0, %xmm1
-        vpsrldq	$4, %xmm1, %xmm1
-        vpslldq	$12, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpsrld	$0x01, %xmm4, %xmm2
-        vpsrld	$2, %xmm4, %xmm3
-        vpsrld	$7, %xmm4, %xmm0
-        vpxor	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm4, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm6, %xmm6
-        vmovdqu	(%esp), %xmm5
-L_AES_GCM_encrypt_update_avx1_done_64:
-        movl	132(%esp), %edx
-        cmpl	%edx, %ebx
-        jge	L_AES_GCM_encrypt_update_avx1_done_enc
-        movl	132(%esp), %eax
-        andl	$0xfffffff0, %eax
-        cmpl	%eax, %ebx
-        jge	L_AES_GCM_encrypt_update_avx1_last_block_done
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        vmovdqu	64(%esp), %xmm1
-        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm1, %xmm0
-        vpaddd	L_aes_gcm_avx1_one, %xmm1, %xmm1
-        vmovdqu	%xmm1, 64(%esp)
-        vpxor	(%ebp), %xmm0, %xmm0
-        vaesenc	16(%ebp), %xmm0, %xmm0
-        vaesenc	32(%ebp), %xmm0, %xmm0
-        vaesenc	48(%ebp), %xmm0, %xmm0
-        vaesenc	64(%ebp), %xmm0, %xmm0
-        vaesenc	80(%ebp), %xmm0, %xmm0
-        vaesenc	96(%ebp), %xmm0, %xmm0
-        vaesenc	112(%ebp), %xmm0, %xmm0
-        vaesenc	128(%ebp), %xmm0, %xmm0
-        vaesenc	144(%ebp), %xmm0, %xmm0
-        cmpl	$11, 120(%esp)
-        vmovdqa	160(%ebp), %xmm1
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_block_aesenc_avx_last
-        vaesenc	%xmm1, %xmm0, %xmm0
-        vaesenc	176(%ebp), %xmm0, %xmm0
-        cmpl	$13, 120(%esp)
-        vmovdqa	192(%ebp), %xmm1
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_block_aesenc_avx_last
-        vaesenc	%xmm1, %xmm0, %xmm0
-        vaesenc	208(%ebp), %xmm0, %xmm0
-        vmovdqa	224(%ebp), %xmm1
-L_AES_GCM_encrypt_update_avx1_aesenc_block_aesenc_avx_last:
-        vaesenclast	%xmm1, %xmm0, %xmm0
-        vmovdqu	(%ecx), %xmm1
-        vpxor	%xmm1, %xmm0, %xmm0
-        vmovdqu	%xmm0, (%edx)
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm6, %xmm6
-        addl	$16, %ebx
-        cmpl	%eax, %ebx
-        jge	L_AES_GCM_encrypt_update_avx1_last_block_ghash
-L_AES_GCM_encrypt_update_avx1_last_block_start:
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        vmovdqu	64(%esp), %xmm1
-        vmovdqu	%xmm6, %xmm3
-        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm1, %xmm0
-        vpaddd	L_aes_gcm_avx1_one, %xmm1, %xmm1
-        vmovdqu	%xmm1, 64(%esp)
-        vpxor	(%ebp), %xmm0, %xmm0
-        vpclmulqdq	$16, %xmm5, %xmm3, %xmm4
-        vaesenc	16(%ebp), %xmm0, %xmm0
-        vaesenc	32(%ebp), %xmm0, %xmm0
-        vpclmulqdq	$0x01, %xmm5, %xmm3, %xmm7
-        vaesenc	48(%ebp), %xmm0, %xmm0
-        vaesenc	64(%ebp), %xmm0, %xmm0
-        vaesenc	80(%ebp), %xmm0, %xmm0
-        vpclmulqdq	$0x11, %xmm5, %xmm3, %xmm1
-        vaesenc	96(%ebp), %xmm0, %xmm0
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpslldq	$8, %xmm4, %xmm2
-        vpsrldq	$8, %xmm4, %xmm4
-        vaesenc	112(%ebp), %xmm0, %xmm0
-        vpclmulqdq	$0x00, %xmm5, %xmm3, %xmm7
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqa	L_aes_gcm_avx1_mod2_128, %xmm3
-        vpclmulqdq	$16, %xmm3, %xmm2, %xmm7
-        vaesenc	128(%ebp), %xmm0, %xmm0
-        vpshufd	$0x4e, %xmm2, %xmm4
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpclmulqdq	$16, %xmm3, %xmm4, %xmm7
-        vaesenc	144(%ebp), %xmm0, %xmm0
-        vpshufd	$0x4e, %xmm4, %xmm6
-        vpxor	%xmm7, %xmm6, %xmm6
-        vpxor	%xmm1, %xmm6, %xmm6
-        cmpl	$11, 120(%esp)
-        vmovdqa	160(%ebp), %xmm1
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last
-        vaesenc	%xmm1, %xmm0, %xmm0
-        vaesenc	176(%ebp), %xmm0, %xmm0
-        cmpl	$13, 120(%esp)
-        vmovdqa	192(%ebp), %xmm1
-        jl	L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last
-        vaesenc	%xmm1, %xmm0, %xmm0
-        vaesenc	208(%ebp), %xmm0, %xmm0
-        vmovdqa	224(%ebp), %xmm1
-L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last:
-        vaesenclast	%xmm1, %xmm0, %xmm0
-        vmovdqu	(%ecx), %xmm1
-        vpxor	%xmm1, %xmm0, %xmm0
-        vmovdqu	%xmm0, (%edx)
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        addl	$16, %ebx
-        vpxor	%xmm0, %xmm6, %xmm6
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_encrypt_update_avx1_last_block_start
-L_AES_GCM_encrypt_update_avx1_last_block_ghash:
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm5, %xmm1
-        vpshufd	$0x4e, %xmm6, %xmm2
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm0
-        vpxor	%xmm5, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm0, %xmm0
-        vpxor	%xmm1, %xmm3, %xmm6
-        vpslld	$31, %xmm0, %xmm1
-        vpslld	$30, %xmm0, %xmm2
-        vpslld	$25, %xmm0, %xmm3
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpsrldq	$4, %xmm1, %xmm3
-        vpslldq	$12, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpsrld	$0x01, %xmm0, %xmm1
-        vpsrld	$2, %xmm0, %xmm2
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpsrld	$7, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm6, %xmm6
-L_AES_GCM_encrypt_update_avx1_last_block_done:
-L_AES_GCM_encrypt_update_avx1_done_enc:
-        movl	136(%esp), %esi
-        movl	144(%esp), %edi
-        vmovdqu	64(%esp), %xmm4
-        vmovdqa	%xmm6, (%esi)
-        vmovdqu	%xmm4, (%edi)
-        addl	$0x60, %esp
-        popl	%ebp
-        popl	%edi
-        popl	%esi
-        popl	%ebx
-        ret
-.size	AES_GCM_encrypt_update_avx1,.-AES_GCM_encrypt_update_avx1
-.text
-.globl	AES_GCM_encrypt_final_avx1
-.type	AES_GCM_encrypt_final_avx1,@function
-.align	16
-AES_GCM_encrypt_final_avx1:
-        pushl	%esi
-        pushl	%edi
-        pushl	%ebp
-        subl	$16, %esp
-        movl	32(%esp), %ebp
-        movl	52(%esp), %esi
-        movl	56(%esp), %edi
-        vmovdqa	(%ebp), %xmm4
-        vmovdqa	(%esi), %xmm5
-        vmovdqa	(%edi), %xmm6
-        vpsrlq	$63, %xmm5, %xmm1
-        vpsllq	$0x01, %xmm5, %xmm0
-        vpslldq	$8, %xmm1, %xmm1
-        vpor	%xmm1, %xmm0, %xmm0
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpand	L_aes_gcm_avx1_mod2_128, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        movl	44(%esp), %edx
-        movl	48(%esp), %ecx
-        shll	$3, %edx
-        shll	$3, %ecx
-        vpinsrd	$0x00, %edx, %xmm0, %xmm0
-        vpinsrd	$2, %ecx, %xmm0, %xmm0
-        movl	44(%esp), %edx
-        movl	48(%esp), %ecx
-        shrl	$29, %edx
-        shrl	$29, %ecx
-        vpinsrd	$0x01, %edx, %xmm0, %xmm0
-        vpinsrd	$3, %ecx, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm5, %xmm1
-        vpshufd	$0x4e, %xmm4, %xmm2
-        vpclmulqdq	$0x11, %xmm5, %xmm4, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm0
-        vpxor	%xmm5, %xmm1, %xmm1
-        vpxor	%xmm4, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm0, %xmm0
-        vpxor	%xmm1, %xmm3, %xmm4
-        vpslld	$31, %xmm0, %xmm1
-        vpslld	$30, %xmm0, %xmm2
-        vpslld	$25, %xmm0, %xmm3
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpsrldq	$4, %xmm1, %xmm3
-        vpslldq	$12, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpsrld	$0x01, %xmm0, %xmm1
-        vpsrld	$2, %xmm0, %xmm2
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpsrld	$7, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm6, %xmm4, %xmm0
-        movl	36(%esp), %edi
-        cmpl	$16, 40(%esp)
-        je	L_AES_GCM_encrypt_final_avx1_store_tag_16
-        xorl	%ecx, %ecx
-        vmovdqu	%xmm0, (%esp)
-L_AES_GCM_encrypt_final_avx1_store_tag_loop:
-        movzbl	(%esp,%ecx,1), %eax
-        movb	%al, (%edi,%ecx,1)
-        incl	%ecx
-        cmpl	40(%esp), %ecx
-        jne	L_AES_GCM_encrypt_final_avx1_store_tag_loop
-        jmp	L_AES_GCM_encrypt_final_avx1_store_tag_done
-L_AES_GCM_encrypt_final_avx1_store_tag_16:
-        vmovdqu	%xmm0, (%edi)
-L_AES_GCM_encrypt_final_avx1_store_tag_done:
-        addl	$16, %esp
-        popl	%ebp
-        popl	%edi
-        popl	%esi
-        ret
-.size	AES_GCM_encrypt_final_avx1,.-AES_GCM_encrypt_final_avx1
-.text
-.globl	AES_GCM_decrypt_update_avx1
-.type	AES_GCM_decrypt_update_avx1,@function
-.align	16
-AES_GCM_decrypt_update_avx1:
-        pushl	%ebx
-        pushl	%esi
-        pushl	%edi
-        pushl	%ebp
-        subl	$0xa0, %esp
-        movl	208(%esp), %esi
-        vmovdqa	(%esi), %xmm4
-        vmovdqu	%xmm4, 64(%esp)
-        movl	200(%esp), %esi
-        movl	204(%esp), %ebp
-        vmovdqa	(%esi), %xmm6
-        vmovdqa	(%ebp), %xmm5
-        vmovdqu	%xmm6, 80(%esp)
-        movl	180(%esp), %ebp
-        movl	188(%esp), %edi
-        movl	192(%esp), %esi
-        vpsrlq	$63, %xmm5, %xmm1
-        vpsllq	$0x01, %xmm5, %xmm0
-        vpslldq	$8, %xmm1, %xmm1
-        vpor	%xmm1, %xmm0, %xmm0
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpand	L_aes_gcm_avx1_mod2_128, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        xorl	%ebx, %ebx
-        cmpl	$0x40, 196(%esp)
-        movl	196(%esp), %eax
-        jl	L_AES_GCM_decrypt_update_avx1_done_64
-        andl	$0xffffffc0, %eax
-        vmovdqa	%xmm6, %xmm2
-        # H ^ 1
-        vmovdqu	%xmm5, (%esp)
-        # H ^ 2
-        vpclmulqdq	$0x00, %xmm5, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm5, %xmm5, %xmm4
-        vpslld	$31, %xmm0, %xmm1
-        vpslld	$30, %xmm0, %xmm2
-        vpslld	$25, %xmm0, %xmm3
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpsrldq	$4, %xmm1, %xmm3
-        vpslldq	$12, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpsrld	$0x01, %xmm0, %xmm1
-        vpsrld	$2, %xmm0, %xmm2
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpsrld	$7, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-        vmovdqu	%xmm4, 16(%esp)
-        # H ^ 3
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm5, %xmm1
-        vpshufd	$0x4e, %xmm4, %xmm2
-        vpclmulqdq	$0x11, %xmm5, %xmm4, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm0
-        vpxor	%xmm5, %xmm1, %xmm1
-        vpxor	%xmm4, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm0, %xmm0
-        vpxor	%xmm1, %xmm3, %xmm7
-        vpslld	$31, %xmm0, %xmm1
-        vpslld	$30, %xmm0, %xmm2
-        vpslld	$25, %xmm0, %xmm3
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpsrldq	$4, %xmm1, %xmm3
-        vpslldq	$12, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpsrld	$0x01, %xmm0, %xmm1
-        vpsrld	$2, %xmm0, %xmm2
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpsrld	$7, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        vmovdqu	%xmm7, 32(%esp)
-        # H ^ 4
-        vpclmulqdq	$0x00, %xmm4, %xmm4, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm4, %xmm7
-        vpslld	$31, %xmm0, %xmm1
-        vpslld	$30, %xmm0, %xmm2
-        vpslld	$25, %xmm0, %xmm3
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpsrldq	$4, %xmm1, %xmm3
-        vpslldq	$12, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpsrld	$0x01, %xmm0, %xmm1
-        vpsrld	$2, %xmm0, %xmm2
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpsrld	$7, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        vmovdqu	%xmm7, 48(%esp)
-        cmpl	%esi, %edi
-        jne	L_AES_GCM_decrypt_update_avx1_ghash_64
-L_AES_GCM_decrypt_update_avx1_ghash_64_inplace:
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        vmovdqu	64(%esp), %xmm0
-        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm7
-        vpaddd	L_aes_gcm_avx1_one, %xmm0, %xmm1
-        vpshufb	%xmm7, %xmm1, %xmm1
-        vpaddd	L_aes_gcm_avx1_two, %xmm0, %xmm2
-        vpshufb	%xmm7, %xmm2, %xmm2
-        vpaddd	L_aes_gcm_avx1_three, %xmm0, %xmm3
-        vpshufb	%xmm7, %xmm3, %xmm3
-        vpshufb	%xmm7, %xmm0, %xmm0
-        vmovdqu	64(%esp), %xmm7
-        vpaddd	L_aes_gcm_avx1_four, %xmm7, %xmm7
-        vmovdqu	%xmm7, 64(%esp)
-        vmovdqa	(%ebp), %xmm7
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm3, %xmm3
-        vmovdqa	16(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	32(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	48(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	64(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	80(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	96(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	112(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	128(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	144(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$11, 184(%esp)
-        vmovdqa	160(%ebp), %xmm7
-        jl	L_AES_GCM_decrypt_update_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	176(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$13, 184(%esp)
-        vmovdqa	192(%ebp), %xmm7
-        jl	L_AES_GCM_decrypt_update_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	208(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	224(%ebp), %xmm7
-L_AES_GCM_decrypt_update_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done:
-        vaesenclast	%xmm7, %xmm0, %xmm0
-        vaesenclast	%xmm7, %xmm1, %xmm1
-        vmovdqu	(%ecx), %xmm4
-        vmovdqu	16(%ecx), %xmm5
-        vpxor	%xmm4, %xmm0, %xmm0
-        vpxor	%xmm5, %xmm1, %xmm1
-        vmovdqu	%xmm4, 96(%esp)
-        vmovdqu	%xmm5, 112(%esp)
-        vmovdqu	%xmm0, (%edx)
-        vmovdqu	%xmm1, 16(%edx)
-        vaesenclast	%xmm7, %xmm2, %xmm2
-        vaesenclast	%xmm7, %xmm3, %xmm3
-        vmovdqu	32(%ecx), %xmm4
-        vmovdqu	48(%ecx), %xmm5
-        vpxor	%xmm4, %xmm2, %xmm2
-        vpxor	%xmm5, %xmm3, %xmm3
-        vmovdqu	%xmm4, 128(%esp)
-        vmovdqu	%xmm5, 144(%esp)
-        vmovdqu	%xmm2, 32(%edx)
-        vmovdqu	%xmm3, 48(%edx)
-        # ghash encrypted counter
-        vmovdqu	80(%esp), %xmm2
-        vmovdqu	48(%esp), %xmm7
-        vmovdqu	96(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm3
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm2
-        vpclmulqdq	$0x00, %xmm5, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqu	32(%esp), %xmm7
-        vmovdqu	112(%esp), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	16(%esp), %xmm7
-        vmovdqu	128(%esp), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	(%esp), %xmm7
-        vmovdqu	144(%esp), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpslldq	$8, %xmm1, %xmm5
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm3, %xmm3
-        vpslld	$31, %xmm2, %xmm7
-        vpslld	$30, %xmm2, %xmm4
-        vpslld	$25, %xmm2, %xmm5
-        vpxor	%xmm4, %xmm7, %xmm7
-        vpxor	%xmm5, %xmm7, %xmm7
-        vpsrldq	$4, %xmm7, %xmm4
-        vpslldq	$12, %xmm7, %xmm7
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpsrld	$0x01, %xmm2, %xmm5
-        vpsrld	$2, %xmm2, %xmm1
-        vpsrld	$7, %xmm2, %xmm0
-        vpxor	%xmm1, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm2, %xmm2
-        vmovdqu	%xmm2, 80(%esp)
-        addl	$0x40, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_decrypt_update_avx1_ghash_64_inplace
-        jmp	L_AES_GCM_decrypt_update_avx1_ghash_64_done
-L_AES_GCM_decrypt_update_avx1_ghash_64:
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        vmovdqu	64(%esp), %xmm0
-        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm7
-        vpaddd	L_aes_gcm_avx1_one, %xmm0, %xmm1
-        vpshufb	%xmm7, %xmm1, %xmm1
-        vpaddd	L_aes_gcm_avx1_two, %xmm0, %xmm2
-        vpshufb	%xmm7, %xmm2, %xmm2
-        vpaddd	L_aes_gcm_avx1_three, %xmm0, %xmm3
-        vpshufb	%xmm7, %xmm3, %xmm3
-        vpshufb	%xmm7, %xmm0, %xmm0
-        vmovdqu	64(%esp), %xmm7
-        vpaddd	L_aes_gcm_avx1_four, %xmm7, %xmm7
-        vmovdqu	%xmm7, 64(%esp)
-        vmovdqa	(%ebp), %xmm7
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm3, %xmm3
-        vmovdqa	16(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	32(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	48(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	64(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	80(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	96(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	112(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	128(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	144(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$11, 184(%esp)
-        vmovdqa	160(%ebp), %xmm7
-        jl	L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	176(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$13, 184(%esp)
-        vmovdqa	192(%ebp), %xmm7
-        jl	L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	208(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqa	224(%ebp), %xmm7
-L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
-        vaesenclast	%xmm7, %xmm0, %xmm0
-        vaesenclast	%xmm7, %xmm1, %xmm1
-        vmovdqu	(%ecx), %xmm4
-        vmovdqu	16(%ecx), %xmm5
-        vpxor	%xmm4, %xmm0, %xmm0
-        vpxor	%xmm5, %xmm1, %xmm1
-        vmovdqu	%xmm4, (%ecx)
-        vmovdqu	%xmm5, 16(%ecx)
-        vmovdqu	%xmm0, (%edx)
-        vmovdqu	%xmm1, 16(%edx)
-        vaesenclast	%xmm7, %xmm2, %xmm2
-        vaesenclast	%xmm7, %xmm3, %xmm3
-        vmovdqu	32(%ecx), %xmm4
-        vmovdqu	48(%ecx), %xmm5
-        vpxor	%xmm4, %xmm2, %xmm2
-        vpxor	%xmm5, %xmm3, %xmm3
-        vmovdqu	%xmm4, 32(%ecx)
-        vmovdqu	%xmm5, 48(%ecx)
-        vmovdqu	%xmm2, 32(%edx)
-        vmovdqu	%xmm3, 48(%edx)
-        # ghash encrypted counter
-        vmovdqu	80(%esp), %xmm2
-        vmovdqu	48(%esp), %xmm7
-        vmovdqu	(%ecx), %xmm0
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm3
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm2
-        vpclmulqdq	$0x00, %xmm5, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vmovdqu	32(%esp), %xmm7
-        vmovdqu	16(%ecx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	16(%esp), %xmm7
-        vmovdqu	32(%ecx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	(%esp), %xmm7
-        vmovdqu	48(%ecx), %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm4
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpshufd	$0x4e, %xmm0, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
-        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpslldq	$8, %xmm1, %xmm5
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm3, %xmm3
-        vpslld	$31, %xmm2, %xmm7
-        vpslld	$30, %xmm2, %xmm4
-        vpslld	$25, %xmm2, %xmm5
-        vpxor	%xmm4, %xmm7, %xmm7
-        vpxor	%xmm5, %xmm7, %xmm7
-        vpsrldq	$4, %xmm7, %xmm4
-        vpslldq	$12, %xmm7, %xmm7
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpsrld	$0x01, %xmm2, %xmm5
-        vpsrld	$2, %xmm2, %xmm1
-        vpsrld	$7, %xmm2, %xmm0
-        vpxor	%xmm1, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm2, %xmm2
-        vmovdqu	%xmm2, 80(%esp)
-        addl	$0x40, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_decrypt_update_avx1_ghash_64
-L_AES_GCM_decrypt_update_avx1_ghash_64_done:
-        vmovdqa	%xmm2, %xmm6
-        vmovdqu	(%esp), %xmm5
-L_AES_GCM_decrypt_update_avx1_done_64:
-        movl	196(%esp), %edx
-        cmpl	%edx, %ebx
-        jge	L_AES_GCM_decrypt_update_avx1_done_dec
-        movl	196(%esp), %eax
-        andl	$0xfffffff0, %eax
-        cmpl	%eax, %ebx
-        jge	L_AES_GCM_decrypt_update_avx1_last_block_done
-L_AES_GCM_decrypt_update_avx1_last_block_start:
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        vmovdqu	(%ecx), %xmm1
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm1, %xmm1
-        vmovdqu	%xmm1, (%esp)
-        vmovdqu	64(%esp), %xmm1
-        vmovdqu	(%esp), %xmm3
-        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm1, %xmm0
-        vpaddd	L_aes_gcm_avx1_one, %xmm1, %xmm1
-        vmovdqu	%xmm1, 64(%esp)
-        vpxor	(%ebp), %xmm0, %xmm0
-        vpclmulqdq	$16, %xmm5, %xmm3, %xmm4
-        vaesenc	16(%ebp), %xmm0, %xmm0
-        vaesenc	32(%ebp), %xmm0, %xmm0
-        vpclmulqdq	$0x01, %xmm5, %xmm3, %xmm7
-        vaesenc	48(%ebp), %xmm0, %xmm0
-        vaesenc	64(%ebp), %xmm0, %xmm0
-        vaesenc	80(%ebp), %xmm0, %xmm0
-        vpclmulqdq	$0x11, %xmm5, %xmm3, %xmm1
-        vaesenc	96(%ebp), %xmm0, %xmm0
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpslldq	$8, %xmm4, %xmm2
-        vpsrldq	$8, %xmm4, %xmm4
-        vaesenc	112(%ebp), %xmm0, %xmm0
-        vpclmulqdq	$0x00, %xmm5, %xmm3, %xmm7
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqa	L_aes_gcm_avx1_mod2_128, %xmm3
-        vpclmulqdq	$16, %xmm3, %xmm2, %xmm7
-        vaesenc	128(%ebp), %xmm0, %xmm0
-        vpshufd	$0x4e, %xmm2, %xmm4
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpclmulqdq	$16, %xmm3, %xmm4, %xmm7
-        vaesenc	144(%ebp), %xmm0, %xmm0
-        vpshufd	$0x4e, %xmm4, %xmm6
-        vpxor	%xmm7, %xmm6, %xmm6
-        vpxor	%xmm1, %xmm6, %xmm6
-        cmpl	$11, 184(%esp)
-        vmovdqa	160(%ebp), %xmm1
-        jl	L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last
-        vaesenc	%xmm1, %xmm0, %xmm0
-        vaesenc	176(%ebp), %xmm0, %xmm0
-        cmpl	$13, 184(%esp)
-        vmovdqa	192(%ebp), %xmm1
-        jl	L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last
-        vaesenc	%xmm1, %xmm0, %xmm0
-        vaesenc	208(%ebp), %xmm0, %xmm0
-        vmovdqa	224(%ebp), %xmm1
-L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last:
-        vaesenclast	%xmm1, %xmm0, %xmm0
-        vmovdqu	(%ecx), %xmm1
-        vpxor	%xmm1, %xmm0, %xmm0
-        vmovdqu	%xmm0, (%edx)
-        addl	$16, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_decrypt_update_avx1_last_block_start
-L_AES_GCM_decrypt_update_avx1_last_block_done:
-L_AES_GCM_decrypt_update_avx1_done_dec:
-        movl	200(%esp), %esi
-        movl	208(%esp), %edi
-        vmovdqu	64(%esp), %xmm4
-        vmovdqa	%xmm6, (%esi)
-        vmovdqu	%xmm4, (%edi)
-        addl	$0xa0, %esp
-        popl	%ebp
-        popl	%edi
-        popl	%esi
-        popl	%ebx
-        ret
-.size	AES_GCM_decrypt_update_avx1,.-AES_GCM_decrypt_update_avx1
-.text
-.globl	AES_GCM_decrypt_final_avx1
-.type	AES_GCM_decrypt_final_avx1,@function
-.align	16
-AES_GCM_decrypt_final_avx1:
-        pushl	%ebx
-        pushl	%esi
-        pushl	%edi
-        pushl	%ebp
-        subl	$16, %esp
-        movl	36(%esp), %ebp
-        movl	56(%esp), %esi
-        movl	60(%esp), %edi
-        vmovdqa	(%ebp), %xmm6
-        vmovdqa	(%esi), %xmm5
-        vmovdqa	(%edi), %xmm7
-        vpsrlq	$63, %xmm5, %xmm1
-        vpsllq	$0x01, %xmm5, %xmm0
-        vpslldq	$8, %xmm1, %xmm1
-        vpor	%xmm1, %xmm0, %xmm0
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpand	L_aes_gcm_avx1_mod2_128, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        movl	48(%esp), %edx
-        movl	52(%esp), %ecx
-        shll	$3, %edx
-        shll	$3, %ecx
-        vpinsrd	$0x00, %edx, %xmm0, %xmm0
-        vpinsrd	$2, %ecx, %xmm0, %xmm0
-        movl	48(%esp), %edx
-        movl	52(%esp), %ecx
-        shrl	$29, %edx
-        shrl	$29, %ecx
-        vpinsrd	$0x01, %edx, %xmm0, %xmm0
-        vpinsrd	$3, %ecx, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm6, %xmm6
-        # ghash_gfmul_red_avx
-        vpshufd	$0x4e, %xmm5, %xmm1
-        vpshufd	$0x4e, %xmm6, %xmm2
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm0
-        vpxor	%xmm5, %xmm1, %xmm1
-        vpxor	%xmm6, %xmm2, %xmm2
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpslldq	$8, %xmm1, %xmm2
-        vpsrldq	$8, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm0, %xmm0
-        vpxor	%xmm1, %xmm3, %xmm6
-        vpslld	$31, %xmm0, %xmm1
-        vpslld	$30, %xmm0, %xmm2
-        vpslld	$25, %xmm0, %xmm3
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpsrldq	$4, %xmm1, %xmm3
-        vpslldq	$12, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm0, %xmm0
-        vpsrld	$0x01, %xmm0, %xmm1
-        vpsrld	$2, %xmm0, %xmm2
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpsrld	$7, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm6, %xmm6
-        vpxor	%xmm7, %xmm6, %xmm0
-        movl	40(%esp), %esi
-        movl	64(%esp), %edi
-        cmpl	$16, 44(%esp)
-        je	L_AES_GCM_decrypt_final_avx1_cmp_tag_16
-        subl	$16, %esp
-        xorl	%ecx, %ecx
-        xorl	%ebx, %ebx
-        vmovdqu	%xmm0, (%esp)
-L_AES_GCM_decrypt_final_avx1_cmp_tag_loop:
-        movzbl	(%esp,%ecx,1), %eax
-        xorb	(%esi,%ecx,1), %al
-        orb	%al, %bl
-        incl	%ecx
-        cmpl	44(%esp), %ecx
-        jne	L_AES_GCM_decrypt_final_avx1_cmp_tag_loop
-        cmpb	$0x00, %bl
-        sete	%bl
-        addl	$16, %esp
-        xorl	%ecx, %ecx
-        jmp	L_AES_GCM_decrypt_final_avx1_cmp_tag_done
-L_AES_GCM_decrypt_final_avx1_cmp_tag_16:
-        vmovdqu	(%esi), %xmm1
-        vpcmpeqb	%xmm1, %xmm0, %xmm0
-        vpmovmskb	%xmm0, %edx
-        # %%edx == 0xFFFF then return 1 else => return 0
-        xorl	%ebx, %ebx
-        cmpl	$0xffff, %edx
-        sete	%bl
-L_AES_GCM_decrypt_final_avx1_cmp_tag_done:
-        movl	%ebx, (%edi)
-        addl	$16, %esp
-        popl	%ebp
-        popl	%edi
-        popl	%esi
-        popl	%ebx
-        ret
-.size	AES_GCM_decrypt_final_avx1,.-AES_GCM_decrypt_final_avx1
-#endif /* WOLFSSL_AESGCM_STREAM */
-#endif /* HAVE_INTEL_AVX1 */
-#ifdef HAVE_INTEL_AVX2
-.text
-.globl	AES_GCM_encrypt_avx2
-.type	AES_GCM_encrypt_avx2,@function
-.align	16
-AES_GCM_encrypt_avx2:
-        pushl	%ebx
-        pushl	%esi
-        pushl	%edi
-        pushl	%ebp
-        subl	$0x70, %esp
-        movl	144(%esp), %esi
-        movl	168(%esp), %ebp
-        movl	160(%esp), %edx
-        vpxor	%xmm4, %xmm4, %xmm4
-        cmpl	$12, %edx
-        je	L_AES_GCM_encrypt_avx2_iv_12
-        # Calculate values when IV is not 12 bytes
-        # H = Encrypt X(=0)
-        vmovdqu	(%ebp), %xmm5
-        vaesenc	16(%ebp), %xmm5, %xmm5
-        vaesenc	32(%ebp), %xmm5, %xmm5
-        vaesenc	48(%ebp), %xmm5, %xmm5
-        vaesenc	64(%ebp), %xmm5, %xmm5
-        vaesenc	80(%ebp), %xmm5, %xmm5
-        vaesenc	96(%ebp), %xmm5, %xmm5
-        vaesenc	112(%ebp), %xmm5, %xmm5
-        vaesenc	128(%ebp), %xmm5, %xmm5
-        vaesenc	144(%ebp), %xmm5, %xmm5
-        cmpl	$11, 172(%esp)
-        vmovdqu	160(%ebp), %xmm0
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	176(%ebp), %xmm5, %xmm5
-        cmpl	$13, 172(%esp)
-        vmovdqu	192(%ebp), %xmm0
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	208(%ebp), %xmm5, %xmm5
-        vmovdqu	224(%ebp), %xmm0
-L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last:
-        vaesenclast	%xmm0, %xmm5, %xmm5
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm5, %xmm5
-        # Calc counter
-        # Initialization vector
-        cmpl	$0x00, %edx
-        movl	$0x00, %ecx
-        je	L_AES_GCM_encrypt_avx2_calc_iv_done
-        cmpl	$16, %edx
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_encrypt_avx2_calc_iv_16_loop:
-        vmovdqu	(%esi,%ecx,1), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_16_loop
-        movl	160(%esp), %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_encrypt_avx2_calc_iv_done
-L_AES_GCM_encrypt_avx2_calc_iv_lt16:
-        vpxor	%xmm0, %xmm0, %xmm0
-        xorl	%ebx, %ebx
-        vmovdqu	%xmm0, (%esp)
-L_AES_GCM_encrypt_avx2_calc_iv_loop:
-        movzbl	(%esi,%ecx,1), %eax
-        movb	%al, (%esp,%ebx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_loop
-        vmovdqu	(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-L_AES_GCM_encrypt_avx2_calc_iv_done:
-        # T = Encrypt counter
-        vpxor	%xmm0, %xmm0, %xmm0
-        shll	$3, %edx
-        vpinsrd	$0x00, %edx, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm4, %xmm4
-        #   Encrypt counter
-        vmovdqu	(%ebp), %xmm6
-        vpxor	%xmm4, %xmm6, %xmm6
-        vaesenc	16(%ebp), %xmm6, %xmm6
-        vaesenc	32(%ebp), %xmm6, %xmm6
-        vaesenc	48(%ebp), %xmm6, %xmm6
-        vaesenc	64(%ebp), %xmm6, %xmm6
-        vaesenc	80(%ebp), %xmm6, %xmm6
-        vaesenc	96(%ebp), %xmm6, %xmm6
-        vaesenc	112(%ebp), %xmm6, %xmm6
-        vaesenc	128(%ebp), %xmm6, %xmm6
-        vaesenc	144(%ebp), %xmm6, %xmm6
-        cmpl	$11, 172(%esp)
-        vmovdqu	160(%ebp), %xmm0
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vaesenc	176(%ebp), %xmm6, %xmm6
-        cmpl	$13, 172(%esp)
-        vmovdqu	192(%ebp), %xmm0
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vaesenc	208(%ebp), %xmm6, %xmm6
-        vmovdqu	224(%ebp), %xmm0
-L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last:
-        vaesenclast	%xmm0, %xmm6, %xmm6
-        jmp	L_AES_GCM_encrypt_avx2_iv_done
-L_AES_GCM_encrypt_avx2_iv_12:
-        # # Calculate values when IV is 12 bytes
-        # Set counter based on IV
-        vmovdqu	L_avx2_aes_gcm_bswap_one, %xmm4
-        vmovdqu	(%ebp), %xmm5
-        vpblendd	$7, (%esi), %xmm4, %xmm4
-        # H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqu	16(%ebp), %xmm7
-        vpxor	%xmm5, %xmm4, %xmm6
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm6, %xmm6
-        vmovdqu	32(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vmovdqu	48(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vmovdqu	64(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vmovdqu	80(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vmovdqu	96(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vmovdqu	112(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vmovdqu	128(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vmovdqu	144(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        cmpl	$11, 172(%esp)
-        vmovdqu	160(%ebp), %xmm0
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_12_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vmovdqu	176(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        cmpl	$13, 172(%esp)
-        vmovdqu	192(%ebp), %xmm0
-        jl	L_AES_GCM_encrypt_avx2_calc_iv_12_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vmovdqu	208(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vmovdqu	224(%ebp), %xmm0
-L_AES_GCM_encrypt_avx2_calc_iv_12_last:
-        vaesenclast	%xmm0, %xmm5, %xmm5
-        vaesenclast	%xmm0, %xmm6, %xmm6
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm5, %xmm5
-L_AES_GCM_encrypt_avx2_iv_done:
-        vmovdqu	%xmm6, 80(%esp)
-        vpxor	%xmm6, %xmm6, %xmm6
-        movl	140(%esp), %esi
-        # Additional authentication data
-        movl	156(%esp), %edx
-        cmpl	$0x00, %edx
-        je	L_AES_GCM_encrypt_avx2_calc_aad_done
-        xorl	%ecx, %ecx
-        cmpl	$16, %edx
-        jl	L_AES_GCM_encrypt_avx2_calc_aad_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_encrypt_avx2_calc_aad_16_loop:
-        vmovdqu	(%esi,%ecx,1), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm6, %xmm6
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm6, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm6, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm6, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm6
-        # ghash_mid
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm6, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm6, %xmm6
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm6, %xmm6
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm6, %xmm6
-        # ghash_red
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm6, %xmm6
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_avx2_calc_aad_16_loop
-        movl	156(%esp), %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_encrypt_avx2_calc_aad_done
-L_AES_GCM_encrypt_avx2_calc_aad_lt16:
-        vpxor	%xmm0, %xmm0, %xmm0
-        xorl	%ebx, %ebx
-        vmovdqu	%xmm0, (%esp)
-L_AES_GCM_encrypt_avx2_calc_aad_loop:
-        movzbl	(%esi,%ecx,1), %eax
-        movb	%al, (%esp,%ebx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_encrypt_avx2_calc_aad_loop
-        vmovdqu	(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm6, %xmm6
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm6, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm6, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm6, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm6
-        # ghash_mid
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm6, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm6, %xmm6
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm6, %xmm6
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm6, %xmm6
-        # ghash_red
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm6, %xmm6
-L_AES_GCM_encrypt_avx2_calc_aad_done:
-        movl	132(%esp), %esi
-        movl	136(%esp), %edi
-        # Calculate counter and H
-        vpsrlq	$63, %xmm5, %xmm1
-        vpsllq	$0x01, %xmm5, %xmm0
-        vpslldq	$8, %xmm1, %xmm1
-        vpor	%xmm1, %xmm0, %xmm0
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm4
-        vpand	L_aes_gcm_avx2_mod2_128, %xmm5, %xmm5
-        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm4
-        vpxor	%xmm0, %xmm5, %xmm5
-        xorl	%ebx, %ebx
-        cmpl	$0x40, 152(%esp)
-        movl	152(%esp), %eax
-        jl	L_AES_GCM_encrypt_avx2_done_64
-        andl	$0xffffffc0, %eax
-        vmovdqu	%xmm4, 64(%esp)
-        vmovdqu	%xmm6, 96(%esp)
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm3
-        # H ^ 1
-        vmovdqu	%xmm5, (%esp)
-        vmovdqu	%xmm5, %xmm2
-        # H ^ 2
-        vpclmulqdq	$0x00, %xmm2, %xmm2, %xmm5
-        vpclmulqdq	$0x11, %xmm2, %xmm2, %xmm6
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm6, %xmm0
-        vmovdqu	%xmm0, 16(%esp)
-        # H ^ 3
-        # ghash_gfmul_red
-        vpclmulqdq	$16, %xmm0, %xmm2, %xmm6
-        vpclmulqdq	$0x01, %xmm0, %xmm2, %xmm5
-        vpclmulqdq	$0x00, %xmm0, %xmm2, %xmm4
-        vpxor	%xmm5, %xmm6, %xmm6
-        vpslldq	$8, %xmm6, %xmm5
-        vpsrldq	$8, %xmm6, %xmm6
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm0, %xmm2, %xmm1
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm1, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	%xmm1, 32(%esp)
-        # H ^ 4
-        vpclmulqdq	$0x00, %xmm0, %xmm0, %xmm5
-        vpclmulqdq	$0x11, %xmm0, %xmm0, %xmm6
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm6, %xmm2
-        vmovdqu	%xmm2, 48(%esp)
-        vmovdqu	96(%esp), %xmm6
-        # First 64 bytes of input
-        # aesenc_64
-        # aesenc_ctr
-        vmovdqu	64(%esp), %xmm4
-        vmovdqu	L_aes_gcm_avx2_bswap_epi64, %xmm7
-        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm1
-        vpshufb	%xmm7, %xmm4, %xmm0
-        vpaddd	L_aes_gcm_avx2_two, %xmm4, %xmm2
-        vpshufb	%xmm7, %xmm1, %xmm1
-        vpaddd	L_aes_gcm_avx2_three, %xmm4, %xmm3
-        vpshufb	%xmm7, %xmm2, %xmm2
-        vpaddd	L_aes_gcm_avx2_four, %xmm4, %xmm4
-        vpshufb	%xmm7, %xmm3, %xmm3
-        # aesenc_xor
-        vmovdqu	(%ebp), %xmm7
-        vmovdqu	%xmm4, 64(%esp)
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm3, %xmm3
-        vmovdqu	16(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	32(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	48(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	64(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	80(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	96(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	112(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	128(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	144(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$11, 172(%esp)
-        vmovdqu	160(%ebp), %xmm7
-        jl	L_AES_GCM_encrypt_avx2_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	176(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$13, 172(%esp)
-        vmovdqu	192(%ebp), %xmm7
-        jl	L_AES_GCM_encrypt_avx2_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	208(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	224(%ebp), %xmm7
-L_AES_GCM_encrypt_avx2_aesenc_64_enc_done:
-        # aesenc_last
-        vaesenclast	%xmm7, %xmm0, %xmm0
-        vaesenclast	%xmm7, %xmm1, %xmm1
-        vaesenclast	%xmm7, %xmm2, %xmm2
-        vaesenclast	%xmm7, %xmm3, %xmm3
-        vmovdqu	(%esi), %xmm7
-        vmovdqu	16(%esi), %xmm4
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	%xmm0, (%edi)
-        vmovdqu	%xmm1, 16(%edi)
-        vmovdqu	32(%esi), %xmm7
-        vmovdqu	48(%esi), %xmm4
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm4, %xmm3, %xmm3
-        vmovdqu	%xmm2, 32(%edi)
-        vmovdqu	%xmm3, 48(%edi)
-        cmpl	$0x40, %eax
-        movl	$0x40, %ebx
-        movl	%esi, %ecx
-        movl	%edi, %edx
-        jle	L_AES_GCM_encrypt_avx2_end_64
-        # More 64 bytes of input
-L_AES_GCM_encrypt_avx2_ghash_64:
-        # aesenc_64_ghash
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        # aesenc_64
-        # aesenc_ctr
-        vmovdqu	64(%esp), %xmm4
-        vmovdqu	L_aes_gcm_avx2_bswap_epi64, %xmm7
-        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm1
-        vpshufb	%xmm7, %xmm4, %xmm0
-        vpaddd	L_aes_gcm_avx2_two, %xmm4, %xmm2
-        vpshufb	%xmm7, %xmm1, %xmm1
-        vpaddd	L_aes_gcm_avx2_three, %xmm4, %xmm3
-        vpshufb	%xmm7, %xmm2, %xmm2
-        vpaddd	L_aes_gcm_avx2_four, %xmm4, %xmm4
-        vpshufb	%xmm7, %xmm3, %xmm3
-        # aesenc_xor
-        vmovdqu	(%ebp), %xmm7
-        vmovdqu	%xmm4, 64(%esp)
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm3, %xmm3
-        vmovdqu	16(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	32(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	48(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	64(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	80(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	96(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	112(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	128(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	144(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$11, 172(%esp)
-        vmovdqu	160(%ebp), %xmm7
-        jl	L_AES_GCM_encrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	176(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$13, 172(%esp)
-        vmovdqu	192(%ebp), %xmm7
-        jl	L_AES_GCM_encrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	208(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	224(%ebp), %xmm7
-L_AES_GCM_encrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done:
-        # aesenc_last
-        vaesenclast	%xmm7, %xmm0, %xmm0
-        vaesenclast	%xmm7, %xmm1, %xmm1
-        vaesenclast	%xmm7, %xmm2, %xmm2
-        vaesenclast	%xmm7, %xmm3, %xmm3
-        vmovdqu	(%ecx), %xmm7
-        vmovdqu	16(%ecx), %xmm4
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	%xmm0, (%edx)
-        vmovdqu	%xmm1, 16(%edx)
-        vmovdqu	32(%ecx), %xmm7
-        vmovdqu	48(%ecx), %xmm4
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm4, %xmm3, %xmm3
-        vmovdqu	%xmm2, 32(%edx)
-        vmovdqu	%xmm3, 48(%edx)
-        # pclmul_1
-        vmovdqu	-64(%edx), %xmm1
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vmovdqu	48(%esp), %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm5
-        vpclmulqdq	$0x01, %xmm2, %xmm1, %xmm3
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm6
-        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
-        # pclmul_2
-        vmovdqu	-48(%edx), %xmm1
-        vmovdqu	32(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        # pclmul_n
-        vmovdqu	-32(%edx), %xmm1
-        vmovdqu	16(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        # pclmul_n
-        vmovdqu	-16(%edx), %xmm1
-        vmovdqu	(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        # aesenc_pclmul_l
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpslldq	$8, %xmm5, %xmm1
-        vpsrldq	$8, %xmm5, %xmm5
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm0
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm7, %xmm7
-        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
-        vpshufd	$0x4e, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
-        vpshufd	$0x4e, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm7, %xmm6, %xmm6
-        # aesenc_64_ghash - end
-        addl	$0x40, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_encrypt_avx2_ghash_64
-L_AES_GCM_encrypt_avx2_end_64:
-        vmovdqu	%xmm6, 96(%esp)
-        vmovdqu	48(%edx), %xmm3
-        vmovdqu	(%esp), %xmm7
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3
-        vpclmulqdq	$16, %xmm3, %xmm7, %xmm5
-        vpclmulqdq	$0x01, %xmm3, %xmm7, %xmm1
-        vpclmulqdq	$0x00, %xmm3, %xmm7, %xmm4
-        vpclmulqdq	$0x11, %xmm3, %xmm7, %xmm6
-        vpxor	%xmm1, %xmm5, %xmm5
-        vmovdqu	32(%edx), %xmm3
-        vmovdqu	16(%esp), %xmm7
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3
-        vpclmulqdq	$16, %xmm3, %xmm7, %xmm2
-        vpclmulqdq	$0x01, %xmm3, %xmm7, %xmm1
-        vpclmulqdq	$0x00, %xmm3, %xmm7, %xmm0
-        vpclmulqdq	$0x11, %xmm3, %xmm7, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm4, %xmm4
-        vmovdqu	16(%edx), %xmm3
-        vmovdqu	32(%esp), %xmm7
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3
-        vpclmulqdq	$16, %xmm3, %xmm7, %xmm2
-        vpclmulqdq	$0x01, %xmm3, %xmm7, %xmm1
-        vpclmulqdq	$0x00, %xmm3, %xmm7, %xmm0
-        vpclmulqdq	$0x11, %xmm3, %xmm7, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm4, %xmm4
-        vmovdqu	96(%esp), %xmm0
-        vmovdqu	(%edx), %xmm3
-        vmovdqu	48(%esp), %xmm7
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3
-        vpxor	%xmm0, %xmm3, %xmm3
-        vpclmulqdq	$16, %xmm3, %xmm7, %xmm2
-        vpclmulqdq	$0x01, %xmm3, %xmm7, %xmm1
-        vpclmulqdq	$0x00, %xmm3, %xmm7, %xmm0
-        vpclmulqdq	$0x11, %xmm3, %xmm7, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpslldq	$8, %xmm5, %xmm7
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpxor	%xmm5, %xmm6, %xmm6
-        # ghash_red
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm4, %xmm0
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm6, %xmm6
-        vmovdqu	(%esp), %xmm5
-        vmovdqu	64(%esp), %xmm4
-L_AES_GCM_encrypt_avx2_done_64:
-        cmpl	152(%esp), %ebx
-        je	L_AES_GCM_encrypt_avx2_done_enc
-        movl	152(%esp), %eax
-        andl	$0xfffffff0, %eax
-        cmpl	%eax, %ebx
-        jge	L_AES_GCM_encrypt_avx2_last_block_done
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        # aesenc_block
-        vmovdqu	%xmm4, %xmm1
-        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm1, %xmm0
-        vpaddd	L_aes_gcm_avx2_one, %xmm1, %xmm1
-        vpxor	(%ebp), %xmm0, %xmm0
-        vaesenc	16(%ebp), %xmm0, %xmm0
-        vaesenc	32(%ebp), %xmm0, %xmm0
-        vaesenc	48(%ebp), %xmm0, %xmm0
-        vaesenc	64(%ebp), %xmm0, %xmm0
-        vaesenc	80(%ebp), %xmm0, %xmm0
-        vaesenc	96(%ebp), %xmm0, %xmm0
-        vaesenc	112(%ebp), %xmm0, %xmm0
-        vaesenc	128(%ebp), %xmm0, %xmm0
-        vaesenc	144(%ebp), %xmm0, %xmm0
-        cmpl	$11, 172(%esp)
-        vmovdqu	160(%ebp), %xmm2
-        jl	L_AES_GCM_encrypt_avx2_aesenc_block_aesenc_avx_last
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vaesenc	176(%ebp), %xmm0, %xmm0
-        cmpl	$13, 172(%esp)
-        vmovdqu	192(%ebp), %xmm2
-        jl	L_AES_GCM_encrypt_avx2_aesenc_block_aesenc_avx_last
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vaesenc	208(%ebp), %xmm0, %xmm0
-        vmovdqu	224(%ebp), %xmm2
-L_AES_GCM_encrypt_avx2_aesenc_block_aesenc_avx_last:
-        vaesenclast	%xmm2, %xmm0, %xmm0
-        vmovdqu	%xmm1, %xmm4
-        vmovdqu	(%ecx), %xmm1
-        vpxor	%xmm1, %xmm0, %xmm0
-        vmovdqu	%xmm0, (%edx)
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm6, %xmm6
-        addl	$16, %ebx
-        cmpl	%eax, %ebx
-        jge	L_AES_GCM_encrypt_avx2_last_block_ghash
-L_AES_GCM_encrypt_avx2_last_block_start:
-        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm7
-        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm4
-        vmovdqu	%xmm4, 64(%esp)
-        # aesenc_gfmul_sb
-        vpclmulqdq	$0x01, %xmm5, %xmm6, %xmm2
-        vpclmulqdq	$16, %xmm5, %xmm6, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm1
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm4
-        vpxor	(%ebp), %xmm7, %xmm7
-        vaesenc	16(%ebp), %xmm7, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpslldq	$8, %xmm3, %xmm2
-        vpsrldq	$8, %xmm3, %xmm3
-        vaesenc	32(%ebp), %xmm7, %xmm7
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1
-        vaesenc	48(%ebp), %xmm7, %xmm7
-        vaesenc	64(%ebp), %xmm7, %xmm7
-        vaesenc	80(%ebp), %xmm7, %xmm7
-        vpshufd	$0x4e, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1
-        vaesenc	96(%ebp), %xmm7, %xmm7
-        vaesenc	112(%ebp), %xmm7, %xmm7
-        vaesenc	128(%ebp), %xmm7, %xmm7
-        vpshufd	$0x4e, %xmm2, %xmm2
-        vaesenc	144(%ebp), %xmm7, %xmm7
-        vpxor	%xmm3, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm2, %xmm2
-        vmovdqu	160(%ebp), %xmm0
-        cmpl	$11, 172(%esp)
-        jl	L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vaesenc	176(%ebp), %xmm7, %xmm7
-        vmovdqu	192(%ebp), %xmm0
-        cmpl	$13, 172(%esp)
-        jl	L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vaesenc	208(%ebp), %xmm7, %xmm7
-        vmovdqu	224(%ebp), %xmm0
-L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last:
-        vaesenclast	%xmm0, %xmm7, %xmm7
-        vmovdqu	(%esi,%ebx,1), %xmm3
-        vpxor	%xmm1, %xmm2, %xmm6
-        vpxor	%xmm3, %xmm7, %xmm7
-        vmovdqu	%xmm7, (%edi,%ebx,1)
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm7, %xmm7
-        vpxor	%xmm7, %xmm6, %xmm6
-        vmovdqu	64(%esp), %xmm4
-        addl	$16, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_encrypt_avx2_last_block_start
-L_AES_GCM_encrypt_avx2_last_block_ghash:
-        # ghash_gfmul_red
-        vpclmulqdq	$16, %xmm5, %xmm6, %xmm2
-        vpclmulqdq	$0x01, %xmm5, %xmm6, %xmm1
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm0
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm6
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm6, %xmm6
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm0, %xmm6, %xmm6
-L_AES_GCM_encrypt_avx2_last_block_done:
-        movl	152(%esp), %ecx
-        movl	152(%esp), %edx
-        andl	$15, %ecx
-        jz	L_AES_GCM_encrypt_avx2_done_enc
-        # aesenc_last15_enc
-        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm4
-        vpxor	(%ebp), %xmm4, %xmm4
-        vaesenc	16(%ebp), %xmm4, %xmm4
-        vaesenc	32(%ebp), %xmm4, %xmm4
-        vaesenc	48(%ebp), %xmm4, %xmm4
-        vaesenc	64(%ebp), %xmm4, %xmm4
-        vaesenc	80(%ebp), %xmm4, %xmm4
-        vaesenc	96(%ebp), %xmm4, %xmm4
-        vaesenc	112(%ebp), %xmm4, %xmm4
-        vaesenc	128(%ebp), %xmm4, %xmm4
-        vaesenc	144(%ebp), %xmm4, %xmm4
-        cmpl	$11, 172(%esp)
-        vmovdqu	160(%ebp), %xmm0
-        jl	L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last
-        vaesenc	%xmm0, %xmm4, %xmm4
-        vaesenc	176(%ebp), %xmm4, %xmm4
-        cmpl	$13, 172(%esp)
-        vmovdqu	192(%ebp), %xmm0
-        jl	L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last
-        vaesenc	%xmm0, %xmm4, %xmm4
-        vaesenc	208(%ebp), %xmm4, %xmm4
-        vmovdqu	224(%ebp), %xmm0
-L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last:
-        vaesenclast	%xmm0, %xmm4, %xmm4
-        xorl	%ecx, %ecx
-        vpxor	%xmm0, %xmm0, %xmm0
-        vmovdqu	%xmm4, (%esp)
-        vmovdqu	%xmm0, 16(%esp)
-L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_loop:
-        movzbl	(%esi,%ebx,1), %eax
-        xorb	(%esp,%ecx,1), %al
-        movb	%al, 16(%esp,%ecx,1)
-        movb	%al, (%edi,%ebx,1)
-        incl	%ebx
-        incl	%ecx
-        cmpl	%edx, %ebx
-        jl	L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_loop
-L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_finish_enc:
-        vmovdqu	16(%esp), %xmm4
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm6, %xmm6
-        # ghash_gfmul_red
-        vpclmulqdq	$16, %xmm5, %xmm6, %xmm2
-        vpclmulqdq	$0x01, %xmm5, %xmm6, %xmm1
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm0
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm6
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm6, %xmm6
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm0, %xmm6, %xmm6
-L_AES_GCM_encrypt_avx2_done_enc:
-        vmovdqu	80(%esp), %xmm7
-        # calc_tag
-        movl	152(%esp), %ecx
-        shll	$3, %ecx
-        vpinsrd	$0x00, %ecx, %xmm0, %xmm0
-        movl	156(%esp), %ecx
-        shll	$3, %ecx
-        vpinsrd	$2, %ecx, %xmm0, %xmm0
-        movl	152(%esp), %ecx
-        shrl	$29, %ecx
-        vpinsrd	$0x01, %ecx, %xmm0, %xmm0
-        movl	156(%esp), %ecx
-        shrl	$29, %ecx
-        vpinsrd	$3, %ecx, %xmm0, %xmm0
-        vpxor	%xmm6, %xmm0, %xmm0
-        # ghash_gfmul_red
-        vpclmulqdq	$16, %xmm5, %xmm0, %xmm4
-        vpclmulqdq	$0x01, %xmm5, %xmm0, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm0, %xmm2
-        vpxor	%xmm3, %xmm4, %xmm4
-        vpslldq	$8, %xmm4, %xmm3
-        vpsrldq	$8, %xmm4, %xmm4
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpclmulqdq	$0x11, %xmm5, %xmm0, %xmm0
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2
-        vpshufd	$0x4e, %xmm3, %xmm3
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2
-        vpshufd	$0x4e, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm0, %xmm0
-        movl	148(%esp), %edi
-        movl	164(%esp), %ebx
-        # store_tag
-        cmpl	$16, %ebx
-        je	L_AES_GCM_encrypt_avx2_store_tag_16
-        xorl	%ecx, %ecx
-        vmovdqu	%xmm0, (%esp)
-L_AES_GCM_encrypt_avx2_store_tag_loop:
-        movzbl	(%esp,%ecx,1), %eax
-        movb	%al, (%edi,%ecx,1)
-        incl	%ecx
-        cmpl	%ebx, %ecx
-        jne	L_AES_GCM_encrypt_avx2_store_tag_loop
-        jmp	L_AES_GCM_encrypt_avx2_store_tag_done
-L_AES_GCM_encrypt_avx2_store_tag_16:
-        vmovdqu	%xmm0, (%edi)
-L_AES_GCM_encrypt_avx2_store_tag_done:
-        addl	$0x70, %esp
-        popl	%ebp
-        popl	%edi
-        popl	%esi
-        popl	%ebx
-        ret
-.size	AES_GCM_encrypt_avx2,.-AES_GCM_encrypt_avx2
-.text
-.globl	AES_GCM_decrypt_avx2
-.type	AES_GCM_decrypt_avx2,@function
-.align	16
-AES_GCM_decrypt_avx2:
-        pushl	%ebx
-        pushl	%esi
-        pushl	%edi
-        pushl	%ebp
-        subl	$0xb0, %esp
-        movl	208(%esp), %esi
-        movl	232(%esp), %ebp
-        vpxor	%xmm4, %xmm4, %xmm4
-        movl	224(%esp), %edx
-        cmpl	$12, %edx
-        je	L_AES_GCM_decrypt_avx2_iv_12
-        # Calculate values when IV is not 12 bytes
-        # H = Encrypt X(=0)
-        vmovdqu	(%ebp), %xmm5
-        vaesenc	16(%ebp), %xmm5, %xmm5
-        vaesenc	32(%ebp), %xmm5, %xmm5
-        vaesenc	48(%ebp), %xmm5, %xmm5
-        vaesenc	64(%ebp), %xmm5, %xmm5
-        vaesenc	80(%ebp), %xmm5, %xmm5
-        vaesenc	96(%ebp), %xmm5, %xmm5
-        vaesenc	112(%ebp), %xmm5, %xmm5
-        vaesenc	128(%ebp), %xmm5, %xmm5
-        vaesenc	144(%ebp), %xmm5, %xmm5
-        cmpl	$11, 236(%esp)
-        vmovdqu	160(%ebp), %xmm0
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	176(%ebp), %xmm5, %xmm5
-        cmpl	$13, 236(%esp)
-        vmovdqu	192(%ebp), %xmm0
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	208(%ebp), %xmm5, %xmm5
-        vmovdqu	224(%ebp), %xmm0
-L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last:
-        vaesenclast	%xmm0, %xmm5, %xmm5
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm5, %xmm5
-        # Calc counter
-        # Initialization vector
-        cmpl	$0x00, %edx
-        movl	$0x00, %ecx
-        je	L_AES_GCM_decrypt_avx2_calc_iv_done
-        cmpl	$16, %edx
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_decrypt_avx2_calc_iv_16_loop:
-        vmovdqu	(%esi,%ecx,1), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_16_loop
-        movl	224(%esp), %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_decrypt_avx2_calc_iv_done
-L_AES_GCM_decrypt_avx2_calc_iv_lt16:
-        vpxor	%xmm0, %xmm0, %xmm0
-        xorl	%ebx, %ebx
-        vmovdqu	%xmm0, (%esp)
-L_AES_GCM_decrypt_avx2_calc_iv_loop:
-        movzbl	(%esi,%ecx,1), %eax
-        movb	%al, (%esp,%ebx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_loop
-        vmovdqu	(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-L_AES_GCM_decrypt_avx2_calc_iv_done:
-        # T = Encrypt counter
-        vpxor	%xmm0, %xmm0, %xmm0
-        shll	$3, %edx
-        vpinsrd	$0x00, %edx, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm4, %xmm4
-        #   Encrypt counter
-        vmovdqu	(%ebp), %xmm6
-        vpxor	%xmm4, %xmm6, %xmm6
-        vaesenc	16(%ebp), %xmm6, %xmm6
-        vaesenc	32(%ebp), %xmm6, %xmm6
-        vaesenc	48(%ebp), %xmm6, %xmm6
-        vaesenc	64(%ebp), %xmm6, %xmm6
-        vaesenc	80(%ebp), %xmm6, %xmm6
-        vaesenc	96(%ebp), %xmm6, %xmm6
-        vaesenc	112(%ebp), %xmm6, %xmm6
-        vaesenc	128(%ebp), %xmm6, %xmm6
-        vaesenc	144(%ebp), %xmm6, %xmm6
-        cmpl	$11, 236(%esp)
-        vmovdqu	160(%ebp), %xmm0
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vaesenc	176(%ebp), %xmm6, %xmm6
-        cmpl	$13, 236(%esp)
-        vmovdqu	192(%ebp), %xmm0
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vaesenc	208(%ebp), %xmm6, %xmm6
-        vmovdqu	224(%ebp), %xmm0
-L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last:
-        vaesenclast	%xmm0, %xmm6, %xmm6
-        jmp	L_AES_GCM_decrypt_avx2_iv_done
-L_AES_GCM_decrypt_avx2_iv_12:
-        # # Calculate values when IV is 12 bytes
-        # Set counter based on IV
-        vmovdqu	L_avx2_aes_gcm_bswap_one, %xmm4
-        vmovdqu	(%ebp), %xmm5
-        vpblendd	$7, (%esi), %xmm4, %xmm4
-        # H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqu	16(%ebp), %xmm7
-        vpxor	%xmm5, %xmm4, %xmm6
-        vaesenc	%xmm7, %xmm5, %xmm5
-        vaesenc	%xmm7, %xmm6, %xmm6
-        vmovdqu	32(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vmovdqu	48(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vmovdqu	64(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vmovdqu	80(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vmovdqu	96(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vmovdqu	112(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vmovdqu	128(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vmovdqu	144(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        cmpl	$11, 236(%esp)
-        vmovdqu	160(%ebp), %xmm0
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_12_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vmovdqu	176(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        cmpl	$13, 236(%esp)
-        vmovdqu	192(%ebp), %xmm0
-        jl	L_AES_GCM_decrypt_avx2_calc_iv_12_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vmovdqu	208(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm6, %xmm6
-        vmovdqu	224(%ebp), %xmm0
-L_AES_GCM_decrypt_avx2_calc_iv_12_last:
-        vaesenclast	%xmm0, %xmm5, %xmm5
-        vaesenclast	%xmm0, %xmm6, %xmm6
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm5, %xmm5
-L_AES_GCM_decrypt_avx2_iv_done:
-        vmovdqu	%xmm6, 80(%esp)
-        vpxor	%xmm6, %xmm6, %xmm6
-        movl	204(%esp), %esi
-        # Additional authentication data
-        movl	220(%esp), %edx
-        cmpl	$0x00, %edx
-        je	L_AES_GCM_decrypt_avx2_calc_aad_done
-        xorl	%ecx, %ecx
-        cmpl	$16, %edx
-        jl	L_AES_GCM_decrypt_avx2_calc_aad_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_decrypt_avx2_calc_aad_16_loop:
-        vmovdqu	(%esi,%ecx,1), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm6, %xmm6
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm6, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm6, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm6, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm6
-        # ghash_mid
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm6, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm6, %xmm6
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm6, %xmm6
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm6, %xmm6
-        # ghash_red
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm6, %xmm6
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_avx2_calc_aad_16_loop
-        movl	220(%esp), %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_decrypt_avx2_calc_aad_done
-L_AES_GCM_decrypt_avx2_calc_aad_lt16:
-        vpxor	%xmm0, %xmm0, %xmm0
-        xorl	%ebx, %ebx
-        vmovdqu	%xmm0, (%esp)
-L_AES_GCM_decrypt_avx2_calc_aad_loop:
-        movzbl	(%esi,%ecx,1), %eax
-        movb	%al, (%esp,%ebx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_decrypt_avx2_calc_aad_loop
-        vmovdqu	(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm6, %xmm6
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm6, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm6, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm6, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm6
-        # ghash_mid
-        vpsrld	$31, %xmm7, %xmm0
-        vpsrld	$31, %xmm6, %xmm1
-        vpslld	$0x01, %xmm7, %xmm7
-        vpslld	$0x01, %xmm6, %xmm6
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm6, %xmm6
-        vpor	%xmm0, %xmm7, %xmm7
-        vpor	%xmm1, %xmm6, %xmm6
-        # ghash_red
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
-        vpshufd	$0x4e, %xmm7, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm6, %xmm6
-L_AES_GCM_decrypt_avx2_calc_aad_done:
-        movl	196(%esp), %esi
-        movl	200(%esp), %edi
-        # Calculate counter and H
-        vpsrlq	$63, %xmm5, %xmm1
-        vpsllq	$0x01, %xmm5, %xmm0
-        vpslldq	$8, %xmm1, %xmm1
-        vpor	%xmm1, %xmm0, %xmm0
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm4
-        vpand	L_aes_gcm_avx2_mod2_128, %xmm5, %xmm5
-        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm4
-        vpxor	%xmm0, %xmm5, %xmm5
-        xorl	%ebx, %ebx
-        cmpl	$0x40, 216(%esp)
-        movl	216(%esp), %eax
-        jl	L_AES_GCM_decrypt_avx2_done_64
-        andl	$0xffffffc0, %eax
-        vmovdqu	%xmm4, 64(%esp)
-        vmovdqu	%xmm6, 96(%esp)
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm3
-        # H ^ 1
-        vmovdqu	%xmm5, (%esp)
-        vmovdqu	%xmm5, %xmm2
-        # H ^ 2
-        vpclmulqdq	$0x00, %xmm2, %xmm2, %xmm5
-        vpclmulqdq	$0x11, %xmm2, %xmm2, %xmm6
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm6, %xmm0
-        vmovdqu	%xmm0, 16(%esp)
-        # H ^ 3
-        # ghash_gfmul_red
-        vpclmulqdq	$16, %xmm0, %xmm2, %xmm6
-        vpclmulqdq	$0x01, %xmm0, %xmm2, %xmm5
-        vpclmulqdq	$0x00, %xmm0, %xmm2, %xmm4
-        vpxor	%xmm5, %xmm6, %xmm6
-        vpslldq	$8, %xmm6, %xmm5
-        vpsrldq	$8, %xmm6, %xmm6
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm0, %xmm2, %xmm1
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm1, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	%xmm1, 32(%esp)
-        # H ^ 4
-        vpclmulqdq	$0x00, %xmm0, %xmm0, %xmm5
-        vpclmulqdq	$0x11, %xmm0, %xmm0, %xmm6
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm6, %xmm2
-        vmovdqu	%xmm2, 48(%esp)
-        vmovdqu	96(%esp), %xmm6
-        cmpl	%esi, %edi
-        jne	L_AES_GCM_decrypt_avx2_ghash_64
-L_AES_GCM_decrypt_avx2_ghash_64_inplace:
-        # aesenc_64_ghash
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        # aesenc_64
-        # aesenc_ctr
-        vmovdqu	64(%esp), %xmm4
-        vmovdqu	L_aes_gcm_avx2_bswap_epi64, %xmm7
-        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm1
-        vpshufb	%xmm7, %xmm4, %xmm0
-        vpaddd	L_aes_gcm_avx2_two, %xmm4, %xmm2
-        vpshufb	%xmm7, %xmm1, %xmm1
-        vpaddd	L_aes_gcm_avx2_three, %xmm4, %xmm3
-        vpshufb	%xmm7, %xmm2, %xmm2
-        vpaddd	L_aes_gcm_avx2_four, %xmm4, %xmm4
-        vpshufb	%xmm7, %xmm3, %xmm3
-        # aesenc_xor
-        vmovdqu	(%ebp), %xmm7
-        vmovdqu	%xmm4, 64(%esp)
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm3, %xmm3
-        vmovdqu	16(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	32(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	48(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	64(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	80(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	96(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	112(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	128(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	144(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$11, 236(%esp)
-        vmovdqu	160(%ebp), %xmm7
-        jl	L_AES_GCM_decrypt_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	176(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$13, 236(%esp)
-        vmovdqu	192(%ebp), %xmm7
-        jl	L_AES_GCM_decrypt_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	208(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	224(%ebp), %xmm7
-L_AES_GCM_decrypt_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done:
-        # aesenc_last
-        vaesenclast	%xmm7, %xmm0, %xmm0
-        vaesenclast	%xmm7, %xmm1, %xmm1
-        vaesenclast	%xmm7, %xmm2, %xmm2
-        vaesenclast	%xmm7, %xmm3, %xmm3
-        vmovdqu	(%ecx), %xmm7
-        vmovdqu	16(%ecx), %xmm4
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	%xmm7, 112(%esp)
-        vmovdqu	%xmm4, 128(%esp)
-        vmovdqu	%xmm0, (%edx)
-        vmovdqu	%xmm1, 16(%edx)
-        vmovdqu	32(%ecx), %xmm7
-        vmovdqu	48(%ecx), %xmm4
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm4, %xmm3, %xmm3
-        vmovdqu	%xmm7, 144(%esp)
-        vmovdqu	%xmm4, 160(%esp)
-        vmovdqu	%xmm2, 32(%edx)
-        vmovdqu	%xmm3, 48(%edx)
-        # pclmul_1
-        vmovdqu	112(%esp), %xmm1
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vmovdqu	48(%esp), %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm5
-        vpclmulqdq	$0x01, %xmm2, %xmm1, %xmm3
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm6
-        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
-        # pclmul_2
-        vmovdqu	128(%esp), %xmm1
-        vmovdqu	32(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        # pclmul_n
-        vmovdqu	144(%esp), %xmm1
-        vmovdqu	16(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        # pclmul_n
-        vmovdqu	160(%esp), %xmm1
-        vmovdqu	(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        # aesenc_pclmul_l
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpslldq	$8, %xmm5, %xmm1
-        vpsrldq	$8, %xmm5, %xmm5
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm0
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm7, %xmm7
-        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
-        vpshufd	$0x4e, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
-        vpshufd	$0x4e, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm7, %xmm6, %xmm6
-        # aesenc_64_ghash - end
-        addl	$0x40, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_decrypt_avx2_ghash_64_inplace
-        jmp	L_AES_GCM_decrypt_avx2_ghash_64_done
-L_AES_GCM_decrypt_avx2_ghash_64:
-        # aesenc_64_ghash
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        # aesenc_64
-        # aesenc_ctr
-        vmovdqu	64(%esp), %xmm4
-        vmovdqu	L_aes_gcm_avx2_bswap_epi64, %xmm7
-        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm1
-        vpshufb	%xmm7, %xmm4, %xmm0
-        vpaddd	L_aes_gcm_avx2_two, %xmm4, %xmm2
-        vpshufb	%xmm7, %xmm1, %xmm1
-        vpaddd	L_aes_gcm_avx2_three, %xmm4, %xmm3
-        vpshufb	%xmm7, %xmm2, %xmm2
-        vpaddd	L_aes_gcm_avx2_four, %xmm4, %xmm4
-        vpshufb	%xmm7, %xmm3, %xmm3
-        # aesenc_xor
-        vmovdqu	(%ebp), %xmm7
-        vmovdqu	%xmm4, 64(%esp)
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm3, %xmm3
-        vmovdqu	16(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	32(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	48(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	64(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	80(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	96(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	112(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	128(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	144(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$11, 236(%esp)
-        vmovdqu	160(%ebp), %xmm7
-        jl	L_AES_GCM_decrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	176(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$13, 236(%esp)
-        vmovdqu	192(%ebp), %xmm7
-        jl	L_AES_GCM_decrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	208(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	224(%ebp), %xmm7
-L_AES_GCM_decrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done:
-        # aesenc_last
-        vaesenclast	%xmm7, %xmm0, %xmm0
-        vaesenclast	%xmm7, %xmm1, %xmm1
-        vaesenclast	%xmm7, %xmm2, %xmm2
-        vaesenclast	%xmm7, %xmm3, %xmm3
-        vmovdqu	(%ecx), %xmm7
-        vmovdqu	16(%ecx), %xmm4
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	%xmm7, (%ecx)
-        vmovdqu	%xmm4, 16(%ecx)
-        vmovdqu	%xmm0, (%edx)
-        vmovdqu	%xmm1, 16(%edx)
-        vmovdqu	32(%ecx), %xmm7
-        vmovdqu	48(%ecx), %xmm4
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm4, %xmm3, %xmm3
-        vmovdqu	%xmm7, 32(%ecx)
-        vmovdqu	%xmm4, 48(%ecx)
-        vmovdqu	%xmm2, 32(%edx)
-        vmovdqu	%xmm3, 48(%edx)
-        # pclmul_1
-        vmovdqu	(%ecx), %xmm1
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vmovdqu	48(%esp), %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm5
-        vpclmulqdq	$0x01, %xmm2, %xmm1, %xmm3
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm6
-        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
-        # pclmul_2
-        vmovdqu	16(%ecx), %xmm1
-        vmovdqu	32(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        # pclmul_n
-        vmovdqu	32(%ecx), %xmm1
-        vmovdqu	16(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        # pclmul_n
-        vmovdqu	48(%ecx), %xmm1
-        vmovdqu	(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        # aesenc_pclmul_l
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpslldq	$8, %xmm5, %xmm1
-        vpsrldq	$8, %xmm5, %xmm5
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm0
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm7, %xmm7
-        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
-        vpshufd	$0x4e, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
-        vpshufd	$0x4e, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm7, %xmm6, %xmm6
-        # aesenc_64_ghash - end
-        addl	$0x40, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_decrypt_avx2_ghash_64
-L_AES_GCM_decrypt_avx2_ghash_64_done:
-        vmovdqu	(%esp), %xmm5
-        vmovdqu	64(%esp), %xmm4
-L_AES_GCM_decrypt_avx2_done_64:
-        cmpl	216(%esp), %ebx
-        jge	L_AES_GCM_decrypt_avx2_done_dec
-        movl	216(%esp), %eax
-        andl	$0xfffffff0, %eax
-        cmpl	%eax, %ebx
-        jge	L_AES_GCM_decrypt_avx2_last_block_done
-L_AES_GCM_decrypt_avx2_last_block_start:
-        vmovdqu	(%esi,%ebx,1), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm7
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
-        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm4
-        vmovdqu	%xmm4, 64(%esp)
-        vpxor	%xmm6, %xmm0, %xmm4
-        # aesenc_gfmul_sb
-        vpclmulqdq	$0x01, %xmm5, %xmm4, %xmm2
-        vpclmulqdq	$16, %xmm5, %xmm4, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm1
-        vpclmulqdq	$0x11, %xmm5, %xmm4, %xmm4
-        vpxor	(%ebp), %xmm7, %xmm7
-        vaesenc	16(%ebp), %xmm7, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpslldq	$8, %xmm3, %xmm2
-        vpsrldq	$8, %xmm3, %xmm3
-        vaesenc	32(%ebp), %xmm7, %xmm7
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1
-        vaesenc	48(%ebp), %xmm7, %xmm7
-        vaesenc	64(%ebp), %xmm7, %xmm7
-        vaesenc	80(%ebp), %xmm7, %xmm7
-        vpshufd	$0x4e, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1
-        vaesenc	96(%ebp), %xmm7, %xmm7
-        vaesenc	112(%ebp), %xmm7, %xmm7
-        vaesenc	128(%ebp), %xmm7, %xmm7
-        vpshufd	$0x4e, %xmm2, %xmm2
-        vaesenc	144(%ebp), %xmm7, %xmm7
-        vpxor	%xmm3, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm2, %xmm2
-        vmovdqu	160(%ebp), %xmm0
-        cmpl	$11, 236(%esp)
-        jl	L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vaesenc	176(%ebp), %xmm7, %xmm7
-        vmovdqu	192(%ebp), %xmm0
-        cmpl	$13, 236(%esp)
-        jl	L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vaesenc	208(%ebp), %xmm7, %xmm7
-        vmovdqu	224(%ebp), %xmm0
-L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last:
-        vaesenclast	%xmm0, %xmm7, %xmm7
-        vmovdqu	(%esi,%ebx,1), %xmm3
-        vpxor	%xmm1, %xmm2, %xmm6
-        vpxor	%xmm3, %xmm7, %xmm7
-        vmovdqu	%xmm7, (%edi,%ebx,1)
-        vmovdqu	64(%esp), %xmm4
-        addl	$16, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_decrypt_avx2_last_block_start
-L_AES_GCM_decrypt_avx2_last_block_done:
-        movl	216(%esp), %ecx
-        movl	216(%esp), %edx
-        andl	$15, %ecx
-        jz	L_AES_GCM_decrypt_avx2_done_dec
-        # aesenc_last15_dec
-        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm4
-        vpxor	(%ebp), %xmm4, %xmm4
-        vaesenc	16(%ebp), %xmm4, %xmm4
-        vaesenc	32(%ebp), %xmm4, %xmm4
-        vaesenc	48(%ebp), %xmm4, %xmm4
-        vaesenc	64(%ebp), %xmm4, %xmm4
-        vaesenc	80(%ebp), %xmm4, %xmm4
-        vaesenc	96(%ebp), %xmm4, %xmm4
-        vaesenc	112(%ebp), %xmm4, %xmm4
-        vaesenc	128(%ebp), %xmm4, %xmm4
-        vaesenc	144(%ebp), %xmm4, %xmm4
-        cmpl	$11, 236(%esp)
-        vmovdqu	160(%ebp), %xmm1
-        jl	L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last
-        vaesenc	%xmm1, %xmm4, %xmm4
-        vaesenc	176(%ebp), %xmm4, %xmm4
-        cmpl	$13, 236(%esp)
-        vmovdqu	192(%ebp), %xmm1
-        jl	L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last
-        vaesenc	%xmm1, %xmm4, %xmm4
-        vaesenc	208(%ebp), %xmm4, %xmm4
-        vmovdqu	224(%ebp), %xmm1
-L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last:
-        vaesenclast	%xmm1, %xmm4, %xmm4
-        xorl	%ecx, %ecx
-        vpxor	%xmm0, %xmm0, %xmm0
-        vmovdqu	%xmm4, (%esp)
-        vmovdqu	%xmm0, 16(%esp)
-L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_loop:
-        movzbl	(%esi,%ebx,1), %eax
-        movb	%al, 16(%esp,%ecx,1)
-        xorb	(%esp,%ecx,1), %al
-        movb	%al, (%edi,%ebx,1)
-        incl	%ebx
-        incl	%ecx
-        cmpl	%edx, %ebx
-        jl	L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_loop
-        vmovdqu	16(%esp), %xmm4
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm6, %xmm6
-        # ghash_gfmul_red
-        vpclmulqdq	$16, %xmm5, %xmm6, %xmm2
-        vpclmulqdq	$0x01, %xmm5, %xmm6, %xmm1
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm0
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm6
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm6, %xmm6
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm0, %xmm6, %xmm6
-L_AES_GCM_decrypt_avx2_done_dec:
-        vmovdqu	80(%esp), %xmm7
-        # calc_tag
-        movl	216(%esp), %ecx
-        shll	$3, %ecx
-        vpinsrd	$0x00, %ecx, %xmm0, %xmm0
-        movl	220(%esp), %ecx
-        shll	$3, %ecx
-        vpinsrd	$2, %ecx, %xmm0, %xmm0
-        movl	216(%esp), %ecx
-        shrl	$29, %ecx
-        vpinsrd	$0x01, %ecx, %xmm0, %xmm0
-        movl	220(%esp), %ecx
-        shrl	$29, %ecx
-        vpinsrd	$3, %ecx, %xmm0, %xmm0
-        vpxor	%xmm6, %xmm0, %xmm0
-        # ghash_gfmul_red
-        vpclmulqdq	$16, %xmm5, %xmm0, %xmm4
-        vpclmulqdq	$0x01, %xmm5, %xmm0, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm0, %xmm2
-        vpxor	%xmm3, %xmm4, %xmm4
-        vpslldq	$8, %xmm4, %xmm3
-        vpsrldq	$8, %xmm4, %xmm4
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpclmulqdq	$0x11, %xmm5, %xmm0, %xmm0
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2
-        vpshufd	$0x4e, %xmm3, %xmm3
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2
-        vpshufd	$0x4e, %xmm3, %xmm3
-        vpxor	%xmm4, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm0, %xmm0
-        movl	212(%esp), %edi
-        movl	228(%esp), %ebx
-        movl	240(%esp), %ebp
-        # cmp_tag
-        cmpl	$16, %ebx
-        je	L_AES_GCM_decrypt_avx2_cmp_tag_16
-        xorl	%edx, %edx
-        xorl	%ecx, %ecx
-        vmovdqu	%xmm0, (%esp)
-L_AES_GCM_decrypt_avx2_cmp_tag_loop:
-        movzbl	(%esp,%edx,1), %eax
-        xorb	(%edi,%edx,1), %al
-        orb	%al, %cl
-        incl	%edx
-        cmpl	%ebx, %edx
-        jne	L_AES_GCM_decrypt_avx2_cmp_tag_loop
-        cmpb	$0x00, %cl
-        sete	%cl
-        jmp	L_AES_GCM_decrypt_avx2_cmp_tag_done
-L_AES_GCM_decrypt_avx2_cmp_tag_16:
-        vmovdqu	(%edi), %xmm1
-        vpcmpeqb	%xmm1, %xmm0, %xmm0
-        vpmovmskb	%xmm0, %edx
-        # %%edx == 0xFFFF then return 1 else => return 0
-        xorl	%ecx, %ecx
-        cmpl	$0xffff, %edx
-        sete	%cl
-L_AES_GCM_decrypt_avx2_cmp_tag_done:
-        movl	%ecx, (%ebp)
-        addl	$0xb0, %esp
-        popl	%ebp
-        popl	%edi
-        popl	%esi
-        popl	%ebx
-        ret
-.size	AES_GCM_decrypt_avx2,.-AES_GCM_decrypt_avx2
-#ifdef WOLFSSL_AESGCM_STREAM
-.text
-.globl	AES_GCM_init_avx2
-.type	AES_GCM_init_avx2,@function
-.align	16
-AES_GCM_init_avx2:
-        pushl	%ebx
-        pushl	%esi
-        pushl	%edi
-        pushl	%ebp
-        subl	$32, %esp
-        movl	52(%esp), %ebp
-        movl	60(%esp), %esi
-        movl	76(%esp), %edi
-        vpxor	%xmm4, %xmm4, %xmm4
-        movl	64(%esp), %edx
-        cmpl	$12, %edx
-        je	L_AES_GCM_init_avx2_iv_12
-        # Calculate values when IV is not 12 bytes
-        # H = Encrypt X(=0)
-        vmovdqu	(%ebp), %xmm5
-        vaesenc	16(%ebp), %xmm5, %xmm5
-        vaesenc	32(%ebp), %xmm5, %xmm5
-        vaesenc	48(%ebp), %xmm5, %xmm5
-        vaesenc	64(%ebp), %xmm5, %xmm5
-        vaesenc	80(%ebp), %xmm5, %xmm5
-        vaesenc	96(%ebp), %xmm5, %xmm5
-        vaesenc	112(%ebp), %xmm5, %xmm5
-        vaesenc	128(%ebp), %xmm5, %xmm5
-        vaesenc	144(%ebp), %xmm5, %xmm5
-        cmpl	$11, 56(%esp)
-        vmovdqu	160(%ebp), %xmm0
-        jl	L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	176(%ebp), %xmm5, %xmm5
-        cmpl	$13, 56(%esp)
-        vmovdqu	192(%ebp), %xmm0
-        jl	L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	208(%ebp), %xmm5, %xmm5
-        vmovdqu	224(%ebp), %xmm0
-L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last:
-        vaesenclast	%xmm0, %xmm5, %xmm5
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm5, %xmm5
-        # Calc counter
-        # Initialization vector
-        cmpl	$0x00, %edx
-        movl	$0x00, %ecx
-        je	L_AES_GCM_init_avx2_calc_iv_done
-        cmpl	$16, %edx
-        jl	L_AES_GCM_init_avx2_calc_iv_lt16
-        andl	$0xfffffff0, %edx
-L_AES_GCM_init_avx2_calc_iv_16_loop:
-        vmovdqu	(%esi,%ecx,1), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm6
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm6, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm6, %xmm6
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm6, %xmm6
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm6, %xmm0
-        vpshufd	$0x4e, %xmm6, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_init_avx2_calc_iv_16_loop
-        movl	64(%esp), %edx
-        cmpl	%edx, %ecx
-        je	L_AES_GCM_init_avx2_calc_iv_done
-L_AES_GCM_init_avx2_calc_iv_lt16:
-        vpxor	%xmm0, %xmm0, %xmm0
-        xorl	%ebx, %ebx
-        vmovdqu	%xmm0, (%esp)
-L_AES_GCM_init_avx2_calc_iv_loop:
-        movzbl	(%esi,%ecx,1), %eax
-        movb	%al, (%esp,%ebx,1)
-        incl	%ecx
-        incl	%ebx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_init_avx2_calc_iv_loop
-        vmovdqu	(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm6
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm6, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm6, %xmm6
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm6, %xmm6
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm6, %xmm0
-        vpshufd	$0x4e, %xmm6, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-L_AES_GCM_init_avx2_calc_iv_done:
-        # T = Encrypt counter
-        vpxor	%xmm0, %xmm0, %xmm0
-        shll	$3, %edx
-        vpinsrd	$0x00, %edx, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm6
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm6, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm6, %xmm6
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm6, %xmm6
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm6, %xmm0
-        vpshufd	$0x4e, %xmm6, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm4, %xmm4
-        #   Encrypt counter
-        vmovdqu	(%ebp), %xmm7
-        vpxor	%xmm4, %xmm7, %xmm7
-        vaesenc	16(%ebp), %xmm7, %xmm7
-        vaesenc	32(%ebp), %xmm7, %xmm7
-        vaesenc	48(%ebp), %xmm7, %xmm7
-        vaesenc	64(%ebp), %xmm7, %xmm7
-        vaesenc	80(%ebp), %xmm7, %xmm7
-        vaesenc	96(%ebp), %xmm7, %xmm7
-        vaesenc	112(%ebp), %xmm7, %xmm7
-        vaesenc	128(%ebp), %xmm7, %xmm7
-        vaesenc	144(%ebp), %xmm7, %xmm7
-        cmpl	$11, 56(%esp)
-        vmovdqu	160(%ebp), %xmm0
-        jl	L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vaesenc	176(%ebp), %xmm7, %xmm7
-        cmpl	$13, 56(%esp)
-        vmovdqu	192(%ebp), %xmm0
-        jl	L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vaesenc	208(%ebp), %xmm7, %xmm7
-        vmovdqu	224(%ebp), %xmm0
-L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last:
-        vaesenclast	%xmm0, %xmm7, %xmm7
-        jmp	L_AES_GCM_init_avx2_iv_done
-L_AES_GCM_init_avx2_iv_12:
-        # # Calculate values when IV is 12 bytes
-        # Set counter based on IV
-        vmovdqu	L_avx2_aes_gcm_bswap_one, %xmm4
-        vmovdqu	(%ebp), %xmm5
-        vpblendd	$7, (%esi), %xmm4, %xmm4
-        # H = Encrypt X(=0) and T = Encrypt counter
-        vmovdqu	16(%ebp), %xmm6
-        vpxor	%xmm5, %xmm4, %xmm7
-        vaesenc	%xmm6, %xmm5, %xmm5
-        vaesenc	%xmm6, %xmm7, %xmm7
-        vmovdqu	32(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vmovdqu	48(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vmovdqu	64(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vmovdqu	80(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vmovdqu	96(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vmovdqu	112(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vmovdqu	128(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vmovdqu	144(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        cmpl	$11, 56(%esp)
-        vmovdqu	160(%ebp), %xmm0
-        jl	L_AES_GCM_init_avx2_calc_iv_12_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vmovdqu	176(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        cmpl	$13, 56(%esp)
-        vmovdqu	192(%ebp), %xmm0
-        jl	L_AES_GCM_init_avx2_calc_iv_12_last
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vmovdqu	208(%ebp), %xmm0
-        vaesenc	%xmm0, %xmm5, %xmm5
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vmovdqu	224(%ebp), %xmm0
-L_AES_GCM_init_avx2_calc_iv_12_last:
-        vaesenclast	%xmm0, %xmm5, %xmm5
-        vaesenclast	%xmm0, %xmm7, %xmm7
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm5, %xmm5
-L_AES_GCM_init_avx2_iv_done:
-        vmovdqu	%xmm7, (%edi)
-        movl	68(%esp), %ebp
-        movl	72(%esp), %edi
-        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm4
-        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm4
-        vmovdqu	%xmm5, (%ebp)
-        vmovdqu	%xmm4, (%edi)
-        addl	$32, %esp
-        popl	%ebp
-        popl	%edi
-        popl	%esi
-        popl	%ebx
-        ret
-.size	AES_GCM_init_avx2,.-AES_GCM_init_avx2
-.text
-.globl	AES_GCM_aad_update_avx2
-.type	AES_GCM_aad_update_avx2,@function
-.align	16
-AES_GCM_aad_update_avx2:
-        pushl	%esi
-        pushl	%edi
-        movl	12(%esp), %esi
-        movl	16(%esp), %edx
-        movl	20(%esp), %edi
-        movl	24(%esp), %eax
-        vmovdqu	(%edi), %xmm4
-        vmovdqu	(%eax), %xmm5
-        xorl	%ecx, %ecx
-L_AES_GCM_aad_update_avx2_16_loop:
-        vmovdqu	(%esi,%ecx,1), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm6
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm6, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm6, %xmm6
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm6, %xmm6
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm6, %xmm0
-        vpshufd	$0x4e, %xmm6, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-        addl	$16, %ecx
-        cmpl	%edx, %ecx
-        jl	L_AES_GCM_aad_update_avx2_16_loop
-        vmovdqu	%xmm4, (%edi)
-        popl	%edi
-        popl	%esi
-        ret
-.size	AES_GCM_aad_update_avx2,.-AES_GCM_aad_update_avx2
-.text
-.globl	AES_GCM_encrypt_block_avx2
-.type	AES_GCM_encrypt_block_avx2,@function
-.align	16
-AES_GCM_encrypt_block_avx2:
-        pushl	%esi
-        pushl	%edi
-        movl	12(%esp), %ecx
-        movl	16(%esp), %eax
-        movl	20(%esp), %edi
-        movl	24(%esp), %esi
-        movl	28(%esp), %edx
-        vmovdqu	(%edx), %xmm3
-        # aesenc_block
-        vmovdqu	%xmm3, %xmm1
-        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm1, %xmm0
-        vpaddd	L_aes_gcm_avx2_one, %xmm1, %xmm1
-        vpxor	(%ecx), %xmm0, %xmm0
-        vaesenc	16(%ecx), %xmm0, %xmm0
-        vaesenc	32(%ecx), %xmm0, %xmm0
-        vaesenc	48(%ecx), %xmm0, %xmm0
-        vaesenc	64(%ecx), %xmm0, %xmm0
-        vaesenc	80(%ecx), %xmm0, %xmm0
-        vaesenc	96(%ecx), %xmm0, %xmm0
-        vaesenc	112(%ecx), %xmm0, %xmm0
-        vaesenc	128(%ecx), %xmm0, %xmm0
-        vaesenc	144(%ecx), %xmm0, %xmm0
-        cmpl	$11, %eax
-        vmovdqu	160(%ecx), %xmm2
-        jl	L_AES_GCM_encrypt_block_avx2_aesenc_block_aesenc_avx_last
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vaesenc	176(%ecx), %xmm0, %xmm0
-        cmpl	$13, %eax
-        vmovdqu	192(%ecx), %xmm2
-        jl	L_AES_GCM_encrypt_block_avx2_aesenc_block_aesenc_avx_last
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vaesenc	208(%ecx), %xmm0, %xmm0
-        vmovdqu	224(%ecx), %xmm2
-L_AES_GCM_encrypt_block_avx2_aesenc_block_aesenc_avx_last:
-        vaesenclast	%xmm2, %xmm0, %xmm0
-        vmovdqu	%xmm1, %xmm3
-        vmovdqu	(%esi), %xmm1
-        vpxor	%xmm1, %xmm0, %xmm0
-        vmovdqu	%xmm0, (%edi)
-        vmovdqu	%xmm3, (%edx)
-        popl	%edi
-        popl	%esi
-        ret
-.size	AES_GCM_encrypt_block_avx2,.-AES_GCM_encrypt_block_avx2
-.text
-.globl	AES_GCM_ghash_block_avx2
-.type	AES_GCM_ghash_block_avx2,@function
-.align	16
-AES_GCM_ghash_block_avx2:
-        movl	4(%esp), %edx
-        movl	8(%esp), %eax
-        movl	12(%esp), %ecx
-        vmovdqu	(%eax), %xmm4
-        vmovdqu	(%ecx), %xmm5
-        vmovdqu	(%edx), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm4, %xmm4
-        # ghash_gfmul_avx
-        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
-        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
-        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
-        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm0, %xmm6
-        vpxor	%xmm2, %xmm3, %xmm4
-        # ghash_mid
-        vpsrld	$31, %xmm6, %xmm0
-        vpsrld	$31, %xmm4, %xmm1
-        vpslld	$0x01, %xmm6, %xmm6
-        vpslld	$0x01, %xmm4, %xmm4
-        vpsrldq	$12, %xmm0, %xmm2
-        vpslldq	$4, %xmm0, %xmm0
-        vpslldq	$4, %xmm1, %xmm1
-        vpor	%xmm2, %xmm4, %xmm4
-        vpor	%xmm0, %xmm6, %xmm6
-        vpor	%xmm1, %xmm4, %xmm4
-        # ghash_red
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm6, %xmm0
-        vpshufd	$0x4e, %xmm6, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm4, %xmm4
-        vmovdqu	%xmm4, (%eax)
-        ret
-.size	AES_GCM_ghash_block_avx2,.-AES_GCM_ghash_block_avx2
-.text
-.globl	AES_GCM_encrypt_update_avx2
-.type	AES_GCM_encrypt_update_avx2,@function
-.align	16
-AES_GCM_encrypt_update_avx2:
-        pushl	%ebx
-        pushl	%esi
-        pushl	%edi
-        pushl	%ebp
-        subl	$0x60, %esp
-        movl	144(%esp), %esi
-        vmovdqu	(%esi), %xmm4
-        vmovdqu	%xmm4, 64(%esp)
-        movl	136(%esp), %esi
-        movl	140(%esp), %ebp
-        vmovdqu	(%esi), %xmm6
-        vmovdqu	(%ebp), %xmm5
-        vmovdqu	%xmm6, 80(%esp)
-        movl	116(%esp), %ebp
-        movl	124(%esp), %edi
-        movl	128(%esp), %esi
-        # Calculate H
-        vpsrlq	$63, %xmm5, %xmm1
-        vpsllq	$0x01, %xmm5, %xmm0
-        vpslldq	$8, %xmm1, %xmm1
-        vpor	%xmm1, %xmm0, %xmm0
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpand	L_aes_gcm_avx2_mod2_128, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        xorl	%ebx, %ebx
-        cmpl	$0x40, 132(%esp)
-        movl	132(%esp), %eax
-        jl	L_AES_GCM_encrypt_update_avx2_done_64
-        andl	$0xffffffc0, %eax
-        vmovdqu	%xmm4, 64(%esp)
-        vmovdqu	%xmm6, 80(%esp)
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm3
-        # H ^ 1
-        vmovdqu	%xmm5, (%esp)
-        vmovdqu	%xmm5, %xmm2
-        # H ^ 2
-        vpclmulqdq	$0x00, %xmm2, %xmm2, %xmm5
-        vpclmulqdq	$0x11, %xmm2, %xmm2, %xmm6
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm6, %xmm0
-        vmovdqu	%xmm0, 16(%esp)
-        # H ^ 3
-        # ghash_gfmul_red
-        vpclmulqdq	$16, %xmm0, %xmm2, %xmm6
-        vpclmulqdq	$0x01, %xmm0, %xmm2, %xmm5
-        vpclmulqdq	$0x00, %xmm0, %xmm2, %xmm4
-        vpxor	%xmm5, %xmm6, %xmm6
-        vpslldq	$8, %xmm6, %xmm5
-        vpsrldq	$8, %xmm6, %xmm6
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm0, %xmm2, %xmm1
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm1, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	%xmm1, 32(%esp)
-        # H ^ 4
-        vpclmulqdq	$0x00, %xmm0, %xmm0, %xmm5
-        vpclmulqdq	$0x11, %xmm0, %xmm0, %xmm6
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm6, %xmm2
-        vmovdqu	%xmm2, 48(%esp)
-        vmovdqu	80(%esp), %xmm6
-        # First 64 bytes of input
-        # aesenc_64
-        # aesenc_ctr
-        vmovdqu	64(%esp), %xmm4
-        vmovdqu	L_aes_gcm_avx2_bswap_epi64, %xmm7
-        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm1
-        vpshufb	%xmm7, %xmm4, %xmm0
-        vpaddd	L_aes_gcm_avx2_two, %xmm4, %xmm2
-        vpshufb	%xmm7, %xmm1, %xmm1
-        vpaddd	L_aes_gcm_avx2_three, %xmm4, %xmm3
-        vpshufb	%xmm7, %xmm2, %xmm2
-        vpaddd	L_aes_gcm_avx2_four, %xmm4, %xmm4
-        vpshufb	%xmm7, %xmm3, %xmm3
-        # aesenc_xor
-        vmovdqu	(%ebp), %xmm7
-        vmovdqu	%xmm4, 64(%esp)
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm3, %xmm3
-        vmovdqu	16(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	32(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	48(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	64(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	80(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	96(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	112(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	128(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	144(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$11, 120(%esp)
-        vmovdqu	160(%ebp), %xmm7
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	176(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$13, 120(%esp)
-        vmovdqu	192(%ebp), %xmm7
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	208(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	224(%ebp), %xmm7
-L_AES_GCM_encrypt_update_avx2_aesenc_64_enc_done:
-        # aesenc_last
-        vaesenclast	%xmm7, %xmm0, %xmm0
-        vaesenclast	%xmm7, %xmm1, %xmm1
-        vaesenclast	%xmm7, %xmm2, %xmm2
-        vaesenclast	%xmm7, %xmm3, %xmm3
-        vmovdqu	(%esi), %xmm7
-        vmovdqu	16(%esi), %xmm4
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	%xmm0, (%edi)
-        vmovdqu	%xmm1, 16(%edi)
-        vmovdqu	32(%esi), %xmm7
-        vmovdqu	48(%esi), %xmm4
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm4, %xmm3, %xmm3
-        vmovdqu	%xmm2, 32(%edi)
-        vmovdqu	%xmm3, 48(%edi)
-        cmpl	$0x40, %eax
-        movl	$0x40, %ebx
-        movl	%esi, %ecx
-        movl	%edi, %edx
-        jle	L_AES_GCM_encrypt_update_avx2_end_64
-        # More 64 bytes of input
-L_AES_GCM_encrypt_update_avx2_ghash_64:
-        # aesenc_64_ghash
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        # aesenc_64
-        # aesenc_ctr
-        vmovdqu	64(%esp), %xmm4
-        vmovdqu	L_aes_gcm_avx2_bswap_epi64, %xmm7
-        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm1
-        vpshufb	%xmm7, %xmm4, %xmm0
-        vpaddd	L_aes_gcm_avx2_two, %xmm4, %xmm2
-        vpshufb	%xmm7, %xmm1, %xmm1
-        vpaddd	L_aes_gcm_avx2_three, %xmm4, %xmm3
-        vpshufb	%xmm7, %xmm2, %xmm2
-        vpaddd	L_aes_gcm_avx2_four, %xmm4, %xmm4
-        vpshufb	%xmm7, %xmm3, %xmm3
-        # aesenc_xor
-        vmovdqu	(%ebp), %xmm7
-        vmovdqu	%xmm4, 64(%esp)
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm3, %xmm3
-        vmovdqu	16(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	32(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	48(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	64(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	80(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	96(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	112(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	128(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	144(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$11, 120(%esp)
-        vmovdqu	160(%ebp), %xmm7
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	176(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$13, 120(%esp)
-        vmovdqu	192(%ebp), %xmm7
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	208(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	224(%ebp), %xmm7
-L_AES_GCM_encrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done:
-        # aesenc_last
-        vaesenclast	%xmm7, %xmm0, %xmm0
-        vaesenclast	%xmm7, %xmm1, %xmm1
-        vaesenclast	%xmm7, %xmm2, %xmm2
-        vaesenclast	%xmm7, %xmm3, %xmm3
-        vmovdqu	(%ecx), %xmm7
-        vmovdqu	16(%ecx), %xmm4
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	%xmm0, (%edx)
-        vmovdqu	%xmm1, 16(%edx)
-        vmovdqu	32(%ecx), %xmm7
-        vmovdqu	48(%ecx), %xmm4
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm4, %xmm3, %xmm3
-        vmovdqu	%xmm2, 32(%edx)
-        vmovdqu	%xmm3, 48(%edx)
-        # pclmul_1
-        vmovdqu	-64(%edx), %xmm1
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vmovdqu	48(%esp), %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm5
-        vpclmulqdq	$0x01, %xmm2, %xmm1, %xmm3
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm6
-        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
-        # pclmul_2
-        vmovdqu	-48(%edx), %xmm1
-        vmovdqu	32(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        # pclmul_n
-        vmovdqu	-32(%edx), %xmm1
-        vmovdqu	16(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        # pclmul_n
-        vmovdqu	-16(%edx), %xmm1
-        vmovdqu	(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        # aesenc_pclmul_l
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpslldq	$8, %xmm5, %xmm1
-        vpsrldq	$8, %xmm5, %xmm5
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm0
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm7, %xmm7
-        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
-        vpshufd	$0x4e, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
-        vpshufd	$0x4e, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm7, %xmm6, %xmm6
-        # aesenc_64_ghash - end
-        addl	$0x40, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_encrypt_update_avx2_ghash_64
-L_AES_GCM_encrypt_update_avx2_end_64:
-        vmovdqu	%xmm6, 80(%esp)
-        vmovdqu	48(%edx), %xmm3
-        vmovdqu	(%esp), %xmm7
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3
-        vpclmulqdq	$16, %xmm3, %xmm7, %xmm5
-        vpclmulqdq	$0x01, %xmm3, %xmm7, %xmm1
-        vpclmulqdq	$0x00, %xmm3, %xmm7, %xmm4
-        vpclmulqdq	$0x11, %xmm3, %xmm7, %xmm6
-        vpxor	%xmm1, %xmm5, %xmm5
-        vmovdqu	32(%edx), %xmm3
-        vmovdqu	16(%esp), %xmm7
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3
-        vpclmulqdq	$16, %xmm3, %xmm7, %xmm2
-        vpclmulqdq	$0x01, %xmm3, %xmm7, %xmm1
-        vpclmulqdq	$0x00, %xmm3, %xmm7, %xmm0
-        vpclmulqdq	$0x11, %xmm3, %xmm7, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm4, %xmm4
-        vmovdqu	16(%edx), %xmm3
-        vmovdqu	32(%esp), %xmm7
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3
-        vpclmulqdq	$16, %xmm3, %xmm7, %xmm2
-        vpclmulqdq	$0x01, %xmm3, %xmm7, %xmm1
-        vpclmulqdq	$0x00, %xmm3, %xmm7, %xmm0
-        vpclmulqdq	$0x11, %xmm3, %xmm7, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm4, %xmm4
-        vmovdqu	80(%esp), %xmm0
-        vmovdqu	(%edx), %xmm3
-        vmovdqu	48(%esp), %xmm7
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3
-        vpxor	%xmm0, %xmm3, %xmm3
-        vpclmulqdq	$16, %xmm3, %xmm7, %xmm2
-        vpclmulqdq	$0x01, %xmm3, %xmm7, %xmm1
-        vpclmulqdq	$0x00, %xmm3, %xmm7, %xmm0
-        vpclmulqdq	$0x11, %xmm3, %xmm7, %xmm3
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm4, %xmm4
-        vpslldq	$8, %xmm5, %xmm7
-        vpsrldq	$8, %xmm5, %xmm5
-        vpxor	%xmm7, %xmm4, %xmm4
-        vpxor	%xmm5, %xmm6, %xmm6
-        # ghash_red
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
-        vpclmulqdq	$16, %xmm2, %xmm4, %xmm0
-        vpshufd	$0x4e, %xmm4, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm6, %xmm6
-        vmovdqu	(%esp), %xmm5
-        vmovdqu	64(%esp), %xmm4
-L_AES_GCM_encrypt_update_avx2_done_64:
-        cmpl	132(%esp), %ebx
-        je	L_AES_GCM_encrypt_update_avx2_done_enc
-        movl	132(%esp), %eax
-        andl	$0xfffffff0, %eax
-        cmpl	%eax, %ebx
-        jge	L_AES_GCM_encrypt_update_avx2_last_block_done
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        # aesenc_block
-        vmovdqu	%xmm4, %xmm1
-        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm1, %xmm0
-        vpaddd	L_aes_gcm_avx2_one, %xmm1, %xmm1
-        vpxor	(%ebp), %xmm0, %xmm0
-        vaesenc	16(%ebp), %xmm0, %xmm0
-        vaesenc	32(%ebp), %xmm0, %xmm0
-        vaesenc	48(%ebp), %xmm0, %xmm0
-        vaesenc	64(%ebp), %xmm0, %xmm0
-        vaesenc	80(%ebp), %xmm0, %xmm0
-        vaesenc	96(%ebp), %xmm0, %xmm0
-        vaesenc	112(%ebp), %xmm0, %xmm0
-        vaesenc	128(%ebp), %xmm0, %xmm0
-        vaesenc	144(%ebp), %xmm0, %xmm0
-        cmpl	$11, 120(%esp)
-        vmovdqu	160(%ebp), %xmm2
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_block_aesenc_avx_last
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vaesenc	176(%ebp), %xmm0, %xmm0
-        cmpl	$13, 120(%esp)
-        vmovdqu	192(%ebp), %xmm2
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_block_aesenc_avx_last
-        vaesenc	%xmm2, %xmm0, %xmm0
-        vaesenc	208(%ebp), %xmm0, %xmm0
-        vmovdqu	224(%ebp), %xmm2
-L_AES_GCM_encrypt_update_avx2_aesenc_block_aesenc_avx_last:
-        vaesenclast	%xmm2, %xmm0, %xmm0
-        vmovdqu	%xmm1, %xmm4
-        vmovdqu	(%ecx), %xmm1
-        vpxor	%xmm1, %xmm0, %xmm0
-        vmovdqu	%xmm0, (%edx)
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm6, %xmm6
-        addl	$16, %ebx
-        cmpl	%eax, %ebx
-        jge	L_AES_GCM_encrypt_update_avx2_last_block_ghash
-L_AES_GCM_encrypt_update_avx2_last_block_start:
-        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm7
-        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm4
-        vmovdqu	%xmm4, 64(%esp)
-        # aesenc_gfmul_sb
-        vpclmulqdq	$0x01, %xmm5, %xmm6, %xmm2
-        vpclmulqdq	$16, %xmm5, %xmm6, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm1
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm4
-        vpxor	(%ebp), %xmm7, %xmm7
-        vaesenc	16(%ebp), %xmm7, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpslldq	$8, %xmm3, %xmm2
-        vpsrldq	$8, %xmm3, %xmm3
-        vaesenc	32(%ebp), %xmm7, %xmm7
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1
-        vaesenc	48(%ebp), %xmm7, %xmm7
-        vaesenc	64(%ebp), %xmm7, %xmm7
-        vaesenc	80(%ebp), %xmm7, %xmm7
-        vpshufd	$0x4e, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1
-        vaesenc	96(%ebp), %xmm7, %xmm7
-        vaesenc	112(%ebp), %xmm7, %xmm7
-        vaesenc	128(%ebp), %xmm7, %xmm7
-        vpshufd	$0x4e, %xmm2, %xmm2
-        vaesenc	144(%ebp), %xmm7, %xmm7
-        vpxor	%xmm3, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm2, %xmm2
-        vmovdqu	160(%ebp), %xmm0
-        cmpl	$11, 120(%esp)
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vaesenc	176(%ebp), %xmm7, %xmm7
-        vmovdqu	192(%ebp), %xmm0
-        cmpl	$13, 120(%esp)
-        jl	L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vaesenc	208(%ebp), %xmm7, %xmm7
-        vmovdqu	224(%ebp), %xmm0
-L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last:
-        vaesenclast	%xmm0, %xmm7, %xmm7
-        vmovdqu	(%esi,%ebx,1), %xmm3
-        vpxor	%xmm1, %xmm2, %xmm6
-        vpxor	%xmm3, %xmm7, %xmm7
-        vmovdqu	%xmm7, (%edi,%ebx,1)
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm7, %xmm7
-        vpxor	%xmm7, %xmm6, %xmm6
-        vmovdqu	64(%esp), %xmm4
-        addl	$16, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_encrypt_update_avx2_last_block_start
-L_AES_GCM_encrypt_update_avx2_last_block_ghash:
-        # ghash_gfmul_red
-        vpclmulqdq	$16, %xmm5, %xmm6, %xmm2
-        vpclmulqdq	$0x01, %xmm5, %xmm6, %xmm1
-        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm0
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpslldq	$8, %xmm2, %xmm1
-        vpsrldq	$8, %xmm2, %xmm2
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm6
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm0, %xmm1, %xmm1
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0
-        vpshufd	$0x4e, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm6, %xmm6
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm0, %xmm6, %xmm6
-L_AES_GCM_encrypt_update_avx2_last_block_done:
-L_AES_GCM_encrypt_update_avx2_done_enc:
-        movl	136(%esp), %esi
-        movl	144(%esp), %edi
-        vmovdqu	%xmm6, (%esi)
-        vmovdqu	%xmm4, (%edi)
-        addl	$0x60, %esp
-        popl	%ebp
-        popl	%edi
-        popl	%esi
-        popl	%ebx
-        ret
-.size	AES_GCM_encrypt_update_avx2,.-AES_GCM_encrypt_update_avx2
-.text
-.globl	AES_GCM_encrypt_final_avx2
-.type	AES_GCM_encrypt_final_avx2,@function
-.align	16
-AES_GCM_encrypt_final_avx2:
-        pushl	%esi
-        pushl	%edi
-        pushl	%ebp
-        subl	$16, %esp
-        movl	32(%esp), %ebp
-        movl	52(%esp), %esi
-        movl	56(%esp), %edi
-        vmovdqu	(%ebp), %xmm4
-        vmovdqu	(%esi), %xmm5
-        vmovdqu	(%edi), %xmm6
-        vpsrlq	$63, %xmm5, %xmm1
-        vpsllq	$0x01, %xmm5, %xmm0
-        vpslldq	$8, %xmm1, %xmm1
-        vpor	%xmm1, %xmm0, %xmm0
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpand	L_aes_gcm_avx2_mod2_128, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        # calc_tag
-        movl	44(%esp), %ecx
-        shll	$3, %ecx
-        vpinsrd	$0x00, %ecx, %xmm0, %xmm0
-        movl	48(%esp), %ecx
-        shll	$3, %ecx
-        vpinsrd	$2, %ecx, %xmm0, %xmm0
-        movl	44(%esp), %ecx
-        shrl	$29, %ecx
-        vpinsrd	$0x01, %ecx, %xmm0, %xmm0
-        movl	48(%esp), %ecx
-        shrl	$29, %ecx
-        vpinsrd	$3, %ecx, %xmm0, %xmm0
-        vpxor	%xmm4, %xmm0, %xmm0
-        # ghash_gfmul_red
-        vpclmulqdq	$16, %xmm5, %xmm0, %xmm7
-        vpclmulqdq	$0x01, %xmm5, %xmm0, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm0, %xmm2
-        vpxor	%xmm3, %xmm7, %xmm7
-        vpslldq	$8, %xmm7, %xmm3
-        vpsrldq	$8, %xmm7, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpclmulqdq	$0x11, %xmm5, %xmm0, %xmm0
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2
-        vpshufd	$0x4e, %xmm3, %xmm3
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2
-        vpshufd	$0x4e, %xmm3, %xmm3
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm6, %xmm0, %xmm0
-        movl	36(%esp), %edi
-        # store_tag
-        cmpl	$16, 40(%esp)
-        je	L_AES_GCM_encrypt_final_avx2_store_tag_16
-        xorl	%ecx, %ecx
-        vmovdqu	%xmm0, (%esp)
-L_AES_GCM_encrypt_final_avx2_store_tag_loop:
-        movzbl	(%esp,%ecx,1), %eax
-        movb	%al, (%edi,%ecx,1)
-        incl	%ecx
-        cmpl	40(%esp), %ecx
-        jne	L_AES_GCM_encrypt_final_avx2_store_tag_loop
-        jmp	L_AES_GCM_encrypt_final_avx2_store_tag_done
-L_AES_GCM_encrypt_final_avx2_store_tag_16:
-        vmovdqu	%xmm0, (%edi)
-L_AES_GCM_encrypt_final_avx2_store_tag_done:
-        addl	$16, %esp
-        popl	%ebp
-        popl	%edi
-        popl	%esi
-        ret
-.size	AES_GCM_encrypt_final_avx2,.-AES_GCM_encrypt_final_avx2
-.text
-.globl	AES_GCM_decrypt_update_avx2
-.type	AES_GCM_decrypt_update_avx2,@function
-.align	16
-AES_GCM_decrypt_update_avx2:
-        pushl	%ebx
-        pushl	%esi
-        pushl	%edi
-        pushl	%ebp
-        subl	$0xa0, %esp
-        movl	208(%esp), %esi
-        vmovdqu	(%esi), %xmm4
-        movl	200(%esp), %esi
-        movl	204(%esp), %ebp
-        vmovdqu	(%esi), %xmm6
-        vmovdqu	(%ebp), %xmm5
-        movl	180(%esp), %ebp
-        movl	188(%esp), %edi
-        movl	192(%esp), %esi
-        # Calculate H
-        vpsrlq	$63, %xmm5, %xmm1
-        vpsllq	$0x01, %xmm5, %xmm0
-        vpslldq	$8, %xmm1, %xmm1
-        vpor	%xmm1, %xmm0, %xmm0
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpand	L_aes_gcm_avx2_mod2_128, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        xorl	%ebx, %ebx
-        cmpl	$0x40, 196(%esp)
-        movl	196(%esp), %eax
-        jl	L_AES_GCM_decrypt_update_avx2_done_64
-        andl	$0xffffffc0, %eax
-        vmovdqu	%xmm4, 64(%esp)
-        vmovdqu	%xmm6, 80(%esp)
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm3
-        # H ^ 1
-        vmovdqu	%xmm5, (%esp)
-        vmovdqu	%xmm5, %xmm2
-        # H ^ 2
-        vpclmulqdq	$0x00, %xmm2, %xmm2, %xmm5
-        vpclmulqdq	$0x11, %xmm2, %xmm2, %xmm6
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm6, %xmm0
-        vmovdqu	%xmm0, 16(%esp)
-        # H ^ 3
-        # ghash_gfmul_red
-        vpclmulqdq	$16, %xmm0, %xmm2, %xmm6
-        vpclmulqdq	$0x01, %xmm0, %xmm2, %xmm5
-        vpclmulqdq	$0x00, %xmm0, %xmm2, %xmm4
-        vpxor	%xmm5, %xmm6, %xmm6
-        vpslldq	$8, %xmm6, %xmm5
-        vpsrldq	$8, %xmm6, %xmm6
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpclmulqdq	$0x11, %xmm0, %xmm2, %xmm1
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm5, %xmm1, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	%xmm1, 32(%esp)
-        # H ^ 4
-        vpclmulqdq	$0x00, %xmm0, %xmm0, %xmm5
-        vpclmulqdq	$0x11, %xmm0, %xmm0, %xmm6
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
-        vpshufd	$0x4e, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm5, %xmm5
-        vpxor	%xmm5, %xmm6, %xmm2
-        vmovdqu	%xmm2, 48(%esp)
-        vmovdqu	80(%esp), %xmm6
-        cmpl	%esi, %edi
-        jne	L_AES_GCM_decrypt_update_avx2_ghash_64
-L_AES_GCM_decrypt_update_avx2_ghash_64_inplace:
-        # aesenc_64_ghash
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        # aesenc_64
-        # aesenc_ctr
-        vmovdqu	64(%esp), %xmm4
-        vmovdqu	L_aes_gcm_avx2_bswap_epi64, %xmm7
-        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm1
-        vpshufb	%xmm7, %xmm4, %xmm0
-        vpaddd	L_aes_gcm_avx2_two, %xmm4, %xmm2
-        vpshufb	%xmm7, %xmm1, %xmm1
-        vpaddd	L_aes_gcm_avx2_three, %xmm4, %xmm3
-        vpshufb	%xmm7, %xmm2, %xmm2
-        vpaddd	L_aes_gcm_avx2_four, %xmm4, %xmm4
-        vpshufb	%xmm7, %xmm3, %xmm3
-        # aesenc_xor
-        vmovdqu	(%ebp), %xmm7
-        vmovdqu	%xmm4, 64(%esp)
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm3, %xmm3
-        vmovdqu	16(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	32(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	48(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	64(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	80(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	96(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	112(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	128(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	144(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$11, 184(%esp)
-        vmovdqu	160(%ebp), %xmm7
-        jl	L_AES_GCM_decrypt_update_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	176(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$13, 184(%esp)
-        vmovdqu	192(%ebp), %xmm7
-        jl	L_AES_GCM_decrypt_update_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	208(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	224(%ebp), %xmm7
-L_AES_GCM_decrypt_update_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done:
-        # aesenc_last
-        vaesenclast	%xmm7, %xmm0, %xmm0
-        vaesenclast	%xmm7, %xmm1, %xmm1
-        vaesenclast	%xmm7, %xmm2, %xmm2
-        vaesenclast	%xmm7, %xmm3, %xmm3
-        vmovdqu	(%ecx), %xmm7
-        vmovdqu	16(%ecx), %xmm4
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	%xmm7, 96(%esp)
-        vmovdqu	%xmm4, 112(%esp)
-        vmovdqu	%xmm0, (%edx)
-        vmovdqu	%xmm1, 16(%edx)
-        vmovdqu	32(%ecx), %xmm7
-        vmovdqu	48(%ecx), %xmm4
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm4, %xmm3, %xmm3
-        vmovdqu	%xmm7, 128(%esp)
-        vmovdqu	%xmm4, 144(%esp)
-        vmovdqu	%xmm2, 32(%edx)
-        vmovdqu	%xmm3, 48(%edx)
-        # pclmul_1
-        vmovdqu	96(%esp), %xmm1
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vmovdqu	48(%esp), %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm5
-        vpclmulqdq	$0x01, %xmm2, %xmm1, %xmm3
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm6
-        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
-        # pclmul_2
-        vmovdqu	112(%esp), %xmm1
-        vmovdqu	32(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        # pclmul_n
-        vmovdqu	128(%esp), %xmm1
-        vmovdqu	16(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        # pclmul_n
-        vmovdqu	144(%esp), %xmm1
-        vmovdqu	(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        # aesenc_pclmul_l
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpslldq	$8, %xmm5, %xmm1
-        vpsrldq	$8, %xmm5, %xmm5
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm0
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm7, %xmm7
-        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
-        vpshufd	$0x4e, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
-        vpshufd	$0x4e, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm7, %xmm6, %xmm6
-        # aesenc_64_ghash - end
-        addl	$0x40, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_decrypt_update_avx2_ghash_64_inplace
-        jmp	L_AES_GCM_decrypt_update_avx2_ghash_64_done
-L_AES_GCM_decrypt_update_avx2_ghash_64:
-        # aesenc_64_ghash
-        leal	(%esi,%ebx,1), %ecx
-        leal	(%edi,%ebx,1), %edx
-        # aesenc_64
-        # aesenc_ctr
-        vmovdqu	64(%esp), %xmm4
-        vmovdqu	L_aes_gcm_avx2_bswap_epi64, %xmm7
-        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm1
-        vpshufb	%xmm7, %xmm4, %xmm0
-        vpaddd	L_aes_gcm_avx2_two, %xmm4, %xmm2
-        vpshufb	%xmm7, %xmm1, %xmm1
-        vpaddd	L_aes_gcm_avx2_three, %xmm4, %xmm3
-        vpshufb	%xmm7, %xmm2, %xmm2
-        vpaddd	L_aes_gcm_avx2_four, %xmm4, %xmm4
-        vpshufb	%xmm7, %xmm3, %xmm3
-        # aesenc_xor
-        vmovdqu	(%ebp), %xmm7
-        vmovdqu	%xmm4, 64(%esp)
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm7, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm7, %xmm3, %xmm3
-        vmovdqu	16(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	32(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	48(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	64(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	80(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	96(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	112(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	128(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	144(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$11, 184(%esp)
-        vmovdqu	160(%ebp), %xmm7
-        jl	L_AES_GCM_decrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	176(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        cmpl	$13, 184(%esp)
-        vmovdqu	192(%ebp), %xmm7
-        jl	L_AES_GCM_decrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	208(%ebp), %xmm7
-        vaesenc	%xmm7, %xmm0, %xmm0
-        vaesenc	%xmm7, %xmm1, %xmm1
-        vaesenc	%xmm7, %xmm2, %xmm2
-        vaesenc	%xmm7, %xmm3, %xmm3
-        vmovdqu	224(%ebp), %xmm7
-L_AES_GCM_decrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done:
-        # aesenc_last
-        vaesenclast	%xmm7, %xmm0, %xmm0
-        vaesenclast	%xmm7, %xmm1, %xmm1
-        vaesenclast	%xmm7, %xmm2, %xmm2
-        vaesenclast	%xmm7, %xmm3, %xmm3
-        vmovdqu	(%ecx), %xmm7
-        vmovdqu	16(%ecx), %xmm4
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm4, %xmm1, %xmm1
-        vmovdqu	%xmm7, (%ecx)
-        vmovdqu	%xmm4, 16(%ecx)
-        vmovdqu	%xmm0, (%edx)
-        vmovdqu	%xmm1, 16(%edx)
-        vmovdqu	32(%ecx), %xmm7
-        vmovdqu	48(%ecx), %xmm4
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm4, %xmm3, %xmm3
-        vmovdqu	%xmm7, 32(%ecx)
-        vmovdqu	%xmm4, 48(%ecx)
-        vmovdqu	%xmm2, 32(%edx)
-        vmovdqu	%xmm3, 48(%edx)
-        # pclmul_1
-        vmovdqu	(%ecx), %xmm1
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vmovdqu	48(%esp), %xmm2
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpclmulqdq	$16, %xmm2, %xmm1, %xmm5
-        vpclmulqdq	$0x01, %xmm2, %xmm1, %xmm3
-        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm6
-        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
-        # pclmul_2
-        vmovdqu	16(%ecx), %xmm1
-        vmovdqu	32(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        # pclmul_n
-        vmovdqu	32(%ecx), %xmm1
-        vmovdqu	16(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        # pclmul_n
-        vmovdqu	48(%ecx), %xmm1
-        vmovdqu	(%esp), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
-        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm7, %xmm7
-        # aesenc_pclmul_l
-        vpxor	%xmm2, %xmm5, %xmm5
-        vpxor	%xmm4, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm5, %xmm5
-        vpslldq	$8, %xmm5, %xmm1
-        vpsrldq	$8, %xmm5, %xmm5
-        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm0
-        vpxor	%xmm1, %xmm6, %xmm6
-        vpxor	%xmm5, %xmm7, %xmm7
-        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
-        vpshufd	$0x4e, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
-        vpshufd	$0x4e, %xmm6, %xmm6
-        vpxor	%xmm3, %xmm6, %xmm6
-        vpxor	%xmm7, %xmm6, %xmm6
-        # aesenc_64_ghash - end
-        addl	$0x40, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_decrypt_update_avx2_ghash_64
-L_AES_GCM_decrypt_update_avx2_ghash_64_done:
-        vmovdqu	(%esp), %xmm5
-        vmovdqu	64(%esp), %xmm4
-L_AES_GCM_decrypt_update_avx2_done_64:
-        cmpl	196(%esp), %ebx
-        jge	L_AES_GCM_decrypt_update_avx2_done_dec
-        movl	196(%esp), %eax
-        andl	$0xfffffff0, %eax
-        cmpl	%eax, %ebx
-        jge	L_AES_GCM_decrypt_update_avx2_last_block_done
-L_AES_GCM_decrypt_update_avx2_last_block_start:
-        vmovdqu	(%esi,%ebx,1), %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm7
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
-        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm4
-        vmovdqu	%xmm4, 64(%esp)
-        vpxor	%xmm6, %xmm0, %xmm4
-        # aesenc_gfmul_sb
-        vpclmulqdq	$0x01, %xmm5, %xmm4, %xmm2
-        vpclmulqdq	$16, %xmm5, %xmm4, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm1
-        vpclmulqdq	$0x11, %xmm5, %xmm4, %xmm4
-        vpxor	(%ebp), %xmm7, %xmm7
-        vaesenc	16(%ebp), %xmm7, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpslldq	$8, %xmm3, %xmm2
-        vpsrldq	$8, %xmm3, %xmm3
-        vaesenc	32(%ebp), %xmm7, %xmm7
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1
-        vaesenc	48(%ebp), %xmm7, %xmm7
-        vaesenc	64(%ebp), %xmm7, %xmm7
-        vaesenc	80(%ebp), %xmm7, %xmm7
-        vpshufd	$0x4e, %xmm2, %xmm2
-        vpxor	%xmm1, %xmm2, %xmm2
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1
-        vaesenc	96(%ebp), %xmm7, %xmm7
-        vaesenc	112(%ebp), %xmm7, %xmm7
-        vaesenc	128(%ebp), %xmm7, %xmm7
-        vpshufd	$0x4e, %xmm2, %xmm2
-        vaesenc	144(%ebp), %xmm7, %xmm7
-        vpxor	%xmm3, %xmm4, %xmm4
-        vpxor	%xmm4, %xmm2, %xmm2
-        vmovdqu	160(%ebp), %xmm0
-        cmpl	$11, 184(%esp)
-        jl	L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vaesenc	176(%ebp), %xmm7, %xmm7
-        vmovdqu	192(%ebp), %xmm0
-        cmpl	$13, 184(%esp)
-        jl	L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last
-        vaesenc	%xmm0, %xmm7, %xmm7
-        vaesenc	208(%ebp), %xmm7, %xmm7
-        vmovdqu	224(%ebp), %xmm0
-L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last:
-        vaesenclast	%xmm0, %xmm7, %xmm7
-        vmovdqu	(%esi,%ebx,1), %xmm3
-        vpxor	%xmm1, %xmm2, %xmm6
-        vpxor	%xmm3, %xmm7, %xmm7
-        vmovdqu	%xmm7, (%edi,%ebx,1)
-        vmovdqu	64(%esp), %xmm4
-        addl	$16, %ebx
-        cmpl	%eax, %ebx
-        jl	L_AES_GCM_decrypt_update_avx2_last_block_start
-L_AES_GCM_decrypt_update_avx2_last_block_done:
-L_AES_GCM_decrypt_update_avx2_done_dec:
-        movl	200(%esp), %esi
-        movl	208(%esp), %edi
-        vmovdqu	64(%esp), %xmm4
-        vmovdqu	%xmm6, (%esi)
-        vmovdqu	%xmm4, (%edi)
-        addl	$0xa0, %esp
-        popl	%ebp
-        popl	%edi
-        popl	%esi
-        popl	%ebx
-        ret
-.size	AES_GCM_decrypt_update_avx2,.-AES_GCM_decrypt_update_avx2
-.text
-.globl	AES_GCM_decrypt_final_avx2
-.type	AES_GCM_decrypt_final_avx2,@function
-.align	16
-AES_GCM_decrypt_final_avx2:
-        pushl	%ebx
-        pushl	%esi
-        pushl	%edi
-        pushl	%ebp
-        subl	$16, %esp
-        movl	36(%esp), %ebp
-        movl	56(%esp), %esi
-        movl	60(%esp), %edi
-        vmovdqu	(%ebp), %xmm4
-        vmovdqu	(%esi), %xmm5
-        vmovdqu	(%edi), %xmm6
-        vpsrlq	$63, %xmm5, %xmm1
-        vpsllq	$0x01, %xmm5, %xmm0
-        vpslldq	$8, %xmm1, %xmm1
-        vpor	%xmm1, %xmm0, %xmm0
-        vpshufd	$0xff, %xmm5, %xmm5
-        vpsrad	$31, %xmm5, %xmm5
-        vpand	L_aes_gcm_avx2_mod2_128, %xmm5, %xmm5
-        vpxor	%xmm0, %xmm5, %xmm5
-        # calc_tag
-        movl	48(%esp), %ecx
-        shll	$3, %ecx
-        vpinsrd	$0x00, %ecx, %xmm0, %xmm0
-        movl	52(%esp), %ecx
-        shll	$3, %ecx
-        vpinsrd	$2, %ecx, %xmm0, %xmm0
-        movl	48(%esp), %ecx
-        shrl	$29, %ecx
-        vpinsrd	$0x01, %ecx, %xmm0, %xmm0
-        movl	52(%esp), %ecx
-        shrl	$29, %ecx
-        vpinsrd	$3, %ecx, %xmm0, %xmm0
-        vpxor	%xmm4, %xmm0, %xmm0
-        # ghash_gfmul_red
-        vpclmulqdq	$16, %xmm5, %xmm0, %xmm7
-        vpclmulqdq	$0x01, %xmm5, %xmm0, %xmm3
-        vpclmulqdq	$0x00, %xmm5, %xmm0, %xmm2
-        vpxor	%xmm3, %xmm7, %xmm7
-        vpslldq	$8, %xmm7, %xmm3
-        vpsrldq	$8, %xmm7, %xmm7
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpclmulqdq	$0x11, %xmm5, %xmm0, %xmm0
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2
-        vpshufd	$0x4e, %xmm3, %xmm3
-        vpxor	%xmm2, %xmm3, %xmm3
-        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2
-        vpshufd	$0x4e, %xmm3, %xmm3
-        vpxor	%xmm7, %xmm0, %xmm0
-        vpxor	%xmm3, %xmm0, %xmm0
-        vpxor	%xmm2, %xmm0, %xmm0
-        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
-        vpxor	%xmm6, %xmm0, %xmm0
-        movl	40(%esp), %esi
-        movl	64(%esp), %edi
-        # cmp_tag
-        cmpl	$16, 44(%esp)
-        je	L_AES_GCM_decrypt_final_avx2_cmp_tag_16
-        xorl	%ecx, %ecx
-        xorl	%edx, %edx
-        vmovdqu	%xmm0, (%esp)
-L_AES_GCM_decrypt_final_avx2_cmp_tag_loop:
-        movzbl	(%esp,%ecx,1), %eax
-        xorb	(%esi,%ecx,1), %al
-        orb	%al, %dl
-        incl	%ecx
-        cmpl	44(%esp), %ecx
-        jne	L_AES_GCM_decrypt_final_avx2_cmp_tag_loop
-        cmpb	$0x00, %dl
-        sete	%dl
-        jmp	L_AES_GCM_decrypt_final_avx2_cmp_tag_done
-L_AES_GCM_decrypt_final_avx2_cmp_tag_16:
-        vmovdqu	(%esi), %xmm1
-        vpcmpeqb	%xmm1, %xmm0, %xmm0
-        vpmovmskb	%xmm0, %ecx
-        # %%edx == 0xFFFF then return 1 else => return 0
-        xorl	%edx, %edx
-        cmpl	$0xffff, %ecx
-        sete	%dl
-L_AES_GCM_decrypt_final_avx2_cmp_tag_done:
-        movl	%edx, (%edi)
-        addl	$16, %esp
-        popl	%ebp
-        popl	%edi
-        popl	%esi
-        popl	%ebx
-        ret
-.size	AES_GCM_decrypt_final_avx2,.-AES_GCM_decrypt_final_avx2
-#endif /* WOLFSSL_AESGCM_STREAM */
-#endif /* HAVE_INTEL_AVX2 */
-
-#if defined(__linux__) && defined(__ELF__)
-.section	.note.GNU-stack,"",%progbits
-#endif

+ 1 - 1
lib/wolfssl/wolfcrypt/src/asm.c

@@ -118,7 +118,7 @@ WC_INLINE static int set_cpuid_flags(void) {
    if(IS_INTEL_BMI2 && IS_INTEL_ADX){  func;  ret ;  }
 
 #else
-    #define IF_HAVE_INTEL_MULX(func, ret)
+    #define IF_HAVE_INTEL_MULX(func, ret) WC_DO_NOTHING
 #endif
 
 #if defined(TFM_X86) && !defined(TFM_SSE2)

Разлика између датотеке није приказан због своје велике величине
+ 619 - 40
lib/wolfssl/wolfcrypt/src/asn.c


+ 2 - 2
lib/wolfssl/wolfcrypt/src/camellia.c

@@ -1464,7 +1464,7 @@ static void camellia_decrypt256(const u32 *subkey, u32 *io)
  * API for compatibility
  */
 
-static void Camellia_EncryptBlock(const int keyBitLength,
+static void Camellia_EncryptBlock(const word32 keyBitLength,
                            const unsigned char *plaintext,
                            const KEY_TABLE_TYPE keyTable,
                            unsigned char *ciphertext)
@@ -1495,7 +1495,7 @@ static void Camellia_EncryptBlock(const int keyBitLength,
     PUTU32(ciphertext + 12, tmp[3]);
 }
 
-static void Camellia_DecryptBlock(const int keyBitLength,
+static void Camellia_DecryptBlock(const word32 keyBitLength,
                            const unsigned char *ciphertext,
                            const KEY_TABLE_TYPE keyTable,
                            unsigned char *plaintext)

+ 7 - 6
lib/wolfssl/wolfcrypt/src/chacha.c

@@ -28,17 +28,18 @@ D. J. Bernstein
 Public domain.
 
 */
-#ifdef WOLFSSL_ARMASM
-    /* implementation is located in wolfcrypt/src/port/arm/armv8-chacha.c */
 
-#else
 #ifdef HAVE_CONFIG_H
     #include <config.h>
 #endif
 
 #include <wolfssl/wolfcrypt/settings.h>
 
-#if defined(HAVE_CHACHA) && !defined(WOLFSSL_ARMASM)
+#if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_NO_NEON)
+    /* implementation is located in wolfcrypt/src/port/arm/armv8-chacha.c */
+
+#else
+#if defined(HAVE_CHACHA)
 
 #include <wolfssl/wolfcrypt/chacha.h>
 #include <wolfssl/wolfcrypt/error-crypt.h>
@@ -436,6 +437,6 @@ void wc_Chacha_purge_current_block(ChaCha* ctx) {
     }
 }
 
-#endif /* HAVE_CHACHA*/
+#endif /* HAVE_CHACHA */
 
-#endif /* WOLFSSL_ARMASM */
+#endif /* WOLFSSL_ARMASM && !WOLFSSL_ARMASM_NO_NEON */

+ 4 - 4
lib/wolfssl/wolfcrypt/src/chacha20_poly1305.c

@@ -355,9 +355,9 @@ static WC_INLINE int wc_XChaCha20Poly1305_crypt_oneshot(
     int isEncrypt)
 {
     int ret;
-    ssize_t dst_len = isEncrypt ?
-        (ssize_t)src_len + POLY1305_DIGEST_SIZE :
-        (ssize_t)src_len - POLY1305_DIGEST_SIZE;
+    long int dst_len = isEncrypt ?
+        (long int)src_len + POLY1305_DIGEST_SIZE :
+        (long int)src_len - POLY1305_DIGEST_SIZE;
     const byte *src_i;
     byte *dst_i;
     size_t src_len_rem;
@@ -375,7 +375,7 @@ static WC_INLINE int wc_XChaCha20Poly1305_crypt_oneshot(
         goto out;
     }
 
-    if ((ssize_t)dst_space < dst_len) {
+    if ((long int)dst_space < dst_len) {
         ret = BUFFER_E;
         goto out;
     }

+ 0 - 1453
lib/wolfssl/wolfcrypt/src/chacha_asm.S

@@ -1,1453 +0,0 @@
-/* chacha_asm
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-#ifdef WOLFSSL_USER_SETTINGS
-#ifdef WOLFSSL_USER_SETTINGS_ASM
-/*
- * user_settings_asm.h is a file generated by the script user_settings_asm.sh.
- * The script takes in a user_settings.h and produces user_settings_asm.h, which
- * is a stripped down version of user_settings.h containing only preprocessor
- * directives. This makes the header safe to include in assembly (.S) files.
- */
-#include "user_settings_asm.h"
-#else
-/*
- * Note: if user_settings.h contains any C code (e.g. a typedef or function
- * prototype), including it here in an assembly (.S) file will cause an
- * assembler failure. See user_settings_asm.h above.
- */
-#include "user_settings.h"
-#endif /* WOLFSSL_USER_SETTINGS_ASM */
-#endif /* WOLFSSL_USER_SETTINGS */
-
-#ifndef HAVE_INTEL_AVX1
-#define HAVE_INTEL_AVX1
-#endif /* HAVE_INTEL_AVX1 */
-#ifndef NO_AVX2_SUPPORT
-#define HAVE_INTEL_AVX2
-#endif /* NO_AVX2_SUPPORT */
-
-#ifdef WOLFSSL_X86_64_BUILD
-#ifndef __APPLE__
-.text
-.globl	chacha_encrypt_x64
-.type	chacha_encrypt_x64,@function
-.align	16
-chacha_encrypt_x64:
-#else
-.section	__TEXT,__text
-.globl	_chacha_encrypt_x64
-.p2align	4
-_chacha_encrypt_x64:
-#endif /* __APPLE__ */
-        pushq	%rbx
-        pushq	%rbp
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        subq	$0x40, %rsp
-        cmpl	$0x40, %ecx
-        jl	L_chacha_x64_small
-L_chacha_x64_start:
-        subq	$48, %rsp
-        movq	%rdx, 24(%rsp)
-        movq	%rsi, 32(%rsp)
-        movq	%rcx, 40(%rsp)
-        movq	32(%rdi), %rax
-        movq	40(%rdi), %rbx
-        movq	%rax, 8(%rsp)
-        movq	%rbx, 16(%rsp)
-        movl	(%rdi), %eax
-        movl	4(%rdi), %ebx
-        movl	8(%rdi), %ecx
-        movl	12(%rdi), %edx
-        movl	16(%rdi), %r8d
-        movl	20(%rdi), %r9d
-        movl	24(%rdi), %r10d
-        movl	28(%rdi), %r11d
-        movl	48(%rdi), %r12d
-        movl	52(%rdi), %r13d
-        movl	56(%rdi), %r14d
-        movl	60(%rdi), %r15d
-        movb	$10, (%rsp)
-        movl	8(%rsp), %esi
-        movl	12(%rsp), %ebp
-L_chacha_x64_block_crypt_start:
-        addl	%r8d, %eax
-        addl	%r9d, %ebx
-        xorl	%eax, %r12d
-        xorl	%ebx, %r13d
-        roll	$16, %r12d
-        roll	$16, %r13d
-        addl	%r12d, %esi
-        addl	%r13d, %ebp
-        xorl	%esi, %r8d
-        xorl	%ebp, %r9d
-        roll	$12, %r8d
-        roll	$12, %r9d
-        addl	%r8d, %eax
-        addl	%r9d, %ebx
-        xorl	%eax, %r12d
-        xorl	%ebx, %r13d
-        roll	$8, %r12d
-        roll	$8, %r13d
-        addl	%r12d, %esi
-        addl	%r13d, %ebp
-        xorl	%esi, %r8d
-        xorl	%ebp, %r9d
-        roll	$7, %r8d
-        roll	$7, %r9d
-        movl	%esi, 8(%rsp)
-        movl	%ebp, 12(%rsp)
-        movl	16(%rsp), %esi
-        movl	20(%rsp), %ebp
-        addl	%r10d, %ecx
-        addl	%r11d, %edx
-        xorl	%ecx, %r14d
-        xorl	%edx, %r15d
-        roll	$16, %r14d
-        roll	$16, %r15d
-        addl	%r14d, %esi
-        addl	%r15d, %ebp
-        xorl	%esi, %r10d
-        xorl	%ebp, %r11d
-        roll	$12, %r10d
-        roll	$12, %r11d
-        addl	%r10d, %ecx
-        addl	%r11d, %edx
-        xorl	%ecx, %r14d
-        xorl	%edx, %r15d
-        roll	$8, %r14d
-        roll	$8, %r15d
-        addl	%r14d, %esi
-        addl	%r15d, %ebp
-        xorl	%esi, %r10d
-        xorl	%ebp, %r11d
-        roll	$7, %r10d
-        roll	$7, %r11d
-        addl	%r9d, %eax
-        addl	%r10d, %ebx
-        xorl	%eax, %r15d
-        xorl	%ebx, %r12d
-        roll	$16, %r15d
-        roll	$16, %r12d
-        addl	%r15d, %esi
-        addl	%r12d, %ebp
-        xorl	%esi, %r9d
-        xorl	%ebp, %r10d
-        roll	$12, %r9d
-        roll	$12, %r10d
-        addl	%r9d, %eax
-        addl	%r10d, %ebx
-        xorl	%eax, %r15d
-        xorl	%ebx, %r12d
-        roll	$8, %r15d
-        roll	$8, %r12d
-        addl	%r15d, %esi
-        addl	%r12d, %ebp
-        xorl	%esi, %r9d
-        xorl	%ebp, %r10d
-        roll	$7, %r9d
-        roll	$7, %r10d
-        movl	%esi, 16(%rsp)
-        movl	%ebp, 20(%rsp)
-        movl	8(%rsp), %esi
-        movl	12(%rsp), %ebp
-        addl	%r11d, %ecx
-        addl	%r8d, %edx
-        xorl	%ecx, %r13d
-        xorl	%edx, %r14d
-        roll	$16, %r13d
-        roll	$16, %r14d
-        addl	%r13d, %esi
-        addl	%r14d, %ebp
-        xorl	%esi, %r11d
-        xorl	%ebp, %r8d
-        roll	$12, %r11d
-        roll	$12, %r8d
-        addl	%r11d, %ecx
-        addl	%r8d, %edx
-        xorl	%ecx, %r13d
-        xorl	%edx, %r14d
-        roll	$8, %r13d
-        roll	$8, %r14d
-        addl	%r13d, %esi
-        addl	%r14d, %ebp
-        xorl	%esi, %r11d
-        xorl	%ebp, %r8d
-        roll	$7, %r11d
-        roll	$7, %r8d
-        decb	(%rsp)
-        jnz	L_chacha_x64_block_crypt_start
-        movl	%esi, 8(%rsp)
-        movl	%ebp, 12(%rsp)
-        movq	32(%rsp), %rsi
-        movq	24(%rsp), %rbp
-        addl	(%rdi), %eax
-        addl	4(%rdi), %ebx
-        addl	8(%rdi), %ecx
-        addl	12(%rdi), %edx
-        addl	16(%rdi), %r8d
-        addl	20(%rdi), %r9d
-        addl	24(%rdi), %r10d
-        addl	28(%rdi), %r11d
-        addl	48(%rdi), %r12d
-        addl	52(%rdi), %r13d
-        addl	56(%rdi), %r14d
-        addl	60(%rdi), %r15d
-        xorl	(%rsi), %eax
-        xorl	4(%rsi), %ebx
-        xorl	8(%rsi), %ecx
-        xorl	12(%rsi), %edx
-        xorl	16(%rsi), %r8d
-        xorl	20(%rsi), %r9d
-        xorl	24(%rsi), %r10d
-        xorl	28(%rsi), %r11d
-        xorl	48(%rsi), %r12d
-        xorl	52(%rsi), %r13d
-        xorl	56(%rsi), %r14d
-        xorl	60(%rsi), %r15d
-        movl	%eax, (%rbp)
-        movl	%ebx, 4(%rbp)
-        movl	%ecx, 8(%rbp)
-        movl	%edx, 12(%rbp)
-        movl	%r8d, 16(%rbp)
-        movl	%r9d, 20(%rbp)
-        movl	%r10d, 24(%rbp)
-        movl	%r11d, 28(%rbp)
-        movl	%r12d, 48(%rbp)
-        movl	%r13d, 52(%rbp)
-        movl	%r14d, 56(%rbp)
-        movl	%r15d, 60(%rbp)
-        movl	8(%rsp), %eax
-        movl	12(%rsp), %ebx
-        movl	16(%rsp), %ecx
-        movl	20(%rsp), %edx
-        addl	32(%rdi), %eax
-        addl	36(%rdi), %ebx
-        addl	40(%rdi), %ecx
-        addl	44(%rdi), %edx
-        xorl	32(%rsi), %eax
-        xorl	36(%rsi), %ebx
-        xorl	40(%rsi), %ecx
-        xorl	44(%rsi), %edx
-        movl	%eax, 32(%rbp)
-        movl	%ebx, 36(%rbp)
-        movl	%ecx, 40(%rbp)
-        movl	%edx, 44(%rbp)
-        movq	24(%rsp), %rdx
-        movq	40(%rsp), %rcx
-        addl	$0x01, 48(%rdi)
-        addq	$48, %rsp
-        subl	$0x40, %ecx
-        addq	$0x40, %rsi
-        addq	$0x40, %rdx
-        cmpl	$0x40, %ecx
-        jge	L_chacha_x64_start
-L_chacha_x64_small:
-        cmpl	$0x00, %ecx
-        je	L_chacha_x64_done
-        subq	$48, %rsp
-        movq	%rdx, 24(%rsp)
-        movq	%rsi, 32(%rsp)
-        movq	%rcx, 40(%rsp)
-        movq	32(%rdi), %rax
-        movq	40(%rdi), %rbx
-        movq	%rax, 8(%rsp)
-        movq	%rbx, 16(%rsp)
-        movl	(%rdi), %eax
-        movl	4(%rdi), %ebx
-        movl	8(%rdi), %ecx
-        movl	12(%rdi), %edx
-        movl	16(%rdi), %r8d
-        movl	20(%rdi), %r9d
-        movl	24(%rdi), %r10d
-        movl	28(%rdi), %r11d
-        movl	48(%rdi), %r12d
-        movl	52(%rdi), %r13d
-        movl	56(%rdi), %r14d
-        movl	60(%rdi), %r15d
-        movb	$10, (%rsp)
-        movl	8(%rsp), %esi
-        movl	12(%rsp), %ebp
-L_chacha_x64_partial_crypt_start:
-        addl	%r8d, %eax
-        addl	%r9d, %ebx
-        xorl	%eax, %r12d
-        xorl	%ebx, %r13d
-        roll	$16, %r12d
-        roll	$16, %r13d
-        addl	%r12d, %esi
-        addl	%r13d, %ebp
-        xorl	%esi, %r8d
-        xorl	%ebp, %r9d
-        roll	$12, %r8d
-        roll	$12, %r9d
-        addl	%r8d, %eax
-        addl	%r9d, %ebx
-        xorl	%eax, %r12d
-        xorl	%ebx, %r13d
-        roll	$8, %r12d
-        roll	$8, %r13d
-        addl	%r12d, %esi
-        addl	%r13d, %ebp
-        xorl	%esi, %r8d
-        xorl	%ebp, %r9d
-        roll	$7, %r8d
-        roll	$7, %r9d
-        movl	%esi, 8(%rsp)
-        movl	%ebp, 12(%rsp)
-        movl	16(%rsp), %esi
-        movl	20(%rsp), %ebp
-        addl	%r10d, %ecx
-        addl	%r11d, %edx
-        xorl	%ecx, %r14d
-        xorl	%edx, %r15d
-        roll	$16, %r14d
-        roll	$16, %r15d
-        addl	%r14d, %esi
-        addl	%r15d, %ebp
-        xorl	%esi, %r10d
-        xorl	%ebp, %r11d
-        roll	$12, %r10d
-        roll	$12, %r11d
-        addl	%r10d, %ecx
-        addl	%r11d, %edx
-        xorl	%ecx, %r14d
-        xorl	%edx, %r15d
-        roll	$8, %r14d
-        roll	$8, %r15d
-        addl	%r14d, %esi
-        addl	%r15d, %ebp
-        xorl	%esi, %r10d
-        xorl	%ebp, %r11d
-        roll	$7, %r10d
-        roll	$7, %r11d
-        addl	%r9d, %eax
-        addl	%r10d, %ebx
-        xorl	%eax, %r15d
-        xorl	%ebx, %r12d
-        roll	$16, %r15d
-        roll	$16, %r12d
-        addl	%r15d, %esi
-        addl	%r12d, %ebp
-        xorl	%esi, %r9d
-        xorl	%ebp, %r10d
-        roll	$12, %r9d
-        roll	$12, %r10d
-        addl	%r9d, %eax
-        addl	%r10d, %ebx
-        xorl	%eax, %r15d
-        xorl	%ebx, %r12d
-        roll	$8, %r15d
-        roll	$8, %r12d
-        addl	%r15d, %esi
-        addl	%r12d, %ebp
-        xorl	%esi, %r9d
-        xorl	%ebp, %r10d
-        roll	$7, %r9d
-        roll	$7, %r10d
-        movl	%esi, 16(%rsp)
-        movl	%ebp, 20(%rsp)
-        movl	8(%rsp), %esi
-        movl	12(%rsp), %ebp
-        addl	%r11d, %ecx
-        addl	%r8d, %edx
-        xorl	%ecx, %r13d
-        xorl	%edx, %r14d
-        roll	$16, %r13d
-        roll	$16, %r14d
-        addl	%r13d, %esi
-        addl	%r14d, %ebp
-        xorl	%esi, %r11d
-        xorl	%ebp, %r8d
-        roll	$12, %r11d
-        roll	$12, %r8d
-        addl	%r11d, %ecx
-        addl	%r8d, %edx
-        xorl	%ecx, %r13d
-        xorl	%edx, %r14d
-        roll	$8, %r13d
-        roll	$8, %r14d
-        addl	%r13d, %esi
-        addl	%r14d, %ebp
-        xorl	%esi, %r11d
-        xorl	%ebp, %r8d
-        roll	$7, %r11d
-        roll	$7, %r8d
-        decb	(%rsp)
-        jnz	L_chacha_x64_partial_crypt_start
-        movl	%esi, 8(%rsp)
-        movl	%ebp, 12(%rsp)
-        movq	32(%rsp), %rsi
-        addl	(%rdi), %eax
-        addl	4(%rdi), %ebx
-        addl	8(%rdi), %ecx
-        addl	12(%rdi), %edx
-        addl	16(%rdi), %r8d
-        addl	20(%rdi), %r9d
-        addl	24(%rdi), %r10d
-        addl	28(%rdi), %r11d
-        addl	48(%rdi), %r12d
-        addl	52(%rdi), %r13d
-        addl	56(%rdi), %r14d
-        addl	60(%rdi), %r15d
-        leaq	80(%rdi), %rbp
-        movl	%eax, (%rbp)
-        movl	%ebx, 4(%rbp)
-        movl	%ecx, 8(%rbp)
-        movl	%edx, 12(%rbp)
-        movl	%r8d, 16(%rbp)
-        movl	%r9d, 20(%rbp)
-        movl	%r10d, 24(%rbp)
-        movl	%r11d, 28(%rbp)
-        movl	%r12d, 48(%rbp)
-        movl	%r13d, 52(%rbp)
-        movl	%r14d, 56(%rbp)
-        movl	%r15d, 60(%rbp)
-        movl	8(%rsp), %eax
-        movl	12(%rsp), %ebx
-        movl	16(%rsp), %ecx
-        movl	20(%rsp), %edx
-        addl	32(%rdi), %eax
-        addl	36(%rdi), %ebx
-        addl	40(%rdi), %ecx
-        addl	44(%rdi), %edx
-        movl	%eax, 32(%rbp)
-        movl	%ebx, 36(%rbp)
-        movl	%ecx, 40(%rbp)
-        movl	%edx, 44(%rbp)
-        movq	24(%rsp), %rdx
-        movq	40(%rsp), %rcx
-        addl	$0x01, 48(%rdi)
-        addq	$48, %rsp
-        movl	%ecx, %r8d
-        xorq	%rbx, %rbx
-        andl	$7, %r8d
-        jz	L_chacha_x64_partial_start64
-L_chacha_x64_partial_start8:
-        movzbl	(%rbp,%rbx,1), %eax
-        xorb	(%rsi,%rbx,1), %al
-        movb	%al, (%rdx,%rbx,1)
-        incl	%ebx
-        cmpl	%r8d, %ebx
-        jne	L_chacha_x64_partial_start8
-        je	L_chacha_x64_partial_end64
-L_chacha_x64_partial_start64:
-        movq	(%rbp,%rbx,1), %rax
-        xorq	(%rsi,%rbx,1), %rax
-        movq	%rax, (%rdx,%rbx,1)
-        addl	$8, %ebx
-L_chacha_x64_partial_end64:
-        cmpl	%ecx, %ebx
-        jne	L_chacha_x64_partial_start64
-        movl	$0x40, %ecx
-        subl	%ebx, %ecx
-        movl	%ecx, 76(%rdi)
-L_chacha_x64_done:
-        addq	$0x40, %rsp
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        popq	%rbp
-        popq	%rbx
-        repz retq
-#ifndef __APPLE__
-.size	chacha_encrypt_x64,.-chacha_encrypt_x64
-#endif /* __APPLE__ */
-#ifdef HAVE_INTEL_AVX1
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_chacha20_avx1_rotl8:
-.quad	0x605040702010003, 0xe0d0c0f0a09080b
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_chacha20_avx1_rotl16:
-.quad	0x504070601000302, 0xd0c0f0e09080b0a
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_chacha20_avx1_add:
-.quad	0x100000000, 0x300000002
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	16
-#else
-.p2align	4
-#endif /* __APPLE__ */
-L_chacha20_avx1_four:
-.quad	0x400000004, 0x400000004
-#ifndef __APPLE__
-.text
-.globl	chacha_encrypt_avx1
-.type	chacha_encrypt_avx1,@function
-.align	16
-chacha_encrypt_avx1:
-#else
-.section	__TEXT,__text
-.globl	_chacha_encrypt_avx1
-.p2align	4
-_chacha_encrypt_avx1:
-#endif /* __APPLE__ */
-        subq	$0x190, %rsp
-        movq	%rsp, %r9
-        leaq	256(%rsp), %r10
-        andq	$-16, %r9
-        andq	$-16, %r10
-        movl	%ecx, %eax
-        shrl	$8, %eax
-        jz	L_chacha20_avx1_end128
-        vpshufd	$0x00, (%rdi), %xmm0
-        vpshufd	$0x00, 4(%rdi), %xmm1
-        vpshufd	$0x00, 8(%rdi), %xmm2
-        vpshufd	$0x00, 12(%rdi), %xmm3
-        vpshufd	$0x00, 16(%rdi), %xmm4
-        vpshufd	$0x00, 20(%rdi), %xmm5
-        vpshufd	$0x00, 24(%rdi), %xmm6
-        vpshufd	$0x00, 28(%rdi), %xmm7
-        vpshufd	$0x00, 32(%rdi), %xmm8
-        vpshufd	$0x00, 36(%rdi), %xmm9
-        vpshufd	$0x00, 40(%rdi), %xmm10
-        vpshufd	$0x00, 44(%rdi), %xmm11
-        vpshufd	$0x00, 48(%rdi), %xmm12
-        vpshufd	$0x00, 52(%rdi), %xmm13
-        vpshufd	$0x00, 56(%rdi), %xmm14
-        vpshufd	$0x00, 60(%rdi), %xmm15
-        vpaddd	L_chacha20_avx1_add(%rip), %xmm12, %xmm12
-        vmovdqa	%xmm0, (%r9)
-        vmovdqa	%xmm1, 16(%r9)
-        vmovdqa	%xmm2, 32(%r9)
-        vmovdqa	%xmm3, 48(%r9)
-        vmovdqa	%xmm4, 64(%r9)
-        vmovdqa	%xmm5, 80(%r9)
-        vmovdqa	%xmm6, 96(%r9)
-        vmovdqa	%xmm7, 112(%r9)
-        vmovdqa	%xmm8, 128(%r9)
-        vmovdqa	%xmm9, 144(%r9)
-        vmovdqa	%xmm10, 160(%r9)
-        vmovdqa	%xmm11, 176(%r9)
-        vmovdqa	%xmm12, 192(%r9)
-        vmovdqa	%xmm13, 208(%r9)
-        vmovdqa	%xmm14, 224(%r9)
-        vmovdqa	%xmm15, 240(%r9)
-L_chacha20_avx1_start128:
-        vmovdqa	%xmm11, 48(%r10)
-        movb	$10, %r8b
-L_chacha20_avx1_loop128:
-        vpaddd	%xmm4, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm12, %xmm12
-        vmovdqa	48(%r10), %xmm11
-        vpshufb	L_chacha20_avx1_rotl16(%rip), %xmm12, %xmm12
-        vpaddd	%xmm12, %xmm8, %xmm8
-        vpxor	%xmm8, %xmm4, %xmm4
-        vpaddd	%xmm5, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm13, %xmm13
-        vpshufb	L_chacha20_avx1_rotl16(%rip), %xmm13, %xmm13
-        vpaddd	%xmm13, %xmm9, %xmm9
-        vpxor	%xmm9, %xmm5, %xmm5
-        vpaddd	%xmm6, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm14, %xmm14
-        vpshufb	L_chacha20_avx1_rotl16(%rip), %xmm14, %xmm14
-        vpaddd	%xmm14, %xmm10, %xmm10
-        vpxor	%xmm10, %xmm6, %xmm6
-        vpaddd	%xmm7, %xmm3, %xmm3
-        vpxor	%xmm3, %xmm15, %xmm15
-        vpshufb	L_chacha20_avx1_rotl16(%rip), %xmm15, %xmm15
-        vpaddd	%xmm15, %xmm11, %xmm11
-        vpxor	%xmm11, %xmm7, %xmm7
-        vmovdqa	%xmm11, 48(%r10)
-        vpsrld	$20, %xmm4, %xmm11
-        vpslld	$12, %xmm4, %xmm4
-        vpxor	%xmm11, %xmm4, %xmm4
-        vpsrld	$20, %xmm5, %xmm11
-        vpslld	$12, %xmm5, %xmm5
-        vpxor	%xmm11, %xmm5, %xmm5
-        vpsrld	$20, %xmm6, %xmm11
-        vpslld	$12, %xmm6, %xmm6
-        vpxor	%xmm11, %xmm6, %xmm6
-        vpsrld	$20, %xmm7, %xmm11
-        vpslld	$12, %xmm7, %xmm7
-        vpxor	%xmm11, %xmm7, %xmm7
-        vpaddd	%xmm4, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm12, %xmm12
-        vmovdqa	48(%r10), %xmm11
-        vpshufb	L_chacha20_avx1_rotl8(%rip), %xmm12, %xmm12
-        vpaddd	%xmm12, %xmm8, %xmm8
-        vpxor	%xmm8, %xmm4, %xmm4
-        vpaddd	%xmm5, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm13, %xmm13
-        vpshufb	L_chacha20_avx1_rotl8(%rip), %xmm13, %xmm13
-        vpaddd	%xmm13, %xmm9, %xmm9
-        vpxor	%xmm9, %xmm5, %xmm5
-        vpaddd	%xmm6, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm14, %xmm14
-        vpshufb	L_chacha20_avx1_rotl8(%rip), %xmm14, %xmm14
-        vpaddd	%xmm14, %xmm10, %xmm10
-        vpxor	%xmm10, %xmm6, %xmm6
-        vpaddd	%xmm7, %xmm3, %xmm3
-        vpxor	%xmm3, %xmm15, %xmm15
-        vpshufb	L_chacha20_avx1_rotl8(%rip), %xmm15, %xmm15
-        vpaddd	%xmm15, %xmm11, %xmm11
-        vpxor	%xmm11, %xmm7, %xmm7
-        vmovdqa	%xmm11, 48(%r10)
-        vpsrld	$25, %xmm4, %xmm11
-        vpslld	$7, %xmm4, %xmm4
-        vpxor	%xmm11, %xmm4, %xmm4
-        vpsrld	$25, %xmm5, %xmm11
-        vpslld	$7, %xmm5, %xmm5
-        vpxor	%xmm11, %xmm5, %xmm5
-        vpsrld	$25, %xmm6, %xmm11
-        vpslld	$7, %xmm6, %xmm6
-        vpxor	%xmm11, %xmm6, %xmm6
-        vpsrld	$25, %xmm7, %xmm11
-        vpslld	$7, %xmm7, %xmm7
-        vpxor	%xmm11, %xmm7, %xmm7
-        vpaddd	%xmm5, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm15, %xmm15
-        vmovdqa	48(%r10), %xmm11
-        vpshufb	L_chacha20_avx1_rotl16(%rip), %xmm15, %xmm15
-        vpaddd	%xmm15, %xmm10, %xmm10
-        vpxor	%xmm10, %xmm5, %xmm5
-        vpaddd	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm12, %xmm12
-        vpshufb	L_chacha20_avx1_rotl16(%rip), %xmm12, %xmm12
-        vpaddd	%xmm12, %xmm11, %xmm11
-        vpxor	%xmm11, %xmm6, %xmm6
-        vpaddd	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm13, %xmm13
-        vpshufb	L_chacha20_avx1_rotl16(%rip), %xmm13, %xmm13
-        vpaddd	%xmm13, %xmm8, %xmm8
-        vpxor	%xmm8, %xmm7, %xmm7
-        vpaddd	%xmm4, %xmm3, %xmm3
-        vpxor	%xmm3, %xmm14, %xmm14
-        vpshufb	L_chacha20_avx1_rotl16(%rip), %xmm14, %xmm14
-        vpaddd	%xmm14, %xmm9, %xmm9
-        vpxor	%xmm9, %xmm4, %xmm4
-        vmovdqa	%xmm11, 48(%r10)
-        vpsrld	$20, %xmm5, %xmm11
-        vpslld	$12, %xmm5, %xmm5
-        vpxor	%xmm11, %xmm5, %xmm5
-        vpsrld	$20, %xmm6, %xmm11
-        vpslld	$12, %xmm6, %xmm6
-        vpxor	%xmm11, %xmm6, %xmm6
-        vpsrld	$20, %xmm7, %xmm11
-        vpslld	$12, %xmm7, %xmm7
-        vpxor	%xmm11, %xmm7, %xmm7
-        vpsrld	$20, %xmm4, %xmm11
-        vpslld	$12, %xmm4, %xmm4
-        vpxor	%xmm11, %xmm4, %xmm4
-        vpaddd	%xmm5, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm15, %xmm15
-        vmovdqa	48(%r10), %xmm11
-        vpshufb	L_chacha20_avx1_rotl8(%rip), %xmm15, %xmm15
-        vpaddd	%xmm15, %xmm10, %xmm10
-        vpxor	%xmm10, %xmm5, %xmm5
-        vpaddd	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm1, %xmm12, %xmm12
-        vpshufb	L_chacha20_avx1_rotl8(%rip), %xmm12, %xmm12
-        vpaddd	%xmm12, %xmm11, %xmm11
-        vpxor	%xmm11, %xmm6, %xmm6
-        vpaddd	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm13, %xmm13
-        vpshufb	L_chacha20_avx1_rotl8(%rip), %xmm13, %xmm13
-        vpaddd	%xmm13, %xmm8, %xmm8
-        vpxor	%xmm8, %xmm7, %xmm7
-        vpaddd	%xmm4, %xmm3, %xmm3
-        vpxor	%xmm3, %xmm14, %xmm14
-        vpshufb	L_chacha20_avx1_rotl8(%rip), %xmm14, %xmm14
-        vpaddd	%xmm14, %xmm9, %xmm9
-        vpxor	%xmm9, %xmm4, %xmm4
-        vmovdqa	%xmm11, 48(%r10)
-        vpsrld	$25, %xmm5, %xmm11
-        vpslld	$7, %xmm5, %xmm5
-        vpxor	%xmm11, %xmm5, %xmm5
-        vpsrld	$25, %xmm6, %xmm11
-        vpslld	$7, %xmm6, %xmm6
-        vpxor	%xmm11, %xmm6, %xmm6
-        vpsrld	$25, %xmm7, %xmm11
-        vpslld	$7, %xmm7, %xmm7
-        vpxor	%xmm11, %xmm7, %xmm7
-        vpsrld	$25, %xmm4, %xmm11
-        vpslld	$7, %xmm4, %xmm4
-        vpxor	%xmm11, %xmm4, %xmm4
-        decb	%r8b
-        jnz	L_chacha20_avx1_loop128
-        vmovdqa	48(%r10), %xmm11
-        vpaddd	(%r9), %xmm0, %xmm0
-        vpaddd	16(%r9), %xmm1, %xmm1
-        vpaddd	32(%r9), %xmm2, %xmm2
-        vpaddd	48(%r9), %xmm3, %xmm3
-        vpaddd	64(%r9), %xmm4, %xmm4
-        vpaddd	80(%r9), %xmm5, %xmm5
-        vpaddd	96(%r9), %xmm6, %xmm6
-        vpaddd	112(%r9), %xmm7, %xmm7
-        vpaddd	128(%r9), %xmm8, %xmm8
-        vpaddd	144(%r9), %xmm9, %xmm9
-        vpaddd	160(%r9), %xmm10, %xmm10
-        vpaddd	176(%r9), %xmm11, %xmm11
-        vpaddd	192(%r9), %xmm12, %xmm12
-        vpaddd	208(%r9), %xmm13, %xmm13
-        vpaddd	224(%r9), %xmm14, %xmm14
-        vpaddd	240(%r9), %xmm15, %xmm15
-        vmovdqa	%xmm8, (%r10)
-        vmovdqa	%xmm9, 16(%r10)
-        vmovdqa	%xmm10, 32(%r10)
-        vmovdqa	%xmm11, 48(%r10)
-        vmovdqa	%xmm12, 64(%r10)
-        vmovdqa	%xmm13, 80(%r10)
-        vmovdqa	%xmm14, 96(%r10)
-        vmovdqa	%xmm15, 112(%r10)
-        vpunpckldq	%xmm1, %xmm0, %xmm8
-        vpunpckldq	%xmm3, %xmm2, %xmm9
-        vpunpckhdq	%xmm1, %xmm0, %xmm12
-        vpunpckhdq	%xmm3, %xmm2, %xmm13
-        vpunpckldq	%xmm5, %xmm4, %xmm10
-        vpunpckldq	%xmm7, %xmm6, %xmm11
-        vpunpckhdq	%xmm5, %xmm4, %xmm14
-        vpunpckhdq	%xmm7, %xmm6, %xmm15
-        vpunpcklqdq	%xmm9, %xmm8, %xmm0
-        vpunpcklqdq	%xmm11, %xmm10, %xmm1
-        vpunpckhqdq	%xmm9, %xmm8, %xmm2
-        vpunpckhqdq	%xmm11, %xmm10, %xmm3
-        vpunpcklqdq	%xmm13, %xmm12, %xmm4
-        vpunpcklqdq	%xmm15, %xmm14, %xmm5
-        vpunpckhqdq	%xmm13, %xmm12, %xmm6
-        vpunpckhqdq	%xmm15, %xmm14, %xmm7
-        vmovdqu	(%rsi), %xmm8
-        vmovdqu	16(%rsi), %xmm9
-        vmovdqu	64(%rsi), %xmm10
-        vmovdqu	80(%rsi), %xmm11
-        vmovdqu	128(%rsi), %xmm12
-        vmovdqu	144(%rsi), %xmm13
-        vmovdqu	192(%rsi), %xmm14
-        vmovdqu	208(%rsi), %xmm15
-        vpxor	%xmm8, %xmm0, %xmm0
-        vpxor	%xmm9, %xmm1, %xmm1
-        vpxor	%xmm10, %xmm2, %xmm2
-        vpxor	%xmm11, %xmm3, %xmm3
-        vpxor	%xmm12, %xmm4, %xmm4
-        vpxor	%xmm13, %xmm5, %xmm5
-        vpxor	%xmm14, %xmm6, %xmm6
-        vpxor	%xmm15, %xmm7, %xmm7
-        vmovdqu	%xmm0, (%rdx)
-        vmovdqu	%xmm1, 16(%rdx)
-        vmovdqu	%xmm2, 64(%rdx)
-        vmovdqu	%xmm3, 80(%rdx)
-        vmovdqu	%xmm4, 128(%rdx)
-        vmovdqu	%xmm5, 144(%rdx)
-        vmovdqu	%xmm6, 192(%rdx)
-        vmovdqu	%xmm7, 208(%rdx)
-        vmovdqa	(%r10), %xmm0
-        vmovdqa	16(%r10), %xmm1
-        vmovdqa	32(%r10), %xmm2
-        vmovdqa	48(%r10), %xmm3
-        vmovdqa	64(%r10), %xmm4
-        vmovdqa	80(%r10), %xmm5
-        vmovdqa	96(%r10), %xmm6
-        vmovdqa	112(%r10), %xmm7
-        vpunpckldq	%xmm1, %xmm0, %xmm8
-        vpunpckldq	%xmm3, %xmm2, %xmm9
-        vpunpckhdq	%xmm1, %xmm0, %xmm12
-        vpunpckhdq	%xmm3, %xmm2, %xmm13
-        vpunpckldq	%xmm5, %xmm4, %xmm10
-        vpunpckldq	%xmm7, %xmm6, %xmm11
-        vpunpckhdq	%xmm5, %xmm4, %xmm14
-        vpunpckhdq	%xmm7, %xmm6, %xmm15
-        vpunpcklqdq	%xmm9, %xmm8, %xmm0
-        vpunpcklqdq	%xmm11, %xmm10, %xmm1
-        vpunpckhqdq	%xmm9, %xmm8, %xmm2
-        vpunpckhqdq	%xmm11, %xmm10, %xmm3
-        vpunpcklqdq	%xmm13, %xmm12, %xmm4
-        vpunpcklqdq	%xmm15, %xmm14, %xmm5
-        vpunpckhqdq	%xmm13, %xmm12, %xmm6
-        vpunpckhqdq	%xmm15, %xmm14, %xmm7
-        vmovdqu	32(%rsi), %xmm8
-        vmovdqu	48(%rsi), %xmm9
-        vmovdqu	96(%rsi), %xmm10
-        vmovdqu	112(%rsi), %xmm11
-        vmovdqu	160(%rsi), %xmm12
-        vmovdqu	176(%rsi), %xmm13
-        vmovdqu	224(%rsi), %xmm14
-        vmovdqu	240(%rsi), %xmm15
-        vpxor	%xmm8, %xmm0, %xmm0
-        vpxor	%xmm9, %xmm1, %xmm1
-        vpxor	%xmm10, %xmm2, %xmm2
-        vpxor	%xmm11, %xmm3, %xmm3
-        vpxor	%xmm12, %xmm4, %xmm4
-        vpxor	%xmm13, %xmm5, %xmm5
-        vpxor	%xmm14, %xmm6, %xmm6
-        vpxor	%xmm15, %xmm7, %xmm7
-        vmovdqu	%xmm0, 32(%rdx)
-        vmovdqu	%xmm1, 48(%rdx)
-        vmovdqu	%xmm2, 96(%rdx)
-        vmovdqu	%xmm3, 112(%rdx)
-        vmovdqu	%xmm4, 160(%rdx)
-        vmovdqu	%xmm5, 176(%rdx)
-        vmovdqu	%xmm6, 224(%rdx)
-        vmovdqu	%xmm7, 240(%rdx)
-        vmovdqa	192(%r9), %xmm12
-        addq	$0x100, %rsi
-        addq	$0x100, %rdx
-        vpaddd	L_chacha20_avx1_four(%rip), %xmm12, %xmm12
-        subl	$0x100, %ecx
-        vmovdqa	%xmm12, 192(%r9)
-        cmpl	$0x100, %ecx
-        jl	L_chacha20_avx1_done128
-        vmovdqa	(%r9), %xmm0
-        vmovdqa	16(%r9), %xmm1
-        vmovdqa	32(%r9), %xmm2
-        vmovdqa	48(%r9), %xmm3
-        vmovdqa	64(%r9), %xmm4
-        vmovdqa	80(%r9), %xmm5
-        vmovdqa	96(%r9), %xmm6
-        vmovdqa	112(%r9), %xmm7
-        vmovdqa	128(%r9), %xmm8
-        vmovdqa	144(%r9), %xmm9
-        vmovdqa	160(%r9), %xmm10
-        vmovdqa	176(%r9), %xmm11
-        vmovdqa	192(%r9), %xmm12
-        vmovdqa	208(%r9), %xmm13
-        vmovdqa	224(%r9), %xmm14
-        vmovdqa	240(%r9), %xmm15
-        jmp	L_chacha20_avx1_start128
-L_chacha20_avx1_done128:
-        shl	$2, %eax
-        addl	%eax, 48(%rdi)
-L_chacha20_avx1_end128:
-        cmpl	$0x40, %ecx
-        jl	L_chacha20_avx1_block_done
-L_chacha20_avx1_block_start:
-        vmovdqu	(%rdi), %xmm0
-        vmovdqu	16(%rdi), %xmm1
-        vmovdqu	32(%rdi), %xmm2
-        vmovdqu	48(%rdi), %xmm3
-        vmovdqa	%xmm0, %xmm5
-        vmovdqa	%xmm1, %xmm6
-        vmovdqa	%xmm2, %xmm7
-        vmovdqa	%xmm3, %xmm8
-        movb	$10, %al
-L_chacha20_avx1_block_crypt_start:
-        vpaddd	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm3, %xmm3
-        vpshufb	L_chacha20_avx1_rotl16(%rip), %xmm3, %xmm3
-        vpaddd	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpsrld	$20, %xmm1, %xmm4
-        vpslld	$12, %xmm1, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpaddd	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm3, %xmm3
-        vpshufb	L_chacha20_avx1_rotl8(%rip), %xmm3, %xmm3
-        vpaddd	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpsrld	$25, %xmm1, %xmm4
-        vpslld	$7, %xmm1, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpshufd	$57, %xmm1, %xmm1
-        vpshufd	$0x4e, %xmm2, %xmm2
-        vpshufd	$0x93, %xmm3, %xmm3
-        vpaddd	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm3, %xmm3
-        vpshufb	L_chacha20_avx1_rotl16(%rip), %xmm3, %xmm3
-        vpaddd	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpsrld	$20, %xmm1, %xmm4
-        vpslld	$12, %xmm1, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpaddd	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm3, %xmm3
-        vpshufb	L_chacha20_avx1_rotl8(%rip), %xmm3, %xmm3
-        vpaddd	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpsrld	$25, %xmm1, %xmm4
-        vpslld	$7, %xmm1, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpshufd	$0x93, %xmm1, %xmm1
-        vpshufd	$0x4e, %xmm2, %xmm2
-        vpshufd	$57, %xmm3, %xmm3
-        decb	%al
-        jnz	L_chacha20_avx1_block_crypt_start
-        vpaddd	%xmm5, %xmm0, %xmm0
-        vpaddd	%xmm6, %xmm1, %xmm1
-        vpaddd	%xmm7, %xmm2, %xmm2
-        vpaddd	%xmm8, %xmm3, %xmm3
-        vmovdqu	(%rsi), %xmm5
-        vmovdqu	16(%rsi), %xmm6
-        vmovdqu	32(%rsi), %xmm7
-        vmovdqu	48(%rsi), %xmm8
-        vpxor	%xmm5, %xmm0, %xmm0
-        vpxor	%xmm6, %xmm1, %xmm1
-        vpxor	%xmm7, %xmm2, %xmm2
-        vpxor	%xmm8, %xmm3, %xmm3
-        vmovdqu	%xmm0, (%rdx)
-        vmovdqu	%xmm1, 16(%rdx)
-        vmovdqu	%xmm2, 32(%rdx)
-        vmovdqu	%xmm3, 48(%rdx)
-        addl	$0x01, 48(%rdi)
-        subl	$0x40, %ecx
-        addq	$0x40, %rsi
-        addq	$0x40, %rdx
-        cmpl	$0x40, %ecx
-        jge	L_chacha20_avx1_block_start
-L_chacha20_avx1_block_done:
-        cmpl	$0x00, %ecx
-        je	L_chacha20_avx1_partial_done
-        leaq	80(%rdi), %r10
-        vmovdqu	(%rdi), %xmm0
-        vmovdqu	16(%rdi), %xmm1
-        vmovdqu	32(%rdi), %xmm2
-        vmovdqu	48(%rdi), %xmm3
-        vmovdqa	%xmm0, %xmm5
-        vmovdqa	%xmm1, %xmm6
-        vmovdqa	%xmm2, %xmm7
-        vmovdqa	%xmm3, %xmm8
-        movb	$10, %al
-L_chacha20_avx1_partial_crypt_start:
-        vpaddd	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm3, %xmm3
-        vpshufb	L_chacha20_avx1_rotl16(%rip), %xmm3, %xmm3
-        vpaddd	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpsrld	$20, %xmm1, %xmm4
-        vpslld	$12, %xmm1, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpaddd	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm3, %xmm3
-        vpshufb	L_chacha20_avx1_rotl8(%rip), %xmm3, %xmm3
-        vpaddd	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpsrld	$25, %xmm1, %xmm4
-        vpslld	$7, %xmm1, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpshufd	$57, %xmm1, %xmm1
-        vpshufd	$0x4e, %xmm2, %xmm2
-        vpshufd	$0x93, %xmm3, %xmm3
-        vpaddd	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm3, %xmm3
-        vpshufb	L_chacha20_avx1_rotl16(%rip), %xmm3, %xmm3
-        vpaddd	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpsrld	$20, %xmm1, %xmm4
-        vpslld	$12, %xmm1, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpaddd	%xmm1, %xmm0, %xmm0
-        vpxor	%xmm0, %xmm3, %xmm3
-        vpshufb	L_chacha20_avx1_rotl8(%rip), %xmm3, %xmm3
-        vpaddd	%xmm3, %xmm2, %xmm2
-        vpxor	%xmm2, %xmm1, %xmm1
-        vpsrld	$25, %xmm1, %xmm4
-        vpslld	$7, %xmm1, %xmm1
-        vpxor	%xmm4, %xmm1, %xmm1
-        vpshufd	$0x93, %xmm1, %xmm1
-        vpshufd	$0x4e, %xmm2, %xmm2
-        vpshufd	$57, %xmm3, %xmm3
-        decb	%al
-        jnz	L_chacha20_avx1_partial_crypt_start
-        vpaddd	%xmm5, %xmm0, %xmm0
-        vpaddd	%xmm6, %xmm1, %xmm1
-        vpaddd	%xmm7, %xmm2, %xmm2
-        vpaddd	%xmm8, %xmm3, %xmm3
-        vmovdqu	%xmm0, (%r10)
-        vmovdqu	%xmm1, 16(%r10)
-        vmovdqu	%xmm2, 32(%r10)
-        vmovdqu	%xmm3, 48(%r10)
-        addl	$0x01, 48(%rdi)
-        movl	%ecx, %r8d
-        xorq	%r11, %r11
-        andl	$7, %r8d
-        jz	L_chacha20_avx1_partial_start64
-L_chacha20_avx1_partial_start8:
-        movzbl	(%r10,%r11,1), %eax
-        xorb	(%rsi,%r11,1), %al
-        movb	%al, (%rdx,%r11,1)
-        incl	%r11d
-        cmpl	%r8d, %r11d
-        jne	L_chacha20_avx1_partial_start8
-        je	L_chacha20_avx1_partial_end64
-L_chacha20_avx1_partial_start64:
-        movq	(%r10,%r11,1), %rax
-        xorq	(%rsi,%r11,1), %rax
-        movq	%rax, (%rdx,%r11,1)
-        addl	$8, %r11d
-L_chacha20_avx1_partial_end64:
-        cmpl	%ecx, %r11d
-        jne	L_chacha20_avx1_partial_start64
-        movl	$0x40, %r8d
-        subl	%r11d, %r8d
-        movl	%r8d, 76(%rdi)
-L_chacha20_avx1_partial_done:
-        addq	$0x190, %rsp
-        repz retq
-#ifndef __APPLE__
-.size	chacha_encrypt_avx1,.-chacha_encrypt_avx1
-#endif /* __APPLE__ */
-#endif /* HAVE_INTEL_AVX1 */
-#ifdef HAVE_INTEL_AVX2
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	32
-#else
-.p2align	5
-#endif /* __APPLE__ */
-L_chacha20_avx2_rotl8:
-.quad	0x605040702010003, 0xe0d0c0f0a09080b
-.quad	0x605040702010003, 0xe0d0c0f0a09080b
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	32
-#else
-.p2align	5
-#endif /* __APPLE__ */
-L_chacha20_avx2_rotl16:
-.quad	0x504070601000302, 0xd0c0f0e09080b0a
-.quad	0x504070601000302, 0xd0c0f0e09080b0a
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	32
-#else
-.p2align	5
-#endif /* __APPLE__ */
-L_chacha20_avx2_add:
-.quad	0x100000000, 0x300000002
-.quad	0x500000004, 0x700000006
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	32
-#else
-.p2align	5
-#endif /* __APPLE__ */
-L_chacha20_avx2_eight:
-.quad	0x800000008, 0x800000008
-.quad	0x800000008, 0x800000008
-#ifndef __APPLE__
-.text
-.globl	chacha_encrypt_avx2
-.type	chacha_encrypt_avx2,@function
-.align	16
-chacha_encrypt_avx2:
-#else
-.section	__TEXT,__text
-.globl	_chacha_encrypt_avx2
-.p2align	4
-_chacha_encrypt_avx2:
-#endif /* __APPLE__ */
-        subq	$0x310, %rsp
-        movq	%rsp, %r9
-        leaq	512(%rsp), %r10
-        andq	$-32, %r9
-        andq	$-32, %r10
-        movl	%ecx, %eax
-        shrl	$9, %eax
-        jz	L_chacha20_avx2_end256
-        vpbroadcastd	(%rdi), %ymm0
-        vpbroadcastd	4(%rdi), %ymm1
-        vpbroadcastd	8(%rdi), %ymm2
-        vpbroadcastd	12(%rdi), %ymm3
-        vpbroadcastd	16(%rdi), %ymm4
-        vpbroadcastd	20(%rdi), %ymm5
-        vpbroadcastd	24(%rdi), %ymm6
-        vpbroadcastd	28(%rdi), %ymm7
-        vpbroadcastd	32(%rdi), %ymm8
-        vpbroadcastd	36(%rdi), %ymm9
-        vpbroadcastd	40(%rdi), %ymm10
-        vpbroadcastd	44(%rdi), %ymm11
-        vpbroadcastd	48(%rdi), %ymm12
-        vpbroadcastd	52(%rdi), %ymm13
-        vpbroadcastd	56(%rdi), %ymm14
-        vpbroadcastd	60(%rdi), %ymm15
-        vpaddd	L_chacha20_avx2_add(%rip), %ymm12, %ymm12
-        vmovdqa	%ymm0, (%r9)
-        vmovdqa	%ymm1, 32(%r9)
-        vmovdqa	%ymm2, 64(%r9)
-        vmovdqa	%ymm3, 96(%r9)
-        vmovdqa	%ymm4, 128(%r9)
-        vmovdqa	%ymm5, 160(%r9)
-        vmovdqa	%ymm6, 192(%r9)
-        vmovdqa	%ymm7, 224(%r9)
-        vmovdqa	%ymm8, 256(%r9)
-        vmovdqa	%ymm9, 288(%r9)
-        vmovdqa	%ymm10, 320(%r9)
-        vmovdqa	%ymm11, 352(%r9)
-        vmovdqa	%ymm12, 384(%r9)
-        vmovdqa	%ymm13, 416(%r9)
-        vmovdqa	%ymm14, 448(%r9)
-        vmovdqa	%ymm15, 480(%r9)
-L_chacha20_avx2_start256:
-        movb	$10, %r8b
-        vmovdqa	%ymm11, 96(%r10)
-L_chacha20_avx2_loop256:
-        vpaddd	%ymm4, %ymm0, %ymm0
-        vpxor	%ymm0, %ymm12, %ymm12
-        vmovdqa	96(%r10), %ymm11
-        vpshufb	L_chacha20_avx2_rotl16(%rip), %ymm12, %ymm12
-        vpaddd	%ymm12, %ymm8, %ymm8
-        vpxor	%ymm8, %ymm4, %ymm4
-        vpaddd	%ymm5, %ymm1, %ymm1
-        vpxor	%ymm1, %ymm13, %ymm13
-        vpshufb	L_chacha20_avx2_rotl16(%rip), %ymm13, %ymm13
-        vpaddd	%ymm13, %ymm9, %ymm9
-        vpxor	%ymm9, %ymm5, %ymm5
-        vpaddd	%ymm6, %ymm2, %ymm2
-        vpxor	%ymm2, %ymm14, %ymm14
-        vpshufb	L_chacha20_avx2_rotl16(%rip), %ymm14, %ymm14
-        vpaddd	%ymm14, %ymm10, %ymm10
-        vpxor	%ymm10, %ymm6, %ymm6
-        vpaddd	%ymm7, %ymm3, %ymm3
-        vpxor	%ymm3, %ymm15, %ymm15
-        vpshufb	L_chacha20_avx2_rotl16(%rip), %ymm15, %ymm15
-        vpaddd	%ymm15, %ymm11, %ymm11
-        vpxor	%ymm11, %ymm7, %ymm7
-        vmovdqa	%ymm11, 96(%r10)
-        vpsrld	$20, %ymm4, %ymm11
-        vpslld	$12, %ymm4, %ymm4
-        vpxor	%ymm11, %ymm4, %ymm4
-        vpsrld	$20, %ymm5, %ymm11
-        vpslld	$12, %ymm5, %ymm5
-        vpxor	%ymm11, %ymm5, %ymm5
-        vpsrld	$20, %ymm6, %ymm11
-        vpslld	$12, %ymm6, %ymm6
-        vpxor	%ymm11, %ymm6, %ymm6
-        vpsrld	$20, %ymm7, %ymm11
-        vpslld	$12, %ymm7, %ymm7
-        vpxor	%ymm11, %ymm7, %ymm7
-        vpaddd	%ymm4, %ymm0, %ymm0
-        vpxor	%ymm0, %ymm12, %ymm12
-        vmovdqa	96(%r10), %ymm11
-        vpshufb	L_chacha20_avx2_rotl8(%rip), %ymm12, %ymm12
-        vpaddd	%ymm12, %ymm8, %ymm8
-        vpxor	%ymm8, %ymm4, %ymm4
-        vpaddd	%ymm5, %ymm1, %ymm1
-        vpxor	%ymm1, %ymm13, %ymm13
-        vpshufb	L_chacha20_avx2_rotl8(%rip), %ymm13, %ymm13
-        vpaddd	%ymm13, %ymm9, %ymm9
-        vpxor	%ymm9, %ymm5, %ymm5
-        vpaddd	%ymm6, %ymm2, %ymm2
-        vpxor	%ymm2, %ymm14, %ymm14
-        vpshufb	L_chacha20_avx2_rotl8(%rip), %ymm14, %ymm14
-        vpaddd	%ymm14, %ymm10, %ymm10
-        vpxor	%ymm10, %ymm6, %ymm6
-        vpaddd	%ymm7, %ymm3, %ymm3
-        vpxor	%ymm3, %ymm15, %ymm15
-        vpshufb	L_chacha20_avx2_rotl8(%rip), %ymm15, %ymm15
-        vpaddd	%ymm15, %ymm11, %ymm11
-        vpxor	%ymm11, %ymm7, %ymm7
-        vmovdqa	%ymm11, 96(%r10)
-        vpsrld	$25, %ymm4, %ymm11
-        vpslld	$7, %ymm4, %ymm4
-        vpxor	%ymm11, %ymm4, %ymm4
-        vpsrld	$25, %ymm5, %ymm11
-        vpslld	$7, %ymm5, %ymm5
-        vpxor	%ymm11, %ymm5, %ymm5
-        vpsrld	$25, %ymm6, %ymm11
-        vpslld	$7, %ymm6, %ymm6
-        vpxor	%ymm11, %ymm6, %ymm6
-        vpsrld	$25, %ymm7, %ymm11
-        vpslld	$7, %ymm7, %ymm7
-        vpxor	%ymm11, %ymm7, %ymm7
-        vpaddd	%ymm5, %ymm0, %ymm0
-        vpxor	%ymm0, %ymm15, %ymm15
-        vmovdqa	96(%r10), %ymm11
-        vpshufb	L_chacha20_avx2_rotl16(%rip), %ymm15, %ymm15
-        vpaddd	%ymm15, %ymm10, %ymm10
-        vpxor	%ymm10, %ymm5, %ymm5
-        vpaddd	%ymm6, %ymm1, %ymm1
-        vpxor	%ymm1, %ymm12, %ymm12
-        vpshufb	L_chacha20_avx2_rotl16(%rip), %ymm12, %ymm12
-        vpaddd	%ymm12, %ymm11, %ymm11
-        vpxor	%ymm11, %ymm6, %ymm6
-        vpaddd	%ymm7, %ymm2, %ymm2
-        vpxor	%ymm2, %ymm13, %ymm13
-        vpshufb	L_chacha20_avx2_rotl16(%rip), %ymm13, %ymm13
-        vpaddd	%ymm13, %ymm8, %ymm8
-        vpxor	%ymm8, %ymm7, %ymm7
-        vpaddd	%ymm4, %ymm3, %ymm3
-        vpxor	%ymm3, %ymm14, %ymm14
-        vpshufb	L_chacha20_avx2_rotl16(%rip), %ymm14, %ymm14
-        vpaddd	%ymm14, %ymm9, %ymm9
-        vpxor	%ymm9, %ymm4, %ymm4
-        vmovdqa	%ymm11, 96(%r10)
-        vpsrld	$20, %ymm5, %ymm11
-        vpslld	$12, %ymm5, %ymm5
-        vpxor	%ymm11, %ymm5, %ymm5
-        vpsrld	$20, %ymm6, %ymm11
-        vpslld	$12, %ymm6, %ymm6
-        vpxor	%ymm11, %ymm6, %ymm6
-        vpsrld	$20, %ymm7, %ymm11
-        vpslld	$12, %ymm7, %ymm7
-        vpxor	%ymm11, %ymm7, %ymm7
-        vpsrld	$20, %ymm4, %ymm11
-        vpslld	$12, %ymm4, %ymm4
-        vpxor	%ymm11, %ymm4, %ymm4
-        vpaddd	%ymm5, %ymm0, %ymm0
-        vpxor	%ymm0, %ymm15, %ymm15
-        vmovdqa	96(%r10), %ymm11
-        vpshufb	L_chacha20_avx2_rotl8(%rip), %ymm15, %ymm15
-        vpaddd	%ymm15, %ymm10, %ymm10
-        vpxor	%ymm10, %ymm5, %ymm5
-        vpaddd	%ymm6, %ymm1, %ymm1
-        vpxor	%ymm1, %ymm12, %ymm12
-        vpshufb	L_chacha20_avx2_rotl8(%rip), %ymm12, %ymm12
-        vpaddd	%ymm12, %ymm11, %ymm11
-        vpxor	%ymm11, %ymm6, %ymm6
-        vpaddd	%ymm7, %ymm2, %ymm2
-        vpxor	%ymm2, %ymm13, %ymm13
-        vpshufb	L_chacha20_avx2_rotl8(%rip), %ymm13, %ymm13
-        vpaddd	%ymm13, %ymm8, %ymm8
-        vpxor	%ymm8, %ymm7, %ymm7
-        vpaddd	%ymm4, %ymm3, %ymm3
-        vpxor	%ymm3, %ymm14, %ymm14
-        vpshufb	L_chacha20_avx2_rotl8(%rip), %ymm14, %ymm14
-        vpaddd	%ymm14, %ymm9, %ymm9
-        vpxor	%ymm9, %ymm4, %ymm4
-        vmovdqa	%ymm11, 96(%r10)
-        vpsrld	$25, %ymm5, %ymm11
-        vpslld	$7, %ymm5, %ymm5
-        vpxor	%ymm11, %ymm5, %ymm5
-        vpsrld	$25, %ymm6, %ymm11
-        vpslld	$7, %ymm6, %ymm6
-        vpxor	%ymm11, %ymm6, %ymm6
-        vpsrld	$25, %ymm7, %ymm11
-        vpslld	$7, %ymm7, %ymm7
-        vpxor	%ymm11, %ymm7, %ymm7
-        vpsrld	$25, %ymm4, %ymm11
-        vpslld	$7, %ymm4, %ymm4
-        vpxor	%ymm11, %ymm4, %ymm4
-        decb	%r8b
-        jnz	L_chacha20_avx2_loop256
-        vmovdqa	96(%r10), %ymm11
-        vpaddd	(%r9), %ymm0, %ymm0
-        vpaddd	32(%r9), %ymm1, %ymm1
-        vpaddd	64(%r9), %ymm2, %ymm2
-        vpaddd	96(%r9), %ymm3, %ymm3
-        vpaddd	128(%r9), %ymm4, %ymm4
-        vpaddd	160(%r9), %ymm5, %ymm5
-        vpaddd	192(%r9), %ymm6, %ymm6
-        vpaddd	224(%r9), %ymm7, %ymm7
-        vpaddd	256(%r9), %ymm8, %ymm8
-        vpaddd	288(%r9), %ymm9, %ymm9
-        vpaddd	320(%r9), %ymm10, %ymm10
-        vpaddd	352(%r9), %ymm11, %ymm11
-        vpaddd	384(%r9), %ymm12, %ymm12
-        vpaddd	416(%r9), %ymm13, %ymm13
-        vpaddd	448(%r9), %ymm14, %ymm14
-        vpaddd	480(%r9), %ymm15, %ymm15
-        vmovdqa	%ymm8, (%r10)
-        vmovdqa	%ymm9, 32(%r10)
-        vmovdqa	%ymm10, 64(%r10)
-        vmovdqa	%ymm11, 96(%r10)
-        vmovdqa	%ymm12, 128(%r10)
-        vmovdqa	%ymm13, 160(%r10)
-        vmovdqa	%ymm14, 192(%r10)
-        vmovdqa	%ymm15, 224(%r10)
-        vpunpckldq	%ymm1, %ymm0, %ymm8
-        vpunpckldq	%ymm3, %ymm2, %ymm9
-        vpunpckhdq	%ymm1, %ymm0, %ymm12
-        vpunpckhdq	%ymm3, %ymm2, %ymm13
-        vpunpckldq	%ymm5, %ymm4, %ymm10
-        vpunpckldq	%ymm7, %ymm6, %ymm11
-        vpunpckhdq	%ymm5, %ymm4, %ymm14
-        vpunpckhdq	%ymm7, %ymm6, %ymm15
-        vpunpcklqdq	%ymm9, %ymm8, %ymm0
-        vpunpcklqdq	%ymm11, %ymm10, %ymm1
-        vpunpckhqdq	%ymm9, %ymm8, %ymm2
-        vpunpckhqdq	%ymm11, %ymm10, %ymm3
-        vpunpcklqdq	%ymm13, %ymm12, %ymm4
-        vpunpcklqdq	%ymm15, %ymm14, %ymm5
-        vpunpckhqdq	%ymm13, %ymm12, %ymm6
-        vpunpckhqdq	%ymm15, %ymm14, %ymm7
-        vperm2i128	$32, %ymm1, %ymm0, %ymm8
-        vperm2i128	$32, %ymm3, %ymm2, %ymm9
-        vperm2i128	$49, %ymm1, %ymm0, %ymm12
-        vperm2i128	$49, %ymm3, %ymm2, %ymm13
-        vperm2i128	$32, %ymm5, %ymm4, %ymm10
-        vperm2i128	$32, %ymm7, %ymm6, %ymm11
-        vperm2i128	$49, %ymm5, %ymm4, %ymm14
-        vperm2i128	$49, %ymm7, %ymm6, %ymm15
-        vmovdqu	(%rsi), %ymm0
-        vmovdqu	64(%rsi), %ymm1
-        vmovdqu	128(%rsi), %ymm2
-        vmovdqu	192(%rsi), %ymm3
-        vmovdqu	256(%rsi), %ymm4
-        vmovdqu	320(%rsi), %ymm5
-        vmovdqu	384(%rsi), %ymm6
-        vmovdqu	448(%rsi), %ymm7
-        vpxor	%ymm0, %ymm8, %ymm8
-        vpxor	%ymm1, %ymm9, %ymm9
-        vpxor	%ymm2, %ymm10, %ymm10
-        vpxor	%ymm3, %ymm11, %ymm11
-        vpxor	%ymm4, %ymm12, %ymm12
-        vpxor	%ymm5, %ymm13, %ymm13
-        vpxor	%ymm6, %ymm14, %ymm14
-        vpxor	%ymm7, %ymm15, %ymm15
-        vmovdqu	%ymm8, (%rdx)
-        vmovdqu	%ymm9, 64(%rdx)
-        vmovdqu	%ymm10, 128(%rdx)
-        vmovdqu	%ymm11, 192(%rdx)
-        vmovdqu	%ymm12, 256(%rdx)
-        vmovdqu	%ymm13, 320(%rdx)
-        vmovdqu	%ymm14, 384(%rdx)
-        vmovdqu	%ymm15, 448(%rdx)
-        vmovdqa	(%r10), %ymm0
-        vmovdqa	32(%r10), %ymm1
-        vmovdqa	64(%r10), %ymm2
-        vmovdqa	96(%r10), %ymm3
-        vmovdqa	128(%r10), %ymm4
-        vmovdqa	160(%r10), %ymm5
-        vmovdqa	192(%r10), %ymm6
-        vmovdqa	224(%r10), %ymm7
-        vpunpckldq	%ymm1, %ymm0, %ymm8
-        vpunpckldq	%ymm3, %ymm2, %ymm9
-        vpunpckhdq	%ymm1, %ymm0, %ymm12
-        vpunpckhdq	%ymm3, %ymm2, %ymm13
-        vpunpckldq	%ymm5, %ymm4, %ymm10
-        vpunpckldq	%ymm7, %ymm6, %ymm11
-        vpunpckhdq	%ymm5, %ymm4, %ymm14
-        vpunpckhdq	%ymm7, %ymm6, %ymm15
-        vpunpcklqdq	%ymm9, %ymm8, %ymm0
-        vpunpcklqdq	%ymm11, %ymm10, %ymm1
-        vpunpckhqdq	%ymm9, %ymm8, %ymm2
-        vpunpckhqdq	%ymm11, %ymm10, %ymm3
-        vpunpcklqdq	%ymm13, %ymm12, %ymm4
-        vpunpcklqdq	%ymm15, %ymm14, %ymm5
-        vpunpckhqdq	%ymm13, %ymm12, %ymm6
-        vpunpckhqdq	%ymm15, %ymm14, %ymm7
-        vperm2i128	$32, %ymm1, %ymm0, %ymm8
-        vperm2i128	$32, %ymm3, %ymm2, %ymm9
-        vperm2i128	$49, %ymm1, %ymm0, %ymm12
-        vperm2i128	$49, %ymm3, %ymm2, %ymm13
-        vperm2i128	$32, %ymm5, %ymm4, %ymm10
-        vperm2i128	$32, %ymm7, %ymm6, %ymm11
-        vperm2i128	$49, %ymm5, %ymm4, %ymm14
-        vperm2i128	$49, %ymm7, %ymm6, %ymm15
-        vmovdqu	32(%rsi), %ymm0
-        vmovdqu	96(%rsi), %ymm1
-        vmovdqu	160(%rsi), %ymm2
-        vmovdqu	224(%rsi), %ymm3
-        vmovdqu	288(%rsi), %ymm4
-        vmovdqu	352(%rsi), %ymm5
-        vmovdqu	416(%rsi), %ymm6
-        vmovdqu	480(%rsi), %ymm7
-        vpxor	%ymm0, %ymm8, %ymm8
-        vpxor	%ymm1, %ymm9, %ymm9
-        vpxor	%ymm2, %ymm10, %ymm10
-        vpxor	%ymm3, %ymm11, %ymm11
-        vpxor	%ymm4, %ymm12, %ymm12
-        vpxor	%ymm5, %ymm13, %ymm13
-        vpxor	%ymm6, %ymm14, %ymm14
-        vpxor	%ymm7, %ymm15, %ymm15
-        vmovdqu	%ymm8, 32(%rdx)
-        vmovdqu	%ymm9, 96(%rdx)
-        vmovdqu	%ymm10, 160(%rdx)
-        vmovdqu	%ymm11, 224(%rdx)
-        vmovdqu	%ymm12, 288(%rdx)
-        vmovdqu	%ymm13, 352(%rdx)
-        vmovdqu	%ymm14, 416(%rdx)
-        vmovdqu	%ymm15, 480(%rdx)
-        vmovdqa	384(%r9), %ymm12
-        addq	$0x200, %rsi
-        addq	$0x200, %rdx
-        vpaddd	L_chacha20_avx2_eight(%rip), %ymm12, %ymm12
-        subl	$0x200, %ecx
-        vmovdqa	%ymm12, 384(%r9)
-        cmpl	$0x200, %ecx
-        jl	L_chacha20_avx2_done256
-        vmovdqa	(%r9), %ymm0
-        vmovdqa	32(%r9), %ymm1
-        vmovdqa	64(%r9), %ymm2
-        vmovdqa	96(%r9), %ymm3
-        vmovdqa	128(%r9), %ymm4
-        vmovdqa	160(%r9), %ymm5
-        vmovdqa	192(%r9), %ymm6
-        vmovdqa	224(%r9), %ymm7
-        vmovdqa	256(%r9), %ymm8
-        vmovdqa	288(%r9), %ymm9
-        vmovdqa	320(%r9), %ymm10
-        vmovdqa	352(%r9), %ymm11
-        vmovdqa	384(%r9), %ymm12
-        vmovdqa	416(%r9), %ymm13
-        vmovdqa	448(%r9), %ymm14
-        vmovdqa	480(%r9), %ymm15
-        jmp	L_chacha20_avx2_start256
-L_chacha20_avx2_done256:
-        shl	$3, %eax
-        addl	%eax, 48(%rdi)
-L_chacha20_avx2_end256:
-#ifndef __APPLE__
-        callq	chacha_encrypt_avx1@plt
-#else
-        callq	_chacha_encrypt_avx1
-#endif /* __APPLE__ */
-        addq	$0x310, %rsp
-        repz retq
-#ifndef __APPLE__
-.size	chacha_encrypt_avx2,.-chacha_encrypt_avx2
-#endif /* __APPLE__ */
-#endif /* HAVE_INTEL_AVX2 */
-#endif /* WOLFSSL_X86_64_BUILD */
-
-#if defined(__linux__) && defined(__ELF__)
-.section	.note.GNU-stack,"",%progbits
-#endif

+ 2 - 1
lib/wolfssl/wolfcrypt/src/cmac.c

@@ -115,11 +115,12 @@ int wc_InitCmac_ex(Cmac* cmac, const byte* key, word32 keySz,
     XMEMSET(cmac, 0, sizeof(Cmac));
 
 #ifdef WOLF_CRYPTO_CB
+    /* Set devId regardless of value (invalid or not) */
+    cmac->devId = devId;
     #ifndef WOLF_CRYPTO_CB_FIND
     if (devId != INVALID_DEVID)
     #endif
     {
-        cmac->devId = devId;
         cmac->devCtx = NULL;
 
         ret = wc_CryptoCb_Cmac(cmac, key, keySz, NULL, 0, NULL, NULL,

+ 1 - 0
lib/wolfssl/wolfcrypt/src/compress.c

@@ -245,6 +245,7 @@ int wc_DeCompressDynamic(byte** out, int maxSz, int memoryType,
     stream.opaque = (voidpf)0;
 
     if (inflateInit2(&stream, DEFLATE_DEFAULT_WINDOWBITS | windowBits) != Z_OK) {
+        XFREE(tmp, heap, memoryType);
         return DECOMPRESS_INIT_E;
     }
 

+ 116 - 10
lib/wolfssl/wolfcrypt/src/cryptocb.c

@@ -22,6 +22,20 @@
 /* This framework provides a central place for crypto hardware integration
    using the devId scheme. If not supported return `CRYPTOCB_UNAVAILABLE`. */
 
+/* Some common, optional build settings:
+ * these can also be set in wolfssl/options.h or user_settings.h
+ * -------------------------------------------------------------
+ * enable the find device callback functions
+ * WOLF_CRYPTO_CB_FIND
+ *
+ * enable the command callback functions to invoke the callback during
+ * register and unregister
+ * WOLF_CRYPTO_CB_CMD
+ *
+ * enable debug InfoString functions
+ * DEBUG_CRYPTO_CB
+ */
+
 #ifdef HAVE_CONFIG_H
     #include <config.h>
 #endif
@@ -34,6 +48,10 @@
 #include <wolfssl/wolfcrypt/error-crypt.h>
 #include <wolfssl/wolfcrypt/logging.h>
 
+#ifdef HAVE_ARIA
+    #include <wolfssl/wolfcrypt/port/aria/aria-cryptocb.h>
+#endif
+
 #ifdef WOLFSSL_CAAM
     #include <wolfssl/wolfcrypt/port/caam/wolfcaam.h>
 #endif
@@ -58,6 +76,9 @@ static CryptoDevCallbackFind CryptoCb_FindCb = NULL;
 static const char* GetAlgoTypeStr(int algo)
 {
     switch (algo) { /* enum wc_AlgoType */
+#ifdef WOLF_CRYPTO_CB_CMD
+        case WC_ALGO_TYPE_NONE:   return "None-Command";
+#endif
         case WC_ALGO_TYPE_HASH:   return "Hash";
         case WC_ALGO_TYPE_CIPHER: return "Cipher";
         case WC_ALGO_TYPE_PK:     return "PK";
@@ -133,6 +154,17 @@ static const char* GetRsaType(int type)
 }
 #endif
 
+#ifdef WOLF_CRYPTO_CB_CMD
+static const char* GetCryptoCbCmdTypeStr(int type)
+{
+    switch (type) {
+        case WC_CRYPTOCB_CMD_TYPE_REGISTER:   return "Register";
+        case WC_CRYPTOCB_CMD_TYPE_UNREGISTER: return "UnRegister";
+    }
+    return NULL;
+}
+#endif
+
 WOLFSSL_API void wc_CryptoCb_InfoString(wc_CryptoInfo* info)
 {
     if (info == NULL)
@@ -165,6 +197,12 @@ WOLFSSL_API void wc_CryptoCb_InfoString(wc_CryptoInfo* info)
         printf("Crypto CB: %s %s (%d)\n", GetAlgoTypeStr(info->algo_type),
             GetHashTypeStr(info->hmac.macType), info->hmac.macType);
     }
+#ifdef WOLF_CRYPTO_CB_CMD
+    else if (info->algo_type == WC_ALGO_TYPE_NONE) {
+        printf("Crypto CB: %s %s (%d)\n", GetAlgoTypeStr(info->algo_type),
+            GetCryptoCbCmdTypeStr(info->cmd.type), info->cmd.type);
+    }
+#endif
     else {
         printf("CryptoCb: %s \n", GetAlgoTypeStr(info->algo_type));
     }
@@ -219,11 +257,28 @@ static WC_INLINE int wc_CryptoCb_TranslateErrorCode(int ret)
     return ret;
 }
 
+/* Helper function to reset a device entry to invalid */
+static WC_INLINE void wc_CryptoCb_ClearDev(CryptoCb *dev)
+{
+    XMEMSET(dev, 0, sizeof(*dev));
+    dev->devId = INVALID_DEVID;
+}
+
 void wc_CryptoCb_Init(void)
 {
     int i;
-    for (i=0; i<MAX_CRYPTO_DEVID_CALLBACKS; i++) {
-        gCryptoDev[i].devId = INVALID_DEVID;
+    for (i = 0; i < MAX_CRYPTO_DEVID_CALLBACKS; i++) {
+        wc_CryptoCb_ClearDev(&gCryptoDev[i]);
+    }
+}
+
+void wc_CryptoCb_Cleanup(void)
+{
+    int i;
+    for (i = 0; i < MAX_CRYPTO_DEVID_CALLBACKS; i++) {
+        if(gCryptoDev[i].devId != INVALID_DEVID) {
+            wc_CryptoCb_UnRegisterDevice(gCryptoDev[i].devId);
+        }
     }
 }
 
@@ -251,6 +306,8 @@ void wc_CryptoCb_SetDeviceFindCb(CryptoDevCallbackFind cb)
 
 int wc_CryptoCb_RegisterDevice(int devId, CryptoDevCallbackFunc cb, void* ctx)
 {
+    int rc = 0;
+
     /* find existing or new */
     CryptoCb* dev = wc_CryptoCb_GetDevice(devId);
     if (dev == NULL)
@@ -260,19 +317,64 @@ int wc_CryptoCb_RegisterDevice(int devId, CryptoDevCallbackFunc cb, void* ctx)
         return BUFFER_E; /* out of devices */
 
     dev->devId = devId;
-    dev->cb = cb;
-    dev->ctx = ctx;
-
-    return 0;
+    dev->cb    = cb;
+    dev->ctx   = ctx;
+
+#ifdef WOLF_CRYPTO_CB_CMD
+    if (cb != NULL) {
+        /* Invoke callback with register command */
+        wc_CryptoInfo info;
+        XMEMSET(&info, 0, sizeof(info));
+        info.algo_type = WC_ALGO_TYPE_NONE;
+        info.cmd.type  = WC_CRYPTOCB_CMD_TYPE_REGISTER;
+        info.cmd.ctx   = ctx;  /* cb may update on success */
+
+        rc = cb(devId, &info, ctx);
+        if (rc == 0) {
+            /* Success.  Update dev->ctx */
+            dev->ctx = info.cmd.ctx;
+        }
+        else if ((rc == CRYPTOCB_UNAVAILABLE) ||
+                 (rc == NOT_COMPILED_IN)) {
+            /* Not implemented.  Return success*/
+            rc = 0;
+        }
+        else {
+            /* Error in callback register cmd. Don't register */
+            wc_CryptoCb_ClearDev(dev);
+        }
+    }
+#endif
+    return rc;
 }
 
 void wc_CryptoCb_UnRegisterDevice(int devId)
 {
-    CryptoCb* dev = wc_CryptoCb_GetDevice(devId);
-    if (dev) {
-        XMEMSET(dev, 0, sizeof(*dev));
-        dev->devId = INVALID_DEVID;
+    CryptoCb* dev = NULL;
+
+    /* Can't unregister the invalid device */
+    if (devId == INVALID_DEVID)
+        return;
+
+    /* Find the matching dev */
+    dev = wc_CryptoCb_GetDevice(devId);
+    if (dev == NULL)
+        return;
+
+#ifdef WOLF_CRYPTO_CB_CMD
+    if (dev->cb != NULL) {
+        /* Invoke callback with unregister command.*/
+        wc_CryptoInfo info;
+        XMEMSET(&info, 0, sizeof(info));
+        info.algo_type = WC_ALGO_TYPE_NONE;
+        info.cmd.type  = WC_CRYPTOCB_CMD_TYPE_UNREGISTER;
+        info.cmd.ctx   = NULL;  /* Not used */
+
+        /* Ignore errors here */
+        dev->cb(devId, &info, dev->ctx);
     }
+#endif
+    wc_CryptoCb_ClearDev(dev);
 }
 
 #ifndef NO_RSA
@@ -1337,6 +1439,10 @@ int wc_CryptoCb_DefaultDevID(void)
     /* conditional macro selection based on build */
 #ifdef WOLFSSL_CAAM_DEVID
     ret = WOLFSSL_CAAM_DEVID;
+#elif defined(HAVE_ARIA)
+    ret = WOLFSSL_ARIA_DEVID;
+#elif defined(WC_USE_DEVID)
+    ret = WC_USE_DEVID;
 #else
     ret = INVALID_DEVID;
 #endif

+ 15 - 12
lib/wolfssl/wolfcrypt/src/curve25519.c

@@ -58,7 +58,13 @@ const curve25519_set_type curve25519_sets[] = {
     }
 };
 
-static const unsigned char kCurve25519BasePoint[CURVE25519_KEYSIZE] = {9};
+static const word32 kCurve25519BasePoint[CURVE25519_KEYSIZE/sizeof(word32)] = {
+#ifdef BIG_ENDIAN_ORDER
+    0x09000000
+#else
+    9
+#endif
+};
 
 /* Curve25519 private key must be less than order */
 /* These functions clamp private k and check it */
@@ -133,7 +139,7 @@ int wc_curve25519_make_pub(int public_size, byte* pub, int private_size,
 
     SAVE_VECTOR_REGISTERS(return _svr_ret;);
 
-    ret = curve25519(pub, priv, kCurve25519BasePoint);
+    ret = curve25519(pub, priv, (byte*)kCurve25519BasePoint);
 
     RESTORE_VECTOR_REGISTERS();
 #endif
@@ -325,14 +331,11 @@ int wc_curve25519_shared_secret_ex(curve25519_key* private_key,
         }
     }
 #endif
-    if (ret != 0) {
-        ForceZero(&o, sizeof(o));
-        return ret;
+    if (ret == 0) {
+        curve25519_copy_point(out, o.point, endian);
+        *outlen = CURVE25519_KEYSIZE;
     }
 
-    curve25519_copy_point(out, o.point, endian);
-    *outlen = CURVE25519_KEYSIZE;
-
     ForceZero(&o, sizeof(o));
 
     return ret;
@@ -372,7 +375,7 @@ int wc_curve25519_export_public_ex(curve25519_key* key, byte* out,
                                      (int)sizeof(key->k), key->k);
         key->pubSet = (ret == 0);
     }
-    /* export public point with endianess */
+    /* export public point with endianness */
     curve25519_copy_point(out, key->p.point, endian);
     *outLen = CURVE25519_KEYSIZE;
 
@@ -410,7 +413,7 @@ int wc_curve25519_import_public_ex(const byte* in, word32 inLen,
        return ECC_BAD_ARG_E;
     }
 
-    /* import public point with endianess */
+    /* import public point with endianness */
     curve25519_copy_point(key->p.point, in, endian);
     key->pubSet = 1;
 
@@ -535,7 +538,7 @@ int wc_curve25519_export_private_raw_ex(curve25519_key* key, byte* out,
         return ECC_BAD_ARG_E;
     }
 
-    /* export private scalar with endianess */
+    /* export private scalar with endianness */
     curve25519_copy_point(out, key->k, endian);
     *outLen = CURVE25519_KEYSIZE;
 
@@ -632,7 +635,7 @@ int wc_curve25519_import_private_ex(const byte* priv, word32 privSz,
     se050_curve25519_free_key(key);
 #endif
 
-    /* import private scalar with endianess */
+    /* import private scalar with endianness */
     curve25519_copy_point(key->k, priv, endian);
     key->privSet = 1;
 

+ 3 - 83
lib/wolfssl/wolfcrypt/src/des3.c

@@ -49,85 +49,6 @@
     #include <wolfssl/wolfcrypt/cryptocb.h>
 #endif
 
-/* fips wrapper calls, user can call direct */
-#if defined(HAVE_FIPS) && \
-    (!defined(HAVE_FIPS_VERSION) || (HAVE_FIPS_VERSION < 2))
-
-    int wc_Des_SetKey(Des* des, const byte* key, const byte* iv, int dir)
-    {
-        return Des_SetKey(des, key, iv, dir);
-    }
-    int wc_Des3_SetKey(Des3* des, const byte* key, const byte* iv, int dir)
-    {
-        if (des == NULL || key == NULL || dir < 0) {
-            return BAD_FUNC_ARG;
-        }
-
-        return Des3_SetKey_fips(des, key, iv, dir);
-    }
-    int wc_Des_CbcEncrypt(Des* des, byte* out, const byte* in, word32 sz)
-    {
-        return Des_CbcEncrypt(des, out, in, sz);
-    }
-    int wc_Des_CbcDecrypt(Des* des, byte* out, const byte* in, word32 sz)
-    {
-        return Des_CbcDecrypt(des, out, in, sz);
-    }
-    int wc_Des3_CbcEncrypt(Des3* des, byte* out, const byte* in, word32 sz)
-    {
-        if (des == NULL || out == NULL || in == NULL) {
-            return BAD_FUNC_ARG;
-        }
-        return Des3_CbcEncrypt_fips(des, out, in, sz);
-    }
-    int wc_Des3_CbcDecrypt(Des3* des, byte* out, const byte* in, word32 sz)
-    {
-        if (des == NULL || out == NULL || in == NULL) {
-            return BAD_FUNC_ARG;
-        }
-        return Des3_CbcDecrypt_fips(des, out, in, sz);
-    }
-
-    #ifdef WOLFSSL_DES_ECB
-        /* One block, compatibility only */
-        int wc_Des_EcbEncrypt(Des* des, byte* out, const byte* in, word32 sz)
-        {
-            return Des_EcbEncrypt(des, out, in, sz);
-        }
-        int wc_Des3_EcbEncrypt(Des3* des, byte* out, const byte* in, word32 sz)
-        {
-            return Des3_EcbEncrypt(des, out, in, sz);
-        }
-    #endif /* WOLFSSL_DES_ECB */
-
-    void wc_Des_SetIV(Des* des, const byte* iv)
-    {
-        Des_SetIV(des, iv);
-    }
-    int wc_Des3_SetIV(Des3* des, const byte* iv)
-    {
-        return Des3_SetIV_fips(des, iv);
-    }
-
-    int wc_Des3Init(Des3* des3, void* heap, int devId)
-    {
-        (void)des3;
-        (void)heap;
-        (void)devId;
-        /* FIPS doesn't support:
-            return Des3Init(des3, heap, devId); */
-        return 0;
-    }
-    void wc_Des3Free(Des3* des3)
-    {
-        (void)des3;
-        /* FIPS doesn't support:
-            Des3Free(des3); */
-    }
-
-#else /* else build without fips, or for FIPS v2 */
-
-
 #if defined(WOLFSSL_TI_CRYPT)
     #include <wolfcrypt/src/port/ti/ti-des3.c>
 #else
@@ -517,7 +438,7 @@
 
 #elif defined(HAVE_COLDFIRE_SEC)
 
-    #include <wolfssl/ctaocrypt/types.h>
+    #include <wolfssl/wolfcrypt/types.h>
 
     #include "sec.h"
     #include "mcf5475_sec.h"
@@ -1695,7 +1616,7 @@
         #elif defined(HAVE_INTEL_QA)
             return IntelQaSymDes3CbcEncrypt(&des->asyncDev, out, in, sz,
                 (const byte*)des->devKey, DES3_KEYLEN, (byte*)des->reg, DES3_IVLEN);
-        #else /* WOLFSSL_ASYNC_CRYPT_SW */
+        #elif defined(WOLFSSL_ASYNC_CRYPT_SW)
             if (wc_AsyncSwInit(&des->asyncDev, ASYNC_SW_DES3_CBC_ENCRYPT)) {
                 WC_ASYNC_SW* sw = &des->asyncDev.sw;
                 sw->des.des = des;
@@ -1746,7 +1667,7 @@
         #elif defined(HAVE_INTEL_QA)
             return IntelQaSymDes3CbcDecrypt(&des->asyncDev, out, in, sz,
                 (const byte*)des->devKey, DES3_KEYLEN, (byte*)des->reg, DES3_IVLEN);
-        #else /* WOLFSSL_ASYNC_CRYPT_SW */
+        #elif defined(WOLFSSL_ASYNC_CRYPT_SW)
             if (wc_AsyncSwInit(&des->asyncDev, ASYNC_SW_DES3_CBC_DECRYPT)) {
                 WC_ASYNC_SW* sw = &des->asyncDev.sw;
                 sw->des.des = des;
@@ -1889,5 +1810,4 @@ void wc_Des3Free(Des3* des3)
 }
 
 #endif /* WOLFSSL_TI_CRYPT */
-#endif /* HAVE_FIPS */
 #endif /* NO_DES3 */

+ 24 - 6
lib/wolfssl/wolfcrypt/src/dh.c

@@ -1021,7 +1021,7 @@ static int _ffc_pairwise_consistency_test(DhKey* key,
 /* if not using fixed points use DiscreteLogWorkFactor function for unusual size
    otherwise round up on size needed */
 #ifndef WOLFSSL_DH_CONST
-    #define WOLFSSL_DH_ROUND(x)
+    #define WOLFSSL_DH_ROUND(x) WC_DO_NOTHING
 #else
     #define WOLFSSL_DH_ROUND(x) \
         do {                    \
@@ -1352,7 +1352,7 @@ static int GeneratePublicDh(DhKey* key, byte* priv, word32 privSz,
         *pubSz = (word32)mp_unsigned_bin_size(y);
 
     mp_clear(y);
-    mp_clear(x);
+    mp_forcezero(x);
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_NO_MALLOC)
     XFREE(y, key->heap, DYNAMIC_TYPE_DH);
     XFREE(x, key->heap, DYNAMIC_TYPE_DH);
@@ -1433,7 +1433,7 @@ static int wc_DhGenerateKeyPair_Async(DhKey* key, WC_RNG* rng,
 #elif defined(HAVE_CAVIUM)
     /* TODO: Not implemented - use software for now */
 
-#else /* WOLFSSL_ASYNC_CRYPT_SW */
+#elif defined(WOLFSSL_ASYNC_CRYPT_SW)
     if (wc_AsyncSwInit(&key->asyncDev, ASYNC_SW_DH_GEN)) {
         WC_ASYNC_SW* sw = &key->asyncDev.sw;
         sw->dhGen.key = key;
@@ -2207,7 +2207,7 @@ static int wc_DhAgree_Async(DhKey* key, byte* agree, word32* agreeSz,
 #elif defined(HAVE_CAVIUM)
     /* TODO: Not implemented - use software for now */
 
-#else /* WOLFSSL_ASYNC_CRYPT_SW */
+#elif defined(WOLFSSL_ASYNC_CRYPT_SW)
     if (wc_AsyncSwInit(&key->asyncDev, ASYNC_SW_DH_AGREE)) {
         WC_ASYNC_SW* sw = &key->asyncDev.sw;
         sw->dhAgree.key = key;
@@ -2886,6 +2886,11 @@ int wc_DhGenerateParams(WC_RNG *rng, int modSz, DhKey *dh)
             ret = 0;
     unsigned char *buf = NULL;
 
+#if !defined(WOLFSSL_SMALL_STACK) || defined(WOLFSSL_NO_MALLOC)
+    XMEMSET(tmp, 0, sizeof(tmp));
+    XMEMSET(tmp2, 0, sizeof(tmp2));
+#endif
+
     if (rng == NULL || dh == NULL)
         ret = BAD_FUNC_ARG;
 
@@ -2934,9 +2939,22 @@ int wc_DhGenerateParams(WC_RNG *rng, int modSz, DhKey *dh)
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_NO_MALLOC)
     if (ret == 0) {
-        if (((tmp = (mp_int *)XMALLOC(sizeof(*tmp), NULL, DYNAMIC_TYPE_WOLF_BIGINT)) == NULL) ||
-            ((tmp2 = (mp_int *)XMALLOC(sizeof(*tmp2), NULL, DYNAMIC_TYPE_WOLF_BIGINT)) == NULL))
+        if ((tmp = (mp_int *)XMALLOC(sizeof(*tmp), NULL,
+                DYNAMIC_TYPE_WOLF_BIGINT)) == NULL) {
+            ret = MEMORY_E;
+        }
+        else {
+            XMEMSET(tmp, 0, sizeof(*tmp));
+        }
+    }
+    if (ret == 0) {
+        if ((tmp2 = (mp_int *)XMALLOC(sizeof(*tmp2), NULL,
+                DYNAMIC_TYPE_WOLF_BIGINT)) == NULL) {
             ret = MEMORY_E;
+        }
+        else {
+            XMEMSET(tmp2, 0, sizeof(*tmp2));
+        }
     }
 #endif
 

+ 4 - 4
lib/wolfssl/wolfcrypt/src/dsa.c

@@ -260,7 +260,7 @@ int wc_MakeDsaParameters(WC_RNG *rng, int modulus_size, DsaKey *dsa)
      */
     switch (modulus_size) {
 #ifdef WOLFSSL_DSA_768_MODULUS
-    /* This key length is unsecure and only included for bind 9 testing */
+    /* This key length is insecure and only included for bind 9 testing */
         case 768:
 #endif
         case 1024:
@@ -641,7 +641,7 @@ int wc_DsaExportKeyRaw(DsaKey* dsa, byte* x, word32* xSz, byte* y, word32* ySz)
 
 int wc_DsaSign(const byte* digest, byte* out, DsaKey* key, WC_RNG* rng)
 {
-    /* use sha1 by default for backwards compatability */
+    /* use sha1 by default for backwards compatibility */
     return wc_DsaSign_ex(digest, WC_SHA_DIGEST_SIZE, out, key, rng);
 }
 
@@ -756,7 +756,7 @@ int wc_DsaSign_ex(const byte* digest, word32 digestSz, byte* out, DsaKey* key,
                 break;
             }
 
-            /* k is a random numnber and it should be less than q-1
+            /* k is a random number and it should be less than q-1
              * if k greater than repeat
              */
             /* Step 6 */
@@ -976,7 +976,7 @@ int wc_DsaSign_ex(const byte* digest, word32 digestSz, byte* out, DsaKey* key,
 
 int wc_DsaVerify(const byte* digest, const byte* sig, DsaKey* key, int* answer)
 {
-    /* use sha1 by default for backwards compatability */
+    /* use sha1 by default for backwards compatibility */
     return wc_DsaVerify_ex(digest, WC_SHA_DIGEST_SIZE, sig, key, answer);
 }
 

Разлика између датотеке није приказан због своје велике величине
+ 412 - 126
lib/wolfssl/wolfcrypt/src/ecc.c


+ 8 - 9
lib/wolfssl/wolfcrypt/src/eccsi.c

@@ -1350,15 +1350,13 @@ static int eccsi_mulmod_base_add(EccsiKey* key, const mp_int* n,
 {
     int err = 0;
 
-#ifdef WOLFSSL_HAVE_SP_ECC
-#ifndef WOLFSSL_SP_NO_256
+#if defined(WOLFSSL_HAVE_SP_ECC) && !defined(WOLFSSL_SP_NO_256)
     if ((key->ecc.idx != ECC_CUSTOM_IDX) &&
             (ecc_sets[key->ecc.idx].id == ECC_SECP256R1)) {
         err = sp_ecc_mulmod_base_add_256(n, a, 1, res, map, key->heap);
     }
     else
 #endif
-#endif
 #ifndef WOLFSSL_SP_MATH
     {
         EccsiKeyParams* params = &key->params;
@@ -1377,7 +1375,12 @@ static int eccsi_mulmod_base_add(EccsiKey* key, const mp_int* n,
     {
         err = NOT_COMPILED_IN;
     }
+    (void)key;
+    (void)n;
+    (void)a;
+    (void)res;
     (void)mp;
+    (void)map;
 #endif
 
     return err;
@@ -1401,14 +1404,12 @@ static int eccsi_mulmod_point(EccsiKey* key, const mp_int* n, ecc_point* point,
 {
     int err;
 
-#ifdef WOLFSSL_HAVE_SP_ECC
-#ifndef WOLFSSL_SP_NO_256
+#if defined(WOLFSSL_HAVE_SP_ECC) && !defined(WOLFSSL_SP_NO_256)
     if ((key->ecc.idx != ECC_CUSTOM_IDX) &&
             (ecc_sets[key->ecc.idx].id == ECC_SECP256R1)) {
         err = sp_ecc_mulmod_256(n, point, res, map, key->heap);
     }
     else
-#endif
 #endif
     {
         EccsiKeyParams* params = &key->params;
@@ -1437,8 +1438,7 @@ static int eccsi_mulmod_point(EccsiKey* key, const mp_int* n, ecc_point* point,
 static int eccsi_mulmod_point_add(EccsiKey* key, const mp_int* n,
         ecc_point* point, ecc_point* a, ecc_point* res, mp_digit mp, int map)
 {
-#ifdef WOLFSSL_HAVE_SP_ECC
-#ifndef WOLFSSL_SP_NO_256
+#if defined(WOLFSSL_HAVE_SP_ECC) && !defined(WOLFSSL_SP_NO_256)
     int err = NOT_COMPILED_IN;
 
     if ((key->ecc.idx != ECC_CUSTOM_IDX) &&
@@ -1449,7 +1449,6 @@ static int eccsi_mulmod_point_add(EccsiKey* key, const mp_int* n,
     (void)mp;
 
     return err;
-#endif
 #else
     int err;
     EccsiKeyParams* params = &key->params;

+ 14 - 8
lib/wolfssl/wolfcrypt/src/ed25519.c

@@ -182,11 +182,12 @@ static int ed25519_hash(ed25519_key* key, const byte* in, word32 inLen,
     return ret;
 }
 
+#ifdef HAVE_ED25519_MAKE_KEY
 int wc_ed25519_make_public(ed25519_key* key, unsigned char* pubKey,
                            word32 pubKeySz)
 {
     int   ret = 0;
-    byte  az[ED25519_PRV_KEY_SIZE];
+    ALIGN16 byte az[ED25519_PRV_KEY_SIZE];
 #if !defined(FREESCALE_LTC_ECC)
     ge_p3 A;
 #endif
@@ -267,6 +268,7 @@ int wc_ed25519_make_key(WC_RNG* rng, int keySz, ed25519_key* key)
 
     return ret;
 }
+#endif /* HAVE_ED25519_MAKE_KEY */
 
 
 #ifdef HAVE_ED25519_SIGN
@@ -294,14 +296,14 @@ int wc_ed25519_sign_msg_ex(const byte* in, word32 inLen, byte* out,
     ret = se050_ed25519_sign_msg(in, inLen, out, outLen, key);
 #else
 #ifdef FREESCALE_LTC_ECC
-    byte   tempBuf[ED25519_PRV_KEY_SIZE];
+    ALIGN16 byte tempBuf[ED25519_PRV_KEY_SIZE];
     ltc_pkha_ecc_point_t ltcPoint = {0};
 #else
     ge_p3  R;
 #endif
-    byte   nonce[WC_SHA512_DIGEST_SIZE];
-    byte   hram[WC_SHA512_DIGEST_SIZE];
-    byte   az[ED25519_PRV_KEY_SIZE];
+    ALIGN16 byte nonce[WC_SHA512_DIGEST_SIZE];
+    ALIGN16 byte hram[WC_SHA512_DIGEST_SIZE];
+    ALIGN16 byte az[ED25519_PRV_KEY_SIZE];
 
     /* sanity check on arguments */
     if (in == NULL || out == NULL || outLen == NULL || key == NULL ||
@@ -615,8 +617,8 @@ static int ed25519_verify_msg_final_with_sha(const byte* sig, word32 sigLen,
                                              int* res, ed25519_key* key,
                                              wc_Sha512 *sha)
 {
-    byte   rcheck[ED25519_KEY_SIZE];
-    byte   h[WC_SHA512_DIGEST_SIZE];
+    ALIGN16 byte rcheck[ED25519_KEY_SIZE];
+    ALIGN16 byte h[WC_SHA512_DIGEST_SIZE];
 #ifndef FREESCALE_LTC_ECC
     ge_p3  A;
     ge_p2  R;
@@ -1236,7 +1238,8 @@ int wc_ed25519_export_key(ed25519_key* key,
 int wc_ed25519_check_key(ed25519_key* key)
 {
     int ret = 0;
-    unsigned char pubKey[ED25519_PUB_KEY_SIZE];
+#ifdef HAVE_ED25519_MAKE_KEY
+    ALIGN16 unsigned char pubKey[ED25519_PUB_KEY_SIZE];
 
     if (!key->pubKeySet)
         ret = PUBLIC_KEY_E;
@@ -1244,6 +1247,9 @@ int wc_ed25519_check_key(ed25519_key* key)
         ret = wc_ed25519_make_public(key, pubKey, sizeof(pubKey));
     if (ret == 0 && XMEMCMP(pubKey, key->p, ED25519_PUB_KEY_SIZE) != 0)
         ret = PUBLIC_KEY_E;
+#else
+     (void)key;
+#endif /* HAVE_ED25519_MAKE_KEY */
 
     return ret;
 }

+ 1 - 1
lib/wolfssl/wolfcrypt/src/ed448.c

@@ -168,7 +168,7 @@ static int ed448_hash(ed448_key* key, const byte* in, word32 inLen,
 /* Derive the public key for the private key.
  *
  * key       [in]  Ed448 key object.
- * pubKey    [in]  Byte array to hold te public key.
+ * pubKey    [in]  Byte array to hold the public key.
  * pubKeySz  [in]  Size of the array in bytes.
  * returns BAD_FUNC_ARG when key is NULL or pubKeySz is not equal to
  *         ED448_PUB_KEY_SIZE,

+ 13 - 1
lib/wolfssl/wolfcrypt/src/error.c

@@ -63,7 +63,7 @@ const char* wc_GetErrorString(int error)
     case WC_PENDING_E:
         return "wolfCrypt Operation Pending (would block / eagain) error";
 
-    case WC_NOT_PENDING_E:
+    case WC_NO_PENDING_E:
         return "wolfCrypt operation not pending error";
 
     case MP_INIT_E :
@@ -589,6 +589,18 @@ const char* wc_GetErrorString(int error)
     case ASN_LEN_E:
         return "ASN.1 length invalid";
 
+    case SM4_GCM_AUTH_E:
+        return "SM4-GCM Authentication check fail";
+
+    case SM4_CCM_AUTH_E:
+        return "SM4-CCM Authentication check fail";
+
+    case FIPS_DEGRADED_E:
+        return "FIPS module in DEGRADED mode";
+
+    case AES_EAX_AUTH_E:
+        return "AES-EAX Authentication check fail";
+
     default:
         return "unknown error number";
 

Разлика између датотеке није приказан због своје велике величине
+ 595 - 187
lib/wolfssl/wolfcrypt/src/evp.c


+ 1 - 1
lib/wolfssl/wolfcrypt/src/ext_kyber.c

@@ -548,7 +548,7 @@ int wc_KyberKey_Decapsulate(KyberKey* key, unsigned char* ss,
  * @param  [in]       in   Buffer holding encoded key.
  * @param  [in]       len  Length of data in buffer.
  * @return  0 on success.
- * @return  BAD_FUNC_ARG when key ot in is NULL.
+ * @return  BAD_FUNC_ARG when key or in is NULL.
  * @return  NOT_COMPILED_IN when key type is not supported.
  * @return  BUFFER_E when len is not the correct size.
  */

+ 972 - 0
lib/wolfssl/wolfcrypt/src/ext_lms.c

@@ -0,0 +1,972 @@
+/* ext_lms.c
+ *
+ * Copyright (C) 2006-2023 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#ifdef HAVE_CONFIG_H
+    #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/logging.h>
+
+#ifdef WOLFSSL_HAVE_LMS
+#include <wolfssl/wolfcrypt/ext_lms.h>
+
+#ifdef NO_INLINE
+    #include <wolfssl/wolfcrypt/misc.h>
+#else
+    #define WOLFSSL_MISC_INCLUDED
+    #include <wolfcrypt/src/misc.c>
+#endif
+
+#ifndef WOLFSSL_LMS_VERIFY_ONLY
+/* If built against hss_lib_thread.a, the hash-sigs lib will spawn
+ * worker threads to parallelize cpu intensive tasks. This will mainly
+ * speedup key generation and signing, and to a lesser extent
+ * verifying for larger levels values.
+ *
+ * Their default max is 16 worker threads, but can be capped with
+ * hss_extra_info_set_threads(). To be safe we are capping at 4 here.
+ * */
+#define EXT_LMS_MAX_THREADS (4)
+
+/* The hash-sigs hss_generate_private_key API requires a generate_random
+ * callback that only has output and length args. The RNG struct must be global
+ * to the function. Maybe there should be a wc_LmsKey_SetRngCb. */
+static THREAD_LS_T WC_RNG * LmsRng = NULL;
+
+static bool LmsGenerateRand(void * output, size_t length)
+{
+    int ret = 0;
+
+    if (output == NULL || LmsRng == NULL) {
+        return false;
+    }
+
+    if (length == 0) {
+        return true;
+    }
+
+    ret = wc_RNG_GenerateBlock(LmsRng, output, (word32) length);
+
+    if (ret) {
+        WOLFSSL_MSG("error: LmsGenerateRand failed");
+        return false;
+    }
+
+    return true;
+}
+
+/* Write callback passed into hash-sigs hss lib.
+ *
+ * Returns true on success. */
+static bool LmsWritePrivKey(unsigned char *private_key,
+                            size_t len_private_key, void *lmsKey)
+{
+    LmsKey *      key = (LmsKey *) lmsKey;
+    enum wc_LmsRc ret = WC_LMS_RC_NONE;
+
+    if (private_key == NULL || key == NULL || len_private_key <= 0) {
+        WOLFSSL_MSG("error: LmsWritePrivKey: invalid args");
+        return false;
+    }
+
+    if (key->state != WC_LMS_STATE_PARMSET && key->state != WC_LMS_STATE_OK) {
+       /* The LmsKey is not ready for writing. */
+        WOLFSSL_MSG("error: LmsWritePrivKey: LMS key not in writeable state");
+        return false;
+    }
+
+    if (key->write_private_key == NULL) {
+        WOLFSSL_MSG("error: LmsWritePrivKey: LMS key write callback not set");
+        key->state = WC_LMS_STATE_BAD;
+        return false;
+    }
+
+    /* Use write callback that saves private key to non-volatile storage. */
+    ret = key->write_private_key(private_key, (word32)len_private_key,
+                                 key->context);
+
+    if (ret != WC_LMS_RC_SAVED_TO_NV_MEMORY) {
+        WOLFSSL_MSG("error: LmsKey write_private_key failed");
+        WOLFSSL_MSG(wc_LmsKey_RcToStr(ret));
+        key->state = WC_LMS_STATE_BAD;
+        return false;
+    }
+
+    return true;
+}
+
+/* Read callback passed into hash-sigs hss lib.
+ *
+ * Returns true on success. */
+static bool LmsReadPrivKey(unsigned char *private_key,
+                           size_t len_private_key, void *lmsKey)
+{
+    LmsKey *      key = (LmsKey *) lmsKey;
+    enum wc_LmsRc ret = WC_LMS_RC_NONE;
+
+    if (private_key == NULL || key == NULL || len_private_key <= 0) {
+        WOLFSSL_MSG("error: LmsReadPrivKey: invalid args");
+        return false;
+    }
+
+    if (key->state != WC_LMS_STATE_PARMSET && key->state != WC_LMS_STATE_OK) {
+       /* The LmsKey is not ready for reading. */
+        WOLFSSL_MSG("error: LmsReadPrivKey: LMS key not in readable state");
+        return false;
+    }
+
+    if (key->read_private_key == NULL) {
+        WOLFSSL_MSG("error: LmsReadPrivKey: LMS key read callback not set");
+        key->state = WC_LMS_STATE_BAD;
+        return false;
+    }
+
+    /* Use read callback that reads private key from non-volatile storage. */
+    ret = key->read_private_key(private_key, (word32)len_private_key,
+                                key->context);
+
+    if (ret != WC_LMS_RC_READ_TO_MEMORY) {
+        WOLFSSL_MSG("error: LmsKey read_private_key failed");
+        WOLFSSL_MSG(wc_LmsKey_RcToStr(ret));
+        key->state = WC_LMS_STATE_BAD;
+        return false;
+    }
+
+    return true;
+}
+#endif /* ifndef WOLFSSL_LMS_VERIFY_ONLY */
+
+const char * wc_LmsKey_ParmToStr(enum wc_LmsParm lmsParm)
+{
+    switch (lmsParm) {
+    case WC_LMS_PARM_NONE:
+        return "LMS_NONE";
+
+    case WC_LMS_PARM_L1_H15_W2:
+        return "LMS/HSS L1_H15_W2";
+
+    case WC_LMS_PARM_L1_H15_W4:
+        return "LMS/HSS L1_H15_W4";
+
+    case WC_LMS_PARM_L2_H10_W2:
+        return "LMS/HSS L2_H10_W2";
+
+    case WC_LMS_PARM_L2_H10_W4:
+        return "LMS/HSS L2_H10_W4";
+
+    case WC_LMS_PARM_L2_H10_W8:
+        return "LMS/HSS L2_H10_W8";
+
+    case WC_LMS_PARM_L3_H5_W2:
+        return "LMS/HSS L3_H5_W2";
+
+    case WC_LMS_PARM_L3_H5_W4:
+        return "LMS/HSS L3_H5_W4";
+
+    case WC_LMS_PARM_L3_H5_W8:
+        return "LMS/HSS L3_H5_W8";
+
+    case WC_LMS_PARM_L3_H10_W4:
+        return "LMS/HSS L3_H10_W4";
+
+    case WC_LMS_PARM_L4_H5_W8:
+        return "LMS/HSS L4_H5_W8";
+
+    default:
+        WOLFSSL_MSG("error: invalid LMS parameter");
+        break;
+    }
+
+    return "LMS_INVALID";
+}
+
+const char * wc_LmsKey_RcToStr(enum wc_LmsRc lmsEc)
+{
+    switch (lmsEc) {
+    case WC_LMS_RC_NONE:
+        return "LMS_RC_NONE";
+
+    case WC_LMS_RC_BAD_ARG:
+        return "LMS_RC_BAD_ARG";
+
+    case WC_LMS_RC_WRITE_FAIL:
+        return "LMS_RC_WRITE_FAIL";
+
+    case WC_LMS_RC_READ_FAIL:
+        return "LMS_RC_READ_FAIL";
+
+    case WC_LMS_RC_SAVED_TO_NV_MEMORY:
+        return "LMS_RC_SAVED_TO_NV_MEMORY";
+
+    case WC_LMS_RC_READ_TO_MEMORY:
+        return "LMS_RC_READ_TO_MEMORY";
+
+    default:
+        WOLFSSL_MSG("error: invalid LMS error code");
+        break;
+    }
+
+    return "LMS_RC_INVALID";
+}
+
+/* Init an LMS key.
+ *
+ * Call this before setting the parms of an LMS key.
+ *
+ * Returns 0 on success.
+ * */
+int wc_LmsKey_Init(LmsKey * key, void * heap, int devId)
+{
+    if (key == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    (void) heap;
+    (void) devId;
+
+    ForceZero(key, sizeof(LmsKey));
+
+#ifndef WOLFSSL_LMS_VERIFY_ONLY
+    hss_init_extra_info(&key->info);
+    /* Set the max number of worker threads that hash-sigs can spawn. */
+    hss_extra_info_set_threads(&key->info, EXT_LMS_MAX_THREADS);
+
+    key->working_key = NULL;
+    key->write_private_key = NULL;
+    key->read_private_key = NULL;
+    key->context = NULL;
+#endif /* ifndef WOLFSSL_LMS_VERIFY_ONLY */
+    key->state = WC_LMS_STATE_INITED;
+
+    return 0;
+}
+
+/* Set the wc_LmsParm of an LMS key.
+ *
+ * Use this if you wish to set a key with a predefined parameter set,
+ * such as WC_LMS_PARM_L2_H10_W8.
+ *
+ * Key must be inited before calling this.
+ *
+ * Returns 0 on success.
+ * */
+int wc_LmsKey_SetLmsParm(LmsKey * key, enum wc_LmsParm lmsParm)
+{
+    if (key == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    /* If NONE is passed, default to the lowest predefined set. */
+    switch (lmsParm) {
+    case WC_LMS_PARM_NONE:
+    case WC_LMS_PARM_L1_H15_W2:
+        return wc_LmsKey_SetParameters(key, 1, 15, 2);
+
+    case WC_LMS_PARM_L1_H15_W4:
+        return wc_LmsKey_SetParameters(key, 1, 15, 4);
+
+    case WC_LMS_PARM_L2_H10_W2:
+        return wc_LmsKey_SetParameters(key, 2, 10, 2);
+
+    case WC_LMS_PARM_L2_H10_W4:
+        return wc_LmsKey_SetParameters(key, 2, 10, 4);
+
+    case WC_LMS_PARM_L2_H10_W8:
+        return wc_LmsKey_SetParameters(key, 2, 10, 8);
+
+    case WC_LMS_PARM_L3_H5_W2:
+        return wc_LmsKey_SetParameters(key, 3, 5, 2);
+
+    case WC_LMS_PARM_L3_H5_W4:
+        return wc_LmsKey_SetParameters(key, 3, 5, 4);
+
+    case WC_LMS_PARM_L3_H5_W8:
+        return wc_LmsKey_SetParameters(key, 3, 5, 8);
+
+    case WC_LMS_PARM_L3_H10_W4:
+        return wc_LmsKey_SetParameters(key, 3, 10, 4);
+
+    case WC_LMS_PARM_L4_H5_W8:
+        return wc_LmsKey_SetParameters(key, 4, 5, 8);
+
+    default:
+        WOLFSSL_MSG("error: invalid LMS parameter set");
+        break;
+    }
+
+    return BAD_FUNC_ARG;
+}
+
+/* Set the parameters of an LMS key.
+ *
+ * Use this if you wish to set specific parameters not found in the
+ * wc_LmsParm predefined sets. See comments in lms.h for allowed
+ * parameters.
+ *
+ * Key must be inited before calling this.
+ *
+ * Returns 0 on success.
+ * */
+int wc_LmsKey_SetParameters(LmsKey * key, int levels, int height,
+    int winternitz)
+{
+    int         i = 0;
+    param_set_t lm = LMS_SHA256_N32_H5;
+    param_set_t ots = LMOTS_SHA256_N32_W1;
+
+    if (key == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    if (key->state != WC_LMS_STATE_INITED) {
+        WOLFSSL_MSG("error: LmsKey needs init");
+        return -1;
+    }
+
+    /* Verify inputs make sense.
+     *
+     * Note: there does not seem to be a define for min or
+     * max Winternitz integer in hash-sigs lib or RFC8554. */
+
+    if (levels < MIN_HSS_LEVELS || levels > MAX_HSS_LEVELS) {
+        WOLFSSL_MSG("error: invalid level parameter");
+        return BAD_FUNC_ARG;
+    }
+
+    if (height < MIN_MERKLE_HEIGHT || height > MAX_MERKLE_HEIGHT) {
+        WOLFSSL_MSG("error: invalid height parameter");
+        return BAD_FUNC_ARG;
+    }
+
+    switch (height) {
+    case 5:
+        lm = LMS_SHA256_N32_H5;
+        break;
+    case 10:
+        lm = LMS_SHA256_N32_H10;
+        break;
+    case 15:
+        lm = LMS_SHA256_N32_H15;
+        break;
+    case 20:
+        lm = LMS_SHA256_N32_H20;
+        break;
+    case 25:
+        lm = LMS_SHA256_N32_H25;
+        break;
+    default:
+        WOLFSSL_MSG("error: invalid height parameter");
+        return BAD_FUNC_ARG;
+    }
+
+    switch (winternitz) {
+    case 1:
+        ots = LMOTS_SHA256_N32_W1;
+        break;
+    case 2:
+        ots = LMOTS_SHA256_N32_W2;
+        break;
+    case 4:
+        ots = LMOTS_SHA256_N32_W4;
+        break;
+    case 8:
+        ots = LMOTS_SHA256_N32_W8;
+        break;
+    default:
+        WOLFSSL_MSG("error: invalid winternitz parameter");
+        return BAD_FUNC_ARG;
+    }
+
+    key->levels = levels;
+
+    for (i = 0; i < levels; ++i) {
+        key->lm_type[i] = lm;
+        key->lm_ots_type[i] = ots;
+    }
+
+    /* Move the state to parms set.
+     * Key is ready for MakeKey or Reload. */
+    key->state = WC_LMS_STATE_PARMSET;
+
+    return 0;
+}
+
+/* Get the parameters of an LMS key.
+ *
+ * Key must be inited and parameters set before calling this.
+ *
+ * Returns 0 on success.
+ * */
+int wc_LmsKey_GetParameters(const LmsKey * key, int * levels, int * height,
+    int * winternitz)
+{
+    if (key == NULL || levels == NULL || height == NULL || winternitz == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    /* This shouldn't happen, but check the LmsKey parameters aren't invalid. */
+
+    if (key->levels < MIN_HSS_LEVELS || key->levels > MAX_HSS_LEVELS) {
+        WOLFSSL_MSG("error: LmsKey invalid level parameter");
+        return -1;
+    }
+
+    *levels = key->levels;
+
+    switch (key->lm_type[0]) {
+    case LMS_SHA256_N32_H5:
+        *height = 5;
+        break;
+    case LMS_SHA256_N32_H10:
+        *height = 10;
+        break;
+    case LMS_SHA256_N32_H15:
+        *height = 15;
+        break;
+    case LMS_SHA256_N32_H20:
+        *height = 20;
+        break;
+    case LMS_SHA256_N32_H25:
+        *height = 25;
+        break;
+    default:
+        WOLFSSL_MSG("error: LmsKey invalid height parameter");
+        return -1;
+    }
+
+    switch (key->lm_ots_type[0]) {
+    case LMOTS_SHA256_N32_W1:
+        *winternitz = 1;
+        break;
+    case LMOTS_SHA256_N32_W2:
+        *winternitz = 2;
+        break;
+    case LMOTS_SHA256_N32_W4:
+        *winternitz = 4;
+        break;
+    case LMOTS_SHA256_N32_W8:
+        *winternitz = 8;
+        break;
+    default:
+        WOLFSSL_MSG("error: LmsKey invalid winternitz parameter");
+        return -1;
+    }
+
+    return 0;
+}
+
+/* Frees the LMS key from memory.
+ *
+ * This does not affect the private key saved to non-volatile storage.
+ * */
+void wc_LmsKey_Free(LmsKey* key)
+{
+    if (key == NULL) {
+        return;
+    }
+
+#ifndef WOLFSSL_LMS_VERIFY_ONLY
+    if (key->working_key != NULL) {
+        hss_free_working_key(key->working_key);
+        key->working_key = NULL;
+    }
+#endif /* ifndef WOLFSSL_LMS_VERIFY_ONLY */
+
+    ForceZero(key, sizeof(LmsKey));
+
+    key->state = WC_LMS_STATE_FREED;
+
+    return;
+}
+
+#ifndef WOLFSSL_LMS_VERIFY_ONLY
+/* Set the write private key callback to the LMS key structure.
+ *
+ * The callback must be able to write/update the private key to
+ * non-volatile storage.
+ *
+ * Returns 0 on success.
+ * */
+int wc_LmsKey_SetWriteCb(LmsKey * key, write_private_key_cb write_cb)
+{
+    if (key == NULL || write_cb == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    /* Changing the write callback of an already working key is forbidden. */
+    if (key->state == WC_LMS_STATE_OK) {
+        WOLFSSL_MSG("error: wc_LmsKey_SetWriteCb: key in use");
+        return -1;
+    }
+
+    key->write_private_key = write_cb;
+
+    return 0;
+}
+
+/* Set the read private key callback to the LMS key structure.
+ *
+ * The callback must be able to read the private key from
+ * non-volatile storage.
+ *
+ * Returns 0 on success.
+ * */
+int wc_LmsKey_SetReadCb(LmsKey * key, read_private_key_cb read_cb)
+{
+    if (key == NULL || read_cb == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    /* Changing the read callback of an already working key is forbidden. */
+    if (key->state == WC_LMS_STATE_OK) {
+        WOLFSSL_MSG("error: wc_LmsKey_SetReadCb: key in use");
+        return -1;
+    }
+
+    key->read_private_key = read_cb;
+
+    return 0;
+}
+
+/* Sets the context to be used by write and read callbacks.
+ *
+ * E.g. this could be a filename if the callbacks write/read to file.
+ *
+ * Returns 0 on success.
+ * */
+int wc_LmsKey_SetContext(LmsKey * key, void * context)
+{
+    if (key == NULL || context == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    /* Setting context of an already working key is forbidden. */
+    if (key->state == WC_LMS_STATE_OK) {
+        WOLFSSL_MSG("error: wc_LmsKey_SetContext: key in use");
+        return -1;
+    }
+
+    key->context = context;
+
+    return 0;
+}
+
+/* Make the LMS private/public key pair. The key must have its parameters
+ * set before calling this.
+ *
+ * Write/read callbacks, and context data, must be set prior.
+ * Key must have parameters set.
+ *
+ * Returns 0 on success.
+ * */
+int wc_LmsKey_MakeKey(LmsKey* key, WC_RNG * rng)
+{
+    bool result = true;
+
+    if (key == NULL || rng == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    if (key->state != WC_LMS_STATE_PARMSET) {
+        WOLFSSL_MSG("error: LmsKey not ready for generation");
+        return -1;
+    }
+
+    if (key->write_private_key == NULL || key->read_private_key == NULL) {
+        WOLFSSL_MSG("error: LmsKey write/read callbacks are not set");
+        return -1;
+    }
+
+    if (key->context == NULL) {
+        WOLFSSL_MSG("error: LmsKey context is not set");
+        return -1;
+    }
+
+    LmsRng = rng;
+
+   /* TODO: The hash-sigs lib allows you to save variable length auxiliary
+    * data, which can be used to speed up key reloading when signing. The
+    * aux data can be 300B - 1KB in size.
+    *
+    * Not implemented at the moment.
+    *
+    * key->aux_data_len = hss_get_aux_data_len(AUX_DATA_MAX_LEN, key->levels,
+    *                                          key->lm_type,
+    *                                          key->lm_ots_type);
+    *
+    * key->aux_data = XMALLOC(key->aux_data_len, NULL,
+    *                         DYNAMIC_TYPE_TMP_BUFFER);
+    */
+
+    /* First generate the private key using the parameters and callbacks.
+     * If successful, private key will be saved to non-volatile storage,
+     * and the public key will be in memory. */
+    result = hss_generate_private_key(LmsGenerateRand, key->levels,
+                                      key->lm_type, key->lm_ots_type,
+                                      LmsWritePrivKey, key,
+                                      key->pub, sizeof(key->pub),
+                                      NULL, 0, &key->info);
+
+    if (!result) {
+        WOLFSSL_MSG("error: hss_generate_private_key failed");
+        key->state = WC_LMS_STATE_BAD;
+        return -1;
+    }
+
+    /* Once generated, now we must load the private key so we have
+     * an hss working key for signing operations. */
+    key->working_key = hss_load_private_key(LmsReadPrivKey, key,
+                                            0, NULL, 0, &key->info);
+
+    if (key->working_key == NULL) {
+        WOLFSSL_MSG("error: hss_load_private_key failed");
+        key->state = WC_LMS_STATE_BAD;
+        return -1;
+    }
+
+    /* This should not happen, but check just in case. */
+    if (wc_LmsKey_SigsLeft(key) == 0) {
+        WOLFSSL_MSG("error: generated LMS key signatures exhausted");
+        key->state = WC_LMS_STATE_NOSIGS;
+        return -1;
+    }
+
+    key->state = WC_LMS_STATE_OK;
+
+    return 0;
+}
+
+/* Reload a key that has been prepared with the appropriate parms and
+ * data. Use this if you wish to resume signing with an existing key.
+ *
+ * Write/read callbacks, and context data, must be set prior.
+ * Key must have parameters set.
+ *
+ * Returns 0 on success. */
+int wc_LmsKey_Reload(LmsKey * key)
+{
+    bool result = true;
+
+    if (key == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    if (key->state != WC_LMS_STATE_PARMSET) {
+        WOLFSSL_MSG("error: LmsKey not ready for reload");
+        return -1;
+    }
+
+    if (key->write_private_key == NULL || key->read_private_key == NULL) {
+        WOLFSSL_MSG("error: LmsKey write/read callbacks are not set");
+        return -1;
+    }
+
+    if (key->context == NULL) {
+        WOLFSSL_MSG("error: LmsKey context is not set");
+        return -1;
+    }
+
+    key->working_key = hss_load_private_key(LmsReadPrivKey, key,
+                                            0, NULL, 0, &key->info);
+
+    if (key->working_key == NULL) {
+        WOLFSSL_MSG("error: hss_load_private_key failed");
+        key->state = WC_LMS_STATE_BAD;
+        return -1;
+    }
+
+    result = hss_get_parameter_set(&key->levels, key->lm_type,
+                                   key->lm_ots_type, LmsReadPrivKey, key);
+
+    if (!result) {
+        WOLFSSL_MSG("error: hss_get_parameter_set failed");
+        key->state = WC_LMS_STATE_BAD;
+        hss_free_working_key(key->working_key);
+        key->working_key = NULL;
+        return -1;
+    }
+
+    /* Double check the key actually has signatures left. */
+    if (wc_LmsKey_SigsLeft(key) == 0) {
+        WOLFSSL_MSG("error: reloaded LMS key signatures exhausted");
+        key->state = WC_LMS_STATE_NOSIGS;
+        return -1;
+    }
+
+    key->state = WC_LMS_STATE_OK;
+
+    return 0;
+}
+
+/* Given a levels, height, winternitz parameter set, determine
+ * the private key length */
+int wc_LmsKey_GetPrivLen(const LmsKey * key, word32 * len)
+{
+    if (key == NULL || len == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    *len = (word32) hss_get_private_key_len(key->levels, key->lm_type,
+                                            key->lm_ots_type);
+
+    return 0;
+}
+
+int wc_LmsKey_Sign(LmsKey* key, byte * sig, word32 * sigSz, const byte * msg,
+    int msgSz)
+{
+    bool   result = true;
+    size_t len = 0;
+
+    if (key == NULL || sig == NULL || sigSz == NULL || msg == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    if (msgSz <= 0) {
+        return BAD_FUNC_ARG;
+    }
+
+    if (key->state == WC_LMS_STATE_NOSIGS) {
+        WOLFSSL_MSG("error: LMS signatures exhausted");
+        return -1;
+    }
+    else if (key->state != WC_LMS_STATE_OK) {
+       /* The key had an error the last time it was used, and we
+        * can't guarantee its state. */
+        WOLFSSL_MSG("error: can't sign, LMS key not in good state");
+        return -1;
+    }
+
+    len = hss_get_signature_len(key->levels, key->lm_type, key->lm_ots_type);
+
+    if (len == 0) {
+        /* Key parameters are invalid. */
+        WOLFSSL_MSG("error: hss_get_signature_len failed");
+        key->state = WC_LMS_STATE_BAD;
+        return -1;
+    }
+
+    result = hss_generate_signature(key->working_key, LmsWritePrivKey,
+                                    key, (const void *) msg, msgSz,
+                                    sig, len, &key->info);
+
+    if (!result) {
+        if (wc_LmsKey_SigsLeft(key) == 0) {
+            WOLFSSL_MSG("error: LMS signatures exhausted");
+            key->state = WC_LMS_STATE_NOSIGS;
+            return -1;
+        }
+
+        WOLFSSL_MSG("error: hss_generate_signature failed");
+        key->state = WC_LMS_STATE_BAD;
+        return -1;
+    }
+
+    *sigSz = (word32) len;
+
+    return 0;
+}
+
+/* Returns 1 if there are signatures remaining.
+ * Returns 0 if available signatures are exhausted.
+ *
+ * Note: the number of remaining signatures is hidden behind an opaque
+ * pointer in the hash-sigs lib. We could add a counter here that is
+ * decremented on every signature. The number of available signatures
+ * grows as
+ *   N = 2 ** (levels * height)
+ * so it would need to be a big integer. */
+int wc_LmsKey_SigsLeft(LmsKey * key)
+{
+    if (key == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    if (hss_extra_info_test_last_signature(&key->info)) {
+        return 0;
+    }
+
+    return 1;
+}
+
+#endif /* ifndef WOLFSSL_LMS_VERIFY_ONLY*/
+
+/* Given a levels, height, winternitz parameter set, determine
+ * the public key length */
+int wc_LmsKey_GetPubLen(const LmsKey * key, word32 * len)
+{
+    if (key == NULL || len == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    *len = (word32) hss_get_public_key_len(key->levels, key->lm_type,
+                                           key->lm_ots_type);
+
+    return 0;
+}
+
+/* Export a generated public key and parameter set from one LmsKey
+ * to another. Use this to prepare a signature verification LmsKey
+ * that is pub only.
+ *
+ * Though the public key is all that is used to verify signatures,
+ * the parameter set is needed to calculate the signature length
+ * before hand. */
+int wc_LmsKey_ExportPub(LmsKey * keyDst, const LmsKey * keySrc)
+{
+    if (keyDst == NULL || keySrc == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    ForceZero(keyDst, sizeof(LmsKey));
+
+    XMEMCPY(keyDst->pub, keySrc->pub, sizeof(keySrc->pub));
+    XMEMCPY(keyDst->lm_type, keySrc->lm_type, sizeof(keySrc->lm_type));
+    XMEMCPY(keyDst->lm_ots_type, keySrc->lm_ots_type,
+            sizeof(keySrc->lm_ots_type));
+
+    keyDst->levels = keySrc->levels;
+
+    /* Mark this key as verify only, to prevent misuse. */
+    keyDst->state = WC_LMS_STATE_VERIFYONLY;
+
+    return 0;
+}
+
+/* Exports the raw LMS public key buffer from key to out buffer.
+ * The out buffer should be large enough to hold the public key, and
+ * outLen should indicate the size of the buffer.
+ *
+ * - Returns 0 on success, and sets outLen to LMS pubLen.
+ * - Returns BUFFER_E if outLen < LMS pubLen.
+ *
+ * Call wc_LmsKey_GetPubLen beforehand to determine pubLen.
+ * */
+int wc_LmsKey_ExportPubRaw(const LmsKey * key, byte * out, word32 * outLen)
+{
+    int    ret = 0;
+    word32 pubLen = 0;
+
+    if (key == NULL || out == NULL || outLen == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    ret = wc_LmsKey_GetPubLen(key, &pubLen);
+
+    if (ret != 0) {
+        WOLFSSL_MSG("error: wc_LmsKey_GetPubLen failed");
+        return -1;
+    }
+
+    if (*outLen < pubLen) {
+        return BUFFER_E;
+    }
+
+    XMEMCPY(out, key->pub, pubLen);
+    *outLen = pubLen;
+
+    return 0;
+}
+
+/* Imports a raw public key buffer from in array to LmsKey key.
+ *
+ * The LMS parameters must be set first with wc_LmsKey_SetLmsParm or
+ * wc_LmsKey_SetParameters, and inLen must match the length returned
+ * by wc_LmsKey_GetPubLen.
+ *
+ * - Returns 0 on success.
+ * - Returns BUFFER_E if inlen != LMS pubLen.
+ *
+ * Call wc_LmsKey_GetPubLen beforehand to determine pubLen.
+ * */
+int wc_LmsKey_ImportPubRaw(LmsKey * key, const byte * in, word32 inLen)
+{
+    int    ret = 0;
+    word32 pubLen = 0;
+
+    if (key == NULL || in == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    ret = wc_LmsKey_GetPubLen(key, &pubLen);
+
+    if (ret != 0) {
+        WOLFSSL_MSG("error: wc_LmsKey_GetPubLen failed");
+        return -1;
+    }
+
+    if (inLen != pubLen) {
+        /* Something inconsistent. Parameters weren't set, or input
+         * pub key is wrong.*/
+        return BUFFER_E;
+    }
+
+    XMEMCPY(key->pub, in, pubLen);
+
+    return 0;
+}
+
+/* Given a levels, height, winternitz parameter set, determine
+ * the signature length.
+ *
+ * Call this before wc_LmsKey_Sign so you know the length of
+ * the required signature buffer. */
+int wc_LmsKey_GetSigLen(const LmsKey * key, word32 * len)
+{
+    if (key == NULL || len == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    *len = (word32) hss_get_signature_len(key->levels, key->lm_type,
+                                          key->lm_ots_type);
+
+    return 0;
+}
+
+int wc_LmsKey_Verify(LmsKey * key, const byte * sig, word32 sigSz,
+    const byte * msg, int msgSz)
+{
+    bool result = true;
+
+    if (key == NULL || sig == NULL || msg == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+#ifdef WOLFSSL_LMS_VERIFY_ONLY
+    result = hss_validate_signature(key->pub, (const void *) msg, msgSz, sig,
+                                    sigSz, NULL);
+#else
+    result = hss_validate_signature(key->pub, (const void *) msg, msgSz, sig,
+                                    sigSz, &key->info);
+#endif
+
+
+    if (!result) {
+        WOLFSSL_MSG("error: hss_validate_signature failed");
+        return -1;
+    }
+
+    return 0;
+}
+
+#endif /* WOLFSSL_HAVE_LMS */

+ 981 - 0
lib/wolfssl/wolfcrypt/src/ext_xmss.c

@@ -0,0 +1,981 @@
+/* ext_xmss.c
+ *
+ * Copyright (C) 2006-2023 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#ifdef HAVE_CONFIG_H
+    #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/logging.h>
+#include <wolfssl/wolfcrypt/sha256.h>
+
+#ifdef WOLFSSL_HAVE_XMSS
+#include <wolfssl/wolfcrypt/ext_xmss.h>
+
+#ifdef NO_INLINE
+    #include <wolfssl/wolfcrypt/misc.h>
+#else
+    #define WOLFSSL_MISC_INCLUDED
+    #include <wolfcrypt/src/misc.c>
+#endif
+
+#include <xmss_callbacks.h>
+
+#ifndef WOLFSSL_XMSS_VERIFY_ONLY
+static THREAD_LS_T WC_RNG * xmssRng = NULL;
+
+/* RNG callback used by xmss.
+ * */
+static int rng_cb(void * output, size_t length)
+{
+    int ret = 0;
+
+    if (output == NULL || xmssRng == NULL) {
+        return -1;
+    }
+
+    if (length == 0) {
+        return 0;
+    }
+
+    ret = wc_RNG_GenerateBlock(xmssRng, (byte *)output, (word32)length);
+
+    if (ret) {
+        WOLFSSL_MSG("error: XMSS rng_cb failed");
+        return -1;
+    }
+
+    return 0;
+}
+#endif /* ifndef WOLFSSL_XMSS_VERIFY_ONLY */
+
+/* SHA256 callback used by XMSS.
+ * */
+static int sha256_cb(const unsigned char *in, unsigned long long inlen,
+                     unsigned char *out)
+{
+    wc_Sha256 sha;
+
+    if (wc_InitSha256_ex(&sha, NULL, INVALID_DEVID) != 0) {
+        WOLFSSL_MSG("SHA256 Init failed");
+        return -1;
+    }
+
+    if (wc_Sha256Update(&sha, in, (word32) inlen) != 0) {
+        WOLFSSL_MSG("SHA256 Update failed");
+        return -1;
+    }
+
+    if (wc_Sha256Final(&sha, out) != 0) {
+        WOLFSSL_MSG("SHA256 Final failed");
+        wc_Sha256Free(&sha);
+        return -1;
+    }
+    wc_Sha256Free(&sha);
+
+    return 0;
+}
+
+/* Init an XMSS key.
+ *
+ * Call this before setting the parms of an XMSS key.
+ *
+ *  key         [in]  The XMSS key to init.
+ *  heap        [in]  Unused.
+ *  devId       [in]  Unused.
+ *
+ *  returns     0 on success.
+ *  returns     BAD_FUNC_ARG when a parameter is NULL.
+ * */
+int wc_XmssKey_Init(XmssKey * key, void * heap, int devId)
+{
+    if (key == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    (void) heap;
+    (void) devId;
+
+    ForceZero(key, sizeof(XmssKey));
+
+#ifndef WOLFSSL_XMSS_VERIFY_ONLY
+    key->sk = NULL;
+    key->sk_len = 0;
+    key->write_private_key = NULL;
+    key->read_private_key = NULL;
+    key->context = NULL;
+#endif /* ifndef WOLFSSL_XMSS_VERIFY_ONLY */
+    key->state = WC_XMSS_STATE_INITED;
+
+    return 0;
+}
+
+/* Sets the XMSS key parameters, given an OID.
+ *
+ * Note: XMSS and XMSS^MT parameter sets do have overlapping
+ * OIDs, therefore is_xmssmt is necessary to toggle.
+ *
+ *  key         [in]  The XMSS key to set.
+ *  OID         [in]  The XMSS parameter set OID.
+ *  is_xmssmt   [in]  1 The OID is assumed to be XMSS^MT.
+ *                    0 The OID is assumed to be XMSS.
+ *
+ *  returns     0 on success.
+ *  returns     BAD_FUNC_ARG when a parameter is NULL.
+ *  returns     -1 on parse failure.
+ * */
+static int wc_XmssKey_SetOid(XmssKey * key, uint32_t oid, int is_xmssmt)
+{
+    int ret = 0;
+
+    if (key == NULL || oid == 0) {
+        return BAD_FUNC_ARG;
+    }
+
+    /* Parse the OID and load the XMSS params structure. */
+    if (is_xmssmt) {
+        ret = xmssmt_parse_oid(&key->params, oid);
+    }
+    else {
+        ret = xmss_parse_oid(&key->params, oid);
+    }
+
+    if (ret != 0) {
+        WOLFSSL_MSG("error: XMSS parse oid failed");
+        return -1;
+    }
+
+    /* Finally, sanity check that this is a supported parameter set.
+     *
+     * We are only supporting XMSS/XMSS^MT with SHA256 parameter sets
+     * that NIST SP 800-208 has standardized. See patched xmss-reference
+     * params.h for the defines. */
+    if (key->params.func != XMSS_SHA2 ||
+        key->params.n != XMSS_SHA256_N ||
+        key->params.padding_len != XMSS_SHA256_PADDING_LEN ||
+        key->params.wots_w != 16 ||
+        key->params.wots_len != XMSS_SHA256_WOTS_LEN) {
+        WOLFSSL_MSG("error: unsupported XMSS/XMSS^MT parameter set");
+        return -1;
+    }
+
+    ret = xmss_set_sha_cb(sha256_cb);
+    if (ret != 0) {
+        WOLFSSL_MSG("error: xmss_set_sha_cb failed");
+        return -1;
+    }
+
+#ifndef WOLFSSL_XMSS_VERIFY_ONLY
+    ret = xmss_set_rng_cb(rng_cb);
+    if (ret != 0) {
+        WOLFSSL_MSG("error: xmss_set_rng_cb failed");
+        return -1;
+    }
+#endif
+
+    key->oid = oid;
+    key->is_xmssmt = is_xmssmt;
+    key->state = WC_XMSS_STATE_PARMSET;
+
+    return 0;
+}
+
+/* Set the XMSS key parameter string.
+ *
+ * The input string must be one of the supported parm set names in
+ * the "Name" section from the table in wolfssl/wolfcrypt/xmss.h,
+ * e.g. "XMSS-SHA2_10_256" or "XMSSMT-SHA2_20/4_256".
+ *
+ *  key         [in]  The XMSS key to set.
+ *  str         [in]  The XMSS/XMSS^MT parameter string.
+ *
+ *  returns     0 on success.
+ *  returns     BAD_FUNC_ARG when a parameter is NULL.
+ *  returns     -1 on failure.
+ * */
+int wc_XmssKey_SetParamStr(XmssKey * key, const char * str)
+{
+    int      ret = 0;
+    uint32_t oid = 0;
+    int      is_xmssmt = 0;
+
+    if (key == NULL || str == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    if (key->state != WC_XMSS_STATE_INITED) {
+        WOLFSSL_MSG("error: XMSS key needs init");
+        return BAD_FUNC_ARG;
+    }
+
+    switch(XSTRLEN(str)) {
+    case XMSS_NAME_LEN:
+        is_xmssmt = 0;
+        break;
+    case XMSSMT_NAME_MIN_LEN:
+    case XMSSMT_NAME_MAX_LEN:
+        is_xmssmt = 1;
+        break;
+    default:
+        WOLFSSL_MSG("error: XMSS param str invalid length");
+        return BAD_FUNC_ARG;
+    }
+
+    /* Convert XMSS param string to OID. */
+    if (is_xmssmt) {
+        ret = xmssmt_str_to_oid(&oid, str);
+    }
+    else {
+        ret = xmss_str_to_oid(&oid, str);
+    }
+
+    if (ret != 0) {
+        WOLFSSL_MSG("error: xmssmt_str_to_oid failed");
+        return -1;
+    }
+
+    return wc_XmssKey_SetOid(key, oid, is_xmssmt);
+}
+
+/* Force zeros and frees the XMSS key from memory.
+ *
+ * This does not touch the private key saved to non-volatile storage.
+ *
+ * This is the only function that frees the key->sk array.
+ *
+ *  key         [in]  The XMSS key.
+ *
+ *  returns     void
+ * */
+void wc_XmssKey_Free(XmssKey* key)
+{
+    if (key == NULL) {
+        return;
+    }
+
+#ifndef WOLFSSL_XMSS_VERIFY_ONLY
+    if (key->sk != NULL) {
+        ForceZero(key->sk, key->sk_len);
+        XFREE(key->sk, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+        key->sk = NULL;
+        key->sk_len = 0;
+    }
+#endif /* ifndef WOLFSSL_XMSS_VERIFY_ONLY */
+
+    ForceZero(key, sizeof(XmssKey));
+
+    key->state = WC_XMSS_STATE_FREED;
+
+    return;
+}
+
+#ifndef WOLFSSL_XMSS_VERIFY_ONLY
+/* Sets the XMSS write private key callback.
+ *
+ * The callback must be able to write/update the private key to
+ * non-volatile storage.
+ *
+ *  key         [in]  The XMSS key.
+ *  write_cb    [in]  The write private key callback.
+ *
+ *  returns     0 on success.
+ *  returns     BAD_FUNC_ARG when a parameter is NULL.
+ *  returns     -1 on failure.
+ * */
+int wc_XmssKey_SetWriteCb(XmssKey * key, write_private_key_cb write_cb)
+{
+    if (key == NULL || write_cb == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    /* Changing the write callback of an already working key is forbidden. */
+    if (key->state == WC_XMSS_STATE_OK) {
+        WOLFSSL_MSG("error: wc_XmssKey_SetWriteCb: key in use");
+        return -1;
+    }
+
+    key->write_private_key = write_cb;
+
+    return 0;
+}
+
+/* Sets the XMSS read private key callback.
+ *
+ * The callback must be able to read the private key from
+ * non-volatile storage.
+ *
+ *  key         [in]  The XMSS key.
+ *  read_cb     [in]  The read private key callback.
+ *
+ *  returns     0 on success.
+ *  returns     BAD_FUNC_ARG when a parameter is NULL.
+ *  returns     -1 on failure.
+ * */
+int wc_XmssKey_SetReadCb(XmssKey * key, read_private_key_cb read_cb)
+{
+    if (key == NULL || read_cb == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    /* Changing the read callback of an already working key is forbidden. */
+    if (key->state == WC_XMSS_STATE_OK) {
+        WOLFSSL_MSG("error: wc_XmssKey_SetReadCb: key in use");
+        return -1;
+    }
+
+    key->read_private_key = read_cb;
+
+    return 0;
+}
+
+/* Sets the XMSS context to be used by write and read callbacks.
+ *
+ * E.g. this could be a filename if the callbacks write/read to file.
+ *
+ *  key         [in]  The XMSS key.
+ *  context     [in]  The context pointer.
+ *
+ *  returns     0 on success.
+ *  returns     BAD_FUNC_ARG when a parameter is NULL.
+ *  returns     -1 on failure.
+ * */
+int wc_XmssKey_SetContext(XmssKey * key, void * context)
+{
+    if (key == NULL || context == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    /* Setting context of an already working key is forbidden. */
+    if (key->state == WC_XMSS_STATE_OK) {
+        WOLFSSL_MSG("error: wc_XmssKey_SetContext: key in use");
+        return -1;
+    }
+
+    key->context = context;
+
+    return 0;
+}
+
+
+/* Allocates the XMSS secret key (sk) array.
+ *
+ * The XMSS/XMSS^MT secret key length is a function of the
+ * parameters, and can't be allocated until the param string
+ * has been set with SetParamStr.
+ *
+ * This is only called by MakeKey() and Reload().
+ *
+ * Note: the XMSS sk array is force zeroed after every use.
+ *
+ *  key         [in]  The XMSS key.
+ *
+ *  returns     0 on success.
+ *  returns     BAD_FUNC_ARG when a parameter is NULL.
+ *  returns     -1 on failure.
+ * */
+static int wc_XmssKey_AllocSk(XmssKey* key)
+{
+    int ret = 0;
+
+    if (key == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    if (key->sk != NULL) {
+        WOLFSSL_MSG("error: XMSS secret key already exists");
+        return -1;
+    }
+
+    /* The XMSS/XMSS^MT secret key length is a function of the
+     * parameters. Therefore can't allocate this until param
+     * string has been set. */
+    ret = wc_XmssKey_GetPrivLen(key, &key->sk_len);
+
+    if (ret != 0 || key->sk_len <= 0) {
+        WOLFSSL_MSG("error: wc_XmssKey_GetPrivLen failed");
+        return -1;
+    }
+
+    key->sk = (unsigned char *)XMALLOC(key->sk_len, NULL,
+                                       DYNAMIC_TYPE_TMP_BUFFER);
+
+    if (key->sk == NULL) {
+        WOLFSSL_MSG("error: malloc XMSS key->sk failed");
+        return -1;
+    }
+
+    ForceZero(key->sk, key->sk_len);
+
+    return 0;
+}
+
+/* Make the XMSS/XMSS^MT private/public key pair. The key must have its parameters
+ * set before calling this.
+ *
+ * Write/read callbacks, and context data, must be set prior.
+ * Key must have parameters set.
+ *
+ * This function and Reload() are the only functions that allocate
+ * key->sk array. wc_XmssKey_FreeKey is the only function that
+ * deallocates key->sk.
+ *
+ *  key         [in]  The XMSS key to make.
+ *  rng         [in]  Initialized WC_RNG pointer.
+ *
+ *  returns     0 on success.
+ *  returns     BAD_FUNC_ARG when a parameter is NULL.
+ *  returns     -1 on verify fail.
+ * */
+int wc_XmssKey_MakeKey(XmssKey* key, WC_RNG * rng)
+{
+    int            ret = 0;
+    enum wc_XmssRc cb_rc = WC_XMSS_RC_NONE;
+
+    if (key == NULL || rng == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    if (key->state != WC_XMSS_STATE_PARMSET) {
+        WOLFSSL_MSG("error: XmssKey not ready for generation");
+        return -1;
+    }
+
+    if (key->write_private_key == NULL || key->read_private_key == NULL) {
+        WOLFSSL_MSG("error: XmssKey write/read callbacks are not set");
+        return -1;
+    }
+
+    if (key->context == NULL) {
+        WOLFSSL_MSG("error: XmssKey context is not set");
+        return -1;
+    }
+
+    /* Allocate sk array. */
+    ret = wc_XmssKey_AllocSk(key);
+
+    if (ret != 0) {
+        return ret;
+    }
+
+    xmssRng = rng;
+
+    /* Finally make the secret public key pair. Immediately write it to NV
+     * storage and then clear from memory. */
+    if (key->is_xmssmt) {
+        ret = xmssmt_keypair(key->pk, key->sk, key->oid);
+    }
+    else {
+        ret = xmss_keypair(key->pk, key->sk, key->oid);
+    }
+
+    if (ret == 0) {
+        cb_rc = key->write_private_key(key->sk, key->sk_len, key->context);
+    }
+
+    ForceZero(key->sk, key->sk_len);
+
+    if (ret != 0) {
+        WOLFSSL_MSG("error: XMSS keypair failed");
+        key->state = WC_XMSS_STATE_BAD;
+        return -1;
+    }
+
+    if (cb_rc != WC_XMSS_RC_SAVED_TO_NV_MEMORY) {
+        WOLFSSL_MSG("error: XMSS write to NV storage failed");
+        key->state = WC_XMSS_STATE_BAD;
+        return -1;
+    }
+
+    key->state = WC_XMSS_STATE_OK;
+
+    return 0;
+}
+
+/* This function allocates the secret key buffer, and does a
+ * quick sanity check to verify the secret key is readable
+ * from NV storage, and then force zeros the key from memory.
+ *
+ * On success it sets the key state to OK.
+ *
+ * Use this function to resume signing with an already existing
+ * XMSS key pair.
+ *
+ * Write/read callbacks, and context data, must be set prior.
+ * Key must have parameters set.
+ *
+ * Returns 0 on success.
+ *
+ * This function and MakeKey are the only functions that allocate
+ * key->sk array. wc_XmssKey_FreeKey is the only function that
+ * deallocates key->sk.
+ *
+ *  key         [in]      XMSS key to load.
+ *
+ *  returns     0 on success.
+ *  returns     BAD_FUNC_ARG when a parameter is NULL.
+ *  returns     -1 on load fail.
+ * */
+int wc_XmssKey_Reload(XmssKey * key)
+{
+    int            ret = 0;
+    enum wc_XmssRc cb_rc = WC_XMSS_RC_NONE;
+
+    if (key == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    if (key->state != WC_XMSS_STATE_PARMSET) {
+        WOLFSSL_MSG("error: XmssKey not ready for reload");
+        return -1;
+    }
+
+    if (key->write_private_key == NULL || key->read_private_key == NULL) {
+        WOLFSSL_MSG("error: XmssKey write/read callbacks are not set");
+        return -1;
+    }
+
+    if (key->context == NULL) {
+        WOLFSSL_MSG("error: XmssKey context is not set");
+        return -1;
+    }
+
+    /* Allocate sk array. */
+    ret = wc_XmssKey_AllocSk(key);
+
+    if (ret != 0) {
+        return ret;
+    }
+
+    /* Read the current secret key from NV storage. Force clear it
+     * immediately. This is just to sanity check the secret key
+     * is readable from permanent storage. */
+    cb_rc = key->read_private_key(key->sk, key->sk_len, key->context);
+    ForceZero(key->sk, key->sk_len);
+
+    if (cb_rc != WC_XMSS_RC_READ_TO_MEMORY) {
+        WOLFSSL_MSG("error: XMSS read from NV storage failed");
+        key->state = WC_XMSS_STATE_BAD;
+        return -1;
+    }
+
+    key->state = WC_XMSS_STATE_OK;
+
+    return 0;
+}
+
+/* Gets the XMSS/XMSS^MT private key length.
+ *
+ * Parameters must be set before calling this, as the key size (sk_bytes)
+ * is a function of the parameters.
+ *
+ * Note: the XMSS/XMSS^MT private key format is implementation specific,
+ * and not standardized. Interoperability of XMSS private keys should
+ * not be expected.
+ *
+ *  key         [in]      The XMSS key.
+ *  len         [out]     The length of the private key in bytes.
+ *
+ *  returns     0 on success.
+ *  returns     BAD_FUNC_ARG when a parameter is NULL.
+ *  returns     -1 on sign fail.
+ * */
+int wc_XmssKey_GetPrivLen(const XmssKey * key, word32 * len)
+{
+    if (key == NULL || len == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    if (key->state != WC_XMSS_STATE_OK && key->state != WC_XMSS_STATE_PARMSET) {
+        /* params.sk_bytes not set yet. */
+        return -1;
+    }
+
+    *len = XMSS_OID_LEN + (word32) key->params.sk_bytes;
+
+    return 0;
+}
+
+/* Signs the message using the XMSS secret key, and
+ * updates the secret key on NV storage.
+ *
+ * Both operations must succeed to be considered
+ * successful.
+ *
+ * On success:  sets key state to WC_XMSS_STATE_OK.
+ * On failure:  sets key state to WC_XMSS_STATE_BAD
+ *
+ * If no signatures are left, sets state to WC_XMSS_STATE_NOSIGS.
+ */
+static void wc_XmssKey_SignUpdate(XmssKey* key, byte * sig, word32 * sigLen,
+    const byte * msg, int msgLen)
+{
+    int                ret = -1;
+    unsigned long long len = *sigLen;
+    enum wc_XmssRc     cb_rc = WC_XMSS_RC_NONE;
+
+    /* Set the key state to bad by default. State is presumed bad
+     * unless a correct sign and update operation happen together. */
+    key->state = WC_XMSS_STATE_BAD;
+    *sigLen = 0;
+
+    /* Read the current secret key from NV storage.*/
+    cb_rc = key->read_private_key(key->sk, key->sk_len, key->context);
+
+    if (cb_rc == WC_XMSS_RC_READ_TO_MEMORY) {
+        /* Read was good. Now sign and update the secret key in memory. */
+        if (key->is_xmssmt) {
+            ret = xmssmt_sign(key->sk, sig, &len, msg, msgLen);
+        }
+        else {
+            ret = xmss_sign(key->sk, sig, &len, msg, msgLen);
+        }
+
+        if (ret == 0 && len == key->params.sig_bytes) {
+            /* The signature succeeded. key->sk is now updated and must be
+             * committed to NV storage. */
+            cb_rc = key->write_private_key(key->sk, key->sk_len, key->context);
+
+            if (cb_rc == WC_XMSS_RC_SAVED_TO_NV_MEMORY) {
+                /* key->sk was successfully committed to NV storage. Set the
+                 * key state to OK, and set the sigLen. */
+                key->state = WC_XMSS_STATE_OK;
+                *sigLen = (word32) len;
+            }
+            else {
+                /* Write to NV storage failed. Erase the signature from
+                 * memory. */
+                ForceZero(sig, key->params.sig_bytes);
+                WOLFSSL_MSG("error: XMSS write_private_key failed");
+            }
+        }
+        else if (ret == -2) {
+            /* Signature space exhausted. */
+            key->state = WC_XMSS_STATE_NOSIGS;
+            WOLFSSL_MSG("error: no XMSS signatures remaining");
+        }
+        else {
+            /* Something failed or inconsistent in signature. Erase the
+             * signature just to be safe. */
+            ForceZero(sig, key->params.sig_bytes);
+            WOLFSSL_MSG("error: XMSS sign failed");
+        }
+    }
+    else {
+        /* Read from NV storage failed. */
+        WOLFSSL_MSG("error: XMSS read_private_key failed");
+    }
+
+    /* Force zero the secret key from memory always. */
+    ForceZero(key->sk, key->sk_len);
+
+    return;
+}
+
+/* Sign the message using the XMSS secret key.
+ *
+ *  key         [in]      XMSS key to use to sign.
+ *  sig         [in]      Buffer to write signature into.
+ *  sigLen      [in/out]  On in, size of buffer.
+ *                        On out, the length of the signature in bytes.
+ *  msg         [in]      Message to sign.
+ *  msgLen      [in]      Length of the message in bytes.
+ *
+ *  returns     0 on success.
+ *  returns     -1 on sign fail.
+ *  returns     BAD_FUNC_ARG when a parameter is NULL.
+ *  returns     BUFFER_E when sigLen is too small.
+ */
+int wc_XmssKey_Sign(XmssKey* key, byte * sig, word32 * sigLen, const byte * msg,
+    int msgLen)
+{
+    if (key == NULL || sig == NULL || sigLen == NULL || msg == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    if (msgLen <= 0) {
+        return BAD_FUNC_ARG;
+    }
+
+    if (*sigLen < key->params.sig_bytes) {
+        /* Signature buffer too small. */
+        WOLFSSL_MSG("error: XMSS sig buffer too small");
+        return BUFFER_E;
+    }
+
+    if (key->state == WC_XMSS_STATE_NOSIGS) {
+        WOLFSSL_MSG("error: XMSS signatures exhausted");
+        return -1;
+    }
+    else if (key->state != WC_XMSS_STATE_OK) {
+       /* The key had an error the last time it was used, and we
+        * can't guarantee its state. */
+        WOLFSSL_MSG("error: can't sign, XMSS key not in good state");
+        return -1;
+    }
+
+    if (key->write_private_key == NULL || key->read_private_key == NULL) {
+        WOLFSSL_MSG("error: XmssKey write/read callbacks are not set");
+        return -1;
+    }
+
+    if (key->context == NULL) {
+        WOLFSSL_MSG("error: XmssKey context is not set");
+        return -1;
+    }
+
+    /* Finally, sign and update the secret key. */
+    wc_XmssKey_SignUpdate(key, sig, sigLen, msg, msgLen);
+
+    return (key->state == WC_XMSS_STATE_OK) ? 0 : -1;
+}
+#endif /* ifndef WOLFSSL_XMSS_VERIFY_ONLY*/
+
+/* Get the XMSS/XMSS^MT public key length. The public key
+ * is static in size and does not depend on parameters,
+ * other than the choice of SHA256 as hashing function.
+ *
+ *  key         [in]      The XMSS key.
+ *  len         [out]     The length of the public key.
+ *
+ *  returns     0 on success.
+ *  returns     BAD_FUNC_ARG when a parameter is NULL.
+ */
+int wc_XmssKey_GetPubLen(const XmssKey * key, word32 * len)
+{
+    if (key == NULL || len == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    *len = XMSS_SHA256_PUBLEN;
+
+    return 0;
+}
+
+/* Export a generated public key and parameter set from one XmssKey
+ * to another. Use this to prepare a signature verification XmssKey
+ * that is pub only.
+ *
+ *  keyDst      [out]    Destination key for copy.
+ *  keySrc      [in]     Source key for copy.
+ *
+ *  returns     0 on success.
+ *  returns     BAD_FUNC_ARG when a parameter is NULL.
+ * */
+int wc_XmssKey_ExportPub(XmssKey * keyDst, const XmssKey * keySrc)
+{
+    if (keyDst == NULL || keySrc == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    ForceZero(keyDst, sizeof(XmssKey));
+
+    XMEMCPY(keyDst->pk, keySrc->pk, sizeof(keySrc->pk));
+
+    keyDst->oid = keySrc->oid;
+    keyDst->is_xmssmt = keySrc->is_xmssmt;
+
+    /* Mark keyDst as verify only, to prevent misuse. */
+    keyDst->state = WC_XMSS_STATE_VERIFYONLY;
+
+    return 0;
+}
+
+/* Exports the raw XMSS public key buffer from key to out buffer.
+ * The out buffer should be large enough to hold the public key, and
+ * outLen should indicate the size of the buffer.
+ *
+ *  key         [in]      XMSS key.
+ *  out         [out]     Array holding public key.
+ *  outLen      [in/out]  On in, size of buffer.
+ *                        On out, the length of the public key.
+ *
+ *  returns     0 on success.
+ *  returns     -1 on failure.
+ *  returns     BAD_FUNC_ARG when a parameter is NULL.
+ *  returns     BUFFER_E if array is too small.
+ * */
+int wc_XmssKey_ExportPubRaw(const XmssKey * key, byte * out, word32 * outLen)
+{
+    int    ret = 0;
+    word32 pubLen = 0;
+
+    if (key == NULL || out == NULL || outLen == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    ret = wc_XmssKey_GetPubLen(key, &pubLen);
+
+    if (ret != 0) {
+        WOLFSSL_MSG("error: wc_XmssKey_GetPubLen failed");
+        return -1;
+    }
+
+    if (*outLen < pubLen) {
+        return BUFFER_E;
+    }
+
+    XMEMCPY(out, key->pk, pubLen);
+    *outLen = pubLen;
+
+    return 0;
+}
+
+/* Imports a raw public key buffer from in array to XmssKey key.
+ *
+ * The XMSS parameters must be set first with wc_XmssKey_SetParamStr,
+ * and inLen must match the length returned by wc_XmssKey_GetPubLen.
+ *
+ *  key         [in]      XMSS key.
+ *  in          [in]      Array holding public key.
+ *  inLen       [in]      Length of array in bytes.
+ *
+ *  returns     0 on success.
+ *  returns     BAD_FUNC_ARG when a parameter is NULL.
+ *  returns     BUFFER_E if array is incorrect size.
+ *  returns     -1 on failure.
+ * */
+int wc_XmssKey_ImportPubRaw(XmssKey * key, const byte * in, word32 inLen)
+{
+    int    ret = 0;
+    word32 pubLen = 0;
+
+    if (key == NULL || in == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    if (key->state != WC_XMSS_STATE_PARMSET) {
+        /* XMSS key not ready for import. Param str must be set first. */
+        WOLFSSL_MSG("error: XMSS key not ready for import");
+        return -1;
+    }
+
+    ret = wc_XmssKey_GetPubLen(key, &pubLen);
+
+    if (ret != 0) {
+        WOLFSSL_MSG("error: wc_XmssKey_GetPubLen failed");
+        return -1;
+    }
+
+    if (inLen != pubLen) {
+        /* Something inconsistent. Parameters weren't set, or input
+         * pub key is wrong.*/
+        return BUFFER_E;
+    }
+
+    XMEMCPY(key->pk, in, pubLen);
+
+    key->state = WC_XMSS_STATE_VERIFYONLY;
+
+    return 0;
+}
+
+/* Gets the XMSS/XMSS^MT signature length.
+ *
+ * Parameters must be set before calling this, as the signature size
+ * is a function of the parameters.
+ *
+ * Note: call this before wc_XmssKey_Sign or Verify so you know the
+ * length of the required signature buffer.
+ *
+ *  key         [in]      XMSS key to use to sign.
+ *  len         [out]     The length of the signature in bytes.
+ *
+ *  returns     0 on success.
+ *  returns     BAD_FUNC_ARG when a parameter is NULL.
+ *  returns     -1 on sign fail.
+ * */
+int wc_XmssKey_GetSigLen(const XmssKey * key, word32 * len)
+{
+    if (key == NULL || len == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    if (key->state != WC_XMSS_STATE_OK && key->state != WC_XMSS_STATE_PARMSET) {
+        return -1;
+    }
+
+    *len = key->params.sig_bytes;
+
+    return 0;
+}
+
+/* Verify the signature using the XMSS public key.
+ *
+ * Requires that XMSS parameters have been set with
+ * wc_XmssKey_SetParamStr, and that a public key is available
+ * from importing or MakeKey().
+ *
+ * Call wc_XmssKey_GetSigLen() before this function to determine
+ * length of the signature buffer.
+ *
+ *  key         [in]  XMSS key to use to verify.
+ *  sig         [in]  Signature to verify.
+ *  sigLen      [in]  Size of signature in bytes.
+ *  msg         [in]  Message to verify.
+ *  msgLen      [in]  Length of the message in bytes.
+ *
+ *  returns     0 on success.
+ *  returns     -1 on verify fail.
+ *  returns     BAD_FUNC_ARG when a parameter is NULL.
+ *  returns     BUFFER_E when sigLen is too small.
+ */
+int wc_XmssKey_Verify(XmssKey * key, const byte * sig, word32 sigLen,
+    const byte * msg, int msgLen)
+{
+    int                ret = 0;
+    unsigned long long msg_len = 0;
+
+    msg_len = msgLen;
+
+    if (key == NULL || sig == NULL || msg == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    if (sigLen < key->params.sig_bytes) {
+        /* Signature buffer too small. */
+        return BUFFER_E;
+    }
+
+    if (key->state != WC_XMSS_STATE_OK &&
+        key->state != WC_XMSS_STATE_VERIFYONLY) {
+        /* XMSS key not ready for verification. Param str must be
+         * set first, and Reload() called. */
+        WOLFSSL_MSG("error: XMSS key not ready for verification");
+        return -1;
+    }
+
+    if (key->is_xmssmt) {
+        ret = xmssmt_sign_open(msg, &msg_len, sig, sigLen, key->pk);
+    }
+    else {
+        ret = xmss_sign_open(msg, &msg_len, sig, sigLen, key->pk);
+    }
+
+    if (ret != 0 || (int) msg_len != msgLen) {
+        WOLFSSL_MSG("error: XMSS verify failed");
+        return -1;
+    }
+
+    return ret;
+}
+
+#endif /* WOLFSSL_HAVE_XMSS */

+ 126 - 126
lib/wolfssl/wolfcrypt/src/fe_448.c

@@ -167,7 +167,7 @@ void fe448_sub(word8* r, const word8* a, const word8* b)
     }
 }
 
-/* Mulitply a field element by 39081. r = (39081 * a) mod (2^448 - 2^224 - 1)
+/* Multiply a field element by 39081. r = (39081 * a) mod (2^448 - 2^224 - 1)
  *
  * r  [in]  Field element to hold result.
  * a  [in]  Field element to multiply.
@@ -192,7 +192,7 @@ void fe448_mul39081(word8* r, const word8* a)
     }
 }
 
-/* Mulitply two field elements. r = (a * b) mod (2^448 - 2^224 - 1)
+/* Multiply two field elements. r = (a * b) mod (2^448 - 2^224 - 1)
  *
  * r  [in]  Field element to hold result.
  * a  [in]  Field element to multiply.
@@ -448,7 +448,7 @@ void fe448_neg(word8* r, const word8* a)
 }
 
 /* Raise field element to (p-3) / 4: 2^446 - 2^222 - 1
- * Used for calcualting y-ordinate from x-ordinate for Ed448.
+ * Used for calculating y-ordinate from x-ordinate for Ed448.
  *
  * r  [in]  Field element to hold result.
  * a  [in]  Field element to exponentiate.
@@ -609,62 +609,62 @@ void fe448_to_bytes(unsigned char* b, const sword64* a)
                       in4  += o; t = o << 56; in7  -= (sword64)t;
 
     /* Output as bytes */
-    b[ 0] = (in0  >>  0);
-    b[ 1] = (in0  >>  8);
-    b[ 2] = (in0  >> 16);
-    b[ 3] = (in0  >> 24);
-    b[ 4] = (in0  >> 32);
-    b[ 5] = (in0  >> 40);
-    b[ 6] = (in0  >> 48);
-    b[ 7] = (in1  >>  0);
-    b[ 8] = (in1  >>  8);
-    b[ 9] = (in1  >> 16);
-    b[10] = (in1  >> 24);
-    b[11] = (in1  >> 32);
-    b[12] = (in1  >> 40);
-    b[13] = (in1  >> 48);
-    b[14] = (in2  >>  0);
-    b[15] = (in2  >>  8);
-    b[16] = (in2  >> 16);
-    b[17] = (in2  >> 24);
-    b[18] = (in2  >> 32);
-    b[19] = (in2  >> 40);
-    b[20] = (in2  >> 48);
-    b[21] = (in3  >>  0);
-    b[22] = (in3  >>  8);
-    b[23] = (in3  >> 16);
-    b[24] = (in3  >> 24);
-    b[25] = (in3  >> 32);
-    b[26] = (in3  >> 40);
-    b[27] = (in3  >> 48);
-    b[28] = (in4  >>  0);
-    b[29] = (in4  >>  8);
-    b[30] = (in4  >> 16);
-    b[31] = (in4  >> 24);
-    b[32] = (in4  >> 32);
-    b[33] = (in4  >> 40);
-    b[34] = (in4  >> 48);
-    b[35] = (in5  >>  0);
-    b[36] = (in5  >>  8);
-    b[37] = (in5  >> 16);
-    b[38] = (in5  >> 24);
-    b[39] = (in5  >> 32);
-    b[40] = (in5  >> 40);
-    b[41] = (in5  >> 48);
-    b[42] = (in6  >>  0);
-    b[43] = (in6  >>  8);
-    b[44] = (in6  >> 16);
-    b[45] = (in6  >> 24);
-    b[46] = (in6  >> 32);
-    b[47] = (in6  >> 40);
-    b[48] = (in6  >> 48);
-    b[49] = (in7  >>  0);
-    b[50] = (in7  >>  8);
-    b[51] = (in7  >> 16);
-    b[52] = (in7  >> 24);
-    b[53] = (in7  >> 32);
-    b[54] = (in7  >> 40);
-    b[55] = (in7  >> 48);
+    b[ 0] = (byte)(in0  >>  0);
+    b[ 1] = (byte)(in0  >>  8);
+    b[ 2] = (byte)(in0  >> 16);
+    b[ 3] = (byte)(in0  >> 24);
+    b[ 4] = (byte)(in0  >> 32);
+    b[ 5] = (byte)(in0  >> 40);
+    b[ 6] = (byte)(in0  >> 48);
+    b[ 7] = (byte)(in1  >>  0);
+    b[ 8] = (byte)(in1  >>  8);
+    b[ 9] = (byte)(in1  >> 16);
+    b[10] = (byte)(in1  >> 24);
+    b[11] = (byte)(in1  >> 32);
+    b[12] = (byte)(in1  >> 40);
+    b[13] = (byte)(in1  >> 48);
+    b[14] = (byte)(in2  >>  0);
+    b[15] = (byte)(in2  >>  8);
+    b[16] = (byte)(in2  >> 16);
+    b[17] = (byte)(in2  >> 24);
+    b[18] = (byte)(in2  >> 32);
+    b[19] = (byte)(in2  >> 40);
+    b[20] = (byte)(in2  >> 48);
+    b[21] = (byte)(in3  >>  0);
+    b[22] = (byte)(in3  >>  8);
+    b[23] = (byte)(in3  >> 16);
+    b[24] = (byte)(in3  >> 24);
+    b[25] = (byte)(in3  >> 32);
+    b[26] = (byte)(in3  >> 40);
+    b[27] = (byte)(in3  >> 48);
+    b[28] = (byte)(in4  >>  0);
+    b[29] = (byte)(in4  >>  8);
+    b[30] = (byte)(in4  >> 16);
+    b[31] = (byte)(in4  >> 24);
+    b[32] = (byte)(in4  >> 32);
+    b[33] = (byte)(in4  >> 40);
+    b[34] = (byte)(in4  >> 48);
+    b[35] = (byte)(in5  >>  0);
+    b[36] = (byte)(in5  >>  8);
+    b[37] = (byte)(in5  >> 16);
+    b[38] = (byte)(in5  >> 24);
+    b[39] = (byte)(in5  >> 32);
+    b[40] = (byte)(in5  >> 40);
+    b[41] = (byte)(in5  >> 48);
+    b[42] = (byte)(in6  >>  0);
+    b[43] = (byte)(in6  >>  8);
+    b[44] = (byte)(in6  >> 16);
+    b[45] = (byte)(in6  >> 24);
+    b[46] = (byte)(in6  >> 32);
+    b[47] = (byte)(in6  >> 40);
+    b[48] = (byte)(in6  >> 48);
+    b[49] = (byte)(in7  >>  0);
+    b[50] = (byte)(in7  >>  8);
+    b[51] = (byte)(in7  >> 16);
+    b[52] = (byte)(in7  >> 24);
+    b[53] = (byte)(in7  >> 32);
+    b[54] = (byte)(in7  >> 40);
+    b[55] = (byte)(in7  >> 48);
 }
 
 /* Set the field element to 0.
@@ -788,7 +788,7 @@ void fe448_sub(sword64* r, const sword64* a, const sword64* b)
     r[7] = a[7] - b[7];
 }
 
-/* Mulitply a field element by 39081. r = (39081 * a) mod (2^448 - 2^224 - 1)
+/* Multiply a field element by 39081. r = (39081 * a) mod (2^448 - 2^224 - 1)
  *
  * r  [in]  Field element to hold result.
  * a  [in]  Field element to multiply.
@@ -826,7 +826,7 @@ void fe448_mul39081(sword64* r, const sword64* a)
     r[7] = (sword64)t7;
 }
 
-/* Mulitply two field elements. r = (a * b) mod (2^448 - 2^224 - 1)
+/* Multiply two field elements. r = (a * b) mod (2^448 - 2^224 - 1)
  *
  * r  [in]  Field element to hold result.
  * a  [in]  Field element to multiply.
@@ -1087,8 +1087,8 @@ int curve448(byte* r, const byte* n, const byte* a)
     for (i = 447; i >= 0; --i) {
         unsigned int b = (n[i >> 3] >> (i & 7)) & 1;
         swap ^= b;
-        fe448_cswap(x2, x3, swap);
-        fe448_cswap(z2, z3, swap);
+        fe448_cswap(x2, x3, (int)swap);
+        fe448_cswap(z2, z3, (int)swap);
         swap = b;
 
         /* Montgomery Ladder - double and add */
@@ -1172,7 +1172,7 @@ void fe448_neg(sword64* r, const sword64* a)
 }
 
 /* Raise field element to (p-3) / 4: 2^446 - 2^222 - 1
- * Used for calcualting y-ordinate from x-ordinate for Ed448.
+ * Used for calculating y-ordinate from x-ordinate for Ed448.
  *
  * r  [in]  Field element to hold result.
  * a  [in]  Field element to exponentiate.
@@ -1434,62 +1434,62 @@ void fe448_to_bytes(unsigned char* b, const sword32* a)
                       in8  += o; t = o << 28; in15 -= (sword32)t;
 
     /* Output as bytes */
-    b[ 0] = (in0  >>  0);
-    b[ 1] = (in0  >>  8);
-    b[ 2] = (in0  >> 16);
-    b[ 3] = (in0  >> 24) + ((in1  >>  0) <<  4);
-    b[ 4] = (in1  >>  4);
-    b[ 5] = (in1  >> 12);
-    b[ 6] = (in1  >> 20);
-    b[ 7] = (in2  >>  0);
-    b[ 8] = (in2  >>  8);
-    b[ 9] = (in2  >> 16);
-    b[10] = (in2  >> 24) + ((in3  >>  0) <<  4);
-    b[11] = (in3  >>  4);
-    b[12] = (in3  >> 12);
-    b[13] = (in3  >> 20);
-    b[14] = (in4  >>  0);
-    b[15] = (in4  >>  8);
-    b[16] = (in4  >> 16);
-    b[17] = (in4  >> 24) + ((in5  >>  0) <<  4);
-    b[18] = (in5  >>  4);
-    b[19] = (in5  >> 12);
-    b[20] = (in5  >> 20);
-    b[21] = (in6  >>  0);
-    b[22] = (in6  >>  8);
-    b[23] = (in6  >> 16);
-    b[24] = (in6  >> 24) + ((in7  >>  0) <<  4);
-    b[25] = (in7  >>  4);
-    b[26] = (in7  >> 12);
-    b[27] = (in7  >> 20);
-    b[28] = (in8  >>  0);
-    b[29] = (in8  >>  8);
-    b[30] = (in8  >> 16);
-    b[31] = (in8  >> 24) + ((in9  >>  0) <<  4);
-    b[32] = (in9  >>  4);
-    b[33] = (in9  >> 12);
-    b[34] = (in9  >> 20);
-    b[35] = (in10 >>  0);
-    b[36] = (in10 >>  8);
-    b[37] = (in10 >> 16);
-    b[38] = (in10 >> 24) + ((in11 >>  0) <<  4);
-    b[39] = (in11 >>  4);
-    b[40] = (in11 >> 12);
-    b[41] = (in11 >> 20);
-    b[42] = (in12 >>  0);
-    b[43] = (in12 >>  8);
-    b[44] = (in12 >> 16);
-    b[45] = (in12 >> 24) + ((in13 >>  0) <<  4);
-    b[46] = (in13 >>  4);
-    b[47] = (in13 >> 12);
-    b[48] = (in13 >> 20);
-    b[49] = (in14 >>  0);
-    b[50] = (in14 >>  8);
-    b[51] = (in14 >> 16);
-    b[52] = (in14 >> 24) + ((in15 >>  0) <<  4);
-    b[53] = (in15 >>  4);
-    b[54] = (in15 >> 12);
-    b[55] = (in15 >> 20);
+    b[ 0] = (byte)(in0  >>  0);
+    b[ 1] = (byte)(in0  >>  8);
+    b[ 2] = (byte)(in0  >> 16);
+    b[ 3] = (byte)(in0  >> 24) + ((in1  >>  0) <<  4);
+    b[ 4] = (byte)(in1  >>  4);
+    b[ 5] = (byte)(in1  >> 12);
+    b[ 6] = (byte)(in1  >> 20);
+    b[ 7] = (byte)(in2  >>  0);
+    b[ 8] = (byte)(in2  >>  8);
+    b[ 9] = (byte)(in2  >> 16);
+    b[10] = (byte)(in2  >> 24) + ((in3  >>  0) <<  4);
+    b[11] = (byte)(in3  >>  4);
+    b[12] = (byte)(in3  >> 12);
+    b[13] = (byte)(in3  >> 20);
+    b[14] = (byte)(in4  >>  0);
+    b[15] = (byte)(in4  >>  8);
+    b[16] = (byte)(in4  >> 16);
+    b[17] = (byte)(in4  >> 24) + ((in5  >>  0) <<  4);
+    b[18] = (byte)(in5  >>  4);
+    b[19] = (byte)(in5  >> 12);
+    b[20] = (byte)(in5  >> 20);
+    b[21] = (byte)(in6  >>  0);
+    b[22] = (byte)(in6  >>  8);
+    b[23] = (byte)(in6  >> 16);
+    b[24] = (byte)(in6  >> 24) + ((in7  >>  0) <<  4);
+    b[25] = (byte)(in7  >>  4);
+    b[26] = (byte)(in7  >> 12);
+    b[27] = (byte)(in7  >> 20);
+    b[28] = (byte)(in8  >>  0);
+    b[29] = (byte)(in8  >>  8);
+    b[30] = (byte)(in8  >> 16);
+    b[31] = (byte)(in8  >> 24) + ((in9  >>  0) <<  4);
+    b[32] = (byte)(in9  >>  4);
+    b[33] = (byte)(in9  >> 12);
+    b[34] = (byte)(in9  >> 20);
+    b[35] = (byte)(in10 >>  0);
+    b[36] = (byte)(in10 >>  8);
+    b[37] = (byte)(in10 >> 16);
+    b[38] = (byte)(in10 >> 24) + ((in11 >>  0) <<  4);
+    b[39] = (byte)(in11 >>  4);
+    b[40] = (byte)(in11 >> 12);
+    b[41] = (byte)(in11 >> 20);
+    b[42] = (byte)(in12 >>  0);
+    b[43] = (byte)(in12 >>  8);
+    b[44] = (byte)(in12 >> 16);
+    b[45] = (byte)(in12 >> 24) + ((in13 >>  0) <<  4);
+    b[46] = (byte)(in13 >>  4);
+    b[47] = (byte)(in13 >> 12);
+    b[48] = (byte)(in13 >> 20);
+    b[49] = (byte)(in14 >>  0);
+    b[50] = (byte)(in14 >>  8);
+    b[51] = (byte)(in14 >> 16);
+    b[52] = (byte)(in14 >> 24) + ((in15 >>  0) <<  4);
+    b[53] = (byte)(in15 >>  4);
+    b[54] = (byte)(in15 >> 12);
+    b[55] = (byte)(in15 >> 20);
 }
 
 /* Set the field element to 0.
@@ -1699,7 +1699,7 @@ void fe448_reduce(sword32* a)
     o = a[15] >> 28; a[0]  += (sword32)o;
                      a[8]  += (sword32)o; a[15] -= (sword32)(o << 28);
 }
-/* Mulitply a field element by 39081. r = (39081 * a) mod (2^448 - 2^224 - 1)
+/* Multiply a field element by 39081. r = (39081 * a) mod (2^448 - 2^224 - 1)
  *
  * r  [in]  Field element to hold result.
  * a  [in]  Field element to multiply.
@@ -1761,7 +1761,7 @@ void fe448_mul39081(sword32* r, const sword32* a)
     r[15] = (sword32)t15;
 }
 
-/* Mulitply two field elements. r = a * b
+/* Multiply two field elements. r = a * b
  *
  * r  [in]  Field element to hold result.
  * a  [in]  Field element to multiply.
@@ -1891,7 +1891,7 @@ static WC_INLINE void fe448_mul_8(sword32* r, const sword32* a, const sword32* b
     r[15] = (sword32)t15;
 }
 
-/* Mulitply two field elements. r = (a * b) mod (2^448 - 2^224 - 1)
+/* Multiply two field elements. r = (a * b) mod (2^448 - 2^224 - 1)
  *
  * r  [in]  Field element to hold result.
  * a  [in]  Field element to multiply.
@@ -2178,8 +2178,8 @@ int curve448(byte* r, const byte* n, const byte* a)
     for (i = 447; i >= 0; --i) {
         unsigned int b = (n[i >> 3] >> (i & 7)) & 1;
         swap ^= b;
-        fe448_cswap(x2, x3, swap);
-        fe448_cswap(z2, z3, swap);
+        fe448_cswap(x2, x3, (int)swap);
+        fe448_cswap(z2, z3, (int)swap);
         swap = b;
 
         /* Montgomery Ladder - double and add */
@@ -2271,7 +2271,7 @@ void fe448_neg(sword32* r, const sword32* a)
 }
 
 /* Raise field element to (p-3) / 4: 2^446 - 2^222 - 1
- * Used for calcualting y-ordinate from x-ordinate for Ed448.
+ * Used for calculating y-ordinate from x-ordinate for Ed448.
  *
  * r  [in]  Field element to hold result.
  * a  [in]  Field element to exponentiate.

+ 0 - 630
lib/wolfssl/wolfcrypt/src/fe_x25519_128.i

@@ -1,630 +0,0 @@
-/* fe_x25519_128.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-/* Generated using (from wolfssl):
- *   cd ../scripts
- *   ruby ./x25519/fe_x25519_128_gen.rb > ../wolfssl/wolfcrypt/src/fe_x25519_128.i
- */
-
-void fe_init(void)
-{
-}
-
-/* Convert a number represented as an array of bytes to an array of words with
- * 51-bits of data in each word.
- *
- * in   An array of bytes.
- * out  An array of words.
- */
-void fe_frombytes(fe out, const unsigned char *in)
-{
-    out[0] = (((sword64)((in[ 0]      )       ))      )
-           | (((sword64)((in[ 1]      )       )) <<  8)
-           | (((sword64)((in[ 2]      )       )) << 16)
-           | (((sword64)((in[ 3]      )       )) << 24)
-           | (((sword64)((in[ 4]      )       )) << 32)
-           | (((sword64)((in[ 5]      )       )) << 40)
-           | (((sword64)((in[ 6]      ) & 0x07)) << 48);
-    out[1] = (((sword64)((in[ 6] >>  3) & 0x1f))      )
-           | (((sword64)((in[ 7]      )       )) <<  5)
-           | (((sword64)((in[ 8]      )       )) << 13)
-           | (((sword64)((in[ 9]      )       )) << 21)
-           | (((sword64)((in[10]      )       )) << 29)
-           | (((sword64)((in[11]      )       )) << 37)
-           | (((sword64)((in[12]      ) & 0x3f)) << 45);
-    out[2] = (((sword64)((in[12] >>  6) & 0x03))      )
-           | (((sword64)((in[13]      )       )) <<  2)
-           | (((sword64)((in[14]      )       )) << 10)
-           | (((sword64)((in[15]      )       )) << 18)
-           | (((sword64)((in[16]      )       )) << 26)
-           | (((sword64)((in[17]      )       )) << 34)
-           | (((sword64)((in[18]      )       )) << 42)
-           | (((sword64)((in[19]      ) & 0x01)) << 50);
-    out[3] = (((sword64)((in[19] >>  1) & 0x7f))      )
-           | (((sword64)((in[20]      )       )) <<  7)
-           | (((sword64)((in[21]      )       )) << 15)
-           | (((sword64)((in[22]      )       )) << 23)
-           | (((sword64)((in[23]      )       )) << 31)
-           | (((sword64)((in[24]      )       )) << 39)
-           | (((sword64)((in[25]      ) & 0x0f)) << 47);
-    out[4] = (((sword64)((in[25] >>  4) & 0x0f))      )
-           | (((sword64)((in[26]      )       )) <<  4)
-           | (((sword64)((in[27]      )       )) << 12)
-           | (((sword64)((in[28]      )       )) << 20)
-           | (((sword64)((in[29]      )       )) << 28)
-           | (((sword64)((in[30]      )       )) << 36)
-           | (((sword64)((in[31]      ) & 0x7f)) << 44);
-}
-
-/* Convert a number represented as an array of words to an array of bytes.
- * The array of words is normalized to an array of 51-bit data words and if
- * greater than the mod, modulo reduced by the prime 2^255 - 1.
- *
- * n    An array of words.
- * out  An array of bytes.
- */
-void fe_tobytes(unsigned char *out, const fe n)
-{
-    fe      in;
-    sword64 c;
-
-    in[0] = n[0];
-    in[1] = n[1];
-    in[2] = n[2];
-    in[3] = n[3];
-    in[4] = n[4];
-
-    /* Normalize to 51-bits of data per word. */
-    in[0] += (in[4] >> 51) * 19; in[4] &= 0x7ffffffffffff;
-
-    in[1] += in[0] >> 51; in[0] &= 0x7ffffffffffff;
-    in[2] += in[1] >> 51; in[1] &= 0x7ffffffffffff;
-    in[3] += in[2] >> 51; in[2] &= 0x7ffffffffffff;
-    in[4] += in[3] >> 51; in[3] &= 0x7ffffffffffff;
-    in[0] += (in[4] >> 51) * 19;
-    in[4] &= 0x7ffffffffffff;
-
-    c = (in[0] + 19) >> 51;
-    c = (in[1] + c) >> 51;
-    c = (in[2] + c) >> 51;
-    c = (in[3] + c) >> 51;
-    c = (in[4] + c) >> 51;
-    in[0] += c * 19;
-    in[1] += in[0] >> 51; in[0] &= 0x7ffffffffffff;
-    in[2] += in[1] >> 51; in[1] &= 0x7ffffffffffff;
-    in[3] += in[2] >> 51; in[2] &= 0x7ffffffffffff;
-    in[4] += in[3] >> 51; in[3] &= 0x7ffffffffffff;
-    in[4] &= 0x7ffffffffffff;
-
-    out[ 0] = (((byte)((in[0]      )       ))      );
-    out[ 1] = (((byte)((in[0] >>  8)       ))      );
-    out[ 2] = (((byte)((in[0] >> 16)       ))      );
-    out[ 3] = (((byte)((in[0] >> 24)       ))      );
-    out[ 4] = (((byte)((in[0] >> 32)       ))      );
-    out[ 5] = (((byte)((in[0] >> 40)       ))      );
-    out[ 6] = (((byte)((in[0] >> 48) & 0x07))      )
-            | (((byte)((in[1]      ) & 0x1f)) <<  3);
-    out[ 7] = (((byte)((in[1] >>  5)       ))      );
-    out[ 8] = (((byte)((in[1] >> 13)       ))      );
-    out[ 9] = (((byte)((in[1] >> 21)       ))      );
-    out[10] = (((byte)((in[1] >> 29)       ))      );
-    out[11] = (((byte)((in[1] >> 37)       ))      );
-    out[12] = (((byte)((in[1] >> 45) & 0x3f))      )
-            | (((byte)((in[2]      ) & 0x03)) <<  6);
-    out[13] = (((byte)((in[2] >>  2)       ))      );
-    out[14] = (((byte)((in[2] >> 10)       ))      );
-    out[15] = (((byte)((in[2] >> 18)       ))      );
-    out[16] = (((byte)((in[2] >> 26)       ))      );
-    out[17] = (((byte)((in[2] >> 34)       ))      );
-    out[18] = (((byte)((in[2] >> 42)       ))      );
-    out[19] = (((byte)((in[2] >> 50) & 0x01))      )
-            | (((byte)((in[3]      ) & 0x7f)) <<  1);
-    out[20] = (((byte)((in[3] >>  7)       ))      );
-    out[21] = (((byte)((in[3] >> 15)       ))      );
-    out[22] = (((byte)((in[3] >> 23)       ))      );
-    out[23] = (((byte)((in[3] >> 31)       ))      );
-    out[24] = (((byte)((in[3] >> 39)       ))      );
-    out[25] = (((byte)((in[3] >> 47) & 0x0f))      )
-            | (((byte)((in[4]      ) & 0x0f)) <<  4);
-    out[26] = (((byte)((in[4] >>  4)       ))      );
-    out[27] = (((byte)((in[4] >> 12)       ))      );
-    out[28] = (((byte)((in[4] >> 20)       ))      );
-    out[29] = (((byte)((in[4] >> 28)       ))      );
-    out[30] = (((byte)((in[4] >> 36)       ))      );
-    out[31] = (((byte)((in[4] >> 44) & 0x7f))      );
-}
-
-/* Set the field element to 1.
- *
- * n  The field element number.
- */
-void fe_1(fe n)
-{
-    n[0] = 0x0000000000001;
-    n[1] = 0x0000000000000;
-    n[2] = 0x0000000000000;
-    n[3] = 0x0000000000000;
-    n[4] = 0x0000000000000;
-}
-
-/* Set the field element to 0.
- *
- * n  The field element number.
- */
-void fe_0(fe n)
-{
-    n[0] = 0x0000000000000;
-    n[1] = 0x0000000000000;
-    n[2] = 0x0000000000000;
-    n[3] = 0x0000000000000;
-    n[4] = 0x0000000000000;
-}
-
-/* Copy field element a into field element r.
- *
- * r  Field element to copy into.
- * a  Field element to copy.
- */
-void fe_copy(fe r, const fe a)
-{
-    r[0] = a[0];
-    r[1] = a[1];
-    r[2] = a[2];
-    r[3] = a[3];
-    r[4] = a[4];
-}
-
-/* Constant time, conditional swap of field elements a and b.
- *
- * f  A field element.
- * g  A field element.
- * b  If 1 then swap and if 0 then don't swap.
- */
-void fe_cswap(fe f, fe g, int b)
-{
-    sword64 m = b;
-    sword64 t0, t1, t2, t3, t4;
-
-    /* Convert conditional into mask. */
-    m = -m;
-    t0 = m & (f[0] ^ g[0]);
-    t1 = m & (f[1] ^ g[1]);
-    t2 = m & (f[2] ^ g[2]);
-    t3 = m & (f[3] ^ g[3]);
-    t4 = m & (f[4] ^ g[4]);
-
-    f[0] ^= t0;
-    f[1] ^= t1;
-    f[2] ^= t2;
-    f[3] ^= t3;
-    f[4] ^= t4;
-
-    g[0] ^= t0;
-    g[1] ^= t1;
-    g[2] ^= t2;
-    g[3] ^= t3;
-    g[4] ^= t4;
-}
-
-/* Subtract b from a into r. (r = a - b)
- *
- * r  A field element.
- * a  A field element.
- * b  A field element.
- */
-void fe_sub(fe r, const fe a, const fe b)
-{
-    r[0] = a[0] - b[0];
-    r[1] = a[1] - b[1];
-    r[2] = a[2] - b[2];
-    r[3] = a[3] - b[3];
-    r[4] = a[4] - b[4];
-}
-
-/* Add b to a into r. (r = a + b)
- *
- * r  A field element.
- * a  A field element.
- * b  A field element.
- */
-void fe_add(fe r, const fe a, const fe b)
-{
-    r[0] = a[0] + b[0];
-    r[1] = a[1] + b[1];
-    r[2] = a[2] + b[2];
-    r[3] = a[3] + b[3];
-    r[4] = a[4] + b[4];
-}
-
-/* Multiply a and b into r. (r = a * b)
- *
- * r  A field element.
- * a  A field element.
- * b  A field element.
- */
-void fe_mul(fe r, const fe a, const fe b)
-{
-    const __int128_t k19 = 19;
-    __int128_t t0 = ((__int128_t)a[0]) * b[0];
-    __int128_t t1 = ((__int128_t)a[0]) * b[1]
-                  + ((__int128_t)a[1]) * b[0];
-    __int128_t t2 = ((__int128_t)a[0]) * b[2]
-                  + ((__int128_t)a[1]) * b[1]
-                  + ((__int128_t)a[2]) * b[0];
-    __int128_t t3 = ((__int128_t)a[0]) * b[3]
-                  + ((__int128_t)a[1]) * b[2]
-                  + ((__int128_t)a[2]) * b[1]
-                  + ((__int128_t)a[3]) * b[0];
-    __int128_t t4 = ((__int128_t)a[0]) * b[4]
-                  + ((__int128_t)a[1]) * b[3]
-                  + ((__int128_t)a[2]) * b[2]
-                  + ((__int128_t)a[3]) * b[1]
-                  + ((__int128_t)a[4]) * b[0];
-    __int128_t t5 = ((__int128_t)a[1]) * b[4]
-                  + ((__int128_t)a[2]) * b[3]
-                  + ((__int128_t)a[3]) * b[2]
-                  + ((__int128_t)a[4]) * b[1];
-    __int128_t t6 = ((__int128_t)a[2]) * b[4]
-                  + ((__int128_t)a[3]) * b[3]
-                  + ((__int128_t)a[4]) * b[2];
-    __int128_t t7 = ((__int128_t)a[3]) * b[4]
-                  + ((__int128_t)a[4]) * b[3];
-    __int128_t t8 = ((__int128_t)a[4]) * b[4];
-
-    /* Modulo reduce double long word. */
-    t0 += t5 * k19;
-    t1 += t6 * k19;
-    t2 += t7 * k19;
-    t3 += t8 * k19;
-
-    /* Normalize to 51-bits of data per word. */
-    t0 += (t4 >> 51) * k19; t4 &= 0x7ffffffffffff;
-
-    t1 += t0 >> 51; r[0] = t0 & 0x7ffffffffffff;
-    t2 += t1 >> 51; r[1] = t1 & 0x7ffffffffffff;
-    t3 += t2 >> 51; r[2] = t2 & 0x7ffffffffffff;
-    t4 += t3 >> 51; r[3] = t3 & 0x7ffffffffffff;
-    r[0] += (sword64)((t4 >> 51) * k19);
-    r[4] = t4 & 0x7ffffffffffff;
-}
-
-/* Square a and put result in r. (r = a * a)
- *
- * r  A field element.
- * a  A field element.
- * b  A field element.
- */
-void fe_sq(fe r, const fe a)
-{
-    const __int128_t k19 = 19;
-    const __int128_t k2 = 2;
-    __int128_t t0 = ((__int128_t)a[0]) * a[0];
-    __int128_t t1 = ((__int128_t)a[0]) * a[1] * k2;
-    __int128_t t2 = ((__int128_t)a[0]) * a[2] * k2
-                  + ((__int128_t)a[1]) * a[1];
-    __int128_t t3 = ((__int128_t)a[0]) * a[3] * k2
-                  + ((__int128_t)a[1]) * a[2] * k2;
-    __int128_t t4 = ((__int128_t)a[0]) * a[4] * k2
-                  + ((__int128_t)a[1]) * a[3] * k2
-                  + ((__int128_t)a[2]) * a[2];
-    __int128_t t5 = ((__int128_t)a[1]) * a[4] * k2
-                  + ((__int128_t)a[2]) * a[3] * k2;
-    __int128_t t6 = ((__int128_t)a[2]) * a[4] * k2
-                  + ((__int128_t)a[3]) * a[3];
-    __int128_t t7 = ((__int128_t)a[3]) * a[4] * k2;
-    __int128_t t8 = ((__int128_t)a[4]) * a[4];
-
-    /* Modulo reduce double long word. */
-    t0 += t5 * k19;
-    t1 += t6 * k19;
-    t2 += t7 * k19;
-    t3 += t8 * k19;
-
-    /* Normalize to 51-bits of data per word. */
-    t0 += (t4 >> 51) * k19; t4 &= 0x7ffffffffffff;
-
-    t1 += t0 >> 51; r[0] = t0 & 0x7ffffffffffff;
-    t2 += t1 >> 51; r[1] = t1 & 0x7ffffffffffff;
-    t3 += t2 >> 51; r[2] = t2 & 0x7ffffffffffff;
-    t4 += t3 >> 51; r[3] = t3 & 0x7ffffffffffff;
-    r[0] += (sword64)((t4 >> 51) * k19);
-    r[4] = t4 & 0x7ffffffffffff;
-}
-
-/* Multiply a by 121666 and put result in r. (r = 121666 * a)
- *
- * r  A field element.
- * a  A field element.
- * b  A field element.
- */
-void fe_mul121666(fe r, fe a)
-{
-    const __int128_t k19 = 19;
-    const __int128_t k121666 = 121666;
-    __int128_t t0 = ((__int128_t)a[0]) * k121666;
-    __int128_t t1 = ((__int128_t)a[1]) * k121666;
-    __int128_t t2 = ((__int128_t)a[2]) * k121666;
-    __int128_t t3 = ((__int128_t)a[3]) * k121666;
-    __int128_t t4 = ((__int128_t)a[4]) * k121666;
-
-    /* Normalize to 51-bits of data per word. */
-    t0 += (t4 >> 51) * k19; t4 &= 0x7ffffffffffff;
-
-    t1 += t0 >> 51; r[0] = t0 & 0x7ffffffffffff;
-    t2 += t1 >> 51; r[1] = t1 & 0x7ffffffffffff;
-    t3 += t2 >> 51; r[2] = t2 & 0x7ffffffffffff;
-    t4 += t3 >> 51; r[3] = t3 & 0x7ffffffffffff;
-    r[0] += (sword64)((t4 >> 51) * k19);
-    r[4] = t4 & 0x7ffffffffffff;
-}
-
-/* Find the inverse of a modulo 2^255 - 1 and put result in r.
- * (r * a) mod (2^255 - 1) = 1
- * Implementation is constant time.
- *
- * r  A field element.
- * a  A field element.
- */
-void fe_invert(fe r, const fe a)
-{
-    fe  t0, t1, t2, t3;
-    int i;
-
-    /* a ^ (2^255 - 21) */
-    fe_sq(t0,  a); for (i = 1; i <   1; ++i) fe_sq(t0, t0);
-    fe_sq(t1, t0); for (i = 1; i <   2; ++i) fe_sq(t1, t1); fe_mul(t1,  a, t1);
-    fe_mul(t0, t0, t1);
-    fe_sq(t2, t0); for (i = 1; i <   1; ++i) fe_sq(t2, t2); fe_mul(t1, t1, t2);
-    fe_sq(t2, t1); for (i = 1; i <   5; ++i) fe_sq(t2, t2); fe_mul(t1, t2, t1);
-    fe_sq(t2, t1); for (i = 1; i <  10; ++i) fe_sq(t2, t2); fe_mul(t2, t2, t1);
-    fe_sq(t3, t2); for (i = 1; i <  20; ++i) fe_sq(t3, t3); fe_mul(t2, t3, t2);
-    fe_sq(t2, t2); for (i = 1; i <  10; ++i) fe_sq(t2, t2); fe_mul(t1, t2, t1);
-    fe_sq(t2, t1); for (i = 1; i <  50; ++i) fe_sq(t2, t2); fe_mul(t2, t2, t1);
-    fe_sq(t3, t2); for (i = 1; i < 100; ++i) fe_sq(t3, t3); fe_mul(t2, t3, t2);
-    fe_sq(t2, t2); for (i = 1; i <  50; ++i) fe_sq(t2, t2); fe_mul(t1, t2, t1);
-    fe_sq(t1, t1); for (i = 1; i <   5; ++i) fe_sq(t1, t1); fe_mul( r, t1, t0);
-}
-
-#ifndef CURVE25519_SMALL
-/* Scalar multiply the field element a by n using Montgomery Ladder and places
- * result in r.
- *
- * r  A field element as an array of bytes.
- * n  The scalar as an array of bytes.
- * a  A field element as an array of bytes.
- */
-int curve25519(byte* r, const byte* n, const byte* a)
-{
-    fe           x1, x2, z2, x3, z3;
-    fe           t0, t1;
-    int          pos;
-    unsigned int swap;
-    unsigned int b;
-
-    fe_frombytes(x1, a);
-    fe_1(x2);
-    fe_0(z2);
-    fe_copy(x3, x1);
-    fe_1(z3);
-
-    swap = 0;
-    for (pos = 254;pos >= 0;--pos) {
-        b = n[pos / 8] >> (pos & 7);
-        b &= 1;
-        swap ^= b;
-        fe_cswap(x2, x3, (int)swap);
-        fe_cswap(z2, z3, (int)swap);
-        swap = b;
-
-        fe_sub(t0, x3, z3);
-        fe_sub(t1, x2, z2);
-        fe_add(x2, x2, z2);
-        fe_add(z2, x3, z3);
-        fe_mul(z3, t0, x2);
-        fe_mul(z2, z2, t1);
-        fe_sq(t0, t1);
-        fe_sq(t1, x2);
-        fe_add(x3, z3, z2);
-        fe_sub(z2, z3, z2);
-        fe_mul(x2, t1, t0);
-        fe_sub(t1, t1, t0);
-        fe_sq(z2, z2);
-        fe_mul121666(z3, t1);
-        fe_sq(x3, x3);
-        fe_add(t0, t0, z3);
-        fe_mul(z3, x1, z2);
-        fe_mul(z2, t1, t0);
-    }
-    fe_cswap(x2, x3, (int)swap);
-    fe_cswap(z2, z3, (int)swap);
-
-    fe_invert(z2, z2);
-    fe_mul(x2, x2, z2);
-    fe_tobytes(r, x2);
-
-    return 0;
-}
-#endif /* !CURVE25519_SMALL */
-
-/* The field element value 0 as an array of bytes. */
-static const unsigned char zero[32] = {0};
-
-/* Constant time check as to whether a is not 0.
- *
- * a  A field element.
- */
-int fe_isnonzero(const fe a)
-{
-    unsigned char s[32];
-    fe_tobytes(s, a);
-    return ConstantCompare(s, zero, 32);
-}
-
-/* Checks whether a is negative.
- *
- * a  A field element.
- */
-int fe_isnegative(const fe a)
-{
-    unsigned char s[32];
-    fe_tobytes(s, a);
-    return s[0] & 1;
-}
-
-/* Negates field element a and stores the result in r.
- *
- * r  A field element.
- * a  A field element.
- */
-void fe_neg(fe r, const fe a)
-{
-    r[0] = -a[0];
-    r[1] = -a[1];
-    r[2] = -a[2];
-    r[3] = -a[3];
-    r[4] = -a[4];
-}
-
-/* Constant time, conditional move of b into a.
- * a is not changed if the condition is 0.
- *
- * f  A field element.
- * g  A field element.
- * b  If 1 then copy and if 0 then don't copy.
- */
-void fe_cmov(fe f, const fe g, int b)
-{
-    sword64 m = b;
-    sword64 t0, t1, t2, t3, t4;
-
-    /* Convert conditional into mask. */
-    m = -m;
-    t0 = m & (f[0] ^ g[0]);
-    t1 = m & (f[1] ^ g[1]);
-    t2 = m & (f[2] ^ g[2]);
-    t3 = m & (f[3] ^ g[3]);
-    t4 = m & (f[4] ^ g[4]);
-
-    f[0] ^= t0;
-    f[1] ^= t1;
-    f[2] ^= t2;
-    f[3] ^= t3;
-    f[4] ^= t4;
-}
-
-void fe_pow22523(fe r, const fe a)
-{
-    fe t0, t1, t2;
-    int i;
-
-    /* a ^ (2^255 - 23) */
-    fe_sq(t0,  a); for (i = 1; i <   1; ++i) fe_sq(t0, t0);
-    fe_sq(t1, t0); for (i = 1; i <   2; ++i) fe_sq(t1, t1); fe_mul(t1,  a, t1);
-    fe_mul(t0, t0, t1);
-    fe_sq(t0, t0); for (i = 1; i <   1; ++i) fe_sq(t0, t0); fe_mul(t0, t1, t0);
-    fe_sq(t1, t0); for (i = 1; i <   5; ++i) fe_sq(t1, t1); fe_mul(t0, t1, t0);
-    fe_sq(t1, t0); for (i = 1; i <  10; ++i) fe_sq(t1, t1); fe_mul(t1, t1, t0);
-    fe_sq(t2, t1); for (i = 1; i <  20; ++i) fe_sq(t2, t2); fe_mul(t1, t2, t1);
-    fe_sq(t1, t1); for (i = 1; i <  10; ++i) fe_sq(t1, t1); fe_mul(t0, t1, t0);
-    fe_sq(t1, t0); for (i = 1; i <  50; ++i) fe_sq(t1, t1); fe_mul(t1, t1, t0);
-    fe_sq(t2, t1); for (i = 1; i < 100; ++i) fe_sq(t2, t2); fe_mul(t1, t2, t1);
-    fe_sq(t1, t1); for (i = 1; i <  50; ++i) fe_sq(t1, t1); fe_mul(t0, t1, t0);
-    fe_sq(t0, t0); for (i = 1; i <   2; ++i) fe_sq(t0, t0); fe_mul( r, t0, a);
-
-    return;
-}
-
-/* Double the square of a and put result in r. (r = 2 * a * a)
- *
- * r  A field element.
- * a  A field element.
- * b  A field element.
- */
-void fe_sq2(fe r, const fe a)
-{
-    const __int128_t k2 = 2;
-    const __int128_t k19 = 19;
-    __int128_t t0 = k2 * (((__int128_t)a[0]) * a[0]);
-    __int128_t t1 = k2 * (((__int128_t)a[0]) * a[1] * k2);
-    __int128_t t2 = k2 * (((__int128_t)a[0]) * a[2] * k2
-                  + ((__int128_t)a[1]) * a[1]);
-    __int128_t t3 = k2 * (((__int128_t)a[0]) * a[3] * k2
-                  + ((__int128_t)a[1]) * a[2] * k2);
-    __int128_t t4 = k2 * (((__int128_t)a[0]) * a[4] * k2
-                  + ((__int128_t)a[1]) * a[3] * k2
-                  + ((__int128_t)a[2]) * a[2]);
-    __int128_t t5 = k2 * (((__int128_t)a[1]) * a[4] * k2
-                  + ((__int128_t)a[2]) * a[3] * k2);
-    __int128_t t6 = k2 * (((__int128_t)a[2]) * a[4] * k2
-                  + ((__int128_t)a[3]) * a[3]);
-    __int128_t t7 = k2 * (((__int128_t)a[3]) * a[4] * k2);
-    __int128_t t8 = k2 * (((__int128_t)a[4]) * a[4]);
-
-    /* Modulo reduce double long word. */
-    t0 += t5 * k19;
-    t1 += t6 * k19;
-    t2 += t7 * k19;
-    t3 += t8 * k19;
-
-    /* Normalize to 51-bits of data per word. */
-    t0 += (t4 >> 51) * k19; t4 &= 0x7ffffffffffff;
-
-    t1 += t0 >> 51; r[0] = t0 & 0x7ffffffffffff;
-    t2 += t1 >> 51; r[1] = t1 & 0x7ffffffffffff;
-    t3 += t2 >> 51; r[2] = t2 & 0x7ffffffffffff;
-    t4 += t3 >> 51; r[3] = t3 & 0x7ffffffffffff;
-    r[0] += (sword64)((t4 >> 51) * k19);
-    r[4] = t4 & 0x7ffffffffffff;
-}
-
-/* Load 3 little endian bytes into a 64-bit word.
- *
- * in  An array of bytes.
- * returns a 64-bit word.
- */
-word64 load_3(const unsigned char *in)
-{
-    word64 result;
-
-    result = ((((word64)in[0])      ) |
-              (((word64)in[1]) <<  8) |
-              (((word64)in[2]) << 16));
-
-    return result;
-}
-
-/* Load 4 little endian bytes into a 64-bit word.
- *
- * in  An array of bytes.
- * returns a 64-bit word.
- */
-word64 load_4(const unsigned char *in)
-{
-    word64 result;
-
-    result = ((((word64)in[0])      ) |
-              (((word64)in[1]) <<  8) |
-              (((word64)in[2]) << 16) |
-              (((word64)in[3]) << 24));
-
-    return result;
-}
-

+ 0 - 16596
lib/wolfssl/wolfcrypt/src/fe_x25519_asm.S

@@ -1,16596 +0,0 @@
-/* fe_x25519_asm
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-#ifdef WOLFSSL_USER_SETTINGS
-#ifdef WOLFSSL_USER_SETTINGS_ASM
-/*
- * user_settings_asm.h is a file generated by the script user_settings_asm.sh.
- * The script takes in a user_settings.h and produces user_settings_asm.h, which
- * is a stripped down version of user_settings.h containing only preprocessor
- * directives. This makes the header safe to include in assembly (.S) files.
- */
-#include "user_settings_asm.h"
-#else
-/*
- * Note: if user_settings.h contains any C code (e.g. a typedef or function
- * prototype), including it here in an assembly (.S) file will cause an
- * assembler failure. See user_settings_asm.h above.
- */
-#include "user_settings.h"
-#endif /* WOLFSSL_USER_SETTINGS_ASM */
-#endif /* WOLFSSL_USER_SETTINGS */
-
-#ifndef HAVE_INTEL_AVX1
-#define HAVE_INTEL_AVX1
-#endif /* HAVE_INTEL_AVX1 */
-#ifndef NO_AVX2_SUPPORT
-#define HAVE_INTEL_AVX2
-#endif /* NO_AVX2_SUPPORT */
-
-#ifndef __APPLE__
-.text
-.globl	fe_init
-.type	fe_init,@function
-.align	16
-fe_init:
-#else
-.section	__TEXT,__text
-.globl	_fe_init
-.p2align	4
-_fe_init:
-#endif /* __APPLE__ */
-#ifdef HAVE_INTEL_AVX2
-#ifndef __APPLE__
-        movq	cpuFlagsSet@GOTPCREL(%rip), %rax
-        movl	(%rax), %eax
-#else
-        movl	_cpuFlagsSet(%rip), %eax
-#endif /* __APPLE__ */
-        testl	%eax, %eax
-        je	L_fe_init_get_flags
-        repz retq
-L_fe_init_get_flags:
-#ifndef __APPLE__
-        callq	cpuid_get_flags@plt
-#else
-        callq	_cpuid_get_flags
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	intelFlags@GOTPCREL(%rip), %rdx
-        movl	%eax, (%rdx)
-#else
-        movl	%eax, _intelFlags(%rip)
-#endif /* __APPLE__ */
-        andl	$0x50, %eax
-        cmpl	$0x50, %eax
-        jne	L_fe_init_flags_done
-#ifndef __APPLE__
-        movq	fe_mul_avx2@GOTPCREL(%rip), %rax
-#else
-        leaq	_fe_mul_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_mul_p@GOTPCREL(%rip), %rdx
-        movq	%rax, (%rdx)
-#else
-        movq	%rax, _fe_mul_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_sq_avx2@GOTPCREL(%rip), %rax
-#else
-        leaq	_fe_sq_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_sq_p@GOTPCREL(%rip), %rdx
-        movq	%rax, (%rdx)
-#else
-        movq	%rax, _fe_sq_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_mul121666_avx2@GOTPCREL(%rip), %rax
-#else
-        leaq	_fe_mul121666_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_mul121666_p@GOTPCREL(%rip), %rdx
-        movq	%rax, (%rdx)
-#else
-        movq	%rax, _fe_mul121666_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_sq2_avx2@GOTPCREL(%rip), %rax
-#else
-        leaq	_fe_sq2_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_sq2_p@GOTPCREL(%rip), %rdx
-        movq	%rax, (%rdx)
-#else
-        movq	%rax, _fe_sq2_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_invert_avx2@GOTPCREL(%rip), %rax
-#else
-        leaq	_fe_invert_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_invert_p@GOTPCREL(%rip), %rdx
-        movq	%rax, (%rdx)
-#else
-        movq	%rax, _fe_invert_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	curve25519_avx2@GOTPCREL(%rip), %rax
-#else
-        leaq	_curve25519_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	curve25519_p@GOTPCREL(%rip), %rdx
-        movq	%rax, (%rdx)
-#else
-        movq	%rax, _curve25519_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_pow22523_avx2@GOTPCREL(%rip), %rax
-#else
-        leaq	_fe_pow22523_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_pow22523_p@GOTPCREL(%rip), %rdx
-        movq	%rax, (%rdx)
-#else
-        movq	%rax, _fe_pow22523_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_ge_to_p2_avx2@GOTPCREL(%rip), %rax
-#else
-        leaq	_fe_ge_to_p2_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_ge_to_p2_p@GOTPCREL(%rip), %rdx
-        movq	%rax, (%rdx)
-#else
-        movq	%rax, _fe_ge_to_p2_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_ge_to_p3_avx2@GOTPCREL(%rip), %rax
-#else
-        leaq	_fe_ge_to_p3_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_ge_to_p3_p@GOTPCREL(%rip), %rdx
-        movq	%rax, (%rdx)
-#else
-        movq	%rax, _fe_ge_to_p3_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_ge_dbl_avx2@GOTPCREL(%rip), %rax
-#else
-        leaq	_fe_ge_dbl_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_ge_dbl_p@GOTPCREL(%rip), %rdx
-        movq	%rax, (%rdx)
-#else
-        movq	%rax, _fe_ge_dbl_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_ge_madd_avx2@GOTPCREL(%rip), %rax
-#else
-        leaq	_fe_ge_madd_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_ge_madd_p@GOTPCREL(%rip), %rdx
-        movq	%rax, (%rdx)
-#else
-        movq	%rax, _fe_ge_madd_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_ge_msub_avx2@GOTPCREL(%rip), %rax
-#else
-        leaq	_fe_ge_msub_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_ge_msub_p@GOTPCREL(%rip), %rdx
-        movq	%rax, (%rdx)
-#else
-        movq	%rax, _fe_ge_msub_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_ge_add_avx2@GOTPCREL(%rip), %rax
-#else
-        leaq	_fe_ge_add_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_ge_add_p@GOTPCREL(%rip), %rdx
-        movq	%rax, (%rdx)
-#else
-        movq	%rax, _fe_ge_add_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_ge_sub_avx2@GOTPCREL(%rip), %rax
-#else
-        leaq	_fe_ge_sub_avx2(%rip), %rax
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        movq	fe_ge_sub_p@GOTPCREL(%rip), %rdx
-        movq	%rax, (%rdx)
-#else
-        movq	%rax, _fe_ge_sub_p(%rip)
-#endif /* __APPLE__ */
-L_fe_init_flags_done:
-#ifndef __APPLE__
-        movq	cpuFlagsSet@GOTPCREL(%rip), %rdx
-        movl	$0x1, (%rdx)
-#else
-        movl	$0x1, _cpuFlagsSet(%rip)
-#endif /* __APPLE__ */
-#endif /* HAVE_INTEL_AVX2 */
-        repz retq
-#ifndef __APPLE__
-.size	fe_init,.-fe_init
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_frombytes
-.type	fe_frombytes,@function
-.align	16
-fe_frombytes:
-#else
-.section	__TEXT,__text
-.globl	_fe_frombytes
-.p2align	4
-_fe_frombytes:
-#endif /* __APPLE__ */
-        movq	$0x7fffffffffffffff, %r9
-        movq	(%rsi), %rdx
-        movq	8(%rsi), %rax
-        movq	16(%rsi), %rcx
-        movq	24(%rsi), %r8
-        andq	%r9, %r8
-        movq	%rdx, (%rdi)
-        movq	%rax, 8(%rdi)
-        movq	%rcx, 16(%rdi)
-        movq	%r8, 24(%rdi)
-        repz retq
-#ifndef __APPLE__
-.size	fe_frombytes,.-fe_frombytes
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_tobytes
-.type	fe_tobytes,@function
-.align	16
-fe_tobytes:
-#else
-.section	__TEXT,__text
-.globl	_fe_tobytes
-.p2align	4
-_fe_tobytes:
-#endif /* __APPLE__ */
-        movq	$0x7fffffffffffffff, %r10
-        movq	(%rsi), %rdx
-        movq	8(%rsi), %rax
-        movq	16(%rsi), %rcx
-        movq	24(%rsi), %r8
-        addq	$19, %rdx
-        adcq	$0x00, %rax
-        adcq	$0x00, %rcx
-        adcq	$0x00, %r8
-        shrq	$63, %r8
-        imulq	$19, %r8, %r9
-        movq	(%rsi), %rdx
-        movq	8(%rsi), %rax
-        movq	16(%rsi), %rcx
-        movq	24(%rsi), %r8
-        addq	%r9, %rdx
-        adcq	$0x00, %rax
-        adcq	$0x00, %rcx
-        adcq	$0x00, %r8
-        andq	%r10, %r8
-        movq	%rdx, (%rdi)
-        movq	%rax, 8(%rdi)
-        movq	%rcx, 16(%rdi)
-        movq	%r8, 24(%rdi)
-        repz retq
-#ifndef __APPLE__
-.size	fe_tobytes,.-fe_tobytes
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_1
-.type	fe_1,@function
-.align	16
-fe_1:
-#else
-.section	__TEXT,__text
-.globl	_fe_1
-.p2align	4
-_fe_1:
-#endif /* __APPLE__ */
-        # Set one
-        movq	$0x01, (%rdi)
-        movq	$0x00, 8(%rdi)
-        movq	$0x00, 16(%rdi)
-        movq	$0x00, 24(%rdi)
-        repz retq
-#ifndef __APPLE__
-.size	fe_1,.-fe_1
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_0
-.type	fe_0,@function
-.align	16
-fe_0:
-#else
-.section	__TEXT,__text
-.globl	_fe_0
-.p2align	4
-_fe_0:
-#endif /* __APPLE__ */
-        # Set zero
-        movq	$0x00, (%rdi)
-        movq	$0x00, 8(%rdi)
-        movq	$0x00, 16(%rdi)
-        movq	$0x00, 24(%rdi)
-        repz retq
-#ifndef __APPLE__
-.size	fe_0,.-fe_0
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_copy
-.type	fe_copy,@function
-.align	16
-fe_copy:
-#else
-.section	__TEXT,__text
-.globl	_fe_copy
-.p2align	4
-_fe_copy:
-#endif /* __APPLE__ */
-        # Copy
-        movq	(%rsi), %rdx
-        movq	8(%rsi), %rax
-        movq	16(%rsi), %rcx
-        movq	24(%rsi), %r8
-        movq	%rdx, (%rdi)
-        movq	%rax, 8(%rdi)
-        movq	%rcx, 16(%rdi)
-        movq	%r8, 24(%rdi)
-        repz retq
-#ifndef __APPLE__
-.size	fe_copy,.-fe_copy
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_sub
-.type	fe_sub,@function
-.align	16
-fe_sub:
-#else
-.section	__TEXT,__text
-.globl	_fe_sub
-.p2align	4
-_fe_sub:
-#endif /* __APPLE__ */
-        pushq	%r12
-        # Sub
-        movq	(%rsi), %rax
-        movq	8(%rsi), %rcx
-        movq	16(%rsi), %r8
-        movq	24(%rsi), %r9
-        subq	(%rdx), %rax
-        movq	$0x00, %r10
-        sbbq	8(%rdx), %rcx
-        movq	$-19, %r11
-        sbbq	16(%rdx), %r8
-        movq	$0x7fffffffffffffff, %r12
-        sbbq	24(%rdx), %r9
-        sbbq	$0x00, %r10
-        #   Mask the modulus
-        andq	%r10, %r11
-        andq	%r10, %r12
-        #   Add modulus (if underflow)
-        addq	%r11, %rax
-        adcq	%r10, %rcx
-        adcq	%r10, %r8
-        adcq	%r12, %r9
-        movq	%rax, (%rdi)
-        movq	%rcx, 8(%rdi)
-        movq	%r8, 16(%rdi)
-        movq	%r9, 24(%rdi)
-        popq	%r12
-        repz retq
-#ifndef __APPLE__
-.size	fe_sub,.-fe_sub
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_add
-.type	fe_add,@function
-.align	16
-fe_add:
-#else
-.section	__TEXT,__text
-.globl	_fe_add
-.p2align	4
-_fe_add:
-#endif /* __APPLE__ */
-        pushq	%r12
-        # Add
-        movq	(%rsi), %rax
-        movq	8(%rsi), %rcx
-        addq	(%rdx), %rax
-        movq	16(%rsi), %r8
-        adcq	8(%rdx), %rcx
-        movq	24(%rsi), %r10
-        adcq	16(%rdx), %r8
-        movq	$-19, %r11
-        adcq	24(%rdx), %r10
-        movq	$0x7fffffffffffffff, %r12
-        movq	%r10, %r9
-        sarq	$63, %r10
-        #   Mask the modulus
-        andq	%r10, %r11
-        andq	%r10, %r12
-        #   Sub modulus (if overflow)
-        subq	%r11, %rax
-        sbbq	%r10, %rcx
-        sbbq	%r10, %r8
-        sbbq	%r12, %r9
-        movq	%rax, (%rdi)
-        movq	%rcx, 8(%rdi)
-        movq	%r8, 16(%rdi)
-        movq	%r9, 24(%rdi)
-        popq	%r12
-        repz retq
-#ifndef __APPLE__
-.size	fe_add,.-fe_add
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_neg
-.type	fe_neg,@function
-.align	16
-fe_neg:
-#else
-.section	__TEXT,__text
-.globl	_fe_neg
-.p2align	4
-_fe_neg:
-#endif /* __APPLE__ */
-        movq	$-19, %rdx
-        movq	$-1, %rax
-        movq	$-1, %rcx
-        movq	$0x7fffffffffffffff, %r8
-        subq	(%rsi), %rdx
-        sbbq	8(%rsi), %rax
-        sbbq	16(%rsi), %rcx
-        sbbq	24(%rsi), %r8
-        movq	%rdx, (%rdi)
-        movq	%rax, 8(%rdi)
-        movq	%rcx, 16(%rdi)
-        movq	%r8, 24(%rdi)
-        repz retq
-#ifndef __APPLE__
-.size	fe_neg,.-fe_neg
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_cmov
-.type	fe_cmov,@function
-.align	16
-fe_cmov:
-#else
-.section	__TEXT,__text
-.globl	_fe_cmov
-.p2align	4
-_fe_cmov:
-#endif /* __APPLE__ */
-        cmpl	$0x01, %edx
-        movq	(%rdi), %rcx
-        movq	8(%rdi), %r8
-        movq	16(%rdi), %r9
-        movq	24(%rdi), %r10
-        cmoveq	(%rsi), %rcx
-        cmoveq	8(%rsi), %r8
-        cmoveq	16(%rsi), %r9
-        cmoveq	24(%rsi), %r10
-        movq	%rcx, (%rdi)
-        movq	%r8, 8(%rdi)
-        movq	%r9, 16(%rdi)
-        movq	%r10, 24(%rdi)
-        repz retq
-#ifndef __APPLE__
-.size	fe_cmov,.-fe_cmov
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_isnonzero
-.type	fe_isnonzero,@function
-.align	16
-fe_isnonzero:
-#else
-.section	__TEXT,__text
-.globl	_fe_isnonzero
-.p2align	4
-_fe_isnonzero:
-#endif /* __APPLE__ */
-        movq	$0x7fffffffffffffff, %r10
-        movq	(%rdi), %rax
-        movq	8(%rdi), %rdx
-        movq	16(%rdi), %rcx
-        movq	24(%rdi), %r8
-        addq	$19, %rax
-        adcq	$0x00, %rdx
-        adcq	$0x00, %rcx
-        adcq	$0x00, %r8
-        shrq	$63, %r8
-        imulq	$19, %r8, %r9
-        movq	(%rdi), %rax
-        movq	8(%rdi), %rdx
-        movq	16(%rdi), %rcx
-        movq	24(%rdi), %r8
-        addq	%r9, %rax
-        adcq	$0x00, %rdx
-        adcq	$0x00, %rcx
-        adcq	$0x00, %r8
-        andq	%r10, %r8
-        orq	%rdx, %rax
-        orq	%rcx, %rax
-        orq	%r8, %rax
-        repz retq
-#ifndef __APPLE__
-.size	fe_isnonzero,.-fe_isnonzero
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_isnegative
-.type	fe_isnegative,@function
-.align	16
-fe_isnegative:
-#else
-.section	__TEXT,__text
-.globl	_fe_isnegative
-.p2align	4
-_fe_isnegative:
-#endif /* __APPLE__ */
-        movq	$0x7fffffffffffffff, %r11
-        movq	(%rdi), %rdx
-        movq	8(%rdi), %rcx
-        movq	16(%rdi), %r8
-        movq	24(%rdi), %r9
-        movq	%rdx, %rax
-        addq	$19, %rdx
-        adcq	$0x00, %rcx
-        adcq	$0x00, %r8
-        adcq	$0x00, %r9
-        shrq	$63, %r9
-        imulq	$19, %r9, %r10
-        addq	%r10, %rax
-        andq	$0x01, %rax
-        repz retq
-#ifndef __APPLE__
-.size	fe_isnegative,.-fe_isnegative
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_cmov_table
-.type	fe_cmov_table,@function
-.align	16
-fe_cmov_table:
-#else
-.section	__TEXT,__text
-.globl	_fe_cmov_table
-.p2align	4
-_fe_cmov_table:
-#endif /* __APPLE__ */
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        movq	%rdx, %rcx
-        movsbq	%cl, %rax
-        cdq
-        xorb	%dl, %al
-        subb	%dl, %al
-        movb	%al, %r15b
-        movq	$0x01, %rax
-        xorq	%rdx, %rdx
-        xorq	%r8, %r8
-        xorq	%r9, %r9
-        movq	$0x01, %r10
-        xorq	%r11, %r11
-        xorq	%r12, %r12
-        xorq	%r13, %r13
-        cmpb	$0x01, %r15b
-        movq	(%rsi), %r14
-        cmoveq	%r14, %rax
-        movq	8(%rsi), %r14
-        cmoveq	%r14, %rdx
-        movq	16(%rsi), %r14
-        cmoveq	%r14, %r8
-        movq	24(%rsi), %r14
-        cmoveq	%r14, %r9
-        movq	32(%rsi), %r14
-        cmoveq	%r14, %r10
-        movq	40(%rsi), %r14
-        cmoveq	%r14, %r11
-        movq	48(%rsi), %r14
-        cmoveq	%r14, %r12
-        movq	56(%rsi), %r14
-        cmoveq	%r14, %r13
-        cmpb	$2, %r15b
-        movq	96(%rsi), %r14
-        cmoveq	%r14, %rax
-        movq	104(%rsi), %r14
-        cmoveq	%r14, %rdx
-        movq	112(%rsi), %r14
-        cmoveq	%r14, %r8
-        movq	120(%rsi), %r14
-        cmoveq	%r14, %r9
-        movq	128(%rsi), %r14
-        cmoveq	%r14, %r10
-        movq	136(%rsi), %r14
-        cmoveq	%r14, %r11
-        movq	144(%rsi), %r14
-        cmoveq	%r14, %r12
-        movq	152(%rsi), %r14
-        cmoveq	%r14, %r13
-        cmpb	$3, %r15b
-        movq	192(%rsi), %r14
-        cmoveq	%r14, %rax
-        movq	200(%rsi), %r14
-        cmoveq	%r14, %rdx
-        movq	208(%rsi), %r14
-        cmoveq	%r14, %r8
-        movq	216(%rsi), %r14
-        cmoveq	%r14, %r9
-        movq	224(%rsi), %r14
-        cmoveq	%r14, %r10
-        movq	232(%rsi), %r14
-        cmoveq	%r14, %r11
-        movq	240(%rsi), %r14
-        cmoveq	%r14, %r12
-        movq	248(%rsi), %r14
-        cmoveq	%r14, %r13
-        cmpb	$4, %r15b
-        movq	288(%rsi), %r14
-        cmoveq	%r14, %rax
-        movq	296(%rsi), %r14
-        cmoveq	%r14, %rdx
-        movq	304(%rsi), %r14
-        cmoveq	%r14, %r8
-        movq	312(%rsi), %r14
-        cmoveq	%r14, %r9
-        movq	320(%rsi), %r14
-        cmoveq	%r14, %r10
-        movq	328(%rsi), %r14
-        cmoveq	%r14, %r11
-        movq	336(%rsi), %r14
-        cmoveq	%r14, %r12
-        movq	344(%rsi), %r14
-        cmoveq	%r14, %r13
-        cmpb	$5, %r15b
-        movq	384(%rsi), %r14
-        cmoveq	%r14, %rax
-        movq	392(%rsi), %r14
-        cmoveq	%r14, %rdx
-        movq	400(%rsi), %r14
-        cmoveq	%r14, %r8
-        movq	408(%rsi), %r14
-        cmoveq	%r14, %r9
-        movq	416(%rsi), %r14
-        cmoveq	%r14, %r10
-        movq	424(%rsi), %r14
-        cmoveq	%r14, %r11
-        movq	432(%rsi), %r14
-        cmoveq	%r14, %r12
-        movq	440(%rsi), %r14
-        cmoveq	%r14, %r13
-        cmpb	$6, %r15b
-        movq	480(%rsi), %r14
-        cmoveq	%r14, %rax
-        movq	488(%rsi), %r14
-        cmoveq	%r14, %rdx
-        movq	496(%rsi), %r14
-        cmoveq	%r14, %r8
-        movq	504(%rsi), %r14
-        cmoveq	%r14, %r9
-        movq	512(%rsi), %r14
-        cmoveq	%r14, %r10
-        movq	520(%rsi), %r14
-        cmoveq	%r14, %r11
-        movq	528(%rsi), %r14
-        cmoveq	%r14, %r12
-        movq	536(%rsi), %r14
-        cmoveq	%r14, %r13
-        cmpb	$7, %r15b
-        movq	576(%rsi), %r14
-        cmoveq	%r14, %rax
-        movq	584(%rsi), %r14
-        cmoveq	%r14, %rdx
-        movq	592(%rsi), %r14
-        cmoveq	%r14, %r8
-        movq	600(%rsi), %r14
-        cmoveq	%r14, %r9
-        movq	608(%rsi), %r14
-        cmoveq	%r14, %r10
-        movq	616(%rsi), %r14
-        cmoveq	%r14, %r11
-        movq	624(%rsi), %r14
-        cmoveq	%r14, %r12
-        movq	632(%rsi), %r14
-        cmoveq	%r14, %r13
-        cmpb	$8, %r15b
-        movq	672(%rsi), %r14
-        cmoveq	%r14, %rax
-        movq	680(%rsi), %r14
-        cmoveq	%r14, %rdx
-        movq	688(%rsi), %r14
-        cmoveq	%r14, %r8
-        movq	696(%rsi), %r14
-        cmoveq	%r14, %r9
-        movq	704(%rsi), %r14
-        cmoveq	%r14, %r10
-        movq	712(%rsi), %r14
-        cmoveq	%r14, %r11
-        movq	720(%rsi), %r14
-        cmoveq	%r14, %r12
-        movq	728(%rsi), %r14
-        cmoveq	%r14, %r13
-        cmpb	$0x00, %cl
-        movq	%rax, %r14
-        cmovlq	%r10, %rax
-        cmovlq	%r14, %r10
-        movq	%rdx, %r14
-        cmovlq	%r11, %rdx
-        cmovlq	%r14, %r11
-        movq	%r8, %r14
-        cmovlq	%r12, %r8
-        cmovlq	%r14, %r12
-        movq	%r9, %r14
-        cmovlq	%r13, %r9
-        cmovlq	%r14, %r13
-        movq	%rax, (%rdi)
-        movq	%rdx, 8(%rdi)
-        movq	%r8, 16(%rdi)
-        movq	%r9, 24(%rdi)
-        movq	%r10, 32(%rdi)
-        movq	%r11, 40(%rdi)
-        movq	%r12, 48(%rdi)
-        movq	%r13, 56(%rdi)
-        xorq	%rax, %rax
-        xorq	%rdx, %rdx
-        xorq	%r8, %r8
-        xorq	%r9, %r9
-        cmpb	$0x01, %r15b
-        movq	64(%rsi), %r14
-        cmoveq	%r14, %rax
-        movq	72(%rsi), %r14
-        cmoveq	%r14, %rdx
-        movq	80(%rsi), %r14
-        cmoveq	%r14, %r8
-        movq	88(%rsi), %r14
-        cmoveq	%r14, %r9
-        cmpb	$2, %r15b
-        movq	160(%rsi), %r14
-        cmoveq	%r14, %rax
-        movq	168(%rsi), %r14
-        cmoveq	%r14, %rdx
-        movq	176(%rsi), %r14
-        cmoveq	%r14, %r8
-        movq	184(%rsi), %r14
-        cmoveq	%r14, %r9
-        cmpb	$3, %r15b
-        movq	256(%rsi), %r14
-        cmoveq	%r14, %rax
-        movq	264(%rsi), %r14
-        cmoveq	%r14, %rdx
-        movq	272(%rsi), %r14
-        cmoveq	%r14, %r8
-        movq	280(%rsi), %r14
-        cmoveq	%r14, %r9
-        cmpb	$4, %r15b
-        movq	352(%rsi), %r14
-        cmoveq	%r14, %rax
-        movq	360(%rsi), %r14
-        cmoveq	%r14, %rdx
-        movq	368(%rsi), %r14
-        cmoveq	%r14, %r8
-        movq	376(%rsi), %r14
-        cmoveq	%r14, %r9
-        cmpb	$5, %r15b
-        movq	448(%rsi), %r14
-        cmoveq	%r14, %rax
-        movq	456(%rsi), %r14
-        cmoveq	%r14, %rdx
-        movq	464(%rsi), %r14
-        cmoveq	%r14, %r8
-        movq	472(%rsi), %r14
-        cmoveq	%r14, %r9
-        cmpb	$6, %r15b
-        movq	544(%rsi), %r14
-        cmoveq	%r14, %rax
-        movq	552(%rsi), %r14
-        cmoveq	%r14, %rdx
-        movq	560(%rsi), %r14
-        cmoveq	%r14, %r8
-        movq	568(%rsi), %r14
-        cmoveq	%r14, %r9
-        cmpb	$7, %r15b
-        movq	640(%rsi), %r14
-        cmoveq	%r14, %rax
-        movq	648(%rsi), %r14
-        cmoveq	%r14, %rdx
-        movq	656(%rsi), %r14
-        cmoveq	%r14, %r8
-        movq	664(%rsi), %r14
-        cmoveq	%r14, %r9
-        cmpb	$8, %r15b
-        movq	736(%rsi), %r14
-        cmoveq	%r14, %rax
-        movq	744(%rsi), %r14
-        cmoveq	%r14, %rdx
-        movq	752(%rsi), %r14
-        cmoveq	%r14, %r8
-        movq	760(%rsi), %r14
-        cmoveq	%r14, %r9
-        movq	$-19, %r10
-        movq	$-1, %r11
-        movq	$-1, %r12
-        movq	$0x7fffffffffffffff, %r13
-        subq	%rax, %r10
-        sbbq	%rdx, %r11
-        sbbq	%r8, %r12
-        sbbq	%r9, %r13
-        cmpb	$0x00, %cl
-        cmovlq	%r10, %rax
-        cmovlq	%r11, %rdx
-        cmovlq	%r12, %r8
-        cmovlq	%r13, %r9
-        movq	%rax, 64(%rdi)
-        movq	%rdx, 72(%rdi)
-        movq	%r8, 80(%rdi)
-        movq	%r9, 88(%rdi)
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        repz retq
-#ifndef __APPLE__
-.size	fe_cmov_table,.-fe_cmov_table
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_mul
-.type	fe_mul,@function
-.align	16
-fe_mul:
-#else
-.section	__TEXT,__text
-.globl	_fe_mul
-.p2align	4
-_fe_mul:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        jmpq	*fe_mul_p(%rip)
-#else
-        jmpq	*_fe_mul_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size	fe_mul,.-fe_mul
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_sq
-.type	fe_sq,@function
-.align	16
-fe_sq:
-#else
-.section	__TEXT,__text
-.globl	_fe_sq
-.p2align	4
-_fe_sq:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        jmpq	*fe_sq_p(%rip)
-#else
-        jmpq	*_fe_sq_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size	fe_sq,.-fe_sq
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_mul121666
-.type	fe_mul121666,@function
-.align	16
-fe_mul121666:
-#else
-.section	__TEXT,__text
-.globl	_fe_mul121666
-.p2align	4
-_fe_mul121666:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        jmpq	*fe_mul121666_p(%rip)
-#else
-        jmpq	*_fe_mul121666_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size	fe_mul121666,.-fe_mul121666
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_sq2
-.type	fe_sq2,@function
-.align	16
-fe_sq2:
-#else
-.section	__TEXT,__text
-.globl	_fe_sq2
-.p2align	4
-_fe_sq2:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        jmpq	*fe_sq2_p(%rip)
-#else
-        jmpq	*_fe_sq2_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size	fe_sq2,.-fe_sq2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_invert
-.type	fe_invert,@function
-.align	16
-fe_invert:
-#else
-.section	__TEXT,__text
-.globl	_fe_invert
-.p2align	4
-_fe_invert:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        jmpq	*fe_invert_p(%rip)
-#else
-        jmpq	*_fe_invert_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size	fe_invert,.-fe_invert
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	curve25519
-.type	curve25519,@function
-.align	16
-curve25519:
-#else
-.section	__TEXT,__text
-.globl	_curve25519
-.p2align	4
-_curve25519:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        jmpq	*curve25519_p(%rip)
-#else
-        jmpq	*_curve25519_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size	curve25519,.-curve25519
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_pow22523
-.type	fe_pow22523,@function
-.align	16
-fe_pow22523:
-#else
-.section	__TEXT,__text
-.globl	_fe_pow22523
-.p2align	4
-_fe_pow22523:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        jmpq	*fe_pow22523_p(%rip)
-#else
-        jmpq	*_fe_pow22523_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size	fe_pow22523,.-fe_pow22523
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_ge_to_p2
-.type	fe_ge_to_p2,@function
-.align	16
-fe_ge_to_p2:
-#else
-.section	__TEXT,__text
-.globl	_fe_ge_to_p2
-.p2align	4
-_fe_ge_to_p2:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        jmpq	*fe_ge_to_p2_p(%rip)
-#else
-        jmpq	*_fe_ge_to_p2_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size	fe_ge_to_p2,.-fe_ge_to_p2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_ge_to_p3
-.type	fe_ge_to_p3,@function
-.align	16
-fe_ge_to_p3:
-#else
-.section	__TEXT,__text
-.globl	_fe_ge_to_p3
-.p2align	4
-_fe_ge_to_p3:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        jmpq	*fe_ge_to_p3_p(%rip)
-#else
-        jmpq	*_fe_ge_to_p3_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size	fe_ge_to_p3,.-fe_ge_to_p3
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_ge_dbl
-.type	fe_ge_dbl,@function
-.align	16
-fe_ge_dbl:
-#else
-.section	__TEXT,__text
-.globl	_fe_ge_dbl
-.p2align	4
-_fe_ge_dbl:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        jmpq	*fe_ge_dbl_p(%rip)
-#else
-        jmpq	*_fe_ge_dbl_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size	fe_ge_dbl,.-fe_ge_dbl
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_ge_madd
-.type	fe_ge_madd,@function
-.align	16
-fe_ge_madd:
-#else
-.section	__TEXT,__text
-.globl	_fe_ge_madd
-.p2align	4
-_fe_ge_madd:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        jmpq	*fe_ge_madd_p(%rip)
-#else
-        jmpq	*_fe_ge_madd_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size	fe_ge_madd,.-fe_ge_madd
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_ge_msub
-.type	fe_ge_msub,@function
-.align	16
-fe_ge_msub:
-#else
-.section	__TEXT,__text
-.globl	_fe_ge_msub
-.p2align	4
-_fe_ge_msub:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        jmpq	*fe_ge_msub_p(%rip)
-#else
-        jmpq	*_fe_ge_msub_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size	fe_ge_msub,.-fe_ge_msub
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_ge_add
-.type	fe_ge_add,@function
-.align	16
-fe_ge_add:
-#else
-.section	__TEXT,__text
-.globl	_fe_ge_add
-.p2align	4
-_fe_ge_add:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        jmpq	*fe_ge_add_p(%rip)
-#else
-        jmpq	*_fe_ge_add_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size	fe_ge_add,.-fe_ge_add
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_ge_sub
-.type	fe_ge_sub,@function
-.align	16
-fe_ge_sub:
-#else
-.section	__TEXT,__text
-.globl	_fe_ge_sub
-.p2align	4
-_fe_ge_sub:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        jmpq	*fe_ge_sub_p(%rip)
-#else
-        jmpq	*_fe_ge_sub_p(%rip)
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.size	fe_ge_sub,.-fe_ge_sub
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type	cpuFlagsSet, @object
-.size	cpuFlagsSet,4
-cpuFlagsSet:
-	.long	0
-#else
-.section	__DATA,__data
-.p2align	2
-_cpuFlagsSet:
-	.long	0
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type	intelFlags, @object
-.size	intelFlags,4
-intelFlags:
-	.long	0
-#else
-.section	__DATA,__data
-.p2align	2
-_intelFlags:
-	.long	0
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type	fe_mul_p, @object
-.size	fe_mul_p,8
-fe_mul_p:
-	.quad	fe_mul_x64
-#else
-.section	__DATA,__data
-.p2align	2
-_fe_mul_p:
-	.quad	_fe_mul_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type	fe_sq_p, @object
-.size	fe_sq_p,8
-fe_sq_p:
-	.quad	fe_sq_x64
-#else
-.section	__DATA,__data
-.p2align	2
-_fe_sq_p:
-	.quad	_fe_sq_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type	fe_mul121666_p, @object
-.size	fe_mul121666_p,8
-fe_mul121666_p:
-	.quad	fe_mul121666_x64
-#else
-.section	__DATA,__data
-.p2align	2
-_fe_mul121666_p:
-	.quad	_fe_mul121666_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type	fe_sq2_p, @object
-.size	fe_sq2_p,8
-fe_sq2_p:
-	.quad	fe_sq2_x64
-#else
-.section	__DATA,__data
-.p2align	2
-_fe_sq2_p:
-	.quad	_fe_sq2_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type	fe_invert_p, @object
-.size	fe_invert_p,8
-fe_invert_p:
-	.quad	fe_invert_x64
-#else
-.section	__DATA,__data
-.p2align	2
-_fe_invert_p:
-	.quad	_fe_invert_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type	curve25519_p, @object
-.size	curve25519_p,8
-curve25519_p:
-	.quad	curve25519_x64
-#else
-.section	__DATA,__data
-.p2align	2
-_curve25519_p:
-	.quad	_curve25519_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type	fe_pow22523_p, @object
-.size	fe_pow22523_p,8
-fe_pow22523_p:
-	.quad	fe_pow22523_x64
-#else
-.section	__DATA,__data
-.p2align	2
-_fe_pow22523_p:
-	.quad	_fe_pow22523_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type	fe_ge_to_p2_p, @object
-.size	fe_ge_to_p2_p,8
-fe_ge_to_p2_p:
-	.quad	fe_ge_to_p2_x64
-#else
-.section	__DATA,__data
-.p2align	2
-_fe_ge_to_p2_p:
-	.quad	_fe_ge_to_p2_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type	fe_ge_to_p3_p, @object
-.size	fe_ge_to_p3_p,8
-fe_ge_to_p3_p:
-	.quad	fe_ge_to_p3_x64
-#else
-.section	__DATA,__data
-.p2align	2
-_fe_ge_to_p3_p:
-	.quad	_fe_ge_to_p3_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type	fe_ge_dbl_p, @object
-.size	fe_ge_dbl_p,8
-fe_ge_dbl_p:
-	.quad	fe_ge_dbl_x64
-#else
-.section	__DATA,__data
-.p2align	2
-_fe_ge_dbl_p:
-	.quad	_fe_ge_dbl_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type	fe_ge_madd_p, @object
-.size	fe_ge_madd_p,8
-fe_ge_madd_p:
-	.quad	fe_ge_madd_x64
-#else
-.section	__DATA,__data
-.p2align	2
-_fe_ge_madd_p:
-	.quad	_fe_ge_madd_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type	fe_ge_msub_p, @object
-.size	fe_ge_msub_p,8
-fe_ge_msub_p:
-	.quad	fe_ge_msub_x64
-#else
-.section	__DATA,__data
-.p2align	2
-_fe_ge_msub_p:
-	.quad	_fe_ge_msub_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type	fe_ge_add_p, @object
-.size	fe_ge_add_p,8
-fe_ge_add_p:
-	.quad	fe_ge_add_x64
-#else
-.section	__DATA,__data
-.p2align	2
-_fe_ge_add_p:
-	.quad	_fe_ge_add_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-.type	fe_ge_sub_p, @object
-.size	fe_ge_sub_p,8
-fe_ge_sub_p:
-	.quad	fe_ge_sub_x64
-#else
-.section	__DATA,__data
-.p2align	2
-_fe_ge_sub_p:
-	.quad	_fe_ge_sub_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_mul_x64
-.type	fe_mul_x64,@function
-.align	16
-fe_mul_x64:
-#else
-.section	__TEXT,__text
-.globl	_fe_mul_x64
-.p2align	4
-_fe_mul_x64:
-#endif /* __APPLE__ */
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        pushq	%rbx
-        movq	%rdx, %rcx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rcx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rcx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rcx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rcx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rcx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rcx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rcx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rcx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rcx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rcx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rcx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rcx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rcx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rcx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rcx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rcx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rbx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rbx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rbx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rbx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        popq	%rbx
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        repz retq
-#ifndef __APPLE__
-.size	fe_mul_x64,.-fe_mul_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_sq_x64
-.type	fe_sq_x64,@function
-.align	16
-fe_sq_x64:
-#else
-.section	__TEXT,__text
-.globl	_fe_sq_x64
-.p2align	4
-_fe_sq_x64:
-#endif /* __APPLE__ */
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        # Square
-        #  A[0] * A[1]
-        movq	(%rsi), %rax
-        mulq	8(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * A[2]
-        movq	(%rsi), %rax
-        mulq	16(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[0] * A[3]
-        movq	(%rsi), %rax
-        mulq	24(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * A[2]
-        movq	8(%rsi), %rax
-        mulq	16(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[1] * A[3]
-        movq	8(%rsi), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        #  A[2] * A[3]
-        movq	16(%rsi), %rax
-        mulq	24(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        # Double
-        xorq	%r14, %r14
-        addq	%r8, %r8
-        adcq	%r9, %r9
-        adcq	%r10, %r10
-        adcq	%r11, %r11
-        adcq	%r12, %r12
-        adcq	%r13, %r13
-        adcq	$0x00, %r14
-        #  A[0] * A[0]
-        movq	(%rsi), %rax
-        mulq	%rax
-        movq	%rax, %rcx
-        movq	%rdx, %r15
-        #  A[1] * A[1]
-        movq	8(%rsi), %rax
-        mulq	%rax
-        addq	%r15, %r8
-        adcq	%rax, %r9
-        adcq	$0x00, %rdx
-        movq	%rdx, %r15
-        #  A[2] * A[2]
-        movq	16(%rsi), %rax
-        mulq	%rax
-        addq	%r15, %r10
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        movq	%rdx, %r15
-        #  A[3] * A[3]
-        movq	24(%rsi), %rax
-        mulq	%rax
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        addq	%r15, %r12
-        adcq	$0x00, %r13
-        adcq	$0x00, %r14
-        # Reduce
-        movq	$0x7fffffffffffffff, %r15
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        shldq	$0x01, %r10, %r11
-        andq	%r15, %r10
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r11
-        xorq	%r11, %r11
-        addq	%rax, %rcx
-        movq	$19, %rax
-        adcq	%rdx, %r11
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        #  Add remaining product results in
-        addq	%r11, %r8
-        adcq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%rax, %r10
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r10, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%r15, %r10
-        addq	%rax, %rcx
-        adcq	$0x00, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        # Reduce if top bit set
-        movq	%r10, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%r15, %r10
-        addq	%rdx, %rcx
-        adcq	$0x00, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        # Store
-        movq	%rcx, (%rdi)
-        movq	%r8, 8(%rdi)
-        movq	%r9, 16(%rdi)
-        movq	%r10, 24(%rdi)
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        repz retq
-#ifndef __APPLE__
-.size	fe_sq_x64,.-fe_sq_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_sq_n_x64
-.type	fe_sq_n_x64,@function
-.align	16
-fe_sq_n_x64:
-#else
-.section	__TEXT,__text
-.globl	_fe_sq_n_x64
-.p2align	4
-_fe_sq_n_x64:
-#endif /* __APPLE__ */
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        pushq	%rbx
-        movq	%rdx, %rcx
-L_fe_sq_n_x64:
-        # Square
-        #  A[0] * A[1]
-        movq	(%rsi), %rax
-        mulq	8(%rsi)
-        movq	%rax, %r9
-        movq	%rdx, %r10
-        #  A[0] * A[2]
-        movq	(%rsi), %rax
-        mulq	16(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[0] * A[3]
-        movq	(%rsi), %rax
-        mulq	24(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        #  A[1] * A[2]
-        movq	8(%rsi), %rax
-        mulq	16(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * A[3]
-        movq	8(%rsi), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        #  A[2] * A[3]
-        movq	16(%rsi), %rax
-        mulq	24(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        # Double
-        xorq	%r15, %r15
-        addq	%r9, %r9
-        adcq	%r10, %r10
-        adcq	%r11, %r11
-        adcq	%r12, %r12
-        adcq	%r13, %r13
-        adcq	%r14, %r14
-        adcq	$0x00, %r15
-        #  A[0] * A[0]
-        movq	(%rsi), %rax
-        mulq	%rax
-        movq	%rax, %r8
-        movq	%rdx, %rbx
-        #  A[1] * A[1]
-        movq	8(%rsi), %rax
-        mulq	%rax
-        addq	%rbx, %r9
-        adcq	%rax, %r10
-        adcq	$0x00, %rdx
-        movq	%rdx, %rbx
-        #  A[2] * A[2]
-        movq	16(%rsi), %rax
-        mulq	%rax
-        addq	%rbx, %r11
-        adcq	%rax, %r12
-        adcq	$0x00, %rdx
-        movq	%rdx, %rbx
-        #  A[3] * A[3]
-        movq	24(%rsi), %rax
-        mulq	%rax
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        addq	%rbx, %r13
-        adcq	$0x00, %r14
-        adcq	$0x00, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rbx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rbx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rbx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rbx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        decb	%cl
-        jnz	L_fe_sq_n_x64
-        popq	%rbx
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        repz retq
-#ifndef __APPLE__
-.size	fe_sq_n_x64,.-fe_sq_n_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_mul121666_x64
-.type	fe_mul121666_x64,@function
-.align	16
-fe_mul121666_x64:
-#else
-.section	__TEXT,__text
-.globl	_fe_mul121666_x64
-.p2align	4
-_fe_mul121666_x64:
-#endif /* __APPLE__ */
-        pushq	%r12
-        # Multiply by 121666
-        movq	$0x1db42, %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        movq	$0x1db42, %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        movq	$0x1db42, %rax
-        mulq	16(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        movq	$0x1db42, %rax
-        mulq	24(%rsi)
-        movq	$0x7fffffffffffffff, %rcx
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        movq	$19, %rax
-        mulq	%r12
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        popq	%r12
-        repz retq
-#ifndef __APPLE__
-.size	fe_mul121666_x64,.-fe_mul121666_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_sq2_x64
-.type	fe_sq2_x64,@function
-.align	16
-fe_sq2_x64:
-#else
-.section	__TEXT,__text
-.globl	_fe_sq2_x64
-.p2align	4
-_fe_sq2_x64:
-#endif /* __APPLE__ */
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        pushq	%rbx
-        # Square * 2
-        #  A[0] * A[1]
-        movq	(%rsi), %rax
-        mulq	8(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * A[2]
-        movq	(%rsi), %rax
-        mulq	16(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[0] * A[3]
-        movq	(%rsi), %rax
-        mulq	24(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * A[2]
-        movq	8(%rsi), %rax
-        mulq	16(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[1] * A[3]
-        movq	8(%rsi), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        #  A[2] * A[3]
-        movq	16(%rsi), %rax
-        mulq	24(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        # Double
-        xorq	%r14, %r14
-        addq	%r8, %r8
-        adcq	%r9, %r9
-        adcq	%r10, %r10
-        adcq	%r11, %r11
-        adcq	%r12, %r12
-        adcq	%r13, %r13
-        adcq	$0x00, %r14
-        #  A[0] * A[0]
-        movq	(%rsi), %rax
-        mulq	%rax
-        movq	%rax, %rcx
-        movq	%rdx, %r15
-        #  A[1] * A[1]
-        movq	8(%rsi), %rax
-        mulq	%rax
-        addq	%r15, %r8
-        adcq	%rax, %r9
-        adcq	$0x00, %rdx
-        movq	%rdx, %r15
-        #  A[2] * A[2]
-        movq	16(%rsi), %rax
-        mulq	%rax
-        addq	%r15, %r10
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        movq	%rdx, %r15
-        #  A[3] * A[3]
-        movq	24(%rsi), %rax
-        mulq	%rax
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        addq	%r15, %r12
-        adcq	$0x00, %r13
-        adcq	$0x00, %r14
-        # Reduce
-        movq	$0x7fffffffffffffff, %rbx
-        xorq	%rax, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$3, %r14, %rax
-        shldq	$2, %r13, %r14
-        shldq	$2, %r12, %r13
-        shldq	$2, %r11, %r12
-        shldq	$2, %r10, %r11
-        shldq	$0x01, %r9, %r10
-        shldq	$0x01, %r8, %r9
-        shldq	$0x01, %rcx, %r8
-        shlq	$0x01, %rcx
-        andq	%rbx, %r10
-        #  Two out left, one in right
-        andq	%rbx, %r14
-        #  Multiply top bits by 19*19
-        imulq	$0x169, %rax, %r15
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r11
-        xorq	%r11, %r11
-        addq	%rax, %rcx
-        movq	$19, %rax
-        adcq	%rdx, %r11
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        #  Add remaining produce results in
-        addq	%r15, %rcx
-        adcq	%r11, %r8
-        adcq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%rax, %r10
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r10, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rbx, %r10
-        addq	%rax, %rcx
-        adcq	$0x00, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        # Reduce if top bit set
-        movq	%r10, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rbx, %r10
-        addq	%rdx, %rcx
-        adcq	$0x00, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        # Store
-        movq	%rcx, (%rdi)
-        movq	%r8, 8(%rdi)
-        movq	%r9, 16(%rdi)
-        movq	%r10, 24(%rdi)
-        popq	%rbx
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        repz retq
-#ifndef __APPLE__
-.size	fe_sq2_x64,.-fe_sq2_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_invert_x64
-.type	fe_invert_x64,@function
-.align	16
-fe_invert_x64:
-#else
-.section	__TEXT,__text
-.globl	_fe_invert_x64
-.p2align	4
-_fe_invert_x64:
-#endif /* __APPLE__ */
-        subq	$0x90, %rsp
-        # Invert
-        movq	%rdi, 128(%rsp)
-        movq	%rsi, 136(%rsp)
-        movq	%rsp, %rdi
-        movq	136(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        movq	%rsp, %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        movq	136(%rsp), %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        movq	%rsp, %rdi
-        movq	%rsp, %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        movq	%rsp, %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        leaq	64(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        movq	$4, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        movq	$9, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        movq	$19, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        leaq	64(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        movq	$9, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        movq	$49, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        movq	$0x63, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        leaq	64(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        movq	$49, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        movq	$4, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        movq	128(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        movq	%rsp, %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        movq	136(%rsp), %rsi
-        movq	128(%rsp), %rdi
-        addq	$0x90, %rsp
-        repz retq
-#ifndef __APPLE__
-.text
-.globl	curve25519_x64
-.type	curve25519_x64,@function
-.align	16
-curve25519_x64:
-#else
-.section	__TEXT,__text
-.globl	_curve25519_x64
-.p2align	4
-_curve25519_x64:
-#endif /* __APPLE__ */
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        pushq	%rbx
-        pushq	%rbp
-        movq	%rdx, %r8
-        subq	$0xb8, %rsp
-        xorq	%rbx, %rbx
-        movq	%rdi, 176(%rsp)
-        # Set one
-        movq	$0x01, (%rdi)
-        movq	$0x00, 8(%rdi)
-        movq	$0x00, 16(%rdi)
-        movq	$0x00, 24(%rdi)
-        # Set zero
-        movq	$0x00, (%rsp)
-        movq	$0x00, 8(%rsp)
-        movq	$0x00, 16(%rsp)
-        movq	$0x00, 24(%rsp)
-        # Set one
-        movq	$0x01, 32(%rsp)
-        movq	$0x00, 40(%rsp)
-        movq	$0x00, 48(%rsp)
-        movq	$0x00, 56(%rsp)
-        # Copy
-        movq	(%r8), %rcx
-        movq	8(%r8), %r9
-        movq	16(%r8), %r10
-        movq	24(%r8), %r11
-        movq	%rcx, 64(%rsp)
-        movq	%r9, 72(%rsp)
-        movq	%r10, 80(%rsp)
-        movq	%r11, 88(%rsp)
-        movb	$62, 168(%rsp)
-        movq	$3, 160(%rsp)
-L_curve25519_x64_words:
-L_curve25519_x64_bits:
-        movq	160(%rsp), %r9
-        movb	168(%rsp), %cl
-        movq	(%rsi,%r9,8), %rbp
-        shrq	%cl, %rbp
-        andq	$0x01, %rbp
-        xorq	%rbp, %rbx
-        negq	%rbx
-        # Conditional Swap
-        movq	(%rdi), %rcx
-        movq	8(%rdi), %r9
-        movq	16(%rdi), %r10
-        movq	24(%rdi), %r11
-        xorq	64(%rsp), %rcx
-        xorq	72(%rsp), %r9
-        xorq	80(%rsp), %r10
-        xorq	88(%rsp), %r11
-        andq	%rbx, %rcx
-        andq	%rbx, %r9
-        andq	%rbx, %r10
-        andq	%rbx, %r11
-        xorq	%rcx, (%rdi)
-        xorq	%r9, 8(%rdi)
-        xorq	%r10, 16(%rdi)
-        xorq	%r11, 24(%rdi)
-        xorq	%rcx, 64(%rsp)
-        xorq	%r9, 72(%rsp)
-        xorq	%r10, 80(%rsp)
-        xorq	%r11, 88(%rsp)
-        # Conditional Swap
-        movq	(%rsp), %rcx
-        movq	8(%rsp), %r9
-        movq	16(%rsp), %r10
-        movq	24(%rsp), %r11
-        xorq	32(%rsp), %rcx
-        xorq	40(%rsp), %r9
-        xorq	48(%rsp), %r10
-        xorq	56(%rsp), %r11
-        andq	%rbx, %rcx
-        andq	%rbx, %r9
-        andq	%rbx, %r10
-        andq	%rbx, %r11
-        xorq	%rcx, (%rsp)
-        xorq	%r9, 8(%rsp)
-        xorq	%r10, 16(%rsp)
-        xorq	%r11, 24(%rsp)
-        xorq	%rcx, 32(%rsp)
-        xorq	%r9, 40(%rsp)
-        xorq	%r10, 48(%rsp)
-        xorq	%r11, 56(%rsp)
-        movq	%rbp, %rbx
-        # Add
-        movq	(%rdi), %rcx
-        movq	8(%rdi), %r9
-        movq	16(%rdi), %r10
-        movq	24(%rdi), %rbp
-        movq	%rcx, %r12
-        addq	(%rsp), %rcx
-        movq	%r9, %r13
-        adcq	8(%rsp), %r9
-        movq	%r10, %r14
-        adcq	16(%rsp), %r10
-        movq	%rbp, %r15
-        adcq	24(%rsp), %rbp
-        movq	$-19, %rax
-        movq	%rbp, %r11
-        movq	$0x7fffffffffffffff, %rdx
-        sarq	$63, %rbp
-        #   Mask the modulus
-        andq	%rbp, %rax
-        andq	%rbp, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %rcx
-        sbbq	%rbp, %r9
-        sbbq	%rbp, %r10
-        sbbq	%rdx, %r11
-        # Sub
-        subq	(%rsp), %r12
-        movq	$0x00, %rbp
-        sbbq	8(%rsp), %r13
-        movq	$-19, %rax
-        sbbq	16(%rsp), %r14
-        movq	$0x7fffffffffffffff, %rdx
-        sbbq	24(%rsp), %r15
-        sbbq	$0x00, %rbp
-        #   Mask the modulus
-        andq	%rbp, %rax
-        andq	%rbp, %rdx
-        #   Add modulus (if underflow)
-        addq	%rax, %r12
-        adcq	%rbp, %r13
-        adcq	%rbp, %r14
-        adcq	%rdx, %r15
-        movq	%rcx, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	%r12, 128(%rsp)
-        movq	%r13, 136(%rsp)
-        movq	%r14, 144(%rsp)
-        movq	%r15, 152(%rsp)
-        # Add
-        movq	64(%rsp), %rcx
-        movq	72(%rsp), %r9
-        movq	80(%rsp), %r10
-        movq	88(%rsp), %rbp
-        movq	%rcx, %r12
-        addq	32(%rsp), %rcx
-        movq	%r9, %r13
-        adcq	40(%rsp), %r9
-        movq	%r10, %r14
-        adcq	48(%rsp), %r10
-        movq	%rbp, %r15
-        adcq	56(%rsp), %rbp
-        movq	$-19, %rax
-        movq	%rbp, %r11
-        movq	$0x7fffffffffffffff, %rdx
-        sarq	$63, %rbp
-        #   Mask the modulus
-        andq	%rbp, %rax
-        andq	%rbp, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %rcx
-        sbbq	%rbp, %r9
-        sbbq	%rbp, %r10
-        sbbq	%rdx, %r11
-        # Sub
-        subq	32(%rsp), %r12
-        movq	$0x00, %rbp
-        sbbq	40(%rsp), %r13
-        movq	$-19, %rax
-        sbbq	48(%rsp), %r14
-        movq	$0x7fffffffffffffff, %rdx
-        sbbq	56(%rsp), %r15
-        sbbq	$0x00, %rbp
-        #   Mask the modulus
-        andq	%rbp, %rax
-        andq	%rbp, %rdx
-        #   Add modulus (if underflow)
-        addq	%rax, %r12
-        adcq	%rbp, %r13
-        adcq	%rbp, %r14
-        adcq	%rdx, %r15
-        movq	%rcx, (%rsp)
-        movq	%r9, 8(%rsp)
-        movq	%r10, 16(%rsp)
-        movq	%r11, 24(%rsp)
-        movq	%r12, 96(%rsp)
-        movq	%r13, 104(%rsp)
-        movq	%r14, 112(%rsp)
-        movq	%r15, 120(%rsp)
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rdi), %rax
-        mulq	96(%rsp)
-        movq	%rax, %rcx
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rdi), %rax
-        mulq	96(%rsp)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rdi), %rax
-        mulq	104(%rsp)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rdi), %rax
-        mulq	96(%rsp)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rdi), %rax
-        mulq	104(%rsp)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rdi), %rax
-        mulq	112(%rsp)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rdi), %rax
-        mulq	96(%rsp)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rdi), %rax
-        mulq	104(%rsp)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rdi), %rax
-        mulq	112(%rsp)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rdi), %rax
-        mulq	120(%rsp)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rdi), %rax
-        mulq	104(%rsp)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rdi), %rax
-        mulq	112(%rsp)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rdi), %rax
-        mulq	120(%rsp)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rdi), %rax
-        mulq	112(%rsp)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rdi), %rax
-        mulq	120(%rsp)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rdi), %rax
-        mulq	120(%rsp)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rbp
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rbp, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %rcx
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rbp, %r11
-        addq	%rax, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rbp, %r11
-        addq	%rdx, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%rcx, 32(%rsp)
-        movq	%r9, 40(%rsp)
-        movq	%r10, 48(%rsp)
-        movq	%r11, 56(%rsp)
-        # Multiply
-        #  A[0] * B[0]
-        movq	128(%rsp), %rax
-        mulq	(%rsp)
-        movq	%rax, %rcx
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	136(%rsp), %rax
-        mulq	(%rsp)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	128(%rsp), %rax
-        mulq	8(%rsp)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	144(%rsp), %rax
-        mulq	(%rsp)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	136(%rsp), %rax
-        mulq	8(%rsp)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	128(%rsp), %rax
-        mulq	16(%rsp)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	152(%rsp), %rax
-        mulq	(%rsp)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	144(%rsp), %rax
-        mulq	8(%rsp)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	136(%rsp), %rax
-        mulq	16(%rsp)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	128(%rsp), %rax
-        mulq	24(%rsp)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	152(%rsp), %rax
-        mulq	8(%rsp)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	144(%rsp), %rax
-        mulq	16(%rsp)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	136(%rsp), %rax
-        mulq	24(%rsp)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	152(%rsp), %rax
-        mulq	16(%rsp)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	144(%rsp), %rax
-        mulq	24(%rsp)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	152(%rsp), %rax
-        mulq	24(%rsp)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rbp
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rbp, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %rcx
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rbp, %r11
-        addq	%rax, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rbp, %r11
-        addq	%rdx, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%rcx, (%rsp)
-        movq	%r9, 8(%rsp)
-        movq	%r10, 16(%rsp)
-        movq	%r11, 24(%rsp)
-        # Square
-        #  A[0] * A[1]
-        movq	128(%rsp), %rax
-        mulq	136(%rsp)
-        movq	%rax, %r9
-        movq	%rdx, %r10
-        #  A[0] * A[2]
-        movq	128(%rsp), %rax
-        mulq	144(%rsp)
-        xorq	%r11, %r11
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[0] * A[3]
-        movq	128(%rsp), %rax
-        mulq	152(%rsp)
-        xorq	%r12, %r12
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        #  A[1] * A[2]
-        movq	136(%rsp), %rax
-        mulq	144(%rsp)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * A[3]
-        movq	136(%rsp), %rax
-        mulq	152(%rsp)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        #  A[2] * A[3]
-        movq	144(%rsp), %rax
-        mulq	152(%rsp)
-        xorq	%r14, %r14
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        # Double
-        xorq	%r15, %r15
-        addq	%r9, %r9
-        adcq	%r10, %r10
-        adcq	%r11, %r11
-        adcq	%r12, %r12
-        adcq	%r13, %r13
-        adcq	%r14, %r14
-        adcq	$0x00, %r15
-        #  A[0] * A[0]
-        movq	128(%rsp), %rax
-        mulq	%rax
-        movq	%rax, %rcx
-        movq	%rdx, %rbp
-        #  A[1] * A[1]
-        movq	136(%rsp), %rax
-        mulq	%rax
-        addq	%rbp, %r9
-        adcq	%rax, %r10
-        adcq	$0x00, %rdx
-        movq	%rdx, %rbp
-        #  A[2] * A[2]
-        movq	144(%rsp), %rax
-        mulq	%rax
-        addq	%rbp, %r11
-        adcq	%rax, %r12
-        adcq	$0x00, %rdx
-        movq	%rdx, %rbp
-        #  A[3] * A[3]
-        movq	152(%rsp), %rax
-        mulq	%rax
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        addq	%rbp, %r13
-        adcq	$0x00, %r14
-        adcq	$0x00, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rbp
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rbp, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %rcx
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rbp, %r11
-        addq	%rax, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rbp, %r11
-        addq	%rdx, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%rcx, 96(%rsp)
-        movq	%r9, 104(%rsp)
-        movq	%r10, 112(%rsp)
-        movq	%r11, 120(%rsp)
-        # Square
-        #  A[0] * A[1]
-        movq	(%rdi), %rax
-        mulq	8(%rdi)
-        movq	%rax, %r9
-        movq	%rdx, %r10
-        #  A[0] * A[2]
-        movq	(%rdi), %rax
-        mulq	16(%rdi)
-        xorq	%r11, %r11
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[0] * A[3]
-        movq	(%rdi), %rax
-        mulq	24(%rdi)
-        xorq	%r12, %r12
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        #  A[1] * A[2]
-        movq	8(%rdi), %rax
-        mulq	16(%rdi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * A[3]
-        movq	8(%rdi), %rax
-        mulq	24(%rdi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        #  A[2] * A[3]
-        movq	16(%rdi), %rax
-        mulq	24(%rdi)
-        xorq	%r14, %r14
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        # Double
-        xorq	%r15, %r15
-        addq	%r9, %r9
-        adcq	%r10, %r10
-        adcq	%r11, %r11
-        adcq	%r12, %r12
-        adcq	%r13, %r13
-        adcq	%r14, %r14
-        adcq	$0x00, %r15
-        #  A[0] * A[0]
-        movq	(%rdi), %rax
-        mulq	%rax
-        movq	%rax, %rcx
-        movq	%rdx, %rbp
-        #  A[1] * A[1]
-        movq	8(%rdi), %rax
-        mulq	%rax
-        addq	%rbp, %r9
-        adcq	%rax, %r10
-        adcq	$0x00, %rdx
-        movq	%rdx, %rbp
-        #  A[2] * A[2]
-        movq	16(%rdi), %rax
-        mulq	%rax
-        addq	%rbp, %r11
-        adcq	%rax, %r12
-        adcq	$0x00, %rdx
-        movq	%rdx, %rbp
-        #  A[3] * A[3]
-        movq	24(%rdi), %rax
-        mulq	%rax
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        addq	%rbp, %r13
-        adcq	$0x00, %r14
-        adcq	$0x00, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rbp
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rbp, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %rcx
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rbp, %r11
-        addq	%rax, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rbp, %r11
-        addq	%rdx, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%rcx, 128(%rsp)
-        movq	%r9, 136(%rsp)
-        movq	%r10, 144(%rsp)
-        movq	%r11, 152(%rsp)
-        # Add
-        movq	32(%rsp), %rcx
-        movq	40(%rsp), %r9
-        movq	48(%rsp), %r10
-        movq	56(%rsp), %rbp
-        movq	%rcx, %r12
-        addq	(%rsp), %rcx
-        movq	%r9, %r13
-        adcq	8(%rsp), %r9
-        movq	%r10, %r14
-        adcq	16(%rsp), %r10
-        movq	%rbp, %r15
-        adcq	24(%rsp), %rbp
-        movq	$-19, %rax
-        movq	%rbp, %r11
-        movq	$0x7fffffffffffffff, %rdx
-        sarq	$63, %rbp
-        #   Mask the modulus
-        andq	%rbp, %rax
-        andq	%rbp, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %rcx
-        sbbq	%rbp, %r9
-        sbbq	%rbp, %r10
-        sbbq	%rdx, %r11
-        # Sub
-        subq	(%rsp), %r12
-        movq	$0x00, %rbp
-        sbbq	8(%rsp), %r13
-        movq	$-19, %rax
-        sbbq	16(%rsp), %r14
-        movq	$0x7fffffffffffffff, %rdx
-        sbbq	24(%rsp), %r15
-        sbbq	$0x00, %rbp
-        #   Mask the modulus
-        andq	%rbp, %rax
-        andq	%rbp, %rdx
-        #   Add modulus (if underflow)
-        addq	%rax, %r12
-        adcq	%rbp, %r13
-        adcq	%rbp, %r14
-        adcq	%rdx, %r15
-        movq	%rcx, 64(%rsp)
-        movq	%r9, 72(%rsp)
-        movq	%r10, 80(%rsp)
-        movq	%r11, 88(%rsp)
-        movq	%r12, (%rsp)
-        movq	%r13, 8(%rsp)
-        movq	%r14, 16(%rsp)
-        movq	%r15, 24(%rsp)
-        # Multiply
-        #  A[0] * B[0]
-        movq	96(%rsp), %rax
-        mulq	128(%rsp)
-        movq	%rax, %rcx
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	104(%rsp), %rax
-        mulq	128(%rsp)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	96(%rsp), %rax
-        mulq	136(%rsp)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	112(%rsp), %rax
-        mulq	128(%rsp)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	104(%rsp), %rax
-        mulq	136(%rsp)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	96(%rsp), %rax
-        mulq	144(%rsp)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	120(%rsp), %rax
-        mulq	128(%rsp)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	112(%rsp), %rax
-        mulq	136(%rsp)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	104(%rsp), %rax
-        mulq	144(%rsp)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	96(%rsp), %rax
-        mulq	152(%rsp)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	120(%rsp), %rax
-        mulq	136(%rsp)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	112(%rsp), %rax
-        mulq	144(%rsp)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	104(%rsp), %rax
-        mulq	152(%rsp)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	120(%rsp), %rax
-        mulq	144(%rsp)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	112(%rsp), %rax
-        mulq	152(%rsp)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	120(%rsp), %rax
-        mulq	152(%rsp)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rbp
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rbp, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %rcx
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rbp, %r11
-        addq	%rax, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rbp, %r11
-        addq	%rdx, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%rcx, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        # Sub
-        movq	128(%rsp), %rcx
-        movq	136(%rsp), %r9
-        movq	144(%rsp), %r10
-        movq	152(%rsp), %r11
-        subq	96(%rsp), %rcx
-        movq	$0x00, %rbp
-        sbbq	104(%rsp), %r9
-        movq	$-19, %rax
-        sbbq	112(%rsp), %r10
-        movq	$0x7fffffffffffffff, %rdx
-        sbbq	120(%rsp), %r11
-        sbbq	$0x00, %rbp
-        #   Mask the modulus
-        andq	%rbp, %rax
-        andq	%rbp, %rdx
-        #   Add modulus (if underflow)
-        addq	%rax, %rcx
-        adcq	%rbp, %r9
-        adcq	%rbp, %r10
-        adcq	%rdx, %r11
-        movq	%rcx, 128(%rsp)
-        movq	%r9, 136(%rsp)
-        movq	%r10, 144(%rsp)
-        movq	%r11, 152(%rsp)
-        # Square
-        #  A[0] * A[1]
-        movq	(%rsp), %rax
-        mulq	8(%rsp)
-        movq	%rax, %r9
-        movq	%rdx, %r10
-        #  A[0] * A[2]
-        movq	(%rsp), %rax
-        mulq	16(%rsp)
-        xorq	%r11, %r11
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[0] * A[3]
-        movq	(%rsp), %rax
-        mulq	24(%rsp)
-        xorq	%r12, %r12
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        #  A[1] * A[2]
-        movq	8(%rsp), %rax
-        mulq	16(%rsp)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * A[3]
-        movq	8(%rsp), %rax
-        mulq	24(%rsp)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        #  A[2] * A[3]
-        movq	16(%rsp), %rax
-        mulq	24(%rsp)
-        xorq	%r14, %r14
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        # Double
-        xorq	%r15, %r15
-        addq	%r9, %r9
-        adcq	%r10, %r10
-        adcq	%r11, %r11
-        adcq	%r12, %r12
-        adcq	%r13, %r13
-        adcq	%r14, %r14
-        adcq	$0x00, %r15
-        #  A[0] * A[0]
-        movq	(%rsp), %rax
-        mulq	%rax
-        movq	%rax, %rcx
-        movq	%rdx, %rbp
-        #  A[1] * A[1]
-        movq	8(%rsp), %rax
-        mulq	%rax
-        addq	%rbp, %r9
-        adcq	%rax, %r10
-        adcq	$0x00, %rdx
-        movq	%rdx, %rbp
-        #  A[2] * A[2]
-        movq	16(%rsp), %rax
-        mulq	%rax
-        addq	%rbp, %r11
-        adcq	%rax, %r12
-        adcq	$0x00, %rdx
-        movq	%rdx, %rbp
-        #  A[3] * A[3]
-        movq	24(%rsp), %rax
-        mulq	%rax
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        addq	%rbp, %r13
-        adcq	$0x00, %r14
-        adcq	$0x00, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rbp
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rbp, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %rcx
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rbp, %r11
-        addq	%rax, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rbp, %r11
-        addq	%rdx, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%rcx, (%rsp)
-        movq	%r9, 8(%rsp)
-        movq	%r10, 16(%rsp)
-        movq	%r11, 24(%rsp)
-        # Multiply by 121666
-        movq	$0x1db42, %rax
-        mulq	128(%rsp)
-        xorq	%r10, %r10
-        movq	%rax, %rcx
-        movq	%rdx, %r9
-        movq	$0x1db42, %rax
-        mulq	136(%rsp)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        movq	$0x1db42, %rax
-        mulq	144(%rsp)
-        xorq	%r13, %r13
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        movq	$0x1db42, %rax
-        mulq	152(%rsp)
-        movq	$0x7fffffffffffffff, %r12
-        addq	%rax, %r11
-        adcq	%rdx, %r13
-        shldq	$0x01, %r11, %r13
-        andq	%r12, %r11
-        movq	$19, %rax
-        mulq	%r13
-        addq	%rax, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        movq	%rcx, 32(%rsp)
-        movq	%r9, 40(%rsp)
-        movq	%r10, 48(%rsp)
-        movq	%r11, 56(%rsp)
-        # Square
-        #  A[0] * A[1]
-        movq	64(%rsp), %rax
-        mulq	72(%rsp)
-        movq	%rax, %r9
-        movq	%rdx, %r10
-        #  A[0] * A[2]
-        movq	64(%rsp), %rax
-        mulq	80(%rsp)
-        xorq	%r11, %r11
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[0] * A[3]
-        movq	64(%rsp), %rax
-        mulq	88(%rsp)
-        xorq	%r12, %r12
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        #  A[1] * A[2]
-        movq	72(%rsp), %rax
-        mulq	80(%rsp)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * A[3]
-        movq	72(%rsp), %rax
-        mulq	88(%rsp)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        #  A[2] * A[3]
-        movq	80(%rsp), %rax
-        mulq	88(%rsp)
-        xorq	%r14, %r14
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        # Double
-        xorq	%r15, %r15
-        addq	%r9, %r9
-        adcq	%r10, %r10
-        adcq	%r11, %r11
-        adcq	%r12, %r12
-        adcq	%r13, %r13
-        adcq	%r14, %r14
-        adcq	$0x00, %r15
-        #  A[0] * A[0]
-        movq	64(%rsp), %rax
-        mulq	%rax
-        movq	%rax, %rcx
-        movq	%rdx, %rbp
-        #  A[1] * A[1]
-        movq	72(%rsp), %rax
-        mulq	%rax
-        addq	%rbp, %r9
-        adcq	%rax, %r10
-        adcq	$0x00, %rdx
-        movq	%rdx, %rbp
-        #  A[2] * A[2]
-        movq	80(%rsp), %rax
-        mulq	%rax
-        addq	%rbp, %r11
-        adcq	%rax, %r12
-        adcq	$0x00, %rdx
-        movq	%rdx, %rbp
-        #  A[3] * A[3]
-        movq	88(%rsp), %rax
-        mulq	%rax
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        addq	%rbp, %r13
-        adcq	$0x00, %r14
-        adcq	$0x00, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rbp
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rbp, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %rcx
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rbp, %r11
-        addq	%rax, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rbp, %r11
-        addq	%rdx, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%rcx, 64(%rsp)
-        movq	%r9, 72(%rsp)
-        movq	%r10, 80(%rsp)
-        movq	%r11, 88(%rsp)
-        # Add
-        movq	96(%rsp), %rcx
-        movq	104(%rsp), %r9
-        addq	32(%rsp), %rcx
-        movq	112(%rsp), %r10
-        adcq	40(%rsp), %r9
-        movq	120(%rsp), %rbp
-        adcq	48(%rsp), %r10
-        movq	$-19, %rax
-        adcq	56(%rsp), %rbp
-        movq	$0x7fffffffffffffff, %rdx
-        movq	%rbp, %r11
-        sarq	$63, %rbp
-        #   Mask the modulus
-        andq	%rbp, %rax
-        andq	%rbp, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %rcx
-        sbbq	%rbp, %r9
-        sbbq	%rbp, %r10
-        sbbq	%rdx, %r11
-        movq	%rcx, 96(%rsp)
-        movq	%r9, 104(%rsp)
-        movq	%r10, 112(%rsp)
-        movq	%r11, 120(%rsp)
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rsp), %rax
-        mulq	(%r8)
-        movq	%rax, %rcx
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rsp), %rax
-        mulq	(%r8)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rsp), %rax
-        mulq	8(%r8)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rsp), %rax
-        mulq	(%r8)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rsp), %rax
-        mulq	8(%r8)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rsp), %rax
-        mulq	16(%r8)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rsp), %rax
-        mulq	(%r8)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rsp), %rax
-        mulq	8(%r8)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rsp), %rax
-        mulq	16(%r8)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rsp), %rax
-        mulq	24(%r8)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rsp), %rax
-        mulq	8(%r8)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rsp), %rax
-        mulq	16(%r8)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rsp), %rax
-        mulq	24(%r8)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rsp), %rax
-        mulq	16(%r8)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rsp), %rax
-        mulq	24(%r8)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rsp), %rax
-        mulq	24(%r8)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rbp
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rbp, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %rcx
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rbp, %r11
-        addq	%rax, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rbp, %r11
-        addq	%rdx, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%rcx, 32(%rsp)
-        movq	%r9, 40(%rsp)
-        movq	%r10, 48(%rsp)
-        movq	%r11, 56(%rsp)
-        # Multiply
-        #  A[0] * B[0]
-        movq	96(%rsp), %rax
-        mulq	128(%rsp)
-        movq	%rax, %rcx
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	104(%rsp), %rax
-        mulq	128(%rsp)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	96(%rsp), %rax
-        mulq	136(%rsp)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	112(%rsp), %rax
-        mulq	128(%rsp)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	104(%rsp), %rax
-        mulq	136(%rsp)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	96(%rsp), %rax
-        mulq	144(%rsp)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	120(%rsp), %rax
-        mulq	128(%rsp)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	112(%rsp), %rax
-        mulq	136(%rsp)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	104(%rsp), %rax
-        mulq	144(%rsp)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	96(%rsp), %rax
-        mulq	152(%rsp)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	120(%rsp), %rax
-        mulq	136(%rsp)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	112(%rsp), %rax
-        mulq	144(%rsp)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	104(%rsp), %rax
-        mulq	152(%rsp)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	120(%rsp), %rax
-        mulq	144(%rsp)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	112(%rsp), %rax
-        mulq	152(%rsp)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	120(%rsp), %rax
-        mulq	152(%rsp)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rbp
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rbp, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %rcx
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rbp, %r11
-        addq	%rax, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rbp, %r11
-        addq	%rdx, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%rcx, (%rsp)
-        movq	%r9, 8(%rsp)
-        movq	%r10, 16(%rsp)
-        movq	%r11, 24(%rsp)
-        decb	168(%rsp)
-        jge	L_curve25519_x64_bits
-        movq	$63, 168(%rsp)
-        decb	160(%rsp)
-        jge	L_curve25519_x64_words
-        # Invert
-        leaq	32(%rsp), %rdi
-        movq	%rsp, %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        movq	%rsp, %rsi
-        leaq	64(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        leaq	64(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        leaq	96(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        movq	$4, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        leaq	64(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        movq	$9, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        leaq	64(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	128(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	128(%rsp), %rdi
-        leaq	128(%rsp), %rsi
-        movq	$19, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	128(%rsp), %rsi
-        leaq	96(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        movq	$9, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        leaq	64(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        movq	$49, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        leaq	64(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	128(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	128(%rsp), %rdi
-        leaq	128(%rsp), %rsi
-        movq	$0x63, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	128(%rsp), %rsi
-        leaq	96(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        movq	$49, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        leaq	64(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        movq	$4, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        movq	%rsp, %rdi
-        leaq	64(%rsp), %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        movq	176(%rsp), %rdi
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rsp), %rax
-        mulq	(%rdi)
-        movq	%rax, %rcx
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rsp), %rax
-        mulq	(%rdi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rsp), %rax
-        mulq	8(%rdi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rsp), %rax
-        mulq	(%rdi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rsp), %rax
-        mulq	8(%rdi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rsp), %rax
-        mulq	16(%rdi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rsp), %rax
-        mulq	(%rdi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rsp), %rax
-        mulq	8(%rdi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rsp), %rax
-        mulq	16(%rdi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rsp), %rax
-        mulq	24(%rdi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rsp), %rax
-        mulq	8(%rdi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rsp), %rax
-        mulq	16(%rdi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rsp), %rax
-        mulq	24(%rdi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rsp), %rax
-        mulq	16(%rdi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rsp), %rax
-        mulq	24(%rdi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rsp), %rax
-        mulq	24(%rdi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rbp
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rbp, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %rcx
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rbp, %r11
-        addq	%rax, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rbp, %r11
-        addq	%rdx, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        movq	%rcx, %rax
-        addq	$19, %rax
-        movq	%r9, %rax
-        adcq	$0x00, %rax
-        movq	%r10, %rax
-        adcq	$0x00, %rax
-        movq	%r11, %rax
-        adcq	$0x00, %rax
-        sarq	$63, %rax
-        andq	$19, %rax
-        addq	%rax, %rcx
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        andq	%rbp, %r11
-        # Store
-        movq	%rcx, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        xorq	%rax, %rax
-        addq	$0xb8, %rsp
-        popq	%rbp
-        popq	%rbx
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        repz retq
-#ifndef __APPLE__
-.size	curve25519_x64,.-curve25519_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_pow22523_x64
-.type	fe_pow22523_x64,@function
-.align	16
-fe_pow22523_x64:
-#else
-.section	__TEXT,__text
-.globl	_fe_pow22523_x64
-.p2align	4
-_fe_pow22523_x64:
-#endif /* __APPLE__ */
-        subq	$0x70, %rsp
-        # pow22523
-        movq	%rdi, 96(%rsp)
-        movq	%rsi, 104(%rsp)
-        movq	%rsp, %rdi
-        movq	104(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        movq	%rsp, %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        movq	104(%rsp), %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        movq	%rsp, %rdi
-        movq	%rsp, %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        movq	%rsp, %rdi
-        movq	%rsp, %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        movq	%rsp, %rdi
-        leaq	32(%rsp), %rsi
-        movq	%rsp, %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        movq	%rsp, %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        movq	$4, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        movq	%rsp, %rdi
-        leaq	32(%rsp), %rsi
-        movq	%rsp, %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        movq	%rsp, %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        movq	$9, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        movq	%rsp, %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        movq	$19, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        movq	$9, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        movq	%rsp, %rdi
-        leaq	32(%rsp), %rsi
-        movq	%rsp, %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        movq	%rsp, %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        movq	$49, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        movq	%rsp, %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        movq	$0x63, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        movq	$49, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_x64@plt
-#else
-        callq	_fe_sq_n_x64
-#endif /* __APPLE__ */
-        movq	%rsp, %rdi
-        leaq	32(%rsp), %rsi
-        movq	%rsp, %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        movq	%rsp, %rdi
-        movq	%rsp, %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        movq	%rsp, %rdi
-        movq	%rsp, %rsi
-#ifndef __APPLE__
-        callq	fe_sq_x64@plt
-#else
-        callq	_fe_sq_x64
-#endif /* __APPLE__ */
-        movq	96(%rsp), %rdi
-        movq	%rsp, %rsi
-        movq	104(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_x64@plt
-#else
-        callq	_fe_mul_x64
-#endif /* __APPLE__ */
-        movq	104(%rsp), %rsi
-        movq	96(%rsp), %rdi
-        addq	$0x70, %rsp
-        repz retq
-#ifndef __APPLE__
-.text
-.globl	fe_ge_to_p2_x64
-.type	fe_ge_to_p2_x64,@function
-.align	16
-fe_ge_to_p2_x64:
-#else
-.section	__TEXT,__text
-.globl	_fe_ge_to_p2_x64
-.p2align	4
-_fe_ge_to_p2_x64:
-#endif /* __APPLE__ */
-        pushq	%rbx
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        subq	$40, %rsp
-        movq	%rsi, (%rsp)
-        movq	%rdx, 8(%rsp)
-        movq	%rcx, 16(%rsp)
-        movq	%r8, 24(%rsp)
-        movq	%r9, 32(%rsp)
-        movq	16(%rsp), %rsi
-        movq	88(%rsp), %rbx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rbx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rbx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rbx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rbx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	(%rsp), %rdi
-        movq	24(%rsp), %rsi
-        movq	32(%rsp), %rbx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rbx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rbx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rbx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rbx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	8(%rsp), %rdi
-        movq	32(%rsp), %rsi
-        movq	88(%rsp), %rbx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rbx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rbx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rbx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rbx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        addq	$40, %rsp
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        popq	%rbx
-        repz retq
-#ifndef __APPLE__
-.size	fe_ge_to_p2_x64,.-fe_ge_to_p2_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_ge_to_p3_x64
-.type	fe_ge_to_p3_x64,@function
-.align	16
-fe_ge_to_p3_x64:
-#else
-.section	__TEXT,__text
-.globl	_fe_ge_to_p3_x64
-.p2align	4
-_fe_ge_to_p3_x64:
-#endif /* __APPLE__ */
-        pushq	%rbx
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        subq	$40, %rsp
-        movq	%rsi, (%rsp)
-        movq	%rdx, 8(%rsp)
-        movq	%rcx, 16(%rsp)
-        movq	%r8, 24(%rsp)
-        movq	%r9, 32(%rsp)
-        movq	24(%rsp), %rsi
-        movq	96(%rsp), %rbx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rbx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rbx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rbx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rbx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	(%rsp), %rdi
-        movq	32(%rsp), %rsi
-        movq	88(%rsp), %rbx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rbx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rbx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rbx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rbx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	8(%rsp), %rdi
-        movq	88(%rsp), %rsi
-        movq	96(%rsp), %rbx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rbx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rbx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rbx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rbx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	16(%rsp), %rdi
-        movq	24(%rsp), %rsi
-        movq	32(%rsp), %rbx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rbx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rbx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rbx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rbx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        addq	$40, %rsp
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        popq	%rbx
-        repz retq
-#ifndef __APPLE__
-.size	fe_ge_to_p3_x64,.-fe_ge_to_p3_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_ge_dbl_x64
-.type	fe_ge_dbl_x64,@function
-.align	16
-fe_ge_dbl_x64:
-#else
-.section	__TEXT,__text
-.globl	_fe_ge_dbl_x64
-.p2align	4
-_fe_ge_dbl_x64:
-#endif /* __APPLE__ */
-        pushq	%rbx
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        subq	$0x50, %rsp
-        movq	%rdi, (%rsp)
-        movq	%rsi, 8(%rsp)
-        movq	%rdx, 16(%rsp)
-        movq	%rcx, 24(%rsp)
-        movq	%r8, 32(%rsp)
-        movq	%r9, 40(%rsp)
-        movq	(%rsp), %rdi
-        movq	32(%rsp), %rsi
-        # Square
-        #  A[0] * A[1]
-        movq	(%rsi), %rax
-        mulq	8(%rsi)
-        movq	%rax, %r9
-        movq	%rdx, %r10
-        #  A[0] * A[2]
-        movq	(%rsi), %rax
-        mulq	16(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[0] * A[3]
-        movq	(%rsi), %rax
-        mulq	24(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        #  A[1] * A[2]
-        movq	8(%rsi), %rax
-        mulq	16(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * A[3]
-        movq	8(%rsi), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        #  A[2] * A[3]
-        movq	16(%rsi), %rax
-        mulq	24(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        # Double
-        xorq	%r15, %r15
-        addq	%r9, %r9
-        adcq	%r10, %r10
-        adcq	%r11, %r11
-        adcq	%r12, %r12
-        adcq	%r13, %r13
-        adcq	%r14, %r14
-        adcq	$0x00, %r15
-        #  A[0] * A[0]
-        movq	(%rsi), %rax
-        mulq	%rax
-        movq	%rax, %r8
-        movq	%rdx, %rcx
-        #  A[1] * A[1]
-        movq	8(%rsi), %rax
-        mulq	%rax
-        addq	%rcx, %r9
-        adcq	%rax, %r10
-        adcq	$0x00, %rdx
-        movq	%rdx, %rcx
-        #  A[2] * A[2]
-        movq	16(%rsi), %rax
-        mulq	%rax
-        addq	%rcx, %r11
-        adcq	%rax, %r12
-        adcq	$0x00, %rdx
-        movq	%rdx, %rcx
-        #  A[3] * A[3]
-        movq	24(%rsi), %rax
-        mulq	%rax
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        addq	%rcx, %r13
-        adcq	$0x00, %r14
-        adcq	$0x00, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	16(%rsp), %rdi
-        movq	40(%rsp), %rsi
-        # Square
-        #  A[0] * A[1]
-        movq	(%rsi), %rax
-        mulq	8(%rsi)
-        movq	%rax, %r9
-        movq	%rdx, %r10
-        #  A[0] * A[2]
-        movq	(%rsi), %rax
-        mulq	16(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[0] * A[3]
-        movq	(%rsi), %rax
-        mulq	24(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        #  A[1] * A[2]
-        movq	8(%rsi), %rax
-        mulq	16(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * A[3]
-        movq	8(%rsi), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        #  A[2] * A[3]
-        movq	16(%rsi), %rax
-        mulq	24(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        # Double
-        xorq	%r15, %r15
-        addq	%r9, %r9
-        adcq	%r10, %r10
-        adcq	%r11, %r11
-        adcq	%r12, %r12
-        adcq	%r13, %r13
-        adcq	%r14, %r14
-        adcq	$0x00, %r15
-        #  A[0] * A[0]
-        movq	(%rsi), %rax
-        mulq	%rax
-        movq	%rax, %r8
-        movq	%rdx, %rcx
-        #  A[1] * A[1]
-        movq	8(%rsi), %rax
-        mulq	%rax
-        addq	%rcx, %r9
-        adcq	%rax, %r10
-        adcq	$0x00, %rdx
-        movq	%rdx, %rcx
-        #  A[2] * A[2]
-        movq	16(%rsi), %rax
-        mulq	%rax
-        addq	%rcx, %r11
-        adcq	%rax, %r12
-        adcq	$0x00, %rdx
-        movq	%rdx, %rcx
-        #  A[3] * A[3]
-        movq	24(%rsi), %rax
-        mulq	%rax
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        addq	%rcx, %r13
-        adcq	$0x00, %r14
-        adcq	$0x00, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	24(%rsp), %rdi
-        movq	128(%rsp), %rsi
-        # Square * 2
-        #  A[0] * A[1]
-        movq	(%rsi), %rax
-        mulq	8(%rsi)
-        movq	%rax, %r9
-        movq	%rdx, %r10
-        #  A[0] * A[2]
-        movq	(%rsi), %rax
-        mulq	16(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[0] * A[3]
-        movq	(%rsi), %rax
-        mulq	24(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        #  A[1] * A[2]
-        movq	8(%rsi), %rax
-        mulq	16(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * A[3]
-        movq	8(%rsi), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        #  A[2] * A[3]
-        movq	16(%rsi), %rax
-        mulq	24(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        # Double
-        xorq	%r15, %r15
-        addq	%r9, %r9
-        adcq	%r10, %r10
-        adcq	%r11, %r11
-        adcq	%r12, %r12
-        adcq	%r13, %r13
-        adcq	%r14, %r14
-        adcq	$0x00, %r15
-        #  A[0] * A[0]
-        movq	(%rsi), %rax
-        mulq	%rax
-        movq	%rax, %r8
-        movq	%rdx, %rcx
-        #  A[1] * A[1]
-        movq	8(%rsi), %rax
-        mulq	%rax
-        addq	%rcx, %r9
-        adcq	%rax, %r10
-        adcq	$0x00, %rdx
-        movq	%rdx, %rcx
-        #  A[2] * A[2]
-        movq	16(%rsi), %rax
-        mulq	%rax
-        addq	%rcx, %r11
-        adcq	%rax, %r12
-        adcq	$0x00, %rdx
-        movq	%rdx, %rcx
-        #  A[3] * A[3]
-        movq	24(%rsi), %rax
-        mulq	%rax
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        addq	%rcx, %r13
-        adcq	$0x00, %r14
-        adcq	$0x00, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rbx
-        xorq	%rax, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$3, %r15, %rax
-        shldq	$2, %r14, %r15
-        shldq	$2, %r13, %r14
-        shldq	$2, %r12, %r13
-        shldq	$2, %r11, %r12
-        shldq	$0x01, %r10, %r11
-        shldq	$0x01, %r9, %r10
-        shldq	$0x01, %r8, %r9
-        shlq	$0x01, %r8
-        andq	%rbx, %r11
-        #  Two out left, one in right
-        andq	%rbx, %r15
-        #  Multiply top bits by 19*19
-        imulq	$0x169, %rax, %rcx
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining produce results in
-        addq	%rcx, %r8
-        adcq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rbx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rbx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	8(%rsp), %rdi
-        movq	32(%rsp), %rsi
-        movq	40(%rsp), %rbx
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        addq	(%rbx), %r8
-        movq	16(%rsi), %r10
-        adcq	8(%rbx), %r9
-        movq	24(%rsi), %rcx
-        adcq	16(%rbx), %r10
-        movq	$-19, %rax
-        adcq	24(%rbx), %rcx
-        movq	$0x7fffffffffffffff, %rdx
-        movq	%rcx, %r11
-        sarq	$63, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %r8
-        sbbq	%rcx, %r9
-        sbbq	%rcx, %r10
-        sbbq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        leaq	48(%rsp), %rdi
-        movq	8(%rsp), %rsi
-        # Square
-        #  A[0] * A[1]
-        movq	(%rsi), %rax
-        mulq	8(%rsi)
-        movq	%rax, %r9
-        movq	%rdx, %r10
-        #  A[0] * A[2]
-        movq	(%rsi), %rax
-        mulq	16(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[0] * A[3]
-        movq	(%rsi), %rax
-        mulq	24(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        #  A[1] * A[2]
-        movq	8(%rsi), %rax
-        mulq	16(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * A[3]
-        movq	8(%rsi), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        #  A[2] * A[3]
-        movq	16(%rsi), %rax
-        mulq	24(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        # Double
-        xorq	%r15, %r15
-        addq	%r9, %r9
-        adcq	%r10, %r10
-        adcq	%r11, %r11
-        adcq	%r12, %r12
-        adcq	%r13, %r13
-        adcq	%r14, %r14
-        adcq	$0x00, %r15
-        #  A[0] * A[0]
-        movq	(%rsi), %rax
-        mulq	%rax
-        movq	%rax, %r8
-        movq	%rdx, %rcx
-        #  A[1] * A[1]
-        movq	8(%rsi), %rax
-        mulq	%rax
-        addq	%rcx, %r9
-        adcq	%rax, %r10
-        adcq	$0x00, %rdx
-        movq	%rdx, %rcx
-        #  A[2] * A[2]
-        movq	16(%rsi), %rax
-        mulq	%rax
-        addq	%rcx, %r11
-        adcq	%rax, %r12
-        adcq	$0x00, %rdx
-        movq	%rdx, %rcx
-        #  A[3] * A[3]
-        movq	24(%rsi), %rax
-        mulq	%rax
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        addq	%rcx, %r13
-        adcq	$0x00, %r14
-        adcq	$0x00, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	8(%rsp), %rdi
-        movq	16(%rsp), %rsi
-        movq	(%rsp), %rbx
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        addq	(%rbx), %r8
-        movq	16(%rsi), %r10
-        adcq	8(%rbx), %r9
-        movq	24(%rsi), %rcx
-        adcq	16(%rbx), %r10
-        movq	$-19, %rax
-        adcq	24(%rbx), %rcx
-        movq	$0x7fffffffffffffff, %rdx
-        movq	%rcx, %r11
-        sarq	$63, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %r8
-        sbbq	%rcx, %r9
-        sbbq	%rcx, %r10
-        sbbq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	16(%rsp), %rdi
-        movq	16(%rsp), %rsi
-        movq	(%rsp), %rbx
-        # Sub
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        movq	16(%rsi), %r10
-        movq	24(%rsi), %r11
-        subq	(%rbx), %r8
-        movq	$0x00, %rcx
-        sbbq	8(%rbx), %r9
-        movq	$-19, %rax
-        sbbq	16(%rbx), %r10
-        movq	$0x7fffffffffffffff, %rdx
-        sbbq	24(%rbx), %r11
-        sbbq	$0x00, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Add modulus (if underflow)
-        addq	%rax, %r8
-        adcq	%rcx, %r9
-        adcq	%rcx, %r10
-        adcq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	(%rsp), %rdi
-        leaq	48(%rsp), %rsi
-        movq	8(%rsp), %rbx
-        # Sub
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        movq	16(%rsi), %r10
-        movq	24(%rsi), %r11
-        subq	(%rbx), %r8
-        movq	$0x00, %rcx
-        sbbq	8(%rbx), %r9
-        movq	$-19, %rax
-        sbbq	16(%rbx), %r10
-        movq	$0x7fffffffffffffff, %rdx
-        sbbq	24(%rbx), %r11
-        sbbq	$0x00, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Add modulus (if underflow)
-        addq	%rax, %r8
-        adcq	%rcx, %r9
-        adcq	%rcx, %r10
-        adcq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	24(%rsp), %rdi
-        movq	24(%rsp), %rsi
-        movq	16(%rsp), %rbx
-        # Sub
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        movq	16(%rsi), %r10
-        movq	24(%rsi), %r11
-        subq	(%rbx), %r8
-        movq	$0x00, %rcx
-        sbbq	8(%rbx), %r9
-        movq	$-19, %rax
-        sbbq	16(%rbx), %r10
-        movq	$0x7fffffffffffffff, %rdx
-        sbbq	24(%rbx), %r11
-        sbbq	$0x00, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Add modulus (if underflow)
-        addq	%rax, %r8
-        adcq	%rcx, %r9
-        adcq	%rcx, %r10
-        adcq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        addq	$0x50, %rsp
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        popq	%rbx
-        repz retq
-#ifndef __APPLE__
-.size	fe_ge_dbl_x64,.-fe_ge_dbl_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_ge_madd_x64
-.type	fe_ge_madd_x64,@function
-.align	16
-fe_ge_madd_x64:
-#else
-.section	__TEXT,__text
-.globl	_fe_ge_madd_x64
-.p2align	4
-_fe_ge_madd_x64:
-#endif /* __APPLE__ */
-        pushq	%rbx
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        subq	$0x50, %rsp
-        movq	%rdi, (%rsp)
-        movq	%rsi, 8(%rsp)
-        movq	%rdx, 16(%rsp)
-        movq	%rcx, 24(%rsp)
-        movq	%r8, 32(%rsp)
-        movq	%r9, 40(%rsp)
-        movq	(%rsp), %rdi
-        movq	40(%rsp), %rsi
-        movq	32(%rsp), %rbx
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        addq	(%rbx), %r8
-        movq	16(%rsi), %r10
-        adcq	8(%rbx), %r9
-        movq	24(%rsi), %rcx
-        adcq	16(%rbx), %r10
-        movq	$-19, %rax
-        adcq	24(%rbx), %rcx
-        movq	$0x7fffffffffffffff, %rdx
-        movq	%rcx, %r11
-        sarq	$63, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %r8
-        sbbq	%rcx, %r9
-        sbbq	%rcx, %r10
-        sbbq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	8(%rsp), %rdi
-        movq	40(%rsp), %rsi
-        movq	32(%rsp), %rbx
-        # Sub
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        movq	16(%rsi), %r10
-        movq	24(%rsi), %r11
-        subq	(%rbx), %r8
-        movq	$0x00, %rcx
-        sbbq	8(%rbx), %r9
-        movq	$-19, %rax
-        sbbq	16(%rbx), %r10
-        movq	$0x7fffffffffffffff, %rdx
-        sbbq	24(%rbx), %r11
-        sbbq	$0x00, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Add modulus (if underflow)
-        addq	%rax, %r8
-        adcq	%rcx, %r9
-        adcq	%rcx, %r10
-        adcq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	16(%rsp), %rdi
-        movq	(%rsp), %rsi
-        movq	152(%rsp), %rbx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rbx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rbx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rbx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rbx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	8(%rsp), %rdi
-        movq	8(%rsp), %rsi
-        movq	160(%rsp), %rbx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rbx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rbx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rbx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rbx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	24(%rsp), %rdi
-        movq	144(%rsp), %rsi
-        movq	136(%rsp), %rbx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rbx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rbx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rbx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rbx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        leaq	48(%rsp), %rdi
-        movq	128(%rsp), %rsi
-        movq	128(%rsp), %rbx
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        addq	(%rbx), %r8
-        movq	16(%rsi), %r10
-        adcq	8(%rbx), %r9
-        movq	24(%rsi), %rcx
-        adcq	16(%rbx), %r10
-        movq	$-19, %rax
-        adcq	24(%rbx), %rcx
-        movq	$0x7fffffffffffffff, %rdx
-        movq	%rcx, %r11
-        sarq	$63, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %r8
-        sbbq	%rcx, %r9
-        sbbq	%rcx, %r10
-        sbbq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	(%rsp), %rdi
-        movq	16(%rsp), %rsi
-        movq	8(%rsp), %rbx
-        # Sub
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        movq	16(%rsi), %r10
-        movq	24(%rsi), %r11
-        subq	(%rbx), %r8
-        movq	$0x00, %rcx
-        sbbq	8(%rbx), %r9
-        movq	$-19, %rax
-        sbbq	16(%rbx), %r10
-        movq	$0x7fffffffffffffff, %rdx
-        sbbq	24(%rbx), %r11
-        sbbq	$0x00, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Add modulus (if underflow)
-        addq	%rax, %r8
-        adcq	%rcx, %r9
-        adcq	%rcx, %r10
-        adcq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	8(%rsp), %rdi
-        movq	16(%rsp), %rsi
-        movq	8(%rsp), %rbx
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        addq	(%rbx), %r8
-        movq	16(%rsi), %r10
-        adcq	8(%rbx), %r9
-        movq	24(%rsi), %rcx
-        adcq	16(%rbx), %r10
-        movq	$-19, %rax
-        adcq	24(%rbx), %rcx
-        movq	$0x7fffffffffffffff, %rdx
-        movq	%rcx, %r11
-        sarq	$63, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %r8
-        sbbq	%rcx, %r9
-        sbbq	%rcx, %r10
-        sbbq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	16(%rsp), %rdi
-        leaq	48(%rsp), %rsi
-        movq	24(%rsp), %rbx
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        addq	(%rbx), %r8
-        movq	16(%rsi), %r10
-        adcq	8(%rbx), %r9
-        movq	24(%rsi), %rcx
-        adcq	16(%rbx), %r10
-        movq	$-19, %rax
-        adcq	24(%rbx), %rcx
-        movq	$0x7fffffffffffffff, %rdx
-        movq	%rcx, %r11
-        sarq	$63, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %r8
-        sbbq	%rcx, %r9
-        sbbq	%rcx, %r10
-        sbbq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	24(%rsp), %rdi
-        leaq	48(%rsp), %rsi
-        movq	24(%rsp), %rbx
-        # Sub
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        movq	16(%rsi), %r10
-        movq	24(%rsi), %r11
-        subq	(%rbx), %r8
-        movq	$0x00, %rcx
-        sbbq	8(%rbx), %r9
-        movq	$-19, %rax
-        sbbq	16(%rbx), %r10
-        movq	$0x7fffffffffffffff, %rdx
-        sbbq	24(%rbx), %r11
-        sbbq	$0x00, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Add modulus (if underflow)
-        addq	%rax, %r8
-        adcq	%rcx, %r9
-        adcq	%rcx, %r10
-        adcq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        addq	$0x50, %rsp
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        popq	%rbx
-        repz retq
-#ifndef __APPLE__
-.size	fe_ge_madd_x64,.-fe_ge_madd_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_ge_msub_x64
-.type	fe_ge_msub_x64,@function
-.align	16
-fe_ge_msub_x64:
-#else
-.section	__TEXT,__text
-.globl	_fe_ge_msub_x64
-.p2align	4
-_fe_ge_msub_x64:
-#endif /* __APPLE__ */
-        pushq	%rbx
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        subq	$0x50, %rsp
-        movq	%rdi, (%rsp)
-        movq	%rsi, 8(%rsp)
-        movq	%rdx, 16(%rsp)
-        movq	%rcx, 24(%rsp)
-        movq	%r8, 32(%rsp)
-        movq	%r9, 40(%rsp)
-        movq	(%rsp), %rdi
-        movq	40(%rsp), %rsi
-        movq	32(%rsp), %rbx
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        addq	(%rbx), %r8
-        movq	16(%rsi), %r10
-        adcq	8(%rbx), %r9
-        movq	24(%rsi), %rcx
-        adcq	16(%rbx), %r10
-        movq	$-19, %rax
-        adcq	24(%rbx), %rcx
-        movq	$0x7fffffffffffffff, %rdx
-        movq	%rcx, %r11
-        sarq	$63, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %r8
-        sbbq	%rcx, %r9
-        sbbq	%rcx, %r10
-        sbbq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	8(%rsp), %rdi
-        movq	40(%rsp), %rsi
-        movq	32(%rsp), %rbx
-        # Sub
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        movq	16(%rsi), %r10
-        movq	24(%rsi), %r11
-        subq	(%rbx), %r8
-        movq	$0x00, %rcx
-        sbbq	8(%rbx), %r9
-        movq	$-19, %rax
-        sbbq	16(%rbx), %r10
-        movq	$0x7fffffffffffffff, %rdx
-        sbbq	24(%rbx), %r11
-        sbbq	$0x00, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Add modulus (if underflow)
-        addq	%rax, %r8
-        adcq	%rcx, %r9
-        adcq	%rcx, %r10
-        adcq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	16(%rsp), %rdi
-        movq	(%rsp), %rsi
-        movq	160(%rsp), %rbx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rbx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rbx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rbx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rbx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	8(%rsp), %rdi
-        movq	8(%rsp), %rsi
-        movq	152(%rsp), %rbx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rbx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rbx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rbx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rbx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	24(%rsp), %rdi
-        movq	144(%rsp), %rsi
-        movq	136(%rsp), %rbx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rbx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rbx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rbx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rbx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        leaq	48(%rsp), %rdi
-        movq	128(%rsp), %rsi
-        movq	128(%rsp), %rbx
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        addq	(%rbx), %r8
-        movq	16(%rsi), %r10
-        adcq	8(%rbx), %r9
-        movq	24(%rsi), %rcx
-        adcq	16(%rbx), %r10
-        movq	$-19, %rax
-        adcq	24(%rbx), %rcx
-        movq	$0x7fffffffffffffff, %rdx
-        movq	%rcx, %r11
-        sarq	$63, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %r8
-        sbbq	%rcx, %r9
-        sbbq	%rcx, %r10
-        sbbq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	(%rsp), %rdi
-        movq	16(%rsp), %rsi
-        movq	8(%rsp), %rbx
-        # Sub
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        movq	16(%rsi), %r10
-        movq	24(%rsi), %r11
-        subq	(%rbx), %r8
-        movq	$0x00, %rcx
-        sbbq	8(%rbx), %r9
-        movq	$-19, %rax
-        sbbq	16(%rbx), %r10
-        movq	$0x7fffffffffffffff, %rdx
-        sbbq	24(%rbx), %r11
-        sbbq	$0x00, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Add modulus (if underflow)
-        addq	%rax, %r8
-        adcq	%rcx, %r9
-        adcq	%rcx, %r10
-        adcq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	8(%rsp), %rdi
-        movq	16(%rsp), %rsi
-        movq	8(%rsp), %rbx
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        addq	(%rbx), %r8
-        movq	16(%rsi), %r10
-        adcq	8(%rbx), %r9
-        movq	24(%rsi), %rcx
-        adcq	16(%rbx), %r10
-        movq	$-19, %rax
-        adcq	24(%rbx), %rcx
-        movq	$0x7fffffffffffffff, %rdx
-        movq	%rcx, %r11
-        sarq	$63, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %r8
-        sbbq	%rcx, %r9
-        sbbq	%rcx, %r10
-        sbbq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	16(%rsp), %rdi
-        leaq	48(%rsp), %rsi
-        movq	24(%rsp), %rbx
-        # Sub
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        movq	16(%rsi), %r10
-        movq	24(%rsi), %r11
-        subq	(%rbx), %r8
-        movq	$0x00, %rcx
-        sbbq	8(%rbx), %r9
-        movq	$-19, %rax
-        sbbq	16(%rbx), %r10
-        movq	$0x7fffffffffffffff, %rdx
-        sbbq	24(%rbx), %r11
-        sbbq	$0x00, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Add modulus (if underflow)
-        addq	%rax, %r8
-        adcq	%rcx, %r9
-        adcq	%rcx, %r10
-        adcq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	24(%rsp), %rdi
-        leaq	48(%rsp), %rsi
-        movq	24(%rsp), %rbx
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        addq	(%rbx), %r8
-        movq	16(%rsi), %r10
-        adcq	8(%rbx), %r9
-        movq	24(%rsi), %rcx
-        adcq	16(%rbx), %r10
-        movq	$-19, %rax
-        adcq	24(%rbx), %rcx
-        movq	$0x7fffffffffffffff, %rdx
-        movq	%rcx, %r11
-        sarq	$63, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %r8
-        sbbq	%rcx, %r9
-        sbbq	%rcx, %r10
-        sbbq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        addq	$0x50, %rsp
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        popq	%rbx
-        repz retq
-#ifndef __APPLE__
-.size	fe_ge_msub_x64,.-fe_ge_msub_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_ge_add_x64
-.type	fe_ge_add_x64,@function
-.align	16
-fe_ge_add_x64:
-#else
-.section	__TEXT,__text
-.globl	_fe_ge_add_x64
-.p2align	4
-_fe_ge_add_x64:
-#endif /* __APPLE__ */
-        pushq	%rbx
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        subq	$0x50, %rsp
-        movq	%rdi, (%rsp)
-        movq	%rsi, 8(%rsp)
-        movq	%rdx, 16(%rsp)
-        movq	%rcx, 24(%rsp)
-        movq	%r8, 32(%rsp)
-        movq	%r9, 40(%rsp)
-        movq	(%rsp), %rdi
-        movq	40(%rsp), %rsi
-        movq	32(%rsp), %rbx
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        addq	(%rbx), %r8
-        movq	16(%rsi), %r10
-        adcq	8(%rbx), %r9
-        movq	24(%rsi), %rcx
-        adcq	16(%rbx), %r10
-        movq	$-19, %rax
-        adcq	24(%rbx), %rcx
-        movq	$0x7fffffffffffffff, %rdx
-        movq	%rcx, %r11
-        sarq	$63, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %r8
-        sbbq	%rcx, %r9
-        sbbq	%rcx, %r10
-        sbbq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	8(%rsp), %rdi
-        movq	40(%rsp), %rsi
-        movq	32(%rsp), %rbx
-        # Sub
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        movq	16(%rsi), %r10
-        movq	24(%rsi), %r11
-        subq	(%rbx), %r8
-        movq	$0x00, %rcx
-        sbbq	8(%rbx), %r9
-        movq	$-19, %rax
-        sbbq	16(%rbx), %r10
-        movq	$0x7fffffffffffffff, %rdx
-        sbbq	24(%rbx), %r11
-        sbbq	$0x00, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Add modulus (if underflow)
-        addq	%rax, %r8
-        adcq	%rcx, %r9
-        adcq	%rcx, %r10
-        adcq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	16(%rsp), %rdi
-        movq	(%rsp), %rsi
-        movq	160(%rsp), %rbx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rbx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rbx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rbx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rbx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	8(%rsp), %rdi
-        movq	8(%rsp), %rsi
-        movq	168(%rsp), %rbx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rbx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rbx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rbx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rbx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	24(%rsp), %rdi
-        movq	152(%rsp), %rsi
-        movq	136(%rsp), %rbx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rbx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rbx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rbx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rbx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	(%rsp), %rdi
-        movq	128(%rsp), %rsi
-        movq	144(%rsp), %rbx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rbx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rbx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rbx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rbx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        leaq	48(%rsp), %rdi
-        movq	(%rsp), %rsi
-        movq	(%rsp), %rbx
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        addq	(%rbx), %r8
-        movq	16(%rsi), %r10
-        adcq	8(%rbx), %r9
-        movq	24(%rsi), %rcx
-        adcq	16(%rbx), %r10
-        movq	$-19, %rax
-        adcq	24(%rbx), %rcx
-        movq	$0x7fffffffffffffff, %rdx
-        movq	%rcx, %r11
-        sarq	$63, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %r8
-        sbbq	%rcx, %r9
-        sbbq	%rcx, %r10
-        sbbq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	(%rsp), %rdi
-        movq	16(%rsp), %rsi
-        movq	8(%rsp), %rbx
-        # Sub
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        movq	16(%rsi), %r10
-        movq	24(%rsi), %r11
-        subq	(%rbx), %r8
-        movq	$0x00, %rcx
-        sbbq	8(%rbx), %r9
-        movq	$-19, %rax
-        sbbq	16(%rbx), %r10
-        movq	$0x7fffffffffffffff, %rdx
-        sbbq	24(%rbx), %r11
-        sbbq	$0x00, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Add modulus (if underflow)
-        addq	%rax, %r8
-        adcq	%rcx, %r9
-        adcq	%rcx, %r10
-        adcq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	8(%rsp), %rdi
-        movq	16(%rsp), %rsi
-        movq	8(%rsp), %rbx
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        addq	(%rbx), %r8
-        movq	16(%rsi), %r10
-        adcq	8(%rbx), %r9
-        movq	24(%rsi), %rcx
-        adcq	16(%rbx), %r10
-        movq	$-19, %rax
-        adcq	24(%rbx), %rcx
-        movq	$0x7fffffffffffffff, %rdx
-        movq	%rcx, %r11
-        sarq	$63, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %r8
-        sbbq	%rcx, %r9
-        sbbq	%rcx, %r10
-        sbbq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	16(%rsp), %rdi
-        leaq	48(%rsp), %rsi
-        movq	24(%rsp), %rbx
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        addq	(%rbx), %r8
-        movq	16(%rsi), %r10
-        adcq	8(%rbx), %r9
-        movq	24(%rsi), %rcx
-        adcq	16(%rbx), %r10
-        movq	$-19, %rax
-        adcq	24(%rbx), %rcx
-        movq	$0x7fffffffffffffff, %rdx
-        movq	%rcx, %r11
-        sarq	$63, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %r8
-        sbbq	%rcx, %r9
-        sbbq	%rcx, %r10
-        sbbq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	24(%rsp), %rdi
-        leaq	48(%rsp), %rsi
-        movq	24(%rsp), %rbx
-        # Sub
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        movq	16(%rsi), %r10
-        movq	24(%rsi), %r11
-        subq	(%rbx), %r8
-        movq	$0x00, %rcx
-        sbbq	8(%rbx), %r9
-        movq	$-19, %rax
-        sbbq	16(%rbx), %r10
-        movq	$0x7fffffffffffffff, %rdx
-        sbbq	24(%rbx), %r11
-        sbbq	$0x00, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Add modulus (if underflow)
-        addq	%rax, %r8
-        adcq	%rcx, %r9
-        adcq	%rcx, %r10
-        adcq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        addq	$0x50, %rsp
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        popq	%rbx
-        repz retq
-#ifndef __APPLE__
-.size	fe_ge_add_x64,.-fe_ge_add_x64
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_ge_sub_x64
-.type	fe_ge_sub_x64,@function
-.align	16
-fe_ge_sub_x64:
-#else
-.section	__TEXT,__text
-.globl	_fe_ge_sub_x64
-.p2align	4
-_fe_ge_sub_x64:
-#endif /* __APPLE__ */
-        pushq	%rbx
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        subq	$0x50, %rsp
-        movq	%rdi, (%rsp)
-        movq	%rsi, 8(%rsp)
-        movq	%rdx, 16(%rsp)
-        movq	%rcx, 24(%rsp)
-        movq	%r8, 32(%rsp)
-        movq	%r9, 40(%rsp)
-        movq	(%rsp), %rdi
-        movq	40(%rsp), %rsi
-        movq	32(%rsp), %rbx
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        addq	(%rbx), %r8
-        movq	16(%rsi), %r10
-        adcq	8(%rbx), %r9
-        movq	24(%rsi), %rcx
-        adcq	16(%rbx), %r10
-        movq	$-19, %rax
-        adcq	24(%rbx), %rcx
-        movq	$0x7fffffffffffffff, %rdx
-        movq	%rcx, %r11
-        sarq	$63, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %r8
-        sbbq	%rcx, %r9
-        sbbq	%rcx, %r10
-        sbbq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	8(%rsp), %rdi
-        movq	40(%rsp), %rsi
-        movq	32(%rsp), %rbx
-        # Sub
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        movq	16(%rsi), %r10
-        movq	24(%rsi), %r11
-        subq	(%rbx), %r8
-        movq	$0x00, %rcx
-        sbbq	8(%rbx), %r9
-        movq	$-19, %rax
-        sbbq	16(%rbx), %r10
-        movq	$0x7fffffffffffffff, %rdx
-        sbbq	24(%rbx), %r11
-        sbbq	$0x00, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Add modulus (if underflow)
-        addq	%rax, %r8
-        adcq	%rcx, %r9
-        adcq	%rcx, %r10
-        adcq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	16(%rsp), %rdi
-        movq	(%rsp), %rsi
-        movq	168(%rsp), %rbx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rbx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rbx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rbx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rbx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	8(%rsp), %rdi
-        movq	8(%rsp), %rsi
-        movq	160(%rsp), %rbx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rbx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rbx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rbx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rbx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	24(%rsp), %rdi
-        movq	152(%rsp), %rsi
-        movq	136(%rsp), %rbx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rbx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rbx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rbx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rbx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	(%rsp), %rdi
-        movq	128(%rsp), %rsi
-        movq	144(%rsp), %rbx
-        # Multiply
-        #  A[0] * B[0]
-        movq	(%rbx), %rax
-        mulq	(%rsi)
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        #  A[0] * B[1]
-        movq	8(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r10, %r10
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        #  A[1] * B[0]
-        movq	(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r11, %r11
-        addq	%rax, %r9
-        adcq	%rdx, %r10
-        adcq	$0x00, %r11
-        #  A[0] * B[2]
-        movq	16(%rbx), %rax
-        mulq	(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        #  A[1] * B[1]
-        movq	8(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r12, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[2] * B[0]
-        movq	(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        #  A[0] * B[3]
-        movq	24(%rbx), %rax
-        mulq	(%rsi)
-        xorq	%r13, %r13
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[2]
-        movq	16(%rbx), %rax
-        mulq	8(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[2] * B[1]
-        movq	8(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[3] * B[0]
-        movq	(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        #  A[1] * B[3]
-        movq	24(%rbx), %rax
-        mulq	8(%rsi)
-        xorq	%r14, %r14
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[2]
-        movq	16(%rbx), %rax
-        mulq	16(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[3] * B[1]
-        movq	8(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r12
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        #  A[2] * B[3]
-        movq	24(%rbx), %rax
-        mulq	16(%rsi)
-        xorq	%r15, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[2]
-        movq	16(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	$0x00, %r15
-        #  A[3] * B[3]
-        movq	24(%rbx), %rax
-        mulq	24(%rsi)
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rax
-        mulq	%r12
-        xorq	%r12, %r12
-        addq	%rax, %r8
-        movq	$19, %rax
-        adcq	%rdx, %r12
-        mulq	%r13
-        xorq	%r13, %r13
-        addq	%rax, %r9
-        movq	$19, %rax
-        adcq	%rdx, %r13
-        mulq	%r14
-        xorq	%r14, %r14
-        addq	%rax, %r10
-        movq	$19, %rax
-        adcq	%rdx, %r14
-        mulq	%r15
-        #  Add remaining product results in
-        addq	%r12, %r9
-        adcq	%r13, %r10
-        adcq	%r14, %r11
-        adcq	%rax, %r11
-        adcq	$0x00, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        leaq	48(%rsp), %rdi
-        movq	(%rsp), %rsi
-        movq	(%rsp), %rbx
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        addq	(%rbx), %r8
-        movq	16(%rsi), %r10
-        adcq	8(%rbx), %r9
-        movq	24(%rsi), %rcx
-        adcq	16(%rbx), %r10
-        movq	$-19, %rax
-        adcq	24(%rbx), %rcx
-        movq	$0x7fffffffffffffff, %rdx
-        movq	%rcx, %r11
-        sarq	$63, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %r8
-        sbbq	%rcx, %r9
-        sbbq	%rcx, %r10
-        sbbq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	(%rsp), %rdi
-        movq	16(%rsp), %rsi
-        movq	8(%rsp), %rbx
-        # Sub
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        movq	16(%rsi), %r10
-        movq	24(%rsi), %r11
-        subq	(%rbx), %r8
-        movq	$0x00, %rcx
-        sbbq	8(%rbx), %r9
-        movq	$-19, %rax
-        sbbq	16(%rbx), %r10
-        movq	$0x7fffffffffffffff, %rdx
-        sbbq	24(%rbx), %r11
-        sbbq	$0x00, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Add modulus (if underflow)
-        addq	%rax, %r8
-        adcq	%rcx, %r9
-        adcq	%rcx, %r10
-        adcq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	8(%rsp), %rdi
-        movq	16(%rsp), %rsi
-        movq	8(%rsp), %rbx
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        addq	(%rbx), %r8
-        movq	16(%rsi), %r10
-        adcq	8(%rbx), %r9
-        movq	24(%rsi), %rcx
-        adcq	16(%rbx), %r10
-        movq	$-19, %rax
-        adcq	24(%rbx), %rcx
-        movq	$0x7fffffffffffffff, %rdx
-        movq	%rcx, %r11
-        sarq	$63, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %r8
-        sbbq	%rcx, %r9
-        sbbq	%rcx, %r10
-        sbbq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	16(%rsp), %rdi
-        leaq	48(%rsp), %rsi
-        movq	24(%rsp), %rbx
-        # Sub
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        movq	16(%rsi), %r10
-        movq	24(%rsi), %r11
-        subq	(%rbx), %r8
-        movq	$0x00, %rcx
-        sbbq	8(%rbx), %r9
-        movq	$-19, %rax
-        sbbq	16(%rbx), %r10
-        movq	$0x7fffffffffffffff, %rdx
-        sbbq	24(%rbx), %r11
-        sbbq	$0x00, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Add modulus (if underflow)
-        addq	%rax, %r8
-        adcq	%rcx, %r9
-        adcq	%rcx, %r10
-        adcq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	24(%rsp), %rdi
-        leaq	48(%rsp), %rsi
-        movq	24(%rsp), %rbx
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        addq	(%rbx), %r8
-        movq	16(%rsi), %r10
-        adcq	8(%rbx), %r9
-        movq	24(%rsi), %rcx
-        adcq	16(%rbx), %r10
-        movq	$-19, %rax
-        adcq	24(%rbx), %rcx
-        movq	$0x7fffffffffffffff, %rdx
-        movq	%rcx, %r11
-        sarq	$63, %rcx
-        #   Mask the modulus
-        andq	%rcx, %rax
-        andq	%rcx, %rdx
-        #   Sub modulus (if overflow)
-        subq	%rax, %r8
-        sbbq	%rcx, %r9
-        sbbq	%rcx, %r10
-        sbbq	%rdx, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        addq	$0x50, %rsp
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        popq	%rbx
-        repz retq
-#ifndef __APPLE__
-.size	fe_ge_sub_x64,.-fe_ge_sub_x64
-#endif /* __APPLE__ */
-#ifdef HAVE_INTEL_AVX2
-#ifndef __APPLE__
-.text
-.globl	fe_mul_avx2
-.type	fe_mul_avx2,@function
-.align	16
-fe_mul_avx2:
-#else
-.section	__TEXT,__text
-.globl	_fe_mul_avx2
-.p2align	4
-_fe_mul_avx2:
-#endif /* __APPLE__ */
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        pushq	%rbx
-        movq	%rdx, %rbx
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rbx), %rdx
-        mulxq	(%rsi), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rsi), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rsi), %rax, %rcx
-        xorq	%r15, %r15
-        adcxq	%rax, %r9
-        # A[1] * B[3]
-        movq	24(%rbx), %rdx
-        mulxq	8(%rsi), %r12, %r13
-        adcxq	%rcx, %r10
-        # A[0] * B[1]
-        movq	8(%rbx), %rdx
-        mulxq	(%rsi), %rax, %rcx
-        adoxq	%rax, %r9
-        # A[2] * B[1]
-        mulxq	16(%rsi), %rax, %r14
-        adoxq	%rcx, %r10
-        adcxq	%rax, %r11
-        # A[1] * B[2]
-        movq	16(%rbx), %rdx
-        mulxq	8(%rsi), %rax, %rcx
-        adcxq	%r14, %r12
-        adoxq	%rax, %r11
-        adcxq	%r15, %r13
-        adoxq	%rcx, %r12
-        # A[0] * B[2]
-        mulxq	(%rsi), %rax, %rcx
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rax, %r10
-        # A[1] * B[1]
-        movq	8(%rbx), %rdx
-        mulxq	8(%rsi), %rdx, %rax
-        adcxq	%rcx, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rbx), %rdx
-        adoxq	%rax, %r11
-        mulxq	24(%rsi), %rax, %rcx
-        adcxq	%rax, %r12
-        # A[2] * B[2]
-        movq	16(%rbx), %rdx
-        mulxq	16(%rsi), %rdx, %rax
-        adcxq	%rcx, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rbx), %rdx
-        adoxq	%rax, %r13
-        mulxq	24(%rsi), %rax, %rcx
-        adoxq	%r15, %r14
-        adcxq	%rax, %r14
-        # A[0] * B[3]
-        mulxq	(%rsi), %rdx, %rax
-        adcxq	%rcx, %r15
-        xorq	%rcx, %rcx
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rbx), %rdx
-        adcxq	%rax, %r12
-        mulxq	24(%rsi), %rdx, %rax
-        adoxq	%rdx, %r11
-        adoxq	%rax, %r12
-        # A[2] * B[3]
-        movq	24(%rbx), %rdx
-        mulxq	16(%rsi), %rdx, %rax
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rbx), %rdx
-        adcxq	%rax, %r14
-        mulxq	24(%rsi), %rax, %rdx
-        adcxq	%rcx, %r15
-        adoxq	%rax, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rcx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rcx, %rcx
-        mulxq	%r12, %rax, %r12
-        adcxq	%rax, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rax, %r13
-        adcxq	%rax, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rax, %r14
-        adcxq	%rax, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rcx, %rdx
-        adcxq	%rcx, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rcx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        popq	%rbx
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        repz retq
-#ifndef __APPLE__
-.size	fe_mul_avx2,.-fe_mul_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_sq_avx2
-.type	fe_sq_avx2,@function
-.align	16
-fe_sq_avx2:
-#else
-.section	__TEXT,__text
-.globl	_fe_sq_avx2
-.p2align	4
-_fe_sq_avx2:
-#endif /* __APPLE__ */
-        pushq	%rbx
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        # Square
-        # A[0] * A[1]
-        movq	(%rsi), %rdx
-        mulxq	8(%rsi), %r9, %r10
-        # A[0] * A[3]
-        mulxq	24(%rsi), %r11, %r12
-        # A[2] * A[1]
-        movq	16(%rsi), %rdx
-        mulxq	8(%rsi), %rcx, %rbx
-        xorq	%r15, %r15
-        adoxq	%rcx, %r11
-        # A[2] * A[3]
-        mulxq	24(%rsi), %r13, %r14
-        adoxq	%rbx, %r12
-        # A[2] * A[0]
-        mulxq	(%rsi), %rcx, %rbx
-        adoxq	%r15, %r13
-        adcxq	%rcx, %r10
-        adoxq	%r15, %r14
-        # A[1] * A[3]
-        movq	8(%rsi), %rdx
-        mulxq	24(%rsi), %rax, %r8
-        adcxq	%rbx, %r11
-        adcxq	%rax, %r12
-        adcxq	%r8, %r13
-        adcxq	%r15, %r14
-        # Double with Carry Flag
-        xorq	%r15, %r15
-        # A[0] * A[0]
-        movq	(%rsi), %rdx
-        mulxq	%rdx, %r8, %rax
-        adcxq	%r9, %r9
-        # A[1] * A[1]
-        movq	8(%rsi), %rdx
-        mulxq	%rdx, %rcx, %rbx
-        adcxq	%r10, %r10
-        adoxq	%rax, %r9
-        adcxq	%r11, %r11
-        adoxq	%rcx, %r10
-        # A[2] * A[2]
-        movq	16(%rsi), %rdx
-        mulxq	%rdx, %rax, %rcx
-        adcxq	%r12, %r12
-        adoxq	%rbx, %r11
-        adcxq	%r13, %r13
-        adoxq	%rax, %r12
-        # A[3] * A[3]
-        movq	24(%rsi), %rdx
-        mulxq	%rdx, %rax, %rbx
-        adcxq	%r14, %r14
-        adoxq	%rcx, %r13
-        adcxq	%r15, %r15
-        adoxq	%rax, %r14
-        adoxq	%rbx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rcx, %rcx
-        mulxq	%r12, %rax, %r12
-        adcxq	%rax, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rax, %r13
-        adcxq	%rax, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rax, %r14
-        adcxq	%rax, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rcx, %rdx
-        adcxq	%rcx, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rcx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        popq	%rbx
-        repz retq
-#ifndef __APPLE__
-.size	fe_sq_avx2,.-fe_sq_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_sq_n_avx2
-.type	fe_sq_n_avx2,@function
-.align	16
-fe_sq_n_avx2:
-#else
-.section	__TEXT,__text
-.globl	_fe_sq_n_avx2
-.p2align	4
-_fe_sq_n_avx2:
-#endif /* __APPLE__ */
-        pushq	%rbx
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        pushq	%rbp
-        movq	%rdx, %rbp
-L_fe_sq_n_avx2:
-        # Square
-        # A[0] * A[1]
-        movq	(%rsi), %rdx
-        mulxq	8(%rsi), %r9, %r10
-        # A[0] * A[3]
-        mulxq	24(%rsi), %r11, %r12
-        # A[2] * A[1]
-        movq	16(%rsi), %rdx
-        mulxq	8(%rsi), %rcx, %rbx
-        xorq	%r15, %r15
-        adoxq	%rcx, %r11
-        # A[2] * A[3]
-        mulxq	24(%rsi), %r13, %r14
-        adoxq	%rbx, %r12
-        # A[2] * A[0]
-        mulxq	(%rsi), %rcx, %rbx
-        adoxq	%r15, %r13
-        adcxq	%rcx, %r10
-        adoxq	%r15, %r14
-        # A[1] * A[3]
-        movq	8(%rsi), %rdx
-        mulxq	24(%rsi), %rax, %r8
-        adcxq	%rbx, %r11
-        adcxq	%rax, %r12
-        adcxq	%r8, %r13
-        adcxq	%r15, %r14
-        # Double with Carry Flag
-        xorq	%r15, %r15
-        # A[0] * A[0]
-        movq	(%rsi), %rdx
-        mulxq	%rdx, %r8, %rax
-        adcxq	%r9, %r9
-        # A[1] * A[1]
-        movq	8(%rsi), %rdx
-        mulxq	%rdx, %rcx, %rbx
-        adcxq	%r10, %r10
-        adoxq	%rax, %r9
-        adcxq	%r11, %r11
-        adoxq	%rcx, %r10
-        # A[2] * A[2]
-        movq	16(%rsi), %rdx
-        mulxq	%rdx, %rax, %rcx
-        adcxq	%r12, %r12
-        adoxq	%rbx, %r11
-        adcxq	%r13, %r13
-        adoxq	%rax, %r12
-        # A[3] * A[3]
-        movq	24(%rsi), %rdx
-        mulxq	%rdx, %rax, %rbx
-        adcxq	%r14, %r14
-        adoxq	%rcx, %r13
-        adcxq	%r15, %r15
-        adoxq	%rax, %r14
-        adoxq	%rbx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rcx, %rcx
-        mulxq	%r12, %rax, %r12
-        adcxq	%rax, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rax, %r13
-        adcxq	%rax, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rax, %r14
-        adcxq	%rax, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rcx, %rdx
-        adcxq	%rcx, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rcx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        decb	%bpl
-        jnz	L_fe_sq_n_avx2
-        popq	%rbp
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        popq	%rbx
-        repz retq
-#ifndef __APPLE__
-.size	fe_sq_n_avx2,.-fe_sq_n_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_mul121666_avx2
-.type	fe_mul121666_avx2,@function
-.align	16
-fe_mul121666_avx2:
-#else
-.section	__TEXT,__text
-.globl	_fe_mul121666_avx2
-.p2align	4
-_fe_mul121666_avx2:
-#endif /* __APPLE__ */
-        pushq	%r12
-        pushq	%r13
-        movq	$0x1db42, %rdx
-        mulxq	(%rsi), %rax, %r13
-        mulxq	8(%rsi), %rcx, %r12
-        mulxq	16(%rsi), %r8, %r11
-        mulxq	24(%rsi), %r9, %r10
-        addq	%r13, %rcx
-        adcq	%r12, %r8
-        adcq	%r11, %r9
-        adcq	$0x00, %r10
-        movq	$0x7fffffffffffffff, %r13
-        shldq	$0x01, %r9, %r10
-        andq	%r13, %r9
-        imulq	$19, %r10, %r10
-        addq	%r10, %rax
-        adcq	$0x00, %rcx
-        adcq	$0x00, %r8
-        adcq	$0x00, %r9
-        movq	%rax, (%rdi)
-        movq	%rcx, 8(%rdi)
-        movq	%r8, 16(%rdi)
-        movq	%r9, 24(%rdi)
-        popq	%r13
-        popq	%r12
-        repz retq
-#ifndef __APPLE__
-.size	fe_mul121666_avx2,.-fe_mul121666_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_sq2_avx2
-.type	fe_sq2_avx2,@function
-.align	16
-fe_sq2_avx2:
-#else
-.section	__TEXT,__text
-.globl	_fe_sq2_avx2
-.p2align	4
-_fe_sq2_avx2:
-#endif /* __APPLE__ */
-        pushq	%rbx
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        # Square * 2
-        # A[0] * A[1]
-        movq	(%rsi), %rdx
-        mulxq	8(%rsi), %r9, %r10
-        # A[0] * A[3]
-        mulxq	24(%rsi), %r11, %r12
-        # A[2] * A[1]
-        movq	16(%rsi), %rdx
-        mulxq	8(%rsi), %rcx, %rbx
-        xorq	%r15, %r15
-        adoxq	%rcx, %r11
-        # A[2] * A[3]
-        mulxq	24(%rsi), %r13, %r14
-        adoxq	%rbx, %r12
-        # A[2] * A[0]
-        mulxq	(%rsi), %rcx, %rbx
-        adoxq	%r15, %r13
-        adcxq	%rcx, %r10
-        adoxq	%r15, %r14
-        # A[1] * A[3]
-        movq	8(%rsi), %rdx
-        mulxq	24(%rsi), %rax, %r8
-        adcxq	%rbx, %r11
-        adcxq	%rax, %r12
-        adcxq	%r8, %r13
-        adcxq	%r15, %r14
-        # Double with Carry Flag
-        xorq	%r15, %r15
-        # A[0] * A[0]
-        movq	(%rsi), %rdx
-        mulxq	%rdx, %r8, %rax
-        adcxq	%r9, %r9
-        # A[1] * A[1]
-        movq	8(%rsi), %rdx
-        mulxq	%rdx, %rcx, %rbx
-        adcxq	%r10, %r10
-        adoxq	%rax, %r9
-        adcxq	%r11, %r11
-        adoxq	%rcx, %r10
-        # A[2] * A[2]
-        movq	16(%rsi), %rdx
-        mulxq	%rdx, %rax, %rcx
-        adcxq	%r12, %r12
-        adoxq	%rbx, %r11
-        adcxq	%r13, %r13
-        adoxq	%rax, %r12
-        # A[3] * A[3]
-        movq	24(%rsi), %rdx
-        mulxq	%rdx, %rax, %rbx
-        adcxq	%r14, %r14
-        adoxq	%rcx, %r13
-        adcxq	%r15, %r15
-        adoxq	%rax, %r14
-        adoxq	%rbx, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rbx
-        xorq	%rax, %rax
-        #  Move top half into t4-t7 and remove top bit from t3 and double
-        shldq	$3, %r15, %rax
-        shldq	$2, %r14, %r15
-        shldq	$2, %r13, %r14
-        shldq	$2, %r12, %r13
-        shldq	$2, %r11, %r12
-        shldq	$0x01, %r10, %r11
-        shldq	$0x01, %r9, %r10
-        shldq	$0x01, %r8, %r9
-        shlq	$0x01, %r8
-        andq	%rbx, %r11
-        #  Two out left, one in right
-        andq	%rbx, %r15
-        #  Multiply top bits by 19*19
-        imulq	$0x169, %rax, %rcx
-        xorq	%rbx, %rbx
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        adoxq	%rcx, %r8
-        mulxq	%r12, %rax, %r12
-        adcxq	%rax, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rax, %r13
-        adcxq	%rax, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rax, %r14
-        adcxq	%rax, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rbx, %rdx
-        adcxq	%rbx, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rbx
-        imulq	$19, %rdx, %rax
-        andq	%rbx, %r11
-        addq	%rax, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rbx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        popq	%rbx
-        repz retq
-#ifndef __APPLE__
-.size	fe_sq2_avx2,.-fe_sq2_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_invert_avx2
-.type	fe_invert_avx2,@function
-.align	16
-fe_invert_avx2:
-#else
-.section	__TEXT,__text
-.globl	_fe_invert_avx2
-.p2align	4
-_fe_invert_avx2:
-#endif /* __APPLE__ */
-        subq	$0x90, %rsp
-        # Invert
-        movq	%rdi, 128(%rsp)
-        movq	%rsi, 136(%rsp)
-        movq	%rsp, %rdi
-        movq	136(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        movq	%rsp, %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        movq	136(%rsp), %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        movq	%rsp, %rdi
-        movq	%rsp, %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        movq	%rsp, %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        leaq	64(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        movq	$4, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        movq	$9, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        movq	$19, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        leaq	64(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        movq	$9, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        movq	$49, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        movq	$0x63, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        leaq	64(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        movq	$49, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        movq	$4, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        movq	128(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        movq	%rsp, %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        movq	136(%rsp), %rsi
-        movq	128(%rsp), %rdi
-        addq	$0x90, %rsp
-        repz retq
-#ifndef __APPLE__
-.text
-.globl	curve25519_avx2
-.type	curve25519_avx2,@function
-.align	16
-curve25519_avx2:
-#else
-.section	__TEXT,__text
-.globl	_curve25519_avx2
-.p2align	4
-_curve25519_avx2:
-#endif /* __APPLE__ */
-        pushq	%rbx
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        pushq	%rbp
-        movq	%rdx, %r8
-        subq	$0xc0, %rsp
-        movq	$0x00, 184(%rsp)
-        movq	%rdi, 176(%rsp)
-        # Set one
-        movq	$0x01, (%rdi)
-        movq	$0x00, 8(%rdi)
-        movq	$0x00, 16(%rdi)
-        movq	$0x00, 24(%rdi)
-        # Set zero
-        movq	$0x00, (%rsp)
-        movq	$0x00, 8(%rsp)
-        movq	$0x00, 16(%rsp)
-        movq	$0x00, 24(%rsp)
-        # Set one
-        movq	$0x01, 32(%rsp)
-        movq	$0x00, 40(%rsp)
-        movq	$0x00, 48(%rsp)
-        movq	$0x00, 56(%rsp)
-        # Copy
-        movq	(%r8), %r9
-        movq	8(%r8), %r10
-        movq	16(%r8), %r11
-        movq	24(%r8), %r12
-        movq	%r9, 64(%rsp)
-        movq	%r10, 72(%rsp)
-        movq	%r11, 80(%rsp)
-        movq	%r12, 88(%rsp)
-        movb	$62, 168(%rsp)
-        movq	$3, 160(%rsp)
-L_curve25519_avx2_words:
-L_curve25519_avx2_bits:
-        movq	184(%rsp), %rbx
-        movq	160(%rsp), %r9
-        movb	168(%rsp), %cl
-        movq	(%rsi,%r9,8), %rax
-        shrq	%cl, %rax
-        andq	$0x01, %rax
-        xorq	%rax, %rbx
-        negq	%rbx
-        # Conditional Swap
-        movq	(%rdi), %r9
-        movq	8(%rdi), %r10
-        movq	16(%rdi), %r11
-        movq	24(%rdi), %r12
-        xorq	64(%rsp), %r9
-        xorq	72(%rsp), %r10
-        xorq	80(%rsp), %r11
-        xorq	88(%rsp), %r12
-        andq	%rbx, %r9
-        andq	%rbx, %r10
-        andq	%rbx, %r11
-        andq	%rbx, %r12
-        xorq	%r9, (%rdi)
-        xorq	%r10, 8(%rdi)
-        xorq	%r11, 16(%rdi)
-        xorq	%r12, 24(%rdi)
-        xorq	%r9, 64(%rsp)
-        xorq	%r10, 72(%rsp)
-        xorq	%r11, 80(%rsp)
-        xorq	%r12, 88(%rsp)
-        # Conditional Swap
-        movq	(%rsp), %r9
-        movq	8(%rsp), %r10
-        movq	16(%rsp), %r11
-        movq	24(%rsp), %r12
-        xorq	32(%rsp), %r9
-        xorq	40(%rsp), %r10
-        xorq	48(%rsp), %r11
-        xorq	56(%rsp), %r12
-        andq	%rbx, %r9
-        andq	%rbx, %r10
-        andq	%rbx, %r11
-        andq	%rbx, %r12
-        xorq	%r9, (%rsp)
-        xorq	%r10, 8(%rsp)
-        xorq	%r11, 16(%rsp)
-        xorq	%r12, 24(%rsp)
-        xorq	%r9, 32(%rsp)
-        xorq	%r10, 40(%rsp)
-        xorq	%r11, 48(%rsp)
-        xorq	%r12, 56(%rsp)
-        movq	%rax, 184(%rsp)
-        # Add
-        movq	(%rdi), %r9
-        movq	8(%rdi), %r10
-        movq	16(%rdi), %r11
-        movq	24(%rdi), %rax
-        movq	%r9, %r13
-        addq	(%rsp), %r9
-        movq	%r10, %r14
-        adcq	8(%rsp), %r10
-        movq	%r11, %r15
-        adcq	16(%rsp), %r11
-        movq	%rax, %rbp
-        adcq	24(%rsp), %rax
-        movq	$-19, %rcx
-        movq	%rax, %r12
-        movq	$0x7fffffffffffffff, %rbx
-        sarq	$63, %rax
-        #   Mask the modulus
-        andq	%rax, %rcx
-        andq	%rax, %rbx
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r9
-        sbbq	%rax, %r10
-        sbbq	%rax, %r11
-        sbbq	%rbx, %r12
-        # Sub
-        subq	(%rsp), %r13
-        movq	$0x00, %rax
-        sbbq	8(%rsp), %r14
-        movq	$-19, %rcx
-        sbbq	16(%rsp), %r15
-        movq	$0x7fffffffffffffff, %rbx
-        sbbq	24(%rsp), %rbp
-        sbbq	$0x00, %rax
-        #   Mask the modulus
-        andq	%rax, %rcx
-        andq	%rax, %rbx
-        #   Add modulus (if underflow)
-        addq	%rcx, %r13
-        adcq	%rax, %r14
-        adcq	%rax, %r15
-        adcq	%rbx, %rbp
-        movq	%r9, (%rdi)
-        movq	%r10, 8(%rdi)
-        movq	%r11, 16(%rdi)
-        movq	%r12, 24(%rdi)
-        movq	%r13, 128(%rsp)
-        movq	%r14, 136(%rsp)
-        movq	%r15, 144(%rsp)
-        movq	%rbp, 152(%rsp)
-        # Add
-        movq	64(%rsp), %r9
-        movq	72(%rsp), %r10
-        movq	80(%rsp), %r11
-        movq	88(%rsp), %rax
-        movq	%r9, %r13
-        addq	32(%rsp), %r9
-        movq	%r10, %r14
-        adcq	40(%rsp), %r10
-        movq	%r11, %r15
-        adcq	48(%rsp), %r11
-        movq	%rax, %rbp
-        adcq	56(%rsp), %rax
-        movq	$-19, %rcx
-        movq	%rax, %r12
-        movq	$0x7fffffffffffffff, %rbx
-        sarq	$63, %rax
-        #   Mask the modulus
-        andq	%rax, %rcx
-        andq	%rax, %rbx
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r9
-        sbbq	%rax, %r10
-        sbbq	%rax, %r11
-        sbbq	%rbx, %r12
-        # Sub
-        subq	32(%rsp), %r13
-        movq	$0x00, %rax
-        sbbq	40(%rsp), %r14
-        movq	$-19, %rcx
-        sbbq	48(%rsp), %r15
-        movq	$0x7fffffffffffffff, %rbx
-        sbbq	56(%rsp), %rbp
-        sbbq	$0x00, %rax
-        #   Mask the modulus
-        andq	%rax, %rcx
-        andq	%rax, %rbx
-        #   Add modulus (if underflow)
-        addq	%rcx, %r13
-        adcq	%rax, %r14
-        adcq	%rax, %r15
-        adcq	%rbx, %rbp
-        movq	%r9, (%rsp)
-        movq	%r10, 8(%rsp)
-        movq	%r11, 16(%rsp)
-        movq	%r12, 24(%rsp)
-        movq	%r13, 96(%rsp)
-        movq	%r14, 104(%rsp)
-        movq	%r15, 112(%rsp)
-        movq	%rbp, 120(%rsp)
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rdi), %rdx
-        mulxq	96(%rsp), %r9, %r10
-        # A[2] * B[0]
-        mulxq	112(%rsp), %r11, %r12
-        # A[1] * B[0]
-        mulxq	104(%rsp), %rcx, %rbx
-        xorq	%rbp, %rbp
-        adcxq	%rcx, %r10
-        # A[1] * B[3]
-        movq	24(%rdi), %rdx
-        mulxq	104(%rsp), %r13, %r14
-        adcxq	%rbx, %r11
-        # A[0] * B[1]
-        movq	8(%rdi), %rdx
-        mulxq	96(%rsp), %rcx, %rbx
-        adoxq	%rcx, %r10
-        # A[2] * B[1]
-        mulxq	112(%rsp), %rcx, %r15
-        adoxq	%rbx, %r11
-        adcxq	%rcx, %r12
-        # A[1] * B[2]
-        movq	16(%rdi), %rdx
-        mulxq	104(%rsp), %rcx, %rbx
-        adcxq	%r15, %r13
-        adoxq	%rcx, %r12
-        adcxq	%rbp, %r14
-        adoxq	%rbx, %r13
-        # A[0] * B[2]
-        mulxq	96(%rsp), %rcx, %rbx
-        adoxq	%rbp, %r14
-        xorq	%r15, %r15
-        adcxq	%rcx, %r11
-        # A[1] * B[1]
-        movq	8(%rdi), %rdx
-        mulxq	104(%rsp), %rdx, %rcx
-        adcxq	%rbx, %r12
-        adoxq	%rdx, %r11
-        # A[3] * B[1]
-        movq	8(%rdi), %rdx
-        adoxq	%rcx, %r12
-        mulxq	120(%rsp), %rcx, %rbx
-        adcxq	%rcx, %r13
-        # A[2] * B[2]
-        movq	16(%rdi), %rdx
-        mulxq	112(%rsp), %rdx, %rcx
-        adcxq	%rbx, %r14
-        adoxq	%rdx, %r13
-        # A[3] * B[3]
-        movq	24(%rdi), %rdx
-        adoxq	%rcx, %r14
-        mulxq	120(%rsp), %rcx, %rbx
-        adoxq	%rbp, %r15
-        adcxq	%rcx, %r15
-        # A[0] * B[3]
-        mulxq	96(%rsp), %rdx, %rcx
-        adcxq	%rbx, %rbp
-        xorq	%rbx, %rbx
-        adcxq	%rdx, %r12
-        # A[3] * B[0]
-        movq	(%rdi), %rdx
-        adcxq	%rcx, %r13
-        mulxq	120(%rsp), %rdx, %rcx
-        adoxq	%rdx, %r12
-        adoxq	%rcx, %r13
-        # A[2] * B[3]
-        movq	24(%rdi), %rdx
-        mulxq	112(%rsp), %rdx, %rcx
-        adcxq	%rdx, %r14
-        # A[3] * B[2]
-        movq	16(%rdi), %rdx
-        adcxq	%rcx, %r15
-        mulxq	120(%rsp), %rcx, %rdx
-        adcxq	%rbx, %rbp
-        adoxq	%rcx, %r14
-        adoxq	%rdx, %r15
-        adoxq	%rbx, %rbp
-        # Reduce
-        movq	$0x7fffffffffffffff, %rbx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r15, %rbp
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        andq	%rbx, %r12
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rbx, %rbx
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %rcx, %r15
-        adcxq	%rcx, %r11
-        adoxq	%r15, %r12
-        mulxq	%rbp, %rbp, %rdx
-        adcxq	%rbp, %r12
-        adoxq	%rbx, %rdx
-        adcxq	%rbx, %rdx
-        #  Overflow
-        shldq	$0x01, %r12, %rdx
-        movq	$0x7fffffffffffffff, %rbx
-        imulq	$19, %rdx, %rcx
-        andq	%rbx, %r12
-        addq	%rcx, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        # Reduce if top bit set
-        movq	%r12, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rbx, %r12
-        addq	%rdx, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        # Store
-        movq	%r9, 32(%rsp)
-        movq	%r10, 40(%rsp)
-        movq	%r11, 48(%rsp)
-        movq	%r12, 56(%rsp)
-        # Multiply
-        # A[0] * B[0]
-        movq	128(%rsp), %rdx
-        mulxq	(%rsp), %r9, %r10
-        # A[2] * B[0]
-        mulxq	16(%rsp), %r11, %r12
-        # A[1] * B[0]
-        mulxq	8(%rsp), %rcx, %rbx
-        xorq	%rbp, %rbp
-        adcxq	%rcx, %r10
-        # A[1] * B[3]
-        movq	152(%rsp), %rdx
-        mulxq	8(%rsp), %r13, %r14
-        adcxq	%rbx, %r11
-        # A[0] * B[1]
-        movq	136(%rsp), %rdx
-        mulxq	(%rsp), %rcx, %rbx
-        adoxq	%rcx, %r10
-        # A[2] * B[1]
-        mulxq	16(%rsp), %rcx, %r15
-        adoxq	%rbx, %r11
-        adcxq	%rcx, %r12
-        # A[1] * B[2]
-        movq	144(%rsp), %rdx
-        mulxq	8(%rsp), %rcx, %rbx
-        adcxq	%r15, %r13
-        adoxq	%rcx, %r12
-        adcxq	%rbp, %r14
-        adoxq	%rbx, %r13
-        # A[0] * B[2]
-        mulxq	(%rsp), %rcx, %rbx
-        adoxq	%rbp, %r14
-        xorq	%r15, %r15
-        adcxq	%rcx, %r11
-        # A[1] * B[1]
-        movq	136(%rsp), %rdx
-        mulxq	8(%rsp), %rdx, %rcx
-        adcxq	%rbx, %r12
-        adoxq	%rdx, %r11
-        # A[3] * B[1]
-        movq	136(%rsp), %rdx
-        adoxq	%rcx, %r12
-        mulxq	24(%rsp), %rcx, %rbx
-        adcxq	%rcx, %r13
-        # A[2] * B[2]
-        movq	144(%rsp), %rdx
-        mulxq	16(%rsp), %rdx, %rcx
-        adcxq	%rbx, %r14
-        adoxq	%rdx, %r13
-        # A[3] * B[3]
-        movq	152(%rsp), %rdx
-        adoxq	%rcx, %r14
-        mulxq	24(%rsp), %rcx, %rbx
-        adoxq	%rbp, %r15
-        adcxq	%rcx, %r15
-        # A[0] * B[3]
-        mulxq	(%rsp), %rdx, %rcx
-        adcxq	%rbx, %rbp
-        xorq	%rbx, %rbx
-        adcxq	%rdx, %r12
-        # A[3] * B[0]
-        movq	128(%rsp), %rdx
-        adcxq	%rcx, %r13
-        mulxq	24(%rsp), %rdx, %rcx
-        adoxq	%rdx, %r12
-        adoxq	%rcx, %r13
-        # A[2] * B[3]
-        movq	152(%rsp), %rdx
-        mulxq	16(%rsp), %rdx, %rcx
-        adcxq	%rdx, %r14
-        # A[3] * B[2]
-        movq	144(%rsp), %rdx
-        adcxq	%rcx, %r15
-        mulxq	24(%rsp), %rcx, %rdx
-        adcxq	%rbx, %rbp
-        adoxq	%rcx, %r14
-        adoxq	%rdx, %r15
-        adoxq	%rbx, %rbp
-        # Reduce
-        movq	$0x7fffffffffffffff, %rbx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r15, %rbp
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        andq	%rbx, %r12
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rbx, %rbx
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %rcx, %r15
-        adcxq	%rcx, %r11
-        adoxq	%r15, %r12
-        mulxq	%rbp, %rbp, %rdx
-        adcxq	%rbp, %r12
-        adoxq	%rbx, %rdx
-        adcxq	%rbx, %rdx
-        #  Overflow
-        shldq	$0x01, %r12, %rdx
-        movq	$0x7fffffffffffffff, %rbx
-        imulq	$19, %rdx, %rcx
-        andq	%rbx, %r12
-        addq	%rcx, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        # Reduce if top bit set
-        movq	%r12, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rbx, %r12
-        addq	%rdx, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        # Store
-        movq	%r9, (%rsp)
-        movq	%r10, 8(%rsp)
-        movq	%r11, 16(%rsp)
-        movq	%r12, 24(%rsp)
-        # Square
-        # A[0] * A[1]
-        movq	128(%rsp), %rdx
-        mulxq	136(%rsp), %r10, %r11
-        # A[0] * A[3]
-        mulxq	152(%rsp), %r12, %r13
-        # A[2] * A[1]
-        movq	144(%rsp), %rdx
-        mulxq	136(%rsp), %rcx, %rbx
-        xorq	%rbp, %rbp
-        adoxq	%rcx, %r12
-        # A[2] * A[3]
-        mulxq	152(%rsp), %r14, %r15
-        adoxq	%rbx, %r13
-        # A[2] * A[0]
-        mulxq	128(%rsp), %rcx, %rbx
-        adoxq	%rbp, %r14
-        adcxq	%rcx, %r11
-        adoxq	%rbp, %r15
-        # A[1] * A[3]
-        movq	136(%rsp), %rdx
-        mulxq	152(%rsp), %rax, %r9
-        adcxq	%rbx, %r12
-        adcxq	%rax, %r13
-        adcxq	%r9, %r14
-        adcxq	%rbp, %r15
-        # Double with Carry Flag
-        xorq	%rbp, %rbp
-        # A[0] * A[0]
-        movq	128(%rsp), %rdx
-        mulxq	%rdx, %r9, %rax
-        adcxq	%r10, %r10
-        # A[1] * A[1]
-        movq	136(%rsp), %rdx
-        mulxq	%rdx, %rcx, %rbx
-        adcxq	%r11, %r11
-        adoxq	%rax, %r10
-        adcxq	%r12, %r12
-        adoxq	%rcx, %r11
-        # A[2] * A[2]
-        movq	144(%rsp), %rdx
-        mulxq	%rdx, %rax, %rcx
-        adcxq	%r13, %r13
-        adoxq	%rbx, %r12
-        adcxq	%r14, %r14
-        adoxq	%rax, %r13
-        # A[3] * A[3]
-        movq	152(%rsp), %rdx
-        mulxq	%rdx, %rax, %rbx
-        adcxq	%r15, %r15
-        adoxq	%rcx, %r14
-        adcxq	%rbp, %rbp
-        adoxq	%rax, %r15
-        adoxq	%rbx, %rbp
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r15, %rbp
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        andq	%rcx, %r12
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rcx, %rcx
-        mulxq	%r13, %rax, %r13
-        adcxq	%rax, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rax, %r14
-        adcxq	%rax, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %rax, %r15
-        adcxq	%rax, %r11
-        adoxq	%r15, %r12
-        mulxq	%rbp, %rbp, %rdx
-        adcxq	%rbp, %r12
-        adoxq	%rcx, %rdx
-        adcxq	%rcx, %rdx
-        #  Overflow
-        shldq	$0x01, %r12, %rdx
-        movq	$0x7fffffffffffffff, %rcx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r12
-        addq	%rax, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        # Reduce if top bit set
-        movq	%r12, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r12
-        addq	%rdx, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        # Store
-        movq	%r9, 96(%rsp)
-        movq	%r10, 104(%rsp)
-        movq	%r11, 112(%rsp)
-        movq	%r12, 120(%rsp)
-        # Square
-        # A[0] * A[1]
-        movq	(%rdi), %rdx
-        mulxq	8(%rdi), %r10, %r11
-        # A[0] * A[3]
-        mulxq	24(%rdi), %r12, %r13
-        # A[2] * A[1]
-        movq	16(%rdi), %rdx
-        mulxq	8(%rdi), %rcx, %rbx
-        xorq	%rbp, %rbp
-        adoxq	%rcx, %r12
-        # A[2] * A[3]
-        mulxq	24(%rdi), %r14, %r15
-        adoxq	%rbx, %r13
-        # A[2] * A[0]
-        mulxq	(%rdi), %rcx, %rbx
-        adoxq	%rbp, %r14
-        adcxq	%rcx, %r11
-        adoxq	%rbp, %r15
-        # A[1] * A[3]
-        movq	8(%rdi), %rdx
-        mulxq	24(%rdi), %rax, %r9
-        adcxq	%rbx, %r12
-        adcxq	%rax, %r13
-        adcxq	%r9, %r14
-        adcxq	%rbp, %r15
-        # Double with Carry Flag
-        xorq	%rbp, %rbp
-        # A[0] * A[0]
-        movq	(%rdi), %rdx
-        mulxq	%rdx, %r9, %rax
-        adcxq	%r10, %r10
-        # A[1] * A[1]
-        movq	8(%rdi), %rdx
-        mulxq	%rdx, %rcx, %rbx
-        adcxq	%r11, %r11
-        adoxq	%rax, %r10
-        adcxq	%r12, %r12
-        adoxq	%rcx, %r11
-        # A[2] * A[2]
-        movq	16(%rdi), %rdx
-        mulxq	%rdx, %rax, %rcx
-        adcxq	%r13, %r13
-        adoxq	%rbx, %r12
-        adcxq	%r14, %r14
-        adoxq	%rax, %r13
-        # A[3] * A[3]
-        movq	24(%rdi), %rdx
-        mulxq	%rdx, %rax, %rbx
-        adcxq	%r15, %r15
-        adoxq	%rcx, %r14
-        adcxq	%rbp, %rbp
-        adoxq	%rax, %r15
-        adoxq	%rbx, %rbp
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r15, %rbp
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        andq	%rcx, %r12
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rcx, %rcx
-        mulxq	%r13, %rax, %r13
-        adcxq	%rax, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rax, %r14
-        adcxq	%rax, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %rax, %r15
-        adcxq	%rax, %r11
-        adoxq	%r15, %r12
-        mulxq	%rbp, %rbp, %rdx
-        adcxq	%rbp, %r12
-        adoxq	%rcx, %rdx
-        adcxq	%rcx, %rdx
-        #  Overflow
-        shldq	$0x01, %r12, %rdx
-        movq	$0x7fffffffffffffff, %rcx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r12
-        addq	%rax, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        # Reduce if top bit set
-        movq	%r12, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r12
-        addq	%rdx, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        # Store
-        movq	%r9, 128(%rsp)
-        movq	%r10, 136(%rsp)
-        movq	%r11, 144(%rsp)
-        movq	%r12, 152(%rsp)
-        # Add
-        movq	32(%rsp), %r9
-        movq	40(%rsp), %r10
-        movq	48(%rsp), %r11
-        movq	56(%rsp), %rax
-        movq	%r9, %r13
-        addq	(%rsp), %r9
-        movq	%r10, %r14
-        adcq	8(%rsp), %r10
-        movq	%r11, %r15
-        adcq	16(%rsp), %r11
-        movq	%rax, %rbp
-        adcq	24(%rsp), %rax
-        movq	$-19, %rcx
-        movq	%rax, %r12
-        movq	$0x7fffffffffffffff, %rbx
-        sarq	$63, %rax
-        #   Mask the modulus
-        andq	%rax, %rcx
-        andq	%rax, %rbx
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r9
-        sbbq	%rax, %r10
-        sbbq	%rax, %r11
-        sbbq	%rbx, %r12
-        # Sub
-        subq	(%rsp), %r13
-        movq	$0x00, %rax
-        sbbq	8(%rsp), %r14
-        movq	$-19, %rcx
-        sbbq	16(%rsp), %r15
-        movq	$0x7fffffffffffffff, %rbx
-        sbbq	24(%rsp), %rbp
-        sbbq	$0x00, %rax
-        #   Mask the modulus
-        andq	%rax, %rcx
-        andq	%rax, %rbx
-        #   Add modulus (if underflow)
-        addq	%rcx, %r13
-        adcq	%rax, %r14
-        adcq	%rax, %r15
-        adcq	%rbx, %rbp
-        movq	%r9, 64(%rsp)
-        movq	%r10, 72(%rsp)
-        movq	%r11, 80(%rsp)
-        movq	%r12, 88(%rsp)
-        movq	%r13, (%rsp)
-        movq	%r14, 8(%rsp)
-        movq	%r15, 16(%rsp)
-        movq	%rbp, 24(%rsp)
-        # Multiply
-        # A[0] * B[0]
-        movq	96(%rsp), %rdx
-        mulxq	128(%rsp), %r9, %r10
-        # A[2] * B[0]
-        mulxq	144(%rsp), %r11, %r12
-        # A[1] * B[0]
-        mulxq	136(%rsp), %rcx, %rbx
-        xorq	%rbp, %rbp
-        adcxq	%rcx, %r10
-        # A[1] * B[3]
-        movq	120(%rsp), %rdx
-        mulxq	136(%rsp), %r13, %r14
-        adcxq	%rbx, %r11
-        # A[0] * B[1]
-        movq	104(%rsp), %rdx
-        mulxq	128(%rsp), %rcx, %rbx
-        adoxq	%rcx, %r10
-        # A[2] * B[1]
-        mulxq	144(%rsp), %rcx, %r15
-        adoxq	%rbx, %r11
-        adcxq	%rcx, %r12
-        # A[1] * B[2]
-        movq	112(%rsp), %rdx
-        mulxq	136(%rsp), %rcx, %rbx
-        adcxq	%r15, %r13
-        adoxq	%rcx, %r12
-        adcxq	%rbp, %r14
-        adoxq	%rbx, %r13
-        # A[0] * B[2]
-        mulxq	128(%rsp), %rcx, %rbx
-        adoxq	%rbp, %r14
-        xorq	%r15, %r15
-        adcxq	%rcx, %r11
-        # A[1] * B[1]
-        movq	104(%rsp), %rdx
-        mulxq	136(%rsp), %rdx, %rcx
-        adcxq	%rbx, %r12
-        adoxq	%rdx, %r11
-        # A[3] * B[1]
-        movq	104(%rsp), %rdx
-        adoxq	%rcx, %r12
-        mulxq	152(%rsp), %rcx, %rbx
-        adcxq	%rcx, %r13
-        # A[2] * B[2]
-        movq	112(%rsp), %rdx
-        mulxq	144(%rsp), %rdx, %rcx
-        adcxq	%rbx, %r14
-        adoxq	%rdx, %r13
-        # A[3] * B[3]
-        movq	120(%rsp), %rdx
-        adoxq	%rcx, %r14
-        mulxq	152(%rsp), %rcx, %rbx
-        adoxq	%rbp, %r15
-        adcxq	%rcx, %r15
-        # A[0] * B[3]
-        mulxq	128(%rsp), %rdx, %rcx
-        adcxq	%rbx, %rbp
-        xorq	%rbx, %rbx
-        adcxq	%rdx, %r12
-        # A[3] * B[0]
-        movq	96(%rsp), %rdx
-        adcxq	%rcx, %r13
-        mulxq	152(%rsp), %rdx, %rcx
-        adoxq	%rdx, %r12
-        adoxq	%rcx, %r13
-        # A[2] * B[3]
-        movq	120(%rsp), %rdx
-        mulxq	144(%rsp), %rdx, %rcx
-        adcxq	%rdx, %r14
-        # A[3] * B[2]
-        movq	112(%rsp), %rdx
-        adcxq	%rcx, %r15
-        mulxq	152(%rsp), %rcx, %rdx
-        adcxq	%rbx, %rbp
-        adoxq	%rcx, %r14
-        adoxq	%rdx, %r15
-        adoxq	%rbx, %rbp
-        # Reduce
-        movq	$0x7fffffffffffffff, %rbx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r15, %rbp
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        andq	%rbx, %r12
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rbx, %rbx
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %rcx, %r15
-        adcxq	%rcx, %r11
-        adoxq	%r15, %r12
-        mulxq	%rbp, %rbp, %rdx
-        adcxq	%rbp, %r12
-        adoxq	%rbx, %rdx
-        adcxq	%rbx, %rdx
-        #  Overflow
-        shldq	$0x01, %r12, %rdx
-        movq	$0x7fffffffffffffff, %rbx
-        imulq	$19, %rdx, %rcx
-        andq	%rbx, %r12
-        addq	%rcx, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        # Reduce if top bit set
-        movq	%r12, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rbx, %r12
-        addq	%rdx, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        # Store
-        movq	%r9, (%rdi)
-        movq	%r10, 8(%rdi)
-        movq	%r11, 16(%rdi)
-        movq	%r12, 24(%rdi)
-        # Sub
-        movq	128(%rsp), %r9
-        movq	136(%rsp), %r10
-        movq	144(%rsp), %r11
-        movq	152(%rsp), %r12
-        subq	96(%rsp), %r9
-        movq	$0x00, %rax
-        sbbq	104(%rsp), %r10
-        movq	$-19, %rcx
-        sbbq	112(%rsp), %r11
-        movq	$0x7fffffffffffffff, %rbx
-        sbbq	120(%rsp), %r12
-        sbbq	$0x00, %rax
-        #   Mask the modulus
-        andq	%rax, %rcx
-        andq	%rax, %rbx
-        #   Add modulus (if underflow)
-        addq	%rcx, %r9
-        adcq	%rax, %r10
-        adcq	%rax, %r11
-        adcq	%rbx, %r12
-        movq	%r9, 128(%rsp)
-        movq	%r10, 136(%rsp)
-        movq	%r11, 144(%rsp)
-        movq	%r12, 152(%rsp)
-        # Square
-        # A[0] * A[1]
-        movq	(%rsp), %rdx
-        mulxq	8(%rsp), %r10, %r11
-        # A[0] * A[3]
-        mulxq	24(%rsp), %r12, %r13
-        # A[2] * A[1]
-        movq	16(%rsp), %rdx
-        mulxq	8(%rsp), %rcx, %rbx
-        xorq	%rbp, %rbp
-        adoxq	%rcx, %r12
-        # A[2] * A[3]
-        mulxq	24(%rsp), %r14, %r15
-        adoxq	%rbx, %r13
-        # A[2] * A[0]
-        mulxq	(%rsp), %rcx, %rbx
-        adoxq	%rbp, %r14
-        adcxq	%rcx, %r11
-        adoxq	%rbp, %r15
-        # A[1] * A[3]
-        movq	8(%rsp), %rdx
-        mulxq	24(%rsp), %rax, %r9
-        adcxq	%rbx, %r12
-        adcxq	%rax, %r13
-        adcxq	%r9, %r14
-        adcxq	%rbp, %r15
-        # Double with Carry Flag
-        xorq	%rbp, %rbp
-        # A[0] * A[0]
-        movq	(%rsp), %rdx
-        mulxq	%rdx, %r9, %rax
-        adcxq	%r10, %r10
-        # A[1] * A[1]
-        movq	8(%rsp), %rdx
-        mulxq	%rdx, %rcx, %rbx
-        adcxq	%r11, %r11
-        adoxq	%rax, %r10
-        adcxq	%r12, %r12
-        adoxq	%rcx, %r11
-        # A[2] * A[2]
-        movq	16(%rsp), %rdx
-        mulxq	%rdx, %rax, %rcx
-        adcxq	%r13, %r13
-        adoxq	%rbx, %r12
-        adcxq	%r14, %r14
-        adoxq	%rax, %r13
-        # A[3] * A[3]
-        movq	24(%rsp), %rdx
-        mulxq	%rdx, %rax, %rbx
-        adcxq	%r15, %r15
-        adoxq	%rcx, %r14
-        adcxq	%rbp, %rbp
-        adoxq	%rax, %r15
-        adoxq	%rbx, %rbp
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r15, %rbp
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        andq	%rcx, %r12
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rcx, %rcx
-        mulxq	%r13, %rax, %r13
-        adcxq	%rax, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rax, %r14
-        adcxq	%rax, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %rax, %r15
-        adcxq	%rax, %r11
-        adoxq	%r15, %r12
-        mulxq	%rbp, %rbp, %rdx
-        adcxq	%rbp, %r12
-        adoxq	%rcx, %rdx
-        adcxq	%rcx, %rdx
-        #  Overflow
-        shldq	$0x01, %r12, %rdx
-        movq	$0x7fffffffffffffff, %rcx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r12
-        addq	%rax, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        # Reduce if top bit set
-        movq	%r12, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r12
-        addq	%rdx, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        # Store
-        movq	%r9, (%rsp)
-        movq	%r10, 8(%rsp)
-        movq	%r11, 16(%rsp)
-        movq	%r12, 24(%rsp)
-        movq	$0x1db42, %rdx
-        mulxq	128(%rsp), %r9, %rbp
-        mulxq	136(%rsp), %r10, %r15
-        mulxq	144(%rsp), %r11, %r14
-        mulxq	152(%rsp), %r12, %r13
-        addq	%rbp, %r10
-        adcq	%r15, %r11
-        adcq	%r14, %r12
-        adcq	$0x00, %r13
-        movq	$0x7fffffffffffffff, %rbp
-        shldq	$0x01, %r12, %r13
-        andq	%rbp, %r12
-        imulq	$19, %r13, %r13
-        addq	%r13, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        movq	%r9, 32(%rsp)
-        movq	%r10, 40(%rsp)
-        movq	%r11, 48(%rsp)
-        movq	%r12, 56(%rsp)
-        # Square
-        # A[0] * A[1]
-        movq	64(%rsp), %rdx
-        mulxq	72(%rsp), %r10, %r11
-        # A[0] * A[3]
-        mulxq	88(%rsp), %r12, %r13
-        # A[2] * A[1]
-        movq	80(%rsp), %rdx
-        mulxq	72(%rsp), %rcx, %rbx
-        xorq	%rbp, %rbp
-        adoxq	%rcx, %r12
-        # A[2] * A[3]
-        mulxq	88(%rsp), %r14, %r15
-        adoxq	%rbx, %r13
-        # A[2] * A[0]
-        mulxq	64(%rsp), %rcx, %rbx
-        adoxq	%rbp, %r14
-        adcxq	%rcx, %r11
-        adoxq	%rbp, %r15
-        # A[1] * A[3]
-        movq	72(%rsp), %rdx
-        mulxq	88(%rsp), %rax, %r9
-        adcxq	%rbx, %r12
-        adcxq	%rax, %r13
-        adcxq	%r9, %r14
-        adcxq	%rbp, %r15
-        # Double with Carry Flag
-        xorq	%rbp, %rbp
-        # A[0] * A[0]
-        movq	64(%rsp), %rdx
-        mulxq	%rdx, %r9, %rax
-        adcxq	%r10, %r10
-        # A[1] * A[1]
-        movq	72(%rsp), %rdx
-        mulxq	%rdx, %rcx, %rbx
-        adcxq	%r11, %r11
-        adoxq	%rax, %r10
-        adcxq	%r12, %r12
-        adoxq	%rcx, %r11
-        # A[2] * A[2]
-        movq	80(%rsp), %rdx
-        mulxq	%rdx, %rax, %rcx
-        adcxq	%r13, %r13
-        adoxq	%rbx, %r12
-        adcxq	%r14, %r14
-        adoxq	%rax, %r13
-        # A[3] * A[3]
-        movq	88(%rsp), %rdx
-        mulxq	%rdx, %rax, %rbx
-        adcxq	%r15, %r15
-        adoxq	%rcx, %r14
-        adcxq	%rbp, %rbp
-        adoxq	%rax, %r15
-        adoxq	%rbx, %rbp
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r15, %rbp
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        andq	%rcx, %r12
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rcx, %rcx
-        mulxq	%r13, %rax, %r13
-        adcxq	%rax, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rax, %r14
-        adcxq	%rax, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %rax, %r15
-        adcxq	%rax, %r11
-        adoxq	%r15, %r12
-        mulxq	%rbp, %rbp, %rdx
-        adcxq	%rbp, %r12
-        adoxq	%rcx, %rdx
-        adcxq	%rcx, %rdx
-        #  Overflow
-        shldq	$0x01, %r12, %rdx
-        movq	$0x7fffffffffffffff, %rcx
-        imulq	$19, %rdx, %rax
-        andq	%rcx, %r12
-        addq	%rax, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        # Reduce if top bit set
-        movq	%r12, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r12
-        addq	%rdx, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        # Store
-        movq	%r9, 64(%rsp)
-        movq	%r10, 72(%rsp)
-        movq	%r11, 80(%rsp)
-        movq	%r12, 88(%rsp)
-        # Add
-        movq	96(%rsp), %r9
-        movq	104(%rsp), %r10
-        addq	32(%rsp), %r9
-        movq	112(%rsp), %r11
-        adcq	40(%rsp), %r10
-        movq	120(%rsp), %rax
-        adcq	48(%rsp), %r11
-        movq	$-19, %rcx
-        adcq	56(%rsp), %rax
-        movq	$0x7fffffffffffffff, %rbx
-        movq	%rax, %r12
-        sarq	$63, %rax
-        #   Mask the modulus
-        andq	%rax, %rcx
-        andq	%rax, %rbx
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r9
-        sbbq	%rax, %r10
-        sbbq	%rax, %r11
-        sbbq	%rbx, %r12
-        movq	%r9, 96(%rsp)
-        movq	%r10, 104(%rsp)
-        movq	%r11, 112(%rsp)
-        movq	%r12, 120(%rsp)
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rsp), %rdx
-        mulxq	(%r8), %r9, %r10
-        # A[2] * B[0]
-        mulxq	16(%r8), %r11, %r12
-        # A[1] * B[0]
-        mulxq	8(%r8), %rcx, %rbx
-        xorq	%rbp, %rbp
-        adcxq	%rcx, %r10
-        # A[1] * B[3]
-        movq	24(%rsp), %rdx
-        mulxq	8(%r8), %r13, %r14
-        adcxq	%rbx, %r11
-        # A[0] * B[1]
-        movq	8(%rsp), %rdx
-        mulxq	(%r8), %rcx, %rbx
-        adoxq	%rcx, %r10
-        # A[2] * B[1]
-        mulxq	16(%r8), %rcx, %r15
-        adoxq	%rbx, %r11
-        adcxq	%rcx, %r12
-        # A[1] * B[2]
-        movq	16(%rsp), %rdx
-        mulxq	8(%r8), %rcx, %rbx
-        adcxq	%r15, %r13
-        adoxq	%rcx, %r12
-        adcxq	%rbp, %r14
-        adoxq	%rbx, %r13
-        # A[0] * B[2]
-        mulxq	(%r8), %rcx, %rbx
-        adoxq	%rbp, %r14
-        xorq	%r15, %r15
-        adcxq	%rcx, %r11
-        # A[1] * B[1]
-        movq	8(%rsp), %rdx
-        mulxq	8(%r8), %rdx, %rcx
-        adcxq	%rbx, %r12
-        adoxq	%rdx, %r11
-        # A[3] * B[1]
-        movq	8(%rsp), %rdx
-        adoxq	%rcx, %r12
-        mulxq	24(%r8), %rcx, %rbx
-        adcxq	%rcx, %r13
-        # A[2] * B[2]
-        movq	16(%rsp), %rdx
-        mulxq	16(%r8), %rdx, %rcx
-        adcxq	%rbx, %r14
-        adoxq	%rdx, %r13
-        # A[3] * B[3]
-        movq	24(%rsp), %rdx
-        adoxq	%rcx, %r14
-        mulxq	24(%r8), %rcx, %rbx
-        adoxq	%rbp, %r15
-        adcxq	%rcx, %r15
-        # A[0] * B[3]
-        mulxq	(%r8), %rdx, %rcx
-        adcxq	%rbx, %rbp
-        xorq	%rbx, %rbx
-        adcxq	%rdx, %r12
-        # A[3] * B[0]
-        movq	(%rsp), %rdx
-        adcxq	%rcx, %r13
-        mulxq	24(%r8), %rdx, %rcx
-        adoxq	%rdx, %r12
-        adoxq	%rcx, %r13
-        # A[2] * B[3]
-        movq	24(%rsp), %rdx
-        mulxq	16(%r8), %rdx, %rcx
-        adcxq	%rdx, %r14
-        # A[3] * B[2]
-        movq	16(%rsp), %rdx
-        adcxq	%rcx, %r15
-        mulxq	24(%r8), %rcx, %rdx
-        adcxq	%rbx, %rbp
-        adoxq	%rcx, %r14
-        adoxq	%rdx, %r15
-        adoxq	%rbx, %rbp
-        # Reduce
-        movq	$0x7fffffffffffffff, %rbx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r15, %rbp
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        andq	%rbx, %r12
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rbx, %rbx
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %rcx, %r15
-        adcxq	%rcx, %r11
-        adoxq	%r15, %r12
-        mulxq	%rbp, %rbp, %rdx
-        adcxq	%rbp, %r12
-        adoxq	%rbx, %rdx
-        adcxq	%rbx, %rdx
-        #  Overflow
-        shldq	$0x01, %r12, %rdx
-        movq	$0x7fffffffffffffff, %rbx
-        imulq	$19, %rdx, %rcx
-        andq	%rbx, %r12
-        addq	%rcx, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        # Reduce if top bit set
-        movq	%r12, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rbx, %r12
-        addq	%rdx, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        # Store
-        movq	%r9, 32(%rsp)
-        movq	%r10, 40(%rsp)
-        movq	%r11, 48(%rsp)
-        movq	%r12, 56(%rsp)
-        # Multiply
-        # A[0] * B[0]
-        movq	96(%rsp), %rdx
-        mulxq	128(%rsp), %r9, %r10
-        # A[2] * B[0]
-        mulxq	144(%rsp), %r11, %r12
-        # A[1] * B[0]
-        mulxq	136(%rsp), %rcx, %rbx
-        xorq	%rbp, %rbp
-        adcxq	%rcx, %r10
-        # A[1] * B[3]
-        movq	120(%rsp), %rdx
-        mulxq	136(%rsp), %r13, %r14
-        adcxq	%rbx, %r11
-        # A[0] * B[1]
-        movq	104(%rsp), %rdx
-        mulxq	128(%rsp), %rcx, %rbx
-        adoxq	%rcx, %r10
-        # A[2] * B[1]
-        mulxq	144(%rsp), %rcx, %r15
-        adoxq	%rbx, %r11
-        adcxq	%rcx, %r12
-        # A[1] * B[2]
-        movq	112(%rsp), %rdx
-        mulxq	136(%rsp), %rcx, %rbx
-        adcxq	%r15, %r13
-        adoxq	%rcx, %r12
-        adcxq	%rbp, %r14
-        adoxq	%rbx, %r13
-        # A[0] * B[2]
-        mulxq	128(%rsp), %rcx, %rbx
-        adoxq	%rbp, %r14
-        xorq	%r15, %r15
-        adcxq	%rcx, %r11
-        # A[1] * B[1]
-        movq	104(%rsp), %rdx
-        mulxq	136(%rsp), %rdx, %rcx
-        adcxq	%rbx, %r12
-        adoxq	%rdx, %r11
-        # A[3] * B[1]
-        movq	104(%rsp), %rdx
-        adoxq	%rcx, %r12
-        mulxq	152(%rsp), %rcx, %rbx
-        adcxq	%rcx, %r13
-        # A[2] * B[2]
-        movq	112(%rsp), %rdx
-        mulxq	144(%rsp), %rdx, %rcx
-        adcxq	%rbx, %r14
-        adoxq	%rdx, %r13
-        # A[3] * B[3]
-        movq	120(%rsp), %rdx
-        adoxq	%rcx, %r14
-        mulxq	152(%rsp), %rcx, %rbx
-        adoxq	%rbp, %r15
-        adcxq	%rcx, %r15
-        # A[0] * B[3]
-        mulxq	128(%rsp), %rdx, %rcx
-        adcxq	%rbx, %rbp
-        xorq	%rbx, %rbx
-        adcxq	%rdx, %r12
-        # A[3] * B[0]
-        movq	96(%rsp), %rdx
-        adcxq	%rcx, %r13
-        mulxq	152(%rsp), %rdx, %rcx
-        adoxq	%rdx, %r12
-        adoxq	%rcx, %r13
-        # A[2] * B[3]
-        movq	120(%rsp), %rdx
-        mulxq	144(%rsp), %rdx, %rcx
-        adcxq	%rdx, %r14
-        # A[3] * B[2]
-        movq	112(%rsp), %rdx
-        adcxq	%rcx, %r15
-        mulxq	152(%rsp), %rcx, %rdx
-        adcxq	%rbx, %rbp
-        adoxq	%rcx, %r14
-        adoxq	%rdx, %r15
-        adoxq	%rbx, %rbp
-        # Reduce
-        movq	$0x7fffffffffffffff, %rbx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r15, %rbp
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        andq	%rbx, %r12
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rbx, %rbx
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %rcx, %r15
-        adcxq	%rcx, %r11
-        adoxq	%r15, %r12
-        mulxq	%rbp, %rbp, %rdx
-        adcxq	%rbp, %r12
-        adoxq	%rbx, %rdx
-        adcxq	%rbx, %rdx
-        #  Overflow
-        shldq	$0x01, %r12, %rdx
-        movq	$0x7fffffffffffffff, %rbx
-        imulq	$19, %rdx, %rcx
-        andq	%rbx, %r12
-        addq	%rcx, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        # Reduce if top bit set
-        movq	%r12, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rbx, %r12
-        addq	%rdx, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        # Store
-        movq	%r9, (%rsp)
-        movq	%r10, 8(%rsp)
-        movq	%r11, 16(%rsp)
-        movq	%r12, 24(%rsp)
-        decb	168(%rsp)
-        jge	L_curve25519_avx2_bits
-        movq	$63, 168(%rsp)
-        decb	160(%rsp)
-        jge	L_curve25519_avx2_words
-        # Invert
-        leaq	32(%rsp), %rdi
-        movq	%rsp, %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        movq	%rsp, %rsi
-        leaq	64(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        leaq	64(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        leaq	96(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        movq	$4, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        leaq	64(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        movq	$9, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        leaq	64(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	128(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	128(%rsp), %rdi
-        leaq	128(%rsp), %rsi
-        movq	$19, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	128(%rsp), %rsi
-        leaq	96(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        movq	$9, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        leaq	64(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        movq	$49, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        leaq	64(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	128(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	128(%rsp), %rdi
-        leaq	128(%rsp), %rsi
-        movq	$0x63, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	128(%rsp), %rsi
-        leaq	96(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	96(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        movq	$49, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	96(%rsp), %rsi
-        leaq	64(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        movq	$4, %rdx
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        movq	%rsp, %rdi
-        leaq	64(%rsp), %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        movq	176(%rsp), %rdi
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rsp), %rdx
-        mulxq	(%rdi), %r9, %r10
-        # A[2] * B[0]
-        mulxq	16(%rdi), %r11, %r12
-        # A[1] * B[0]
-        mulxq	8(%rdi), %rcx, %rbx
-        xorq	%rbp, %rbp
-        adcxq	%rcx, %r10
-        # A[1] * B[3]
-        movq	24(%rsp), %rdx
-        mulxq	8(%rdi), %r13, %r14
-        adcxq	%rbx, %r11
-        # A[0] * B[1]
-        movq	8(%rsp), %rdx
-        mulxq	(%rdi), %rcx, %rbx
-        adoxq	%rcx, %r10
-        # A[2] * B[1]
-        mulxq	16(%rdi), %rcx, %r15
-        adoxq	%rbx, %r11
-        adcxq	%rcx, %r12
-        # A[1] * B[2]
-        movq	16(%rsp), %rdx
-        mulxq	8(%rdi), %rcx, %rbx
-        adcxq	%r15, %r13
-        adoxq	%rcx, %r12
-        adcxq	%rbp, %r14
-        adoxq	%rbx, %r13
-        # A[0] * B[2]
-        mulxq	(%rdi), %rcx, %rbx
-        adoxq	%rbp, %r14
-        xorq	%r15, %r15
-        adcxq	%rcx, %r11
-        # A[1] * B[1]
-        movq	8(%rsp), %rdx
-        mulxq	8(%rdi), %rdx, %rcx
-        adcxq	%rbx, %r12
-        adoxq	%rdx, %r11
-        # A[3] * B[1]
-        movq	8(%rsp), %rdx
-        adoxq	%rcx, %r12
-        mulxq	24(%rdi), %rcx, %rbx
-        adcxq	%rcx, %r13
-        # A[2] * B[2]
-        movq	16(%rsp), %rdx
-        mulxq	16(%rdi), %rdx, %rcx
-        adcxq	%rbx, %r14
-        adoxq	%rdx, %r13
-        # A[3] * B[3]
-        movq	24(%rsp), %rdx
-        adoxq	%rcx, %r14
-        mulxq	24(%rdi), %rcx, %rbx
-        adoxq	%rbp, %r15
-        adcxq	%rcx, %r15
-        # A[0] * B[3]
-        mulxq	(%rdi), %rdx, %rcx
-        adcxq	%rbx, %rbp
-        xorq	%rbx, %rbx
-        adcxq	%rdx, %r12
-        # A[3] * B[0]
-        movq	(%rsp), %rdx
-        adcxq	%rcx, %r13
-        mulxq	24(%rdi), %rdx, %rcx
-        adoxq	%rdx, %r12
-        adoxq	%rcx, %r13
-        # A[2] * B[3]
-        movq	24(%rsp), %rdx
-        mulxq	16(%rdi), %rdx, %rcx
-        adcxq	%rdx, %r14
-        # A[3] * B[2]
-        movq	16(%rsp), %rdx
-        adcxq	%rcx, %r15
-        mulxq	24(%rdi), %rcx, %rdx
-        adcxq	%rbx, %rbp
-        adoxq	%rcx, %r14
-        adoxq	%rdx, %r15
-        adoxq	%rbx, %rbp
-        # Reduce
-        movq	$0x7fffffffffffffff, %rbx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r15, %rbp
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        andq	%rbx, %r12
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rbx, %rbx
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %rcx, %r15
-        adcxq	%rcx, %r11
-        adoxq	%r15, %r12
-        mulxq	%rbp, %rbp, %rdx
-        adcxq	%rbp, %r12
-        adoxq	%rbx, %rdx
-        adcxq	%rbx, %rdx
-        #  Overflow
-        shldq	$0x01, %r12, %rdx
-        movq	$0x7fffffffffffffff, %rbx
-        imulq	$19, %rdx, %rcx
-        andq	%rbx, %r12
-        addq	%rcx, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        # Reduce if top bit set
-        movq	%r12, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rbx, %r12
-        addq	%rdx, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        movq	$0x7fffffffffffffff, %rbx
-        movq	%r9, %rdx
-        addq	$19, %rdx
-        movq	%r10, %rdx
-        adcq	$0x00, %rdx
-        movq	%r11, %rdx
-        adcq	$0x00, %rdx
-        movq	%r12, %rdx
-        adcq	$0x00, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        addq	%rdx, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        andq	%rbx, %r12
-        # Store
-        movq	%r9, (%rdi)
-        movq	%r10, 8(%rdi)
-        movq	%r11, 16(%rdi)
-        movq	%r12, 24(%rdi)
-        xorq	%rax, %rax
-        addq	$0xc0, %rsp
-        popq	%rbp
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        popq	%rbx
-        repz retq
-#ifndef __APPLE__
-.size	curve25519_avx2,.-curve25519_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_pow22523_avx2
-.type	fe_pow22523_avx2,@function
-.align	16
-fe_pow22523_avx2:
-#else
-.section	__TEXT,__text
-.globl	_fe_pow22523_avx2
-.p2align	4
-_fe_pow22523_avx2:
-#endif /* __APPLE__ */
-        subq	$0x70, %rsp
-        # pow22523
-        movq	%rdi, 96(%rsp)
-        movq	%rsi, 104(%rsp)
-        movq	%rsp, %rdi
-        movq	104(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        movq	%rsp, %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        movq	104(%rsp), %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        movq	%rsp, %rdi
-        movq	%rsp, %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        movq	%rsp, %rdi
-        movq	%rsp, %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        movq	%rsp, %rdi
-        leaq	32(%rsp), %rsi
-        movq	%rsp, %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        movq	%rsp, %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        movb	$4, %dl
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        movq	%rsp, %rdi
-        leaq	32(%rsp), %rsi
-        movq	%rsp, %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        movq	%rsp, %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        movb	$9, %dl
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        movq	%rsp, %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        movb	$19, %dl
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        movb	$9, %dl
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        movq	%rsp, %rdi
-        leaq	32(%rsp), %rsi
-        movq	%rsp, %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        movq	%rsp, %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        movb	$49, %dl
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        movq	%rsp, %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	64(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        movb	$0x63, %dl
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	64(%rsp), %rsi
-        leaq	32(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        leaq	32(%rsp), %rdi
-        leaq	32(%rsp), %rsi
-        movb	$49, %dl
-#ifndef __APPLE__
-        callq	fe_sq_n_avx2@plt
-#else
-        callq	_fe_sq_n_avx2
-#endif /* __APPLE__ */
-        movq	%rsp, %rdi
-        leaq	32(%rsp), %rsi
-        movq	%rsp, %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        movq	%rsp, %rdi
-        movq	%rsp, %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        movq	%rsp, %rdi
-        movq	%rsp, %rsi
-#ifndef __APPLE__
-        callq	fe_sq_avx2@plt
-#else
-        callq	_fe_sq_avx2
-#endif /* __APPLE__ */
-        movq	96(%rsp), %rdi
-        movq	%rsp, %rsi
-        movq	104(%rsp), %rdx
-#ifndef __APPLE__
-        callq	fe_mul_avx2@plt
-#else
-        callq	_fe_mul_avx2
-#endif /* __APPLE__ */
-        movq	104(%rsp), %rsi
-        movq	96(%rsp), %rdi
-        addq	$0x70, %rsp
-        repz retq
-#ifndef __APPLE__
-.text
-.globl	fe_ge_to_p2_avx2
-.type	fe_ge_to_p2_avx2,@function
-.align	16
-fe_ge_to_p2_avx2:
-#else
-.section	__TEXT,__text
-.globl	_fe_ge_to_p2_avx2
-.p2align	4
-_fe_ge_to_p2_avx2:
-#endif /* __APPLE__ */
-        pushq	%rbx
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        subq	$40, %rsp
-        movq	%rsi, (%rsp)
-        movq	%rdx, 8(%rsp)
-        movq	%rcx, 16(%rsp)
-        movq	%r8, 24(%rsp)
-        movq	%r9, 32(%rsp)
-        movq	16(%rsp), %rsi
-        movq	88(%rsp), %rbx
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rbx), %rdx
-        mulxq	(%rsi), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rsi), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rsi), %rcx, %rax
-        xorq	%r15, %r15
-        adcxq	%rcx, %r9
-        # A[1] * B[3]
-        movq	24(%rbx), %rdx
-        mulxq	8(%rsi), %r12, %r13
-        adcxq	%rax, %r10
-        # A[0] * B[1]
-        movq	8(%rbx), %rdx
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%rcx, %r9
-        # A[2] * B[1]
-        mulxq	16(%rsi), %rcx, %r14
-        adoxq	%rax, %r10
-        adcxq	%rcx, %r11
-        # A[1] * B[2]
-        movq	16(%rbx), %rdx
-        mulxq	8(%rsi), %rcx, %rax
-        adcxq	%r14, %r12
-        adoxq	%rcx, %r11
-        adcxq	%r15, %r13
-        adoxq	%rax, %r12
-        # A[0] * B[2]
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rcx, %r10
-        # A[1] * B[1]
-        movq	8(%rbx), %rdx
-        mulxq	8(%rsi), %rdx, %rcx
-        adcxq	%rax, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rbx), %rdx
-        adoxq	%rcx, %r11
-        mulxq	24(%rsi), %rcx, %rax
-        adcxq	%rcx, %r12
-        # A[2] * B[2]
-        movq	16(%rbx), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rax, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rbx), %rdx
-        adoxq	%rcx, %r13
-        mulxq	24(%rsi), %rcx, %rax
-        adoxq	%r15, %r14
-        adcxq	%rcx, %r14
-        # A[0] * B[3]
-        mulxq	(%rsi), %rdx, %rcx
-        adcxq	%rax, %r15
-        xorq	%rax, %rax
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rbx), %rdx
-        adcxq	%rcx, %r12
-        mulxq	24(%rsi), %rdx, %rcx
-        adoxq	%rdx, %r11
-        adoxq	%rcx, %r12
-        # A[2] * B[3]
-        movq	24(%rbx), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rbx), %rdx
-        adcxq	%rcx, %r14
-        mulxq	24(%rsi), %rcx, %rdx
-        adcxq	%rax, %r15
-        adoxq	%rcx, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rax, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rax, %rax
-        mulxq	%r12, %rcx, %r12
-        adcxq	%rcx, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rcx
-        andq	%rax, %r11
-        addq	%rcx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	(%rsp), %rdi
-        movq	24(%rsp), %rsi
-        movq	32(%rsp), %rbx
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rbx), %rdx
-        mulxq	(%rsi), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rsi), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rsi), %rcx, %rax
-        xorq	%r15, %r15
-        adcxq	%rcx, %r9
-        # A[1] * B[3]
-        movq	24(%rbx), %rdx
-        mulxq	8(%rsi), %r12, %r13
-        adcxq	%rax, %r10
-        # A[0] * B[1]
-        movq	8(%rbx), %rdx
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%rcx, %r9
-        # A[2] * B[1]
-        mulxq	16(%rsi), %rcx, %r14
-        adoxq	%rax, %r10
-        adcxq	%rcx, %r11
-        # A[1] * B[2]
-        movq	16(%rbx), %rdx
-        mulxq	8(%rsi), %rcx, %rax
-        adcxq	%r14, %r12
-        adoxq	%rcx, %r11
-        adcxq	%r15, %r13
-        adoxq	%rax, %r12
-        # A[0] * B[2]
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rcx, %r10
-        # A[1] * B[1]
-        movq	8(%rbx), %rdx
-        mulxq	8(%rsi), %rdx, %rcx
-        adcxq	%rax, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rbx), %rdx
-        adoxq	%rcx, %r11
-        mulxq	24(%rsi), %rcx, %rax
-        adcxq	%rcx, %r12
-        # A[2] * B[2]
-        movq	16(%rbx), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rax, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rbx), %rdx
-        adoxq	%rcx, %r13
-        mulxq	24(%rsi), %rcx, %rax
-        adoxq	%r15, %r14
-        adcxq	%rcx, %r14
-        # A[0] * B[3]
-        mulxq	(%rsi), %rdx, %rcx
-        adcxq	%rax, %r15
-        xorq	%rax, %rax
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rbx), %rdx
-        adcxq	%rcx, %r12
-        mulxq	24(%rsi), %rdx, %rcx
-        adoxq	%rdx, %r11
-        adoxq	%rcx, %r12
-        # A[2] * B[3]
-        movq	24(%rbx), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rbx), %rdx
-        adcxq	%rcx, %r14
-        mulxq	24(%rsi), %rcx, %rdx
-        adcxq	%rax, %r15
-        adoxq	%rcx, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rax, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rax, %rax
-        mulxq	%r12, %rcx, %r12
-        adcxq	%rcx, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rcx
-        andq	%rax, %r11
-        addq	%rcx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	8(%rsp), %rdi
-        movq	88(%rsp), %rsi
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rsi), %rdx
-        mulxq	(%rbx), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rbx), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rbx), %rcx, %rax
-        xorq	%r15, %r15
-        adcxq	%rcx, %r9
-        # A[1] * B[3]
-        movq	24(%rsi), %rdx
-        mulxq	8(%rbx), %r12, %r13
-        adcxq	%rax, %r10
-        # A[0] * B[1]
-        movq	8(%rsi), %rdx
-        mulxq	(%rbx), %rcx, %rax
-        adoxq	%rcx, %r9
-        # A[2] * B[1]
-        mulxq	16(%rbx), %rcx, %r14
-        adoxq	%rax, %r10
-        adcxq	%rcx, %r11
-        # A[1] * B[2]
-        movq	16(%rsi), %rdx
-        mulxq	8(%rbx), %rcx, %rax
-        adcxq	%r14, %r12
-        adoxq	%rcx, %r11
-        adcxq	%r15, %r13
-        adoxq	%rax, %r12
-        # A[0] * B[2]
-        mulxq	(%rbx), %rcx, %rax
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rcx, %r10
-        # A[1] * B[1]
-        movq	8(%rsi), %rdx
-        mulxq	8(%rbx), %rdx, %rcx
-        adcxq	%rax, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rsi), %rdx
-        adoxq	%rcx, %r11
-        mulxq	24(%rbx), %rcx, %rax
-        adcxq	%rcx, %r12
-        # A[2] * B[2]
-        movq	16(%rsi), %rdx
-        mulxq	16(%rbx), %rdx, %rcx
-        adcxq	%rax, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rsi), %rdx
-        adoxq	%rcx, %r13
-        mulxq	24(%rbx), %rcx, %rax
-        adoxq	%r15, %r14
-        adcxq	%rcx, %r14
-        # A[0] * B[3]
-        mulxq	(%rbx), %rdx, %rcx
-        adcxq	%rax, %r15
-        xorq	%rax, %rax
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rsi), %rdx
-        adcxq	%rcx, %r12
-        mulxq	24(%rbx), %rdx, %rcx
-        adoxq	%rdx, %r11
-        adoxq	%rcx, %r12
-        # A[2] * B[3]
-        movq	24(%rsi), %rdx
-        mulxq	16(%rbx), %rdx, %rcx
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rsi), %rdx
-        adcxq	%rcx, %r14
-        mulxq	24(%rbx), %rcx, %rdx
-        adcxq	%rax, %r15
-        adoxq	%rcx, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rax, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rax, %rax
-        mulxq	%r12, %rcx, %r12
-        adcxq	%rcx, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rcx
-        andq	%rax, %r11
-        addq	%rcx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        addq	$40, %rsp
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        popq	%rbx
-        repz retq
-#ifndef __APPLE__
-.size	fe_ge_to_p2_avx2,.-fe_ge_to_p2_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_ge_to_p3_avx2
-.type	fe_ge_to_p3_avx2,@function
-.align	16
-fe_ge_to_p3_avx2:
-#else
-.section	__TEXT,__text
-.globl	_fe_ge_to_p3_avx2
-.p2align	4
-_fe_ge_to_p3_avx2:
-#endif /* __APPLE__ */
-        pushq	%rbx
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        subq	$40, %rsp
-        movq	%rsi, (%rsp)
-        movq	%rdx, 8(%rsp)
-        movq	%rcx, 16(%rsp)
-        movq	%r8, 24(%rsp)
-        movq	%r9, 32(%rsp)
-        movq	24(%rsp), %rsi
-        movq	96(%rsp), %rbx
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rbx), %rdx
-        mulxq	(%rsi), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rsi), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rsi), %rcx, %rax
-        xorq	%r15, %r15
-        adcxq	%rcx, %r9
-        # A[1] * B[3]
-        movq	24(%rbx), %rdx
-        mulxq	8(%rsi), %r12, %r13
-        adcxq	%rax, %r10
-        # A[0] * B[1]
-        movq	8(%rbx), %rdx
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%rcx, %r9
-        # A[2] * B[1]
-        mulxq	16(%rsi), %rcx, %r14
-        adoxq	%rax, %r10
-        adcxq	%rcx, %r11
-        # A[1] * B[2]
-        movq	16(%rbx), %rdx
-        mulxq	8(%rsi), %rcx, %rax
-        adcxq	%r14, %r12
-        adoxq	%rcx, %r11
-        adcxq	%r15, %r13
-        adoxq	%rax, %r12
-        # A[0] * B[2]
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rcx, %r10
-        # A[1] * B[1]
-        movq	8(%rbx), %rdx
-        mulxq	8(%rsi), %rdx, %rcx
-        adcxq	%rax, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rbx), %rdx
-        adoxq	%rcx, %r11
-        mulxq	24(%rsi), %rcx, %rax
-        adcxq	%rcx, %r12
-        # A[2] * B[2]
-        movq	16(%rbx), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rax, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rbx), %rdx
-        adoxq	%rcx, %r13
-        mulxq	24(%rsi), %rcx, %rax
-        adoxq	%r15, %r14
-        adcxq	%rcx, %r14
-        # A[0] * B[3]
-        mulxq	(%rsi), %rdx, %rcx
-        adcxq	%rax, %r15
-        xorq	%rax, %rax
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rbx), %rdx
-        adcxq	%rcx, %r12
-        mulxq	24(%rsi), %rdx, %rcx
-        adoxq	%rdx, %r11
-        adoxq	%rcx, %r12
-        # A[2] * B[3]
-        movq	24(%rbx), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rbx), %rdx
-        adcxq	%rcx, %r14
-        mulxq	24(%rsi), %rcx, %rdx
-        adcxq	%rax, %r15
-        adoxq	%rcx, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rax, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rax, %rax
-        mulxq	%r12, %rcx, %r12
-        adcxq	%rcx, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rcx
-        andq	%rax, %r11
-        addq	%rcx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	(%rsp), %rdi
-        movq	32(%rsp), %rsi
-        movq	88(%rsp), %rbx
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rbx), %rdx
-        mulxq	(%rsi), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rsi), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rsi), %rcx, %rax
-        xorq	%r15, %r15
-        adcxq	%rcx, %r9
-        # A[1] * B[3]
-        movq	24(%rbx), %rdx
-        mulxq	8(%rsi), %r12, %r13
-        adcxq	%rax, %r10
-        # A[0] * B[1]
-        movq	8(%rbx), %rdx
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%rcx, %r9
-        # A[2] * B[1]
-        mulxq	16(%rsi), %rcx, %r14
-        adoxq	%rax, %r10
-        adcxq	%rcx, %r11
-        # A[1] * B[2]
-        movq	16(%rbx), %rdx
-        mulxq	8(%rsi), %rcx, %rax
-        adcxq	%r14, %r12
-        adoxq	%rcx, %r11
-        adcxq	%r15, %r13
-        adoxq	%rax, %r12
-        # A[0] * B[2]
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rcx, %r10
-        # A[1] * B[1]
-        movq	8(%rbx), %rdx
-        mulxq	8(%rsi), %rdx, %rcx
-        adcxq	%rax, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rbx), %rdx
-        adoxq	%rcx, %r11
-        mulxq	24(%rsi), %rcx, %rax
-        adcxq	%rcx, %r12
-        # A[2] * B[2]
-        movq	16(%rbx), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rax, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rbx), %rdx
-        adoxq	%rcx, %r13
-        mulxq	24(%rsi), %rcx, %rax
-        adoxq	%r15, %r14
-        adcxq	%rcx, %r14
-        # A[0] * B[3]
-        mulxq	(%rsi), %rdx, %rcx
-        adcxq	%rax, %r15
-        xorq	%rax, %rax
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rbx), %rdx
-        adcxq	%rcx, %r12
-        mulxq	24(%rsi), %rdx, %rcx
-        adoxq	%rdx, %r11
-        adoxq	%rcx, %r12
-        # A[2] * B[3]
-        movq	24(%rbx), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rbx), %rdx
-        adcxq	%rcx, %r14
-        mulxq	24(%rsi), %rcx, %rdx
-        adcxq	%rax, %r15
-        adoxq	%rcx, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rax, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rax, %rax
-        mulxq	%r12, %rcx, %r12
-        adcxq	%rcx, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rcx
-        andq	%rax, %r11
-        addq	%rcx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	8(%rsp), %rdi
-        movq	96(%rsp), %rsi
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rsi), %rdx
-        mulxq	(%rbx), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rbx), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rbx), %rcx, %rax
-        xorq	%r15, %r15
-        adcxq	%rcx, %r9
-        # A[1] * B[3]
-        movq	24(%rsi), %rdx
-        mulxq	8(%rbx), %r12, %r13
-        adcxq	%rax, %r10
-        # A[0] * B[1]
-        movq	8(%rsi), %rdx
-        mulxq	(%rbx), %rcx, %rax
-        adoxq	%rcx, %r9
-        # A[2] * B[1]
-        mulxq	16(%rbx), %rcx, %r14
-        adoxq	%rax, %r10
-        adcxq	%rcx, %r11
-        # A[1] * B[2]
-        movq	16(%rsi), %rdx
-        mulxq	8(%rbx), %rcx, %rax
-        adcxq	%r14, %r12
-        adoxq	%rcx, %r11
-        adcxq	%r15, %r13
-        adoxq	%rax, %r12
-        # A[0] * B[2]
-        mulxq	(%rbx), %rcx, %rax
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rcx, %r10
-        # A[1] * B[1]
-        movq	8(%rsi), %rdx
-        mulxq	8(%rbx), %rdx, %rcx
-        adcxq	%rax, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rsi), %rdx
-        adoxq	%rcx, %r11
-        mulxq	24(%rbx), %rcx, %rax
-        adcxq	%rcx, %r12
-        # A[2] * B[2]
-        movq	16(%rsi), %rdx
-        mulxq	16(%rbx), %rdx, %rcx
-        adcxq	%rax, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rsi), %rdx
-        adoxq	%rcx, %r13
-        mulxq	24(%rbx), %rcx, %rax
-        adoxq	%r15, %r14
-        adcxq	%rcx, %r14
-        # A[0] * B[3]
-        mulxq	(%rbx), %rdx, %rcx
-        adcxq	%rax, %r15
-        xorq	%rax, %rax
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rsi), %rdx
-        adcxq	%rcx, %r12
-        mulxq	24(%rbx), %rdx, %rcx
-        adoxq	%rdx, %r11
-        adoxq	%rcx, %r12
-        # A[2] * B[3]
-        movq	24(%rsi), %rdx
-        mulxq	16(%rbx), %rdx, %rcx
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rsi), %rdx
-        adcxq	%rcx, %r14
-        mulxq	24(%rbx), %rcx, %rdx
-        adcxq	%rax, %r15
-        adoxq	%rcx, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rax, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rax, %rax
-        mulxq	%r12, %rcx, %r12
-        adcxq	%rcx, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rcx
-        andq	%rax, %r11
-        addq	%rcx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	16(%rsp), %rdi
-        movq	24(%rsp), %rsi
-        movq	32(%rsp), %rbx
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rbx), %rdx
-        mulxq	(%rsi), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rsi), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rsi), %rcx, %rax
-        xorq	%r15, %r15
-        adcxq	%rcx, %r9
-        # A[1] * B[3]
-        movq	24(%rbx), %rdx
-        mulxq	8(%rsi), %r12, %r13
-        adcxq	%rax, %r10
-        # A[0] * B[1]
-        movq	8(%rbx), %rdx
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%rcx, %r9
-        # A[2] * B[1]
-        mulxq	16(%rsi), %rcx, %r14
-        adoxq	%rax, %r10
-        adcxq	%rcx, %r11
-        # A[1] * B[2]
-        movq	16(%rbx), %rdx
-        mulxq	8(%rsi), %rcx, %rax
-        adcxq	%r14, %r12
-        adoxq	%rcx, %r11
-        adcxq	%r15, %r13
-        adoxq	%rax, %r12
-        # A[0] * B[2]
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rcx, %r10
-        # A[1] * B[1]
-        movq	8(%rbx), %rdx
-        mulxq	8(%rsi), %rdx, %rcx
-        adcxq	%rax, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rbx), %rdx
-        adoxq	%rcx, %r11
-        mulxq	24(%rsi), %rcx, %rax
-        adcxq	%rcx, %r12
-        # A[2] * B[2]
-        movq	16(%rbx), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rax, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rbx), %rdx
-        adoxq	%rcx, %r13
-        mulxq	24(%rsi), %rcx, %rax
-        adoxq	%r15, %r14
-        adcxq	%rcx, %r14
-        # A[0] * B[3]
-        mulxq	(%rsi), %rdx, %rcx
-        adcxq	%rax, %r15
-        xorq	%rax, %rax
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rbx), %rdx
-        adcxq	%rcx, %r12
-        mulxq	24(%rsi), %rdx, %rcx
-        adoxq	%rdx, %r11
-        adoxq	%rcx, %r12
-        # A[2] * B[3]
-        movq	24(%rbx), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rbx), %rdx
-        adcxq	%rcx, %r14
-        mulxq	24(%rsi), %rcx, %rdx
-        adcxq	%rax, %r15
-        adoxq	%rcx, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rax, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rax, %rax
-        mulxq	%r12, %rcx, %r12
-        adcxq	%rcx, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rcx
-        andq	%rax, %r11
-        addq	%rcx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        addq	$40, %rsp
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        popq	%rbx
-        repz retq
-#ifndef __APPLE__
-.size	fe_ge_to_p3_avx2,.-fe_ge_to_p3_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_ge_dbl_avx2
-.type	fe_ge_dbl_avx2,@function
-.align	16
-fe_ge_dbl_avx2:
-#else
-.section	__TEXT,__text
-.globl	_fe_ge_dbl_avx2
-.p2align	4
-_fe_ge_dbl_avx2:
-#endif /* __APPLE__ */
-        pushq	%rbp
-        pushq	%rbx
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        subq	$48, %rsp
-        movq	%rdi, (%rsp)
-        movq	%rsi, 8(%rsp)
-        movq	%rdx, 16(%rsp)
-        movq	%rcx, 24(%rsp)
-        movq	%r8, 32(%rsp)
-        movq	%r9, 40(%rsp)
-        movq	32(%rsp), %rsi
-        # Square
-        # A[0] * A[1]
-        movq	(%rsi), %rdx
-        mulxq	8(%rsi), %r9, %r10
-        # A[0] * A[3]
-        mulxq	24(%rsi), %r11, %r12
-        # A[2] * A[1]
-        movq	16(%rsi), %rdx
-        mulxq	8(%rsi), %rcx, %rax
-        xorq	%r15, %r15
-        adoxq	%rcx, %r11
-        # A[2] * A[3]
-        mulxq	24(%rsi), %r13, %r14
-        adoxq	%rax, %r12
-        # A[2] * A[0]
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%r15, %r13
-        adcxq	%rcx, %r10
-        adoxq	%r15, %r14
-        # A[1] * A[3]
-        movq	8(%rsi), %rdx
-        mulxq	24(%rsi), %rbp, %r8
-        adcxq	%rax, %r11
-        adcxq	%rbp, %r12
-        adcxq	%r8, %r13
-        adcxq	%r15, %r14
-        # Double with Carry Flag
-        xorq	%r15, %r15
-        # A[0] * A[0]
-        movq	(%rsi), %rdx
-        mulxq	%rdx, %r8, %rbp
-        adcxq	%r9, %r9
-        # A[1] * A[1]
-        movq	8(%rsi), %rdx
-        mulxq	%rdx, %rcx, %rax
-        adcxq	%r10, %r10
-        adoxq	%rbp, %r9
-        adcxq	%r11, %r11
-        adoxq	%rcx, %r10
-        # A[2] * A[2]
-        movq	16(%rsi), %rdx
-        mulxq	%rdx, %rbp, %rcx
-        adcxq	%r12, %r12
-        adoxq	%rax, %r11
-        adcxq	%r13, %r13
-        adoxq	%rbp, %r12
-        # A[3] * A[3]
-        movq	24(%rsi), %rdx
-        mulxq	%rdx, %rbp, %rax
-        adcxq	%r14, %r14
-        adoxq	%rcx, %r13
-        adcxq	%r15, %r15
-        adoxq	%rbp, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rcx, %rcx
-        mulxq	%r12, %rbp, %r12
-        adcxq	%rbp, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rbp, %r13
-        adcxq	%rbp, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rbp, %r14
-        adcxq	%rbp, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rcx, %rdx
-        adcxq	%rcx, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rcx
-        imulq	$19, %rdx, %rbp
-        andq	%rcx, %r11
-        addq	%rbp, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	16(%rsp), %rdi
-        movq	40(%rsp), %rbx
-        # Square
-        # A[0] * A[1]
-        movq	(%rbx), %rdx
-        mulxq	8(%rbx), %r9, %r10
-        # A[0] * A[3]
-        mulxq	24(%rbx), %r11, %r12
-        # A[2] * A[1]
-        movq	16(%rbx), %rdx
-        mulxq	8(%rbx), %rcx, %rax
-        xorq	%r15, %r15
-        adoxq	%rcx, %r11
-        # A[2] * A[3]
-        mulxq	24(%rbx), %r13, %r14
-        adoxq	%rax, %r12
-        # A[2] * A[0]
-        mulxq	(%rbx), %rcx, %rax
-        adoxq	%r15, %r13
-        adcxq	%rcx, %r10
-        adoxq	%r15, %r14
-        # A[1] * A[3]
-        movq	8(%rbx), %rdx
-        mulxq	24(%rbx), %rbp, %r8
-        adcxq	%rax, %r11
-        adcxq	%rbp, %r12
-        adcxq	%r8, %r13
-        adcxq	%r15, %r14
-        # Double with Carry Flag
-        xorq	%r15, %r15
-        # A[0] * A[0]
-        movq	(%rbx), %rdx
-        mulxq	%rdx, %r8, %rbp
-        adcxq	%r9, %r9
-        # A[1] * A[1]
-        movq	8(%rbx), %rdx
-        mulxq	%rdx, %rcx, %rax
-        adcxq	%r10, %r10
-        adoxq	%rbp, %r9
-        adcxq	%r11, %r11
-        adoxq	%rcx, %r10
-        # A[2] * A[2]
-        movq	16(%rbx), %rdx
-        mulxq	%rdx, %rbp, %rcx
-        adcxq	%r12, %r12
-        adoxq	%rax, %r11
-        adcxq	%r13, %r13
-        adoxq	%rbp, %r12
-        # A[3] * A[3]
-        movq	24(%rbx), %rdx
-        mulxq	%rdx, %rbp, %rax
-        adcxq	%r14, %r14
-        adoxq	%rcx, %r13
-        adcxq	%r15, %r15
-        adoxq	%rbp, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rcx, %rcx
-        mulxq	%r12, %rbp, %r12
-        adcxq	%rbp, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rbp, %r13
-        adcxq	%rbp, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rbp, %r14
-        adcxq	%rbp, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rcx, %rdx
-        adcxq	%rcx, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rcx
-        imulq	$19, %rdx, %rbp
-        andq	%rcx, %r11
-        addq	%rbp, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	8(%rsp), %rdi
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        addq	(%rbx), %r8
-        movq	16(%rsi), %r10
-        adcq	8(%rbx), %r9
-        movq	24(%rsi), %rdx
-        adcq	16(%rbx), %r10
-        movq	$-19, %rcx
-        adcq	24(%rbx), %rdx
-        movq	$0x7fffffffffffffff, %rax
-        movq	%rdx, %r11
-        sarq	$63, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r8
-        sbbq	%rdx, %r9
-        sbbq	%rdx, %r10
-        sbbq	%rax, %r11
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	24(%rsp), %rsi
-        # Square
-        # A[0] * A[1]
-        movq	(%rdi), %rdx
-        mulxq	8(%rdi), %r9, %r10
-        # A[0] * A[3]
-        mulxq	24(%rdi), %r11, %r12
-        # A[2] * A[1]
-        movq	16(%rdi), %rdx
-        mulxq	8(%rdi), %rcx, %rax
-        xorq	%r15, %r15
-        adoxq	%rcx, %r11
-        # A[2] * A[3]
-        mulxq	24(%rdi), %r13, %r14
-        adoxq	%rax, %r12
-        # A[2] * A[0]
-        mulxq	(%rdi), %rcx, %rax
-        adoxq	%r15, %r13
-        adcxq	%rcx, %r10
-        adoxq	%r15, %r14
-        # A[1] * A[3]
-        movq	8(%rdi), %rdx
-        mulxq	24(%rdi), %rbp, %r8
-        adcxq	%rax, %r11
-        adcxq	%rbp, %r12
-        adcxq	%r8, %r13
-        adcxq	%r15, %r14
-        # Double with Carry Flag
-        xorq	%r15, %r15
-        # A[0] * A[0]
-        movq	(%rdi), %rdx
-        mulxq	%rdx, %r8, %rbp
-        adcxq	%r9, %r9
-        # A[1] * A[1]
-        movq	8(%rdi), %rdx
-        mulxq	%rdx, %rcx, %rax
-        adcxq	%r10, %r10
-        adoxq	%rbp, %r9
-        adcxq	%r11, %r11
-        adoxq	%rcx, %r10
-        # A[2] * A[2]
-        movq	16(%rdi), %rdx
-        mulxq	%rdx, %rbp, %rcx
-        adcxq	%r12, %r12
-        adoxq	%rax, %r11
-        adcxq	%r13, %r13
-        adoxq	%rbp, %r12
-        # A[3] * A[3]
-        movq	24(%rdi), %rdx
-        mulxq	%rdx, %rbp, %rax
-        adcxq	%r14, %r14
-        adoxq	%rcx, %r13
-        adcxq	%r15, %r15
-        adoxq	%rbp, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rcx
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rcx, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rcx, %rcx
-        mulxq	%r12, %rbp, %r12
-        adcxq	%rbp, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rbp, %r13
-        adcxq	%rbp, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rbp, %r14
-        adcxq	%rbp, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rcx, %rdx
-        adcxq	%rcx, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rcx
-        imulq	$19, %rdx, %rbp
-        andq	%rcx, %r11
-        addq	%rbp, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rcx, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rsi)
-        movq	%r9, 8(%rsi)
-        movq	%r10, 16(%rsi)
-        movq	%r11, 24(%rsi)
-        movq	16(%rsp), %rsi
-        movq	(%rsp), %rbx
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        movq	16(%rsi), %r10
-        movq	24(%rsi), %rdx
-        movq	%r8, %r12
-        addq	(%rbx), %r8
-        movq	%r9, %r13
-        adcq	8(%rbx), %r9
-        movq	%r10, %r14
-        adcq	16(%rbx), %r10
-        movq	%rdx, %r15
-        adcq	24(%rbx), %rdx
-        movq	$-19, %rcx
-        movq	%rdx, %r11
-        movq	$0x7fffffffffffffff, %rax
-        sarq	$63, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r8
-        sbbq	%rdx, %r9
-        sbbq	%rdx, %r10
-        sbbq	%rax, %r11
-        # Sub
-        subq	(%rbx), %r12
-        movq	$0x00, %rdx
-        sbbq	8(%rbx), %r13
-        movq	$-19, %rcx
-        sbbq	16(%rbx), %r14
-        movq	$0x7fffffffffffffff, %rax
-        sbbq	24(%rbx), %r15
-        sbbq	$0x00, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Add modulus (if underflow)
-        addq	%rcx, %r12
-        adcq	%rdx, %r13
-        adcq	%rdx, %r14
-        adcq	%rax, %r15
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	%r12, (%rsi)
-        movq	%r13, 8(%rsi)
-        movq	%r14, 16(%rsi)
-        movq	%r15, 24(%rsi)
-        movq	24(%rsp), %rsi
-        # Sub
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        movq	16(%rsi), %r10
-        movq	24(%rsi), %r11
-        subq	(%rdi), %r8
-        movq	$0x00, %rdx
-        sbbq	8(%rdi), %r9
-        movq	$-19, %rcx
-        sbbq	16(%rdi), %r10
-        movq	$0x7fffffffffffffff, %rax
-        sbbq	24(%rdi), %r11
-        sbbq	$0x00, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Add modulus (if underflow)
-        addq	%rcx, %r8
-        adcq	%rdx, %r9
-        adcq	%rdx, %r10
-        adcq	%rax, %r11
-        movq	%r8, (%rbx)
-        movq	%r9, 8(%rbx)
-        movq	%r10, 16(%rbx)
-        movq	%r11, 24(%rbx)
-        movq	104(%rsp), %rdi
-        # Square * 2
-        # A[0] * A[1]
-        movq	(%rdi), %rdx
-        mulxq	8(%rdi), %r9, %r10
-        # A[0] * A[3]
-        mulxq	24(%rdi), %r11, %r12
-        # A[2] * A[1]
-        movq	16(%rdi), %rdx
-        mulxq	8(%rdi), %rcx, %rax
-        xorq	%r15, %r15
-        adoxq	%rcx, %r11
-        # A[2] * A[3]
-        mulxq	24(%rdi), %r13, %r14
-        adoxq	%rax, %r12
-        # A[2] * A[0]
-        mulxq	(%rdi), %rcx, %rax
-        adoxq	%r15, %r13
-        adcxq	%rcx, %r10
-        adoxq	%r15, %r14
-        # A[1] * A[3]
-        movq	8(%rdi), %rdx
-        mulxq	24(%rdi), %rbp, %r8
-        adcxq	%rax, %r11
-        adcxq	%rbp, %r12
-        adcxq	%r8, %r13
-        adcxq	%r15, %r14
-        # Double with Carry Flag
-        xorq	%r15, %r15
-        # A[0] * A[0]
-        movq	(%rdi), %rdx
-        mulxq	%rdx, %r8, %rbp
-        adcxq	%r9, %r9
-        # A[1] * A[1]
-        movq	8(%rdi), %rdx
-        mulxq	%rdx, %rcx, %rax
-        adcxq	%r10, %r10
-        adoxq	%rbp, %r9
-        adcxq	%r11, %r11
-        adoxq	%rcx, %r10
-        # A[2] * A[2]
-        movq	16(%rdi), %rdx
-        mulxq	%rdx, %rbp, %rcx
-        adcxq	%r12, %r12
-        adoxq	%rax, %r11
-        adcxq	%r13, %r13
-        adoxq	%rbp, %r12
-        # A[3] * A[3]
-        movq	24(%rdi), %rdx
-        mulxq	%rdx, %rbp, %rax
-        adcxq	%r14, %r14
-        adoxq	%rcx, %r13
-        adcxq	%r15, %r15
-        adoxq	%rbp, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        xorq	%rbp, %rbp
-        #  Move top half into t4-t7 and remove top bit from t3 and double
-        shldq	$3, %r15, %rbp
-        shldq	$2, %r14, %r15
-        shldq	$2, %r13, %r14
-        shldq	$2, %r12, %r13
-        shldq	$2, %r11, %r12
-        shldq	$0x01, %r10, %r11
-        shldq	$0x01, %r9, %r10
-        shldq	$0x01, %r8, %r9
-        shlq	$0x01, %r8
-        andq	%rax, %r11
-        #  Two out left, one in right
-        andq	%rax, %r15
-        #  Multiply top bits by 19*19
-        imulq	$0x169, %rbp, %rcx
-        xorq	%rax, %rax
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        adoxq	%rcx, %r8
-        mulxq	%r12, %rbp, %r12
-        adcxq	%rbp, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rbp, %r13
-        adcxq	%rbp, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rbp, %r14
-        adcxq	%rbp, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rbp
-        andq	%rax, %r11
-        addq	%rbp, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rsi)
-        movq	%r9, 8(%rsi)
-        movq	%r10, 16(%rsi)
-        movq	%r11, 24(%rsi)
-        movq	16(%rsp), %rdi
-        # Sub
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        movq	16(%rsi), %r10
-        movq	24(%rsi), %r11
-        subq	(%rdi), %r8
-        movq	$0x00, %rdx
-        sbbq	8(%rdi), %r9
-        movq	$-19, %rcx
-        sbbq	16(%rdi), %r10
-        movq	$0x7fffffffffffffff, %rax
-        sbbq	24(%rdi), %r11
-        sbbq	$0x00, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Add modulus (if underflow)
-        addq	%rcx, %r8
-        adcq	%rdx, %r9
-        adcq	%rdx, %r10
-        adcq	%rax, %r11
-        movq	%r8, (%rsi)
-        movq	%r9, 8(%rsi)
-        movq	%r10, 16(%rsi)
-        movq	%r11, 24(%rsi)
-        addq	$48, %rsp
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        popq	%rbx
-        popq	%rbp
-        repz retq
-#ifndef __APPLE__
-.size	fe_ge_dbl_avx2,.-fe_ge_dbl_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_ge_madd_avx2
-.type	fe_ge_madd_avx2,@function
-.align	16
-fe_ge_madd_avx2:
-#else
-.section	__TEXT,__text
-.globl	_fe_ge_madd_avx2
-.p2align	4
-_fe_ge_madd_avx2:
-#endif /* __APPLE__ */
-        pushq	%rbp
-        pushq	%rbx
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        subq	$48, %rsp
-        movq	%rdi, (%rsp)
-        movq	%rsi, 8(%rsp)
-        movq	%rdx, 16(%rsp)
-        movq	%rcx, 24(%rsp)
-        movq	%r8, 32(%rsp)
-        movq	%r9, 40(%rsp)
-        movq	8(%rsp), %rsi
-        movq	40(%rsp), %rbx
-        movq	32(%rsp), %rbp
-        # Add
-        movq	(%rbx), %r8
-        movq	8(%rbx), %r9
-        movq	16(%rbx), %r10
-        movq	24(%rbx), %rdx
-        movq	%r8, %r12
-        addq	(%rbp), %r8
-        movq	%r9, %r13
-        adcq	8(%rbp), %r9
-        movq	%r10, %r14
-        adcq	16(%rbp), %r10
-        movq	%rdx, %r15
-        adcq	24(%rbp), %rdx
-        movq	$-19, %rcx
-        movq	%rdx, %r11
-        movq	$0x7fffffffffffffff, %rax
-        sarq	$63, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r8
-        sbbq	%rdx, %r9
-        sbbq	%rdx, %r10
-        sbbq	%rax, %r11
-        # Sub
-        subq	(%rbp), %r12
-        movq	$0x00, %rdx
-        sbbq	8(%rbp), %r13
-        movq	$-19, %rcx
-        sbbq	16(%rbp), %r14
-        movq	$0x7fffffffffffffff, %rax
-        sbbq	24(%rbp), %r15
-        sbbq	$0x00, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Add modulus (if underflow)
-        addq	%rcx, %r12
-        adcq	%rdx, %r13
-        adcq	%rdx, %r14
-        adcq	%rax, %r15
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	%r12, (%rsi)
-        movq	%r13, 8(%rsi)
-        movq	%r14, 16(%rsi)
-        movq	%r15, 24(%rsi)
-        movq	16(%rsp), %rbx
-        movq	128(%rsp), %rbp
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rbp), %rdx
-        mulxq	(%rdi), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rdi), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rdi), %rcx, %rax
-        xorq	%r15, %r15
-        adcxq	%rcx, %r9
-        # A[1] * B[3]
-        movq	24(%rbp), %rdx
-        mulxq	8(%rdi), %r12, %r13
-        adcxq	%rax, %r10
-        # A[0] * B[1]
-        movq	8(%rbp), %rdx
-        mulxq	(%rdi), %rcx, %rax
-        adoxq	%rcx, %r9
-        # A[2] * B[1]
-        mulxq	16(%rdi), %rcx, %r14
-        adoxq	%rax, %r10
-        adcxq	%rcx, %r11
-        # A[1] * B[2]
-        movq	16(%rbp), %rdx
-        mulxq	8(%rdi), %rcx, %rax
-        adcxq	%r14, %r12
-        adoxq	%rcx, %r11
-        adcxq	%r15, %r13
-        adoxq	%rax, %r12
-        # A[0] * B[2]
-        mulxq	(%rdi), %rcx, %rax
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rcx, %r10
-        # A[1] * B[1]
-        movq	8(%rbp), %rdx
-        mulxq	8(%rdi), %rdx, %rcx
-        adcxq	%rax, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rbp), %rdx
-        adoxq	%rcx, %r11
-        mulxq	24(%rdi), %rcx, %rax
-        adcxq	%rcx, %r12
-        # A[2] * B[2]
-        movq	16(%rbp), %rdx
-        mulxq	16(%rdi), %rdx, %rcx
-        adcxq	%rax, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rbp), %rdx
-        adoxq	%rcx, %r13
-        mulxq	24(%rdi), %rcx, %rax
-        adoxq	%r15, %r14
-        adcxq	%rcx, %r14
-        # A[0] * B[3]
-        mulxq	(%rdi), %rdx, %rcx
-        adcxq	%rax, %r15
-        xorq	%rax, %rax
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rbp), %rdx
-        adcxq	%rcx, %r12
-        mulxq	24(%rdi), %rdx, %rcx
-        adoxq	%rdx, %r11
-        adoxq	%rcx, %r12
-        # A[2] * B[3]
-        movq	24(%rbp), %rdx
-        mulxq	16(%rdi), %rdx, %rcx
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rbp), %rdx
-        adcxq	%rcx, %r14
-        mulxq	24(%rdi), %rcx, %rdx
-        adcxq	%rax, %r15
-        adoxq	%rcx, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rax, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rax, %rax
-        mulxq	%r12, %rcx, %r12
-        adcxq	%rcx, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rcx
-        andq	%rax, %r11
-        addq	%rcx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rbx)
-        movq	%r9, 8(%rbx)
-        movq	%r10, 16(%rbx)
-        movq	%r11, 24(%rbx)
-        movq	136(%rsp), %rdi
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rdi), %rdx
-        mulxq	(%rsi), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rsi), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rsi), %rcx, %rax
-        xorq	%r15, %r15
-        adcxq	%rcx, %r9
-        # A[1] * B[3]
-        movq	24(%rdi), %rdx
-        mulxq	8(%rsi), %r12, %r13
-        adcxq	%rax, %r10
-        # A[0] * B[1]
-        movq	8(%rdi), %rdx
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%rcx, %r9
-        # A[2] * B[1]
-        mulxq	16(%rsi), %rcx, %r14
-        adoxq	%rax, %r10
-        adcxq	%rcx, %r11
-        # A[1] * B[2]
-        movq	16(%rdi), %rdx
-        mulxq	8(%rsi), %rcx, %rax
-        adcxq	%r14, %r12
-        adoxq	%rcx, %r11
-        adcxq	%r15, %r13
-        adoxq	%rax, %r12
-        # A[0] * B[2]
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rcx, %r10
-        # A[1] * B[1]
-        movq	8(%rdi), %rdx
-        mulxq	8(%rsi), %rdx, %rcx
-        adcxq	%rax, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rdi), %rdx
-        adoxq	%rcx, %r11
-        mulxq	24(%rsi), %rcx, %rax
-        adcxq	%rcx, %r12
-        # A[2] * B[2]
-        movq	16(%rdi), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rax, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rdi), %rdx
-        adoxq	%rcx, %r13
-        mulxq	24(%rsi), %rcx, %rax
-        adoxq	%r15, %r14
-        adcxq	%rcx, %r14
-        # A[0] * B[3]
-        mulxq	(%rsi), %rdx, %rcx
-        adcxq	%rax, %r15
-        xorq	%rax, %rax
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rdi), %rdx
-        adcxq	%rcx, %r12
-        mulxq	24(%rsi), %rdx, %rcx
-        adoxq	%rdx, %r11
-        adoxq	%rcx, %r12
-        # A[2] * B[3]
-        movq	24(%rdi), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rdi), %rdx
-        adcxq	%rcx, %r14
-        mulxq	24(%rsi), %rcx, %rdx
-        adcxq	%rax, %r15
-        adoxq	%rcx, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rax, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rax, %rax
-        mulxq	%r12, %rcx, %r12
-        adcxq	%rcx, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rcx
-        andq	%rax, %r11
-        addq	%rcx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rsi)
-        movq	%r9, 8(%rsi)
-        movq	%r10, 16(%rsi)
-        movq	%r11, 24(%rsi)
-        movq	24(%rsp), %rdi
-        movq	120(%rsp), %rsi
-        movq	112(%rsp), %rbp
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rbp), %rdx
-        mulxq	(%rsi), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rsi), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rsi), %rcx, %rax
-        xorq	%r15, %r15
-        adcxq	%rcx, %r9
-        # A[1] * B[3]
-        movq	24(%rbp), %rdx
-        mulxq	8(%rsi), %r12, %r13
-        adcxq	%rax, %r10
-        # A[0] * B[1]
-        movq	8(%rbp), %rdx
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%rcx, %r9
-        # A[2] * B[1]
-        mulxq	16(%rsi), %rcx, %r14
-        adoxq	%rax, %r10
-        adcxq	%rcx, %r11
-        # A[1] * B[2]
-        movq	16(%rbp), %rdx
-        mulxq	8(%rsi), %rcx, %rax
-        adcxq	%r14, %r12
-        adoxq	%rcx, %r11
-        adcxq	%r15, %r13
-        adoxq	%rax, %r12
-        # A[0] * B[2]
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rcx, %r10
-        # A[1] * B[1]
-        movq	8(%rbp), %rdx
-        mulxq	8(%rsi), %rdx, %rcx
-        adcxq	%rax, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rbp), %rdx
-        adoxq	%rcx, %r11
-        mulxq	24(%rsi), %rcx, %rax
-        adcxq	%rcx, %r12
-        # A[2] * B[2]
-        movq	16(%rbp), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rax, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rbp), %rdx
-        adoxq	%rcx, %r13
-        mulxq	24(%rsi), %rcx, %rax
-        adoxq	%r15, %r14
-        adcxq	%rcx, %r14
-        # A[0] * B[3]
-        mulxq	(%rsi), %rdx, %rcx
-        adcxq	%rax, %r15
-        xorq	%rax, %rax
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rbp), %rdx
-        adcxq	%rcx, %r12
-        mulxq	24(%rsi), %rdx, %rcx
-        adoxq	%rdx, %r11
-        adoxq	%rcx, %r12
-        # A[2] * B[3]
-        movq	24(%rbp), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rbp), %rdx
-        adcxq	%rcx, %r14
-        mulxq	24(%rsi), %rcx, %rdx
-        adcxq	%rax, %r15
-        adoxq	%rcx, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rax, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rax, %rax
-        mulxq	%r12, %rcx, %r12
-        adcxq	%rcx, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rcx
-        andq	%rax, %r11
-        addq	%rcx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	8(%rsp), %rdi
-        movq	(%rsp), %rsi
-        # Add
-        movq	(%rbx), %r8
-        movq	8(%rbx), %r9
-        movq	16(%rbx), %r10
-        movq	24(%rbx), %rdx
-        movq	%r8, %r12
-        addq	(%rdi), %r8
-        movq	%r9, %r13
-        adcq	8(%rdi), %r9
-        movq	%r10, %r14
-        adcq	16(%rdi), %r10
-        movq	%rdx, %r15
-        adcq	24(%rdi), %rdx
-        movq	$-19, %rcx
-        movq	%rdx, %r11
-        movq	$0x7fffffffffffffff, %rax
-        sarq	$63, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r8
-        sbbq	%rdx, %r9
-        sbbq	%rdx, %r10
-        sbbq	%rax, %r11
-        # Sub
-        subq	(%rdi), %r12
-        movq	$0x00, %rdx
-        sbbq	8(%rdi), %r13
-        movq	$-19, %rcx
-        sbbq	16(%rdi), %r14
-        movq	$0x7fffffffffffffff, %rax
-        sbbq	24(%rdi), %r15
-        sbbq	$0x00, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Add modulus (if underflow)
-        addq	%rcx, %r12
-        adcq	%rdx, %r13
-        adcq	%rdx, %r14
-        adcq	%rax, %r15
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	%r12, (%rsi)
-        movq	%r13, 8(%rsi)
-        movq	%r14, 16(%rsi)
-        movq	%r15, 24(%rsi)
-        movq	104(%rsp), %rdi
-        # Double
-        movq	(%rdi), %r8
-        movq	8(%rdi), %r9
-        addq	%r8, %r8
-        movq	16(%rdi), %r10
-        adcq	%r9, %r9
-        movq	24(%rdi), %rdx
-        adcq	%r10, %r10
-        movq	$-19, %rcx
-        adcq	%rdx, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        movq	%rdx, %r11
-        sarq	$63, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r8
-        sbbq	%rdx, %r9
-        sbbq	%rdx, %r10
-        sbbq	%rax, %r11
-        movq	%r8, (%rbx)
-        movq	%r9, 8(%rbx)
-        movq	%r10, 16(%rbx)
-        movq	%r11, 24(%rbx)
-        movq	24(%rsp), %rdi
-        # Add
-        movq	(%rbx), %r8
-        movq	8(%rbx), %r9
-        movq	16(%rbx), %r10
-        movq	24(%rbx), %rdx
-        movq	%r8, %r12
-        addq	(%rdi), %r8
-        movq	%r9, %r13
-        adcq	8(%rdi), %r9
-        movq	%r10, %r14
-        adcq	16(%rdi), %r10
-        movq	%rdx, %r15
-        adcq	24(%rdi), %rdx
-        movq	$-19, %rcx
-        movq	%rdx, %r11
-        movq	$0x7fffffffffffffff, %rax
-        sarq	$63, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r8
-        sbbq	%rdx, %r9
-        sbbq	%rdx, %r10
-        sbbq	%rax, %r11
-        # Sub
-        subq	(%rdi), %r12
-        movq	$0x00, %rdx
-        sbbq	8(%rdi), %r13
-        movq	$-19, %rcx
-        sbbq	16(%rdi), %r14
-        movq	$0x7fffffffffffffff, %rax
-        sbbq	24(%rdi), %r15
-        sbbq	$0x00, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Add modulus (if underflow)
-        addq	%rcx, %r12
-        adcq	%rdx, %r13
-        adcq	%rdx, %r14
-        adcq	%rax, %r15
-        movq	%r8, (%rbx)
-        movq	%r9, 8(%rbx)
-        movq	%r10, 16(%rbx)
-        movq	%r11, 24(%rbx)
-        movq	%r12, (%rdi)
-        movq	%r13, 8(%rdi)
-        movq	%r14, 16(%rdi)
-        movq	%r15, 24(%rdi)
-        addq	$48, %rsp
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        popq	%rbx
-        popq	%rbp
-        repz retq
-#ifndef __APPLE__
-.size	fe_ge_madd_avx2,.-fe_ge_madd_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_ge_msub_avx2
-.type	fe_ge_msub_avx2,@function
-.align	16
-fe_ge_msub_avx2:
-#else
-.section	__TEXT,__text
-.globl	_fe_ge_msub_avx2
-.p2align	4
-_fe_ge_msub_avx2:
-#endif /* __APPLE__ */
-        pushq	%rbp
-        pushq	%rbx
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        subq	$48, %rsp
-        movq	%rdi, (%rsp)
-        movq	%rsi, 8(%rsp)
-        movq	%rdx, 16(%rsp)
-        movq	%rcx, 24(%rsp)
-        movq	%r8, 32(%rsp)
-        movq	%r9, 40(%rsp)
-        movq	8(%rsp), %rsi
-        movq	40(%rsp), %rbx
-        movq	32(%rsp), %rbp
-        # Add
-        movq	(%rbx), %r8
-        movq	8(%rbx), %r9
-        movq	16(%rbx), %r10
-        movq	24(%rbx), %rdx
-        movq	%r8, %r12
-        addq	(%rbp), %r8
-        movq	%r9, %r13
-        adcq	8(%rbp), %r9
-        movq	%r10, %r14
-        adcq	16(%rbp), %r10
-        movq	%rdx, %r15
-        adcq	24(%rbp), %rdx
-        movq	$-19, %rcx
-        movq	%rdx, %r11
-        movq	$0x7fffffffffffffff, %rax
-        sarq	$63, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r8
-        sbbq	%rdx, %r9
-        sbbq	%rdx, %r10
-        sbbq	%rax, %r11
-        # Sub
-        subq	(%rbp), %r12
-        movq	$0x00, %rdx
-        sbbq	8(%rbp), %r13
-        movq	$-19, %rcx
-        sbbq	16(%rbp), %r14
-        movq	$0x7fffffffffffffff, %rax
-        sbbq	24(%rbp), %r15
-        sbbq	$0x00, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Add modulus (if underflow)
-        addq	%rcx, %r12
-        adcq	%rdx, %r13
-        adcq	%rdx, %r14
-        adcq	%rax, %r15
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	%r12, (%rsi)
-        movq	%r13, 8(%rsi)
-        movq	%r14, 16(%rsi)
-        movq	%r15, 24(%rsi)
-        movq	16(%rsp), %rbx
-        movq	136(%rsp), %rbp
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rbp), %rdx
-        mulxq	(%rdi), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rdi), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rdi), %rcx, %rax
-        xorq	%r15, %r15
-        adcxq	%rcx, %r9
-        # A[1] * B[3]
-        movq	24(%rbp), %rdx
-        mulxq	8(%rdi), %r12, %r13
-        adcxq	%rax, %r10
-        # A[0] * B[1]
-        movq	8(%rbp), %rdx
-        mulxq	(%rdi), %rcx, %rax
-        adoxq	%rcx, %r9
-        # A[2] * B[1]
-        mulxq	16(%rdi), %rcx, %r14
-        adoxq	%rax, %r10
-        adcxq	%rcx, %r11
-        # A[1] * B[2]
-        movq	16(%rbp), %rdx
-        mulxq	8(%rdi), %rcx, %rax
-        adcxq	%r14, %r12
-        adoxq	%rcx, %r11
-        adcxq	%r15, %r13
-        adoxq	%rax, %r12
-        # A[0] * B[2]
-        mulxq	(%rdi), %rcx, %rax
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rcx, %r10
-        # A[1] * B[1]
-        movq	8(%rbp), %rdx
-        mulxq	8(%rdi), %rdx, %rcx
-        adcxq	%rax, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rbp), %rdx
-        adoxq	%rcx, %r11
-        mulxq	24(%rdi), %rcx, %rax
-        adcxq	%rcx, %r12
-        # A[2] * B[2]
-        movq	16(%rbp), %rdx
-        mulxq	16(%rdi), %rdx, %rcx
-        adcxq	%rax, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rbp), %rdx
-        adoxq	%rcx, %r13
-        mulxq	24(%rdi), %rcx, %rax
-        adoxq	%r15, %r14
-        adcxq	%rcx, %r14
-        # A[0] * B[3]
-        mulxq	(%rdi), %rdx, %rcx
-        adcxq	%rax, %r15
-        xorq	%rax, %rax
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rbp), %rdx
-        adcxq	%rcx, %r12
-        mulxq	24(%rdi), %rdx, %rcx
-        adoxq	%rdx, %r11
-        adoxq	%rcx, %r12
-        # A[2] * B[3]
-        movq	24(%rbp), %rdx
-        mulxq	16(%rdi), %rdx, %rcx
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rbp), %rdx
-        adcxq	%rcx, %r14
-        mulxq	24(%rdi), %rcx, %rdx
-        adcxq	%rax, %r15
-        adoxq	%rcx, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rax, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rax, %rax
-        mulxq	%r12, %rcx, %r12
-        adcxq	%rcx, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rcx
-        andq	%rax, %r11
-        addq	%rcx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rbx)
-        movq	%r9, 8(%rbx)
-        movq	%r10, 16(%rbx)
-        movq	%r11, 24(%rbx)
-        movq	128(%rsp), %rdi
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rdi), %rdx
-        mulxq	(%rsi), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rsi), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rsi), %rcx, %rax
-        xorq	%r15, %r15
-        adcxq	%rcx, %r9
-        # A[1] * B[3]
-        movq	24(%rdi), %rdx
-        mulxq	8(%rsi), %r12, %r13
-        adcxq	%rax, %r10
-        # A[0] * B[1]
-        movq	8(%rdi), %rdx
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%rcx, %r9
-        # A[2] * B[1]
-        mulxq	16(%rsi), %rcx, %r14
-        adoxq	%rax, %r10
-        adcxq	%rcx, %r11
-        # A[1] * B[2]
-        movq	16(%rdi), %rdx
-        mulxq	8(%rsi), %rcx, %rax
-        adcxq	%r14, %r12
-        adoxq	%rcx, %r11
-        adcxq	%r15, %r13
-        adoxq	%rax, %r12
-        # A[0] * B[2]
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rcx, %r10
-        # A[1] * B[1]
-        movq	8(%rdi), %rdx
-        mulxq	8(%rsi), %rdx, %rcx
-        adcxq	%rax, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rdi), %rdx
-        adoxq	%rcx, %r11
-        mulxq	24(%rsi), %rcx, %rax
-        adcxq	%rcx, %r12
-        # A[2] * B[2]
-        movq	16(%rdi), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rax, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rdi), %rdx
-        adoxq	%rcx, %r13
-        mulxq	24(%rsi), %rcx, %rax
-        adoxq	%r15, %r14
-        adcxq	%rcx, %r14
-        # A[0] * B[3]
-        mulxq	(%rsi), %rdx, %rcx
-        adcxq	%rax, %r15
-        xorq	%rax, %rax
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rdi), %rdx
-        adcxq	%rcx, %r12
-        mulxq	24(%rsi), %rdx, %rcx
-        adoxq	%rdx, %r11
-        adoxq	%rcx, %r12
-        # A[2] * B[3]
-        movq	24(%rdi), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rdi), %rdx
-        adcxq	%rcx, %r14
-        mulxq	24(%rsi), %rcx, %rdx
-        adcxq	%rax, %r15
-        adoxq	%rcx, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rax, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rax, %rax
-        mulxq	%r12, %rcx, %r12
-        adcxq	%rcx, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rcx
-        andq	%rax, %r11
-        addq	%rcx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rsi)
-        movq	%r9, 8(%rsi)
-        movq	%r10, 16(%rsi)
-        movq	%r11, 24(%rsi)
-        movq	24(%rsp), %rdi
-        movq	120(%rsp), %rsi
-        movq	112(%rsp), %rbp
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rbp), %rdx
-        mulxq	(%rsi), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rsi), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rsi), %rcx, %rax
-        xorq	%r15, %r15
-        adcxq	%rcx, %r9
-        # A[1] * B[3]
-        movq	24(%rbp), %rdx
-        mulxq	8(%rsi), %r12, %r13
-        adcxq	%rax, %r10
-        # A[0] * B[1]
-        movq	8(%rbp), %rdx
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%rcx, %r9
-        # A[2] * B[1]
-        mulxq	16(%rsi), %rcx, %r14
-        adoxq	%rax, %r10
-        adcxq	%rcx, %r11
-        # A[1] * B[2]
-        movq	16(%rbp), %rdx
-        mulxq	8(%rsi), %rcx, %rax
-        adcxq	%r14, %r12
-        adoxq	%rcx, %r11
-        adcxq	%r15, %r13
-        adoxq	%rax, %r12
-        # A[0] * B[2]
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rcx, %r10
-        # A[1] * B[1]
-        movq	8(%rbp), %rdx
-        mulxq	8(%rsi), %rdx, %rcx
-        adcxq	%rax, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rbp), %rdx
-        adoxq	%rcx, %r11
-        mulxq	24(%rsi), %rcx, %rax
-        adcxq	%rcx, %r12
-        # A[2] * B[2]
-        movq	16(%rbp), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rax, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rbp), %rdx
-        adoxq	%rcx, %r13
-        mulxq	24(%rsi), %rcx, %rax
-        adoxq	%r15, %r14
-        adcxq	%rcx, %r14
-        # A[0] * B[3]
-        mulxq	(%rsi), %rdx, %rcx
-        adcxq	%rax, %r15
-        xorq	%rax, %rax
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rbp), %rdx
-        adcxq	%rcx, %r12
-        mulxq	24(%rsi), %rdx, %rcx
-        adoxq	%rdx, %r11
-        adoxq	%rcx, %r12
-        # A[2] * B[3]
-        movq	24(%rbp), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rbp), %rdx
-        adcxq	%rcx, %r14
-        mulxq	24(%rsi), %rcx, %rdx
-        adcxq	%rax, %r15
-        adoxq	%rcx, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rax, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rax, %rax
-        mulxq	%r12, %rcx, %r12
-        adcxq	%rcx, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rcx
-        andq	%rax, %r11
-        addq	%rcx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	8(%rsp), %rsi
-        movq	(%rsp), %rbp
-        # Add
-        movq	(%rbx), %r8
-        movq	8(%rbx), %r9
-        movq	16(%rbx), %r10
-        movq	24(%rbx), %rdx
-        movq	%r8, %r12
-        addq	(%rsi), %r8
-        movq	%r9, %r13
-        adcq	8(%rsi), %r9
-        movq	%r10, %r14
-        adcq	16(%rsi), %r10
-        movq	%rdx, %r15
-        adcq	24(%rsi), %rdx
-        movq	$-19, %rcx
-        movq	%rdx, %r11
-        movq	$0x7fffffffffffffff, %rax
-        sarq	$63, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r8
-        sbbq	%rdx, %r9
-        sbbq	%rdx, %r10
-        sbbq	%rax, %r11
-        # Sub
-        subq	(%rsi), %r12
-        movq	$0x00, %rdx
-        sbbq	8(%rsi), %r13
-        movq	$-19, %rcx
-        sbbq	16(%rsi), %r14
-        movq	$0x7fffffffffffffff, %rax
-        sbbq	24(%rsi), %r15
-        sbbq	$0x00, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Add modulus (if underflow)
-        addq	%rcx, %r12
-        adcq	%rdx, %r13
-        adcq	%rdx, %r14
-        adcq	%rax, %r15
-        movq	%r8, (%rsi)
-        movq	%r9, 8(%rsi)
-        movq	%r10, 16(%rsi)
-        movq	%r11, 24(%rsi)
-        movq	%r12, (%rbp)
-        movq	%r13, 8(%rbp)
-        movq	%r14, 16(%rbp)
-        movq	%r15, 24(%rbp)
-        movq	104(%rsp), %rsi
-        # Double
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        addq	%r8, %r8
-        movq	16(%rsi), %r10
-        adcq	%r9, %r9
-        movq	24(%rsi), %rdx
-        adcq	%r10, %r10
-        movq	$-19, %rcx
-        adcq	%rdx, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        movq	%rdx, %r11
-        sarq	$63, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r8
-        sbbq	%rdx, %r9
-        sbbq	%rdx, %r10
-        sbbq	%rax, %r11
-        movq	%r8, (%rbx)
-        movq	%r9, 8(%rbx)
-        movq	%r10, 16(%rbx)
-        movq	%r11, 24(%rbx)
-        # Add
-        movq	(%rbx), %r8
-        movq	8(%rbx), %r9
-        movq	16(%rbx), %r10
-        movq	24(%rbx), %rdx
-        movq	%r8, %r12
-        addq	(%rdi), %r8
-        movq	%r9, %r13
-        adcq	8(%rdi), %r9
-        movq	%r10, %r14
-        adcq	16(%rdi), %r10
-        movq	%rdx, %r15
-        adcq	24(%rdi), %rdx
-        movq	$-19, %rcx
-        movq	%rdx, %r11
-        movq	$0x7fffffffffffffff, %rax
-        sarq	$63, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r8
-        sbbq	%rdx, %r9
-        sbbq	%rdx, %r10
-        sbbq	%rax, %r11
-        # Sub
-        subq	(%rdi), %r12
-        movq	$0x00, %rdx
-        sbbq	8(%rdi), %r13
-        movq	$-19, %rcx
-        sbbq	16(%rdi), %r14
-        movq	$0x7fffffffffffffff, %rax
-        sbbq	24(%rdi), %r15
-        sbbq	$0x00, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Add modulus (if underflow)
-        addq	%rcx, %r12
-        adcq	%rdx, %r13
-        adcq	%rdx, %r14
-        adcq	%rax, %r15
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	%r12, (%rbx)
-        movq	%r13, 8(%rbx)
-        movq	%r14, 16(%rbx)
-        movq	%r15, 24(%rbx)
-        addq	$48, %rsp
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        popq	%rbx
-        popq	%rbp
-        repz retq
-#ifndef __APPLE__
-.size	fe_ge_msub_avx2,.-fe_ge_msub_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_ge_add_avx2
-.type	fe_ge_add_avx2,@function
-.align	16
-fe_ge_add_avx2:
-#else
-.section	__TEXT,__text
-.globl	_fe_ge_add_avx2
-.p2align	4
-_fe_ge_add_avx2:
-#endif /* __APPLE__ */
-        pushq	%rbx
-        pushq	%rbp
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        subq	$0x50, %rsp
-        movq	%rdi, (%rsp)
-        movq	%rsi, 8(%rsp)
-        movq	%rdx, 16(%rsp)
-        movq	%rcx, 24(%rsp)
-        movq	%r8, 32(%rsp)
-        movq	%r9, 40(%rsp)
-        movq	8(%rsp), %rsi
-        movq	40(%rsp), %rbx
-        movq	32(%rsp), %rbp
-        # Add
-        movq	(%rbx), %r8
-        movq	8(%rbx), %r9
-        movq	16(%rbx), %r10
-        movq	24(%rbx), %rdx
-        movq	%r8, %r12
-        addq	(%rbp), %r8
-        movq	%r9, %r13
-        adcq	8(%rbp), %r9
-        movq	%r10, %r14
-        adcq	16(%rbp), %r10
-        movq	%rdx, %r15
-        adcq	24(%rbp), %rdx
-        movq	$-19, %rcx
-        movq	%rdx, %r11
-        movq	$0x7fffffffffffffff, %rax
-        sarq	$63, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r8
-        sbbq	%rdx, %r9
-        sbbq	%rdx, %r10
-        sbbq	%rax, %r11
-        # Sub
-        subq	(%rbp), %r12
-        movq	$0x00, %rdx
-        sbbq	8(%rbp), %r13
-        movq	$-19, %rcx
-        sbbq	16(%rbp), %r14
-        movq	$0x7fffffffffffffff, %rax
-        sbbq	24(%rbp), %r15
-        sbbq	$0x00, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Add modulus (if underflow)
-        addq	%rcx, %r12
-        adcq	%rdx, %r13
-        adcq	%rdx, %r14
-        adcq	%rax, %r15
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	%r12, (%rsi)
-        movq	%r13, 8(%rsi)
-        movq	%r14, 16(%rsi)
-        movq	%r15, 24(%rsi)
-        movq	16(%rsp), %rbx
-        movq	168(%rsp), %rbp
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rbp), %rdx
-        mulxq	(%rdi), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rdi), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rdi), %rcx, %rax
-        xorq	%r15, %r15
-        adcxq	%rcx, %r9
-        # A[1] * B[3]
-        movq	24(%rbp), %rdx
-        mulxq	8(%rdi), %r12, %r13
-        adcxq	%rax, %r10
-        # A[0] * B[1]
-        movq	8(%rbp), %rdx
-        mulxq	(%rdi), %rcx, %rax
-        adoxq	%rcx, %r9
-        # A[2] * B[1]
-        mulxq	16(%rdi), %rcx, %r14
-        adoxq	%rax, %r10
-        adcxq	%rcx, %r11
-        # A[1] * B[2]
-        movq	16(%rbp), %rdx
-        mulxq	8(%rdi), %rcx, %rax
-        adcxq	%r14, %r12
-        adoxq	%rcx, %r11
-        adcxq	%r15, %r13
-        adoxq	%rax, %r12
-        # A[0] * B[2]
-        mulxq	(%rdi), %rcx, %rax
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rcx, %r10
-        # A[1] * B[1]
-        movq	8(%rbp), %rdx
-        mulxq	8(%rdi), %rdx, %rcx
-        adcxq	%rax, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rbp), %rdx
-        adoxq	%rcx, %r11
-        mulxq	24(%rdi), %rcx, %rax
-        adcxq	%rcx, %r12
-        # A[2] * B[2]
-        movq	16(%rbp), %rdx
-        mulxq	16(%rdi), %rdx, %rcx
-        adcxq	%rax, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rbp), %rdx
-        adoxq	%rcx, %r13
-        mulxq	24(%rdi), %rcx, %rax
-        adoxq	%r15, %r14
-        adcxq	%rcx, %r14
-        # A[0] * B[3]
-        mulxq	(%rdi), %rdx, %rcx
-        adcxq	%rax, %r15
-        xorq	%rax, %rax
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rbp), %rdx
-        adcxq	%rcx, %r12
-        mulxq	24(%rdi), %rdx, %rcx
-        adoxq	%rdx, %r11
-        adoxq	%rcx, %r12
-        # A[2] * B[3]
-        movq	24(%rbp), %rdx
-        mulxq	16(%rdi), %rdx, %rcx
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rbp), %rdx
-        adcxq	%rcx, %r14
-        mulxq	24(%rdi), %rcx, %rdx
-        adcxq	%rax, %r15
-        adoxq	%rcx, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rax, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rax, %rax
-        mulxq	%r12, %rcx, %r12
-        adcxq	%rcx, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rcx
-        andq	%rax, %r11
-        addq	%rcx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rbx)
-        movq	%r9, 8(%rbx)
-        movq	%r10, 16(%rbx)
-        movq	%r11, 24(%rbx)
-        movq	176(%rsp), %rbx
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rbx), %rdx
-        mulxq	(%rsi), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rsi), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rsi), %rcx, %rax
-        xorq	%r15, %r15
-        adcxq	%rcx, %r9
-        # A[1] * B[3]
-        movq	24(%rbx), %rdx
-        mulxq	8(%rsi), %r12, %r13
-        adcxq	%rax, %r10
-        # A[0] * B[1]
-        movq	8(%rbx), %rdx
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%rcx, %r9
-        # A[2] * B[1]
-        mulxq	16(%rsi), %rcx, %r14
-        adoxq	%rax, %r10
-        adcxq	%rcx, %r11
-        # A[1] * B[2]
-        movq	16(%rbx), %rdx
-        mulxq	8(%rsi), %rcx, %rax
-        adcxq	%r14, %r12
-        adoxq	%rcx, %r11
-        adcxq	%r15, %r13
-        adoxq	%rax, %r12
-        # A[0] * B[2]
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rcx, %r10
-        # A[1] * B[1]
-        movq	8(%rbx), %rdx
-        mulxq	8(%rsi), %rdx, %rcx
-        adcxq	%rax, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rbx), %rdx
-        adoxq	%rcx, %r11
-        mulxq	24(%rsi), %rcx, %rax
-        adcxq	%rcx, %r12
-        # A[2] * B[2]
-        movq	16(%rbx), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rax, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rbx), %rdx
-        adoxq	%rcx, %r13
-        mulxq	24(%rsi), %rcx, %rax
-        adoxq	%r15, %r14
-        adcxq	%rcx, %r14
-        # A[0] * B[3]
-        mulxq	(%rsi), %rdx, %rcx
-        adcxq	%rax, %r15
-        xorq	%rax, %rax
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rbx), %rdx
-        adcxq	%rcx, %r12
-        mulxq	24(%rsi), %rdx, %rcx
-        adoxq	%rdx, %r11
-        adoxq	%rcx, %r12
-        # A[2] * B[3]
-        movq	24(%rbx), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rbx), %rdx
-        adcxq	%rcx, %r14
-        mulxq	24(%rsi), %rcx, %rdx
-        adcxq	%rax, %r15
-        adoxq	%rcx, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rax, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rax, %rax
-        mulxq	%r12, %rcx, %r12
-        adcxq	%rcx, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rcx
-        andq	%rax, %r11
-        addq	%rcx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rsi)
-        movq	%r9, 8(%rsi)
-        movq	%r10, 16(%rsi)
-        movq	%r11, 24(%rsi)
-        movq	24(%rsp), %rsi
-        movq	160(%rsp), %rbx
-        movq	144(%rsp), %rbp
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rbp), %rdx
-        mulxq	(%rbx), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rbx), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rbx), %rcx, %rax
-        xorq	%r15, %r15
-        adcxq	%rcx, %r9
-        # A[1] * B[3]
-        movq	24(%rbp), %rdx
-        mulxq	8(%rbx), %r12, %r13
-        adcxq	%rax, %r10
-        # A[0] * B[1]
-        movq	8(%rbp), %rdx
-        mulxq	(%rbx), %rcx, %rax
-        adoxq	%rcx, %r9
-        # A[2] * B[1]
-        mulxq	16(%rbx), %rcx, %r14
-        adoxq	%rax, %r10
-        adcxq	%rcx, %r11
-        # A[1] * B[2]
-        movq	16(%rbp), %rdx
-        mulxq	8(%rbx), %rcx, %rax
-        adcxq	%r14, %r12
-        adoxq	%rcx, %r11
-        adcxq	%r15, %r13
-        adoxq	%rax, %r12
-        # A[0] * B[2]
-        mulxq	(%rbx), %rcx, %rax
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rcx, %r10
-        # A[1] * B[1]
-        movq	8(%rbp), %rdx
-        mulxq	8(%rbx), %rdx, %rcx
-        adcxq	%rax, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rbp), %rdx
-        adoxq	%rcx, %r11
-        mulxq	24(%rbx), %rcx, %rax
-        adcxq	%rcx, %r12
-        # A[2] * B[2]
-        movq	16(%rbp), %rdx
-        mulxq	16(%rbx), %rdx, %rcx
-        adcxq	%rax, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rbp), %rdx
-        adoxq	%rcx, %r13
-        mulxq	24(%rbx), %rcx, %rax
-        adoxq	%r15, %r14
-        adcxq	%rcx, %r14
-        # A[0] * B[3]
-        mulxq	(%rbx), %rdx, %rcx
-        adcxq	%rax, %r15
-        xorq	%rax, %rax
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rbp), %rdx
-        adcxq	%rcx, %r12
-        mulxq	24(%rbx), %rdx, %rcx
-        adoxq	%rdx, %r11
-        adoxq	%rcx, %r12
-        # A[2] * B[3]
-        movq	24(%rbp), %rdx
-        mulxq	16(%rbx), %rdx, %rcx
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rbp), %rdx
-        adcxq	%rcx, %r14
-        mulxq	24(%rbx), %rcx, %rdx
-        adcxq	%rax, %r15
-        adoxq	%rcx, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rax, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rax, %rax
-        mulxq	%r12, %rcx, %r12
-        adcxq	%rcx, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rcx
-        andq	%rax, %r11
-        addq	%rcx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rsi)
-        movq	%r9, 8(%rsi)
-        movq	%r10, 16(%rsi)
-        movq	%r11, 24(%rsi)
-        movq	136(%rsp), %rsi
-        movq	152(%rsp), %rbx
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rbx), %rdx
-        mulxq	(%rsi), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rsi), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rsi), %rcx, %rax
-        xorq	%r15, %r15
-        adcxq	%rcx, %r9
-        # A[1] * B[3]
-        movq	24(%rbx), %rdx
-        mulxq	8(%rsi), %r12, %r13
-        adcxq	%rax, %r10
-        # A[0] * B[1]
-        movq	8(%rbx), %rdx
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%rcx, %r9
-        # A[2] * B[1]
-        mulxq	16(%rsi), %rcx, %r14
-        adoxq	%rax, %r10
-        adcxq	%rcx, %r11
-        # A[1] * B[2]
-        movq	16(%rbx), %rdx
-        mulxq	8(%rsi), %rcx, %rax
-        adcxq	%r14, %r12
-        adoxq	%rcx, %r11
-        adcxq	%r15, %r13
-        adoxq	%rax, %r12
-        # A[0] * B[2]
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rcx, %r10
-        # A[1] * B[1]
-        movq	8(%rbx), %rdx
-        mulxq	8(%rsi), %rdx, %rcx
-        adcxq	%rax, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rbx), %rdx
-        adoxq	%rcx, %r11
-        mulxq	24(%rsi), %rcx, %rax
-        adcxq	%rcx, %r12
-        # A[2] * B[2]
-        movq	16(%rbx), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rax, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rbx), %rdx
-        adoxq	%rcx, %r13
-        mulxq	24(%rsi), %rcx, %rax
-        adoxq	%r15, %r14
-        adcxq	%rcx, %r14
-        # A[0] * B[3]
-        mulxq	(%rsi), %rdx, %rcx
-        adcxq	%rax, %r15
-        xorq	%rax, %rax
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rbx), %rdx
-        adcxq	%rcx, %r12
-        mulxq	24(%rsi), %rdx, %rcx
-        adoxq	%rdx, %r11
-        adoxq	%rcx, %r12
-        # A[2] * B[3]
-        movq	24(%rbx), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rbx), %rdx
-        adcxq	%rcx, %r14
-        mulxq	24(%rsi), %rcx, %rdx
-        adcxq	%rax, %r15
-        adoxq	%rcx, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rax, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rax, %rax
-        mulxq	%r12, %rcx, %r12
-        adcxq	%rcx, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rcx
-        andq	%rax, %r11
-        addq	%rcx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        leaq	48(%rsp), %rsi
-        # Double
-        movq	(%rdi), %r8
-        movq	8(%rdi), %r9
-        addq	%r8, %r8
-        movq	16(%rdi), %r10
-        adcq	%r9, %r9
-        movq	24(%rdi), %rdx
-        adcq	%r10, %r10
-        movq	$-19, %rcx
-        adcq	%rdx, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        movq	%rdx, %r11
-        sarq	$63, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r8
-        sbbq	%rdx, %r9
-        sbbq	%rdx, %r10
-        sbbq	%rax, %r11
-        movq	%r8, (%rsi)
-        movq	%r9, 8(%rsi)
-        movq	%r10, 16(%rsi)
-        movq	%r11, 24(%rsi)
-        movq	8(%rsp), %rbx
-        movq	16(%rsp), %rbp
-        # Add
-        movq	(%rbp), %r8
-        movq	8(%rbp), %r9
-        movq	16(%rbp), %r10
-        movq	24(%rbp), %rdx
-        movq	%r8, %r12
-        addq	(%rbx), %r8
-        movq	%r9, %r13
-        adcq	8(%rbx), %r9
-        movq	%r10, %r14
-        adcq	16(%rbx), %r10
-        movq	%rdx, %r15
-        adcq	24(%rbx), %rdx
-        movq	$-19, %rcx
-        movq	%rdx, %r11
-        movq	$0x7fffffffffffffff, %rax
-        sarq	$63, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r8
-        sbbq	%rdx, %r9
-        sbbq	%rdx, %r10
-        sbbq	%rax, %r11
-        # Sub
-        subq	(%rbx), %r12
-        movq	$0x00, %rdx
-        sbbq	8(%rbx), %r13
-        movq	$-19, %rcx
-        sbbq	16(%rbx), %r14
-        movq	$0x7fffffffffffffff, %rax
-        sbbq	24(%rbx), %r15
-        sbbq	$0x00, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Add modulus (if underflow)
-        addq	%rcx, %r12
-        adcq	%rdx, %r13
-        adcq	%rdx, %r14
-        adcq	%rax, %r15
-        movq	%r8, (%rbx)
-        movq	%r9, 8(%rbx)
-        movq	%r10, 16(%rbx)
-        movq	%r11, 24(%rbx)
-        movq	%r12, (%rdi)
-        movq	%r13, 8(%rdi)
-        movq	%r14, 16(%rdi)
-        movq	%r15, 24(%rdi)
-        movq	24(%rsp), %rdi
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        movq	16(%rsi), %r10
-        movq	24(%rsi), %rdx
-        movq	%r8, %r12
-        addq	(%rdi), %r8
-        movq	%r9, %r13
-        adcq	8(%rdi), %r9
-        movq	%r10, %r14
-        adcq	16(%rdi), %r10
-        movq	%rdx, %r15
-        adcq	24(%rdi), %rdx
-        movq	$-19, %rcx
-        movq	%rdx, %r11
-        movq	$0x7fffffffffffffff, %rax
-        sarq	$63, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r8
-        sbbq	%rdx, %r9
-        sbbq	%rdx, %r10
-        sbbq	%rax, %r11
-        # Sub
-        subq	(%rdi), %r12
-        movq	$0x00, %rdx
-        sbbq	8(%rdi), %r13
-        movq	$-19, %rcx
-        sbbq	16(%rdi), %r14
-        movq	$0x7fffffffffffffff, %rax
-        sbbq	24(%rdi), %r15
-        sbbq	$0x00, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Add modulus (if underflow)
-        addq	%rcx, %r12
-        adcq	%rdx, %r13
-        adcq	%rdx, %r14
-        adcq	%rax, %r15
-        movq	%r8, (%rbp)
-        movq	%r9, 8(%rbp)
-        movq	%r10, 16(%rbp)
-        movq	%r11, 24(%rbp)
-        movq	%r12, (%rdi)
-        movq	%r13, 8(%rdi)
-        movq	%r14, 16(%rdi)
-        movq	%r15, 24(%rdi)
-        addq	$0x50, %rsp
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        popq	%rbp
-        popq	%rbx
-        repz retq
-#ifndef __APPLE__
-.size	fe_ge_add_avx2,.-fe_ge_add_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	fe_ge_sub_avx2
-.type	fe_ge_sub_avx2,@function
-.align	16
-fe_ge_sub_avx2:
-#else
-.section	__TEXT,__text
-.globl	_fe_ge_sub_avx2
-.p2align	4
-_fe_ge_sub_avx2:
-#endif /* __APPLE__ */
-        pushq	%rbx
-        pushq	%rbp
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        subq	$0x50, %rsp
-        movq	%rdi, (%rsp)
-        movq	%rsi, 8(%rsp)
-        movq	%rdx, 16(%rsp)
-        movq	%rcx, 24(%rsp)
-        movq	%r8, 32(%rsp)
-        movq	%r9, 40(%rsp)
-        movq	8(%rsp), %rsi
-        movq	40(%rsp), %rbx
-        movq	32(%rsp), %rbp
-        # Add
-        movq	(%rbx), %r8
-        movq	8(%rbx), %r9
-        movq	16(%rbx), %r10
-        movq	24(%rbx), %rdx
-        movq	%r8, %r12
-        addq	(%rbp), %r8
-        movq	%r9, %r13
-        adcq	8(%rbp), %r9
-        movq	%r10, %r14
-        adcq	16(%rbp), %r10
-        movq	%rdx, %r15
-        adcq	24(%rbp), %rdx
-        movq	$-19, %rcx
-        movq	%rdx, %r11
-        movq	$0x7fffffffffffffff, %rax
-        sarq	$63, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r8
-        sbbq	%rdx, %r9
-        sbbq	%rdx, %r10
-        sbbq	%rax, %r11
-        # Sub
-        subq	(%rbp), %r12
-        movq	$0x00, %rdx
-        sbbq	8(%rbp), %r13
-        movq	$-19, %rcx
-        sbbq	16(%rbp), %r14
-        movq	$0x7fffffffffffffff, %rax
-        sbbq	24(%rbp), %r15
-        sbbq	$0x00, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Add modulus (if underflow)
-        addq	%rcx, %r12
-        adcq	%rdx, %r13
-        adcq	%rdx, %r14
-        adcq	%rax, %r15
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	%r12, (%rsi)
-        movq	%r13, 8(%rsi)
-        movq	%r14, 16(%rsi)
-        movq	%r15, 24(%rsi)
-        movq	16(%rsp), %rbx
-        movq	176(%rsp), %rbp
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rbp), %rdx
-        mulxq	(%rdi), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rdi), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rdi), %rcx, %rax
-        xorq	%r15, %r15
-        adcxq	%rcx, %r9
-        # A[1] * B[3]
-        movq	24(%rbp), %rdx
-        mulxq	8(%rdi), %r12, %r13
-        adcxq	%rax, %r10
-        # A[0] * B[1]
-        movq	8(%rbp), %rdx
-        mulxq	(%rdi), %rcx, %rax
-        adoxq	%rcx, %r9
-        # A[2] * B[1]
-        mulxq	16(%rdi), %rcx, %r14
-        adoxq	%rax, %r10
-        adcxq	%rcx, %r11
-        # A[1] * B[2]
-        movq	16(%rbp), %rdx
-        mulxq	8(%rdi), %rcx, %rax
-        adcxq	%r14, %r12
-        adoxq	%rcx, %r11
-        adcxq	%r15, %r13
-        adoxq	%rax, %r12
-        # A[0] * B[2]
-        mulxq	(%rdi), %rcx, %rax
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rcx, %r10
-        # A[1] * B[1]
-        movq	8(%rbp), %rdx
-        mulxq	8(%rdi), %rdx, %rcx
-        adcxq	%rax, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rbp), %rdx
-        adoxq	%rcx, %r11
-        mulxq	24(%rdi), %rcx, %rax
-        adcxq	%rcx, %r12
-        # A[2] * B[2]
-        movq	16(%rbp), %rdx
-        mulxq	16(%rdi), %rdx, %rcx
-        adcxq	%rax, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rbp), %rdx
-        adoxq	%rcx, %r13
-        mulxq	24(%rdi), %rcx, %rax
-        adoxq	%r15, %r14
-        adcxq	%rcx, %r14
-        # A[0] * B[3]
-        mulxq	(%rdi), %rdx, %rcx
-        adcxq	%rax, %r15
-        xorq	%rax, %rax
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rbp), %rdx
-        adcxq	%rcx, %r12
-        mulxq	24(%rdi), %rdx, %rcx
-        adoxq	%rdx, %r11
-        adoxq	%rcx, %r12
-        # A[2] * B[3]
-        movq	24(%rbp), %rdx
-        mulxq	16(%rdi), %rdx, %rcx
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rbp), %rdx
-        adcxq	%rcx, %r14
-        mulxq	24(%rdi), %rcx, %rdx
-        adcxq	%rax, %r15
-        adoxq	%rcx, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rax, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rax, %rax
-        mulxq	%r12, %rcx, %r12
-        adcxq	%rcx, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rcx
-        andq	%rax, %r11
-        addq	%rcx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rbx)
-        movq	%r9, 8(%rbx)
-        movq	%r10, 16(%rbx)
-        movq	%r11, 24(%rbx)
-        movq	168(%rsp), %rbx
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rbx), %rdx
-        mulxq	(%rsi), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rsi), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rsi), %rcx, %rax
-        xorq	%r15, %r15
-        adcxq	%rcx, %r9
-        # A[1] * B[3]
-        movq	24(%rbx), %rdx
-        mulxq	8(%rsi), %r12, %r13
-        adcxq	%rax, %r10
-        # A[0] * B[1]
-        movq	8(%rbx), %rdx
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%rcx, %r9
-        # A[2] * B[1]
-        mulxq	16(%rsi), %rcx, %r14
-        adoxq	%rax, %r10
-        adcxq	%rcx, %r11
-        # A[1] * B[2]
-        movq	16(%rbx), %rdx
-        mulxq	8(%rsi), %rcx, %rax
-        adcxq	%r14, %r12
-        adoxq	%rcx, %r11
-        adcxq	%r15, %r13
-        adoxq	%rax, %r12
-        # A[0] * B[2]
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rcx, %r10
-        # A[1] * B[1]
-        movq	8(%rbx), %rdx
-        mulxq	8(%rsi), %rdx, %rcx
-        adcxq	%rax, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rbx), %rdx
-        adoxq	%rcx, %r11
-        mulxq	24(%rsi), %rcx, %rax
-        adcxq	%rcx, %r12
-        # A[2] * B[2]
-        movq	16(%rbx), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rax, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rbx), %rdx
-        adoxq	%rcx, %r13
-        mulxq	24(%rsi), %rcx, %rax
-        adoxq	%r15, %r14
-        adcxq	%rcx, %r14
-        # A[0] * B[3]
-        mulxq	(%rsi), %rdx, %rcx
-        adcxq	%rax, %r15
-        xorq	%rax, %rax
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rbx), %rdx
-        adcxq	%rcx, %r12
-        mulxq	24(%rsi), %rdx, %rcx
-        adoxq	%rdx, %r11
-        adoxq	%rcx, %r12
-        # A[2] * B[3]
-        movq	24(%rbx), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rbx), %rdx
-        adcxq	%rcx, %r14
-        mulxq	24(%rsi), %rcx, %rdx
-        adcxq	%rax, %r15
-        adoxq	%rcx, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rax, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rax, %rax
-        mulxq	%r12, %rcx, %r12
-        adcxq	%rcx, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rcx
-        andq	%rax, %r11
-        addq	%rcx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rsi)
-        movq	%r9, 8(%rsi)
-        movq	%r10, 16(%rsi)
-        movq	%r11, 24(%rsi)
-        movq	24(%rsp), %rsi
-        movq	160(%rsp), %rbx
-        movq	144(%rsp), %rbp
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rbp), %rdx
-        mulxq	(%rbx), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rbx), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rbx), %rcx, %rax
-        xorq	%r15, %r15
-        adcxq	%rcx, %r9
-        # A[1] * B[3]
-        movq	24(%rbp), %rdx
-        mulxq	8(%rbx), %r12, %r13
-        adcxq	%rax, %r10
-        # A[0] * B[1]
-        movq	8(%rbp), %rdx
-        mulxq	(%rbx), %rcx, %rax
-        adoxq	%rcx, %r9
-        # A[2] * B[1]
-        mulxq	16(%rbx), %rcx, %r14
-        adoxq	%rax, %r10
-        adcxq	%rcx, %r11
-        # A[1] * B[2]
-        movq	16(%rbp), %rdx
-        mulxq	8(%rbx), %rcx, %rax
-        adcxq	%r14, %r12
-        adoxq	%rcx, %r11
-        adcxq	%r15, %r13
-        adoxq	%rax, %r12
-        # A[0] * B[2]
-        mulxq	(%rbx), %rcx, %rax
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rcx, %r10
-        # A[1] * B[1]
-        movq	8(%rbp), %rdx
-        mulxq	8(%rbx), %rdx, %rcx
-        adcxq	%rax, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rbp), %rdx
-        adoxq	%rcx, %r11
-        mulxq	24(%rbx), %rcx, %rax
-        adcxq	%rcx, %r12
-        # A[2] * B[2]
-        movq	16(%rbp), %rdx
-        mulxq	16(%rbx), %rdx, %rcx
-        adcxq	%rax, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rbp), %rdx
-        adoxq	%rcx, %r13
-        mulxq	24(%rbx), %rcx, %rax
-        adoxq	%r15, %r14
-        adcxq	%rcx, %r14
-        # A[0] * B[3]
-        mulxq	(%rbx), %rdx, %rcx
-        adcxq	%rax, %r15
-        xorq	%rax, %rax
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rbp), %rdx
-        adcxq	%rcx, %r12
-        mulxq	24(%rbx), %rdx, %rcx
-        adoxq	%rdx, %r11
-        adoxq	%rcx, %r12
-        # A[2] * B[3]
-        movq	24(%rbp), %rdx
-        mulxq	16(%rbx), %rdx, %rcx
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rbp), %rdx
-        adcxq	%rcx, %r14
-        mulxq	24(%rbx), %rcx, %rdx
-        adcxq	%rax, %r15
-        adoxq	%rcx, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rax, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rax, %rax
-        mulxq	%r12, %rcx, %r12
-        adcxq	%rcx, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rcx
-        andq	%rax, %r11
-        addq	%rcx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rsi)
-        movq	%r9, 8(%rsi)
-        movq	%r10, 16(%rsi)
-        movq	%r11, 24(%rsi)
-        movq	136(%rsp), %rsi
-        movq	152(%rsp), %rbx
-        # Multiply
-        # A[0] * B[0]
-        movq	(%rbx), %rdx
-        mulxq	(%rsi), %r8, %r9
-        # A[2] * B[0]
-        mulxq	16(%rsi), %r10, %r11
-        # A[1] * B[0]
-        mulxq	8(%rsi), %rcx, %rax
-        xorq	%r15, %r15
-        adcxq	%rcx, %r9
-        # A[1] * B[3]
-        movq	24(%rbx), %rdx
-        mulxq	8(%rsi), %r12, %r13
-        adcxq	%rax, %r10
-        # A[0] * B[1]
-        movq	8(%rbx), %rdx
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%rcx, %r9
-        # A[2] * B[1]
-        mulxq	16(%rsi), %rcx, %r14
-        adoxq	%rax, %r10
-        adcxq	%rcx, %r11
-        # A[1] * B[2]
-        movq	16(%rbx), %rdx
-        mulxq	8(%rsi), %rcx, %rax
-        adcxq	%r14, %r12
-        adoxq	%rcx, %r11
-        adcxq	%r15, %r13
-        adoxq	%rax, %r12
-        # A[0] * B[2]
-        mulxq	(%rsi), %rcx, %rax
-        adoxq	%r15, %r13
-        xorq	%r14, %r14
-        adcxq	%rcx, %r10
-        # A[1] * B[1]
-        movq	8(%rbx), %rdx
-        mulxq	8(%rsi), %rdx, %rcx
-        adcxq	%rax, %r11
-        adoxq	%rdx, %r10
-        # A[3] * B[1]
-        movq	8(%rbx), %rdx
-        adoxq	%rcx, %r11
-        mulxq	24(%rsi), %rcx, %rax
-        adcxq	%rcx, %r12
-        # A[2] * B[2]
-        movq	16(%rbx), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rax, %r13
-        adoxq	%rdx, %r12
-        # A[3] * B[3]
-        movq	24(%rbx), %rdx
-        adoxq	%rcx, %r13
-        mulxq	24(%rsi), %rcx, %rax
-        adoxq	%r15, %r14
-        adcxq	%rcx, %r14
-        # A[0] * B[3]
-        mulxq	(%rsi), %rdx, %rcx
-        adcxq	%rax, %r15
-        xorq	%rax, %rax
-        adcxq	%rdx, %r11
-        # A[3] * B[0]
-        movq	(%rbx), %rdx
-        adcxq	%rcx, %r12
-        mulxq	24(%rsi), %rdx, %rcx
-        adoxq	%rdx, %r11
-        adoxq	%rcx, %r12
-        # A[2] * B[3]
-        movq	24(%rbx), %rdx
-        mulxq	16(%rsi), %rdx, %rcx
-        adcxq	%rdx, %r13
-        # A[3] * B[2]
-        movq	16(%rbx), %rdx
-        adcxq	%rcx, %r14
-        mulxq	24(%rsi), %rcx, %rdx
-        adcxq	%rax, %r15
-        adoxq	%rcx, %r13
-        adoxq	%rdx, %r14
-        adoxq	%rax, %r15
-        # Reduce
-        movq	$0x7fffffffffffffff, %rax
-        #  Move top half into t4-t7 and remove top bit from t3
-        shldq	$0x01, %r14, %r15
-        shldq	$0x01, %r13, %r14
-        shldq	$0x01, %r12, %r13
-        shldq	$0x01, %r11, %r12
-        andq	%rax, %r11
-        #  Multiply top half by 19
-        movq	$19, %rdx
-        xorq	%rax, %rax
-        mulxq	%r12, %rcx, %r12
-        adcxq	%rcx, %r8
-        adoxq	%r12, %r9
-        mulxq	%r13, %rcx, %r13
-        adcxq	%rcx, %r9
-        adoxq	%r13, %r10
-        mulxq	%r14, %rcx, %r14
-        adcxq	%rcx, %r10
-        adoxq	%r14, %r11
-        mulxq	%r15, %r15, %rdx
-        adcxq	%r15, %r11
-        adoxq	%rax, %rdx
-        adcxq	%rax, %rdx
-        #  Overflow
-        shldq	$0x01, %r11, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        imulq	$19, %rdx, %rcx
-        andq	%rax, %r11
-        addq	%rcx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Reduce if top bit set
-        movq	%r11, %rdx
-        sarq	$63, %rdx
-        andq	$19, %rdx
-        andq	%rax, %r11
-        addq	%rdx, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        adcq	$0x00, %r11
-        # Store
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        leaq	48(%rsp), %rsi
-        # Double
-        movq	(%rdi), %r8
-        movq	8(%rdi), %r9
-        addq	%r8, %r8
-        movq	16(%rdi), %r10
-        adcq	%r9, %r9
-        movq	24(%rdi), %rdx
-        adcq	%r10, %r10
-        movq	$-19, %rcx
-        adcq	%rdx, %rdx
-        movq	$0x7fffffffffffffff, %rax
-        movq	%rdx, %r11
-        sarq	$63, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r8
-        sbbq	%rdx, %r9
-        sbbq	%rdx, %r10
-        sbbq	%rax, %r11
-        movq	%r8, (%rsi)
-        movq	%r9, 8(%rsi)
-        movq	%r10, 16(%rsi)
-        movq	%r11, 24(%rsi)
-        movq	8(%rsp), %rbx
-        movq	16(%rsp), %rbp
-        # Add
-        movq	(%rbp), %r8
-        movq	8(%rbp), %r9
-        movq	16(%rbp), %r10
-        movq	24(%rbp), %rdx
-        movq	%r8, %r12
-        addq	(%rbx), %r8
-        movq	%r9, %r13
-        adcq	8(%rbx), %r9
-        movq	%r10, %r14
-        adcq	16(%rbx), %r10
-        movq	%rdx, %r15
-        adcq	24(%rbx), %rdx
-        movq	$-19, %rcx
-        movq	%rdx, %r11
-        movq	$0x7fffffffffffffff, %rax
-        sarq	$63, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r8
-        sbbq	%rdx, %r9
-        sbbq	%rdx, %r10
-        sbbq	%rax, %r11
-        # Sub
-        subq	(%rbx), %r12
-        movq	$0x00, %rdx
-        sbbq	8(%rbx), %r13
-        movq	$-19, %rcx
-        sbbq	16(%rbx), %r14
-        movq	$0x7fffffffffffffff, %rax
-        sbbq	24(%rbx), %r15
-        sbbq	$0x00, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Add modulus (if underflow)
-        addq	%rcx, %r12
-        adcq	%rdx, %r13
-        adcq	%rdx, %r14
-        adcq	%rax, %r15
-        movq	%r8, (%rbx)
-        movq	%r9, 8(%rbx)
-        movq	%r10, 16(%rbx)
-        movq	%r11, 24(%rbx)
-        movq	%r12, (%rdi)
-        movq	%r13, 8(%rdi)
-        movq	%r14, 16(%rdi)
-        movq	%r15, 24(%rdi)
-        movq	24(%rsp), %rdi
-        # Add
-        movq	(%rsi), %r8
-        movq	8(%rsi), %r9
-        movq	16(%rsi), %r10
-        movq	24(%rsi), %rdx
-        movq	%r8, %r12
-        addq	(%rdi), %r8
-        movq	%r9, %r13
-        adcq	8(%rdi), %r9
-        movq	%r10, %r14
-        adcq	16(%rdi), %r10
-        movq	%rdx, %r15
-        adcq	24(%rdi), %rdx
-        movq	$-19, %rcx
-        movq	%rdx, %r11
-        movq	$0x7fffffffffffffff, %rax
-        sarq	$63, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Sub modulus (if overflow)
-        subq	%rcx, %r8
-        sbbq	%rdx, %r9
-        sbbq	%rdx, %r10
-        sbbq	%rax, %r11
-        # Sub
-        subq	(%rdi), %r12
-        movq	$0x00, %rdx
-        sbbq	8(%rdi), %r13
-        movq	$-19, %rcx
-        sbbq	16(%rdi), %r14
-        movq	$0x7fffffffffffffff, %rax
-        sbbq	24(%rdi), %r15
-        sbbq	$0x00, %rdx
-        #   Mask the modulus
-        andq	%rdx, %rcx
-        andq	%rdx, %rax
-        #   Add modulus (if underflow)
-        addq	%rcx, %r12
-        adcq	%rdx, %r13
-        adcq	%rdx, %r14
-        adcq	%rax, %r15
-        movq	%r8, (%rdi)
-        movq	%r9, 8(%rdi)
-        movq	%r10, 16(%rdi)
-        movq	%r11, 24(%rdi)
-        movq	%r12, (%rbp)
-        movq	%r13, 8(%rbp)
-        movq	%r14, 16(%rbp)
-        movq	%r15, 24(%rbp)
-        addq	$0x50, %rsp
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        popq	%rbp
-        popq	%rbx
-        repz retq
-#ifndef __APPLE__
-.size	fe_ge_sub_avx2,.-fe_ge_sub_avx2
-#endif /* __APPLE__ */
-#endif /* HAVE_INTEL_AVX2 */
-
-#if defined(__linux__) && defined(__ELF__)
-.section	.note.GNU-stack,"",%progbits
-#endif

+ 0 - 3874
lib/wolfssl/wolfcrypt/src/fp_mont_small.i

@@ -1,3874 +0,0 @@
-/* fp_mont_small.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_SMALL_MONT_SET
-/* computes x/R == x (mod N) via Montgomery Reduction */
-int fp_montgomery_reduce_small(fp_int *a, fp_int *m, fp_digit mp)
-{
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit c[FP_SIZE];
-#else
-   fp_digit *c;
-#endif
-   fp_digit *_c, *tmpm, mu, cy;
-   int      oldused, x, y, pa;
-
-#ifdef WOLFSSL_SMALL_STACK
-   /* only allocate space for what's needed for window plus res */
-   c = (fp_digit*)XMALLOC(sizeof(fp_digit)*FP_SIZE, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (c == NULL) {
-      return FP_MEM;
-   }
-#endif
-
-   /* now zero the buff */
-   XMEMSET(c, 0, sizeof(fp_digit)*(FP_SIZE));
-
-   pa = m->used;
-
-   /* copy the input */
-   oldused = a->used;
-   for (x = 0; x < oldused; x++) {
-       c[x] = a->dp[x];
-   }
-
-   MONT_START;
-
-   switch (pa) {
-      case 1:
-            x = 0; cy   = 0;
-            LOOP_START;
-            _c   = c + 0;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-         break;
-      case 2:
-            x = 0; cy   = 0;
-            LOOP_START;
-            _c   = c + 0;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 1; cy   = 0;
-            LOOP_START;
-            _c   = c + 1;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-         break;
-      case 3:
-            x = 0; cy   = 0;
-            LOOP_START;
-            _c   = c + 0;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 1; cy   = 0;
-            LOOP_START;
-            _c   = c + 1;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 2; cy   = 0;
-            LOOP_START;
-            _c   = c + 2;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-         break;
-      case 4:
-            x = 0; cy   = 0;
-            LOOP_START;
-            _c   = c + 0;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 1; cy   = 0;
-            LOOP_START;
-            _c   = c + 1;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 2; cy   = 0;
-            LOOP_START;
-            _c   = c + 2;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 3; cy   = 0;
-            LOOP_START;
-            _c   = c + 3;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-         break;
-      case 5:
-            x = 0; cy   = 0;
-            LOOP_START;
-            _c   = c + 0;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 1; cy   = 0;
-            LOOP_START;
-            _c   = c + 1;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 2; cy   = 0;
-            LOOP_START;
-            _c   = c + 2;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 3; cy   = 0;
-            LOOP_START;
-            _c   = c + 3;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 4; cy   = 0;
-            LOOP_START;
-            _c   = c + 4;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-         break;
-      case 6:
-            x = 0; cy   = 0;
-            LOOP_START;
-            _c   = c + 0;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 1; cy   = 0;
-            LOOP_START;
-            _c   = c + 1;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 2; cy   = 0;
-            LOOP_START;
-            _c   = c + 2;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 3; cy   = 0;
-            LOOP_START;
-            _c   = c + 3;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 4; cy   = 0;
-            LOOP_START;
-            _c   = c + 4;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 5; cy   = 0;
-            LOOP_START;
-            _c   = c + 5;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-         break;
-      case 7:
-            x = 0; cy   = 0;
-            LOOP_START;
-            _c   = c + 0;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 1; cy   = 0;
-            LOOP_START;
-            _c   = c + 1;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 2; cy   = 0;
-            LOOP_START;
-            _c   = c + 2;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 3; cy   = 0;
-            LOOP_START;
-            _c   = c + 3;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 4; cy   = 0;
-            LOOP_START;
-            _c   = c + 4;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 5; cy   = 0;
-            LOOP_START;
-            _c   = c + 5;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 6; cy   = 0;
-            LOOP_START;
-            _c   = c + 6;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-         break;
-      case 8:
-            x = 0; cy   = 0;
-            LOOP_START;
-            _c   = c + 0;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 1; cy   = 0;
-            LOOP_START;
-            _c   = c + 1;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 2; cy   = 0;
-            LOOP_START;
-            _c   = c + 2;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 3; cy   = 0;
-            LOOP_START;
-            _c   = c + 3;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 4; cy   = 0;
-            LOOP_START;
-            _c   = c + 4;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 5; cy   = 0;
-            LOOP_START;
-            _c   = c + 5;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 6; cy   = 0;
-            LOOP_START;
-            _c   = c + 6;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 7; cy   = 0;
-            LOOP_START;
-            _c   = c + 7;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-         break;
-      case 9:
-            x = 0; cy   = 0;
-            LOOP_START;
-            _c   = c + 0;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 1; cy   = 0;
-            LOOP_START;
-            _c   = c + 1;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 2; cy   = 0;
-            LOOP_START;
-            _c   = c + 2;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 3; cy   = 0;
-            LOOP_START;
-            _c   = c + 3;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 4; cy   = 0;
-            LOOP_START;
-            _c   = c + 4;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 5; cy   = 0;
-            LOOP_START;
-            _c   = c + 5;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 6; cy   = 0;
-            LOOP_START;
-            _c   = c + 6;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 7; cy   = 0;
-            LOOP_START;
-            _c   = c + 7;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 8; cy   = 0;
-            LOOP_START;
-            _c   = c + 8;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-         break;
-      case 10:
-            x = 0; cy   = 0;
-            LOOP_START;
-            _c   = c + 0;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 1; cy   = 0;
-            LOOP_START;
-            _c   = c + 1;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 2; cy   = 0;
-            LOOP_START;
-            _c   = c + 2;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 3; cy   = 0;
-            LOOP_START;
-            _c   = c + 3;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 4; cy   = 0;
-            LOOP_START;
-            _c   = c + 4;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 5; cy   = 0;
-            LOOP_START;
-            _c   = c + 5;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 6; cy   = 0;
-            LOOP_START;
-            _c   = c + 6;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 7; cy   = 0;
-            LOOP_START;
-            _c   = c + 7;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 8; cy   = 0;
-            LOOP_START;
-            _c   = c + 8;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 9; cy   = 0;
-            LOOP_START;
-            _c   = c + 9;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-         break;
-      case 11:
-            x = 0; cy   = 0;
-            LOOP_START;
-            _c   = c + 0;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 1; cy   = 0;
-            LOOP_START;
-            _c   = c + 1;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 2; cy   = 0;
-            LOOP_START;
-            _c   = c + 2;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 3; cy   = 0;
-            LOOP_START;
-            _c   = c + 3;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 4; cy   = 0;
-            LOOP_START;
-            _c   = c + 4;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 5; cy   = 0;
-            LOOP_START;
-            _c   = c + 5;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 6; cy   = 0;
-            LOOP_START;
-            _c   = c + 6;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 7; cy   = 0;
-            LOOP_START;
-            _c   = c + 7;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 8; cy   = 0;
-            LOOP_START;
-            _c   = c + 8;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 9; cy   = 0;
-            LOOP_START;
-            _c   = c + 9;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 10; cy   = 0;
-            LOOP_START;
-            _c   = c + 10;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-         break;
-      case 12:
-            x = 0; cy   = 0;
-            LOOP_START;
-            _c   = c + 0;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 1; cy   = 0;
-            LOOP_START;
-            _c   = c + 1;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 2; cy   = 0;
-            LOOP_START;
-            _c   = c + 2;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 3; cy   = 0;
-            LOOP_START;
-            _c   = c + 3;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 4; cy   = 0;
-            LOOP_START;
-            _c   = c + 4;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 5; cy   = 0;
-            LOOP_START;
-            _c   = c + 5;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 6; cy   = 0;
-            LOOP_START;
-            _c   = c + 6;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 7; cy   = 0;
-            LOOP_START;
-            _c   = c + 7;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 8; cy   = 0;
-            LOOP_START;
-            _c   = c + 8;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 9; cy   = 0;
-            LOOP_START;
-            _c   = c + 9;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 10; cy   = 0;
-            LOOP_START;
-            _c   = c + 10;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 11; cy   = 0;
-            LOOP_START;
-            _c   = c + 11;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-         break;
-      case 13:
-            x = 0; cy   = 0;
-            LOOP_START;
-            _c   = c + 0;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 1; cy   = 0;
-            LOOP_START;
-            _c   = c + 1;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 2; cy   = 0;
-            LOOP_START;
-            _c   = c + 2;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 3; cy   = 0;
-            LOOP_START;
-            _c   = c + 3;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 4; cy   = 0;
-            LOOP_START;
-            _c   = c + 4;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 5; cy   = 0;
-            LOOP_START;
-            _c   = c + 5;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 6; cy   = 0;
-            LOOP_START;
-            _c   = c + 6;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 7; cy   = 0;
-            LOOP_START;
-            _c   = c + 7;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 8; cy   = 0;
-            LOOP_START;
-            _c   = c + 8;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 9; cy   = 0;
-            LOOP_START;
-            _c   = c + 9;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 10; cy   = 0;
-            LOOP_START;
-            _c   = c + 10;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 11; cy   = 0;
-            LOOP_START;
-            _c   = c + 11;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 12; cy   = 0;
-            LOOP_START;
-            _c   = c + 12;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-         break;
-      case 14:
-            x = 0; cy   = 0;
-            LOOP_START;
-            _c   = c + 0;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 1; cy   = 0;
-            LOOP_START;
-            _c   = c + 1;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 2; cy   = 0;
-            LOOP_START;
-            _c   = c + 2;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 3; cy   = 0;
-            LOOP_START;
-            _c   = c + 3;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 4; cy   = 0;
-            LOOP_START;
-            _c   = c + 4;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 5; cy   = 0;
-            LOOP_START;
-            _c   = c + 5;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 6; cy   = 0;
-            LOOP_START;
-            _c   = c + 6;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 7; cy   = 0;
-            LOOP_START;
-            _c   = c + 7;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 8; cy   = 0;
-            LOOP_START;
-            _c   = c + 8;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 9; cy   = 0;
-            LOOP_START;
-            _c   = c + 9;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 10; cy   = 0;
-            LOOP_START;
-            _c   = c + 10;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 11; cy   = 0;
-            LOOP_START;
-            _c   = c + 11;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 12; cy   = 0;
-            LOOP_START;
-            _c   = c + 12;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 13; cy   = 0;
-            LOOP_START;
-            _c   = c + 13;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-         break;
-      case 15:
-            x = 0; cy   = 0;
-            LOOP_START;
-            _c   = c + 0;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 1; cy   = 0;
-            LOOP_START;
-            _c   = c + 1;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 2; cy   = 0;
-            LOOP_START;
-            _c   = c + 2;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 3; cy   = 0;
-            LOOP_START;
-            _c   = c + 3;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 4; cy   = 0;
-            LOOP_START;
-            _c   = c + 4;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 5; cy   = 0;
-            LOOP_START;
-            _c   = c + 5;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 6; cy   = 0;
-            LOOP_START;
-            _c   = c + 6;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 7; cy   = 0;
-            LOOP_START;
-            _c   = c + 7;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 8; cy   = 0;
-            LOOP_START;
-            _c   = c + 8;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 9; cy   = 0;
-            LOOP_START;
-            _c   = c + 9;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 10; cy   = 0;
-            LOOP_START;
-            _c   = c + 10;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 11; cy   = 0;
-            LOOP_START;
-            _c   = c + 11;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 12; cy   = 0;
-            LOOP_START;
-            _c   = c + 12;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 13; cy   = 0;
-            LOOP_START;
-            _c   = c + 13;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 14; cy   = 0;
-            LOOP_START;
-            _c   = c + 14;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-         break;
-      case 16:
-            x = 0; cy   = 0;
-            LOOP_START;
-            _c   = c + 0;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 1; cy   = 0;
-            LOOP_START;
-            _c   = c + 1;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 2; cy   = 0;
-            LOOP_START;
-            _c   = c + 2;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 3; cy   = 0;
-            LOOP_START;
-            _c   = c + 3;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 4; cy   = 0;
-            LOOP_START;
-            _c   = c + 4;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 5; cy   = 0;
-            LOOP_START;
-            _c   = c + 5;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 6; cy   = 0;
-            LOOP_START;
-            _c   = c + 6;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 7; cy   = 0;
-            LOOP_START;
-            _c   = c + 7;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 8; cy   = 0;
-            LOOP_START;
-            _c   = c + 8;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 9; cy   = 0;
-            LOOP_START;
-            _c   = c + 9;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 10; cy   = 0;
-            LOOP_START;
-            _c   = c + 10;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 11; cy   = 0;
-            LOOP_START;
-            _c   = c + 11;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 12; cy   = 0;
-            LOOP_START;
-            _c   = c + 12;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 13; cy   = 0;
-            LOOP_START;
-            _c   = c + 13;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 14; cy   = 0;
-            LOOP_START;
-            _c   = c + 14;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-            x = 15; cy   = 0;
-            LOOP_START;
-            _c   = c + 15;
-            tmpm = m->dp;
-#ifdef INNERMUL8
-            INNERMUL8; _c += 8; tmpm += 8;
-            INNERMUL8; _c += 8; tmpm += 8;
-#else
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-            INNERMUL; ++_c;
-#endif
-            LOOP_END;
-            while (cy) {
-               PROPCARRY;
-               ++_c;
-            }
-         break;
-  }
-  /* now copy out */
-  _c   = c + pa;
-  tmpm = a->dp;
-  for (x = 0; x < pa+1; x++) {
-     *tmpm++ = *_c++;
-  }
-
-  for (; x < oldused; x++)   {
-     *tmpm++ = 0;
-  }
-
-  MONT_FINI;
-
-  a->used = pa+1;
-  fp_clamp(a);
-
-  /* if A >= m then A = A - m */
-  if (fp_cmp_mag (a, m) != FP_LT) {
-    s_fp_sub (a, m, a);
-  }
-
-#ifdef WOLFSSL_SMALL_STACK
-  XFREE(c, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-  return FP_OKAY;
-}
-
-#endif

+ 0 - 147
lib/wolfssl/wolfcrypt/src/fp_mul_comba_12.i

@@ -1,147 +0,0 @@
-/* fp_mul_comba_12.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_MUL12
-int fp_mul_comba12(fp_int *A, fp_int *B, fp_int *C)
-{
-   fp_digit c0, c1, c2;
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit at[24];
-#else
-   fp_digit *at;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 24, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (at == NULL)
-       return FP_MEM;
-#endif
-
-   XMEMCPY(at, A->dp, 12 * sizeof(fp_digit));
-   XMEMCPY(at+12, B->dp, 12 * sizeof(fp_digit));
-   COMBA_START;
-
-   COMBA_CLEAR;
-   /* 0 */
-   MULADD(at[0], at[12]); 
-   COMBA_STORE(C->dp[0]);
-   /* 1 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[13]);    MULADD(at[1], at[12]); 
-   COMBA_STORE(C->dp[1]);
-   /* 2 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[14]);    MULADD(at[1], at[13]);    MULADD(at[2], at[12]); 
-   COMBA_STORE(C->dp[2]);
-   /* 3 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[15]);    MULADD(at[1], at[14]);    MULADD(at[2], at[13]);    MULADD(at[3], at[12]); 
-   COMBA_STORE(C->dp[3]);
-   /* 4 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[16]);    MULADD(at[1], at[15]);    MULADD(at[2], at[14]);    MULADD(at[3], at[13]);    MULADD(at[4], at[12]); 
-   COMBA_STORE(C->dp[4]);
-   /* 5 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[17]);    MULADD(at[1], at[16]);    MULADD(at[2], at[15]);    MULADD(at[3], at[14]);    MULADD(at[4], at[13]);    MULADD(at[5], at[12]); 
-   COMBA_STORE(C->dp[5]);
-   /* 6 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[18]);    MULADD(at[1], at[17]);    MULADD(at[2], at[16]);    MULADD(at[3], at[15]);    MULADD(at[4], at[14]);    MULADD(at[5], at[13]);    MULADD(at[6], at[12]); 
-   COMBA_STORE(C->dp[6]);
-   /* 7 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[19]);    MULADD(at[1], at[18]);    MULADD(at[2], at[17]);    MULADD(at[3], at[16]);    MULADD(at[4], at[15]);    MULADD(at[5], at[14]);    MULADD(at[6], at[13]);    MULADD(at[7], at[12]); 
-   COMBA_STORE(C->dp[7]);
-   /* 8 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[20]);    MULADD(at[1], at[19]);    MULADD(at[2], at[18]);    MULADD(at[3], at[17]);    MULADD(at[4], at[16]);    MULADD(at[5], at[15]);    MULADD(at[6], at[14]);    MULADD(at[7], at[13]);    MULADD(at[8], at[12]); 
-   COMBA_STORE(C->dp[8]);
-   /* 9 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[21]);    MULADD(at[1], at[20]);    MULADD(at[2], at[19]);    MULADD(at[3], at[18]);    MULADD(at[4], at[17]);    MULADD(at[5], at[16]);    MULADD(at[6], at[15]);    MULADD(at[7], at[14]);    MULADD(at[8], at[13]);    MULADD(at[9], at[12]); 
-   COMBA_STORE(C->dp[9]);
-   /* 10 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[22]);    MULADD(at[1], at[21]);    MULADD(at[2], at[20]);    MULADD(at[3], at[19]);    MULADD(at[4], at[18]);    MULADD(at[5], at[17]);    MULADD(at[6], at[16]);    MULADD(at[7], at[15]);    MULADD(at[8], at[14]);    MULADD(at[9], at[13]);    MULADD(at[10], at[12]); 
-   COMBA_STORE(C->dp[10]);
-   /* 11 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[23]);    MULADD(at[1], at[22]);    MULADD(at[2], at[21]);    MULADD(at[3], at[20]);    MULADD(at[4], at[19]);    MULADD(at[5], at[18]);    MULADD(at[6], at[17]);    MULADD(at[7], at[16]);    MULADD(at[8], at[15]);    MULADD(at[9], at[14]);    MULADD(at[10], at[13]);    MULADD(at[11], at[12]); 
-   COMBA_STORE(C->dp[11]);
-   /* 12 */
-   COMBA_FORWARD;
-   MULADD(at[1], at[23]);    MULADD(at[2], at[22]);    MULADD(at[3], at[21]);    MULADD(at[4], at[20]);    MULADD(at[5], at[19]);    MULADD(at[6], at[18]);    MULADD(at[7], at[17]);    MULADD(at[8], at[16]);    MULADD(at[9], at[15]);    MULADD(at[10], at[14]);    MULADD(at[11], at[13]); 
-   COMBA_STORE(C->dp[12]);
-   /* 13 */
-   COMBA_FORWARD;
-   MULADD(at[2], at[23]);    MULADD(at[3], at[22]);    MULADD(at[4], at[21]);    MULADD(at[5], at[20]);    MULADD(at[6], at[19]);    MULADD(at[7], at[18]);    MULADD(at[8], at[17]);    MULADD(at[9], at[16]);    MULADD(at[10], at[15]);    MULADD(at[11], at[14]); 
-   COMBA_STORE(C->dp[13]);
-   /* 14 */
-   COMBA_FORWARD;
-   MULADD(at[3], at[23]);    MULADD(at[4], at[22]);    MULADD(at[5], at[21]);    MULADD(at[6], at[20]);    MULADD(at[7], at[19]);    MULADD(at[8], at[18]);    MULADD(at[9], at[17]);    MULADD(at[10], at[16]);    MULADD(at[11], at[15]); 
-   COMBA_STORE(C->dp[14]);
-   /* 15 */
-   COMBA_FORWARD;
-   MULADD(at[4], at[23]);    MULADD(at[5], at[22]);    MULADD(at[6], at[21]);    MULADD(at[7], at[20]);    MULADD(at[8], at[19]);    MULADD(at[9], at[18]);    MULADD(at[10], at[17]);    MULADD(at[11], at[16]); 
-   COMBA_STORE(C->dp[15]);
-   /* 16 */
-   COMBA_FORWARD;
-   MULADD(at[5], at[23]);    MULADD(at[6], at[22]);    MULADD(at[7], at[21]);    MULADD(at[8], at[20]);    MULADD(at[9], at[19]);    MULADD(at[10], at[18]);    MULADD(at[11], at[17]); 
-   COMBA_STORE(C->dp[16]);
-   /* 17 */
-   COMBA_FORWARD;
-   MULADD(at[6], at[23]);    MULADD(at[7], at[22]);    MULADD(at[8], at[21]);    MULADD(at[9], at[20]);    MULADD(at[10], at[19]);    MULADD(at[11], at[18]); 
-   COMBA_STORE(C->dp[17]);
-   /* 18 */
-   COMBA_FORWARD;
-   MULADD(at[7], at[23]);    MULADD(at[8], at[22]);    MULADD(at[9], at[21]);    MULADD(at[10], at[20]);    MULADD(at[11], at[19]); 
-   COMBA_STORE(C->dp[18]);
-   /* 19 */
-   COMBA_FORWARD;
-   MULADD(at[8], at[23]);    MULADD(at[9], at[22]);    MULADD(at[10], at[21]);    MULADD(at[11], at[20]); 
-   COMBA_STORE(C->dp[19]);
-   /* 20 */
-   COMBA_FORWARD;
-   MULADD(at[9], at[23]);    MULADD(at[10], at[22]);    MULADD(at[11], at[21]); 
-   COMBA_STORE(C->dp[20]);
-   /* 21 */
-   COMBA_FORWARD;
-   MULADD(at[10], at[23]);    MULADD(at[11], at[22]); 
-   COMBA_STORE(C->dp[21]);
-   /* 22 */
-   COMBA_FORWARD;
-   MULADD(at[11], at[23]); 
-   COMBA_STORE(C->dp[22]);
-   COMBA_STORE2(C->dp[23]);
-   C->used = 24;
-   C->sign = A->sign ^ B->sign;
-   fp_clamp(C);
-   COMBA_FINI;
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif

+ 0 - 187
lib/wolfssl/wolfcrypt/src/fp_mul_comba_17.i

@@ -1,187 +0,0 @@
-/* fp_mul_comba_17.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_MUL17
-int fp_mul_comba17(fp_int *A, fp_int *B, fp_int *C)
-{
-   fp_digit c0, c1, c2;
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit at[34];
-#else
-   fp_digit *at;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 34, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (at == NULL)
-       return FP_MEM;
-#endif
-
-   XMEMCPY(at, A->dp, 17 * sizeof(fp_digit));
-   XMEMCPY(at+17, B->dp, 17 * sizeof(fp_digit));
-   COMBA_START;
-
-   COMBA_CLEAR;
-   /* 0 */
-   MULADD(at[0], at[17]); 
-   COMBA_STORE(C->dp[0]);
-   /* 1 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[18]);    MULADD(at[1], at[17]); 
-   COMBA_STORE(C->dp[1]);
-   /* 2 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[19]);    MULADD(at[1], at[18]);    MULADD(at[2], at[17]); 
-   COMBA_STORE(C->dp[2]);
-   /* 3 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[20]);    MULADD(at[1], at[19]);    MULADD(at[2], at[18]);    MULADD(at[3], at[17]); 
-   COMBA_STORE(C->dp[3]);
-   /* 4 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[21]);    MULADD(at[1], at[20]);    MULADD(at[2], at[19]);    MULADD(at[3], at[18]);    MULADD(at[4], at[17]); 
-   COMBA_STORE(C->dp[4]);
-   /* 5 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[22]);    MULADD(at[1], at[21]);    MULADD(at[2], at[20]);    MULADD(at[3], at[19]);    MULADD(at[4], at[18]);    MULADD(at[5], at[17]); 
-   COMBA_STORE(C->dp[5]);
-   /* 6 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[23]);    MULADD(at[1], at[22]);    MULADD(at[2], at[21]);    MULADD(at[3], at[20]);    MULADD(at[4], at[19]);    MULADD(at[5], at[18]);    MULADD(at[6], at[17]); 
-   COMBA_STORE(C->dp[6]);
-   /* 7 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[24]);    MULADD(at[1], at[23]);    MULADD(at[2], at[22]);    MULADD(at[3], at[21]);    MULADD(at[4], at[20]);    MULADD(at[5], at[19]);    MULADD(at[6], at[18]);    MULADD(at[7], at[17]); 
-   COMBA_STORE(C->dp[7]);
-   /* 8 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[25]);    MULADD(at[1], at[24]);    MULADD(at[2], at[23]);    MULADD(at[3], at[22]);    MULADD(at[4], at[21]);    MULADD(at[5], at[20]);    MULADD(at[6], at[19]);    MULADD(at[7], at[18]);    MULADD(at[8], at[17]); 
-   COMBA_STORE(C->dp[8]);
-   /* 9 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[26]);    MULADD(at[1], at[25]);    MULADD(at[2], at[24]);    MULADD(at[3], at[23]);    MULADD(at[4], at[22]);    MULADD(at[5], at[21]);    MULADD(at[6], at[20]);    MULADD(at[7], at[19]);    MULADD(at[8], at[18]);    MULADD(at[9], at[17]); 
-   COMBA_STORE(C->dp[9]);
-   /* 10 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[27]);    MULADD(at[1], at[26]);    MULADD(at[2], at[25]);    MULADD(at[3], at[24]);    MULADD(at[4], at[23]);    MULADD(at[5], at[22]);    MULADD(at[6], at[21]);    MULADD(at[7], at[20]);    MULADD(at[8], at[19]);    MULADD(at[9], at[18]);    MULADD(at[10], at[17]); 
-   COMBA_STORE(C->dp[10]);
-   /* 11 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[28]);    MULADD(at[1], at[27]);    MULADD(at[2], at[26]);    MULADD(at[3], at[25]);    MULADD(at[4], at[24]);    MULADD(at[5], at[23]);    MULADD(at[6], at[22]);    MULADD(at[7], at[21]);    MULADD(at[8], at[20]);    MULADD(at[9], at[19]);    MULADD(at[10], at[18]);    MULADD(at[11], at[17]); 
-   COMBA_STORE(C->dp[11]);
-   /* 12 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[29]);    MULADD(at[1], at[28]);    MULADD(at[2], at[27]);    MULADD(at[3], at[26]);    MULADD(at[4], at[25]);    MULADD(at[5], at[24]);    MULADD(at[6], at[23]);    MULADD(at[7], at[22]);    MULADD(at[8], at[21]);    MULADD(at[9], at[20]);    MULADD(at[10], at[19]);    MULADD(at[11], at[18]);    MULADD(at[12], at[17]); 
-   COMBA_STORE(C->dp[12]);
-   /* 13 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[30]);    MULADD(at[1], at[29]);    MULADD(at[2], at[28]);    MULADD(at[3], at[27]);    MULADD(at[4], at[26]);    MULADD(at[5], at[25]);    MULADD(at[6], at[24]);    MULADD(at[7], at[23]);    MULADD(at[8], at[22]);    MULADD(at[9], at[21]);    MULADD(at[10], at[20]);    MULADD(at[11], at[19]);    MULADD(at[12], at[18]);    MULADD(at[13], at[17]); 
-   COMBA_STORE(C->dp[13]);
-   /* 14 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[31]);    MULADD(at[1], at[30]);    MULADD(at[2], at[29]);    MULADD(at[3], at[28]);    MULADD(at[4], at[27]);    MULADD(at[5], at[26]);    MULADD(at[6], at[25]);    MULADD(at[7], at[24]);    MULADD(at[8], at[23]);    MULADD(at[9], at[22]);    MULADD(at[10], at[21]);    MULADD(at[11], at[20]);    MULADD(at[12], at[19]);    MULADD(at[13], at[18]);    MULADD(at[14], at[17]); 
-   COMBA_STORE(C->dp[14]);
-   /* 15 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[32]);    MULADD(at[1], at[31]);    MULADD(at[2], at[30]);    MULADD(at[3], at[29]);    MULADD(at[4], at[28]);    MULADD(at[5], at[27]);    MULADD(at[6], at[26]);    MULADD(at[7], at[25]);    MULADD(at[8], at[24]);    MULADD(at[9], at[23]);    MULADD(at[10], at[22]);    MULADD(at[11], at[21]);    MULADD(at[12], at[20]);    MULADD(at[13], at[19]);    MULADD(at[14], at[18]);    MULADD(at[15], at[17]); 
-   COMBA_STORE(C->dp[15]);
-   /* 16 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[33]);    MULADD(at[1], at[32]);    MULADD(at[2], at[31]);    MULADD(at[3], at[30]);    MULADD(at[4], at[29]);    MULADD(at[5], at[28]);    MULADD(at[6], at[27]);    MULADD(at[7], at[26]);    MULADD(at[8], at[25]);    MULADD(at[9], at[24]);    MULADD(at[10], at[23]);    MULADD(at[11], at[22]);    MULADD(at[12], at[21]);    MULADD(at[13], at[20]);    MULADD(at[14], at[19]);    MULADD(at[15], at[18]);    MULADD(at[16], at[17]); 
-   COMBA_STORE(C->dp[16]);
-   /* 17 */
-   COMBA_FORWARD;
-   MULADD(at[1], at[33]);    MULADD(at[2], at[32]);    MULADD(at[3], at[31]);    MULADD(at[4], at[30]);    MULADD(at[5], at[29]);    MULADD(at[6], at[28]);    MULADD(at[7], at[27]);    MULADD(at[8], at[26]);    MULADD(at[9], at[25]);    MULADD(at[10], at[24]);    MULADD(at[11], at[23]);    MULADD(at[12], at[22]);    MULADD(at[13], at[21]);    MULADD(at[14], at[20]);    MULADD(at[15], at[19]);    MULADD(at[16], at[18]); 
-   COMBA_STORE(C->dp[17]);
-   /* 18 */
-   COMBA_FORWARD;
-   MULADD(at[2], at[33]);    MULADD(at[3], at[32]);    MULADD(at[4], at[31]);    MULADD(at[5], at[30]);    MULADD(at[6], at[29]);    MULADD(at[7], at[28]);    MULADD(at[8], at[27]);    MULADD(at[9], at[26]);    MULADD(at[10], at[25]);    MULADD(at[11], at[24]);    MULADD(at[12], at[23]);    MULADD(at[13], at[22]);    MULADD(at[14], at[21]);    MULADD(at[15], at[20]);    MULADD(at[16], at[19]); 
-   COMBA_STORE(C->dp[18]);
-   /* 19 */
-   COMBA_FORWARD;
-   MULADD(at[3], at[33]);    MULADD(at[4], at[32]);    MULADD(at[5], at[31]);    MULADD(at[6], at[30]);    MULADD(at[7], at[29]);    MULADD(at[8], at[28]);    MULADD(at[9], at[27]);    MULADD(at[10], at[26]);    MULADD(at[11], at[25]);    MULADD(at[12], at[24]);    MULADD(at[13], at[23]);    MULADD(at[14], at[22]);    MULADD(at[15], at[21]);    MULADD(at[16], at[20]); 
-   COMBA_STORE(C->dp[19]);
-   /* 20 */
-   COMBA_FORWARD;
-   MULADD(at[4], at[33]);    MULADD(at[5], at[32]);    MULADD(at[6], at[31]);    MULADD(at[7], at[30]);    MULADD(at[8], at[29]);    MULADD(at[9], at[28]);    MULADD(at[10], at[27]);    MULADD(at[11], at[26]);    MULADD(at[12], at[25]);    MULADD(at[13], at[24]);    MULADD(at[14], at[23]);    MULADD(at[15], at[22]);    MULADD(at[16], at[21]); 
-   COMBA_STORE(C->dp[20]);
-   /* 21 */
-   COMBA_FORWARD;
-   MULADD(at[5], at[33]);    MULADD(at[6], at[32]);    MULADD(at[7], at[31]);    MULADD(at[8], at[30]);    MULADD(at[9], at[29]);    MULADD(at[10], at[28]);    MULADD(at[11], at[27]);    MULADD(at[12], at[26]);    MULADD(at[13], at[25]);    MULADD(at[14], at[24]);    MULADD(at[15], at[23]);    MULADD(at[16], at[22]); 
-   COMBA_STORE(C->dp[21]);
-   /* 22 */
-   COMBA_FORWARD;
-   MULADD(at[6], at[33]);    MULADD(at[7], at[32]);    MULADD(at[8], at[31]);    MULADD(at[9], at[30]);    MULADD(at[10], at[29]);    MULADD(at[11], at[28]);    MULADD(at[12], at[27]);    MULADD(at[13], at[26]);    MULADD(at[14], at[25]);    MULADD(at[15], at[24]);    MULADD(at[16], at[23]); 
-   COMBA_STORE(C->dp[22]);
-   /* 23 */
-   COMBA_FORWARD;
-   MULADD(at[7], at[33]);    MULADD(at[8], at[32]);    MULADD(at[9], at[31]);    MULADD(at[10], at[30]);    MULADD(at[11], at[29]);    MULADD(at[12], at[28]);    MULADD(at[13], at[27]);    MULADD(at[14], at[26]);    MULADD(at[15], at[25]);    MULADD(at[16], at[24]); 
-   COMBA_STORE(C->dp[23]);
-   /* 24 */
-   COMBA_FORWARD;
-   MULADD(at[8], at[33]);    MULADD(at[9], at[32]);    MULADD(at[10], at[31]);    MULADD(at[11], at[30]);    MULADD(at[12], at[29]);    MULADD(at[13], at[28]);    MULADD(at[14], at[27]);    MULADD(at[15], at[26]);    MULADD(at[16], at[25]); 
-   COMBA_STORE(C->dp[24]);
-   /* 25 */
-   COMBA_FORWARD;
-   MULADD(at[9], at[33]);    MULADD(at[10], at[32]);    MULADD(at[11], at[31]);    MULADD(at[12], at[30]);    MULADD(at[13], at[29]);    MULADD(at[14], at[28]);    MULADD(at[15], at[27]);    MULADD(at[16], at[26]); 
-   COMBA_STORE(C->dp[25]);
-   /* 26 */
-   COMBA_FORWARD;
-   MULADD(at[10], at[33]);    MULADD(at[11], at[32]);    MULADD(at[12], at[31]);    MULADD(at[13], at[30]);    MULADD(at[14], at[29]);    MULADD(at[15], at[28]);    MULADD(at[16], at[27]); 
-   COMBA_STORE(C->dp[26]);
-   /* 27 */
-   COMBA_FORWARD;
-   MULADD(at[11], at[33]);    MULADD(at[12], at[32]);    MULADD(at[13], at[31]);    MULADD(at[14], at[30]);    MULADD(at[15], at[29]);    MULADD(at[16], at[28]); 
-   COMBA_STORE(C->dp[27]);
-   /* 28 */
-   COMBA_FORWARD;
-   MULADD(at[12], at[33]);    MULADD(at[13], at[32]);    MULADD(at[14], at[31]);    MULADD(at[15], at[30]);    MULADD(at[16], at[29]); 
-   COMBA_STORE(C->dp[28]);
-   /* 29 */
-   COMBA_FORWARD;
-   MULADD(at[13], at[33]);    MULADD(at[14], at[32]);    MULADD(at[15], at[31]);    MULADD(at[16], at[30]); 
-   COMBA_STORE(C->dp[29]);
-   /* 30 */
-   COMBA_FORWARD;
-   MULADD(at[14], at[33]);    MULADD(at[15], at[32]);    MULADD(at[16], at[31]); 
-   COMBA_STORE(C->dp[30]);
-   /* 31 */
-   COMBA_FORWARD;
-   MULADD(at[15], at[33]);    MULADD(at[16], at[32]); 
-   COMBA_STORE(C->dp[31]);
-   /* 32 */
-   COMBA_FORWARD;
-   MULADD(at[16], at[33]); 
-   COMBA_STORE(C->dp[32]);
-   COMBA_STORE2(C->dp[33]);
-   C->used = 34;
-   C->sign = A->sign ^ B->sign;
-   fp_clamp(C);
-   COMBA_FINI;
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif

+ 0 - 210
lib/wolfssl/wolfcrypt/src/fp_mul_comba_20.i

@@ -1,210 +0,0 @@
-/* fp_mul_comba_20.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-#ifdef TFM_MUL20
-int fp_mul_comba20(fp_int *A, fp_int *B, fp_int *C)
-{
-   fp_digit c0, c1, c2;
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit at[40];
-#else
-   fp_digit *at;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 40, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (at == NULL)
-       return FP_MEM;
-#endif
-
-   XMEMCPY(at, A->dp, 20 * sizeof(fp_digit));
-   XMEMCPY(at+20, B->dp, 20 * sizeof(fp_digit));
-   COMBA_START;
-
-   COMBA_CLEAR;
-   /* 0 */
-   MULADD(at[0], at[20]); 
-   COMBA_STORE(C->dp[0]);
-   /* 1 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[21]);    MULADD(at[1], at[20]); 
-   COMBA_STORE(C->dp[1]);
-   /* 2 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[22]);    MULADD(at[1], at[21]);    MULADD(at[2], at[20]); 
-   COMBA_STORE(C->dp[2]);
-   /* 3 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[23]);    MULADD(at[1], at[22]);    MULADD(at[2], at[21]);    MULADD(at[3], at[20]); 
-   COMBA_STORE(C->dp[3]);
-   /* 4 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[24]);    MULADD(at[1], at[23]);    MULADD(at[2], at[22]);    MULADD(at[3], at[21]);    MULADD(at[4], at[20]); 
-   COMBA_STORE(C->dp[4]);
-   /* 5 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[25]);    MULADD(at[1], at[24]);    MULADD(at[2], at[23]);    MULADD(at[3], at[22]);    MULADD(at[4], at[21]);    MULADD(at[5], at[20]); 
-   COMBA_STORE(C->dp[5]);
-   /* 6 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[26]);    MULADD(at[1], at[25]);    MULADD(at[2], at[24]);    MULADD(at[3], at[23]);    MULADD(at[4], at[22]);    MULADD(at[5], at[21]);    MULADD(at[6], at[20]); 
-   COMBA_STORE(C->dp[6]);
-   /* 7 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[27]);    MULADD(at[1], at[26]);    MULADD(at[2], at[25]);    MULADD(at[3], at[24]);    MULADD(at[4], at[23]);    MULADD(at[5], at[22]);    MULADD(at[6], at[21]);    MULADD(at[7], at[20]); 
-   COMBA_STORE(C->dp[7]);
-   /* 8 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[28]);    MULADD(at[1], at[27]);    MULADD(at[2], at[26]);    MULADD(at[3], at[25]);    MULADD(at[4], at[24]);    MULADD(at[5], at[23]);    MULADD(at[6], at[22]);    MULADD(at[7], at[21]);    MULADD(at[8], at[20]); 
-   COMBA_STORE(C->dp[8]);
-   /* 9 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[29]);    MULADD(at[1], at[28]);    MULADD(at[2], at[27]);    MULADD(at[3], at[26]);    MULADD(at[4], at[25]);    MULADD(at[5], at[24]);    MULADD(at[6], at[23]);    MULADD(at[7], at[22]);    MULADD(at[8], at[21]);    MULADD(at[9], at[20]); 
-   COMBA_STORE(C->dp[9]);
-   /* 10 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[30]);    MULADD(at[1], at[29]);    MULADD(at[2], at[28]);    MULADD(at[3], at[27]);    MULADD(at[4], at[26]);    MULADD(at[5], at[25]);    MULADD(at[6], at[24]);    MULADD(at[7], at[23]);    MULADD(at[8], at[22]);    MULADD(at[9], at[21]);    MULADD(at[10], at[20]); 
-   COMBA_STORE(C->dp[10]);
-   /* 11 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[31]);    MULADD(at[1], at[30]);    MULADD(at[2], at[29]);    MULADD(at[3], at[28]);    MULADD(at[4], at[27]);    MULADD(at[5], at[26]);    MULADD(at[6], at[25]);    MULADD(at[7], at[24]);    MULADD(at[8], at[23]);    MULADD(at[9], at[22]);    MULADD(at[10], at[21]);    MULADD(at[11], at[20]); 
-   COMBA_STORE(C->dp[11]);
-   /* 12 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[32]);    MULADD(at[1], at[31]);    MULADD(at[2], at[30]);    MULADD(at[3], at[29]);    MULADD(at[4], at[28]);    MULADD(at[5], at[27]);    MULADD(at[6], at[26]);    MULADD(at[7], at[25]);    MULADD(at[8], at[24]);    MULADD(at[9], at[23]);    MULADD(at[10], at[22]);    MULADD(at[11], at[21]);    MULADD(at[12], at[20]); 
-   COMBA_STORE(C->dp[12]);
-   /* 13 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[33]);    MULADD(at[1], at[32]);    MULADD(at[2], at[31]);    MULADD(at[3], at[30]);    MULADD(at[4], at[29]);    MULADD(at[5], at[28]);    MULADD(at[6], at[27]);    MULADD(at[7], at[26]);    MULADD(at[8], at[25]);    MULADD(at[9], at[24]);    MULADD(at[10], at[23]);    MULADD(at[11], at[22]);    MULADD(at[12], at[21]);    MULADD(at[13], at[20]); 
-   COMBA_STORE(C->dp[13]);
-   /* 14 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[34]);    MULADD(at[1], at[33]);    MULADD(at[2], at[32]);    MULADD(at[3], at[31]);    MULADD(at[4], at[30]);    MULADD(at[5], at[29]);    MULADD(at[6], at[28]);    MULADD(at[7], at[27]);    MULADD(at[8], at[26]);    MULADD(at[9], at[25]);    MULADD(at[10], at[24]);    MULADD(at[11], at[23]);    MULADD(at[12], at[22]);    MULADD(at[13], at[21]);    MULADD(at[14], at[20]); 
-   COMBA_STORE(C->dp[14]);
-   /* 15 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[35]);    MULADD(at[1], at[34]);    MULADD(at[2], at[33]);    MULADD(at[3], at[32]);    MULADD(at[4], at[31]);    MULADD(at[5], at[30]);    MULADD(at[6], at[29]);    MULADD(at[7], at[28]);    MULADD(at[8], at[27]);    MULADD(at[9], at[26]);    MULADD(at[10], at[25]);    MULADD(at[11], at[24]);    MULADD(at[12], at[23]);    MULADD(at[13], at[22]);    MULADD(at[14], at[21]);    MULADD(at[15], at[20]); 
-   COMBA_STORE(C->dp[15]);
-   /* 16 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[36]);    MULADD(at[1], at[35]);    MULADD(at[2], at[34]);    MULADD(at[3], at[33]);    MULADD(at[4], at[32]);    MULADD(at[5], at[31]);    MULADD(at[6], at[30]);    MULADD(at[7], at[29]);    MULADD(at[8], at[28]);    MULADD(at[9], at[27]);    MULADD(at[10], at[26]);    MULADD(at[11], at[25]);    MULADD(at[12], at[24]);    MULADD(at[13], at[23]);    MULADD(at[14], at[22]);    MULADD(at[15], at[21]);    MULADD(at[16], at[20]); 
-   COMBA_STORE(C->dp[16]);
-   /* 17 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[37]);    MULADD(at[1], at[36]);    MULADD(at[2], at[35]);    MULADD(at[3], at[34]);    MULADD(at[4], at[33]);    MULADD(at[5], at[32]);    MULADD(at[6], at[31]);    MULADD(at[7], at[30]);    MULADD(at[8], at[29]);    MULADD(at[9], at[28]);    MULADD(at[10], at[27]);    MULADD(at[11], at[26]);    MULADD(at[12], at[25]);    MULADD(at[13], at[24]);    MULADD(at[14], at[23]);    MULADD(at[15], at[22]);    MULADD(at[16], at[21]);    MULADD(at[17], at[20]); 
-   COMBA_STORE(C->dp[17]);
-   /* 18 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[38]);    MULADD(at[1], at[37]);    MULADD(at[2], at[36]);    MULADD(at[3], at[35]);    MULADD(at[4], at[34]);    MULADD(at[5], at[33]);    MULADD(at[6], at[32]);    MULADD(at[7], at[31]);    MULADD(at[8], at[30]);    MULADD(at[9], at[29]);    MULADD(at[10], at[28]);    MULADD(at[11], at[27]);    MULADD(at[12], at[26]);    MULADD(at[13], at[25]);    MULADD(at[14], at[24]);    MULADD(at[15], at[23]);    MULADD(at[16], at[22]);    MULADD(at[17], at[21]);    MULADD(at[18], at[20]); 
-   COMBA_STORE(C->dp[18]);
-   /* 19 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[39]);    MULADD(at[1], at[38]);    MULADD(at[2], at[37]);    MULADD(at[3], at[36]);    MULADD(at[4], at[35]);    MULADD(at[5], at[34]);    MULADD(at[6], at[33]);    MULADD(at[7], at[32]);    MULADD(at[8], at[31]);    MULADD(at[9], at[30]);    MULADD(at[10], at[29]);    MULADD(at[11], at[28]);    MULADD(at[12], at[27]);    MULADD(at[13], at[26]);    MULADD(at[14], at[25]);    MULADD(at[15], at[24]);    MULADD(at[16], at[23]);    MULADD(at[17], at[22]);    MULADD(at[18], at[21]);    MULADD(at[19], at[20]); 
-   COMBA_STORE(C->dp[19]);
-   /* 20 */
-   COMBA_FORWARD;
-   MULADD(at[1], at[39]);    MULADD(at[2], at[38]);    MULADD(at[3], at[37]);    MULADD(at[4], at[36]);    MULADD(at[5], at[35]);    MULADD(at[6], at[34]);    MULADD(at[7], at[33]);    MULADD(at[8], at[32]);    MULADD(at[9], at[31]);    MULADD(at[10], at[30]);    MULADD(at[11], at[29]);    MULADD(at[12], at[28]);    MULADD(at[13], at[27]);    MULADD(at[14], at[26]);    MULADD(at[15], at[25]);    MULADD(at[16], at[24]);    MULADD(at[17], at[23]);    MULADD(at[18], at[22]);    MULADD(at[19], at[21]); 
-   COMBA_STORE(C->dp[20]);
-   /* 21 */
-   COMBA_FORWARD;
-   MULADD(at[2], at[39]);    MULADD(at[3], at[38]);    MULADD(at[4], at[37]);    MULADD(at[5], at[36]);    MULADD(at[6], at[35]);    MULADD(at[7], at[34]);    MULADD(at[8], at[33]);    MULADD(at[9], at[32]);    MULADD(at[10], at[31]);    MULADD(at[11], at[30]);    MULADD(at[12], at[29]);    MULADD(at[13], at[28]);    MULADD(at[14], at[27]);    MULADD(at[15], at[26]);    MULADD(at[16], at[25]);    MULADD(at[17], at[24]);    MULADD(at[18], at[23]);    MULADD(at[19], at[22]); 
-   COMBA_STORE(C->dp[21]);
-   /* 22 */
-   COMBA_FORWARD;
-   MULADD(at[3], at[39]);    MULADD(at[4], at[38]);    MULADD(at[5], at[37]);    MULADD(at[6], at[36]);    MULADD(at[7], at[35]);    MULADD(at[8], at[34]);    MULADD(at[9], at[33]);    MULADD(at[10], at[32]);    MULADD(at[11], at[31]);    MULADD(at[12], at[30]);    MULADD(at[13], at[29]);    MULADD(at[14], at[28]);    MULADD(at[15], at[27]);    MULADD(at[16], at[26]);    MULADD(at[17], at[25]);    MULADD(at[18], at[24]);    MULADD(at[19], at[23]); 
-   COMBA_STORE(C->dp[22]);
-   /* 23 */
-   COMBA_FORWARD;
-   MULADD(at[4], at[39]);    MULADD(at[5], at[38]);    MULADD(at[6], at[37]);    MULADD(at[7], at[36]);    MULADD(at[8], at[35]);    MULADD(at[9], at[34]);    MULADD(at[10], at[33]);    MULADD(at[11], at[32]);    MULADD(at[12], at[31]);    MULADD(at[13], at[30]);    MULADD(at[14], at[29]);    MULADD(at[15], at[28]);    MULADD(at[16], at[27]);    MULADD(at[17], at[26]);    MULADD(at[18], at[25]);    MULADD(at[19], at[24]); 
-   COMBA_STORE(C->dp[23]);
-   /* 24 */
-   COMBA_FORWARD;
-   MULADD(at[5], at[39]);    MULADD(at[6], at[38]);    MULADD(at[7], at[37]);    MULADD(at[8], at[36]);    MULADD(at[9], at[35]);    MULADD(at[10], at[34]);    MULADD(at[11], at[33]);    MULADD(at[12], at[32]);    MULADD(at[13], at[31]);    MULADD(at[14], at[30]);    MULADD(at[15], at[29]);    MULADD(at[16], at[28]);    MULADD(at[17], at[27]);    MULADD(at[18], at[26]);    MULADD(at[19], at[25]); 
-   COMBA_STORE(C->dp[24]);
-   /* 25 */
-   COMBA_FORWARD;
-   MULADD(at[6], at[39]);    MULADD(at[7], at[38]);    MULADD(at[8], at[37]);    MULADD(at[9], at[36]);    MULADD(at[10], at[35]);    MULADD(at[11], at[34]);    MULADD(at[12], at[33]);    MULADD(at[13], at[32]);    MULADD(at[14], at[31]);    MULADD(at[15], at[30]);    MULADD(at[16], at[29]);    MULADD(at[17], at[28]);    MULADD(at[18], at[27]);    MULADD(at[19], at[26]); 
-   COMBA_STORE(C->dp[25]);
-   /* 26 */
-   COMBA_FORWARD;
-   MULADD(at[7], at[39]);    MULADD(at[8], at[38]);    MULADD(at[9], at[37]);    MULADD(at[10], at[36]);    MULADD(at[11], at[35]);    MULADD(at[12], at[34]);    MULADD(at[13], at[33]);    MULADD(at[14], at[32]);    MULADD(at[15], at[31]);    MULADD(at[16], at[30]);    MULADD(at[17], at[29]);    MULADD(at[18], at[28]);    MULADD(at[19], at[27]); 
-   COMBA_STORE(C->dp[26]);
-   /* 27 */
-   COMBA_FORWARD;
-   MULADD(at[8], at[39]);    MULADD(at[9], at[38]);    MULADD(at[10], at[37]);    MULADD(at[11], at[36]);    MULADD(at[12], at[35]);    MULADD(at[13], at[34]);    MULADD(at[14], at[33]);    MULADD(at[15], at[32]);    MULADD(at[16], at[31]);    MULADD(at[17], at[30]);    MULADD(at[18], at[29]);    MULADD(at[19], at[28]); 
-   COMBA_STORE(C->dp[27]);
-   /* 28 */
-   COMBA_FORWARD;
-   MULADD(at[9], at[39]);    MULADD(at[10], at[38]);    MULADD(at[11], at[37]);    MULADD(at[12], at[36]);    MULADD(at[13], at[35]);    MULADD(at[14], at[34]);    MULADD(at[15], at[33]);    MULADD(at[16], at[32]);    MULADD(at[17], at[31]);    MULADD(at[18], at[30]);    MULADD(at[19], at[29]); 
-   COMBA_STORE(C->dp[28]);
-   /* 29 */
-   COMBA_FORWARD;
-   MULADD(at[10], at[39]);    MULADD(at[11], at[38]);    MULADD(at[12], at[37]);    MULADD(at[13], at[36]);    MULADD(at[14], at[35]);    MULADD(at[15], at[34]);    MULADD(at[16], at[33]);    MULADD(at[17], at[32]);    MULADD(at[18], at[31]);    MULADD(at[19], at[30]); 
-   COMBA_STORE(C->dp[29]);
-   /* 30 */
-   COMBA_FORWARD;
-   MULADD(at[11], at[39]);    MULADD(at[12], at[38]);    MULADD(at[13], at[37]);    MULADD(at[14], at[36]);    MULADD(at[15], at[35]);    MULADD(at[16], at[34]);    MULADD(at[17], at[33]);    MULADD(at[18], at[32]);    MULADD(at[19], at[31]); 
-   COMBA_STORE(C->dp[30]);
-   /* 31 */
-   COMBA_FORWARD;
-   MULADD(at[12], at[39]);    MULADD(at[13], at[38]);    MULADD(at[14], at[37]);    MULADD(at[15], at[36]);    MULADD(at[16], at[35]);    MULADD(at[17], at[34]);    MULADD(at[18], at[33]);    MULADD(at[19], at[32]); 
-   COMBA_STORE(C->dp[31]);
-   /* 32 */
-   COMBA_FORWARD;
-   MULADD(at[13], at[39]);    MULADD(at[14], at[38]);    MULADD(at[15], at[37]);    MULADD(at[16], at[36]);    MULADD(at[17], at[35]);    MULADD(at[18], at[34]);    MULADD(at[19], at[33]); 
-   COMBA_STORE(C->dp[32]);
-   /* 33 */
-   COMBA_FORWARD;
-   MULADD(at[14], at[39]);    MULADD(at[15], at[38]);    MULADD(at[16], at[37]);    MULADD(at[17], at[36]);    MULADD(at[18], at[35]);    MULADD(at[19], at[34]); 
-   COMBA_STORE(C->dp[33]);
-   /* 34 */
-   COMBA_FORWARD;
-   MULADD(at[15], at[39]);    MULADD(at[16], at[38]);    MULADD(at[17], at[37]);    MULADD(at[18], at[36]);    MULADD(at[19], at[35]); 
-   COMBA_STORE(C->dp[34]);
-   /* 35 */
-   COMBA_FORWARD;
-   MULADD(at[16], at[39]);    MULADD(at[17], at[38]);    MULADD(at[18], at[37]);    MULADD(at[19], at[36]); 
-   COMBA_STORE(C->dp[35]);
-   /* 36 */
-   COMBA_FORWARD;
-   MULADD(at[17], at[39]);    MULADD(at[18], at[38]);    MULADD(at[19], at[37]); 
-   COMBA_STORE(C->dp[36]);
-   /* 37 */
-   COMBA_FORWARD;
-   MULADD(at[18], at[39]);    MULADD(at[19], at[38]); 
-   COMBA_STORE(C->dp[37]);
-   /* 38 */
-   COMBA_FORWARD;
-   MULADD(at[19], at[39]); 
-   COMBA_STORE(C->dp[38]);
-   COMBA_STORE2(C->dp[39]);
-   C->used = 40;
-   C->sign = A->sign ^ B->sign;
-   fp_clamp(C);
-   COMBA_FINI;
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif

+ 0 - 243
lib/wolfssl/wolfcrypt/src/fp_mul_comba_24.i

@@ -1,243 +0,0 @@
-/* fp_mul_comba_24.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_MUL24
-int fp_mul_comba24(fp_int *A, fp_int *B, fp_int *C)
-{
-   fp_digit c0, c1, c2;
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit at[48];
-#else
-   fp_digit *at;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 48, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (at == NULL)
-       return FP_MEM;
-#endif
-
-   XMEMCPY(at, A->dp, 24 * sizeof(fp_digit));
-   XMEMCPY(at+24, B->dp, 24 * sizeof(fp_digit));
-   COMBA_START;
-
-   COMBA_CLEAR;
-   /* 0 */
-   MULADD(at[0], at[24]); 
-   COMBA_STORE(C->dp[0]);
-   /* 1 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[25]);    MULADD(at[1], at[24]); 
-   COMBA_STORE(C->dp[1]);
-   /* 2 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[26]);    MULADD(at[1], at[25]);    MULADD(at[2], at[24]); 
-   COMBA_STORE(C->dp[2]);
-   /* 3 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[27]);    MULADD(at[1], at[26]);    MULADD(at[2], at[25]);    MULADD(at[3], at[24]); 
-   COMBA_STORE(C->dp[3]);
-   /* 4 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[28]);    MULADD(at[1], at[27]);    MULADD(at[2], at[26]);    MULADD(at[3], at[25]);    MULADD(at[4], at[24]); 
-   COMBA_STORE(C->dp[4]);
-   /* 5 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[29]);    MULADD(at[1], at[28]);    MULADD(at[2], at[27]);    MULADD(at[3], at[26]);    MULADD(at[4], at[25]);    MULADD(at[5], at[24]); 
-   COMBA_STORE(C->dp[5]);
-   /* 6 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[30]);    MULADD(at[1], at[29]);    MULADD(at[2], at[28]);    MULADD(at[3], at[27]);    MULADD(at[4], at[26]);    MULADD(at[5], at[25]);    MULADD(at[6], at[24]); 
-   COMBA_STORE(C->dp[6]);
-   /* 7 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[31]);    MULADD(at[1], at[30]);    MULADD(at[2], at[29]);    MULADD(at[3], at[28]);    MULADD(at[4], at[27]);    MULADD(at[5], at[26]);    MULADD(at[6], at[25]);    MULADD(at[7], at[24]); 
-   COMBA_STORE(C->dp[7]);
-   /* 8 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[32]);    MULADD(at[1], at[31]);    MULADD(at[2], at[30]);    MULADD(at[3], at[29]);    MULADD(at[4], at[28]);    MULADD(at[5], at[27]);    MULADD(at[6], at[26]);    MULADD(at[7], at[25]);    MULADD(at[8], at[24]); 
-   COMBA_STORE(C->dp[8]);
-   /* 9 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[33]);    MULADD(at[1], at[32]);    MULADD(at[2], at[31]);    MULADD(at[3], at[30]);    MULADD(at[4], at[29]);    MULADD(at[5], at[28]);    MULADD(at[6], at[27]);    MULADD(at[7], at[26]);    MULADD(at[8], at[25]);    MULADD(at[9], at[24]); 
-   COMBA_STORE(C->dp[9]);
-   /* 10 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[34]);    MULADD(at[1], at[33]);    MULADD(at[2], at[32]);    MULADD(at[3], at[31]);    MULADD(at[4], at[30]);    MULADD(at[5], at[29]);    MULADD(at[6], at[28]);    MULADD(at[7], at[27]);    MULADD(at[8], at[26]);    MULADD(at[9], at[25]);    MULADD(at[10], at[24]); 
-   COMBA_STORE(C->dp[10]);
-   /* 11 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[35]);    MULADD(at[1], at[34]);    MULADD(at[2], at[33]);    MULADD(at[3], at[32]);    MULADD(at[4], at[31]);    MULADD(at[5], at[30]);    MULADD(at[6], at[29]);    MULADD(at[7], at[28]);    MULADD(at[8], at[27]);    MULADD(at[9], at[26]);    MULADD(at[10], at[25]);    MULADD(at[11], at[24]); 
-   COMBA_STORE(C->dp[11]);
-   /* 12 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[36]);    MULADD(at[1], at[35]);    MULADD(at[2], at[34]);    MULADD(at[3], at[33]);    MULADD(at[4], at[32]);    MULADD(at[5], at[31]);    MULADD(at[6], at[30]);    MULADD(at[7], at[29]);    MULADD(at[8], at[28]);    MULADD(at[9], at[27]);    MULADD(at[10], at[26]);    MULADD(at[11], at[25]);    MULADD(at[12], at[24]); 
-   COMBA_STORE(C->dp[12]);
-   /* 13 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[37]);    MULADD(at[1], at[36]);    MULADD(at[2], at[35]);    MULADD(at[3], at[34]);    MULADD(at[4], at[33]);    MULADD(at[5], at[32]);    MULADD(at[6], at[31]);    MULADD(at[7], at[30]);    MULADD(at[8], at[29]);    MULADD(at[9], at[28]);    MULADD(at[10], at[27]);    MULADD(at[11], at[26]);    MULADD(at[12], at[25]);    MULADD(at[13], at[24]); 
-   COMBA_STORE(C->dp[13]);
-   /* 14 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[38]);    MULADD(at[1], at[37]);    MULADD(at[2], at[36]);    MULADD(at[3], at[35]);    MULADD(at[4], at[34]);    MULADD(at[5], at[33]);    MULADD(at[6], at[32]);    MULADD(at[7], at[31]);    MULADD(at[8], at[30]);    MULADD(at[9], at[29]);    MULADD(at[10], at[28]);    MULADD(at[11], at[27]);    MULADD(at[12], at[26]);    MULADD(at[13], at[25]);    MULADD(at[14], at[24]); 
-   COMBA_STORE(C->dp[14]);
-   /* 15 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[39]);    MULADD(at[1], at[38]);    MULADD(at[2], at[37]);    MULADD(at[3], at[36]);    MULADD(at[4], at[35]);    MULADD(at[5], at[34]);    MULADD(at[6], at[33]);    MULADD(at[7], at[32]);    MULADD(at[8], at[31]);    MULADD(at[9], at[30]);    MULADD(at[10], at[29]);    MULADD(at[11], at[28]);    MULADD(at[12], at[27]);    MULADD(at[13], at[26]);    MULADD(at[14], at[25]);    MULADD(at[15], at[24]); 
-   COMBA_STORE(C->dp[15]);
-   /* 16 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[40]);    MULADD(at[1], at[39]);    MULADD(at[2], at[38]);    MULADD(at[3], at[37]);    MULADD(at[4], at[36]);    MULADD(at[5], at[35]);    MULADD(at[6], at[34]);    MULADD(at[7], at[33]);    MULADD(at[8], at[32]);    MULADD(at[9], at[31]);    MULADD(at[10], at[30]);    MULADD(at[11], at[29]);    MULADD(at[12], at[28]);    MULADD(at[13], at[27]);    MULADD(at[14], at[26]);    MULADD(at[15], at[25]);    MULADD(at[16], at[24]); 
-   COMBA_STORE(C->dp[16]);
-   /* 17 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[41]);    MULADD(at[1], at[40]);    MULADD(at[2], at[39]);    MULADD(at[3], at[38]);    MULADD(at[4], at[37]);    MULADD(at[5], at[36]);    MULADD(at[6], at[35]);    MULADD(at[7], at[34]);    MULADD(at[8], at[33]);    MULADD(at[9], at[32]);    MULADD(at[10], at[31]);    MULADD(at[11], at[30]);    MULADD(at[12], at[29]);    MULADD(at[13], at[28]);    MULADD(at[14], at[27]);    MULADD(at[15], at[26]);    MULADD(at[16], at[25]);    MULADD(at[17], at[24]); 
-   COMBA_STORE(C->dp[17]);
-   /* 18 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[42]);    MULADD(at[1], at[41]);    MULADD(at[2], at[40]);    MULADD(at[3], at[39]);    MULADD(at[4], at[38]);    MULADD(at[5], at[37]);    MULADD(at[6], at[36]);    MULADD(at[7], at[35]);    MULADD(at[8], at[34]);    MULADD(at[9], at[33]);    MULADD(at[10], at[32]);    MULADD(at[11], at[31]);    MULADD(at[12], at[30]);    MULADD(at[13], at[29]);    MULADD(at[14], at[28]);    MULADD(at[15], at[27]);    MULADD(at[16], at[26]);    MULADD(at[17], at[25]);    MULADD(at[18], at[24]); 
-   COMBA_STORE(C->dp[18]);
-   /* 19 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[43]);    MULADD(at[1], at[42]);    MULADD(at[2], at[41]);    MULADD(at[3], at[40]);    MULADD(at[4], at[39]);    MULADD(at[5], at[38]);    MULADD(at[6], at[37]);    MULADD(at[7], at[36]);    MULADD(at[8], at[35]);    MULADD(at[9], at[34]);    MULADD(at[10], at[33]);    MULADD(at[11], at[32]);    MULADD(at[12], at[31]);    MULADD(at[13], at[30]);    MULADD(at[14], at[29]);    MULADD(at[15], at[28]);    MULADD(at[16], at[27]);    MULADD(at[17], at[26]);    MULADD(at[18], at[25]);    MULADD(at[19], at[24]); 
-   COMBA_STORE(C->dp[19]);
-   /* 20 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[44]);    MULADD(at[1], at[43]);    MULADD(at[2], at[42]);    MULADD(at[3], at[41]);    MULADD(at[4], at[40]);    MULADD(at[5], at[39]);    MULADD(at[6], at[38]);    MULADD(at[7], at[37]);    MULADD(at[8], at[36]);    MULADD(at[9], at[35]);    MULADD(at[10], at[34]);    MULADD(at[11], at[33]);    MULADD(at[12], at[32]);    MULADD(at[13], at[31]);    MULADD(at[14], at[30]);    MULADD(at[15], at[29]);    MULADD(at[16], at[28]);    MULADD(at[17], at[27]);    MULADD(at[18], at[26]);    MULADD(at[19], at[25]);    MULADD(at[20], at[24]); 
-   COMBA_STORE(C->dp[20]);
-   /* 21 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[45]);    MULADD(at[1], at[44]);    MULADD(at[2], at[43]);    MULADD(at[3], at[42]);    MULADD(at[4], at[41]);    MULADD(at[5], at[40]);    MULADD(at[6], at[39]);    MULADD(at[7], at[38]);    MULADD(at[8], at[37]);    MULADD(at[9], at[36]);    MULADD(at[10], at[35]);    MULADD(at[11], at[34]);    MULADD(at[12], at[33]);    MULADD(at[13], at[32]);    MULADD(at[14], at[31]);    MULADD(at[15], at[30]);    MULADD(at[16], at[29]);    MULADD(at[17], at[28]);    MULADD(at[18], at[27]);    MULADD(at[19], at[26]);    MULADD(at[20], at[25]);    MULADD(at[21], at[24]); 
-   COMBA_STORE(C->dp[21]);
-   /* 22 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[46]);    MULADD(at[1], at[45]);    MULADD(at[2], at[44]);    MULADD(at[3], at[43]);    MULADD(at[4], at[42]);    MULADD(at[5], at[41]);    MULADD(at[6], at[40]);    MULADD(at[7], at[39]);    MULADD(at[8], at[38]);    MULADD(at[9], at[37]);    MULADD(at[10], at[36]);    MULADD(at[11], at[35]);    MULADD(at[12], at[34]);    MULADD(at[13], at[33]);    MULADD(at[14], at[32]);    MULADD(at[15], at[31]);    MULADD(at[16], at[30]);    MULADD(at[17], at[29]);    MULADD(at[18], at[28]);    MULADD(at[19], at[27]);    MULADD(at[20], at[26]);    MULADD(at[21], at[25]);    MULADD(at[22], at[24]); 
-   COMBA_STORE(C->dp[22]);
-   /* 23 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[47]);    MULADD(at[1], at[46]);    MULADD(at[2], at[45]);    MULADD(at[3], at[44]);    MULADD(at[4], at[43]);    MULADD(at[5], at[42]);    MULADD(at[6], at[41]);    MULADD(at[7], at[40]);    MULADD(at[8], at[39]);    MULADD(at[9], at[38]);    MULADD(at[10], at[37]);    MULADD(at[11], at[36]);    MULADD(at[12], at[35]);    MULADD(at[13], at[34]);    MULADD(at[14], at[33]);    MULADD(at[15], at[32]);    MULADD(at[16], at[31]);    MULADD(at[17], at[30]);    MULADD(at[18], at[29]);    MULADD(at[19], at[28]);    MULADD(at[20], at[27]);    MULADD(at[21], at[26]);    MULADD(at[22], at[25]);    MULADD(at[23], at[24]); 
-   COMBA_STORE(C->dp[23]);
-   /* 24 */
-   COMBA_FORWARD;
-   MULADD(at[1], at[47]);    MULADD(at[2], at[46]);    MULADD(at[3], at[45]);    MULADD(at[4], at[44]);    MULADD(at[5], at[43]);    MULADD(at[6], at[42]);    MULADD(at[7], at[41]);    MULADD(at[8], at[40]);    MULADD(at[9], at[39]);    MULADD(at[10], at[38]);    MULADD(at[11], at[37]);    MULADD(at[12], at[36]);    MULADD(at[13], at[35]);    MULADD(at[14], at[34]);    MULADD(at[15], at[33]);    MULADD(at[16], at[32]);    MULADD(at[17], at[31]);    MULADD(at[18], at[30]);    MULADD(at[19], at[29]);    MULADD(at[20], at[28]);    MULADD(at[21], at[27]);    MULADD(at[22], at[26]);    MULADD(at[23], at[25]); 
-   COMBA_STORE(C->dp[24]);
-   /* 25 */
-   COMBA_FORWARD;
-   MULADD(at[2], at[47]);    MULADD(at[3], at[46]);    MULADD(at[4], at[45]);    MULADD(at[5], at[44]);    MULADD(at[6], at[43]);    MULADD(at[7], at[42]);    MULADD(at[8], at[41]);    MULADD(at[9], at[40]);    MULADD(at[10], at[39]);    MULADD(at[11], at[38]);    MULADD(at[12], at[37]);    MULADD(at[13], at[36]);    MULADD(at[14], at[35]);    MULADD(at[15], at[34]);    MULADD(at[16], at[33]);    MULADD(at[17], at[32]);    MULADD(at[18], at[31]);    MULADD(at[19], at[30]);    MULADD(at[20], at[29]);    MULADD(at[21], at[28]);    MULADD(at[22], at[27]);    MULADD(at[23], at[26]); 
-   COMBA_STORE(C->dp[25]);
-   /* 26 */
-   COMBA_FORWARD;
-   MULADD(at[3], at[47]);    MULADD(at[4], at[46]);    MULADD(at[5], at[45]);    MULADD(at[6], at[44]);    MULADD(at[7], at[43]);    MULADD(at[8], at[42]);    MULADD(at[9], at[41]);    MULADD(at[10], at[40]);    MULADD(at[11], at[39]);    MULADD(at[12], at[38]);    MULADD(at[13], at[37]);    MULADD(at[14], at[36]);    MULADD(at[15], at[35]);    MULADD(at[16], at[34]);    MULADD(at[17], at[33]);    MULADD(at[18], at[32]);    MULADD(at[19], at[31]);    MULADD(at[20], at[30]);    MULADD(at[21], at[29]);    MULADD(at[22], at[28]);    MULADD(at[23], at[27]); 
-   COMBA_STORE(C->dp[26]);
-   /* 27 */
-   COMBA_FORWARD;
-   MULADD(at[4], at[47]);    MULADD(at[5], at[46]);    MULADD(at[6], at[45]);    MULADD(at[7], at[44]);    MULADD(at[8], at[43]);    MULADD(at[9], at[42]);    MULADD(at[10], at[41]);    MULADD(at[11], at[40]);    MULADD(at[12], at[39]);    MULADD(at[13], at[38]);    MULADD(at[14], at[37]);    MULADD(at[15], at[36]);    MULADD(at[16], at[35]);    MULADD(at[17], at[34]);    MULADD(at[18], at[33]);    MULADD(at[19], at[32]);    MULADD(at[20], at[31]);    MULADD(at[21], at[30]);    MULADD(at[22], at[29]);    MULADD(at[23], at[28]); 
-   COMBA_STORE(C->dp[27]);
-   /* 28 */
-   COMBA_FORWARD;
-   MULADD(at[5], at[47]);    MULADD(at[6], at[46]);    MULADD(at[7], at[45]);    MULADD(at[8], at[44]);    MULADD(at[9], at[43]);    MULADD(at[10], at[42]);    MULADD(at[11], at[41]);    MULADD(at[12], at[40]);    MULADD(at[13], at[39]);    MULADD(at[14], at[38]);    MULADD(at[15], at[37]);    MULADD(at[16], at[36]);    MULADD(at[17], at[35]);    MULADD(at[18], at[34]);    MULADD(at[19], at[33]);    MULADD(at[20], at[32]);    MULADD(at[21], at[31]);    MULADD(at[22], at[30]);    MULADD(at[23], at[29]); 
-   COMBA_STORE(C->dp[28]);
-   /* 29 */
-   COMBA_FORWARD;
-   MULADD(at[6], at[47]);    MULADD(at[7], at[46]);    MULADD(at[8], at[45]);    MULADD(at[9], at[44]);    MULADD(at[10], at[43]);    MULADD(at[11], at[42]);    MULADD(at[12], at[41]);    MULADD(at[13], at[40]);    MULADD(at[14], at[39]);    MULADD(at[15], at[38]);    MULADD(at[16], at[37]);    MULADD(at[17], at[36]);    MULADD(at[18], at[35]);    MULADD(at[19], at[34]);    MULADD(at[20], at[33]);    MULADD(at[21], at[32]);    MULADD(at[22], at[31]);    MULADD(at[23], at[30]); 
-   COMBA_STORE(C->dp[29]);
-   /* 30 */
-   COMBA_FORWARD;
-   MULADD(at[7], at[47]);    MULADD(at[8], at[46]);    MULADD(at[9], at[45]);    MULADD(at[10], at[44]);    MULADD(at[11], at[43]);    MULADD(at[12], at[42]);    MULADD(at[13], at[41]);    MULADD(at[14], at[40]);    MULADD(at[15], at[39]);    MULADD(at[16], at[38]);    MULADD(at[17], at[37]);    MULADD(at[18], at[36]);    MULADD(at[19], at[35]);    MULADD(at[20], at[34]);    MULADD(at[21], at[33]);    MULADD(at[22], at[32]);    MULADD(at[23], at[31]); 
-   COMBA_STORE(C->dp[30]);
-   /* 31 */
-   COMBA_FORWARD;
-   MULADD(at[8], at[47]);    MULADD(at[9], at[46]);    MULADD(at[10], at[45]);    MULADD(at[11], at[44]);    MULADD(at[12], at[43]);    MULADD(at[13], at[42]);    MULADD(at[14], at[41]);    MULADD(at[15], at[40]);    MULADD(at[16], at[39]);    MULADD(at[17], at[38]);    MULADD(at[18], at[37]);    MULADD(at[19], at[36]);    MULADD(at[20], at[35]);    MULADD(at[21], at[34]);    MULADD(at[22], at[33]);    MULADD(at[23], at[32]); 
-   COMBA_STORE(C->dp[31]);
-   /* 32 */
-   COMBA_FORWARD;
-   MULADD(at[9], at[47]);    MULADD(at[10], at[46]);    MULADD(at[11], at[45]);    MULADD(at[12], at[44]);    MULADD(at[13], at[43]);    MULADD(at[14], at[42]);    MULADD(at[15], at[41]);    MULADD(at[16], at[40]);    MULADD(at[17], at[39]);    MULADD(at[18], at[38]);    MULADD(at[19], at[37]);    MULADD(at[20], at[36]);    MULADD(at[21], at[35]);    MULADD(at[22], at[34]);    MULADD(at[23], at[33]); 
-   COMBA_STORE(C->dp[32]);
-   /* 33 */
-   COMBA_FORWARD;
-   MULADD(at[10], at[47]);    MULADD(at[11], at[46]);    MULADD(at[12], at[45]);    MULADD(at[13], at[44]);    MULADD(at[14], at[43]);    MULADD(at[15], at[42]);    MULADD(at[16], at[41]);    MULADD(at[17], at[40]);    MULADD(at[18], at[39]);    MULADD(at[19], at[38]);    MULADD(at[20], at[37]);    MULADD(at[21], at[36]);    MULADD(at[22], at[35]);    MULADD(at[23], at[34]); 
-   COMBA_STORE(C->dp[33]);
-   /* 34 */
-   COMBA_FORWARD;
-   MULADD(at[11], at[47]);    MULADD(at[12], at[46]);    MULADD(at[13], at[45]);    MULADD(at[14], at[44]);    MULADD(at[15], at[43]);    MULADD(at[16], at[42]);    MULADD(at[17], at[41]);    MULADD(at[18], at[40]);    MULADD(at[19], at[39]);    MULADD(at[20], at[38]);    MULADD(at[21], at[37]);    MULADD(at[22], at[36]);    MULADD(at[23], at[35]); 
-   COMBA_STORE(C->dp[34]);
-   /* 35 */
-   COMBA_FORWARD;
-   MULADD(at[12], at[47]);    MULADD(at[13], at[46]);    MULADD(at[14], at[45]);    MULADD(at[15], at[44]);    MULADD(at[16], at[43]);    MULADD(at[17], at[42]);    MULADD(at[18], at[41]);    MULADD(at[19], at[40]);    MULADD(at[20], at[39]);    MULADD(at[21], at[38]);    MULADD(at[22], at[37]);    MULADD(at[23], at[36]); 
-   COMBA_STORE(C->dp[35]);
-   /* 36 */
-   COMBA_FORWARD;
-   MULADD(at[13], at[47]);    MULADD(at[14], at[46]);    MULADD(at[15], at[45]);    MULADD(at[16], at[44]);    MULADD(at[17], at[43]);    MULADD(at[18], at[42]);    MULADD(at[19], at[41]);    MULADD(at[20], at[40]);    MULADD(at[21], at[39]);    MULADD(at[22], at[38]);    MULADD(at[23], at[37]); 
-   COMBA_STORE(C->dp[36]);
-   /* 37 */
-   COMBA_FORWARD;
-   MULADD(at[14], at[47]);    MULADD(at[15], at[46]);    MULADD(at[16], at[45]);    MULADD(at[17], at[44]);    MULADD(at[18], at[43]);    MULADD(at[19], at[42]);    MULADD(at[20], at[41]);    MULADD(at[21], at[40]);    MULADD(at[22], at[39]);    MULADD(at[23], at[38]); 
-   COMBA_STORE(C->dp[37]);
-   /* 38 */
-   COMBA_FORWARD;
-   MULADD(at[15], at[47]);    MULADD(at[16], at[46]);    MULADD(at[17], at[45]);    MULADD(at[18], at[44]);    MULADD(at[19], at[43]);    MULADD(at[20], at[42]);    MULADD(at[21], at[41]);    MULADD(at[22], at[40]);    MULADD(at[23], at[39]); 
-   COMBA_STORE(C->dp[38]);
-   /* 39 */
-   COMBA_FORWARD;
-   MULADD(at[16], at[47]);    MULADD(at[17], at[46]);    MULADD(at[18], at[45]);    MULADD(at[19], at[44]);    MULADD(at[20], at[43]);    MULADD(at[21], at[42]);    MULADD(at[22], at[41]);    MULADD(at[23], at[40]); 
-   COMBA_STORE(C->dp[39]);
-   /* 40 */
-   COMBA_FORWARD;
-   MULADD(at[17], at[47]);    MULADD(at[18], at[46]);    MULADD(at[19], at[45]);    MULADD(at[20], at[44]);    MULADD(at[21], at[43]);    MULADD(at[22], at[42]);    MULADD(at[23], at[41]); 
-   COMBA_STORE(C->dp[40]);
-   /* 41 */
-   COMBA_FORWARD;
-   MULADD(at[18], at[47]);    MULADD(at[19], at[46]);    MULADD(at[20], at[45]);    MULADD(at[21], at[44]);    MULADD(at[22], at[43]);    MULADD(at[23], at[42]); 
-   COMBA_STORE(C->dp[41]);
-   /* 42 */
-   COMBA_FORWARD;
-   MULADD(at[19], at[47]);    MULADD(at[20], at[46]);    MULADD(at[21], at[45]);    MULADD(at[22], at[44]);    MULADD(at[23], at[43]); 
-   COMBA_STORE(C->dp[42]);
-   /* 43 */
-   COMBA_FORWARD;
-   MULADD(at[20], at[47]);    MULADD(at[21], at[46]);    MULADD(at[22], at[45]);    MULADD(at[23], at[44]); 
-   COMBA_STORE(C->dp[43]);
-   /* 44 */
-   COMBA_FORWARD;
-   MULADD(at[21], at[47]);    MULADD(at[22], at[46]);    MULADD(at[23], at[45]); 
-   COMBA_STORE(C->dp[44]);
-   /* 45 */
-   COMBA_FORWARD;
-   MULADD(at[22], at[47]);    MULADD(at[23], at[46]); 
-   COMBA_STORE(C->dp[45]);
-   /* 46 */
-   COMBA_FORWARD;
-   MULADD(at[23], at[47]); 
-   COMBA_STORE(C->dp[46]);
-   COMBA_STORE2(C->dp[47]);
-   C->used = 48;
-   C->sign = A->sign ^ B->sign;
-   fp_clamp(C);
-   COMBA_FINI;
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif

+ 0 - 275
lib/wolfssl/wolfcrypt/src/fp_mul_comba_28.i

@@ -1,275 +0,0 @@
-/* fp_mul_comba_28.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_MUL28
-int fp_mul_comba28(fp_int *A, fp_int *B, fp_int *C)
-{
-   fp_digit c0, c1, c2;
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit at[56];
-#else
-   fp_digit *at;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 56, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (at == NULL)
-       return FP_MEM;
-#endif
-
-   XMEMCPY(at, A->dp, 28 * sizeof(fp_digit));
-   XMEMCPY(at+28, B->dp, 28 * sizeof(fp_digit));
-   COMBA_START;
-
-   COMBA_CLEAR;
-   /* 0 */
-   MULADD(at[0], at[28]); 
-   COMBA_STORE(C->dp[0]);
-   /* 1 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[29]);    MULADD(at[1], at[28]); 
-   COMBA_STORE(C->dp[1]);
-   /* 2 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[30]);    MULADD(at[1], at[29]);    MULADD(at[2], at[28]); 
-   COMBA_STORE(C->dp[2]);
-   /* 3 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[31]);    MULADD(at[1], at[30]);    MULADD(at[2], at[29]);    MULADD(at[3], at[28]); 
-   COMBA_STORE(C->dp[3]);
-   /* 4 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[32]);    MULADD(at[1], at[31]);    MULADD(at[2], at[30]);    MULADD(at[3], at[29]);    MULADD(at[4], at[28]); 
-   COMBA_STORE(C->dp[4]);
-   /* 5 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[33]);    MULADD(at[1], at[32]);    MULADD(at[2], at[31]);    MULADD(at[3], at[30]);    MULADD(at[4], at[29]);    MULADD(at[5], at[28]); 
-   COMBA_STORE(C->dp[5]);
-   /* 6 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[34]);    MULADD(at[1], at[33]);    MULADD(at[2], at[32]);    MULADD(at[3], at[31]);    MULADD(at[4], at[30]);    MULADD(at[5], at[29]);    MULADD(at[6], at[28]); 
-   COMBA_STORE(C->dp[6]);
-   /* 7 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[35]);    MULADD(at[1], at[34]);    MULADD(at[2], at[33]);    MULADD(at[3], at[32]);    MULADD(at[4], at[31]);    MULADD(at[5], at[30]);    MULADD(at[6], at[29]);    MULADD(at[7], at[28]); 
-   COMBA_STORE(C->dp[7]);
-   /* 8 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[36]);    MULADD(at[1], at[35]);    MULADD(at[2], at[34]);    MULADD(at[3], at[33]);    MULADD(at[4], at[32]);    MULADD(at[5], at[31]);    MULADD(at[6], at[30]);    MULADD(at[7], at[29]);    MULADD(at[8], at[28]); 
-   COMBA_STORE(C->dp[8]);
-   /* 9 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[37]);    MULADD(at[1], at[36]);    MULADD(at[2], at[35]);    MULADD(at[3], at[34]);    MULADD(at[4], at[33]);    MULADD(at[5], at[32]);    MULADD(at[6], at[31]);    MULADD(at[7], at[30]);    MULADD(at[8], at[29]);    MULADD(at[9], at[28]); 
-   COMBA_STORE(C->dp[9]);
-   /* 10 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[38]);    MULADD(at[1], at[37]);    MULADD(at[2], at[36]);    MULADD(at[3], at[35]);    MULADD(at[4], at[34]);    MULADD(at[5], at[33]);    MULADD(at[6], at[32]);    MULADD(at[7], at[31]);    MULADD(at[8], at[30]);    MULADD(at[9], at[29]);    MULADD(at[10], at[28]); 
-   COMBA_STORE(C->dp[10]);
-   /* 11 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[39]);    MULADD(at[1], at[38]);    MULADD(at[2], at[37]);    MULADD(at[3], at[36]);    MULADD(at[4], at[35]);    MULADD(at[5], at[34]);    MULADD(at[6], at[33]);    MULADD(at[7], at[32]);    MULADD(at[8], at[31]);    MULADD(at[9], at[30]);    MULADD(at[10], at[29]);    MULADD(at[11], at[28]); 
-   COMBA_STORE(C->dp[11]);
-   /* 12 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[40]);    MULADD(at[1], at[39]);    MULADD(at[2], at[38]);    MULADD(at[3], at[37]);    MULADD(at[4], at[36]);    MULADD(at[5], at[35]);    MULADD(at[6], at[34]);    MULADD(at[7], at[33]);    MULADD(at[8], at[32]);    MULADD(at[9], at[31]);    MULADD(at[10], at[30]);    MULADD(at[11], at[29]);    MULADD(at[12], at[28]); 
-   COMBA_STORE(C->dp[12]);
-   /* 13 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[41]);    MULADD(at[1], at[40]);    MULADD(at[2], at[39]);    MULADD(at[3], at[38]);    MULADD(at[4], at[37]);    MULADD(at[5], at[36]);    MULADD(at[6], at[35]);    MULADD(at[7], at[34]);    MULADD(at[8], at[33]);    MULADD(at[9], at[32]);    MULADD(at[10], at[31]);    MULADD(at[11], at[30]);    MULADD(at[12], at[29]);    MULADD(at[13], at[28]); 
-   COMBA_STORE(C->dp[13]);
-   /* 14 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[42]);    MULADD(at[1], at[41]);    MULADD(at[2], at[40]);    MULADD(at[3], at[39]);    MULADD(at[4], at[38]);    MULADD(at[5], at[37]);    MULADD(at[6], at[36]);    MULADD(at[7], at[35]);    MULADD(at[8], at[34]);    MULADD(at[9], at[33]);    MULADD(at[10], at[32]);    MULADD(at[11], at[31]);    MULADD(at[12], at[30]);    MULADD(at[13], at[29]);    MULADD(at[14], at[28]); 
-   COMBA_STORE(C->dp[14]);
-   /* 15 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[43]);    MULADD(at[1], at[42]);    MULADD(at[2], at[41]);    MULADD(at[3], at[40]);    MULADD(at[4], at[39]);    MULADD(at[5], at[38]);    MULADD(at[6], at[37]);    MULADD(at[7], at[36]);    MULADD(at[8], at[35]);    MULADD(at[9], at[34]);    MULADD(at[10], at[33]);    MULADD(at[11], at[32]);    MULADD(at[12], at[31]);    MULADD(at[13], at[30]);    MULADD(at[14], at[29]);    MULADD(at[15], at[28]); 
-   COMBA_STORE(C->dp[15]);
-   /* 16 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[44]);    MULADD(at[1], at[43]);    MULADD(at[2], at[42]);    MULADD(at[3], at[41]);    MULADD(at[4], at[40]);    MULADD(at[5], at[39]);    MULADD(at[6], at[38]);    MULADD(at[7], at[37]);    MULADD(at[8], at[36]);    MULADD(at[9], at[35]);    MULADD(at[10], at[34]);    MULADD(at[11], at[33]);    MULADD(at[12], at[32]);    MULADD(at[13], at[31]);    MULADD(at[14], at[30]);    MULADD(at[15], at[29]);    MULADD(at[16], at[28]); 
-   COMBA_STORE(C->dp[16]);
-   /* 17 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[45]);    MULADD(at[1], at[44]);    MULADD(at[2], at[43]);    MULADD(at[3], at[42]);    MULADD(at[4], at[41]);    MULADD(at[5], at[40]);    MULADD(at[6], at[39]);    MULADD(at[7], at[38]);    MULADD(at[8], at[37]);    MULADD(at[9], at[36]);    MULADD(at[10], at[35]);    MULADD(at[11], at[34]);    MULADD(at[12], at[33]);    MULADD(at[13], at[32]);    MULADD(at[14], at[31]);    MULADD(at[15], at[30]);    MULADD(at[16], at[29]);    MULADD(at[17], at[28]); 
-   COMBA_STORE(C->dp[17]);
-   /* 18 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[46]);    MULADD(at[1], at[45]);    MULADD(at[2], at[44]);    MULADD(at[3], at[43]);    MULADD(at[4], at[42]);    MULADD(at[5], at[41]);    MULADD(at[6], at[40]);    MULADD(at[7], at[39]);    MULADD(at[8], at[38]);    MULADD(at[9], at[37]);    MULADD(at[10], at[36]);    MULADD(at[11], at[35]);    MULADD(at[12], at[34]);    MULADD(at[13], at[33]);    MULADD(at[14], at[32]);    MULADD(at[15], at[31]);    MULADD(at[16], at[30]);    MULADD(at[17], at[29]);    MULADD(at[18], at[28]); 
-   COMBA_STORE(C->dp[18]);
-   /* 19 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[47]);    MULADD(at[1], at[46]);    MULADD(at[2], at[45]);    MULADD(at[3], at[44]);    MULADD(at[4], at[43]);    MULADD(at[5], at[42]);    MULADD(at[6], at[41]);    MULADD(at[7], at[40]);    MULADD(at[8], at[39]);    MULADD(at[9], at[38]);    MULADD(at[10], at[37]);    MULADD(at[11], at[36]);    MULADD(at[12], at[35]);    MULADD(at[13], at[34]);    MULADD(at[14], at[33]);    MULADD(at[15], at[32]);    MULADD(at[16], at[31]);    MULADD(at[17], at[30]);    MULADD(at[18], at[29]);    MULADD(at[19], at[28]); 
-   COMBA_STORE(C->dp[19]);
-   /* 20 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[48]);    MULADD(at[1], at[47]);    MULADD(at[2], at[46]);    MULADD(at[3], at[45]);    MULADD(at[4], at[44]);    MULADD(at[5], at[43]);    MULADD(at[6], at[42]);    MULADD(at[7], at[41]);    MULADD(at[8], at[40]);    MULADD(at[9], at[39]);    MULADD(at[10], at[38]);    MULADD(at[11], at[37]);    MULADD(at[12], at[36]);    MULADD(at[13], at[35]);    MULADD(at[14], at[34]);    MULADD(at[15], at[33]);    MULADD(at[16], at[32]);    MULADD(at[17], at[31]);    MULADD(at[18], at[30]);    MULADD(at[19], at[29]);    MULADD(at[20], at[28]); 
-   COMBA_STORE(C->dp[20]);
-   /* 21 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[49]);    MULADD(at[1], at[48]);    MULADD(at[2], at[47]);    MULADD(at[3], at[46]);    MULADD(at[4], at[45]);    MULADD(at[5], at[44]);    MULADD(at[6], at[43]);    MULADD(at[7], at[42]);    MULADD(at[8], at[41]);    MULADD(at[9], at[40]);    MULADD(at[10], at[39]);    MULADD(at[11], at[38]);    MULADD(at[12], at[37]);    MULADD(at[13], at[36]);    MULADD(at[14], at[35]);    MULADD(at[15], at[34]);    MULADD(at[16], at[33]);    MULADD(at[17], at[32]);    MULADD(at[18], at[31]);    MULADD(at[19], at[30]);    MULADD(at[20], at[29]);    MULADD(at[21], at[28]); 
-   COMBA_STORE(C->dp[21]);
-   /* 22 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[50]);    MULADD(at[1], at[49]);    MULADD(at[2], at[48]);    MULADD(at[3], at[47]);    MULADD(at[4], at[46]);    MULADD(at[5], at[45]);    MULADD(at[6], at[44]);    MULADD(at[7], at[43]);    MULADD(at[8], at[42]);    MULADD(at[9], at[41]);    MULADD(at[10], at[40]);    MULADD(at[11], at[39]);    MULADD(at[12], at[38]);    MULADD(at[13], at[37]);    MULADD(at[14], at[36]);    MULADD(at[15], at[35]);    MULADD(at[16], at[34]);    MULADD(at[17], at[33]);    MULADD(at[18], at[32]);    MULADD(at[19], at[31]);    MULADD(at[20], at[30]);    MULADD(at[21], at[29]);    MULADD(at[22], at[28]); 
-   COMBA_STORE(C->dp[22]);
-   /* 23 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[51]);    MULADD(at[1], at[50]);    MULADD(at[2], at[49]);    MULADD(at[3], at[48]);    MULADD(at[4], at[47]);    MULADD(at[5], at[46]);    MULADD(at[6], at[45]);    MULADD(at[7], at[44]);    MULADD(at[8], at[43]);    MULADD(at[9], at[42]);    MULADD(at[10], at[41]);    MULADD(at[11], at[40]);    MULADD(at[12], at[39]);    MULADD(at[13], at[38]);    MULADD(at[14], at[37]);    MULADD(at[15], at[36]);    MULADD(at[16], at[35]);    MULADD(at[17], at[34]);    MULADD(at[18], at[33]);    MULADD(at[19], at[32]);    MULADD(at[20], at[31]);    MULADD(at[21], at[30]);    MULADD(at[22], at[29]);    MULADD(at[23], at[28]); 
-   COMBA_STORE(C->dp[23]);
-   /* 24 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[52]);    MULADD(at[1], at[51]);    MULADD(at[2], at[50]);    MULADD(at[3], at[49]);    MULADD(at[4], at[48]);    MULADD(at[5], at[47]);    MULADD(at[6], at[46]);    MULADD(at[7], at[45]);    MULADD(at[8], at[44]);    MULADD(at[9], at[43]);    MULADD(at[10], at[42]);    MULADD(at[11], at[41]);    MULADD(at[12], at[40]);    MULADD(at[13], at[39]);    MULADD(at[14], at[38]);    MULADD(at[15], at[37]);    MULADD(at[16], at[36]);    MULADD(at[17], at[35]);    MULADD(at[18], at[34]);    MULADD(at[19], at[33]);    MULADD(at[20], at[32]);    MULADD(at[21], at[31]);    MULADD(at[22], at[30]);    MULADD(at[23], at[29]);    MULADD(at[24], at[28]); 
-   COMBA_STORE(C->dp[24]);
-   /* 25 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[53]);    MULADD(at[1], at[52]);    MULADD(at[2], at[51]);    MULADD(at[3], at[50]);    MULADD(at[4], at[49]);    MULADD(at[5], at[48]);    MULADD(at[6], at[47]);    MULADD(at[7], at[46]);    MULADD(at[8], at[45]);    MULADD(at[9], at[44]);    MULADD(at[10], at[43]);    MULADD(at[11], at[42]);    MULADD(at[12], at[41]);    MULADD(at[13], at[40]);    MULADD(at[14], at[39]);    MULADD(at[15], at[38]);    MULADD(at[16], at[37]);    MULADD(at[17], at[36]);    MULADD(at[18], at[35]);    MULADD(at[19], at[34]);    MULADD(at[20], at[33]);    MULADD(at[21], at[32]);    MULADD(at[22], at[31]);    MULADD(at[23], at[30]);    MULADD(at[24], at[29]);    MULADD(at[25], at[28]); 
-   COMBA_STORE(C->dp[25]);
-   /* 26 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[54]);    MULADD(at[1], at[53]);    MULADD(at[2], at[52]);    MULADD(at[3], at[51]);    MULADD(at[4], at[50]);    MULADD(at[5], at[49]);    MULADD(at[6], at[48]);    MULADD(at[7], at[47]);    MULADD(at[8], at[46]);    MULADD(at[9], at[45]);    MULADD(at[10], at[44]);    MULADD(at[11], at[43]);    MULADD(at[12], at[42]);    MULADD(at[13], at[41]);    MULADD(at[14], at[40]);    MULADD(at[15], at[39]);    MULADD(at[16], at[38]);    MULADD(at[17], at[37]);    MULADD(at[18], at[36]);    MULADD(at[19], at[35]);    MULADD(at[20], at[34]);    MULADD(at[21], at[33]);    MULADD(at[22], at[32]);    MULADD(at[23], at[31]);    MULADD(at[24], at[30]);    MULADD(at[25], at[29]);    MULADD(at[26], at[28]); 
-   COMBA_STORE(C->dp[26]);
-   /* 27 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[55]);    MULADD(at[1], at[54]);    MULADD(at[2], at[53]);    MULADD(at[3], at[52]);    MULADD(at[4], at[51]);    MULADD(at[5], at[50]);    MULADD(at[6], at[49]);    MULADD(at[7], at[48]);    MULADD(at[8], at[47]);    MULADD(at[9], at[46]);    MULADD(at[10], at[45]);    MULADD(at[11], at[44]);    MULADD(at[12], at[43]);    MULADD(at[13], at[42]);    MULADD(at[14], at[41]);    MULADD(at[15], at[40]);    MULADD(at[16], at[39]);    MULADD(at[17], at[38]);    MULADD(at[18], at[37]);    MULADD(at[19], at[36]);    MULADD(at[20], at[35]);    MULADD(at[21], at[34]);    MULADD(at[22], at[33]);    MULADD(at[23], at[32]);    MULADD(at[24], at[31]);    MULADD(at[25], at[30]);    MULADD(at[26], at[29]);    MULADD(at[27], at[28]); 
-   COMBA_STORE(C->dp[27]);
-   /* 28 */
-   COMBA_FORWARD;
-   MULADD(at[1], at[55]);    MULADD(at[2], at[54]);    MULADD(at[3], at[53]);    MULADD(at[4], at[52]);    MULADD(at[5], at[51]);    MULADD(at[6], at[50]);    MULADD(at[7], at[49]);    MULADD(at[8], at[48]);    MULADD(at[9], at[47]);    MULADD(at[10], at[46]);    MULADD(at[11], at[45]);    MULADD(at[12], at[44]);    MULADD(at[13], at[43]);    MULADD(at[14], at[42]);    MULADD(at[15], at[41]);    MULADD(at[16], at[40]);    MULADD(at[17], at[39]);    MULADD(at[18], at[38]);    MULADD(at[19], at[37]);    MULADD(at[20], at[36]);    MULADD(at[21], at[35]);    MULADD(at[22], at[34]);    MULADD(at[23], at[33]);    MULADD(at[24], at[32]);    MULADD(at[25], at[31]);    MULADD(at[26], at[30]);    MULADD(at[27], at[29]); 
-   COMBA_STORE(C->dp[28]);
-   /* 29 */
-   COMBA_FORWARD;
-   MULADD(at[2], at[55]);    MULADD(at[3], at[54]);    MULADD(at[4], at[53]);    MULADD(at[5], at[52]);    MULADD(at[6], at[51]);    MULADD(at[7], at[50]);    MULADD(at[8], at[49]);    MULADD(at[9], at[48]);    MULADD(at[10], at[47]);    MULADD(at[11], at[46]);    MULADD(at[12], at[45]);    MULADD(at[13], at[44]);    MULADD(at[14], at[43]);    MULADD(at[15], at[42]);    MULADD(at[16], at[41]);    MULADD(at[17], at[40]);    MULADD(at[18], at[39]);    MULADD(at[19], at[38]);    MULADD(at[20], at[37]);    MULADD(at[21], at[36]);    MULADD(at[22], at[35]);    MULADD(at[23], at[34]);    MULADD(at[24], at[33]);    MULADD(at[25], at[32]);    MULADD(at[26], at[31]);    MULADD(at[27], at[30]); 
-   COMBA_STORE(C->dp[29]);
-   /* 30 */
-   COMBA_FORWARD;
-   MULADD(at[3], at[55]);    MULADD(at[4], at[54]);    MULADD(at[5], at[53]);    MULADD(at[6], at[52]);    MULADD(at[7], at[51]);    MULADD(at[8], at[50]);    MULADD(at[9], at[49]);    MULADD(at[10], at[48]);    MULADD(at[11], at[47]);    MULADD(at[12], at[46]);    MULADD(at[13], at[45]);    MULADD(at[14], at[44]);    MULADD(at[15], at[43]);    MULADD(at[16], at[42]);    MULADD(at[17], at[41]);    MULADD(at[18], at[40]);    MULADD(at[19], at[39]);    MULADD(at[20], at[38]);    MULADD(at[21], at[37]);    MULADD(at[22], at[36]);    MULADD(at[23], at[35]);    MULADD(at[24], at[34]);    MULADD(at[25], at[33]);    MULADD(at[26], at[32]);    MULADD(at[27], at[31]); 
-   COMBA_STORE(C->dp[30]);
-   /* 31 */
-   COMBA_FORWARD;
-   MULADD(at[4], at[55]);    MULADD(at[5], at[54]);    MULADD(at[6], at[53]);    MULADD(at[7], at[52]);    MULADD(at[8], at[51]);    MULADD(at[9], at[50]);    MULADD(at[10], at[49]);    MULADD(at[11], at[48]);    MULADD(at[12], at[47]);    MULADD(at[13], at[46]);    MULADD(at[14], at[45]);    MULADD(at[15], at[44]);    MULADD(at[16], at[43]);    MULADD(at[17], at[42]);    MULADD(at[18], at[41]);    MULADD(at[19], at[40]);    MULADD(at[20], at[39]);    MULADD(at[21], at[38]);    MULADD(at[22], at[37]);    MULADD(at[23], at[36]);    MULADD(at[24], at[35]);    MULADD(at[25], at[34]);    MULADD(at[26], at[33]);    MULADD(at[27], at[32]); 
-   COMBA_STORE(C->dp[31]);
-   /* 32 */
-   COMBA_FORWARD;
-   MULADD(at[5], at[55]);    MULADD(at[6], at[54]);    MULADD(at[7], at[53]);    MULADD(at[8], at[52]);    MULADD(at[9], at[51]);    MULADD(at[10], at[50]);    MULADD(at[11], at[49]);    MULADD(at[12], at[48]);    MULADD(at[13], at[47]);    MULADD(at[14], at[46]);    MULADD(at[15], at[45]);    MULADD(at[16], at[44]);    MULADD(at[17], at[43]);    MULADD(at[18], at[42]);    MULADD(at[19], at[41]);    MULADD(at[20], at[40]);    MULADD(at[21], at[39]);    MULADD(at[22], at[38]);    MULADD(at[23], at[37]);    MULADD(at[24], at[36]);    MULADD(at[25], at[35]);    MULADD(at[26], at[34]);    MULADD(at[27], at[33]); 
-   COMBA_STORE(C->dp[32]);
-   /* 33 */
-   COMBA_FORWARD;
-   MULADD(at[6], at[55]);    MULADD(at[7], at[54]);    MULADD(at[8], at[53]);    MULADD(at[9], at[52]);    MULADD(at[10], at[51]);    MULADD(at[11], at[50]);    MULADD(at[12], at[49]);    MULADD(at[13], at[48]);    MULADD(at[14], at[47]);    MULADD(at[15], at[46]);    MULADD(at[16], at[45]);    MULADD(at[17], at[44]);    MULADD(at[18], at[43]);    MULADD(at[19], at[42]);    MULADD(at[20], at[41]);    MULADD(at[21], at[40]);    MULADD(at[22], at[39]);    MULADD(at[23], at[38]);    MULADD(at[24], at[37]);    MULADD(at[25], at[36]);    MULADD(at[26], at[35]);    MULADD(at[27], at[34]); 
-   COMBA_STORE(C->dp[33]);
-   /* 34 */
-   COMBA_FORWARD;
-   MULADD(at[7], at[55]);    MULADD(at[8], at[54]);    MULADD(at[9], at[53]);    MULADD(at[10], at[52]);    MULADD(at[11], at[51]);    MULADD(at[12], at[50]);    MULADD(at[13], at[49]);    MULADD(at[14], at[48]);    MULADD(at[15], at[47]);    MULADD(at[16], at[46]);    MULADD(at[17], at[45]);    MULADD(at[18], at[44]);    MULADD(at[19], at[43]);    MULADD(at[20], at[42]);    MULADD(at[21], at[41]);    MULADD(at[22], at[40]);    MULADD(at[23], at[39]);    MULADD(at[24], at[38]);    MULADD(at[25], at[37]);    MULADD(at[26], at[36]);    MULADD(at[27], at[35]); 
-   COMBA_STORE(C->dp[34]);
-   /* 35 */
-   COMBA_FORWARD;
-   MULADD(at[8], at[55]);    MULADD(at[9], at[54]);    MULADD(at[10], at[53]);    MULADD(at[11], at[52]);    MULADD(at[12], at[51]);    MULADD(at[13], at[50]);    MULADD(at[14], at[49]);    MULADD(at[15], at[48]);    MULADD(at[16], at[47]);    MULADD(at[17], at[46]);    MULADD(at[18], at[45]);    MULADD(at[19], at[44]);    MULADD(at[20], at[43]);    MULADD(at[21], at[42]);    MULADD(at[22], at[41]);    MULADD(at[23], at[40]);    MULADD(at[24], at[39]);    MULADD(at[25], at[38]);    MULADD(at[26], at[37]);    MULADD(at[27], at[36]); 
-   COMBA_STORE(C->dp[35]);
-   /* 36 */
-   COMBA_FORWARD;
-   MULADD(at[9], at[55]);    MULADD(at[10], at[54]);    MULADD(at[11], at[53]);    MULADD(at[12], at[52]);    MULADD(at[13], at[51]);    MULADD(at[14], at[50]);    MULADD(at[15], at[49]);    MULADD(at[16], at[48]);    MULADD(at[17], at[47]);    MULADD(at[18], at[46]);    MULADD(at[19], at[45]);    MULADD(at[20], at[44]);    MULADD(at[21], at[43]);    MULADD(at[22], at[42]);    MULADD(at[23], at[41]);    MULADD(at[24], at[40]);    MULADD(at[25], at[39]);    MULADD(at[26], at[38]);    MULADD(at[27], at[37]); 
-   COMBA_STORE(C->dp[36]);
-   /* 37 */
-   COMBA_FORWARD;
-   MULADD(at[10], at[55]);    MULADD(at[11], at[54]);    MULADD(at[12], at[53]);    MULADD(at[13], at[52]);    MULADD(at[14], at[51]);    MULADD(at[15], at[50]);    MULADD(at[16], at[49]);    MULADD(at[17], at[48]);    MULADD(at[18], at[47]);    MULADD(at[19], at[46]);    MULADD(at[20], at[45]);    MULADD(at[21], at[44]);    MULADD(at[22], at[43]);    MULADD(at[23], at[42]);    MULADD(at[24], at[41]);    MULADD(at[25], at[40]);    MULADD(at[26], at[39]);    MULADD(at[27], at[38]); 
-   COMBA_STORE(C->dp[37]);
-   /* 38 */
-   COMBA_FORWARD;
-   MULADD(at[11], at[55]);    MULADD(at[12], at[54]);    MULADD(at[13], at[53]);    MULADD(at[14], at[52]);    MULADD(at[15], at[51]);    MULADD(at[16], at[50]);    MULADD(at[17], at[49]);    MULADD(at[18], at[48]);    MULADD(at[19], at[47]);    MULADD(at[20], at[46]);    MULADD(at[21], at[45]);    MULADD(at[22], at[44]);    MULADD(at[23], at[43]);    MULADD(at[24], at[42]);    MULADD(at[25], at[41]);    MULADD(at[26], at[40]);    MULADD(at[27], at[39]); 
-   COMBA_STORE(C->dp[38]);
-   /* 39 */
-   COMBA_FORWARD;
-   MULADD(at[12], at[55]);    MULADD(at[13], at[54]);    MULADD(at[14], at[53]);    MULADD(at[15], at[52]);    MULADD(at[16], at[51]);    MULADD(at[17], at[50]);    MULADD(at[18], at[49]);    MULADD(at[19], at[48]);    MULADD(at[20], at[47]);    MULADD(at[21], at[46]);    MULADD(at[22], at[45]);    MULADD(at[23], at[44]);    MULADD(at[24], at[43]);    MULADD(at[25], at[42]);    MULADD(at[26], at[41]);    MULADD(at[27], at[40]); 
-   COMBA_STORE(C->dp[39]);
-   /* 40 */
-   COMBA_FORWARD;
-   MULADD(at[13], at[55]);    MULADD(at[14], at[54]);    MULADD(at[15], at[53]);    MULADD(at[16], at[52]);    MULADD(at[17], at[51]);    MULADD(at[18], at[50]);    MULADD(at[19], at[49]);    MULADD(at[20], at[48]);    MULADD(at[21], at[47]);    MULADD(at[22], at[46]);    MULADD(at[23], at[45]);    MULADD(at[24], at[44]);    MULADD(at[25], at[43]);    MULADD(at[26], at[42]);    MULADD(at[27], at[41]); 
-   COMBA_STORE(C->dp[40]);
-   /* 41 */
-   COMBA_FORWARD;
-   MULADD(at[14], at[55]);    MULADD(at[15], at[54]);    MULADD(at[16], at[53]);    MULADD(at[17], at[52]);    MULADD(at[18], at[51]);    MULADD(at[19], at[50]);    MULADD(at[20], at[49]);    MULADD(at[21], at[48]);    MULADD(at[22], at[47]);    MULADD(at[23], at[46]);    MULADD(at[24], at[45]);    MULADD(at[25], at[44]);    MULADD(at[26], at[43]);    MULADD(at[27], at[42]); 
-   COMBA_STORE(C->dp[41]);
-   /* 42 */
-   COMBA_FORWARD;
-   MULADD(at[15], at[55]);    MULADD(at[16], at[54]);    MULADD(at[17], at[53]);    MULADD(at[18], at[52]);    MULADD(at[19], at[51]);    MULADD(at[20], at[50]);    MULADD(at[21], at[49]);    MULADD(at[22], at[48]);    MULADD(at[23], at[47]);    MULADD(at[24], at[46]);    MULADD(at[25], at[45]);    MULADD(at[26], at[44]);    MULADD(at[27], at[43]); 
-   COMBA_STORE(C->dp[42]);
-   /* 43 */
-   COMBA_FORWARD;
-   MULADD(at[16], at[55]);    MULADD(at[17], at[54]);    MULADD(at[18], at[53]);    MULADD(at[19], at[52]);    MULADD(at[20], at[51]);    MULADD(at[21], at[50]);    MULADD(at[22], at[49]);    MULADD(at[23], at[48]);    MULADD(at[24], at[47]);    MULADD(at[25], at[46]);    MULADD(at[26], at[45]);    MULADD(at[27], at[44]); 
-   COMBA_STORE(C->dp[43]);
-   /* 44 */
-   COMBA_FORWARD;
-   MULADD(at[17], at[55]);    MULADD(at[18], at[54]);    MULADD(at[19], at[53]);    MULADD(at[20], at[52]);    MULADD(at[21], at[51]);    MULADD(at[22], at[50]);    MULADD(at[23], at[49]);    MULADD(at[24], at[48]);    MULADD(at[25], at[47]);    MULADD(at[26], at[46]);    MULADD(at[27], at[45]); 
-   COMBA_STORE(C->dp[44]);
-   /* 45 */
-   COMBA_FORWARD;
-   MULADD(at[18], at[55]);    MULADD(at[19], at[54]);    MULADD(at[20], at[53]);    MULADD(at[21], at[52]);    MULADD(at[22], at[51]);    MULADD(at[23], at[50]);    MULADD(at[24], at[49]);    MULADD(at[25], at[48]);    MULADD(at[26], at[47]);    MULADD(at[27], at[46]); 
-   COMBA_STORE(C->dp[45]);
-   /* 46 */
-   COMBA_FORWARD;
-   MULADD(at[19], at[55]);    MULADD(at[20], at[54]);    MULADD(at[21], at[53]);    MULADD(at[22], at[52]);    MULADD(at[23], at[51]);    MULADD(at[24], at[50]);    MULADD(at[25], at[49]);    MULADD(at[26], at[48]);    MULADD(at[27], at[47]); 
-   COMBA_STORE(C->dp[46]);
-   /* 47 */
-   COMBA_FORWARD;
-   MULADD(at[20], at[55]);    MULADD(at[21], at[54]);    MULADD(at[22], at[53]);    MULADD(at[23], at[52]);    MULADD(at[24], at[51]);    MULADD(at[25], at[50]);    MULADD(at[26], at[49]);    MULADD(at[27], at[48]); 
-   COMBA_STORE(C->dp[47]);
-   /* 48 */
-   COMBA_FORWARD;
-   MULADD(at[21], at[55]);    MULADD(at[22], at[54]);    MULADD(at[23], at[53]);    MULADD(at[24], at[52]);    MULADD(at[25], at[51]);    MULADD(at[26], at[50]);    MULADD(at[27], at[49]); 
-   COMBA_STORE(C->dp[48]);
-   /* 49 */
-   COMBA_FORWARD;
-   MULADD(at[22], at[55]);    MULADD(at[23], at[54]);    MULADD(at[24], at[53]);    MULADD(at[25], at[52]);    MULADD(at[26], at[51]);    MULADD(at[27], at[50]); 
-   COMBA_STORE(C->dp[49]);
-   /* 50 */
-   COMBA_FORWARD;
-   MULADD(at[23], at[55]);    MULADD(at[24], at[54]);    MULADD(at[25], at[53]);    MULADD(at[26], at[52]);    MULADD(at[27], at[51]); 
-   COMBA_STORE(C->dp[50]);
-   /* 51 */
-   COMBA_FORWARD;
-   MULADD(at[24], at[55]);    MULADD(at[25], at[54]);    MULADD(at[26], at[53]);    MULADD(at[27], at[52]); 
-   COMBA_STORE(C->dp[51]);
-   /* 52 */
-   COMBA_FORWARD;
-   MULADD(at[25], at[55]);    MULADD(at[26], at[54]);    MULADD(at[27], at[53]); 
-   COMBA_STORE(C->dp[52]);
-   /* 53 */
-   COMBA_FORWARD;
-   MULADD(at[26], at[55]);    MULADD(at[27], at[54]); 
-   COMBA_STORE(C->dp[53]);
-   /* 54 */
-   COMBA_FORWARD;
-   MULADD(at[27], at[55]); 
-   COMBA_STORE(C->dp[54]);
-   COMBA_STORE2(C->dp[55]);
-   C->used = 56;
-   C->sign = A->sign ^ B->sign;
-   fp_clamp(C);
-   COMBA_FINI;
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif

+ 0 - 61
lib/wolfssl/wolfcrypt/src/fp_mul_comba_3.i

@@ -1,61 +0,0 @@
-/* fp_mul_comba_3.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_MUL3
-int fp_mul_comba3(fp_int *A, fp_int *B, fp_int *C)
-{
-   fp_digit c0, c1, c2, at[6];
-
-   XMEMCPY(at, A->dp, 3 * sizeof(fp_digit));
-   XMEMCPY(at+3, B->dp, 3 * sizeof(fp_digit));
-   COMBA_START;
-
-   COMBA_CLEAR;
-   /* 0 */
-   MULADD(at[0], at[3]); 
-   COMBA_STORE(C->dp[0]);
-   /* 1 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[4]);    MULADD(at[1], at[3]); 
-   COMBA_STORE(C->dp[1]);
-   /* 2 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[5]);    MULADD(at[1], at[4]);    MULADD(at[2], at[3]); 
-   COMBA_STORE(C->dp[2]);
-   /* 3 */
-   COMBA_FORWARD;
-   MULADD(at[1], at[5]);    MULADD(at[2], at[4]); 
-   COMBA_STORE(C->dp[3]);
-   /* 4 */
-   COMBA_FORWARD;
-   MULADD(at[2], at[5]); 
-   COMBA_STORE(C->dp[4]);
-   COMBA_STORE2(C->dp[5]);
-   C->used = 6;
-   C->sign = A->sign ^ B->sign;
-   fp_clamp(C);
-   COMBA_FINI;
-
-   return FP_OKAY;
-}
-#endif

+ 0 - 321
lib/wolfssl/wolfcrypt/src/fp_mul_comba_32.i

@@ -1,321 +0,0 @@
-/* fp_mul_comba_32.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_MUL32
-int fp_mul_comba32(fp_int *A, fp_int *B, fp_int *C)
-{
-   int out_size;
-   fp_digit c0, c1, c2;
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit at[64];
-#else
-   fp_digit *at;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (at == NULL)
-       return FP_MEM;
-#endif
-
-   out_size = A->used + B->used;
-   XMEMCPY(at, A->dp, 32 * sizeof(fp_digit));
-   XMEMCPY(at+32, B->dp, 32 * sizeof(fp_digit));
-   COMBA_START;
-
-   COMBA_CLEAR;
-   /* 0 */
-   MULADD(at[0], at[32]); 
-   COMBA_STORE(C->dp[0]);
-   /* 1 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[33]);    MULADD(at[1], at[32]); 
-   COMBA_STORE(C->dp[1]);
-   /* 2 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[34]);    MULADD(at[1], at[33]);    MULADD(at[2], at[32]); 
-   COMBA_STORE(C->dp[2]);
-   /* 3 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[35]);    MULADD(at[1], at[34]);    MULADD(at[2], at[33]);    MULADD(at[3], at[32]); 
-   COMBA_STORE(C->dp[3]);
-   /* 4 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[36]);    MULADD(at[1], at[35]);    MULADD(at[2], at[34]);    MULADD(at[3], at[33]);    MULADD(at[4], at[32]); 
-   COMBA_STORE(C->dp[4]);
-   /* 5 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[37]);    MULADD(at[1], at[36]);    MULADD(at[2], at[35]);    MULADD(at[3], at[34]);    MULADD(at[4], at[33]);    MULADD(at[5], at[32]); 
-   COMBA_STORE(C->dp[5]);
-   /* 6 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[38]);    MULADD(at[1], at[37]);    MULADD(at[2], at[36]);    MULADD(at[3], at[35]);    MULADD(at[4], at[34]);    MULADD(at[5], at[33]);    MULADD(at[6], at[32]); 
-   COMBA_STORE(C->dp[6]);
-   /* 7 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[39]);    MULADD(at[1], at[38]);    MULADD(at[2], at[37]);    MULADD(at[3], at[36]);    MULADD(at[4], at[35]);    MULADD(at[5], at[34]);    MULADD(at[6], at[33]);    MULADD(at[7], at[32]); 
-   COMBA_STORE(C->dp[7]);
-   /* 8 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[40]);    MULADD(at[1], at[39]);    MULADD(at[2], at[38]);    MULADD(at[3], at[37]);    MULADD(at[4], at[36]);    MULADD(at[5], at[35]);    MULADD(at[6], at[34]);    MULADD(at[7], at[33]);    MULADD(at[8], at[32]); 
-   COMBA_STORE(C->dp[8]);
-   /* 9 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[41]);    MULADD(at[1], at[40]);    MULADD(at[2], at[39]);    MULADD(at[3], at[38]);    MULADD(at[4], at[37]);    MULADD(at[5], at[36]);    MULADD(at[6], at[35]);    MULADD(at[7], at[34]);    MULADD(at[8], at[33]);    MULADD(at[9], at[32]); 
-   COMBA_STORE(C->dp[9]);
-   /* 10 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[42]);    MULADD(at[1], at[41]);    MULADD(at[2], at[40]);    MULADD(at[3], at[39]);    MULADD(at[4], at[38]);    MULADD(at[5], at[37]);    MULADD(at[6], at[36]);    MULADD(at[7], at[35]);    MULADD(at[8], at[34]);    MULADD(at[9], at[33]);    MULADD(at[10], at[32]); 
-   COMBA_STORE(C->dp[10]);
-   /* 11 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[43]);    MULADD(at[1], at[42]);    MULADD(at[2], at[41]);    MULADD(at[3], at[40]);    MULADD(at[4], at[39]);    MULADD(at[5], at[38]);    MULADD(at[6], at[37]);    MULADD(at[7], at[36]);    MULADD(at[8], at[35]);    MULADD(at[9], at[34]);    MULADD(at[10], at[33]);    MULADD(at[11], at[32]); 
-   COMBA_STORE(C->dp[11]);
-   /* 12 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[44]);    MULADD(at[1], at[43]);    MULADD(at[2], at[42]);    MULADD(at[3], at[41]);    MULADD(at[4], at[40]);    MULADD(at[5], at[39]);    MULADD(at[6], at[38]);    MULADD(at[7], at[37]);    MULADD(at[8], at[36]);    MULADD(at[9], at[35]);    MULADD(at[10], at[34]);    MULADD(at[11], at[33]);    MULADD(at[12], at[32]); 
-   COMBA_STORE(C->dp[12]);
-   /* 13 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[45]);    MULADD(at[1], at[44]);    MULADD(at[2], at[43]);    MULADD(at[3], at[42]);    MULADD(at[4], at[41]);    MULADD(at[5], at[40]);    MULADD(at[6], at[39]);    MULADD(at[7], at[38]);    MULADD(at[8], at[37]);    MULADD(at[9], at[36]);    MULADD(at[10], at[35]);    MULADD(at[11], at[34]);    MULADD(at[12], at[33]);    MULADD(at[13], at[32]); 
-   COMBA_STORE(C->dp[13]);
-   /* 14 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[46]);    MULADD(at[1], at[45]);    MULADD(at[2], at[44]);    MULADD(at[3], at[43]);    MULADD(at[4], at[42]);    MULADD(at[5], at[41]);    MULADD(at[6], at[40]);    MULADD(at[7], at[39]);    MULADD(at[8], at[38]);    MULADD(at[9], at[37]);    MULADD(at[10], at[36]);    MULADD(at[11], at[35]);    MULADD(at[12], at[34]);    MULADD(at[13], at[33]);    MULADD(at[14], at[32]); 
-   COMBA_STORE(C->dp[14]);
-   /* 15 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[47]);    MULADD(at[1], at[46]);    MULADD(at[2], at[45]);    MULADD(at[3], at[44]);    MULADD(at[4], at[43]);    MULADD(at[5], at[42]);    MULADD(at[6], at[41]);    MULADD(at[7], at[40]);    MULADD(at[8], at[39]);    MULADD(at[9], at[38]);    MULADD(at[10], at[37]);    MULADD(at[11], at[36]);    MULADD(at[12], at[35]);    MULADD(at[13], at[34]);    MULADD(at[14], at[33]);    MULADD(at[15], at[32]); 
-   COMBA_STORE(C->dp[15]);
-   /* 16 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[48]);    MULADD(at[1], at[47]);    MULADD(at[2], at[46]);    MULADD(at[3], at[45]);    MULADD(at[4], at[44]);    MULADD(at[5], at[43]);    MULADD(at[6], at[42]);    MULADD(at[7], at[41]);    MULADD(at[8], at[40]);    MULADD(at[9], at[39]);    MULADD(at[10], at[38]);    MULADD(at[11], at[37]);    MULADD(at[12], at[36]);    MULADD(at[13], at[35]);    MULADD(at[14], at[34]);    MULADD(at[15], at[33]);    MULADD(at[16], at[32]); 
-   COMBA_STORE(C->dp[16]);
-   /* 17 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[49]);    MULADD(at[1], at[48]);    MULADD(at[2], at[47]);    MULADD(at[3], at[46]);    MULADD(at[4], at[45]);    MULADD(at[5], at[44]);    MULADD(at[6], at[43]);    MULADD(at[7], at[42]);    MULADD(at[8], at[41]);    MULADD(at[9], at[40]);    MULADD(at[10], at[39]);    MULADD(at[11], at[38]);    MULADD(at[12], at[37]);    MULADD(at[13], at[36]);    MULADD(at[14], at[35]);    MULADD(at[15], at[34]);    MULADD(at[16], at[33]);    MULADD(at[17], at[32]); 
-   COMBA_STORE(C->dp[17]);
-   /* 18 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[50]);    MULADD(at[1], at[49]);    MULADD(at[2], at[48]);    MULADD(at[3], at[47]);    MULADD(at[4], at[46]);    MULADD(at[5], at[45]);    MULADD(at[6], at[44]);    MULADD(at[7], at[43]);    MULADD(at[8], at[42]);    MULADD(at[9], at[41]);    MULADD(at[10], at[40]);    MULADD(at[11], at[39]);    MULADD(at[12], at[38]);    MULADD(at[13], at[37]);    MULADD(at[14], at[36]);    MULADD(at[15], at[35]);    MULADD(at[16], at[34]);    MULADD(at[17], at[33]);    MULADD(at[18], at[32]); 
-   COMBA_STORE(C->dp[18]);
-   /* 19 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[51]);    MULADD(at[1], at[50]);    MULADD(at[2], at[49]);    MULADD(at[3], at[48]);    MULADD(at[4], at[47]);    MULADD(at[5], at[46]);    MULADD(at[6], at[45]);    MULADD(at[7], at[44]);    MULADD(at[8], at[43]);    MULADD(at[9], at[42]);    MULADD(at[10], at[41]);    MULADD(at[11], at[40]);    MULADD(at[12], at[39]);    MULADD(at[13], at[38]);    MULADD(at[14], at[37]);    MULADD(at[15], at[36]);    MULADD(at[16], at[35]);    MULADD(at[17], at[34]);    MULADD(at[18], at[33]);    MULADD(at[19], at[32]); 
-   COMBA_STORE(C->dp[19]);
-   /* 20 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[52]);    MULADD(at[1], at[51]);    MULADD(at[2], at[50]);    MULADD(at[3], at[49]);    MULADD(at[4], at[48]);    MULADD(at[5], at[47]);    MULADD(at[6], at[46]);    MULADD(at[7], at[45]);    MULADD(at[8], at[44]);    MULADD(at[9], at[43]);    MULADD(at[10], at[42]);    MULADD(at[11], at[41]);    MULADD(at[12], at[40]);    MULADD(at[13], at[39]);    MULADD(at[14], at[38]);    MULADD(at[15], at[37]);    MULADD(at[16], at[36]);    MULADD(at[17], at[35]);    MULADD(at[18], at[34]);    MULADD(at[19], at[33]);    MULADD(at[20], at[32]); 
-   COMBA_STORE(C->dp[20]);
-   /* 21 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[53]);    MULADD(at[1], at[52]);    MULADD(at[2], at[51]);    MULADD(at[3], at[50]);    MULADD(at[4], at[49]);    MULADD(at[5], at[48]);    MULADD(at[6], at[47]);    MULADD(at[7], at[46]);    MULADD(at[8], at[45]);    MULADD(at[9], at[44]);    MULADD(at[10], at[43]);    MULADD(at[11], at[42]);    MULADD(at[12], at[41]);    MULADD(at[13], at[40]);    MULADD(at[14], at[39]);    MULADD(at[15], at[38]);    MULADD(at[16], at[37]);    MULADD(at[17], at[36]);    MULADD(at[18], at[35]);    MULADD(at[19], at[34]);    MULADD(at[20], at[33]);    MULADD(at[21], at[32]); 
-   COMBA_STORE(C->dp[21]);
-   /* 22 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[54]);    MULADD(at[1], at[53]);    MULADD(at[2], at[52]);    MULADD(at[3], at[51]);    MULADD(at[4], at[50]);    MULADD(at[5], at[49]);    MULADD(at[6], at[48]);    MULADD(at[7], at[47]);    MULADD(at[8], at[46]);    MULADD(at[9], at[45]);    MULADD(at[10], at[44]);    MULADD(at[11], at[43]);    MULADD(at[12], at[42]);    MULADD(at[13], at[41]);    MULADD(at[14], at[40]);    MULADD(at[15], at[39]);    MULADD(at[16], at[38]);    MULADD(at[17], at[37]);    MULADD(at[18], at[36]);    MULADD(at[19], at[35]);    MULADD(at[20], at[34]);    MULADD(at[21], at[33]);    MULADD(at[22], at[32]); 
-   COMBA_STORE(C->dp[22]);
-   /* 23 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[55]);    MULADD(at[1], at[54]);    MULADD(at[2], at[53]);    MULADD(at[3], at[52]);    MULADD(at[4], at[51]);    MULADD(at[5], at[50]);    MULADD(at[6], at[49]);    MULADD(at[7], at[48]);    MULADD(at[8], at[47]);    MULADD(at[9], at[46]);    MULADD(at[10], at[45]);    MULADD(at[11], at[44]);    MULADD(at[12], at[43]);    MULADD(at[13], at[42]);    MULADD(at[14], at[41]);    MULADD(at[15], at[40]);    MULADD(at[16], at[39]);    MULADD(at[17], at[38]);    MULADD(at[18], at[37]);    MULADD(at[19], at[36]);    MULADD(at[20], at[35]);    MULADD(at[21], at[34]);    MULADD(at[22], at[33]);    MULADD(at[23], at[32]); 
-   COMBA_STORE(C->dp[23]);
-   /* 24 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[56]);    MULADD(at[1], at[55]);    MULADD(at[2], at[54]);    MULADD(at[3], at[53]);    MULADD(at[4], at[52]);    MULADD(at[5], at[51]);    MULADD(at[6], at[50]);    MULADD(at[7], at[49]);    MULADD(at[8], at[48]);    MULADD(at[9], at[47]);    MULADD(at[10], at[46]);    MULADD(at[11], at[45]);    MULADD(at[12], at[44]);    MULADD(at[13], at[43]);    MULADD(at[14], at[42]);    MULADD(at[15], at[41]);    MULADD(at[16], at[40]);    MULADD(at[17], at[39]);    MULADD(at[18], at[38]);    MULADD(at[19], at[37]);    MULADD(at[20], at[36]);    MULADD(at[21], at[35]);    MULADD(at[22], at[34]);    MULADD(at[23], at[33]);    MULADD(at[24], at[32]); 
-   COMBA_STORE(C->dp[24]);
-   /* 25 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[57]);    MULADD(at[1], at[56]);    MULADD(at[2], at[55]);    MULADD(at[3], at[54]);    MULADD(at[4], at[53]);    MULADD(at[5], at[52]);    MULADD(at[6], at[51]);    MULADD(at[7], at[50]);    MULADD(at[8], at[49]);    MULADD(at[9], at[48]);    MULADD(at[10], at[47]);    MULADD(at[11], at[46]);    MULADD(at[12], at[45]);    MULADD(at[13], at[44]);    MULADD(at[14], at[43]);    MULADD(at[15], at[42]);    MULADD(at[16], at[41]);    MULADD(at[17], at[40]);    MULADD(at[18], at[39]);    MULADD(at[19], at[38]);    MULADD(at[20], at[37]);    MULADD(at[21], at[36]);    MULADD(at[22], at[35]);    MULADD(at[23], at[34]);    MULADD(at[24], at[33]);    MULADD(at[25], at[32]); 
-   COMBA_STORE(C->dp[25]);
-   /* 26 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[58]);    MULADD(at[1], at[57]);    MULADD(at[2], at[56]);    MULADD(at[3], at[55]);    MULADD(at[4], at[54]);    MULADD(at[5], at[53]);    MULADD(at[6], at[52]);    MULADD(at[7], at[51]);    MULADD(at[8], at[50]);    MULADD(at[9], at[49]);    MULADD(at[10], at[48]);    MULADD(at[11], at[47]);    MULADD(at[12], at[46]);    MULADD(at[13], at[45]);    MULADD(at[14], at[44]);    MULADD(at[15], at[43]);    MULADD(at[16], at[42]);    MULADD(at[17], at[41]);    MULADD(at[18], at[40]);    MULADD(at[19], at[39]);    MULADD(at[20], at[38]);    MULADD(at[21], at[37]);    MULADD(at[22], at[36]);    MULADD(at[23], at[35]);    MULADD(at[24], at[34]);    MULADD(at[25], at[33]);    MULADD(at[26], at[32]); 
-   COMBA_STORE(C->dp[26]);
-   /* 27 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[59]);    MULADD(at[1], at[58]);    MULADD(at[2], at[57]);    MULADD(at[3], at[56]);    MULADD(at[4], at[55]);    MULADD(at[5], at[54]);    MULADD(at[6], at[53]);    MULADD(at[7], at[52]);    MULADD(at[8], at[51]);    MULADD(at[9], at[50]);    MULADD(at[10], at[49]);    MULADD(at[11], at[48]);    MULADD(at[12], at[47]);    MULADD(at[13], at[46]);    MULADD(at[14], at[45]);    MULADD(at[15], at[44]);    MULADD(at[16], at[43]);    MULADD(at[17], at[42]);    MULADD(at[18], at[41]);    MULADD(at[19], at[40]);    MULADD(at[20], at[39]);    MULADD(at[21], at[38]);    MULADD(at[22], at[37]);    MULADD(at[23], at[36]);    MULADD(at[24], at[35]);    MULADD(at[25], at[34]);    MULADD(at[26], at[33]);    MULADD(at[27], at[32]); 
-   COMBA_STORE(C->dp[27]);
-   /* 28 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[60]);    MULADD(at[1], at[59]);    MULADD(at[2], at[58]);    MULADD(at[3], at[57]);    MULADD(at[4], at[56]);    MULADD(at[5], at[55]);    MULADD(at[6], at[54]);    MULADD(at[7], at[53]);    MULADD(at[8], at[52]);    MULADD(at[9], at[51]);    MULADD(at[10], at[50]);    MULADD(at[11], at[49]);    MULADD(at[12], at[48]);    MULADD(at[13], at[47]);    MULADD(at[14], at[46]);    MULADD(at[15], at[45]);    MULADD(at[16], at[44]);    MULADD(at[17], at[43]);    MULADD(at[18], at[42]);    MULADD(at[19], at[41]);    MULADD(at[20], at[40]);    MULADD(at[21], at[39]);    MULADD(at[22], at[38]);    MULADD(at[23], at[37]);    MULADD(at[24], at[36]);    MULADD(at[25], at[35]);    MULADD(at[26], at[34]);    MULADD(at[27], at[33]);    MULADD(at[28], at[32]); 
-   COMBA_STORE(C->dp[28]);
-   /* 29 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[61]);    MULADD(at[1], at[60]);    MULADD(at[2], at[59]);    MULADD(at[3], at[58]);    MULADD(at[4], at[57]);    MULADD(at[5], at[56]);    MULADD(at[6], at[55]);    MULADD(at[7], at[54]);    MULADD(at[8], at[53]);    MULADD(at[9], at[52]);    MULADD(at[10], at[51]);    MULADD(at[11], at[50]);    MULADD(at[12], at[49]);    MULADD(at[13], at[48]);    MULADD(at[14], at[47]);    MULADD(at[15], at[46]);    MULADD(at[16], at[45]);    MULADD(at[17], at[44]);    MULADD(at[18], at[43]);    MULADD(at[19], at[42]);    MULADD(at[20], at[41]);    MULADD(at[21], at[40]);    MULADD(at[22], at[39]);    MULADD(at[23], at[38]);    MULADD(at[24], at[37]);    MULADD(at[25], at[36]);    MULADD(at[26], at[35]);    MULADD(at[27], at[34]);    MULADD(at[28], at[33]);    MULADD(at[29], at[32]); 
-   COMBA_STORE(C->dp[29]);
-   /* 30 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[62]);    MULADD(at[1], at[61]);    MULADD(at[2], at[60]);    MULADD(at[3], at[59]);    MULADD(at[4], at[58]);    MULADD(at[5], at[57]);    MULADD(at[6], at[56]);    MULADD(at[7], at[55]);    MULADD(at[8], at[54]);    MULADD(at[9], at[53]);    MULADD(at[10], at[52]);    MULADD(at[11], at[51]);    MULADD(at[12], at[50]);    MULADD(at[13], at[49]);    MULADD(at[14], at[48]);    MULADD(at[15], at[47]);    MULADD(at[16], at[46]);    MULADD(at[17], at[45]);    MULADD(at[18], at[44]);    MULADD(at[19], at[43]);    MULADD(at[20], at[42]);    MULADD(at[21], at[41]);    MULADD(at[22], at[40]);    MULADD(at[23], at[39]);    MULADD(at[24], at[38]);    MULADD(at[25], at[37]);    MULADD(at[26], at[36]);    MULADD(at[27], at[35]);    MULADD(at[28], at[34]);    MULADD(at[29], at[33]);    MULADD(at[30], at[32]); 
-   COMBA_STORE(C->dp[30]);
-   /* 31 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[63]);    MULADD(at[1], at[62]);    MULADD(at[2], at[61]);    MULADD(at[3], at[60]);    MULADD(at[4], at[59]);    MULADD(at[5], at[58]);    MULADD(at[6], at[57]);    MULADD(at[7], at[56]);    MULADD(at[8], at[55]);    MULADD(at[9], at[54]);    MULADD(at[10], at[53]);    MULADD(at[11], at[52]);    MULADD(at[12], at[51]);    MULADD(at[13], at[50]);    MULADD(at[14], at[49]);    MULADD(at[15], at[48]);    MULADD(at[16], at[47]);    MULADD(at[17], at[46]);    MULADD(at[18], at[45]);    MULADD(at[19], at[44]);    MULADD(at[20], at[43]);    MULADD(at[21], at[42]);    MULADD(at[22], at[41]);    MULADD(at[23], at[40]);    MULADD(at[24], at[39]);    MULADD(at[25], at[38]);    MULADD(at[26], at[37]);    MULADD(at[27], at[36]);    MULADD(at[28], at[35]);    MULADD(at[29], at[34]);    MULADD(at[30], at[33]);    MULADD(at[31], at[32]); 
-   COMBA_STORE(C->dp[31]);
-   /* 32 */
-   COMBA_FORWARD;
-   MULADD(at[1], at[63]);    MULADD(at[2], at[62]);    MULADD(at[3], at[61]);    MULADD(at[4], at[60]);    MULADD(at[5], at[59]);    MULADD(at[6], at[58]);    MULADD(at[7], at[57]);    MULADD(at[8], at[56]);    MULADD(at[9], at[55]);    MULADD(at[10], at[54]);    MULADD(at[11], at[53]);    MULADD(at[12], at[52]);    MULADD(at[13], at[51]);    MULADD(at[14], at[50]);    MULADD(at[15], at[49]);    MULADD(at[16], at[48]);    MULADD(at[17], at[47]);    MULADD(at[18], at[46]);    MULADD(at[19], at[45]);    MULADD(at[20], at[44]);    MULADD(at[21], at[43]);    MULADD(at[22], at[42]);    MULADD(at[23], at[41]);    MULADD(at[24], at[40]);    MULADD(at[25], at[39]);    MULADD(at[26], at[38]);    MULADD(at[27], at[37]);    MULADD(at[28], at[36]);    MULADD(at[29], at[35]);    MULADD(at[30], at[34]);    MULADD(at[31], at[33]); 
-   COMBA_STORE(C->dp[32]);
-   /* 33 */
-   COMBA_FORWARD;
-   MULADD(at[2], at[63]);    MULADD(at[3], at[62]);    MULADD(at[4], at[61]);    MULADD(at[5], at[60]);    MULADD(at[6], at[59]);    MULADD(at[7], at[58]);    MULADD(at[8], at[57]);    MULADD(at[9], at[56]);    MULADD(at[10], at[55]);    MULADD(at[11], at[54]);    MULADD(at[12], at[53]);    MULADD(at[13], at[52]);    MULADD(at[14], at[51]);    MULADD(at[15], at[50]);    MULADD(at[16], at[49]);    MULADD(at[17], at[48]);    MULADD(at[18], at[47]);    MULADD(at[19], at[46]);    MULADD(at[20], at[45]);    MULADD(at[21], at[44]);    MULADD(at[22], at[43]);    MULADD(at[23], at[42]);    MULADD(at[24], at[41]);    MULADD(at[25], at[40]);    MULADD(at[26], at[39]);    MULADD(at[27], at[38]);    MULADD(at[28], at[37]);    MULADD(at[29], at[36]);    MULADD(at[30], at[35]);    MULADD(at[31], at[34]); 
-   COMBA_STORE(C->dp[33]);
-   /* 34 */
-   COMBA_FORWARD;
-   MULADD(at[3], at[63]);    MULADD(at[4], at[62]);    MULADD(at[5], at[61]);    MULADD(at[6], at[60]);    MULADD(at[7], at[59]);    MULADD(at[8], at[58]);    MULADD(at[9], at[57]);    MULADD(at[10], at[56]);    MULADD(at[11], at[55]);    MULADD(at[12], at[54]);    MULADD(at[13], at[53]);    MULADD(at[14], at[52]);    MULADD(at[15], at[51]);    MULADD(at[16], at[50]);    MULADD(at[17], at[49]);    MULADD(at[18], at[48]);    MULADD(at[19], at[47]);    MULADD(at[20], at[46]);    MULADD(at[21], at[45]);    MULADD(at[22], at[44]);    MULADD(at[23], at[43]);    MULADD(at[24], at[42]);    MULADD(at[25], at[41]);    MULADD(at[26], at[40]);    MULADD(at[27], at[39]);    MULADD(at[28], at[38]);    MULADD(at[29], at[37]);    MULADD(at[30], at[36]);    MULADD(at[31], at[35]); 
-   COMBA_STORE(C->dp[34]);
-   /* 35 */
-   COMBA_FORWARD;
-   MULADD(at[4], at[63]);    MULADD(at[5], at[62]);    MULADD(at[6], at[61]);    MULADD(at[7], at[60]);    MULADD(at[8], at[59]);    MULADD(at[9], at[58]);    MULADD(at[10], at[57]);    MULADD(at[11], at[56]);    MULADD(at[12], at[55]);    MULADD(at[13], at[54]);    MULADD(at[14], at[53]);    MULADD(at[15], at[52]);    MULADD(at[16], at[51]);    MULADD(at[17], at[50]);    MULADD(at[18], at[49]);    MULADD(at[19], at[48]);    MULADD(at[20], at[47]);    MULADD(at[21], at[46]);    MULADD(at[22], at[45]);    MULADD(at[23], at[44]);    MULADD(at[24], at[43]);    MULADD(at[25], at[42]);    MULADD(at[26], at[41]);    MULADD(at[27], at[40]);    MULADD(at[28], at[39]);    MULADD(at[29], at[38]);    MULADD(at[30], at[37]);    MULADD(at[31], at[36]); 
-   COMBA_STORE(C->dp[35]);
-   /* 36 */
-   COMBA_FORWARD;
-   MULADD(at[5], at[63]);    MULADD(at[6], at[62]);    MULADD(at[7], at[61]);    MULADD(at[8], at[60]);    MULADD(at[9], at[59]);    MULADD(at[10], at[58]);    MULADD(at[11], at[57]);    MULADD(at[12], at[56]);    MULADD(at[13], at[55]);    MULADD(at[14], at[54]);    MULADD(at[15], at[53]);    MULADD(at[16], at[52]);    MULADD(at[17], at[51]);    MULADD(at[18], at[50]);    MULADD(at[19], at[49]);    MULADD(at[20], at[48]);    MULADD(at[21], at[47]);    MULADD(at[22], at[46]);    MULADD(at[23], at[45]);    MULADD(at[24], at[44]);    MULADD(at[25], at[43]);    MULADD(at[26], at[42]);    MULADD(at[27], at[41]);    MULADD(at[28], at[40]);    MULADD(at[29], at[39]);    MULADD(at[30], at[38]);    MULADD(at[31], at[37]); 
-   COMBA_STORE(C->dp[36]);
-   /* 37 */
-   COMBA_FORWARD;
-   MULADD(at[6], at[63]);    MULADD(at[7], at[62]);    MULADD(at[8], at[61]);    MULADD(at[9], at[60]);    MULADD(at[10], at[59]);    MULADD(at[11], at[58]);    MULADD(at[12], at[57]);    MULADD(at[13], at[56]);    MULADD(at[14], at[55]);    MULADD(at[15], at[54]);    MULADD(at[16], at[53]);    MULADD(at[17], at[52]);    MULADD(at[18], at[51]);    MULADD(at[19], at[50]);    MULADD(at[20], at[49]);    MULADD(at[21], at[48]);    MULADD(at[22], at[47]);    MULADD(at[23], at[46]);    MULADD(at[24], at[45]);    MULADD(at[25], at[44]);    MULADD(at[26], at[43]);    MULADD(at[27], at[42]);    MULADD(at[28], at[41]);    MULADD(at[29], at[40]);    MULADD(at[30], at[39]);    MULADD(at[31], at[38]); 
-   COMBA_STORE(C->dp[37]);
-   /* 38 */
-   COMBA_FORWARD;
-   MULADD(at[7], at[63]);    MULADD(at[8], at[62]);    MULADD(at[9], at[61]);    MULADD(at[10], at[60]);    MULADD(at[11], at[59]);    MULADD(at[12], at[58]);    MULADD(at[13], at[57]);    MULADD(at[14], at[56]);    MULADD(at[15], at[55]);    MULADD(at[16], at[54]);    MULADD(at[17], at[53]);    MULADD(at[18], at[52]);    MULADD(at[19], at[51]);    MULADD(at[20], at[50]);    MULADD(at[21], at[49]);    MULADD(at[22], at[48]);    MULADD(at[23], at[47]);    MULADD(at[24], at[46]);    MULADD(at[25], at[45]);    MULADD(at[26], at[44]);    MULADD(at[27], at[43]);    MULADD(at[28], at[42]);    MULADD(at[29], at[41]);    MULADD(at[30], at[40]);    MULADD(at[31], at[39]); 
-   COMBA_STORE(C->dp[38]);
-
-   /* early out at 40 digits, 40*32==1280, or two 640 bit operands */
-   if (out_size <= 40) { COMBA_STORE2(C->dp[39]); C->used = 40; C->sign = A->sign ^ B->sign; fp_clamp(C); COMBA_FINI; return FP_OKAY; }
-
-   /* 39 */
-   COMBA_FORWARD;
-   MULADD(at[8], at[63]);    MULADD(at[9], at[62]);    MULADD(at[10], at[61]);    MULADD(at[11], at[60]);    MULADD(at[12], at[59]);    MULADD(at[13], at[58]);    MULADD(at[14], at[57]);    MULADD(at[15], at[56]);    MULADD(at[16], at[55]);    MULADD(at[17], at[54]);    MULADD(at[18], at[53]);    MULADD(at[19], at[52]);    MULADD(at[20], at[51]);    MULADD(at[21], at[50]);    MULADD(at[22], at[49]);    MULADD(at[23], at[48]);    MULADD(at[24], at[47]);    MULADD(at[25], at[46]);    MULADD(at[26], at[45]);    MULADD(at[27], at[44]);    MULADD(at[28], at[43]);    MULADD(at[29], at[42]);    MULADD(at[30], at[41]);    MULADD(at[31], at[40]); 
-   COMBA_STORE(C->dp[39]);
-   /* 40 */
-   COMBA_FORWARD;
-   MULADD(at[9], at[63]);    MULADD(at[10], at[62]);    MULADD(at[11], at[61]);    MULADD(at[12], at[60]);    MULADD(at[13], at[59]);    MULADD(at[14], at[58]);    MULADD(at[15], at[57]);    MULADD(at[16], at[56]);    MULADD(at[17], at[55]);    MULADD(at[18], at[54]);    MULADD(at[19], at[53]);    MULADD(at[20], at[52]);    MULADD(at[21], at[51]);    MULADD(at[22], at[50]);    MULADD(at[23], at[49]);    MULADD(at[24], at[48]);    MULADD(at[25], at[47]);    MULADD(at[26], at[46]);    MULADD(at[27], at[45]);    MULADD(at[28], at[44]);    MULADD(at[29], at[43]);    MULADD(at[30], at[42]);    MULADD(at[31], at[41]); 
-   COMBA_STORE(C->dp[40]);
-   /* 41 */
-   COMBA_FORWARD;
-   MULADD(at[10], at[63]);    MULADD(at[11], at[62]);    MULADD(at[12], at[61]);    MULADD(at[13], at[60]);    MULADD(at[14], at[59]);    MULADD(at[15], at[58]);    MULADD(at[16], at[57]);    MULADD(at[17], at[56]);    MULADD(at[18], at[55]);    MULADD(at[19], at[54]);    MULADD(at[20], at[53]);    MULADD(at[21], at[52]);    MULADD(at[22], at[51]);    MULADD(at[23], at[50]);    MULADD(at[24], at[49]);    MULADD(at[25], at[48]);    MULADD(at[26], at[47]);    MULADD(at[27], at[46]);    MULADD(at[28], at[45]);    MULADD(at[29], at[44]);    MULADD(at[30], at[43]);    MULADD(at[31], at[42]); 
-   COMBA_STORE(C->dp[41]);
-   /* 42 */
-   COMBA_FORWARD;
-   MULADD(at[11], at[63]);    MULADD(at[12], at[62]);    MULADD(at[13], at[61]);    MULADD(at[14], at[60]);    MULADD(at[15], at[59]);    MULADD(at[16], at[58]);    MULADD(at[17], at[57]);    MULADD(at[18], at[56]);    MULADD(at[19], at[55]);    MULADD(at[20], at[54]);    MULADD(at[21], at[53]);    MULADD(at[22], at[52]);    MULADD(at[23], at[51]);    MULADD(at[24], at[50]);    MULADD(at[25], at[49]);    MULADD(at[26], at[48]);    MULADD(at[27], at[47]);    MULADD(at[28], at[46]);    MULADD(at[29], at[45]);    MULADD(at[30], at[44]);    MULADD(at[31], at[43]); 
-   COMBA_STORE(C->dp[42]);
-   /* 43 */
-   COMBA_FORWARD;
-   MULADD(at[12], at[63]);    MULADD(at[13], at[62]);    MULADD(at[14], at[61]);    MULADD(at[15], at[60]);    MULADD(at[16], at[59]);    MULADD(at[17], at[58]);    MULADD(at[18], at[57]);    MULADD(at[19], at[56]);    MULADD(at[20], at[55]);    MULADD(at[21], at[54]);    MULADD(at[22], at[53]);    MULADD(at[23], at[52]);    MULADD(at[24], at[51]);    MULADD(at[25], at[50]);    MULADD(at[26], at[49]);    MULADD(at[27], at[48]);    MULADD(at[28], at[47]);    MULADD(at[29], at[46]);    MULADD(at[30], at[45]);    MULADD(at[31], at[44]); 
-   COMBA_STORE(C->dp[43]);
-   /* 44 */
-   COMBA_FORWARD;
-   MULADD(at[13], at[63]);    MULADD(at[14], at[62]);    MULADD(at[15], at[61]);    MULADD(at[16], at[60]);    MULADD(at[17], at[59]);    MULADD(at[18], at[58]);    MULADD(at[19], at[57]);    MULADD(at[20], at[56]);    MULADD(at[21], at[55]);    MULADD(at[22], at[54]);    MULADD(at[23], at[53]);    MULADD(at[24], at[52]);    MULADD(at[25], at[51]);    MULADD(at[26], at[50]);    MULADD(at[27], at[49]);    MULADD(at[28], at[48]);    MULADD(at[29], at[47]);    MULADD(at[30], at[46]);    MULADD(at[31], at[45]); 
-   COMBA_STORE(C->dp[44]);
-   /* 45 */
-   COMBA_FORWARD;
-   MULADD(at[14], at[63]);    MULADD(at[15], at[62]);    MULADD(at[16], at[61]);    MULADD(at[17], at[60]);    MULADD(at[18], at[59]);    MULADD(at[19], at[58]);    MULADD(at[20], at[57]);    MULADD(at[21], at[56]);    MULADD(at[22], at[55]);    MULADD(at[23], at[54]);    MULADD(at[24], at[53]);    MULADD(at[25], at[52]);    MULADD(at[26], at[51]);    MULADD(at[27], at[50]);    MULADD(at[28], at[49]);    MULADD(at[29], at[48]);    MULADD(at[30], at[47]);    MULADD(at[31], at[46]); 
-   COMBA_STORE(C->dp[45]);
-   /* 46 */
-   COMBA_FORWARD;
-   MULADD(at[15], at[63]);    MULADD(at[16], at[62]);    MULADD(at[17], at[61]);    MULADD(at[18], at[60]);    MULADD(at[19], at[59]);    MULADD(at[20], at[58]);    MULADD(at[21], at[57]);    MULADD(at[22], at[56]);    MULADD(at[23], at[55]);    MULADD(at[24], at[54]);    MULADD(at[25], at[53]);    MULADD(at[26], at[52]);    MULADD(at[27], at[51]);    MULADD(at[28], at[50]);    MULADD(at[29], at[49]);    MULADD(at[30], at[48]);    MULADD(at[31], at[47]); 
-   COMBA_STORE(C->dp[46]);
-
-   /* early out at 48 digits, 48*32==1536, or two 768 bit operands */
-   if (out_size <= 48) { COMBA_STORE2(C->dp[47]); C->used = 48; C->sign = A->sign ^ B->sign; fp_clamp(C); COMBA_FINI; return FP_OKAY; }
-
-   /* 47 */
-   COMBA_FORWARD;
-   MULADD(at[16], at[63]);    MULADD(at[17], at[62]);    MULADD(at[18], at[61]);    MULADD(at[19], at[60]);    MULADD(at[20], at[59]);    MULADD(at[21], at[58]);    MULADD(at[22], at[57]);    MULADD(at[23], at[56]);    MULADD(at[24], at[55]);    MULADD(at[25], at[54]);    MULADD(at[26], at[53]);    MULADD(at[27], at[52]);    MULADD(at[28], at[51]);    MULADD(at[29], at[50]);    MULADD(at[30], at[49]);    MULADD(at[31], at[48]); 
-   COMBA_STORE(C->dp[47]);
-   /* 48 */
-   COMBA_FORWARD;
-   MULADD(at[17], at[63]);    MULADD(at[18], at[62]);    MULADD(at[19], at[61]);    MULADD(at[20], at[60]);    MULADD(at[21], at[59]);    MULADD(at[22], at[58]);    MULADD(at[23], at[57]);    MULADD(at[24], at[56]);    MULADD(at[25], at[55]);    MULADD(at[26], at[54]);    MULADD(at[27], at[53]);    MULADD(at[28], at[52]);    MULADD(at[29], at[51]);    MULADD(at[30], at[50]);    MULADD(at[31], at[49]); 
-   COMBA_STORE(C->dp[48]);
-   /* 49 */
-   COMBA_FORWARD;
-   MULADD(at[18], at[63]);    MULADD(at[19], at[62]);    MULADD(at[20], at[61]);    MULADD(at[21], at[60]);    MULADD(at[22], at[59]);    MULADD(at[23], at[58]);    MULADD(at[24], at[57]);    MULADD(at[25], at[56]);    MULADD(at[26], at[55]);    MULADD(at[27], at[54]);    MULADD(at[28], at[53]);    MULADD(at[29], at[52]);    MULADD(at[30], at[51]);    MULADD(at[31], at[50]); 
-   COMBA_STORE(C->dp[49]);
-   /* 50 */
-   COMBA_FORWARD;
-   MULADD(at[19], at[63]);    MULADD(at[20], at[62]);    MULADD(at[21], at[61]);    MULADD(at[22], at[60]);    MULADD(at[23], at[59]);    MULADD(at[24], at[58]);    MULADD(at[25], at[57]);    MULADD(at[26], at[56]);    MULADD(at[27], at[55]);    MULADD(at[28], at[54]);    MULADD(at[29], at[53]);    MULADD(at[30], at[52]);    MULADD(at[31], at[51]); 
-   COMBA_STORE(C->dp[50]);
-   /* 51 */
-   COMBA_FORWARD;
-   MULADD(at[20], at[63]);    MULADD(at[21], at[62]);    MULADD(at[22], at[61]);    MULADD(at[23], at[60]);    MULADD(at[24], at[59]);    MULADD(at[25], at[58]);    MULADD(at[26], at[57]);    MULADD(at[27], at[56]);    MULADD(at[28], at[55]);    MULADD(at[29], at[54]);    MULADD(at[30], at[53]);    MULADD(at[31], at[52]); 
-   COMBA_STORE(C->dp[51]);
-   /* 52 */
-   COMBA_FORWARD;
-   MULADD(at[21], at[63]);    MULADD(at[22], at[62]);    MULADD(at[23], at[61]);    MULADD(at[24], at[60]);    MULADD(at[25], at[59]);    MULADD(at[26], at[58]);    MULADD(at[27], at[57]);    MULADD(at[28], at[56]);    MULADD(at[29], at[55]);    MULADD(at[30], at[54]);    MULADD(at[31], at[53]); 
-   COMBA_STORE(C->dp[52]);
-   /* 53 */
-   COMBA_FORWARD;
-   MULADD(at[22], at[63]);    MULADD(at[23], at[62]);    MULADD(at[24], at[61]);    MULADD(at[25], at[60]);    MULADD(at[26], at[59]);    MULADD(at[27], at[58]);    MULADD(at[28], at[57]);    MULADD(at[29], at[56]);    MULADD(at[30], at[55]);    MULADD(at[31], at[54]); 
-   COMBA_STORE(C->dp[53]);
-   /* 54 */
-   COMBA_FORWARD;
-   MULADD(at[23], at[63]);    MULADD(at[24], at[62]);    MULADD(at[25], at[61]);    MULADD(at[26], at[60]);    MULADD(at[27], at[59]);    MULADD(at[28], at[58]);    MULADD(at[29], at[57]);    MULADD(at[30], at[56]);    MULADD(at[31], at[55]); 
-   COMBA_STORE(C->dp[54]);
-
-   /* early out at 56 digits, 56*32==1792, or two 896 bit operands */
-   if (out_size <= 56) { COMBA_STORE2(C->dp[55]); C->used = 56; C->sign = A->sign ^ B->sign; fp_clamp(C); COMBA_FINI; return FP_OKAY; }
-
-   /* 55 */
-   COMBA_FORWARD;
-   MULADD(at[24], at[63]);    MULADD(at[25], at[62]);    MULADD(at[26], at[61]);    MULADD(at[27], at[60]);    MULADD(at[28], at[59]);    MULADD(at[29], at[58]);    MULADD(at[30], at[57]);    MULADD(at[31], at[56]); 
-   COMBA_STORE(C->dp[55]);
-   /* 56 */
-   COMBA_FORWARD;
-   MULADD(at[25], at[63]);    MULADD(at[26], at[62]);    MULADD(at[27], at[61]);    MULADD(at[28], at[60]);    MULADD(at[29], at[59]);    MULADD(at[30], at[58]);    MULADD(at[31], at[57]); 
-   COMBA_STORE(C->dp[56]);
-   /* 57 */
-   COMBA_FORWARD;
-   MULADD(at[26], at[63]);    MULADD(at[27], at[62]);    MULADD(at[28], at[61]);    MULADD(at[29], at[60]);    MULADD(at[30], at[59]);    MULADD(at[31], at[58]); 
-   COMBA_STORE(C->dp[57]);
-   /* 58 */
-   COMBA_FORWARD;
-   MULADD(at[27], at[63]);    MULADD(at[28], at[62]);    MULADD(at[29], at[61]);    MULADD(at[30], at[60]);    MULADD(at[31], at[59]); 
-   COMBA_STORE(C->dp[58]);
-   /* 59 */
-   COMBA_FORWARD;
-   MULADD(at[28], at[63]);    MULADD(at[29], at[62]);    MULADD(at[30], at[61]);    MULADD(at[31], at[60]); 
-   COMBA_STORE(C->dp[59]);
-   /* 60 */
-   COMBA_FORWARD;
-   MULADD(at[29], at[63]);    MULADD(at[30], at[62]);    MULADD(at[31], at[61]); 
-   COMBA_STORE(C->dp[60]);
-   /* 61 */
-   COMBA_FORWARD;
-   MULADD(at[30], at[63]);    MULADD(at[31], at[62]); 
-   COMBA_STORE(C->dp[61]);
-   /* 62 */
-   COMBA_FORWARD;
-   MULADD(at[31], at[63]); 
-   COMBA_STORE(C->dp[62]);
-   COMBA_STORE2(C->dp[63]);
-   C->used = 64;
-   C->sign = A->sign ^ B->sign;
-   fp_clamp(C);
-   COMBA_FINI;
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif

+ 0 - 83
lib/wolfssl/wolfcrypt/src/fp_mul_comba_4.i

@@ -1,83 +0,0 @@
-/* fp_mul_comba_4.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_MUL4
-int fp_mul_comba4(fp_int *A, fp_int *B, fp_int *C)
-{
-   fp_digit c0, c1, c2;
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit at[8];
-#else
-   fp_digit *at;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 8, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (at == NULL)
-       return FP_MEM;
-#endif
-
-   XMEMCPY(at, A->dp, 4 * sizeof(fp_digit));
-   XMEMCPY(at+4, B->dp, 4 * sizeof(fp_digit));
-   COMBA_START;
-
-   COMBA_CLEAR;
-   /* 0 */
-   MULADD(at[0], at[4]); 
-   COMBA_STORE(C->dp[0]);
-   /* 1 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[5]);    MULADD(at[1], at[4]); 
-   COMBA_STORE(C->dp[1]);
-   /* 2 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[6]);    MULADD(at[1], at[5]);    MULADD(at[2], at[4]); 
-   COMBA_STORE(C->dp[2]);
-   /* 3 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[7]);    MULADD(at[1], at[6]);    MULADD(at[2], at[5]);    MULADD(at[3], at[4]); 
-   COMBA_STORE(C->dp[3]);
-   /* 4 */
-   COMBA_FORWARD;
-   MULADD(at[1], at[7]);    MULADD(at[2], at[6]);    MULADD(at[3], at[5]); 
-   COMBA_STORE(C->dp[4]);
-   /* 5 */
-   COMBA_FORWARD;
-   MULADD(at[2], at[7]);    MULADD(at[3], at[6]); 
-   COMBA_STORE(C->dp[5]);
-   /* 6 */
-   COMBA_FORWARD;
-   MULADD(at[3], at[7]); 
-   COMBA_STORE(C->dp[6]);
-   COMBA_STORE2(C->dp[7]);
-   C->used = 8;
-   C->sign = A->sign ^ B->sign;
-   fp_clamp(C);
-   COMBA_FINI;
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif

+ 0 - 435
lib/wolfssl/wolfcrypt/src/fp_mul_comba_48.i

@@ -1,435 +0,0 @@
-/* fp_mul_comba_48.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_MUL48
-int fp_mul_comba48(fp_int *A, fp_int *B, fp_int *C)
-{
-   fp_digit c0, c1, c2;
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit at[96];
-#else
-   fp_digit *at;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 96, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (at == NULL)
-       return FP_MEM;
-#endif
-
-   XMEMCPY(at, A->dp, 48 * sizeof(fp_digit));
-   XMEMCPY(at+48, B->dp, 48 * sizeof(fp_digit));
-   COMBA_START;
-
-   COMBA_CLEAR;
-   /* 0 */
-   MULADD(at[0], at[48]); 
-   COMBA_STORE(C->dp[0]);
-   /* 1 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[49]);    MULADD(at[1], at[48]); 
-   COMBA_STORE(C->dp[1]);
-   /* 2 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[50]);    MULADD(at[1], at[49]);    MULADD(at[2], at[48]); 
-   COMBA_STORE(C->dp[2]);
-   /* 3 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[51]);    MULADD(at[1], at[50]);    MULADD(at[2], at[49]);    MULADD(at[3], at[48]); 
-   COMBA_STORE(C->dp[3]);
-   /* 4 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[52]);    MULADD(at[1], at[51]);    MULADD(at[2], at[50]);    MULADD(at[3], at[49]);    MULADD(at[4], at[48]); 
-   COMBA_STORE(C->dp[4]);
-   /* 5 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[53]);    MULADD(at[1], at[52]);    MULADD(at[2], at[51]);    MULADD(at[3], at[50]);    MULADD(at[4], at[49]);    MULADD(at[5], at[48]); 
-   COMBA_STORE(C->dp[5]);
-   /* 6 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[54]);    MULADD(at[1], at[53]);    MULADD(at[2], at[52]);    MULADD(at[3], at[51]);    MULADD(at[4], at[50]);    MULADD(at[5], at[49]);    MULADD(at[6], at[48]); 
-   COMBA_STORE(C->dp[6]);
-   /* 7 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[55]);    MULADD(at[1], at[54]);    MULADD(at[2], at[53]);    MULADD(at[3], at[52]);    MULADD(at[4], at[51]);    MULADD(at[5], at[50]);    MULADD(at[6], at[49]);    MULADD(at[7], at[48]); 
-   COMBA_STORE(C->dp[7]);
-   /* 8 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[56]);    MULADD(at[1], at[55]);    MULADD(at[2], at[54]);    MULADD(at[3], at[53]);    MULADD(at[4], at[52]);    MULADD(at[5], at[51]);    MULADD(at[6], at[50]);    MULADD(at[7], at[49]);    MULADD(at[8], at[48]); 
-   COMBA_STORE(C->dp[8]);
-   /* 9 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[57]);    MULADD(at[1], at[56]);    MULADD(at[2], at[55]);    MULADD(at[3], at[54]);    MULADD(at[4], at[53]);    MULADD(at[5], at[52]);    MULADD(at[6], at[51]);    MULADD(at[7], at[50]);    MULADD(at[8], at[49]);    MULADD(at[9], at[48]); 
-   COMBA_STORE(C->dp[9]);
-   /* 10 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[58]);    MULADD(at[1], at[57]);    MULADD(at[2], at[56]);    MULADD(at[3], at[55]);    MULADD(at[4], at[54]);    MULADD(at[5], at[53]);    MULADD(at[6], at[52]);    MULADD(at[7], at[51]);    MULADD(at[8], at[50]);    MULADD(at[9], at[49]);    MULADD(at[10], at[48]); 
-   COMBA_STORE(C->dp[10]);
-   /* 11 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[59]);    MULADD(at[1], at[58]);    MULADD(at[2], at[57]);    MULADD(at[3], at[56]);    MULADD(at[4], at[55]);    MULADD(at[5], at[54]);    MULADD(at[6], at[53]);    MULADD(at[7], at[52]);    MULADD(at[8], at[51]);    MULADD(at[9], at[50]);    MULADD(at[10], at[49]);    MULADD(at[11], at[48]); 
-   COMBA_STORE(C->dp[11]);
-   /* 12 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[60]);    MULADD(at[1], at[59]);    MULADD(at[2], at[58]);    MULADD(at[3], at[57]);    MULADD(at[4], at[56]);    MULADD(at[5], at[55]);    MULADD(at[6], at[54]);    MULADD(at[7], at[53]);    MULADD(at[8], at[52]);    MULADD(at[9], at[51]);    MULADD(at[10], at[50]);    MULADD(at[11], at[49]);    MULADD(at[12], at[48]); 
-   COMBA_STORE(C->dp[12]);
-   /* 13 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[61]);    MULADD(at[1], at[60]);    MULADD(at[2], at[59]);    MULADD(at[3], at[58]);    MULADD(at[4], at[57]);    MULADD(at[5], at[56]);    MULADD(at[6], at[55]);    MULADD(at[7], at[54]);    MULADD(at[8], at[53]);    MULADD(at[9], at[52]);    MULADD(at[10], at[51]);    MULADD(at[11], at[50]);    MULADD(at[12], at[49]);    MULADD(at[13], at[48]); 
-   COMBA_STORE(C->dp[13]);
-   /* 14 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[62]);    MULADD(at[1], at[61]);    MULADD(at[2], at[60]);    MULADD(at[3], at[59]);    MULADD(at[4], at[58]);    MULADD(at[5], at[57]);    MULADD(at[6], at[56]);    MULADD(at[7], at[55]);    MULADD(at[8], at[54]);    MULADD(at[9], at[53]);    MULADD(at[10], at[52]);    MULADD(at[11], at[51]);    MULADD(at[12], at[50]);    MULADD(at[13], at[49]);    MULADD(at[14], at[48]); 
-   COMBA_STORE(C->dp[14]);
-   /* 15 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[63]);    MULADD(at[1], at[62]);    MULADD(at[2], at[61]);    MULADD(at[3], at[60]);    MULADD(at[4], at[59]);    MULADD(at[5], at[58]);    MULADD(at[6], at[57]);    MULADD(at[7], at[56]);    MULADD(at[8], at[55]);    MULADD(at[9], at[54]);    MULADD(at[10], at[53]);    MULADD(at[11], at[52]);    MULADD(at[12], at[51]);    MULADD(at[13], at[50]);    MULADD(at[14], at[49]);    MULADD(at[15], at[48]); 
-   COMBA_STORE(C->dp[15]);
-   /* 16 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[64]);    MULADD(at[1], at[63]);    MULADD(at[2], at[62]);    MULADD(at[3], at[61]);    MULADD(at[4], at[60]);    MULADD(at[5], at[59]);    MULADD(at[6], at[58]);    MULADD(at[7], at[57]);    MULADD(at[8], at[56]);    MULADD(at[9], at[55]);    MULADD(at[10], at[54]);    MULADD(at[11], at[53]);    MULADD(at[12], at[52]);    MULADD(at[13], at[51]);    MULADD(at[14], at[50]);    MULADD(at[15], at[49]);    MULADD(at[16], at[48]); 
-   COMBA_STORE(C->dp[16]);
-   /* 17 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[65]);    MULADD(at[1], at[64]);    MULADD(at[2], at[63]);    MULADD(at[3], at[62]);    MULADD(at[4], at[61]);    MULADD(at[5], at[60]);    MULADD(at[6], at[59]);    MULADD(at[7], at[58]);    MULADD(at[8], at[57]);    MULADD(at[9], at[56]);    MULADD(at[10], at[55]);    MULADD(at[11], at[54]);    MULADD(at[12], at[53]);    MULADD(at[13], at[52]);    MULADD(at[14], at[51]);    MULADD(at[15], at[50]);    MULADD(at[16], at[49]);    MULADD(at[17], at[48]); 
-   COMBA_STORE(C->dp[17]);
-   /* 18 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[66]);    MULADD(at[1], at[65]);    MULADD(at[2], at[64]);    MULADD(at[3], at[63]);    MULADD(at[4], at[62]);    MULADD(at[5], at[61]);    MULADD(at[6], at[60]);    MULADD(at[7], at[59]);    MULADD(at[8], at[58]);    MULADD(at[9], at[57]);    MULADD(at[10], at[56]);    MULADD(at[11], at[55]);    MULADD(at[12], at[54]);    MULADD(at[13], at[53]);    MULADD(at[14], at[52]);    MULADD(at[15], at[51]);    MULADD(at[16], at[50]);    MULADD(at[17], at[49]);    MULADD(at[18], at[48]); 
-   COMBA_STORE(C->dp[18]);
-   /* 19 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[67]);    MULADD(at[1], at[66]);    MULADD(at[2], at[65]);    MULADD(at[3], at[64]);    MULADD(at[4], at[63]);    MULADD(at[5], at[62]);    MULADD(at[6], at[61]);    MULADD(at[7], at[60]);    MULADD(at[8], at[59]);    MULADD(at[9], at[58]);    MULADD(at[10], at[57]);    MULADD(at[11], at[56]);    MULADD(at[12], at[55]);    MULADD(at[13], at[54]);    MULADD(at[14], at[53]);    MULADD(at[15], at[52]);    MULADD(at[16], at[51]);    MULADD(at[17], at[50]);    MULADD(at[18], at[49]);    MULADD(at[19], at[48]); 
-   COMBA_STORE(C->dp[19]);
-   /* 20 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[68]);    MULADD(at[1], at[67]);    MULADD(at[2], at[66]);    MULADD(at[3], at[65]);    MULADD(at[4], at[64]);    MULADD(at[5], at[63]);    MULADD(at[6], at[62]);    MULADD(at[7], at[61]);    MULADD(at[8], at[60]);    MULADD(at[9], at[59]);    MULADD(at[10], at[58]);    MULADD(at[11], at[57]);    MULADD(at[12], at[56]);    MULADD(at[13], at[55]);    MULADD(at[14], at[54]);    MULADD(at[15], at[53]);    MULADD(at[16], at[52]);    MULADD(at[17], at[51]);    MULADD(at[18], at[50]);    MULADD(at[19], at[49]);    MULADD(at[20], at[48]); 
-   COMBA_STORE(C->dp[20]);
-   /* 21 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[69]);    MULADD(at[1], at[68]);    MULADD(at[2], at[67]);    MULADD(at[3], at[66]);    MULADD(at[4], at[65]);    MULADD(at[5], at[64]);    MULADD(at[6], at[63]);    MULADD(at[7], at[62]);    MULADD(at[8], at[61]);    MULADD(at[9], at[60]);    MULADD(at[10], at[59]);    MULADD(at[11], at[58]);    MULADD(at[12], at[57]);    MULADD(at[13], at[56]);    MULADD(at[14], at[55]);    MULADD(at[15], at[54]);    MULADD(at[16], at[53]);    MULADD(at[17], at[52]);    MULADD(at[18], at[51]);    MULADD(at[19], at[50]);    MULADD(at[20], at[49]);    MULADD(at[21], at[48]); 
-   COMBA_STORE(C->dp[21]);
-   /* 22 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[70]);    MULADD(at[1], at[69]);    MULADD(at[2], at[68]);    MULADD(at[3], at[67]);    MULADD(at[4], at[66]);    MULADD(at[5], at[65]);    MULADD(at[6], at[64]);    MULADD(at[7], at[63]);    MULADD(at[8], at[62]);    MULADD(at[9], at[61]);    MULADD(at[10], at[60]);    MULADD(at[11], at[59]);    MULADD(at[12], at[58]);    MULADD(at[13], at[57]);    MULADD(at[14], at[56]);    MULADD(at[15], at[55]);    MULADD(at[16], at[54]);    MULADD(at[17], at[53]);    MULADD(at[18], at[52]);    MULADD(at[19], at[51]);    MULADD(at[20], at[50]);    MULADD(at[21], at[49]);    MULADD(at[22], at[48]); 
-   COMBA_STORE(C->dp[22]);
-   /* 23 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[71]);    MULADD(at[1], at[70]);    MULADD(at[2], at[69]);    MULADD(at[3], at[68]);    MULADD(at[4], at[67]);    MULADD(at[5], at[66]);    MULADD(at[6], at[65]);    MULADD(at[7], at[64]);    MULADD(at[8], at[63]);    MULADD(at[9], at[62]);    MULADD(at[10], at[61]);    MULADD(at[11], at[60]);    MULADD(at[12], at[59]);    MULADD(at[13], at[58]);    MULADD(at[14], at[57]);    MULADD(at[15], at[56]);    MULADD(at[16], at[55]);    MULADD(at[17], at[54]);    MULADD(at[18], at[53]);    MULADD(at[19], at[52]);    MULADD(at[20], at[51]);    MULADD(at[21], at[50]);    MULADD(at[22], at[49]);    MULADD(at[23], at[48]); 
-   COMBA_STORE(C->dp[23]);
-   /* 24 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[72]);    MULADD(at[1], at[71]);    MULADD(at[2], at[70]);    MULADD(at[3], at[69]);    MULADD(at[4], at[68]);    MULADD(at[5], at[67]);    MULADD(at[6], at[66]);    MULADD(at[7], at[65]);    MULADD(at[8], at[64]);    MULADD(at[9], at[63]);    MULADD(at[10], at[62]);    MULADD(at[11], at[61]);    MULADD(at[12], at[60]);    MULADD(at[13], at[59]);    MULADD(at[14], at[58]);    MULADD(at[15], at[57]);    MULADD(at[16], at[56]);    MULADD(at[17], at[55]);    MULADD(at[18], at[54]);    MULADD(at[19], at[53]);    MULADD(at[20], at[52]);    MULADD(at[21], at[51]);    MULADD(at[22], at[50]);    MULADD(at[23], at[49]);    MULADD(at[24], at[48]); 
-   COMBA_STORE(C->dp[24]);
-   /* 25 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[73]);    MULADD(at[1], at[72]);    MULADD(at[2], at[71]);    MULADD(at[3], at[70]);    MULADD(at[4], at[69]);    MULADD(at[5], at[68]);    MULADD(at[6], at[67]);    MULADD(at[7], at[66]);    MULADD(at[8], at[65]);    MULADD(at[9], at[64]);    MULADD(at[10], at[63]);    MULADD(at[11], at[62]);    MULADD(at[12], at[61]);    MULADD(at[13], at[60]);    MULADD(at[14], at[59]);    MULADD(at[15], at[58]);    MULADD(at[16], at[57]);    MULADD(at[17], at[56]);    MULADD(at[18], at[55]);    MULADD(at[19], at[54]);    MULADD(at[20], at[53]);    MULADD(at[21], at[52]);    MULADD(at[22], at[51]);    MULADD(at[23], at[50]);    MULADD(at[24], at[49]);    MULADD(at[25], at[48]); 
-   COMBA_STORE(C->dp[25]);
-   /* 26 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[74]);    MULADD(at[1], at[73]);    MULADD(at[2], at[72]);    MULADD(at[3], at[71]);    MULADD(at[4], at[70]);    MULADD(at[5], at[69]);    MULADD(at[6], at[68]);    MULADD(at[7], at[67]);    MULADD(at[8], at[66]);    MULADD(at[9], at[65]);    MULADD(at[10], at[64]);    MULADD(at[11], at[63]);    MULADD(at[12], at[62]);    MULADD(at[13], at[61]);    MULADD(at[14], at[60]);    MULADD(at[15], at[59]);    MULADD(at[16], at[58]);    MULADD(at[17], at[57]);    MULADD(at[18], at[56]);    MULADD(at[19], at[55]);    MULADD(at[20], at[54]);    MULADD(at[21], at[53]);    MULADD(at[22], at[52]);    MULADD(at[23], at[51]);    MULADD(at[24], at[50]);    MULADD(at[25], at[49]);    MULADD(at[26], at[48]); 
-   COMBA_STORE(C->dp[26]);
-   /* 27 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[75]);    MULADD(at[1], at[74]);    MULADD(at[2], at[73]);    MULADD(at[3], at[72]);    MULADD(at[4], at[71]);    MULADD(at[5], at[70]);    MULADD(at[6], at[69]);    MULADD(at[7], at[68]);    MULADD(at[8], at[67]);    MULADD(at[9], at[66]);    MULADD(at[10], at[65]);    MULADD(at[11], at[64]);    MULADD(at[12], at[63]);    MULADD(at[13], at[62]);    MULADD(at[14], at[61]);    MULADD(at[15], at[60]);    MULADD(at[16], at[59]);    MULADD(at[17], at[58]);    MULADD(at[18], at[57]);    MULADD(at[19], at[56]);    MULADD(at[20], at[55]);    MULADD(at[21], at[54]);    MULADD(at[22], at[53]);    MULADD(at[23], at[52]);    MULADD(at[24], at[51]);    MULADD(at[25], at[50]);    MULADD(at[26], at[49]);    MULADD(at[27], at[48]); 
-   COMBA_STORE(C->dp[27]);
-   /* 28 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[76]);    MULADD(at[1], at[75]);    MULADD(at[2], at[74]);    MULADD(at[3], at[73]);    MULADD(at[4], at[72]);    MULADD(at[5], at[71]);    MULADD(at[6], at[70]);    MULADD(at[7], at[69]);    MULADD(at[8], at[68]);    MULADD(at[9], at[67]);    MULADD(at[10], at[66]);    MULADD(at[11], at[65]);    MULADD(at[12], at[64]);    MULADD(at[13], at[63]);    MULADD(at[14], at[62]);    MULADD(at[15], at[61]);    MULADD(at[16], at[60]);    MULADD(at[17], at[59]);    MULADD(at[18], at[58]);    MULADD(at[19], at[57]);    MULADD(at[20], at[56]);    MULADD(at[21], at[55]);    MULADD(at[22], at[54]);    MULADD(at[23], at[53]);    MULADD(at[24], at[52]);    MULADD(at[25], at[51]);    MULADD(at[26], at[50]);    MULADD(at[27], at[49]);    MULADD(at[28], at[48]); 
-   COMBA_STORE(C->dp[28]);
-   /* 29 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[77]);    MULADD(at[1], at[76]);    MULADD(at[2], at[75]);    MULADD(at[3], at[74]);    MULADD(at[4], at[73]);    MULADD(at[5], at[72]);    MULADD(at[6], at[71]);    MULADD(at[7], at[70]);    MULADD(at[8], at[69]);    MULADD(at[9], at[68]);    MULADD(at[10], at[67]);    MULADD(at[11], at[66]);    MULADD(at[12], at[65]);    MULADD(at[13], at[64]);    MULADD(at[14], at[63]);    MULADD(at[15], at[62]);    MULADD(at[16], at[61]);    MULADD(at[17], at[60]);    MULADD(at[18], at[59]);    MULADD(at[19], at[58]);    MULADD(at[20], at[57]);    MULADD(at[21], at[56]);    MULADD(at[22], at[55]);    MULADD(at[23], at[54]);    MULADD(at[24], at[53]);    MULADD(at[25], at[52]);    MULADD(at[26], at[51]);    MULADD(at[27], at[50]);    MULADD(at[28], at[49]);    MULADD(at[29], at[48]); 
-   COMBA_STORE(C->dp[29]);
-   /* 30 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[78]);    MULADD(at[1], at[77]);    MULADD(at[2], at[76]);    MULADD(at[3], at[75]);    MULADD(at[4], at[74]);    MULADD(at[5], at[73]);    MULADD(at[6], at[72]);    MULADD(at[7], at[71]);    MULADD(at[8], at[70]);    MULADD(at[9], at[69]);    MULADD(at[10], at[68]);    MULADD(at[11], at[67]);    MULADD(at[12], at[66]);    MULADD(at[13], at[65]);    MULADD(at[14], at[64]);    MULADD(at[15], at[63]);    MULADD(at[16], at[62]);    MULADD(at[17], at[61]);    MULADD(at[18], at[60]);    MULADD(at[19], at[59]);    MULADD(at[20], at[58]);    MULADD(at[21], at[57]);    MULADD(at[22], at[56]);    MULADD(at[23], at[55]);    MULADD(at[24], at[54]);    MULADD(at[25], at[53]);    MULADD(at[26], at[52]);    MULADD(at[27], at[51]);    MULADD(at[28], at[50]);    MULADD(at[29], at[49]);    MULADD(at[30], at[48]); 
-   COMBA_STORE(C->dp[30]);
-   /* 31 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[79]);    MULADD(at[1], at[78]);    MULADD(at[2], at[77]);    MULADD(at[3], at[76]);    MULADD(at[4], at[75]);    MULADD(at[5], at[74]);    MULADD(at[6], at[73]);    MULADD(at[7], at[72]);    MULADD(at[8], at[71]);    MULADD(at[9], at[70]);    MULADD(at[10], at[69]);    MULADD(at[11], at[68]);    MULADD(at[12], at[67]);    MULADD(at[13], at[66]);    MULADD(at[14], at[65]);    MULADD(at[15], at[64]);    MULADD(at[16], at[63]);    MULADD(at[17], at[62]);    MULADD(at[18], at[61]);    MULADD(at[19], at[60]);    MULADD(at[20], at[59]);    MULADD(at[21], at[58]);    MULADD(at[22], at[57]);    MULADD(at[23], at[56]);    MULADD(at[24], at[55]);    MULADD(at[25], at[54]);    MULADD(at[26], at[53]);    MULADD(at[27], at[52]);    MULADD(at[28], at[51]);    MULADD(at[29], at[50]);    MULADD(at[30], at[49]);    MULADD(at[31], at[48]); 
-   COMBA_STORE(C->dp[31]);
-   /* 32 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[80]);    MULADD(at[1], at[79]);    MULADD(at[2], at[78]);    MULADD(at[3], at[77]);    MULADD(at[4], at[76]);    MULADD(at[5], at[75]);    MULADD(at[6], at[74]);    MULADD(at[7], at[73]);    MULADD(at[8], at[72]);    MULADD(at[9], at[71]);    MULADD(at[10], at[70]);    MULADD(at[11], at[69]);    MULADD(at[12], at[68]);    MULADD(at[13], at[67]);    MULADD(at[14], at[66]);    MULADD(at[15], at[65]);    MULADD(at[16], at[64]);    MULADD(at[17], at[63]);    MULADD(at[18], at[62]);    MULADD(at[19], at[61]);    MULADD(at[20], at[60]);    MULADD(at[21], at[59]);    MULADD(at[22], at[58]);    MULADD(at[23], at[57]);    MULADD(at[24], at[56]);    MULADD(at[25], at[55]);    MULADD(at[26], at[54]);    MULADD(at[27], at[53]);    MULADD(at[28], at[52]);    MULADD(at[29], at[51]);    MULADD(at[30], at[50]);    MULADD(at[31], at[49]);    MULADD(at[32], at[48]); 
-   COMBA_STORE(C->dp[32]);
-   /* 33 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[81]);    MULADD(at[1], at[80]);    MULADD(at[2], at[79]);    MULADD(at[3], at[78]);    MULADD(at[4], at[77]);    MULADD(at[5], at[76]);    MULADD(at[6], at[75]);    MULADD(at[7], at[74]);    MULADD(at[8], at[73]);    MULADD(at[9], at[72]);    MULADD(at[10], at[71]);    MULADD(at[11], at[70]);    MULADD(at[12], at[69]);    MULADD(at[13], at[68]);    MULADD(at[14], at[67]);    MULADD(at[15], at[66]);    MULADD(at[16], at[65]);    MULADD(at[17], at[64]);    MULADD(at[18], at[63]);    MULADD(at[19], at[62]);    MULADD(at[20], at[61]);    MULADD(at[21], at[60]);    MULADD(at[22], at[59]);    MULADD(at[23], at[58]);    MULADD(at[24], at[57]);    MULADD(at[25], at[56]);    MULADD(at[26], at[55]);    MULADD(at[27], at[54]);    MULADD(at[28], at[53]);    MULADD(at[29], at[52]);    MULADD(at[30], at[51]);    MULADD(at[31], at[50]);    MULADD(at[32], at[49]);    MULADD(at[33], at[48]); 
-   COMBA_STORE(C->dp[33]);
-   /* 34 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[82]);    MULADD(at[1], at[81]);    MULADD(at[2], at[80]);    MULADD(at[3], at[79]);    MULADD(at[4], at[78]);    MULADD(at[5], at[77]);    MULADD(at[6], at[76]);    MULADD(at[7], at[75]);    MULADD(at[8], at[74]);    MULADD(at[9], at[73]);    MULADD(at[10], at[72]);    MULADD(at[11], at[71]);    MULADD(at[12], at[70]);    MULADD(at[13], at[69]);    MULADD(at[14], at[68]);    MULADD(at[15], at[67]);    MULADD(at[16], at[66]);    MULADD(at[17], at[65]);    MULADD(at[18], at[64]);    MULADD(at[19], at[63]);    MULADD(at[20], at[62]);    MULADD(at[21], at[61]);    MULADD(at[22], at[60]);    MULADD(at[23], at[59]);    MULADD(at[24], at[58]);    MULADD(at[25], at[57]);    MULADD(at[26], at[56]);    MULADD(at[27], at[55]);    MULADD(at[28], at[54]);    MULADD(at[29], at[53]);    MULADD(at[30], at[52]);    MULADD(at[31], at[51]);    MULADD(at[32], at[50]);    MULADD(at[33], at[49]);    MULADD(at[34], at[48]); 
-   COMBA_STORE(C->dp[34]);
-   /* 35 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[83]);    MULADD(at[1], at[82]);    MULADD(at[2], at[81]);    MULADD(at[3], at[80]);    MULADD(at[4], at[79]);    MULADD(at[5], at[78]);    MULADD(at[6], at[77]);    MULADD(at[7], at[76]);    MULADD(at[8], at[75]);    MULADD(at[9], at[74]);    MULADD(at[10], at[73]);    MULADD(at[11], at[72]);    MULADD(at[12], at[71]);    MULADD(at[13], at[70]);    MULADD(at[14], at[69]);    MULADD(at[15], at[68]);    MULADD(at[16], at[67]);    MULADD(at[17], at[66]);    MULADD(at[18], at[65]);    MULADD(at[19], at[64]);    MULADD(at[20], at[63]);    MULADD(at[21], at[62]);    MULADD(at[22], at[61]);    MULADD(at[23], at[60]);    MULADD(at[24], at[59]);    MULADD(at[25], at[58]);    MULADD(at[26], at[57]);    MULADD(at[27], at[56]);    MULADD(at[28], at[55]);    MULADD(at[29], at[54]);    MULADD(at[30], at[53]);    MULADD(at[31], at[52]);    MULADD(at[32], at[51]);    MULADD(at[33], at[50]);    MULADD(at[34], at[49]);    MULADD(at[35], at[48]); 
-   COMBA_STORE(C->dp[35]);
-   /* 36 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[84]);    MULADD(at[1], at[83]);    MULADD(at[2], at[82]);    MULADD(at[3], at[81]);    MULADD(at[4], at[80]);    MULADD(at[5], at[79]);    MULADD(at[6], at[78]);    MULADD(at[7], at[77]);    MULADD(at[8], at[76]);    MULADD(at[9], at[75]);    MULADD(at[10], at[74]);    MULADD(at[11], at[73]);    MULADD(at[12], at[72]);    MULADD(at[13], at[71]);    MULADD(at[14], at[70]);    MULADD(at[15], at[69]);    MULADD(at[16], at[68]);    MULADD(at[17], at[67]);    MULADD(at[18], at[66]);    MULADD(at[19], at[65]);    MULADD(at[20], at[64]);    MULADD(at[21], at[63]);    MULADD(at[22], at[62]);    MULADD(at[23], at[61]);    MULADD(at[24], at[60]);    MULADD(at[25], at[59]);    MULADD(at[26], at[58]);    MULADD(at[27], at[57]);    MULADD(at[28], at[56]);    MULADD(at[29], at[55]);    MULADD(at[30], at[54]);    MULADD(at[31], at[53]);    MULADD(at[32], at[52]);    MULADD(at[33], at[51]);    MULADD(at[34], at[50]);    MULADD(at[35], at[49]);    MULADD(at[36], at[48]); 
-   COMBA_STORE(C->dp[36]);
-   /* 37 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[85]);    MULADD(at[1], at[84]);    MULADD(at[2], at[83]);    MULADD(at[3], at[82]);    MULADD(at[4], at[81]);    MULADD(at[5], at[80]);    MULADD(at[6], at[79]);    MULADD(at[7], at[78]);    MULADD(at[8], at[77]);    MULADD(at[9], at[76]);    MULADD(at[10], at[75]);    MULADD(at[11], at[74]);    MULADD(at[12], at[73]);    MULADD(at[13], at[72]);    MULADD(at[14], at[71]);    MULADD(at[15], at[70]);    MULADD(at[16], at[69]);    MULADD(at[17], at[68]);    MULADD(at[18], at[67]);    MULADD(at[19], at[66]);    MULADD(at[20], at[65]);    MULADD(at[21], at[64]);    MULADD(at[22], at[63]);    MULADD(at[23], at[62]);    MULADD(at[24], at[61]);    MULADD(at[25], at[60]);    MULADD(at[26], at[59]);    MULADD(at[27], at[58]);    MULADD(at[28], at[57]);    MULADD(at[29], at[56]);    MULADD(at[30], at[55]);    MULADD(at[31], at[54]);    MULADD(at[32], at[53]);    MULADD(at[33], at[52]);    MULADD(at[34], at[51]);    MULADD(at[35], at[50]);    MULADD(at[36], at[49]);    MULADD(at[37], at[48]); 
-   COMBA_STORE(C->dp[37]);
-   /* 38 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[86]);    MULADD(at[1], at[85]);    MULADD(at[2], at[84]);    MULADD(at[3], at[83]);    MULADD(at[4], at[82]);    MULADD(at[5], at[81]);    MULADD(at[6], at[80]);    MULADD(at[7], at[79]);    MULADD(at[8], at[78]);    MULADD(at[9], at[77]);    MULADD(at[10], at[76]);    MULADD(at[11], at[75]);    MULADD(at[12], at[74]);    MULADD(at[13], at[73]);    MULADD(at[14], at[72]);    MULADD(at[15], at[71]);    MULADD(at[16], at[70]);    MULADD(at[17], at[69]);    MULADD(at[18], at[68]);    MULADD(at[19], at[67]);    MULADD(at[20], at[66]);    MULADD(at[21], at[65]);    MULADD(at[22], at[64]);    MULADD(at[23], at[63]);    MULADD(at[24], at[62]);    MULADD(at[25], at[61]);    MULADD(at[26], at[60]);    MULADD(at[27], at[59]);    MULADD(at[28], at[58]);    MULADD(at[29], at[57]);    MULADD(at[30], at[56]);    MULADD(at[31], at[55]);    MULADD(at[32], at[54]);    MULADD(at[33], at[53]);    MULADD(at[34], at[52]);    MULADD(at[35], at[51]);    MULADD(at[36], at[50]);    MULADD(at[37], at[49]);    MULADD(at[38], at[48]); 
-   COMBA_STORE(C->dp[38]);
-   /* 39 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[87]);    MULADD(at[1], at[86]);    MULADD(at[2], at[85]);    MULADD(at[3], at[84]);    MULADD(at[4], at[83]);    MULADD(at[5], at[82]);    MULADD(at[6], at[81]);    MULADD(at[7], at[80]);    MULADD(at[8], at[79]);    MULADD(at[9], at[78]);    MULADD(at[10], at[77]);    MULADD(at[11], at[76]);    MULADD(at[12], at[75]);    MULADD(at[13], at[74]);    MULADD(at[14], at[73]);    MULADD(at[15], at[72]);    MULADD(at[16], at[71]);    MULADD(at[17], at[70]);    MULADD(at[18], at[69]);    MULADD(at[19], at[68]);    MULADD(at[20], at[67]);    MULADD(at[21], at[66]);    MULADD(at[22], at[65]);    MULADD(at[23], at[64]);    MULADD(at[24], at[63]);    MULADD(at[25], at[62]);    MULADD(at[26], at[61]);    MULADD(at[27], at[60]);    MULADD(at[28], at[59]);    MULADD(at[29], at[58]);    MULADD(at[30], at[57]);    MULADD(at[31], at[56]);    MULADD(at[32], at[55]);    MULADD(at[33], at[54]);    MULADD(at[34], at[53]);    MULADD(at[35], at[52]);    MULADD(at[36], at[51]);    MULADD(at[37], at[50]);    MULADD(at[38], at[49]);    MULADD(at[39], at[48]); 
-   COMBA_STORE(C->dp[39]);
-   /* 40 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[88]);    MULADD(at[1], at[87]);    MULADD(at[2], at[86]);    MULADD(at[3], at[85]);    MULADD(at[4], at[84]);    MULADD(at[5], at[83]);    MULADD(at[6], at[82]);    MULADD(at[7], at[81]);    MULADD(at[8], at[80]);    MULADD(at[9], at[79]);    MULADD(at[10], at[78]);    MULADD(at[11], at[77]);    MULADD(at[12], at[76]);    MULADD(at[13], at[75]);    MULADD(at[14], at[74]);    MULADD(at[15], at[73]);    MULADD(at[16], at[72]);    MULADD(at[17], at[71]);    MULADD(at[18], at[70]);    MULADD(at[19], at[69]);    MULADD(at[20], at[68]);    MULADD(at[21], at[67]);    MULADD(at[22], at[66]);    MULADD(at[23], at[65]);    MULADD(at[24], at[64]);    MULADD(at[25], at[63]);    MULADD(at[26], at[62]);    MULADD(at[27], at[61]);    MULADD(at[28], at[60]);    MULADD(at[29], at[59]);    MULADD(at[30], at[58]);    MULADD(at[31], at[57]);    MULADD(at[32], at[56]);    MULADD(at[33], at[55]);    MULADD(at[34], at[54]);    MULADD(at[35], at[53]);    MULADD(at[36], at[52]);    MULADD(at[37], at[51]);    MULADD(at[38], at[50]);    MULADD(at[39], at[49]);    MULADD(at[40], at[48]); 
-   COMBA_STORE(C->dp[40]);
-   /* 41 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[89]);    MULADD(at[1], at[88]);    MULADD(at[2], at[87]);    MULADD(at[3], at[86]);    MULADD(at[4], at[85]);    MULADD(at[5], at[84]);    MULADD(at[6], at[83]);    MULADD(at[7], at[82]);    MULADD(at[8], at[81]);    MULADD(at[9], at[80]);    MULADD(at[10], at[79]);    MULADD(at[11], at[78]);    MULADD(at[12], at[77]);    MULADD(at[13], at[76]);    MULADD(at[14], at[75]);    MULADD(at[15], at[74]);    MULADD(at[16], at[73]);    MULADD(at[17], at[72]);    MULADD(at[18], at[71]);    MULADD(at[19], at[70]);    MULADD(at[20], at[69]);    MULADD(at[21], at[68]);    MULADD(at[22], at[67]);    MULADD(at[23], at[66]);    MULADD(at[24], at[65]);    MULADD(at[25], at[64]);    MULADD(at[26], at[63]);    MULADD(at[27], at[62]);    MULADD(at[28], at[61]);    MULADD(at[29], at[60]);    MULADD(at[30], at[59]);    MULADD(at[31], at[58]);    MULADD(at[32], at[57]);    MULADD(at[33], at[56]);    MULADD(at[34], at[55]);    MULADD(at[35], at[54]);    MULADD(at[36], at[53]);    MULADD(at[37], at[52]);    MULADD(at[38], at[51]);    MULADD(at[39], at[50]);    MULADD(at[40], at[49]);    MULADD(at[41], at[48]); 
-   COMBA_STORE(C->dp[41]);
-   /* 42 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[90]);    MULADD(at[1], at[89]);    MULADD(at[2], at[88]);    MULADD(at[3], at[87]);    MULADD(at[4], at[86]);    MULADD(at[5], at[85]);    MULADD(at[6], at[84]);    MULADD(at[7], at[83]);    MULADD(at[8], at[82]);    MULADD(at[9], at[81]);    MULADD(at[10], at[80]);    MULADD(at[11], at[79]);    MULADD(at[12], at[78]);    MULADD(at[13], at[77]);    MULADD(at[14], at[76]);    MULADD(at[15], at[75]);    MULADD(at[16], at[74]);    MULADD(at[17], at[73]);    MULADD(at[18], at[72]);    MULADD(at[19], at[71]);    MULADD(at[20], at[70]);    MULADD(at[21], at[69]);    MULADD(at[22], at[68]);    MULADD(at[23], at[67]);    MULADD(at[24], at[66]);    MULADD(at[25], at[65]);    MULADD(at[26], at[64]);    MULADD(at[27], at[63]);    MULADD(at[28], at[62]);    MULADD(at[29], at[61]);    MULADD(at[30], at[60]);    MULADD(at[31], at[59]);    MULADD(at[32], at[58]);    MULADD(at[33], at[57]);    MULADD(at[34], at[56]);    MULADD(at[35], at[55]);    MULADD(at[36], at[54]);    MULADD(at[37], at[53]);    MULADD(at[38], at[52]);    MULADD(at[39], at[51]);    MULADD(at[40], at[50]);    MULADD(at[41], at[49]);    MULADD(at[42], at[48]); 
-   COMBA_STORE(C->dp[42]);
-   /* 43 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[91]);    MULADD(at[1], at[90]);    MULADD(at[2], at[89]);    MULADD(at[3], at[88]);    MULADD(at[4], at[87]);    MULADD(at[5], at[86]);    MULADD(at[6], at[85]);    MULADD(at[7], at[84]);    MULADD(at[8], at[83]);    MULADD(at[9], at[82]);    MULADD(at[10], at[81]);    MULADD(at[11], at[80]);    MULADD(at[12], at[79]);    MULADD(at[13], at[78]);    MULADD(at[14], at[77]);    MULADD(at[15], at[76]);    MULADD(at[16], at[75]);    MULADD(at[17], at[74]);    MULADD(at[18], at[73]);    MULADD(at[19], at[72]);    MULADD(at[20], at[71]);    MULADD(at[21], at[70]);    MULADD(at[22], at[69]);    MULADD(at[23], at[68]);    MULADD(at[24], at[67]);    MULADD(at[25], at[66]);    MULADD(at[26], at[65]);    MULADD(at[27], at[64]);    MULADD(at[28], at[63]);    MULADD(at[29], at[62]);    MULADD(at[30], at[61]);    MULADD(at[31], at[60]);    MULADD(at[32], at[59]);    MULADD(at[33], at[58]);    MULADD(at[34], at[57]);    MULADD(at[35], at[56]);    MULADD(at[36], at[55]);    MULADD(at[37], at[54]);    MULADD(at[38], at[53]);    MULADD(at[39], at[52]);    MULADD(at[40], at[51]);    MULADD(at[41], at[50]);    MULADD(at[42], at[49]);    MULADD(at[43], at[48]); 
-   COMBA_STORE(C->dp[43]);
-   /* 44 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[92]);    MULADD(at[1], at[91]);    MULADD(at[2], at[90]);    MULADD(at[3], at[89]);    MULADD(at[4], at[88]);    MULADD(at[5], at[87]);    MULADD(at[6], at[86]);    MULADD(at[7], at[85]);    MULADD(at[8], at[84]);    MULADD(at[9], at[83]);    MULADD(at[10], at[82]);    MULADD(at[11], at[81]);    MULADD(at[12], at[80]);    MULADD(at[13], at[79]);    MULADD(at[14], at[78]);    MULADD(at[15], at[77]);    MULADD(at[16], at[76]);    MULADD(at[17], at[75]);    MULADD(at[18], at[74]);    MULADD(at[19], at[73]);    MULADD(at[20], at[72]);    MULADD(at[21], at[71]);    MULADD(at[22], at[70]);    MULADD(at[23], at[69]);    MULADD(at[24], at[68]);    MULADD(at[25], at[67]);    MULADD(at[26], at[66]);    MULADD(at[27], at[65]);    MULADD(at[28], at[64]);    MULADD(at[29], at[63]);    MULADD(at[30], at[62]);    MULADD(at[31], at[61]);    MULADD(at[32], at[60]);    MULADD(at[33], at[59]);    MULADD(at[34], at[58]);    MULADD(at[35], at[57]);    MULADD(at[36], at[56]);    MULADD(at[37], at[55]);    MULADD(at[38], at[54]);    MULADD(at[39], at[53]);    MULADD(at[40], at[52]);    MULADD(at[41], at[51]);    MULADD(at[42], at[50]);    MULADD(at[43], at[49]);    MULADD(at[44], at[48]); 
-   COMBA_STORE(C->dp[44]);
-   /* 45 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[93]);    MULADD(at[1], at[92]);    MULADD(at[2], at[91]);    MULADD(at[3], at[90]);    MULADD(at[4], at[89]);    MULADD(at[5], at[88]);    MULADD(at[6], at[87]);    MULADD(at[7], at[86]);    MULADD(at[8], at[85]);    MULADD(at[9], at[84]);    MULADD(at[10], at[83]);    MULADD(at[11], at[82]);    MULADD(at[12], at[81]);    MULADD(at[13], at[80]);    MULADD(at[14], at[79]);    MULADD(at[15], at[78]);    MULADD(at[16], at[77]);    MULADD(at[17], at[76]);    MULADD(at[18], at[75]);    MULADD(at[19], at[74]);    MULADD(at[20], at[73]);    MULADD(at[21], at[72]);    MULADD(at[22], at[71]);    MULADD(at[23], at[70]);    MULADD(at[24], at[69]);    MULADD(at[25], at[68]);    MULADD(at[26], at[67]);    MULADD(at[27], at[66]);    MULADD(at[28], at[65]);    MULADD(at[29], at[64]);    MULADD(at[30], at[63]);    MULADD(at[31], at[62]);    MULADD(at[32], at[61]);    MULADD(at[33], at[60]);    MULADD(at[34], at[59]);    MULADD(at[35], at[58]);    MULADD(at[36], at[57]);    MULADD(at[37], at[56]);    MULADD(at[38], at[55]);    MULADD(at[39], at[54]);    MULADD(at[40], at[53]);    MULADD(at[41], at[52]);    MULADD(at[42], at[51]);    MULADD(at[43], at[50]);    MULADD(at[44], at[49]);    MULADD(at[45], at[48]); 
-   COMBA_STORE(C->dp[45]);
-   /* 46 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[94]);    MULADD(at[1], at[93]);    MULADD(at[2], at[92]);    MULADD(at[3], at[91]);    MULADD(at[4], at[90]);    MULADD(at[5], at[89]);    MULADD(at[6], at[88]);    MULADD(at[7], at[87]);    MULADD(at[8], at[86]);    MULADD(at[9], at[85]);    MULADD(at[10], at[84]);    MULADD(at[11], at[83]);    MULADD(at[12], at[82]);    MULADD(at[13], at[81]);    MULADD(at[14], at[80]);    MULADD(at[15], at[79]);    MULADD(at[16], at[78]);    MULADD(at[17], at[77]);    MULADD(at[18], at[76]);    MULADD(at[19], at[75]);    MULADD(at[20], at[74]);    MULADD(at[21], at[73]);    MULADD(at[22], at[72]);    MULADD(at[23], at[71]);    MULADD(at[24], at[70]);    MULADD(at[25], at[69]);    MULADD(at[26], at[68]);    MULADD(at[27], at[67]);    MULADD(at[28], at[66]);    MULADD(at[29], at[65]);    MULADD(at[30], at[64]);    MULADD(at[31], at[63]);    MULADD(at[32], at[62]);    MULADD(at[33], at[61]);    MULADD(at[34], at[60]);    MULADD(at[35], at[59]);    MULADD(at[36], at[58]);    MULADD(at[37], at[57]);    MULADD(at[38], at[56]);    MULADD(at[39], at[55]);    MULADD(at[40], at[54]);    MULADD(at[41], at[53]);    MULADD(at[42], at[52]);    MULADD(at[43], at[51]);    MULADD(at[44], at[50]);    MULADD(at[45], at[49]);    MULADD(at[46], at[48]); 
-   COMBA_STORE(C->dp[46]);
-   /* 47 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[95]);    MULADD(at[1], at[94]);    MULADD(at[2], at[93]);    MULADD(at[3], at[92]);    MULADD(at[4], at[91]);    MULADD(at[5], at[90]);    MULADD(at[6], at[89]);    MULADD(at[7], at[88]);    MULADD(at[8], at[87]);    MULADD(at[9], at[86]);    MULADD(at[10], at[85]);    MULADD(at[11], at[84]);    MULADD(at[12], at[83]);    MULADD(at[13], at[82]);    MULADD(at[14], at[81]);    MULADD(at[15], at[80]);    MULADD(at[16], at[79]);    MULADD(at[17], at[78]);    MULADD(at[18], at[77]);    MULADD(at[19], at[76]);    MULADD(at[20], at[75]);    MULADD(at[21], at[74]);    MULADD(at[22], at[73]);    MULADD(at[23], at[72]);    MULADD(at[24], at[71]);    MULADD(at[25], at[70]);    MULADD(at[26], at[69]);    MULADD(at[27], at[68]);    MULADD(at[28], at[67]);    MULADD(at[29], at[66]);    MULADD(at[30], at[65]);    MULADD(at[31], at[64]);    MULADD(at[32], at[63]);    MULADD(at[33], at[62]);    MULADD(at[34], at[61]);    MULADD(at[35], at[60]);    MULADD(at[36], at[59]);    MULADD(at[37], at[58]);    MULADD(at[38], at[57]);    MULADD(at[39], at[56]);    MULADD(at[40], at[55]);    MULADD(at[41], at[54]);    MULADD(at[42], at[53]);    MULADD(at[43], at[52]);    MULADD(at[44], at[51]);    MULADD(at[45], at[50]);    MULADD(at[46], at[49]);    MULADD(at[47], at[48]); 
-   COMBA_STORE(C->dp[47]);
-   /* 48 */
-   COMBA_FORWARD;
-   MULADD(at[1], at[95]);    MULADD(at[2], at[94]);    MULADD(at[3], at[93]);    MULADD(at[4], at[92]);    MULADD(at[5], at[91]);    MULADD(at[6], at[90]);    MULADD(at[7], at[89]);    MULADD(at[8], at[88]);    MULADD(at[9], at[87]);    MULADD(at[10], at[86]);    MULADD(at[11], at[85]);    MULADD(at[12], at[84]);    MULADD(at[13], at[83]);    MULADD(at[14], at[82]);    MULADD(at[15], at[81]);    MULADD(at[16], at[80]);    MULADD(at[17], at[79]);    MULADD(at[18], at[78]);    MULADD(at[19], at[77]);    MULADD(at[20], at[76]);    MULADD(at[21], at[75]);    MULADD(at[22], at[74]);    MULADD(at[23], at[73]);    MULADD(at[24], at[72]);    MULADD(at[25], at[71]);    MULADD(at[26], at[70]);    MULADD(at[27], at[69]);    MULADD(at[28], at[68]);    MULADD(at[29], at[67]);    MULADD(at[30], at[66]);    MULADD(at[31], at[65]);    MULADD(at[32], at[64]);    MULADD(at[33], at[63]);    MULADD(at[34], at[62]);    MULADD(at[35], at[61]);    MULADD(at[36], at[60]);    MULADD(at[37], at[59]);    MULADD(at[38], at[58]);    MULADD(at[39], at[57]);    MULADD(at[40], at[56]);    MULADD(at[41], at[55]);    MULADD(at[42], at[54]);    MULADD(at[43], at[53]);    MULADD(at[44], at[52]);    MULADD(at[45], at[51]);    MULADD(at[46], at[50]);    MULADD(at[47], at[49]); 
-   COMBA_STORE(C->dp[48]);
-   /* 49 */
-   COMBA_FORWARD;
-   MULADD(at[2], at[95]);    MULADD(at[3], at[94]);    MULADD(at[4], at[93]);    MULADD(at[5], at[92]);    MULADD(at[6], at[91]);    MULADD(at[7], at[90]);    MULADD(at[8], at[89]);    MULADD(at[9], at[88]);    MULADD(at[10], at[87]);    MULADD(at[11], at[86]);    MULADD(at[12], at[85]);    MULADD(at[13], at[84]);    MULADD(at[14], at[83]);    MULADD(at[15], at[82]);    MULADD(at[16], at[81]);    MULADD(at[17], at[80]);    MULADD(at[18], at[79]);    MULADD(at[19], at[78]);    MULADD(at[20], at[77]);    MULADD(at[21], at[76]);    MULADD(at[22], at[75]);    MULADD(at[23], at[74]);    MULADD(at[24], at[73]);    MULADD(at[25], at[72]);    MULADD(at[26], at[71]);    MULADD(at[27], at[70]);    MULADD(at[28], at[69]);    MULADD(at[29], at[68]);    MULADD(at[30], at[67]);    MULADD(at[31], at[66]);    MULADD(at[32], at[65]);    MULADD(at[33], at[64]);    MULADD(at[34], at[63]);    MULADD(at[35], at[62]);    MULADD(at[36], at[61]);    MULADD(at[37], at[60]);    MULADD(at[38], at[59]);    MULADD(at[39], at[58]);    MULADD(at[40], at[57]);    MULADD(at[41], at[56]);    MULADD(at[42], at[55]);    MULADD(at[43], at[54]);    MULADD(at[44], at[53]);    MULADD(at[45], at[52]);    MULADD(at[46], at[51]);    MULADD(at[47], at[50]); 
-   COMBA_STORE(C->dp[49]);
-   /* 50 */
-   COMBA_FORWARD;
-   MULADD(at[3], at[95]);    MULADD(at[4], at[94]);    MULADD(at[5], at[93]);    MULADD(at[6], at[92]);    MULADD(at[7], at[91]);    MULADD(at[8], at[90]);    MULADD(at[9], at[89]);    MULADD(at[10], at[88]);    MULADD(at[11], at[87]);    MULADD(at[12], at[86]);    MULADD(at[13], at[85]);    MULADD(at[14], at[84]);    MULADD(at[15], at[83]);    MULADD(at[16], at[82]);    MULADD(at[17], at[81]);    MULADD(at[18], at[80]);    MULADD(at[19], at[79]);    MULADD(at[20], at[78]);    MULADD(at[21], at[77]);    MULADD(at[22], at[76]);    MULADD(at[23], at[75]);    MULADD(at[24], at[74]);    MULADD(at[25], at[73]);    MULADD(at[26], at[72]);    MULADD(at[27], at[71]);    MULADD(at[28], at[70]);    MULADD(at[29], at[69]);    MULADD(at[30], at[68]);    MULADD(at[31], at[67]);    MULADD(at[32], at[66]);    MULADD(at[33], at[65]);    MULADD(at[34], at[64]);    MULADD(at[35], at[63]);    MULADD(at[36], at[62]);    MULADD(at[37], at[61]);    MULADD(at[38], at[60]);    MULADD(at[39], at[59]);    MULADD(at[40], at[58]);    MULADD(at[41], at[57]);    MULADD(at[42], at[56]);    MULADD(at[43], at[55]);    MULADD(at[44], at[54]);    MULADD(at[45], at[53]);    MULADD(at[46], at[52]);    MULADD(at[47], at[51]); 
-   COMBA_STORE(C->dp[50]);
-   /* 51 */
-   COMBA_FORWARD;
-   MULADD(at[4], at[95]);    MULADD(at[5], at[94]);    MULADD(at[6], at[93]);    MULADD(at[7], at[92]);    MULADD(at[8], at[91]);    MULADD(at[9], at[90]);    MULADD(at[10], at[89]);    MULADD(at[11], at[88]);    MULADD(at[12], at[87]);    MULADD(at[13], at[86]);    MULADD(at[14], at[85]);    MULADD(at[15], at[84]);    MULADD(at[16], at[83]);    MULADD(at[17], at[82]);    MULADD(at[18], at[81]);    MULADD(at[19], at[80]);    MULADD(at[20], at[79]);    MULADD(at[21], at[78]);    MULADD(at[22], at[77]);    MULADD(at[23], at[76]);    MULADD(at[24], at[75]);    MULADD(at[25], at[74]);    MULADD(at[26], at[73]);    MULADD(at[27], at[72]);    MULADD(at[28], at[71]);    MULADD(at[29], at[70]);    MULADD(at[30], at[69]);    MULADD(at[31], at[68]);    MULADD(at[32], at[67]);    MULADD(at[33], at[66]);    MULADD(at[34], at[65]);    MULADD(at[35], at[64]);    MULADD(at[36], at[63]);    MULADD(at[37], at[62]);    MULADD(at[38], at[61]);    MULADD(at[39], at[60]);    MULADD(at[40], at[59]);    MULADD(at[41], at[58]);    MULADD(at[42], at[57]);    MULADD(at[43], at[56]);    MULADD(at[44], at[55]);    MULADD(at[45], at[54]);    MULADD(at[46], at[53]);    MULADD(at[47], at[52]); 
-   COMBA_STORE(C->dp[51]);
-   /* 52 */
-   COMBA_FORWARD;
-   MULADD(at[5], at[95]);    MULADD(at[6], at[94]);    MULADD(at[7], at[93]);    MULADD(at[8], at[92]);    MULADD(at[9], at[91]);    MULADD(at[10], at[90]);    MULADD(at[11], at[89]);    MULADD(at[12], at[88]);    MULADD(at[13], at[87]);    MULADD(at[14], at[86]);    MULADD(at[15], at[85]);    MULADD(at[16], at[84]);    MULADD(at[17], at[83]);    MULADD(at[18], at[82]);    MULADD(at[19], at[81]);    MULADD(at[20], at[80]);    MULADD(at[21], at[79]);    MULADD(at[22], at[78]);    MULADD(at[23], at[77]);    MULADD(at[24], at[76]);    MULADD(at[25], at[75]);    MULADD(at[26], at[74]);    MULADD(at[27], at[73]);    MULADD(at[28], at[72]);    MULADD(at[29], at[71]);    MULADD(at[30], at[70]);    MULADD(at[31], at[69]);    MULADD(at[32], at[68]);    MULADD(at[33], at[67]);    MULADD(at[34], at[66]);    MULADD(at[35], at[65]);    MULADD(at[36], at[64]);    MULADD(at[37], at[63]);    MULADD(at[38], at[62]);    MULADD(at[39], at[61]);    MULADD(at[40], at[60]);    MULADD(at[41], at[59]);    MULADD(at[42], at[58]);    MULADD(at[43], at[57]);    MULADD(at[44], at[56]);    MULADD(at[45], at[55]);    MULADD(at[46], at[54]);    MULADD(at[47], at[53]); 
-   COMBA_STORE(C->dp[52]);
-   /* 53 */
-   COMBA_FORWARD;
-   MULADD(at[6], at[95]);    MULADD(at[7], at[94]);    MULADD(at[8], at[93]);    MULADD(at[9], at[92]);    MULADD(at[10], at[91]);    MULADD(at[11], at[90]);    MULADD(at[12], at[89]);    MULADD(at[13], at[88]);    MULADD(at[14], at[87]);    MULADD(at[15], at[86]);    MULADD(at[16], at[85]);    MULADD(at[17], at[84]);    MULADD(at[18], at[83]);    MULADD(at[19], at[82]);    MULADD(at[20], at[81]);    MULADD(at[21], at[80]);    MULADD(at[22], at[79]);    MULADD(at[23], at[78]);    MULADD(at[24], at[77]);    MULADD(at[25], at[76]);    MULADD(at[26], at[75]);    MULADD(at[27], at[74]);    MULADD(at[28], at[73]);    MULADD(at[29], at[72]);    MULADD(at[30], at[71]);    MULADD(at[31], at[70]);    MULADD(at[32], at[69]);    MULADD(at[33], at[68]);    MULADD(at[34], at[67]);    MULADD(at[35], at[66]);    MULADD(at[36], at[65]);    MULADD(at[37], at[64]);    MULADD(at[38], at[63]);    MULADD(at[39], at[62]);    MULADD(at[40], at[61]);    MULADD(at[41], at[60]);    MULADD(at[42], at[59]);    MULADD(at[43], at[58]);    MULADD(at[44], at[57]);    MULADD(at[45], at[56]);    MULADD(at[46], at[55]);    MULADD(at[47], at[54]); 
-   COMBA_STORE(C->dp[53]);
-   /* 54 */
-   COMBA_FORWARD;
-   MULADD(at[7], at[95]);    MULADD(at[8], at[94]);    MULADD(at[9], at[93]);    MULADD(at[10], at[92]);    MULADD(at[11], at[91]);    MULADD(at[12], at[90]);    MULADD(at[13], at[89]);    MULADD(at[14], at[88]);    MULADD(at[15], at[87]);    MULADD(at[16], at[86]);    MULADD(at[17], at[85]);    MULADD(at[18], at[84]);    MULADD(at[19], at[83]);    MULADD(at[20], at[82]);    MULADD(at[21], at[81]);    MULADD(at[22], at[80]);    MULADD(at[23], at[79]);    MULADD(at[24], at[78]);    MULADD(at[25], at[77]);    MULADD(at[26], at[76]);    MULADD(at[27], at[75]);    MULADD(at[28], at[74]);    MULADD(at[29], at[73]);    MULADD(at[30], at[72]);    MULADD(at[31], at[71]);    MULADD(at[32], at[70]);    MULADD(at[33], at[69]);    MULADD(at[34], at[68]);    MULADD(at[35], at[67]);    MULADD(at[36], at[66]);    MULADD(at[37], at[65]);    MULADD(at[38], at[64]);    MULADD(at[39], at[63]);    MULADD(at[40], at[62]);    MULADD(at[41], at[61]);    MULADD(at[42], at[60]);    MULADD(at[43], at[59]);    MULADD(at[44], at[58]);    MULADD(at[45], at[57]);    MULADD(at[46], at[56]);    MULADD(at[47], at[55]); 
-   COMBA_STORE(C->dp[54]);
-   /* 55 */
-   COMBA_FORWARD;
-   MULADD(at[8], at[95]);    MULADD(at[9], at[94]);    MULADD(at[10], at[93]);    MULADD(at[11], at[92]);    MULADD(at[12], at[91]);    MULADD(at[13], at[90]);    MULADD(at[14], at[89]);    MULADD(at[15], at[88]);    MULADD(at[16], at[87]);    MULADD(at[17], at[86]);    MULADD(at[18], at[85]);    MULADD(at[19], at[84]);    MULADD(at[20], at[83]);    MULADD(at[21], at[82]);    MULADD(at[22], at[81]);    MULADD(at[23], at[80]);    MULADD(at[24], at[79]);    MULADD(at[25], at[78]);    MULADD(at[26], at[77]);    MULADD(at[27], at[76]);    MULADD(at[28], at[75]);    MULADD(at[29], at[74]);    MULADD(at[30], at[73]);    MULADD(at[31], at[72]);    MULADD(at[32], at[71]);    MULADD(at[33], at[70]);    MULADD(at[34], at[69]);    MULADD(at[35], at[68]);    MULADD(at[36], at[67]);    MULADD(at[37], at[66]);    MULADD(at[38], at[65]);    MULADD(at[39], at[64]);    MULADD(at[40], at[63]);    MULADD(at[41], at[62]);    MULADD(at[42], at[61]);    MULADD(at[43], at[60]);    MULADD(at[44], at[59]);    MULADD(at[45], at[58]);    MULADD(at[46], at[57]);    MULADD(at[47], at[56]); 
-   COMBA_STORE(C->dp[55]);
-   /* 56 */
-   COMBA_FORWARD;
-   MULADD(at[9], at[95]);    MULADD(at[10], at[94]);    MULADD(at[11], at[93]);    MULADD(at[12], at[92]);    MULADD(at[13], at[91]);    MULADD(at[14], at[90]);    MULADD(at[15], at[89]);    MULADD(at[16], at[88]);    MULADD(at[17], at[87]);    MULADD(at[18], at[86]);    MULADD(at[19], at[85]);    MULADD(at[20], at[84]);    MULADD(at[21], at[83]);    MULADD(at[22], at[82]);    MULADD(at[23], at[81]);    MULADD(at[24], at[80]);    MULADD(at[25], at[79]);    MULADD(at[26], at[78]);    MULADD(at[27], at[77]);    MULADD(at[28], at[76]);    MULADD(at[29], at[75]);    MULADD(at[30], at[74]);    MULADD(at[31], at[73]);    MULADD(at[32], at[72]);    MULADD(at[33], at[71]);    MULADD(at[34], at[70]);    MULADD(at[35], at[69]);    MULADD(at[36], at[68]);    MULADD(at[37], at[67]);    MULADD(at[38], at[66]);    MULADD(at[39], at[65]);    MULADD(at[40], at[64]);    MULADD(at[41], at[63]);    MULADD(at[42], at[62]);    MULADD(at[43], at[61]);    MULADD(at[44], at[60]);    MULADD(at[45], at[59]);    MULADD(at[46], at[58]);    MULADD(at[47], at[57]); 
-   COMBA_STORE(C->dp[56]);
-   /* 57 */
-   COMBA_FORWARD;
-   MULADD(at[10], at[95]);    MULADD(at[11], at[94]);    MULADD(at[12], at[93]);    MULADD(at[13], at[92]);    MULADD(at[14], at[91]);    MULADD(at[15], at[90]);    MULADD(at[16], at[89]);    MULADD(at[17], at[88]);    MULADD(at[18], at[87]);    MULADD(at[19], at[86]);    MULADD(at[20], at[85]);    MULADD(at[21], at[84]);    MULADD(at[22], at[83]);    MULADD(at[23], at[82]);    MULADD(at[24], at[81]);    MULADD(at[25], at[80]);    MULADD(at[26], at[79]);    MULADD(at[27], at[78]);    MULADD(at[28], at[77]);    MULADD(at[29], at[76]);    MULADD(at[30], at[75]);    MULADD(at[31], at[74]);    MULADD(at[32], at[73]);    MULADD(at[33], at[72]);    MULADD(at[34], at[71]);    MULADD(at[35], at[70]);    MULADD(at[36], at[69]);    MULADD(at[37], at[68]);    MULADD(at[38], at[67]);    MULADD(at[39], at[66]);    MULADD(at[40], at[65]);    MULADD(at[41], at[64]);    MULADD(at[42], at[63]);    MULADD(at[43], at[62]);    MULADD(at[44], at[61]);    MULADD(at[45], at[60]);    MULADD(at[46], at[59]);    MULADD(at[47], at[58]); 
-   COMBA_STORE(C->dp[57]);
-   /* 58 */
-   COMBA_FORWARD;
-   MULADD(at[11], at[95]);    MULADD(at[12], at[94]);    MULADD(at[13], at[93]);    MULADD(at[14], at[92]);    MULADD(at[15], at[91]);    MULADD(at[16], at[90]);    MULADD(at[17], at[89]);    MULADD(at[18], at[88]);    MULADD(at[19], at[87]);    MULADD(at[20], at[86]);    MULADD(at[21], at[85]);    MULADD(at[22], at[84]);    MULADD(at[23], at[83]);    MULADD(at[24], at[82]);    MULADD(at[25], at[81]);    MULADD(at[26], at[80]);    MULADD(at[27], at[79]);    MULADD(at[28], at[78]);    MULADD(at[29], at[77]);    MULADD(at[30], at[76]);    MULADD(at[31], at[75]);    MULADD(at[32], at[74]);    MULADD(at[33], at[73]);    MULADD(at[34], at[72]);    MULADD(at[35], at[71]);    MULADD(at[36], at[70]);    MULADD(at[37], at[69]);    MULADD(at[38], at[68]);    MULADD(at[39], at[67]);    MULADD(at[40], at[66]);    MULADD(at[41], at[65]);    MULADD(at[42], at[64]);    MULADD(at[43], at[63]);    MULADD(at[44], at[62]);    MULADD(at[45], at[61]);    MULADD(at[46], at[60]);    MULADD(at[47], at[59]); 
-   COMBA_STORE(C->dp[58]);
-   /* 59 */
-   COMBA_FORWARD;
-   MULADD(at[12], at[95]);    MULADD(at[13], at[94]);    MULADD(at[14], at[93]);    MULADD(at[15], at[92]);    MULADD(at[16], at[91]);    MULADD(at[17], at[90]);    MULADD(at[18], at[89]);    MULADD(at[19], at[88]);    MULADD(at[20], at[87]);    MULADD(at[21], at[86]);    MULADD(at[22], at[85]);    MULADD(at[23], at[84]);    MULADD(at[24], at[83]);    MULADD(at[25], at[82]);    MULADD(at[26], at[81]);    MULADD(at[27], at[80]);    MULADD(at[28], at[79]);    MULADD(at[29], at[78]);    MULADD(at[30], at[77]);    MULADD(at[31], at[76]);    MULADD(at[32], at[75]);    MULADD(at[33], at[74]);    MULADD(at[34], at[73]);    MULADD(at[35], at[72]);    MULADD(at[36], at[71]);    MULADD(at[37], at[70]);    MULADD(at[38], at[69]);    MULADD(at[39], at[68]);    MULADD(at[40], at[67]);    MULADD(at[41], at[66]);    MULADD(at[42], at[65]);    MULADD(at[43], at[64]);    MULADD(at[44], at[63]);    MULADD(at[45], at[62]);    MULADD(at[46], at[61]);    MULADD(at[47], at[60]); 
-   COMBA_STORE(C->dp[59]);
-   /* 60 */
-   COMBA_FORWARD;
-   MULADD(at[13], at[95]);    MULADD(at[14], at[94]);    MULADD(at[15], at[93]);    MULADD(at[16], at[92]);    MULADD(at[17], at[91]);    MULADD(at[18], at[90]);    MULADD(at[19], at[89]);    MULADD(at[20], at[88]);    MULADD(at[21], at[87]);    MULADD(at[22], at[86]);    MULADD(at[23], at[85]);    MULADD(at[24], at[84]);    MULADD(at[25], at[83]);    MULADD(at[26], at[82]);    MULADD(at[27], at[81]);    MULADD(at[28], at[80]);    MULADD(at[29], at[79]);    MULADD(at[30], at[78]);    MULADD(at[31], at[77]);    MULADD(at[32], at[76]);    MULADD(at[33], at[75]);    MULADD(at[34], at[74]);    MULADD(at[35], at[73]);    MULADD(at[36], at[72]);    MULADD(at[37], at[71]);    MULADD(at[38], at[70]);    MULADD(at[39], at[69]);    MULADD(at[40], at[68]);    MULADD(at[41], at[67]);    MULADD(at[42], at[66]);    MULADD(at[43], at[65]);    MULADD(at[44], at[64]);    MULADD(at[45], at[63]);    MULADD(at[46], at[62]);    MULADD(at[47], at[61]); 
-   COMBA_STORE(C->dp[60]);
-   /* 61 */
-   COMBA_FORWARD;
-   MULADD(at[14], at[95]);    MULADD(at[15], at[94]);    MULADD(at[16], at[93]);    MULADD(at[17], at[92]);    MULADD(at[18], at[91]);    MULADD(at[19], at[90]);    MULADD(at[20], at[89]);    MULADD(at[21], at[88]);    MULADD(at[22], at[87]);    MULADD(at[23], at[86]);    MULADD(at[24], at[85]);    MULADD(at[25], at[84]);    MULADD(at[26], at[83]);    MULADD(at[27], at[82]);    MULADD(at[28], at[81]);    MULADD(at[29], at[80]);    MULADD(at[30], at[79]);    MULADD(at[31], at[78]);    MULADD(at[32], at[77]);    MULADD(at[33], at[76]);    MULADD(at[34], at[75]);    MULADD(at[35], at[74]);    MULADD(at[36], at[73]);    MULADD(at[37], at[72]);    MULADD(at[38], at[71]);    MULADD(at[39], at[70]);    MULADD(at[40], at[69]);    MULADD(at[41], at[68]);    MULADD(at[42], at[67]);    MULADD(at[43], at[66]);    MULADD(at[44], at[65]);    MULADD(at[45], at[64]);    MULADD(at[46], at[63]);    MULADD(at[47], at[62]); 
-   COMBA_STORE(C->dp[61]);
-   /* 62 */
-   COMBA_FORWARD;
-   MULADD(at[15], at[95]);    MULADD(at[16], at[94]);    MULADD(at[17], at[93]);    MULADD(at[18], at[92]);    MULADD(at[19], at[91]);    MULADD(at[20], at[90]);    MULADD(at[21], at[89]);    MULADD(at[22], at[88]);    MULADD(at[23], at[87]);    MULADD(at[24], at[86]);    MULADD(at[25], at[85]);    MULADD(at[26], at[84]);    MULADD(at[27], at[83]);    MULADD(at[28], at[82]);    MULADD(at[29], at[81]);    MULADD(at[30], at[80]);    MULADD(at[31], at[79]);    MULADD(at[32], at[78]);    MULADD(at[33], at[77]);    MULADD(at[34], at[76]);    MULADD(at[35], at[75]);    MULADD(at[36], at[74]);    MULADD(at[37], at[73]);    MULADD(at[38], at[72]);    MULADD(at[39], at[71]);    MULADD(at[40], at[70]);    MULADD(at[41], at[69]);    MULADD(at[42], at[68]);    MULADD(at[43], at[67]);    MULADD(at[44], at[66]);    MULADD(at[45], at[65]);    MULADD(at[46], at[64]);    MULADD(at[47], at[63]); 
-   COMBA_STORE(C->dp[62]);
-   /* 63 */
-   COMBA_FORWARD;
-   MULADD(at[16], at[95]);    MULADD(at[17], at[94]);    MULADD(at[18], at[93]);    MULADD(at[19], at[92]);    MULADD(at[20], at[91]);    MULADD(at[21], at[90]);    MULADD(at[22], at[89]);    MULADD(at[23], at[88]);    MULADD(at[24], at[87]);    MULADD(at[25], at[86]);    MULADD(at[26], at[85]);    MULADD(at[27], at[84]);    MULADD(at[28], at[83]);    MULADD(at[29], at[82]);    MULADD(at[30], at[81]);    MULADD(at[31], at[80]);    MULADD(at[32], at[79]);    MULADD(at[33], at[78]);    MULADD(at[34], at[77]);    MULADD(at[35], at[76]);    MULADD(at[36], at[75]);    MULADD(at[37], at[74]);    MULADD(at[38], at[73]);    MULADD(at[39], at[72]);    MULADD(at[40], at[71]);    MULADD(at[41], at[70]);    MULADD(at[42], at[69]);    MULADD(at[43], at[68]);    MULADD(at[44], at[67]);    MULADD(at[45], at[66]);    MULADD(at[46], at[65]);    MULADD(at[47], at[64]); 
-   COMBA_STORE(C->dp[63]);
-   /* 64 */
-   COMBA_FORWARD;
-   MULADD(at[17], at[95]);    MULADD(at[18], at[94]);    MULADD(at[19], at[93]);    MULADD(at[20], at[92]);    MULADD(at[21], at[91]);    MULADD(at[22], at[90]);    MULADD(at[23], at[89]);    MULADD(at[24], at[88]);    MULADD(at[25], at[87]);    MULADD(at[26], at[86]);    MULADD(at[27], at[85]);    MULADD(at[28], at[84]);    MULADD(at[29], at[83]);    MULADD(at[30], at[82]);    MULADD(at[31], at[81]);    MULADD(at[32], at[80]);    MULADD(at[33], at[79]);    MULADD(at[34], at[78]);    MULADD(at[35], at[77]);    MULADD(at[36], at[76]);    MULADD(at[37], at[75]);    MULADD(at[38], at[74]);    MULADD(at[39], at[73]);    MULADD(at[40], at[72]);    MULADD(at[41], at[71]);    MULADD(at[42], at[70]);    MULADD(at[43], at[69]);    MULADD(at[44], at[68]);    MULADD(at[45], at[67]);    MULADD(at[46], at[66]);    MULADD(at[47], at[65]); 
-   COMBA_STORE(C->dp[64]);
-   /* 65 */
-   COMBA_FORWARD;
-   MULADD(at[18], at[95]);    MULADD(at[19], at[94]);    MULADD(at[20], at[93]);    MULADD(at[21], at[92]);    MULADD(at[22], at[91]);    MULADD(at[23], at[90]);    MULADD(at[24], at[89]);    MULADD(at[25], at[88]);    MULADD(at[26], at[87]);    MULADD(at[27], at[86]);    MULADD(at[28], at[85]);    MULADD(at[29], at[84]);    MULADD(at[30], at[83]);    MULADD(at[31], at[82]);    MULADD(at[32], at[81]);    MULADD(at[33], at[80]);    MULADD(at[34], at[79]);    MULADD(at[35], at[78]);    MULADD(at[36], at[77]);    MULADD(at[37], at[76]);    MULADD(at[38], at[75]);    MULADD(at[39], at[74]);    MULADD(at[40], at[73]);    MULADD(at[41], at[72]);    MULADD(at[42], at[71]);    MULADD(at[43], at[70]);    MULADD(at[44], at[69]);    MULADD(at[45], at[68]);    MULADD(at[46], at[67]);    MULADD(at[47], at[66]); 
-   COMBA_STORE(C->dp[65]);
-   /* 66 */
-   COMBA_FORWARD;
-   MULADD(at[19], at[95]);    MULADD(at[20], at[94]);    MULADD(at[21], at[93]);    MULADD(at[22], at[92]);    MULADD(at[23], at[91]);    MULADD(at[24], at[90]);    MULADD(at[25], at[89]);    MULADD(at[26], at[88]);    MULADD(at[27], at[87]);    MULADD(at[28], at[86]);    MULADD(at[29], at[85]);    MULADD(at[30], at[84]);    MULADD(at[31], at[83]);    MULADD(at[32], at[82]);    MULADD(at[33], at[81]);    MULADD(at[34], at[80]);    MULADD(at[35], at[79]);    MULADD(at[36], at[78]);    MULADD(at[37], at[77]);    MULADD(at[38], at[76]);    MULADD(at[39], at[75]);    MULADD(at[40], at[74]);    MULADD(at[41], at[73]);    MULADD(at[42], at[72]);    MULADD(at[43], at[71]);    MULADD(at[44], at[70]);    MULADD(at[45], at[69]);    MULADD(at[46], at[68]);    MULADD(at[47], at[67]); 
-   COMBA_STORE(C->dp[66]);
-   /* 67 */
-   COMBA_FORWARD;
-   MULADD(at[20], at[95]);    MULADD(at[21], at[94]);    MULADD(at[22], at[93]);    MULADD(at[23], at[92]);    MULADD(at[24], at[91]);    MULADD(at[25], at[90]);    MULADD(at[26], at[89]);    MULADD(at[27], at[88]);    MULADD(at[28], at[87]);    MULADD(at[29], at[86]);    MULADD(at[30], at[85]);    MULADD(at[31], at[84]);    MULADD(at[32], at[83]);    MULADD(at[33], at[82]);    MULADD(at[34], at[81]);    MULADD(at[35], at[80]);    MULADD(at[36], at[79]);    MULADD(at[37], at[78]);    MULADD(at[38], at[77]);    MULADD(at[39], at[76]);    MULADD(at[40], at[75]);    MULADD(at[41], at[74]);    MULADD(at[42], at[73]);    MULADD(at[43], at[72]);    MULADD(at[44], at[71]);    MULADD(at[45], at[70]);    MULADD(at[46], at[69]);    MULADD(at[47], at[68]); 
-   COMBA_STORE(C->dp[67]);
-   /* 68 */
-   COMBA_FORWARD;
-   MULADD(at[21], at[95]);    MULADD(at[22], at[94]);    MULADD(at[23], at[93]);    MULADD(at[24], at[92]);    MULADD(at[25], at[91]);    MULADD(at[26], at[90]);    MULADD(at[27], at[89]);    MULADD(at[28], at[88]);    MULADD(at[29], at[87]);    MULADD(at[30], at[86]);    MULADD(at[31], at[85]);    MULADD(at[32], at[84]);    MULADD(at[33], at[83]);    MULADD(at[34], at[82]);    MULADD(at[35], at[81]);    MULADD(at[36], at[80]);    MULADD(at[37], at[79]);    MULADD(at[38], at[78]);    MULADD(at[39], at[77]);    MULADD(at[40], at[76]);    MULADD(at[41], at[75]);    MULADD(at[42], at[74]);    MULADD(at[43], at[73]);    MULADD(at[44], at[72]);    MULADD(at[45], at[71]);    MULADD(at[46], at[70]);    MULADD(at[47], at[69]); 
-   COMBA_STORE(C->dp[68]);
-   /* 69 */
-   COMBA_FORWARD;
-   MULADD(at[22], at[95]);    MULADD(at[23], at[94]);    MULADD(at[24], at[93]);    MULADD(at[25], at[92]);    MULADD(at[26], at[91]);    MULADD(at[27], at[90]);    MULADD(at[28], at[89]);    MULADD(at[29], at[88]);    MULADD(at[30], at[87]);    MULADD(at[31], at[86]);    MULADD(at[32], at[85]);    MULADD(at[33], at[84]);    MULADD(at[34], at[83]);    MULADD(at[35], at[82]);    MULADD(at[36], at[81]);    MULADD(at[37], at[80]);    MULADD(at[38], at[79]);    MULADD(at[39], at[78]);    MULADD(at[40], at[77]);    MULADD(at[41], at[76]);    MULADD(at[42], at[75]);    MULADD(at[43], at[74]);    MULADD(at[44], at[73]);    MULADD(at[45], at[72]);    MULADD(at[46], at[71]);    MULADD(at[47], at[70]); 
-   COMBA_STORE(C->dp[69]);
-   /* 70 */
-   COMBA_FORWARD;
-   MULADD(at[23], at[95]);    MULADD(at[24], at[94]);    MULADD(at[25], at[93]);    MULADD(at[26], at[92]);    MULADD(at[27], at[91]);    MULADD(at[28], at[90]);    MULADD(at[29], at[89]);    MULADD(at[30], at[88]);    MULADD(at[31], at[87]);    MULADD(at[32], at[86]);    MULADD(at[33], at[85]);    MULADD(at[34], at[84]);    MULADD(at[35], at[83]);    MULADD(at[36], at[82]);    MULADD(at[37], at[81]);    MULADD(at[38], at[80]);    MULADD(at[39], at[79]);    MULADD(at[40], at[78]);    MULADD(at[41], at[77]);    MULADD(at[42], at[76]);    MULADD(at[43], at[75]);    MULADD(at[44], at[74]);    MULADD(at[45], at[73]);    MULADD(at[46], at[72]);    MULADD(at[47], at[71]); 
-   COMBA_STORE(C->dp[70]);
-   /* 71 */
-   COMBA_FORWARD;
-   MULADD(at[24], at[95]);    MULADD(at[25], at[94]);    MULADD(at[26], at[93]);    MULADD(at[27], at[92]);    MULADD(at[28], at[91]);    MULADD(at[29], at[90]);    MULADD(at[30], at[89]);    MULADD(at[31], at[88]);    MULADD(at[32], at[87]);    MULADD(at[33], at[86]);    MULADD(at[34], at[85]);    MULADD(at[35], at[84]);    MULADD(at[36], at[83]);    MULADD(at[37], at[82]);    MULADD(at[38], at[81]);    MULADD(at[39], at[80]);    MULADD(at[40], at[79]);    MULADD(at[41], at[78]);    MULADD(at[42], at[77]);    MULADD(at[43], at[76]);    MULADD(at[44], at[75]);    MULADD(at[45], at[74]);    MULADD(at[46], at[73]);    MULADD(at[47], at[72]); 
-   COMBA_STORE(C->dp[71]);
-   /* 72 */
-   COMBA_FORWARD;
-   MULADD(at[25], at[95]);    MULADD(at[26], at[94]);    MULADD(at[27], at[93]);    MULADD(at[28], at[92]);    MULADD(at[29], at[91]);    MULADD(at[30], at[90]);    MULADD(at[31], at[89]);    MULADD(at[32], at[88]);    MULADD(at[33], at[87]);    MULADD(at[34], at[86]);    MULADD(at[35], at[85]);    MULADD(at[36], at[84]);    MULADD(at[37], at[83]);    MULADD(at[38], at[82]);    MULADD(at[39], at[81]);    MULADD(at[40], at[80]);    MULADD(at[41], at[79]);    MULADD(at[42], at[78]);    MULADD(at[43], at[77]);    MULADD(at[44], at[76]);    MULADD(at[45], at[75]);    MULADD(at[46], at[74]);    MULADD(at[47], at[73]); 
-   COMBA_STORE(C->dp[72]);
-   /* 73 */
-   COMBA_FORWARD;
-   MULADD(at[26], at[95]);    MULADD(at[27], at[94]);    MULADD(at[28], at[93]);    MULADD(at[29], at[92]);    MULADD(at[30], at[91]);    MULADD(at[31], at[90]);    MULADD(at[32], at[89]);    MULADD(at[33], at[88]);    MULADD(at[34], at[87]);    MULADD(at[35], at[86]);    MULADD(at[36], at[85]);    MULADD(at[37], at[84]);    MULADD(at[38], at[83]);    MULADD(at[39], at[82]);    MULADD(at[40], at[81]);    MULADD(at[41], at[80]);    MULADD(at[42], at[79]);    MULADD(at[43], at[78]);    MULADD(at[44], at[77]);    MULADD(at[45], at[76]);    MULADD(at[46], at[75]);    MULADD(at[47], at[74]); 
-   COMBA_STORE(C->dp[73]);
-   /* 74 */
-   COMBA_FORWARD;
-   MULADD(at[27], at[95]);    MULADD(at[28], at[94]);    MULADD(at[29], at[93]);    MULADD(at[30], at[92]);    MULADD(at[31], at[91]);    MULADD(at[32], at[90]);    MULADD(at[33], at[89]);    MULADD(at[34], at[88]);    MULADD(at[35], at[87]);    MULADD(at[36], at[86]);    MULADD(at[37], at[85]);    MULADD(at[38], at[84]);    MULADD(at[39], at[83]);    MULADD(at[40], at[82]);    MULADD(at[41], at[81]);    MULADD(at[42], at[80]);    MULADD(at[43], at[79]);    MULADD(at[44], at[78]);    MULADD(at[45], at[77]);    MULADD(at[46], at[76]);    MULADD(at[47], at[75]); 
-   COMBA_STORE(C->dp[74]);
-   /* 75 */
-   COMBA_FORWARD;
-   MULADD(at[28], at[95]);    MULADD(at[29], at[94]);    MULADD(at[30], at[93]);    MULADD(at[31], at[92]);    MULADD(at[32], at[91]);    MULADD(at[33], at[90]);    MULADD(at[34], at[89]);    MULADD(at[35], at[88]);    MULADD(at[36], at[87]);    MULADD(at[37], at[86]);    MULADD(at[38], at[85]);    MULADD(at[39], at[84]);    MULADD(at[40], at[83]);    MULADD(at[41], at[82]);    MULADD(at[42], at[81]);    MULADD(at[43], at[80]);    MULADD(at[44], at[79]);    MULADD(at[45], at[78]);    MULADD(at[46], at[77]);    MULADD(at[47], at[76]); 
-   COMBA_STORE(C->dp[75]);
-   /* 76 */
-   COMBA_FORWARD;
-   MULADD(at[29], at[95]);    MULADD(at[30], at[94]);    MULADD(at[31], at[93]);    MULADD(at[32], at[92]);    MULADD(at[33], at[91]);    MULADD(at[34], at[90]);    MULADD(at[35], at[89]);    MULADD(at[36], at[88]);    MULADD(at[37], at[87]);    MULADD(at[38], at[86]);    MULADD(at[39], at[85]);    MULADD(at[40], at[84]);    MULADD(at[41], at[83]);    MULADD(at[42], at[82]);    MULADD(at[43], at[81]);    MULADD(at[44], at[80]);    MULADD(at[45], at[79]);    MULADD(at[46], at[78]);    MULADD(at[47], at[77]); 
-   COMBA_STORE(C->dp[76]);
-   /* 77 */
-   COMBA_FORWARD;
-   MULADD(at[30], at[95]);    MULADD(at[31], at[94]);    MULADD(at[32], at[93]);    MULADD(at[33], at[92]);    MULADD(at[34], at[91]);    MULADD(at[35], at[90]);    MULADD(at[36], at[89]);    MULADD(at[37], at[88]);    MULADD(at[38], at[87]);    MULADD(at[39], at[86]);    MULADD(at[40], at[85]);    MULADD(at[41], at[84]);    MULADD(at[42], at[83]);    MULADD(at[43], at[82]);    MULADD(at[44], at[81]);    MULADD(at[45], at[80]);    MULADD(at[46], at[79]);    MULADD(at[47], at[78]); 
-   COMBA_STORE(C->dp[77]);
-   /* 78 */
-   COMBA_FORWARD;
-   MULADD(at[31], at[95]);    MULADD(at[32], at[94]);    MULADD(at[33], at[93]);    MULADD(at[34], at[92]);    MULADD(at[35], at[91]);    MULADD(at[36], at[90]);    MULADD(at[37], at[89]);    MULADD(at[38], at[88]);    MULADD(at[39], at[87]);    MULADD(at[40], at[86]);    MULADD(at[41], at[85]);    MULADD(at[42], at[84]);    MULADD(at[43], at[83]);    MULADD(at[44], at[82]);    MULADD(at[45], at[81]);    MULADD(at[46], at[80]);    MULADD(at[47], at[79]); 
-   COMBA_STORE(C->dp[78]);
-   /* 79 */
-   COMBA_FORWARD;
-   MULADD(at[32], at[95]);    MULADD(at[33], at[94]);    MULADD(at[34], at[93]);    MULADD(at[35], at[92]);    MULADD(at[36], at[91]);    MULADD(at[37], at[90]);    MULADD(at[38], at[89]);    MULADD(at[39], at[88]);    MULADD(at[40], at[87]);    MULADD(at[41], at[86]);    MULADD(at[42], at[85]);    MULADD(at[43], at[84]);    MULADD(at[44], at[83]);    MULADD(at[45], at[82]);    MULADD(at[46], at[81]);    MULADD(at[47], at[80]); 
-   COMBA_STORE(C->dp[79]);
-   /* 80 */
-   COMBA_FORWARD;
-   MULADD(at[33], at[95]);    MULADD(at[34], at[94]);    MULADD(at[35], at[93]);    MULADD(at[36], at[92]);    MULADD(at[37], at[91]);    MULADD(at[38], at[90]);    MULADD(at[39], at[89]);    MULADD(at[40], at[88]);    MULADD(at[41], at[87]);    MULADD(at[42], at[86]);    MULADD(at[43], at[85]);    MULADD(at[44], at[84]);    MULADD(at[45], at[83]);    MULADD(at[46], at[82]);    MULADD(at[47], at[81]); 
-   COMBA_STORE(C->dp[80]);
-   /* 81 */
-   COMBA_FORWARD;
-   MULADD(at[34], at[95]);    MULADD(at[35], at[94]);    MULADD(at[36], at[93]);    MULADD(at[37], at[92]);    MULADD(at[38], at[91]);    MULADD(at[39], at[90]);    MULADD(at[40], at[89]);    MULADD(at[41], at[88]);    MULADD(at[42], at[87]);    MULADD(at[43], at[86]);    MULADD(at[44], at[85]);    MULADD(at[45], at[84]);    MULADD(at[46], at[83]);    MULADD(at[47], at[82]); 
-   COMBA_STORE(C->dp[81]);
-   /* 82 */
-   COMBA_FORWARD;
-   MULADD(at[35], at[95]);    MULADD(at[36], at[94]);    MULADD(at[37], at[93]);    MULADD(at[38], at[92]);    MULADD(at[39], at[91]);    MULADD(at[40], at[90]);    MULADD(at[41], at[89]);    MULADD(at[42], at[88]);    MULADD(at[43], at[87]);    MULADD(at[44], at[86]);    MULADD(at[45], at[85]);    MULADD(at[46], at[84]);    MULADD(at[47], at[83]); 
-   COMBA_STORE(C->dp[82]);
-   /* 83 */
-   COMBA_FORWARD;
-   MULADD(at[36], at[95]);    MULADD(at[37], at[94]);    MULADD(at[38], at[93]);    MULADD(at[39], at[92]);    MULADD(at[40], at[91]);    MULADD(at[41], at[90]);    MULADD(at[42], at[89]);    MULADD(at[43], at[88]);    MULADD(at[44], at[87]);    MULADD(at[45], at[86]);    MULADD(at[46], at[85]);    MULADD(at[47], at[84]); 
-   COMBA_STORE(C->dp[83]);
-   /* 84 */
-   COMBA_FORWARD;
-   MULADD(at[37], at[95]);    MULADD(at[38], at[94]);    MULADD(at[39], at[93]);    MULADD(at[40], at[92]);    MULADD(at[41], at[91]);    MULADD(at[42], at[90]);    MULADD(at[43], at[89]);    MULADD(at[44], at[88]);    MULADD(at[45], at[87]);    MULADD(at[46], at[86]);    MULADD(at[47], at[85]); 
-   COMBA_STORE(C->dp[84]);
-   /* 85 */
-   COMBA_FORWARD;
-   MULADD(at[38], at[95]);    MULADD(at[39], at[94]);    MULADD(at[40], at[93]);    MULADD(at[41], at[92]);    MULADD(at[42], at[91]);    MULADD(at[43], at[90]);    MULADD(at[44], at[89]);    MULADD(at[45], at[88]);    MULADD(at[46], at[87]);    MULADD(at[47], at[86]); 
-   COMBA_STORE(C->dp[85]);
-   /* 86 */
-   COMBA_FORWARD;
-   MULADD(at[39], at[95]);    MULADD(at[40], at[94]);    MULADD(at[41], at[93]);    MULADD(at[42], at[92]);    MULADD(at[43], at[91]);    MULADD(at[44], at[90]);    MULADD(at[45], at[89]);    MULADD(at[46], at[88]);    MULADD(at[47], at[87]); 
-   COMBA_STORE(C->dp[86]);
-   /* 87 */
-   COMBA_FORWARD;
-   MULADD(at[40], at[95]);    MULADD(at[41], at[94]);    MULADD(at[42], at[93]);    MULADD(at[43], at[92]);    MULADD(at[44], at[91]);    MULADD(at[45], at[90]);    MULADD(at[46], at[89]);    MULADD(at[47], at[88]); 
-   COMBA_STORE(C->dp[87]);
-   /* 88 */
-   COMBA_FORWARD;
-   MULADD(at[41], at[95]);    MULADD(at[42], at[94]);    MULADD(at[43], at[93]);    MULADD(at[44], at[92]);    MULADD(at[45], at[91]);    MULADD(at[46], at[90]);    MULADD(at[47], at[89]); 
-   COMBA_STORE(C->dp[88]);
-   /* 89 */
-   COMBA_FORWARD;
-   MULADD(at[42], at[95]);    MULADD(at[43], at[94]);    MULADD(at[44], at[93]);    MULADD(at[45], at[92]);    MULADD(at[46], at[91]);    MULADD(at[47], at[90]); 
-   COMBA_STORE(C->dp[89]);
-   /* 90 */
-   COMBA_FORWARD;
-   MULADD(at[43], at[95]);    MULADD(at[44], at[94]);    MULADD(at[45], at[93]);    MULADD(at[46], at[92]);    MULADD(at[47], at[91]); 
-   COMBA_STORE(C->dp[90]);
-   /* 91 */
-   COMBA_FORWARD;
-   MULADD(at[44], at[95]);    MULADD(at[45], at[94]);    MULADD(at[46], at[93]);    MULADD(at[47], at[92]); 
-   COMBA_STORE(C->dp[91]);
-   /* 92 */
-   COMBA_FORWARD;
-   MULADD(at[45], at[95]);    MULADD(at[46], at[94]);    MULADD(at[47], at[93]); 
-   COMBA_STORE(C->dp[92]);
-   /* 93 */
-   COMBA_FORWARD;
-   MULADD(at[46], at[95]);    MULADD(at[47], at[94]); 
-   COMBA_STORE(C->dp[93]);
-   /* 94 */
-   COMBA_FORWARD;
-   MULADD(at[47], at[95]); 
-   COMBA_STORE(C->dp[94]);
-   COMBA_STORE2(C->dp[95]);
-   C->used = 96;
-   C->sign = A->sign ^ B->sign;
-   fp_clamp(C);
-   COMBA_FINI;
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif

+ 0 - 99
lib/wolfssl/wolfcrypt/src/fp_mul_comba_6.i

@@ -1,99 +0,0 @@
-/* fp_mul_comba_6.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_MUL6
-int fp_mul_comba6(fp_int *A, fp_int *B, fp_int *C)
-{
-   fp_digit c0, c1, c2;
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit at[12];
-#else
-   fp_digit *at;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 12, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (at == NULL)
-       return FP_MEM;
-#endif
-
-   XMEMCPY(at, A->dp, 6 * sizeof(fp_digit));
-   XMEMCPY(at+6, B->dp, 6 * sizeof(fp_digit));
-   COMBA_START;
-
-   COMBA_CLEAR;
-   /* 0 */
-   MULADD(at[0], at[6]); 
-   COMBA_STORE(C->dp[0]);
-   /* 1 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[7]);    MULADD(at[1], at[6]); 
-   COMBA_STORE(C->dp[1]);
-   /* 2 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[8]);    MULADD(at[1], at[7]);    MULADD(at[2], at[6]); 
-   COMBA_STORE(C->dp[2]);
-   /* 3 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[9]);    MULADD(at[1], at[8]);    MULADD(at[2], at[7]);    MULADD(at[3], at[6]); 
-   COMBA_STORE(C->dp[3]);
-   /* 4 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[10]);    MULADD(at[1], at[9]);    MULADD(at[2], at[8]);    MULADD(at[3], at[7]);    MULADD(at[4], at[6]); 
-   COMBA_STORE(C->dp[4]);
-   /* 5 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[11]);    MULADD(at[1], at[10]);    MULADD(at[2], at[9]);    MULADD(at[3], at[8]);    MULADD(at[4], at[7]);    MULADD(at[5], at[6]); 
-   COMBA_STORE(C->dp[5]);
-   /* 6 */
-   COMBA_FORWARD;
-   MULADD(at[1], at[11]);    MULADD(at[2], at[10]);    MULADD(at[3], at[9]);    MULADD(at[4], at[8]);    MULADD(at[5], at[7]); 
-   COMBA_STORE(C->dp[6]);
-   /* 7 */
-   COMBA_FORWARD;
-   MULADD(at[2], at[11]);    MULADD(at[3], at[10]);    MULADD(at[4], at[9]);    MULADD(at[5], at[8]); 
-   COMBA_STORE(C->dp[7]);
-   /* 8 */
-   COMBA_FORWARD;
-   MULADD(at[3], at[11]);    MULADD(at[4], at[10]);    MULADD(at[5], at[9]); 
-   COMBA_STORE(C->dp[8]);
-   /* 9 */
-   COMBA_FORWARD;
-   MULADD(at[4], at[11]);    MULADD(at[5], at[10]); 
-   COMBA_STORE(C->dp[9]);
-   /* 10 */
-   COMBA_FORWARD;
-   MULADD(at[5], at[11]); 
-   COMBA_STORE(C->dp[10]);
-   COMBA_STORE2(C->dp[11]);
-   C->used = 12;
-   C->sign = A->sign ^ B->sign;
-   fp_clamp(C);
-   COMBA_FINI;
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif

+ 0 - 563
lib/wolfssl/wolfcrypt/src/fp_mul_comba_64.i

@@ -1,563 +0,0 @@
-/* fp_mul_comba_64.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_MUL64
-int fp_mul_comba64(fp_int *A, fp_int *B, fp_int *C)
-{
-   fp_digit c0, c1, c2;
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit at[128];
-#else
-   fp_digit *at;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 128, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (at == NULL)
-       return FP_MEM;
-#endif
-
-   XMEMCPY(at, A->dp, 64 * sizeof(fp_digit));
-   XMEMCPY(at+64, B->dp, 64 * sizeof(fp_digit));
-   COMBA_START;
-
-   COMBA_CLEAR;
-   /* 0 */
-   MULADD(at[0], at[64]); 
-   COMBA_STORE(C->dp[0]);
-   /* 1 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[65]);    MULADD(at[1], at[64]); 
-   COMBA_STORE(C->dp[1]);
-   /* 2 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[66]);    MULADD(at[1], at[65]);    MULADD(at[2], at[64]); 
-   COMBA_STORE(C->dp[2]);
-   /* 3 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[67]);    MULADD(at[1], at[66]);    MULADD(at[2], at[65]);    MULADD(at[3], at[64]); 
-   COMBA_STORE(C->dp[3]);
-   /* 4 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[68]);    MULADD(at[1], at[67]);    MULADD(at[2], at[66]);    MULADD(at[3], at[65]);    MULADD(at[4], at[64]); 
-   COMBA_STORE(C->dp[4]);
-   /* 5 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[69]);    MULADD(at[1], at[68]);    MULADD(at[2], at[67]);    MULADD(at[3], at[66]);    MULADD(at[4], at[65]);    MULADD(at[5], at[64]); 
-   COMBA_STORE(C->dp[5]);
-   /* 6 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[70]);    MULADD(at[1], at[69]);    MULADD(at[2], at[68]);    MULADD(at[3], at[67]);    MULADD(at[4], at[66]);    MULADD(at[5], at[65]);    MULADD(at[6], at[64]); 
-   COMBA_STORE(C->dp[6]);
-   /* 7 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[71]);    MULADD(at[1], at[70]);    MULADD(at[2], at[69]);    MULADD(at[3], at[68]);    MULADD(at[4], at[67]);    MULADD(at[5], at[66]);    MULADD(at[6], at[65]);    MULADD(at[7], at[64]); 
-   COMBA_STORE(C->dp[7]);
-   /* 8 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[72]);    MULADD(at[1], at[71]);    MULADD(at[2], at[70]);    MULADD(at[3], at[69]);    MULADD(at[4], at[68]);    MULADD(at[5], at[67]);    MULADD(at[6], at[66]);    MULADD(at[7], at[65]);    MULADD(at[8], at[64]); 
-   COMBA_STORE(C->dp[8]);
-   /* 9 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[73]);    MULADD(at[1], at[72]);    MULADD(at[2], at[71]);    MULADD(at[3], at[70]);    MULADD(at[4], at[69]);    MULADD(at[5], at[68]);    MULADD(at[6], at[67]);    MULADD(at[7], at[66]);    MULADD(at[8], at[65]);    MULADD(at[9], at[64]); 
-   COMBA_STORE(C->dp[9]);
-   /* 10 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[74]);    MULADD(at[1], at[73]);    MULADD(at[2], at[72]);    MULADD(at[3], at[71]);    MULADD(at[4], at[70]);    MULADD(at[5], at[69]);    MULADD(at[6], at[68]);    MULADD(at[7], at[67]);    MULADD(at[8], at[66]);    MULADD(at[9], at[65]);    MULADD(at[10], at[64]); 
-   COMBA_STORE(C->dp[10]);
-   /* 11 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[75]);    MULADD(at[1], at[74]);    MULADD(at[2], at[73]);    MULADD(at[3], at[72]);    MULADD(at[4], at[71]);    MULADD(at[5], at[70]);    MULADD(at[6], at[69]);    MULADD(at[7], at[68]);    MULADD(at[8], at[67]);    MULADD(at[9], at[66]);    MULADD(at[10], at[65]);    MULADD(at[11], at[64]); 
-   COMBA_STORE(C->dp[11]);
-   /* 12 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[76]);    MULADD(at[1], at[75]);    MULADD(at[2], at[74]);    MULADD(at[3], at[73]);    MULADD(at[4], at[72]);    MULADD(at[5], at[71]);    MULADD(at[6], at[70]);    MULADD(at[7], at[69]);    MULADD(at[8], at[68]);    MULADD(at[9], at[67]);    MULADD(at[10], at[66]);    MULADD(at[11], at[65]);    MULADD(at[12], at[64]); 
-   COMBA_STORE(C->dp[12]);
-   /* 13 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[77]);    MULADD(at[1], at[76]);    MULADD(at[2], at[75]);    MULADD(at[3], at[74]);    MULADD(at[4], at[73]);    MULADD(at[5], at[72]);    MULADD(at[6], at[71]);    MULADD(at[7], at[70]);    MULADD(at[8], at[69]);    MULADD(at[9], at[68]);    MULADD(at[10], at[67]);    MULADD(at[11], at[66]);    MULADD(at[12], at[65]);    MULADD(at[13], at[64]); 
-   COMBA_STORE(C->dp[13]);
-   /* 14 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[78]);    MULADD(at[1], at[77]);    MULADD(at[2], at[76]);    MULADD(at[3], at[75]);    MULADD(at[4], at[74]);    MULADD(at[5], at[73]);    MULADD(at[6], at[72]);    MULADD(at[7], at[71]);    MULADD(at[8], at[70]);    MULADD(at[9], at[69]);    MULADD(at[10], at[68]);    MULADD(at[11], at[67]);    MULADD(at[12], at[66]);    MULADD(at[13], at[65]);    MULADD(at[14], at[64]); 
-   COMBA_STORE(C->dp[14]);
-   /* 15 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[79]);    MULADD(at[1], at[78]);    MULADD(at[2], at[77]);    MULADD(at[3], at[76]);    MULADD(at[4], at[75]);    MULADD(at[5], at[74]);    MULADD(at[6], at[73]);    MULADD(at[7], at[72]);    MULADD(at[8], at[71]);    MULADD(at[9], at[70]);    MULADD(at[10], at[69]);    MULADD(at[11], at[68]);    MULADD(at[12], at[67]);    MULADD(at[13], at[66]);    MULADD(at[14], at[65]);    MULADD(at[15], at[64]); 
-   COMBA_STORE(C->dp[15]);
-   /* 16 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[80]);    MULADD(at[1], at[79]);    MULADD(at[2], at[78]);    MULADD(at[3], at[77]);    MULADD(at[4], at[76]);    MULADD(at[5], at[75]);    MULADD(at[6], at[74]);    MULADD(at[7], at[73]);    MULADD(at[8], at[72]);    MULADD(at[9], at[71]);    MULADD(at[10], at[70]);    MULADD(at[11], at[69]);    MULADD(at[12], at[68]);    MULADD(at[13], at[67]);    MULADD(at[14], at[66]);    MULADD(at[15], at[65]);    MULADD(at[16], at[64]); 
-   COMBA_STORE(C->dp[16]);
-   /* 17 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[81]);    MULADD(at[1], at[80]);    MULADD(at[2], at[79]);    MULADD(at[3], at[78]);    MULADD(at[4], at[77]);    MULADD(at[5], at[76]);    MULADD(at[6], at[75]);    MULADD(at[7], at[74]);    MULADD(at[8], at[73]);    MULADD(at[9], at[72]);    MULADD(at[10], at[71]);    MULADD(at[11], at[70]);    MULADD(at[12], at[69]);    MULADD(at[13], at[68]);    MULADD(at[14], at[67]);    MULADD(at[15], at[66]);    MULADD(at[16], at[65]);    MULADD(at[17], at[64]); 
-   COMBA_STORE(C->dp[17]);
-   /* 18 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[82]);    MULADD(at[1], at[81]);    MULADD(at[2], at[80]);    MULADD(at[3], at[79]);    MULADD(at[4], at[78]);    MULADD(at[5], at[77]);    MULADD(at[6], at[76]);    MULADD(at[7], at[75]);    MULADD(at[8], at[74]);    MULADD(at[9], at[73]);    MULADD(at[10], at[72]);    MULADD(at[11], at[71]);    MULADD(at[12], at[70]);    MULADD(at[13], at[69]);    MULADD(at[14], at[68]);    MULADD(at[15], at[67]);    MULADD(at[16], at[66]);    MULADD(at[17], at[65]);    MULADD(at[18], at[64]); 
-   COMBA_STORE(C->dp[18]);
-   /* 19 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[83]);    MULADD(at[1], at[82]);    MULADD(at[2], at[81]);    MULADD(at[3], at[80]);    MULADD(at[4], at[79]);    MULADD(at[5], at[78]);    MULADD(at[6], at[77]);    MULADD(at[7], at[76]);    MULADD(at[8], at[75]);    MULADD(at[9], at[74]);    MULADD(at[10], at[73]);    MULADD(at[11], at[72]);    MULADD(at[12], at[71]);    MULADD(at[13], at[70]);    MULADD(at[14], at[69]);    MULADD(at[15], at[68]);    MULADD(at[16], at[67]);    MULADD(at[17], at[66]);    MULADD(at[18], at[65]);    MULADD(at[19], at[64]); 
-   COMBA_STORE(C->dp[19]);
-   /* 20 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[84]);    MULADD(at[1], at[83]);    MULADD(at[2], at[82]);    MULADD(at[3], at[81]);    MULADD(at[4], at[80]);    MULADD(at[5], at[79]);    MULADD(at[6], at[78]);    MULADD(at[7], at[77]);    MULADD(at[8], at[76]);    MULADD(at[9], at[75]);    MULADD(at[10], at[74]);    MULADD(at[11], at[73]);    MULADD(at[12], at[72]);    MULADD(at[13], at[71]);    MULADD(at[14], at[70]);    MULADD(at[15], at[69]);    MULADD(at[16], at[68]);    MULADD(at[17], at[67]);    MULADD(at[18], at[66]);    MULADD(at[19], at[65]);    MULADD(at[20], at[64]); 
-   COMBA_STORE(C->dp[20]);
-   /* 21 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[85]);    MULADD(at[1], at[84]);    MULADD(at[2], at[83]);    MULADD(at[3], at[82]);    MULADD(at[4], at[81]);    MULADD(at[5], at[80]);    MULADD(at[6], at[79]);    MULADD(at[7], at[78]);    MULADD(at[8], at[77]);    MULADD(at[9], at[76]);    MULADD(at[10], at[75]);    MULADD(at[11], at[74]);    MULADD(at[12], at[73]);    MULADD(at[13], at[72]);    MULADD(at[14], at[71]);    MULADD(at[15], at[70]);    MULADD(at[16], at[69]);    MULADD(at[17], at[68]);    MULADD(at[18], at[67]);    MULADD(at[19], at[66]);    MULADD(at[20], at[65]);    MULADD(at[21], at[64]); 
-   COMBA_STORE(C->dp[21]);
-   /* 22 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[86]);    MULADD(at[1], at[85]);    MULADD(at[2], at[84]);    MULADD(at[3], at[83]);    MULADD(at[4], at[82]);    MULADD(at[5], at[81]);    MULADD(at[6], at[80]);    MULADD(at[7], at[79]);    MULADD(at[8], at[78]);    MULADD(at[9], at[77]);    MULADD(at[10], at[76]);    MULADD(at[11], at[75]);    MULADD(at[12], at[74]);    MULADD(at[13], at[73]);    MULADD(at[14], at[72]);    MULADD(at[15], at[71]);    MULADD(at[16], at[70]);    MULADD(at[17], at[69]);    MULADD(at[18], at[68]);    MULADD(at[19], at[67]);    MULADD(at[20], at[66]);    MULADD(at[21], at[65]);    MULADD(at[22], at[64]); 
-   COMBA_STORE(C->dp[22]);
-   /* 23 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[87]);    MULADD(at[1], at[86]);    MULADD(at[2], at[85]);    MULADD(at[3], at[84]);    MULADD(at[4], at[83]);    MULADD(at[5], at[82]);    MULADD(at[6], at[81]);    MULADD(at[7], at[80]);    MULADD(at[8], at[79]);    MULADD(at[9], at[78]);    MULADD(at[10], at[77]);    MULADD(at[11], at[76]);    MULADD(at[12], at[75]);    MULADD(at[13], at[74]);    MULADD(at[14], at[73]);    MULADD(at[15], at[72]);    MULADD(at[16], at[71]);    MULADD(at[17], at[70]);    MULADD(at[18], at[69]);    MULADD(at[19], at[68]);    MULADD(at[20], at[67]);    MULADD(at[21], at[66]);    MULADD(at[22], at[65]);    MULADD(at[23], at[64]); 
-   COMBA_STORE(C->dp[23]);
-   /* 24 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[88]);    MULADD(at[1], at[87]);    MULADD(at[2], at[86]);    MULADD(at[3], at[85]);    MULADD(at[4], at[84]);    MULADD(at[5], at[83]);    MULADD(at[6], at[82]);    MULADD(at[7], at[81]);    MULADD(at[8], at[80]);    MULADD(at[9], at[79]);    MULADD(at[10], at[78]);    MULADD(at[11], at[77]);    MULADD(at[12], at[76]);    MULADD(at[13], at[75]);    MULADD(at[14], at[74]);    MULADD(at[15], at[73]);    MULADD(at[16], at[72]);    MULADD(at[17], at[71]);    MULADD(at[18], at[70]);    MULADD(at[19], at[69]);    MULADD(at[20], at[68]);    MULADD(at[21], at[67]);    MULADD(at[22], at[66]);    MULADD(at[23], at[65]);    MULADD(at[24], at[64]); 
-   COMBA_STORE(C->dp[24]);
-   /* 25 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[89]);    MULADD(at[1], at[88]);    MULADD(at[2], at[87]);    MULADD(at[3], at[86]);    MULADD(at[4], at[85]);    MULADD(at[5], at[84]);    MULADD(at[6], at[83]);    MULADD(at[7], at[82]);    MULADD(at[8], at[81]);    MULADD(at[9], at[80]);    MULADD(at[10], at[79]);    MULADD(at[11], at[78]);    MULADD(at[12], at[77]);    MULADD(at[13], at[76]);    MULADD(at[14], at[75]);    MULADD(at[15], at[74]);    MULADD(at[16], at[73]);    MULADD(at[17], at[72]);    MULADD(at[18], at[71]);    MULADD(at[19], at[70]);    MULADD(at[20], at[69]);    MULADD(at[21], at[68]);    MULADD(at[22], at[67]);    MULADD(at[23], at[66]);    MULADD(at[24], at[65]);    MULADD(at[25], at[64]); 
-   COMBA_STORE(C->dp[25]);
-   /* 26 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[90]);    MULADD(at[1], at[89]);    MULADD(at[2], at[88]);    MULADD(at[3], at[87]);    MULADD(at[4], at[86]);    MULADD(at[5], at[85]);    MULADD(at[6], at[84]);    MULADD(at[7], at[83]);    MULADD(at[8], at[82]);    MULADD(at[9], at[81]);    MULADD(at[10], at[80]);    MULADD(at[11], at[79]);    MULADD(at[12], at[78]);    MULADD(at[13], at[77]);    MULADD(at[14], at[76]);    MULADD(at[15], at[75]);    MULADD(at[16], at[74]);    MULADD(at[17], at[73]);    MULADD(at[18], at[72]);    MULADD(at[19], at[71]);    MULADD(at[20], at[70]);    MULADD(at[21], at[69]);    MULADD(at[22], at[68]);    MULADD(at[23], at[67]);    MULADD(at[24], at[66]);    MULADD(at[25], at[65]);    MULADD(at[26], at[64]); 
-   COMBA_STORE(C->dp[26]);
-   /* 27 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[91]);    MULADD(at[1], at[90]);    MULADD(at[2], at[89]);    MULADD(at[3], at[88]);    MULADD(at[4], at[87]);    MULADD(at[5], at[86]);    MULADD(at[6], at[85]);    MULADD(at[7], at[84]);    MULADD(at[8], at[83]);    MULADD(at[9], at[82]);    MULADD(at[10], at[81]);    MULADD(at[11], at[80]);    MULADD(at[12], at[79]);    MULADD(at[13], at[78]);    MULADD(at[14], at[77]);    MULADD(at[15], at[76]);    MULADD(at[16], at[75]);    MULADD(at[17], at[74]);    MULADD(at[18], at[73]);    MULADD(at[19], at[72]);    MULADD(at[20], at[71]);    MULADD(at[21], at[70]);    MULADD(at[22], at[69]);    MULADD(at[23], at[68]);    MULADD(at[24], at[67]);    MULADD(at[25], at[66]);    MULADD(at[26], at[65]);    MULADD(at[27], at[64]); 
-   COMBA_STORE(C->dp[27]);
-   /* 28 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[92]);    MULADD(at[1], at[91]);    MULADD(at[2], at[90]);    MULADD(at[3], at[89]);    MULADD(at[4], at[88]);    MULADD(at[5], at[87]);    MULADD(at[6], at[86]);    MULADD(at[7], at[85]);    MULADD(at[8], at[84]);    MULADD(at[9], at[83]);    MULADD(at[10], at[82]);    MULADD(at[11], at[81]);    MULADD(at[12], at[80]);    MULADD(at[13], at[79]);    MULADD(at[14], at[78]);    MULADD(at[15], at[77]);    MULADD(at[16], at[76]);    MULADD(at[17], at[75]);    MULADD(at[18], at[74]);    MULADD(at[19], at[73]);    MULADD(at[20], at[72]);    MULADD(at[21], at[71]);    MULADD(at[22], at[70]);    MULADD(at[23], at[69]);    MULADD(at[24], at[68]);    MULADD(at[25], at[67]);    MULADD(at[26], at[66]);    MULADD(at[27], at[65]);    MULADD(at[28], at[64]); 
-   COMBA_STORE(C->dp[28]);
-   /* 29 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[93]);    MULADD(at[1], at[92]);    MULADD(at[2], at[91]);    MULADD(at[3], at[90]);    MULADD(at[4], at[89]);    MULADD(at[5], at[88]);    MULADD(at[6], at[87]);    MULADD(at[7], at[86]);    MULADD(at[8], at[85]);    MULADD(at[9], at[84]);    MULADD(at[10], at[83]);    MULADD(at[11], at[82]);    MULADD(at[12], at[81]);    MULADD(at[13], at[80]);    MULADD(at[14], at[79]);    MULADD(at[15], at[78]);    MULADD(at[16], at[77]);    MULADD(at[17], at[76]);    MULADD(at[18], at[75]);    MULADD(at[19], at[74]);    MULADD(at[20], at[73]);    MULADD(at[21], at[72]);    MULADD(at[22], at[71]);    MULADD(at[23], at[70]);    MULADD(at[24], at[69]);    MULADD(at[25], at[68]);    MULADD(at[26], at[67]);    MULADD(at[27], at[66]);    MULADD(at[28], at[65]);    MULADD(at[29], at[64]); 
-   COMBA_STORE(C->dp[29]);
-   /* 30 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[94]);    MULADD(at[1], at[93]);    MULADD(at[2], at[92]);    MULADD(at[3], at[91]);    MULADD(at[4], at[90]);    MULADD(at[5], at[89]);    MULADD(at[6], at[88]);    MULADD(at[7], at[87]);    MULADD(at[8], at[86]);    MULADD(at[9], at[85]);    MULADD(at[10], at[84]);    MULADD(at[11], at[83]);    MULADD(at[12], at[82]);    MULADD(at[13], at[81]);    MULADD(at[14], at[80]);    MULADD(at[15], at[79]);    MULADD(at[16], at[78]);    MULADD(at[17], at[77]);    MULADD(at[18], at[76]);    MULADD(at[19], at[75]);    MULADD(at[20], at[74]);    MULADD(at[21], at[73]);    MULADD(at[22], at[72]);    MULADD(at[23], at[71]);    MULADD(at[24], at[70]);    MULADD(at[25], at[69]);    MULADD(at[26], at[68]);    MULADD(at[27], at[67]);    MULADD(at[28], at[66]);    MULADD(at[29], at[65]);    MULADD(at[30], at[64]); 
-   COMBA_STORE(C->dp[30]);
-   /* 31 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[95]);    MULADD(at[1], at[94]);    MULADD(at[2], at[93]);    MULADD(at[3], at[92]);    MULADD(at[4], at[91]);    MULADD(at[5], at[90]);    MULADD(at[6], at[89]);    MULADD(at[7], at[88]);    MULADD(at[8], at[87]);    MULADD(at[9], at[86]);    MULADD(at[10], at[85]);    MULADD(at[11], at[84]);    MULADD(at[12], at[83]);    MULADD(at[13], at[82]);    MULADD(at[14], at[81]);    MULADD(at[15], at[80]);    MULADD(at[16], at[79]);    MULADD(at[17], at[78]);    MULADD(at[18], at[77]);    MULADD(at[19], at[76]);    MULADD(at[20], at[75]);    MULADD(at[21], at[74]);    MULADD(at[22], at[73]);    MULADD(at[23], at[72]);    MULADD(at[24], at[71]);    MULADD(at[25], at[70]);    MULADD(at[26], at[69]);    MULADD(at[27], at[68]);    MULADD(at[28], at[67]);    MULADD(at[29], at[66]);    MULADD(at[30], at[65]);    MULADD(at[31], at[64]); 
-   COMBA_STORE(C->dp[31]);
-   /* 32 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[96]);    MULADD(at[1], at[95]);    MULADD(at[2], at[94]);    MULADD(at[3], at[93]);    MULADD(at[4], at[92]);    MULADD(at[5], at[91]);    MULADD(at[6], at[90]);    MULADD(at[7], at[89]);    MULADD(at[8], at[88]);    MULADD(at[9], at[87]);    MULADD(at[10], at[86]);    MULADD(at[11], at[85]);    MULADD(at[12], at[84]);    MULADD(at[13], at[83]);    MULADD(at[14], at[82]);    MULADD(at[15], at[81]);    MULADD(at[16], at[80]);    MULADD(at[17], at[79]);    MULADD(at[18], at[78]);    MULADD(at[19], at[77]);    MULADD(at[20], at[76]);    MULADD(at[21], at[75]);    MULADD(at[22], at[74]);    MULADD(at[23], at[73]);    MULADD(at[24], at[72]);    MULADD(at[25], at[71]);    MULADD(at[26], at[70]);    MULADD(at[27], at[69]);    MULADD(at[28], at[68]);    MULADD(at[29], at[67]);    MULADD(at[30], at[66]);    MULADD(at[31], at[65]);    MULADD(at[32], at[64]); 
-   COMBA_STORE(C->dp[32]);
-   /* 33 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[97]);    MULADD(at[1], at[96]);    MULADD(at[2], at[95]);    MULADD(at[3], at[94]);    MULADD(at[4], at[93]);    MULADD(at[5], at[92]);    MULADD(at[6], at[91]);    MULADD(at[7], at[90]);    MULADD(at[8], at[89]);    MULADD(at[9], at[88]);    MULADD(at[10], at[87]);    MULADD(at[11], at[86]);    MULADD(at[12], at[85]);    MULADD(at[13], at[84]);    MULADD(at[14], at[83]);    MULADD(at[15], at[82]);    MULADD(at[16], at[81]);    MULADD(at[17], at[80]);    MULADD(at[18], at[79]);    MULADD(at[19], at[78]);    MULADD(at[20], at[77]);    MULADD(at[21], at[76]);    MULADD(at[22], at[75]);    MULADD(at[23], at[74]);    MULADD(at[24], at[73]);    MULADD(at[25], at[72]);    MULADD(at[26], at[71]);    MULADD(at[27], at[70]);    MULADD(at[28], at[69]);    MULADD(at[29], at[68]);    MULADD(at[30], at[67]);    MULADD(at[31], at[66]);    MULADD(at[32], at[65]);    MULADD(at[33], at[64]); 
-   COMBA_STORE(C->dp[33]);
-   /* 34 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[98]);    MULADD(at[1], at[97]);    MULADD(at[2], at[96]);    MULADD(at[3], at[95]);    MULADD(at[4], at[94]);    MULADD(at[5], at[93]);    MULADD(at[6], at[92]);    MULADD(at[7], at[91]);    MULADD(at[8], at[90]);    MULADD(at[9], at[89]);    MULADD(at[10], at[88]);    MULADD(at[11], at[87]);    MULADD(at[12], at[86]);    MULADD(at[13], at[85]);    MULADD(at[14], at[84]);    MULADD(at[15], at[83]);    MULADD(at[16], at[82]);    MULADD(at[17], at[81]);    MULADD(at[18], at[80]);    MULADD(at[19], at[79]);    MULADD(at[20], at[78]);    MULADD(at[21], at[77]);    MULADD(at[22], at[76]);    MULADD(at[23], at[75]);    MULADD(at[24], at[74]);    MULADD(at[25], at[73]);    MULADD(at[26], at[72]);    MULADD(at[27], at[71]);    MULADD(at[28], at[70]);    MULADD(at[29], at[69]);    MULADD(at[30], at[68]);    MULADD(at[31], at[67]);    MULADD(at[32], at[66]);    MULADD(at[33], at[65]);    MULADD(at[34], at[64]); 
-   COMBA_STORE(C->dp[34]);
-   /* 35 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[99]);    MULADD(at[1], at[98]);    MULADD(at[2], at[97]);    MULADD(at[3], at[96]);    MULADD(at[4], at[95]);    MULADD(at[5], at[94]);    MULADD(at[6], at[93]);    MULADD(at[7], at[92]);    MULADD(at[8], at[91]);    MULADD(at[9], at[90]);    MULADD(at[10], at[89]);    MULADD(at[11], at[88]);    MULADD(at[12], at[87]);    MULADD(at[13], at[86]);    MULADD(at[14], at[85]);    MULADD(at[15], at[84]);    MULADD(at[16], at[83]);    MULADD(at[17], at[82]);    MULADD(at[18], at[81]);    MULADD(at[19], at[80]);    MULADD(at[20], at[79]);    MULADD(at[21], at[78]);    MULADD(at[22], at[77]);    MULADD(at[23], at[76]);    MULADD(at[24], at[75]);    MULADD(at[25], at[74]);    MULADD(at[26], at[73]);    MULADD(at[27], at[72]);    MULADD(at[28], at[71]);    MULADD(at[29], at[70]);    MULADD(at[30], at[69]);    MULADD(at[31], at[68]);    MULADD(at[32], at[67]);    MULADD(at[33], at[66]);    MULADD(at[34], at[65]);    MULADD(at[35], at[64]); 
-   COMBA_STORE(C->dp[35]);
-   /* 36 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[100]);    MULADD(at[1], at[99]);    MULADD(at[2], at[98]);    MULADD(at[3], at[97]);    MULADD(at[4], at[96]);    MULADD(at[5], at[95]);    MULADD(at[6], at[94]);    MULADD(at[7], at[93]);    MULADD(at[8], at[92]);    MULADD(at[9], at[91]);    MULADD(at[10], at[90]);    MULADD(at[11], at[89]);    MULADD(at[12], at[88]);    MULADD(at[13], at[87]);    MULADD(at[14], at[86]);    MULADD(at[15], at[85]);    MULADD(at[16], at[84]);    MULADD(at[17], at[83]);    MULADD(at[18], at[82]);    MULADD(at[19], at[81]);    MULADD(at[20], at[80]);    MULADD(at[21], at[79]);    MULADD(at[22], at[78]);    MULADD(at[23], at[77]);    MULADD(at[24], at[76]);    MULADD(at[25], at[75]);    MULADD(at[26], at[74]);    MULADD(at[27], at[73]);    MULADD(at[28], at[72]);    MULADD(at[29], at[71]);    MULADD(at[30], at[70]);    MULADD(at[31], at[69]);    MULADD(at[32], at[68]);    MULADD(at[33], at[67]);    MULADD(at[34], at[66]);    MULADD(at[35], at[65]);    MULADD(at[36], at[64]); 
-   COMBA_STORE(C->dp[36]);
-   /* 37 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[101]);    MULADD(at[1], at[100]);    MULADD(at[2], at[99]);    MULADD(at[3], at[98]);    MULADD(at[4], at[97]);    MULADD(at[5], at[96]);    MULADD(at[6], at[95]);    MULADD(at[7], at[94]);    MULADD(at[8], at[93]);    MULADD(at[9], at[92]);    MULADD(at[10], at[91]);    MULADD(at[11], at[90]);    MULADD(at[12], at[89]);    MULADD(at[13], at[88]);    MULADD(at[14], at[87]);    MULADD(at[15], at[86]);    MULADD(at[16], at[85]);    MULADD(at[17], at[84]);    MULADD(at[18], at[83]);    MULADD(at[19], at[82]);    MULADD(at[20], at[81]);    MULADD(at[21], at[80]);    MULADD(at[22], at[79]);    MULADD(at[23], at[78]);    MULADD(at[24], at[77]);    MULADD(at[25], at[76]);    MULADD(at[26], at[75]);    MULADD(at[27], at[74]);    MULADD(at[28], at[73]);    MULADD(at[29], at[72]);    MULADD(at[30], at[71]);    MULADD(at[31], at[70]);    MULADD(at[32], at[69]);    MULADD(at[33], at[68]);    MULADD(at[34], at[67]);    MULADD(at[35], at[66]);    MULADD(at[36], at[65]);    MULADD(at[37], at[64]); 
-   COMBA_STORE(C->dp[37]);
-   /* 38 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[102]);    MULADD(at[1], at[101]);    MULADD(at[2], at[100]);    MULADD(at[3], at[99]);    MULADD(at[4], at[98]);    MULADD(at[5], at[97]);    MULADD(at[6], at[96]);    MULADD(at[7], at[95]);    MULADD(at[8], at[94]);    MULADD(at[9], at[93]);    MULADD(at[10], at[92]);    MULADD(at[11], at[91]);    MULADD(at[12], at[90]);    MULADD(at[13], at[89]);    MULADD(at[14], at[88]);    MULADD(at[15], at[87]);    MULADD(at[16], at[86]);    MULADD(at[17], at[85]);    MULADD(at[18], at[84]);    MULADD(at[19], at[83]);    MULADD(at[20], at[82]);    MULADD(at[21], at[81]);    MULADD(at[22], at[80]);    MULADD(at[23], at[79]);    MULADD(at[24], at[78]);    MULADD(at[25], at[77]);    MULADD(at[26], at[76]);    MULADD(at[27], at[75]);    MULADD(at[28], at[74]);    MULADD(at[29], at[73]);    MULADD(at[30], at[72]);    MULADD(at[31], at[71]);    MULADD(at[32], at[70]);    MULADD(at[33], at[69]);    MULADD(at[34], at[68]);    MULADD(at[35], at[67]);    MULADD(at[36], at[66]);    MULADD(at[37], at[65]);    MULADD(at[38], at[64]); 
-   COMBA_STORE(C->dp[38]);
-   /* 39 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[103]);    MULADD(at[1], at[102]);    MULADD(at[2], at[101]);    MULADD(at[3], at[100]);    MULADD(at[4], at[99]);    MULADD(at[5], at[98]);    MULADD(at[6], at[97]);    MULADD(at[7], at[96]);    MULADD(at[8], at[95]);    MULADD(at[9], at[94]);    MULADD(at[10], at[93]);    MULADD(at[11], at[92]);    MULADD(at[12], at[91]);    MULADD(at[13], at[90]);    MULADD(at[14], at[89]);    MULADD(at[15], at[88]);    MULADD(at[16], at[87]);    MULADD(at[17], at[86]);    MULADD(at[18], at[85]);    MULADD(at[19], at[84]);    MULADD(at[20], at[83]);    MULADD(at[21], at[82]);    MULADD(at[22], at[81]);    MULADD(at[23], at[80]);    MULADD(at[24], at[79]);    MULADD(at[25], at[78]);    MULADD(at[26], at[77]);    MULADD(at[27], at[76]);    MULADD(at[28], at[75]);    MULADD(at[29], at[74]);    MULADD(at[30], at[73]);    MULADD(at[31], at[72]);    MULADD(at[32], at[71]);    MULADD(at[33], at[70]);    MULADD(at[34], at[69]);    MULADD(at[35], at[68]);    MULADD(at[36], at[67]);    MULADD(at[37], at[66]);    MULADD(at[38], at[65]);    MULADD(at[39], at[64]); 
-   COMBA_STORE(C->dp[39]);
-   /* 40 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[104]);    MULADD(at[1], at[103]);    MULADD(at[2], at[102]);    MULADD(at[3], at[101]);    MULADD(at[4], at[100]);    MULADD(at[5], at[99]);    MULADD(at[6], at[98]);    MULADD(at[7], at[97]);    MULADD(at[8], at[96]);    MULADD(at[9], at[95]);    MULADD(at[10], at[94]);    MULADD(at[11], at[93]);    MULADD(at[12], at[92]);    MULADD(at[13], at[91]);    MULADD(at[14], at[90]);    MULADD(at[15], at[89]);    MULADD(at[16], at[88]);    MULADD(at[17], at[87]);    MULADD(at[18], at[86]);    MULADD(at[19], at[85]);    MULADD(at[20], at[84]);    MULADD(at[21], at[83]);    MULADD(at[22], at[82]);    MULADD(at[23], at[81]);    MULADD(at[24], at[80]);    MULADD(at[25], at[79]);    MULADD(at[26], at[78]);    MULADD(at[27], at[77]);    MULADD(at[28], at[76]);    MULADD(at[29], at[75]);    MULADD(at[30], at[74]);    MULADD(at[31], at[73]);    MULADD(at[32], at[72]);    MULADD(at[33], at[71]);    MULADD(at[34], at[70]);    MULADD(at[35], at[69]);    MULADD(at[36], at[68]);    MULADD(at[37], at[67]);    MULADD(at[38], at[66]);    MULADD(at[39], at[65]);    MULADD(at[40], at[64]); 
-   COMBA_STORE(C->dp[40]);
-   /* 41 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[105]);    MULADD(at[1], at[104]);    MULADD(at[2], at[103]);    MULADD(at[3], at[102]);    MULADD(at[4], at[101]);    MULADD(at[5], at[100]);    MULADD(at[6], at[99]);    MULADD(at[7], at[98]);    MULADD(at[8], at[97]);    MULADD(at[9], at[96]);    MULADD(at[10], at[95]);    MULADD(at[11], at[94]);    MULADD(at[12], at[93]);    MULADD(at[13], at[92]);    MULADD(at[14], at[91]);    MULADD(at[15], at[90]);    MULADD(at[16], at[89]);    MULADD(at[17], at[88]);    MULADD(at[18], at[87]);    MULADD(at[19], at[86]);    MULADD(at[20], at[85]);    MULADD(at[21], at[84]);    MULADD(at[22], at[83]);    MULADD(at[23], at[82]);    MULADD(at[24], at[81]);    MULADD(at[25], at[80]);    MULADD(at[26], at[79]);    MULADD(at[27], at[78]);    MULADD(at[28], at[77]);    MULADD(at[29], at[76]);    MULADD(at[30], at[75]);    MULADD(at[31], at[74]);    MULADD(at[32], at[73]);    MULADD(at[33], at[72]);    MULADD(at[34], at[71]);    MULADD(at[35], at[70]);    MULADD(at[36], at[69]);    MULADD(at[37], at[68]);    MULADD(at[38], at[67]);    MULADD(at[39], at[66]);    MULADD(at[40], at[65]);    MULADD(at[41], at[64]); 
-   COMBA_STORE(C->dp[41]);
-   /* 42 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[106]);    MULADD(at[1], at[105]);    MULADD(at[2], at[104]);    MULADD(at[3], at[103]);    MULADD(at[4], at[102]);    MULADD(at[5], at[101]);    MULADD(at[6], at[100]);    MULADD(at[7], at[99]);    MULADD(at[8], at[98]);    MULADD(at[9], at[97]);    MULADD(at[10], at[96]);    MULADD(at[11], at[95]);    MULADD(at[12], at[94]);    MULADD(at[13], at[93]);    MULADD(at[14], at[92]);    MULADD(at[15], at[91]);    MULADD(at[16], at[90]);    MULADD(at[17], at[89]);    MULADD(at[18], at[88]);    MULADD(at[19], at[87]);    MULADD(at[20], at[86]);    MULADD(at[21], at[85]);    MULADD(at[22], at[84]);    MULADD(at[23], at[83]);    MULADD(at[24], at[82]);    MULADD(at[25], at[81]);    MULADD(at[26], at[80]);    MULADD(at[27], at[79]);    MULADD(at[28], at[78]);    MULADD(at[29], at[77]);    MULADD(at[30], at[76]);    MULADD(at[31], at[75]);    MULADD(at[32], at[74]);    MULADD(at[33], at[73]);    MULADD(at[34], at[72]);    MULADD(at[35], at[71]);    MULADD(at[36], at[70]);    MULADD(at[37], at[69]);    MULADD(at[38], at[68]);    MULADD(at[39], at[67]);    MULADD(at[40], at[66]);    MULADD(at[41], at[65]);    MULADD(at[42], at[64]); 
-   COMBA_STORE(C->dp[42]);
-   /* 43 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[107]);    MULADD(at[1], at[106]);    MULADD(at[2], at[105]);    MULADD(at[3], at[104]);    MULADD(at[4], at[103]);    MULADD(at[5], at[102]);    MULADD(at[6], at[101]);    MULADD(at[7], at[100]);    MULADD(at[8], at[99]);    MULADD(at[9], at[98]);    MULADD(at[10], at[97]);    MULADD(at[11], at[96]);    MULADD(at[12], at[95]);    MULADD(at[13], at[94]);    MULADD(at[14], at[93]);    MULADD(at[15], at[92]);    MULADD(at[16], at[91]);    MULADD(at[17], at[90]);    MULADD(at[18], at[89]);    MULADD(at[19], at[88]);    MULADD(at[20], at[87]);    MULADD(at[21], at[86]);    MULADD(at[22], at[85]);    MULADD(at[23], at[84]);    MULADD(at[24], at[83]);    MULADD(at[25], at[82]);    MULADD(at[26], at[81]);    MULADD(at[27], at[80]);    MULADD(at[28], at[79]);    MULADD(at[29], at[78]);    MULADD(at[30], at[77]);    MULADD(at[31], at[76]);    MULADD(at[32], at[75]);    MULADD(at[33], at[74]);    MULADD(at[34], at[73]);    MULADD(at[35], at[72]);    MULADD(at[36], at[71]);    MULADD(at[37], at[70]);    MULADD(at[38], at[69]);    MULADD(at[39], at[68]);    MULADD(at[40], at[67]);    MULADD(at[41], at[66]);    MULADD(at[42], at[65]);    MULADD(at[43], at[64]); 
-   COMBA_STORE(C->dp[43]);
-   /* 44 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[108]);    MULADD(at[1], at[107]);    MULADD(at[2], at[106]);    MULADD(at[3], at[105]);    MULADD(at[4], at[104]);    MULADD(at[5], at[103]);    MULADD(at[6], at[102]);    MULADD(at[7], at[101]);    MULADD(at[8], at[100]);    MULADD(at[9], at[99]);    MULADD(at[10], at[98]);    MULADD(at[11], at[97]);    MULADD(at[12], at[96]);    MULADD(at[13], at[95]);    MULADD(at[14], at[94]);    MULADD(at[15], at[93]);    MULADD(at[16], at[92]);    MULADD(at[17], at[91]);    MULADD(at[18], at[90]);    MULADD(at[19], at[89]);    MULADD(at[20], at[88]);    MULADD(at[21], at[87]);    MULADD(at[22], at[86]);    MULADD(at[23], at[85]);    MULADD(at[24], at[84]);    MULADD(at[25], at[83]);    MULADD(at[26], at[82]);    MULADD(at[27], at[81]);    MULADD(at[28], at[80]);    MULADD(at[29], at[79]);    MULADD(at[30], at[78]);    MULADD(at[31], at[77]);    MULADD(at[32], at[76]);    MULADD(at[33], at[75]);    MULADD(at[34], at[74]);    MULADD(at[35], at[73]);    MULADD(at[36], at[72]);    MULADD(at[37], at[71]);    MULADD(at[38], at[70]);    MULADD(at[39], at[69]);    MULADD(at[40], at[68]);    MULADD(at[41], at[67]);    MULADD(at[42], at[66]);    MULADD(at[43], at[65]);    MULADD(at[44], at[64]); 
-   COMBA_STORE(C->dp[44]);
-   /* 45 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[109]);    MULADD(at[1], at[108]);    MULADD(at[2], at[107]);    MULADD(at[3], at[106]);    MULADD(at[4], at[105]);    MULADD(at[5], at[104]);    MULADD(at[6], at[103]);    MULADD(at[7], at[102]);    MULADD(at[8], at[101]);    MULADD(at[9], at[100]);    MULADD(at[10], at[99]);    MULADD(at[11], at[98]);    MULADD(at[12], at[97]);    MULADD(at[13], at[96]);    MULADD(at[14], at[95]);    MULADD(at[15], at[94]);    MULADD(at[16], at[93]);    MULADD(at[17], at[92]);    MULADD(at[18], at[91]);    MULADD(at[19], at[90]);    MULADD(at[20], at[89]);    MULADD(at[21], at[88]);    MULADD(at[22], at[87]);    MULADD(at[23], at[86]);    MULADD(at[24], at[85]);    MULADD(at[25], at[84]);    MULADD(at[26], at[83]);    MULADD(at[27], at[82]);    MULADD(at[28], at[81]);    MULADD(at[29], at[80]);    MULADD(at[30], at[79]);    MULADD(at[31], at[78]);    MULADD(at[32], at[77]);    MULADD(at[33], at[76]);    MULADD(at[34], at[75]);    MULADD(at[35], at[74]);    MULADD(at[36], at[73]);    MULADD(at[37], at[72]);    MULADD(at[38], at[71]);    MULADD(at[39], at[70]);    MULADD(at[40], at[69]);    MULADD(at[41], at[68]);    MULADD(at[42], at[67]);    MULADD(at[43], at[66]);    MULADD(at[44], at[65]);    MULADD(at[45], at[64]); 
-   COMBA_STORE(C->dp[45]);
-   /* 46 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[110]);    MULADD(at[1], at[109]);    MULADD(at[2], at[108]);    MULADD(at[3], at[107]);    MULADD(at[4], at[106]);    MULADD(at[5], at[105]);    MULADD(at[6], at[104]);    MULADD(at[7], at[103]);    MULADD(at[8], at[102]);    MULADD(at[9], at[101]);    MULADD(at[10], at[100]);    MULADD(at[11], at[99]);    MULADD(at[12], at[98]);    MULADD(at[13], at[97]);    MULADD(at[14], at[96]);    MULADD(at[15], at[95]);    MULADD(at[16], at[94]);    MULADD(at[17], at[93]);    MULADD(at[18], at[92]);    MULADD(at[19], at[91]);    MULADD(at[20], at[90]);    MULADD(at[21], at[89]);    MULADD(at[22], at[88]);    MULADD(at[23], at[87]);    MULADD(at[24], at[86]);    MULADD(at[25], at[85]);    MULADD(at[26], at[84]);    MULADD(at[27], at[83]);    MULADD(at[28], at[82]);    MULADD(at[29], at[81]);    MULADD(at[30], at[80]);    MULADD(at[31], at[79]);    MULADD(at[32], at[78]);    MULADD(at[33], at[77]);    MULADD(at[34], at[76]);    MULADD(at[35], at[75]);    MULADD(at[36], at[74]);    MULADD(at[37], at[73]);    MULADD(at[38], at[72]);    MULADD(at[39], at[71]);    MULADD(at[40], at[70]);    MULADD(at[41], at[69]);    MULADD(at[42], at[68]);    MULADD(at[43], at[67]);    MULADD(at[44], at[66]);    MULADD(at[45], at[65]);    MULADD(at[46], at[64]); 
-   COMBA_STORE(C->dp[46]);
-   /* 47 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[111]);    MULADD(at[1], at[110]);    MULADD(at[2], at[109]);    MULADD(at[3], at[108]);    MULADD(at[4], at[107]);    MULADD(at[5], at[106]);    MULADD(at[6], at[105]);    MULADD(at[7], at[104]);    MULADD(at[8], at[103]);    MULADD(at[9], at[102]);    MULADD(at[10], at[101]);    MULADD(at[11], at[100]);    MULADD(at[12], at[99]);    MULADD(at[13], at[98]);    MULADD(at[14], at[97]);    MULADD(at[15], at[96]);    MULADD(at[16], at[95]);    MULADD(at[17], at[94]);    MULADD(at[18], at[93]);    MULADD(at[19], at[92]);    MULADD(at[20], at[91]);    MULADD(at[21], at[90]);    MULADD(at[22], at[89]);    MULADD(at[23], at[88]);    MULADD(at[24], at[87]);    MULADD(at[25], at[86]);    MULADD(at[26], at[85]);    MULADD(at[27], at[84]);    MULADD(at[28], at[83]);    MULADD(at[29], at[82]);    MULADD(at[30], at[81]);    MULADD(at[31], at[80]);    MULADD(at[32], at[79]);    MULADD(at[33], at[78]);    MULADD(at[34], at[77]);    MULADD(at[35], at[76]);    MULADD(at[36], at[75]);    MULADD(at[37], at[74]);    MULADD(at[38], at[73]);    MULADD(at[39], at[72]);    MULADD(at[40], at[71]);    MULADD(at[41], at[70]);    MULADD(at[42], at[69]);    MULADD(at[43], at[68]);    MULADD(at[44], at[67]);    MULADD(at[45], at[66]);    MULADD(at[46], at[65]);    MULADD(at[47], at[64]); 
-   COMBA_STORE(C->dp[47]);
-   /* 48 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[112]);    MULADD(at[1], at[111]);    MULADD(at[2], at[110]);    MULADD(at[3], at[109]);    MULADD(at[4], at[108]);    MULADD(at[5], at[107]);    MULADD(at[6], at[106]);    MULADD(at[7], at[105]);    MULADD(at[8], at[104]);    MULADD(at[9], at[103]);    MULADD(at[10], at[102]);    MULADD(at[11], at[101]);    MULADD(at[12], at[100]);    MULADD(at[13], at[99]);    MULADD(at[14], at[98]);    MULADD(at[15], at[97]);    MULADD(at[16], at[96]);    MULADD(at[17], at[95]);    MULADD(at[18], at[94]);    MULADD(at[19], at[93]);    MULADD(at[20], at[92]);    MULADD(at[21], at[91]);    MULADD(at[22], at[90]);    MULADD(at[23], at[89]);    MULADD(at[24], at[88]);    MULADD(at[25], at[87]);    MULADD(at[26], at[86]);    MULADD(at[27], at[85]);    MULADD(at[28], at[84]);    MULADD(at[29], at[83]);    MULADD(at[30], at[82]);    MULADD(at[31], at[81]);    MULADD(at[32], at[80]);    MULADD(at[33], at[79]);    MULADD(at[34], at[78]);    MULADD(at[35], at[77]);    MULADD(at[36], at[76]);    MULADD(at[37], at[75]);    MULADD(at[38], at[74]);    MULADD(at[39], at[73]);    MULADD(at[40], at[72]);    MULADD(at[41], at[71]);    MULADD(at[42], at[70]);    MULADD(at[43], at[69]);    MULADD(at[44], at[68]);    MULADD(at[45], at[67]);    MULADD(at[46], at[66]);    MULADD(at[47], at[65]);    MULADD(at[48], at[64]); 
-   COMBA_STORE(C->dp[48]);
-   /* 49 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[113]);    MULADD(at[1], at[112]);    MULADD(at[2], at[111]);    MULADD(at[3], at[110]);    MULADD(at[4], at[109]);    MULADD(at[5], at[108]);    MULADD(at[6], at[107]);    MULADD(at[7], at[106]);    MULADD(at[8], at[105]);    MULADD(at[9], at[104]);    MULADD(at[10], at[103]);    MULADD(at[11], at[102]);    MULADD(at[12], at[101]);    MULADD(at[13], at[100]);    MULADD(at[14], at[99]);    MULADD(at[15], at[98]);    MULADD(at[16], at[97]);    MULADD(at[17], at[96]);    MULADD(at[18], at[95]);    MULADD(at[19], at[94]);    MULADD(at[20], at[93]);    MULADD(at[21], at[92]);    MULADD(at[22], at[91]);    MULADD(at[23], at[90]);    MULADD(at[24], at[89]);    MULADD(at[25], at[88]);    MULADD(at[26], at[87]);    MULADD(at[27], at[86]);    MULADD(at[28], at[85]);    MULADD(at[29], at[84]);    MULADD(at[30], at[83]);    MULADD(at[31], at[82]);    MULADD(at[32], at[81]);    MULADD(at[33], at[80]);    MULADD(at[34], at[79]);    MULADD(at[35], at[78]);    MULADD(at[36], at[77]);    MULADD(at[37], at[76]);    MULADD(at[38], at[75]);    MULADD(at[39], at[74]);    MULADD(at[40], at[73]);    MULADD(at[41], at[72]);    MULADD(at[42], at[71]);    MULADD(at[43], at[70]);    MULADD(at[44], at[69]);    MULADD(at[45], at[68]);    MULADD(at[46], at[67]);    MULADD(at[47], at[66]);    MULADD(at[48], at[65]);    MULADD(at[49], at[64]); 
-   COMBA_STORE(C->dp[49]);
-   /* 50 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[114]);    MULADD(at[1], at[113]);    MULADD(at[2], at[112]);    MULADD(at[3], at[111]);    MULADD(at[4], at[110]);    MULADD(at[5], at[109]);    MULADD(at[6], at[108]);    MULADD(at[7], at[107]);    MULADD(at[8], at[106]);    MULADD(at[9], at[105]);    MULADD(at[10], at[104]);    MULADD(at[11], at[103]);    MULADD(at[12], at[102]);    MULADD(at[13], at[101]);    MULADD(at[14], at[100]);    MULADD(at[15], at[99]);    MULADD(at[16], at[98]);    MULADD(at[17], at[97]);    MULADD(at[18], at[96]);    MULADD(at[19], at[95]);    MULADD(at[20], at[94]);    MULADD(at[21], at[93]);    MULADD(at[22], at[92]);    MULADD(at[23], at[91]);    MULADD(at[24], at[90]);    MULADD(at[25], at[89]);    MULADD(at[26], at[88]);    MULADD(at[27], at[87]);    MULADD(at[28], at[86]);    MULADD(at[29], at[85]);    MULADD(at[30], at[84]);    MULADD(at[31], at[83]);    MULADD(at[32], at[82]);    MULADD(at[33], at[81]);    MULADD(at[34], at[80]);    MULADD(at[35], at[79]);    MULADD(at[36], at[78]);    MULADD(at[37], at[77]);    MULADD(at[38], at[76]);    MULADD(at[39], at[75]);    MULADD(at[40], at[74]);    MULADD(at[41], at[73]);    MULADD(at[42], at[72]);    MULADD(at[43], at[71]);    MULADD(at[44], at[70]);    MULADD(at[45], at[69]);    MULADD(at[46], at[68]);    MULADD(at[47], at[67]);    MULADD(at[48], at[66]);    MULADD(at[49], at[65]);    MULADD(at[50], at[64]); 
-   COMBA_STORE(C->dp[50]);
-   /* 51 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[115]);    MULADD(at[1], at[114]);    MULADD(at[2], at[113]);    MULADD(at[3], at[112]);    MULADD(at[4], at[111]);    MULADD(at[5], at[110]);    MULADD(at[6], at[109]);    MULADD(at[7], at[108]);    MULADD(at[8], at[107]);    MULADD(at[9], at[106]);    MULADD(at[10], at[105]);    MULADD(at[11], at[104]);    MULADD(at[12], at[103]);    MULADD(at[13], at[102]);    MULADD(at[14], at[101]);    MULADD(at[15], at[100]);    MULADD(at[16], at[99]);    MULADD(at[17], at[98]);    MULADD(at[18], at[97]);    MULADD(at[19], at[96]);    MULADD(at[20], at[95]);    MULADD(at[21], at[94]);    MULADD(at[22], at[93]);    MULADD(at[23], at[92]);    MULADD(at[24], at[91]);    MULADD(at[25], at[90]);    MULADD(at[26], at[89]);    MULADD(at[27], at[88]);    MULADD(at[28], at[87]);    MULADD(at[29], at[86]);    MULADD(at[30], at[85]);    MULADD(at[31], at[84]);    MULADD(at[32], at[83]);    MULADD(at[33], at[82]);    MULADD(at[34], at[81]);    MULADD(at[35], at[80]);    MULADD(at[36], at[79]);    MULADD(at[37], at[78]);    MULADD(at[38], at[77]);    MULADD(at[39], at[76]);    MULADD(at[40], at[75]);    MULADD(at[41], at[74]);    MULADD(at[42], at[73]);    MULADD(at[43], at[72]);    MULADD(at[44], at[71]);    MULADD(at[45], at[70]);    MULADD(at[46], at[69]);    MULADD(at[47], at[68]);    MULADD(at[48], at[67]);    MULADD(at[49], at[66]);    MULADD(at[50], at[65]);    MULADD(at[51], at[64]); 
-   COMBA_STORE(C->dp[51]);
-   /* 52 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[116]);    MULADD(at[1], at[115]);    MULADD(at[2], at[114]);    MULADD(at[3], at[113]);    MULADD(at[4], at[112]);    MULADD(at[5], at[111]);    MULADD(at[6], at[110]);    MULADD(at[7], at[109]);    MULADD(at[8], at[108]);    MULADD(at[9], at[107]);    MULADD(at[10], at[106]);    MULADD(at[11], at[105]);    MULADD(at[12], at[104]);    MULADD(at[13], at[103]);    MULADD(at[14], at[102]);    MULADD(at[15], at[101]);    MULADD(at[16], at[100]);    MULADD(at[17], at[99]);    MULADD(at[18], at[98]);    MULADD(at[19], at[97]);    MULADD(at[20], at[96]);    MULADD(at[21], at[95]);    MULADD(at[22], at[94]);    MULADD(at[23], at[93]);    MULADD(at[24], at[92]);    MULADD(at[25], at[91]);    MULADD(at[26], at[90]);    MULADD(at[27], at[89]);    MULADD(at[28], at[88]);    MULADD(at[29], at[87]);    MULADD(at[30], at[86]);    MULADD(at[31], at[85]);    MULADD(at[32], at[84]);    MULADD(at[33], at[83]);    MULADD(at[34], at[82]);    MULADD(at[35], at[81]);    MULADD(at[36], at[80]);    MULADD(at[37], at[79]);    MULADD(at[38], at[78]);    MULADD(at[39], at[77]);    MULADD(at[40], at[76]);    MULADD(at[41], at[75]);    MULADD(at[42], at[74]);    MULADD(at[43], at[73]);    MULADD(at[44], at[72]);    MULADD(at[45], at[71]);    MULADD(at[46], at[70]);    MULADD(at[47], at[69]);    MULADD(at[48], at[68]);    MULADD(at[49], at[67]);    MULADD(at[50], at[66]);    MULADD(at[51], at[65]);    MULADD(at[52], at[64]); 
-   COMBA_STORE(C->dp[52]);
-   /* 53 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[117]);    MULADD(at[1], at[116]);    MULADD(at[2], at[115]);    MULADD(at[3], at[114]);    MULADD(at[4], at[113]);    MULADD(at[5], at[112]);    MULADD(at[6], at[111]);    MULADD(at[7], at[110]);    MULADD(at[8], at[109]);    MULADD(at[9], at[108]);    MULADD(at[10], at[107]);    MULADD(at[11], at[106]);    MULADD(at[12], at[105]);    MULADD(at[13], at[104]);    MULADD(at[14], at[103]);    MULADD(at[15], at[102]);    MULADD(at[16], at[101]);    MULADD(at[17], at[100]);    MULADD(at[18], at[99]);    MULADD(at[19], at[98]);    MULADD(at[20], at[97]);    MULADD(at[21], at[96]);    MULADD(at[22], at[95]);    MULADD(at[23], at[94]);    MULADD(at[24], at[93]);    MULADD(at[25], at[92]);    MULADD(at[26], at[91]);    MULADD(at[27], at[90]);    MULADD(at[28], at[89]);    MULADD(at[29], at[88]);    MULADD(at[30], at[87]);    MULADD(at[31], at[86]);    MULADD(at[32], at[85]);    MULADD(at[33], at[84]);    MULADD(at[34], at[83]);    MULADD(at[35], at[82]);    MULADD(at[36], at[81]);    MULADD(at[37], at[80]);    MULADD(at[38], at[79]);    MULADD(at[39], at[78]);    MULADD(at[40], at[77]);    MULADD(at[41], at[76]);    MULADD(at[42], at[75]);    MULADD(at[43], at[74]);    MULADD(at[44], at[73]);    MULADD(at[45], at[72]);    MULADD(at[46], at[71]);    MULADD(at[47], at[70]);    MULADD(at[48], at[69]);    MULADD(at[49], at[68]);    MULADD(at[50], at[67]);    MULADD(at[51], at[66]);    MULADD(at[52], at[65]);    MULADD(at[53], at[64]); 
-   COMBA_STORE(C->dp[53]);
-   /* 54 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[118]);    MULADD(at[1], at[117]);    MULADD(at[2], at[116]);    MULADD(at[3], at[115]);    MULADD(at[4], at[114]);    MULADD(at[5], at[113]);    MULADD(at[6], at[112]);    MULADD(at[7], at[111]);    MULADD(at[8], at[110]);    MULADD(at[9], at[109]);    MULADD(at[10], at[108]);    MULADD(at[11], at[107]);    MULADD(at[12], at[106]);    MULADD(at[13], at[105]);    MULADD(at[14], at[104]);    MULADD(at[15], at[103]);    MULADD(at[16], at[102]);    MULADD(at[17], at[101]);    MULADD(at[18], at[100]);    MULADD(at[19], at[99]);    MULADD(at[20], at[98]);    MULADD(at[21], at[97]);    MULADD(at[22], at[96]);    MULADD(at[23], at[95]);    MULADD(at[24], at[94]);    MULADD(at[25], at[93]);    MULADD(at[26], at[92]);    MULADD(at[27], at[91]);    MULADD(at[28], at[90]);    MULADD(at[29], at[89]);    MULADD(at[30], at[88]);    MULADD(at[31], at[87]);    MULADD(at[32], at[86]);    MULADD(at[33], at[85]);    MULADD(at[34], at[84]);    MULADD(at[35], at[83]);    MULADD(at[36], at[82]);    MULADD(at[37], at[81]);    MULADD(at[38], at[80]);    MULADD(at[39], at[79]);    MULADD(at[40], at[78]);    MULADD(at[41], at[77]);    MULADD(at[42], at[76]);    MULADD(at[43], at[75]);    MULADD(at[44], at[74]);    MULADD(at[45], at[73]);    MULADD(at[46], at[72]);    MULADD(at[47], at[71]);    MULADD(at[48], at[70]);    MULADD(at[49], at[69]);    MULADD(at[50], at[68]);    MULADD(at[51], at[67]);    MULADD(at[52], at[66]);    MULADD(at[53], at[65]);    MULADD(at[54], at[64]); 
-   COMBA_STORE(C->dp[54]);
-   /* 55 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[119]);    MULADD(at[1], at[118]);    MULADD(at[2], at[117]);    MULADD(at[3], at[116]);    MULADD(at[4], at[115]);    MULADD(at[5], at[114]);    MULADD(at[6], at[113]);    MULADD(at[7], at[112]);    MULADD(at[8], at[111]);    MULADD(at[9], at[110]);    MULADD(at[10], at[109]);    MULADD(at[11], at[108]);    MULADD(at[12], at[107]);    MULADD(at[13], at[106]);    MULADD(at[14], at[105]);    MULADD(at[15], at[104]);    MULADD(at[16], at[103]);    MULADD(at[17], at[102]);    MULADD(at[18], at[101]);    MULADD(at[19], at[100]);    MULADD(at[20], at[99]);    MULADD(at[21], at[98]);    MULADD(at[22], at[97]);    MULADD(at[23], at[96]);    MULADD(at[24], at[95]);    MULADD(at[25], at[94]);    MULADD(at[26], at[93]);    MULADD(at[27], at[92]);    MULADD(at[28], at[91]);    MULADD(at[29], at[90]);    MULADD(at[30], at[89]);    MULADD(at[31], at[88]);    MULADD(at[32], at[87]);    MULADD(at[33], at[86]);    MULADD(at[34], at[85]);    MULADD(at[35], at[84]);    MULADD(at[36], at[83]);    MULADD(at[37], at[82]);    MULADD(at[38], at[81]);    MULADD(at[39], at[80]);    MULADD(at[40], at[79]);    MULADD(at[41], at[78]);    MULADD(at[42], at[77]);    MULADD(at[43], at[76]);    MULADD(at[44], at[75]);    MULADD(at[45], at[74]);    MULADD(at[46], at[73]);    MULADD(at[47], at[72]);    MULADD(at[48], at[71]);    MULADD(at[49], at[70]);    MULADD(at[50], at[69]);    MULADD(at[51], at[68]);    MULADD(at[52], at[67]);    MULADD(at[53], at[66]);    MULADD(at[54], at[65]);    MULADD(at[55], at[64]); 
-   COMBA_STORE(C->dp[55]);
-   /* 56 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[120]);    MULADD(at[1], at[119]);    MULADD(at[2], at[118]);    MULADD(at[3], at[117]);    MULADD(at[4], at[116]);    MULADD(at[5], at[115]);    MULADD(at[6], at[114]);    MULADD(at[7], at[113]);    MULADD(at[8], at[112]);    MULADD(at[9], at[111]);    MULADD(at[10], at[110]);    MULADD(at[11], at[109]);    MULADD(at[12], at[108]);    MULADD(at[13], at[107]);    MULADD(at[14], at[106]);    MULADD(at[15], at[105]);    MULADD(at[16], at[104]);    MULADD(at[17], at[103]);    MULADD(at[18], at[102]);    MULADD(at[19], at[101]);    MULADD(at[20], at[100]);    MULADD(at[21], at[99]);    MULADD(at[22], at[98]);    MULADD(at[23], at[97]);    MULADD(at[24], at[96]);    MULADD(at[25], at[95]);    MULADD(at[26], at[94]);    MULADD(at[27], at[93]);    MULADD(at[28], at[92]);    MULADD(at[29], at[91]);    MULADD(at[30], at[90]);    MULADD(at[31], at[89]);    MULADD(at[32], at[88]);    MULADD(at[33], at[87]);    MULADD(at[34], at[86]);    MULADD(at[35], at[85]);    MULADD(at[36], at[84]);    MULADD(at[37], at[83]);    MULADD(at[38], at[82]);    MULADD(at[39], at[81]);    MULADD(at[40], at[80]);    MULADD(at[41], at[79]);    MULADD(at[42], at[78]);    MULADD(at[43], at[77]);    MULADD(at[44], at[76]);    MULADD(at[45], at[75]);    MULADD(at[46], at[74]);    MULADD(at[47], at[73]);    MULADD(at[48], at[72]);    MULADD(at[49], at[71]);    MULADD(at[50], at[70]);    MULADD(at[51], at[69]);    MULADD(at[52], at[68]);    MULADD(at[53], at[67]);    MULADD(at[54], at[66]);    MULADD(at[55], at[65]);    MULADD(at[56], at[64]); 
-   COMBA_STORE(C->dp[56]);
-   /* 57 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[121]);    MULADD(at[1], at[120]);    MULADD(at[2], at[119]);    MULADD(at[3], at[118]);    MULADD(at[4], at[117]);    MULADD(at[5], at[116]);    MULADD(at[6], at[115]);    MULADD(at[7], at[114]);    MULADD(at[8], at[113]);    MULADD(at[9], at[112]);    MULADD(at[10], at[111]);    MULADD(at[11], at[110]);    MULADD(at[12], at[109]);    MULADD(at[13], at[108]);    MULADD(at[14], at[107]);    MULADD(at[15], at[106]);    MULADD(at[16], at[105]);    MULADD(at[17], at[104]);    MULADD(at[18], at[103]);    MULADD(at[19], at[102]);    MULADD(at[20], at[101]);    MULADD(at[21], at[100]);    MULADD(at[22], at[99]);    MULADD(at[23], at[98]);    MULADD(at[24], at[97]);    MULADD(at[25], at[96]);    MULADD(at[26], at[95]);    MULADD(at[27], at[94]);    MULADD(at[28], at[93]);    MULADD(at[29], at[92]);    MULADD(at[30], at[91]);    MULADD(at[31], at[90]);    MULADD(at[32], at[89]);    MULADD(at[33], at[88]);    MULADD(at[34], at[87]);    MULADD(at[35], at[86]);    MULADD(at[36], at[85]);    MULADD(at[37], at[84]);    MULADD(at[38], at[83]);    MULADD(at[39], at[82]);    MULADD(at[40], at[81]);    MULADD(at[41], at[80]);    MULADD(at[42], at[79]);    MULADD(at[43], at[78]);    MULADD(at[44], at[77]);    MULADD(at[45], at[76]);    MULADD(at[46], at[75]);    MULADD(at[47], at[74]);    MULADD(at[48], at[73]);    MULADD(at[49], at[72]);    MULADD(at[50], at[71]);    MULADD(at[51], at[70]);    MULADD(at[52], at[69]);    MULADD(at[53], at[68]);    MULADD(at[54], at[67]);    MULADD(at[55], at[66]);    MULADD(at[56], at[65]);    MULADD(at[57], at[64]); 
-   COMBA_STORE(C->dp[57]);
-   /* 58 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[122]);    MULADD(at[1], at[121]);    MULADD(at[2], at[120]);    MULADD(at[3], at[119]);    MULADD(at[4], at[118]);    MULADD(at[5], at[117]);    MULADD(at[6], at[116]);    MULADD(at[7], at[115]);    MULADD(at[8], at[114]);    MULADD(at[9], at[113]);    MULADD(at[10], at[112]);    MULADD(at[11], at[111]);    MULADD(at[12], at[110]);    MULADD(at[13], at[109]);    MULADD(at[14], at[108]);    MULADD(at[15], at[107]);    MULADD(at[16], at[106]);    MULADD(at[17], at[105]);    MULADD(at[18], at[104]);    MULADD(at[19], at[103]);    MULADD(at[20], at[102]);    MULADD(at[21], at[101]);    MULADD(at[22], at[100]);    MULADD(at[23], at[99]);    MULADD(at[24], at[98]);    MULADD(at[25], at[97]);    MULADD(at[26], at[96]);    MULADD(at[27], at[95]);    MULADD(at[28], at[94]);    MULADD(at[29], at[93]);    MULADD(at[30], at[92]);    MULADD(at[31], at[91]);    MULADD(at[32], at[90]);    MULADD(at[33], at[89]);    MULADD(at[34], at[88]);    MULADD(at[35], at[87]);    MULADD(at[36], at[86]);    MULADD(at[37], at[85]);    MULADD(at[38], at[84]);    MULADD(at[39], at[83]);    MULADD(at[40], at[82]);    MULADD(at[41], at[81]);    MULADD(at[42], at[80]);    MULADD(at[43], at[79]);    MULADD(at[44], at[78]);    MULADD(at[45], at[77]);    MULADD(at[46], at[76]);    MULADD(at[47], at[75]);    MULADD(at[48], at[74]);    MULADD(at[49], at[73]);    MULADD(at[50], at[72]);    MULADD(at[51], at[71]);    MULADD(at[52], at[70]);    MULADD(at[53], at[69]);    MULADD(at[54], at[68]);    MULADD(at[55], at[67]);    MULADD(at[56], at[66]);    MULADD(at[57], at[65]);    MULADD(at[58], at[64]); 
-   COMBA_STORE(C->dp[58]);
-   /* 59 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[123]);    MULADD(at[1], at[122]);    MULADD(at[2], at[121]);    MULADD(at[3], at[120]);    MULADD(at[4], at[119]);    MULADD(at[5], at[118]);    MULADD(at[6], at[117]);    MULADD(at[7], at[116]);    MULADD(at[8], at[115]);    MULADD(at[9], at[114]);    MULADD(at[10], at[113]);    MULADD(at[11], at[112]);    MULADD(at[12], at[111]);    MULADD(at[13], at[110]);    MULADD(at[14], at[109]);    MULADD(at[15], at[108]);    MULADD(at[16], at[107]);    MULADD(at[17], at[106]);    MULADD(at[18], at[105]);    MULADD(at[19], at[104]);    MULADD(at[20], at[103]);    MULADD(at[21], at[102]);    MULADD(at[22], at[101]);    MULADD(at[23], at[100]);    MULADD(at[24], at[99]);    MULADD(at[25], at[98]);    MULADD(at[26], at[97]);    MULADD(at[27], at[96]);    MULADD(at[28], at[95]);    MULADD(at[29], at[94]);    MULADD(at[30], at[93]);    MULADD(at[31], at[92]);    MULADD(at[32], at[91]);    MULADD(at[33], at[90]);    MULADD(at[34], at[89]);    MULADD(at[35], at[88]);    MULADD(at[36], at[87]);    MULADD(at[37], at[86]);    MULADD(at[38], at[85]);    MULADD(at[39], at[84]);    MULADD(at[40], at[83]);    MULADD(at[41], at[82]);    MULADD(at[42], at[81]);    MULADD(at[43], at[80]);    MULADD(at[44], at[79]);    MULADD(at[45], at[78]);    MULADD(at[46], at[77]);    MULADD(at[47], at[76]);    MULADD(at[48], at[75]);    MULADD(at[49], at[74]);    MULADD(at[50], at[73]);    MULADD(at[51], at[72]);    MULADD(at[52], at[71]);    MULADD(at[53], at[70]);    MULADD(at[54], at[69]);    MULADD(at[55], at[68]);    MULADD(at[56], at[67]);    MULADD(at[57], at[66]);    MULADD(at[58], at[65]);    MULADD(at[59], at[64]); 
-   COMBA_STORE(C->dp[59]);
-   /* 60 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[124]);    MULADD(at[1], at[123]);    MULADD(at[2], at[122]);    MULADD(at[3], at[121]);    MULADD(at[4], at[120]);    MULADD(at[5], at[119]);    MULADD(at[6], at[118]);    MULADD(at[7], at[117]);    MULADD(at[8], at[116]);    MULADD(at[9], at[115]);    MULADD(at[10], at[114]);    MULADD(at[11], at[113]);    MULADD(at[12], at[112]);    MULADD(at[13], at[111]);    MULADD(at[14], at[110]);    MULADD(at[15], at[109]);    MULADD(at[16], at[108]);    MULADD(at[17], at[107]);    MULADD(at[18], at[106]);    MULADD(at[19], at[105]);    MULADD(at[20], at[104]);    MULADD(at[21], at[103]);    MULADD(at[22], at[102]);    MULADD(at[23], at[101]);    MULADD(at[24], at[100]);    MULADD(at[25], at[99]);    MULADD(at[26], at[98]);    MULADD(at[27], at[97]);    MULADD(at[28], at[96]);    MULADD(at[29], at[95]);    MULADD(at[30], at[94]);    MULADD(at[31], at[93]);    MULADD(at[32], at[92]);    MULADD(at[33], at[91]);    MULADD(at[34], at[90]);    MULADD(at[35], at[89]);    MULADD(at[36], at[88]);    MULADD(at[37], at[87]);    MULADD(at[38], at[86]);    MULADD(at[39], at[85]);    MULADD(at[40], at[84]);    MULADD(at[41], at[83]);    MULADD(at[42], at[82]);    MULADD(at[43], at[81]);    MULADD(at[44], at[80]);    MULADD(at[45], at[79]);    MULADD(at[46], at[78]);    MULADD(at[47], at[77]);    MULADD(at[48], at[76]);    MULADD(at[49], at[75]);    MULADD(at[50], at[74]);    MULADD(at[51], at[73]);    MULADD(at[52], at[72]);    MULADD(at[53], at[71]);    MULADD(at[54], at[70]);    MULADD(at[55], at[69]);    MULADD(at[56], at[68]);    MULADD(at[57], at[67]);    MULADD(at[58], at[66]);    MULADD(at[59], at[65]);    MULADD(at[60], at[64]); 
-   COMBA_STORE(C->dp[60]);
-   /* 61 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[125]);    MULADD(at[1], at[124]);    MULADD(at[2], at[123]);    MULADD(at[3], at[122]);    MULADD(at[4], at[121]);    MULADD(at[5], at[120]);    MULADD(at[6], at[119]);    MULADD(at[7], at[118]);    MULADD(at[8], at[117]);    MULADD(at[9], at[116]);    MULADD(at[10], at[115]);    MULADD(at[11], at[114]);    MULADD(at[12], at[113]);    MULADD(at[13], at[112]);    MULADD(at[14], at[111]);    MULADD(at[15], at[110]);    MULADD(at[16], at[109]);    MULADD(at[17], at[108]);    MULADD(at[18], at[107]);    MULADD(at[19], at[106]);    MULADD(at[20], at[105]);    MULADD(at[21], at[104]);    MULADD(at[22], at[103]);    MULADD(at[23], at[102]);    MULADD(at[24], at[101]);    MULADD(at[25], at[100]);    MULADD(at[26], at[99]);    MULADD(at[27], at[98]);    MULADD(at[28], at[97]);    MULADD(at[29], at[96]);    MULADD(at[30], at[95]);    MULADD(at[31], at[94]);    MULADD(at[32], at[93]);    MULADD(at[33], at[92]);    MULADD(at[34], at[91]);    MULADD(at[35], at[90]);    MULADD(at[36], at[89]);    MULADD(at[37], at[88]);    MULADD(at[38], at[87]);    MULADD(at[39], at[86]);    MULADD(at[40], at[85]);    MULADD(at[41], at[84]);    MULADD(at[42], at[83]);    MULADD(at[43], at[82]);    MULADD(at[44], at[81]);    MULADD(at[45], at[80]);    MULADD(at[46], at[79]);    MULADD(at[47], at[78]);    MULADD(at[48], at[77]);    MULADD(at[49], at[76]);    MULADD(at[50], at[75]);    MULADD(at[51], at[74]);    MULADD(at[52], at[73]);    MULADD(at[53], at[72]);    MULADD(at[54], at[71]);    MULADD(at[55], at[70]);    MULADD(at[56], at[69]);    MULADD(at[57], at[68]);    MULADD(at[58], at[67]);    MULADD(at[59], at[66]);    MULADD(at[60], at[65]);    MULADD(at[61], at[64]); 
-   COMBA_STORE(C->dp[61]);
-   /* 62 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[126]);    MULADD(at[1], at[125]);    MULADD(at[2], at[124]);    MULADD(at[3], at[123]);    MULADD(at[4], at[122]);    MULADD(at[5], at[121]);    MULADD(at[6], at[120]);    MULADD(at[7], at[119]);    MULADD(at[8], at[118]);    MULADD(at[9], at[117]);    MULADD(at[10], at[116]);    MULADD(at[11], at[115]);    MULADD(at[12], at[114]);    MULADD(at[13], at[113]);    MULADD(at[14], at[112]);    MULADD(at[15], at[111]);    MULADD(at[16], at[110]);    MULADD(at[17], at[109]);    MULADD(at[18], at[108]);    MULADD(at[19], at[107]);    MULADD(at[20], at[106]);    MULADD(at[21], at[105]);    MULADD(at[22], at[104]);    MULADD(at[23], at[103]);    MULADD(at[24], at[102]);    MULADD(at[25], at[101]);    MULADD(at[26], at[100]);    MULADD(at[27], at[99]);    MULADD(at[28], at[98]);    MULADD(at[29], at[97]);    MULADD(at[30], at[96]);    MULADD(at[31], at[95]);    MULADD(at[32], at[94]);    MULADD(at[33], at[93]);    MULADD(at[34], at[92]);    MULADD(at[35], at[91]);    MULADD(at[36], at[90]);    MULADD(at[37], at[89]);    MULADD(at[38], at[88]);    MULADD(at[39], at[87]);    MULADD(at[40], at[86]);    MULADD(at[41], at[85]);    MULADD(at[42], at[84]);    MULADD(at[43], at[83]);    MULADD(at[44], at[82]);    MULADD(at[45], at[81]);    MULADD(at[46], at[80]);    MULADD(at[47], at[79]);    MULADD(at[48], at[78]);    MULADD(at[49], at[77]);    MULADD(at[50], at[76]);    MULADD(at[51], at[75]);    MULADD(at[52], at[74]);    MULADD(at[53], at[73]);    MULADD(at[54], at[72]);    MULADD(at[55], at[71]);    MULADD(at[56], at[70]);    MULADD(at[57], at[69]);    MULADD(at[58], at[68]);    MULADD(at[59], at[67]);    MULADD(at[60], at[66]);    MULADD(at[61], at[65]);    MULADD(at[62], at[64]); 
-   COMBA_STORE(C->dp[62]);
-   /* 63 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[127]);    MULADD(at[1], at[126]);    MULADD(at[2], at[125]);    MULADD(at[3], at[124]);    MULADD(at[4], at[123]);    MULADD(at[5], at[122]);    MULADD(at[6], at[121]);    MULADD(at[7], at[120]);    MULADD(at[8], at[119]);    MULADD(at[9], at[118]);    MULADD(at[10], at[117]);    MULADD(at[11], at[116]);    MULADD(at[12], at[115]);    MULADD(at[13], at[114]);    MULADD(at[14], at[113]);    MULADD(at[15], at[112]);    MULADD(at[16], at[111]);    MULADD(at[17], at[110]);    MULADD(at[18], at[109]);    MULADD(at[19], at[108]);    MULADD(at[20], at[107]);    MULADD(at[21], at[106]);    MULADD(at[22], at[105]);    MULADD(at[23], at[104]);    MULADD(at[24], at[103]);    MULADD(at[25], at[102]);    MULADD(at[26], at[101]);    MULADD(at[27], at[100]);    MULADD(at[28], at[99]);    MULADD(at[29], at[98]);    MULADD(at[30], at[97]);    MULADD(at[31], at[96]);    MULADD(at[32], at[95]);    MULADD(at[33], at[94]);    MULADD(at[34], at[93]);    MULADD(at[35], at[92]);    MULADD(at[36], at[91]);    MULADD(at[37], at[90]);    MULADD(at[38], at[89]);    MULADD(at[39], at[88]);    MULADD(at[40], at[87]);    MULADD(at[41], at[86]);    MULADD(at[42], at[85]);    MULADD(at[43], at[84]);    MULADD(at[44], at[83]);    MULADD(at[45], at[82]);    MULADD(at[46], at[81]);    MULADD(at[47], at[80]);    MULADD(at[48], at[79]);    MULADD(at[49], at[78]);    MULADD(at[50], at[77]);    MULADD(at[51], at[76]);    MULADD(at[52], at[75]);    MULADD(at[53], at[74]);    MULADD(at[54], at[73]);    MULADD(at[55], at[72]);    MULADD(at[56], at[71]);    MULADD(at[57], at[70]);    MULADD(at[58], at[69]);    MULADD(at[59], at[68]);    MULADD(at[60], at[67]);    MULADD(at[61], at[66]);    MULADD(at[62], at[65]);    MULADD(at[63], at[64]); 
-   COMBA_STORE(C->dp[63]);
-   /* 64 */
-   COMBA_FORWARD;
-   MULADD(at[1], at[127]);    MULADD(at[2], at[126]);    MULADD(at[3], at[125]);    MULADD(at[4], at[124]);    MULADD(at[5], at[123]);    MULADD(at[6], at[122]);    MULADD(at[7], at[121]);    MULADD(at[8], at[120]);    MULADD(at[9], at[119]);    MULADD(at[10], at[118]);    MULADD(at[11], at[117]);    MULADD(at[12], at[116]);    MULADD(at[13], at[115]);    MULADD(at[14], at[114]);    MULADD(at[15], at[113]);    MULADD(at[16], at[112]);    MULADD(at[17], at[111]);    MULADD(at[18], at[110]);    MULADD(at[19], at[109]);    MULADD(at[20], at[108]);    MULADD(at[21], at[107]);    MULADD(at[22], at[106]);    MULADD(at[23], at[105]);    MULADD(at[24], at[104]);    MULADD(at[25], at[103]);    MULADD(at[26], at[102]);    MULADD(at[27], at[101]);    MULADD(at[28], at[100]);    MULADD(at[29], at[99]);    MULADD(at[30], at[98]);    MULADD(at[31], at[97]);    MULADD(at[32], at[96]);    MULADD(at[33], at[95]);    MULADD(at[34], at[94]);    MULADD(at[35], at[93]);    MULADD(at[36], at[92]);    MULADD(at[37], at[91]);    MULADD(at[38], at[90]);    MULADD(at[39], at[89]);    MULADD(at[40], at[88]);    MULADD(at[41], at[87]);    MULADD(at[42], at[86]);    MULADD(at[43], at[85]);    MULADD(at[44], at[84]);    MULADD(at[45], at[83]);    MULADD(at[46], at[82]);    MULADD(at[47], at[81]);    MULADD(at[48], at[80]);    MULADD(at[49], at[79]);    MULADD(at[50], at[78]);    MULADD(at[51], at[77]);    MULADD(at[52], at[76]);    MULADD(at[53], at[75]);    MULADD(at[54], at[74]);    MULADD(at[55], at[73]);    MULADD(at[56], at[72]);    MULADD(at[57], at[71]);    MULADD(at[58], at[70]);    MULADD(at[59], at[69]);    MULADD(at[60], at[68]);    MULADD(at[61], at[67]);    MULADD(at[62], at[66]);    MULADD(at[63], at[65]); 
-   COMBA_STORE(C->dp[64]);
-   /* 65 */
-   COMBA_FORWARD;
-   MULADD(at[2], at[127]);    MULADD(at[3], at[126]);    MULADD(at[4], at[125]);    MULADD(at[5], at[124]);    MULADD(at[6], at[123]);    MULADD(at[7], at[122]);    MULADD(at[8], at[121]);    MULADD(at[9], at[120]);    MULADD(at[10], at[119]);    MULADD(at[11], at[118]);    MULADD(at[12], at[117]);    MULADD(at[13], at[116]);    MULADD(at[14], at[115]);    MULADD(at[15], at[114]);    MULADD(at[16], at[113]);    MULADD(at[17], at[112]);    MULADD(at[18], at[111]);    MULADD(at[19], at[110]);    MULADD(at[20], at[109]);    MULADD(at[21], at[108]);    MULADD(at[22], at[107]);    MULADD(at[23], at[106]);    MULADD(at[24], at[105]);    MULADD(at[25], at[104]);    MULADD(at[26], at[103]);    MULADD(at[27], at[102]);    MULADD(at[28], at[101]);    MULADD(at[29], at[100]);    MULADD(at[30], at[99]);    MULADD(at[31], at[98]);    MULADD(at[32], at[97]);    MULADD(at[33], at[96]);    MULADD(at[34], at[95]);    MULADD(at[35], at[94]);    MULADD(at[36], at[93]);    MULADD(at[37], at[92]);    MULADD(at[38], at[91]);    MULADD(at[39], at[90]);    MULADD(at[40], at[89]);    MULADD(at[41], at[88]);    MULADD(at[42], at[87]);    MULADD(at[43], at[86]);    MULADD(at[44], at[85]);    MULADD(at[45], at[84]);    MULADD(at[46], at[83]);    MULADD(at[47], at[82]);    MULADD(at[48], at[81]);    MULADD(at[49], at[80]);    MULADD(at[50], at[79]);    MULADD(at[51], at[78]);    MULADD(at[52], at[77]);    MULADD(at[53], at[76]);    MULADD(at[54], at[75]);    MULADD(at[55], at[74]);    MULADD(at[56], at[73]);    MULADD(at[57], at[72]);    MULADD(at[58], at[71]);    MULADD(at[59], at[70]);    MULADD(at[60], at[69]);    MULADD(at[61], at[68]);    MULADD(at[62], at[67]);    MULADD(at[63], at[66]); 
-   COMBA_STORE(C->dp[65]);
-   /* 66 */
-   COMBA_FORWARD;
-   MULADD(at[3], at[127]);    MULADD(at[4], at[126]);    MULADD(at[5], at[125]);    MULADD(at[6], at[124]);    MULADD(at[7], at[123]);    MULADD(at[8], at[122]);    MULADD(at[9], at[121]);    MULADD(at[10], at[120]);    MULADD(at[11], at[119]);    MULADD(at[12], at[118]);    MULADD(at[13], at[117]);    MULADD(at[14], at[116]);    MULADD(at[15], at[115]);    MULADD(at[16], at[114]);    MULADD(at[17], at[113]);    MULADD(at[18], at[112]);    MULADD(at[19], at[111]);    MULADD(at[20], at[110]);    MULADD(at[21], at[109]);    MULADD(at[22], at[108]);    MULADD(at[23], at[107]);    MULADD(at[24], at[106]);    MULADD(at[25], at[105]);    MULADD(at[26], at[104]);    MULADD(at[27], at[103]);    MULADD(at[28], at[102]);    MULADD(at[29], at[101]);    MULADD(at[30], at[100]);    MULADD(at[31], at[99]);    MULADD(at[32], at[98]);    MULADD(at[33], at[97]);    MULADD(at[34], at[96]);    MULADD(at[35], at[95]);    MULADD(at[36], at[94]);    MULADD(at[37], at[93]);    MULADD(at[38], at[92]);    MULADD(at[39], at[91]);    MULADD(at[40], at[90]);    MULADD(at[41], at[89]);    MULADD(at[42], at[88]);    MULADD(at[43], at[87]);    MULADD(at[44], at[86]);    MULADD(at[45], at[85]);    MULADD(at[46], at[84]);    MULADD(at[47], at[83]);    MULADD(at[48], at[82]);    MULADD(at[49], at[81]);    MULADD(at[50], at[80]);    MULADD(at[51], at[79]);    MULADD(at[52], at[78]);    MULADD(at[53], at[77]);    MULADD(at[54], at[76]);    MULADD(at[55], at[75]);    MULADD(at[56], at[74]);    MULADD(at[57], at[73]);    MULADD(at[58], at[72]);    MULADD(at[59], at[71]);    MULADD(at[60], at[70]);    MULADD(at[61], at[69]);    MULADD(at[62], at[68]);    MULADD(at[63], at[67]); 
-   COMBA_STORE(C->dp[66]);
-   /* 67 */
-   COMBA_FORWARD;
-   MULADD(at[4], at[127]);    MULADD(at[5], at[126]);    MULADD(at[6], at[125]);    MULADD(at[7], at[124]);    MULADD(at[8], at[123]);    MULADD(at[9], at[122]);    MULADD(at[10], at[121]);    MULADD(at[11], at[120]);    MULADD(at[12], at[119]);    MULADD(at[13], at[118]);    MULADD(at[14], at[117]);    MULADD(at[15], at[116]);    MULADD(at[16], at[115]);    MULADD(at[17], at[114]);    MULADD(at[18], at[113]);    MULADD(at[19], at[112]);    MULADD(at[20], at[111]);    MULADD(at[21], at[110]);    MULADD(at[22], at[109]);    MULADD(at[23], at[108]);    MULADD(at[24], at[107]);    MULADD(at[25], at[106]);    MULADD(at[26], at[105]);    MULADD(at[27], at[104]);    MULADD(at[28], at[103]);    MULADD(at[29], at[102]);    MULADD(at[30], at[101]);    MULADD(at[31], at[100]);    MULADD(at[32], at[99]);    MULADD(at[33], at[98]);    MULADD(at[34], at[97]);    MULADD(at[35], at[96]);    MULADD(at[36], at[95]);    MULADD(at[37], at[94]);    MULADD(at[38], at[93]);    MULADD(at[39], at[92]);    MULADD(at[40], at[91]);    MULADD(at[41], at[90]);    MULADD(at[42], at[89]);    MULADD(at[43], at[88]);    MULADD(at[44], at[87]);    MULADD(at[45], at[86]);    MULADD(at[46], at[85]);    MULADD(at[47], at[84]);    MULADD(at[48], at[83]);    MULADD(at[49], at[82]);    MULADD(at[50], at[81]);    MULADD(at[51], at[80]);    MULADD(at[52], at[79]);    MULADD(at[53], at[78]);    MULADD(at[54], at[77]);    MULADD(at[55], at[76]);    MULADD(at[56], at[75]);    MULADD(at[57], at[74]);    MULADD(at[58], at[73]);    MULADD(at[59], at[72]);    MULADD(at[60], at[71]);    MULADD(at[61], at[70]);    MULADD(at[62], at[69]);    MULADD(at[63], at[68]); 
-   COMBA_STORE(C->dp[67]);
-   /* 68 */
-   COMBA_FORWARD;
-   MULADD(at[5], at[127]);    MULADD(at[6], at[126]);    MULADD(at[7], at[125]);    MULADD(at[8], at[124]);    MULADD(at[9], at[123]);    MULADD(at[10], at[122]);    MULADD(at[11], at[121]);    MULADD(at[12], at[120]);    MULADD(at[13], at[119]);    MULADD(at[14], at[118]);    MULADD(at[15], at[117]);    MULADD(at[16], at[116]);    MULADD(at[17], at[115]);    MULADD(at[18], at[114]);    MULADD(at[19], at[113]);    MULADD(at[20], at[112]);    MULADD(at[21], at[111]);    MULADD(at[22], at[110]);    MULADD(at[23], at[109]);    MULADD(at[24], at[108]);    MULADD(at[25], at[107]);    MULADD(at[26], at[106]);    MULADD(at[27], at[105]);    MULADD(at[28], at[104]);    MULADD(at[29], at[103]);    MULADD(at[30], at[102]);    MULADD(at[31], at[101]);    MULADD(at[32], at[100]);    MULADD(at[33], at[99]);    MULADD(at[34], at[98]);    MULADD(at[35], at[97]);    MULADD(at[36], at[96]);    MULADD(at[37], at[95]);    MULADD(at[38], at[94]);    MULADD(at[39], at[93]);    MULADD(at[40], at[92]);    MULADD(at[41], at[91]);    MULADD(at[42], at[90]);    MULADD(at[43], at[89]);    MULADD(at[44], at[88]);    MULADD(at[45], at[87]);    MULADD(at[46], at[86]);    MULADD(at[47], at[85]);    MULADD(at[48], at[84]);    MULADD(at[49], at[83]);    MULADD(at[50], at[82]);    MULADD(at[51], at[81]);    MULADD(at[52], at[80]);    MULADD(at[53], at[79]);    MULADD(at[54], at[78]);    MULADD(at[55], at[77]);    MULADD(at[56], at[76]);    MULADD(at[57], at[75]);    MULADD(at[58], at[74]);    MULADD(at[59], at[73]);    MULADD(at[60], at[72]);    MULADD(at[61], at[71]);    MULADD(at[62], at[70]);    MULADD(at[63], at[69]); 
-   COMBA_STORE(C->dp[68]);
-   /* 69 */
-   COMBA_FORWARD;
-   MULADD(at[6], at[127]);    MULADD(at[7], at[126]);    MULADD(at[8], at[125]);    MULADD(at[9], at[124]);    MULADD(at[10], at[123]);    MULADD(at[11], at[122]);    MULADD(at[12], at[121]);    MULADD(at[13], at[120]);    MULADD(at[14], at[119]);    MULADD(at[15], at[118]);    MULADD(at[16], at[117]);    MULADD(at[17], at[116]);    MULADD(at[18], at[115]);    MULADD(at[19], at[114]);    MULADD(at[20], at[113]);    MULADD(at[21], at[112]);    MULADD(at[22], at[111]);    MULADD(at[23], at[110]);    MULADD(at[24], at[109]);    MULADD(at[25], at[108]);    MULADD(at[26], at[107]);    MULADD(at[27], at[106]);    MULADD(at[28], at[105]);    MULADD(at[29], at[104]);    MULADD(at[30], at[103]);    MULADD(at[31], at[102]);    MULADD(at[32], at[101]);    MULADD(at[33], at[100]);    MULADD(at[34], at[99]);    MULADD(at[35], at[98]);    MULADD(at[36], at[97]);    MULADD(at[37], at[96]);    MULADD(at[38], at[95]);    MULADD(at[39], at[94]);    MULADD(at[40], at[93]);    MULADD(at[41], at[92]);    MULADD(at[42], at[91]);    MULADD(at[43], at[90]);    MULADD(at[44], at[89]);    MULADD(at[45], at[88]);    MULADD(at[46], at[87]);    MULADD(at[47], at[86]);    MULADD(at[48], at[85]);    MULADD(at[49], at[84]);    MULADD(at[50], at[83]);    MULADD(at[51], at[82]);    MULADD(at[52], at[81]);    MULADD(at[53], at[80]);    MULADD(at[54], at[79]);    MULADD(at[55], at[78]);    MULADD(at[56], at[77]);    MULADD(at[57], at[76]);    MULADD(at[58], at[75]);    MULADD(at[59], at[74]);    MULADD(at[60], at[73]);    MULADD(at[61], at[72]);    MULADD(at[62], at[71]);    MULADD(at[63], at[70]); 
-   COMBA_STORE(C->dp[69]);
-   /* 70 */
-   COMBA_FORWARD;
-   MULADD(at[7], at[127]);    MULADD(at[8], at[126]);    MULADD(at[9], at[125]);    MULADD(at[10], at[124]);    MULADD(at[11], at[123]);    MULADD(at[12], at[122]);    MULADD(at[13], at[121]);    MULADD(at[14], at[120]);    MULADD(at[15], at[119]);    MULADD(at[16], at[118]);    MULADD(at[17], at[117]);    MULADD(at[18], at[116]);    MULADD(at[19], at[115]);    MULADD(at[20], at[114]);    MULADD(at[21], at[113]);    MULADD(at[22], at[112]);    MULADD(at[23], at[111]);    MULADD(at[24], at[110]);    MULADD(at[25], at[109]);    MULADD(at[26], at[108]);    MULADD(at[27], at[107]);    MULADD(at[28], at[106]);    MULADD(at[29], at[105]);    MULADD(at[30], at[104]);    MULADD(at[31], at[103]);    MULADD(at[32], at[102]);    MULADD(at[33], at[101]);    MULADD(at[34], at[100]);    MULADD(at[35], at[99]);    MULADD(at[36], at[98]);    MULADD(at[37], at[97]);    MULADD(at[38], at[96]);    MULADD(at[39], at[95]);    MULADD(at[40], at[94]);    MULADD(at[41], at[93]);    MULADD(at[42], at[92]);    MULADD(at[43], at[91]);    MULADD(at[44], at[90]);    MULADD(at[45], at[89]);    MULADD(at[46], at[88]);    MULADD(at[47], at[87]);    MULADD(at[48], at[86]);    MULADD(at[49], at[85]);    MULADD(at[50], at[84]);    MULADD(at[51], at[83]);    MULADD(at[52], at[82]);    MULADD(at[53], at[81]);    MULADD(at[54], at[80]);    MULADD(at[55], at[79]);    MULADD(at[56], at[78]);    MULADD(at[57], at[77]);    MULADD(at[58], at[76]);    MULADD(at[59], at[75]);    MULADD(at[60], at[74]);    MULADD(at[61], at[73]);    MULADD(at[62], at[72]);    MULADD(at[63], at[71]); 
-   COMBA_STORE(C->dp[70]);
-   /* 71 */
-   COMBA_FORWARD;
-   MULADD(at[8], at[127]);    MULADD(at[9], at[126]);    MULADD(at[10], at[125]);    MULADD(at[11], at[124]);    MULADD(at[12], at[123]);    MULADD(at[13], at[122]);    MULADD(at[14], at[121]);    MULADD(at[15], at[120]);    MULADD(at[16], at[119]);    MULADD(at[17], at[118]);    MULADD(at[18], at[117]);    MULADD(at[19], at[116]);    MULADD(at[20], at[115]);    MULADD(at[21], at[114]);    MULADD(at[22], at[113]);    MULADD(at[23], at[112]);    MULADD(at[24], at[111]);    MULADD(at[25], at[110]);    MULADD(at[26], at[109]);    MULADD(at[27], at[108]);    MULADD(at[28], at[107]);    MULADD(at[29], at[106]);    MULADD(at[30], at[105]);    MULADD(at[31], at[104]);    MULADD(at[32], at[103]);    MULADD(at[33], at[102]);    MULADD(at[34], at[101]);    MULADD(at[35], at[100]);    MULADD(at[36], at[99]);    MULADD(at[37], at[98]);    MULADD(at[38], at[97]);    MULADD(at[39], at[96]);    MULADD(at[40], at[95]);    MULADD(at[41], at[94]);    MULADD(at[42], at[93]);    MULADD(at[43], at[92]);    MULADD(at[44], at[91]);    MULADD(at[45], at[90]);    MULADD(at[46], at[89]);    MULADD(at[47], at[88]);    MULADD(at[48], at[87]);    MULADD(at[49], at[86]);    MULADD(at[50], at[85]);    MULADD(at[51], at[84]);    MULADD(at[52], at[83]);    MULADD(at[53], at[82]);    MULADD(at[54], at[81]);    MULADD(at[55], at[80]);    MULADD(at[56], at[79]);    MULADD(at[57], at[78]);    MULADD(at[58], at[77]);    MULADD(at[59], at[76]);    MULADD(at[60], at[75]);    MULADD(at[61], at[74]);    MULADD(at[62], at[73]);    MULADD(at[63], at[72]); 
-   COMBA_STORE(C->dp[71]);
-   /* 72 */
-   COMBA_FORWARD;
-   MULADD(at[9], at[127]);    MULADD(at[10], at[126]);    MULADD(at[11], at[125]);    MULADD(at[12], at[124]);    MULADD(at[13], at[123]);    MULADD(at[14], at[122]);    MULADD(at[15], at[121]);    MULADD(at[16], at[120]);    MULADD(at[17], at[119]);    MULADD(at[18], at[118]);    MULADD(at[19], at[117]);    MULADD(at[20], at[116]);    MULADD(at[21], at[115]);    MULADD(at[22], at[114]);    MULADD(at[23], at[113]);    MULADD(at[24], at[112]);    MULADD(at[25], at[111]);    MULADD(at[26], at[110]);    MULADD(at[27], at[109]);    MULADD(at[28], at[108]);    MULADD(at[29], at[107]);    MULADD(at[30], at[106]);    MULADD(at[31], at[105]);    MULADD(at[32], at[104]);    MULADD(at[33], at[103]);    MULADD(at[34], at[102]);    MULADD(at[35], at[101]);    MULADD(at[36], at[100]);    MULADD(at[37], at[99]);    MULADD(at[38], at[98]);    MULADD(at[39], at[97]);    MULADD(at[40], at[96]);    MULADD(at[41], at[95]);    MULADD(at[42], at[94]);    MULADD(at[43], at[93]);    MULADD(at[44], at[92]);    MULADD(at[45], at[91]);    MULADD(at[46], at[90]);    MULADD(at[47], at[89]);    MULADD(at[48], at[88]);    MULADD(at[49], at[87]);    MULADD(at[50], at[86]);    MULADD(at[51], at[85]);    MULADD(at[52], at[84]);    MULADD(at[53], at[83]);    MULADD(at[54], at[82]);    MULADD(at[55], at[81]);    MULADD(at[56], at[80]);    MULADD(at[57], at[79]);    MULADD(at[58], at[78]);    MULADD(at[59], at[77]);    MULADD(at[60], at[76]);    MULADD(at[61], at[75]);    MULADD(at[62], at[74]);    MULADD(at[63], at[73]); 
-   COMBA_STORE(C->dp[72]);
-   /* 73 */
-   COMBA_FORWARD;
-   MULADD(at[10], at[127]);    MULADD(at[11], at[126]);    MULADD(at[12], at[125]);    MULADD(at[13], at[124]);    MULADD(at[14], at[123]);    MULADD(at[15], at[122]);    MULADD(at[16], at[121]);    MULADD(at[17], at[120]);    MULADD(at[18], at[119]);    MULADD(at[19], at[118]);    MULADD(at[20], at[117]);    MULADD(at[21], at[116]);    MULADD(at[22], at[115]);    MULADD(at[23], at[114]);    MULADD(at[24], at[113]);    MULADD(at[25], at[112]);    MULADD(at[26], at[111]);    MULADD(at[27], at[110]);    MULADD(at[28], at[109]);    MULADD(at[29], at[108]);    MULADD(at[30], at[107]);    MULADD(at[31], at[106]);    MULADD(at[32], at[105]);    MULADD(at[33], at[104]);    MULADD(at[34], at[103]);    MULADD(at[35], at[102]);    MULADD(at[36], at[101]);    MULADD(at[37], at[100]);    MULADD(at[38], at[99]);    MULADD(at[39], at[98]);    MULADD(at[40], at[97]);    MULADD(at[41], at[96]);    MULADD(at[42], at[95]);    MULADD(at[43], at[94]);    MULADD(at[44], at[93]);    MULADD(at[45], at[92]);    MULADD(at[46], at[91]);    MULADD(at[47], at[90]);    MULADD(at[48], at[89]);    MULADD(at[49], at[88]);    MULADD(at[50], at[87]);    MULADD(at[51], at[86]);    MULADD(at[52], at[85]);    MULADD(at[53], at[84]);    MULADD(at[54], at[83]);    MULADD(at[55], at[82]);    MULADD(at[56], at[81]);    MULADD(at[57], at[80]);    MULADD(at[58], at[79]);    MULADD(at[59], at[78]);    MULADD(at[60], at[77]);    MULADD(at[61], at[76]);    MULADD(at[62], at[75]);    MULADD(at[63], at[74]); 
-   COMBA_STORE(C->dp[73]);
-   /* 74 */
-   COMBA_FORWARD;
-   MULADD(at[11], at[127]);    MULADD(at[12], at[126]);    MULADD(at[13], at[125]);    MULADD(at[14], at[124]);    MULADD(at[15], at[123]);    MULADD(at[16], at[122]);    MULADD(at[17], at[121]);    MULADD(at[18], at[120]);    MULADD(at[19], at[119]);    MULADD(at[20], at[118]);    MULADD(at[21], at[117]);    MULADD(at[22], at[116]);    MULADD(at[23], at[115]);    MULADD(at[24], at[114]);    MULADD(at[25], at[113]);    MULADD(at[26], at[112]);    MULADD(at[27], at[111]);    MULADD(at[28], at[110]);    MULADD(at[29], at[109]);    MULADD(at[30], at[108]);    MULADD(at[31], at[107]);    MULADD(at[32], at[106]);    MULADD(at[33], at[105]);    MULADD(at[34], at[104]);    MULADD(at[35], at[103]);    MULADD(at[36], at[102]);    MULADD(at[37], at[101]);    MULADD(at[38], at[100]);    MULADD(at[39], at[99]);    MULADD(at[40], at[98]);    MULADD(at[41], at[97]);    MULADD(at[42], at[96]);    MULADD(at[43], at[95]);    MULADD(at[44], at[94]);    MULADD(at[45], at[93]);    MULADD(at[46], at[92]);    MULADD(at[47], at[91]);    MULADD(at[48], at[90]);    MULADD(at[49], at[89]);    MULADD(at[50], at[88]);    MULADD(at[51], at[87]);    MULADD(at[52], at[86]);    MULADD(at[53], at[85]);    MULADD(at[54], at[84]);    MULADD(at[55], at[83]);    MULADD(at[56], at[82]);    MULADD(at[57], at[81]);    MULADD(at[58], at[80]);    MULADD(at[59], at[79]);    MULADD(at[60], at[78]);    MULADD(at[61], at[77]);    MULADD(at[62], at[76]);    MULADD(at[63], at[75]); 
-   COMBA_STORE(C->dp[74]);
-   /* 75 */
-   COMBA_FORWARD;
-   MULADD(at[12], at[127]);    MULADD(at[13], at[126]);    MULADD(at[14], at[125]);    MULADD(at[15], at[124]);    MULADD(at[16], at[123]);    MULADD(at[17], at[122]);    MULADD(at[18], at[121]);    MULADD(at[19], at[120]);    MULADD(at[20], at[119]);    MULADD(at[21], at[118]);    MULADD(at[22], at[117]);    MULADD(at[23], at[116]);    MULADD(at[24], at[115]);    MULADD(at[25], at[114]);    MULADD(at[26], at[113]);    MULADD(at[27], at[112]);    MULADD(at[28], at[111]);    MULADD(at[29], at[110]);    MULADD(at[30], at[109]);    MULADD(at[31], at[108]);    MULADD(at[32], at[107]);    MULADD(at[33], at[106]);    MULADD(at[34], at[105]);    MULADD(at[35], at[104]);    MULADD(at[36], at[103]);    MULADD(at[37], at[102]);    MULADD(at[38], at[101]);    MULADD(at[39], at[100]);    MULADD(at[40], at[99]);    MULADD(at[41], at[98]);    MULADD(at[42], at[97]);    MULADD(at[43], at[96]);    MULADD(at[44], at[95]);    MULADD(at[45], at[94]);    MULADD(at[46], at[93]);    MULADD(at[47], at[92]);    MULADD(at[48], at[91]);    MULADD(at[49], at[90]);    MULADD(at[50], at[89]);    MULADD(at[51], at[88]);    MULADD(at[52], at[87]);    MULADD(at[53], at[86]);    MULADD(at[54], at[85]);    MULADD(at[55], at[84]);    MULADD(at[56], at[83]);    MULADD(at[57], at[82]);    MULADD(at[58], at[81]);    MULADD(at[59], at[80]);    MULADD(at[60], at[79]);    MULADD(at[61], at[78]);    MULADD(at[62], at[77]);    MULADD(at[63], at[76]); 
-   COMBA_STORE(C->dp[75]);
-   /* 76 */
-   COMBA_FORWARD;
-   MULADD(at[13], at[127]);    MULADD(at[14], at[126]);    MULADD(at[15], at[125]);    MULADD(at[16], at[124]);    MULADD(at[17], at[123]);    MULADD(at[18], at[122]);    MULADD(at[19], at[121]);    MULADD(at[20], at[120]);    MULADD(at[21], at[119]);    MULADD(at[22], at[118]);    MULADD(at[23], at[117]);    MULADD(at[24], at[116]);    MULADD(at[25], at[115]);    MULADD(at[26], at[114]);    MULADD(at[27], at[113]);    MULADD(at[28], at[112]);    MULADD(at[29], at[111]);    MULADD(at[30], at[110]);    MULADD(at[31], at[109]);    MULADD(at[32], at[108]);    MULADD(at[33], at[107]);    MULADD(at[34], at[106]);    MULADD(at[35], at[105]);    MULADD(at[36], at[104]);    MULADD(at[37], at[103]);    MULADD(at[38], at[102]);    MULADD(at[39], at[101]);    MULADD(at[40], at[100]);    MULADD(at[41], at[99]);    MULADD(at[42], at[98]);    MULADD(at[43], at[97]);    MULADD(at[44], at[96]);    MULADD(at[45], at[95]);    MULADD(at[46], at[94]);    MULADD(at[47], at[93]);    MULADD(at[48], at[92]);    MULADD(at[49], at[91]);    MULADD(at[50], at[90]);    MULADD(at[51], at[89]);    MULADD(at[52], at[88]);    MULADD(at[53], at[87]);    MULADD(at[54], at[86]);    MULADD(at[55], at[85]);    MULADD(at[56], at[84]);    MULADD(at[57], at[83]);    MULADD(at[58], at[82]);    MULADD(at[59], at[81]);    MULADD(at[60], at[80]);    MULADD(at[61], at[79]);    MULADD(at[62], at[78]);    MULADD(at[63], at[77]); 
-   COMBA_STORE(C->dp[76]);
-   /* 77 */
-   COMBA_FORWARD;
-   MULADD(at[14], at[127]);    MULADD(at[15], at[126]);    MULADD(at[16], at[125]);    MULADD(at[17], at[124]);    MULADD(at[18], at[123]);    MULADD(at[19], at[122]);    MULADD(at[20], at[121]);    MULADD(at[21], at[120]);    MULADD(at[22], at[119]);    MULADD(at[23], at[118]);    MULADD(at[24], at[117]);    MULADD(at[25], at[116]);    MULADD(at[26], at[115]);    MULADD(at[27], at[114]);    MULADD(at[28], at[113]);    MULADD(at[29], at[112]);    MULADD(at[30], at[111]);    MULADD(at[31], at[110]);    MULADD(at[32], at[109]);    MULADD(at[33], at[108]);    MULADD(at[34], at[107]);    MULADD(at[35], at[106]);    MULADD(at[36], at[105]);    MULADD(at[37], at[104]);    MULADD(at[38], at[103]);    MULADD(at[39], at[102]);    MULADD(at[40], at[101]);    MULADD(at[41], at[100]);    MULADD(at[42], at[99]);    MULADD(at[43], at[98]);    MULADD(at[44], at[97]);    MULADD(at[45], at[96]);    MULADD(at[46], at[95]);    MULADD(at[47], at[94]);    MULADD(at[48], at[93]);    MULADD(at[49], at[92]);    MULADD(at[50], at[91]);    MULADD(at[51], at[90]);    MULADD(at[52], at[89]);    MULADD(at[53], at[88]);    MULADD(at[54], at[87]);    MULADD(at[55], at[86]);    MULADD(at[56], at[85]);    MULADD(at[57], at[84]);    MULADD(at[58], at[83]);    MULADD(at[59], at[82]);    MULADD(at[60], at[81]);    MULADD(at[61], at[80]);    MULADD(at[62], at[79]);    MULADD(at[63], at[78]); 
-   COMBA_STORE(C->dp[77]);
-   /* 78 */
-   COMBA_FORWARD;
-   MULADD(at[15], at[127]);    MULADD(at[16], at[126]);    MULADD(at[17], at[125]);    MULADD(at[18], at[124]);    MULADD(at[19], at[123]);    MULADD(at[20], at[122]);    MULADD(at[21], at[121]);    MULADD(at[22], at[120]);    MULADD(at[23], at[119]);    MULADD(at[24], at[118]);    MULADD(at[25], at[117]);    MULADD(at[26], at[116]);    MULADD(at[27], at[115]);    MULADD(at[28], at[114]);    MULADD(at[29], at[113]);    MULADD(at[30], at[112]);    MULADD(at[31], at[111]);    MULADD(at[32], at[110]);    MULADD(at[33], at[109]);    MULADD(at[34], at[108]);    MULADD(at[35], at[107]);    MULADD(at[36], at[106]);    MULADD(at[37], at[105]);    MULADD(at[38], at[104]);    MULADD(at[39], at[103]);    MULADD(at[40], at[102]);    MULADD(at[41], at[101]);    MULADD(at[42], at[100]);    MULADD(at[43], at[99]);    MULADD(at[44], at[98]);    MULADD(at[45], at[97]);    MULADD(at[46], at[96]);    MULADD(at[47], at[95]);    MULADD(at[48], at[94]);    MULADD(at[49], at[93]);    MULADD(at[50], at[92]);    MULADD(at[51], at[91]);    MULADD(at[52], at[90]);    MULADD(at[53], at[89]);    MULADD(at[54], at[88]);    MULADD(at[55], at[87]);    MULADD(at[56], at[86]);    MULADD(at[57], at[85]);    MULADD(at[58], at[84]);    MULADD(at[59], at[83]);    MULADD(at[60], at[82]);    MULADD(at[61], at[81]);    MULADD(at[62], at[80]);    MULADD(at[63], at[79]); 
-   COMBA_STORE(C->dp[78]);
-   /* 79 */
-   COMBA_FORWARD;
-   MULADD(at[16], at[127]);    MULADD(at[17], at[126]);    MULADD(at[18], at[125]);    MULADD(at[19], at[124]);    MULADD(at[20], at[123]);    MULADD(at[21], at[122]);    MULADD(at[22], at[121]);    MULADD(at[23], at[120]);    MULADD(at[24], at[119]);    MULADD(at[25], at[118]);    MULADD(at[26], at[117]);    MULADD(at[27], at[116]);    MULADD(at[28], at[115]);    MULADD(at[29], at[114]);    MULADD(at[30], at[113]);    MULADD(at[31], at[112]);    MULADD(at[32], at[111]);    MULADD(at[33], at[110]);    MULADD(at[34], at[109]);    MULADD(at[35], at[108]);    MULADD(at[36], at[107]);    MULADD(at[37], at[106]);    MULADD(at[38], at[105]);    MULADD(at[39], at[104]);    MULADD(at[40], at[103]);    MULADD(at[41], at[102]);    MULADD(at[42], at[101]);    MULADD(at[43], at[100]);    MULADD(at[44], at[99]);    MULADD(at[45], at[98]);    MULADD(at[46], at[97]);    MULADD(at[47], at[96]);    MULADD(at[48], at[95]);    MULADD(at[49], at[94]);    MULADD(at[50], at[93]);    MULADD(at[51], at[92]);    MULADD(at[52], at[91]);    MULADD(at[53], at[90]);    MULADD(at[54], at[89]);    MULADD(at[55], at[88]);    MULADD(at[56], at[87]);    MULADD(at[57], at[86]);    MULADD(at[58], at[85]);    MULADD(at[59], at[84]);    MULADD(at[60], at[83]);    MULADD(at[61], at[82]);    MULADD(at[62], at[81]);    MULADD(at[63], at[80]); 
-   COMBA_STORE(C->dp[79]);
-   /* 80 */
-   COMBA_FORWARD;
-   MULADD(at[17], at[127]);    MULADD(at[18], at[126]);    MULADD(at[19], at[125]);    MULADD(at[20], at[124]);    MULADD(at[21], at[123]);    MULADD(at[22], at[122]);    MULADD(at[23], at[121]);    MULADD(at[24], at[120]);    MULADD(at[25], at[119]);    MULADD(at[26], at[118]);    MULADD(at[27], at[117]);    MULADD(at[28], at[116]);    MULADD(at[29], at[115]);    MULADD(at[30], at[114]);    MULADD(at[31], at[113]);    MULADD(at[32], at[112]);    MULADD(at[33], at[111]);    MULADD(at[34], at[110]);    MULADD(at[35], at[109]);    MULADD(at[36], at[108]);    MULADD(at[37], at[107]);    MULADD(at[38], at[106]);    MULADD(at[39], at[105]);    MULADD(at[40], at[104]);    MULADD(at[41], at[103]);    MULADD(at[42], at[102]);    MULADD(at[43], at[101]);    MULADD(at[44], at[100]);    MULADD(at[45], at[99]);    MULADD(at[46], at[98]);    MULADD(at[47], at[97]);    MULADD(at[48], at[96]);    MULADD(at[49], at[95]);    MULADD(at[50], at[94]);    MULADD(at[51], at[93]);    MULADD(at[52], at[92]);    MULADD(at[53], at[91]);    MULADD(at[54], at[90]);    MULADD(at[55], at[89]);    MULADD(at[56], at[88]);    MULADD(at[57], at[87]);    MULADD(at[58], at[86]);    MULADD(at[59], at[85]);    MULADD(at[60], at[84]);    MULADD(at[61], at[83]);    MULADD(at[62], at[82]);    MULADD(at[63], at[81]); 
-   COMBA_STORE(C->dp[80]);
-   /* 81 */
-   COMBA_FORWARD;
-   MULADD(at[18], at[127]);    MULADD(at[19], at[126]);    MULADD(at[20], at[125]);    MULADD(at[21], at[124]);    MULADD(at[22], at[123]);    MULADD(at[23], at[122]);    MULADD(at[24], at[121]);    MULADD(at[25], at[120]);    MULADD(at[26], at[119]);    MULADD(at[27], at[118]);    MULADD(at[28], at[117]);    MULADD(at[29], at[116]);    MULADD(at[30], at[115]);    MULADD(at[31], at[114]);    MULADD(at[32], at[113]);    MULADD(at[33], at[112]);    MULADD(at[34], at[111]);    MULADD(at[35], at[110]);    MULADD(at[36], at[109]);    MULADD(at[37], at[108]);    MULADD(at[38], at[107]);    MULADD(at[39], at[106]);    MULADD(at[40], at[105]);    MULADD(at[41], at[104]);    MULADD(at[42], at[103]);    MULADD(at[43], at[102]);    MULADD(at[44], at[101]);    MULADD(at[45], at[100]);    MULADD(at[46], at[99]);    MULADD(at[47], at[98]);    MULADD(at[48], at[97]);    MULADD(at[49], at[96]);    MULADD(at[50], at[95]);    MULADD(at[51], at[94]);    MULADD(at[52], at[93]);    MULADD(at[53], at[92]);    MULADD(at[54], at[91]);    MULADD(at[55], at[90]);    MULADD(at[56], at[89]);    MULADD(at[57], at[88]);    MULADD(at[58], at[87]);    MULADD(at[59], at[86]);    MULADD(at[60], at[85]);    MULADD(at[61], at[84]);    MULADD(at[62], at[83]);    MULADD(at[63], at[82]); 
-   COMBA_STORE(C->dp[81]);
-   /* 82 */
-   COMBA_FORWARD;
-   MULADD(at[19], at[127]);    MULADD(at[20], at[126]);    MULADD(at[21], at[125]);    MULADD(at[22], at[124]);    MULADD(at[23], at[123]);    MULADD(at[24], at[122]);    MULADD(at[25], at[121]);    MULADD(at[26], at[120]);    MULADD(at[27], at[119]);    MULADD(at[28], at[118]);    MULADD(at[29], at[117]);    MULADD(at[30], at[116]);    MULADD(at[31], at[115]);    MULADD(at[32], at[114]);    MULADD(at[33], at[113]);    MULADD(at[34], at[112]);    MULADD(at[35], at[111]);    MULADD(at[36], at[110]);    MULADD(at[37], at[109]);    MULADD(at[38], at[108]);    MULADD(at[39], at[107]);    MULADD(at[40], at[106]);    MULADD(at[41], at[105]);    MULADD(at[42], at[104]);    MULADD(at[43], at[103]);    MULADD(at[44], at[102]);    MULADD(at[45], at[101]);    MULADD(at[46], at[100]);    MULADD(at[47], at[99]);    MULADD(at[48], at[98]);    MULADD(at[49], at[97]);    MULADD(at[50], at[96]);    MULADD(at[51], at[95]);    MULADD(at[52], at[94]);    MULADD(at[53], at[93]);    MULADD(at[54], at[92]);    MULADD(at[55], at[91]);    MULADD(at[56], at[90]);    MULADD(at[57], at[89]);    MULADD(at[58], at[88]);    MULADD(at[59], at[87]);    MULADD(at[60], at[86]);    MULADD(at[61], at[85]);    MULADD(at[62], at[84]);    MULADD(at[63], at[83]); 
-   COMBA_STORE(C->dp[82]);
-   /* 83 */
-   COMBA_FORWARD;
-   MULADD(at[20], at[127]);    MULADD(at[21], at[126]);    MULADD(at[22], at[125]);    MULADD(at[23], at[124]);    MULADD(at[24], at[123]);    MULADD(at[25], at[122]);    MULADD(at[26], at[121]);    MULADD(at[27], at[120]);    MULADD(at[28], at[119]);    MULADD(at[29], at[118]);    MULADD(at[30], at[117]);    MULADD(at[31], at[116]);    MULADD(at[32], at[115]);    MULADD(at[33], at[114]);    MULADD(at[34], at[113]);    MULADD(at[35], at[112]);    MULADD(at[36], at[111]);    MULADD(at[37], at[110]);    MULADD(at[38], at[109]);    MULADD(at[39], at[108]);    MULADD(at[40], at[107]);    MULADD(at[41], at[106]);    MULADD(at[42], at[105]);    MULADD(at[43], at[104]);    MULADD(at[44], at[103]);    MULADD(at[45], at[102]);    MULADD(at[46], at[101]);    MULADD(at[47], at[100]);    MULADD(at[48], at[99]);    MULADD(at[49], at[98]);    MULADD(at[50], at[97]);    MULADD(at[51], at[96]);    MULADD(at[52], at[95]);    MULADD(at[53], at[94]);    MULADD(at[54], at[93]);    MULADD(at[55], at[92]);    MULADD(at[56], at[91]);    MULADD(at[57], at[90]);    MULADD(at[58], at[89]);    MULADD(at[59], at[88]);    MULADD(at[60], at[87]);    MULADD(at[61], at[86]);    MULADD(at[62], at[85]);    MULADD(at[63], at[84]); 
-   COMBA_STORE(C->dp[83]);
-   /* 84 */
-   COMBA_FORWARD;
-   MULADD(at[21], at[127]);    MULADD(at[22], at[126]);    MULADD(at[23], at[125]);    MULADD(at[24], at[124]);    MULADD(at[25], at[123]);    MULADD(at[26], at[122]);    MULADD(at[27], at[121]);    MULADD(at[28], at[120]);    MULADD(at[29], at[119]);    MULADD(at[30], at[118]);    MULADD(at[31], at[117]);    MULADD(at[32], at[116]);    MULADD(at[33], at[115]);    MULADD(at[34], at[114]);    MULADD(at[35], at[113]);    MULADD(at[36], at[112]);    MULADD(at[37], at[111]);    MULADD(at[38], at[110]);    MULADD(at[39], at[109]);    MULADD(at[40], at[108]);    MULADD(at[41], at[107]);    MULADD(at[42], at[106]);    MULADD(at[43], at[105]);    MULADD(at[44], at[104]);    MULADD(at[45], at[103]);    MULADD(at[46], at[102]);    MULADD(at[47], at[101]);    MULADD(at[48], at[100]);    MULADD(at[49], at[99]);    MULADD(at[50], at[98]);    MULADD(at[51], at[97]);    MULADD(at[52], at[96]);    MULADD(at[53], at[95]);    MULADD(at[54], at[94]);    MULADD(at[55], at[93]);    MULADD(at[56], at[92]);    MULADD(at[57], at[91]);    MULADD(at[58], at[90]);    MULADD(at[59], at[89]);    MULADD(at[60], at[88]);    MULADD(at[61], at[87]);    MULADD(at[62], at[86]);    MULADD(at[63], at[85]); 
-   COMBA_STORE(C->dp[84]);
-   /* 85 */
-   COMBA_FORWARD;
-   MULADD(at[22], at[127]);    MULADD(at[23], at[126]);    MULADD(at[24], at[125]);    MULADD(at[25], at[124]);    MULADD(at[26], at[123]);    MULADD(at[27], at[122]);    MULADD(at[28], at[121]);    MULADD(at[29], at[120]);    MULADD(at[30], at[119]);    MULADD(at[31], at[118]);    MULADD(at[32], at[117]);    MULADD(at[33], at[116]);    MULADD(at[34], at[115]);    MULADD(at[35], at[114]);    MULADD(at[36], at[113]);    MULADD(at[37], at[112]);    MULADD(at[38], at[111]);    MULADD(at[39], at[110]);    MULADD(at[40], at[109]);    MULADD(at[41], at[108]);    MULADD(at[42], at[107]);    MULADD(at[43], at[106]);    MULADD(at[44], at[105]);    MULADD(at[45], at[104]);    MULADD(at[46], at[103]);    MULADD(at[47], at[102]);    MULADD(at[48], at[101]);    MULADD(at[49], at[100]);    MULADD(at[50], at[99]);    MULADD(at[51], at[98]);    MULADD(at[52], at[97]);    MULADD(at[53], at[96]);    MULADD(at[54], at[95]);    MULADD(at[55], at[94]);    MULADD(at[56], at[93]);    MULADD(at[57], at[92]);    MULADD(at[58], at[91]);    MULADD(at[59], at[90]);    MULADD(at[60], at[89]);    MULADD(at[61], at[88]);    MULADD(at[62], at[87]);    MULADD(at[63], at[86]); 
-   COMBA_STORE(C->dp[85]);
-   /* 86 */
-   COMBA_FORWARD;
-   MULADD(at[23], at[127]);    MULADD(at[24], at[126]);    MULADD(at[25], at[125]);    MULADD(at[26], at[124]);    MULADD(at[27], at[123]);    MULADD(at[28], at[122]);    MULADD(at[29], at[121]);    MULADD(at[30], at[120]);    MULADD(at[31], at[119]);    MULADD(at[32], at[118]);    MULADD(at[33], at[117]);    MULADD(at[34], at[116]);    MULADD(at[35], at[115]);    MULADD(at[36], at[114]);    MULADD(at[37], at[113]);    MULADD(at[38], at[112]);    MULADD(at[39], at[111]);    MULADD(at[40], at[110]);    MULADD(at[41], at[109]);    MULADD(at[42], at[108]);    MULADD(at[43], at[107]);    MULADD(at[44], at[106]);    MULADD(at[45], at[105]);    MULADD(at[46], at[104]);    MULADD(at[47], at[103]);    MULADD(at[48], at[102]);    MULADD(at[49], at[101]);    MULADD(at[50], at[100]);    MULADD(at[51], at[99]);    MULADD(at[52], at[98]);    MULADD(at[53], at[97]);    MULADD(at[54], at[96]);    MULADD(at[55], at[95]);    MULADD(at[56], at[94]);    MULADD(at[57], at[93]);    MULADD(at[58], at[92]);    MULADD(at[59], at[91]);    MULADD(at[60], at[90]);    MULADD(at[61], at[89]);    MULADD(at[62], at[88]);    MULADD(at[63], at[87]); 
-   COMBA_STORE(C->dp[86]);
-   /* 87 */
-   COMBA_FORWARD;
-   MULADD(at[24], at[127]);    MULADD(at[25], at[126]);    MULADD(at[26], at[125]);    MULADD(at[27], at[124]);    MULADD(at[28], at[123]);    MULADD(at[29], at[122]);    MULADD(at[30], at[121]);    MULADD(at[31], at[120]);    MULADD(at[32], at[119]);    MULADD(at[33], at[118]);    MULADD(at[34], at[117]);    MULADD(at[35], at[116]);    MULADD(at[36], at[115]);    MULADD(at[37], at[114]);    MULADD(at[38], at[113]);    MULADD(at[39], at[112]);    MULADD(at[40], at[111]);    MULADD(at[41], at[110]);    MULADD(at[42], at[109]);    MULADD(at[43], at[108]);    MULADD(at[44], at[107]);    MULADD(at[45], at[106]);    MULADD(at[46], at[105]);    MULADD(at[47], at[104]);    MULADD(at[48], at[103]);    MULADD(at[49], at[102]);    MULADD(at[50], at[101]);    MULADD(at[51], at[100]);    MULADD(at[52], at[99]);    MULADD(at[53], at[98]);    MULADD(at[54], at[97]);    MULADD(at[55], at[96]);    MULADD(at[56], at[95]);    MULADD(at[57], at[94]);    MULADD(at[58], at[93]);    MULADD(at[59], at[92]);    MULADD(at[60], at[91]);    MULADD(at[61], at[90]);    MULADD(at[62], at[89]);    MULADD(at[63], at[88]); 
-   COMBA_STORE(C->dp[87]);
-   /* 88 */
-   COMBA_FORWARD;
-   MULADD(at[25], at[127]);    MULADD(at[26], at[126]);    MULADD(at[27], at[125]);    MULADD(at[28], at[124]);    MULADD(at[29], at[123]);    MULADD(at[30], at[122]);    MULADD(at[31], at[121]);    MULADD(at[32], at[120]);    MULADD(at[33], at[119]);    MULADD(at[34], at[118]);    MULADD(at[35], at[117]);    MULADD(at[36], at[116]);    MULADD(at[37], at[115]);    MULADD(at[38], at[114]);    MULADD(at[39], at[113]);    MULADD(at[40], at[112]);    MULADD(at[41], at[111]);    MULADD(at[42], at[110]);    MULADD(at[43], at[109]);    MULADD(at[44], at[108]);    MULADD(at[45], at[107]);    MULADD(at[46], at[106]);    MULADD(at[47], at[105]);    MULADD(at[48], at[104]);    MULADD(at[49], at[103]);    MULADD(at[50], at[102]);    MULADD(at[51], at[101]);    MULADD(at[52], at[100]);    MULADD(at[53], at[99]);    MULADD(at[54], at[98]);    MULADD(at[55], at[97]);    MULADD(at[56], at[96]);    MULADD(at[57], at[95]);    MULADD(at[58], at[94]);    MULADD(at[59], at[93]);    MULADD(at[60], at[92]);    MULADD(at[61], at[91]);    MULADD(at[62], at[90]);    MULADD(at[63], at[89]); 
-   COMBA_STORE(C->dp[88]);
-   /* 89 */
-   COMBA_FORWARD;
-   MULADD(at[26], at[127]);    MULADD(at[27], at[126]);    MULADD(at[28], at[125]);    MULADD(at[29], at[124]);    MULADD(at[30], at[123]);    MULADD(at[31], at[122]);    MULADD(at[32], at[121]);    MULADD(at[33], at[120]);    MULADD(at[34], at[119]);    MULADD(at[35], at[118]);    MULADD(at[36], at[117]);    MULADD(at[37], at[116]);    MULADD(at[38], at[115]);    MULADD(at[39], at[114]);    MULADD(at[40], at[113]);    MULADD(at[41], at[112]);    MULADD(at[42], at[111]);    MULADD(at[43], at[110]);    MULADD(at[44], at[109]);    MULADD(at[45], at[108]);    MULADD(at[46], at[107]);    MULADD(at[47], at[106]);    MULADD(at[48], at[105]);    MULADD(at[49], at[104]);    MULADD(at[50], at[103]);    MULADD(at[51], at[102]);    MULADD(at[52], at[101]);    MULADD(at[53], at[100]);    MULADD(at[54], at[99]);    MULADD(at[55], at[98]);    MULADD(at[56], at[97]);    MULADD(at[57], at[96]);    MULADD(at[58], at[95]);    MULADD(at[59], at[94]);    MULADD(at[60], at[93]);    MULADD(at[61], at[92]);    MULADD(at[62], at[91]);    MULADD(at[63], at[90]); 
-   COMBA_STORE(C->dp[89]);
-   /* 90 */
-   COMBA_FORWARD;
-   MULADD(at[27], at[127]);    MULADD(at[28], at[126]);    MULADD(at[29], at[125]);    MULADD(at[30], at[124]);    MULADD(at[31], at[123]);    MULADD(at[32], at[122]);    MULADD(at[33], at[121]);    MULADD(at[34], at[120]);    MULADD(at[35], at[119]);    MULADD(at[36], at[118]);    MULADD(at[37], at[117]);    MULADD(at[38], at[116]);    MULADD(at[39], at[115]);    MULADD(at[40], at[114]);    MULADD(at[41], at[113]);    MULADD(at[42], at[112]);    MULADD(at[43], at[111]);    MULADD(at[44], at[110]);    MULADD(at[45], at[109]);    MULADD(at[46], at[108]);    MULADD(at[47], at[107]);    MULADD(at[48], at[106]);    MULADD(at[49], at[105]);    MULADD(at[50], at[104]);    MULADD(at[51], at[103]);    MULADD(at[52], at[102]);    MULADD(at[53], at[101]);    MULADD(at[54], at[100]);    MULADD(at[55], at[99]);    MULADD(at[56], at[98]);    MULADD(at[57], at[97]);    MULADD(at[58], at[96]);    MULADD(at[59], at[95]);    MULADD(at[60], at[94]);    MULADD(at[61], at[93]);    MULADD(at[62], at[92]);    MULADD(at[63], at[91]); 
-   COMBA_STORE(C->dp[90]);
-   /* 91 */
-   COMBA_FORWARD;
-   MULADD(at[28], at[127]);    MULADD(at[29], at[126]);    MULADD(at[30], at[125]);    MULADD(at[31], at[124]);    MULADD(at[32], at[123]);    MULADD(at[33], at[122]);    MULADD(at[34], at[121]);    MULADD(at[35], at[120]);    MULADD(at[36], at[119]);    MULADD(at[37], at[118]);    MULADD(at[38], at[117]);    MULADD(at[39], at[116]);    MULADD(at[40], at[115]);    MULADD(at[41], at[114]);    MULADD(at[42], at[113]);    MULADD(at[43], at[112]);    MULADD(at[44], at[111]);    MULADD(at[45], at[110]);    MULADD(at[46], at[109]);    MULADD(at[47], at[108]);    MULADD(at[48], at[107]);    MULADD(at[49], at[106]);    MULADD(at[50], at[105]);    MULADD(at[51], at[104]);    MULADD(at[52], at[103]);    MULADD(at[53], at[102]);    MULADD(at[54], at[101]);    MULADD(at[55], at[100]);    MULADD(at[56], at[99]);    MULADD(at[57], at[98]);    MULADD(at[58], at[97]);    MULADD(at[59], at[96]);    MULADD(at[60], at[95]);    MULADD(at[61], at[94]);    MULADD(at[62], at[93]);    MULADD(at[63], at[92]); 
-   COMBA_STORE(C->dp[91]);
-   /* 92 */
-   COMBA_FORWARD;
-   MULADD(at[29], at[127]);    MULADD(at[30], at[126]);    MULADD(at[31], at[125]);    MULADD(at[32], at[124]);    MULADD(at[33], at[123]);    MULADD(at[34], at[122]);    MULADD(at[35], at[121]);    MULADD(at[36], at[120]);    MULADD(at[37], at[119]);    MULADD(at[38], at[118]);    MULADD(at[39], at[117]);    MULADD(at[40], at[116]);    MULADD(at[41], at[115]);    MULADD(at[42], at[114]);    MULADD(at[43], at[113]);    MULADD(at[44], at[112]);    MULADD(at[45], at[111]);    MULADD(at[46], at[110]);    MULADD(at[47], at[109]);    MULADD(at[48], at[108]);    MULADD(at[49], at[107]);    MULADD(at[50], at[106]);    MULADD(at[51], at[105]);    MULADD(at[52], at[104]);    MULADD(at[53], at[103]);    MULADD(at[54], at[102]);    MULADD(at[55], at[101]);    MULADD(at[56], at[100]);    MULADD(at[57], at[99]);    MULADD(at[58], at[98]);    MULADD(at[59], at[97]);    MULADD(at[60], at[96]);    MULADD(at[61], at[95]);    MULADD(at[62], at[94]);    MULADD(at[63], at[93]); 
-   COMBA_STORE(C->dp[92]);
-   /* 93 */
-   COMBA_FORWARD;
-   MULADD(at[30], at[127]);    MULADD(at[31], at[126]);    MULADD(at[32], at[125]);    MULADD(at[33], at[124]);    MULADD(at[34], at[123]);    MULADD(at[35], at[122]);    MULADD(at[36], at[121]);    MULADD(at[37], at[120]);    MULADD(at[38], at[119]);    MULADD(at[39], at[118]);    MULADD(at[40], at[117]);    MULADD(at[41], at[116]);    MULADD(at[42], at[115]);    MULADD(at[43], at[114]);    MULADD(at[44], at[113]);    MULADD(at[45], at[112]);    MULADD(at[46], at[111]);    MULADD(at[47], at[110]);    MULADD(at[48], at[109]);    MULADD(at[49], at[108]);    MULADD(at[50], at[107]);    MULADD(at[51], at[106]);    MULADD(at[52], at[105]);    MULADD(at[53], at[104]);    MULADD(at[54], at[103]);    MULADD(at[55], at[102]);    MULADD(at[56], at[101]);    MULADD(at[57], at[100]);    MULADD(at[58], at[99]);    MULADD(at[59], at[98]);    MULADD(at[60], at[97]);    MULADD(at[61], at[96]);    MULADD(at[62], at[95]);    MULADD(at[63], at[94]); 
-   COMBA_STORE(C->dp[93]);
-   /* 94 */
-   COMBA_FORWARD;
-   MULADD(at[31], at[127]);    MULADD(at[32], at[126]);    MULADD(at[33], at[125]);    MULADD(at[34], at[124]);    MULADD(at[35], at[123]);    MULADD(at[36], at[122]);    MULADD(at[37], at[121]);    MULADD(at[38], at[120]);    MULADD(at[39], at[119]);    MULADD(at[40], at[118]);    MULADD(at[41], at[117]);    MULADD(at[42], at[116]);    MULADD(at[43], at[115]);    MULADD(at[44], at[114]);    MULADD(at[45], at[113]);    MULADD(at[46], at[112]);    MULADD(at[47], at[111]);    MULADD(at[48], at[110]);    MULADD(at[49], at[109]);    MULADD(at[50], at[108]);    MULADD(at[51], at[107]);    MULADD(at[52], at[106]);    MULADD(at[53], at[105]);    MULADD(at[54], at[104]);    MULADD(at[55], at[103]);    MULADD(at[56], at[102]);    MULADD(at[57], at[101]);    MULADD(at[58], at[100]);    MULADD(at[59], at[99]);    MULADD(at[60], at[98]);    MULADD(at[61], at[97]);    MULADD(at[62], at[96]);    MULADD(at[63], at[95]); 
-   COMBA_STORE(C->dp[94]);
-   /* 95 */
-   COMBA_FORWARD;
-   MULADD(at[32], at[127]);    MULADD(at[33], at[126]);    MULADD(at[34], at[125]);    MULADD(at[35], at[124]);    MULADD(at[36], at[123]);    MULADD(at[37], at[122]);    MULADD(at[38], at[121]);    MULADD(at[39], at[120]);    MULADD(at[40], at[119]);    MULADD(at[41], at[118]);    MULADD(at[42], at[117]);    MULADD(at[43], at[116]);    MULADD(at[44], at[115]);    MULADD(at[45], at[114]);    MULADD(at[46], at[113]);    MULADD(at[47], at[112]);    MULADD(at[48], at[111]);    MULADD(at[49], at[110]);    MULADD(at[50], at[109]);    MULADD(at[51], at[108]);    MULADD(at[52], at[107]);    MULADD(at[53], at[106]);    MULADD(at[54], at[105]);    MULADD(at[55], at[104]);    MULADD(at[56], at[103]);    MULADD(at[57], at[102]);    MULADD(at[58], at[101]);    MULADD(at[59], at[100]);    MULADD(at[60], at[99]);    MULADD(at[61], at[98]);    MULADD(at[62], at[97]);    MULADD(at[63], at[96]); 
-   COMBA_STORE(C->dp[95]);
-   /* 96 */
-   COMBA_FORWARD;
-   MULADD(at[33], at[127]);    MULADD(at[34], at[126]);    MULADD(at[35], at[125]);    MULADD(at[36], at[124]);    MULADD(at[37], at[123]);    MULADD(at[38], at[122]);    MULADD(at[39], at[121]);    MULADD(at[40], at[120]);    MULADD(at[41], at[119]);    MULADD(at[42], at[118]);    MULADD(at[43], at[117]);    MULADD(at[44], at[116]);    MULADD(at[45], at[115]);    MULADD(at[46], at[114]);    MULADD(at[47], at[113]);    MULADD(at[48], at[112]);    MULADD(at[49], at[111]);    MULADD(at[50], at[110]);    MULADD(at[51], at[109]);    MULADD(at[52], at[108]);    MULADD(at[53], at[107]);    MULADD(at[54], at[106]);    MULADD(at[55], at[105]);    MULADD(at[56], at[104]);    MULADD(at[57], at[103]);    MULADD(at[58], at[102]);    MULADD(at[59], at[101]);    MULADD(at[60], at[100]);    MULADD(at[61], at[99]);    MULADD(at[62], at[98]);    MULADD(at[63], at[97]); 
-   COMBA_STORE(C->dp[96]);
-   /* 97 */
-   COMBA_FORWARD;
-   MULADD(at[34], at[127]);    MULADD(at[35], at[126]);    MULADD(at[36], at[125]);    MULADD(at[37], at[124]);    MULADD(at[38], at[123]);    MULADD(at[39], at[122]);    MULADD(at[40], at[121]);    MULADD(at[41], at[120]);    MULADD(at[42], at[119]);    MULADD(at[43], at[118]);    MULADD(at[44], at[117]);    MULADD(at[45], at[116]);    MULADD(at[46], at[115]);    MULADD(at[47], at[114]);    MULADD(at[48], at[113]);    MULADD(at[49], at[112]);    MULADD(at[50], at[111]);    MULADD(at[51], at[110]);    MULADD(at[52], at[109]);    MULADD(at[53], at[108]);    MULADD(at[54], at[107]);    MULADD(at[55], at[106]);    MULADD(at[56], at[105]);    MULADD(at[57], at[104]);    MULADD(at[58], at[103]);    MULADD(at[59], at[102]);    MULADD(at[60], at[101]);    MULADD(at[61], at[100]);    MULADD(at[62], at[99]);    MULADD(at[63], at[98]); 
-   COMBA_STORE(C->dp[97]);
-   /* 98 */
-   COMBA_FORWARD;
-   MULADD(at[35], at[127]);    MULADD(at[36], at[126]);    MULADD(at[37], at[125]);    MULADD(at[38], at[124]);    MULADD(at[39], at[123]);    MULADD(at[40], at[122]);    MULADD(at[41], at[121]);    MULADD(at[42], at[120]);    MULADD(at[43], at[119]);    MULADD(at[44], at[118]);    MULADD(at[45], at[117]);    MULADD(at[46], at[116]);    MULADD(at[47], at[115]);    MULADD(at[48], at[114]);    MULADD(at[49], at[113]);    MULADD(at[50], at[112]);    MULADD(at[51], at[111]);    MULADD(at[52], at[110]);    MULADD(at[53], at[109]);    MULADD(at[54], at[108]);    MULADD(at[55], at[107]);    MULADD(at[56], at[106]);    MULADD(at[57], at[105]);    MULADD(at[58], at[104]);    MULADD(at[59], at[103]);    MULADD(at[60], at[102]);    MULADD(at[61], at[101]);    MULADD(at[62], at[100]);    MULADD(at[63], at[99]); 
-   COMBA_STORE(C->dp[98]);
-   /* 99 */
-   COMBA_FORWARD;
-   MULADD(at[36], at[127]);    MULADD(at[37], at[126]);    MULADD(at[38], at[125]);    MULADD(at[39], at[124]);    MULADD(at[40], at[123]);    MULADD(at[41], at[122]);    MULADD(at[42], at[121]);    MULADD(at[43], at[120]);    MULADD(at[44], at[119]);    MULADD(at[45], at[118]);    MULADD(at[46], at[117]);    MULADD(at[47], at[116]);    MULADD(at[48], at[115]);    MULADD(at[49], at[114]);    MULADD(at[50], at[113]);    MULADD(at[51], at[112]);    MULADD(at[52], at[111]);    MULADD(at[53], at[110]);    MULADD(at[54], at[109]);    MULADD(at[55], at[108]);    MULADD(at[56], at[107]);    MULADD(at[57], at[106]);    MULADD(at[58], at[105]);    MULADD(at[59], at[104]);    MULADD(at[60], at[103]);    MULADD(at[61], at[102]);    MULADD(at[62], at[101]);    MULADD(at[63], at[100]); 
-   COMBA_STORE(C->dp[99]);
-   /* 100 */
-   COMBA_FORWARD;
-   MULADD(at[37], at[127]);    MULADD(at[38], at[126]);    MULADD(at[39], at[125]);    MULADD(at[40], at[124]);    MULADD(at[41], at[123]);    MULADD(at[42], at[122]);    MULADD(at[43], at[121]);    MULADD(at[44], at[120]);    MULADD(at[45], at[119]);    MULADD(at[46], at[118]);    MULADD(at[47], at[117]);    MULADD(at[48], at[116]);    MULADD(at[49], at[115]);    MULADD(at[50], at[114]);    MULADD(at[51], at[113]);    MULADD(at[52], at[112]);    MULADD(at[53], at[111]);    MULADD(at[54], at[110]);    MULADD(at[55], at[109]);    MULADD(at[56], at[108]);    MULADD(at[57], at[107]);    MULADD(at[58], at[106]);    MULADD(at[59], at[105]);    MULADD(at[60], at[104]);    MULADD(at[61], at[103]);    MULADD(at[62], at[102]);    MULADD(at[63], at[101]); 
-   COMBA_STORE(C->dp[100]);
-   /* 101 */
-   COMBA_FORWARD;
-   MULADD(at[38], at[127]);    MULADD(at[39], at[126]);    MULADD(at[40], at[125]);    MULADD(at[41], at[124]);    MULADD(at[42], at[123]);    MULADD(at[43], at[122]);    MULADD(at[44], at[121]);    MULADD(at[45], at[120]);    MULADD(at[46], at[119]);    MULADD(at[47], at[118]);    MULADD(at[48], at[117]);    MULADD(at[49], at[116]);    MULADD(at[50], at[115]);    MULADD(at[51], at[114]);    MULADD(at[52], at[113]);    MULADD(at[53], at[112]);    MULADD(at[54], at[111]);    MULADD(at[55], at[110]);    MULADD(at[56], at[109]);    MULADD(at[57], at[108]);    MULADD(at[58], at[107]);    MULADD(at[59], at[106]);    MULADD(at[60], at[105]);    MULADD(at[61], at[104]);    MULADD(at[62], at[103]);    MULADD(at[63], at[102]); 
-   COMBA_STORE(C->dp[101]);
-   /* 102 */
-   COMBA_FORWARD;
-   MULADD(at[39], at[127]);    MULADD(at[40], at[126]);    MULADD(at[41], at[125]);    MULADD(at[42], at[124]);    MULADD(at[43], at[123]);    MULADD(at[44], at[122]);    MULADD(at[45], at[121]);    MULADD(at[46], at[120]);    MULADD(at[47], at[119]);    MULADD(at[48], at[118]);    MULADD(at[49], at[117]);    MULADD(at[50], at[116]);    MULADD(at[51], at[115]);    MULADD(at[52], at[114]);    MULADD(at[53], at[113]);    MULADD(at[54], at[112]);    MULADD(at[55], at[111]);    MULADD(at[56], at[110]);    MULADD(at[57], at[109]);    MULADD(at[58], at[108]);    MULADD(at[59], at[107]);    MULADD(at[60], at[106]);    MULADD(at[61], at[105]);    MULADD(at[62], at[104]);    MULADD(at[63], at[103]); 
-   COMBA_STORE(C->dp[102]);
-   /* 103 */
-   COMBA_FORWARD;
-   MULADD(at[40], at[127]);    MULADD(at[41], at[126]);    MULADD(at[42], at[125]);    MULADD(at[43], at[124]);    MULADD(at[44], at[123]);    MULADD(at[45], at[122]);    MULADD(at[46], at[121]);    MULADD(at[47], at[120]);    MULADD(at[48], at[119]);    MULADD(at[49], at[118]);    MULADD(at[50], at[117]);    MULADD(at[51], at[116]);    MULADD(at[52], at[115]);    MULADD(at[53], at[114]);    MULADD(at[54], at[113]);    MULADD(at[55], at[112]);    MULADD(at[56], at[111]);    MULADD(at[57], at[110]);    MULADD(at[58], at[109]);    MULADD(at[59], at[108]);    MULADD(at[60], at[107]);    MULADD(at[61], at[106]);    MULADD(at[62], at[105]);    MULADD(at[63], at[104]); 
-   COMBA_STORE(C->dp[103]);
-   /* 104 */
-   COMBA_FORWARD;
-   MULADD(at[41], at[127]);    MULADD(at[42], at[126]);    MULADD(at[43], at[125]);    MULADD(at[44], at[124]);    MULADD(at[45], at[123]);    MULADD(at[46], at[122]);    MULADD(at[47], at[121]);    MULADD(at[48], at[120]);    MULADD(at[49], at[119]);    MULADD(at[50], at[118]);    MULADD(at[51], at[117]);    MULADD(at[52], at[116]);    MULADD(at[53], at[115]);    MULADD(at[54], at[114]);    MULADD(at[55], at[113]);    MULADD(at[56], at[112]);    MULADD(at[57], at[111]);    MULADD(at[58], at[110]);    MULADD(at[59], at[109]);    MULADD(at[60], at[108]);    MULADD(at[61], at[107]);    MULADD(at[62], at[106]);    MULADD(at[63], at[105]); 
-   COMBA_STORE(C->dp[104]);
-   /* 105 */
-   COMBA_FORWARD;
-   MULADD(at[42], at[127]);    MULADD(at[43], at[126]);    MULADD(at[44], at[125]);    MULADD(at[45], at[124]);    MULADD(at[46], at[123]);    MULADD(at[47], at[122]);    MULADD(at[48], at[121]);    MULADD(at[49], at[120]);    MULADD(at[50], at[119]);    MULADD(at[51], at[118]);    MULADD(at[52], at[117]);    MULADD(at[53], at[116]);    MULADD(at[54], at[115]);    MULADD(at[55], at[114]);    MULADD(at[56], at[113]);    MULADD(at[57], at[112]);    MULADD(at[58], at[111]);    MULADD(at[59], at[110]);    MULADD(at[60], at[109]);    MULADD(at[61], at[108]);    MULADD(at[62], at[107]);    MULADD(at[63], at[106]); 
-   COMBA_STORE(C->dp[105]);
-   /* 106 */
-   COMBA_FORWARD;
-   MULADD(at[43], at[127]);    MULADD(at[44], at[126]);    MULADD(at[45], at[125]);    MULADD(at[46], at[124]);    MULADD(at[47], at[123]);    MULADD(at[48], at[122]);    MULADD(at[49], at[121]);    MULADD(at[50], at[120]);    MULADD(at[51], at[119]);    MULADD(at[52], at[118]);    MULADD(at[53], at[117]);    MULADD(at[54], at[116]);    MULADD(at[55], at[115]);    MULADD(at[56], at[114]);    MULADD(at[57], at[113]);    MULADD(at[58], at[112]);    MULADD(at[59], at[111]);    MULADD(at[60], at[110]);    MULADD(at[61], at[109]);    MULADD(at[62], at[108]);    MULADD(at[63], at[107]); 
-   COMBA_STORE(C->dp[106]);
-   /* 107 */
-   COMBA_FORWARD;
-   MULADD(at[44], at[127]);    MULADD(at[45], at[126]);    MULADD(at[46], at[125]);    MULADD(at[47], at[124]);    MULADD(at[48], at[123]);    MULADD(at[49], at[122]);    MULADD(at[50], at[121]);    MULADD(at[51], at[120]);    MULADD(at[52], at[119]);    MULADD(at[53], at[118]);    MULADD(at[54], at[117]);    MULADD(at[55], at[116]);    MULADD(at[56], at[115]);    MULADD(at[57], at[114]);    MULADD(at[58], at[113]);    MULADD(at[59], at[112]);    MULADD(at[60], at[111]);    MULADD(at[61], at[110]);    MULADD(at[62], at[109]);    MULADD(at[63], at[108]); 
-   COMBA_STORE(C->dp[107]);
-   /* 108 */
-   COMBA_FORWARD;
-   MULADD(at[45], at[127]);    MULADD(at[46], at[126]);    MULADD(at[47], at[125]);    MULADD(at[48], at[124]);    MULADD(at[49], at[123]);    MULADD(at[50], at[122]);    MULADD(at[51], at[121]);    MULADD(at[52], at[120]);    MULADD(at[53], at[119]);    MULADD(at[54], at[118]);    MULADD(at[55], at[117]);    MULADD(at[56], at[116]);    MULADD(at[57], at[115]);    MULADD(at[58], at[114]);    MULADD(at[59], at[113]);    MULADD(at[60], at[112]);    MULADD(at[61], at[111]);    MULADD(at[62], at[110]);    MULADD(at[63], at[109]); 
-   COMBA_STORE(C->dp[108]);
-   /* 109 */
-   COMBA_FORWARD;
-   MULADD(at[46], at[127]);    MULADD(at[47], at[126]);    MULADD(at[48], at[125]);    MULADD(at[49], at[124]);    MULADD(at[50], at[123]);    MULADD(at[51], at[122]);    MULADD(at[52], at[121]);    MULADD(at[53], at[120]);    MULADD(at[54], at[119]);    MULADD(at[55], at[118]);    MULADD(at[56], at[117]);    MULADD(at[57], at[116]);    MULADD(at[58], at[115]);    MULADD(at[59], at[114]);    MULADD(at[60], at[113]);    MULADD(at[61], at[112]);    MULADD(at[62], at[111]);    MULADD(at[63], at[110]); 
-   COMBA_STORE(C->dp[109]);
-   /* 110 */
-   COMBA_FORWARD;
-   MULADD(at[47], at[127]);    MULADD(at[48], at[126]);    MULADD(at[49], at[125]);    MULADD(at[50], at[124]);    MULADD(at[51], at[123]);    MULADD(at[52], at[122]);    MULADD(at[53], at[121]);    MULADD(at[54], at[120]);    MULADD(at[55], at[119]);    MULADD(at[56], at[118]);    MULADD(at[57], at[117]);    MULADD(at[58], at[116]);    MULADD(at[59], at[115]);    MULADD(at[60], at[114]);    MULADD(at[61], at[113]);    MULADD(at[62], at[112]);    MULADD(at[63], at[111]); 
-   COMBA_STORE(C->dp[110]);
-   /* 111 */
-   COMBA_FORWARD;
-   MULADD(at[48], at[127]);    MULADD(at[49], at[126]);    MULADD(at[50], at[125]);    MULADD(at[51], at[124]);    MULADD(at[52], at[123]);    MULADD(at[53], at[122]);    MULADD(at[54], at[121]);    MULADD(at[55], at[120]);    MULADD(at[56], at[119]);    MULADD(at[57], at[118]);    MULADD(at[58], at[117]);    MULADD(at[59], at[116]);    MULADD(at[60], at[115]);    MULADD(at[61], at[114]);    MULADD(at[62], at[113]);    MULADD(at[63], at[112]); 
-   COMBA_STORE(C->dp[111]);
-   /* 112 */
-   COMBA_FORWARD;
-   MULADD(at[49], at[127]);    MULADD(at[50], at[126]);    MULADD(at[51], at[125]);    MULADD(at[52], at[124]);    MULADD(at[53], at[123]);    MULADD(at[54], at[122]);    MULADD(at[55], at[121]);    MULADD(at[56], at[120]);    MULADD(at[57], at[119]);    MULADD(at[58], at[118]);    MULADD(at[59], at[117]);    MULADD(at[60], at[116]);    MULADD(at[61], at[115]);    MULADD(at[62], at[114]);    MULADD(at[63], at[113]); 
-   COMBA_STORE(C->dp[112]);
-   /* 113 */
-   COMBA_FORWARD;
-   MULADD(at[50], at[127]);    MULADD(at[51], at[126]);    MULADD(at[52], at[125]);    MULADD(at[53], at[124]);    MULADD(at[54], at[123]);    MULADD(at[55], at[122]);    MULADD(at[56], at[121]);    MULADD(at[57], at[120]);    MULADD(at[58], at[119]);    MULADD(at[59], at[118]);    MULADD(at[60], at[117]);    MULADD(at[61], at[116]);    MULADD(at[62], at[115]);    MULADD(at[63], at[114]); 
-   COMBA_STORE(C->dp[113]);
-   /* 114 */
-   COMBA_FORWARD;
-   MULADD(at[51], at[127]);    MULADD(at[52], at[126]);    MULADD(at[53], at[125]);    MULADD(at[54], at[124]);    MULADD(at[55], at[123]);    MULADD(at[56], at[122]);    MULADD(at[57], at[121]);    MULADD(at[58], at[120]);    MULADD(at[59], at[119]);    MULADD(at[60], at[118]);    MULADD(at[61], at[117]);    MULADD(at[62], at[116]);    MULADD(at[63], at[115]); 
-   COMBA_STORE(C->dp[114]);
-   /* 115 */
-   COMBA_FORWARD;
-   MULADD(at[52], at[127]);    MULADD(at[53], at[126]);    MULADD(at[54], at[125]);    MULADD(at[55], at[124]);    MULADD(at[56], at[123]);    MULADD(at[57], at[122]);    MULADD(at[58], at[121]);    MULADD(at[59], at[120]);    MULADD(at[60], at[119]);    MULADD(at[61], at[118]);    MULADD(at[62], at[117]);    MULADD(at[63], at[116]); 
-   COMBA_STORE(C->dp[115]);
-   /* 116 */
-   COMBA_FORWARD;
-   MULADD(at[53], at[127]);    MULADD(at[54], at[126]);    MULADD(at[55], at[125]);    MULADD(at[56], at[124]);    MULADD(at[57], at[123]);    MULADD(at[58], at[122]);    MULADD(at[59], at[121]);    MULADD(at[60], at[120]);    MULADD(at[61], at[119]);    MULADD(at[62], at[118]);    MULADD(at[63], at[117]); 
-   COMBA_STORE(C->dp[116]);
-   /* 117 */
-   COMBA_FORWARD;
-   MULADD(at[54], at[127]);    MULADD(at[55], at[126]);    MULADD(at[56], at[125]);    MULADD(at[57], at[124]);    MULADD(at[58], at[123]);    MULADD(at[59], at[122]);    MULADD(at[60], at[121]);    MULADD(at[61], at[120]);    MULADD(at[62], at[119]);    MULADD(at[63], at[118]); 
-   COMBA_STORE(C->dp[117]);
-   /* 118 */
-   COMBA_FORWARD;
-   MULADD(at[55], at[127]);    MULADD(at[56], at[126]);    MULADD(at[57], at[125]);    MULADD(at[58], at[124]);    MULADD(at[59], at[123]);    MULADD(at[60], at[122]);    MULADD(at[61], at[121]);    MULADD(at[62], at[120]);    MULADD(at[63], at[119]); 
-   COMBA_STORE(C->dp[118]);
-   /* 119 */
-   COMBA_FORWARD;
-   MULADD(at[56], at[127]);    MULADD(at[57], at[126]);    MULADD(at[58], at[125]);    MULADD(at[59], at[124]);    MULADD(at[60], at[123]);    MULADD(at[61], at[122]);    MULADD(at[62], at[121]);    MULADD(at[63], at[120]); 
-   COMBA_STORE(C->dp[119]);
-   /* 120 */
-   COMBA_FORWARD;
-   MULADD(at[57], at[127]);    MULADD(at[58], at[126]);    MULADD(at[59], at[125]);    MULADD(at[60], at[124]);    MULADD(at[61], at[123]);    MULADD(at[62], at[122]);    MULADD(at[63], at[121]); 
-   COMBA_STORE(C->dp[120]);
-   /* 121 */
-   COMBA_FORWARD;
-   MULADD(at[58], at[127]);    MULADD(at[59], at[126]);    MULADD(at[60], at[125]);    MULADD(at[61], at[124]);    MULADD(at[62], at[123]);    MULADD(at[63], at[122]); 
-   COMBA_STORE(C->dp[121]);
-   /* 122 */
-   COMBA_FORWARD;
-   MULADD(at[59], at[127]);    MULADD(at[60], at[126]);    MULADD(at[61], at[125]);    MULADD(at[62], at[124]);    MULADD(at[63], at[123]); 
-   COMBA_STORE(C->dp[122]);
-   /* 123 */
-   COMBA_FORWARD;
-   MULADD(at[60], at[127]);    MULADD(at[61], at[126]);    MULADD(at[62], at[125]);    MULADD(at[63], at[124]); 
-   COMBA_STORE(C->dp[123]);
-   /* 124 */
-   COMBA_FORWARD;
-   MULADD(at[61], at[127]);    MULADD(at[62], at[126]);    MULADD(at[63], at[125]); 
-   COMBA_STORE(C->dp[124]);
-   /* 125 */
-   COMBA_FORWARD;
-   MULADD(at[62], at[127]);    MULADD(at[63], at[126]); 
-   COMBA_STORE(C->dp[125]);
-   /* 126 */
-   COMBA_FORWARD;
-   MULADD(at[63], at[127]); 
-   COMBA_STORE(C->dp[126]);
-   COMBA_STORE2(C->dp[127]);
-   C->used = 128;
-   C->sign = A->sign ^ B->sign;
-   fp_clamp(C);
-   COMBA_FINI;
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif

+ 0 - 107
lib/wolfssl/wolfcrypt/src/fp_mul_comba_7.i

@@ -1,107 +0,0 @@
-/* fp_mul_comba_7.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_MUL7
-int fp_mul_comba7(fp_int *A, fp_int *B, fp_int *C)
-{
-   fp_digit c0, c1, c2;
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit at[14];
-#else
-   fp_digit *at;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 14, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (at == NULL)
-       return FP_MEM;
-#endif
-
-   XMEMCPY(at, A->dp, 7 * sizeof(fp_digit));
-   XMEMCPY(at+7, B->dp, 7 * sizeof(fp_digit));
-   COMBA_START;
-
-   COMBA_CLEAR;
-   /* 0 */
-   MULADD(at[0], at[7]); 
-   COMBA_STORE(C->dp[0]);
-   /* 1 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[8]);    MULADD(at[1], at[7]); 
-   COMBA_STORE(C->dp[1]);
-   /* 2 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[9]);    MULADD(at[1], at[8]);    MULADD(at[2], at[7]); 
-   COMBA_STORE(C->dp[2]);
-   /* 3 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[10]);    MULADD(at[1], at[9]);    MULADD(at[2], at[8]);    MULADD(at[3], at[7]); 
-   COMBA_STORE(C->dp[3]);
-   /* 4 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[11]);    MULADD(at[1], at[10]);    MULADD(at[2], at[9]);    MULADD(at[3], at[8]);    MULADD(at[4], at[7]); 
-   COMBA_STORE(C->dp[4]);
-   /* 5 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[12]);    MULADD(at[1], at[11]);    MULADD(at[2], at[10]);    MULADD(at[3], at[9]);    MULADD(at[4], at[8]);    MULADD(at[5], at[7]); 
-   COMBA_STORE(C->dp[5]);
-   /* 6 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[13]);    MULADD(at[1], at[12]);    MULADD(at[2], at[11]);    MULADD(at[3], at[10]);    MULADD(at[4], at[9]);    MULADD(at[5], at[8]);    MULADD(at[6], at[7]); 
-   COMBA_STORE(C->dp[6]);
-   /* 7 */
-   COMBA_FORWARD;
-   MULADD(at[1], at[13]);    MULADD(at[2], at[12]);    MULADD(at[3], at[11]);    MULADD(at[4], at[10]);    MULADD(at[5], at[9]);    MULADD(at[6], at[8]); 
-   COMBA_STORE(C->dp[7]);
-   /* 8 */
-   COMBA_FORWARD;
-   MULADD(at[2], at[13]);    MULADD(at[3], at[12]);    MULADD(at[4], at[11]);    MULADD(at[5], at[10]);    MULADD(at[6], at[9]); 
-   COMBA_STORE(C->dp[8]);
-   /* 9 */
-   COMBA_FORWARD;
-   MULADD(at[3], at[13]);    MULADD(at[4], at[12]);    MULADD(at[5], at[11]);    MULADD(at[6], at[10]); 
-   COMBA_STORE(C->dp[9]);
-   /* 10 */
-   COMBA_FORWARD;
-   MULADD(at[4], at[13]);    MULADD(at[5], at[12]);    MULADD(at[6], at[11]); 
-   COMBA_STORE(C->dp[10]);
-   /* 11 */
-   COMBA_FORWARD;
-   MULADD(at[5], at[13]);    MULADD(at[6], at[12]); 
-   COMBA_STORE(C->dp[11]);
-   /* 12 */
-   COMBA_FORWARD;
-   MULADD(at[6], at[13]); 
-   COMBA_STORE(C->dp[12]);
-   COMBA_STORE2(C->dp[13]);
-   C->used = 14;
-   C->sign = A->sign ^ B->sign;
-   fp_clamp(C);
-   COMBA_FINI;
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif

+ 0 - 115
lib/wolfssl/wolfcrypt/src/fp_mul_comba_8.i

@@ -1,115 +0,0 @@
-/* fp_mul_comba_8.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_MUL8
-int fp_mul_comba8(fp_int *A, fp_int *B, fp_int *C)
-{
-   fp_digit c0, c1, c2;
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit at[16];
-#else
-   fp_digit *at;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 16, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (at == NULL)
-       return FP_MEM;
-#endif
-
-   XMEMCPY(at, A->dp, 8 * sizeof(fp_digit));
-   XMEMCPY(at+8, B->dp, 8 * sizeof(fp_digit));
-   COMBA_START;
-
-   COMBA_CLEAR;
-   /* 0 */
-   MULADD(at[0], at[8]); 
-   COMBA_STORE(C->dp[0]);
-   /* 1 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[9]);    MULADD(at[1], at[8]); 
-   COMBA_STORE(C->dp[1]);
-   /* 2 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[10]);    MULADD(at[1], at[9]);    MULADD(at[2], at[8]); 
-   COMBA_STORE(C->dp[2]);
-   /* 3 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[11]);    MULADD(at[1], at[10]);    MULADD(at[2], at[9]);    MULADD(at[3], at[8]); 
-   COMBA_STORE(C->dp[3]);
-   /* 4 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[12]);    MULADD(at[1], at[11]);    MULADD(at[2], at[10]);    MULADD(at[3], at[9]);    MULADD(at[4], at[8]); 
-   COMBA_STORE(C->dp[4]);
-   /* 5 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[13]);    MULADD(at[1], at[12]);    MULADD(at[2], at[11]);    MULADD(at[3], at[10]);    MULADD(at[4], at[9]);    MULADD(at[5], at[8]); 
-   COMBA_STORE(C->dp[5]);
-   /* 6 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[14]);    MULADD(at[1], at[13]);    MULADD(at[2], at[12]);    MULADD(at[3], at[11]);    MULADD(at[4], at[10]);    MULADD(at[5], at[9]);    MULADD(at[6], at[8]); 
-   COMBA_STORE(C->dp[6]);
-   /* 7 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[15]);    MULADD(at[1], at[14]);    MULADD(at[2], at[13]);    MULADD(at[3], at[12]);    MULADD(at[4], at[11]);    MULADD(at[5], at[10]);    MULADD(at[6], at[9]);    MULADD(at[7], at[8]); 
-   COMBA_STORE(C->dp[7]);
-   /* 8 */
-   COMBA_FORWARD;
-   MULADD(at[1], at[15]);    MULADD(at[2], at[14]);    MULADD(at[3], at[13]);    MULADD(at[4], at[12]);    MULADD(at[5], at[11]);    MULADD(at[6], at[10]);    MULADD(at[7], at[9]); 
-   COMBA_STORE(C->dp[8]);
-   /* 9 */
-   COMBA_FORWARD;
-   MULADD(at[2], at[15]);    MULADD(at[3], at[14]);    MULADD(at[4], at[13]);    MULADD(at[5], at[12]);    MULADD(at[6], at[11]);    MULADD(at[7], at[10]); 
-   COMBA_STORE(C->dp[9]);
-   /* 10 */
-   COMBA_FORWARD;
-   MULADD(at[3], at[15]);    MULADD(at[4], at[14]);    MULADD(at[5], at[13]);    MULADD(at[6], at[12]);    MULADD(at[7], at[11]); 
-   COMBA_STORE(C->dp[10]);
-   /* 11 */
-   COMBA_FORWARD;
-   MULADD(at[4], at[15]);    MULADD(at[5], at[14]);    MULADD(at[6], at[13]);    MULADD(at[7], at[12]); 
-   COMBA_STORE(C->dp[11]);
-   /* 12 */
-   COMBA_FORWARD;
-   MULADD(at[5], at[15]);    MULADD(at[6], at[14]);    MULADD(at[7], at[13]); 
-   COMBA_STORE(C->dp[12]);
-   /* 13 */
-   COMBA_FORWARD;
-   MULADD(at[6], at[15]);    MULADD(at[7], at[14]); 
-   COMBA_STORE(C->dp[13]);
-   /* 14 */
-   COMBA_FORWARD;
-   MULADD(at[7], at[15]); 
-   COMBA_STORE(C->dp[14]);
-   COMBA_STORE2(C->dp[15]);
-   C->used = 16;
-   C->sign = A->sign ^ B->sign;
-   fp_clamp(C);
-   COMBA_FINI;
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif

+ 0 - 123
lib/wolfssl/wolfcrypt/src/fp_mul_comba_9.i

@@ -1,123 +0,0 @@
-/* fp_mul_comba_9.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_MUL9
-int fp_mul_comba9(fp_int *A, fp_int *B, fp_int *C)
-{
-   fp_digit c0, c1, c2;
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit at[18];
-#else
-   fp_digit *at;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 18, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (at == NULL)
-       return FP_MEM;
-#endif
-
-   XMEMCPY(at, A->dp, 9 * sizeof(fp_digit));
-   XMEMCPY(at+9, B->dp, 9 * sizeof(fp_digit));
-   COMBA_START;
-
-   COMBA_CLEAR;
-   /* 0 */
-   MULADD(at[0], at[9]); 
-   COMBA_STORE(C->dp[0]);
-   /* 1 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[10]);    MULADD(at[1], at[9]); 
-   COMBA_STORE(C->dp[1]);
-   /* 2 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[11]);    MULADD(at[1], at[10]);    MULADD(at[2], at[9]); 
-   COMBA_STORE(C->dp[2]);
-   /* 3 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[12]);    MULADD(at[1], at[11]);    MULADD(at[2], at[10]);    MULADD(at[3], at[9]); 
-   COMBA_STORE(C->dp[3]);
-   /* 4 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[13]);    MULADD(at[1], at[12]);    MULADD(at[2], at[11]);    MULADD(at[3], at[10]);    MULADD(at[4], at[9]); 
-   COMBA_STORE(C->dp[4]);
-   /* 5 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[14]);    MULADD(at[1], at[13]);    MULADD(at[2], at[12]);    MULADD(at[3], at[11]);    MULADD(at[4], at[10]);    MULADD(at[5], at[9]); 
-   COMBA_STORE(C->dp[5]);
-   /* 6 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[15]);    MULADD(at[1], at[14]);    MULADD(at[2], at[13]);    MULADD(at[3], at[12]);    MULADD(at[4], at[11]);    MULADD(at[5], at[10]);    MULADD(at[6], at[9]); 
-   COMBA_STORE(C->dp[6]);
-   /* 7 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[16]);    MULADD(at[1], at[15]);    MULADD(at[2], at[14]);    MULADD(at[3], at[13]);    MULADD(at[4], at[12]);    MULADD(at[5], at[11]);    MULADD(at[6], at[10]);    MULADD(at[7], at[9]); 
-   COMBA_STORE(C->dp[7]);
-   /* 8 */
-   COMBA_FORWARD;
-   MULADD(at[0], at[17]);    MULADD(at[1], at[16]);    MULADD(at[2], at[15]);    MULADD(at[3], at[14]);    MULADD(at[4], at[13]);    MULADD(at[5], at[12]);    MULADD(at[6], at[11]);    MULADD(at[7], at[10]);    MULADD(at[8], at[9]); 
-   COMBA_STORE(C->dp[8]);
-   /* 9 */
-   COMBA_FORWARD;
-   MULADD(at[1], at[17]);    MULADD(at[2], at[16]);    MULADD(at[3], at[15]);    MULADD(at[4], at[14]);    MULADD(at[5], at[13]);    MULADD(at[6], at[12]);    MULADD(at[7], at[11]);    MULADD(at[8], at[10]); 
-   COMBA_STORE(C->dp[9]);
-   /* 10 */
-   COMBA_FORWARD;
-   MULADD(at[2], at[17]);    MULADD(at[3], at[16]);    MULADD(at[4], at[15]);    MULADD(at[5], at[14]);    MULADD(at[6], at[13]);    MULADD(at[7], at[12]);    MULADD(at[8], at[11]); 
-   COMBA_STORE(C->dp[10]);
-   /* 11 */
-   COMBA_FORWARD;
-   MULADD(at[3], at[17]);    MULADD(at[4], at[16]);    MULADD(at[5], at[15]);    MULADD(at[6], at[14]);    MULADD(at[7], at[13]);    MULADD(at[8], at[12]); 
-   COMBA_STORE(C->dp[11]);
-   /* 12 */
-   COMBA_FORWARD;
-   MULADD(at[4], at[17]);    MULADD(at[5], at[16]);    MULADD(at[6], at[15]);    MULADD(at[7], at[14]);    MULADD(at[8], at[13]); 
-   COMBA_STORE(C->dp[12]);
-   /* 13 */
-   COMBA_FORWARD;
-   MULADD(at[5], at[17]);    MULADD(at[6], at[16]);    MULADD(at[7], at[15]);    MULADD(at[8], at[14]); 
-   COMBA_STORE(C->dp[13]);
-   /* 14 */
-   COMBA_FORWARD;
-   MULADD(at[6], at[17]);    MULADD(at[7], at[16]);    MULADD(at[8], at[15]); 
-   COMBA_STORE(C->dp[14]);
-   /* 15 */
-   COMBA_FORWARD;
-   MULADD(at[7], at[17]);    MULADD(at[8], at[16]); 
-   COMBA_STORE(C->dp[15]);
-   /* 16 */
-   COMBA_FORWARD;
-   MULADD(at[8], at[17]); 
-   COMBA_STORE(C->dp[16]);
-   COMBA_STORE2(C->dp[17]);
-   C->used = 18;
-   C->sign = A->sign ^ B->sign;
-   fp_clamp(C);
-   COMBA_FINI;
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif

+ 0 - 1268
lib/wolfssl/wolfcrypt/src/fp_mul_comba_small_set.i

@@ -1,1268 +0,0 @@
-/* fp_mul_comba_small_set.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#if defined(TFM_SMALL_SET)
-int fp_mul_comba_small(fp_int *A, fp_int *B, fp_int *C)
-{
-   fp_digit c0, c1, c2;
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit at[32];
-#else
-   fp_digit *at;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   at = (fp_digit*)XMALLOC(sizeof(fp_digit) * 32, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (at == NULL)
-       return FP_MEM;
-#endif
-
-   switch (MAX(A->used, B->used)) { 
-
-   case 1:
-      XMEMCPY(at, A->dp, 1 * sizeof(fp_digit));
-      XMEMCPY(at+1, B->dp, 1 * sizeof(fp_digit));
-      COMBA_START;
-
-      COMBA_CLEAR;
-      /* 0 */
-      MULADD(at[0], at[1]); 
-      COMBA_STORE(C->dp[0]);
-      COMBA_STORE2(C->dp[1]);
-      C->used = 2;
-      C->sign = A->sign ^ B->sign;
-      fp_clamp(C);
-      COMBA_FINI;
-      break;
-
-   case 2:
-      XMEMCPY(at, A->dp, 2 * sizeof(fp_digit));
-      XMEMCPY(at+2, B->dp, 2 * sizeof(fp_digit));
-      COMBA_START;
-
-      COMBA_CLEAR;
-      /* 0 */
-      MULADD(at[0], at[2]); 
-      COMBA_STORE(C->dp[0]);
-      /* 1 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[3]);       MULADD(at[1], at[2]); 
-      COMBA_STORE(C->dp[1]);
-      /* 2 */
-      COMBA_FORWARD;
-      MULADD(at[1], at[3]); 
-      COMBA_STORE(C->dp[2]);
-      COMBA_STORE2(C->dp[3]);
-      C->used = 4;
-      C->sign = A->sign ^ B->sign;
-      fp_clamp(C);
-      COMBA_FINI;
-      break;
-
-   case 3:
-      XMEMCPY(at, A->dp, 3 * sizeof(fp_digit));
-      XMEMCPY(at+3, B->dp, 3 * sizeof(fp_digit));
-      COMBA_START;
-
-      COMBA_CLEAR;
-      /* 0 */
-      MULADD(at[0], at[3]); 
-      COMBA_STORE(C->dp[0]);
-      /* 1 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[4]);       MULADD(at[1], at[3]); 
-      COMBA_STORE(C->dp[1]);
-      /* 2 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[5]);       MULADD(at[1], at[4]);       MULADD(at[2], at[3]); 
-      COMBA_STORE(C->dp[2]);
-      /* 3 */
-      COMBA_FORWARD;
-      MULADD(at[1], at[5]);       MULADD(at[2], at[4]); 
-      COMBA_STORE(C->dp[3]);
-      /* 4 */
-      COMBA_FORWARD;
-      MULADD(at[2], at[5]); 
-      COMBA_STORE(C->dp[4]);
-      COMBA_STORE2(C->dp[5]);
-      C->used = 6;
-      C->sign = A->sign ^ B->sign;
-      fp_clamp(C);
-      COMBA_FINI;
-      break;
-
-   case 4:
-      XMEMCPY(at, A->dp, 4 * sizeof(fp_digit));
-      XMEMCPY(at+4, B->dp, 4 * sizeof(fp_digit));
-      COMBA_START;
-
-      COMBA_CLEAR;
-      /* 0 */
-      MULADD(at[0], at[4]); 
-      COMBA_STORE(C->dp[0]);
-      /* 1 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[5]);       MULADD(at[1], at[4]); 
-      COMBA_STORE(C->dp[1]);
-      /* 2 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[6]);       MULADD(at[1], at[5]);       MULADD(at[2], at[4]); 
-      COMBA_STORE(C->dp[2]);
-      /* 3 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[7]);       MULADD(at[1], at[6]);       MULADD(at[2], at[5]);       MULADD(at[3], at[4]); 
-      COMBA_STORE(C->dp[3]);
-      /* 4 */
-      COMBA_FORWARD;
-      MULADD(at[1], at[7]);       MULADD(at[2], at[6]);       MULADD(at[3], at[5]); 
-      COMBA_STORE(C->dp[4]);
-      /* 5 */
-      COMBA_FORWARD;
-      MULADD(at[2], at[7]);       MULADD(at[3], at[6]); 
-      COMBA_STORE(C->dp[5]);
-      /* 6 */
-      COMBA_FORWARD;
-      MULADD(at[3], at[7]); 
-      COMBA_STORE(C->dp[6]);
-      COMBA_STORE2(C->dp[7]);
-      C->used = 8;
-      C->sign = A->sign ^ B->sign;
-      fp_clamp(C);
-      COMBA_FINI;
-      break;
-
-   case 5:
-      XMEMCPY(at, A->dp, 5 * sizeof(fp_digit));
-      XMEMCPY(at+5, B->dp, 5 * sizeof(fp_digit));
-      COMBA_START;
-
-      COMBA_CLEAR;
-      /* 0 */
-      MULADD(at[0], at[5]); 
-      COMBA_STORE(C->dp[0]);
-      /* 1 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[6]);       MULADD(at[1], at[5]); 
-      COMBA_STORE(C->dp[1]);
-      /* 2 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[7]);       MULADD(at[1], at[6]);       MULADD(at[2], at[5]); 
-      COMBA_STORE(C->dp[2]);
-      /* 3 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[8]);       MULADD(at[1], at[7]);       MULADD(at[2], at[6]);       MULADD(at[3], at[5]); 
-      COMBA_STORE(C->dp[3]);
-      /* 4 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[9]);       MULADD(at[1], at[8]);       MULADD(at[2], at[7]);       MULADD(at[3], at[6]);       MULADD(at[4], at[5]); 
-      COMBA_STORE(C->dp[4]);
-      /* 5 */
-      COMBA_FORWARD;
-      MULADD(at[1], at[9]);       MULADD(at[2], at[8]);       MULADD(at[3], at[7]);       MULADD(at[4], at[6]); 
-      COMBA_STORE(C->dp[5]);
-      /* 6 */
-      COMBA_FORWARD;
-      MULADD(at[2], at[9]);       MULADD(at[3], at[8]);       MULADD(at[4], at[7]); 
-      COMBA_STORE(C->dp[6]);
-      /* 7 */
-      COMBA_FORWARD;
-      MULADD(at[3], at[9]);       MULADD(at[4], at[8]); 
-      COMBA_STORE(C->dp[7]);
-      /* 8 */
-      COMBA_FORWARD;
-      MULADD(at[4], at[9]); 
-      COMBA_STORE(C->dp[8]);
-      COMBA_STORE2(C->dp[9]);
-      C->used = 10;
-      C->sign = A->sign ^ B->sign;
-      fp_clamp(C);
-      COMBA_FINI;
-      break;
-
-   case 6:
-      XMEMCPY(at, A->dp, 6 * sizeof(fp_digit));
-      XMEMCPY(at+6, B->dp, 6 * sizeof(fp_digit));
-      COMBA_START;
-
-      COMBA_CLEAR;
-      /* 0 */
-      MULADD(at[0], at[6]); 
-      COMBA_STORE(C->dp[0]);
-      /* 1 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[7]);       MULADD(at[1], at[6]); 
-      COMBA_STORE(C->dp[1]);
-      /* 2 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[8]);       MULADD(at[1], at[7]);       MULADD(at[2], at[6]); 
-      COMBA_STORE(C->dp[2]);
-      /* 3 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[9]);       MULADD(at[1], at[8]);       MULADD(at[2], at[7]);       MULADD(at[3], at[6]); 
-      COMBA_STORE(C->dp[3]);
-      /* 4 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[10]);       MULADD(at[1], at[9]);       MULADD(at[2], at[8]);       MULADD(at[3], at[7]);       MULADD(at[4], at[6]); 
-      COMBA_STORE(C->dp[4]);
-      /* 5 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[11]);       MULADD(at[1], at[10]);       MULADD(at[2], at[9]);       MULADD(at[3], at[8]);       MULADD(at[4], at[7]);       MULADD(at[5], at[6]); 
-      COMBA_STORE(C->dp[5]);
-      /* 6 */
-      COMBA_FORWARD;
-      MULADD(at[1], at[11]);       MULADD(at[2], at[10]);       MULADD(at[3], at[9]);       MULADD(at[4], at[8]);       MULADD(at[5], at[7]); 
-      COMBA_STORE(C->dp[6]);
-      /* 7 */
-      COMBA_FORWARD;
-      MULADD(at[2], at[11]);       MULADD(at[3], at[10]);       MULADD(at[4], at[9]);       MULADD(at[5], at[8]); 
-      COMBA_STORE(C->dp[7]);
-      /* 8 */
-      COMBA_FORWARD;
-      MULADD(at[3], at[11]);       MULADD(at[4], at[10]);       MULADD(at[5], at[9]); 
-      COMBA_STORE(C->dp[8]);
-      /* 9 */
-      COMBA_FORWARD;
-      MULADD(at[4], at[11]);       MULADD(at[5], at[10]); 
-      COMBA_STORE(C->dp[9]);
-      /* 10 */
-      COMBA_FORWARD;
-      MULADD(at[5], at[11]); 
-      COMBA_STORE(C->dp[10]);
-      COMBA_STORE2(C->dp[11]);
-      C->used = 12;
-      C->sign = A->sign ^ B->sign;
-      fp_clamp(C);
-      COMBA_FINI;
-      break;
-
-   case 7:
-      XMEMCPY(at, A->dp, 7 * sizeof(fp_digit));
-      XMEMCPY(at+7, B->dp, 7 * sizeof(fp_digit));
-      COMBA_START;
-
-      COMBA_CLEAR;
-      /* 0 */
-      MULADD(at[0], at[7]); 
-      COMBA_STORE(C->dp[0]);
-      /* 1 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[8]);       MULADD(at[1], at[7]); 
-      COMBA_STORE(C->dp[1]);
-      /* 2 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[9]);       MULADD(at[1], at[8]);       MULADD(at[2], at[7]); 
-      COMBA_STORE(C->dp[2]);
-      /* 3 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[10]);       MULADD(at[1], at[9]);       MULADD(at[2], at[8]);       MULADD(at[3], at[7]); 
-      COMBA_STORE(C->dp[3]);
-      /* 4 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[11]);       MULADD(at[1], at[10]);       MULADD(at[2], at[9]);       MULADD(at[3], at[8]);       MULADD(at[4], at[7]); 
-      COMBA_STORE(C->dp[4]);
-      /* 5 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[12]);       MULADD(at[1], at[11]);       MULADD(at[2], at[10]);       MULADD(at[3], at[9]);       MULADD(at[4], at[8]);       MULADD(at[5], at[7]); 
-      COMBA_STORE(C->dp[5]);
-      /* 6 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[13]);       MULADD(at[1], at[12]);       MULADD(at[2], at[11]);       MULADD(at[3], at[10]);       MULADD(at[4], at[9]);       MULADD(at[5], at[8]);       MULADD(at[6], at[7]); 
-      COMBA_STORE(C->dp[6]);
-      /* 7 */
-      COMBA_FORWARD;
-      MULADD(at[1], at[13]);       MULADD(at[2], at[12]);       MULADD(at[3], at[11]);       MULADD(at[4], at[10]);       MULADD(at[5], at[9]);       MULADD(at[6], at[8]); 
-      COMBA_STORE(C->dp[7]);
-      /* 8 */
-      COMBA_FORWARD;
-      MULADD(at[2], at[13]);       MULADD(at[3], at[12]);       MULADD(at[4], at[11]);       MULADD(at[5], at[10]);       MULADD(at[6], at[9]); 
-      COMBA_STORE(C->dp[8]);
-      /* 9 */
-      COMBA_FORWARD;
-      MULADD(at[3], at[13]);       MULADD(at[4], at[12]);       MULADD(at[5], at[11]);       MULADD(at[6], at[10]); 
-      COMBA_STORE(C->dp[9]);
-      /* 10 */
-      COMBA_FORWARD;
-      MULADD(at[4], at[13]);       MULADD(at[5], at[12]);       MULADD(at[6], at[11]); 
-      COMBA_STORE(C->dp[10]);
-      /* 11 */
-      COMBA_FORWARD;
-      MULADD(at[5], at[13]);       MULADD(at[6], at[12]); 
-      COMBA_STORE(C->dp[11]);
-      /* 12 */
-      COMBA_FORWARD;
-      MULADD(at[6], at[13]); 
-      COMBA_STORE(C->dp[12]);
-      COMBA_STORE2(C->dp[13]);
-      C->used = 14;
-      C->sign = A->sign ^ B->sign;
-      fp_clamp(C);
-      COMBA_FINI;
-      break;
-
-   case 8:
-      XMEMCPY(at, A->dp, 8 * sizeof(fp_digit));
-      XMEMCPY(at+8, B->dp, 8 * sizeof(fp_digit));
-      COMBA_START;
-
-      COMBA_CLEAR;
-      /* 0 */
-      MULADD(at[0], at[8]); 
-      COMBA_STORE(C->dp[0]);
-      /* 1 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[9]);       MULADD(at[1], at[8]); 
-      COMBA_STORE(C->dp[1]);
-      /* 2 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[10]);       MULADD(at[1], at[9]);       MULADD(at[2], at[8]); 
-      COMBA_STORE(C->dp[2]);
-      /* 3 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[11]);       MULADD(at[1], at[10]);       MULADD(at[2], at[9]);       MULADD(at[3], at[8]); 
-      COMBA_STORE(C->dp[3]);
-      /* 4 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[12]);       MULADD(at[1], at[11]);       MULADD(at[2], at[10]);       MULADD(at[3], at[9]);       MULADD(at[4], at[8]); 
-      COMBA_STORE(C->dp[4]);
-      /* 5 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[13]);       MULADD(at[1], at[12]);       MULADD(at[2], at[11]);       MULADD(at[3], at[10]);       MULADD(at[4], at[9]);       MULADD(at[5], at[8]); 
-      COMBA_STORE(C->dp[5]);
-      /* 6 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[14]);       MULADD(at[1], at[13]);       MULADD(at[2], at[12]);       MULADD(at[3], at[11]);       MULADD(at[4], at[10]);       MULADD(at[5], at[9]);       MULADD(at[6], at[8]); 
-      COMBA_STORE(C->dp[6]);
-      /* 7 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[15]);       MULADD(at[1], at[14]);       MULADD(at[2], at[13]);       MULADD(at[3], at[12]);       MULADD(at[4], at[11]);       MULADD(at[5], at[10]);       MULADD(at[6], at[9]);       MULADD(at[7], at[8]); 
-      COMBA_STORE(C->dp[7]);
-      /* 8 */
-      COMBA_FORWARD;
-      MULADD(at[1], at[15]);       MULADD(at[2], at[14]);       MULADD(at[3], at[13]);       MULADD(at[4], at[12]);       MULADD(at[5], at[11]);       MULADD(at[6], at[10]);       MULADD(at[7], at[9]); 
-      COMBA_STORE(C->dp[8]);
-      /* 9 */
-      COMBA_FORWARD;
-      MULADD(at[2], at[15]);       MULADD(at[3], at[14]);       MULADD(at[4], at[13]);       MULADD(at[5], at[12]);       MULADD(at[6], at[11]);       MULADD(at[7], at[10]); 
-      COMBA_STORE(C->dp[9]);
-      /* 10 */
-      COMBA_FORWARD;
-      MULADD(at[3], at[15]);       MULADD(at[4], at[14]);       MULADD(at[5], at[13]);       MULADD(at[6], at[12]);       MULADD(at[7], at[11]); 
-      COMBA_STORE(C->dp[10]);
-      /* 11 */
-      COMBA_FORWARD;
-      MULADD(at[4], at[15]);       MULADD(at[5], at[14]);       MULADD(at[6], at[13]);       MULADD(at[7], at[12]); 
-      COMBA_STORE(C->dp[11]);
-      /* 12 */
-      COMBA_FORWARD;
-      MULADD(at[5], at[15]);       MULADD(at[6], at[14]);       MULADD(at[7], at[13]); 
-      COMBA_STORE(C->dp[12]);
-      /* 13 */
-      COMBA_FORWARD;
-      MULADD(at[6], at[15]);       MULADD(at[7], at[14]); 
-      COMBA_STORE(C->dp[13]);
-      /* 14 */
-      COMBA_FORWARD;
-      MULADD(at[7], at[15]); 
-      COMBA_STORE(C->dp[14]);
-      COMBA_STORE2(C->dp[15]);
-      C->used = 16;
-      C->sign = A->sign ^ B->sign;
-      fp_clamp(C);
-      COMBA_FINI;
-      break;
-
-   case 9:
-      XMEMCPY(at, A->dp, 9 * sizeof(fp_digit));
-      XMEMCPY(at+9, B->dp, 9 * sizeof(fp_digit));
-      COMBA_START;
-
-      COMBA_CLEAR;
-      /* 0 */
-      MULADD(at[0], at[9]); 
-      COMBA_STORE(C->dp[0]);
-      /* 1 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[10]);       MULADD(at[1], at[9]); 
-      COMBA_STORE(C->dp[1]);
-      /* 2 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[11]);       MULADD(at[1], at[10]);       MULADD(at[2], at[9]); 
-      COMBA_STORE(C->dp[2]);
-      /* 3 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[12]);       MULADD(at[1], at[11]);       MULADD(at[2], at[10]);       MULADD(at[3], at[9]); 
-      COMBA_STORE(C->dp[3]);
-      /* 4 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[13]);       MULADD(at[1], at[12]);       MULADD(at[2], at[11]);       MULADD(at[3], at[10]);       MULADD(at[4], at[9]); 
-      COMBA_STORE(C->dp[4]);
-      /* 5 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[14]);       MULADD(at[1], at[13]);       MULADD(at[2], at[12]);       MULADD(at[3], at[11]);       MULADD(at[4], at[10]);       MULADD(at[5], at[9]); 
-      COMBA_STORE(C->dp[5]);
-      /* 6 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[15]);       MULADD(at[1], at[14]);       MULADD(at[2], at[13]);       MULADD(at[3], at[12]);       MULADD(at[4], at[11]);       MULADD(at[5], at[10]);       MULADD(at[6], at[9]); 
-      COMBA_STORE(C->dp[6]);
-      /* 7 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[16]);       MULADD(at[1], at[15]);       MULADD(at[2], at[14]);       MULADD(at[3], at[13]);       MULADD(at[4], at[12]);       MULADD(at[5], at[11]);       MULADD(at[6], at[10]);       MULADD(at[7], at[9]); 
-      COMBA_STORE(C->dp[7]);
-      /* 8 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[17]);       MULADD(at[1], at[16]);       MULADD(at[2], at[15]);       MULADD(at[3], at[14]);       MULADD(at[4], at[13]);       MULADD(at[5], at[12]);       MULADD(at[6], at[11]);       MULADD(at[7], at[10]);       MULADD(at[8], at[9]); 
-      COMBA_STORE(C->dp[8]);
-      /* 9 */
-      COMBA_FORWARD;
-      MULADD(at[1], at[17]);       MULADD(at[2], at[16]);       MULADD(at[3], at[15]);       MULADD(at[4], at[14]);       MULADD(at[5], at[13]);       MULADD(at[6], at[12]);       MULADD(at[7], at[11]);       MULADD(at[8], at[10]); 
-      COMBA_STORE(C->dp[9]);
-      /* 10 */
-      COMBA_FORWARD;
-      MULADD(at[2], at[17]);       MULADD(at[3], at[16]);       MULADD(at[4], at[15]);       MULADD(at[5], at[14]);       MULADD(at[6], at[13]);       MULADD(at[7], at[12]);       MULADD(at[8], at[11]); 
-      COMBA_STORE(C->dp[10]);
-      /* 11 */
-      COMBA_FORWARD;
-      MULADD(at[3], at[17]);       MULADD(at[4], at[16]);       MULADD(at[5], at[15]);       MULADD(at[6], at[14]);       MULADD(at[7], at[13]);       MULADD(at[8], at[12]); 
-      COMBA_STORE(C->dp[11]);
-      /* 12 */
-      COMBA_FORWARD;
-      MULADD(at[4], at[17]);       MULADD(at[5], at[16]);       MULADD(at[6], at[15]);       MULADD(at[7], at[14]);       MULADD(at[8], at[13]); 
-      COMBA_STORE(C->dp[12]);
-      /* 13 */
-      COMBA_FORWARD;
-      MULADD(at[5], at[17]);       MULADD(at[6], at[16]);       MULADD(at[7], at[15]);       MULADD(at[8], at[14]); 
-      COMBA_STORE(C->dp[13]);
-      /* 14 */
-      COMBA_FORWARD;
-      MULADD(at[6], at[17]);       MULADD(at[7], at[16]);       MULADD(at[8], at[15]); 
-      COMBA_STORE(C->dp[14]);
-      /* 15 */
-      COMBA_FORWARD;
-      MULADD(at[7], at[17]);       MULADD(at[8], at[16]); 
-      COMBA_STORE(C->dp[15]);
-      /* 16 */
-      COMBA_FORWARD;
-      MULADD(at[8], at[17]); 
-      COMBA_STORE(C->dp[16]);
-      COMBA_STORE2(C->dp[17]);
-      C->used = 18;
-      C->sign = A->sign ^ B->sign;
-      fp_clamp(C);
-      COMBA_FINI;
-      break;
-
-   case 10:
-      XMEMCPY(at, A->dp, 10 * sizeof(fp_digit));
-      XMEMCPY(at+10, B->dp, 10 * sizeof(fp_digit));
-      COMBA_START;
-
-      COMBA_CLEAR;
-      /* 0 */
-      MULADD(at[0], at[10]); 
-      COMBA_STORE(C->dp[0]);
-      /* 1 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[11]);       MULADD(at[1], at[10]); 
-      COMBA_STORE(C->dp[1]);
-      /* 2 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[12]);       MULADD(at[1], at[11]);       MULADD(at[2], at[10]); 
-      COMBA_STORE(C->dp[2]);
-      /* 3 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[13]);       MULADD(at[1], at[12]);       MULADD(at[2], at[11]);       MULADD(at[3], at[10]); 
-      COMBA_STORE(C->dp[3]);
-      /* 4 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[14]);       MULADD(at[1], at[13]);       MULADD(at[2], at[12]);       MULADD(at[3], at[11]);       MULADD(at[4], at[10]); 
-      COMBA_STORE(C->dp[4]);
-      /* 5 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[15]);       MULADD(at[1], at[14]);       MULADD(at[2], at[13]);       MULADD(at[3], at[12]);       MULADD(at[4], at[11]);       MULADD(at[5], at[10]); 
-      COMBA_STORE(C->dp[5]);
-      /* 6 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[16]);       MULADD(at[1], at[15]);       MULADD(at[2], at[14]);       MULADD(at[3], at[13]);       MULADD(at[4], at[12]);       MULADD(at[5], at[11]);       MULADD(at[6], at[10]); 
-      COMBA_STORE(C->dp[6]);
-      /* 7 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[17]);       MULADD(at[1], at[16]);       MULADD(at[2], at[15]);       MULADD(at[3], at[14]);       MULADD(at[4], at[13]);       MULADD(at[5], at[12]);       MULADD(at[6], at[11]);       MULADD(at[7], at[10]); 
-      COMBA_STORE(C->dp[7]);
-      /* 8 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[18]);       MULADD(at[1], at[17]);       MULADD(at[2], at[16]);       MULADD(at[3], at[15]);       MULADD(at[4], at[14]);       MULADD(at[5], at[13]);       MULADD(at[6], at[12]);       MULADD(at[7], at[11]);       MULADD(at[8], at[10]); 
-      COMBA_STORE(C->dp[8]);
-      /* 9 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[19]);       MULADD(at[1], at[18]);       MULADD(at[2], at[17]);       MULADD(at[3], at[16]);       MULADD(at[4], at[15]);       MULADD(at[5], at[14]);       MULADD(at[6], at[13]);       MULADD(at[7], at[12]);       MULADD(at[8], at[11]);       MULADD(at[9], at[10]); 
-      COMBA_STORE(C->dp[9]);
-      /* 10 */
-      COMBA_FORWARD;
-      MULADD(at[1], at[19]);       MULADD(at[2], at[18]);       MULADD(at[3], at[17]);       MULADD(at[4], at[16]);       MULADD(at[5], at[15]);       MULADD(at[6], at[14]);       MULADD(at[7], at[13]);       MULADD(at[8], at[12]);       MULADD(at[9], at[11]); 
-      COMBA_STORE(C->dp[10]);
-      /* 11 */
-      COMBA_FORWARD;
-      MULADD(at[2], at[19]);       MULADD(at[3], at[18]);       MULADD(at[4], at[17]);       MULADD(at[5], at[16]);       MULADD(at[6], at[15]);       MULADD(at[7], at[14]);       MULADD(at[8], at[13]);       MULADD(at[9], at[12]); 
-      COMBA_STORE(C->dp[11]);
-      /* 12 */
-      COMBA_FORWARD;
-      MULADD(at[3], at[19]);       MULADD(at[4], at[18]);       MULADD(at[5], at[17]);       MULADD(at[6], at[16]);       MULADD(at[7], at[15]);       MULADD(at[8], at[14]);       MULADD(at[9], at[13]); 
-      COMBA_STORE(C->dp[12]);
-      /* 13 */
-      COMBA_FORWARD;
-      MULADD(at[4], at[19]);       MULADD(at[5], at[18]);       MULADD(at[6], at[17]);       MULADD(at[7], at[16]);       MULADD(at[8], at[15]);       MULADD(at[9], at[14]); 
-      COMBA_STORE(C->dp[13]);
-      /* 14 */
-      COMBA_FORWARD;
-      MULADD(at[5], at[19]);       MULADD(at[6], at[18]);       MULADD(at[7], at[17]);       MULADD(at[8], at[16]);       MULADD(at[9], at[15]); 
-      COMBA_STORE(C->dp[14]);
-      /* 15 */
-      COMBA_FORWARD;
-      MULADD(at[6], at[19]);       MULADD(at[7], at[18]);       MULADD(at[8], at[17]);       MULADD(at[9], at[16]); 
-      COMBA_STORE(C->dp[15]);
-      /* 16 */
-      COMBA_FORWARD;
-      MULADD(at[7], at[19]);       MULADD(at[8], at[18]);       MULADD(at[9], at[17]); 
-      COMBA_STORE(C->dp[16]);
-      /* 17 */
-      COMBA_FORWARD;
-      MULADD(at[8], at[19]);       MULADD(at[9], at[18]); 
-      COMBA_STORE(C->dp[17]);
-      /* 18 */
-      COMBA_FORWARD;
-      MULADD(at[9], at[19]); 
-      COMBA_STORE(C->dp[18]);
-      COMBA_STORE2(C->dp[19]);
-      C->used = 20;
-      C->sign = A->sign ^ B->sign;
-      fp_clamp(C);
-      COMBA_FINI;
-      break;
-
-   case 11:
-      XMEMCPY(at, A->dp, 11 * sizeof(fp_digit));
-      XMEMCPY(at+11, B->dp, 11 * sizeof(fp_digit));
-      COMBA_START;
-
-      COMBA_CLEAR;
-      /* 0 */
-      MULADD(at[0], at[11]); 
-      COMBA_STORE(C->dp[0]);
-      /* 1 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[12]);       MULADD(at[1], at[11]); 
-      COMBA_STORE(C->dp[1]);
-      /* 2 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[13]);       MULADD(at[1], at[12]);       MULADD(at[2], at[11]); 
-      COMBA_STORE(C->dp[2]);
-      /* 3 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[14]);       MULADD(at[1], at[13]);       MULADD(at[2], at[12]);       MULADD(at[3], at[11]); 
-      COMBA_STORE(C->dp[3]);
-      /* 4 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[15]);       MULADD(at[1], at[14]);       MULADD(at[2], at[13]);       MULADD(at[3], at[12]);       MULADD(at[4], at[11]); 
-      COMBA_STORE(C->dp[4]);
-      /* 5 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[16]);       MULADD(at[1], at[15]);       MULADD(at[2], at[14]);       MULADD(at[3], at[13]);       MULADD(at[4], at[12]);       MULADD(at[5], at[11]); 
-      COMBA_STORE(C->dp[5]);
-      /* 6 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[17]);       MULADD(at[1], at[16]);       MULADD(at[2], at[15]);       MULADD(at[3], at[14]);       MULADD(at[4], at[13]);       MULADD(at[5], at[12]);       MULADD(at[6], at[11]); 
-      COMBA_STORE(C->dp[6]);
-      /* 7 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[18]);       MULADD(at[1], at[17]);       MULADD(at[2], at[16]);       MULADD(at[3], at[15]);       MULADD(at[4], at[14]);       MULADD(at[5], at[13]);       MULADD(at[6], at[12]);       MULADD(at[7], at[11]); 
-      COMBA_STORE(C->dp[7]);
-      /* 8 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[19]);       MULADD(at[1], at[18]);       MULADD(at[2], at[17]);       MULADD(at[3], at[16]);       MULADD(at[4], at[15]);       MULADD(at[5], at[14]);       MULADD(at[6], at[13]);       MULADD(at[7], at[12]);       MULADD(at[8], at[11]); 
-      COMBA_STORE(C->dp[8]);
-      /* 9 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[20]);       MULADD(at[1], at[19]);       MULADD(at[2], at[18]);       MULADD(at[3], at[17]);       MULADD(at[4], at[16]);       MULADD(at[5], at[15]);       MULADD(at[6], at[14]);       MULADD(at[7], at[13]);       MULADD(at[8], at[12]);       MULADD(at[9], at[11]); 
-      COMBA_STORE(C->dp[9]);
-      /* 10 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[21]);       MULADD(at[1], at[20]);       MULADD(at[2], at[19]);       MULADD(at[3], at[18]);       MULADD(at[4], at[17]);       MULADD(at[5], at[16]);       MULADD(at[6], at[15]);       MULADD(at[7], at[14]);       MULADD(at[8], at[13]);       MULADD(at[9], at[12]);       MULADD(at[10], at[11]); 
-      COMBA_STORE(C->dp[10]);
-      /* 11 */
-      COMBA_FORWARD;
-      MULADD(at[1], at[21]);       MULADD(at[2], at[20]);       MULADD(at[3], at[19]);       MULADD(at[4], at[18]);       MULADD(at[5], at[17]);       MULADD(at[6], at[16]);       MULADD(at[7], at[15]);       MULADD(at[8], at[14]);       MULADD(at[9], at[13]);       MULADD(at[10], at[12]); 
-      COMBA_STORE(C->dp[11]);
-      /* 12 */
-      COMBA_FORWARD;
-      MULADD(at[2], at[21]);       MULADD(at[3], at[20]);       MULADD(at[4], at[19]);       MULADD(at[5], at[18]);       MULADD(at[6], at[17]);       MULADD(at[7], at[16]);       MULADD(at[8], at[15]);       MULADD(at[9], at[14]);       MULADD(at[10], at[13]); 
-      COMBA_STORE(C->dp[12]);
-      /* 13 */
-      COMBA_FORWARD;
-      MULADD(at[3], at[21]);       MULADD(at[4], at[20]);       MULADD(at[5], at[19]);       MULADD(at[6], at[18]);       MULADD(at[7], at[17]);       MULADD(at[8], at[16]);       MULADD(at[9], at[15]);       MULADD(at[10], at[14]); 
-      COMBA_STORE(C->dp[13]);
-      /* 14 */
-      COMBA_FORWARD;
-      MULADD(at[4], at[21]);       MULADD(at[5], at[20]);       MULADD(at[6], at[19]);       MULADD(at[7], at[18]);       MULADD(at[8], at[17]);       MULADD(at[9], at[16]);       MULADD(at[10], at[15]); 
-      COMBA_STORE(C->dp[14]);
-      /* 15 */
-      COMBA_FORWARD;
-      MULADD(at[5], at[21]);       MULADD(at[6], at[20]);       MULADD(at[7], at[19]);       MULADD(at[8], at[18]);       MULADD(at[9], at[17]);       MULADD(at[10], at[16]); 
-      COMBA_STORE(C->dp[15]);
-      /* 16 */
-      COMBA_FORWARD;
-      MULADD(at[6], at[21]);       MULADD(at[7], at[20]);       MULADD(at[8], at[19]);       MULADD(at[9], at[18]);       MULADD(at[10], at[17]); 
-      COMBA_STORE(C->dp[16]);
-      /* 17 */
-      COMBA_FORWARD;
-      MULADD(at[7], at[21]);       MULADD(at[8], at[20]);       MULADD(at[9], at[19]);       MULADD(at[10], at[18]); 
-      COMBA_STORE(C->dp[17]);
-      /* 18 */
-      COMBA_FORWARD;
-      MULADD(at[8], at[21]);       MULADD(at[9], at[20]);       MULADD(at[10], at[19]); 
-      COMBA_STORE(C->dp[18]);
-      /* 19 */
-      COMBA_FORWARD;
-      MULADD(at[9], at[21]);       MULADD(at[10], at[20]); 
-      COMBA_STORE(C->dp[19]);
-      /* 20 */
-      COMBA_FORWARD;
-      MULADD(at[10], at[21]); 
-      COMBA_STORE(C->dp[20]);
-      COMBA_STORE2(C->dp[21]);
-      C->used = 22;
-      C->sign = A->sign ^ B->sign;
-      fp_clamp(C);
-      COMBA_FINI;
-      break;
-
-   case 12:
-      XMEMCPY(at, A->dp, 12 * sizeof(fp_digit));
-      XMEMCPY(at+12, B->dp, 12 * sizeof(fp_digit));
-      COMBA_START;
-
-      COMBA_CLEAR;
-      /* 0 */
-      MULADD(at[0], at[12]); 
-      COMBA_STORE(C->dp[0]);
-      /* 1 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[13]);       MULADD(at[1], at[12]); 
-      COMBA_STORE(C->dp[1]);
-      /* 2 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[14]);       MULADD(at[1], at[13]);       MULADD(at[2], at[12]); 
-      COMBA_STORE(C->dp[2]);
-      /* 3 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[15]);       MULADD(at[1], at[14]);       MULADD(at[2], at[13]);       MULADD(at[3], at[12]); 
-      COMBA_STORE(C->dp[3]);
-      /* 4 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[16]);       MULADD(at[1], at[15]);       MULADD(at[2], at[14]);       MULADD(at[3], at[13]);       MULADD(at[4], at[12]); 
-      COMBA_STORE(C->dp[4]);
-      /* 5 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[17]);       MULADD(at[1], at[16]);       MULADD(at[2], at[15]);       MULADD(at[3], at[14]);       MULADD(at[4], at[13]);       MULADD(at[5], at[12]); 
-      COMBA_STORE(C->dp[5]);
-      /* 6 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[18]);       MULADD(at[1], at[17]);       MULADD(at[2], at[16]);       MULADD(at[3], at[15]);       MULADD(at[4], at[14]);       MULADD(at[5], at[13]);       MULADD(at[6], at[12]); 
-      COMBA_STORE(C->dp[6]);
-      /* 7 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[19]);       MULADD(at[1], at[18]);       MULADD(at[2], at[17]);       MULADD(at[3], at[16]);       MULADD(at[4], at[15]);       MULADD(at[5], at[14]);       MULADD(at[6], at[13]);       MULADD(at[7], at[12]); 
-      COMBA_STORE(C->dp[7]);
-      /* 8 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[20]);       MULADD(at[1], at[19]);       MULADD(at[2], at[18]);       MULADD(at[3], at[17]);       MULADD(at[4], at[16]);       MULADD(at[5], at[15]);       MULADD(at[6], at[14]);       MULADD(at[7], at[13]);       MULADD(at[8], at[12]); 
-      COMBA_STORE(C->dp[8]);
-      /* 9 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[21]);       MULADD(at[1], at[20]);       MULADD(at[2], at[19]);       MULADD(at[3], at[18]);       MULADD(at[4], at[17]);       MULADD(at[5], at[16]);       MULADD(at[6], at[15]);       MULADD(at[7], at[14]);       MULADD(at[8], at[13]);       MULADD(at[9], at[12]); 
-      COMBA_STORE(C->dp[9]);
-      /* 10 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[22]);       MULADD(at[1], at[21]);       MULADD(at[2], at[20]);       MULADD(at[3], at[19]);       MULADD(at[4], at[18]);       MULADD(at[5], at[17]);       MULADD(at[6], at[16]);       MULADD(at[7], at[15]);       MULADD(at[8], at[14]);       MULADD(at[9], at[13]);       MULADD(at[10], at[12]); 
-      COMBA_STORE(C->dp[10]);
-      /* 11 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[23]);       MULADD(at[1], at[22]);       MULADD(at[2], at[21]);       MULADD(at[3], at[20]);       MULADD(at[4], at[19]);       MULADD(at[5], at[18]);       MULADD(at[6], at[17]);       MULADD(at[7], at[16]);       MULADD(at[8], at[15]);       MULADD(at[9], at[14]);       MULADD(at[10], at[13]);       MULADD(at[11], at[12]); 
-      COMBA_STORE(C->dp[11]);
-      /* 12 */
-      COMBA_FORWARD;
-      MULADD(at[1], at[23]);       MULADD(at[2], at[22]);       MULADD(at[3], at[21]);       MULADD(at[4], at[20]);       MULADD(at[5], at[19]);       MULADD(at[6], at[18]);       MULADD(at[7], at[17]);       MULADD(at[8], at[16]);       MULADD(at[9], at[15]);       MULADD(at[10], at[14]);       MULADD(at[11], at[13]); 
-      COMBA_STORE(C->dp[12]);
-      /* 13 */
-      COMBA_FORWARD;
-      MULADD(at[2], at[23]);       MULADD(at[3], at[22]);       MULADD(at[4], at[21]);       MULADD(at[5], at[20]);       MULADD(at[6], at[19]);       MULADD(at[7], at[18]);       MULADD(at[8], at[17]);       MULADD(at[9], at[16]);       MULADD(at[10], at[15]);       MULADD(at[11], at[14]); 
-      COMBA_STORE(C->dp[13]);
-      /* 14 */
-      COMBA_FORWARD;
-      MULADD(at[3], at[23]);       MULADD(at[4], at[22]);       MULADD(at[5], at[21]);       MULADD(at[6], at[20]);       MULADD(at[7], at[19]);       MULADD(at[8], at[18]);       MULADD(at[9], at[17]);       MULADD(at[10], at[16]);       MULADD(at[11], at[15]); 
-      COMBA_STORE(C->dp[14]);
-      /* 15 */
-      COMBA_FORWARD;
-      MULADD(at[4], at[23]);       MULADD(at[5], at[22]);       MULADD(at[6], at[21]);       MULADD(at[7], at[20]);       MULADD(at[8], at[19]);       MULADD(at[9], at[18]);       MULADD(at[10], at[17]);       MULADD(at[11], at[16]); 
-      COMBA_STORE(C->dp[15]);
-      /* 16 */
-      COMBA_FORWARD;
-      MULADD(at[5], at[23]);       MULADD(at[6], at[22]);       MULADD(at[7], at[21]);       MULADD(at[8], at[20]);       MULADD(at[9], at[19]);       MULADD(at[10], at[18]);       MULADD(at[11], at[17]); 
-      COMBA_STORE(C->dp[16]);
-      /* 17 */
-      COMBA_FORWARD;
-      MULADD(at[6], at[23]);       MULADD(at[7], at[22]);       MULADD(at[8], at[21]);       MULADD(at[9], at[20]);       MULADD(at[10], at[19]);       MULADD(at[11], at[18]); 
-      COMBA_STORE(C->dp[17]);
-      /* 18 */
-      COMBA_FORWARD;
-      MULADD(at[7], at[23]);       MULADD(at[8], at[22]);       MULADD(at[9], at[21]);       MULADD(at[10], at[20]);       MULADD(at[11], at[19]); 
-      COMBA_STORE(C->dp[18]);
-      /* 19 */
-      COMBA_FORWARD;
-      MULADD(at[8], at[23]);       MULADD(at[9], at[22]);       MULADD(at[10], at[21]);       MULADD(at[11], at[20]); 
-      COMBA_STORE(C->dp[19]);
-      /* 20 */
-      COMBA_FORWARD;
-      MULADD(at[9], at[23]);       MULADD(at[10], at[22]);       MULADD(at[11], at[21]); 
-      COMBA_STORE(C->dp[20]);
-      /* 21 */
-      COMBA_FORWARD;
-      MULADD(at[10], at[23]);       MULADD(at[11], at[22]); 
-      COMBA_STORE(C->dp[21]);
-      /* 22 */
-      COMBA_FORWARD;
-      MULADD(at[11], at[23]); 
-      COMBA_STORE(C->dp[22]);
-      COMBA_STORE2(C->dp[23]);
-      C->used = 24;
-      C->sign = A->sign ^ B->sign;
-      fp_clamp(C);
-      COMBA_FINI;
-      break;
-
-   case 13:
-      XMEMCPY(at, A->dp, 13 * sizeof(fp_digit));
-      XMEMCPY(at+13, B->dp, 13 * sizeof(fp_digit));
-      COMBA_START;
-
-      COMBA_CLEAR;
-      /* 0 */
-      MULADD(at[0], at[13]); 
-      COMBA_STORE(C->dp[0]);
-      /* 1 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[14]);       MULADD(at[1], at[13]); 
-      COMBA_STORE(C->dp[1]);
-      /* 2 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[15]);       MULADD(at[1], at[14]);       MULADD(at[2], at[13]); 
-      COMBA_STORE(C->dp[2]);
-      /* 3 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[16]);       MULADD(at[1], at[15]);       MULADD(at[2], at[14]);       MULADD(at[3], at[13]); 
-      COMBA_STORE(C->dp[3]);
-      /* 4 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[17]);       MULADD(at[1], at[16]);       MULADD(at[2], at[15]);       MULADD(at[3], at[14]);       MULADD(at[4], at[13]); 
-      COMBA_STORE(C->dp[4]);
-      /* 5 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[18]);       MULADD(at[1], at[17]);       MULADD(at[2], at[16]);       MULADD(at[3], at[15]);       MULADD(at[4], at[14]);       MULADD(at[5], at[13]); 
-      COMBA_STORE(C->dp[5]);
-      /* 6 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[19]);       MULADD(at[1], at[18]);       MULADD(at[2], at[17]);       MULADD(at[3], at[16]);       MULADD(at[4], at[15]);       MULADD(at[5], at[14]);       MULADD(at[6], at[13]); 
-      COMBA_STORE(C->dp[6]);
-      /* 7 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[20]);       MULADD(at[1], at[19]);       MULADD(at[2], at[18]);       MULADD(at[3], at[17]);       MULADD(at[4], at[16]);       MULADD(at[5], at[15]);       MULADD(at[6], at[14]);       MULADD(at[7], at[13]); 
-      COMBA_STORE(C->dp[7]);
-      /* 8 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[21]);       MULADD(at[1], at[20]);       MULADD(at[2], at[19]);       MULADD(at[3], at[18]);       MULADD(at[4], at[17]);       MULADD(at[5], at[16]);       MULADD(at[6], at[15]);       MULADD(at[7], at[14]);       MULADD(at[8], at[13]); 
-      COMBA_STORE(C->dp[8]);
-      /* 9 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[22]);       MULADD(at[1], at[21]);       MULADD(at[2], at[20]);       MULADD(at[3], at[19]);       MULADD(at[4], at[18]);       MULADD(at[5], at[17]);       MULADD(at[6], at[16]);       MULADD(at[7], at[15]);       MULADD(at[8], at[14]);       MULADD(at[9], at[13]); 
-      COMBA_STORE(C->dp[9]);
-      /* 10 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[23]);       MULADD(at[1], at[22]);       MULADD(at[2], at[21]);       MULADD(at[3], at[20]);       MULADD(at[4], at[19]);       MULADD(at[5], at[18]);       MULADD(at[6], at[17]);       MULADD(at[7], at[16]);       MULADD(at[8], at[15]);       MULADD(at[9], at[14]);       MULADD(at[10], at[13]); 
-      COMBA_STORE(C->dp[10]);
-      /* 11 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[24]);       MULADD(at[1], at[23]);       MULADD(at[2], at[22]);       MULADD(at[3], at[21]);       MULADD(at[4], at[20]);       MULADD(at[5], at[19]);       MULADD(at[6], at[18]);       MULADD(at[7], at[17]);       MULADD(at[8], at[16]);       MULADD(at[9], at[15]);       MULADD(at[10], at[14]);       MULADD(at[11], at[13]); 
-      COMBA_STORE(C->dp[11]);
-      /* 12 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[25]);       MULADD(at[1], at[24]);       MULADD(at[2], at[23]);       MULADD(at[3], at[22]);       MULADD(at[4], at[21]);       MULADD(at[5], at[20]);       MULADD(at[6], at[19]);       MULADD(at[7], at[18]);       MULADD(at[8], at[17]);       MULADD(at[9], at[16]);       MULADD(at[10], at[15]);       MULADD(at[11], at[14]);       MULADD(at[12], at[13]); 
-      COMBA_STORE(C->dp[12]);
-      /* 13 */
-      COMBA_FORWARD;
-      MULADD(at[1], at[25]);       MULADD(at[2], at[24]);       MULADD(at[3], at[23]);       MULADD(at[4], at[22]);       MULADD(at[5], at[21]);       MULADD(at[6], at[20]);       MULADD(at[7], at[19]);       MULADD(at[8], at[18]);       MULADD(at[9], at[17]);       MULADD(at[10], at[16]);       MULADD(at[11], at[15]);       MULADD(at[12], at[14]); 
-      COMBA_STORE(C->dp[13]);
-      /* 14 */
-      COMBA_FORWARD;
-      MULADD(at[2], at[25]);       MULADD(at[3], at[24]);       MULADD(at[4], at[23]);       MULADD(at[5], at[22]);       MULADD(at[6], at[21]);       MULADD(at[7], at[20]);       MULADD(at[8], at[19]);       MULADD(at[9], at[18]);       MULADD(at[10], at[17]);       MULADD(at[11], at[16]);       MULADD(at[12], at[15]); 
-      COMBA_STORE(C->dp[14]);
-      /* 15 */
-      COMBA_FORWARD;
-      MULADD(at[3], at[25]);       MULADD(at[4], at[24]);       MULADD(at[5], at[23]);       MULADD(at[6], at[22]);       MULADD(at[7], at[21]);       MULADD(at[8], at[20]);       MULADD(at[9], at[19]);       MULADD(at[10], at[18]);       MULADD(at[11], at[17]);       MULADD(at[12], at[16]); 
-      COMBA_STORE(C->dp[15]);
-      /* 16 */
-      COMBA_FORWARD;
-      MULADD(at[4], at[25]);       MULADD(at[5], at[24]);       MULADD(at[6], at[23]);       MULADD(at[7], at[22]);       MULADD(at[8], at[21]);       MULADD(at[9], at[20]);       MULADD(at[10], at[19]);       MULADD(at[11], at[18]);       MULADD(at[12], at[17]); 
-      COMBA_STORE(C->dp[16]);
-      /* 17 */
-      COMBA_FORWARD;
-      MULADD(at[5], at[25]);       MULADD(at[6], at[24]);       MULADD(at[7], at[23]);       MULADD(at[8], at[22]);       MULADD(at[9], at[21]);       MULADD(at[10], at[20]);       MULADD(at[11], at[19]);       MULADD(at[12], at[18]); 
-      COMBA_STORE(C->dp[17]);
-      /* 18 */
-      COMBA_FORWARD;
-      MULADD(at[6], at[25]);       MULADD(at[7], at[24]);       MULADD(at[8], at[23]);       MULADD(at[9], at[22]);       MULADD(at[10], at[21]);       MULADD(at[11], at[20]);       MULADD(at[12], at[19]); 
-      COMBA_STORE(C->dp[18]);
-      /* 19 */
-      COMBA_FORWARD;
-      MULADD(at[7], at[25]);       MULADD(at[8], at[24]);       MULADD(at[9], at[23]);       MULADD(at[10], at[22]);       MULADD(at[11], at[21]);       MULADD(at[12], at[20]); 
-      COMBA_STORE(C->dp[19]);
-      /* 20 */
-      COMBA_FORWARD;
-      MULADD(at[8], at[25]);       MULADD(at[9], at[24]);       MULADD(at[10], at[23]);       MULADD(at[11], at[22]);       MULADD(at[12], at[21]); 
-      COMBA_STORE(C->dp[20]);
-      /* 21 */
-      COMBA_FORWARD;
-      MULADD(at[9], at[25]);       MULADD(at[10], at[24]);       MULADD(at[11], at[23]);       MULADD(at[12], at[22]); 
-      COMBA_STORE(C->dp[21]);
-      /* 22 */
-      COMBA_FORWARD;
-      MULADD(at[10], at[25]);       MULADD(at[11], at[24]);       MULADD(at[12], at[23]); 
-      COMBA_STORE(C->dp[22]);
-      /* 23 */
-      COMBA_FORWARD;
-      MULADD(at[11], at[25]);       MULADD(at[12], at[24]); 
-      COMBA_STORE(C->dp[23]);
-      /* 24 */
-      COMBA_FORWARD;
-      MULADD(at[12], at[25]); 
-      COMBA_STORE(C->dp[24]);
-      COMBA_STORE2(C->dp[25]);
-      C->used = 26;
-      C->sign = A->sign ^ B->sign;
-      fp_clamp(C);
-      COMBA_FINI;
-      break;
-
-   case 14:
-      XMEMCPY(at, A->dp, 14 * sizeof(fp_digit));
-      XMEMCPY(at+14, B->dp, 14 * sizeof(fp_digit));
-      COMBA_START;
-
-      COMBA_CLEAR;
-      /* 0 */
-      MULADD(at[0], at[14]); 
-      COMBA_STORE(C->dp[0]);
-      /* 1 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[15]);       MULADD(at[1], at[14]); 
-      COMBA_STORE(C->dp[1]);
-      /* 2 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[16]);       MULADD(at[1], at[15]);       MULADD(at[2], at[14]); 
-      COMBA_STORE(C->dp[2]);
-      /* 3 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[17]);       MULADD(at[1], at[16]);       MULADD(at[2], at[15]);       MULADD(at[3], at[14]); 
-      COMBA_STORE(C->dp[3]);
-      /* 4 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[18]);       MULADD(at[1], at[17]);       MULADD(at[2], at[16]);       MULADD(at[3], at[15]);       MULADD(at[4], at[14]); 
-      COMBA_STORE(C->dp[4]);
-      /* 5 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[19]);       MULADD(at[1], at[18]);       MULADD(at[2], at[17]);       MULADD(at[3], at[16]);       MULADD(at[4], at[15]);       MULADD(at[5], at[14]); 
-      COMBA_STORE(C->dp[5]);
-      /* 6 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[20]);       MULADD(at[1], at[19]);       MULADD(at[2], at[18]);       MULADD(at[3], at[17]);       MULADD(at[4], at[16]);       MULADD(at[5], at[15]);       MULADD(at[6], at[14]); 
-      COMBA_STORE(C->dp[6]);
-      /* 7 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[21]);       MULADD(at[1], at[20]);       MULADD(at[2], at[19]);       MULADD(at[3], at[18]);       MULADD(at[4], at[17]);       MULADD(at[5], at[16]);       MULADD(at[6], at[15]);       MULADD(at[7], at[14]); 
-      COMBA_STORE(C->dp[7]);
-      /* 8 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[22]);       MULADD(at[1], at[21]);       MULADD(at[2], at[20]);       MULADD(at[3], at[19]);       MULADD(at[4], at[18]);       MULADD(at[5], at[17]);       MULADD(at[6], at[16]);       MULADD(at[7], at[15]);       MULADD(at[8], at[14]); 
-      COMBA_STORE(C->dp[8]);
-      /* 9 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[23]);       MULADD(at[1], at[22]);       MULADD(at[2], at[21]);       MULADD(at[3], at[20]);       MULADD(at[4], at[19]);       MULADD(at[5], at[18]);       MULADD(at[6], at[17]);       MULADD(at[7], at[16]);       MULADD(at[8], at[15]);       MULADD(at[9], at[14]); 
-      COMBA_STORE(C->dp[9]);
-      /* 10 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[24]);       MULADD(at[1], at[23]);       MULADD(at[2], at[22]);       MULADD(at[3], at[21]);       MULADD(at[4], at[20]);       MULADD(at[5], at[19]);       MULADD(at[6], at[18]);       MULADD(at[7], at[17]);       MULADD(at[8], at[16]);       MULADD(at[9], at[15]);       MULADD(at[10], at[14]); 
-      COMBA_STORE(C->dp[10]);
-      /* 11 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[25]);       MULADD(at[1], at[24]);       MULADD(at[2], at[23]);       MULADD(at[3], at[22]);       MULADD(at[4], at[21]);       MULADD(at[5], at[20]);       MULADD(at[6], at[19]);       MULADD(at[7], at[18]);       MULADD(at[8], at[17]);       MULADD(at[9], at[16]);       MULADD(at[10], at[15]);       MULADD(at[11], at[14]); 
-      COMBA_STORE(C->dp[11]);
-      /* 12 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[26]);       MULADD(at[1], at[25]);       MULADD(at[2], at[24]);       MULADD(at[3], at[23]);       MULADD(at[4], at[22]);       MULADD(at[5], at[21]);       MULADD(at[6], at[20]);       MULADD(at[7], at[19]);       MULADD(at[8], at[18]);       MULADD(at[9], at[17]);       MULADD(at[10], at[16]);       MULADD(at[11], at[15]);       MULADD(at[12], at[14]); 
-      COMBA_STORE(C->dp[12]);
-      /* 13 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[27]);       MULADD(at[1], at[26]);       MULADD(at[2], at[25]);       MULADD(at[3], at[24]);       MULADD(at[4], at[23]);       MULADD(at[5], at[22]);       MULADD(at[6], at[21]);       MULADD(at[7], at[20]);       MULADD(at[8], at[19]);       MULADD(at[9], at[18]);       MULADD(at[10], at[17]);       MULADD(at[11], at[16]);       MULADD(at[12], at[15]);       MULADD(at[13], at[14]); 
-      COMBA_STORE(C->dp[13]);
-      /* 14 */
-      COMBA_FORWARD;
-      MULADD(at[1], at[27]);       MULADD(at[2], at[26]);       MULADD(at[3], at[25]);       MULADD(at[4], at[24]);       MULADD(at[5], at[23]);       MULADD(at[6], at[22]);       MULADD(at[7], at[21]);       MULADD(at[8], at[20]);       MULADD(at[9], at[19]);       MULADD(at[10], at[18]);       MULADD(at[11], at[17]);       MULADD(at[12], at[16]);       MULADD(at[13], at[15]); 
-      COMBA_STORE(C->dp[14]);
-      /* 15 */
-      COMBA_FORWARD;
-      MULADD(at[2], at[27]);       MULADD(at[3], at[26]);       MULADD(at[4], at[25]);       MULADD(at[5], at[24]);       MULADD(at[6], at[23]);       MULADD(at[7], at[22]);       MULADD(at[8], at[21]);       MULADD(at[9], at[20]);       MULADD(at[10], at[19]);       MULADD(at[11], at[18]);       MULADD(at[12], at[17]);       MULADD(at[13], at[16]); 
-      COMBA_STORE(C->dp[15]);
-      /* 16 */
-      COMBA_FORWARD;
-      MULADD(at[3], at[27]);       MULADD(at[4], at[26]);       MULADD(at[5], at[25]);       MULADD(at[6], at[24]);       MULADD(at[7], at[23]);       MULADD(at[8], at[22]);       MULADD(at[9], at[21]);       MULADD(at[10], at[20]);       MULADD(at[11], at[19]);       MULADD(at[12], at[18]);       MULADD(at[13], at[17]); 
-      COMBA_STORE(C->dp[16]);
-      /* 17 */
-      COMBA_FORWARD;
-      MULADD(at[4], at[27]);       MULADD(at[5], at[26]);       MULADD(at[6], at[25]);       MULADD(at[7], at[24]);       MULADD(at[8], at[23]);       MULADD(at[9], at[22]);       MULADD(at[10], at[21]);       MULADD(at[11], at[20]);       MULADD(at[12], at[19]);       MULADD(at[13], at[18]); 
-      COMBA_STORE(C->dp[17]);
-      /* 18 */
-      COMBA_FORWARD;
-      MULADD(at[5], at[27]);       MULADD(at[6], at[26]);       MULADD(at[7], at[25]);       MULADD(at[8], at[24]);       MULADD(at[9], at[23]);       MULADD(at[10], at[22]);       MULADD(at[11], at[21]);       MULADD(at[12], at[20]);       MULADD(at[13], at[19]); 
-      COMBA_STORE(C->dp[18]);
-      /* 19 */
-      COMBA_FORWARD;
-      MULADD(at[6], at[27]);       MULADD(at[7], at[26]);       MULADD(at[8], at[25]);       MULADD(at[9], at[24]);       MULADD(at[10], at[23]);       MULADD(at[11], at[22]);       MULADD(at[12], at[21]);       MULADD(at[13], at[20]); 
-      COMBA_STORE(C->dp[19]);
-      /* 20 */
-      COMBA_FORWARD;
-      MULADD(at[7], at[27]);       MULADD(at[8], at[26]);       MULADD(at[9], at[25]);       MULADD(at[10], at[24]);       MULADD(at[11], at[23]);       MULADD(at[12], at[22]);       MULADD(at[13], at[21]); 
-      COMBA_STORE(C->dp[20]);
-      /* 21 */
-      COMBA_FORWARD;
-      MULADD(at[8], at[27]);       MULADD(at[9], at[26]);       MULADD(at[10], at[25]);       MULADD(at[11], at[24]);       MULADD(at[12], at[23]);       MULADD(at[13], at[22]); 
-      COMBA_STORE(C->dp[21]);
-      /* 22 */
-      COMBA_FORWARD;
-      MULADD(at[9], at[27]);       MULADD(at[10], at[26]);       MULADD(at[11], at[25]);       MULADD(at[12], at[24]);       MULADD(at[13], at[23]); 
-      COMBA_STORE(C->dp[22]);
-      /* 23 */
-      COMBA_FORWARD;
-      MULADD(at[10], at[27]);       MULADD(at[11], at[26]);       MULADD(at[12], at[25]);       MULADD(at[13], at[24]); 
-      COMBA_STORE(C->dp[23]);
-      /* 24 */
-      COMBA_FORWARD;
-      MULADD(at[11], at[27]);       MULADD(at[12], at[26]);       MULADD(at[13], at[25]); 
-      COMBA_STORE(C->dp[24]);
-      /* 25 */
-      COMBA_FORWARD;
-      MULADD(at[12], at[27]);       MULADD(at[13], at[26]); 
-      COMBA_STORE(C->dp[25]);
-      /* 26 */
-      COMBA_FORWARD;
-      MULADD(at[13], at[27]); 
-      COMBA_STORE(C->dp[26]);
-      COMBA_STORE2(C->dp[27]);
-      C->used = 28;
-      C->sign = A->sign ^ B->sign;
-      fp_clamp(C);
-      COMBA_FINI;
-      break;
-
-   case 15:
-      XMEMCPY(at, A->dp, 15 * sizeof(fp_digit));
-      XMEMCPY(at+15, B->dp, 15 * sizeof(fp_digit));
-      COMBA_START;
-
-      COMBA_CLEAR;
-      /* 0 */
-      MULADD(at[0], at[15]); 
-      COMBA_STORE(C->dp[0]);
-      /* 1 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[16]);       MULADD(at[1], at[15]); 
-      COMBA_STORE(C->dp[1]);
-      /* 2 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[17]);       MULADD(at[1], at[16]);       MULADD(at[2], at[15]); 
-      COMBA_STORE(C->dp[2]);
-      /* 3 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[18]);       MULADD(at[1], at[17]);       MULADD(at[2], at[16]);       MULADD(at[3], at[15]); 
-      COMBA_STORE(C->dp[3]);
-      /* 4 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[19]);       MULADD(at[1], at[18]);       MULADD(at[2], at[17]);       MULADD(at[3], at[16]);       MULADD(at[4], at[15]); 
-      COMBA_STORE(C->dp[4]);
-      /* 5 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[20]);       MULADD(at[1], at[19]);       MULADD(at[2], at[18]);       MULADD(at[3], at[17]);       MULADD(at[4], at[16]);       MULADD(at[5], at[15]); 
-      COMBA_STORE(C->dp[5]);
-      /* 6 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[21]);       MULADD(at[1], at[20]);       MULADD(at[2], at[19]);       MULADD(at[3], at[18]);       MULADD(at[4], at[17]);       MULADD(at[5], at[16]);       MULADD(at[6], at[15]); 
-      COMBA_STORE(C->dp[6]);
-      /* 7 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[22]);       MULADD(at[1], at[21]);       MULADD(at[2], at[20]);       MULADD(at[3], at[19]);       MULADD(at[4], at[18]);       MULADD(at[5], at[17]);       MULADD(at[6], at[16]);       MULADD(at[7], at[15]); 
-      COMBA_STORE(C->dp[7]);
-      /* 8 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[23]);       MULADD(at[1], at[22]);       MULADD(at[2], at[21]);       MULADD(at[3], at[20]);       MULADD(at[4], at[19]);       MULADD(at[5], at[18]);       MULADD(at[6], at[17]);       MULADD(at[7], at[16]);       MULADD(at[8], at[15]); 
-      COMBA_STORE(C->dp[8]);
-      /* 9 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[24]);       MULADD(at[1], at[23]);       MULADD(at[2], at[22]);       MULADD(at[3], at[21]);       MULADD(at[4], at[20]);       MULADD(at[5], at[19]);       MULADD(at[6], at[18]);       MULADD(at[7], at[17]);       MULADD(at[8], at[16]);       MULADD(at[9], at[15]); 
-      COMBA_STORE(C->dp[9]);
-      /* 10 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[25]);       MULADD(at[1], at[24]);       MULADD(at[2], at[23]);       MULADD(at[3], at[22]);       MULADD(at[4], at[21]);       MULADD(at[5], at[20]);       MULADD(at[6], at[19]);       MULADD(at[7], at[18]);       MULADD(at[8], at[17]);       MULADD(at[9], at[16]);       MULADD(at[10], at[15]); 
-      COMBA_STORE(C->dp[10]);
-      /* 11 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[26]);       MULADD(at[1], at[25]);       MULADD(at[2], at[24]);       MULADD(at[3], at[23]);       MULADD(at[4], at[22]);       MULADD(at[5], at[21]);       MULADD(at[6], at[20]);       MULADD(at[7], at[19]);       MULADD(at[8], at[18]);       MULADD(at[9], at[17]);       MULADD(at[10], at[16]);       MULADD(at[11], at[15]); 
-      COMBA_STORE(C->dp[11]);
-      /* 12 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[27]);       MULADD(at[1], at[26]);       MULADD(at[2], at[25]);       MULADD(at[3], at[24]);       MULADD(at[4], at[23]);       MULADD(at[5], at[22]);       MULADD(at[6], at[21]);       MULADD(at[7], at[20]);       MULADD(at[8], at[19]);       MULADD(at[9], at[18]);       MULADD(at[10], at[17]);       MULADD(at[11], at[16]);       MULADD(at[12], at[15]); 
-      COMBA_STORE(C->dp[12]);
-      /* 13 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[28]);       MULADD(at[1], at[27]);       MULADD(at[2], at[26]);       MULADD(at[3], at[25]);       MULADD(at[4], at[24]);       MULADD(at[5], at[23]);       MULADD(at[6], at[22]);       MULADD(at[7], at[21]);       MULADD(at[8], at[20]);       MULADD(at[9], at[19]);       MULADD(at[10], at[18]);       MULADD(at[11], at[17]);       MULADD(at[12], at[16]);       MULADD(at[13], at[15]); 
-      COMBA_STORE(C->dp[13]);
-      /* 14 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[29]);       MULADD(at[1], at[28]);       MULADD(at[2], at[27]);       MULADD(at[3], at[26]);       MULADD(at[4], at[25]);       MULADD(at[5], at[24]);       MULADD(at[6], at[23]);       MULADD(at[7], at[22]);       MULADD(at[8], at[21]);       MULADD(at[9], at[20]);       MULADD(at[10], at[19]);       MULADD(at[11], at[18]);       MULADD(at[12], at[17]);       MULADD(at[13], at[16]);       MULADD(at[14], at[15]); 
-      COMBA_STORE(C->dp[14]);
-      /* 15 */
-      COMBA_FORWARD;
-      MULADD(at[1], at[29]);       MULADD(at[2], at[28]);       MULADD(at[3], at[27]);       MULADD(at[4], at[26]);       MULADD(at[5], at[25]);       MULADD(at[6], at[24]);       MULADD(at[7], at[23]);       MULADD(at[8], at[22]);       MULADD(at[9], at[21]);       MULADD(at[10], at[20]);       MULADD(at[11], at[19]);       MULADD(at[12], at[18]);       MULADD(at[13], at[17]);       MULADD(at[14], at[16]); 
-      COMBA_STORE(C->dp[15]);
-      /* 16 */
-      COMBA_FORWARD;
-      MULADD(at[2], at[29]);       MULADD(at[3], at[28]);       MULADD(at[4], at[27]);       MULADD(at[5], at[26]);       MULADD(at[6], at[25]);       MULADD(at[7], at[24]);       MULADD(at[8], at[23]);       MULADD(at[9], at[22]);       MULADD(at[10], at[21]);       MULADD(at[11], at[20]);       MULADD(at[12], at[19]);       MULADD(at[13], at[18]);       MULADD(at[14], at[17]); 
-      COMBA_STORE(C->dp[16]);
-      /* 17 */
-      COMBA_FORWARD;
-      MULADD(at[3], at[29]);       MULADD(at[4], at[28]);       MULADD(at[5], at[27]);       MULADD(at[6], at[26]);       MULADD(at[7], at[25]);       MULADD(at[8], at[24]);       MULADD(at[9], at[23]);       MULADD(at[10], at[22]);       MULADD(at[11], at[21]);       MULADD(at[12], at[20]);       MULADD(at[13], at[19]);       MULADD(at[14], at[18]); 
-      COMBA_STORE(C->dp[17]);
-      /* 18 */
-      COMBA_FORWARD;
-      MULADD(at[4], at[29]);       MULADD(at[5], at[28]);       MULADD(at[6], at[27]);       MULADD(at[7], at[26]);       MULADD(at[8], at[25]);       MULADD(at[9], at[24]);       MULADD(at[10], at[23]);       MULADD(at[11], at[22]);       MULADD(at[12], at[21]);       MULADD(at[13], at[20]);       MULADD(at[14], at[19]); 
-      COMBA_STORE(C->dp[18]);
-      /* 19 */
-      COMBA_FORWARD;
-      MULADD(at[5], at[29]);       MULADD(at[6], at[28]);       MULADD(at[7], at[27]);       MULADD(at[8], at[26]);       MULADD(at[9], at[25]);       MULADD(at[10], at[24]);       MULADD(at[11], at[23]);       MULADD(at[12], at[22]);       MULADD(at[13], at[21]);       MULADD(at[14], at[20]); 
-      COMBA_STORE(C->dp[19]);
-      /* 20 */
-      COMBA_FORWARD;
-      MULADD(at[6], at[29]);       MULADD(at[7], at[28]);       MULADD(at[8], at[27]);       MULADD(at[9], at[26]);       MULADD(at[10], at[25]);       MULADD(at[11], at[24]);       MULADD(at[12], at[23]);       MULADD(at[13], at[22]);       MULADD(at[14], at[21]); 
-      COMBA_STORE(C->dp[20]);
-      /* 21 */
-      COMBA_FORWARD;
-      MULADD(at[7], at[29]);       MULADD(at[8], at[28]);       MULADD(at[9], at[27]);       MULADD(at[10], at[26]);       MULADD(at[11], at[25]);       MULADD(at[12], at[24]);       MULADD(at[13], at[23]);       MULADD(at[14], at[22]); 
-      COMBA_STORE(C->dp[21]);
-      /* 22 */
-      COMBA_FORWARD;
-      MULADD(at[8], at[29]);       MULADD(at[9], at[28]);       MULADD(at[10], at[27]);       MULADD(at[11], at[26]);       MULADD(at[12], at[25]);       MULADD(at[13], at[24]);       MULADD(at[14], at[23]); 
-      COMBA_STORE(C->dp[22]);
-      /* 23 */
-      COMBA_FORWARD;
-      MULADD(at[9], at[29]);       MULADD(at[10], at[28]);       MULADD(at[11], at[27]);       MULADD(at[12], at[26]);       MULADD(at[13], at[25]);       MULADD(at[14], at[24]); 
-      COMBA_STORE(C->dp[23]);
-      /* 24 */
-      COMBA_FORWARD;
-      MULADD(at[10], at[29]);       MULADD(at[11], at[28]);       MULADD(at[12], at[27]);       MULADD(at[13], at[26]);       MULADD(at[14], at[25]); 
-      COMBA_STORE(C->dp[24]);
-      /* 25 */
-      COMBA_FORWARD;
-      MULADD(at[11], at[29]);       MULADD(at[12], at[28]);       MULADD(at[13], at[27]);       MULADD(at[14], at[26]); 
-      COMBA_STORE(C->dp[25]);
-      /* 26 */
-      COMBA_FORWARD;
-      MULADD(at[12], at[29]);       MULADD(at[13], at[28]);       MULADD(at[14], at[27]); 
-      COMBA_STORE(C->dp[26]);
-      /* 27 */
-      COMBA_FORWARD;
-      MULADD(at[13], at[29]);       MULADD(at[14], at[28]); 
-      COMBA_STORE(C->dp[27]);
-      /* 28 */
-      COMBA_FORWARD;
-      MULADD(at[14], at[29]); 
-      COMBA_STORE(C->dp[28]);
-      COMBA_STORE2(C->dp[29]);
-      C->used = 30;
-      C->sign = A->sign ^ B->sign;
-      fp_clamp(C);
-      COMBA_FINI;
-      break;
-
-   case 16:
-      XMEMCPY(at, A->dp, 16 * sizeof(fp_digit));
-      XMEMCPY(at+16, B->dp, 16 * sizeof(fp_digit));
-      COMBA_START;
-
-      COMBA_CLEAR;
-      /* 0 */
-      MULADD(at[0], at[16]); 
-      COMBA_STORE(C->dp[0]);
-      /* 1 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[17]);       MULADD(at[1], at[16]); 
-      COMBA_STORE(C->dp[1]);
-      /* 2 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[18]);       MULADD(at[1], at[17]);       MULADD(at[2], at[16]); 
-      COMBA_STORE(C->dp[2]);
-      /* 3 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[19]);       MULADD(at[1], at[18]);       MULADD(at[2], at[17]);       MULADD(at[3], at[16]); 
-      COMBA_STORE(C->dp[3]);
-      /* 4 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[20]);       MULADD(at[1], at[19]);       MULADD(at[2], at[18]);       MULADD(at[3], at[17]);       MULADD(at[4], at[16]); 
-      COMBA_STORE(C->dp[4]);
-      /* 5 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[21]);       MULADD(at[1], at[20]);       MULADD(at[2], at[19]);       MULADD(at[3], at[18]);       MULADD(at[4], at[17]);       MULADD(at[5], at[16]); 
-      COMBA_STORE(C->dp[5]);
-      /* 6 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[22]);       MULADD(at[1], at[21]);       MULADD(at[2], at[20]);       MULADD(at[3], at[19]);       MULADD(at[4], at[18]);       MULADD(at[5], at[17]);       MULADD(at[6], at[16]); 
-      COMBA_STORE(C->dp[6]);
-      /* 7 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[23]);       MULADD(at[1], at[22]);       MULADD(at[2], at[21]);       MULADD(at[3], at[20]);       MULADD(at[4], at[19]);       MULADD(at[5], at[18]);       MULADD(at[6], at[17]);       MULADD(at[7], at[16]); 
-      COMBA_STORE(C->dp[7]);
-      /* 8 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[24]);       MULADD(at[1], at[23]);       MULADD(at[2], at[22]);       MULADD(at[3], at[21]);       MULADD(at[4], at[20]);       MULADD(at[5], at[19]);       MULADD(at[6], at[18]);       MULADD(at[7], at[17]);       MULADD(at[8], at[16]); 
-      COMBA_STORE(C->dp[8]);
-      /* 9 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[25]);       MULADD(at[1], at[24]);       MULADD(at[2], at[23]);       MULADD(at[3], at[22]);       MULADD(at[4], at[21]);       MULADD(at[5], at[20]);       MULADD(at[6], at[19]);       MULADD(at[7], at[18]);       MULADD(at[8], at[17]);       MULADD(at[9], at[16]); 
-      COMBA_STORE(C->dp[9]);
-      /* 10 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[26]);       MULADD(at[1], at[25]);       MULADD(at[2], at[24]);       MULADD(at[3], at[23]);       MULADD(at[4], at[22]);       MULADD(at[5], at[21]);       MULADD(at[6], at[20]);       MULADD(at[7], at[19]);       MULADD(at[8], at[18]);       MULADD(at[9], at[17]);       MULADD(at[10], at[16]); 
-      COMBA_STORE(C->dp[10]);
-      /* 11 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[27]);       MULADD(at[1], at[26]);       MULADD(at[2], at[25]);       MULADD(at[3], at[24]);       MULADD(at[4], at[23]);       MULADD(at[5], at[22]);       MULADD(at[6], at[21]);       MULADD(at[7], at[20]);       MULADD(at[8], at[19]);       MULADD(at[9], at[18]);       MULADD(at[10], at[17]);       MULADD(at[11], at[16]); 
-      COMBA_STORE(C->dp[11]);
-      /* 12 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[28]);       MULADD(at[1], at[27]);       MULADD(at[2], at[26]);       MULADD(at[3], at[25]);       MULADD(at[4], at[24]);       MULADD(at[5], at[23]);       MULADD(at[6], at[22]);       MULADD(at[7], at[21]);       MULADD(at[8], at[20]);       MULADD(at[9], at[19]);       MULADD(at[10], at[18]);       MULADD(at[11], at[17]);       MULADD(at[12], at[16]); 
-      COMBA_STORE(C->dp[12]);
-      /* 13 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[29]);       MULADD(at[1], at[28]);       MULADD(at[2], at[27]);       MULADD(at[3], at[26]);       MULADD(at[4], at[25]);       MULADD(at[5], at[24]);       MULADD(at[6], at[23]);       MULADD(at[7], at[22]);       MULADD(at[8], at[21]);       MULADD(at[9], at[20]);       MULADD(at[10], at[19]);       MULADD(at[11], at[18]);       MULADD(at[12], at[17]);       MULADD(at[13], at[16]); 
-      COMBA_STORE(C->dp[13]);
-      /* 14 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[30]);       MULADD(at[1], at[29]);       MULADD(at[2], at[28]);       MULADD(at[3], at[27]);       MULADD(at[4], at[26]);       MULADD(at[5], at[25]);       MULADD(at[6], at[24]);       MULADD(at[7], at[23]);       MULADD(at[8], at[22]);       MULADD(at[9], at[21]);       MULADD(at[10], at[20]);       MULADD(at[11], at[19]);       MULADD(at[12], at[18]);       MULADD(at[13], at[17]);       MULADD(at[14], at[16]); 
-      COMBA_STORE(C->dp[14]);
-      /* 15 */
-      COMBA_FORWARD;
-      MULADD(at[0], at[31]);       MULADD(at[1], at[30]);       MULADD(at[2], at[29]);       MULADD(at[3], at[28]);       MULADD(at[4], at[27]);       MULADD(at[5], at[26]);       MULADD(at[6], at[25]);       MULADD(at[7], at[24]);       MULADD(at[8], at[23]);       MULADD(at[9], at[22]);       MULADD(at[10], at[21]);       MULADD(at[11], at[20]);       MULADD(at[12], at[19]);       MULADD(at[13], at[18]);       MULADD(at[14], at[17]);       MULADD(at[15], at[16]); 
-      COMBA_STORE(C->dp[15]);
-      /* 16 */
-      COMBA_FORWARD;
-      MULADD(at[1], at[31]);       MULADD(at[2], at[30]);       MULADD(at[3], at[29]);       MULADD(at[4], at[28]);       MULADD(at[5], at[27]);       MULADD(at[6], at[26]);       MULADD(at[7], at[25]);       MULADD(at[8], at[24]);       MULADD(at[9], at[23]);       MULADD(at[10], at[22]);       MULADD(at[11], at[21]);       MULADD(at[12], at[20]);       MULADD(at[13], at[19]);       MULADD(at[14], at[18]);       MULADD(at[15], at[17]); 
-      COMBA_STORE(C->dp[16]);
-      /* 17 */
-      COMBA_FORWARD;
-      MULADD(at[2], at[31]);       MULADD(at[3], at[30]);       MULADD(at[4], at[29]);       MULADD(at[5], at[28]);       MULADD(at[6], at[27]);       MULADD(at[7], at[26]);       MULADD(at[8], at[25]);       MULADD(at[9], at[24]);       MULADD(at[10], at[23]);       MULADD(at[11], at[22]);       MULADD(at[12], at[21]);       MULADD(at[13], at[20]);       MULADD(at[14], at[19]);       MULADD(at[15], at[18]); 
-      COMBA_STORE(C->dp[17]);
-      /* 18 */
-      COMBA_FORWARD;
-      MULADD(at[3], at[31]);       MULADD(at[4], at[30]);       MULADD(at[5], at[29]);       MULADD(at[6], at[28]);       MULADD(at[7], at[27]);       MULADD(at[8], at[26]);       MULADD(at[9], at[25]);       MULADD(at[10], at[24]);       MULADD(at[11], at[23]);       MULADD(at[12], at[22]);       MULADD(at[13], at[21]);       MULADD(at[14], at[20]);       MULADD(at[15], at[19]); 
-      COMBA_STORE(C->dp[18]);
-      /* 19 */
-      COMBA_FORWARD;
-      MULADD(at[4], at[31]);       MULADD(at[5], at[30]);       MULADD(at[6], at[29]);       MULADD(at[7], at[28]);       MULADD(at[8], at[27]);       MULADD(at[9], at[26]);       MULADD(at[10], at[25]);       MULADD(at[11], at[24]);       MULADD(at[12], at[23]);       MULADD(at[13], at[22]);       MULADD(at[14], at[21]);       MULADD(at[15], at[20]); 
-      COMBA_STORE(C->dp[19]);
-      /* 20 */
-      COMBA_FORWARD;
-      MULADD(at[5], at[31]);       MULADD(at[6], at[30]);       MULADD(at[7], at[29]);       MULADD(at[8], at[28]);       MULADD(at[9], at[27]);       MULADD(at[10], at[26]);       MULADD(at[11], at[25]);       MULADD(at[12], at[24]);       MULADD(at[13], at[23]);       MULADD(at[14], at[22]);       MULADD(at[15], at[21]); 
-      COMBA_STORE(C->dp[20]);
-      /* 21 */
-      COMBA_FORWARD;
-      MULADD(at[6], at[31]);       MULADD(at[7], at[30]);       MULADD(at[8], at[29]);       MULADD(at[9], at[28]);       MULADD(at[10], at[27]);       MULADD(at[11], at[26]);       MULADD(at[12], at[25]);       MULADD(at[13], at[24]);       MULADD(at[14], at[23]);       MULADD(at[15], at[22]); 
-      COMBA_STORE(C->dp[21]);
-      /* 22 */
-      COMBA_FORWARD;
-      MULADD(at[7], at[31]);       MULADD(at[8], at[30]);       MULADD(at[9], at[29]);       MULADD(at[10], at[28]);       MULADD(at[11], at[27]);       MULADD(at[12], at[26]);       MULADD(at[13], at[25]);       MULADD(at[14], at[24]);       MULADD(at[15], at[23]); 
-      COMBA_STORE(C->dp[22]);
-      /* 23 */
-      COMBA_FORWARD;
-      MULADD(at[8], at[31]);       MULADD(at[9], at[30]);       MULADD(at[10], at[29]);       MULADD(at[11], at[28]);       MULADD(at[12], at[27]);       MULADD(at[13], at[26]);       MULADD(at[14], at[25]);       MULADD(at[15], at[24]); 
-      COMBA_STORE(C->dp[23]);
-      /* 24 */
-      COMBA_FORWARD;
-      MULADD(at[9], at[31]);       MULADD(at[10], at[30]);       MULADD(at[11], at[29]);       MULADD(at[12], at[28]);       MULADD(at[13], at[27]);       MULADD(at[14], at[26]);       MULADD(at[15], at[25]); 
-      COMBA_STORE(C->dp[24]);
-      /* 25 */
-      COMBA_FORWARD;
-      MULADD(at[10], at[31]);       MULADD(at[11], at[30]);       MULADD(at[12], at[29]);       MULADD(at[13], at[28]);       MULADD(at[14], at[27]);       MULADD(at[15], at[26]); 
-      COMBA_STORE(C->dp[25]);
-      /* 26 */
-      COMBA_FORWARD;
-      MULADD(at[11], at[31]);       MULADD(at[12], at[30]);       MULADD(at[13], at[29]);       MULADD(at[14], at[28]);       MULADD(at[15], at[27]); 
-      COMBA_STORE(C->dp[26]);
-      /* 27 */
-      COMBA_FORWARD;
-      MULADD(at[12], at[31]);       MULADD(at[13], at[30]);       MULADD(at[14], at[29]);       MULADD(at[15], at[28]); 
-      COMBA_STORE(C->dp[27]);
-      /* 28 */
-      COMBA_FORWARD;
-      MULADD(at[13], at[31]);       MULADD(at[14], at[30]);       MULADD(at[15], at[29]); 
-      COMBA_STORE(C->dp[28]);
-      /* 29 */
-      COMBA_FORWARD;
-      MULADD(at[14], at[31]);       MULADD(at[15], at[30]); 
-      COMBA_STORE(C->dp[29]);
-      /* 30 */
-      COMBA_FORWARD;
-      MULADD(at[15], at[31]); 
-      COMBA_STORE(C->dp[30]);
-      COMBA_STORE2(C->dp[31]);
-      C->used = 32;
-      C->sign = A->sign ^ B->sign;
-      fp_clamp(C);
-      COMBA_FINI;
-      break;
-
-   default:
-      break;
-   }
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(at, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-
-#endif

+ 0 - 177
lib/wolfssl/wolfcrypt/src/fp_sqr_comba_12.i

@@ -1,177 +0,0 @@
-/* fp_sqr_comba_12.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_SQR12
-int fp_sqr_comba12(fp_int *A, fp_int *B)
-{
-   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
-#ifdef TFM_ISO
-   fp_word tt;
-#endif
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit b[24];
-#else
-   fp_digit *b;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 24, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (b == NULL)
-      return FP_MEM;
-#endif
-
-   a = A->dp;
-   COMBA_START; 
-
-   /* clear carries */
-   CLEAR_CARRY;
-
-   /* output 0 */
-   SQRADD(a[0],a[0]);
-   COMBA_STORE(b[0]);
-
-   /* output 1 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[1]); 
-   COMBA_STORE(b[1]);
-
-   /* output 2 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]); 
-   COMBA_STORE(b[2]);
-
-   /* output 3 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]); 
-   COMBA_STORE(b[3]);
-
-   /* output 4 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]); 
-   COMBA_STORE(b[4]);
-
-   /* output 5 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-   COMBA_STORE(b[5]);
-
-   /* output 6 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); 
-   COMBA_STORE(b[6]);
-
-   /* output 7 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; 
-   COMBA_STORE(b[7]);
-
-   /* output 8 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
-   COMBA_STORE(b[8]);
-
-   /* output 9 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB; 
-   COMBA_STORE(b[9]);
-
-   /* output 10 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]); 
-   COMBA_STORE(b[10]);
-
-   /* output 11 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB; 
-   COMBA_STORE(b[11]);
-
-   /* output 12 */
-   CARRY_FORWARD;
-   SQRADDSC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]); 
-   COMBA_STORE(b[12]);
-
-   /* output 13 */
-   CARRY_FORWARD;
-   SQRADDSC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB; 
-   COMBA_STORE(b[13]);
-
-   /* output 14 */
-   CARRY_FORWARD;
-   SQRADDSC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]); 
-   COMBA_STORE(b[14]);
-
-   /* output 15 */
-   CARRY_FORWARD;
-   SQRADDSC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB; 
-   COMBA_STORE(b[15]);
-
-   /* output 16 */
-   CARRY_FORWARD;
-   SQRADDSC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]); 
-   COMBA_STORE(b[16]);
-
-   /* output 17 */
-   CARRY_FORWARD;
-   SQRADDSC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB; 
-   COMBA_STORE(b[17]);
-
-   /* output 18 */
-   CARRY_FORWARD;
-   SQRADD2(a[7], a[11]); SQRADD2(a[8], a[10]); SQRADD(a[9], a[9]); 
-   COMBA_STORE(b[18]);
-
-   /* output 19 */
-   CARRY_FORWARD;
-   SQRADD2(a[8], a[11]); SQRADD2(a[9], a[10]); 
-   COMBA_STORE(b[19]);
-
-   /* output 20 */
-   CARRY_FORWARD;
-   SQRADD2(a[9], a[11]); SQRADD(a[10], a[10]); 
-   COMBA_STORE(b[20]);
-
-   /* output 21 */
-   CARRY_FORWARD;
-   SQRADD2(a[10], a[11]); 
-   COMBA_STORE(b[21]);
-
-   /* output 22 */
-   CARRY_FORWARD;
-   SQRADD(a[11], a[11]); 
-   COMBA_STORE(b[22]);
-   COMBA_STORE2(b[23]);
-   COMBA_FINI;
-
-   B->used = 24;
-   B->sign = FP_ZPOS;
-   XMEMCPY(B->dp, b, 24 * sizeof(fp_digit));
-   fp_clamp(B);
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif
-
-

+ 0 - 227
lib/wolfssl/wolfcrypt/src/fp_sqr_comba_17.i

@@ -1,227 +0,0 @@
-/* fp_sqr_comba_17.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_SQR17
-int fp_sqr_comba17(fp_int *A, fp_int *B)
-{
-   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
-#ifdef TFM_ISO
-   fp_word tt;
-#endif
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit b[34];
-#else
-   fp_digit *b;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 34, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (b == NULL)
-      return FP_MEM;
-#endif
-
-   a = A->dp;
-   COMBA_START; 
-
-   /* clear carries */
-   CLEAR_CARRY;
-
-   /* output 0 */
-   SQRADD(a[0],a[0]);
-   COMBA_STORE(b[0]);
-
-   /* output 1 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[1]); 
-   COMBA_STORE(b[1]);
-
-   /* output 2 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]); 
-   COMBA_STORE(b[2]);
-
-   /* output 3 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]); 
-   COMBA_STORE(b[3]);
-
-   /* output 4 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]); 
-   COMBA_STORE(b[4]);
-
-   /* output 5 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-   COMBA_STORE(b[5]);
-
-   /* output 6 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); 
-   COMBA_STORE(b[6]);
-
-   /* output 7 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; 
-   COMBA_STORE(b[7]);
-
-   /* output 8 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
-   COMBA_STORE(b[8]);
-
-   /* output 9 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB; 
-   COMBA_STORE(b[9]);
-
-   /* output 10 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]); 
-   COMBA_STORE(b[10]);
-
-   /* output 11 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB; 
-   COMBA_STORE(b[11]);
-
-   /* output 12 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]); 
-   COMBA_STORE(b[12]);
-
-   /* output 13 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB; 
-   COMBA_STORE(b[13]);
-
-   /* output 14 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]); 
-   COMBA_STORE(b[14]);
-
-   /* output 15 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB; 
-   COMBA_STORE(b[15]);
-
-   /* output 16 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[16]); SQRADDAC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]); 
-   COMBA_STORE(b[16]);
-
-   /* output 17 */
-   CARRY_FORWARD;
-   SQRADDSC(a[1], a[16]); SQRADDAC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB; 
-   COMBA_STORE(b[17]);
-
-   /* output 18 */
-   CARRY_FORWARD;
-   SQRADDSC(a[2], a[16]); SQRADDAC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]); 
-   COMBA_STORE(b[18]);
-
-   /* output 19 */
-   CARRY_FORWARD;
-   SQRADDSC(a[3], a[16]); SQRADDAC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB; 
-   COMBA_STORE(b[19]);
-
-   /* output 20 */
-   CARRY_FORWARD;
-   SQRADDSC(a[4], a[16]); SQRADDAC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]); 
-   COMBA_STORE(b[20]);
-
-   /* output 21 */
-   CARRY_FORWARD;
-   SQRADDSC(a[5], a[16]); SQRADDAC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB; 
-   COMBA_STORE(b[21]);
-
-   /* output 22 */
-   CARRY_FORWARD;
-   SQRADDSC(a[6], a[16]); SQRADDAC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]); 
-   COMBA_STORE(b[22]);
-
-   /* output 23 */
-   CARRY_FORWARD;
-   SQRADDSC(a[7], a[16]); SQRADDAC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB; 
-   COMBA_STORE(b[23]);
-
-   /* output 24 */
-   CARRY_FORWARD;
-   SQRADDSC(a[8], a[16]); SQRADDAC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]); 
-   COMBA_STORE(b[24]);
-
-   /* output 25 */
-   CARRY_FORWARD;
-   SQRADDSC(a[9], a[16]); SQRADDAC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB; 
-   COMBA_STORE(b[25]);
-
-   /* output 26 */
-   CARRY_FORWARD;
-   SQRADDSC(a[10], a[16]); SQRADDAC(a[11], a[15]); SQRADDAC(a[12], a[14]); SQRADDDB; SQRADD(a[13], a[13]); 
-   COMBA_STORE(b[26]);
-
-   /* output 27 */
-   CARRY_FORWARD;
-   SQRADDSC(a[11], a[16]); SQRADDAC(a[12], a[15]); SQRADDAC(a[13], a[14]); SQRADDDB; 
-   COMBA_STORE(b[27]);
-
-   /* output 28 */
-   CARRY_FORWARD;
-   SQRADD2(a[12], a[16]); SQRADD2(a[13], a[15]); SQRADD(a[14], a[14]); 
-   COMBA_STORE(b[28]);
-
-   /* output 29 */
-   CARRY_FORWARD;
-   SQRADD2(a[13], a[16]); SQRADD2(a[14], a[15]); 
-   COMBA_STORE(b[29]);
-
-   /* output 30 */
-   CARRY_FORWARD;
-   SQRADD2(a[14], a[16]); SQRADD(a[15], a[15]); 
-   COMBA_STORE(b[30]);
-
-   /* output 31 */
-   CARRY_FORWARD;
-   SQRADD2(a[15], a[16]); 
-   COMBA_STORE(b[31]);
-
-   /* output 32 */
-   CARRY_FORWARD;
-   SQRADD(a[16], a[16]); 
-   COMBA_STORE(b[32]);
-   COMBA_STORE2(b[33]);
-   COMBA_FINI;
-
-   B->used = 34;
-   B->sign = FP_ZPOS;
-   XMEMCPY(B->dp, b, 34 * sizeof(fp_digit));
-   fp_clamp(B);
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif
-
-

+ 0 - 257
lib/wolfssl/wolfcrypt/src/fp_sqr_comba_20.i

@@ -1,257 +0,0 @@
-/* fp_sqr_comba_20.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_SQR20
-int fp_sqr_comba20(fp_int *A, fp_int *B)
-{
-   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
-#ifdef TFM_ISO
-   fp_word tt;
-#endif
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit b[40];
-#else
-   fp_digit *b;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 40, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (b == NULL)
-      return FP_MEM;
-#endif
-
-   a = A->dp;
-   COMBA_START; 
-
-   /* clear carries */
-   CLEAR_CARRY;
-
-   /* output 0 */
-   SQRADD(a[0],a[0]);
-   COMBA_STORE(b[0]);
-
-   /* output 1 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[1]); 
-   COMBA_STORE(b[1]);
-
-   /* output 2 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]); 
-   COMBA_STORE(b[2]);
-
-   /* output 3 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]); 
-   COMBA_STORE(b[3]);
-
-   /* output 4 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]); 
-   COMBA_STORE(b[4]);
-
-   /* output 5 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-   COMBA_STORE(b[5]);
-
-   /* output 6 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); 
-   COMBA_STORE(b[6]);
-
-   /* output 7 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; 
-   COMBA_STORE(b[7]);
-
-   /* output 8 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
-   COMBA_STORE(b[8]);
-
-   /* output 9 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB; 
-   COMBA_STORE(b[9]);
-
-   /* output 10 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]); 
-   COMBA_STORE(b[10]);
-
-   /* output 11 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB; 
-   COMBA_STORE(b[11]);
-
-   /* output 12 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]); 
-   COMBA_STORE(b[12]);
-
-   /* output 13 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB; 
-   COMBA_STORE(b[13]);
-
-   /* output 14 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]); 
-   COMBA_STORE(b[14]);
-
-   /* output 15 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB; 
-   COMBA_STORE(b[15]);
-
-   /* output 16 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[16]); SQRADDAC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]); 
-   COMBA_STORE(b[16]);
-
-   /* output 17 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[17]); SQRADDAC(a[1], a[16]); SQRADDAC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB; 
-   COMBA_STORE(b[17]);
-
-   /* output 18 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[18]); SQRADDAC(a[1], a[17]); SQRADDAC(a[2], a[16]); SQRADDAC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]); 
-   COMBA_STORE(b[18]);
-
-   /* output 19 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[19]); SQRADDAC(a[1], a[18]); SQRADDAC(a[2], a[17]); SQRADDAC(a[3], a[16]); SQRADDAC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB; 
-   COMBA_STORE(b[19]);
-
-   /* output 20 */
-   CARRY_FORWARD;
-   SQRADDSC(a[1], a[19]); SQRADDAC(a[2], a[18]); SQRADDAC(a[3], a[17]); SQRADDAC(a[4], a[16]); SQRADDAC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]); 
-   COMBA_STORE(b[20]);
-
-   /* output 21 */
-   CARRY_FORWARD;
-   SQRADDSC(a[2], a[19]); SQRADDAC(a[3], a[18]); SQRADDAC(a[4], a[17]); SQRADDAC(a[5], a[16]); SQRADDAC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB; 
-   COMBA_STORE(b[21]);
-
-   /* output 22 */
-   CARRY_FORWARD;
-   SQRADDSC(a[3], a[19]); SQRADDAC(a[4], a[18]); SQRADDAC(a[5], a[17]); SQRADDAC(a[6], a[16]); SQRADDAC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]); 
-   COMBA_STORE(b[22]);
-
-   /* output 23 */
-   CARRY_FORWARD;
-   SQRADDSC(a[4], a[19]); SQRADDAC(a[5], a[18]); SQRADDAC(a[6], a[17]); SQRADDAC(a[7], a[16]); SQRADDAC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB; 
-   COMBA_STORE(b[23]);
-
-   /* output 24 */
-   CARRY_FORWARD;
-   SQRADDSC(a[5], a[19]); SQRADDAC(a[6], a[18]); SQRADDAC(a[7], a[17]); SQRADDAC(a[8], a[16]); SQRADDAC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]); 
-   COMBA_STORE(b[24]);
-
-   /* output 25 */
-   CARRY_FORWARD;
-   SQRADDSC(a[6], a[19]); SQRADDAC(a[7], a[18]); SQRADDAC(a[8], a[17]); SQRADDAC(a[9], a[16]); SQRADDAC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB; 
-   COMBA_STORE(b[25]);
-
-   /* output 26 */
-   CARRY_FORWARD;
-   SQRADDSC(a[7], a[19]); SQRADDAC(a[8], a[18]); SQRADDAC(a[9], a[17]); SQRADDAC(a[10], a[16]); SQRADDAC(a[11], a[15]); SQRADDAC(a[12], a[14]); SQRADDDB; SQRADD(a[13], a[13]); 
-   COMBA_STORE(b[26]);
-
-   /* output 27 */
-   CARRY_FORWARD;
-   SQRADDSC(a[8], a[19]); SQRADDAC(a[9], a[18]); SQRADDAC(a[10], a[17]); SQRADDAC(a[11], a[16]); SQRADDAC(a[12], a[15]); SQRADDAC(a[13], a[14]); SQRADDDB; 
-   COMBA_STORE(b[27]);
-
-   /* output 28 */
-   CARRY_FORWARD;
-   SQRADDSC(a[9], a[19]); SQRADDAC(a[10], a[18]); SQRADDAC(a[11], a[17]); SQRADDAC(a[12], a[16]); SQRADDAC(a[13], a[15]); SQRADDDB; SQRADD(a[14], a[14]); 
-   COMBA_STORE(b[28]);
-
-   /* output 29 */
-   CARRY_FORWARD;
-   SQRADDSC(a[10], a[19]); SQRADDAC(a[11], a[18]); SQRADDAC(a[12], a[17]); SQRADDAC(a[13], a[16]); SQRADDAC(a[14], a[15]); SQRADDDB; 
-   COMBA_STORE(b[29]);
-
-   /* output 30 */
-   CARRY_FORWARD;
-   SQRADDSC(a[11], a[19]); SQRADDAC(a[12], a[18]); SQRADDAC(a[13], a[17]); SQRADDAC(a[14], a[16]); SQRADDDB; SQRADD(a[15], a[15]); 
-   COMBA_STORE(b[30]);
-
-   /* output 31 */
-   CARRY_FORWARD;
-   SQRADDSC(a[12], a[19]); SQRADDAC(a[13], a[18]); SQRADDAC(a[14], a[17]); SQRADDAC(a[15], a[16]); SQRADDDB; 
-   COMBA_STORE(b[31]);
-
-   /* output 32 */
-   CARRY_FORWARD;
-   SQRADDSC(a[13], a[19]); SQRADDAC(a[14], a[18]); SQRADDAC(a[15], a[17]); SQRADDDB; SQRADD(a[16], a[16]); 
-   COMBA_STORE(b[32]);
-
-   /* output 33 */
-   CARRY_FORWARD;
-   SQRADDSC(a[14], a[19]); SQRADDAC(a[15], a[18]); SQRADDAC(a[16], a[17]); SQRADDDB; 
-   COMBA_STORE(b[33]);
-
-   /* output 34 */
-   CARRY_FORWARD;
-   SQRADD2(a[15], a[19]); SQRADD2(a[16], a[18]); SQRADD(a[17], a[17]); 
-   COMBA_STORE(b[34]);
-
-   /* output 35 */
-   CARRY_FORWARD;
-   SQRADD2(a[16], a[19]); SQRADD2(a[17], a[18]); 
-   COMBA_STORE(b[35]);
-
-   /* output 36 */
-   CARRY_FORWARD;
-   SQRADD2(a[17], a[19]); SQRADD(a[18], a[18]); 
-   COMBA_STORE(b[36]);
-
-   /* output 37 */
-   CARRY_FORWARD;
-   SQRADD2(a[18], a[19]); 
-   COMBA_STORE(b[37]);
-
-   /* output 38 */
-   CARRY_FORWARD;
-   SQRADD(a[19], a[19]); 
-   COMBA_STORE(b[38]);
-   COMBA_STORE2(b[39]);
-   COMBA_FINI;
-
-   B->used = 40;
-   B->sign = FP_ZPOS;
-   XMEMCPY(B->dp, b, 40 * sizeof(fp_digit));
-   fp_clamp(B);
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif
-
-

+ 0 - 297
lib/wolfssl/wolfcrypt/src/fp_sqr_comba_24.i

@@ -1,297 +0,0 @@
-/* fp_sqr_comba_24.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_SQR24
-int fp_sqr_comba24(fp_int *A, fp_int *B)
-{
-   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
-#ifdef TFM_ISO
-   fp_word tt;
-#endif
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit b[48];
-#else
-   fp_digit *b;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 48, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (b == NULL)
-      return FP_MEM;
-#endif
-
-   a = A->dp;
-   COMBA_START; 
-
-   /* clear carries */
-   CLEAR_CARRY;
-
-   /* output 0 */
-   SQRADD(a[0],a[0]);
-   COMBA_STORE(b[0]);
-
-   /* output 1 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[1]); 
-   COMBA_STORE(b[1]);
-
-   /* output 2 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]); 
-   COMBA_STORE(b[2]);
-
-   /* output 3 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]); 
-   COMBA_STORE(b[3]);
-
-   /* output 4 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]); 
-   COMBA_STORE(b[4]);
-
-   /* output 5 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-   COMBA_STORE(b[5]);
-
-   /* output 6 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); 
-   COMBA_STORE(b[6]);
-
-   /* output 7 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; 
-   COMBA_STORE(b[7]);
-
-   /* output 8 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
-   COMBA_STORE(b[8]);
-
-   /* output 9 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB; 
-   COMBA_STORE(b[9]);
-
-   /* output 10 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]); 
-   COMBA_STORE(b[10]);
-
-   /* output 11 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB; 
-   COMBA_STORE(b[11]);
-
-   /* output 12 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]); 
-   COMBA_STORE(b[12]);
-
-   /* output 13 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB; 
-   COMBA_STORE(b[13]);
-
-   /* output 14 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]); 
-   COMBA_STORE(b[14]);
-
-   /* output 15 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB; 
-   COMBA_STORE(b[15]);
-
-   /* output 16 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[16]); SQRADDAC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]); 
-   COMBA_STORE(b[16]);
-
-   /* output 17 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[17]); SQRADDAC(a[1], a[16]); SQRADDAC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB; 
-   COMBA_STORE(b[17]);
-
-   /* output 18 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[18]); SQRADDAC(a[1], a[17]); SQRADDAC(a[2], a[16]); SQRADDAC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]); 
-   COMBA_STORE(b[18]);
-
-   /* output 19 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[19]); SQRADDAC(a[1], a[18]); SQRADDAC(a[2], a[17]); SQRADDAC(a[3], a[16]); SQRADDAC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB; 
-   COMBA_STORE(b[19]);
-
-   /* output 20 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[20]); SQRADDAC(a[1], a[19]); SQRADDAC(a[2], a[18]); SQRADDAC(a[3], a[17]); SQRADDAC(a[4], a[16]); SQRADDAC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]); 
-   COMBA_STORE(b[20]);
-
-   /* output 21 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[21]); SQRADDAC(a[1], a[20]); SQRADDAC(a[2], a[19]); SQRADDAC(a[3], a[18]); SQRADDAC(a[4], a[17]); SQRADDAC(a[5], a[16]); SQRADDAC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB; 
-   COMBA_STORE(b[21]);
-
-   /* output 22 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[22]); SQRADDAC(a[1], a[21]); SQRADDAC(a[2], a[20]); SQRADDAC(a[3], a[19]); SQRADDAC(a[4], a[18]); SQRADDAC(a[5], a[17]); SQRADDAC(a[6], a[16]); SQRADDAC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]); 
-   COMBA_STORE(b[22]);
-
-   /* output 23 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[23]); SQRADDAC(a[1], a[22]); SQRADDAC(a[2], a[21]); SQRADDAC(a[3], a[20]); SQRADDAC(a[4], a[19]); SQRADDAC(a[5], a[18]); SQRADDAC(a[6], a[17]); SQRADDAC(a[7], a[16]); SQRADDAC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB; 
-   COMBA_STORE(b[23]);
-
-   /* output 24 */
-   CARRY_FORWARD;
-   SQRADDSC(a[1], a[23]); SQRADDAC(a[2], a[22]); SQRADDAC(a[3], a[21]); SQRADDAC(a[4], a[20]); SQRADDAC(a[5], a[19]); SQRADDAC(a[6], a[18]); SQRADDAC(a[7], a[17]); SQRADDAC(a[8], a[16]); SQRADDAC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]); 
-   COMBA_STORE(b[24]);
-
-   /* output 25 */
-   CARRY_FORWARD;
-   SQRADDSC(a[2], a[23]); SQRADDAC(a[3], a[22]); SQRADDAC(a[4], a[21]); SQRADDAC(a[5], a[20]); SQRADDAC(a[6], a[19]); SQRADDAC(a[7], a[18]); SQRADDAC(a[8], a[17]); SQRADDAC(a[9], a[16]); SQRADDAC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB; 
-   COMBA_STORE(b[25]);
-
-   /* output 26 */
-   CARRY_FORWARD;
-   SQRADDSC(a[3], a[23]); SQRADDAC(a[4], a[22]); SQRADDAC(a[5], a[21]); SQRADDAC(a[6], a[20]); SQRADDAC(a[7], a[19]); SQRADDAC(a[8], a[18]); SQRADDAC(a[9], a[17]); SQRADDAC(a[10], a[16]); SQRADDAC(a[11], a[15]); SQRADDAC(a[12], a[14]); SQRADDDB; SQRADD(a[13], a[13]); 
-   COMBA_STORE(b[26]);
-
-   /* output 27 */
-   CARRY_FORWARD;
-   SQRADDSC(a[4], a[23]); SQRADDAC(a[5], a[22]); SQRADDAC(a[6], a[21]); SQRADDAC(a[7], a[20]); SQRADDAC(a[8], a[19]); SQRADDAC(a[9], a[18]); SQRADDAC(a[10], a[17]); SQRADDAC(a[11], a[16]); SQRADDAC(a[12], a[15]); SQRADDAC(a[13], a[14]); SQRADDDB; 
-   COMBA_STORE(b[27]);
-
-   /* output 28 */
-   CARRY_FORWARD;
-   SQRADDSC(a[5], a[23]); SQRADDAC(a[6], a[22]); SQRADDAC(a[7], a[21]); SQRADDAC(a[8], a[20]); SQRADDAC(a[9], a[19]); SQRADDAC(a[10], a[18]); SQRADDAC(a[11], a[17]); SQRADDAC(a[12], a[16]); SQRADDAC(a[13], a[15]); SQRADDDB; SQRADD(a[14], a[14]); 
-   COMBA_STORE(b[28]);
-
-   /* output 29 */
-   CARRY_FORWARD;
-   SQRADDSC(a[6], a[23]); SQRADDAC(a[7], a[22]); SQRADDAC(a[8], a[21]); SQRADDAC(a[9], a[20]); SQRADDAC(a[10], a[19]); SQRADDAC(a[11], a[18]); SQRADDAC(a[12], a[17]); SQRADDAC(a[13], a[16]); SQRADDAC(a[14], a[15]); SQRADDDB; 
-   COMBA_STORE(b[29]);
-
-   /* output 30 */
-   CARRY_FORWARD;
-   SQRADDSC(a[7], a[23]); SQRADDAC(a[8], a[22]); SQRADDAC(a[9], a[21]); SQRADDAC(a[10], a[20]); SQRADDAC(a[11], a[19]); SQRADDAC(a[12], a[18]); SQRADDAC(a[13], a[17]); SQRADDAC(a[14], a[16]); SQRADDDB; SQRADD(a[15], a[15]); 
-   COMBA_STORE(b[30]);
-
-   /* output 31 */
-   CARRY_FORWARD;
-   SQRADDSC(a[8], a[23]); SQRADDAC(a[9], a[22]); SQRADDAC(a[10], a[21]); SQRADDAC(a[11], a[20]); SQRADDAC(a[12], a[19]); SQRADDAC(a[13], a[18]); SQRADDAC(a[14], a[17]); SQRADDAC(a[15], a[16]); SQRADDDB; 
-   COMBA_STORE(b[31]);
-
-   /* output 32 */
-   CARRY_FORWARD;
-   SQRADDSC(a[9], a[23]); SQRADDAC(a[10], a[22]); SQRADDAC(a[11], a[21]); SQRADDAC(a[12], a[20]); SQRADDAC(a[13], a[19]); SQRADDAC(a[14], a[18]); SQRADDAC(a[15], a[17]); SQRADDDB; SQRADD(a[16], a[16]); 
-   COMBA_STORE(b[32]);
-
-   /* output 33 */
-   CARRY_FORWARD;
-   SQRADDSC(a[10], a[23]); SQRADDAC(a[11], a[22]); SQRADDAC(a[12], a[21]); SQRADDAC(a[13], a[20]); SQRADDAC(a[14], a[19]); SQRADDAC(a[15], a[18]); SQRADDAC(a[16], a[17]); SQRADDDB; 
-   COMBA_STORE(b[33]);
-
-   /* output 34 */
-   CARRY_FORWARD;
-   SQRADDSC(a[11], a[23]); SQRADDAC(a[12], a[22]); SQRADDAC(a[13], a[21]); SQRADDAC(a[14], a[20]); SQRADDAC(a[15], a[19]); SQRADDAC(a[16], a[18]); SQRADDDB; SQRADD(a[17], a[17]); 
-   COMBA_STORE(b[34]);
-
-   /* output 35 */
-   CARRY_FORWARD;
-   SQRADDSC(a[12], a[23]); SQRADDAC(a[13], a[22]); SQRADDAC(a[14], a[21]); SQRADDAC(a[15], a[20]); SQRADDAC(a[16], a[19]); SQRADDAC(a[17], a[18]); SQRADDDB; 
-   COMBA_STORE(b[35]);
-
-   /* output 36 */
-   CARRY_FORWARD;
-   SQRADDSC(a[13], a[23]); SQRADDAC(a[14], a[22]); SQRADDAC(a[15], a[21]); SQRADDAC(a[16], a[20]); SQRADDAC(a[17], a[19]); SQRADDDB; SQRADD(a[18], a[18]); 
-   COMBA_STORE(b[36]);
-
-   /* output 37 */
-   CARRY_FORWARD;
-   SQRADDSC(a[14], a[23]); SQRADDAC(a[15], a[22]); SQRADDAC(a[16], a[21]); SQRADDAC(a[17], a[20]); SQRADDAC(a[18], a[19]); SQRADDDB; 
-   COMBA_STORE(b[37]);
-
-   /* output 38 */
-   CARRY_FORWARD;
-   SQRADDSC(a[15], a[23]); SQRADDAC(a[16], a[22]); SQRADDAC(a[17], a[21]); SQRADDAC(a[18], a[20]); SQRADDDB; SQRADD(a[19], a[19]); 
-   COMBA_STORE(b[38]);
-
-   /* output 39 */
-   CARRY_FORWARD;
-   SQRADDSC(a[16], a[23]); SQRADDAC(a[17], a[22]); SQRADDAC(a[18], a[21]); SQRADDAC(a[19], a[20]); SQRADDDB; 
-   COMBA_STORE(b[39]);
-
-   /* output 40 */
-   CARRY_FORWARD;
-   SQRADDSC(a[17], a[23]); SQRADDAC(a[18], a[22]); SQRADDAC(a[19], a[21]); SQRADDDB; SQRADD(a[20], a[20]); 
-   COMBA_STORE(b[40]);
-
-   /* output 41 */
-   CARRY_FORWARD;
-   SQRADDSC(a[18], a[23]); SQRADDAC(a[19], a[22]); SQRADDAC(a[20], a[21]); SQRADDDB; 
-   COMBA_STORE(b[41]);
-
-   /* output 42 */
-   CARRY_FORWARD;
-   SQRADD2(a[19], a[23]); SQRADD2(a[20], a[22]); SQRADD(a[21], a[21]); 
-   COMBA_STORE(b[42]);
-
-   /* output 43 */
-   CARRY_FORWARD;
-   SQRADD2(a[20], a[23]); SQRADD2(a[21], a[22]); 
-   COMBA_STORE(b[43]);
-
-   /* output 44 */
-   CARRY_FORWARD;
-   SQRADD2(a[21], a[23]); SQRADD(a[22], a[22]); 
-   COMBA_STORE(b[44]);
-
-   /* output 45 */
-   CARRY_FORWARD;
-   SQRADD2(a[22], a[23]); 
-   COMBA_STORE(b[45]);
-
-   /* output 46 */
-   CARRY_FORWARD;
-   SQRADD(a[23], a[23]); 
-   COMBA_STORE(b[46]);
-   COMBA_STORE2(b[47]);
-   COMBA_FINI;
-
-   B->used = 48;
-   B->sign = FP_ZPOS;
-   XMEMCPY(B->dp, b, 48 * sizeof(fp_digit));
-   fp_clamp(B);
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif
-
-

+ 0 - 337
lib/wolfssl/wolfcrypt/src/fp_sqr_comba_28.i

@@ -1,337 +0,0 @@
-/* fp_sqr_comba_28.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_SQR28
-int fp_sqr_comba28(fp_int *A, fp_int *B)
-{
-   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
-#ifdef TFM_ISO
-   fp_word tt;
-#endif
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit b[56];
-#else
-   fp_digit *b;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 56, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (b == NULL)
-      return FP_MEM;
-#endif
-
-   a = A->dp;
-   COMBA_START; 
-
-   /* clear carries */
-   CLEAR_CARRY;
-
-   /* output 0 */
-   SQRADD(a[0],a[0]);
-   COMBA_STORE(b[0]);
-
-   /* output 1 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[1]); 
-   COMBA_STORE(b[1]);
-
-   /* output 2 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]); 
-   COMBA_STORE(b[2]);
-
-   /* output 3 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]); 
-   COMBA_STORE(b[3]);
-
-   /* output 4 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]); 
-   COMBA_STORE(b[4]);
-
-   /* output 5 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-   COMBA_STORE(b[5]);
-
-   /* output 6 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); 
-   COMBA_STORE(b[6]);
-
-   /* output 7 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; 
-   COMBA_STORE(b[7]);
-
-   /* output 8 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
-   COMBA_STORE(b[8]);
-
-   /* output 9 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB; 
-   COMBA_STORE(b[9]);
-
-   /* output 10 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]); 
-   COMBA_STORE(b[10]);
-
-   /* output 11 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB; 
-   COMBA_STORE(b[11]);
-
-   /* output 12 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]); 
-   COMBA_STORE(b[12]);
-
-   /* output 13 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB; 
-   COMBA_STORE(b[13]);
-
-   /* output 14 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]); 
-   COMBA_STORE(b[14]);
-
-   /* output 15 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB; 
-   COMBA_STORE(b[15]);
-
-   /* output 16 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[16]); SQRADDAC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]); 
-   COMBA_STORE(b[16]);
-
-   /* output 17 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[17]); SQRADDAC(a[1], a[16]); SQRADDAC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB; 
-   COMBA_STORE(b[17]);
-
-   /* output 18 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[18]); SQRADDAC(a[1], a[17]); SQRADDAC(a[2], a[16]); SQRADDAC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]); 
-   COMBA_STORE(b[18]);
-
-   /* output 19 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[19]); SQRADDAC(a[1], a[18]); SQRADDAC(a[2], a[17]); SQRADDAC(a[3], a[16]); SQRADDAC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB; 
-   COMBA_STORE(b[19]);
-
-   /* output 20 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[20]); SQRADDAC(a[1], a[19]); SQRADDAC(a[2], a[18]); SQRADDAC(a[3], a[17]); SQRADDAC(a[4], a[16]); SQRADDAC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]); 
-   COMBA_STORE(b[20]);
-
-   /* output 21 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[21]); SQRADDAC(a[1], a[20]); SQRADDAC(a[2], a[19]); SQRADDAC(a[3], a[18]); SQRADDAC(a[4], a[17]); SQRADDAC(a[5], a[16]); SQRADDAC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB; 
-   COMBA_STORE(b[21]);
-
-   /* output 22 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[22]); SQRADDAC(a[1], a[21]); SQRADDAC(a[2], a[20]); SQRADDAC(a[3], a[19]); SQRADDAC(a[4], a[18]); SQRADDAC(a[5], a[17]); SQRADDAC(a[6], a[16]); SQRADDAC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]); 
-   COMBA_STORE(b[22]);
-
-   /* output 23 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[23]); SQRADDAC(a[1], a[22]); SQRADDAC(a[2], a[21]); SQRADDAC(a[3], a[20]); SQRADDAC(a[4], a[19]); SQRADDAC(a[5], a[18]); SQRADDAC(a[6], a[17]); SQRADDAC(a[7], a[16]); SQRADDAC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB; 
-   COMBA_STORE(b[23]);
-
-   /* output 24 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[24]); SQRADDAC(a[1], a[23]); SQRADDAC(a[2], a[22]); SQRADDAC(a[3], a[21]); SQRADDAC(a[4], a[20]); SQRADDAC(a[5], a[19]); SQRADDAC(a[6], a[18]); SQRADDAC(a[7], a[17]); SQRADDAC(a[8], a[16]); SQRADDAC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]); 
-   COMBA_STORE(b[24]);
-
-   /* output 25 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[25]); SQRADDAC(a[1], a[24]); SQRADDAC(a[2], a[23]); SQRADDAC(a[3], a[22]); SQRADDAC(a[4], a[21]); SQRADDAC(a[5], a[20]); SQRADDAC(a[6], a[19]); SQRADDAC(a[7], a[18]); SQRADDAC(a[8], a[17]); SQRADDAC(a[9], a[16]); SQRADDAC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB; 
-   COMBA_STORE(b[25]);
-
-   /* output 26 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[26]); SQRADDAC(a[1], a[25]); SQRADDAC(a[2], a[24]); SQRADDAC(a[3], a[23]); SQRADDAC(a[4], a[22]); SQRADDAC(a[5], a[21]); SQRADDAC(a[6], a[20]); SQRADDAC(a[7], a[19]); SQRADDAC(a[8], a[18]); SQRADDAC(a[9], a[17]); SQRADDAC(a[10], a[16]); SQRADDAC(a[11], a[15]); SQRADDAC(a[12], a[14]); SQRADDDB; SQRADD(a[13], a[13]); 
-   COMBA_STORE(b[26]);
-
-   /* output 27 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[27]); SQRADDAC(a[1], a[26]); SQRADDAC(a[2], a[25]); SQRADDAC(a[3], a[24]); SQRADDAC(a[4], a[23]); SQRADDAC(a[5], a[22]); SQRADDAC(a[6], a[21]); SQRADDAC(a[7], a[20]); SQRADDAC(a[8], a[19]); SQRADDAC(a[9], a[18]); SQRADDAC(a[10], a[17]); SQRADDAC(a[11], a[16]); SQRADDAC(a[12], a[15]); SQRADDAC(a[13], a[14]); SQRADDDB; 
-   COMBA_STORE(b[27]);
-
-   /* output 28 */
-   CARRY_FORWARD;
-   SQRADDSC(a[1], a[27]); SQRADDAC(a[2], a[26]); SQRADDAC(a[3], a[25]); SQRADDAC(a[4], a[24]); SQRADDAC(a[5], a[23]); SQRADDAC(a[6], a[22]); SQRADDAC(a[7], a[21]); SQRADDAC(a[8], a[20]); SQRADDAC(a[9], a[19]); SQRADDAC(a[10], a[18]); SQRADDAC(a[11], a[17]); SQRADDAC(a[12], a[16]); SQRADDAC(a[13], a[15]); SQRADDDB; SQRADD(a[14], a[14]); 
-   COMBA_STORE(b[28]);
-
-   /* output 29 */
-   CARRY_FORWARD;
-   SQRADDSC(a[2], a[27]); SQRADDAC(a[3], a[26]); SQRADDAC(a[4], a[25]); SQRADDAC(a[5], a[24]); SQRADDAC(a[6], a[23]); SQRADDAC(a[7], a[22]); SQRADDAC(a[8], a[21]); SQRADDAC(a[9], a[20]); SQRADDAC(a[10], a[19]); SQRADDAC(a[11], a[18]); SQRADDAC(a[12], a[17]); SQRADDAC(a[13], a[16]); SQRADDAC(a[14], a[15]); SQRADDDB; 
-   COMBA_STORE(b[29]);
-
-   /* output 30 */
-   CARRY_FORWARD;
-   SQRADDSC(a[3], a[27]); SQRADDAC(a[4], a[26]); SQRADDAC(a[5], a[25]); SQRADDAC(a[6], a[24]); SQRADDAC(a[7], a[23]); SQRADDAC(a[8], a[22]); SQRADDAC(a[9], a[21]); SQRADDAC(a[10], a[20]); SQRADDAC(a[11], a[19]); SQRADDAC(a[12], a[18]); SQRADDAC(a[13], a[17]); SQRADDAC(a[14], a[16]); SQRADDDB; SQRADD(a[15], a[15]); 
-   COMBA_STORE(b[30]);
-
-   /* output 31 */
-   CARRY_FORWARD;
-   SQRADDSC(a[4], a[27]); SQRADDAC(a[5], a[26]); SQRADDAC(a[6], a[25]); SQRADDAC(a[7], a[24]); SQRADDAC(a[8], a[23]); SQRADDAC(a[9], a[22]); SQRADDAC(a[10], a[21]); SQRADDAC(a[11], a[20]); SQRADDAC(a[12], a[19]); SQRADDAC(a[13], a[18]); SQRADDAC(a[14], a[17]); SQRADDAC(a[15], a[16]); SQRADDDB; 
-   COMBA_STORE(b[31]);
-
-   /* output 32 */
-   CARRY_FORWARD;
-   SQRADDSC(a[5], a[27]); SQRADDAC(a[6], a[26]); SQRADDAC(a[7], a[25]); SQRADDAC(a[8], a[24]); SQRADDAC(a[9], a[23]); SQRADDAC(a[10], a[22]); SQRADDAC(a[11], a[21]); SQRADDAC(a[12], a[20]); SQRADDAC(a[13], a[19]); SQRADDAC(a[14], a[18]); SQRADDAC(a[15], a[17]); SQRADDDB; SQRADD(a[16], a[16]); 
-   COMBA_STORE(b[32]);
-
-   /* output 33 */
-   CARRY_FORWARD;
-   SQRADDSC(a[6], a[27]); SQRADDAC(a[7], a[26]); SQRADDAC(a[8], a[25]); SQRADDAC(a[9], a[24]); SQRADDAC(a[10], a[23]); SQRADDAC(a[11], a[22]); SQRADDAC(a[12], a[21]); SQRADDAC(a[13], a[20]); SQRADDAC(a[14], a[19]); SQRADDAC(a[15], a[18]); SQRADDAC(a[16], a[17]); SQRADDDB; 
-   COMBA_STORE(b[33]);
-
-   /* output 34 */
-   CARRY_FORWARD;
-   SQRADDSC(a[7], a[27]); SQRADDAC(a[8], a[26]); SQRADDAC(a[9], a[25]); SQRADDAC(a[10], a[24]); SQRADDAC(a[11], a[23]); SQRADDAC(a[12], a[22]); SQRADDAC(a[13], a[21]); SQRADDAC(a[14], a[20]); SQRADDAC(a[15], a[19]); SQRADDAC(a[16], a[18]); SQRADDDB; SQRADD(a[17], a[17]); 
-   COMBA_STORE(b[34]);
-
-   /* output 35 */
-   CARRY_FORWARD;
-   SQRADDSC(a[8], a[27]); SQRADDAC(a[9], a[26]); SQRADDAC(a[10], a[25]); SQRADDAC(a[11], a[24]); SQRADDAC(a[12], a[23]); SQRADDAC(a[13], a[22]); SQRADDAC(a[14], a[21]); SQRADDAC(a[15], a[20]); SQRADDAC(a[16], a[19]); SQRADDAC(a[17], a[18]); SQRADDDB; 
-   COMBA_STORE(b[35]);
-
-   /* output 36 */
-   CARRY_FORWARD;
-   SQRADDSC(a[9], a[27]); SQRADDAC(a[10], a[26]); SQRADDAC(a[11], a[25]); SQRADDAC(a[12], a[24]); SQRADDAC(a[13], a[23]); SQRADDAC(a[14], a[22]); SQRADDAC(a[15], a[21]); SQRADDAC(a[16], a[20]); SQRADDAC(a[17], a[19]); SQRADDDB; SQRADD(a[18], a[18]); 
-   COMBA_STORE(b[36]);
-
-   /* output 37 */
-   CARRY_FORWARD;
-   SQRADDSC(a[10], a[27]); SQRADDAC(a[11], a[26]); SQRADDAC(a[12], a[25]); SQRADDAC(a[13], a[24]); SQRADDAC(a[14], a[23]); SQRADDAC(a[15], a[22]); SQRADDAC(a[16], a[21]); SQRADDAC(a[17], a[20]); SQRADDAC(a[18], a[19]); SQRADDDB; 
-   COMBA_STORE(b[37]);
-
-   /* output 38 */
-   CARRY_FORWARD;
-   SQRADDSC(a[11], a[27]); SQRADDAC(a[12], a[26]); SQRADDAC(a[13], a[25]); SQRADDAC(a[14], a[24]); SQRADDAC(a[15], a[23]); SQRADDAC(a[16], a[22]); SQRADDAC(a[17], a[21]); SQRADDAC(a[18], a[20]); SQRADDDB; SQRADD(a[19], a[19]); 
-   COMBA_STORE(b[38]);
-
-   /* output 39 */
-   CARRY_FORWARD;
-   SQRADDSC(a[12], a[27]); SQRADDAC(a[13], a[26]); SQRADDAC(a[14], a[25]); SQRADDAC(a[15], a[24]); SQRADDAC(a[16], a[23]); SQRADDAC(a[17], a[22]); SQRADDAC(a[18], a[21]); SQRADDAC(a[19], a[20]); SQRADDDB; 
-   COMBA_STORE(b[39]);
-
-   /* output 40 */
-   CARRY_FORWARD;
-   SQRADDSC(a[13], a[27]); SQRADDAC(a[14], a[26]); SQRADDAC(a[15], a[25]); SQRADDAC(a[16], a[24]); SQRADDAC(a[17], a[23]); SQRADDAC(a[18], a[22]); SQRADDAC(a[19], a[21]); SQRADDDB; SQRADD(a[20], a[20]); 
-   COMBA_STORE(b[40]);
-
-   /* output 41 */
-   CARRY_FORWARD;
-   SQRADDSC(a[14], a[27]); SQRADDAC(a[15], a[26]); SQRADDAC(a[16], a[25]); SQRADDAC(a[17], a[24]); SQRADDAC(a[18], a[23]); SQRADDAC(a[19], a[22]); SQRADDAC(a[20], a[21]); SQRADDDB; 
-   COMBA_STORE(b[41]);
-
-   /* output 42 */
-   CARRY_FORWARD;
-   SQRADDSC(a[15], a[27]); SQRADDAC(a[16], a[26]); SQRADDAC(a[17], a[25]); SQRADDAC(a[18], a[24]); SQRADDAC(a[19], a[23]); SQRADDAC(a[20], a[22]); SQRADDDB; SQRADD(a[21], a[21]); 
-   COMBA_STORE(b[42]);
-
-   /* output 43 */
-   CARRY_FORWARD;
-   SQRADDSC(a[16], a[27]); SQRADDAC(a[17], a[26]); SQRADDAC(a[18], a[25]); SQRADDAC(a[19], a[24]); SQRADDAC(a[20], a[23]); SQRADDAC(a[21], a[22]); SQRADDDB; 
-   COMBA_STORE(b[43]);
-
-   /* output 44 */
-   CARRY_FORWARD;
-   SQRADDSC(a[17], a[27]); SQRADDAC(a[18], a[26]); SQRADDAC(a[19], a[25]); SQRADDAC(a[20], a[24]); SQRADDAC(a[21], a[23]); SQRADDDB; SQRADD(a[22], a[22]); 
-   COMBA_STORE(b[44]);
-
-   /* output 45 */
-   CARRY_FORWARD;
-   SQRADDSC(a[18], a[27]); SQRADDAC(a[19], a[26]); SQRADDAC(a[20], a[25]); SQRADDAC(a[21], a[24]); SQRADDAC(a[22], a[23]); SQRADDDB; 
-   COMBA_STORE(b[45]);
-
-   /* output 46 */
-   CARRY_FORWARD;
-   SQRADDSC(a[19], a[27]); SQRADDAC(a[20], a[26]); SQRADDAC(a[21], a[25]); SQRADDAC(a[22], a[24]); SQRADDDB; SQRADD(a[23], a[23]); 
-   COMBA_STORE(b[46]);
-
-   /* output 47 */
-   CARRY_FORWARD;
-   SQRADDSC(a[20], a[27]); SQRADDAC(a[21], a[26]); SQRADDAC(a[22], a[25]); SQRADDAC(a[23], a[24]); SQRADDDB; 
-   COMBA_STORE(b[47]);
-
-   /* output 48 */
-   CARRY_FORWARD;
-   SQRADDSC(a[21], a[27]); SQRADDAC(a[22], a[26]); SQRADDAC(a[23], a[25]); SQRADDDB; SQRADD(a[24], a[24]); 
-   COMBA_STORE(b[48]);
-
-   /* output 49 */
-   CARRY_FORWARD;
-   SQRADDSC(a[22], a[27]); SQRADDAC(a[23], a[26]); SQRADDAC(a[24], a[25]); SQRADDDB; 
-   COMBA_STORE(b[49]);
-
-   /* output 50 */
-   CARRY_FORWARD;
-   SQRADD2(a[23], a[27]); SQRADD2(a[24], a[26]); SQRADD(a[25], a[25]); 
-   COMBA_STORE(b[50]);
-
-   /* output 51 */
-   CARRY_FORWARD;
-   SQRADD2(a[24], a[27]); SQRADD2(a[25], a[26]); 
-   COMBA_STORE(b[51]);
-
-   /* output 52 */
-   CARRY_FORWARD;
-   SQRADD2(a[25], a[27]); SQRADD(a[26], a[26]); 
-   COMBA_STORE(b[52]);
-
-   /* output 53 */
-   CARRY_FORWARD;
-   SQRADD2(a[26], a[27]); 
-   COMBA_STORE(b[53]);
-
-   /* output 54 */
-   CARRY_FORWARD;
-   SQRADD(a[27], a[27]); 
-   COMBA_STORE(b[54]);
-   COMBA_STORE2(b[55]);
-   COMBA_FINI;
-
-   B->used = 56;
-   B->sign = FP_ZPOS;
-   XMEMCPY(B->dp, b, 56 * sizeof(fp_digit));
-   fp_clamp(B);
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif
-
-

+ 0 - 73
lib/wolfssl/wolfcrypt/src/fp_sqr_comba_3.i

@@ -1,73 +0,0 @@
-/* fp_sqr_comba_3.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_SQR3
-int fp_sqr_comba3(fp_int *A, fp_int *B)
-{
-   fp_digit *a, b[6], c0, c1, c2;
-#ifdef TFM_ISO
-   fp_word tt;
-#endif
-
-   a = A->dp;
-   COMBA_START; 
-
-   /* clear carries */
-   CLEAR_CARRY;
-
-   /* output 0 */
-   SQRADD(a[0],a[0]);
-   COMBA_STORE(b[0]);
-
-   /* output 1 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[1]); 
-   COMBA_STORE(b[1]);
-
-   /* output 2 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]); 
-   COMBA_STORE(b[2]);
-
-   /* output 3 */
-   CARRY_FORWARD;
-   SQRADD2(a[1], a[2]); 
-   COMBA_STORE(b[3]);
-
-   /* output 4 */
-   CARRY_FORWARD;
-   SQRADD(a[2], a[2]); 
-   COMBA_STORE(b[4]);
-   COMBA_STORE2(b[5]);
-   COMBA_FINI;
-
-   B->used = 6;
-   B->sign = FP_ZPOS;
-   XMEMCPY(B->dp, b, 6 * sizeof(fp_digit));
-   fp_clamp(B);
-
-   return FP_OKAY;
-}
-#endif
-
-

+ 0 - 377
lib/wolfssl/wolfcrypt/src/fp_sqr_comba_32.i

@@ -1,377 +0,0 @@
-/* fp_sqr_comba_32.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_SQR32
-int fp_sqr_comba32(fp_int *A, fp_int *B)
-{
-   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
-#ifdef TFM_ISO
-   fp_word tt;
-#endif
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit b[64];
-#else
-   fp_digit *b;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (b == NULL)
-      return FP_MEM;
-#endif
-
-   a = A->dp;
-   COMBA_START; 
-
-   /* clear carries */
-   CLEAR_CARRY;
-
-   /* output 0 */
-   SQRADD(a[0],a[0]);
-   COMBA_STORE(b[0]);
-
-   /* output 1 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[1]); 
-   COMBA_STORE(b[1]);
-
-   /* output 2 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]); 
-   COMBA_STORE(b[2]);
-
-   /* output 3 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]); 
-   COMBA_STORE(b[3]);
-
-   /* output 4 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]); 
-   COMBA_STORE(b[4]);
-
-   /* output 5 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-   COMBA_STORE(b[5]);
-
-   /* output 6 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); 
-   COMBA_STORE(b[6]);
-
-   /* output 7 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; 
-   COMBA_STORE(b[7]);
-
-   /* output 8 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
-   COMBA_STORE(b[8]);
-
-   /* output 9 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB; 
-   COMBA_STORE(b[9]);
-
-   /* output 10 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]); 
-   COMBA_STORE(b[10]);
-
-   /* output 11 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB; 
-   COMBA_STORE(b[11]);
-
-   /* output 12 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]); 
-   COMBA_STORE(b[12]);
-
-   /* output 13 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB; 
-   COMBA_STORE(b[13]);
-
-   /* output 14 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]); 
-   COMBA_STORE(b[14]);
-
-   /* output 15 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB; 
-   COMBA_STORE(b[15]);
-
-   /* output 16 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[16]); SQRADDAC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]); 
-   COMBA_STORE(b[16]);
-
-   /* output 17 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[17]); SQRADDAC(a[1], a[16]); SQRADDAC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB; 
-   COMBA_STORE(b[17]);
-
-   /* output 18 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[18]); SQRADDAC(a[1], a[17]); SQRADDAC(a[2], a[16]); SQRADDAC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]); 
-   COMBA_STORE(b[18]);
-
-   /* output 19 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[19]); SQRADDAC(a[1], a[18]); SQRADDAC(a[2], a[17]); SQRADDAC(a[3], a[16]); SQRADDAC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB; 
-   COMBA_STORE(b[19]);
-
-   /* output 20 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[20]); SQRADDAC(a[1], a[19]); SQRADDAC(a[2], a[18]); SQRADDAC(a[3], a[17]); SQRADDAC(a[4], a[16]); SQRADDAC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]); 
-   COMBA_STORE(b[20]);
-
-   /* output 21 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[21]); SQRADDAC(a[1], a[20]); SQRADDAC(a[2], a[19]); SQRADDAC(a[3], a[18]); SQRADDAC(a[4], a[17]); SQRADDAC(a[5], a[16]); SQRADDAC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB; 
-   COMBA_STORE(b[21]);
-
-   /* output 22 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[22]); SQRADDAC(a[1], a[21]); SQRADDAC(a[2], a[20]); SQRADDAC(a[3], a[19]); SQRADDAC(a[4], a[18]); SQRADDAC(a[5], a[17]); SQRADDAC(a[6], a[16]); SQRADDAC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]); 
-   COMBA_STORE(b[22]);
-
-   /* output 23 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[23]); SQRADDAC(a[1], a[22]); SQRADDAC(a[2], a[21]); SQRADDAC(a[3], a[20]); SQRADDAC(a[4], a[19]); SQRADDAC(a[5], a[18]); SQRADDAC(a[6], a[17]); SQRADDAC(a[7], a[16]); SQRADDAC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB; 
-   COMBA_STORE(b[23]);
-
-   /* output 24 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[24]); SQRADDAC(a[1], a[23]); SQRADDAC(a[2], a[22]); SQRADDAC(a[3], a[21]); SQRADDAC(a[4], a[20]); SQRADDAC(a[5], a[19]); SQRADDAC(a[6], a[18]); SQRADDAC(a[7], a[17]); SQRADDAC(a[8], a[16]); SQRADDAC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]); 
-   COMBA_STORE(b[24]);
-
-   /* output 25 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[25]); SQRADDAC(a[1], a[24]); SQRADDAC(a[2], a[23]); SQRADDAC(a[3], a[22]); SQRADDAC(a[4], a[21]); SQRADDAC(a[5], a[20]); SQRADDAC(a[6], a[19]); SQRADDAC(a[7], a[18]); SQRADDAC(a[8], a[17]); SQRADDAC(a[9], a[16]); SQRADDAC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB; 
-   COMBA_STORE(b[25]);
-
-   /* output 26 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[26]); SQRADDAC(a[1], a[25]); SQRADDAC(a[2], a[24]); SQRADDAC(a[3], a[23]); SQRADDAC(a[4], a[22]); SQRADDAC(a[5], a[21]); SQRADDAC(a[6], a[20]); SQRADDAC(a[7], a[19]); SQRADDAC(a[8], a[18]); SQRADDAC(a[9], a[17]); SQRADDAC(a[10], a[16]); SQRADDAC(a[11], a[15]); SQRADDAC(a[12], a[14]); SQRADDDB; SQRADD(a[13], a[13]); 
-   COMBA_STORE(b[26]);
-
-   /* output 27 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[27]); SQRADDAC(a[1], a[26]); SQRADDAC(a[2], a[25]); SQRADDAC(a[3], a[24]); SQRADDAC(a[4], a[23]); SQRADDAC(a[5], a[22]); SQRADDAC(a[6], a[21]); SQRADDAC(a[7], a[20]); SQRADDAC(a[8], a[19]); SQRADDAC(a[9], a[18]); SQRADDAC(a[10], a[17]); SQRADDAC(a[11], a[16]); SQRADDAC(a[12], a[15]); SQRADDAC(a[13], a[14]); SQRADDDB; 
-   COMBA_STORE(b[27]);
-
-   /* output 28 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[28]); SQRADDAC(a[1], a[27]); SQRADDAC(a[2], a[26]); SQRADDAC(a[3], a[25]); SQRADDAC(a[4], a[24]); SQRADDAC(a[5], a[23]); SQRADDAC(a[6], a[22]); SQRADDAC(a[7], a[21]); SQRADDAC(a[8], a[20]); SQRADDAC(a[9], a[19]); SQRADDAC(a[10], a[18]); SQRADDAC(a[11], a[17]); SQRADDAC(a[12], a[16]); SQRADDAC(a[13], a[15]); SQRADDDB; SQRADD(a[14], a[14]); 
-   COMBA_STORE(b[28]);
-
-   /* output 29 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[29]); SQRADDAC(a[1], a[28]); SQRADDAC(a[2], a[27]); SQRADDAC(a[3], a[26]); SQRADDAC(a[4], a[25]); SQRADDAC(a[5], a[24]); SQRADDAC(a[6], a[23]); SQRADDAC(a[7], a[22]); SQRADDAC(a[8], a[21]); SQRADDAC(a[9], a[20]); SQRADDAC(a[10], a[19]); SQRADDAC(a[11], a[18]); SQRADDAC(a[12], a[17]); SQRADDAC(a[13], a[16]); SQRADDAC(a[14], a[15]); SQRADDDB; 
-   COMBA_STORE(b[29]);
-
-   /* output 30 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[30]); SQRADDAC(a[1], a[29]); SQRADDAC(a[2], a[28]); SQRADDAC(a[3], a[27]); SQRADDAC(a[4], a[26]); SQRADDAC(a[5], a[25]); SQRADDAC(a[6], a[24]); SQRADDAC(a[7], a[23]); SQRADDAC(a[8], a[22]); SQRADDAC(a[9], a[21]); SQRADDAC(a[10], a[20]); SQRADDAC(a[11], a[19]); SQRADDAC(a[12], a[18]); SQRADDAC(a[13], a[17]); SQRADDAC(a[14], a[16]); SQRADDDB; SQRADD(a[15], a[15]); 
-   COMBA_STORE(b[30]);
-
-   /* output 31 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[31]); SQRADDAC(a[1], a[30]); SQRADDAC(a[2], a[29]); SQRADDAC(a[3], a[28]); SQRADDAC(a[4], a[27]); SQRADDAC(a[5], a[26]); SQRADDAC(a[6], a[25]); SQRADDAC(a[7], a[24]); SQRADDAC(a[8], a[23]); SQRADDAC(a[9], a[22]); SQRADDAC(a[10], a[21]); SQRADDAC(a[11], a[20]); SQRADDAC(a[12], a[19]); SQRADDAC(a[13], a[18]); SQRADDAC(a[14], a[17]); SQRADDAC(a[15], a[16]); SQRADDDB; 
-   COMBA_STORE(b[31]);
-
-   /* output 32 */
-   CARRY_FORWARD;
-   SQRADDSC(a[1], a[31]); SQRADDAC(a[2], a[30]); SQRADDAC(a[3], a[29]); SQRADDAC(a[4], a[28]); SQRADDAC(a[5], a[27]); SQRADDAC(a[6], a[26]); SQRADDAC(a[7], a[25]); SQRADDAC(a[8], a[24]); SQRADDAC(a[9], a[23]); SQRADDAC(a[10], a[22]); SQRADDAC(a[11], a[21]); SQRADDAC(a[12], a[20]); SQRADDAC(a[13], a[19]); SQRADDAC(a[14], a[18]); SQRADDAC(a[15], a[17]); SQRADDDB; SQRADD(a[16], a[16]); 
-   COMBA_STORE(b[32]);
-
-   /* output 33 */
-   CARRY_FORWARD;
-   SQRADDSC(a[2], a[31]); SQRADDAC(a[3], a[30]); SQRADDAC(a[4], a[29]); SQRADDAC(a[5], a[28]); SQRADDAC(a[6], a[27]); SQRADDAC(a[7], a[26]); SQRADDAC(a[8], a[25]); SQRADDAC(a[9], a[24]); SQRADDAC(a[10], a[23]); SQRADDAC(a[11], a[22]); SQRADDAC(a[12], a[21]); SQRADDAC(a[13], a[20]); SQRADDAC(a[14], a[19]); SQRADDAC(a[15], a[18]); SQRADDAC(a[16], a[17]); SQRADDDB; 
-   COMBA_STORE(b[33]);
-
-   /* output 34 */
-   CARRY_FORWARD;
-   SQRADDSC(a[3], a[31]); SQRADDAC(a[4], a[30]); SQRADDAC(a[5], a[29]); SQRADDAC(a[6], a[28]); SQRADDAC(a[7], a[27]); SQRADDAC(a[8], a[26]); SQRADDAC(a[9], a[25]); SQRADDAC(a[10], a[24]); SQRADDAC(a[11], a[23]); SQRADDAC(a[12], a[22]); SQRADDAC(a[13], a[21]); SQRADDAC(a[14], a[20]); SQRADDAC(a[15], a[19]); SQRADDAC(a[16], a[18]); SQRADDDB; SQRADD(a[17], a[17]); 
-   COMBA_STORE(b[34]);
-
-   /* output 35 */
-   CARRY_FORWARD;
-   SQRADDSC(a[4], a[31]); SQRADDAC(a[5], a[30]); SQRADDAC(a[6], a[29]); SQRADDAC(a[7], a[28]); SQRADDAC(a[8], a[27]); SQRADDAC(a[9], a[26]); SQRADDAC(a[10], a[25]); SQRADDAC(a[11], a[24]); SQRADDAC(a[12], a[23]); SQRADDAC(a[13], a[22]); SQRADDAC(a[14], a[21]); SQRADDAC(a[15], a[20]); SQRADDAC(a[16], a[19]); SQRADDAC(a[17], a[18]); SQRADDDB; 
-   COMBA_STORE(b[35]);
-
-   /* output 36 */
-   CARRY_FORWARD;
-   SQRADDSC(a[5], a[31]); SQRADDAC(a[6], a[30]); SQRADDAC(a[7], a[29]); SQRADDAC(a[8], a[28]); SQRADDAC(a[9], a[27]); SQRADDAC(a[10], a[26]); SQRADDAC(a[11], a[25]); SQRADDAC(a[12], a[24]); SQRADDAC(a[13], a[23]); SQRADDAC(a[14], a[22]); SQRADDAC(a[15], a[21]); SQRADDAC(a[16], a[20]); SQRADDAC(a[17], a[19]); SQRADDDB; SQRADD(a[18], a[18]); 
-   COMBA_STORE(b[36]);
-
-   /* output 37 */
-   CARRY_FORWARD;
-   SQRADDSC(a[6], a[31]); SQRADDAC(a[7], a[30]); SQRADDAC(a[8], a[29]); SQRADDAC(a[9], a[28]); SQRADDAC(a[10], a[27]); SQRADDAC(a[11], a[26]); SQRADDAC(a[12], a[25]); SQRADDAC(a[13], a[24]); SQRADDAC(a[14], a[23]); SQRADDAC(a[15], a[22]); SQRADDAC(a[16], a[21]); SQRADDAC(a[17], a[20]); SQRADDAC(a[18], a[19]); SQRADDDB; 
-   COMBA_STORE(b[37]);
-
-   /* output 38 */
-   CARRY_FORWARD;
-   SQRADDSC(a[7], a[31]); SQRADDAC(a[8], a[30]); SQRADDAC(a[9], a[29]); SQRADDAC(a[10], a[28]); SQRADDAC(a[11], a[27]); SQRADDAC(a[12], a[26]); SQRADDAC(a[13], a[25]); SQRADDAC(a[14], a[24]); SQRADDAC(a[15], a[23]); SQRADDAC(a[16], a[22]); SQRADDAC(a[17], a[21]); SQRADDAC(a[18], a[20]); SQRADDDB; SQRADD(a[19], a[19]); 
-   COMBA_STORE(b[38]);
-
-   /* output 39 */
-   CARRY_FORWARD;
-   SQRADDSC(a[8], a[31]); SQRADDAC(a[9], a[30]); SQRADDAC(a[10], a[29]); SQRADDAC(a[11], a[28]); SQRADDAC(a[12], a[27]); SQRADDAC(a[13], a[26]); SQRADDAC(a[14], a[25]); SQRADDAC(a[15], a[24]); SQRADDAC(a[16], a[23]); SQRADDAC(a[17], a[22]); SQRADDAC(a[18], a[21]); SQRADDAC(a[19], a[20]); SQRADDDB; 
-   COMBA_STORE(b[39]);
-
-   /* output 40 */
-   CARRY_FORWARD;
-   SQRADDSC(a[9], a[31]); SQRADDAC(a[10], a[30]); SQRADDAC(a[11], a[29]); SQRADDAC(a[12], a[28]); SQRADDAC(a[13], a[27]); SQRADDAC(a[14], a[26]); SQRADDAC(a[15], a[25]); SQRADDAC(a[16], a[24]); SQRADDAC(a[17], a[23]); SQRADDAC(a[18], a[22]); SQRADDAC(a[19], a[21]); SQRADDDB; SQRADD(a[20], a[20]); 
-   COMBA_STORE(b[40]);
-
-   /* output 41 */
-   CARRY_FORWARD;
-   SQRADDSC(a[10], a[31]); SQRADDAC(a[11], a[30]); SQRADDAC(a[12], a[29]); SQRADDAC(a[13], a[28]); SQRADDAC(a[14], a[27]); SQRADDAC(a[15], a[26]); SQRADDAC(a[16], a[25]); SQRADDAC(a[17], a[24]); SQRADDAC(a[18], a[23]); SQRADDAC(a[19], a[22]); SQRADDAC(a[20], a[21]); SQRADDDB; 
-   COMBA_STORE(b[41]);
-
-   /* output 42 */
-   CARRY_FORWARD;
-   SQRADDSC(a[11], a[31]); SQRADDAC(a[12], a[30]); SQRADDAC(a[13], a[29]); SQRADDAC(a[14], a[28]); SQRADDAC(a[15], a[27]); SQRADDAC(a[16], a[26]); SQRADDAC(a[17], a[25]); SQRADDAC(a[18], a[24]); SQRADDAC(a[19], a[23]); SQRADDAC(a[20], a[22]); SQRADDDB; SQRADD(a[21], a[21]); 
-   COMBA_STORE(b[42]);
-
-   /* output 43 */
-   CARRY_FORWARD;
-   SQRADDSC(a[12], a[31]); SQRADDAC(a[13], a[30]); SQRADDAC(a[14], a[29]); SQRADDAC(a[15], a[28]); SQRADDAC(a[16], a[27]); SQRADDAC(a[17], a[26]); SQRADDAC(a[18], a[25]); SQRADDAC(a[19], a[24]); SQRADDAC(a[20], a[23]); SQRADDAC(a[21], a[22]); SQRADDDB; 
-   COMBA_STORE(b[43]);
-
-   /* output 44 */
-   CARRY_FORWARD;
-   SQRADDSC(a[13], a[31]); SQRADDAC(a[14], a[30]); SQRADDAC(a[15], a[29]); SQRADDAC(a[16], a[28]); SQRADDAC(a[17], a[27]); SQRADDAC(a[18], a[26]); SQRADDAC(a[19], a[25]); SQRADDAC(a[20], a[24]); SQRADDAC(a[21], a[23]); SQRADDDB; SQRADD(a[22], a[22]); 
-   COMBA_STORE(b[44]);
-
-   /* output 45 */
-   CARRY_FORWARD;
-   SQRADDSC(a[14], a[31]); SQRADDAC(a[15], a[30]); SQRADDAC(a[16], a[29]); SQRADDAC(a[17], a[28]); SQRADDAC(a[18], a[27]); SQRADDAC(a[19], a[26]); SQRADDAC(a[20], a[25]); SQRADDAC(a[21], a[24]); SQRADDAC(a[22], a[23]); SQRADDDB; 
-   COMBA_STORE(b[45]);
-
-   /* output 46 */
-   CARRY_FORWARD;
-   SQRADDSC(a[15], a[31]); SQRADDAC(a[16], a[30]); SQRADDAC(a[17], a[29]); SQRADDAC(a[18], a[28]); SQRADDAC(a[19], a[27]); SQRADDAC(a[20], a[26]); SQRADDAC(a[21], a[25]); SQRADDAC(a[22], a[24]); SQRADDDB; SQRADD(a[23], a[23]); 
-   COMBA_STORE(b[46]);
-
-   /* output 47 */
-   CARRY_FORWARD;
-   SQRADDSC(a[16], a[31]); SQRADDAC(a[17], a[30]); SQRADDAC(a[18], a[29]); SQRADDAC(a[19], a[28]); SQRADDAC(a[20], a[27]); SQRADDAC(a[21], a[26]); SQRADDAC(a[22], a[25]); SQRADDAC(a[23], a[24]); SQRADDDB; 
-   COMBA_STORE(b[47]);
-
-   /* output 48 */
-   CARRY_FORWARD;
-   SQRADDSC(a[17], a[31]); SQRADDAC(a[18], a[30]); SQRADDAC(a[19], a[29]); SQRADDAC(a[20], a[28]); SQRADDAC(a[21], a[27]); SQRADDAC(a[22], a[26]); SQRADDAC(a[23], a[25]); SQRADDDB; SQRADD(a[24], a[24]); 
-   COMBA_STORE(b[48]);
-
-   /* output 49 */
-   CARRY_FORWARD;
-   SQRADDSC(a[18], a[31]); SQRADDAC(a[19], a[30]); SQRADDAC(a[20], a[29]); SQRADDAC(a[21], a[28]); SQRADDAC(a[22], a[27]); SQRADDAC(a[23], a[26]); SQRADDAC(a[24], a[25]); SQRADDDB; 
-   COMBA_STORE(b[49]);
-
-   /* output 50 */
-   CARRY_FORWARD;
-   SQRADDSC(a[19], a[31]); SQRADDAC(a[20], a[30]); SQRADDAC(a[21], a[29]); SQRADDAC(a[22], a[28]); SQRADDAC(a[23], a[27]); SQRADDAC(a[24], a[26]); SQRADDDB; SQRADD(a[25], a[25]); 
-   COMBA_STORE(b[50]);
-
-   /* output 51 */
-   CARRY_FORWARD;
-   SQRADDSC(a[20], a[31]); SQRADDAC(a[21], a[30]); SQRADDAC(a[22], a[29]); SQRADDAC(a[23], a[28]); SQRADDAC(a[24], a[27]); SQRADDAC(a[25], a[26]); SQRADDDB; 
-   COMBA_STORE(b[51]);
-
-   /* output 52 */
-   CARRY_FORWARD;
-   SQRADDSC(a[21], a[31]); SQRADDAC(a[22], a[30]); SQRADDAC(a[23], a[29]); SQRADDAC(a[24], a[28]); SQRADDAC(a[25], a[27]); SQRADDDB; SQRADD(a[26], a[26]); 
-   COMBA_STORE(b[52]);
-
-   /* output 53 */
-   CARRY_FORWARD;
-   SQRADDSC(a[22], a[31]); SQRADDAC(a[23], a[30]); SQRADDAC(a[24], a[29]); SQRADDAC(a[25], a[28]); SQRADDAC(a[26], a[27]); SQRADDDB; 
-   COMBA_STORE(b[53]);
-
-   /* output 54 */
-   CARRY_FORWARD;
-   SQRADDSC(a[23], a[31]); SQRADDAC(a[24], a[30]); SQRADDAC(a[25], a[29]); SQRADDAC(a[26], a[28]); SQRADDDB; SQRADD(a[27], a[27]); 
-   COMBA_STORE(b[54]);
-
-   /* output 55 */
-   CARRY_FORWARD;
-   SQRADDSC(a[24], a[31]); SQRADDAC(a[25], a[30]); SQRADDAC(a[26], a[29]); SQRADDAC(a[27], a[28]); SQRADDDB; 
-   COMBA_STORE(b[55]);
-
-   /* output 56 */
-   CARRY_FORWARD;
-   SQRADDSC(a[25], a[31]); SQRADDAC(a[26], a[30]); SQRADDAC(a[27], a[29]); SQRADDDB; SQRADD(a[28], a[28]); 
-   COMBA_STORE(b[56]);
-
-   /* output 57 */
-   CARRY_FORWARD;
-   SQRADDSC(a[26], a[31]); SQRADDAC(a[27], a[30]); SQRADDAC(a[28], a[29]); SQRADDDB; 
-   COMBA_STORE(b[57]);
-
-   /* output 58 */
-   CARRY_FORWARD;
-   SQRADD2(a[27], a[31]); SQRADD2(a[28], a[30]); SQRADD(a[29], a[29]); 
-   COMBA_STORE(b[58]);
-
-   /* output 59 */
-   CARRY_FORWARD;
-   SQRADD2(a[28], a[31]); SQRADD2(a[29], a[30]); 
-   COMBA_STORE(b[59]);
-
-   /* output 60 */
-   CARRY_FORWARD;
-   SQRADD2(a[29], a[31]); SQRADD(a[30], a[30]); 
-   COMBA_STORE(b[60]);
-
-   /* output 61 */
-   CARRY_FORWARD;
-   SQRADD2(a[30], a[31]); 
-   COMBA_STORE(b[61]);
-
-   /* output 62 */
-   CARRY_FORWARD;
-   SQRADD(a[31], a[31]); 
-   COMBA_STORE(b[62]);
-   COMBA_STORE2(b[63]);
-   COMBA_FINI;
-
-   B->used = 64;
-   B->sign = FP_ZPOS;
-   XMEMCPY(B->dp, b, 64 * sizeof(fp_digit));
-   fp_clamp(B);
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif
-
-

+ 0 - 97
lib/wolfssl/wolfcrypt/src/fp_sqr_comba_4.i

@@ -1,97 +0,0 @@
-/* fp_sqr_comba_4.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_SQR4
-int fp_sqr_comba4(fp_int *A, fp_int *B)
-{
-   fp_digit *a, c0, c1, c2;
-#ifdef TFM_ISO
-   fp_word tt;
-#endif
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit b[8];
-#else
-   fp_digit *b;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 8, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (b == NULL)
-      return FP_MEM;
-#endif
-
-   a = A->dp;
-   COMBA_START; 
-
-   /* clear carries */
-   CLEAR_CARRY;
-
-   /* output 0 */
-   SQRADD(a[0],a[0]);
-   COMBA_STORE(b[0]);
-
-   /* output 1 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[1]); 
-   COMBA_STORE(b[1]);
-
-   /* output 2 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]); 
-   COMBA_STORE(b[2]);
-
-   /* output 3 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]); 
-   COMBA_STORE(b[3]);
-
-   /* output 4 */
-   CARRY_FORWARD;
-   SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]); 
-   COMBA_STORE(b[4]);
-
-   /* output 5 */
-   CARRY_FORWARD;
-   SQRADD2(a[2], a[3]); 
-   COMBA_STORE(b[5]);
-
-   /* output 6 */
-   CARRY_FORWARD;
-   SQRADD(a[3], a[3]); 
-   COMBA_STORE(b[6]);
-   COMBA_STORE2(b[7]);
-   COMBA_FINI;
-
-   B->used = 8;
-   B->sign = FP_ZPOS;
-   XMEMCPY(B->dp, b, 8 * sizeof(fp_digit));
-   fp_clamp(B);
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif
-
-

+ 0 - 537
lib/wolfssl/wolfcrypt/src/fp_sqr_comba_48.i

@@ -1,537 +0,0 @@
-/* fp_sqr_comba_48.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_SQR48
-int fp_sqr_comba48(fp_int *A, fp_int *B)
-{
-   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
-#ifdef TFM_ISO
-   fp_word tt;
-#endif
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit b[96];
-#else
-   fp_digit *b;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 96, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (b == NULL)
-      return FP_MEM;
-#endif
-
-   a = A->dp;
-   COMBA_START; 
-
-   /* clear carries */
-   CLEAR_CARRY;
-
-   /* output 0 */
-   SQRADD(a[0],a[0]);
-   COMBA_STORE(b[0]);
-
-   /* output 1 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[1]); 
-   COMBA_STORE(b[1]);
-
-   /* output 2 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]); 
-   COMBA_STORE(b[2]);
-
-   /* output 3 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]); 
-   COMBA_STORE(b[3]);
-
-   /* output 4 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]); 
-   COMBA_STORE(b[4]);
-
-   /* output 5 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-   COMBA_STORE(b[5]);
-
-   /* output 6 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); 
-   COMBA_STORE(b[6]);
-
-   /* output 7 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; 
-   COMBA_STORE(b[7]);
-
-   /* output 8 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
-   COMBA_STORE(b[8]);
-
-   /* output 9 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB; 
-   COMBA_STORE(b[9]);
-
-   /* output 10 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]); 
-   COMBA_STORE(b[10]);
-
-   /* output 11 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB; 
-   COMBA_STORE(b[11]);
-
-   /* output 12 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]); 
-   COMBA_STORE(b[12]);
-
-   /* output 13 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB; 
-   COMBA_STORE(b[13]);
-
-   /* output 14 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]); 
-   COMBA_STORE(b[14]);
-
-   /* output 15 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB; 
-   COMBA_STORE(b[15]);
-
-   /* output 16 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[16]); SQRADDAC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]); 
-   COMBA_STORE(b[16]);
-
-   /* output 17 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[17]); SQRADDAC(a[1], a[16]); SQRADDAC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB; 
-   COMBA_STORE(b[17]);
-
-   /* output 18 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[18]); SQRADDAC(a[1], a[17]); SQRADDAC(a[2], a[16]); SQRADDAC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]); 
-   COMBA_STORE(b[18]);
-
-   /* output 19 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[19]); SQRADDAC(a[1], a[18]); SQRADDAC(a[2], a[17]); SQRADDAC(a[3], a[16]); SQRADDAC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB; 
-   COMBA_STORE(b[19]);
-
-   /* output 20 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[20]); SQRADDAC(a[1], a[19]); SQRADDAC(a[2], a[18]); SQRADDAC(a[3], a[17]); SQRADDAC(a[4], a[16]); SQRADDAC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]); 
-   COMBA_STORE(b[20]);
-
-   /* output 21 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[21]); SQRADDAC(a[1], a[20]); SQRADDAC(a[2], a[19]); SQRADDAC(a[3], a[18]); SQRADDAC(a[4], a[17]); SQRADDAC(a[5], a[16]); SQRADDAC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB; 
-   COMBA_STORE(b[21]);
-
-   /* output 22 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[22]); SQRADDAC(a[1], a[21]); SQRADDAC(a[2], a[20]); SQRADDAC(a[3], a[19]); SQRADDAC(a[4], a[18]); SQRADDAC(a[5], a[17]); SQRADDAC(a[6], a[16]); SQRADDAC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]); 
-   COMBA_STORE(b[22]);
-
-   /* output 23 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[23]); SQRADDAC(a[1], a[22]); SQRADDAC(a[2], a[21]); SQRADDAC(a[3], a[20]); SQRADDAC(a[4], a[19]); SQRADDAC(a[5], a[18]); SQRADDAC(a[6], a[17]); SQRADDAC(a[7], a[16]); SQRADDAC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB; 
-   COMBA_STORE(b[23]);
-
-   /* output 24 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[24]); SQRADDAC(a[1], a[23]); SQRADDAC(a[2], a[22]); SQRADDAC(a[3], a[21]); SQRADDAC(a[4], a[20]); SQRADDAC(a[5], a[19]); SQRADDAC(a[6], a[18]); SQRADDAC(a[7], a[17]); SQRADDAC(a[8], a[16]); SQRADDAC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]); 
-   COMBA_STORE(b[24]);
-
-   /* output 25 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[25]); SQRADDAC(a[1], a[24]); SQRADDAC(a[2], a[23]); SQRADDAC(a[3], a[22]); SQRADDAC(a[4], a[21]); SQRADDAC(a[5], a[20]); SQRADDAC(a[6], a[19]); SQRADDAC(a[7], a[18]); SQRADDAC(a[8], a[17]); SQRADDAC(a[9], a[16]); SQRADDAC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB; 
-   COMBA_STORE(b[25]);
-
-   /* output 26 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[26]); SQRADDAC(a[1], a[25]); SQRADDAC(a[2], a[24]); SQRADDAC(a[3], a[23]); SQRADDAC(a[4], a[22]); SQRADDAC(a[5], a[21]); SQRADDAC(a[6], a[20]); SQRADDAC(a[7], a[19]); SQRADDAC(a[8], a[18]); SQRADDAC(a[9], a[17]); SQRADDAC(a[10], a[16]); SQRADDAC(a[11], a[15]); SQRADDAC(a[12], a[14]); SQRADDDB; SQRADD(a[13], a[13]); 
-   COMBA_STORE(b[26]);
-
-   /* output 27 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[27]); SQRADDAC(a[1], a[26]); SQRADDAC(a[2], a[25]); SQRADDAC(a[3], a[24]); SQRADDAC(a[4], a[23]); SQRADDAC(a[5], a[22]); SQRADDAC(a[6], a[21]); SQRADDAC(a[7], a[20]); SQRADDAC(a[8], a[19]); SQRADDAC(a[9], a[18]); SQRADDAC(a[10], a[17]); SQRADDAC(a[11], a[16]); SQRADDAC(a[12], a[15]); SQRADDAC(a[13], a[14]); SQRADDDB; 
-   COMBA_STORE(b[27]);
-
-   /* output 28 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[28]); SQRADDAC(a[1], a[27]); SQRADDAC(a[2], a[26]); SQRADDAC(a[3], a[25]); SQRADDAC(a[4], a[24]); SQRADDAC(a[5], a[23]); SQRADDAC(a[6], a[22]); SQRADDAC(a[7], a[21]); SQRADDAC(a[8], a[20]); SQRADDAC(a[9], a[19]); SQRADDAC(a[10], a[18]); SQRADDAC(a[11], a[17]); SQRADDAC(a[12], a[16]); SQRADDAC(a[13], a[15]); SQRADDDB; SQRADD(a[14], a[14]); 
-   COMBA_STORE(b[28]);
-
-   /* output 29 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[29]); SQRADDAC(a[1], a[28]); SQRADDAC(a[2], a[27]); SQRADDAC(a[3], a[26]); SQRADDAC(a[4], a[25]); SQRADDAC(a[5], a[24]); SQRADDAC(a[6], a[23]); SQRADDAC(a[7], a[22]); SQRADDAC(a[8], a[21]); SQRADDAC(a[9], a[20]); SQRADDAC(a[10], a[19]); SQRADDAC(a[11], a[18]); SQRADDAC(a[12], a[17]); SQRADDAC(a[13], a[16]); SQRADDAC(a[14], a[15]); SQRADDDB; 
-   COMBA_STORE(b[29]);
-
-   /* output 30 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[30]); SQRADDAC(a[1], a[29]); SQRADDAC(a[2], a[28]); SQRADDAC(a[3], a[27]); SQRADDAC(a[4], a[26]); SQRADDAC(a[5], a[25]); SQRADDAC(a[6], a[24]); SQRADDAC(a[7], a[23]); SQRADDAC(a[8], a[22]); SQRADDAC(a[9], a[21]); SQRADDAC(a[10], a[20]); SQRADDAC(a[11], a[19]); SQRADDAC(a[12], a[18]); SQRADDAC(a[13], a[17]); SQRADDAC(a[14], a[16]); SQRADDDB; SQRADD(a[15], a[15]); 
-   COMBA_STORE(b[30]);
-
-   /* output 31 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[31]); SQRADDAC(a[1], a[30]); SQRADDAC(a[2], a[29]); SQRADDAC(a[3], a[28]); SQRADDAC(a[4], a[27]); SQRADDAC(a[5], a[26]); SQRADDAC(a[6], a[25]); SQRADDAC(a[7], a[24]); SQRADDAC(a[8], a[23]); SQRADDAC(a[9], a[22]); SQRADDAC(a[10], a[21]); SQRADDAC(a[11], a[20]); SQRADDAC(a[12], a[19]); SQRADDAC(a[13], a[18]); SQRADDAC(a[14], a[17]); SQRADDAC(a[15], a[16]); SQRADDDB; 
-   COMBA_STORE(b[31]);
-
-   /* output 32 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[32]); SQRADDAC(a[1], a[31]); SQRADDAC(a[2], a[30]); SQRADDAC(a[3], a[29]); SQRADDAC(a[4], a[28]); SQRADDAC(a[5], a[27]); SQRADDAC(a[6], a[26]); SQRADDAC(a[7], a[25]); SQRADDAC(a[8], a[24]); SQRADDAC(a[9], a[23]); SQRADDAC(a[10], a[22]); SQRADDAC(a[11], a[21]); SQRADDAC(a[12], a[20]); SQRADDAC(a[13], a[19]); SQRADDAC(a[14], a[18]); SQRADDAC(a[15], a[17]); SQRADDDB; SQRADD(a[16], a[16]); 
-   COMBA_STORE(b[32]);
-
-   /* output 33 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[33]); SQRADDAC(a[1], a[32]); SQRADDAC(a[2], a[31]); SQRADDAC(a[3], a[30]); SQRADDAC(a[4], a[29]); SQRADDAC(a[5], a[28]); SQRADDAC(a[6], a[27]); SQRADDAC(a[7], a[26]); SQRADDAC(a[8], a[25]); SQRADDAC(a[9], a[24]); SQRADDAC(a[10], a[23]); SQRADDAC(a[11], a[22]); SQRADDAC(a[12], a[21]); SQRADDAC(a[13], a[20]); SQRADDAC(a[14], a[19]); SQRADDAC(a[15], a[18]); SQRADDAC(a[16], a[17]); SQRADDDB; 
-   COMBA_STORE(b[33]);
-
-   /* output 34 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[34]); SQRADDAC(a[1], a[33]); SQRADDAC(a[2], a[32]); SQRADDAC(a[3], a[31]); SQRADDAC(a[4], a[30]); SQRADDAC(a[5], a[29]); SQRADDAC(a[6], a[28]); SQRADDAC(a[7], a[27]); SQRADDAC(a[8], a[26]); SQRADDAC(a[9], a[25]); SQRADDAC(a[10], a[24]); SQRADDAC(a[11], a[23]); SQRADDAC(a[12], a[22]); SQRADDAC(a[13], a[21]); SQRADDAC(a[14], a[20]); SQRADDAC(a[15], a[19]); SQRADDAC(a[16], a[18]); SQRADDDB; SQRADD(a[17], a[17]); 
-   COMBA_STORE(b[34]);
-
-   /* output 35 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[35]); SQRADDAC(a[1], a[34]); SQRADDAC(a[2], a[33]); SQRADDAC(a[3], a[32]); SQRADDAC(a[4], a[31]); SQRADDAC(a[5], a[30]); SQRADDAC(a[6], a[29]); SQRADDAC(a[7], a[28]); SQRADDAC(a[8], a[27]); SQRADDAC(a[9], a[26]); SQRADDAC(a[10], a[25]); SQRADDAC(a[11], a[24]); SQRADDAC(a[12], a[23]); SQRADDAC(a[13], a[22]); SQRADDAC(a[14], a[21]); SQRADDAC(a[15], a[20]); SQRADDAC(a[16], a[19]); SQRADDAC(a[17], a[18]); SQRADDDB; 
-   COMBA_STORE(b[35]);
-
-   /* output 36 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[36]); SQRADDAC(a[1], a[35]); SQRADDAC(a[2], a[34]); SQRADDAC(a[3], a[33]); SQRADDAC(a[4], a[32]); SQRADDAC(a[5], a[31]); SQRADDAC(a[6], a[30]); SQRADDAC(a[7], a[29]); SQRADDAC(a[8], a[28]); SQRADDAC(a[9], a[27]); SQRADDAC(a[10], a[26]); SQRADDAC(a[11], a[25]); SQRADDAC(a[12], a[24]); SQRADDAC(a[13], a[23]); SQRADDAC(a[14], a[22]); SQRADDAC(a[15], a[21]); SQRADDAC(a[16], a[20]); SQRADDAC(a[17], a[19]); SQRADDDB; SQRADD(a[18], a[18]); 
-   COMBA_STORE(b[36]);
-
-   /* output 37 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[37]); SQRADDAC(a[1], a[36]); SQRADDAC(a[2], a[35]); SQRADDAC(a[3], a[34]); SQRADDAC(a[4], a[33]); SQRADDAC(a[5], a[32]); SQRADDAC(a[6], a[31]); SQRADDAC(a[7], a[30]); SQRADDAC(a[8], a[29]); SQRADDAC(a[9], a[28]); SQRADDAC(a[10], a[27]); SQRADDAC(a[11], a[26]); SQRADDAC(a[12], a[25]); SQRADDAC(a[13], a[24]); SQRADDAC(a[14], a[23]); SQRADDAC(a[15], a[22]); SQRADDAC(a[16], a[21]); SQRADDAC(a[17], a[20]); SQRADDAC(a[18], a[19]); SQRADDDB; 
-   COMBA_STORE(b[37]);
-
-   /* output 38 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[38]); SQRADDAC(a[1], a[37]); SQRADDAC(a[2], a[36]); SQRADDAC(a[3], a[35]); SQRADDAC(a[4], a[34]); SQRADDAC(a[5], a[33]); SQRADDAC(a[6], a[32]); SQRADDAC(a[7], a[31]); SQRADDAC(a[8], a[30]); SQRADDAC(a[9], a[29]); SQRADDAC(a[10], a[28]); SQRADDAC(a[11], a[27]); SQRADDAC(a[12], a[26]); SQRADDAC(a[13], a[25]); SQRADDAC(a[14], a[24]); SQRADDAC(a[15], a[23]); SQRADDAC(a[16], a[22]); SQRADDAC(a[17], a[21]); SQRADDAC(a[18], a[20]); SQRADDDB; SQRADD(a[19], a[19]); 
-   COMBA_STORE(b[38]);
-
-   /* output 39 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[39]); SQRADDAC(a[1], a[38]); SQRADDAC(a[2], a[37]); SQRADDAC(a[3], a[36]); SQRADDAC(a[4], a[35]); SQRADDAC(a[5], a[34]); SQRADDAC(a[6], a[33]); SQRADDAC(a[7], a[32]); SQRADDAC(a[8], a[31]); SQRADDAC(a[9], a[30]); SQRADDAC(a[10], a[29]); SQRADDAC(a[11], a[28]); SQRADDAC(a[12], a[27]); SQRADDAC(a[13], a[26]); SQRADDAC(a[14], a[25]); SQRADDAC(a[15], a[24]); SQRADDAC(a[16], a[23]); SQRADDAC(a[17], a[22]); SQRADDAC(a[18], a[21]); SQRADDAC(a[19], a[20]); SQRADDDB; 
-   COMBA_STORE(b[39]);
-
-   /* output 40 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[40]); SQRADDAC(a[1], a[39]); SQRADDAC(a[2], a[38]); SQRADDAC(a[3], a[37]); SQRADDAC(a[4], a[36]); SQRADDAC(a[5], a[35]); SQRADDAC(a[6], a[34]); SQRADDAC(a[7], a[33]); SQRADDAC(a[8], a[32]); SQRADDAC(a[9], a[31]); SQRADDAC(a[10], a[30]); SQRADDAC(a[11], a[29]); SQRADDAC(a[12], a[28]); SQRADDAC(a[13], a[27]); SQRADDAC(a[14], a[26]); SQRADDAC(a[15], a[25]); SQRADDAC(a[16], a[24]); SQRADDAC(a[17], a[23]); SQRADDAC(a[18], a[22]); SQRADDAC(a[19], a[21]); SQRADDDB; SQRADD(a[20], a[20]); 
-   COMBA_STORE(b[40]);
-
-   /* output 41 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[41]); SQRADDAC(a[1], a[40]); SQRADDAC(a[2], a[39]); SQRADDAC(a[3], a[38]); SQRADDAC(a[4], a[37]); SQRADDAC(a[5], a[36]); SQRADDAC(a[6], a[35]); SQRADDAC(a[7], a[34]); SQRADDAC(a[8], a[33]); SQRADDAC(a[9], a[32]); SQRADDAC(a[10], a[31]); SQRADDAC(a[11], a[30]); SQRADDAC(a[12], a[29]); SQRADDAC(a[13], a[28]); SQRADDAC(a[14], a[27]); SQRADDAC(a[15], a[26]); SQRADDAC(a[16], a[25]); SQRADDAC(a[17], a[24]); SQRADDAC(a[18], a[23]); SQRADDAC(a[19], a[22]); SQRADDAC(a[20], a[21]); SQRADDDB; 
-   COMBA_STORE(b[41]);
-
-   /* output 42 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[42]); SQRADDAC(a[1], a[41]); SQRADDAC(a[2], a[40]); SQRADDAC(a[3], a[39]); SQRADDAC(a[4], a[38]); SQRADDAC(a[5], a[37]); SQRADDAC(a[6], a[36]); SQRADDAC(a[7], a[35]); SQRADDAC(a[8], a[34]); SQRADDAC(a[9], a[33]); SQRADDAC(a[10], a[32]); SQRADDAC(a[11], a[31]); SQRADDAC(a[12], a[30]); SQRADDAC(a[13], a[29]); SQRADDAC(a[14], a[28]); SQRADDAC(a[15], a[27]); SQRADDAC(a[16], a[26]); SQRADDAC(a[17], a[25]); SQRADDAC(a[18], a[24]); SQRADDAC(a[19], a[23]); SQRADDAC(a[20], a[22]); SQRADDDB; SQRADD(a[21], a[21]); 
-   COMBA_STORE(b[42]);
-
-   /* output 43 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[43]); SQRADDAC(a[1], a[42]); SQRADDAC(a[2], a[41]); SQRADDAC(a[3], a[40]); SQRADDAC(a[4], a[39]); SQRADDAC(a[5], a[38]); SQRADDAC(a[6], a[37]); SQRADDAC(a[7], a[36]); SQRADDAC(a[8], a[35]); SQRADDAC(a[9], a[34]); SQRADDAC(a[10], a[33]); SQRADDAC(a[11], a[32]); SQRADDAC(a[12], a[31]); SQRADDAC(a[13], a[30]); SQRADDAC(a[14], a[29]); SQRADDAC(a[15], a[28]); SQRADDAC(a[16], a[27]); SQRADDAC(a[17], a[26]); SQRADDAC(a[18], a[25]); SQRADDAC(a[19], a[24]); SQRADDAC(a[20], a[23]); SQRADDAC(a[21], a[22]); SQRADDDB; 
-   COMBA_STORE(b[43]);
-
-   /* output 44 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[44]); SQRADDAC(a[1], a[43]); SQRADDAC(a[2], a[42]); SQRADDAC(a[3], a[41]); SQRADDAC(a[4], a[40]); SQRADDAC(a[5], a[39]); SQRADDAC(a[6], a[38]); SQRADDAC(a[7], a[37]); SQRADDAC(a[8], a[36]); SQRADDAC(a[9], a[35]); SQRADDAC(a[10], a[34]); SQRADDAC(a[11], a[33]); SQRADDAC(a[12], a[32]); SQRADDAC(a[13], a[31]); SQRADDAC(a[14], a[30]); SQRADDAC(a[15], a[29]); SQRADDAC(a[16], a[28]); SQRADDAC(a[17], a[27]); SQRADDAC(a[18], a[26]); SQRADDAC(a[19], a[25]); SQRADDAC(a[20], a[24]); SQRADDAC(a[21], a[23]); SQRADDDB; SQRADD(a[22], a[22]); 
-   COMBA_STORE(b[44]);
-
-   /* output 45 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[45]); SQRADDAC(a[1], a[44]); SQRADDAC(a[2], a[43]); SQRADDAC(a[3], a[42]); SQRADDAC(a[4], a[41]); SQRADDAC(a[5], a[40]); SQRADDAC(a[6], a[39]); SQRADDAC(a[7], a[38]); SQRADDAC(a[8], a[37]); SQRADDAC(a[9], a[36]); SQRADDAC(a[10], a[35]); SQRADDAC(a[11], a[34]); SQRADDAC(a[12], a[33]); SQRADDAC(a[13], a[32]); SQRADDAC(a[14], a[31]); SQRADDAC(a[15], a[30]); SQRADDAC(a[16], a[29]); SQRADDAC(a[17], a[28]); SQRADDAC(a[18], a[27]); SQRADDAC(a[19], a[26]); SQRADDAC(a[20], a[25]); SQRADDAC(a[21], a[24]); SQRADDAC(a[22], a[23]); SQRADDDB; 
-   COMBA_STORE(b[45]);
-
-   /* output 46 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[46]); SQRADDAC(a[1], a[45]); SQRADDAC(a[2], a[44]); SQRADDAC(a[3], a[43]); SQRADDAC(a[4], a[42]); SQRADDAC(a[5], a[41]); SQRADDAC(a[6], a[40]); SQRADDAC(a[7], a[39]); SQRADDAC(a[8], a[38]); SQRADDAC(a[9], a[37]); SQRADDAC(a[10], a[36]); SQRADDAC(a[11], a[35]); SQRADDAC(a[12], a[34]); SQRADDAC(a[13], a[33]); SQRADDAC(a[14], a[32]); SQRADDAC(a[15], a[31]); SQRADDAC(a[16], a[30]); SQRADDAC(a[17], a[29]); SQRADDAC(a[18], a[28]); SQRADDAC(a[19], a[27]); SQRADDAC(a[20], a[26]); SQRADDAC(a[21], a[25]); SQRADDAC(a[22], a[24]); SQRADDDB; SQRADD(a[23], a[23]); 
-   COMBA_STORE(b[46]);
-
-   /* output 47 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[47]); SQRADDAC(a[1], a[46]); SQRADDAC(a[2], a[45]); SQRADDAC(a[3], a[44]); SQRADDAC(a[4], a[43]); SQRADDAC(a[5], a[42]); SQRADDAC(a[6], a[41]); SQRADDAC(a[7], a[40]); SQRADDAC(a[8], a[39]); SQRADDAC(a[9], a[38]); SQRADDAC(a[10], a[37]); SQRADDAC(a[11], a[36]); SQRADDAC(a[12], a[35]); SQRADDAC(a[13], a[34]); SQRADDAC(a[14], a[33]); SQRADDAC(a[15], a[32]); SQRADDAC(a[16], a[31]); SQRADDAC(a[17], a[30]); SQRADDAC(a[18], a[29]); SQRADDAC(a[19], a[28]); SQRADDAC(a[20], a[27]); SQRADDAC(a[21], a[26]); SQRADDAC(a[22], a[25]); SQRADDAC(a[23], a[24]); SQRADDDB; 
-   COMBA_STORE(b[47]);
-
-   /* output 48 */
-   CARRY_FORWARD;
-   SQRADDSC(a[1], a[47]); SQRADDAC(a[2], a[46]); SQRADDAC(a[3], a[45]); SQRADDAC(a[4], a[44]); SQRADDAC(a[5], a[43]); SQRADDAC(a[6], a[42]); SQRADDAC(a[7], a[41]); SQRADDAC(a[8], a[40]); SQRADDAC(a[9], a[39]); SQRADDAC(a[10], a[38]); SQRADDAC(a[11], a[37]); SQRADDAC(a[12], a[36]); SQRADDAC(a[13], a[35]); SQRADDAC(a[14], a[34]); SQRADDAC(a[15], a[33]); SQRADDAC(a[16], a[32]); SQRADDAC(a[17], a[31]); SQRADDAC(a[18], a[30]); SQRADDAC(a[19], a[29]); SQRADDAC(a[20], a[28]); SQRADDAC(a[21], a[27]); SQRADDAC(a[22], a[26]); SQRADDAC(a[23], a[25]); SQRADDDB; SQRADD(a[24], a[24]); 
-   COMBA_STORE(b[48]);
-
-   /* output 49 */
-   CARRY_FORWARD;
-   SQRADDSC(a[2], a[47]); SQRADDAC(a[3], a[46]); SQRADDAC(a[4], a[45]); SQRADDAC(a[5], a[44]); SQRADDAC(a[6], a[43]); SQRADDAC(a[7], a[42]); SQRADDAC(a[8], a[41]); SQRADDAC(a[9], a[40]); SQRADDAC(a[10], a[39]); SQRADDAC(a[11], a[38]); SQRADDAC(a[12], a[37]); SQRADDAC(a[13], a[36]); SQRADDAC(a[14], a[35]); SQRADDAC(a[15], a[34]); SQRADDAC(a[16], a[33]); SQRADDAC(a[17], a[32]); SQRADDAC(a[18], a[31]); SQRADDAC(a[19], a[30]); SQRADDAC(a[20], a[29]); SQRADDAC(a[21], a[28]); SQRADDAC(a[22], a[27]); SQRADDAC(a[23], a[26]); SQRADDAC(a[24], a[25]); SQRADDDB; 
-   COMBA_STORE(b[49]);
-
-   /* output 50 */
-   CARRY_FORWARD;
-   SQRADDSC(a[3], a[47]); SQRADDAC(a[4], a[46]); SQRADDAC(a[5], a[45]); SQRADDAC(a[6], a[44]); SQRADDAC(a[7], a[43]); SQRADDAC(a[8], a[42]); SQRADDAC(a[9], a[41]); SQRADDAC(a[10], a[40]); SQRADDAC(a[11], a[39]); SQRADDAC(a[12], a[38]); SQRADDAC(a[13], a[37]); SQRADDAC(a[14], a[36]); SQRADDAC(a[15], a[35]); SQRADDAC(a[16], a[34]); SQRADDAC(a[17], a[33]); SQRADDAC(a[18], a[32]); SQRADDAC(a[19], a[31]); SQRADDAC(a[20], a[30]); SQRADDAC(a[21], a[29]); SQRADDAC(a[22], a[28]); SQRADDAC(a[23], a[27]); SQRADDAC(a[24], a[26]); SQRADDDB; SQRADD(a[25], a[25]); 
-   COMBA_STORE(b[50]);
-
-   /* output 51 */
-   CARRY_FORWARD;
-   SQRADDSC(a[4], a[47]); SQRADDAC(a[5], a[46]); SQRADDAC(a[6], a[45]); SQRADDAC(a[7], a[44]); SQRADDAC(a[8], a[43]); SQRADDAC(a[9], a[42]); SQRADDAC(a[10], a[41]); SQRADDAC(a[11], a[40]); SQRADDAC(a[12], a[39]); SQRADDAC(a[13], a[38]); SQRADDAC(a[14], a[37]); SQRADDAC(a[15], a[36]); SQRADDAC(a[16], a[35]); SQRADDAC(a[17], a[34]); SQRADDAC(a[18], a[33]); SQRADDAC(a[19], a[32]); SQRADDAC(a[20], a[31]); SQRADDAC(a[21], a[30]); SQRADDAC(a[22], a[29]); SQRADDAC(a[23], a[28]); SQRADDAC(a[24], a[27]); SQRADDAC(a[25], a[26]); SQRADDDB; 
-   COMBA_STORE(b[51]);
-
-   /* output 52 */
-   CARRY_FORWARD;
-   SQRADDSC(a[5], a[47]); SQRADDAC(a[6], a[46]); SQRADDAC(a[7], a[45]); SQRADDAC(a[8], a[44]); SQRADDAC(a[9], a[43]); SQRADDAC(a[10], a[42]); SQRADDAC(a[11], a[41]); SQRADDAC(a[12], a[40]); SQRADDAC(a[13], a[39]); SQRADDAC(a[14], a[38]); SQRADDAC(a[15], a[37]); SQRADDAC(a[16], a[36]); SQRADDAC(a[17], a[35]); SQRADDAC(a[18], a[34]); SQRADDAC(a[19], a[33]); SQRADDAC(a[20], a[32]); SQRADDAC(a[21], a[31]); SQRADDAC(a[22], a[30]); SQRADDAC(a[23], a[29]); SQRADDAC(a[24], a[28]); SQRADDAC(a[25], a[27]); SQRADDDB; SQRADD(a[26], a[26]); 
-   COMBA_STORE(b[52]);
-
-   /* output 53 */
-   CARRY_FORWARD;
-   SQRADDSC(a[6], a[47]); SQRADDAC(a[7], a[46]); SQRADDAC(a[8], a[45]); SQRADDAC(a[9], a[44]); SQRADDAC(a[10], a[43]); SQRADDAC(a[11], a[42]); SQRADDAC(a[12], a[41]); SQRADDAC(a[13], a[40]); SQRADDAC(a[14], a[39]); SQRADDAC(a[15], a[38]); SQRADDAC(a[16], a[37]); SQRADDAC(a[17], a[36]); SQRADDAC(a[18], a[35]); SQRADDAC(a[19], a[34]); SQRADDAC(a[20], a[33]); SQRADDAC(a[21], a[32]); SQRADDAC(a[22], a[31]); SQRADDAC(a[23], a[30]); SQRADDAC(a[24], a[29]); SQRADDAC(a[25], a[28]); SQRADDAC(a[26], a[27]); SQRADDDB; 
-   COMBA_STORE(b[53]);
-
-   /* output 54 */
-   CARRY_FORWARD;
-   SQRADDSC(a[7], a[47]); SQRADDAC(a[8], a[46]); SQRADDAC(a[9], a[45]); SQRADDAC(a[10], a[44]); SQRADDAC(a[11], a[43]); SQRADDAC(a[12], a[42]); SQRADDAC(a[13], a[41]); SQRADDAC(a[14], a[40]); SQRADDAC(a[15], a[39]); SQRADDAC(a[16], a[38]); SQRADDAC(a[17], a[37]); SQRADDAC(a[18], a[36]); SQRADDAC(a[19], a[35]); SQRADDAC(a[20], a[34]); SQRADDAC(a[21], a[33]); SQRADDAC(a[22], a[32]); SQRADDAC(a[23], a[31]); SQRADDAC(a[24], a[30]); SQRADDAC(a[25], a[29]); SQRADDAC(a[26], a[28]); SQRADDDB; SQRADD(a[27], a[27]); 
-   COMBA_STORE(b[54]);
-
-   /* output 55 */
-   CARRY_FORWARD;
-   SQRADDSC(a[8], a[47]); SQRADDAC(a[9], a[46]); SQRADDAC(a[10], a[45]); SQRADDAC(a[11], a[44]); SQRADDAC(a[12], a[43]); SQRADDAC(a[13], a[42]); SQRADDAC(a[14], a[41]); SQRADDAC(a[15], a[40]); SQRADDAC(a[16], a[39]); SQRADDAC(a[17], a[38]); SQRADDAC(a[18], a[37]); SQRADDAC(a[19], a[36]); SQRADDAC(a[20], a[35]); SQRADDAC(a[21], a[34]); SQRADDAC(a[22], a[33]); SQRADDAC(a[23], a[32]); SQRADDAC(a[24], a[31]); SQRADDAC(a[25], a[30]); SQRADDAC(a[26], a[29]); SQRADDAC(a[27], a[28]); SQRADDDB; 
-   COMBA_STORE(b[55]);
-
-   /* output 56 */
-   CARRY_FORWARD;
-   SQRADDSC(a[9], a[47]); SQRADDAC(a[10], a[46]); SQRADDAC(a[11], a[45]); SQRADDAC(a[12], a[44]); SQRADDAC(a[13], a[43]); SQRADDAC(a[14], a[42]); SQRADDAC(a[15], a[41]); SQRADDAC(a[16], a[40]); SQRADDAC(a[17], a[39]); SQRADDAC(a[18], a[38]); SQRADDAC(a[19], a[37]); SQRADDAC(a[20], a[36]); SQRADDAC(a[21], a[35]); SQRADDAC(a[22], a[34]); SQRADDAC(a[23], a[33]); SQRADDAC(a[24], a[32]); SQRADDAC(a[25], a[31]); SQRADDAC(a[26], a[30]); SQRADDAC(a[27], a[29]); SQRADDDB; SQRADD(a[28], a[28]); 
-   COMBA_STORE(b[56]);
-
-   /* output 57 */
-   CARRY_FORWARD;
-   SQRADDSC(a[10], a[47]); SQRADDAC(a[11], a[46]); SQRADDAC(a[12], a[45]); SQRADDAC(a[13], a[44]); SQRADDAC(a[14], a[43]); SQRADDAC(a[15], a[42]); SQRADDAC(a[16], a[41]); SQRADDAC(a[17], a[40]); SQRADDAC(a[18], a[39]); SQRADDAC(a[19], a[38]); SQRADDAC(a[20], a[37]); SQRADDAC(a[21], a[36]); SQRADDAC(a[22], a[35]); SQRADDAC(a[23], a[34]); SQRADDAC(a[24], a[33]); SQRADDAC(a[25], a[32]); SQRADDAC(a[26], a[31]); SQRADDAC(a[27], a[30]); SQRADDAC(a[28], a[29]); SQRADDDB; 
-   COMBA_STORE(b[57]);
-
-   /* output 58 */
-   CARRY_FORWARD;
-   SQRADDSC(a[11], a[47]); SQRADDAC(a[12], a[46]); SQRADDAC(a[13], a[45]); SQRADDAC(a[14], a[44]); SQRADDAC(a[15], a[43]); SQRADDAC(a[16], a[42]); SQRADDAC(a[17], a[41]); SQRADDAC(a[18], a[40]); SQRADDAC(a[19], a[39]); SQRADDAC(a[20], a[38]); SQRADDAC(a[21], a[37]); SQRADDAC(a[22], a[36]); SQRADDAC(a[23], a[35]); SQRADDAC(a[24], a[34]); SQRADDAC(a[25], a[33]); SQRADDAC(a[26], a[32]); SQRADDAC(a[27], a[31]); SQRADDAC(a[28], a[30]); SQRADDDB; SQRADD(a[29], a[29]); 
-   COMBA_STORE(b[58]);
-
-   /* output 59 */
-   CARRY_FORWARD;
-   SQRADDSC(a[12], a[47]); SQRADDAC(a[13], a[46]); SQRADDAC(a[14], a[45]); SQRADDAC(a[15], a[44]); SQRADDAC(a[16], a[43]); SQRADDAC(a[17], a[42]); SQRADDAC(a[18], a[41]); SQRADDAC(a[19], a[40]); SQRADDAC(a[20], a[39]); SQRADDAC(a[21], a[38]); SQRADDAC(a[22], a[37]); SQRADDAC(a[23], a[36]); SQRADDAC(a[24], a[35]); SQRADDAC(a[25], a[34]); SQRADDAC(a[26], a[33]); SQRADDAC(a[27], a[32]); SQRADDAC(a[28], a[31]); SQRADDAC(a[29], a[30]); SQRADDDB; 
-   COMBA_STORE(b[59]);
-
-   /* output 60 */
-   CARRY_FORWARD;
-   SQRADDSC(a[13], a[47]); SQRADDAC(a[14], a[46]); SQRADDAC(a[15], a[45]); SQRADDAC(a[16], a[44]); SQRADDAC(a[17], a[43]); SQRADDAC(a[18], a[42]); SQRADDAC(a[19], a[41]); SQRADDAC(a[20], a[40]); SQRADDAC(a[21], a[39]); SQRADDAC(a[22], a[38]); SQRADDAC(a[23], a[37]); SQRADDAC(a[24], a[36]); SQRADDAC(a[25], a[35]); SQRADDAC(a[26], a[34]); SQRADDAC(a[27], a[33]); SQRADDAC(a[28], a[32]); SQRADDAC(a[29], a[31]); SQRADDDB; SQRADD(a[30], a[30]); 
-   COMBA_STORE(b[60]);
-
-   /* output 61 */
-   CARRY_FORWARD;
-   SQRADDSC(a[14], a[47]); SQRADDAC(a[15], a[46]); SQRADDAC(a[16], a[45]); SQRADDAC(a[17], a[44]); SQRADDAC(a[18], a[43]); SQRADDAC(a[19], a[42]); SQRADDAC(a[20], a[41]); SQRADDAC(a[21], a[40]); SQRADDAC(a[22], a[39]); SQRADDAC(a[23], a[38]); SQRADDAC(a[24], a[37]); SQRADDAC(a[25], a[36]); SQRADDAC(a[26], a[35]); SQRADDAC(a[27], a[34]); SQRADDAC(a[28], a[33]); SQRADDAC(a[29], a[32]); SQRADDAC(a[30], a[31]); SQRADDDB; 
-   COMBA_STORE(b[61]);
-
-   /* output 62 */
-   CARRY_FORWARD;
-   SQRADDSC(a[15], a[47]); SQRADDAC(a[16], a[46]); SQRADDAC(a[17], a[45]); SQRADDAC(a[18], a[44]); SQRADDAC(a[19], a[43]); SQRADDAC(a[20], a[42]); SQRADDAC(a[21], a[41]); SQRADDAC(a[22], a[40]); SQRADDAC(a[23], a[39]); SQRADDAC(a[24], a[38]); SQRADDAC(a[25], a[37]); SQRADDAC(a[26], a[36]); SQRADDAC(a[27], a[35]); SQRADDAC(a[28], a[34]); SQRADDAC(a[29], a[33]); SQRADDAC(a[30], a[32]); SQRADDDB; SQRADD(a[31], a[31]); 
-   COMBA_STORE(b[62]);
-
-   /* output 63 */
-   CARRY_FORWARD;
-   SQRADDSC(a[16], a[47]); SQRADDAC(a[17], a[46]); SQRADDAC(a[18], a[45]); SQRADDAC(a[19], a[44]); SQRADDAC(a[20], a[43]); SQRADDAC(a[21], a[42]); SQRADDAC(a[22], a[41]); SQRADDAC(a[23], a[40]); SQRADDAC(a[24], a[39]); SQRADDAC(a[25], a[38]); SQRADDAC(a[26], a[37]); SQRADDAC(a[27], a[36]); SQRADDAC(a[28], a[35]); SQRADDAC(a[29], a[34]); SQRADDAC(a[30], a[33]); SQRADDAC(a[31], a[32]); SQRADDDB; 
-   COMBA_STORE(b[63]);
-
-   /* output 64 */
-   CARRY_FORWARD;
-   SQRADDSC(a[17], a[47]); SQRADDAC(a[18], a[46]); SQRADDAC(a[19], a[45]); SQRADDAC(a[20], a[44]); SQRADDAC(a[21], a[43]); SQRADDAC(a[22], a[42]); SQRADDAC(a[23], a[41]); SQRADDAC(a[24], a[40]); SQRADDAC(a[25], a[39]); SQRADDAC(a[26], a[38]); SQRADDAC(a[27], a[37]); SQRADDAC(a[28], a[36]); SQRADDAC(a[29], a[35]); SQRADDAC(a[30], a[34]); SQRADDAC(a[31], a[33]); SQRADDDB; SQRADD(a[32], a[32]); 
-   COMBA_STORE(b[64]);
-
-   /* output 65 */
-   CARRY_FORWARD;
-   SQRADDSC(a[18], a[47]); SQRADDAC(a[19], a[46]); SQRADDAC(a[20], a[45]); SQRADDAC(a[21], a[44]); SQRADDAC(a[22], a[43]); SQRADDAC(a[23], a[42]); SQRADDAC(a[24], a[41]); SQRADDAC(a[25], a[40]); SQRADDAC(a[26], a[39]); SQRADDAC(a[27], a[38]); SQRADDAC(a[28], a[37]); SQRADDAC(a[29], a[36]); SQRADDAC(a[30], a[35]); SQRADDAC(a[31], a[34]); SQRADDAC(a[32], a[33]); SQRADDDB; 
-   COMBA_STORE(b[65]);
-
-   /* output 66 */
-   CARRY_FORWARD;
-   SQRADDSC(a[19], a[47]); SQRADDAC(a[20], a[46]); SQRADDAC(a[21], a[45]); SQRADDAC(a[22], a[44]); SQRADDAC(a[23], a[43]); SQRADDAC(a[24], a[42]); SQRADDAC(a[25], a[41]); SQRADDAC(a[26], a[40]); SQRADDAC(a[27], a[39]); SQRADDAC(a[28], a[38]); SQRADDAC(a[29], a[37]); SQRADDAC(a[30], a[36]); SQRADDAC(a[31], a[35]); SQRADDAC(a[32], a[34]); SQRADDDB; SQRADD(a[33], a[33]); 
-   COMBA_STORE(b[66]);
-
-   /* output 67 */
-   CARRY_FORWARD;
-   SQRADDSC(a[20], a[47]); SQRADDAC(a[21], a[46]); SQRADDAC(a[22], a[45]); SQRADDAC(a[23], a[44]); SQRADDAC(a[24], a[43]); SQRADDAC(a[25], a[42]); SQRADDAC(a[26], a[41]); SQRADDAC(a[27], a[40]); SQRADDAC(a[28], a[39]); SQRADDAC(a[29], a[38]); SQRADDAC(a[30], a[37]); SQRADDAC(a[31], a[36]); SQRADDAC(a[32], a[35]); SQRADDAC(a[33], a[34]); SQRADDDB; 
-   COMBA_STORE(b[67]);
-
-   /* output 68 */
-   CARRY_FORWARD;
-   SQRADDSC(a[21], a[47]); SQRADDAC(a[22], a[46]); SQRADDAC(a[23], a[45]); SQRADDAC(a[24], a[44]); SQRADDAC(a[25], a[43]); SQRADDAC(a[26], a[42]); SQRADDAC(a[27], a[41]); SQRADDAC(a[28], a[40]); SQRADDAC(a[29], a[39]); SQRADDAC(a[30], a[38]); SQRADDAC(a[31], a[37]); SQRADDAC(a[32], a[36]); SQRADDAC(a[33], a[35]); SQRADDDB; SQRADD(a[34], a[34]); 
-   COMBA_STORE(b[68]);
-
-   /* output 69 */
-   CARRY_FORWARD;
-   SQRADDSC(a[22], a[47]); SQRADDAC(a[23], a[46]); SQRADDAC(a[24], a[45]); SQRADDAC(a[25], a[44]); SQRADDAC(a[26], a[43]); SQRADDAC(a[27], a[42]); SQRADDAC(a[28], a[41]); SQRADDAC(a[29], a[40]); SQRADDAC(a[30], a[39]); SQRADDAC(a[31], a[38]); SQRADDAC(a[32], a[37]); SQRADDAC(a[33], a[36]); SQRADDAC(a[34], a[35]); SQRADDDB; 
-   COMBA_STORE(b[69]);
-
-   /* output 70 */
-   CARRY_FORWARD;
-   SQRADDSC(a[23], a[47]); SQRADDAC(a[24], a[46]); SQRADDAC(a[25], a[45]); SQRADDAC(a[26], a[44]); SQRADDAC(a[27], a[43]); SQRADDAC(a[28], a[42]); SQRADDAC(a[29], a[41]); SQRADDAC(a[30], a[40]); SQRADDAC(a[31], a[39]); SQRADDAC(a[32], a[38]); SQRADDAC(a[33], a[37]); SQRADDAC(a[34], a[36]); SQRADDDB; SQRADD(a[35], a[35]); 
-   COMBA_STORE(b[70]);
-
-   /* output 71 */
-   CARRY_FORWARD;
-   SQRADDSC(a[24], a[47]); SQRADDAC(a[25], a[46]); SQRADDAC(a[26], a[45]); SQRADDAC(a[27], a[44]); SQRADDAC(a[28], a[43]); SQRADDAC(a[29], a[42]); SQRADDAC(a[30], a[41]); SQRADDAC(a[31], a[40]); SQRADDAC(a[32], a[39]); SQRADDAC(a[33], a[38]); SQRADDAC(a[34], a[37]); SQRADDAC(a[35], a[36]); SQRADDDB; 
-   COMBA_STORE(b[71]);
-
-   /* output 72 */
-   CARRY_FORWARD;
-   SQRADDSC(a[25], a[47]); SQRADDAC(a[26], a[46]); SQRADDAC(a[27], a[45]); SQRADDAC(a[28], a[44]); SQRADDAC(a[29], a[43]); SQRADDAC(a[30], a[42]); SQRADDAC(a[31], a[41]); SQRADDAC(a[32], a[40]); SQRADDAC(a[33], a[39]); SQRADDAC(a[34], a[38]); SQRADDAC(a[35], a[37]); SQRADDDB; SQRADD(a[36], a[36]); 
-   COMBA_STORE(b[72]);
-
-   /* output 73 */
-   CARRY_FORWARD;
-   SQRADDSC(a[26], a[47]); SQRADDAC(a[27], a[46]); SQRADDAC(a[28], a[45]); SQRADDAC(a[29], a[44]); SQRADDAC(a[30], a[43]); SQRADDAC(a[31], a[42]); SQRADDAC(a[32], a[41]); SQRADDAC(a[33], a[40]); SQRADDAC(a[34], a[39]); SQRADDAC(a[35], a[38]); SQRADDAC(a[36], a[37]); SQRADDDB; 
-   COMBA_STORE(b[73]);
-
-   /* output 74 */
-   CARRY_FORWARD;
-   SQRADDSC(a[27], a[47]); SQRADDAC(a[28], a[46]); SQRADDAC(a[29], a[45]); SQRADDAC(a[30], a[44]); SQRADDAC(a[31], a[43]); SQRADDAC(a[32], a[42]); SQRADDAC(a[33], a[41]); SQRADDAC(a[34], a[40]); SQRADDAC(a[35], a[39]); SQRADDAC(a[36], a[38]); SQRADDDB; SQRADD(a[37], a[37]); 
-   COMBA_STORE(b[74]);
-
-   /* output 75 */
-   CARRY_FORWARD;
-   SQRADDSC(a[28], a[47]); SQRADDAC(a[29], a[46]); SQRADDAC(a[30], a[45]); SQRADDAC(a[31], a[44]); SQRADDAC(a[32], a[43]); SQRADDAC(a[33], a[42]); SQRADDAC(a[34], a[41]); SQRADDAC(a[35], a[40]); SQRADDAC(a[36], a[39]); SQRADDAC(a[37], a[38]); SQRADDDB; 
-   COMBA_STORE(b[75]);
-
-   /* output 76 */
-   CARRY_FORWARD;
-   SQRADDSC(a[29], a[47]); SQRADDAC(a[30], a[46]); SQRADDAC(a[31], a[45]); SQRADDAC(a[32], a[44]); SQRADDAC(a[33], a[43]); SQRADDAC(a[34], a[42]); SQRADDAC(a[35], a[41]); SQRADDAC(a[36], a[40]); SQRADDAC(a[37], a[39]); SQRADDDB; SQRADD(a[38], a[38]); 
-   COMBA_STORE(b[76]);
-
-   /* output 77 */
-   CARRY_FORWARD;
-   SQRADDSC(a[30], a[47]); SQRADDAC(a[31], a[46]); SQRADDAC(a[32], a[45]); SQRADDAC(a[33], a[44]); SQRADDAC(a[34], a[43]); SQRADDAC(a[35], a[42]); SQRADDAC(a[36], a[41]); SQRADDAC(a[37], a[40]); SQRADDAC(a[38], a[39]); SQRADDDB; 
-   COMBA_STORE(b[77]);
-
-   /* output 78 */
-   CARRY_FORWARD;
-   SQRADDSC(a[31], a[47]); SQRADDAC(a[32], a[46]); SQRADDAC(a[33], a[45]); SQRADDAC(a[34], a[44]); SQRADDAC(a[35], a[43]); SQRADDAC(a[36], a[42]); SQRADDAC(a[37], a[41]); SQRADDAC(a[38], a[40]); SQRADDDB; SQRADD(a[39], a[39]); 
-   COMBA_STORE(b[78]);
-
-   /* output 79 */
-   CARRY_FORWARD;
-   SQRADDSC(a[32], a[47]); SQRADDAC(a[33], a[46]); SQRADDAC(a[34], a[45]); SQRADDAC(a[35], a[44]); SQRADDAC(a[36], a[43]); SQRADDAC(a[37], a[42]); SQRADDAC(a[38], a[41]); SQRADDAC(a[39], a[40]); SQRADDDB; 
-   COMBA_STORE(b[79]);
-
-   /* output 80 */
-   CARRY_FORWARD;
-   SQRADDSC(a[33], a[47]); SQRADDAC(a[34], a[46]); SQRADDAC(a[35], a[45]); SQRADDAC(a[36], a[44]); SQRADDAC(a[37], a[43]); SQRADDAC(a[38], a[42]); SQRADDAC(a[39], a[41]); SQRADDDB; SQRADD(a[40], a[40]); 
-   COMBA_STORE(b[80]);
-
-   /* output 81 */
-   CARRY_FORWARD;
-   SQRADDSC(a[34], a[47]); SQRADDAC(a[35], a[46]); SQRADDAC(a[36], a[45]); SQRADDAC(a[37], a[44]); SQRADDAC(a[38], a[43]); SQRADDAC(a[39], a[42]); SQRADDAC(a[40], a[41]); SQRADDDB; 
-   COMBA_STORE(b[81]);
-
-   /* output 82 */
-   CARRY_FORWARD;
-   SQRADDSC(a[35], a[47]); SQRADDAC(a[36], a[46]); SQRADDAC(a[37], a[45]); SQRADDAC(a[38], a[44]); SQRADDAC(a[39], a[43]); SQRADDAC(a[40], a[42]); SQRADDDB; SQRADD(a[41], a[41]); 
-   COMBA_STORE(b[82]);
-
-   /* output 83 */
-   CARRY_FORWARD;
-   SQRADDSC(a[36], a[47]); SQRADDAC(a[37], a[46]); SQRADDAC(a[38], a[45]); SQRADDAC(a[39], a[44]); SQRADDAC(a[40], a[43]); SQRADDAC(a[41], a[42]); SQRADDDB; 
-   COMBA_STORE(b[83]);
-
-   /* output 84 */
-   CARRY_FORWARD;
-   SQRADDSC(a[37], a[47]); SQRADDAC(a[38], a[46]); SQRADDAC(a[39], a[45]); SQRADDAC(a[40], a[44]); SQRADDAC(a[41], a[43]); SQRADDDB; SQRADD(a[42], a[42]); 
-   COMBA_STORE(b[84]);
-
-   /* output 85 */
-   CARRY_FORWARD;
-   SQRADDSC(a[38], a[47]); SQRADDAC(a[39], a[46]); SQRADDAC(a[40], a[45]); SQRADDAC(a[41], a[44]); SQRADDAC(a[42], a[43]); SQRADDDB; 
-   COMBA_STORE(b[85]);
-
-   /* output 86 */
-   CARRY_FORWARD;
-   SQRADDSC(a[39], a[47]); SQRADDAC(a[40], a[46]); SQRADDAC(a[41], a[45]); SQRADDAC(a[42], a[44]); SQRADDDB; SQRADD(a[43], a[43]); 
-   COMBA_STORE(b[86]);
-
-   /* output 87 */
-   CARRY_FORWARD;
-   SQRADDSC(a[40], a[47]); SQRADDAC(a[41], a[46]); SQRADDAC(a[42], a[45]); SQRADDAC(a[43], a[44]); SQRADDDB; 
-   COMBA_STORE(b[87]);
-
-   /* output 88 */
-   CARRY_FORWARD;
-   SQRADDSC(a[41], a[47]); SQRADDAC(a[42], a[46]); SQRADDAC(a[43], a[45]); SQRADDDB; SQRADD(a[44], a[44]); 
-   COMBA_STORE(b[88]);
-
-   /* output 89 */
-   CARRY_FORWARD;
-   SQRADDSC(a[42], a[47]); SQRADDAC(a[43], a[46]); SQRADDAC(a[44], a[45]); SQRADDDB; 
-   COMBA_STORE(b[89]);
-
-   /* output 90 */
-   CARRY_FORWARD;
-   SQRADD2(a[43], a[47]); SQRADD2(a[44], a[46]); SQRADD(a[45], a[45]); 
-   COMBA_STORE(b[90]);
-
-   /* output 91 */
-   CARRY_FORWARD;
-   SQRADD2(a[44], a[47]); SQRADD2(a[45], a[46]); 
-   COMBA_STORE(b[91]);
-
-   /* output 92 */
-   CARRY_FORWARD;
-   SQRADD2(a[45], a[47]); SQRADD(a[46], a[46]); 
-   COMBA_STORE(b[92]);
-
-   /* output 93 */
-   CARRY_FORWARD;
-   SQRADD2(a[46], a[47]); 
-   COMBA_STORE(b[93]);
-
-   /* output 94 */
-   CARRY_FORWARD;
-   SQRADD(a[47], a[47]); 
-   COMBA_STORE(b[94]);
-   COMBA_STORE2(b[95]);
-   COMBA_FINI;
-
-   B->used = 96;
-   B->sign = FP_ZPOS;
-   XMEMCPY(B->dp, b, 96 * sizeof(fp_digit));
-   fp_clamp(B);
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif
-
-

+ 0 - 117
lib/wolfssl/wolfcrypt/src/fp_sqr_comba_6.i

@@ -1,117 +0,0 @@
-/* fp_sqr_comba_6.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_SQR6
-int fp_sqr_comba6(fp_int *A, fp_int *B)
-{
-   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
-#ifdef TFM_ISO
-   fp_word tt;
-#endif
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit b[12];
-#else
-   fp_digit *b;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 12, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (b == NULL)
-      return FP_MEM;
-#endif
-
-   a = A->dp;
-   COMBA_START; 
-
-   /* clear carries */
-   CLEAR_CARRY;
-
-   /* output 0 */
-   SQRADD(a[0],a[0]);
-   COMBA_STORE(b[0]);
-
-   /* output 1 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[1]); 
-   COMBA_STORE(b[1]);
-
-   /* output 2 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]); 
-   COMBA_STORE(b[2]);
-
-   /* output 3 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]); 
-   COMBA_STORE(b[3]);
-
-   /* output 4 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]); 
-   COMBA_STORE(b[4]);
-
-   /* output 5 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-   COMBA_STORE(b[5]);
-
-   /* output 6 */
-   CARRY_FORWARD;
-   SQRADD2(a[1], a[5]); SQRADD2(a[2], a[4]); SQRADD(a[3], a[3]); 
-   COMBA_STORE(b[6]);
-
-   /* output 7 */
-   CARRY_FORWARD;
-   SQRADD2(a[2], a[5]); SQRADD2(a[3], a[4]); 
-   COMBA_STORE(b[7]);
-
-   /* output 8 */
-   CARRY_FORWARD;
-   SQRADD2(a[3], a[5]); SQRADD(a[4], a[4]); 
-   COMBA_STORE(b[8]);
-
-   /* output 9 */
-   CARRY_FORWARD;
-   SQRADD2(a[4], a[5]); 
-   COMBA_STORE(b[9]);
-
-   /* output 10 */
-   CARRY_FORWARD;
-   SQRADD(a[5], a[5]); 
-   COMBA_STORE(b[10]);
-   COMBA_STORE2(b[11]);
-   COMBA_FINI;
-
-   B->used = 12;
-   B->sign = FP_ZPOS;
-   XMEMCPY(B->dp, b, 12 * sizeof(fp_digit));
-   fp_clamp(B);
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif
-
-

+ 0 - 697
lib/wolfssl/wolfcrypt/src/fp_sqr_comba_64.i

@@ -1,697 +0,0 @@
-/* fp_sqr_comba_64.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_SQR64
-int fp_sqr_comba64(fp_int *A, fp_int *B)
-{
-   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
-#ifdef TFM_ISO
-   fp_word tt;
-#endif
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit b[128];
-#else
-   fp_digit *b;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 128, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (b == NULL)
-      return FP_MEM;
-#endif
-
-   a = A->dp;
-   COMBA_START; 
-
-   /* clear carries */
-   CLEAR_CARRY;
-
-   /* output 0 */
-   SQRADD(a[0],a[0]);
-   COMBA_STORE(b[0]);
-
-   /* output 1 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[1]); 
-   COMBA_STORE(b[1]);
-
-   /* output 2 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]); 
-   COMBA_STORE(b[2]);
-
-   /* output 3 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]); 
-   COMBA_STORE(b[3]);
-
-   /* output 4 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]); 
-   COMBA_STORE(b[4]);
-
-   /* output 5 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-   COMBA_STORE(b[5]);
-
-   /* output 6 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); 
-   COMBA_STORE(b[6]);
-
-   /* output 7 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; 
-   COMBA_STORE(b[7]);
-
-   /* output 8 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
-   COMBA_STORE(b[8]);
-
-   /* output 9 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB; 
-   COMBA_STORE(b[9]);
-
-   /* output 10 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]); 
-   COMBA_STORE(b[10]);
-
-   /* output 11 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB; 
-   COMBA_STORE(b[11]);
-
-   /* output 12 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]); 
-   COMBA_STORE(b[12]);
-
-   /* output 13 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB; 
-   COMBA_STORE(b[13]);
-
-   /* output 14 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]); 
-   COMBA_STORE(b[14]);
-
-   /* output 15 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB; 
-   COMBA_STORE(b[15]);
-
-   /* output 16 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[16]); SQRADDAC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]); 
-   COMBA_STORE(b[16]);
-
-   /* output 17 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[17]); SQRADDAC(a[1], a[16]); SQRADDAC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB; 
-   COMBA_STORE(b[17]);
-
-   /* output 18 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[18]); SQRADDAC(a[1], a[17]); SQRADDAC(a[2], a[16]); SQRADDAC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]); 
-   COMBA_STORE(b[18]);
-
-   /* output 19 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[19]); SQRADDAC(a[1], a[18]); SQRADDAC(a[2], a[17]); SQRADDAC(a[3], a[16]); SQRADDAC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB; 
-   COMBA_STORE(b[19]);
-
-   /* output 20 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[20]); SQRADDAC(a[1], a[19]); SQRADDAC(a[2], a[18]); SQRADDAC(a[3], a[17]); SQRADDAC(a[4], a[16]); SQRADDAC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]); 
-   COMBA_STORE(b[20]);
-
-   /* output 21 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[21]); SQRADDAC(a[1], a[20]); SQRADDAC(a[2], a[19]); SQRADDAC(a[3], a[18]); SQRADDAC(a[4], a[17]); SQRADDAC(a[5], a[16]); SQRADDAC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB; 
-   COMBA_STORE(b[21]);
-
-   /* output 22 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[22]); SQRADDAC(a[1], a[21]); SQRADDAC(a[2], a[20]); SQRADDAC(a[3], a[19]); SQRADDAC(a[4], a[18]); SQRADDAC(a[5], a[17]); SQRADDAC(a[6], a[16]); SQRADDAC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]); 
-   COMBA_STORE(b[22]);
-
-   /* output 23 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[23]); SQRADDAC(a[1], a[22]); SQRADDAC(a[2], a[21]); SQRADDAC(a[3], a[20]); SQRADDAC(a[4], a[19]); SQRADDAC(a[5], a[18]); SQRADDAC(a[6], a[17]); SQRADDAC(a[7], a[16]); SQRADDAC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB; 
-   COMBA_STORE(b[23]);
-
-   /* output 24 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[24]); SQRADDAC(a[1], a[23]); SQRADDAC(a[2], a[22]); SQRADDAC(a[3], a[21]); SQRADDAC(a[4], a[20]); SQRADDAC(a[5], a[19]); SQRADDAC(a[6], a[18]); SQRADDAC(a[7], a[17]); SQRADDAC(a[8], a[16]); SQRADDAC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]); 
-   COMBA_STORE(b[24]);
-
-   /* output 25 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[25]); SQRADDAC(a[1], a[24]); SQRADDAC(a[2], a[23]); SQRADDAC(a[3], a[22]); SQRADDAC(a[4], a[21]); SQRADDAC(a[5], a[20]); SQRADDAC(a[6], a[19]); SQRADDAC(a[7], a[18]); SQRADDAC(a[8], a[17]); SQRADDAC(a[9], a[16]); SQRADDAC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB; 
-   COMBA_STORE(b[25]);
-
-   /* output 26 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[26]); SQRADDAC(a[1], a[25]); SQRADDAC(a[2], a[24]); SQRADDAC(a[3], a[23]); SQRADDAC(a[4], a[22]); SQRADDAC(a[5], a[21]); SQRADDAC(a[6], a[20]); SQRADDAC(a[7], a[19]); SQRADDAC(a[8], a[18]); SQRADDAC(a[9], a[17]); SQRADDAC(a[10], a[16]); SQRADDAC(a[11], a[15]); SQRADDAC(a[12], a[14]); SQRADDDB; SQRADD(a[13], a[13]); 
-   COMBA_STORE(b[26]);
-
-   /* output 27 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[27]); SQRADDAC(a[1], a[26]); SQRADDAC(a[2], a[25]); SQRADDAC(a[3], a[24]); SQRADDAC(a[4], a[23]); SQRADDAC(a[5], a[22]); SQRADDAC(a[6], a[21]); SQRADDAC(a[7], a[20]); SQRADDAC(a[8], a[19]); SQRADDAC(a[9], a[18]); SQRADDAC(a[10], a[17]); SQRADDAC(a[11], a[16]); SQRADDAC(a[12], a[15]); SQRADDAC(a[13], a[14]); SQRADDDB; 
-   COMBA_STORE(b[27]);
-
-   /* output 28 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[28]); SQRADDAC(a[1], a[27]); SQRADDAC(a[2], a[26]); SQRADDAC(a[3], a[25]); SQRADDAC(a[4], a[24]); SQRADDAC(a[5], a[23]); SQRADDAC(a[6], a[22]); SQRADDAC(a[7], a[21]); SQRADDAC(a[8], a[20]); SQRADDAC(a[9], a[19]); SQRADDAC(a[10], a[18]); SQRADDAC(a[11], a[17]); SQRADDAC(a[12], a[16]); SQRADDAC(a[13], a[15]); SQRADDDB; SQRADD(a[14], a[14]); 
-   COMBA_STORE(b[28]);
-
-   /* output 29 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[29]); SQRADDAC(a[1], a[28]); SQRADDAC(a[2], a[27]); SQRADDAC(a[3], a[26]); SQRADDAC(a[4], a[25]); SQRADDAC(a[5], a[24]); SQRADDAC(a[6], a[23]); SQRADDAC(a[7], a[22]); SQRADDAC(a[8], a[21]); SQRADDAC(a[9], a[20]); SQRADDAC(a[10], a[19]); SQRADDAC(a[11], a[18]); SQRADDAC(a[12], a[17]); SQRADDAC(a[13], a[16]); SQRADDAC(a[14], a[15]); SQRADDDB; 
-   COMBA_STORE(b[29]);
-
-   /* output 30 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[30]); SQRADDAC(a[1], a[29]); SQRADDAC(a[2], a[28]); SQRADDAC(a[3], a[27]); SQRADDAC(a[4], a[26]); SQRADDAC(a[5], a[25]); SQRADDAC(a[6], a[24]); SQRADDAC(a[7], a[23]); SQRADDAC(a[8], a[22]); SQRADDAC(a[9], a[21]); SQRADDAC(a[10], a[20]); SQRADDAC(a[11], a[19]); SQRADDAC(a[12], a[18]); SQRADDAC(a[13], a[17]); SQRADDAC(a[14], a[16]); SQRADDDB; SQRADD(a[15], a[15]); 
-   COMBA_STORE(b[30]);
-
-   /* output 31 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[31]); SQRADDAC(a[1], a[30]); SQRADDAC(a[2], a[29]); SQRADDAC(a[3], a[28]); SQRADDAC(a[4], a[27]); SQRADDAC(a[5], a[26]); SQRADDAC(a[6], a[25]); SQRADDAC(a[7], a[24]); SQRADDAC(a[8], a[23]); SQRADDAC(a[9], a[22]); SQRADDAC(a[10], a[21]); SQRADDAC(a[11], a[20]); SQRADDAC(a[12], a[19]); SQRADDAC(a[13], a[18]); SQRADDAC(a[14], a[17]); SQRADDAC(a[15], a[16]); SQRADDDB; 
-   COMBA_STORE(b[31]);
-
-   /* output 32 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[32]); SQRADDAC(a[1], a[31]); SQRADDAC(a[2], a[30]); SQRADDAC(a[3], a[29]); SQRADDAC(a[4], a[28]); SQRADDAC(a[5], a[27]); SQRADDAC(a[6], a[26]); SQRADDAC(a[7], a[25]); SQRADDAC(a[8], a[24]); SQRADDAC(a[9], a[23]); SQRADDAC(a[10], a[22]); SQRADDAC(a[11], a[21]); SQRADDAC(a[12], a[20]); SQRADDAC(a[13], a[19]); SQRADDAC(a[14], a[18]); SQRADDAC(a[15], a[17]); SQRADDDB; SQRADD(a[16], a[16]); 
-   COMBA_STORE(b[32]);
-
-   /* output 33 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[33]); SQRADDAC(a[1], a[32]); SQRADDAC(a[2], a[31]); SQRADDAC(a[3], a[30]); SQRADDAC(a[4], a[29]); SQRADDAC(a[5], a[28]); SQRADDAC(a[6], a[27]); SQRADDAC(a[7], a[26]); SQRADDAC(a[8], a[25]); SQRADDAC(a[9], a[24]); SQRADDAC(a[10], a[23]); SQRADDAC(a[11], a[22]); SQRADDAC(a[12], a[21]); SQRADDAC(a[13], a[20]); SQRADDAC(a[14], a[19]); SQRADDAC(a[15], a[18]); SQRADDAC(a[16], a[17]); SQRADDDB; 
-   COMBA_STORE(b[33]);
-
-   /* output 34 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[34]); SQRADDAC(a[1], a[33]); SQRADDAC(a[2], a[32]); SQRADDAC(a[3], a[31]); SQRADDAC(a[4], a[30]); SQRADDAC(a[5], a[29]); SQRADDAC(a[6], a[28]); SQRADDAC(a[7], a[27]); SQRADDAC(a[8], a[26]); SQRADDAC(a[9], a[25]); SQRADDAC(a[10], a[24]); SQRADDAC(a[11], a[23]); SQRADDAC(a[12], a[22]); SQRADDAC(a[13], a[21]); SQRADDAC(a[14], a[20]); SQRADDAC(a[15], a[19]); SQRADDAC(a[16], a[18]); SQRADDDB; SQRADD(a[17], a[17]); 
-   COMBA_STORE(b[34]);
-
-   /* output 35 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[35]); SQRADDAC(a[1], a[34]); SQRADDAC(a[2], a[33]); SQRADDAC(a[3], a[32]); SQRADDAC(a[4], a[31]); SQRADDAC(a[5], a[30]); SQRADDAC(a[6], a[29]); SQRADDAC(a[7], a[28]); SQRADDAC(a[8], a[27]); SQRADDAC(a[9], a[26]); SQRADDAC(a[10], a[25]); SQRADDAC(a[11], a[24]); SQRADDAC(a[12], a[23]); SQRADDAC(a[13], a[22]); SQRADDAC(a[14], a[21]); SQRADDAC(a[15], a[20]); SQRADDAC(a[16], a[19]); SQRADDAC(a[17], a[18]); SQRADDDB; 
-   COMBA_STORE(b[35]);
-
-   /* output 36 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[36]); SQRADDAC(a[1], a[35]); SQRADDAC(a[2], a[34]); SQRADDAC(a[3], a[33]); SQRADDAC(a[4], a[32]); SQRADDAC(a[5], a[31]); SQRADDAC(a[6], a[30]); SQRADDAC(a[7], a[29]); SQRADDAC(a[8], a[28]); SQRADDAC(a[9], a[27]); SQRADDAC(a[10], a[26]); SQRADDAC(a[11], a[25]); SQRADDAC(a[12], a[24]); SQRADDAC(a[13], a[23]); SQRADDAC(a[14], a[22]); SQRADDAC(a[15], a[21]); SQRADDAC(a[16], a[20]); SQRADDAC(a[17], a[19]); SQRADDDB; SQRADD(a[18], a[18]); 
-   COMBA_STORE(b[36]);
-
-   /* output 37 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[37]); SQRADDAC(a[1], a[36]); SQRADDAC(a[2], a[35]); SQRADDAC(a[3], a[34]); SQRADDAC(a[4], a[33]); SQRADDAC(a[5], a[32]); SQRADDAC(a[6], a[31]); SQRADDAC(a[7], a[30]); SQRADDAC(a[8], a[29]); SQRADDAC(a[9], a[28]); SQRADDAC(a[10], a[27]); SQRADDAC(a[11], a[26]); SQRADDAC(a[12], a[25]); SQRADDAC(a[13], a[24]); SQRADDAC(a[14], a[23]); SQRADDAC(a[15], a[22]); SQRADDAC(a[16], a[21]); SQRADDAC(a[17], a[20]); SQRADDAC(a[18], a[19]); SQRADDDB; 
-   COMBA_STORE(b[37]);
-
-   /* output 38 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[38]); SQRADDAC(a[1], a[37]); SQRADDAC(a[2], a[36]); SQRADDAC(a[3], a[35]); SQRADDAC(a[4], a[34]); SQRADDAC(a[5], a[33]); SQRADDAC(a[6], a[32]); SQRADDAC(a[7], a[31]); SQRADDAC(a[8], a[30]); SQRADDAC(a[9], a[29]); SQRADDAC(a[10], a[28]); SQRADDAC(a[11], a[27]); SQRADDAC(a[12], a[26]); SQRADDAC(a[13], a[25]); SQRADDAC(a[14], a[24]); SQRADDAC(a[15], a[23]); SQRADDAC(a[16], a[22]); SQRADDAC(a[17], a[21]); SQRADDAC(a[18], a[20]); SQRADDDB; SQRADD(a[19], a[19]); 
-   COMBA_STORE(b[38]);
-
-   /* output 39 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[39]); SQRADDAC(a[1], a[38]); SQRADDAC(a[2], a[37]); SQRADDAC(a[3], a[36]); SQRADDAC(a[4], a[35]); SQRADDAC(a[5], a[34]); SQRADDAC(a[6], a[33]); SQRADDAC(a[7], a[32]); SQRADDAC(a[8], a[31]); SQRADDAC(a[9], a[30]); SQRADDAC(a[10], a[29]); SQRADDAC(a[11], a[28]); SQRADDAC(a[12], a[27]); SQRADDAC(a[13], a[26]); SQRADDAC(a[14], a[25]); SQRADDAC(a[15], a[24]); SQRADDAC(a[16], a[23]); SQRADDAC(a[17], a[22]); SQRADDAC(a[18], a[21]); SQRADDAC(a[19], a[20]); SQRADDDB; 
-   COMBA_STORE(b[39]);
-
-   /* output 40 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[40]); SQRADDAC(a[1], a[39]); SQRADDAC(a[2], a[38]); SQRADDAC(a[3], a[37]); SQRADDAC(a[4], a[36]); SQRADDAC(a[5], a[35]); SQRADDAC(a[6], a[34]); SQRADDAC(a[7], a[33]); SQRADDAC(a[8], a[32]); SQRADDAC(a[9], a[31]); SQRADDAC(a[10], a[30]); SQRADDAC(a[11], a[29]); SQRADDAC(a[12], a[28]); SQRADDAC(a[13], a[27]); SQRADDAC(a[14], a[26]); SQRADDAC(a[15], a[25]); SQRADDAC(a[16], a[24]); SQRADDAC(a[17], a[23]); SQRADDAC(a[18], a[22]); SQRADDAC(a[19], a[21]); SQRADDDB; SQRADD(a[20], a[20]); 
-   COMBA_STORE(b[40]);
-
-   /* output 41 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[41]); SQRADDAC(a[1], a[40]); SQRADDAC(a[2], a[39]); SQRADDAC(a[3], a[38]); SQRADDAC(a[4], a[37]); SQRADDAC(a[5], a[36]); SQRADDAC(a[6], a[35]); SQRADDAC(a[7], a[34]); SQRADDAC(a[8], a[33]); SQRADDAC(a[9], a[32]); SQRADDAC(a[10], a[31]); SQRADDAC(a[11], a[30]); SQRADDAC(a[12], a[29]); SQRADDAC(a[13], a[28]); SQRADDAC(a[14], a[27]); SQRADDAC(a[15], a[26]); SQRADDAC(a[16], a[25]); SQRADDAC(a[17], a[24]); SQRADDAC(a[18], a[23]); SQRADDAC(a[19], a[22]); SQRADDAC(a[20], a[21]); SQRADDDB; 
-   COMBA_STORE(b[41]);
-
-   /* output 42 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[42]); SQRADDAC(a[1], a[41]); SQRADDAC(a[2], a[40]); SQRADDAC(a[3], a[39]); SQRADDAC(a[4], a[38]); SQRADDAC(a[5], a[37]); SQRADDAC(a[6], a[36]); SQRADDAC(a[7], a[35]); SQRADDAC(a[8], a[34]); SQRADDAC(a[9], a[33]); SQRADDAC(a[10], a[32]); SQRADDAC(a[11], a[31]); SQRADDAC(a[12], a[30]); SQRADDAC(a[13], a[29]); SQRADDAC(a[14], a[28]); SQRADDAC(a[15], a[27]); SQRADDAC(a[16], a[26]); SQRADDAC(a[17], a[25]); SQRADDAC(a[18], a[24]); SQRADDAC(a[19], a[23]); SQRADDAC(a[20], a[22]); SQRADDDB; SQRADD(a[21], a[21]); 
-   COMBA_STORE(b[42]);
-
-   /* output 43 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[43]); SQRADDAC(a[1], a[42]); SQRADDAC(a[2], a[41]); SQRADDAC(a[3], a[40]); SQRADDAC(a[4], a[39]); SQRADDAC(a[5], a[38]); SQRADDAC(a[6], a[37]); SQRADDAC(a[7], a[36]); SQRADDAC(a[8], a[35]); SQRADDAC(a[9], a[34]); SQRADDAC(a[10], a[33]); SQRADDAC(a[11], a[32]); SQRADDAC(a[12], a[31]); SQRADDAC(a[13], a[30]); SQRADDAC(a[14], a[29]); SQRADDAC(a[15], a[28]); SQRADDAC(a[16], a[27]); SQRADDAC(a[17], a[26]); SQRADDAC(a[18], a[25]); SQRADDAC(a[19], a[24]); SQRADDAC(a[20], a[23]); SQRADDAC(a[21], a[22]); SQRADDDB; 
-   COMBA_STORE(b[43]);
-
-   /* output 44 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[44]); SQRADDAC(a[1], a[43]); SQRADDAC(a[2], a[42]); SQRADDAC(a[3], a[41]); SQRADDAC(a[4], a[40]); SQRADDAC(a[5], a[39]); SQRADDAC(a[6], a[38]); SQRADDAC(a[7], a[37]); SQRADDAC(a[8], a[36]); SQRADDAC(a[9], a[35]); SQRADDAC(a[10], a[34]); SQRADDAC(a[11], a[33]); SQRADDAC(a[12], a[32]); SQRADDAC(a[13], a[31]); SQRADDAC(a[14], a[30]); SQRADDAC(a[15], a[29]); SQRADDAC(a[16], a[28]); SQRADDAC(a[17], a[27]); SQRADDAC(a[18], a[26]); SQRADDAC(a[19], a[25]); SQRADDAC(a[20], a[24]); SQRADDAC(a[21], a[23]); SQRADDDB; SQRADD(a[22], a[22]); 
-   COMBA_STORE(b[44]);
-
-   /* output 45 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[45]); SQRADDAC(a[1], a[44]); SQRADDAC(a[2], a[43]); SQRADDAC(a[3], a[42]); SQRADDAC(a[4], a[41]); SQRADDAC(a[5], a[40]); SQRADDAC(a[6], a[39]); SQRADDAC(a[7], a[38]); SQRADDAC(a[8], a[37]); SQRADDAC(a[9], a[36]); SQRADDAC(a[10], a[35]); SQRADDAC(a[11], a[34]); SQRADDAC(a[12], a[33]); SQRADDAC(a[13], a[32]); SQRADDAC(a[14], a[31]); SQRADDAC(a[15], a[30]); SQRADDAC(a[16], a[29]); SQRADDAC(a[17], a[28]); SQRADDAC(a[18], a[27]); SQRADDAC(a[19], a[26]); SQRADDAC(a[20], a[25]); SQRADDAC(a[21], a[24]); SQRADDAC(a[22], a[23]); SQRADDDB; 
-   COMBA_STORE(b[45]);
-
-   /* output 46 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[46]); SQRADDAC(a[1], a[45]); SQRADDAC(a[2], a[44]); SQRADDAC(a[3], a[43]); SQRADDAC(a[4], a[42]); SQRADDAC(a[5], a[41]); SQRADDAC(a[6], a[40]); SQRADDAC(a[7], a[39]); SQRADDAC(a[8], a[38]); SQRADDAC(a[9], a[37]); SQRADDAC(a[10], a[36]); SQRADDAC(a[11], a[35]); SQRADDAC(a[12], a[34]); SQRADDAC(a[13], a[33]); SQRADDAC(a[14], a[32]); SQRADDAC(a[15], a[31]); SQRADDAC(a[16], a[30]); SQRADDAC(a[17], a[29]); SQRADDAC(a[18], a[28]); SQRADDAC(a[19], a[27]); SQRADDAC(a[20], a[26]); SQRADDAC(a[21], a[25]); SQRADDAC(a[22], a[24]); SQRADDDB; SQRADD(a[23], a[23]); 
-   COMBA_STORE(b[46]);
-
-   /* output 47 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[47]); SQRADDAC(a[1], a[46]); SQRADDAC(a[2], a[45]); SQRADDAC(a[3], a[44]); SQRADDAC(a[4], a[43]); SQRADDAC(a[5], a[42]); SQRADDAC(a[6], a[41]); SQRADDAC(a[7], a[40]); SQRADDAC(a[8], a[39]); SQRADDAC(a[9], a[38]); SQRADDAC(a[10], a[37]); SQRADDAC(a[11], a[36]); SQRADDAC(a[12], a[35]); SQRADDAC(a[13], a[34]); SQRADDAC(a[14], a[33]); SQRADDAC(a[15], a[32]); SQRADDAC(a[16], a[31]); SQRADDAC(a[17], a[30]); SQRADDAC(a[18], a[29]); SQRADDAC(a[19], a[28]); SQRADDAC(a[20], a[27]); SQRADDAC(a[21], a[26]); SQRADDAC(a[22], a[25]); SQRADDAC(a[23], a[24]); SQRADDDB; 
-   COMBA_STORE(b[47]);
-
-   /* output 48 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[48]); SQRADDAC(a[1], a[47]); SQRADDAC(a[2], a[46]); SQRADDAC(a[3], a[45]); SQRADDAC(a[4], a[44]); SQRADDAC(a[5], a[43]); SQRADDAC(a[6], a[42]); SQRADDAC(a[7], a[41]); SQRADDAC(a[8], a[40]); SQRADDAC(a[9], a[39]); SQRADDAC(a[10], a[38]); SQRADDAC(a[11], a[37]); SQRADDAC(a[12], a[36]); SQRADDAC(a[13], a[35]); SQRADDAC(a[14], a[34]); SQRADDAC(a[15], a[33]); SQRADDAC(a[16], a[32]); SQRADDAC(a[17], a[31]); SQRADDAC(a[18], a[30]); SQRADDAC(a[19], a[29]); SQRADDAC(a[20], a[28]); SQRADDAC(a[21], a[27]); SQRADDAC(a[22], a[26]); SQRADDAC(a[23], a[25]); SQRADDDB; SQRADD(a[24], a[24]); 
-   COMBA_STORE(b[48]);
-
-   /* output 49 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[49]); SQRADDAC(a[1], a[48]); SQRADDAC(a[2], a[47]); SQRADDAC(a[3], a[46]); SQRADDAC(a[4], a[45]); SQRADDAC(a[5], a[44]); SQRADDAC(a[6], a[43]); SQRADDAC(a[7], a[42]); SQRADDAC(a[8], a[41]); SQRADDAC(a[9], a[40]); SQRADDAC(a[10], a[39]); SQRADDAC(a[11], a[38]); SQRADDAC(a[12], a[37]); SQRADDAC(a[13], a[36]); SQRADDAC(a[14], a[35]); SQRADDAC(a[15], a[34]); SQRADDAC(a[16], a[33]); SQRADDAC(a[17], a[32]); SQRADDAC(a[18], a[31]); SQRADDAC(a[19], a[30]); SQRADDAC(a[20], a[29]); SQRADDAC(a[21], a[28]); SQRADDAC(a[22], a[27]); SQRADDAC(a[23], a[26]); SQRADDAC(a[24], a[25]); SQRADDDB; 
-   COMBA_STORE(b[49]);
-
-   /* output 50 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[50]); SQRADDAC(a[1], a[49]); SQRADDAC(a[2], a[48]); SQRADDAC(a[3], a[47]); SQRADDAC(a[4], a[46]); SQRADDAC(a[5], a[45]); SQRADDAC(a[6], a[44]); SQRADDAC(a[7], a[43]); SQRADDAC(a[8], a[42]); SQRADDAC(a[9], a[41]); SQRADDAC(a[10], a[40]); SQRADDAC(a[11], a[39]); SQRADDAC(a[12], a[38]); SQRADDAC(a[13], a[37]); SQRADDAC(a[14], a[36]); SQRADDAC(a[15], a[35]); SQRADDAC(a[16], a[34]); SQRADDAC(a[17], a[33]); SQRADDAC(a[18], a[32]); SQRADDAC(a[19], a[31]); SQRADDAC(a[20], a[30]); SQRADDAC(a[21], a[29]); SQRADDAC(a[22], a[28]); SQRADDAC(a[23], a[27]); SQRADDAC(a[24], a[26]); SQRADDDB; SQRADD(a[25], a[25]); 
-   COMBA_STORE(b[50]);
-
-   /* output 51 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[51]); SQRADDAC(a[1], a[50]); SQRADDAC(a[2], a[49]); SQRADDAC(a[3], a[48]); SQRADDAC(a[4], a[47]); SQRADDAC(a[5], a[46]); SQRADDAC(a[6], a[45]); SQRADDAC(a[7], a[44]); SQRADDAC(a[8], a[43]); SQRADDAC(a[9], a[42]); SQRADDAC(a[10], a[41]); SQRADDAC(a[11], a[40]); SQRADDAC(a[12], a[39]); SQRADDAC(a[13], a[38]); SQRADDAC(a[14], a[37]); SQRADDAC(a[15], a[36]); SQRADDAC(a[16], a[35]); SQRADDAC(a[17], a[34]); SQRADDAC(a[18], a[33]); SQRADDAC(a[19], a[32]); SQRADDAC(a[20], a[31]); SQRADDAC(a[21], a[30]); SQRADDAC(a[22], a[29]); SQRADDAC(a[23], a[28]); SQRADDAC(a[24], a[27]); SQRADDAC(a[25], a[26]); SQRADDDB; 
-   COMBA_STORE(b[51]);
-
-   /* output 52 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[52]); SQRADDAC(a[1], a[51]); SQRADDAC(a[2], a[50]); SQRADDAC(a[3], a[49]); SQRADDAC(a[4], a[48]); SQRADDAC(a[5], a[47]); SQRADDAC(a[6], a[46]); SQRADDAC(a[7], a[45]); SQRADDAC(a[8], a[44]); SQRADDAC(a[9], a[43]); SQRADDAC(a[10], a[42]); SQRADDAC(a[11], a[41]); SQRADDAC(a[12], a[40]); SQRADDAC(a[13], a[39]); SQRADDAC(a[14], a[38]); SQRADDAC(a[15], a[37]); SQRADDAC(a[16], a[36]); SQRADDAC(a[17], a[35]); SQRADDAC(a[18], a[34]); SQRADDAC(a[19], a[33]); SQRADDAC(a[20], a[32]); SQRADDAC(a[21], a[31]); SQRADDAC(a[22], a[30]); SQRADDAC(a[23], a[29]); SQRADDAC(a[24], a[28]); SQRADDAC(a[25], a[27]); SQRADDDB; SQRADD(a[26], a[26]); 
-   COMBA_STORE(b[52]);
-
-   /* output 53 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[53]); SQRADDAC(a[1], a[52]); SQRADDAC(a[2], a[51]); SQRADDAC(a[3], a[50]); SQRADDAC(a[4], a[49]); SQRADDAC(a[5], a[48]); SQRADDAC(a[6], a[47]); SQRADDAC(a[7], a[46]); SQRADDAC(a[8], a[45]); SQRADDAC(a[9], a[44]); SQRADDAC(a[10], a[43]); SQRADDAC(a[11], a[42]); SQRADDAC(a[12], a[41]); SQRADDAC(a[13], a[40]); SQRADDAC(a[14], a[39]); SQRADDAC(a[15], a[38]); SQRADDAC(a[16], a[37]); SQRADDAC(a[17], a[36]); SQRADDAC(a[18], a[35]); SQRADDAC(a[19], a[34]); SQRADDAC(a[20], a[33]); SQRADDAC(a[21], a[32]); SQRADDAC(a[22], a[31]); SQRADDAC(a[23], a[30]); SQRADDAC(a[24], a[29]); SQRADDAC(a[25], a[28]); SQRADDAC(a[26], a[27]); SQRADDDB; 
-   COMBA_STORE(b[53]);
-
-   /* output 54 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[54]); SQRADDAC(a[1], a[53]); SQRADDAC(a[2], a[52]); SQRADDAC(a[3], a[51]); SQRADDAC(a[4], a[50]); SQRADDAC(a[5], a[49]); SQRADDAC(a[6], a[48]); SQRADDAC(a[7], a[47]); SQRADDAC(a[8], a[46]); SQRADDAC(a[9], a[45]); SQRADDAC(a[10], a[44]); SQRADDAC(a[11], a[43]); SQRADDAC(a[12], a[42]); SQRADDAC(a[13], a[41]); SQRADDAC(a[14], a[40]); SQRADDAC(a[15], a[39]); SQRADDAC(a[16], a[38]); SQRADDAC(a[17], a[37]); SQRADDAC(a[18], a[36]); SQRADDAC(a[19], a[35]); SQRADDAC(a[20], a[34]); SQRADDAC(a[21], a[33]); SQRADDAC(a[22], a[32]); SQRADDAC(a[23], a[31]); SQRADDAC(a[24], a[30]); SQRADDAC(a[25], a[29]); SQRADDAC(a[26], a[28]); SQRADDDB; SQRADD(a[27], a[27]); 
-   COMBA_STORE(b[54]);
-
-   /* output 55 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[55]); SQRADDAC(a[1], a[54]); SQRADDAC(a[2], a[53]); SQRADDAC(a[3], a[52]); SQRADDAC(a[4], a[51]); SQRADDAC(a[5], a[50]); SQRADDAC(a[6], a[49]); SQRADDAC(a[7], a[48]); SQRADDAC(a[8], a[47]); SQRADDAC(a[9], a[46]); SQRADDAC(a[10], a[45]); SQRADDAC(a[11], a[44]); SQRADDAC(a[12], a[43]); SQRADDAC(a[13], a[42]); SQRADDAC(a[14], a[41]); SQRADDAC(a[15], a[40]); SQRADDAC(a[16], a[39]); SQRADDAC(a[17], a[38]); SQRADDAC(a[18], a[37]); SQRADDAC(a[19], a[36]); SQRADDAC(a[20], a[35]); SQRADDAC(a[21], a[34]); SQRADDAC(a[22], a[33]); SQRADDAC(a[23], a[32]); SQRADDAC(a[24], a[31]); SQRADDAC(a[25], a[30]); SQRADDAC(a[26], a[29]); SQRADDAC(a[27], a[28]); SQRADDDB; 
-   COMBA_STORE(b[55]);
-
-   /* output 56 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[56]); SQRADDAC(a[1], a[55]); SQRADDAC(a[2], a[54]); SQRADDAC(a[3], a[53]); SQRADDAC(a[4], a[52]); SQRADDAC(a[5], a[51]); SQRADDAC(a[6], a[50]); SQRADDAC(a[7], a[49]); SQRADDAC(a[8], a[48]); SQRADDAC(a[9], a[47]); SQRADDAC(a[10], a[46]); SQRADDAC(a[11], a[45]); SQRADDAC(a[12], a[44]); SQRADDAC(a[13], a[43]); SQRADDAC(a[14], a[42]); SQRADDAC(a[15], a[41]); SQRADDAC(a[16], a[40]); SQRADDAC(a[17], a[39]); SQRADDAC(a[18], a[38]); SQRADDAC(a[19], a[37]); SQRADDAC(a[20], a[36]); SQRADDAC(a[21], a[35]); SQRADDAC(a[22], a[34]); SQRADDAC(a[23], a[33]); SQRADDAC(a[24], a[32]); SQRADDAC(a[25], a[31]); SQRADDAC(a[26], a[30]); SQRADDAC(a[27], a[29]); SQRADDDB; SQRADD(a[28], a[28]); 
-   COMBA_STORE(b[56]);
-
-   /* output 57 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[57]); SQRADDAC(a[1], a[56]); SQRADDAC(a[2], a[55]); SQRADDAC(a[3], a[54]); SQRADDAC(a[4], a[53]); SQRADDAC(a[5], a[52]); SQRADDAC(a[6], a[51]); SQRADDAC(a[7], a[50]); SQRADDAC(a[8], a[49]); SQRADDAC(a[9], a[48]); SQRADDAC(a[10], a[47]); SQRADDAC(a[11], a[46]); SQRADDAC(a[12], a[45]); SQRADDAC(a[13], a[44]); SQRADDAC(a[14], a[43]); SQRADDAC(a[15], a[42]); SQRADDAC(a[16], a[41]); SQRADDAC(a[17], a[40]); SQRADDAC(a[18], a[39]); SQRADDAC(a[19], a[38]); SQRADDAC(a[20], a[37]); SQRADDAC(a[21], a[36]); SQRADDAC(a[22], a[35]); SQRADDAC(a[23], a[34]); SQRADDAC(a[24], a[33]); SQRADDAC(a[25], a[32]); SQRADDAC(a[26], a[31]); SQRADDAC(a[27], a[30]); SQRADDAC(a[28], a[29]); SQRADDDB; 
-   COMBA_STORE(b[57]);
-
-   /* output 58 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[58]); SQRADDAC(a[1], a[57]); SQRADDAC(a[2], a[56]); SQRADDAC(a[3], a[55]); SQRADDAC(a[4], a[54]); SQRADDAC(a[5], a[53]); SQRADDAC(a[6], a[52]); SQRADDAC(a[7], a[51]); SQRADDAC(a[8], a[50]); SQRADDAC(a[9], a[49]); SQRADDAC(a[10], a[48]); SQRADDAC(a[11], a[47]); SQRADDAC(a[12], a[46]); SQRADDAC(a[13], a[45]); SQRADDAC(a[14], a[44]); SQRADDAC(a[15], a[43]); SQRADDAC(a[16], a[42]); SQRADDAC(a[17], a[41]); SQRADDAC(a[18], a[40]); SQRADDAC(a[19], a[39]); SQRADDAC(a[20], a[38]); SQRADDAC(a[21], a[37]); SQRADDAC(a[22], a[36]); SQRADDAC(a[23], a[35]); SQRADDAC(a[24], a[34]); SQRADDAC(a[25], a[33]); SQRADDAC(a[26], a[32]); SQRADDAC(a[27], a[31]); SQRADDAC(a[28], a[30]); SQRADDDB; SQRADD(a[29], a[29]); 
-   COMBA_STORE(b[58]);
-
-   /* output 59 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[59]); SQRADDAC(a[1], a[58]); SQRADDAC(a[2], a[57]); SQRADDAC(a[3], a[56]); SQRADDAC(a[4], a[55]); SQRADDAC(a[5], a[54]); SQRADDAC(a[6], a[53]); SQRADDAC(a[7], a[52]); SQRADDAC(a[8], a[51]); SQRADDAC(a[9], a[50]); SQRADDAC(a[10], a[49]); SQRADDAC(a[11], a[48]); SQRADDAC(a[12], a[47]); SQRADDAC(a[13], a[46]); SQRADDAC(a[14], a[45]); SQRADDAC(a[15], a[44]); SQRADDAC(a[16], a[43]); SQRADDAC(a[17], a[42]); SQRADDAC(a[18], a[41]); SQRADDAC(a[19], a[40]); SQRADDAC(a[20], a[39]); SQRADDAC(a[21], a[38]); SQRADDAC(a[22], a[37]); SQRADDAC(a[23], a[36]); SQRADDAC(a[24], a[35]); SQRADDAC(a[25], a[34]); SQRADDAC(a[26], a[33]); SQRADDAC(a[27], a[32]); SQRADDAC(a[28], a[31]); SQRADDAC(a[29], a[30]); SQRADDDB; 
-   COMBA_STORE(b[59]);
-
-   /* output 60 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[60]); SQRADDAC(a[1], a[59]); SQRADDAC(a[2], a[58]); SQRADDAC(a[3], a[57]); SQRADDAC(a[4], a[56]); SQRADDAC(a[5], a[55]); SQRADDAC(a[6], a[54]); SQRADDAC(a[7], a[53]); SQRADDAC(a[8], a[52]); SQRADDAC(a[9], a[51]); SQRADDAC(a[10], a[50]); SQRADDAC(a[11], a[49]); SQRADDAC(a[12], a[48]); SQRADDAC(a[13], a[47]); SQRADDAC(a[14], a[46]); SQRADDAC(a[15], a[45]); SQRADDAC(a[16], a[44]); SQRADDAC(a[17], a[43]); SQRADDAC(a[18], a[42]); SQRADDAC(a[19], a[41]); SQRADDAC(a[20], a[40]); SQRADDAC(a[21], a[39]); SQRADDAC(a[22], a[38]); SQRADDAC(a[23], a[37]); SQRADDAC(a[24], a[36]); SQRADDAC(a[25], a[35]); SQRADDAC(a[26], a[34]); SQRADDAC(a[27], a[33]); SQRADDAC(a[28], a[32]); SQRADDAC(a[29], a[31]); SQRADDDB; SQRADD(a[30], a[30]); 
-   COMBA_STORE(b[60]);
-
-   /* output 61 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[61]); SQRADDAC(a[1], a[60]); SQRADDAC(a[2], a[59]); SQRADDAC(a[3], a[58]); SQRADDAC(a[4], a[57]); SQRADDAC(a[5], a[56]); SQRADDAC(a[6], a[55]); SQRADDAC(a[7], a[54]); SQRADDAC(a[8], a[53]); SQRADDAC(a[9], a[52]); SQRADDAC(a[10], a[51]); SQRADDAC(a[11], a[50]); SQRADDAC(a[12], a[49]); SQRADDAC(a[13], a[48]); SQRADDAC(a[14], a[47]); SQRADDAC(a[15], a[46]); SQRADDAC(a[16], a[45]); SQRADDAC(a[17], a[44]); SQRADDAC(a[18], a[43]); SQRADDAC(a[19], a[42]); SQRADDAC(a[20], a[41]); SQRADDAC(a[21], a[40]); SQRADDAC(a[22], a[39]); SQRADDAC(a[23], a[38]); SQRADDAC(a[24], a[37]); SQRADDAC(a[25], a[36]); SQRADDAC(a[26], a[35]); SQRADDAC(a[27], a[34]); SQRADDAC(a[28], a[33]); SQRADDAC(a[29], a[32]); SQRADDAC(a[30], a[31]); SQRADDDB; 
-   COMBA_STORE(b[61]);
-
-   /* output 62 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[62]); SQRADDAC(a[1], a[61]); SQRADDAC(a[2], a[60]); SQRADDAC(a[3], a[59]); SQRADDAC(a[4], a[58]); SQRADDAC(a[5], a[57]); SQRADDAC(a[6], a[56]); SQRADDAC(a[7], a[55]); SQRADDAC(a[8], a[54]); SQRADDAC(a[9], a[53]); SQRADDAC(a[10], a[52]); SQRADDAC(a[11], a[51]); SQRADDAC(a[12], a[50]); SQRADDAC(a[13], a[49]); SQRADDAC(a[14], a[48]); SQRADDAC(a[15], a[47]); SQRADDAC(a[16], a[46]); SQRADDAC(a[17], a[45]); SQRADDAC(a[18], a[44]); SQRADDAC(a[19], a[43]); SQRADDAC(a[20], a[42]); SQRADDAC(a[21], a[41]); SQRADDAC(a[22], a[40]); SQRADDAC(a[23], a[39]); SQRADDAC(a[24], a[38]); SQRADDAC(a[25], a[37]); SQRADDAC(a[26], a[36]); SQRADDAC(a[27], a[35]); SQRADDAC(a[28], a[34]); SQRADDAC(a[29], a[33]); SQRADDAC(a[30], a[32]); SQRADDDB; SQRADD(a[31], a[31]); 
-   COMBA_STORE(b[62]);
-
-   /* output 63 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[63]); SQRADDAC(a[1], a[62]); SQRADDAC(a[2], a[61]); SQRADDAC(a[3], a[60]); SQRADDAC(a[4], a[59]); SQRADDAC(a[5], a[58]); SQRADDAC(a[6], a[57]); SQRADDAC(a[7], a[56]); SQRADDAC(a[8], a[55]); SQRADDAC(a[9], a[54]); SQRADDAC(a[10], a[53]); SQRADDAC(a[11], a[52]); SQRADDAC(a[12], a[51]); SQRADDAC(a[13], a[50]); SQRADDAC(a[14], a[49]); SQRADDAC(a[15], a[48]); SQRADDAC(a[16], a[47]); SQRADDAC(a[17], a[46]); SQRADDAC(a[18], a[45]); SQRADDAC(a[19], a[44]); SQRADDAC(a[20], a[43]); SQRADDAC(a[21], a[42]); SQRADDAC(a[22], a[41]); SQRADDAC(a[23], a[40]); SQRADDAC(a[24], a[39]); SQRADDAC(a[25], a[38]); SQRADDAC(a[26], a[37]); SQRADDAC(a[27], a[36]); SQRADDAC(a[28], a[35]); SQRADDAC(a[29], a[34]); SQRADDAC(a[30], a[33]); SQRADDAC(a[31], a[32]); SQRADDDB; 
-   COMBA_STORE(b[63]);
-
-   /* output 64 */
-   CARRY_FORWARD;
-   SQRADDSC(a[1], a[63]); SQRADDAC(a[2], a[62]); SQRADDAC(a[3], a[61]); SQRADDAC(a[4], a[60]); SQRADDAC(a[5], a[59]); SQRADDAC(a[6], a[58]); SQRADDAC(a[7], a[57]); SQRADDAC(a[8], a[56]); SQRADDAC(a[9], a[55]); SQRADDAC(a[10], a[54]); SQRADDAC(a[11], a[53]); SQRADDAC(a[12], a[52]); SQRADDAC(a[13], a[51]); SQRADDAC(a[14], a[50]); SQRADDAC(a[15], a[49]); SQRADDAC(a[16], a[48]); SQRADDAC(a[17], a[47]); SQRADDAC(a[18], a[46]); SQRADDAC(a[19], a[45]); SQRADDAC(a[20], a[44]); SQRADDAC(a[21], a[43]); SQRADDAC(a[22], a[42]); SQRADDAC(a[23], a[41]); SQRADDAC(a[24], a[40]); SQRADDAC(a[25], a[39]); SQRADDAC(a[26], a[38]); SQRADDAC(a[27], a[37]); SQRADDAC(a[28], a[36]); SQRADDAC(a[29], a[35]); SQRADDAC(a[30], a[34]); SQRADDAC(a[31], a[33]); SQRADDDB; SQRADD(a[32], a[32]); 
-   COMBA_STORE(b[64]);
-
-   /* output 65 */
-   CARRY_FORWARD;
-   SQRADDSC(a[2], a[63]); SQRADDAC(a[3], a[62]); SQRADDAC(a[4], a[61]); SQRADDAC(a[5], a[60]); SQRADDAC(a[6], a[59]); SQRADDAC(a[7], a[58]); SQRADDAC(a[8], a[57]); SQRADDAC(a[9], a[56]); SQRADDAC(a[10], a[55]); SQRADDAC(a[11], a[54]); SQRADDAC(a[12], a[53]); SQRADDAC(a[13], a[52]); SQRADDAC(a[14], a[51]); SQRADDAC(a[15], a[50]); SQRADDAC(a[16], a[49]); SQRADDAC(a[17], a[48]); SQRADDAC(a[18], a[47]); SQRADDAC(a[19], a[46]); SQRADDAC(a[20], a[45]); SQRADDAC(a[21], a[44]); SQRADDAC(a[22], a[43]); SQRADDAC(a[23], a[42]); SQRADDAC(a[24], a[41]); SQRADDAC(a[25], a[40]); SQRADDAC(a[26], a[39]); SQRADDAC(a[27], a[38]); SQRADDAC(a[28], a[37]); SQRADDAC(a[29], a[36]); SQRADDAC(a[30], a[35]); SQRADDAC(a[31], a[34]); SQRADDAC(a[32], a[33]); SQRADDDB; 
-   COMBA_STORE(b[65]);
-
-   /* output 66 */
-   CARRY_FORWARD;
-   SQRADDSC(a[3], a[63]); SQRADDAC(a[4], a[62]); SQRADDAC(a[5], a[61]); SQRADDAC(a[6], a[60]); SQRADDAC(a[7], a[59]); SQRADDAC(a[8], a[58]); SQRADDAC(a[9], a[57]); SQRADDAC(a[10], a[56]); SQRADDAC(a[11], a[55]); SQRADDAC(a[12], a[54]); SQRADDAC(a[13], a[53]); SQRADDAC(a[14], a[52]); SQRADDAC(a[15], a[51]); SQRADDAC(a[16], a[50]); SQRADDAC(a[17], a[49]); SQRADDAC(a[18], a[48]); SQRADDAC(a[19], a[47]); SQRADDAC(a[20], a[46]); SQRADDAC(a[21], a[45]); SQRADDAC(a[22], a[44]); SQRADDAC(a[23], a[43]); SQRADDAC(a[24], a[42]); SQRADDAC(a[25], a[41]); SQRADDAC(a[26], a[40]); SQRADDAC(a[27], a[39]); SQRADDAC(a[28], a[38]); SQRADDAC(a[29], a[37]); SQRADDAC(a[30], a[36]); SQRADDAC(a[31], a[35]); SQRADDAC(a[32], a[34]); SQRADDDB; SQRADD(a[33], a[33]); 
-   COMBA_STORE(b[66]);
-
-   /* output 67 */
-   CARRY_FORWARD;
-   SQRADDSC(a[4], a[63]); SQRADDAC(a[5], a[62]); SQRADDAC(a[6], a[61]); SQRADDAC(a[7], a[60]); SQRADDAC(a[8], a[59]); SQRADDAC(a[9], a[58]); SQRADDAC(a[10], a[57]); SQRADDAC(a[11], a[56]); SQRADDAC(a[12], a[55]); SQRADDAC(a[13], a[54]); SQRADDAC(a[14], a[53]); SQRADDAC(a[15], a[52]); SQRADDAC(a[16], a[51]); SQRADDAC(a[17], a[50]); SQRADDAC(a[18], a[49]); SQRADDAC(a[19], a[48]); SQRADDAC(a[20], a[47]); SQRADDAC(a[21], a[46]); SQRADDAC(a[22], a[45]); SQRADDAC(a[23], a[44]); SQRADDAC(a[24], a[43]); SQRADDAC(a[25], a[42]); SQRADDAC(a[26], a[41]); SQRADDAC(a[27], a[40]); SQRADDAC(a[28], a[39]); SQRADDAC(a[29], a[38]); SQRADDAC(a[30], a[37]); SQRADDAC(a[31], a[36]); SQRADDAC(a[32], a[35]); SQRADDAC(a[33], a[34]); SQRADDDB; 
-   COMBA_STORE(b[67]);
-
-   /* output 68 */
-   CARRY_FORWARD;
-   SQRADDSC(a[5], a[63]); SQRADDAC(a[6], a[62]); SQRADDAC(a[7], a[61]); SQRADDAC(a[8], a[60]); SQRADDAC(a[9], a[59]); SQRADDAC(a[10], a[58]); SQRADDAC(a[11], a[57]); SQRADDAC(a[12], a[56]); SQRADDAC(a[13], a[55]); SQRADDAC(a[14], a[54]); SQRADDAC(a[15], a[53]); SQRADDAC(a[16], a[52]); SQRADDAC(a[17], a[51]); SQRADDAC(a[18], a[50]); SQRADDAC(a[19], a[49]); SQRADDAC(a[20], a[48]); SQRADDAC(a[21], a[47]); SQRADDAC(a[22], a[46]); SQRADDAC(a[23], a[45]); SQRADDAC(a[24], a[44]); SQRADDAC(a[25], a[43]); SQRADDAC(a[26], a[42]); SQRADDAC(a[27], a[41]); SQRADDAC(a[28], a[40]); SQRADDAC(a[29], a[39]); SQRADDAC(a[30], a[38]); SQRADDAC(a[31], a[37]); SQRADDAC(a[32], a[36]); SQRADDAC(a[33], a[35]); SQRADDDB; SQRADD(a[34], a[34]); 
-   COMBA_STORE(b[68]);
-
-   /* output 69 */
-   CARRY_FORWARD;
-   SQRADDSC(a[6], a[63]); SQRADDAC(a[7], a[62]); SQRADDAC(a[8], a[61]); SQRADDAC(a[9], a[60]); SQRADDAC(a[10], a[59]); SQRADDAC(a[11], a[58]); SQRADDAC(a[12], a[57]); SQRADDAC(a[13], a[56]); SQRADDAC(a[14], a[55]); SQRADDAC(a[15], a[54]); SQRADDAC(a[16], a[53]); SQRADDAC(a[17], a[52]); SQRADDAC(a[18], a[51]); SQRADDAC(a[19], a[50]); SQRADDAC(a[20], a[49]); SQRADDAC(a[21], a[48]); SQRADDAC(a[22], a[47]); SQRADDAC(a[23], a[46]); SQRADDAC(a[24], a[45]); SQRADDAC(a[25], a[44]); SQRADDAC(a[26], a[43]); SQRADDAC(a[27], a[42]); SQRADDAC(a[28], a[41]); SQRADDAC(a[29], a[40]); SQRADDAC(a[30], a[39]); SQRADDAC(a[31], a[38]); SQRADDAC(a[32], a[37]); SQRADDAC(a[33], a[36]); SQRADDAC(a[34], a[35]); SQRADDDB; 
-   COMBA_STORE(b[69]);
-
-   /* output 70 */
-   CARRY_FORWARD;
-   SQRADDSC(a[7], a[63]); SQRADDAC(a[8], a[62]); SQRADDAC(a[9], a[61]); SQRADDAC(a[10], a[60]); SQRADDAC(a[11], a[59]); SQRADDAC(a[12], a[58]); SQRADDAC(a[13], a[57]); SQRADDAC(a[14], a[56]); SQRADDAC(a[15], a[55]); SQRADDAC(a[16], a[54]); SQRADDAC(a[17], a[53]); SQRADDAC(a[18], a[52]); SQRADDAC(a[19], a[51]); SQRADDAC(a[20], a[50]); SQRADDAC(a[21], a[49]); SQRADDAC(a[22], a[48]); SQRADDAC(a[23], a[47]); SQRADDAC(a[24], a[46]); SQRADDAC(a[25], a[45]); SQRADDAC(a[26], a[44]); SQRADDAC(a[27], a[43]); SQRADDAC(a[28], a[42]); SQRADDAC(a[29], a[41]); SQRADDAC(a[30], a[40]); SQRADDAC(a[31], a[39]); SQRADDAC(a[32], a[38]); SQRADDAC(a[33], a[37]); SQRADDAC(a[34], a[36]); SQRADDDB; SQRADD(a[35], a[35]); 
-   COMBA_STORE(b[70]);
-
-   /* output 71 */
-   CARRY_FORWARD;
-   SQRADDSC(a[8], a[63]); SQRADDAC(a[9], a[62]); SQRADDAC(a[10], a[61]); SQRADDAC(a[11], a[60]); SQRADDAC(a[12], a[59]); SQRADDAC(a[13], a[58]); SQRADDAC(a[14], a[57]); SQRADDAC(a[15], a[56]); SQRADDAC(a[16], a[55]); SQRADDAC(a[17], a[54]); SQRADDAC(a[18], a[53]); SQRADDAC(a[19], a[52]); SQRADDAC(a[20], a[51]); SQRADDAC(a[21], a[50]); SQRADDAC(a[22], a[49]); SQRADDAC(a[23], a[48]); SQRADDAC(a[24], a[47]); SQRADDAC(a[25], a[46]); SQRADDAC(a[26], a[45]); SQRADDAC(a[27], a[44]); SQRADDAC(a[28], a[43]); SQRADDAC(a[29], a[42]); SQRADDAC(a[30], a[41]); SQRADDAC(a[31], a[40]); SQRADDAC(a[32], a[39]); SQRADDAC(a[33], a[38]); SQRADDAC(a[34], a[37]); SQRADDAC(a[35], a[36]); SQRADDDB; 
-   COMBA_STORE(b[71]);
-
-   /* output 72 */
-   CARRY_FORWARD;
-   SQRADDSC(a[9], a[63]); SQRADDAC(a[10], a[62]); SQRADDAC(a[11], a[61]); SQRADDAC(a[12], a[60]); SQRADDAC(a[13], a[59]); SQRADDAC(a[14], a[58]); SQRADDAC(a[15], a[57]); SQRADDAC(a[16], a[56]); SQRADDAC(a[17], a[55]); SQRADDAC(a[18], a[54]); SQRADDAC(a[19], a[53]); SQRADDAC(a[20], a[52]); SQRADDAC(a[21], a[51]); SQRADDAC(a[22], a[50]); SQRADDAC(a[23], a[49]); SQRADDAC(a[24], a[48]); SQRADDAC(a[25], a[47]); SQRADDAC(a[26], a[46]); SQRADDAC(a[27], a[45]); SQRADDAC(a[28], a[44]); SQRADDAC(a[29], a[43]); SQRADDAC(a[30], a[42]); SQRADDAC(a[31], a[41]); SQRADDAC(a[32], a[40]); SQRADDAC(a[33], a[39]); SQRADDAC(a[34], a[38]); SQRADDAC(a[35], a[37]); SQRADDDB; SQRADD(a[36], a[36]); 
-   COMBA_STORE(b[72]);
-
-   /* output 73 */
-   CARRY_FORWARD;
-   SQRADDSC(a[10], a[63]); SQRADDAC(a[11], a[62]); SQRADDAC(a[12], a[61]); SQRADDAC(a[13], a[60]); SQRADDAC(a[14], a[59]); SQRADDAC(a[15], a[58]); SQRADDAC(a[16], a[57]); SQRADDAC(a[17], a[56]); SQRADDAC(a[18], a[55]); SQRADDAC(a[19], a[54]); SQRADDAC(a[20], a[53]); SQRADDAC(a[21], a[52]); SQRADDAC(a[22], a[51]); SQRADDAC(a[23], a[50]); SQRADDAC(a[24], a[49]); SQRADDAC(a[25], a[48]); SQRADDAC(a[26], a[47]); SQRADDAC(a[27], a[46]); SQRADDAC(a[28], a[45]); SQRADDAC(a[29], a[44]); SQRADDAC(a[30], a[43]); SQRADDAC(a[31], a[42]); SQRADDAC(a[32], a[41]); SQRADDAC(a[33], a[40]); SQRADDAC(a[34], a[39]); SQRADDAC(a[35], a[38]); SQRADDAC(a[36], a[37]); SQRADDDB; 
-   COMBA_STORE(b[73]);
-
-   /* output 74 */
-   CARRY_FORWARD;
-   SQRADDSC(a[11], a[63]); SQRADDAC(a[12], a[62]); SQRADDAC(a[13], a[61]); SQRADDAC(a[14], a[60]); SQRADDAC(a[15], a[59]); SQRADDAC(a[16], a[58]); SQRADDAC(a[17], a[57]); SQRADDAC(a[18], a[56]); SQRADDAC(a[19], a[55]); SQRADDAC(a[20], a[54]); SQRADDAC(a[21], a[53]); SQRADDAC(a[22], a[52]); SQRADDAC(a[23], a[51]); SQRADDAC(a[24], a[50]); SQRADDAC(a[25], a[49]); SQRADDAC(a[26], a[48]); SQRADDAC(a[27], a[47]); SQRADDAC(a[28], a[46]); SQRADDAC(a[29], a[45]); SQRADDAC(a[30], a[44]); SQRADDAC(a[31], a[43]); SQRADDAC(a[32], a[42]); SQRADDAC(a[33], a[41]); SQRADDAC(a[34], a[40]); SQRADDAC(a[35], a[39]); SQRADDAC(a[36], a[38]); SQRADDDB; SQRADD(a[37], a[37]); 
-   COMBA_STORE(b[74]);
-
-   /* output 75 */
-   CARRY_FORWARD;
-   SQRADDSC(a[12], a[63]); SQRADDAC(a[13], a[62]); SQRADDAC(a[14], a[61]); SQRADDAC(a[15], a[60]); SQRADDAC(a[16], a[59]); SQRADDAC(a[17], a[58]); SQRADDAC(a[18], a[57]); SQRADDAC(a[19], a[56]); SQRADDAC(a[20], a[55]); SQRADDAC(a[21], a[54]); SQRADDAC(a[22], a[53]); SQRADDAC(a[23], a[52]); SQRADDAC(a[24], a[51]); SQRADDAC(a[25], a[50]); SQRADDAC(a[26], a[49]); SQRADDAC(a[27], a[48]); SQRADDAC(a[28], a[47]); SQRADDAC(a[29], a[46]); SQRADDAC(a[30], a[45]); SQRADDAC(a[31], a[44]); SQRADDAC(a[32], a[43]); SQRADDAC(a[33], a[42]); SQRADDAC(a[34], a[41]); SQRADDAC(a[35], a[40]); SQRADDAC(a[36], a[39]); SQRADDAC(a[37], a[38]); SQRADDDB; 
-   COMBA_STORE(b[75]);
-
-   /* output 76 */
-   CARRY_FORWARD;
-   SQRADDSC(a[13], a[63]); SQRADDAC(a[14], a[62]); SQRADDAC(a[15], a[61]); SQRADDAC(a[16], a[60]); SQRADDAC(a[17], a[59]); SQRADDAC(a[18], a[58]); SQRADDAC(a[19], a[57]); SQRADDAC(a[20], a[56]); SQRADDAC(a[21], a[55]); SQRADDAC(a[22], a[54]); SQRADDAC(a[23], a[53]); SQRADDAC(a[24], a[52]); SQRADDAC(a[25], a[51]); SQRADDAC(a[26], a[50]); SQRADDAC(a[27], a[49]); SQRADDAC(a[28], a[48]); SQRADDAC(a[29], a[47]); SQRADDAC(a[30], a[46]); SQRADDAC(a[31], a[45]); SQRADDAC(a[32], a[44]); SQRADDAC(a[33], a[43]); SQRADDAC(a[34], a[42]); SQRADDAC(a[35], a[41]); SQRADDAC(a[36], a[40]); SQRADDAC(a[37], a[39]); SQRADDDB; SQRADD(a[38], a[38]); 
-   COMBA_STORE(b[76]);
-
-   /* output 77 */
-   CARRY_FORWARD;
-   SQRADDSC(a[14], a[63]); SQRADDAC(a[15], a[62]); SQRADDAC(a[16], a[61]); SQRADDAC(a[17], a[60]); SQRADDAC(a[18], a[59]); SQRADDAC(a[19], a[58]); SQRADDAC(a[20], a[57]); SQRADDAC(a[21], a[56]); SQRADDAC(a[22], a[55]); SQRADDAC(a[23], a[54]); SQRADDAC(a[24], a[53]); SQRADDAC(a[25], a[52]); SQRADDAC(a[26], a[51]); SQRADDAC(a[27], a[50]); SQRADDAC(a[28], a[49]); SQRADDAC(a[29], a[48]); SQRADDAC(a[30], a[47]); SQRADDAC(a[31], a[46]); SQRADDAC(a[32], a[45]); SQRADDAC(a[33], a[44]); SQRADDAC(a[34], a[43]); SQRADDAC(a[35], a[42]); SQRADDAC(a[36], a[41]); SQRADDAC(a[37], a[40]); SQRADDAC(a[38], a[39]); SQRADDDB; 
-   COMBA_STORE(b[77]);
-
-   /* output 78 */
-   CARRY_FORWARD;
-   SQRADDSC(a[15], a[63]); SQRADDAC(a[16], a[62]); SQRADDAC(a[17], a[61]); SQRADDAC(a[18], a[60]); SQRADDAC(a[19], a[59]); SQRADDAC(a[20], a[58]); SQRADDAC(a[21], a[57]); SQRADDAC(a[22], a[56]); SQRADDAC(a[23], a[55]); SQRADDAC(a[24], a[54]); SQRADDAC(a[25], a[53]); SQRADDAC(a[26], a[52]); SQRADDAC(a[27], a[51]); SQRADDAC(a[28], a[50]); SQRADDAC(a[29], a[49]); SQRADDAC(a[30], a[48]); SQRADDAC(a[31], a[47]); SQRADDAC(a[32], a[46]); SQRADDAC(a[33], a[45]); SQRADDAC(a[34], a[44]); SQRADDAC(a[35], a[43]); SQRADDAC(a[36], a[42]); SQRADDAC(a[37], a[41]); SQRADDAC(a[38], a[40]); SQRADDDB; SQRADD(a[39], a[39]); 
-   COMBA_STORE(b[78]);
-
-   /* output 79 */
-   CARRY_FORWARD;
-   SQRADDSC(a[16], a[63]); SQRADDAC(a[17], a[62]); SQRADDAC(a[18], a[61]); SQRADDAC(a[19], a[60]); SQRADDAC(a[20], a[59]); SQRADDAC(a[21], a[58]); SQRADDAC(a[22], a[57]); SQRADDAC(a[23], a[56]); SQRADDAC(a[24], a[55]); SQRADDAC(a[25], a[54]); SQRADDAC(a[26], a[53]); SQRADDAC(a[27], a[52]); SQRADDAC(a[28], a[51]); SQRADDAC(a[29], a[50]); SQRADDAC(a[30], a[49]); SQRADDAC(a[31], a[48]); SQRADDAC(a[32], a[47]); SQRADDAC(a[33], a[46]); SQRADDAC(a[34], a[45]); SQRADDAC(a[35], a[44]); SQRADDAC(a[36], a[43]); SQRADDAC(a[37], a[42]); SQRADDAC(a[38], a[41]); SQRADDAC(a[39], a[40]); SQRADDDB; 
-   COMBA_STORE(b[79]);
-
-   /* output 80 */
-   CARRY_FORWARD;
-   SQRADDSC(a[17], a[63]); SQRADDAC(a[18], a[62]); SQRADDAC(a[19], a[61]); SQRADDAC(a[20], a[60]); SQRADDAC(a[21], a[59]); SQRADDAC(a[22], a[58]); SQRADDAC(a[23], a[57]); SQRADDAC(a[24], a[56]); SQRADDAC(a[25], a[55]); SQRADDAC(a[26], a[54]); SQRADDAC(a[27], a[53]); SQRADDAC(a[28], a[52]); SQRADDAC(a[29], a[51]); SQRADDAC(a[30], a[50]); SQRADDAC(a[31], a[49]); SQRADDAC(a[32], a[48]); SQRADDAC(a[33], a[47]); SQRADDAC(a[34], a[46]); SQRADDAC(a[35], a[45]); SQRADDAC(a[36], a[44]); SQRADDAC(a[37], a[43]); SQRADDAC(a[38], a[42]); SQRADDAC(a[39], a[41]); SQRADDDB; SQRADD(a[40], a[40]); 
-   COMBA_STORE(b[80]);
-
-   /* output 81 */
-   CARRY_FORWARD;
-   SQRADDSC(a[18], a[63]); SQRADDAC(a[19], a[62]); SQRADDAC(a[20], a[61]); SQRADDAC(a[21], a[60]); SQRADDAC(a[22], a[59]); SQRADDAC(a[23], a[58]); SQRADDAC(a[24], a[57]); SQRADDAC(a[25], a[56]); SQRADDAC(a[26], a[55]); SQRADDAC(a[27], a[54]); SQRADDAC(a[28], a[53]); SQRADDAC(a[29], a[52]); SQRADDAC(a[30], a[51]); SQRADDAC(a[31], a[50]); SQRADDAC(a[32], a[49]); SQRADDAC(a[33], a[48]); SQRADDAC(a[34], a[47]); SQRADDAC(a[35], a[46]); SQRADDAC(a[36], a[45]); SQRADDAC(a[37], a[44]); SQRADDAC(a[38], a[43]); SQRADDAC(a[39], a[42]); SQRADDAC(a[40], a[41]); SQRADDDB; 
-   COMBA_STORE(b[81]);
-
-   /* output 82 */
-   CARRY_FORWARD;
-   SQRADDSC(a[19], a[63]); SQRADDAC(a[20], a[62]); SQRADDAC(a[21], a[61]); SQRADDAC(a[22], a[60]); SQRADDAC(a[23], a[59]); SQRADDAC(a[24], a[58]); SQRADDAC(a[25], a[57]); SQRADDAC(a[26], a[56]); SQRADDAC(a[27], a[55]); SQRADDAC(a[28], a[54]); SQRADDAC(a[29], a[53]); SQRADDAC(a[30], a[52]); SQRADDAC(a[31], a[51]); SQRADDAC(a[32], a[50]); SQRADDAC(a[33], a[49]); SQRADDAC(a[34], a[48]); SQRADDAC(a[35], a[47]); SQRADDAC(a[36], a[46]); SQRADDAC(a[37], a[45]); SQRADDAC(a[38], a[44]); SQRADDAC(a[39], a[43]); SQRADDAC(a[40], a[42]); SQRADDDB; SQRADD(a[41], a[41]); 
-   COMBA_STORE(b[82]);
-
-   /* output 83 */
-   CARRY_FORWARD;
-   SQRADDSC(a[20], a[63]); SQRADDAC(a[21], a[62]); SQRADDAC(a[22], a[61]); SQRADDAC(a[23], a[60]); SQRADDAC(a[24], a[59]); SQRADDAC(a[25], a[58]); SQRADDAC(a[26], a[57]); SQRADDAC(a[27], a[56]); SQRADDAC(a[28], a[55]); SQRADDAC(a[29], a[54]); SQRADDAC(a[30], a[53]); SQRADDAC(a[31], a[52]); SQRADDAC(a[32], a[51]); SQRADDAC(a[33], a[50]); SQRADDAC(a[34], a[49]); SQRADDAC(a[35], a[48]); SQRADDAC(a[36], a[47]); SQRADDAC(a[37], a[46]); SQRADDAC(a[38], a[45]); SQRADDAC(a[39], a[44]); SQRADDAC(a[40], a[43]); SQRADDAC(a[41], a[42]); SQRADDDB; 
-   COMBA_STORE(b[83]);
-
-   /* output 84 */
-   CARRY_FORWARD;
-   SQRADDSC(a[21], a[63]); SQRADDAC(a[22], a[62]); SQRADDAC(a[23], a[61]); SQRADDAC(a[24], a[60]); SQRADDAC(a[25], a[59]); SQRADDAC(a[26], a[58]); SQRADDAC(a[27], a[57]); SQRADDAC(a[28], a[56]); SQRADDAC(a[29], a[55]); SQRADDAC(a[30], a[54]); SQRADDAC(a[31], a[53]); SQRADDAC(a[32], a[52]); SQRADDAC(a[33], a[51]); SQRADDAC(a[34], a[50]); SQRADDAC(a[35], a[49]); SQRADDAC(a[36], a[48]); SQRADDAC(a[37], a[47]); SQRADDAC(a[38], a[46]); SQRADDAC(a[39], a[45]); SQRADDAC(a[40], a[44]); SQRADDAC(a[41], a[43]); SQRADDDB; SQRADD(a[42], a[42]); 
-   COMBA_STORE(b[84]);
-
-   /* output 85 */
-   CARRY_FORWARD;
-   SQRADDSC(a[22], a[63]); SQRADDAC(a[23], a[62]); SQRADDAC(a[24], a[61]); SQRADDAC(a[25], a[60]); SQRADDAC(a[26], a[59]); SQRADDAC(a[27], a[58]); SQRADDAC(a[28], a[57]); SQRADDAC(a[29], a[56]); SQRADDAC(a[30], a[55]); SQRADDAC(a[31], a[54]); SQRADDAC(a[32], a[53]); SQRADDAC(a[33], a[52]); SQRADDAC(a[34], a[51]); SQRADDAC(a[35], a[50]); SQRADDAC(a[36], a[49]); SQRADDAC(a[37], a[48]); SQRADDAC(a[38], a[47]); SQRADDAC(a[39], a[46]); SQRADDAC(a[40], a[45]); SQRADDAC(a[41], a[44]); SQRADDAC(a[42], a[43]); SQRADDDB; 
-   COMBA_STORE(b[85]);
-
-   /* output 86 */
-   CARRY_FORWARD;
-   SQRADDSC(a[23], a[63]); SQRADDAC(a[24], a[62]); SQRADDAC(a[25], a[61]); SQRADDAC(a[26], a[60]); SQRADDAC(a[27], a[59]); SQRADDAC(a[28], a[58]); SQRADDAC(a[29], a[57]); SQRADDAC(a[30], a[56]); SQRADDAC(a[31], a[55]); SQRADDAC(a[32], a[54]); SQRADDAC(a[33], a[53]); SQRADDAC(a[34], a[52]); SQRADDAC(a[35], a[51]); SQRADDAC(a[36], a[50]); SQRADDAC(a[37], a[49]); SQRADDAC(a[38], a[48]); SQRADDAC(a[39], a[47]); SQRADDAC(a[40], a[46]); SQRADDAC(a[41], a[45]); SQRADDAC(a[42], a[44]); SQRADDDB; SQRADD(a[43], a[43]); 
-   COMBA_STORE(b[86]);
-
-   /* output 87 */
-   CARRY_FORWARD;
-   SQRADDSC(a[24], a[63]); SQRADDAC(a[25], a[62]); SQRADDAC(a[26], a[61]); SQRADDAC(a[27], a[60]); SQRADDAC(a[28], a[59]); SQRADDAC(a[29], a[58]); SQRADDAC(a[30], a[57]); SQRADDAC(a[31], a[56]); SQRADDAC(a[32], a[55]); SQRADDAC(a[33], a[54]); SQRADDAC(a[34], a[53]); SQRADDAC(a[35], a[52]); SQRADDAC(a[36], a[51]); SQRADDAC(a[37], a[50]); SQRADDAC(a[38], a[49]); SQRADDAC(a[39], a[48]); SQRADDAC(a[40], a[47]); SQRADDAC(a[41], a[46]); SQRADDAC(a[42], a[45]); SQRADDAC(a[43], a[44]); SQRADDDB; 
-   COMBA_STORE(b[87]);
-
-   /* output 88 */
-   CARRY_FORWARD;
-   SQRADDSC(a[25], a[63]); SQRADDAC(a[26], a[62]); SQRADDAC(a[27], a[61]); SQRADDAC(a[28], a[60]); SQRADDAC(a[29], a[59]); SQRADDAC(a[30], a[58]); SQRADDAC(a[31], a[57]); SQRADDAC(a[32], a[56]); SQRADDAC(a[33], a[55]); SQRADDAC(a[34], a[54]); SQRADDAC(a[35], a[53]); SQRADDAC(a[36], a[52]); SQRADDAC(a[37], a[51]); SQRADDAC(a[38], a[50]); SQRADDAC(a[39], a[49]); SQRADDAC(a[40], a[48]); SQRADDAC(a[41], a[47]); SQRADDAC(a[42], a[46]); SQRADDAC(a[43], a[45]); SQRADDDB; SQRADD(a[44], a[44]); 
-   COMBA_STORE(b[88]);
-
-   /* output 89 */
-   CARRY_FORWARD;
-   SQRADDSC(a[26], a[63]); SQRADDAC(a[27], a[62]); SQRADDAC(a[28], a[61]); SQRADDAC(a[29], a[60]); SQRADDAC(a[30], a[59]); SQRADDAC(a[31], a[58]); SQRADDAC(a[32], a[57]); SQRADDAC(a[33], a[56]); SQRADDAC(a[34], a[55]); SQRADDAC(a[35], a[54]); SQRADDAC(a[36], a[53]); SQRADDAC(a[37], a[52]); SQRADDAC(a[38], a[51]); SQRADDAC(a[39], a[50]); SQRADDAC(a[40], a[49]); SQRADDAC(a[41], a[48]); SQRADDAC(a[42], a[47]); SQRADDAC(a[43], a[46]); SQRADDAC(a[44], a[45]); SQRADDDB; 
-   COMBA_STORE(b[89]);
-
-   /* output 90 */
-   CARRY_FORWARD;
-   SQRADDSC(a[27], a[63]); SQRADDAC(a[28], a[62]); SQRADDAC(a[29], a[61]); SQRADDAC(a[30], a[60]); SQRADDAC(a[31], a[59]); SQRADDAC(a[32], a[58]); SQRADDAC(a[33], a[57]); SQRADDAC(a[34], a[56]); SQRADDAC(a[35], a[55]); SQRADDAC(a[36], a[54]); SQRADDAC(a[37], a[53]); SQRADDAC(a[38], a[52]); SQRADDAC(a[39], a[51]); SQRADDAC(a[40], a[50]); SQRADDAC(a[41], a[49]); SQRADDAC(a[42], a[48]); SQRADDAC(a[43], a[47]); SQRADDAC(a[44], a[46]); SQRADDDB; SQRADD(a[45], a[45]); 
-   COMBA_STORE(b[90]);
-
-   /* output 91 */
-   CARRY_FORWARD;
-   SQRADDSC(a[28], a[63]); SQRADDAC(a[29], a[62]); SQRADDAC(a[30], a[61]); SQRADDAC(a[31], a[60]); SQRADDAC(a[32], a[59]); SQRADDAC(a[33], a[58]); SQRADDAC(a[34], a[57]); SQRADDAC(a[35], a[56]); SQRADDAC(a[36], a[55]); SQRADDAC(a[37], a[54]); SQRADDAC(a[38], a[53]); SQRADDAC(a[39], a[52]); SQRADDAC(a[40], a[51]); SQRADDAC(a[41], a[50]); SQRADDAC(a[42], a[49]); SQRADDAC(a[43], a[48]); SQRADDAC(a[44], a[47]); SQRADDAC(a[45], a[46]); SQRADDDB; 
-   COMBA_STORE(b[91]);
-
-   /* output 92 */
-   CARRY_FORWARD;
-   SQRADDSC(a[29], a[63]); SQRADDAC(a[30], a[62]); SQRADDAC(a[31], a[61]); SQRADDAC(a[32], a[60]); SQRADDAC(a[33], a[59]); SQRADDAC(a[34], a[58]); SQRADDAC(a[35], a[57]); SQRADDAC(a[36], a[56]); SQRADDAC(a[37], a[55]); SQRADDAC(a[38], a[54]); SQRADDAC(a[39], a[53]); SQRADDAC(a[40], a[52]); SQRADDAC(a[41], a[51]); SQRADDAC(a[42], a[50]); SQRADDAC(a[43], a[49]); SQRADDAC(a[44], a[48]); SQRADDAC(a[45], a[47]); SQRADDDB; SQRADD(a[46], a[46]); 
-   COMBA_STORE(b[92]);
-
-   /* output 93 */
-   CARRY_FORWARD;
-   SQRADDSC(a[30], a[63]); SQRADDAC(a[31], a[62]); SQRADDAC(a[32], a[61]); SQRADDAC(a[33], a[60]); SQRADDAC(a[34], a[59]); SQRADDAC(a[35], a[58]); SQRADDAC(a[36], a[57]); SQRADDAC(a[37], a[56]); SQRADDAC(a[38], a[55]); SQRADDAC(a[39], a[54]); SQRADDAC(a[40], a[53]); SQRADDAC(a[41], a[52]); SQRADDAC(a[42], a[51]); SQRADDAC(a[43], a[50]); SQRADDAC(a[44], a[49]); SQRADDAC(a[45], a[48]); SQRADDAC(a[46], a[47]); SQRADDDB; 
-   COMBA_STORE(b[93]);
-
-   /* output 94 */
-   CARRY_FORWARD;
-   SQRADDSC(a[31], a[63]); SQRADDAC(a[32], a[62]); SQRADDAC(a[33], a[61]); SQRADDAC(a[34], a[60]); SQRADDAC(a[35], a[59]); SQRADDAC(a[36], a[58]); SQRADDAC(a[37], a[57]); SQRADDAC(a[38], a[56]); SQRADDAC(a[39], a[55]); SQRADDAC(a[40], a[54]); SQRADDAC(a[41], a[53]); SQRADDAC(a[42], a[52]); SQRADDAC(a[43], a[51]); SQRADDAC(a[44], a[50]); SQRADDAC(a[45], a[49]); SQRADDAC(a[46], a[48]); SQRADDDB; SQRADD(a[47], a[47]); 
-   COMBA_STORE(b[94]);
-
-   /* output 95 */
-   CARRY_FORWARD;
-   SQRADDSC(a[32], a[63]); SQRADDAC(a[33], a[62]); SQRADDAC(a[34], a[61]); SQRADDAC(a[35], a[60]); SQRADDAC(a[36], a[59]); SQRADDAC(a[37], a[58]); SQRADDAC(a[38], a[57]); SQRADDAC(a[39], a[56]); SQRADDAC(a[40], a[55]); SQRADDAC(a[41], a[54]); SQRADDAC(a[42], a[53]); SQRADDAC(a[43], a[52]); SQRADDAC(a[44], a[51]); SQRADDAC(a[45], a[50]); SQRADDAC(a[46], a[49]); SQRADDAC(a[47], a[48]); SQRADDDB; 
-   COMBA_STORE(b[95]);
-
-   /* output 96 */
-   CARRY_FORWARD;
-   SQRADDSC(a[33], a[63]); SQRADDAC(a[34], a[62]); SQRADDAC(a[35], a[61]); SQRADDAC(a[36], a[60]); SQRADDAC(a[37], a[59]); SQRADDAC(a[38], a[58]); SQRADDAC(a[39], a[57]); SQRADDAC(a[40], a[56]); SQRADDAC(a[41], a[55]); SQRADDAC(a[42], a[54]); SQRADDAC(a[43], a[53]); SQRADDAC(a[44], a[52]); SQRADDAC(a[45], a[51]); SQRADDAC(a[46], a[50]); SQRADDAC(a[47], a[49]); SQRADDDB; SQRADD(a[48], a[48]); 
-   COMBA_STORE(b[96]);
-
-   /* output 97 */
-   CARRY_FORWARD;
-   SQRADDSC(a[34], a[63]); SQRADDAC(a[35], a[62]); SQRADDAC(a[36], a[61]); SQRADDAC(a[37], a[60]); SQRADDAC(a[38], a[59]); SQRADDAC(a[39], a[58]); SQRADDAC(a[40], a[57]); SQRADDAC(a[41], a[56]); SQRADDAC(a[42], a[55]); SQRADDAC(a[43], a[54]); SQRADDAC(a[44], a[53]); SQRADDAC(a[45], a[52]); SQRADDAC(a[46], a[51]); SQRADDAC(a[47], a[50]); SQRADDAC(a[48], a[49]); SQRADDDB; 
-   COMBA_STORE(b[97]);
-
-   /* output 98 */
-   CARRY_FORWARD;
-   SQRADDSC(a[35], a[63]); SQRADDAC(a[36], a[62]); SQRADDAC(a[37], a[61]); SQRADDAC(a[38], a[60]); SQRADDAC(a[39], a[59]); SQRADDAC(a[40], a[58]); SQRADDAC(a[41], a[57]); SQRADDAC(a[42], a[56]); SQRADDAC(a[43], a[55]); SQRADDAC(a[44], a[54]); SQRADDAC(a[45], a[53]); SQRADDAC(a[46], a[52]); SQRADDAC(a[47], a[51]); SQRADDAC(a[48], a[50]); SQRADDDB; SQRADD(a[49], a[49]); 
-   COMBA_STORE(b[98]);
-
-   /* output 99 */
-   CARRY_FORWARD;
-   SQRADDSC(a[36], a[63]); SQRADDAC(a[37], a[62]); SQRADDAC(a[38], a[61]); SQRADDAC(a[39], a[60]); SQRADDAC(a[40], a[59]); SQRADDAC(a[41], a[58]); SQRADDAC(a[42], a[57]); SQRADDAC(a[43], a[56]); SQRADDAC(a[44], a[55]); SQRADDAC(a[45], a[54]); SQRADDAC(a[46], a[53]); SQRADDAC(a[47], a[52]); SQRADDAC(a[48], a[51]); SQRADDAC(a[49], a[50]); SQRADDDB; 
-   COMBA_STORE(b[99]);
-
-   /* output 100 */
-   CARRY_FORWARD;
-   SQRADDSC(a[37], a[63]); SQRADDAC(a[38], a[62]); SQRADDAC(a[39], a[61]); SQRADDAC(a[40], a[60]); SQRADDAC(a[41], a[59]); SQRADDAC(a[42], a[58]); SQRADDAC(a[43], a[57]); SQRADDAC(a[44], a[56]); SQRADDAC(a[45], a[55]); SQRADDAC(a[46], a[54]); SQRADDAC(a[47], a[53]); SQRADDAC(a[48], a[52]); SQRADDAC(a[49], a[51]); SQRADDDB; SQRADD(a[50], a[50]); 
-   COMBA_STORE(b[100]);
-
-   /* output 101 */
-   CARRY_FORWARD;
-   SQRADDSC(a[38], a[63]); SQRADDAC(a[39], a[62]); SQRADDAC(a[40], a[61]); SQRADDAC(a[41], a[60]); SQRADDAC(a[42], a[59]); SQRADDAC(a[43], a[58]); SQRADDAC(a[44], a[57]); SQRADDAC(a[45], a[56]); SQRADDAC(a[46], a[55]); SQRADDAC(a[47], a[54]); SQRADDAC(a[48], a[53]); SQRADDAC(a[49], a[52]); SQRADDAC(a[50], a[51]); SQRADDDB; 
-   COMBA_STORE(b[101]);
-
-   /* output 102 */
-   CARRY_FORWARD;
-   SQRADDSC(a[39], a[63]); SQRADDAC(a[40], a[62]); SQRADDAC(a[41], a[61]); SQRADDAC(a[42], a[60]); SQRADDAC(a[43], a[59]); SQRADDAC(a[44], a[58]); SQRADDAC(a[45], a[57]); SQRADDAC(a[46], a[56]); SQRADDAC(a[47], a[55]); SQRADDAC(a[48], a[54]); SQRADDAC(a[49], a[53]); SQRADDAC(a[50], a[52]); SQRADDDB; SQRADD(a[51], a[51]); 
-   COMBA_STORE(b[102]);
-
-   /* output 103 */
-   CARRY_FORWARD;
-   SQRADDSC(a[40], a[63]); SQRADDAC(a[41], a[62]); SQRADDAC(a[42], a[61]); SQRADDAC(a[43], a[60]); SQRADDAC(a[44], a[59]); SQRADDAC(a[45], a[58]); SQRADDAC(a[46], a[57]); SQRADDAC(a[47], a[56]); SQRADDAC(a[48], a[55]); SQRADDAC(a[49], a[54]); SQRADDAC(a[50], a[53]); SQRADDAC(a[51], a[52]); SQRADDDB; 
-   COMBA_STORE(b[103]);
-
-   /* output 104 */
-   CARRY_FORWARD;
-   SQRADDSC(a[41], a[63]); SQRADDAC(a[42], a[62]); SQRADDAC(a[43], a[61]); SQRADDAC(a[44], a[60]); SQRADDAC(a[45], a[59]); SQRADDAC(a[46], a[58]); SQRADDAC(a[47], a[57]); SQRADDAC(a[48], a[56]); SQRADDAC(a[49], a[55]); SQRADDAC(a[50], a[54]); SQRADDAC(a[51], a[53]); SQRADDDB; SQRADD(a[52], a[52]); 
-   COMBA_STORE(b[104]);
-
-   /* output 105 */
-   CARRY_FORWARD;
-   SQRADDSC(a[42], a[63]); SQRADDAC(a[43], a[62]); SQRADDAC(a[44], a[61]); SQRADDAC(a[45], a[60]); SQRADDAC(a[46], a[59]); SQRADDAC(a[47], a[58]); SQRADDAC(a[48], a[57]); SQRADDAC(a[49], a[56]); SQRADDAC(a[50], a[55]); SQRADDAC(a[51], a[54]); SQRADDAC(a[52], a[53]); SQRADDDB; 
-   COMBA_STORE(b[105]);
-
-   /* output 106 */
-   CARRY_FORWARD;
-   SQRADDSC(a[43], a[63]); SQRADDAC(a[44], a[62]); SQRADDAC(a[45], a[61]); SQRADDAC(a[46], a[60]); SQRADDAC(a[47], a[59]); SQRADDAC(a[48], a[58]); SQRADDAC(a[49], a[57]); SQRADDAC(a[50], a[56]); SQRADDAC(a[51], a[55]); SQRADDAC(a[52], a[54]); SQRADDDB; SQRADD(a[53], a[53]); 
-   COMBA_STORE(b[106]);
-
-   /* output 107 */
-   CARRY_FORWARD;
-   SQRADDSC(a[44], a[63]); SQRADDAC(a[45], a[62]); SQRADDAC(a[46], a[61]); SQRADDAC(a[47], a[60]); SQRADDAC(a[48], a[59]); SQRADDAC(a[49], a[58]); SQRADDAC(a[50], a[57]); SQRADDAC(a[51], a[56]); SQRADDAC(a[52], a[55]); SQRADDAC(a[53], a[54]); SQRADDDB; 
-   COMBA_STORE(b[107]);
-
-   /* output 108 */
-   CARRY_FORWARD;
-   SQRADDSC(a[45], a[63]); SQRADDAC(a[46], a[62]); SQRADDAC(a[47], a[61]); SQRADDAC(a[48], a[60]); SQRADDAC(a[49], a[59]); SQRADDAC(a[50], a[58]); SQRADDAC(a[51], a[57]); SQRADDAC(a[52], a[56]); SQRADDAC(a[53], a[55]); SQRADDDB; SQRADD(a[54], a[54]); 
-   COMBA_STORE(b[108]);
-
-   /* output 109 */
-   CARRY_FORWARD;
-   SQRADDSC(a[46], a[63]); SQRADDAC(a[47], a[62]); SQRADDAC(a[48], a[61]); SQRADDAC(a[49], a[60]); SQRADDAC(a[50], a[59]); SQRADDAC(a[51], a[58]); SQRADDAC(a[52], a[57]); SQRADDAC(a[53], a[56]); SQRADDAC(a[54], a[55]); SQRADDDB; 
-   COMBA_STORE(b[109]);
-
-   /* output 110 */
-   CARRY_FORWARD;
-   SQRADDSC(a[47], a[63]); SQRADDAC(a[48], a[62]); SQRADDAC(a[49], a[61]); SQRADDAC(a[50], a[60]); SQRADDAC(a[51], a[59]); SQRADDAC(a[52], a[58]); SQRADDAC(a[53], a[57]); SQRADDAC(a[54], a[56]); SQRADDDB; SQRADD(a[55], a[55]); 
-   COMBA_STORE(b[110]);
-
-   /* output 111 */
-   CARRY_FORWARD;
-   SQRADDSC(a[48], a[63]); SQRADDAC(a[49], a[62]); SQRADDAC(a[50], a[61]); SQRADDAC(a[51], a[60]); SQRADDAC(a[52], a[59]); SQRADDAC(a[53], a[58]); SQRADDAC(a[54], a[57]); SQRADDAC(a[55], a[56]); SQRADDDB; 
-   COMBA_STORE(b[111]);
-
-   /* output 112 */
-   CARRY_FORWARD;
-   SQRADDSC(a[49], a[63]); SQRADDAC(a[50], a[62]); SQRADDAC(a[51], a[61]); SQRADDAC(a[52], a[60]); SQRADDAC(a[53], a[59]); SQRADDAC(a[54], a[58]); SQRADDAC(a[55], a[57]); SQRADDDB; SQRADD(a[56], a[56]); 
-   COMBA_STORE(b[112]);
-
-   /* output 113 */
-   CARRY_FORWARD;
-   SQRADDSC(a[50], a[63]); SQRADDAC(a[51], a[62]); SQRADDAC(a[52], a[61]); SQRADDAC(a[53], a[60]); SQRADDAC(a[54], a[59]); SQRADDAC(a[55], a[58]); SQRADDAC(a[56], a[57]); SQRADDDB; 
-   COMBA_STORE(b[113]);
-
-   /* output 114 */
-   CARRY_FORWARD;
-   SQRADDSC(a[51], a[63]); SQRADDAC(a[52], a[62]); SQRADDAC(a[53], a[61]); SQRADDAC(a[54], a[60]); SQRADDAC(a[55], a[59]); SQRADDAC(a[56], a[58]); SQRADDDB; SQRADD(a[57], a[57]); 
-   COMBA_STORE(b[114]);
-
-   /* output 115 */
-   CARRY_FORWARD;
-   SQRADDSC(a[52], a[63]); SQRADDAC(a[53], a[62]); SQRADDAC(a[54], a[61]); SQRADDAC(a[55], a[60]); SQRADDAC(a[56], a[59]); SQRADDAC(a[57], a[58]); SQRADDDB; 
-   COMBA_STORE(b[115]);
-
-   /* output 116 */
-   CARRY_FORWARD;
-   SQRADDSC(a[53], a[63]); SQRADDAC(a[54], a[62]); SQRADDAC(a[55], a[61]); SQRADDAC(a[56], a[60]); SQRADDAC(a[57], a[59]); SQRADDDB; SQRADD(a[58], a[58]); 
-   COMBA_STORE(b[116]);
-
-   /* output 117 */
-   CARRY_FORWARD;
-   SQRADDSC(a[54], a[63]); SQRADDAC(a[55], a[62]); SQRADDAC(a[56], a[61]); SQRADDAC(a[57], a[60]); SQRADDAC(a[58], a[59]); SQRADDDB; 
-   COMBA_STORE(b[117]);
-
-   /* output 118 */
-   CARRY_FORWARD;
-   SQRADDSC(a[55], a[63]); SQRADDAC(a[56], a[62]); SQRADDAC(a[57], a[61]); SQRADDAC(a[58], a[60]); SQRADDDB; SQRADD(a[59], a[59]); 
-   COMBA_STORE(b[118]);
-
-   /* output 119 */
-   CARRY_FORWARD;
-   SQRADDSC(a[56], a[63]); SQRADDAC(a[57], a[62]); SQRADDAC(a[58], a[61]); SQRADDAC(a[59], a[60]); SQRADDDB; 
-   COMBA_STORE(b[119]);
-
-   /* output 120 */
-   CARRY_FORWARD;
-   SQRADDSC(a[57], a[63]); SQRADDAC(a[58], a[62]); SQRADDAC(a[59], a[61]); SQRADDDB; SQRADD(a[60], a[60]); 
-   COMBA_STORE(b[120]);
-
-   /* output 121 */
-   CARRY_FORWARD;
-   SQRADDSC(a[58], a[63]); SQRADDAC(a[59], a[62]); SQRADDAC(a[60], a[61]); SQRADDDB; 
-   COMBA_STORE(b[121]);
-
-   /* output 122 */
-   CARRY_FORWARD;
-   SQRADD2(a[59], a[63]); SQRADD2(a[60], a[62]); SQRADD(a[61], a[61]); 
-   COMBA_STORE(b[122]);
-
-   /* output 123 */
-   CARRY_FORWARD;
-   SQRADD2(a[60], a[63]); SQRADD2(a[61], a[62]); 
-   COMBA_STORE(b[123]);
-
-   /* output 124 */
-   CARRY_FORWARD;
-   SQRADD2(a[61], a[63]); SQRADD(a[62], a[62]); 
-   COMBA_STORE(b[124]);
-
-   /* output 125 */
-   CARRY_FORWARD;
-   SQRADD2(a[62], a[63]); 
-   COMBA_STORE(b[125]);
-
-   /* output 126 */
-   CARRY_FORWARD;
-   SQRADD(a[63], a[63]); 
-   COMBA_STORE(b[126]);
-   COMBA_STORE2(b[127]);
-   COMBA_FINI;
-
-   B->used = 128;
-   B->sign = FP_ZPOS;
-   XMEMCPY(B->dp, b, 128 * sizeof(fp_digit));
-   fp_clamp(B);
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif
-
-

+ 0 - 127
lib/wolfssl/wolfcrypt/src/fp_sqr_comba_7.i

@@ -1,127 +0,0 @@
-/* fp_sqr_comba_7.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_SQR7
-int fp_sqr_comba7(fp_int *A, fp_int *B)
-{
-   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
-#ifdef TFM_ISO
-   fp_word tt;
-#endif
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit b[14];
-#else
-   fp_digit *b;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 14, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (b == NULL)
-      return FP_MEM;
-#endif
-
-   a = A->dp;
-   COMBA_START; 
-
-   /* clear carries */
-   CLEAR_CARRY;
-
-   /* output 0 */
-   SQRADD(a[0],a[0]);
-   COMBA_STORE(b[0]);
-
-   /* output 1 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[1]); 
-   COMBA_STORE(b[1]);
-
-   /* output 2 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]); 
-   COMBA_STORE(b[2]);
-
-   /* output 3 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]); 
-   COMBA_STORE(b[3]);
-
-   /* output 4 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]); 
-   COMBA_STORE(b[4]);
-
-   /* output 5 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-   COMBA_STORE(b[5]);
-
-   /* output 6 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); 
-   COMBA_STORE(b[6]);
-
-   /* output 7 */
-   CARRY_FORWARD;
-   SQRADDSC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; 
-   COMBA_STORE(b[7]);
-
-   /* output 8 */
-   CARRY_FORWARD;
-   SQRADD2(a[2], a[6]); SQRADD2(a[3], a[5]); SQRADD(a[4], a[4]); 
-   COMBA_STORE(b[8]);
-
-   /* output 9 */
-   CARRY_FORWARD;
-   SQRADD2(a[3], a[6]); SQRADD2(a[4], a[5]); 
-   COMBA_STORE(b[9]);
-
-   /* output 10 */
-   CARRY_FORWARD;
-   SQRADD2(a[4], a[6]); SQRADD(a[5], a[5]); 
-   COMBA_STORE(b[10]);
-
-   /* output 11 */
-   CARRY_FORWARD;
-   SQRADD2(a[5], a[6]); 
-   COMBA_STORE(b[11]);
-
-   /* output 12 */
-   CARRY_FORWARD;
-   SQRADD(a[6], a[6]); 
-   COMBA_STORE(b[12]);
-   COMBA_STORE2(b[13]);
-   COMBA_FINI;
-
-   B->used = 14;
-   B->sign = FP_ZPOS;
-   XMEMCPY(B->dp, b, 14 * sizeof(fp_digit));
-   fp_clamp(B);
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif
-
-

+ 0 - 137
lib/wolfssl/wolfcrypt/src/fp_sqr_comba_8.i

@@ -1,137 +0,0 @@
-/* fp_sqr_comba_8.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_SQR8
-int fp_sqr_comba8(fp_int *A, fp_int *B)
-{
-   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
-#ifdef TFM_ISO
-   fp_word tt;
-#endif
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit b[16];
-#else
-   fp_digit *b;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 16, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (b == NULL)
-      return FP_MEM;
-#endif
-
-   a = A->dp;
-   COMBA_START; 
-
-   /* clear carries */
-   CLEAR_CARRY;
-
-   /* output 0 */
-   SQRADD(a[0],a[0]);
-   COMBA_STORE(b[0]);
-
-   /* output 1 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[1]); 
-   COMBA_STORE(b[1]);
-
-   /* output 2 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]); 
-   COMBA_STORE(b[2]);
-
-   /* output 3 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]); 
-   COMBA_STORE(b[3]);
-
-   /* output 4 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]); 
-   COMBA_STORE(b[4]);
-
-   /* output 5 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-   COMBA_STORE(b[5]);
-
-   /* output 6 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); 
-   COMBA_STORE(b[6]);
-
-   /* output 7 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; 
-   COMBA_STORE(b[7]);
-
-   /* output 8 */
-   CARRY_FORWARD;
-   SQRADDSC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
-   COMBA_STORE(b[8]);
-
-   /* output 9 */
-   CARRY_FORWARD;
-   SQRADDSC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB; 
-   COMBA_STORE(b[9]);
-
-   /* output 10 */
-   CARRY_FORWARD;
-   SQRADD2(a[3], a[7]); SQRADD2(a[4], a[6]); SQRADD(a[5], a[5]); 
-   COMBA_STORE(b[10]);
-
-   /* output 11 */
-   CARRY_FORWARD;
-   SQRADD2(a[4], a[7]); SQRADD2(a[5], a[6]); 
-   COMBA_STORE(b[11]);
-
-   /* output 12 */
-   CARRY_FORWARD;
-   SQRADD2(a[5], a[7]); SQRADD(a[6], a[6]); 
-   COMBA_STORE(b[12]);
-
-   /* output 13 */
-   CARRY_FORWARD;
-   SQRADD2(a[6], a[7]); 
-   COMBA_STORE(b[13]);
-
-   /* output 14 */
-   CARRY_FORWARD;
-   SQRADD(a[7], a[7]); 
-   COMBA_STORE(b[14]);
-   COMBA_STORE2(b[15]);
-   COMBA_FINI;
-
-   B->used = 16;
-   B->sign = FP_ZPOS;
-   XMEMCPY(B->dp, b, 16 * sizeof(fp_digit));
-   fp_clamp(B);
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif
-
-

+ 0 - 147
lib/wolfssl/wolfcrypt/src/fp_sqr_comba_9.i

@@ -1,147 +0,0 @@
-/* fp_sqr_comba_9.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#ifdef TFM_SQR9
-int fp_sqr_comba9(fp_int *A, fp_int *B)
-{
-   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
-#ifdef TFM_ISO
-   fp_word tt;
-#endif
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit b[18];
-#else
-   fp_digit *b;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 18, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (b == NULL)
-      return FP_MEM;
-#endif
-
-   a = A->dp;
-   COMBA_START; 
-
-   /* clear carries */
-   CLEAR_CARRY;
-
-   /* output 0 */
-   SQRADD(a[0],a[0]);
-   COMBA_STORE(b[0]);
-
-   /* output 1 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[1]); 
-   COMBA_STORE(b[1]);
-
-   /* output 2 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]); 
-   COMBA_STORE(b[2]);
-
-   /* output 3 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]); 
-   COMBA_STORE(b[3]);
-
-   /* output 4 */
-   CARRY_FORWARD;
-   SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]); 
-   COMBA_STORE(b[4]);
-
-   /* output 5 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-   COMBA_STORE(b[5]);
-
-   /* output 6 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); 
-   COMBA_STORE(b[6]);
-
-   /* output 7 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; 
-   COMBA_STORE(b[7]);
-
-   /* output 8 */
-   CARRY_FORWARD;
-   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
-   COMBA_STORE(b[8]);
-
-   /* output 9 */
-   CARRY_FORWARD;
-   SQRADDSC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB; 
-   COMBA_STORE(b[9]);
-
-   /* output 10 */
-   CARRY_FORWARD;
-   SQRADDSC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]); 
-   COMBA_STORE(b[10]);
-
-   /* output 11 */
-   CARRY_FORWARD;
-   SQRADDSC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB; 
-   COMBA_STORE(b[11]);
-
-   /* output 12 */
-   CARRY_FORWARD;
-   SQRADD2(a[4], a[8]); SQRADD2(a[5], a[7]); SQRADD(a[6], a[6]); 
-   COMBA_STORE(b[12]);
-
-   /* output 13 */
-   CARRY_FORWARD;
-   SQRADD2(a[5], a[8]); SQRADD2(a[6], a[7]); 
-   COMBA_STORE(b[13]);
-
-   /* output 14 */
-   CARRY_FORWARD;
-   SQRADD2(a[6], a[8]); SQRADD(a[7], a[7]); 
-   COMBA_STORE(b[14]);
-
-   /* output 15 */
-   CARRY_FORWARD;
-   SQRADD2(a[7], a[8]); 
-   COMBA_STORE(b[15]);
-
-   /* output 16 */
-   CARRY_FORWARD;
-   SQRADD(a[8], a[8]); 
-   COMBA_STORE(b[16]);
-   COMBA_STORE2(b[17]);
-   COMBA_FINI;
-
-   B->used = 18;
-   B->sign = FP_ZPOS;
-   XMEMCPY(B->dp, b, 18 * sizeof(fp_digit));
-   fp_clamp(B);
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-#endif
-
-

+ 0 - 1558
lib/wolfssl/wolfcrypt/src/fp_sqr_comba_small_set.i

@@ -1,1558 +0,0 @@
-/* fp_sqr_comba_small_set.i
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-
-
-#if defined(TFM_SMALL_SET)
-int fp_sqr_comba_small(fp_int *A, fp_int *B)
-{
-   fp_digit *a, c0, c1, c2, sc0 = 0, sc1 = 0, sc2 = 0;
-#ifdef TFM_ISO
-   fp_word tt;
-#endif
-#ifndef WOLFSSL_SMALL_STACK
-   fp_digit b[32];
-#else
-   fp_digit *b;
-#endif
-
-#ifdef WOLFSSL_SMALL_STACK
-   b = (fp_digit*)XMALLOC(sizeof(fp_digit) * 32, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-   if (b == NULL)
-      return FP_MEM;
-#endif
-
-   switch (A->used) { 
-   case 1:
-      a = A->dp;
-      COMBA_START; 
-
-      /* clear carries */
-      CLEAR_CARRY;
-
-      /* output 0 */
-      SQRADD(a[0],a[0]);
-      COMBA_STORE(b[0]);
-      COMBA_STORE2(b[1]);
-      COMBA_FINI;
-
-      B->used = 2;
-      B->sign = FP_ZPOS;
-      XMEMCPY(B->dp, b, 2 * sizeof(fp_digit));
-      fp_clamp(B);
-      break;
-
-   case 2:
-      a = A->dp;
-      COMBA_START; 
-
-      /* clear carries */
-      CLEAR_CARRY;
-
-      /* output 0 */
-      SQRADD(a[0],a[0]);
-      COMBA_STORE(b[0]);
-
-      /* output 1 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[1]); 
-      COMBA_STORE(b[1]);
-
-      /* output 2 */
-      CARRY_FORWARD;
-      SQRADD(a[1], a[1]); 
-      COMBA_STORE(b[2]);
-      COMBA_STORE2(b[3]);
-      COMBA_FINI;
-
-      B->used = 4;
-      B->sign = FP_ZPOS;
-      XMEMCPY(B->dp, b, 4 * sizeof(fp_digit));
-      fp_clamp(B);
-      break;
-
-   case 3:
-      a = A->dp;
-      COMBA_START; 
-
-      /* clear carries */
-      CLEAR_CARRY;
-
-      /* output 0 */
-      SQRADD(a[0],a[0]);
-      COMBA_STORE(b[0]);
-
-      /* output 1 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[1]); 
-      COMBA_STORE(b[1]);
-
-      /* output 2 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[2]);    SQRADD(a[1], a[1]); 
-      COMBA_STORE(b[2]);
-
-      /* output 3 */
-      CARRY_FORWARD;
-      SQRADD2(a[1], a[2]); 
-      COMBA_STORE(b[3]);
-
-      /* output 4 */
-      CARRY_FORWARD;
-      SQRADD(a[2], a[2]); 
-      COMBA_STORE(b[4]);
-      COMBA_STORE2(b[5]);
-      COMBA_FINI;
-
-      B->used = 6;
-      B->sign = FP_ZPOS;
-      XMEMCPY(B->dp, b, 6 * sizeof(fp_digit));
-      fp_clamp(B);
-      break;
-
-   case 4:
-      a = A->dp;
-      COMBA_START; 
-
-      /* clear carries */
-      CLEAR_CARRY;
-
-      /* output 0 */
-      SQRADD(a[0],a[0]);
-      COMBA_STORE(b[0]);
-
-      /* output 1 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[1]); 
-      COMBA_STORE(b[1]);
-
-      /* output 2 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[2]);    SQRADD(a[1], a[1]); 
-      COMBA_STORE(b[2]);
-
-      /* output 3 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[3]);    SQRADD2(a[1], a[2]); 
-      COMBA_STORE(b[3]);
-
-      /* output 4 */
-      CARRY_FORWARD;
-      SQRADD2(a[1], a[3]);    SQRADD(a[2], a[2]); 
-      COMBA_STORE(b[4]);
-
-      /* output 5 */
-      CARRY_FORWARD;
-      SQRADD2(a[2], a[3]); 
-      COMBA_STORE(b[5]);
-
-      /* output 6 */
-      CARRY_FORWARD;
-      SQRADD(a[3], a[3]); 
-      COMBA_STORE(b[6]);
-      COMBA_STORE2(b[7]);
-      COMBA_FINI;
-
-      B->used = 8;
-      B->sign = FP_ZPOS;
-      XMEMCPY(B->dp, b, 8 * sizeof(fp_digit));
-      fp_clamp(B);
-      break;
-
-   case 5:
-      a = A->dp;
-      COMBA_START; 
-
-      /* clear carries */
-      CLEAR_CARRY;
-
-      /* output 0 */
-      SQRADD(a[0],a[0]);
-      COMBA_STORE(b[0]);
-
-      /* output 1 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[1]); 
-      COMBA_STORE(b[1]);
-
-      /* output 2 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[2]);    SQRADD(a[1], a[1]); 
-      COMBA_STORE(b[2]);
-
-      /* output 3 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[3]);    SQRADD2(a[1], a[2]); 
-      COMBA_STORE(b[3]);
-
-      /* output 4 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[4]);    SQRADD2(a[1], a[3]);    SQRADD(a[2], a[2]); 
-      COMBA_STORE(b[4]);
-
-      /* output 5 */
-      CARRY_FORWARD;
-      SQRADD2(a[1], a[4]);    SQRADD2(a[2], a[3]); 
-      COMBA_STORE(b[5]);
-
-      /* output 6 */
-      CARRY_FORWARD;
-      SQRADD2(a[2], a[4]);    SQRADD(a[3], a[3]); 
-      COMBA_STORE(b[6]);
-
-      /* output 7 */
-      CARRY_FORWARD;
-      SQRADD2(a[3], a[4]); 
-      COMBA_STORE(b[7]);
-
-      /* output 8 */
-      CARRY_FORWARD;
-      SQRADD(a[4], a[4]); 
-      COMBA_STORE(b[8]);
-      COMBA_STORE2(b[9]);
-      COMBA_FINI;
-
-      B->used = 10;
-      B->sign = FP_ZPOS;
-      XMEMCPY(B->dp, b, 10 * sizeof(fp_digit));
-      fp_clamp(B);
-      break;
-
-   case 6:
-      a = A->dp;
-      COMBA_START; 
-
-      /* clear carries */
-      CLEAR_CARRY;
-
-      /* output 0 */
-      SQRADD(a[0],a[0]);
-      COMBA_STORE(b[0]);
-
-      /* output 1 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[1]); 
-      COMBA_STORE(b[1]);
-
-      /* output 2 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[2]);    SQRADD(a[1], a[1]); 
-      COMBA_STORE(b[2]);
-
-      /* output 3 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[3]);    SQRADD2(a[1], a[2]); 
-      COMBA_STORE(b[3]);
-
-      /* output 4 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[4]);    SQRADD2(a[1], a[3]);    SQRADD(a[2], a[2]); 
-      COMBA_STORE(b[4]);
-
-      /* output 5 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-      COMBA_STORE(b[5]);
-
-      /* output 6 */
-      CARRY_FORWARD;
-      SQRADD2(a[1], a[5]);    SQRADD2(a[2], a[4]);    SQRADD(a[3], a[3]); 
-      COMBA_STORE(b[6]);
-
-      /* output 7 */
-      CARRY_FORWARD;
-      SQRADD2(a[2], a[5]);    SQRADD2(a[3], a[4]); 
-      COMBA_STORE(b[7]);
-
-      /* output 8 */
-      CARRY_FORWARD;
-      SQRADD2(a[3], a[5]);    SQRADD(a[4], a[4]); 
-      COMBA_STORE(b[8]);
-
-      /* output 9 */
-      CARRY_FORWARD;
-      SQRADD2(a[4], a[5]); 
-      COMBA_STORE(b[9]);
-
-      /* output 10 */
-      CARRY_FORWARD;
-      SQRADD(a[5], a[5]); 
-      COMBA_STORE(b[10]);
-      COMBA_STORE2(b[11]);
-      COMBA_FINI;
-
-      B->used = 12;
-      B->sign = FP_ZPOS;
-      XMEMCPY(B->dp, b, 12 * sizeof(fp_digit));
-      fp_clamp(B);
-      break;
-
-   case 7:
-      a = A->dp;
-      COMBA_START; 
-
-      /* clear carries */
-      CLEAR_CARRY;
-
-      /* output 0 */
-      SQRADD(a[0],a[0]);
-      COMBA_STORE(b[0]);
-
-      /* output 1 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[1]); 
-      COMBA_STORE(b[1]);
-
-      /* output 2 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[2]);    SQRADD(a[1], a[1]); 
-      COMBA_STORE(b[2]);
-
-      /* output 3 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[3]);    SQRADD2(a[1], a[2]); 
-      COMBA_STORE(b[3]);
-
-      /* output 4 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[4]);    SQRADD2(a[1], a[3]);    SQRADD(a[2], a[2]); 
-      COMBA_STORE(b[4]);
-
-      /* output 5 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-      COMBA_STORE(b[5]);
-
-      /* output 6 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); 
-      COMBA_STORE(b[6]);
-
-      /* output 7 */
-      CARRY_FORWARD;
-   SQRADDSC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; 
-      COMBA_STORE(b[7]);
-
-      /* output 8 */
-      CARRY_FORWARD;
-      SQRADD2(a[2], a[6]);    SQRADD2(a[3], a[5]);    SQRADD(a[4], a[4]); 
-      COMBA_STORE(b[8]);
-
-      /* output 9 */
-      CARRY_FORWARD;
-      SQRADD2(a[3], a[6]);    SQRADD2(a[4], a[5]); 
-      COMBA_STORE(b[9]);
-
-      /* output 10 */
-      CARRY_FORWARD;
-      SQRADD2(a[4], a[6]);    SQRADD(a[5], a[5]); 
-      COMBA_STORE(b[10]);
-
-      /* output 11 */
-      CARRY_FORWARD;
-      SQRADD2(a[5], a[6]); 
-      COMBA_STORE(b[11]);
-
-      /* output 12 */
-      CARRY_FORWARD;
-      SQRADD(a[6], a[6]); 
-      COMBA_STORE(b[12]);
-      COMBA_STORE2(b[13]);
-      COMBA_FINI;
-
-      B->used = 14;
-      B->sign = FP_ZPOS;
-      XMEMCPY(B->dp, b, 14 * sizeof(fp_digit));
-      fp_clamp(B);
-      break;
-
-   case 8:
-      a = A->dp;
-      COMBA_START; 
-
-      /* clear carries */
-      CLEAR_CARRY;
-
-      /* output 0 */
-      SQRADD(a[0],a[0]);
-      COMBA_STORE(b[0]);
-
-      /* output 1 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[1]); 
-      COMBA_STORE(b[1]);
-
-      /* output 2 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[2]);    SQRADD(a[1], a[1]); 
-      COMBA_STORE(b[2]);
-
-      /* output 3 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[3]);    SQRADD2(a[1], a[2]); 
-      COMBA_STORE(b[3]);
-
-      /* output 4 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[4]);    SQRADD2(a[1], a[3]);    SQRADD(a[2], a[2]); 
-      COMBA_STORE(b[4]);
-
-      /* output 5 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-      COMBA_STORE(b[5]);
-
-      /* output 6 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); 
-      COMBA_STORE(b[6]);
-
-      /* output 7 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; 
-      COMBA_STORE(b[7]);
-
-      /* output 8 */
-      CARRY_FORWARD;
-   SQRADDSC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
-      COMBA_STORE(b[8]);
-
-      /* output 9 */
-      CARRY_FORWARD;
-   SQRADDSC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB; 
-      COMBA_STORE(b[9]);
-
-      /* output 10 */
-      CARRY_FORWARD;
-      SQRADD2(a[3], a[7]);    SQRADD2(a[4], a[6]);    SQRADD(a[5], a[5]); 
-      COMBA_STORE(b[10]);
-
-      /* output 11 */
-      CARRY_FORWARD;
-      SQRADD2(a[4], a[7]);    SQRADD2(a[5], a[6]); 
-      COMBA_STORE(b[11]);
-
-      /* output 12 */
-      CARRY_FORWARD;
-      SQRADD2(a[5], a[7]);    SQRADD(a[6], a[6]); 
-      COMBA_STORE(b[12]);
-
-      /* output 13 */
-      CARRY_FORWARD;
-      SQRADD2(a[6], a[7]); 
-      COMBA_STORE(b[13]);
-
-      /* output 14 */
-      CARRY_FORWARD;
-      SQRADD(a[7], a[7]); 
-      COMBA_STORE(b[14]);
-      COMBA_STORE2(b[15]);
-      COMBA_FINI;
-
-      B->used = 16;
-      B->sign = FP_ZPOS;
-      XMEMCPY(B->dp, b, 16 * sizeof(fp_digit));
-      fp_clamp(B);
-      break;
-
-   case 9:
-      a = A->dp;
-      COMBA_START; 
-
-      /* clear carries */
-      CLEAR_CARRY;
-
-      /* output 0 */
-      SQRADD(a[0],a[0]);
-      COMBA_STORE(b[0]);
-
-      /* output 1 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[1]); 
-      COMBA_STORE(b[1]);
-
-      /* output 2 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[2]);    SQRADD(a[1], a[1]); 
-      COMBA_STORE(b[2]);
-
-      /* output 3 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[3]);    SQRADD2(a[1], a[2]); 
-      COMBA_STORE(b[3]);
-
-      /* output 4 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[4]);    SQRADD2(a[1], a[3]);    SQRADD(a[2], a[2]); 
-      COMBA_STORE(b[4]);
-
-      /* output 5 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-      COMBA_STORE(b[5]);
-
-      /* output 6 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); 
-      COMBA_STORE(b[6]);
-
-      /* output 7 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; 
-      COMBA_STORE(b[7]);
-
-      /* output 8 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
-      COMBA_STORE(b[8]);
-
-      /* output 9 */
-      CARRY_FORWARD;
-   SQRADDSC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB; 
-      COMBA_STORE(b[9]);
-
-      /* output 10 */
-      CARRY_FORWARD;
-   SQRADDSC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]); 
-      COMBA_STORE(b[10]);
-
-      /* output 11 */
-      CARRY_FORWARD;
-   SQRADDSC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB; 
-      COMBA_STORE(b[11]);
-
-      /* output 12 */
-      CARRY_FORWARD;
-      SQRADD2(a[4], a[8]);    SQRADD2(a[5], a[7]);    SQRADD(a[6], a[6]); 
-      COMBA_STORE(b[12]);
-
-      /* output 13 */
-      CARRY_FORWARD;
-      SQRADD2(a[5], a[8]);    SQRADD2(a[6], a[7]); 
-      COMBA_STORE(b[13]);
-
-      /* output 14 */
-      CARRY_FORWARD;
-      SQRADD2(a[6], a[8]);    SQRADD(a[7], a[7]); 
-      COMBA_STORE(b[14]);
-
-      /* output 15 */
-      CARRY_FORWARD;
-      SQRADD2(a[7], a[8]); 
-      COMBA_STORE(b[15]);
-
-      /* output 16 */
-      CARRY_FORWARD;
-      SQRADD(a[8], a[8]); 
-      COMBA_STORE(b[16]);
-      COMBA_STORE2(b[17]);
-      COMBA_FINI;
-
-      B->used = 18;
-      B->sign = FP_ZPOS;
-      XMEMCPY(B->dp, b, 18 * sizeof(fp_digit));
-      fp_clamp(B);
-      break;
-
-   case 10:
-      a = A->dp;
-      COMBA_START; 
-
-      /* clear carries */
-      CLEAR_CARRY;
-
-      /* output 0 */
-      SQRADD(a[0],a[0]);
-      COMBA_STORE(b[0]);
-
-      /* output 1 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[1]); 
-      COMBA_STORE(b[1]);
-
-      /* output 2 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[2]);    SQRADD(a[1], a[1]); 
-      COMBA_STORE(b[2]);
-
-      /* output 3 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[3]);    SQRADD2(a[1], a[2]); 
-      COMBA_STORE(b[3]);
-
-      /* output 4 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[4]);    SQRADD2(a[1], a[3]);    SQRADD(a[2], a[2]); 
-      COMBA_STORE(b[4]);
-
-      /* output 5 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-      COMBA_STORE(b[5]);
-
-      /* output 6 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); 
-      COMBA_STORE(b[6]);
-
-      /* output 7 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; 
-      COMBA_STORE(b[7]);
-
-      /* output 8 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
-      COMBA_STORE(b[8]);
-
-      /* output 9 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB; 
-      COMBA_STORE(b[9]);
-
-      /* output 10 */
-      CARRY_FORWARD;
-   SQRADDSC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]); 
-      COMBA_STORE(b[10]);
-
-      /* output 11 */
-      CARRY_FORWARD;
-   SQRADDSC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB; 
-      COMBA_STORE(b[11]);
-
-      /* output 12 */
-      CARRY_FORWARD;
-   SQRADDSC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]); 
-      COMBA_STORE(b[12]);
-
-      /* output 13 */
-      CARRY_FORWARD;
-   SQRADDSC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB; 
-      COMBA_STORE(b[13]);
-
-      /* output 14 */
-      CARRY_FORWARD;
-      SQRADD2(a[5], a[9]);    SQRADD2(a[6], a[8]);    SQRADD(a[7], a[7]); 
-      COMBA_STORE(b[14]);
-
-      /* output 15 */
-      CARRY_FORWARD;
-      SQRADD2(a[6], a[9]);    SQRADD2(a[7], a[8]); 
-      COMBA_STORE(b[15]);
-
-      /* output 16 */
-      CARRY_FORWARD;
-      SQRADD2(a[7], a[9]);    SQRADD(a[8], a[8]); 
-      COMBA_STORE(b[16]);
-
-      /* output 17 */
-      CARRY_FORWARD;
-      SQRADD2(a[8], a[9]); 
-      COMBA_STORE(b[17]);
-
-      /* output 18 */
-      CARRY_FORWARD;
-      SQRADD(a[9], a[9]); 
-      COMBA_STORE(b[18]);
-      COMBA_STORE2(b[19]);
-      COMBA_FINI;
-
-      B->used = 20;
-      B->sign = FP_ZPOS;
-      XMEMCPY(B->dp, b, 20 * sizeof(fp_digit));
-      fp_clamp(B);
-      break;
-
-   case 11:
-      a = A->dp;
-      COMBA_START; 
-
-      /* clear carries */
-      CLEAR_CARRY;
-
-      /* output 0 */
-      SQRADD(a[0],a[0]);
-      COMBA_STORE(b[0]);
-
-      /* output 1 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[1]); 
-      COMBA_STORE(b[1]);
-
-      /* output 2 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[2]);    SQRADD(a[1], a[1]); 
-      COMBA_STORE(b[2]);
-
-      /* output 3 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[3]);    SQRADD2(a[1], a[2]); 
-      COMBA_STORE(b[3]);
-
-      /* output 4 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[4]);    SQRADD2(a[1], a[3]);    SQRADD(a[2], a[2]); 
-      COMBA_STORE(b[4]);
-
-      /* output 5 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-      COMBA_STORE(b[5]);
-
-      /* output 6 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); 
-      COMBA_STORE(b[6]);
-
-      /* output 7 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; 
-      COMBA_STORE(b[7]);
-
-      /* output 8 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
-      COMBA_STORE(b[8]);
-
-      /* output 9 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB; 
-      COMBA_STORE(b[9]);
-
-      /* output 10 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]); 
-      COMBA_STORE(b[10]);
-
-      /* output 11 */
-      CARRY_FORWARD;
-   SQRADDSC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB; 
-      COMBA_STORE(b[11]);
-
-      /* output 12 */
-      CARRY_FORWARD;
-   SQRADDSC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]); 
-      COMBA_STORE(b[12]);
-
-      /* output 13 */
-      CARRY_FORWARD;
-   SQRADDSC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB; 
-      COMBA_STORE(b[13]);
-
-      /* output 14 */
-      CARRY_FORWARD;
-   SQRADDSC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]); 
-      COMBA_STORE(b[14]);
-
-      /* output 15 */
-      CARRY_FORWARD;
-   SQRADDSC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB; 
-      COMBA_STORE(b[15]);
-
-      /* output 16 */
-      CARRY_FORWARD;
-      SQRADD2(a[6], a[10]);    SQRADD2(a[7], a[9]);    SQRADD(a[8], a[8]); 
-      COMBA_STORE(b[16]);
-
-      /* output 17 */
-      CARRY_FORWARD;
-      SQRADD2(a[7], a[10]);    SQRADD2(a[8], a[9]); 
-      COMBA_STORE(b[17]);
-
-      /* output 18 */
-      CARRY_FORWARD;
-      SQRADD2(a[8], a[10]);    SQRADD(a[9], a[9]); 
-      COMBA_STORE(b[18]);
-
-      /* output 19 */
-      CARRY_FORWARD;
-      SQRADD2(a[9], a[10]); 
-      COMBA_STORE(b[19]);
-
-      /* output 20 */
-      CARRY_FORWARD;
-      SQRADD(a[10], a[10]); 
-      COMBA_STORE(b[20]);
-      COMBA_STORE2(b[21]);
-      COMBA_FINI;
-
-      B->used = 22;
-      B->sign = FP_ZPOS;
-      XMEMCPY(B->dp, b, 22 * sizeof(fp_digit));
-      fp_clamp(B);
-      break;
-
-   case 12:
-      a = A->dp;
-      COMBA_START; 
-
-      /* clear carries */
-      CLEAR_CARRY;
-
-      /* output 0 */
-      SQRADD(a[0],a[0]);
-      COMBA_STORE(b[0]);
-
-      /* output 1 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[1]); 
-      COMBA_STORE(b[1]);
-
-      /* output 2 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[2]);    SQRADD(a[1], a[1]); 
-      COMBA_STORE(b[2]);
-
-      /* output 3 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[3]);    SQRADD2(a[1], a[2]); 
-      COMBA_STORE(b[3]);
-
-      /* output 4 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[4]);    SQRADD2(a[1], a[3]);    SQRADD(a[2], a[2]); 
-      COMBA_STORE(b[4]);
-
-      /* output 5 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-      COMBA_STORE(b[5]);
-
-      /* output 6 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); 
-      COMBA_STORE(b[6]);
-
-      /* output 7 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; 
-      COMBA_STORE(b[7]);
-
-      /* output 8 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
-      COMBA_STORE(b[8]);
-
-      /* output 9 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB; 
-      COMBA_STORE(b[9]);
-
-      /* output 10 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]); 
-      COMBA_STORE(b[10]);
-
-      /* output 11 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB; 
-      COMBA_STORE(b[11]);
-
-      /* output 12 */
-      CARRY_FORWARD;
-   SQRADDSC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]); 
-      COMBA_STORE(b[12]);
-
-      /* output 13 */
-      CARRY_FORWARD;
-   SQRADDSC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB; 
-      COMBA_STORE(b[13]);
-
-      /* output 14 */
-      CARRY_FORWARD;
-   SQRADDSC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]); 
-      COMBA_STORE(b[14]);
-
-      /* output 15 */
-      CARRY_FORWARD;
-   SQRADDSC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB; 
-      COMBA_STORE(b[15]);
-
-      /* output 16 */
-      CARRY_FORWARD;
-   SQRADDSC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]); 
-      COMBA_STORE(b[16]);
-
-      /* output 17 */
-      CARRY_FORWARD;
-   SQRADDSC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB; 
-      COMBA_STORE(b[17]);
-
-      /* output 18 */
-      CARRY_FORWARD;
-      SQRADD2(a[7], a[11]);    SQRADD2(a[8], a[10]);    SQRADD(a[9], a[9]); 
-      COMBA_STORE(b[18]);
-
-      /* output 19 */
-      CARRY_FORWARD;
-      SQRADD2(a[8], a[11]);    SQRADD2(a[9], a[10]); 
-      COMBA_STORE(b[19]);
-
-      /* output 20 */
-      CARRY_FORWARD;
-      SQRADD2(a[9], a[11]);    SQRADD(a[10], a[10]); 
-      COMBA_STORE(b[20]);
-
-      /* output 21 */
-      CARRY_FORWARD;
-      SQRADD2(a[10], a[11]); 
-      COMBA_STORE(b[21]);
-
-      /* output 22 */
-      CARRY_FORWARD;
-      SQRADD(a[11], a[11]); 
-      COMBA_STORE(b[22]);
-      COMBA_STORE2(b[23]);
-      COMBA_FINI;
-
-      B->used = 24;
-      B->sign = FP_ZPOS;
-      XMEMCPY(B->dp, b, 24 * sizeof(fp_digit));
-      fp_clamp(B);
-      break;
-
-   case 13:
-      a = A->dp;
-      COMBA_START; 
-
-      /* clear carries */
-      CLEAR_CARRY;
-
-      /* output 0 */
-      SQRADD(a[0],a[0]);
-      COMBA_STORE(b[0]);
-
-      /* output 1 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[1]); 
-      COMBA_STORE(b[1]);
-
-      /* output 2 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[2]);    SQRADD(a[1], a[1]); 
-      COMBA_STORE(b[2]);
-
-      /* output 3 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[3]);    SQRADD2(a[1], a[2]); 
-      COMBA_STORE(b[3]);
-
-      /* output 4 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[4]);    SQRADD2(a[1], a[3]);    SQRADD(a[2], a[2]); 
-      COMBA_STORE(b[4]);
-
-      /* output 5 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-      COMBA_STORE(b[5]);
-
-      /* output 6 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); 
-      COMBA_STORE(b[6]);
-
-      /* output 7 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; 
-      COMBA_STORE(b[7]);
-
-      /* output 8 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
-      COMBA_STORE(b[8]);
-
-      /* output 9 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB; 
-      COMBA_STORE(b[9]);
-
-      /* output 10 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]); 
-      COMBA_STORE(b[10]);
-
-      /* output 11 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB; 
-      COMBA_STORE(b[11]);
-
-      /* output 12 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]); 
-      COMBA_STORE(b[12]);
-
-      /* output 13 */
-      CARRY_FORWARD;
-   SQRADDSC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB; 
-      COMBA_STORE(b[13]);
-
-      /* output 14 */
-      CARRY_FORWARD;
-   SQRADDSC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]); 
-      COMBA_STORE(b[14]);
-
-      /* output 15 */
-      CARRY_FORWARD;
-   SQRADDSC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB; 
-      COMBA_STORE(b[15]);
-
-      /* output 16 */
-      CARRY_FORWARD;
-   SQRADDSC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]); 
-      COMBA_STORE(b[16]);
-
-      /* output 17 */
-      CARRY_FORWARD;
-   SQRADDSC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB; 
-      COMBA_STORE(b[17]);
-
-      /* output 18 */
-      CARRY_FORWARD;
-   SQRADDSC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]); 
-      COMBA_STORE(b[18]);
-
-      /* output 19 */
-      CARRY_FORWARD;
-   SQRADDSC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB; 
-      COMBA_STORE(b[19]);
-
-      /* output 20 */
-      CARRY_FORWARD;
-      SQRADD2(a[8], a[12]);    SQRADD2(a[9], a[11]);    SQRADD(a[10], a[10]); 
-      COMBA_STORE(b[20]);
-
-      /* output 21 */
-      CARRY_FORWARD;
-      SQRADD2(a[9], a[12]);    SQRADD2(a[10], a[11]); 
-      COMBA_STORE(b[21]);
-
-      /* output 22 */
-      CARRY_FORWARD;
-      SQRADD2(a[10], a[12]);    SQRADD(a[11], a[11]); 
-      COMBA_STORE(b[22]);
-
-      /* output 23 */
-      CARRY_FORWARD;
-      SQRADD2(a[11], a[12]); 
-      COMBA_STORE(b[23]);
-
-      /* output 24 */
-      CARRY_FORWARD;
-      SQRADD(a[12], a[12]); 
-      COMBA_STORE(b[24]);
-      COMBA_STORE2(b[25]);
-      COMBA_FINI;
-
-      B->used = 26;
-      B->sign = FP_ZPOS;
-      XMEMCPY(B->dp, b, 26 * sizeof(fp_digit));
-      fp_clamp(B);
-      break;
-
-   case 14:
-      a = A->dp;
-      COMBA_START; 
-
-      /* clear carries */
-      CLEAR_CARRY;
-
-      /* output 0 */
-      SQRADD(a[0],a[0]);
-      COMBA_STORE(b[0]);
-
-      /* output 1 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[1]); 
-      COMBA_STORE(b[1]);
-
-      /* output 2 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[2]);    SQRADD(a[1], a[1]); 
-      COMBA_STORE(b[2]);
-
-      /* output 3 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[3]);    SQRADD2(a[1], a[2]); 
-      COMBA_STORE(b[3]);
-
-      /* output 4 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[4]);    SQRADD2(a[1], a[3]);    SQRADD(a[2], a[2]); 
-      COMBA_STORE(b[4]);
-
-      /* output 5 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-      COMBA_STORE(b[5]);
-
-      /* output 6 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); 
-      COMBA_STORE(b[6]);
-
-      /* output 7 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; 
-      COMBA_STORE(b[7]);
-
-      /* output 8 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
-      COMBA_STORE(b[8]);
-
-      /* output 9 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB; 
-      COMBA_STORE(b[9]);
-
-      /* output 10 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]); 
-      COMBA_STORE(b[10]);
-
-      /* output 11 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB; 
-      COMBA_STORE(b[11]);
-
-      /* output 12 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]); 
-      COMBA_STORE(b[12]);
-
-      /* output 13 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB; 
-      COMBA_STORE(b[13]);
-
-      /* output 14 */
-      CARRY_FORWARD;
-   SQRADDSC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]); 
-      COMBA_STORE(b[14]);
-
-      /* output 15 */
-      CARRY_FORWARD;
-   SQRADDSC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB; 
-      COMBA_STORE(b[15]);
-
-      /* output 16 */
-      CARRY_FORWARD;
-   SQRADDSC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]); 
-      COMBA_STORE(b[16]);
-
-      /* output 17 */
-      CARRY_FORWARD;
-   SQRADDSC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB; 
-      COMBA_STORE(b[17]);
-
-      /* output 18 */
-      CARRY_FORWARD;
-   SQRADDSC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]); 
-      COMBA_STORE(b[18]);
-
-      /* output 19 */
-      CARRY_FORWARD;
-   SQRADDSC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB; 
-      COMBA_STORE(b[19]);
-
-      /* output 20 */
-      CARRY_FORWARD;
-   SQRADDSC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]); 
-      COMBA_STORE(b[20]);
-
-      /* output 21 */
-      CARRY_FORWARD;
-   SQRADDSC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB; 
-      COMBA_STORE(b[21]);
-
-      /* output 22 */
-      CARRY_FORWARD;
-      SQRADD2(a[9], a[13]);    SQRADD2(a[10], a[12]);    SQRADD(a[11], a[11]); 
-      COMBA_STORE(b[22]);
-
-      /* output 23 */
-      CARRY_FORWARD;
-      SQRADD2(a[10], a[13]);    SQRADD2(a[11], a[12]); 
-      COMBA_STORE(b[23]);
-
-      /* output 24 */
-      CARRY_FORWARD;
-      SQRADD2(a[11], a[13]);    SQRADD(a[12], a[12]); 
-      COMBA_STORE(b[24]);
-
-      /* output 25 */
-      CARRY_FORWARD;
-      SQRADD2(a[12], a[13]); 
-      COMBA_STORE(b[25]);
-
-      /* output 26 */
-      CARRY_FORWARD;
-      SQRADD(a[13], a[13]); 
-      COMBA_STORE(b[26]);
-      COMBA_STORE2(b[27]);
-      COMBA_FINI;
-
-      B->used = 28;
-      B->sign = FP_ZPOS;
-      XMEMCPY(B->dp, b, 28 * sizeof(fp_digit));
-      fp_clamp(B);
-      break;
-
-   case 15:
-      a = A->dp;
-      COMBA_START; 
-
-      /* clear carries */
-      CLEAR_CARRY;
-
-      /* output 0 */
-      SQRADD(a[0],a[0]);
-      COMBA_STORE(b[0]);
-
-      /* output 1 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[1]); 
-      COMBA_STORE(b[1]);
-
-      /* output 2 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[2]);    SQRADD(a[1], a[1]); 
-      COMBA_STORE(b[2]);
-
-      /* output 3 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[3]);    SQRADD2(a[1], a[2]); 
-      COMBA_STORE(b[3]);
-
-      /* output 4 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[4]);    SQRADD2(a[1], a[3]);    SQRADD(a[2], a[2]); 
-      COMBA_STORE(b[4]);
-
-      /* output 5 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-      COMBA_STORE(b[5]);
-
-      /* output 6 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); 
-      COMBA_STORE(b[6]);
-
-      /* output 7 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; 
-      COMBA_STORE(b[7]);
-
-      /* output 8 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
-      COMBA_STORE(b[8]);
-
-      /* output 9 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB; 
-      COMBA_STORE(b[9]);
-
-      /* output 10 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]); 
-      COMBA_STORE(b[10]);
-
-      /* output 11 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB; 
-      COMBA_STORE(b[11]);
-
-      /* output 12 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]); 
-      COMBA_STORE(b[12]);
-
-      /* output 13 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB; 
-      COMBA_STORE(b[13]);
-
-      /* output 14 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]); 
-      COMBA_STORE(b[14]);
-
-      /* output 15 */
-      CARRY_FORWARD;
-   SQRADDSC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB; 
-      COMBA_STORE(b[15]);
-
-      /* output 16 */
-      CARRY_FORWARD;
-   SQRADDSC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]); 
-      COMBA_STORE(b[16]);
-
-      /* output 17 */
-      CARRY_FORWARD;
-   SQRADDSC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB; 
-      COMBA_STORE(b[17]);
-
-      /* output 18 */
-      CARRY_FORWARD;
-   SQRADDSC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]); 
-      COMBA_STORE(b[18]);
-
-      /* output 19 */
-      CARRY_FORWARD;
-   SQRADDSC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB; 
-      COMBA_STORE(b[19]);
-
-      /* output 20 */
-      CARRY_FORWARD;
-   SQRADDSC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]); 
-      COMBA_STORE(b[20]);
-
-      /* output 21 */
-      CARRY_FORWARD;
-   SQRADDSC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB; 
-      COMBA_STORE(b[21]);
-
-      /* output 22 */
-      CARRY_FORWARD;
-   SQRADDSC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]); 
-      COMBA_STORE(b[22]);
-
-      /* output 23 */
-      CARRY_FORWARD;
-   SQRADDSC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB; 
-      COMBA_STORE(b[23]);
-
-      /* output 24 */
-      CARRY_FORWARD;
-      SQRADD2(a[10], a[14]);    SQRADD2(a[11], a[13]);    SQRADD(a[12], a[12]); 
-      COMBA_STORE(b[24]);
-
-      /* output 25 */
-      CARRY_FORWARD;
-      SQRADD2(a[11], a[14]);    SQRADD2(a[12], a[13]); 
-      COMBA_STORE(b[25]);
-
-      /* output 26 */
-      CARRY_FORWARD;
-      SQRADD2(a[12], a[14]);    SQRADD(a[13], a[13]); 
-      COMBA_STORE(b[26]);
-
-      /* output 27 */
-      CARRY_FORWARD;
-      SQRADD2(a[13], a[14]); 
-      COMBA_STORE(b[27]);
-
-      /* output 28 */
-      CARRY_FORWARD;
-      SQRADD(a[14], a[14]); 
-      COMBA_STORE(b[28]);
-      COMBA_STORE2(b[29]);
-      COMBA_FINI;
-
-      B->used = 30;
-      B->sign = FP_ZPOS;
-      XMEMCPY(B->dp, b, 30 * sizeof(fp_digit));
-      fp_clamp(B);
-      break;
-
-   case 16:
-      a = A->dp;
-      COMBA_START; 
-
-      /* clear carries */
-      CLEAR_CARRY;
-
-      /* output 0 */
-      SQRADD(a[0],a[0]);
-      COMBA_STORE(b[0]);
-
-      /* output 1 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[1]); 
-      COMBA_STORE(b[1]);
-
-      /* output 2 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[2]);    SQRADD(a[1], a[1]); 
-      COMBA_STORE(b[2]);
-
-      /* output 3 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[3]);    SQRADD2(a[1], a[2]); 
-      COMBA_STORE(b[3]);
-
-      /* output 4 */
-      CARRY_FORWARD;
-      SQRADD2(a[0], a[4]);    SQRADD2(a[1], a[3]);    SQRADD(a[2], a[2]); 
-      COMBA_STORE(b[4]);
-
-      /* output 5 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB; 
-      COMBA_STORE(b[5]);
-
-      /* output 6 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]); 
-      COMBA_STORE(b[6]);
-
-      /* output 7 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB; 
-      COMBA_STORE(b[7]);
-
-      /* output 8 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]); 
-      COMBA_STORE(b[8]);
-
-      /* output 9 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB; 
-      COMBA_STORE(b[9]);
-
-      /* output 10 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]); 
-      COMBA_STORE(b[10]);
-
-      /* output 11 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB; 
-      COMBA_STORE(b[11]);
-
-      /* output 12 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]); 
-      COMBA_STORE(b[12]);
-
-      /* output 13 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB; 
-      COMBA_STORE(b[13]);
-
-      /* output 14 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]); 
-      COMBA_STORE(b[14]);
-
-      /* output 15 */
-      CARRY_FORWARD;
-   SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB; 
-      COMBA_STORE(b[15]);
-
-      /* output 16 */
-      CARRY_FORWARD;
-   SQRADDSC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]); 
-      COMBA_STORE(b[16]);
-
-      /* output 17 */
-      CARRY_FORWARD;
-   SQRADDSC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB; 
-      COMBA_STORE(b[17]);
-
-      /* output 18 */
-      CARRY_FORWARD;
-   SQRADDSC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]); 
-      COMBA_STORE(b[18]);
-
-      /* output 19 */
-      CARRY_FORWARD;
-   SQRADDSC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB; 
-      COMBA_STORE(b[19]);
-
-      /* output 20 */
-      CARRY_FORWARD;
-   SQRADDSC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]); 
-      COMBA_STORE(b[20]);
-
-      /* output 21 */
-      CARRY_FORWARD;
-   SQRADDSC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB; 
-      COMBA_STORE(b[21]);
-
-      /* output 22 */
-      CARRY_FORWARD;
-   SQRADDSC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]); 
-      COMBA_STORE(b[22]);
-
-      /* output 23 */
-      CARRY_FORWARD;
-   SQRADDSC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB; 
-      COMBA_STORE(b[23]);
-
-      /* output 24 */
-      CARRY_FORWARD;
-   SQRADDSC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]); 
-      COMBA_STORE(b[24]);
-
-      /* output 25 */
-      CARRY_FORWARD;
-   SQRADDSC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB; 
-      COMBA_STORE(b[25]);
-
-      /* output 26 */
-      CARRY_FORWARD;
-      SQRADD2(a[11], a[15]);    SQRADD2(a[12], a[14]);    SQRADD(a[13], a[13]); 
-      COMBA_STORE(b[26]);
-
-      /* output 27 */
-      CARRY_FORWARD;
-      SQRADD2(a[12], a[15]);    SQRADD2(a[13], a[14]); 
-      COMBA_STORE(b[27]);
-
-      /* output 28 */
-      CARRY_FORWARD;
-      SQRADD2(a[13], a[15]);    SQRADD(a[14], a[14]); 
-      COMBA_STORE(b[28]);
-
-      /* output 29 */
-      CARRY_FORWARD;
-      SQRADD2(a[14], a[15]); 
-      COMBA_STORE(b[29]);
-
-      /* output 30 */
-      CARRY_FORWARD;
-      SQRADD(a[15], a[15]); 
-      COMBA_STORE(b[30]);
-      COMBA_STORE2(b[31]);
-      COMBA_FINI;
-
-      B->used = 32;
-      B->sign = FP_ZPOS;
-      XMEMCPY(B->dp, b, 32 * sizeof(fp_digit));
-      fp_clamp(B);
-      break;
-
-   default:
-      break;
-   }
-
-#ifdef WOLFSSL_SMALL_STACK
-   XFREE(b, NULL, DYNAMIC_TYPE_TMP_BUFFER);
-#endif
-   return FP_OKAY;
-}
-
-#endif /* TFM_SMALL_SET */

Разлика између датотеке није приказан због своје велике величине
+ 488 - 488
lib/wolfssl/wolfcrypt/src/ge_448.c


+ 0 - 22
lib/wolfssl/wolfcrypt/src/ge_low_mem.c

@@ -441,28 +441,6 @@ void ge_scalarmult_base(ge_p3 *R,const unsigned char *nonce)
 }
 
 
-/* pack the point h into array s */
-void ge_p3_tobytes(unsigned char *s,const ge_p3 *h)
-{
-    byte x[F25519_SIZE];
-    byte y[F25519_SIZE];
-    byte z1[F25519_SIZE];
-    byte parity;
-
-    fe_inv__distinct(z1, h->Z);
-    fe_mul__distinct(x, h->X, z1);
-    fe_mul__distinct(y, h->Y, z1);
-
-    fe_normalize(x);
-    fe_normalize(y);
-
-    parity = (x[0] & 1) << 7;
-    lm_copy(s, y);
-    fe_normalize(s);
-    s[31] |= parity;
-}
-
-
 /* pack the point h into array s */
 void ge_tobytes(unsigned char *s,const ge_p2 *h)
 {

+ 115 - 148
lib/wolfssl/wolfcrypt/src/ge_operations.c

@@ -58,19 +58,23 @@
 
 static void ge_p2_0(ge_p2 *h);
 #ifndef CURVED25519_ASM
+#if defined(HAVE_ED25519_SIGN) || defined(HAVE_ED25519_MAKE_KEY)
 static void ge_precomp_0(ge_precomp *h);
+#endif
 static void ge_p3_to_p2(ge_p2 *r,const ge_p3 *p);
 #endif
 static WC_INLINE void ge_p3_to_cached(ge_cached *r,const ge_p3 *p);
+
+#ifndef CURVED25519_ASM
 static void ge_p1p1_to_p2(ge_p2 *r,const ge_p1p1 *p);
 static WC_INLINE void ge_p1p1_to_p3(ge_p3 *r,const ge_p1p1 *p);
 static WC_INLINE void ge_p2_dbl(ge_p1p1 *r,const ge_p2 *p);
 static void ge_p3_dbl(ge_p1p1 *r,const ge_p3 *p);
-
 static WC_INLINE void ge_madd(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q);
 static WC_INLINE void ge_msub(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q);
 static WC_INLINE void ge_add(ge_p1p1 *r,const ge_p3 *p,const ge_cached *q);
 static WC_INLINE void ge_sub(ge_p1p1 *r,const ge_p3 *p,const ge_cached *q);
+#endif
 
 /*
 ge means group element.
@@ -95,28 +99,6 @@ Representations:
 #define ORDER_4     0x1dea2f
 #define ORDER_5     0xa6f7c
 
-#ifdef CURVED25519_ASM_32BIT
-word64 load_3(const unsigned char *in)
-{
-  word64 result;
-  result = (word64) in[0];
-  result |= ((word64) in[1]) << 8;
-  result |= ((word64) in[2]) << 16;
-  return result;
-}
-
-
-word64 load_4(const unsigned char *in)
-{
-  word64 result;
-  result = (word64) in[0];
-  result |= ((word64) in[1]) << 8;
-  result |= ((word64) in[2]) << 16;
-  result |= ((word64) in[3]) << 24;
-  return result;
-}
-#endif
-
 /*
 Input:
   s[0]+256*s[1]+...+256^63*s[63] = s
@@ -126,6 +108,7 @@ Output:
   where l = 2^252 + 27742317777372353535851937790883648493.
   Overwrites s in place.
 */
+#ifndef CURVED25519_ASM
 void sc_reduce(byte* s)
 {
     sword64 t[24];
@@ -638,7 +621,9 @@ void sc_muladd(byte* s, const byte* a, const byte* b, const byte* c)
     s[30] = (byte)(t[11] >>  9);
     s[31] = (byte)(t[11] >> 17);
 }
+#endif
 #else
+#ifndef CURVED25519_ASM
 static word64 load_6(const byte* a)
 {
     word64 n;
@@ -744,38 +729,38 @@ void sc_reduce(byte* s)
     carry = t[ 3] >> 42; t[ 4] += carry; t[ 3] &= MASK_42;
     carry = t[ 4] >> 42; t[ 5] += carry; t[ 4] &= MASK_42;
 
-    s[ 0] = (t[ 0] >>  0);
-    s[ 1] = (t[ 0] >>  8);
-    s[ 2] = (t[ 0] >> 16);
-    s[ 3] = (t[ 0] >> 24);
-    s[ 4] = (t[ 0] >> 32);
-    s[ 5] = (t[ 0] >> 40) | (t[ 1] <<  2);
-    s[ 6] = (t[ 1] >>  6);
-    s[ 7] = (t[ 1] >> 14);
-    s[ 8] = (t[ 1] >> 22);
-    s[ 9] = (t[ 1] >> 30);
-    s[10] = (t[ 1] >> 38) | (t[ 2] <<  4);
-    s[11] = (t[ 2] >>  4);
-    s[12] = (t[ 2] >> 12);
-    s[13] = (t[ 2] >> 20);
-    s[14] = (t[ 2] >> 28);
-    s[15] = (t[ 2] >> 36) | (t[ 3] <<  6);
-    s[16] = (t[ 3] >>  2);
-    s[17] = (t[ 3] >> 10);
-    s[18] = (t[ 3] >> 18);
-    s[19] = (t[ 3] >> 26);
-    s[20] = (t[ 3] >> 34);
-    s[21] = (t[ 4] >>  0);
-    s[22] = (t[ 4] >>  8);
-    s[23] = (t[ 4] >> 16);
-    s[24] = (t[ 4] >> 24);
-    s[25] = (t[ 4] >> 32);
-    s[26] = (t[ 4] >> 40) | (t[ 5] <<  2);
-    s[27] = (t[ 5] >>  6);
-    s[28] = (t[ 5] >> 14);
-    s[29] = (t[ 5] >> 22);
-    s[30] = (t[ 5] >> 30);
-    s[31] = (t[ 5] >> 38);
+    s[ 0] = (byte)(t[ 0] >>  0);
+    s[ 1] = (byte)(t[ 0] >>  8);
+    s[ 2] = (byte)(t[ 0] >> 16);
+    s[ 3] = (byte)(t[ 0] >> 24);
+    s[ 4] = (byte)(t[ 0] >> 32);
+    s[ 5] = (byte)(t[ 0] >> 40) | (byte)(t[ 1] <<  2);
+    s[ 6] = (byte)(t[ 1] >>  6);
+    s[ 7] = (byte)(t[ 1] >> 14);
+    s[ 8] = (byte)(t[ 1] >> 22);
+    s[ 9] = (byte)(t[ 1] >> 30);
+    s[10] = (byte)(t[ 1] >> 38) | (byte)(t[ 2] <<  4);
+    s[11] = (byte)(t[ 2] >>  4);
+    s[12] = (byte)(t[ 2] >> 12);
+    s[13] = (byte)(t[ 2] >> 20);
+    s[14] = (byte)(t[ 2] >> 28);
+    s[15] = (byte)(t[ 2] >> 36) | (byte)(t[ 3] <<  6);
+    s[16] = (byte)(t[ 3] >>  2);
+    s[17] = (byte)(t[ 3] >> 10);
+    s[18] = (byte)(t[ 3] >> 18);
+    s[19] = (byte)(t[ 3] >> 26);
+    s[20] = (byte)(t[ 3] >> 34);
+    s[21] = (byte)(t[ 4] >>  0);
+    s[22] = (byte)(t[ 4] >>  8);
+    s[23] = (byte)(t[ 4] >> 16);
+    s[24] = (byte)(t[ 4] >> 24);
+    s[25] = (byte)(t[ 4] >> 32);
+    s[26] = (byte)(t[ 4] >> 40) | (byte)(t[ 5] <<  2);
+    s[27] = (byte)(t[ 5] >>  6);
+    s[28] = (byte)(t[ 5] >> 14);
+    s[29] = (byte)(t[ 5] >> 22);
+    s[30] = (byte)(t[ 5] >> 30);
+    s[31] = (byte)(t[ 5] >> 38);
 }
 
 /*
@@ -896,59 +881,57 @@ void sc_muladd(byte* s, const byte* a, const byte* b, const byte* c)
     carry = t[ 3] >> 42; t[ 4] += carry; t[ 3] &= MASK_42;
     carry = t[ 4] >> 42; t[ 5] += carry; t[ 4] &= MASK_42;
 
-    s[ 0] = (t[ 0] >>  0);
-    s[ 1] = (t[ 0] >>  8);
-    s[ 2] = (t[ 0] >> 16);
-    s[ 3] = (t[ 0] >> 24);
-    s[ 4] = (t[ 0] >> 32);
-    s[ 5] = (t[ 0] >> 40) | (t[ 1] <<  2);
-    s[ 6] = (t[ 1] >>  6);
-    s[ 7] = (t[ 1] >> 14);
-    s[ 8] = (t[ 1] >> 22);
-    s[ 9] = (t[ 1] >> 30);
-    s[10] = (t[ 1] >> 38) | (t[ 2] <<  4);
-    s[11] = (t[ 2] >>  4);
-    s[12] = (t[ 2] >> 12);
-    s[13] = (t[ 2] >> 20);
-    s[14] = (t[ 2] >> 28);
-    s[15] = (t[ 2] >> 36) | (t[ 3] <<  6);
-    s[16] = (t[ 3] >>  2);
-    s[17] = (t[ 3] >> 10);
-    s[18] = (t[ 3] >> 18);
-    s[19] = (t[ 3] >> 26);
-    s[20] = (t[ 3] >> 34);
-    s[21] = (t[ 4] >>  0);
-    s[22] = (t[ 4] >>  8);
-    s[23] = (t[ 4] >> 16);
-    s[24] = (t[ 4] >> 24);
-    s[25] = (t[ 4] >> 32);
-    s[26] = (t[ 4] >> 40) | (t[ 5] <<  2);
-    s[27] = (t[ 5] >>  6);
-    s[28] = (t[ 5] >> 14);
-    s[29] = (t[ 5] >> 22);
-    s[30] = (t[ 5] >> 30);
-    s[31] = (t[ 5] >> 38);
+    s[ 0] = (byte)(t[ 0] >>  0);
+    s[ 1] = (byte)(t[ 0] >>  8);
+    s[ 2] = (byte)(t[ 0] >> 16);
+    s[ 3] = (byte)(t[ 0] >> 24);
+    s[ 4] = (byte)(t[ 0] >> 32);
+    s[ 5] = (byte)(t[ 0] >> 40) | (byte)(t[ 1] <<  2);
+    s[ 6] = (byte)(t[ 1] >>  6);
+    s[ 7] = (byte)(t[ 1] >> 14);
+    s[ 8] = (byte)(t[ 1] >> 22);
+    s[ 9] = (byte)(t[ 1] >> 30);
+    s[10] = (byte)(t[ 1] >> 38) | (byte)(t[ 2] <<  4);
+    s[11] = (byte)(t[ 2] >>  4);
+    s[12] = (byte)(t[ 2] >> 12);
+    s[13] = (byte)(t[ 2] >> 20);
+    s[14] = (byte)(t[ 2] >> 28);
+    s[15] = (byte)(t[ 2] >> 36) | (byte)(t[ 3] <<  6);
+    s[16] = (byte)(t[ 3] >>  2);
+    s[17] = (byte)(t[ 3] >> 10);
+    s[18] = (byte)(t[ 3] >> 18);
+    s[19] = (byte)(t[ 3] >> 26);
+    s[20] = (byte)(t[ 3] >> 34);
+    s[21] = (byte)(t[ 4] >>  0);
+    s[22] = (byte)(t[ 4] >>  8);
+    s[23] = (byte)(t[ 4] >> 16);
+    s[24] = (byte)(t[ 4] >> 24);
+    s[25] = (byte)(t[ 4] >> 32);
+    s[26] = (byte)(t[ 4] >> 40) | (byte)(t[ 5] <<  2);
+    s[27] = (byte)(t[ 5] >>  6);
+    s[28] = (byte)(t[ 5] >> 14);
+    s[29] = (byte)(t[ 5] >> 22);
+    s[30] = (byte)(t[ 5] >> 30);
+    s[31] = (byte)(t[ 5] >> 38);
 }
+#endif /* !CURVED25519_ASM */
 #endif /* !HAVE___UINT128_T || NO_CURVED25519_128BIT */
 
 int ge_compress_key(byte* out, const byte* xIn, const byte* yIn, word32 keySz)
 {
-    ge     x,y,z;
-    ge_p3  g;
-    byte   bArray[ED25519_KEY_SIZE];
+    ge_p2  g;
+    ALIGN16 byte bArray[ED25519_KEY_SIZE];
+    ALIGN16 byte x[ED25519_PUB_KEY_SIZE];
+    ALIGN16 byte y[ED25519_PUB_KEY_SIZE];
     word32 i;
 
-    fe_0(x);
-    fe_0(y);
-    fe_1(z);
-    fe_frombytes(x, xIn);
-    fe_frombytes(y, yIn);
-
-    fe_copy(g.X, x);
-    fe_copy(g.Y, y);
-    fe_copy(g.Z, z);
+    XMEMCPY(x, xIn, ED25519_PUB_KEY_SIZE);
+    XMEMCPY(y, yIn, ED25519_PUB_KEY_SIZE);
+    fe_frombytes(g.X, x);
+    fe_frombytes(g.Y, y);
+    fe_1(g.Z);
 
-    ge_p3_tobytes(bArray, &g);
+    ge_tobytes(bArray, &g);
 
     for (i = 0; i < keySz; i++) {
         out[keySz - 1 - i] = bArray[i];
@@ -961,9 +944,9 @@ int ge_compress_key(byte* out, const byte* xIn, const byte* yIn, word32 keySz)
 /*
 r = p + q
 */
+#ifndef CURVED25519_ASM
 static WC_INLINE void ge_add(ge_p1p1 *r,const ge_p3 *p,const ge_cached *q)
 {
-#ifndef CURVED25519_ASM
     ge t0;
     fe_add(r->X,p->Y,p->X);
     fe_sub(r->Y,p->Y,p->X);
@@ -976,31 +959,27 @@ static WC_INLINE void ge_add(ge_p1p1 *r,const ge_p3 *p,const ge_cached *q)
     fe_add(r->Y,r->Z,r->Y);
     fe_add(r->Z,t0,r->T);
     fe_sub(r->T,t0,r->T);
-#else
-    fe_ge_add(r->X, r->Y, r->Z, r->T, p->X, p->Y, p->Z, p->T, q->Z, q->T2d,
-              q->YplusX, q->YminusX);
-#endif
 }
+#endif
 
 
 #ifndef CURVED25519_ASM
 /* ge_scalar mult base */
-static unsigned char equal(signed char b,signed char c)
+static unsigned char equal(unsigned char b,unsigned char c)
 {
-  unsigned char ub = b;
-  unsigned char uc = c;
-  unsigned char x = ub ^ uc; /* 0: yes; 1..255: no */
+  unsigned char x = b ^ c; /* 0: yes; 1..255: no */
   word32 y = x; /* 0: yes; 1..255: no */
   y -= 1; /* 4294967295: yes; 0..254: no */
   y >>= 31; /* 1: yes; 0: no */
   return (unsigned char)y;
 }
 
-
+#if defined(HAVE_ED25519_SIGN) || defined(HAVE_ED25519_MAKE_KEY)
 static unsigned char negative(signed char b)
 {
   return ((unsigned char)b) >> 7;
 }
+#endif
 
 
 static WC_INLINE void cmov(ge_precomp *t,const ge_precomp *u,unsigned char b,
@@ -1013,6 +992,7 @@ static WC_INLINE void cmov(ge_precomp *t,const ge_precomp *u,unsigned char b,
 }
 #endif
 
+#if defined(HAVE_ED25519_SIGN) || defined(HAVE_ED25519_MAKE_KEY)
 #ifdef CURVED25519_ASM_64BIT
 static const ge_precomp base[64][8] = {
 {
@@ -9098,7 +9078,7 @@ static void ge_select(ge_precomp *t,int pos,signed char b)
 #ifndef CURVED25519_ASM
   ge_precomp minust;
   unsigned char bnegative = negative(b);
-  unsigned char babs = b - (((-bnegative) & b) << 1);
+  unsigned char babs = (unsigned char)(b - (((-bnegative) & b) << 1));
 
   ge_precomp_0(t);
   cmov(t,&base[pos][0],babs,1);
@@ -9148,7 +9128,7 @@ void ge_scalarmult_base(ge_p3 *h,const unsigned char *a)
     e[i] += carry;
     carry = e[i] + 8;
     carry >>= 4;
-    e[i] -= carry << 4;
+    e[i] -= (signed char)(carry << 4);
   }
   e[63] += carry;
   /* each e[i] is between -8 and 8 */
@@ -9190,6 +9170,7 @@ void ge_scalarmult_base(ge_p3 *h,const unsigned char *a)
   }
 #endif
 }
+#endif /* HAVE_ED25519_SIGN || HAVE_ED25519_MAKE_KEY */
 
 
 #define SLIDE_SIZE 256
@@ -9209,9 +9190,9 @@ static void slide(signed char *r,const unsigned char *a)
       for (b = 1;b <= 6 && i + b < SLIDE_SIZE;++b) {
         if (r[i + b]) {
           if (r[i] + (r[i + b] << b) <= 15) {
-            r[i] += r[i + b] << b; r[i + b] = 0;
+            r[i] += (signed char)(r[i + b] << b); r[i + b] = 0;
           } else if (r[i] - (r[i + b] << b) >= -15) {
-            r[i] -= r[i + b] << b;
+            r[i] -= (signed char)(r[i + b] << b);
             for (k = i + b;k < SLIDE_SIZE;++k) {
               if (!r[k]) {
                 r[k] = 1;
@@ -9598,9 +9579,9 @@ int ge_frombytes_negate_vartime(ge_p3 *h,const unsigned char *s)
 r = p + q
 */
 
+#ifndef CURVED25519_ASM
 static WC_INLINE void ge_madd(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q)
 {
-#ifndef CURVED25519_ASM
     ge t0;
     fe_add(r->X,p->Y,p->X);
     fe_sub(r->Y,p->Y,p->X);
@@ -9612,11 +9593,8 @@ static WC_INLINE void ge_madd(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q)
     fe_add(r->Y,r->Z,r->Y);
     fe_add(r->Z,t0,r->T);
     fe_sub(r->T,t0,r->T);
-#else
-    fe_ge_madd(r->X, r->Y, r->Z, r->T, p->X, p->Y, p->Z, p->T, q->xy2d,
-              q->yplusx, q->yminusx);
-#endif
 }
+#endif
 
 
 /* ge msub */
@@ -9625,9 +9603,9 @@ static WC_INLINE void ge_madd(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q)
 r = p - q
 */
 
+#ifndef CURVED25519_ASM
 static WC_INLINE void ge_msub(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q)
 {
-#ifndef CURVED25519_ASM
     ge t0;
     fe_add(r->X,p->Y,p->X);
     fe_sub(r->Y,p->Y,p->X);
@@ -9639,11 +9617,8 @@ static WC_INLINE void ge_msub(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q)
     fe_add(r->Y,r->Z,r->Y);
     fe_sub(r->Z,t0,r->T);
     fe_add(r->T,t0,r->T);
-#else
-    fe_ge_msub(r->X, r->Y, r->Z, r->T, p->X, p->Y, p->Z, p->T, q->xy2d,
-              q->yplusx, q->yminusx);
-#endif
 }
+#endif
 
 
 /* ge p1p1 to p2 */
@@ -9651,16 +9626,14 @@ static WC_INLINE void ge_msub(ge_p1p1 *r,const ge_p3 *p,const ge_precomp *q)
 r = p
 */
 
+#ifndef CURVED25519_ASM
 static void ge_p1p1_to_p2(ge_p2 *r,const ge_p1p1 *p)
 {
-#ifndef CURVED25519_ASM
   fe_mul(r->X,p->X,p->T);
   fe_mul(r->Y,p->Y,p->Z);
   fe_mul(r->Z,p->Z,p->T);
-#else
-  fe_ge_to_p2(r->X, r->Y, r->Z, p->X, p->Y, p->Z, p->T);
-#endif
 }
+#endif
 
 
 /* ge p1p1 to p3 */
@@ -9669,17 +9642,15 @@ static void ge_p1p1_to_p2(ge_p2 *r,const ge_p1p1 *p)
 r = p
 */
 
+#ifndef CURVED25519_ASM
 static WC_INLINE void ge_p1p1_to_p3(ge_p3 *r,const ge_p1p1 *p)
 {
-#ifndef CURVED25519_ASM
   fe_mul(r->X,p->X,p->T);
   fe_mul(r->Y,p->Y,p->Z);
   fe_mul(r->Z,p->Z,p->T);
   fe_mul(r->T,p->X,p->Y);
-#else
-  fe_ge_to_p3(r->X, r->Y, r->Z, r->T, p->X, p->Y, p->Z, p->T);
-#endif
 }
+#endif
 
 
 /* ge p2 0 */
@@ -9698,9 +9669,9 @@ static void ge_p2_0(ge_p2 *h)
 r = 2 * p
 */
 
+#ifndef CURVED25519_ASM
 static WC_INLINE void ge_p2_dbl(ge_p1p1 *r,const ge_p2 *p)
 {
-#ifndef CURVED25519_ASM
     ge t0;
     fe_sq(r->X,p->X);
     fe_sq(r->Z,p->Y);
@@ -9711,10 +9682,8 @@ static WC_INLINE void ge_p2_dbl(ge_p1p1 *r,const ge_p2 *p)
     fe_sub(r->Z,r->Z,r->X);
     fe_sub(r->X,t0,r->Y);
     fe_sub(r->T,r->T,r->Z);
-#else
-    fe_ge_dbl(r->X, r->Y, r->Z, r->T, p->X, p->Y, p->Z);
-#endif
 }
+#endif
 
 
 /* ge p3 dble */
@@ -9723,16 +9692,14 @@ static WC_INLINE void ge_p2_dbl(ge_p1p1 *r,const ge_p2 *p)
 r = 2 * p
 */
 
+#ifndef CURVED25519_ASM
 static void ge_p3_dbl(ge_p1p1 *r,const ge_p3 *p)
 {
-#ifndef CURVED25519_ASM
     ge_p2 q;
     ge_p3_to_p2(&q,p);
     ge_p2_dbl(r,&q);
-#else
-    fe_ge_dbl(r->X, r->Y, r->Z, r->T, p->X, p->Y, p->Z);
-#endif
 }
+#endif
 
 
 /* ge p3 to cached */
@@ -9786,6 +9753,7 @@ static void ge_p3_to_p2(ge_p2 *r,const ge_p3 *p)
 #endif
 
 
+#ifdef GE_P3_TOBYTES_IMPL
 /* ge p3 tobytes */
 void ge_p3_tobytes(unsigned char *s,const ge_p3 *h)
 {
@@ -9797,11 +9765,13 @@ void ge_p3_tobytes(unsigned char *s,const ge_p3 *h)
   fe_mul(x,h->X,recip);
   fe_mul(y,h->Y,recip);
   fe_tobytes(s,y);
-  s[31] ^= fe_isnegative(x) << 7;
+  s[31] ^= (unsigned char)(fe_isnegative(x) << 7);
 }
+#endif
 
 
 #ifndef CURVED25519_ASM
+#if defined(HAVE_ED25519_SIGN) || defined(HAVE_ED25519_MAKE_KEY)
 /* ge_precomp_0 */
 static void ge_precomp_0(ge_precomp *h)
 {
@@ -9810,6 +9780,7 @@ static void ge_precomp_0(ge_precomp *h)
   fe_0(h->xy2d);
 }
 #endif
+#endif
 
 
 /* ge_sub */
@@ -9817,9 +9788,9 @@ static void ge_precomp_0(ge_precomp *h)
 r = p - q
 */
 
+#ifndef CURVED25519_ASM
 static WC_INLINE void ge_sub(ge_p1p1 *r,const ge_p3 *p,const ge_cached *q)
 {
-#ifndef CURVED25519_ASM
     ge t0;
     fe_add(r->X,p->Y,p->X);
     fe_sub(r->Y,p->Y,p->X);
@@ -9832,12 +9803,8 @@ static WC_INLINE void ge_sub(ge_p1p1 *r,const ge_p3 *p,const ge_cached *q)
     fe_add(r->Y,r->Z,r->Y);
     fe_sub(r->Z,t0,r->T);
     fe_add(r->T,t0,r->T);
-#else
-    fe_ge_sub(r->X, r->Y, r->Z, r->T, p->X, p->Y, p->Z, p->T, q->Z, q->T2d,
-              q->YplusX, q->YminusX);
-#endif
 }
-
+#endif
 
 /* ge tobytes */
 void ge_tobytes(unsigned char *s,const ge_p2 *h)
@@ -9850,7 +9817,7 @@ void ge_tobytes(unsigned char *s,const ge_p2 *h)
   fe_mul(x,h->X,recip);
   fe_mul(y,h->Y,recip);
   fe_tobytes(s,y);
-  s[31] ^= fe_isnegative(x) << 7;
+  s[31] ^= (unsigned char)(fe_isnegative(x) << 7);
 }
 
 #endif /* !ED25519_SMALL */

+ 109 - 1
lib/wolfssl/wolfcrypt/src/hash.c

@@ -59,7 +59,8 @@ enum Hash_Sum  {
     SHA3_384h = 422,
     SHA3_512h = 423,
     SHAKE128h = 424,
-    SHAKE256h = 425
+    SHAKE256h = 425,
+    SM3h      = 640     /* 0x2A,0x81,0x1C,0xCF,0x55,0x01,0x83,0x11 */
 };
 #endif /* !NO_ASN */
 
@@ -121,6 +122,11 @@ enum wc_HashType wc_HashTypeConvert(int hashType)
             eHashType = WC_HASH_TYPE_SHA3_512;
             break;
     #endif /* WOLFSSL_SHA3 */
+    #ifdef WOLFSSL_SM3
+        case WC_SM3:
+            eHashType = WC_HASH_TYPE_SM3;
+            break;
+    #endif
         default:
             eHashType = WC_HASH_TYPE_NONE;
             break;
@@ -222,6 +228,11 @@ int wc_HashGetOID(enum wc_HashType hash_type)
             oid = SHAKE256h;
             break;
     #endif
+    #ifdef WOLFSSL_SM3
+        case WC_HASH_TYPE_SM3:
+            oid = SM3h;
+            break;
+    #endif
 
         /* Not Supported */
         case WC_HASH_TYPE_MD4:
@@ -289,6 +300,11 @@ enum wc_HashType wc_OidGetHash(int oid)
             hash_type = WC_HASH_TYPE_SHA3_512;
             break;
     #endif /* WOLFSSL_SHA3 */
+    #ifdef WOLFSSL_SM3
+        case SM3h:
+            hash_type = WC_HASH_TYPE_SM3;
+            break;
+    #endif
         default:
             break;
     }
@@ -395,6 +411,12 @@ int wc_HashGetDigestSize(enum wc_HashType hash_type)
         #endif
             break;
 
+    #ifdef WOLFSSL_SM3
+        case WC_HASH_TYPE_SM3:
+            dig_size = WC_SM3_DIGEST_SIZE;
+            break;
+    #endif
+
         /* Not Supported */
     #if defined(WOLFSSL_SHA3) && defined(WOLFSSL_SHAKE128)
         case WC_HASH_TYPE_SHAKE128:
@@ -508,6 +530,12 @@ int wc_HashGetBlockSize(enum wc_HashType hash_type)
         #endif
             break;
 
+    #ifdef WOLFSSL_SM3
+        case WC_HASH_TYPE_SM3:
+            block_size = WC_SM3_BLOCK_SIZE;
+            break;
+    #endif
+
         /* Not Supported */
     #if defined(WOLFSSL_SHA3) && defined(WOLFSSL_SHAKE128)
         case WC_HASH_TYPE_SHAKE128:
@@ -626,6 +654,12 @@ int wc_Hash(enum wc_HashType hash_type, const byte* data,
 #endif
             break;
 
+    #ifdef WOLFSSL_SM3
+        case WC_HASH_TYPE_SM3:
+            ret = wc_Sm3Hash(data, data_len, hash);
+            break;
+    #endif
+
         /* Not Supported */
         case WC_HASH_TYPE_MD2:
         case WC_HASH_TYPE_MD4:
@@ -723,6 +757,12 @@ int wc_HashInit_ex(wc_HashAlg* hash, enum wc_HashType type, void* heap,
 #endif
             break;
 
+    #ifdef WOLFSSL_SM3
+        case WC_HASH_TYPE_SM3:
+            ret = wc_InitSm3(&hash->sm3, heap, devId);
+            break;
+    #endif
+
         /* not supported */
         case WC_HASH_TYPE_MD5_SHA:
         case WC_HASH_TYPE_MD2:
@@ -829,6 +869,12 @@ int wc_HashUpdate(wc_HashAlg* hash, enum wc_HashType type, const byte* data,
 #endif
             break;
 
+    #ifdef WOLFSSL_SM3
+        case WC_HASH_TYPE_SM3:
+            ret = wc_Sm3Update(&hash->sm3, data, dataSz);
+            break;
+    #endif
+
         /* not supported */
         case WC_HASH_TYPE_MD5_SHA:
         case WC_HASH_TYPE_MD2:
@@ -926,6 +972,12 @@ int wc_HashFinal(wc_HashAlg* hash, enum wc_HashType type, byte* out)
 #endif
             break;
 
+    #ifdef WOLFSSL_SM3
+        case WC_HASH_TYPE_SM3:
+            ret = wc_Sm3Final(&hash->sm3, out);
+            break;
+    #endif
+
         /* not supported */
         case WC_HASH_TYPE_MD5_SHA:
         case WC_HASH_TYPE_MD2:
@@ -1035,6 +1087,13 @@ int wc_HashFree(wc_HashAlg* hash, enum wc_HashType type)
 #endif
             break;
 
+    #ifdef WOLFSSL_SM3
+        case WC_HASH_TYPE_SM3:
+            wc_Sm3Free(&hash->sm3);
+            ret = 0;
+            break;
+    #endif
+
         /* not supported */
         case WC_HASH_TYPE_MD5_SHA:
         case WC_HASH_TYPE_MD2:
@@ -1110,6 +1169,12 @@ int wc_HashSetFlags(wc_HashAlg* hash, enum wc_HashType type, word32 flags)
 #endif
             break;
 
+    #ifdef WOLFSSL_SM3
+        case WC_HASH_TYPE_SM3:
+            ret = wc_Sm3SetFlags(&hash->sm3, flags);
+            break;
+    #endif
+
         /* not supported */
         case WC_HASH_TYPE_MD5_SHA:
         case WC_HASH_TYPE_MD2:
@@ -1183,6 +1248,12 @@ int wc_HashGetFlags(wc_HashAlg* hash, enum wc_HashType type, word32* flags)
 #endif
             break;
 
+    #ifdef WOLFSSL_SM3
+        case WC_HASH_TYPE_SM3:
+            ret = wc_Sm3GetFlags(&hash->sm3, flags);
+            break;
+    #endif
+
         /* not supported */
         case WC_HASH_TYPE_MD5_SHA:
         case WC_HASH_TYPE_MD2:
@@ -1763,6 +1834,43 @@ int wc_HashGetFlags(wc_HashAlg* hash, enum wc_HashType type, word32* flags)
 #endif /* WOLFSSL_SHAKE_256 */
 #endif /* WOLFSSL_SHA3 */
 
+#ifdef WOLFSSL_SM3
+    int wc_Sm3Hash(const byte* data, word32 len, byte* hash)
+    {
+        int ret = 0;
+    #ifdef WOLFSSL_SMALL_STACK
+        wc_Sm3* sm3;
+    #else
+        wc_Sm3 sm3[1];
+    #endif
+
+    #ifdef WOLFSSL_SMALL_STACK
+        sm3 = (wc_Sm3*)XMALLOC(sizeof(wc_Sm3), NULL, DYNAMIC_TYPE_TMP_BUFFER);
+        if (sm3 == NULL)
+            return MEMORY_E;
+    #endif
+
+        if ((ret = wc_InitSm3(sm3, NULL, INVALID_DEVID)) != 0) {
+            WOLFSSL_MSG("InitSm3 failed");
+        }
+        else {
+            if ((ret = wc_Sm3Update(sm3, data, len)) != 0) {
+                WOLFSSL_MSG("Sm3Update failed");
+            }
+            else if ((ret = wc_Sm3Final(sm3, hash)) != 0) {
+                WOLFSSL_MSG("Sm3Final failed");
+            }
+            wc_Sm3Free(sm3);
+        }
+
+    #ifdef WOLFSSL_SMALL_STACK
+        XFREE(sm3, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    #endif
+
+        return ret;
+    }
+#endif /* !WOLFSSL_NOSHA3_224 */
+
 #endif /* !NO_HASH_WRAPPER */
 
 #ifdef WOLFSSL_HASH_KEEP

+ 76 - 72
lib/wolfssl/wolfcrypt/src/hmac.c

@@ -65,82 +65,14 @@
 #endif
 
 
-/* fips wrapper calls, user can call direct */
-/* If building for old FIPS. */
-#if defined(HAVE_FIPS) && \
-    (!defined(HAVE_FIPS_VERSION) || (HAVE_FIPS_VERSION < 2))
-
-    /* does init */
-    int wc_HmacSetKey(Hmac* hmac, int type, const byte* key, word32 keySz)
-    {
-        if (hmac == NULL || (key == NULL && keySz != 0) ||
-           !(type == WC_MD5 || type == WC_SHA || type == WC_SHA256 ||
-                type == WC_SHA384 || type == WC_SHA512)) {
-            return BAD_FUNC_ARG;
-        }
-
-        return HmacSetKey_fips(hmac, type, key, keySz);
-    }
-    int wc_HmacUpdate(Hmac* hmac, const byte* in, word32 sz)
-    {
-        if (hmac == NULL || (in == NULL && sz > 0)) {
-            return BAD_FUNC_ARG;
-        }
-
-        return HmacUpdate_fips(hmac, in, sz);
-    }
-    int wc_HmacFinal(Hmac* hmac, byte* out)
-    {
-        if (hmac == NULL) {
-            return BAD_FUNC_ARG;
-        }
-
-        return HmacFinal_fips(hmac, out);
-    }
-    int wolfSSL_GetHmacMaxSize(void)
-    {
-        return CyaSSL_GetHmacMaxSize();
-    }
-
-    int wc_HmacInit(Hmac* hmac, void* heap, int devId)
-    {
-    #ifndef WOLFSSL_KCAPI_HMAC
-        (void)hmac;
-        (void)heap;
-        (void)devId;
-        return 0;
-    #else
-        return HmacInit(hmac, heap, devId);
-    #endif
-    }
-    void wc_HmacFree(Hmac* hmac)
-    {
-    #ifndef WOLFSSL_KCAPI_HMAC
-        (void)hmac;
-    #else
-        HmacFree(hmac);
-    #endif
-    }
-
-    #ifdef HAVE_HKDF
-        int wc_HKDF(int type, const byte* inKey, word32 inKeySz,
-                    const byte* salt, word32 saltSz,
-                    const byte* info, word32 infoSz,
-                    byte* out, word32 outSz)
-        {
-            return HKDF(type, inKey, inKeySz, salt, saltSz,
-                info, infoSz, out, outSz);
-        }
-    #endif /* HAVE_HKDF */
-
-#else /* else build without fips, or for new fips */
-
-
 int wc_HmacSizeByType(int type)
 {
     int ret;
 
     if (!(type == WC_MD5 || type == WC_SHA ||
+    #ifdef WOLFSSL_SM3
+            type == WC_SM3 ||
+    #endif
             type == WC_SHA224 || type == WC_SHA256 ||
             type == WC_SHA384 || type == WC_SHA512 ||
             type == WC_SHA3_224 || type == WC_SHA3_256 ||
@@ -200,7 +132,12 @@ int wc_HmacSizeByType(int type)
         case WC_SHA3_512:
             ret = WC_SHA3_512_DIGEST_SIZE;
             break;
+    #endif /* WOLFSSL_SHA3 */
 
+    #ifdef WOLFSSL_SM3
+        case WC_SM3:
+            ret = WC_SM3_DIGEST_SIZE;
+            break;
     #endif
 
         default:
@@ -278,6 +215,12 @@ int _InitHmac(Hmac* hmac, int type, void* heap)
     #endif
     #endif
 
+    #ifdef WOLFSSL_SM3
+        case WC_SM3:
+            ret = wc_InitSm3(&hmac->hash.sm3, heap, devId);
+            break;
+    #endif
+
         default:
             ret = BAD_FUNC_ARG;
             break;
@@ -306,6 +249,9 @@ int wc_HmacSetKey(Hmac* hmac, int type, const byte* key, word32 length)
 
     if (hmac == NULL || (key == NULL && length != 0) ||
        !(type == WC_MD5 || type == WC_SHA ||
+    #ifdef WOLFSSL_SM3
+            type == WC_SM3 ||
+    #endif
             type == WC_SHA224 || type == WC_SHA256 ||
             type == WC_SHA384 || type == WC_SHA512 ||
             type == WC_SHA3_224 || type == WC_SHA3_256 ||
@@ -558,6 +504,27 @@ int wc_HmacSetKey(Hmac* hmac, int type, const byte* key, word32 length)
     #endif
     #endif /* WOLFSSL_SHA3 */
 
+    #ifdef WOLFSSL_SM3
+        case WC_SM3:
+            hmac_block_size = WC_SM3_BLOCK_SIZE;
+            if (length <= WC_SM3_BLOCK_SIZE) {
+                if (key != NULL) {
+                    XMEMCPY(ip, key, length);
+                }
+            }
+            else {
+                ret = wc_Sm3Update(&hmac->hash.sm3, key, length);
+                if (ret != 0)
+                    break;
+                ret = wc_Sm3Final(&hmac->hash.sm3, ip);
+                if (ret != 0)
+                    break;
+
+                length = WC_SM3_DIGEST_SIZE;
+            }
+            break;
+    #endif
+
         default:
             return BAD_FUNC_ARG;
     }
@@ -670,6 +637,13 @@ static int HmacKeyInnerHash(Hmac* hmac)
     #endif
     #endif /* WOLFSSL_SHA3 */
 
+    #ifdef WOLFSSL_SM3
+        case WC_SM3:
+            ret = wc_Sm3Update(&hmac->hash.sm3, (byte*)hmac->ipad,
+                                                             WC_SM3_BLOCK_SIZE);
+            break;
+    #endif
+
         default:
             break;
     }
@@ -776,6 +750,12 @@ int wc_HmacUpdate(Hmac* hmac, const byte* msg, word32 length)
     #endif
     #endif /* WOLFSSL_SHA3 */
 
+    #ifdef WOLFSSL_SM3
+        case WC_SM3:
+            ret = wc_Sm3Update(&hmac->hash.sm3, msg, length);
+            break;
+    #endif
+
         default:
             break;
     }
@@ -993,6 +973,23 @@ int wc_HmacFinal(Hmac* hmac, byte* hash)
     #endif
     #endif /* WOLFSSL_SHA3 */
 
+    #ifdef WOLFSSL_SM3
+        case WC_SM3:
+            ret = wc_Sm3Final(&hmac->hash.sm3, (byte*)hmac->innerHash);
+            if (ret != 0)
+                break;
+            ret = wc_Sm3Update(&hmac->hash.sm3, (byte*)hmac->opad,
+                                                             WC_SM3_BLOCK_SIZE);
+            if (ret != 0)
+                break;
+            ret = wc_Sm3Update(&hmac->hash.sm3, (byte*)hmac->innerHash,
+                                                            WC_SM3_DIGEST_SIZE);
+            if (ret != 0)
+                break;
+            ret = wc_Sm3Final(&hmac->hash.sm3, hash);
+            break;
+    #endif
+
         default:
             ret = BAD_FUNC_ARG;
             break;
@@ -1167,9 +1164,17 @@ void wc_HmacFree(Hmac* hmac)
     #endif
     #endif /* WOLFSSL_SHA3 */
 
+    #ifdef WOLFSSL_SM3
+        case WC_SM3:
+            wc_Sm3Free(&hmac->hash.sm3);
+            break;
+    #endif
+
         default:
             break;
     }
+
+    ForceZero(hmac, sizeof(*hmac));
 }
 #endif /* WOLFSSL_KCAPI_HMAC */
 
@@ -1366,5 +1371,4 @@ int wolfSSL_GetHmacMaxSize(void)
 
 #endif /* HAVE_HKDF */
 
-#endif /* HAVE_FIPS */
 #endif /* NO_HMAC */

+ 21 - 19
lib/wolfssl/wolfcrypt/src/hpke.c

@@ -117,7 +117,7 @@ static int I2OSP(int n, int w, byte* out)
     }
 
     /* make sure the byte string is cleared */
-    XMEMSET( out, 0, w );
+    XMEMSET(out, 0, (size_t)w);
 
     for (i = 0; i < w && n > 0; i++) {
         out[w-(i + 1)] = (byte)n;
@@ -138,9 +138,9 @@ int wc_HpkeInit(Hpke* hpke, int kem, int kdf, int aead, void* heap)
     }
 
     XMEMSET(hpke, 0, sizeof(*hpke));
-    hpke->kem = kem;
-    hpke->kdf = kdf;
-    hpke->aead = aead;
+    hpke->kem = (word32)kem;
+    hpke->kdf = (word32)kdf;
+    hpke->aead = (word32)aead;
     hpke->heap = heap;
 
     /* set kem_suite_id */
@@ -177,7 +177,7 @@ int wc_HpkeInit(Hpke* hpke, int kem, int kdf, int aead, void* heap)
             hpke->curve_id = ECC_SECP256R1;
             hpke->Nsecret = WC_SHA256_DIGEST_SIZE;
             hpke->Nh = WC_SHA256_DIGEST_SIZE;
-            hpke->Ndh = wc_ecc_get_curve_size_from_id(hpke->curve_id);
+            hpke->Ndh = (word32)wc_ecc_get_curve_size_from_id(hpke->curve_id);
             hpke->Npk = 1 + hpke->Ndh * 2;
             break;
 #endif
@@ -187,7 +187,7 @@ int wc_HpkeInit(Hpke* hpke, int kem, int kdf, int aead, void* heap)
             hpke->curve_id = ECC_SECP384R1;
             hpke->Nsecret = WC_SHA384_DIGEST_SIZE;
             hpke->Nh = WC_SHA384_DIGEST_SIZE;
-            hpke->Ndh = wc_ecc_get_curve_size_from_id(hpke->curve_id);
+            hpke->Ndh = (word32)wc_ecc_get_curve_size_from_id(hpke->curve_id);
             hpke->Npk = 1 + hpke->Ndh * 2;
             break;
 #endif
@@ -197,7 +197,7 @@ int wc_HpkeInit(Hpke* hpke, int kem, int kdf, int aead, void* heap)
             hpke->curve_id = ECC_SECP521R1;
             hpke->Nsecret = WC_SHA512_DIGEST_SIZE;
             hpke->Nh = WC_SHA512_DIGEST_SIZE;
-            hpke->Ndh = wc_ecc_get_curve_size_from_id(hpke->curve_id);
+            hpke->Ndh = (word32)wc_ecc_get_curve_size_from_id(hpke->curve_id);
             hpke->Npk = 1 + hpke->Ndh * 2;
             break;
 #endif
@@ -272,7 +272,7 @@ int wc_HpkeInit(Hpke* hpke, int kem, int kdf, int aead, void* heap)
     }
 
     if ((int)hpke->Ndh < 0) {
-        return hpke->Ndh;
+        return (int)hpke->Ndh;
     }
 
     return ret;
@@ -332,7 +332,7 @@ int wc_HpkeGenerateKeyPair(Hpke* hpke, void** keypair, WC_RNG* rng)
         ret = MEMORY_E;
 
     if (ret != 0 && *keypair != NULL) {
-        wc_HpkeFreeKey(hpke, hpke->kem, *keypair, hpke->heap);
+        wc_HpkeFreeKey(hpke, (word16)hpke->kem, *keypair, hpke->heap);
         *keypair = NULL;
     }
 
@@ -373,7 +373,7 @@ int wc_HpkeSerializePublicKey(Hpke* hpke, void* key, byte* out, word16* outSz)
             break;
     }
 
-    *outSz = tmpOutSz;
+    *outSz = (word16)tmpOutSz;
 
     return ret;
 }
@@ -430,7 +430,7 @@ int wc_HpkeDeserializePublicKey(Hpke* hpke, void** key, const byte* in,
         ret = MEMORY_E;
 
     if (ret != 0 && *key != NULL) {
-        wc_HpkeFreeKey(hpke, hpke->kem, *key, hpke->heap);
+        wc_HpkeFreeKey(hpke, (word16)hpke->kem, *key, hpke->heap);
         *key = NULL;
     }
 
@@ -547,7 +547,7 @@ static int wc_HpkeLabeledExpand(Hpke* hpke, byte* suite_id, word32 suite_id_len,
 #endif
 
     /* copy length */
-    ret = I2OSP(L, 2, labeled_info);
+    ret = I2OSP((int)L, 2, labeled_info);
     labeled_info_p = labeled_info + 2;
 
     if (ret == 0) {
@@ -593,7 +593,7 @@ static int wc_HpkeContextComputeNonce(Hpke* hpke, HpkeBaseContext* context,
 
     /* convert the sequence into a byte string with the same length as the
      * nonce */
-    ret = I2OSP(context->seq, hpke->Nn, seq_bytes);
+    ret = I2OSP(context->seq, (int)hpke->Nn, seq_bytes);
     if (ret == 0) {
         xorbufout(out, context->base_nonce, seq_bytes, hpke->Nn);
     }
@@ -759,8 +759,8 @@ static int wc_HpkeEncap(Hpke* hpke, void* ephemeralKey, void* receiverKey,
         return BAD_FUNC_ARG;
     }
 
-    receiverPubKeySz = hpke->Npk;
-    ephemeralPubKeySz = hpke->Npk;
+    receiverPubKeySz = (word16)hpke->Npk;
+    ephemeralPubKeySz = (word16)hpke->Npk;
 
 #ifdef WOLFSSL_SMALL_STACK
     dh = (byte*)XMALLOC(hpke->Ndh, hpke->heap, DYNAMIC_TYPE_TMP_BUFFER);
@@ -785,8 +785,10 @@ static int wc_HpkeEncap(Hpke* hpke, void* ephemeralKey, void* receiverKey,
 #ifdef ECC_TIMING_RESISTANT
             rng = wc_rng_new(NULL, 0, hpke->heap);
 
-            if (rng == NULL)
-                return RNG_FAILURE_E;
+            if (rng == NULL) {
+                ret = RNG_FAILURE_E;
+                break;
+            }
 
             wc_ecc_set_rng((ecc_key*)ephemeralKey, rng);
 #endif
@@ -990,7 +992,7 @@ static int wc_HpkeDecap(Hpke* hpke, void* receiverKey, const byte* pubKey,
         return BAD_FUNC_ARG;
     }
 
-    receiverPubKeySz = hpke->Npk;
+    receiverPubKeySz = (word16)hpke->Npk;
 
 #ifdef WOLFSSL_SMALL_STACK
     dh = (byte*)XMALLOC(hpke->Ndh, hpke->heap, DYNAMIC_TYPE_TMP_BUFFER);
@@ -1048,7 +1050,7 @@ static int wc_HpkeDecap(Hpke* hpke, void* receiverKey, const byte* pubKey,
         }
 
     if (ephemeralKey != NULL)
-        wc_HpkeFreeKey(hpke, hpke->kem, ephemeralKey, hpke->heap);
+        wc_HpkeFreeKey(hpke, (word16)hpke->kem, ephemeralKey, hpke->heap);
 
     if (ret == 0) {
         /* copy pubKey into kemContext */

+ 0 - 212
lib/wolfssl/wolfcrypt/src/include.am

@@ -1,212 +0,0 @@
-# vim:ft=automake
-# All paths should be given relative to the root
-
-ASYNC_FILES =						\
-	wolfcrypt/src/port/cavium/cavium_nitrox.c	\
-	wolfcrypt/src/port/intel/quickassist.c		\
-	wolfcrypt/src/port/intel/quickassist_mem.c
-
-BUILT_SOURCES+= $(ASYNC_FILES)
-
-MAINTAINERCLEANFILES+= $(ASYNC_FILES)
-
-EXTRA_DIST += wolfcrypt/src/misc.c
-EXTRA_DIST += wolfcrypt/src/evp.c
-EXTRA_DIST += wolfcrypt/src/asm.c
-EXTRA_DIST += wolfcrypt/src/aes_asm.asm
-EXTRA_DIST += wolfcrypt/src/aes_gcm_asm.asm
-EXTRA_DIST += wolfcrypt/src/wc_dsp.c
-EXTRA_DIST += wolfcrypt/src/sp_dsp32.c
-EXTRA_DIST += wolfcrypt/src/sp_x86_64_asm.asm
-
-EXTRA_DIST += \
-              wolfcrypt/src/ecc_fp.c \
-              wolfcrypt/src/fp_mont_small.i \
-              wolfcrypt/src/fp_mul_comba_12.i \
-              wolfcrypt/src/fp_mul_comba_17.i \
-              wolfcrypt/src/fp_mul_comba_20.i \
-              wolfcrypt/src/fp_mul_comba_24.i \
-              wolfcrypt/src/fp_mul_comba_28.i \
-              wolfcrypt/src/fp_mul_comba_32.i \
-              wolfcrypt/src/fp_mul_comba_3.i \
-              wolfcrypt/src/fp_mul_comba_48.i \
-              wolfcrypt/src/fp_mul_comba_4.i \
-              wolfcrypt/src/fp_mul_comba_64.i \
-              wolfcrypt/src/fp_mul_comba_6.i \
-              wolfcrypt/src/fp_mul_comba_7.i \
-              wolfcrypt/src/fp_mul_comba_8.i \
-              wolfcrypt/src/fp_mul_comba_9.i \
-              wolfcrypt/src/fp_mul_comba_small_set.i \
-              wolfcrypt/src/fp_sqr_comba_12.i \
-              wolfcrypt/src/fp_sqr_comba_17.i \
-              wolfcrypt/src/fp_sqr_comba_20.i \
-              wolfcrypt/src/fp_sqr_comba_24.i \
-              wolfcrypt/src/fp_sqr_comba_28.i \
-              wolfcrypt/src/fp_sqr_comba_32.i \
-              wolfcrypt/src/fp_sqr_comba_3.i \
-              wolfcrypt/src/fp_sqr_comba_48.i \
-              wolfcrypt/src/fp_sqr_comba_4.i \
-              wolfcrypt/src/fp_sqr_comba_64.i \
-              wolfcrypt/src/fp_sqr_comba_6.i \
-              wolfcrypt/src/fp_sqr_comba_7.i \
-              wolfcrypt/src/fp_sqr_comba_8.i \
-              wolfcrypt/src/fp_sqr_comba_9.i \
-              wolfcrypt/src/fp_sqr_comba_small_set.i \
-              wolfcrypt/src/fe_x25519_128.i
-
-EXTRA_DIST += wolfcrypt/src/port/ti/ti-aes.c \
-              wolfcrypt/src/port/ti/ti-des3.c \
-              wolfcrypt/src/port/ti/ti-hash.c \
-              wolfcrypt/src/port/ti/ti-ccm.c \
-              wolfcrypt/src/port/pic32/pic32mz-crypt.c \
-              wolfcrypt/src/port/nrf51.c \
-              wolfcrypt/src/port/arm/armv8-aes.c \
-              wolfcrypt/src/port/arm/armv8-sha256.c \
-              wolfcrypt/src/port/arm/armv8-chacha.c \
-              wolfcrypt/src/port/nxp/ksdk_port.c \
-              wolfcrypt/src/port/nxp/dcp_port.c \
-              wolfcrypt/src/port/nxp/se050_port.c \
-              wolfcrypt/src/port/nxp/README.md \
-              wolfcrypt/src/port/atmel/README.md \
-              wolfcrypt/src/port/xilinx/xil-sha3.c \
-              wolfcrypt/src/port/xilinx/xil-aesgcm.c \
-              wolfcrypt/src/port/xilinx/xil-versal-glue.c \
-              wolfcrypt/src/port/xilinx/xil-versal-trng.c \
-              wolfcrypt/src/port/caam/caam_aes.c \
-              wolfcrypt/src/port/caam/caam_driver.c \
-              wolfcrypt/src/port/caam/caam_error.c \
-              wolfcrypt/src/port/caam/caam_qnx.c \
-              wolfcrypt/src/port/caam/caam_integrity.c \
-              wolfcrypt/src/port/caam/caam_sha.c \
-              wolfcrypt/src/port/caam/caam_doc.pdf \
-              wolfcrypt/src/port/caam/wolfcaam_init.c \
-              wolfcrypt/src/port/caam/wolfcaam_seco.c \
-              wolfcrypt/src/port/caam/wolfcaam_qnx.c \
-              wolfcrypt/src/port/caam/wolfcaam_x25519.c \
-              wolfcrypt/src/port/caam/wolfcaam_ecdsa.c \
-              wolfcrypt/src/port/caam/wolfcaam_cmac.c \
-              wolfcrypt/src/port/caam/wolfcaam_hash.c \
-              wolfcrypt/src/port/caam/wolfcaam_rsa.c \
-              wolfcrypt/src/port/caam/wolfcaam_hmac.c \
-              wolfcrypt/src/port/caam/wolfcaam_aes.c \
-              wolfcrypt/src/port/caam/wolfcaam_fsl_nxp.c \
-              wolfcrypt/src/port/silabs/silabs_aes.c \
-              wolfcrypt/src/port/silabs/silabs_ecc.c \
-              wolfcrypt/src/port/silabs/silabs_hash.c \
-              wolfcrypt/src/port/silabs/silabs_random.c \
-              wolfcrypt/src/port/silabs/README.md \
-              wolfcrypt/src/port/st/stm32.c \
-              wolfcrypt/src/port/st/stsafe.c \
-              wolfcrypt/src/port/st/README.md \
-              wolfcrypt/src/port/af_alg/afalg_aes.c \
-              wolfcrypt/src/port/af_alg/afalg_hash.c \
-              wolfcrypt/src/port/kcapi/kcapi_aes.c \
-              wolfcrypt/src/port/kcapi/kcapi_hash.c \
-              wolfcrypt/src/port/kcapi/kcapi_hmac.c \
-              wolfcrypt/src/port/kcapi/kcapi_ecc.c \
-              wolfcrypt/src/port/kcapi/kcapi_rsa.c \
-              wolfcrypt/src/port/kcapi/kcapi_dh.c \
-              wolfcrypt/src/port/kcapi/README.md \
-              wolfcrypt/src/port/devcrypto/devcrypto_hash.c \
-              wolfcrypt/src/port/devcrypto/wc_devcrypto.c \
-              wolfcrypt/src/port/devcrypto/README.md \
-              wolfcrypt/src/port/mynewt/mynewt_port.c \
-              wolfcrypt/src/port/Espressif/esp32_aes.c \
-              wolfcrypt/src/port/Espressif/esp32_sha.c \
-              wolfcrypt/src/port/Espressif/esp32_util.c \
-              wolfcrypt/src/port/Espressif/esp32_mp.c \
-              wolfcrypt/src/port/Espressif/README.md \
-              wolfcrypt/src/port/arm/cryptoCell.c \
-              wolfcrypt/src/port/arm/cryptoCellHash.c \
-              wolfcrypt/src/port/Renesas/renesas_tsip_aes.c \
-              wolfcrypt/src/port/Renesas/renesas_tsip_sha.c \
-              wolfcrypt/src/port/Renesas/renesas_tsip_util.c \
-              wolfcrypt/src/port/Renesas/renesas_sce_util.c \
-              wolfcrypt/src/port/Renesas/renesas_sce_aes.c \
-              wolfcrypt/src/port/Renesas/renesas_sce_sha.c \
-              wolfcrypt/src/port/Renesas/renesas_sce_rsa.c \
-              wolfcrypt/src/port/Renesas/renesas_common.c \
-              wolfcrypt/src/port/Renesas/renesas_rx64_hw_sha.c \
-              wolfcrypt/src/port/Renesas/renesas_rx64_hw_util.c \
-              wolfcrypt/src/port/Renesas/README.md \
-              wolfcrypt/src/port/cypress/psoc6_crypto.c
-
-$(ASYNC_FILES):
-	$(AM_V_at)touch $(srcdir)/$@
-
-if BUILD_CRYPTOCB
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/cryptocb.c
-endif
-
-if BUILD_PKCS11
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/wc_pkcs11.c
-endif
-
-if BUILD_DEVCRYPTO
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/devcrypto/devcrypto_ecdsa.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/devcrypto/devcrypto_x25519.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/devcrypto/devcrypto_rsa.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/devcrypto/devcrypto_hmac.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/devcrypto/devcrypto_hash.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/devcrypto/devcrypto_aes.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/devcrypto/wc_devcrypto.c
-endif
-
-if BUILD_CAVIUM
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/cavium/cavium_nitrox.c
-endif
-EXTRA_DIST += wolfcrypt/src/port/cavium/README.md
-
-if BUILD_OCTEON_SYNC
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/cavium/cavium_octeon_sync.c
-endif
-EXTRA_DIST += wolfcrypt/src/port/cavium/README_Octeon.md
-
-if BUILD_INTEL_QA
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/intel/quickassist.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/intel/quickassist_mem.c
-endif
-EXTRA_DIST += wolfcrypt/src/port/intel/README.md
-
-if BUILD_INTEL_QA_SYNC
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/intel/quickassist_sync.c
-endif
-
-if BUILD_CRYPTOAUTHLIB
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/atmel/atmel.c
-endif
-
-if BUILD_IOTSAFE
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/iotsafe/iotsafe.c
-endif
-
-
-if BUILD_CAAM
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/caam/wolfcaam_init.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/caam/wolfcaam_qnx.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/caam/wolfcaam_seco.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/caam/wolfcaam_x25519.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/caam/wolfcaam_ecdsa.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/caam/wolfcaam_cmac.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/caam/wolfcaam_aes.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/caam/wolfcaam_hash.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/caam/wolfcaam_rsa.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/caam/wolfcaam_hmac.c
-endif
-
-if BUILD_SE050
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/nxp/se050_port.c
-endif
-
-if BUILD_PSA
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/psa/psa.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/psa/psa_hash.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/psa/psa_aes.c
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/psa/psa_pkcbs.c
-endif
-EXTRA_DIST += wolfcrypt/src/port/psa/README.md
-
-if BUILD_MAXQ10XX
-src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/maxim/maxq10xx.c
-endif
-EXTRA_DIST += wolfcrypt/src/port/maxim/README.md

+ 16 - 5
lib/wolfssl/wolfcrypt/src/integer.c

@@ -553,6 +553,15 @@ int mp_exch (mp_int * a, mp_int * b)
   return MP_OKAY;
 }
 
+int mp_cond_swap_ct_ex (mp_int * a, mp_int * b, int c, int m, mp_int * t)
+{
+    (void)c;
+    (void)t;
+    if (m == 1)
+        mp_exch(a, b);
+    return MP_OKAY;
+}
+
 int mp_cond_swap_ct (mp_int * a, mp_int * b, int c, int m)
 {
     (void)c;
@@ -946,7 +955,7 @@ int wolfcrypt_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
   }
 
 #ifdef BN_MP_EXPTMOD_BASE_2
-  if (G->used == 1 && G->dp[0] == 2) {
+  if (G->used == 1 && G->dp[0] == 2 && mp_isodd(P) == MP_YES) {
     return mp_exptmod_base_2(X, P, Y);
   }
 #endif
@@ -976,7 +985,7 @@ int wolfcrypt_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
   }
 #endif
 
-  /* if the modulus is odd or dr != 0 use the montgomery method */
+  /* if the modulus is odd use the montgomery method, or use other known */
 #ifdef BN_MP_EXPTMOD_FAST_C
   if (mp_isodd (P) == MP_YES || dr !=  0) {
     return mp_exptmod_fast (G, X, P, Y, dr);
@@ -1976,7 +1985,6 @@ int mp_dr_is_modulus(mp_int *a)
    return 1;
 }
 
-
 /* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
  *
  * Uses a left-to-right k-ary sliding window to compute the modular
@@ -2104,7 +2112,10 @@ int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y,
      if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
         goto LBL_M;
      }
-     redux = mp_reduce_2k;
+     /* mp of zero is not usable */
+     if (mp != 0) {
+         redux = mp_reduce_2k;
+     }
 #endif
   }
 
@@ -3314,7 +3325,7 @@ int mp_init_size (mp_int * a, int size)
 }
 
 
-/* the jist of squaring...
+/* the list of squaring...
  * you do like mult except the offset of the tmpx [one that
  * starts closer to zero] can't equal the offset of tmpy.
  * So basically you set up iy like before then you min it with

+ 15 - 1
lib/wolfssl/wolfcrypt/src/kdf.c

@@ -113,6 +113,13 @@ int wc_PRF(byte* result, word32 resLen, const byte* secret,
         break;
     #endif
 
+    #ifdef WOLFSSL_SM3
+        case sm3_mac:
+            hash = WC_SM3;
+            len  = WC_SM3_DIGEST_SIZE;
+        break;
+    #endif
+
     #ifndef NO_SHA
         case sha_mac:
             hash = WC_SHA;
@@ -129,7 +136,7 @@ int wc_PRF(byte* result, word32 resLen, const byte* secret,
     if (lastLen)
         times += 1;
 
-    /* times == 0 iif resLen == 0, but times == 0 abides clang static analyzer
+    /* times == 0 if resLen == 0, but times == 0 abides clang static analyzer
        while resLen == 0 doesn't */
     if (times == 0)
         return BAD_FUNC_ARG;
@@ -376,6 +383,13 @@ int wc_PRF_TLS(byte* digest, word32 digLen, const byte* secret, word32 secLen,
                 len = WC_SHA512_DIGEST_SIZE;
                 break;
             #endif
+
+            #ifdef WOLFSSL_SM3
+            case WC_SM3:
+                len = WC_SM3_DIGEST_SIZE;
+                break;
+            #endif
+
             default:
                 return BAD_FUNC_ARG;
         }

+ 24 - 11
lib/wolfssl/wolfcrypt/src/logging.c

@@ -127,6 +127,7 @@ THREAD_LS_T void *StackSizeCheck_stackOffsetPointer = 0;
 /* Set these to default values initially. */
 static wolfSSL_Logging_cb log_function = NULL;
 static int loggingEnabled = 0;
+THREAD_LS_T const char* log_prefix = NULL;
 
 #if defined(WOLFSSL_APACHE_MYNEWT)
 #include "log/log.h"
@@ -186,6 +187,15 @@ void wolfSSL_Debugging_OFF(void)
 #endif
 }
 
+WOLFSSL_API void wolfSSL_SetLoggingPrefix(const char* prefix)
+{
+#ifdef DEBUG_WOLFSSL
+    log_prefix = prefix;
+#else
+    (void)prefix;
+#endif
+}
+
 #ifdef WOLFSSL_FUNC_TIME
 /* WARNING: This code is only to be used for debugging performance.
  *          The code is not thread-safe.
@@ -316,14 +326,17 @@ static void wolfssl_log(const int logLevel, const char *const logMessage)
       defined(HAVE_STACK_SIZE_VERBOSE) && defined(HAVE_STACK_SIZE_VERBOSE_LOG)
         STACK_SIZE_CHECKPOINT_MSG(logMessage);
 #else
-        fprintf(stderr, "%s\n", logMessage);
+        if (log_prefix != NULL)
+            fprintf(stderr, "[%s]: %s\n", log_prefix, logMessage);
+        else
+            fprintf(stderr, "%s\n", logMessage);
 #endif
     }
 }
 
 #ifndef WOLFSSL_DEBUG_ERRORS_ONLY
 
-#if !defined(_WIN32) && defined(XVSNPRINTF) && !defined(NO_WOLFSSL_MSG_EX)
+#if defined(XVSNPRINTF) && !defined(NO_WOLFSSL_MSG_EX)
 #include <stdarg.h> /* for var args */
 #ifndef WOLFSSL_MSG_EX_BUF_SZ
 #define WOLFSSL_MSG_EX_BUF_SZ 100
@@ -477,7 +490,7 @@ static int get_abs_idx(int relative_idx)
         return (int)((wc_errors.head_idx + wc_errors.count - 1)
                       % ERROR_QUEUE_MAX);
     }
-    return (int)((wc_errors.head_idx + relative_idx) % ERROR_QUEUE_MAX);
+    return (int)((wc_errors.head_idx + (size_t)relative_idx) % ERROR_QUEUE_MAX);
 }
 
 /**
@@ -526,13 +539,13 @@ static int pass_entry(struct wc_error_entry *entry,
 static void set_entry(struct wc_error_entry *entry, int error,
                       const char *file, const char *reason, int line)
 {
-    int sz;
+    size_t sz;
 
     XMEMSET(entry, 0, sizeof(struct wc_error_entry));
     entry->err = error;
 
     entry->line  = line;
-    sz = (int)XSTRLEN(reason);
+    sz = XSTRLEN(reason);
     if (sz > WOLFSSL_MAX_ERROR_SZ - 1) {
         sz = WOLFSSL_MAX_ERROR_SZ - 1;
     }
@@ -541,7 +554,7 @@ static void set_entry(struct wc_error_entry *entry, int error,
         entry->reason[WOLFSSL_MAX_ERROR_SZ - 1] = '\0';
     }
 
-    sz = (int)XSTRLEN(file);
+    sz = XSTRLEN(file);
     if (sz > WOLFSSL_MAX_ERROR_SZ - 1) {
         sz = WOLFSSL_MAX_ERROR_SZ - 1;
     }
@@ -628,7 +641,7 @@ void wc_RemoveErrorNode(int relative_idx)
         if (abs_idx >= (int)wc_errors.head_idx) {
             /* removed entry sits "above" head (or is head),
              * move entries below it "up" */
-            move_count = (abs_idx - (int)wc_errors.head_idx);
+            move_count = (size_t)abs_idx - wc_errors.head_idx;
             if (move_count > 0) {
                 XMEMMOVE(&wc_errors.entries[wc_errors.head_idx + 1],
                          &wc_errors.entries[wc_errors.head_idx],
@@ -642,7 +655,7 @@ void wc_RemoveErrorNode(int relative_idx)
              * move entries above it "down" */
             int last_idx = get_abs_idx(-1);
             if (last_idx >= abs_idx) {  /* this SHOULD always be true */
-                move_count = (last_idx - abs_idx);
+                move_count = (size_t)(last_idx - abs_idx);
                 if (move_count > 0) {
                     XMEMMOVE(&wc_errors.entries[abs_idx],
                              &wc_errors.entries[abs_idx + 1],
@@ -725,7 +738,7 @@ unsigned long wc_PeekErrorNodeLineData(const char **file, int *line,
 
 /**
  * Get the error value at the HEAD of the ERR queue or 0 if the queue
- * is emtpy. The HEAD entry is removed by this call.
+ * is empty. The HEAD entry is removed by this call.
  */
 unsigned long wc_GetErrorNodeErr(void)
 {
@@ -746,7 +759,7 @@ unsigned long wc_GetErrorNodeErr(void)
             wc_ClearErrorNodes();
         }
     }
-    return ret;
+    return (unsigned long)ret;
 }
 
 #if !defined(NO_FILESYSTEM) && !defined(NO_STDIO_FILESYSTEM)
@@ -1495,7 +1508,7 @@ void WOLFSSL_ERROR(int error)
                     "wolfSSL error occurred, error = %d line:%u file:%s",
                     error, line, file);
 
-            if (wc_AddErrorNode(error, line, buffer, (char*)file) != 0) {
+            if (wc_AddErrorNode(error, (int)line, buffer, (char*)file) != 0) {
                 WOLFSSL_MSG("Error creating logging node");
                 /* with void function there is no return here, continue on
                  * to unlock mutex and log what buffer was created. */

+ 1 - 1
lib/wolfssl/wolfcrypt/src/md2.c

@@ -107,7 +107,7 @@ void wc_Md2Update(Md2* md2, const byte* data, word32 len)
                     t = md2->X[j+6] ^= S[t];
                     t = md2->X[j+7] ^= S[t];
                 }
-                t = (t + i) & 0xFF;
+                t = (byte)((t + i) & 0xFF);
             }
         }
     }

+ 63 - 56
lib/wolfssl/wolfcrypt/src/memory.c

@@ -33,15 +33,6 @@
 
 #include <wolfssl/wolfcrypt/settings.h>
 
-/* check old macros @wc_fips */
-#if defined(USE_CYASSL_MEMORY) && !defined(USE_WOLFSSL_MEMORY)
-    #define USE_WOLFSSL_MEMORY
-#endif
-#if defined(CYASSL_MALLOC_CHECK) && !defined(WOLFSSL_MALLOC_CHECK)
-    #define WOLFSSL_MALLOC_CHECK
-#endif
-
-
 /*
 Possible memory options:
  * NO_WOLFSSL_MEMORY:               Disables wolf memory callback support. When not defined settings.h defines USE_WOLFSSL_MEMORY.
@@ -128,6 +119,51 @@ int wolfSSL_GetAllocators(wolfSSL_Malloc_cb*  mf,
     return 0;
 }
 
+#ifdef WOLFSSL_MEM_FAIL_COUNT
+static wolfSSL_Mutex memFailMutex;
+int mem_fail_allocs = 0;
+int mem_fail_frees = 0;
+int mem_fail_cnt = 0;
+
+void wc_MemFailCount_Init()
+{
+    char* cnt;
+    wc_InitMutex(&memFailMutex);
+    cnt = getenv("MEM_FAIL_CNT");
+    if (cnt != NULL) {
+        fprintf(stderr, "MemFailCount At: %d\n", mem_fail_cnt);
+        mem_fail_cnt = atoi(cnt);
+    }
+}
+static int wc_MemFailCount_AllocMem(void)
+{
+    int ret = 1;
+
+    wc_LockMutex(&memFailMutex);
+    if ((mem_fail_cnt > 0) && (mem_fail_cnt <= mem_fail_allocs + 1)) {
+        ret = 0;
+    }
+    else {
+        mem_fail_allocs++;
+    }
+    wc_UnLockMutex(&memFailMutex);
+
+    return ret;
+}
+static void wc_MemFailCount_FreeMem(void)
+{
+    wc_LockMutex(&memFailMutex);
+    mem_fail_frees++;
+    wc_UnLockMutex(&memFailMutex);
+}
+void wc_MemFailCount_Free()
+{
+    wc_FreeMutex(&memFailMutex);
+    fprintf(stderr, "MemFailCount Total: %d\n", mem_fail_allocs);
+    fprintf(stderr, "MemFailCount Frees: %d\n", mem_fail_frees);
+}
+#endif
+
 #ifndef WOLFSSL_STATIC_MEMORY
 #ifdef WOLFSSL_CHECK_MEM_ZERO
 
@@ -269,50 +305,6 @@ void wc_MemZero_Check(void* addr, size_t len)
 }
 #endif /* WOLFSSL_CHECK_MEM_ZERO */
 
-#ifdef WOLFSSL_MEM_FAIL_COUNT
-static wolfSSL_Mutex memFailMutex;
-int mem_fail_allocs = 0;
-int mem_fail_frees = 0;
-int mem_fail_cnt = 0;
-
-void wc_MemFailCount_Init()
-{
-    wc_InitMutex(&memFailMutex);
-    char* cnt = getenv("MEM_FAIL_CNT");
-    if (cnt != NULL) {
-        fprintf(stderr, "MemFailCount At: %d\n", mem_fail_cnt);
-        mem_fail_cnt = atoi(cnt);
-    }
-}
-static int wc_MemFailCount_AllocMem(void)
-{
-    int ret = 1;
-
-    wc_LockMutex(&memFailMutex);
-    if ((mem_fail_cnt > 0) && (mem_fail_cnt <= mem_fail_allocs + 1)) {
-        ret = 0;
-    }
-    else {
-        mem_fail_allocs++;
-    }
-    wc_UnLockMutex(&memFailMutex);
-
-    return ret;
-}
-static void wc_MemFailCount_FreeMem(void)
-{
-    wc_LockMutex(&memFailMutex);
-    mem_fail_frees++;
-    wc_UnLockMutex(&memFailMutex);
-}
-void wc_MemFailCount_Free()
-{
-    wc_FreeMutex(&memFailMutex);
-    fprintf(stderr, "MemFailCount Total: %d\n", mem_fail_allocs);
-    fprintf(stderr, "MemFailCount Frees: %d\n", mem_fail_frees);
-}
-#endif
-
 #ifdef WOLFSSL_DEBUG_MEMORY
 void* wolfSSL_Malloc(size_t size, const char* func, unsigned int line)
 #else
@@ -1324,8 +1316,13 @@ void *xmalloc(size_t n, void* heap, int type, const char* func,
     }
 #endif
 
-    if (malloc_function)
+    if (malloc_function) {
+#ifndef WOLFSSL_STATIC_MEMORY
         p32 = malloc_function(n + sizeof(word32) * 4);
+#else
+        p32 = malloc_function(n + sizeof(word32) * 4, heap, type);
+#endif
+    }
     else
         p32 = malloc(n + sizeof(word32) * 4);
 
@@ -1362,8 +1359,13 @@ void *xrealloc(void *p, size_t n, void* heap, int type, const char* func,
         oldLen = oldp32[0];
     }
 
-    if (realloc_function)
+    if (realloc_function) {
+#ifndef WOLFSSL_STATIC_MEMORY
         p32 = realloc_function(oldp32, n + sizeof(word32) * 4);
+#else
+        p32 = realloc_function(oldp32, n + sizeof(word32) * 4, heap, type);
+#endif
+    }
     else
         p32 = realloc(oldp32, n + sizeof(word32) * 4);
 
@@ -1403,8 +1405,13 @@ void xfree(void *p, void* heap, int type, const char* func, const char* file,
         fprintf(stderr, "Free: %p -> %u (%d) at %s:%s:%u\n", p, p32[0], type,
                                                               func, file, line);
 
-        if (free_function)
+        if (free_function) {
+#ifndef WOLFSSL_STATIC_MEMORY
             free_function(p32);
+#else
+            free_function(p32, heap, type);
+#endif
+        }
         else
             free(p32);
     }

+ 45 - 10
lib/wolfssl/wolfcrypt/src/misc.c

@@ -104,13 +104,13 @@ masking and clearing memory logic.
 
     WC_MISC_STATIC WC_INLINE word32 rotlFixed(word32 x, word32 y)
     {
-        return (x << y) | (x >> (sizeof(y) * 8 - y));
+        return (x << y) | (x >> (sizeof(x) * 8 - y));
     }
 
 /* This routine performs a right circular arithmetic shift of <x> by <y> value. */
     WC_MISC_STATIC WC_INLINE word32 rotrFixed(word32 x, word32 y)
     {
-        return (x >> y) | (x << (sizeof(y) * 8 - y));
+        return (x >> y) | (x << (sizeof(x) * 8 - y));
     }
 
 #endif
@@ -120,14 +120,14 @@ masking and clearing memory logic.
 /* This routine performs a left circular arithmetic shift of <x> by <y> value */
 WC_MISC_STATIC WC_INLINE word16 rotlFixed16(word16 x, word16 y)
 {
-    return (x << y) | (x >> (sizeof(y) * 8 - y));
+    return (x << y) | (x >> (sizeof(x) * 8 - y));
 }
 
 
 /* This routine performs a right circular arithmetic shift of <x> by <y> value */
 WC_MISC_STATIC WC_INLINE word16 rotrFixed16(word16 x, word16 y)
 {
-    return (x >> y) | (x << (sizeof(y) * 8 - y));
+    return (x >> y) | (x << (sizeof(x) * 8 - y));
 }
 
 #endif /* WC_RC2 */
@@ -273,11 +273,11 @@ WC_MISC_STATIC WC_INLINE void xorbufout(void* out, const void* buf,
 {
     word32      i;
     byte*       o;
-    byte*       b;
+    const byte* b;
     const byte* m;
 
     o = (byte*)out;
-    b = (byte*)buf;
+    b = (const byte*)buf;
     m = (const byte*)mask;
 
 
@@ -285,6 +285,15 @@ WC_MISC_STATIC WC_INLINE void xorbufout(void* out, const void* buf,
             ((wc_ptr_t)b) % WOLFSSL_WORD_SIZE &&
             ((wc_ptr_t)b) % WOLFSSL_WORD_SIZE ==
                         ((wc_ptr_t)m) % WOLFSSL_WORD_SIZE) {
+        /* type-punning helpers */
+        union {
+            byte* bp;
+            wolfssl_word* wp;
+        } tpo;
+        union {
+            const byte* bp;
+            const wolfssl_word* wp;
+        } tpb, tpm;
         /* Alignment checks out. Possible to XOR words. */
         /* Move alignment so that it lines up with a
          * WOLFSSL_WORD_SIZE boundary */
@@ -292,8 +301,13 @@ WC_MISC_STATIC WC_INLINE void xorbufout(void* out, const void* buf,
             *(o++) = (byte)(*(b++) ^ *(m++));
             count--;
         }
-        XorWordsOut( (wolfssl_word**)&o, (const wolfssl_word**)&b,
-                     (const wolfssl_word**)&m, count / WOLFSSL_WORD_SIZE);
+        tpo.bp = o;
+        tpb.bp = b;
+        tpm.bp = m;
+        XorWordsOut( &tpo.wp, &tpb.wp, &tpm.wp, count / WOLFSSL_WORD_SIZE);
+        o = tpo.bp;
+        b = tpb.bp;
+        m = tpm.bp;
         count %= WOLFSSL_WORD_SIZE;
     }
 
@@ -326,6 +340,15 @@ WC_MISC_STATIC WC_INLINE void xorbuf(void* buf, const void* mask, word32 count)
 
     if (((wc_ptr_t)b) % WOLFSSL_WORD_SIZE ==
             ((wc_ptr_t)m) % WOLFSSL_WORD_SIZE) {
+        /* type-punning helpers */
+        union {
+            byte* bp;
+            wolfssl_word* wp;
+        } tpb;
+        union {
+            const byte* bp;
+            const wolfssl_word* wp;
+        } tpm;
         /* Alignment checks out. Possible to XOR words. */
         /* Move alignment so that it lines up with a
          * WOLFSSL_WORD_SIZE boundary */
@@ -333,8 +356,11 @@ WC_MISC_STATIC WC_INLINE void xorbuf(void* buf, const void* mask, word32 count)
             *(b++) ^= *(m++);
             count--;
         }
-        XorWords( (wolfssl_word**)&b,
-                  (const wolfssl_word**)&m, count / WOLFSSL_WORD_SIZE);
+        tpb.bp = b;
+        tpm.bp = m;
+        XorWords( &tpb.wp, &tpm.wp, count / WOLFSSL_WORD_SIZE);
+        b = tpb.bp;
+        m = tpm.bp;
         count %= WOLFSSL_WORD_SIZE;
     }
 
@@ -472,6 +498,15 @@ WC_MISC_STATIC WC_INLINE void ato32(const byte* c, word32* wc_u32)
                (word32)c[3];
 }
 
+/* convert opaque to 32 bit integer. Interpret as little endian. */
+WC_MISC_STATIC WC_INLINE void ato32le(const byte* c, word32* wc_u32)
+{
+    *wc_u32 =  (word32)c[0] |
+              ((word32)c[1] << 8) |
+              ((word32)c[2] << 16) |
+              ((word32)c[3] << 24);
+}
+
 
 WC_MISC_STATIC WC_INLINE word32 btoi(byte b)
 {

+ 5 - 4
lib/wolfssl/wolfcrypt/src/pkcs12.c

@@ -28,7 +28,8 @@
 #include <wolfssl/wolfcrypt/settings.h>
 
 #if defined(HAVE_PKCS12) && \
-    !defined(NO_ASN) && !defined(NO_PWDBASED) && !defined(NO_HMAC)
+    !defined(NO_ASN) && !defined(NO_PWDBASED) && !defined(NO_HMAC) && \
+    !defined(NO_CERTS)
 
 #include <wolfssl/wolfcrypt/asn.h>
 #include <wolfssl/wolfcrypt/asn_public.h>
@@ -1229,7 +1230,7 @@ static int PKCS12_CheckConstructedZero(byte* data, word32 dataSz, word32* idx)
 static int PKCS12_CoalesceOctetStrings(WC_PKCS12* pkcs12, byte* data,
         word32 dataSz, word32* idx, int* curIdx)
 {
-    byte*  mergedData = NULL; /* buffer for concatonated strings */
+    byte*  mergedData = NULL; /* buffer for concatenated strings */
     word32 mergedSz = 0;      /* total size of merged strings */
     int    encryptedContentSz = 0;
     int    originalEncSz = 0;
@@ -1243,7 +1244,7 @@ static int PKCS12_CoalesceOctetStrings(WC_PKCS12* pkcs12, byte* data,
         ret = ASN_PARSE_E;
     }
 
-    /* Loop through octet strings and concatonate them without
+    /* Loop through octet strings and concatenate them without
      * the tags and length */
     while ((int)*idx < originalEncSz + *curIdx) {
         if (GetASNTag(data, idx, &tag, dataSz) < 0) {
@@ -1281,7 +1282,7 @@ static int PKCS12_CoalesceOctetStrings(WC_PKCS12* pkcs12, byte* data,
     *idx += SetLength(mergedSz, &data[*idx]);
 
     if (mergedSz > 0) {
-        /* Copy over concatonated octet strings into data buffer */
+        /* Copy over concatenated octet strings into data buffer */
         XMEMCPY(&data[*idx], mergedData, mergedSz);
 
         XFREE(mergedData, pkcs12->heap, DYNAMIC_TYPE_PKCS);

+ 251 - 106
lib/wolfssl/wolfcrypt/src/pkcs7.c

@@ -2374,6 +2374,7 @@ static int PKCS7_EncodeSigned(PKCS7* pkcs7, ESD* esd,
     word32 totalSz, total2Sz;
     int idx = 0, ret = 0;
     int digEncAlgoId, digEncAlgoType;
+    int keyIdSize;
     byte* flatSignedAttribs = NULL;
     word32 flatSignedAttribsSz = 0;
 
@@ -2392,6 +2393,13 @@ static int PKCS7_EncodeSigned(PKCS7* pkcs7, ESD* esd,
         return BAD_FUNC_ARG;
     }
 
+#if defined(WOLFSSL_SM2) && defined(WOLFSSL_SM3)
+    keyIdSize = wc_HashGetDigestSize(wc_HashTypeConvert(HashIdAlg(
+           pkcs7->publicKeyOID)));
+#else
+    keyIdSize = KEYID_SIZE;
+#endif
+
 #ifdef WOLFSSL_SMALL_STACK
     signedDataOid = (byte *)XMALLOC(MAX_OID_SZ, pkcs7->heap, DYNAMIC_TYPE_TMP_BUFFER);
     if (signedDataOid == NULL) {
@@ -2485,11 +2493,10 @@ static int PKCS7_EncodeSigned(PKCS7* pkcs7, ESD* esd,
 
     } else if (pkcs7->sidType == CMS_SKID) {
         /* SubjectKeyIdentifier */
-        esd->issuerSKIDSz = SetOctetString(KEYID_SIZE, esd->issuerSKID);
-        esd->issuerSKIDSeqSz = SetExplicit(0, esd->issuerSKIDSz + KEYID_SIZE,
+        esd->issuerSKIDSz = SetOctetString(keyIdSize, esd->issuerSKID);
+        esd->issuerSKIDSeqSz = SetExplicit(0, esd->issuerSKIDSz + keyIdSize,
                                            esd->issuerSKIDSeq);
-        signerInfoSz += (esd->issuerSKIDSz + esd->issuerSKIDSeqSz +
-                         KEYID_SIZE);
+        signerInfoSz += (esd->issuerSKIDSz + esd->issuerSKIDSeqSz + keyIdSize);
 
         /* version MUST be 3 */
         esd->signerVersionSz = SetMyVersion(3, esd->signerVersion, 0);
@@ -2723,8 +2730,8 @@ static int PKCS7_EncodeSigned(PKCS7* pkcs7, ESD* esd,
         idx += esd->issuerSKIDSeqSz;
         XMEMCPY(output2 + idx, esd->issuerSKID, esd->issuerSKIDSz);
         idx += esd->issuerSKIDSz;
-        XMEMCPY(output2 + idx, pkcs7->issuerSubjKeyId, KEYID_SIZE);
-        idx += KEYID_SIZE;
+        XMEMCPY(output2 + idx, pkcs7->issuerSubjKeyId, keyIdSize);
+        idx += keyIdSize;
     } else if (pkcs7->sidType == DEGENERATE_SID) {
         /* no signer infos in degenerate case */
     } else {
@@ -3779,7 +3786,10 @@ static int wc_PKCS7_VerifyContentMessageDigest(PKCS7* pkcs7,
                                                word32 hashSz)
 {
     int ret = 0, digestSz = 0, innerAttribSz = 0;
+    int contentLen = 0;
     word32 idx = 0;
+    word32 contentIdx = 0;
+    byte* content = NULL;
     byte* digestBuf = NULL;
 #ifdef WOLFSSL_SMALL_STACK
     byte* digest = NULL;
@@ -3838,7 +3848,29 @@ static int wc_PKCS7_VerifyContentMessageDigest(PKCS7* pkcs7,
 #endif
         XMEMSET(digest, 0, MAX_PKCS7_DIGEST_SZ);
 
-        ret = wc_Hash(hashType, pkcs7->content, pkcs7->contentSz, digest,
+        content = pkcs7->content;
+        contentLen = pkcs7->contentSz;
+
+        if (pkcs7->contentIsPkcs7Type == 1) {
+            /* Content follows PKCS#7 RFC, which defines type as ANY. CMS
+             * mandates OCTET_STRING which has already been stripped off.
+             * For PKCS#7 message digest calculation, digest is calculated
+             * only on the "value" of the DER encoding. As such, advance past
+             * the tag and length */
+            if (contentLen > 1) {
+                contentIdx++;
+            }
+
+            if (GetLength_ex(content, &contentIdx, &contentLen,
+                    contentLen, 1) < 0) {
+                #ifdef WOLFSSL_SMALL_STACK
+                    XFREE(digest, pkcs7->heap, DYNAMIC_TYPE_TMP_BUFFER);
+                #endif
+                return ASN_PARSE_E;
+            }
+        }
+
+        ret = wc_Hash(hashType, content + contentIdx, contentLen, digest,
                       MAX_PKCS7_DIGEST_SZ);
         if (ret < 0) {
             WOLFSSL_MSG("Error hashing PKCS7 content for verification");
@@ -4428,11 +4460,13 @@ static int PKCS7_VerifySignedData(PKCS7* pkcs7, const byte* hashBuf,
     byte* cert = NULL;
     byte* signedAttrib = NULL;
     byte* contentType = NULL;
+    int encapContentInfoLen = 0;
     int contentSz = 0, sigSz = 0, certSz = 0, signedAttribSz = 0;
     word32 localIdx, start;
     byte degenerate = 0;
     byte detached = 0;
     byte tag = 0;
+    word16 contentIsPkcs7Type = 0;
 #ifdef ASN_BER_TO_DER
     byte* der;
 #endif
@@ -4642,7 +4676,7 @@ static int PKCS7_VerifySignedData(PKCS7* pkcs7, const byte* hashBuf,
 
         #endif
             /* Get the inner ContentInfo sequence */
-            if (GetSequence_ex(pkiMsg, &idx, &length, pkiMsgSz,
+            if (GetSequence_ex(pkiMsg, &idx, &encapContentInfoLen, pkiMsgSz,
                         NO_USER_CHECK) < 0)
                 ret = ASN_PARSE_E;
 
@@ -4650,7 +4684,8 @@ static int PKCS7_VerifySignedData(PKCS7* pkcs7, const byte* hashBuf,
             if (ret == 0) {
                 int isIndef = 0;
                 word32 tmpIdx = idx;
-                if (length == 0 && pkiMsg[idx-1] == ASN_INDEF_LENGTH) {
+                if (encapContentInfoLen == 0 &&
+                    pkiMsg[idx-1] == ASN_INDEF_LENGTH) {
                     isIndef = 1;
                 }
                 if (GetASNObjectId(pkiMsg, &idx, &length, pkiMsgSz) == 0) {
@@ -4675,7 +4710,7 @@ static int PKCS7_VerifySignedData(PKCS7* pkcs7, const byte* hashBuf,
             if (ret != 0)
                 break;
 
-            /* Check for content info, it could be omitted when degenerate */
+            /* Check for content, it could be omitted when degenerate */
             localIdx = idx;
             ret = 0;
             if (localIdx + 1 > pkiMsgSz) {
@@ -4683,75 +4718,114 @@ static int PKCS7_VerifySignedData(PKCS7* pkcs7, const byte* hashBuf,
                 break;
             }
 
+            /* Set error state if no more data left in ContentInfo, meaning
+             * no content - may be detached. Will recover from error below */
+            if ((encapContentInfoLen != 0) &&
+                (encapContentInfoLen - contentTypeSz == 0)) {
+                ret = ASN_PARSE_E;
+            }
+
+            /* PKCS#7 spec:
+             *     content [0] EXPLICIT ANY DEFINED BY contentType OPTIONAL
+             * CMS spec:
+             *     eContent [0] EXPLICIT OCTET STRING OPTIONAL
+             */
             if (ret == 0 && GetASNTag(pkiMsg, &localIdx, &tag, pkiMsgSz) != 0)
                 ret = ASN_PARSE_E;
 
             if (ret == 0 && tag != (ASN_CONSTRUCTED | ASN_CONTEXT_SPECIFIC | 0))
                 ret = ASN_PARSE_E;
 
+            /* Get length of inner eContent payload. For CMS, spec defines
+             * OCTET_STRING will be next. If so, we use the length retrieved
+             * there. PKCS#7 spec defines ANY as eContent type. In this case
+             * we fall back and save this content length for use later */
             if (ret == 0 && GetLength_ex(pkiMsg, &localIdx, &length, pkiMsgSz,
-                        NO_USER_CHECK) <= 0)
+                        NO_USER_CHECK) <= 0) {
                 ret = ASN_PARSE_E;
+            }
 
             if (localIdx >= pkiMsgSz) {
                 ret = BUFFER_E;
             }
 
+            /* Save idx to back up in case of PKCS#7 eContent */
+            start = localIdx;
+
             /* get length of content in the case that there is multiple parts */
             if (ret == 0 && GetASNTag(pkiMsg, &localIdx, &tag, pkiMsgSz) < 0)
                 ret = ASN_PARSE_E;
 
-            if (ret == 0 && tag == (ASN_OCTET_STRING | ASN_CONSTRUCTED)) {
-                multiPart = 1;
+            if (ret == 0 &&
+                (tag != (ASN_OCTET_STRING | ASN_CONSTRUCTED) &&
+                (tag != ASN_OCTET_STRING))) {
 
-                /* Get length of all OCTET_STRINGs. */
-                if (GetLength_ex(pkiMsg, &localIdx, &contentLen, pkiMsgSz,
-                            NO_USER_CHECK) < 0)
+                /* If reached end of ContentInfo, or we see the next element
+                 * ([0] IMPLICIT CertificateSet), set error state. Either
+                 * true error or detached */
+                if (tag == (ASN_CONSTRUCTED | ASN_CONTEXT_SPECIFIC | 0)) {
                     ret = ASN_PARSE_E;
-
-                /* Check whether there is one OCTET_STRING inside. */
-                start = localIdx;
-                if (localIdx >= pkiMsgSz) {
-                    ret = BUFFER_E;
                 }
 
-                if (ret == 0 && GetASNTag(pkiMsg, &localIdx, &tag, pkiMsgSz)
-                        != 0)
-                    ret = ASN_PARSE_E;
+                /* Back up before getting tag, process as PKCS#7 ANY and use
+                 * this as start of content. */
+                localIdx = start;
+                pkcs7->contentIsPkcs7Type = 1;
+            }
+            else {
+                /* CMS eContent OCTET_STRING */
+                if (ret == 0 && tag == (ASN_OCTET_STRING | ASN_CONSTRUCTED)) {
+                    multiPart = 1;
 
-                if (ret == 0 && tag != ASN_OCTET_STRING)
-                    ret = ASN_PARSE_E;
+                    /* Get length of all OCTET_STRINGs. */
+                    if (GetLength_ex(pkiMsg, &localIdx, &contentLen, pkiMsgSz,
+                                NO_USER_CHECK) < 0)
+                        ret = ASN_PARSE_E;
 
-                if (ret == 0 && GetLength_ex(pkiMsg, &localIdx, &length,
-                            pkiMsgSz, NO_USER_CHECK) < 0)
-                    ret = ASN_PARSE_E;
+                    /* Check whether there is one OCTET_STRING inside. */
+                    start = localIdx;
+                    if (localIdx >= pkiMsgSz) {
+                        ret = BUFFER_E;
+                    }
 
-                if (ret == 0) {
-                    /* Use single OCTET_STRING directly, or reset length. */
-                    if (localIdx - start + length == (word32)contentLen) {
-                        multiPart = 0;
-                    } else {
-                        /* reset length to outer OCTET_STRING for bundle size
-                         * check below */
-                        length = contentLen;
+                    if (ret == 0 && GetASNTag(pkiMsg, &localIdx, &tag, pkiMsgSz)
+                            != 0)
+                        ret = ASN_PARSE_E;
+
+                    if (ret == 0 && tag != ASN_OCTET_STRING)
+                        ret = ASN_PARSE_E;
+
+                    if (ret == 0 && GetLength_ex(pkiMsg, &localIdx, &length,
+                                pkiMsgSz, NO_USER_CHECK) < 0)
+                        ret = ASN_PARSE_E;
+
+                    if (ret == 0) {
+                        /* Use single OCTET_STRING directly, or reset length. */
+                        if (localIdx - start + length == (word32)contentLen) {
+                            multiPart = 0;
+                        } else {
+                            /* reset length to outer OCTET_STRING for bundle
+                             * size check below */
+                            length = contentLen;
+                        }
+                        localIdx = start;
                     }
-                    localIdx = start;
-                }
 
-                if (ret != 0) {
-                    /* failed ASN1 parsing during OCTET_STRING checks */
-                    break;
+                    if (ret != 0) {
+                        /* failed ASN1 parsing during OCTET_STRING checks */
+                        break;
+                    }
                 }
-            }
 
-            /* get length of content in case of single part */
-            if (ret == 0 && !multiPart) {
-                if (tag != ASN_OCTET_STRING)
-                    ret = ASN_PARSE_E;
+                /* get length of content in case of single part */
+                if (ret == 0 && !multiPart) {
+                    if (tag != ASN_OCTET_STRING)
+                        ret = ASN_PARSE_E;
 
-                if (ret == 0 && GetLength_ex(pkiMsg, &localIdx,
-                            &length, pkiMsgSz, NO_USER_CHECK) < 0)
-                    ret = ASN_PARSE_E;
+                    if (ret == 0 && GetLength_ex(pkiMsg, &localIdx,
+                                &length, pkiMsgSz, NO_USER_CHECK) < 0)
+                        ret = ASN_PARSE_E;
+                }
             }
 
             /* update idx if successful */
@@ -5104,6 +5178,7 @@ static int PKCS7_VerifySignedData(PKCS7* pkcs7, const byte* hashBuf,
                     pkcs7->der = NULL;
         #endif
                     version = pkcs7->version;
+                    contentIsPkcs7Type = pkcs7->contentIsPkcs7Type;
 
                     if (ret == 0) {
                         byte isDynamic = (byte)pkcs7->isDynamic;
@@ -5139,6 +5214,9 @@ static int PKCS7_VerifySignedData(PKCS7* pkcs7, const byte* hashBuf,
                             contentDynamic = NULL;
                         }
 
+                        /* Restore content is PKCS#7 flag */
+                        pkcs7->contentIsPkcs7Type = contentIsPkcs7Type;
+
                     #ifndef NO_PKCS7_STREAM
                         pkcs7->stream = stream;
                     #endif
@@ -5661,6 +5739,7 @@ static WC_PKCS7_KARI* wc_PKCS7_KariNew(PKCS7* pkcs7, byte direction)
         XFREE(kari, pkcs7->heap, DYNAMIC_TYPE_PKCS7);
         return NULL;
     }
+    XMEMSET(kari->decoded, 0, sizeof(DecodedCert));
 
     kari->recipKey = (ecc_key*)XMALLOC(sizeof(ecc_key), pkcs7->heap,
                                        DYNAMIC_TYPE_PKCS7);
@@ -5764,29 +5843,30 @@ static int wc_PKCS7_KariParseRecipCert(WC_PKCS7_KARI* kari, const byte* cert,
     int ret;
     word32 idx;
 
-    if (kari == NULL || kari->decoded == NULL ||
-        cert == NULL || certSz == 0)
+    if (kari == NULL || kari->decoded == NULL) {
         return BAD_FUNC_ARG;
+    }
 
     /* decode certificate */
-    InitDecodedCert(kari->decoded, (byte*)cert, certSz, kari->heap);
-    kari->decodedInit = 1;
-    ret = ParseCert(kari->decoded, CA_TYPE, NO_VERIFY, 0);
-    if (ret < 0)
-        return ret;
+    if (cert != NULL) {
+        InitDecodedCert(kari->decoded, (byte*)cert, certSz, kari->heap);
+        kari->decodedInit = 1;
+        ret = ParseCert(kari->decoded, CA_TYPE, NO_VERIFY, 0);
+        if (ret < 0)
+            return ret;
 
-    /* only supports ECDSA for now */
-    if (kari->decoded->keyOID != ECDSAk) {
-        WOLFSSL_MSG("CMS KARI only supports ECDSA key types");
-        return BAD_FUNC_ARG;
-    }
+        /* only supports ECDSA for now */
+        if (kari->decoded->keyOID != ECDSAk) {
+            WOLFSSL_MSG("CMS KARI only supports ECDSA key types");
+            return BAD_FUNC_ARG;
+        }
 
-    /* make sure subject key id was read from cert */
-    if (kari->decoded->extSubjKeyIdSet == 0) {
-        WOLFSSL_MSG("Failed to read subject key ID from recipient cert");
-        return BAD_FUNC_ARG;
+        /* make sure subject key id was read from cert */
+        if (kari->decoded->extSubjKeyIdSet == 0) {
+            WOLFSSL_MSG("Failed to read subject key ID from recipient cert");
+            return BAD_FUNC_ARG;
+        }
     }
-
     ret = wc_ecc_init_ex(kari->recipKey, kari->heap, kari->devId);
     if (ret != 0)
         return ret;
@@ -5795,6 +5875,10 @@ static int wc_PKCS7_KariParseRecipCert(WC_PKCS7_KARI* kari, const byte* cert,
 
     /* get recip public key */
     if (kari->direction == WC_PKCS7_ENCODE) {
+        if (cert == NULL) {
+            WOLFSSL_MSG("Error recipient cert can not be null with encode");
+            return BAD_FUNC_ARG;
+        }
 
         idx = 0;
         ret = wc_EccPublicKeyDecode(kari->decoded->publicKey, &idx,
@@ -6023,11 +6107,15 @@ static int wc_PKCS7_KariGenerateKEK(WC_PKCS7_KARI* kari, WC_RNG* rng,
     (!defined(HAVE_FIPS_VERSION) || (HAVE_FIPS_VERSION != 2))) && \
     !defined(HAVE_SELFTEST)
     ret = wc_ecc_set_rng(kari->senderKey, rng);
-    if (ret != 0)
+    if (ret != 0) {
+        XFREE(secret, kari->heap, DYNAMIC_TYPE_PKCS7);
         return ret;
+    }
     ret = wc_ecc_set_rng(kari->recipKey, rng);
-    if (ret != 0)
+    if (ret != 0) {
+        XFREE(secret, kari->heap, DYNAMIC_TYPE_PKCS7);
         return ret;
+    }
 #else
     (void)rng;
 #endif
@@ -6118,6 +6206,7 @@ int wc_PKCS7_AddRecipient_KARI(PKCS7* pkcs7, const byte* cert, word32 certSz,
     int ret = 0;
     int keySz, direction = 0;
     int blockKeySz = 0;
+    int keyIdSize;
 
     /* ASN.1 layout */
     int totalSz = 0;
@@ -6169,6 +6258,13 @@ int wc_PKCS7_AddRecipient_KARI(PKCS7* pkcs7, const byte* cert, word32 certSz,
     byte encryptedKey[MAX_ENCRYPTED_KEY_SZ];
 #endif
 
+#if defined(WOLFSSL_SM2) && defined(WOLFSSL_SM3)
+    keyIdSize = wc_HashGetDigestSize(wc_HashTypeConvert(HashIdAlg(
+           pkcs7->publicKeyOID)));
+#else
+    keyIdSize = KEYID_SIZE;
+#endif
+
     /* allocate and init memory for recipient */
     recip = (Pkcs7EncodedRecip*)XMALLOC(sizeof(Pkcs7EncodedRecip), pkcs7->heap,
                                  DYNAMIC_TYPE_PKCS7);
@@ -6294,12 +6390,12 @@ int wc_PKCS7_AddRecipient_KARI(PKCS7* pkcs7, const byte* cert, word32 certSz,
     totalSz += (encryptedKeyOctetSz + encryptedKeySz);
 
     /* SubjectKeyIdentifier */
-    subjKeyIdOctetSz = SetOctetString(KEYID_SIZE, subjKeyIdOctet);
-    totalSz += (subjKeyIdOctetSz + KEYID_SIZE);
+    subjKeyIdOctetSz = SetOctetString(keyIdSize, subjKeyIdOctet);
+    totalSz += (subjKeyIdOctetSz + keyIdSize);
 
     /* RecipientKeyIdentifier IMPLICIT [0] */
     recipKeyIdSeqSz = SetImplicit(ASN_SEQUENCE, 0, subjKeyIdOctetSz +
-                                  KEYID_SIZE, recipKeyIdSeq);
+                                  keyIdSize, recipKeyIdSeq);
     totalSz += recipKeyIdSeqSz;
 
     /* RecipientEncryptedKey */
@@ -6423,8 +6519,8 @@ int wc_PKCS7_AddRecipient_KARI(PKCS7* pkcs7, const byte* cert, word32 certSz,
     XMEMCPY(recip->recip + idx, subjKeyIdOctet, subjKeyIdOctetSz);
     idx += subjKeyIdOctetSz;
     /* subject key id */
-    XMEMCPY(recip->recip + idx, kari->decoded->extSubjKeyId, KEYID_SIZE);
-    idx += KEYID_SIZE;
+    XMEMCPY(recip->recip + idx, kari->decoded->extSubjKeyId, keyIdSize);
+    idx += keyIdSize;
     XMEMCPY(recip->recip + idx, encryptedKeyOctet, encryptedKeyOctetSz);
     idx += encryptedKeyOctetSz;
     /* encrypted CEK */
@@ -6473,6 +6569,7 @@ int wc_PKCS7_AddRecipient_KTRI(PKCS7* pkcs7, const byte* cert, word32 certSz,
     WC_RNG rng;
     word32 idx = 0;
     word32 encryptedKeySz = 0;
+    int keyIdSize;
 
     int ret = 0, blockKeySz;
     int verSz = 0, issuerSz = 0, snSz = 0, keyEncAlgSz = 0;
@@ -6599,6 +6696,13 @@ int wc_PKCS7_AddRecipient_KTRI(PKCS7* pkcs7, const byte* cert, word32 certSz,
         return ret;
     }
 
+#if defined(WOLFSSL_SM2) && defined(WOLFSSL_SM3)
+    keyIdSize = wc_HashGetDigestSize(wc_HashTypeConvert(HashIdAlg(
+           decoded->signatureOID)));
+#else
+    keyIdSize = KEYID_SIZE;
+#endif
+
     if (sidType == CMS_ISSUER_AND_SERIAL_NUMBER) {
 
         /* version, must be 0 for IssuerAndSerialNumber */
@@ -6655,7 +6759,7 @@ int wc_PKCS7_AddRecipient_KTRI(PKCS7* pkcs7, const byte* cert, word32 certSz,
         verSz = SetMyVersion(2, ver, 0);
         recip->recipVersion = 2;
 
-        issuerSKIDSz = SetLength(KEYID_SIZE, issuerSKID);
+        issuerSKIDSz = SetLength(keyIdSize, issuerSKID);
     } else {
         FreeDecodedCert(decoded);
 #ifdef WOLFSSL_SMALL_STACK
@@ -6815,10 +6919,10 @@ int wc_PKCS7_AddRecipient_KTRI(PKCS7* pkcs7, const byte* cert, word32 certSz,
 
     } else {
         recipSeqSz = SetSequence(verSz + ASN_TAG_SZ + issuerSKIDSz +
-                                 KEYID_SIZE + keyEncAlgSz + encKeyOctetStrSz +
+                                 keyIdSize + keyEncAlgSz + encKeyOctetStrSz +
                                  encryptedKeySz, recipSeq);
 
-        if (recipSeqSz + verSz + ASN_TAG_SZ + issuerSKIDSz + KEYID_SIZE +
+        if (recipSeqSz + verSz + ASN_TAG_SZ + issuerSKIDSz + keyIdSize +
             keyEncAlgSz + encKeyOctetStrSz + encryptedKeySz > MAX_RECIP_SZ) {
             WOLFSSL_MSG("RecipientInfo output buffer too small");
             FreeDecodedCert(decoded);
@@ -6852,8 +6956,8 @@ int wc_PKCS7_AddRecipient_KTRI(PKCS7* pkcs7, const byte* cert, word32 certSz,
         idx += ASN_TAG_SZ;
         XMEMCPY(recip->recip + idx, issuerSKID, issuerSKIDSz);
         idx += issuerSKIDSz;
-        XMEMCPY(recip->recip + idx, pkcs7->issuerSubjKeyId, KEYID_SIZE);
-        idx += KEYID_SIZE;
+        XMEMCPY(recip->recip + idx, pkcs7->issuerSubjKeyId, keyIdSize);
+        idx += keyIdSize;
     }
     XMEMCPY(recip->recip + idx, keyAlgArray, keyEncAlgSz);
     idx += keyEncAlgSz;
@@ -8567,6 +8671,7 @@ static int wc_PKCS7_DecryptKtri(PKCS7* pkcs7, byte* in, word32 inSz,
 {
     int length, encryptedKeySz = 0, ret = 0;
     int keySz, version, sidType = 0;
+    int keyIdSize;
     word32 encOID = 0;
     word32 keyIdx;
     byte   issuerHash[KEYID_SIZE];
@@ -8593,6 +8698,13 @@ static int wc_PKCS7_DecryptKtri(PKCS7* pkcs7, byte* in, word32 inSz,
     RsaKey privKey[1];
 #endif
 
+#if defined(WOLFSSL_SM2) && defined(WOLFSSL_SM3)
+    keyIdSize = wc_HashGetDigestSize(wc_HashTypeConvert(HashIdAlg(
+           pkcs7->publicKeyOID)));
+#else
+    keyIdSize = KEYID_SIZE;
+#endif
+
     switch (pkcs7->state) {
         case WC_PKCS7_DECRYPT_KTRI:
         #ifndef NO_PKCS7_STREAM
@@ -8680,11 +8792,12 @@ static int wc_PKCS7_DecryptKtri(PKCS7* pkcs7, byte* in, word32 inSz,
                 if (GetSequence(pkiMsg, idx, &length, pkiMsgSz) < 0)
                     return ASN_PARSE_E;
 
-                if (GetNameHash(pkiMsg, idx, issuerHash, pkiMsgSz) < 0)
+                if (GetNameHash_ex(pkiMsg, idx, issuerHash, pkiMsgSz,
+                                   pkcs7->publicKeyOID) < 0)
                     return ASN_PARSE_E;
 
                 /* if we found correct recipient, issuer hashes will match */
-                if (XMEMCMP(issuerHash, pkcs7->issuerHash, KEYID_SIZE) == 0) {
+                if (XMEMCMP(issuerHash, pkcs7->issuerHash, keyIdSize) == 0) {
                     *recipFound = 1;
                 }
 
@@ -8732,15 +8845,15 @@ static int wc_PKCS7_DecryptKtri(PKCS7* pkcs7, byte* in, word32 inSz,
                 if (GetLength(pkiMsg, idx, &length, pkiMsgSz) < 0)
                     return ASN_PARSE_E;
 
-                if (KEYID_SIZE > pkiMsgSz - (*idx))
+                if ((word32)keyIdSize > pkiMsgSz - (*idx))
                     return BUFFER_E;
 
                 /* if we found correct recipient, SKID will match */
                 if (XMEMCMP(pkiMsg + (*idx), pkcs7->issuerSubjKeyId,
-                            KEYID_SIZE) == 0) {
+                            keyIdSize) == 0) {
                     *recipFound = 1;
                 }
-                (*idx) += KEYID_SIZE;
+                (*idx) += keyIdSize;
             }
 
             if (GetAlgoId(pkiMsg, idx, &encOID, oidKeyType, pkiMsgSz) < 0)
@@ -9149,11 +9262,19 @@ static int wc_PKCS7_KariGetSubjectKeyIdentifier(WC_PKCS7_KARI* kari,
 {
     int length;
     byte tag;
+    int keyIdSize;
 
     if (kari == NULL || pkiMsg == NULL || idx == NULL || recipFound == NULL ||
             rid == NULL)
         return BAD_FUNC_ARG;
 
+#if defined(WOLFSSL_SM2) && defined(WOLFSSL_SM3)
+    keyIdSize = wc_HashGetDigestSize(wc_HashTypeConvert(HashIdAlg(
+           kari->decoded->signatureOID)));
+#else
+    keyIdSize = KEYID_SIZE;
+#endif
+
     /* remove RecipientKeyIdentifier IMPLICIT [0] */
     if (GetASNTag(pkiMsg, idx, &tag, pkiMsgSz) < 0) {
         return ASN_PARSE_E;
@@ -9178,14 +9299,14 @@ static int wc_PKCS7_KariGetSubjectKeyIdentifier(WC_PKCS7_KARI* kari,
     if (GetLength(pkiMsg, idx, &length, pkiMsgSz) < 0)
         return ASN_PARSE_E;
 
-    if (length != KEYID_SIZE)
+    if (length != keyIdSize)
         return ASN_PARSE_E;
 
-    XMEMCPY(rid, pkiMsg + (*idx), KEYID_SIZE);
+    XMEMCPY(rid, pkiMsg + (*idx), keyIdSize);
     (*idx) += length;
 
     /* subject key id should match if recipient found */
-    if (XMEMCMP(rid, kari->decoded->extSubjKeyId, KEYID_SIZE) == 0) {
+    if (XMEMCMP(rid, kari->decoded->extSubjKeyId, keyIdSize) == 0) {
         *recipFound = 1;
     }
 
@@ -9200,6 +9321,7 @@ static int wc_PKCS7_KariGetIssuerAndSerialNumber(WC_PKCS7_KARI* kari,
                         int* recipFound, byte* rid)
 {
     int length, ret;
+    int keyIdSize;
 #ifdef WOLFSSL_SMALL_STACK
     mp_int* serial;
     mp_int* recipSerial;
@@ -9212,15 +9334,31 @@ static int wc_PKCS7_KariGetIssuerAndSerialNumber(WC_PKCS7_KARI* kari,
         return BAD_FUNC_ARG;
     }
 
+#if defined(WOLFSSL_SM2) && defined(WOLFSSL_SM3)
+    keyIdSize = wc_HashGetDigestSize(wc_HashTypeConvert(HashIdAlg(
+           kari->decoded->signatureOID)));
+#else
+    keyIdSize = KEYID_SIZE;
+#endif
+
     /* remove IssuerAndSerialNumber */
     if (GetSequence(pkiMsg, idx, &length, pkiMsgSz) < 0)
         return ASN_PARSE_E;
 
-    if (GetNameHash(pkiMsg, idx, rid, pkiMsgSz) < 0)
+    if (GetNameHash_ex(pkiMsg, idx, rid, pkiMsgSz,
+                       kari->decoded->signatureOID) < 0) {
         return ASN_PARSE_E;
+    }
 
     /* if we found correct recipient, issuer hashes will match */
-    if (XMEMCMP(rid, kari->decoded->issuerHash, KEYID_SIZE) == 0) {
+    if (kari->decodedInit == 1) {
+        if (XMEMCMP(rid, kari->decoded->issuerHash, keyIdSize) == 0) {
+            *recipFound = 1;
+        }
+    }
+    else {
+        /* can not confirm recipient serial number with no cert provided */
+        WOLFSSL_MSG("No recipient cert loaded to match with CMS serial number");
         *recipFound = 1;
     }
 
@@ -9246,7 +9384,9 @@ static int wc_PKCS7_KariGetIssuerAndSerialNumber(WC_PKCS7_KARI* kari,
         return ASN_PARSE_E;
     }
 
-    ret = mp_read_unsigned_bin(recipSerial, kari->decoded->serial,
+    ret = mp_init(recipSerial);
+    if (ret == MP_OKAY)
+        ret = mp_read_unsigned_bin(recipSerial, kari->decoded->serial,
                              kari->decoded->serialSz);
     if (ret != MP_OKAY) {
         mp_clear(serial);
@@ -9258,7 +9398,8 @@ static int wc_PKCS7_KariGetIssuerAndSerialNumber(WC_PKCS7_KARI* kari,
         return ret;
     }
 
-    if (mp_cmp(recipSerial, serial) != MP_EQ) {
+    if (kari->decodedInit == 1 &&
+            mp_cmp(recipSerial, serial) != MP_EQ) {
         mp_clear(serial);
         mp_clear(recipSerial);
         WOLFSSL_MSG("CMS serial number does not match recipient");
@@ -9876,6 +10017,7 @@ static int wc_PKCS7_DecryptKari(PKCS7* pkcs7, byte* in, word32 inSz,
     int ret, keySz;
     int encryptedKeySz;
     int direction = 0;
+    int keyIdSize;
     word32 keyAgreeOID, keyWrapOID;
     byte rid[KEYID_SIZE];
 
@@ -9893,12 +10035,17 @@ static int wc_PKCS7_DecryptKari(PKCS7* pkcs7, byte* in, word32 inSz,
 
     WOLFSSL_ENTER("wc_PKCS7_DecryptKari");
     if (pkcs7 == NULL || pkiMsg == NULL ||
-            ((pkcs7->singleCert == NULL || pkcs7->singleCertSz == 0) &&
-              pkcs7->wrapCEKCb == NULL) ||
         idx == NULL || decryptedKey == NULL || decryptedKeySz == NULL) {
         return BAD_FUNC_ARG;
     }
 
+#if defined(WOLFSSL_SM2) && defined(WOLFSSL_SM3)
+    keyIdSize = wc_HashGetDigestSize(wc_HashTypeConvert(HashIdAlg(
+           pkcs7->publicKeyOID)));
+#else
+    keyIdSize = KEYID_SIZE;
+#endif
+
     switch (pkcs7->state) {
         case WC_PKCS7_DECRYPT_KARI: {
             WC_PKCS7_KARI* kari;
@@ -9928,17 +10075,15 @@ static int wc_PKCS7_DecryptKari(PKCS7* pkcs7, byte* in, word32 inSz,
             encryptedKeySz = MAX_ENCRYPTED_KEY_SZ;
 
             /* parse cert and key */
-            if (pkcs7->singleCert != NULL) {
-                ret = wc_PKCS7_KariParseRecipCert(kari, (byte*)pkcs7->singleCert,
-                                              pkcs7->singleCertSz, pkcs7->privateKey,
-                                              pkcs7->privateKeySz);
-                if (ret != 0) {
-                    wc_PKCS7_KariFree(kari);
-                #ifdef WOLFSSL_SMALL_STACK
-                    XFREE(encryptedKey, pkcs7->heap, DYNAMIC_TYPE_PKCS7);
-                #endif
-                    return ret;
-                }
+            ret = wc_PKCS7_KariParseRecipCert(kari, (byte*)pkcs7->singleCert,
+                                          pkcs7->singleCertSz, pkcs7->privateKey,
+                                          pkcs7->privateKeySz);
+            if (ret != 0) {
+                wc_PKCS7_KariFree(kari);
+            #ifdef WOLFSSL_SMALL_STACK
+                XFREE(encryptedKey, pkcs7->heap, DYNAMIC_TYPE_PKCS7);
+            #endif
+                return ret;
             }
 
             /* remove OriginatorIdentifierOrKey */
@@ -10054,7 +10199,7 @@ static int wc_PKCS7_DecryptKari(PKCS7* pkcs7, byte* in, word32 inSz,
                 tmpKeySz = (word32)ret;
 
                 keySz = pkcs7->wrapCEKCb(pkcs7, encryptedKey, encryptedKeySz,
-                        rid, KEYID_SIZE, tmpKeyDer, tmpKeySz,
+                        rid, keyIdSize, tmpKeyDer, tmpKeySz,
                         decryptedKey, *decryptedKeySz,
                         keyWrapOID, (int)PKCS7_KARI, direction);
                 XFREE(tmpKeyDer, pkcs7->heap, DYNAMIC_TYPE_TMP_BUFFER);
@@ -11982,7 +12127,7 @@ WOLFSSL_API int wc_PKCS7_DecodeAuthEnvelopedData(PKCS7* pkcs7, byte* in,
                 encodedAttribs = pkiMsg + idx;
                 idx++;
 
-                if (GetLength(pkiMsg, &idx, &length, pkiMsgSz) < 0)
+                if (GetLength(pkiMsg, &idx, &length, pkiMsgSz) <= 0)
                     ret = ASN_PARSE_E;
             #ifndef NO_PKCS7_STREAM
                 pkcs7->stream->expected = length;

+ 0 - 1130
lib/wolfssl/wolfcrypt/src/poly1305_asm.S

@@ -1,1130 +0,0 @@
-/* poly1305_asm
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-#ifdef WOLFSSL_USER_SETTINGS
-#ifdef WOLFSSL_USER_SETTINGS_ASM
-/*
- * user_settings_asm.h is a file generated by the script user_settings_asm.sh.
- * The script takes in a user_settings.h and produces user_settings_asm.h, which
- * is a stripped down version of user_settings.h containing only preprocessor
- * directives. This makes the header safe to include in assembly (.S) files.
- */
-#include "user_settings_asm.h"
-#else
-/*
- * Note: if user_settings.h contains any C code (e.g. a typedef or function
- * prototype), including it here in an assembly (.S) file will cause an
- * assembler failure. See user_settings_asm.h above.
- */
-#include "user_settings.h"
-#endif /* WOLFSSL_USER_SETTINGS_ASM */
-#endif /* WOLFSSL_USER_SETTINGS */
-
-#ifndef HAVE_INTEL_AVX1
-#define HAVE_INTEL_AVX1
-#endif /* HAVE_INTEL_AVX1 */
-#ifndef NO_AVX2_SUPPORT
-#define HAVE_INTEL_AVX2
-#endif /* NO_AVX2_SUPPORT */
-
-#ifdef WOLFSSL_X86_64_BUILD
-#ifdef HAVE_INTEL_AVX1
-#ifndef __APPLE__
-.text
-.globl	poly1305_setkey_avx
-.type	poly1305_setkey_avx,@function
-.align	16
-poly1305_setkey_avx:
-#else
-.section	__TEXT,__text
-.globl	_poly1305_setkey_avx
-.p2align	4
-_poly1305_setkey_avx:
-#endif /* __APPLE__ */
-        movabsq	$0xffffffc0fffffff, %r10
-        movabsq	$0xffffffc0ffffffc, %r11
-        movq	(%rsi), %rdx
-        movq	8(%rsi), %rax
-        movq	16(%rsi), %rcx
-        movq	24(%rsi), %r8
-        andq	%r10, %rdx
-        andq	%r11, %rax
-        movq	%rdx, %r10
-        movq	%rax, %r11
-        xorq	%r9, %r9
-        movq	%rdx, (%rdi)
-        movq	%rax, 8(%rdi)
-        movq	%r9, 24(%rdi)
-        movq	%r9, 32(%rdi)
-        movq	%r9, 40(%rdi)
-        movq	%rcx, 48(%rdi)
-        movq	%r8, 56(%rdi)
-        movq	%r9, 352(%rdi)
-        movq	%r9, 408(%rdi)
-        movq	%rdx, 360(%rdi)
-        movq	%rax, 416(%rdi)
-        addq	%rdx, %r10
-        addq	%rax, %r11
-        movq	%r10, 368(%rdi)
-        movq	%r11, 424(%rdi)
-        addq	%rdx, %r10
-        addq	%rax, %r11
-        movq	%r10, 376(%rdi)
-        movq	%r11, 432(%rdi)
-        addq	%rdx, %r10
-        addq	%rax, %r11
-        movq	%r10, 384(%rdi)
-        movq	%r11, 440(%rdi)
-        addq	%rdx, %r10
-        addq	%rax, %r11
-        movq	%r10, 392(%rdi)
-        movq	%r11, 448(%rdi)
-        addq	%rdx, %r10
-        addq	%rax, %r11
-        movq	%r10, 400(%rdi)
-        movq	%r11, 456(%rdi)
-        movq	%r9, 608(%rdi)
-        movb	$0x01, 616(%rdi)
-        repz retq
-#ifndef __APPLE__
-.size	poly1305_setkey_avx,.-poly1305_setkey_avx
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	poly1305_block_avx
-.type	poly1305_block_avx,@function
-.align	16
-poly1305_block_avx:
-#else
-.section	__TEXT,__text
-.globl	_poly1305_block_avx
-.p2align	4
-_poly1305_block_avx:
-#endif /* __APPLE__ */
-        pushq	%r15
-        pushq	%rbx
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        movq	(%rdi), %r15
-        movq	8(%rdi), %rbx
-        movq	24(%rdi), %r8
-        movq	32(%rdi), %r9
-        movq	40(%rdi), %r10
-        xorq	%r14, %r14
-        movb	616(%rdi), %r14b
-        # h += m
-        movq	(%rsi), %r11
-        movq	8(%rsi), %r12
-        addq	%r11, %r8
-        adcq	%r12, %r9
-        movq	%rbx, %rax
-        adcq	%r14, %r10
-        # r[1] * h[0] => rdx, rax ==> t2, t1
-        mulq	%r8
-        movq	%rax, %r12
-        movq	%rdx, %r13
-        # r[0] * h[1] => rdx, rax ++> t2, t1
-        movq	%r15, %rax
-        mulq	%r9
-        addq	%rax, %r12
-        movq	%r15, %rax
-        adcq	%rdx, %r13
-        # r[0] * h[0] => rdx, rax ==> t4, t0
-        mulq	%r8
-        movq	%rax, %r11
-        movq	%rdx, %r8
-        # r[1] * h[1] => rdx, rax =+> t3, t2
-        movq	%rbx, %rax
-        mulq	%r9
-        #   r[0] * h[2] +> t2
-        addq	352(%rdi,%r10,8), %r13
-        movq	%rdx, %r14
-        addq	%r8, %r12
-        adcq	%rax, %r13
-        #   r[1] * h[2] +> t3
-        adcq	408(%rdi,%r10,8), %r14
-        # r * h in r14, r13, r12, r11
-        # h = (r * h) mod 2^130 - 5
-        movq	%r13, %r10
-        andq	$-4, %r13
-        andq	$3, %r10
-        addq	%r13, %r11
-        movq	%r13, %r8
-        adcq	%r14, %r12
-        adcq	$0x00, %r10
-        shrdq	$2, %r14, %r8
-        shrq	$2, %r14
-        addq	%r11, %r8
-        adcq	%r14, %r12
-        movq	%r12, %r9
-        adcq	$0x00, %r10
-        # h in r10, r9, r8
-        # Store h to ctx
-        movq	%r8, 24(%rdi)
-        movq	%r9, 32(%rdi)
-        movq	%r10, 40(%rdi)
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        popq	%rbx
-        popq	%r15
-        repz retq
-#ifndef __APPLE__
-.size	poly1305_block_avx,.-poly1305_block_avx
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	poly1305_blocks_avx
-.type	poly1305_blocks_avx,@function
-.align	16
-poly1305_blocks_avx:
-#else
-.section	__TEXT,__text
-.globl	_poly1305_blocks_avx
-.p2align	4
-_poly1305_blocks_avx:
-#endif /* __APPLE__ */
-        pushq	%r15
-        pushq	%rbx
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        movq	%rdx, %rcx
-        movq	(%rdi), %r15
-        movq	8(%rdi), %rbx
-        movq	24(%rdi), %r8
-        movq	32(%rdi), %r9
-        movq	40(%rdi), %r10
-L_poly1305_avx_blocks_start:
-        # h += m
-        movq	(%rsi), %r11
-        movq	8(%rsi), %r12
-        addq	%r11, %r8
-        adcq	%r12, %r9
-        movq	%rbx, %rax
-        adcq	$0x00, %r10
-        # r[1] * h[0] => rdx, rax ==> t2, t1
-        mulq	%r8
-        movq	%rax, %r12
-        movq	%rdx, %r13
-        # r[0] * h[1] => rdx, rax ++> t2, t1
-        movq	%r15, %rax
-        mulq	%r9
-        addq	%rax, %r12
-        movq	%r15, %rax
-        adcq	%rdx, %r13
-        # r[0] * h[0] => rdx, rax ==> t4, t0
-        mulq	%r8
-        movq	%rax, %r11
-        movq	%rdx, %r8
-        # r[1] * h[1] => rdx, rax =+> t3, t2
-        movq	%rbx, %rax
-        mulq	%r9
-        #   r[0] * h[2] +> t2
-        addq	360(%rdi,%r10,8), %r13
-        movq	%rdx, %r14
-        addq	%r8, %r12
-        adcq	%rax, %r13
-        #   r[1] * h[2] +> t3
-        adcq	416(%rdi,%r10,8), %r14
-        # r * h in r14, r13, r12, r11
-        # h = (r * h) mod 2^130 - 5
-        movq	%r13, %r10
-        andq	$-4, %r13
-        andq	$3, %r10
-        addq	%r13, %r11
-        movq	%r13, %r8
-        adcq	%r14, %r12
-        adcq	$0x00, %r10
-        shrdq	$2, %r14, %r8
-        shrq	$2, %r14
-        addq	%r11, %r8
-        adcq	%r14, %r12
-        movq	%r12, %r9
-        adcq	$0x00, %r10
-        # h in r10, r9, r8
-        # Next block from message
-        addq	$16, %rsi
-        subq	$16, %rcx
-        jg	L_poly1305_avx_blocks_start
-        # Store h to ctx
-        movq	%r8, 24(%rdi)
-        movq	%r9, 32(%rdi)
-        movq	%r10, 40(%rdi)
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        popq	%rbx
-        popq	%r15
-        repz retq
-#ifndef __APPLE__
-.size	poly1305_blocks_avx,.-poly1305_blocks_avx
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	poly1305_final_avx
-.type	poly1305_final_avx,@function
-.align	16
-poly1305_final_avx:
-#else
-.section	__TEXT,__text
-.globl	_poly1305_final_avx
-.p2align	4
-_poly1305_final_avx:
-#endif /* __APPLE__ */
-        pushq	%rbx
-        pushq	%r12
-        movq	%rsi, %rbx
-        movq	608(%rdi), %rax
-        testq	%rax, %rax
-        je	L_poly1305_avx_final_no_more
-        movb	$0x01, 480(%rdi,%rax,1)
-        jmp	L_poly1305_avx_final_cmp_rem
-L_poly1305_avx_final_zero_rem:
-        movb	$0x00, 480(%rdi,%rax,1)
-L_poly1305_avx_final_cmp_rem:
-        incb	%al
-        cmpq	$16, %rax
-        jl	L_poly1305_avx_final_zero_rem
-        movb	$0x00, 616(%rdi)
-        leaq	480(%rdi), %rsi
-#ifndef __APPLE__
-        callq	poly1305_block_avx@plt
-#else
-        callq	_poly1305_block_avx
-#endif /* __APPLE__ */
-L_poly1305_avx_final_no_more:
-        movq	24(%rdi), %rax
-        movq	32(%rdi), %rdx
-        movq	40(%rdi), %rcx
-        movq	48(%rdi), %r11
-        movq	56(%rdi), %r12
-        # h %= p
-        # h = (h + pad)
-        # mod 2^130 - 5
-        movq	%rcx, %r8
-        andq	$3, %rcx
-        shrq	$2, %r8
-        #   Multily by 5
-        leaq	0(%r8,%r8,4), %r8
-        addq	%r8, %rax
-        adcq	$0x00, %rdx
-        adcq	$0x00, %rcx
-        # Fixup when between (1 << 130) - 1 and (1 << 130) - 5
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        movq	%rcx, %r10
-        addq	$5, %r8
-        adcq	$0x00, %r9
-        adcq	$0x00, %r10
-        cmpq	$4, %r10
-        cmoveq	%r8, %rax
-        cmoveq	%r9, %rdx
-        # h += pad
-        addq	%r11, %rax
-        adcq	%r12, %rdx
-        movq	%rax, (%rbx)
-        movq	%rdx, 8(%rbx)
-        # Zero out r
-        movq	$0x00, (%rdi)
-        movq	$0x00, 8(%rdi)
-        # Zero out h
-        movq	$0x00, 24(%rdi)
-        movq	$0x00, 32(%rdi)
-        movq	$0x00, 40(%rdi)
-        # Zero out pad
-        movq	$0x00, 48(%rdi)
-        movq	$0x00, 56(%rdi)
-        popq	%r12
-        popq	%rbx
-        repz retq
-#ifndef __APPLE__
-.size	poly1305_final_avx,.-poly1305_final_avx
-#endif /* __APPLE__ */
-#endif /* HAVE_INTEL_AVX1 */
-#ifdef HAVE_INTEL_AVX2
-#ifndef __APPLE__
-.text
-.globl	poly1305_calc_powers_avx2
-.type	poly1305_calc_powers_avx2,@function
-.align	16
-poly1305_calc_powers_avx2:
-#else
-.section	__TEXT,__text
-.globl	_poly1305_calc_powers_avx2
-.p2align	4
-_poly1305_calc_powers_avx2:
-#endif /* __APPLE__ */
-        pushq	%r12
-        pushq	%r13
-        pushq	%r14
-        pushq	%r15
-        pushq	%rbx
-        pushq	%rbp
-        movq	(%rdi), %rcx
-        movq	8(%rdi), %r8
-        xorq	%r9, %r9
-        # Convert to 26 bits in 32
-        movq	%rcx, %rax
-        movq	%rcx, %rdx
-        movq	%rcx, %rsi
-        movq	%r8, %rbx
-        movq	%r8, %rbp
-        shrq	$26, %rdx
-        shrdq	$52, %r8, %rsi
-        shrq	$14, %rbx
-        shrdq	$40, %r9, %rbp
-        andq	$0x3ffffff, %rax
-        andq	$0x3ffffff, %rdx
-        andq	$0x3ffffff, %rsi
-        andq	$0x3ffffff, %rbx
-        andq	$0x3ffffff, %rbp
-        movl	%eax, 224(%rdi)
-        movl	%edx, 228(%rdi)
-        movl	%esi, 232(%rdi)
-        movl	%ebx, 236(%rdi)
-        movl	%ebp, 240(%rdi)
-        movl	$0x00, 244(%rdi)
-        # Square 128-bit
-        movq	%r8, %rax
-        mulq	%rcx
-        xorq	%r13, %r13
-        movq	%rax, %r11
-        movq	%rdx, %r12
-        addq	%rax, %r11
-        adcq	%rdx, %r12
-        adcq	$0x00, %r13
-        movq	%rcx, %rax
-        mulq	%rax
-        movq	%rax, %r10
-        movq	%rdx, %r15
-        movq	%r8, %rax
-        mulq	%rax
-        addq	%r15, %r11
-        adcq	%rax, %r12
-        adcq	%rdx, %r13
-        # Reduce 256-bit to 130-bit
-        movq	%r12, %rax
-        movq	%r13, %rdx
-        andq	$-4, %rax
-        andq	$3, %r12
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        shrdq	$2, %rdx, %rax
-        shrq	$2, %rdx
-        addq	%rax, %r10
-        adcq	%rdx, %r11
-        adcq	$0x00, %r12
-        movq	%r12, %rax
-        shrq	$2, %rax
-        leaq	0(%rax,%rax,4), %rax
-        andq	$3, %r12
-        addq	%rax, %r10
-        adcq	$0x00, %r11
-        adcq	$0x00, %r12
-        # Convert to 26 bits in 32
-        movq	%r10, %rax
-        movq	%r10, %rdx
-        movq	%r10, %rsi
-        movq	%r11, %rbx
-        movq	%r11, %rbp
-        shrq	$26, %rdx
-        shrdq	$52, %r11, %rsi
-        shrq	$14, %rbx
-        shrdq	$40, %r12, %rbp
-        andq	$0x3ffffff, %rax
-        andq	$0x3ffffff, %rdx
-        andq	$0x3ffffff, %rsi
-        andq	$0x3ffffff, %rbx
-        andq	$0x3ffffff, %rbp
-        movl	%eax, 256(%rdi)
-        movl	%edx, 260(%rdi)
-        movl	%esi, 264(%rdi)
-        movl	%ebx, 268(%rdi)
-        movl	%ebp, 272(%rdi)
-        movl	$0x00, 276(%rdi)
-        # Multiply 128-bit by 130-bit
-        #   r1[0] * r2[0]
-        movq	%rcx, %rax
-        mulq	%r10
-        movq	%rax, %r13
-        movq	%rdx, %r14
-        #   r1[0] * r2[1]
-        movq	%rcx, %rax
-        mulq	%r11
-        movq	$0x00, %r15
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        #   r1[1] * r2[0]
-        movq	%r8, %rax
-        mulq	%r10
-        movq	$0x00, %rsi
-        addq	%rax, %r14
-        adcq	%rdx, %r15
-        adcq	$0x00, %rsi
-        #   r1[0] * r2[2]
-        movq	%rcx, %rax
-        mulq	%r12
-        addq	%rax, %r15
-        adcq	%rdx, %rsi
-        #   r1[1] * r2[1]
-        movq	%r8, %rax
-        mulq	%r11
-        movq	$0x00, %rbx
-        addq	%rax, %r15
-        adcq	%rdx, %rsi
-        adcq	$0x00, %rbx
-        #   r1[1] * r2[2]
-        movq	%r8, %rax
-        mulq	%r12
-        addq	%rax, %rsi
-        adcq	%rdx, %rbx
-        # Reduce 260-bit to 130-bit
-        movq	%r15, %rax
-        movq	%rsi, %rdx
-        movq	%rbx, %rbx
-        andq	$-4, %rax
-        andq	$3, %r15
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	%rbx, %r15
-        shrdq	$2, %rdx, %rax
-        shrdq	$2, %rbx, %rdx
-        shrq	$2, %rbx
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        adcq	%rbx, %r15
-        movq	%r15, %rax
-        andq	$3, %r15
-        shrq	$2, %rax
-        leaq	0(%rax,%rax,4), %rax
-        addq	%rax, %r13
-        adcq	$0x00, %r14
-        adcq	$0x00, %r15
-        # Convert to 26 bits in 32
-        movq	%r13, %rax
-        movq	%r13, %rdx
-        movq	%r13, %rsi
-        movq	%r14, %rbx
-        movq	%r14, %rbp
-        shrq	$26, %rdx
-        shrdq	$52, %r14, %rsi
-        shrq	$14, %rbx
-        shrdq	$40, %r15, %rbp
-        andq	$0x3ffffff, %rax
-        andq	$0x3ffffff, %rdx
-        andq	$0x3ffffff, %rsi
-        andq	$0x3ffffff, %rbx
-        andq	$0x3ffffff, %rbp
-        movl	%eax, 288(%rdi)
-        movl	%edx, 292(%rdi)
-        movl	%esi, 296(%rdi)
-        movl	%ebx, 300(%rdi)
-        movl	%ebp, 304(%rdi)
-        movl	$0x00, 308(%rdi)
-        # Square 130-bit
-        movq	%r11, %rax
-        mulq	%r10
-        xorq	%r13, %r13
-        movq	%rax, %r8
-        movq	%rdx, %r9
-        addq	%rax, %r8
-        adcq	%rdx, %r9
-        adcq	$0x00, %r13
-        movq	%r10, %rax
-        mulq	%rax
-        movq	%rax, %rcx
-        movq	%rdx, %r15
-        movq	%r11, %rax
-        mulq	%rax
-        addq	%r15, %r8
-        adcq	%rax, %r9
-        adcq	%rdx, %r13
-        movq	%r12, %rax
-        mulq	%rax
-        movq	%rax, %r14
-        movq	%r12, %rax
-        mulq	%r10
-        addq	%rax, %r9
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        addq	%rax, %r9
-        adcq	%rdx, %r13
-        adcq	$0x00, %r14
-        movq	%r12, %rax
-        mulq	%r11
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        addq	%rax, %r13
-        adcq	%rdx, %r14
-        # Reduce 260-bit to 130-bit
-        movq	%r9, %rax
-        movq	%r13, %rdx
-        movq	%r14, %r15
-        andq	$-4, %rax
-        andq	$3, %r9
-        addq	%rax, %rcx
-        adcq	%rdx, %r8
-        adcq	%r15, %r9
-        shrdq	$2, %rdx, %rax
-        shrdq	$2, %r15, %rdx
-        shrq	$2, %r15
-        addq	%rax, %rcx
-        adcq	%rdx, %r8
-        adcq	%r15, %r9
-        movq	%r9, %rax
-        andq	$3, %r9
-        shrq	$2, %rax
-        leaq	0(%rax,%rax,4), %rax
-        addq	%rax, %rcx
-        adcq	$0x00, %r8
-        adcq	$0x00, %r9
-        # Convert to 26 bits in 32
-        movq	%rcx, %rax
-        movq	%rcx, %rdx
-        movq	%rcx, %rsi
-        movq	%r8, %rbx
-        movq	%r8, %rbp
-        shrq	$26, %rdx
-        shrdq	$52, %r8, %rsi
-        shrq	$14, %rbx
-        shrdq	$40, %r9, %rbp
-        andq	$0x3ffffff, %rax
-        andq	$0x3ffffff, %rdx
-        andq	$0x3ffffff, %rsi
-        andq	$0x3ffffff, %rbx
-        andq	$0x3ffffff, %rbp
-        movl	%eax, 320(%rdi)
-        movl	%edx, 324(%rdi)
-        movl	%esi, 328(%rdi)
-        movl	%ebx, 332(%rdi)
-        movl	%ebp, 336(%rdi)
-        movl	$0x00, 340(%rdi)
-        popq	%rbp
-        popq	%rbx
-        popq	%r15
-        popq	%r14
-        popq	%r13
-        popq	%r12
-        repz retq
-#ifndef __APPLE__
-.size	poly1305_calc_powers_avx2,.-poly1305_calc_powers_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	poly1305_setkey_avx2
-.type	poly1305_setkey_avx2,@function
-.align	16
-poly1305_setkey_avx2:
-#else
-.section	__TEXT,__text
-.globl	_poly1305_setkey_avx2
-.p2align	4
-_poly1305_setkey_avx2:
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-        callq	poly1305_setkey_avx@plt
-#else
-        callq	_poly1305_setkey_avx
-#endif /* __APPLE__ */
-        vpxor	%ymm0, %ymm0, %ymm0
-        vmovdqu	%ymm0, 64(%rdi)
-        vmovdqu	%ymm0, 96(%rdi)
-        vmovdqu	%ymm0, 128(%rdi)
-        vmovdqu	%ymm0, 160(%rdi)
-        vmovdqu	%ymm0, 192(%rdi)
-        movq	$0x00, 608(%rdi)
-        movw	$0x00, 616(%rdi)
-        repz retq
-#ifndef __APPLE__
-.size	poly1305_setkey_avx2,.-poly1305_setkey_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	32
-#else
-.p2align	5
-#endif /* __APPLE__ */
-L_poly1305_avx2_blocks_mask:
-.quad	0x3ffffff, 0x3ffffff
-.quad	0x3ffffff, 0x3ffffff
-#ifndef __APPLE__
-.data
-#else
-.section	__DATA,__data
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.align	32
-#else
-.p2align	5
-#endif /* __APPLE__ */
-L_poly1305_avx2_blocks_hibit:
-.quad	0x1000000, 0x1000000
-.quad	0x1000000, 0x1000000
-#ifndef __APPLE__
-.text
-.globl	poly1305_blocks_avx2
-.type	poly1305_blocks_avx2,@function
-.align	16
-poly1305_blocks_avx2:
-#else
-.section	__TEXT,__text
-.globl	_poly1305_blocks_avx2
-.p2align	4
-_poly1305_blocks_avx2:
-#endif /* __APPLE__ */
-        pushq	%r12
-        pushq	%rbx
-        subq	$0x140, %rsp
-        movq	%rsp, %rcx
-        andq	$-32, %rcx
-        addq	$32, %rcx
-        vpxor	%ymm15, %ymm15, %ymm15
-        movq	%rcx, %rbx
-        leaq	64(%rdi), %rax
-        addq	$0xa0, %rbx
-        cmpw	$0x00, 616(%rdi)
-        jne	L_poly1305_avx2_blocks_begin_h
-        # Load the message data
-        vmovdqu	(%rsi), %ymm0
-        vmovdqu	32(%rsi), %ymm1
-        vperm2i128	$32, %ymm1, %ymm0, %ymm2
-        vperm2i128	$49, %ymm1, %ymm0, %ymm0
-        vpunpckldq	%ymm0, %ymm2, %ymm1
-        vpunpckhdq	%ymm0, %ymm2, %ymm3
-        vpunpckldq	%ymm15, %ymm1, %ymm0
-        vpunpckhdq	%ymm15, %ymm1, %ymm1
-        vpunpckldq	%ymm15, %ymm3, %ymm2
-        vpunpckhdq	%ymm15, %ymm3, %ymm3
-        vmovdqu	L_poly1305_avx2_blocks_hibit(%rip), %ymm4
-        vpsllq	$6, %ymm1, %ymm1
-        vpsllq	$12, %ymm2, %ymm2
-        vpsllq	$18, %ymm3, %ymm3
-        vmovdqu	L_poly1305_avx2_blocks_mask(%rip), %ymm14
-        # Reduce, in place, the message data
-        vpsrlq	$26, %ymm0, %ymm10
-        vpsrlq	$26, %ymm3, %ymm11
-        vpand	%ymm14, %ymm0, %ymm0
-        vpand	%ymm14, %ymm3, %ymm3
-        vpaddq	%ymm1, %ymm10, %ymm1
-        vpaddq	%ymm4, %ymm11, %ymm4
-        vpsrlq	$26, %ymm1, %ymm10
-        vpsrlq	$26, %ymm4, %ymm11
-        vpand	%ymm14, %ymm1, %ymm1
-        vpand	%ymm14, %ymm4, %ymm4
-        vpaddq	%ymm2, %ymm10, %ymm2
-        vpslld	$2, %ymm11, %ymm12
-        vpaddd	%ymm12, %ymm11, %ymm12
-        vpsrlq	$26, %ymm2, %ymm10
-        vpaddq	%ymm0, %ymm12, %ymm0
-        vpsrlq	$26, %ymm0, %ymm11
-        vpand	%ymm14, %ymm2, %ymm2
-        vpand	%ymm14, %ymm0, %ymm0
-        vpaddq	%ymm3, %ymm10, %ymm3
-        vpaddq	%ymm1, %ymm11, %ymm1
-        vpsrlq	$26, %ymm3, %ymm10
-        vpand	%ymm14, %ymm3, %ymm3
-        vpaddq	%ymm4, %ymm10, %ymm4
-        addq	$0x40, %rsi
-        subq	$0x40, %rdx
-        jz	L_poly1305_avx2_blocks_store
-        jmp	L_poly1305_avx2_blocks_load_r4
-L_poly1305_avx2_blocks_begin_h:
-        # Load the H values.
-        vmovdqu	(%rax), %ymm0
-        vmovdqu	32(%rax), %ymm1
-        vmovdqu	64(%rax), %ymm2
-        vmovdqu	96(%rax), %ymm3
-        vmovdqu	128(%rax), %ymm4
-        # Check if there is a power of r to load - otherwise use r^4.
-        cmpb	$0x00, 616(%rdi)
-        je	L_poly1305_avx2_blocks_load_r4
-        # Load the 4 powers of r - r^4, r^3, r^2, r^1.
-        vmovdqu	224(%rdi), %ymm8
-        vmovdqu	256(%rdi), %ymm7
-        vmovdqu	288(%rdi), %ymm6
-        vmovdqu	320(%rdi), %ymm5
-        vpermq	$0xd8, %ymm5, %ymm5
-        vpermq	$0xd8, %ymm6, %ymm6
-        vpermq	$0xd8, %ymm7, %ymm7
-        vpermq	$0xd8, %ymm8, %ymm8
-        vpunpcklqdq	%ymm6, %ymm5, %ymm10
-        vpunpckhqdq	%ymm6, %ymm5, %ymm11
-        vpunpcklqdq	%ymm8, %ymm7, %ymm12
-        vpunpckhqdq	%ymm8, %ymm7, %ymm13
-        vperm2i128	$32, %ymm12, %ymm10, %ymm5
-        vperm2i128	$49, %ymm12, %ymm10, %ymm7
-        vperm2i128	$32, %ymm13, %ymm11, %ymm9
-        vpsrlq	$32, %ymm5, %ymm6
-        vpsrlq	$32, %ymm7, %ymm8
-        jmp	L_poly1305_avx2_blocks_mul_5
-L_poly1305_avx2_blocks_load_r4:
-        # Load r^4 into all four positions.
-        vmovdqu	320(%rdi), %ymm13
-        vpermq	$0x00, %ymm13, %ymm5
-        vpsrlq	$32, %ymm13, %ymm14
-        vpermq	$0x55, %ymm13, %ymm7
-        vpermq	$0xaa, %ymm13, %ymm9
-        vpermq	$0x00, %ymm14, %ymm6
-        vpermq	$0x55, %ymm14, %ymm8
-L_poly1305_avx2_blocks_mul_5:
-        # Multiply top 4 26-bit values of all four H by 5
-        vpslld	$2, %ymm6, %ymm10
-        vpslld	$2, %ymm7, %ymm11
-        vpslld	$2, %ymm8, %ymm12
-        vpslld	$2, %ymm9, %ymm13
-        vpaddq	%ymm10, %ymm6, %ymm10
-        vpaddq	%ymm11, %ymm7, %ymm11
-        vpaddq	%ymm12, %ymm8, %ymm12
-        vpaddq	%ymm13, %ymm9, %ymm13
-        # Store powers of r and multiple of 5 for use in multiply.
-        vmovdqa	%ymm10, (%rbx)
-        vmovdqa	%ymm11, 32(%rbx)
-        vmovdqa	%ymm12, 64(%rbx)
-        vmovdqa	%ymm13, 96(%rbx)
-        vmovdqa	%ymm5, (%rcx)
-        vmovdqa	%ymm6, 32(%rcx)
-        vmovdqa	%ymm7, 64(%rcx)
-        vmovdqa	%ymm8, 96(%rcx)
-        vmovdqa	%ymm9, 128(%rcx)
-        vmovdqu	L_poly1305_avx2_blocks_mask(%rip), %ymm14
-        # If not finished then loop over data
-        cmpb	$0x01, 616(%rdi)
-        jne	L_poly1305_avx2_blocks_start
-        # Do last multiply, reduce, add the four H together and move to
-        # 32-bit registers
-        vpmuludq	(%rbx), %ymm4, %ymm5
-        vpmuludq	32(%rbx), %ymm3, %ymm10
-        vpmuludq	32(%rbx), %ymm4, %ymm6
-        vpmuludq	64(%rbx), %ymm3, %ymm11
-        vpmuludq	64(%rbx), %ymm4, %ymm7
-        vpaddq	%ymm5, %ymm10, %ymm5
-        vpmuludq	64(%rbx), %ymm2, %ymm12
-        vpmuludq	96(%rbx), %ymm4, %ymm8
-        vpaddq	%ymm6, %ymm11, %ymm6
-        vpmuludq	96(%rbx), %ymm1, %ymm13
-        vpmuludq	96(%rbx), %ymm2, %ymm10
-        vpaddq	%ymm5, %ymm12, %ymm5
-        vpmuludq	96(%rbx), %ymm3, %ymm11
-        vpmuludq	(%rcx), %ymm3, %ymm12
-        vpaddq	%ymm5, %ymm13, %ymm5
-        vpmuludq	(%rcx), %ymm4, %ymm9
-        vpaddq	%ymm6, %ymm10, %ymm6
-        vpmuludq	(%rcx), %ymm0, %ymm13
-        vpaddq	%ymm7, %ymm11, %ymm7
-        vpmuludq	(%rcx), %ymm1, %ymm10
-        vpaddq	%ymm8, %ymm12, %ymm8
-        vpmuludq	(%rcx), %ymm2, %ymm11
-        vpmuludq	32(%rcx), %ymm2, %ymm12
-        vpaddq	%ymm5, %ymm13, %ymm5
-        vpmuludq	32(%rcx), %ymm3, %ymm13
-        vpaddq	%ymm6, %ymm10, %ymm6
-        vpmuludq	32(%rcx), %ymm0, %ymm10
-        vpaddq	%ymm7, %ymm11, %ymm7
-        vpmuludq	32(%rcx), %ymm1, %ymm11
-        vpaddq	%ymm8, %ymm12, %ymm8
-        vpmuludq	64(%rcx), %ymm1, %ymm12
-        vpaddq	%ymm9, %ymm13, %ymm9
-        vpmuludq	64(%rcx), %ymm2, %ymm13
-        vpaddq	%ymm6, %ymm10, %ymm6
-        vpmuludq	64(%rcx), %ymm0, %ymm10
-        vpaddq	%ymm7, %ymm11, %ymm7
-        vpmuludq	96(%rcx), %ymm0, %ymm11
-        vpaddq	%ymm8, %ymm12, %ymm8
-        vpmuludq	96(%rcx), %ymm1, %ymm12
-        vpaddq	%ymm9, %ymm13, %ymm9
-        vpaddq	%ymm7, %ymm10, %ymm7
-        vpmuludq	128(%rcx), %ymm0, %ymm13
-        vpaddq	%ymm8, %ymm11, %ymm8
-        vpaddq	%ymm9, %ymm12, %ymm9
-        vpaddq	%ymm9, %ymm13, %ymm9
-        vpsrlq	$26, %ymm5, %ymm10
-        vpsrlq	$26, %ymm8, %ymm11
-        vpand	%ymm14, %ymm5, %ymm5
-        vpand	%ymm14, %ymm8, %ymm8
-        vpaddq	%ymm6, %ymm10, %ymm6
-        vpaddq	%ymm9, %ymm11, %ymm9
-        vpsrlq	$26, %ymm6, %ymm10
-        vpsrlq	$26, %ymm9, %ymm11
-        vpand	%ymm14, %ymm6, %ymm1
-        vpand	%ymm14, %ymm9, %ymm4
-        vpaddq	%ymm7, %ymm10, %ymm7
-        vpslld	$2, %ymm11, %ymm12
-        vpaddd	%ymm12, %ymm11, %ymm12
-        vpsrlq	$26, %ymm7, %ymm10
-        vpaddq	%ymm5, %ymm12, %ymm5
-        vpsrlq	$26, %ymm5, %ymm11
-        vpand	%ymm14, %ymm7, %ymm2
-        vpand	%ymm14, %ymm5, %ymm0
-        vpaddq	%ymm8, %ymm10, %ymm8
-        vpaddq	%ymm1, %ymm11, %ymm1
-        vpsrlq	$26, %ymm8, %ymm10
-        vpand	%ymm14, %ymm8, %ymm3
-        vpaddq	%ymm4, %ymm10, %ymm4
-        vpsrldq	$8, %ymm0, %ymm5
-        vpsrldq	$8, %ymm1, %ymm6
-        vpsrldq	$8, %ymm2, %ymm7
-        vpsrldq	$8, %ymm3, %ymm8
-        vpsrldq	$8, %ymm4, %ymm9
-        vpaddq	%ymm0, %ymm5, %ymm0
-        vpaddq	%ymm1, %ymm6, %ymm1
-        vpaddq	%ymm2, %ymm7, %ymm2
-        vpaddq	%ymm3, %ymm8, %ymm3
-        vpaddq	%ymm4, %ymm9, %ymm4
-        vpermq	$2, %ymm0, %ymm5
-        vpermq	$2, %ymm1, %ymm6
-        vpermq	$2, %ymm2, %ymm7
-        vpermq	$2, %ymm3, %ymm8
-        vpermq	$2, %ymm4, %ymm9
-        vpaddq	%ymm0, %ymm5, %ymm0
-        vpaddq	%ymm1, %ymm6, %ymm1
-        vpaddq	%ymm2, %ymm7, %ymm2
-        vpaddq	%ymm3, %ymm8, %ymm3
-        vpaddq	%ymm4, %ymm9, %ymm4
-        vmovd	%xmm0, %r8d
-        vmovd	%xmm1, %r9d
-        vmovd	%xmm2, %r10d
-        vmovd	%xmm3, %r11d
-        vmovd	%xmm4, %r12d
-        jmp	L_poly1305_avx2_blocks_end_calc
-L_poly1305_avx2_blocks_start:
-        vmovdqu	(%rsi), %ymm5
-        vmovdqu	32(%rsi), %ymm6
-        vperm2i128	$32, %ymm6, %ymm5, %ymm7
-        vperm2i128	$49, %ymm6, %ymm5, %ymm5
-        vpunpckldq	%ymm5, %ymm7, %ymm6
-        vpunpckhdq	%ymm5, %ymm7, %ymm8
-        vpunpckldq	%ymm15, %ymm6, %ymm5
-        vpunpckhdq	%ymm15, %ymm6, %ymm6
-        vpunpckldq	%ymm15, %ymm8, %ymm7
-        vpunpckhdq	%ymm15, %ymm8, %ymm8
-        vmovdqu	L_poly1305_avx2_blocks_hibit(%rip), %ymm9
-        vpsllq	$6, %ymm6, %ymm6
-        vpsllq	$12, %ymm7, %ymm7
-        vpsllq	$18, %ymm8, %ymm8
-        vpmuludq	(%rbx), %ymm4, %ymm10
-        vpaddq	%ymm5, %ymm10, %ymm5
-        vpmuludq	32(%rbx), %ymm3, %ymm10
-        vpmuludq	32(%rbx), %ymm4, %ymm11
-        vpaddq	%ymm6, %ymm11, %ymm6
-        vpmuludq	64(%rbx), %ymm3, %ymm11
-        vpmuludq	64(%rbx), %ymm4, %ymm12
-        vpaddq	%ymm7, %ymm12, %ymm7
-        vpaddq	%ymm5, %ymm10, %ymm5
-        vpmuludq	64(%rbx), %ymm2, %ymm12
-        vpmuludq	96(%rbx), %ymm4, %ymm13
-        vpaddq	%ymm8, %ymm13, %ymm8
-        vpaddq	%ymm6, %ymm11, %ymm6
-        vpmuludq	96(%rbx), %ymm1, %ymm13
-        vpmuludq	96(%rbx), %ymm2, %ymm10
-        vpaddq	%ymm5, %ymm12, %ymm5
-        vpmuludq	96(%rbx), %ymm3, %ymm11
-        vpmuludq	(%rcx), %ymm3, %ymm12
-        vpaddq	%ymm5, %ymm13, %ymm5
-        vpmuludq	(%rcx), %ymm4, %ymm13
-        vpaddq	%ymm9, %ymm13, %ymm9
-        vpaddq	%ymm6, %ymm10, %ymm6
-        vpmuludq	(%rcx), %ymm0, %ymm13
-        vpaddq	%ymm7, %ymm11, %ymm7
-        vpmuludq	(%rcx), %ymm1, %ymm10
-        vpaddq	%ymm8, %ymm12, %ymm8
-        vpmuludq	(%rcx), %ymm2, %ymm11
-        vpmuludq	32(%rcx), %ymm2, %ymm12
-        vpaddq	%ymm5, %ymm13, %ymm5
-        vpmuludq	32(%rcx), %ymm3, %ymm13
-        vpaddq	%ymm6, %ymm10, %ymm6
-        vpmuludq	32(%rcx), %ymm0, %ymm10
-        vpaddq	%ymm7, %ymm11, %ymm7
-        vpmuludq	32(%rcx), %ymm1, %ymm11
-        vpaddq	%ymm8, %ymm12, %ymm8
-        vpmuludq	64(%rcx), %ymm1, %ymm12
-        vpaddq	%ymm9, %ymm13, %ymm9
-        vpmuludq	64(%rcx), %ymm2, %ymm13
-        vpaddq	%ymm6, %ymm10, %ymm6
-        vpmuludq	64(%rcx), %ymm0, %ymm10
-        vpaddq	%ymm7, %ymm11, %ymm7
-        vpmuludq	96(%rcx), %ymm0, %ymm11
-        vpaddq	%ymm8, %ymm12, %ymm8
-        vpmuludq	96(%rcx), %ymm1, %ymm12
-        vpaddq	%ymm9, %ymm13, %ymm9
-        vpaddq	%ymm7, %ymm10, %ymm7
-        vpmuludq	128(%rcx), %ymm0, %ymm13
-        vpaddq	%ymm8, %ymm11, %ymm8
-        vpaddq	%ymm9, %ymm12, %ymm9
-        vpaddq	%ymm9, %ymm13, %ymm9
-        vpsrlq	$26, %ymm5, %ymm10
-        vpsrlq	$26, %ymm8, %ymm11
-        vpand	%ymm14, %ymm5, %ymm5
-        vpand	%ymm14, %ymm8, %ymm8
-        vpaddq	%ymm6, %ymm10, %ymm6
-        vpaddq	%ymm9, %ymm11, %ymm9
-        vpsrlq	$26, %ymm6, %ymm10
-        vpsrlq	$26, %ymm9, %ymm11
-        vpand	%ymm14, %ymm6, %ymm1
-        vpand	%ymm14, %ymm9, %ymm4
-        vpaddq	%ymm7, %ymm10, %ymm7
-        vpslld	$2, %ymm11, %ymm12
-        vpaddd	%ymm12, %ymm11, %ymm12
-        vpsrlq	$26, %ymm7, %ymm10
-        vpaddq	%ymm5, %ymm12, %ymm5
-        vpsrlq	$26, %ymm5, %ymm11
-        vpand	%ymm14, %ymm7, %ymm2
-        vpand	%ymm14, %ymm5, %ymm0
-        vpaddq	%ymm8, %ymm10, %ymm8
-        vpaddq	%ymm1, %ymm11, %ymm1
-        vpsrlq	$26, %ymm8, %ymm10
-        vpand	%ymm14, %ymm8, %ymm3
-        vpaddq	%ymm4, %ymm10, %ymm4
-        addq	$0x40, %rsi
-        subq	$0x40, %rdx
-        jnz	L_poly1305_avx2_blocks_start
-L_poly1305_avx2_blocks_store:
-        # Store four H values - state
-        vmovdqu	%ymm0, (%rax)
-        vmovdqu	%ymm1, 32(%rax)
-        vmovdqu	%ymm2, 64(%rax)
-        vmovdqu	%ymm3, 96(%rax)
-        vmovdqu	%ymm4, 128(%rax)
-L_poly1305_avx2_blocks_end_calc:
-        cmpb	$0x00, 616(%rdi)
-        je	L_poly1305_avx2_blocks_complete
-        movq	%r8, %rax
-        movq	%r10, %rdx
-        movq	%r12, %rcx
-        shrq	$12, %rdx
-        shrq	$24, %rcx
-        shlq	$26, %r9
-        shlq	$52, %r10
-        shlq	$14, %r11
-        shlq	$40, %r12
-        addq	%r9, %rax
-        adcq	%r10, %rax
-        adcq	%r11, %rdx
-        adcq	%r12, %rdx
-        adcq	$0x00, %rcx
-        movq	%rcx, %r8
-        andq	$3, %rcx
-        shrq	$2, %r8
-        leaq	0(%r8,%r8,4), %r8
-        addq	%r8, %rax
-        adcq	$0x00, %rdx
-        adcq	$0x00, %rcx
-        movq	%rax, 24(%rdi)
-        movq	%rdx, 32(%rdi)
-        movq	%rcx, 40(%rdi)
-L_poly1305_avx2_blocks_complete:
-        movb	$0x01, 617(%rdi)
-        addq	$0x140, %rsp
-        popq	%rbx
-        popq	%r12
-        repz retq
-#ifndef __APPLE__
-.size	poly1305_blocks_avx2,.-poly1305_blocks_avx2
-#endif /* __APPLE__ */
-#ifndef __APPLE__
-.text
-.globl	poly1305_final_avx2
-.type	poly1305_final_avx2,@function
-.align	16
-poly1305_final_avx2:
-#else
-.section	__TEXT,__text
-.globl	_poly1305_final_avx2
-.p2align	4
-_poly1305_final_avx2:
-#endif /* __APPLE__ */
-        movb	$0x01, 616(%rdi)
-        movb	617(%rdi), %cl
-        cmpb	$0x00, %cl
-        je	L_poly1305_avx2_final_done_blocks_X4
-        pushq	%rsi
-        movq	$0x40, %rdx
-        xorq	%rsi, %rsi
-#ifndef __APPLE__
-        callq	poly1305_blocks_avx2@plt
-#else
-        callq	_poly1305_blocks_avx2
-#endif /* __APPLE__ */
-        popq	%rsi
-L_poly1305_avx2_final_done_blocks_X4:
-        movq	608(%rdi), %rax
-        movq	%rax, %rcx
-        andq	$-16, %rcx
-        cmpb	$0x00, %cl
-        je	L_poly1305_avx2_final_done_blocks
-        pushq	%rcx
-        pushq	%rax
-        pushq	%rsi
-        movq	%rcx, %rdx
-        leaq	480(%rdi), %rsi
-#ifndef __APPLE__
-        callq	poly1305_blocks_avx@plt
-#else
-        callq	_poly1305_blocks_avx
-#endif /* __APPLE__ */
-        popq	%rsi
-        popq	%rax
-        popq	%rcx
-L_poly1305_avx2_final_done_blocks:
-        subq	%rcx, 608(%rdi)
-        xorq	%rdx, %rdx
-        jmp	L_poly1305_avx2_final_cmp_copy
-L_poly1305_avx2_final_start_copy:
-        movb	480(%rdi,%rcx,1), %r8b
-        movb	%r8b, 480(%rdi,%rdx,1)
-        incb	%cl
-        incb	%dl
-L_poly1305_avx2_final_cmp_copy:
-        cmp	%rcx, %rax
-        jne	L_poly1305_avx2_final_start_copy
-#ifndef __APPLE__
-        callq	poly1305_final_avx@plt
-#else
-        callq	_poly1305_final_avx
-#endif /* __APPLE__ */
-        vpxor	%ymm0, %ymm0, %ymm0
-        vmovdqu	%ymm0, 64(%rdi)
-        vmovdqu	%ymm0, 96(%rdi)
-        vmovdqu	%ymm0, 128(%rdi)
-        vmovdqu	%ymm0, 160(%rdi)
-        vmovdqu	%ymm0, 192(%rdi)
-        vmovdqu	%ymm0, 224(%rdi)
-        vmovdqu	%ymm0, 256(%rdi)
-        vmovdqu	%ymm0, 288(%rdi)
-        vmovdqu	%ymm0, 320(%rdi)
-        movq	$0x00, 608(%rdi)
-        movw	$0x00, 616(%rdi)
-        repz retq
-#ifndef __APPLE__
-.size	poly1305_final_avx2,.-poly1305_final_avx2
-#endif /* __APPLE__ */
-#endif /* HAVE_INTEL_AVX2 */
-#endif /* WOLFSSL_X86_64_BUILD */
-
-#if defined(__linux__) && defined(__ELF__)
-.section	.note.GNU-stack,"",%progbits
-#endif

+ 0 - 109
lib/wolfssl/wolfcrypt/src/port/Espressif/README.md

@@ -1,109 +0,0 @@
-# ESP32 Port
-
-Support for the ESP32-WROOM-32 on-board crypto hardware acceleration for symmetric AES, SHA1/SHA256/SHA384/SHA512 and RSA primitive including mul, mulmod and exptmod.
-
-## ESP32 Acceleration
-
-For detail about ESP32 HW Acceleration, you can find in [Technical Reference Manual](https://espressif.com/sites/default/files/documentation/esp32_technical_reference_manual_en.pdf)
-
-### Building
-
-To enable hw acceleration :
-
-* Uncomment out `#define WOLFSSL_ESPIDF` in `/path/to/wolfssl/wolfssl/wolfcrypt/settings.h`
-* Uncomment out `#define WOLFSSL_ESPWROOM32` in `/path/to/wolfssl/wolfssl/wolfcrypt/settings.h`
-
-To disable portions of the hardware acceleration you can optionally define:
-
-```c
-/* Disabled SHA, AES and RSA acceleration */
-#define NO_ESP32WROOM32_CRYPT
-/* Disabled AES acceleration */
-#define NO_WOLFSSL_ESP32WROOM32_CRYPT_AES
-/* Disabled SHA acceleration */
-#define NO_WOLFSSL_ESP32WROOM32_CRYPT_HASH
-/* Disabled RSA Primitive acceleration */
-#define NO_WOLFSSL_ESP32WROOM32_CRYPT_RSA_PRI
-```
-
-### Coding
-
-In your application you must include `<wolfssl/wolfcrypt/settings.h>` before any other wolfSSL headers. If building the sources directly we recommend defining `WOLFSSL_USER_SETTINGS` and adding your own `user_settings.h` file. You can find a good reference for this in `IDE/GCC-ARM/Header/user_settings.h`.
-
-
-### Benchmarks
-
-w/ `USE_FAST_MATH` and `WOLFSSL_SMALL_STACK` options
-
-Software only implementation :
-
-```
-AES-128-CBC-enc      1 MB took 1.001 seconds,    1.146 MB/s
-AES-128-CBC-dec      1 MB took 1.017 seconds,    1.104 MB/s
-AES-192-CBC-enc      1 MB took 1.018 seconds,    1.055 MB/s
-AES-192-CBC-dec      1 MB took 1.006 seconds,    1.019 MB/s
-AES-256-CBC-enc   1000 KB took 1.000 seconds, 1000.000 KB/s
-AES-256-CBC-dec    975 KB took 1.007 seconds,  968.222 KB/s
-AES-128-GCM-enc    350 KB took 1.055 seconds,  331.754 KB/s
-AES-128-GCM-dec    350 KB took 1.054 seconds,  332.068 KB/s
-AES-192-GCM-enc    325 KB took 1.013 seconds,  320.829 KB/s
-AES-192-GCM-dec    325 KB took 1.013 seconds,  320.829 KB/s
-AES-256-GCM-enc    325 KB took 1.041 seconds,  312.200 KB/s
-AES-256-GCM-dec    325 KB took 1.041 seconds,  312.200 KB/s
-SHA                  6 MB took 1.004 seconds,    5.714 MB/s
-SHA-256              2 MB took 1.006 seconds,    1.747 MB/s
-SHA-384              1 MB took 1.011 seconds,    1.159 MB/s
-SHA-512              1 MB took 1.009 seconds,    1.161 MB/s
-HMAC-SHA             6 MB took 1.001 seconds,    5.634 MB/s
-HMAC-SHA256          2 MB took 1.000 seconds,    1.733 MB/s
-HMAC-SHA384          1 MB took 1.004 seconds,    1.046 MB/s
-HMAC-SHA512          1 MB took 1.002 seconds,    1.048 MB/s
-RSA     2048 public         16 ops took 1.056 sec, avg 66.000 ms, 15.152 ops/sec
-RSA     2048 private         2 ops took 2.488 sec, avg 1244.000 ms, 0.804 ops/sec
-ECC      256 key gen         4 ops took 1.101 sec, avg 275.250 ms, 3.633 ops/sec
-ECDHE    256 agree           4 ops took 1.098 sec, avg 274.500 ms, 3.643 ops/sec
-ECDSA    256 sign            4 ops took 1.111 sec, avg 277.750 ms, 3.600 ops/sec
-ECDSA    256 verify          2 ops took 1.099 sec, avg 549.500 ms, 1.820 ops/sec
-```
-
-Hardware Acceleration :
-
-
-```
-AES-128-CBC-enc      6 MB took 1.004 seconds,    5.958 MB/s
-AES-128-CBC-dec      5 MB took 1.002 seconds,    5.287 MB/s
-AES-192-CBC-enc      6 MB took 1.004 seconds,    5.958 MB/s
-AES-192-CBC-dec      5 MB took 1.002 seconds,    5.287 MB/s
-AES-256-CBC-enc      6 MB took 1.001 seconds,    5.951 MB/s
-AES-256-CBC-dec      5 MB took 1.004 seconds,    5.277 MB/s
-AES-128-GCM-enc    375 KB took 1.067 seconds,  351.453 KB/s
-AES-128-GCM-dec    375 KB took 1.067 seconds,  351.453 KB/s
-AES-192-GCM-enc    350 KB took 1.010 seconds,  346.535 KB/s
-AES-192-GCM-dec    350 KB took 1.009 seconds,  346.878 KB/s
-AES-256-GCM-enc    350 KB took 1.016 seconds,  344.488 KB/s
-AES-256-GCM-dec    350 KB took 1.016 seconds,  344.488 KB/s
-SHA                 14 MB took 1.000 seconds,   14.062 MB/s
-SHA-256             15 MB took 1.000 seconds,   15.234 MB/s
-SHA-384             17 MB took 1.000 seconds,   17.383 MB/s
-SHA-512             18 MB took 1.001 seconds,   17.512 MB/s
-HMAC-SHA            14 MB took 1.000 seconds,   13.818 MB/s
-HMAC-SHA256         15 MB took 1.001 seconds,   14.951 MB/s
-HMAC-SHA384         17 MB took 1.001 seconds,   16.683 MB/s
-HMAC-SHA512         17 MB took 1.000 seconds,   16.943 MB/s
-RSA     2048 public         20 ops took 1.017 sec, avg 50.850 ms, 19.666 ops/sec
-RSA     2048 private         4 ops took 1.059 sec, avg 264.750 ms, 3.777 ops/sec
-ECC      256 key gen         4 ops took 1.092 sec, avg 273.000 ms, 3.663 ops/sec
-ECDHE    256 agree           4 ops took 1.089 sec, avg 272.250 ms, 3.673 ops/sec
-ECDSA    256 sign            4 ops took 1.101 sec, avg 275.250 ms, 3.633 ops/sec
-ECDSA    256 verify          2 ops took 1.092 sec, avg 546.000 ms, 1.832 ops/sec
-```
-
-Condition  :  
-- Model    : ESP32-WROOM-32  
-- CPU Speed: 240Mhz  
-- ESP-IDF  : v3.3-beta1-39-g6cb37ecc5(commit hash : 6cb37ecc5)  
-- OS       : Ubuntu 18.04.1 LTS (Bionic Beaver)
-
-## Support
-
-Email us at [support@wolfssl.com](mailto:support@wolfssl.com).

+ 0 - 432
lib/wolfssl/wolfcrypt/src/port/Espressif/esp32_aes.c

@@ -1,432 +0,0 @@
-/* esp32_aes.c
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-/*
- * WOLFSSL_SUCCESS and WOLFSSL_FAILURE values should only
- * be used in the ssl layer, not in wolfCrypt
- **/
-#include <string.h>
-#include <stdio.h>
-
-#ifdef HAVE_CONFIG_H
-    #include <config.h>
-#endif
-#include <wolfssl/wolfcrypt/settings.h>
-
-#ifndef NO_AES
-
-#if defined(WOLFSSL_ESP32WROOM32_CRYPT) && \
-    !defined(NO_WOLFSSL_ESP32WROOM32_CRYPT_AES)
-#include "sdkconfig.h" /* programmatically generated from sdkconfig */
-#include <wolfssl/wolfcrypt/aes.h>
-#include "wolfssl/wolfcrypt/port/Espressif/esp32-crypt.h"
-#include <wolfssl/wolfcrypt/error-crypt.h>
-
-/* breadcrumb tag text for ESP_LOG() */
-static const char* TAG = "wolf_hw_aes";
-
-/* mutex */
-static wolfSSL_Mutex aes_mutex;
-
-/* keep track as to whether esp aes is initialized */
-static int espaes_CryptHwMutexInit = 0;
-
-/*
-* lock hw engine.
-* this should be called before using engine.
-*
-* returns 0 if the hw lock was initialized and mutex lock
-*/
-static int esp_aes_hw_InUse()
-{
-    int ret = 0;
-
-    ESP_LOGV(TAG, "enter esp_aes_hw_InUse");
-
-    if (espaes_CryptHwMutexInit == 0) {
-        ret = esp_CryptHwMutexInit(&aes_mutex);
-        if (ret == 0) {
-            /* flag esp aes as initialized */
-            espaes_CryptHwMutexInit = 1;
-        }
-        else {
-            ESP_LOGE(TAG, "aes mutex initialization failed.");
-        }
-    }
-    else {
-        /* esp aes has already been initialized */
-    }
-
-    if (ret == 0) {
-        /* lock hardware */
-        ret = esp_CryptHwMutexLock(&aes_mutex, portMAX_DELAY);
-    }
-    else {
-        ESP_LOGE(TAG, "aes engine lock failed.");
-    }
-
-
-    if (ret == 0) {
-        /* Enable AES hardware */
-        periph_module_enable(PERIPH_AES_MODULE);
-
-        #if CONFIG_IDF_TARGET_ESP32S3
-        /* Select working mode. Can be typical or DMA.
-         * 0 => typical
-         * 1 => DMA */
-        DPORT_REG_WRITE(AES_DMA_ENABLE_REG, 0);
-        #endif
-    }
-
-    ESP_LOGV(TAG, "leave esp_aes_hw_InUse");
-    return ret;
-} /* esp_aes_hw_InUse */
-
-/*
-*   release hw engine
-*/
-static void esp_aes_hw_Leave( void )
-{
-    ESP_LOGV(TAG, "enter esp_aes_hw_Leave");
-    /* Disable AES hardware */
-    periph_module_disable(PERIPH_AES_MODULE);
-
-    /* unlock */
-    esp_CryptHwMutexUnLock(&aes_mutex);
-
-    ESP_LOGV(TAG, "leave esp_aes_hw_Leave");
-} /* esp_aes_hw_Leave */
-
-/*
- * set key to hardware key registers.
- * return 0 on success; -1 if mode isn't supported.
- */
-static int esp_aes_hw_Set_KeyMode(Aes *ctx, ESP32_AESPROCESS mode)
-{
-    int ret = 0;
-    word32 i;
-    word32 mode_ = 0;
-
-    ESP_LOGV(TAG, "  enter esp_aes_hw_Set_KeyMode");
-
-    /* check mode */
-    if (mode == ESP32_AES_UPDATEKEY_ENCRYPT) {
-        mode_ = 0;
-    }
-    else {
-        if (mode == ESP32_AES_UPDATEKEY_DECRYPT) {
-            mode_ = 4;
-        }
-        else {
-            ESP_LOGE(TAG, "  >> unexpected error.");
-            ret = BAD_FUNC_ARG;
-        }
-    } /* if mode */
-
-    if (ret == 0) {
-
-        /* update key */
-        for (i = 0; i < (ctx->keylen) / sizeof(word32); i++) {
-            DPORT_REG_WRITE(AES_KEY_BASE + (i * 4), *(((word32*)ctx->key) + i));
-        }
-
-        /*
-        ** ESP32: see table 22-1 in ESP32 Technical Reference
-        ** ESP32S3: see table 19-2 in ESP32S3 Technical Reference
-        ** mode     Algorithm             ESP32   ESP32S3
-        **   0       AES-128 Encryption     y        y
-        **   1       AES-192 Encryption     y        n
-        **   2       AES-256 Encryption     y        y
-        **   4       AES-128 Decryption     y        y
-        **   5       AES-192 Decryption     y        n
-        **   6       AES-256 Decryption     y        y
-        */
-        switch(ctx->keylen){
-            case 24: mode_ += 1; break;
-            case 32: mode_ += 2; break;
-            default: break;
-        }
-
-    #if CONFIG_IDF_TARGET_ESP32S3
-        if (mode_ == 1 || mode_ == 5 || mode_ == 7) {
-            ESP_LOGE(TAG, "esp_aes_hw_Set_KeyMode unsupported mode: %i", mode_);
-            ret = BAD_FUNC_ARG;
-        }
-    #endif
-
-        if (ret == 0) {
-            DPORT_REG_WRITE(AES_MODE_REG, mode_);
-        }
-        ESP_LOGV(TAG, "  leave esp_aes_hw_Setkey");
-    }
-
-    return ret;
-} /* esp_aes_hw_Set_KeyMode */
-
-/*
- * esp_aes_bk
- * Process a one block of AES
- * in: block of 16 bytes (4 x words32) to process
- * out: result of processing input bytes.
- */
-static void esp_aes_bk(const byte* in, byte* out)
-{
-    const word32 *inwords = (const word32 *)in;
-
-#if ESP_IDF_VERSION_MAJOR >= 4
-    uint32_t *outwords    = (uint32_t *)out;
-#else
-    word32 *outwords      = (word32 *)out;
-#endif
-
-    ESP_LOGV(TAG, "enter esp_aes_bk");
-#if CONFIG_IDF_TARGET_ESP32S3
-    /* See esp32 - s3 technical reference manual:
-    ** 19.4.3 Operation process using CPU working mode.
-    ** The ESP32-S3 also supports a DMA mode.
-    **
-    ** Copy text for encrypting/decrypting blocks: */
-    DPORT_REG_WRITE(AES_TEXT_IN_BASE, inwords[0]);
-    DPORT_REG_WRITE(AES_TEXT_IN_BASE + 4, inwords[1]);
-    DPORT_REG_WRITE(AES_TEXT_IN_BASE + 8, inwords[2]);
-    DPORT_REG_WRITE(AES_TEXT_IN_BASE + 12, inwords[3]);
-
-    /* start engine */
-    DPORT_REG_WRITE(AES_TRIGGER_REG, 1);
-
-    /* wait until finishing the process */
-    while (DPORT_REG_READ(AES_STATE_REG) != 0) {
-        /* wating for the hardware accelerator to complete operation. */
-    }
-
-    /* read-out blocks */
-    esp_dport_access_read_buffer(outwords, AES_TEXT_OUT_BASE, 4);
-#else
-    /* copy text for encrypting/decrypting blocks */
-    DPORT_REG_WRITE(AES_TEXT_BASE, inwords[0]);
-    DPORT_REG_WRITE(AES_TEXT_BASE + 4, inwords[1]);
-    DPORT_REG_WRITE(AES_TEXT_BASE + 8, inwords[2]);
-    DPORT_REG_WRITE(AES_TEXT_BASE + 12, inwords[3]);
-
-    /* start engine */
-    DPORT_REG_WRITE(AES_START_REG, 1);
-
-    /* wait until finishing the process */
-    while (1) {
-        if (DPORT_REG_READ(AES_IDLE_REG) == 1) {
-            break;
-        }
-    }
-
-    /* read-out blocks */
-    esp_dport_access_read_buffer(outwords, AES_TEXT_BASE, 4);
-#endif
-
-    ESP_LOGV(TAG, "leave esp_aes_bk");
-} /* esp_aes_bk */
-
-/*
-* wc_esp32AesEncrypt
-* @brief: a one block encrypt of the input block, into the output block
-* @param aes: a pointer of the AES object used to encrypt data
-* @param in : a pointer of the input buffer containing plain text to be encrypted
-* @param out: a pointer of the output buffer in which to store the cipher text of
-*             the encrypted message
-* @return: 0 on success, BAD_FUNC_ARG if the AES algorithm isn't supported.
-*/
-int wc_esp32AesEncrypt(Aes *aes, const byte* in, byte* out)
-{
-    int ret = 0;
-
-    ESP_LOGV(TAG, "enter wc_esp32AesEncrypt");
-    /* lock the hw engine */
-    ret = esp_aes_hw_InUse();
-
-    if (ret == 0) {
-        ret = esp_aes_hw_Set_KeyMode(aes, ESP32_AES_UPDATEKEY_ENCRYPT);
-        if (ret != 0) {
-            ESP_LOGE(TAG, "wc_esp32AesEncrypt failed during esp_aes_hw_Set_KeyMode");
-        }
-    }
-
-    /* load the key into the register */
-    if (ret == 0) {
-        /* process a one block of AES */
-        esp_aes_bk(in, out);
-    }
-
-    /* release hw */
-    esp_aes_hw_Leave();
-    return ret;
-} /* wc_esp32AesEncrypt */
-
-/*
-* wc_esp32AesDecrypt
-* @brief: a one block decrypt of the input block, into the output block
-* @param aes: a pointer of the AES object used to decrypt data
-* @param in : a pointer of the input buffer containing plain text to be decrypted
-* @param out: a pointer of the output buffer in which to store the cipher text of
-*             the decrypted message
-* @return: 0 on success, BAD_FUNC_ARG if the AES algorithm isn't supported.
-*/
-int wc_esp32AesDecrypt(Aes *aes, const byte* in, byte* out)
-{
-    int ret;
-
-    ESP_LOGV(TAG, "enter wc_esp32AesDecrypt");
-    /* lock the hw engine */
-    esp_aes_hw_InUse();
-    /* load the key into the register */
-    ret = esp_aes_hw_Set_KeyMode(aes, ESP32_AES_UPDATEKEY_DECRYPT);
-    if (ret != 0) {
-        ESP_LOGE(TAG, "wc_esp32AesDecrypt failed during esp_aes_hw_Set_KeyMode");
-        /* release hw */
-        esp_aes_hw_Leave();
-        ret = BAD_FUNC_ARG;
-    }
-
-    if (ret == 0) {
-        /* process a one block of AES */
-        esp_aes_bk(in, out);
-        /* release hw engine */
-        esp_aes_hw_Leave();
-    }
-
-    return ret;
-} /* wc_esp32AesDecrypt */
-
-/*
-* wc_esp32AesCbcEncrypt
-* @brief: Encrypts a plain text message from the input buffer, and places the
-*         resulting cipher text into the output buffer using cipher block chaining
-*         with AES.
-* @param aes: a pointer of the AES object used to encrypt data
-* @param out: a pointer of the output buffer in which to store the cipher text of
-*             the encrypted message
-* @param in : a pointer of the input buffer containing plain text to be encrypted
-* @param sz : size of input message
-* @return: 0 on success, BAD_FUNC_ARG if the AES algorithm isn't supported.
-*/
-int wc_esp32AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
-{
-    int ret;
-    int i;
-    int offset = 0;
-    word32 blocks = (sz / AES_BLOCK_SIZE);
-    byte *iv;
-    byte temp_block[AES_BLOCK_SIZE];
-
-    ESP_LOGV(TAG, "enter wc_esp32AesCbcEncrypt");
-
-    iv = (byte*)aes->reg;
-
-    ret = esp_aes_hw_InUse();
-
-    if (ret == 0) {
-        ret = esp_aes_hw_Set_KeyMode(aes, ESP32_AES_UPDATEKEY_ENCRYPT);
-        if (ret != 0) {
-            ESP_LOGE(TAG, "wc_esp32AesCbcEncrypt failed HW Set KeyMode");
-        }
-    } /* if set esp_aes_hw_InUse successful */
-
-    if (ret == 0) {
-        while (blocks--) {
-            XMEMCPY(temp_block, in + offset, AES_BLOCK_SIZE);
-
-            /* XOR block with IV for CBC */
-            for (i = 0; i < AES_BLOCK_SIZE; i++) {
-                temp_block[i] ^= iv[i];
-            }
-
-            esp_aes_bk(temp_block, (out + offset));
-
-            offset += AES_BLOCK_SIZE;
-
-            /* store IV for next block */
-            XMEMCPY(iv, out + offset - AES_BLOCK_SIZE, AES_BLOCK_SIZE);
-        } /* while (blocks--) */
-    } /* if Set Mode successful (ret == 0) */
-
-    esp_aes_hw_Leave();
-    ESP_LOGV(TAG, "leave wc_esp32AesCbcEncrypt");
-    return 0;
-} /* wc_esp32AesCbcEncrypt */
-
-/*
-* wc_esp32AesCbcDecrypt
-* @brief: Encrypts a plain text message from the input buffer, and places the
-*         resulting cipher text into the output buffer using cipher block chaining
-*         with AES.
-* @param aes: a pointer of the AES object used to decrypt data
-* @param out: a pointer of the output buffer in which to store the cipher text of
-*             the decrypted message
-* @param in : a pointer of the input buffer containing plain text to be decrypted
-* @param sz : size of input message
-* @return: 0 on success, BAD_FUNC_ARG if the AES algorithm isn't supported.
-*/
-int wc_esp32AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz)
-{
-    int ret;
-
-    int i;
-    int offset = 0;
-    word32 blocks = (sz / AES_BLOCK_SIZE);
-    byte* iv;
-    byte temp_block[AES_BLOCK_SIZE];
-
-    ESP_LOGV(TAG, "enter wc_esp32AesCbcDecrypt");
-
-    iv = (byte*)aes->reg;
-
-    ret = esp_aes_hw_InUse();
-
-    if (ret == 0) {
-        ret = esp_aes_hw_Set_KeyMode(aes, ESP32_AES_UPDATEKEY_DECRYPT);
-        if (ret != 0) {
-            ESP_LOGE(TAG, "wc_esp32AesCbcDecrypt failed HW Set KeyMode");
-        }
-    }
-
-    if (ret == 0) {
-        while (blocks--) {
-            XMEMCPY(temp_block, in + offset, AES_BLOCK_SIZE);
-
-            esp_aes_bk((in + offset), (out + offset));
-
-            /* XOR block with IV for CBC */
-            for (i = 0; i < AES_BLOCK_SIZE; i++) {
-                (out + offset)[i] ^= iv[i];
-            }
-
-            /* store IV for next block */
-            XMEMCPY(iv, temp_block, AES_BLOCK_SIZE);
-
-            offset += AES_BLOCK_SIZE;
-        } /* while (blocks--) */
-    } /* if Set Mode was successful (ret == 0) */
-
-    esp_aes_hw_Leave();
-    ESP_LOGV(TAG, "leave wc_esp32AesCbcDecrypt");
-    return 0;
-} /* wc_esp32AesCbcDecrypt */
-
-#endif /* WOLFSSL_ESP32WROOM32_CRYPT */
-#endif /* NO_AES */

+ 0 - 914
lib/wolfssl/wolfcrypt/src/port/Espressif/esp32_mp.c

@@ -1,914 +0,0 @@
-/* esp32_mp.c
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-#include <string.h>
-#include <stdio.h>
-
-#ifdef HAVE_CONFIG_H
-    #include <config.h>
-#endif
-#include <wolfssl/wolfcrypt/settings.h>
-
-#include "wolfssl/wolfcrypt/logging.h"
-
-#if !defined(NO_RSA) || defined(HAVE_ECC)
-
-#if defined(WOLFSSL_ESP32WROOM32_CRYPT_RSA_PRI) && \
-   !defined(NO_WOLFSSL_ESP32WROOM32_CRYPT_RSA_PRI)
-
-#ifdef NO_INLINE
-    #include <wolfssl/wolfcrypt/misc.h>
-#else
-    #define WOLFSSL_MISC_INCLUDED
-    #include <wolfcrypt/src/misc.c>
-#endif
-#include <wolfssl/wolfcrypt/wolfmath.h>
-
-static const char* const TAG = "wolfssl_mp";
-
-#define ESP_HW_RSAMAX_BIT           4096
-#define ESP_HW_MULTI_RSAMAX_BITS    2048
-#define ESP_HW_RSAMIN_BIT           512
-#define BYTE_TO_WORDS(s)            (((s+3)>>2))           /* (s+(4-1))/ 4    */
-#define BITS_TO_WORDS(s)            (((s+31)>>3)>>2)       /* (s+(32-1))/ 8/ 4*/
-#define BITS_IN_ONE_WORD            32
-
-#define MP_NG   -1
-
-#define ESP_TIMEOUT(cnt)         (cnt >= ESP_RSA_TIMEOUT_CNT)
-
-/* mutex */
-static wolfSSL_Mutex mp_mutex;
-static int espmp_CryptHwMutexInit = 0;
-/*
-* check if the HW is ready before accessing it
-*
-* When the RSA Accelerator is released from reset, the register RSA_CLEAN_REG
-* reads 0 and an initialization process begins. Hardware initializes the four
-* memory blocks by setting them to 0. After initialization is complete,
-* RSA_CLEAN_REG reads 1. For this reason, software should query RSA_CLEAN_REG
-* after being released from reset, and before writing to any RSA Accelerator
-* memory blocks or registers for the first time.
-*/
-static int esp_mp_hw_wait_clean(void)
-{
-	int ret = MP_OKAY;
-    word32 timeout = 0;
-
-#if CONFIG_IDF_TARGET_ESP32S3
-
-    while (!ESP_TIMEOUT(++timeout) && DPORT_REG_READ(RSA_QUERY_CLEAN_REG) != 1)
-    {
-      /*  wait. expected delay 1 to 2 uS  */
-    }
-#else
-  /* RSA_CLEAN_REG is now called RSA_QUERY_CLEAN_REG. hwcrypto_reg.h maintains
-   * RSA_CLEAN_REG for backwards compatibility so this block _might_ be not needed. */
-    while(!ESP_TIMEOUT(++timeout) && DPORT_REG_READ(RSA_CLEAN_REG) != 1) {
-        /*  wait. expected delay 1 to 2 uS  */
-    }
-#endif
-
-    if (ESP_TIMEOUT(timeout)) {
-        ESP_LOGE(TAG, "esp_mp_hw_wait_clean waiting HW ready timed out.");
-        ret = MP_NG;
-    }
-    return ret;
-}
-
-/*
-* esp_mp_hw_lock()
-*
-* Lock HW engine.
-* This should be called before using engine.
-*
-* Returns 0 if the HW lock was initialized and mutex lock.
-*
-* See Chapter 24:
-*  https://www.espressif.com/sites/default/files/documentation/esp32_technical_reference_manual_en.pdf
-*
-* The RSA Accelerator is activated by enabling the corresponding peripheral
-* clock, and by clearing the DPORT_RSA_PD bit in the DPORT_RSA_PD_CTRL_REG
-* register. This releases the RSA Accelerator from reset.
-*
-* When the RSA Accelerator is released from reset, the register RSA_CLEAN_REG
-* reads 0 and an initialization process begins. Hardware initializes the four
-* memory blocks by setting them to 0. After initialization is complete,
-* RSA_CLEAN_REG reads 1. For this reason, software should query RSA_CLEAN_REG
-* after being released from reset, and before writing to any RSA Accelerator
-* memory blocks or registers for the first time.
-*/
-static int esp_mp_hw_lock()
-{
-    int ret = 0;
-
-    ESP_LOGV(TAG, "enter esp_mp_hw_lock");
-
-    if (espmp_CryptHwMutexInit == 0) {
-        ret = esp_CryptHwMutexInit(&mp_mutex);
-        if (ret == 0) {
-            /* flag esp mp as initialized */
-            espmp_CryptHwMutexInit = 1;
-        }
-        else {
-            ESP_LOGE(TAG, "mp mutex initialization failed.");
-        }
-    }
-    else {
-        /* ESP AES has already been initialized */
-    }
-
-    if (ret == 0) {
-        /* lock hardware */
-        ret = esp_CryptHwMutexLock(&mp_mutex, portMAX_DELAY);
-        if (ret != 0) {
-            ESP_LOGE(TAG, "mp engine lock failed.");
-            ret = MP_NG;
-        }
-   }
-
-#if CONFIG_IDF_TARGET_ESP32S3
-    /* Activate the RSA accelerator. See 20.3 of ESP32-S3 technical manual.
-     * periph_module_enable doesn't seem to be documented and in private folder
-     * with v5 release. Maybe it will be deprecated? */
-    if (ret == 0) {
-        periph_module_enable(PERIPH_RSA_MODULE);
-
-        /* clear bit to enable hardware operation; (set to disable) */
-        DPORT_REG_CLR_BIT(SYSTEM_RSA_PD_CTRL_REG, SYSTEM_RSA_MEM_PD);
-    }
-#else
-    /* Enable RSA hardware */
-    if (ret == 0) {
-        periph_module_enable(PERIPH_RSA_MODULE);
-
-        /* clear bit to enable hardware operation; (set to disable) */
-        DPORT_REG_CLR_BIT(DPORT_RSA_PD_CTRL_REG, DPORT_RSA_PD);
-    }
-#endif
-
-    /* reminder: wait until RSA_CLEAN_REG reads 1
-     *  see esp_mp_hw_wait_clean()
-     */
-
-    ESP_LOGV(TAG, "leave esp_mp_hw_lock");
-    return ret;
-}
-
-/*
-*   Release HW engine
-*/
-static void esp_mp_hw_unlock( void )
-{
-#if CONFIG_IDF_TARGET_ESP32S3
-    /* Deactivate the RSA accelerator. See 20.3 of ESP32-S3 technical manual.
-     * periph_module_enable doesn't seem to be documented and in private folder
-     * with v5 release. Maybe it will be deprecated? */
-    DPORT_REG_SET_BIT(SYSTEM_RSA_PD_CTRL_REG, SYSTEM_RSA_MEM_PD);
-    periph_module_disable(PERIPH_RSA_MODULE);
-
-#else
-    /* set bit to disabled hardware operation; (clear to enable)
-     */
-    DPORT_REG_SET_BIT(DPORT_RSA_PD_CTRL_REG, DPORT_RSA_PD);
-
-    /* Disable RSA hardware */
-    periph_module_disable(PERIPH_RSA_MODULE);
-#endif
-
-    /* unlock */
-    esp_CryptHwMutexUnLock(&mp_mutex);
-}
-
-/* this is based on an article by Cetin Kaya Koc,
- * A New Algorithm for Inversion: mod p^k, June 28 2017 */
-static int esp_calc_Mdash(MATH_INT_T *M, word32 k, mp_digit* md)
-{
-    int i;
-    int xi;
-    int b0 = 1;
-    int bi;
-    word32  N = 0;
-    word32  x;
-
-    N = M->dp[0];
-    bi = b0;
-    x  = 0;
-
-    for (i = 0; i < k; i++) {
-        xi = bi % 2;
-        if (xi < 0) {
-            xi *= -1;
-        }
-        bi = (bi - N * xi) / 2;
-        x |= (xi << i);
-    }
-    /* 2's complement */
-    *md = ~x + 1;
-    return MP_OKAY;
-}
-
-/* start HW process */
-static void process_start(word32 reg)
-{
-     /* clear interrupt */
-    DPORT_REG_WRITE(RSA_INTERRUPT_REG, 1);
-    /* start process  */
-    DPORT_REG_WRITE(reg, 1);
-}
-
-/* wait until done */
-static int wait_until_done(word32 reg)
-{
-    word32 timeout = 0;
-    /* wait until done && not timeout */
-    while (!ESP_TIMEOUT(++timeout) &&
-                DPORT_REG_READ(reg) != 1) {
-        /* wait */
-    }
-
-    /* clear interrupt */
-    DPORT_REG_WRITE(RSA_INTERRUPT_REG, 1);
-
-    if (ESP_TIMEOUT(timeout)) {
-        ESP_LOGE(TAG, "rsa operation is timed out.");
-        return MP_NG;
-    }
-
-    return MP_OKAY;
-}
-
-/* read data from memory into mp_init          */
-static void esp_memblock_to_mpint(word32 mem_address,
-                                  MATH_INT_T* mp,
-                                  word32 numwords)
-{
-    esp_dport_access_read_buffer((uint32_t*)mp->dp, mem_address, numwords);
-    mp->used = numwords;
-}
-
-/* write mp_init into memory block
- */
-static void esp_mpint_to_memblock(word32 mem_address, const MATH_INT_T* mp,
-                                                      const word32 bits,
-                                                      const word32 hwords)
-{
-    /* init */
-    word32 i;
-    word32 len = (bits / 8 + ((bits & 7) != 0 ? 1 : 0));
-
-    len = (len + sizeof(word32)-1) / sizeof(word32);
-
-    for (i=0; i < hwords; i++) {
-        if (i < len) {
-            DPORT_REG_WRITE(mem_address + (i * sizeof(word32)), mp->dp[i]);
-        }
-        else {
-            DPORT_REG_WRITE(mem_address + (i * sizeof(word32)), 0);
-        }
-    }
-}
-
-/* return needed HW words.
- * supported words length
- *  words : {16 ,  32,  48,    64,   80,   96, 112,   128}
- *  bits  : {512,1024, 1536, 2048, 2560, 3072, 3584, 4096}
- */
-static word32 words2hwords(word32 wd)
-{
-    const word32 bit_shift  = 4;
-
-    return (((wd + 0xf) >> bit_shift) << bit_shift);
-}
-
-/* count the number of words is needed for bits */
-static word32 bits2words(word32 bits)
-{
-    /* 32 bits */
-    const word32 d = sizeof(word32) * WOLFSSL_BIT_SIZE;
-
-    return ((bits + (d - 1)) / d);
-}
-
-/* get rinv */
-static int esp_get_rinv(MATH_INT_T *rinv, MATH_INT_T *M, word32 exp)
-{
-    int ret = 0;
-
-    /* 2^(exp)*/
-    if ((ret = mp_2expt(rinv, exp)) != MP_OKAY) {
-        ESP_LOGE(TAG, "failed to calculate mp_2expt()");
-        return ret;
-    }
-
-    /* r_inv = R^2 mod M(=P) */
-    if (ret == 0 && (ret = mp_mod(rinv, M, rinv)) != MP_OKAY) {
-        ESP_LOGE(TAG, "failed to calculate mp_mod()");
-        return ret;
-    }
-
-    return ret;
-}
-
-/* Z = X * Y;  */
-int esp_mp_mul(MATH_INT_T* X, MATH_INT_T* Y, MATH_INT_T* Z)
-{
-    int ret;
-
-#ifdef WOLFSSL_SP_INT_NEGATIVE
-    /* neg check: X*Y becomes negative */
-    int neg;
-
-    /* aka (X->sign == Y->sign) ? MP_ZPOS : MP_NEG; , but with mp_isneg(): */
-    neg = (mp_isneg(X) == mp_isneg(Y)) ? MP_ZPOS : MP_NEG;
-    if (neg) {
-        /* Negative numbers are relatively infrequent.
-         * May be interesting during verbose debugging: */
-        ESP_LOGV(TAG, "mp_isneg(X) = %d; mp_isneg(Y) = %d; neg = %d ",
-                       mp_isneg(X),      mp_isneg(Y),           neg);
-    }
-#endif
-    ret = MP_OKAY; /* assume success until proven wrong */
-
-#if CONFIG_IDF_TARGET_ESP32S3
-
-    int BitsInX = mp_count_bits(X);
-    int BitsInY = mp_count_bits(Y);
-
-    /* X & Y must be represented by the same number of bits. Must be
-     * enough to represent the larger one. */
-    int MinXYBits = max(BitsInX, BitsInY);
-
-    /* Figure out how many words we need to represent each operand & the result. */
-    int WordsForOperand = bits2words(MinXYBits);
-    int WordsForResult = bits2words(BitsInX + BitsInY);
-
-    /* Make sure we are within capabilities of hardware. */
-    if ( (WordsForOperand * BITS_IN_ONE_WORD) > ESP_HW_MULTI_RSAMAX_BITS ) {
-        ESP_LOGW(TAG, "exceeds max bit length(2048)");
-        return MP_VAL; /*  Error: value is not able to be used. */
-    }
-
-    /* Steps to perform large number multiplication. Calculates Z = X x Y. The number of
-     * bits in the operands (X, Y) is N. N can be 32x, where x = {1,2,3,...64}, so the
-     * maximum number of bits in the X and Y is 2048.
-     * See 20.3.3 of ESP32-S3 technical manual
-     *  1. Lock the hardware so no-one else uses it and wait until it is ready.
-     *  2. Enable/disable interrupt that signals completion -- we don't use the interrupt.
-     *  3. Write number of words required for result to the RSA_MODE_REG (now called RSA_LENGTH_REG).
-     *     Number of words required for the result is 2 * words for operand - 1
-     *  4. Load X, Y operands to memory blocks. Note the Y value must be written to
-     *     right aligned.
-     *  5. Start the operation by writing 1 to RSA_MULT_START_REG, then wait for it
-     *     to complete by monitoring RSA_IDLE_REG (which is now called RSA_QUERY_INTERRUPT_REG).
-     *  6. Read the result out.
-     *  7. Release the hardware lock so others can use it.
-     *  x. Clear the interrupt flag, if you used it (we don't). */
-
-    /* 1. lock HW for use & wait until it is ready. */
-    if ( ((ret = esp_mp_hw_lock()) != MP_OKAY) ||
-         ((ret = esp_mp_hw_wait_clean()) != MP_OKAY) ) {
-        return ret;
-    }
-
-    /* 2. Disable completion interrupt signal; we don't use.
-    **    0 => no interrupt; 1 => interrupt on completion. */
-    DPORT_REG_WRITE(RSA_INTERRUPT_REG, 0);
-
-    /* 3. Write number of words required for result. */
-    if ( (WordsForOperand * BITS_IN_ONE_WORD * 2) > ESP_HW_RSAMAX_BIT) {
-        ESP_LOGW(TAG, "result exceeds max bit length");
-        return MP_VAL; /*  Error: value is not able to be used. */
-    }
-    DPORT_REG_WRITE(RSA_LENGTH_REG, (WordsForOperand * 2 - 1) );
-
-    /* 4. Load X, Y operands. Maximum is 64 words (64*8*4 = 2048 bits) */
-    esp_mpint_to_memblock(RSA_MEM_X_BLOCK_BASE,
-                          X, BitsInX, WordsForOperand);
-    esp_mpint_to_memblock(RSA_MEM_Z_BLOCK_BASE + WordsForOperand * 4,
-                          Y, BitsInY, WordsForOperand);
-
-
-    /* 5. Start operation and wait until it completes. */
-    process_start(RSA_MULT_START_REG);
-    ret = wait_until_done(RSA_QUERY_INTERRUPT_REG);
-    if (MP_OKAY != ret) {
-        return ret;
-    }
-
-    /* 6. read the result form MEM_Z              */
-    esp_memblock_to_mpint(RSA_MEM_Z_BLOCK_BASE, Z, WordsForResult);
-
-    /* 7. clear and release HW                    */
-    esp_mp_hw_unlock();
-
-    /* end if CONFIG_IDF_TARGET_ESP32S3 */
-
-#else /* not CONFIG_IDF_TARGET_ESP32S3 */
-    /* assumed to be regular Xtensa here */
-    word32 Xs;
-    word32 Ys;
-    word32 Zs;
-    word32 maxWords_sz;
-    word32 hwWords_sz;
-
-    /* ask bits number */
-    Xs = mp_count_bits(X);
-    Ys = mp_count_bits(Y);
-    Zs = Xs + Ys;
-
-    /* maximum bits and words for writing to HW */
-    maxWords_sz = bits2words(max(Xs, Ys));
-    hwWords_sz  = words2hwords(maxWords_sz);
-
-    /* sanity check */
-    if((hwWords_sz<<5) > ESP_HW_MULTI_RSAMAX_BITS) {
-        ESP_LOGW(TAG, "exceeds max bit length(2048)");
-        return MP_VAL; /*  Error: value is not able to be used. */
-    }
-
-    /*Steps to use HW in the following order:
-    * 1. wait until clean HW engine
-    * 2. Write(2*N/512bits - 1 + 8) to MULT_MODE_REG
-    * 3. Write X and Y to memory blocks
-    *    need to write data to each memory block only according to the length
-    *    of the number.
-    * 4. Write 1  to MUL_START_REG
-    * 5. Wait for the first operation to be done. Poll INTERRUPT_REG until it reads 1.
-    *    (Or until the INTER interrupt is generated.)
-    * 6. Write 1 to RSA_INTERRUPT_REG to clear the interrupt.
-    * 7. Read the Z from RSA_Z_MEM
-    * 8. Write 1 to RSA_INTERUPT_REG to clear the interrupt.
-    * 9. Release the HW engine
-    */
-    /* lock HW for use */
-    if ((ret = esp_mp_hw_lock()) != MP_OKAY) {
-        return ret;
-    }
-
-    if((ret = esp_mp_hw_wait_clean()) != MP_OKAY) {
-        return ret;
-    }
-
-    /* step.1  (2*N/512) => N/256. 512 bits => 16 words */
-    DPORT_REG_WRITE(RSA_MULT_MODE_REG, (hwWords_sz >> 3) - 1 + 8);
-    /* step.2 write X, M and r_inv into memory */
-    esp_mpint_to_memblock(RSA_MEM_X_BLOCK_BASE,
-                          X,
-                          Xs,
-                          hwWords_sz);
-    /* Y(let-extend)                          */
-    esp_mpint_to_memblock(RSA_MEM_Z_BLOCK_BASE + (hwWords_sz<<2),
-                          Y,
-                          Ys,
-                          hwWords_sz);
-    /* step.3 start process                           */
-    process_start(RSA_MULT_START_REG);
-
-    /* step.4,5 wait until done                       */
-    ret = wait_until_done(RSA_INTERRUPT_REG);
-    if (ret != MP_OKAY) {
-        ESP_LOGE(TAG, "wait_until_done failed.");
-        return ret;
-    }
-    /* step.6 read the result form MEM_Z              */
-    esp_memblock_to_mpint(RSA_MEM_Z_BLOCK_BASE, Z, BITS_TO_WORDS(Zs));
-
-    /* step.7 clear and release HW                    */
-    esp_mp_hw_unlock();
-
-#endif /* CONFIG_IDF_TARGET_ESP32S3 or not */
-
-    /* common exit for all chipset types */
-#ifdef WOLFSSL_SP_INT_NEGATIVE
-    if (!mp_iszero(Z) && neg) {
-        /* for non-zero negative numbers, set negative flag for our result:
-         *   Z->sign = FP_NEG */
-        mp_setneg(Z);
-    }
-#endif
-
-    return ret;
-}
-
-/* Z = X * Y (mod M)                                  */
-int esp_mp_mulmod(MATH_INT_T* X, MATH_INT_T* Y, MATH_INT_T* M, MATH_INT_T* Z)
-{
-    int ret = 0;
-    int negcheck;
-    word32 Xs;
-    word32 Ys;
-    word32 Ms;
-    word32 maxWords_sz;
-    word32 hwWords_sz;
-    word32 zwords;
-
-    MATH_INT_T r_inv;
-    MATH_INT_T tmpZ;
-    mp_digit mp;
-
-    uint32_t Exponent;
-#if CONFIG_IDF_TARGET_ESP32S3
-    uint32_t OperandBits;
-    int WordsForOperand;
-# endif
-
-    /* neg check - X*Y becomes negative */
-    negcheck = mp_isneg(X) != mp_isneg(Y) ? 1 : 0;
-
-    /* ask bits number */
-    Xs = mp_count_bits(X);
-    Ys = mp_count_bits(Y);
-    Ms = mp_count_bits(M);
-
-    /* maximum bits and words for writing to HW */
-    maxWords_sz = bits2words(max(Xs, max(Ys, Ms)));
-    zwords      = bits2words(min(Ms, Xs + Ys));
-    hwWords_sz  = words2hwords(maxWords_sz);
-
-    if ((hwWords_sz << 5) > ESP_HW_RSAMAX_BIT) {
-        ESP_LOGE(TAG, "exceeds HW maximum bits");
-        return MP_VAL; /*  Error: value is not able to be used. */
-    }
-    /* calculate r_inv = R^2 mode M
-    *    where: R = b^n, and b = 2^32
-    *    accordingly R^2 = 2^(n*32*2)
-    */
-#if CONFIG_IDF_TARGET_ESP32S3
-    Exponent = maxWords_sz * BITS_IN_ONE_WORD * 2;
-#else
-    Exponent = hwWords_sz << 6;
-#endif
-    ret = mp_init_multi(&tmpZ, &r_inv, NULL, NULL, NULL, NULL);
-    if (ret == 0 && (ret = esp_get_rinv(&r_inv, M, Exponent)) != MP_OKAY) {
-        ESP_LOGE(TAG, "calculate r_inv failed.");
-        mp_clear(&tmpZ);
-        mp_clear(&r_inv);
-        return ret;
-    }
-
-    /* lock HW for use */
-    if ((ret = esp_mp_hw_lock()) != MP_OKAY) {
-        mp_clear(&tmpZ);
-        mp_clear(&r_inv);
-        return ret;
-    }
-    /* Calculate M' */
-    if ((ret = esp_calc_Mdash(M, 32/* bits */, &mp)) != MP_OKAY) {
-        ESP_LOGE(TAG, "failed to calculate M dash");
-        mp_clear(&tmpZ);
-        mp_clear(&r_inv);
-        return ret;
-    }
-
-#if CONFIG_IDF_TARGET_ESP32S3
-    /* Steps to perform large number modular multiplication. Calculates Z = (X x Y) modulo M.
-     * The number of bits in the operands (X, Y) is N. N can be 32x, where x = {1,2,3,...64}, so the
-     * maximum number of bits in the X and Y is 2048. We must use the same number of words to represent
-     * the bits in X, Y and M.
-     * See 20.3.3 of ESP32-S3 technical manual
-     *  1. Wait until the hardware is ready.
-     *  2. Enable/disable interrupt that signals completion -- we don't use the interrupt.
-     *  3. Write the number of words required to represent the operands to the
-     *     RSA_MODE_REG (now called RSA_LENGTH_REG).
-     *  4. Write M' value into RSA_M_PRIME_REG (now called RSA_M_DASH_REG).
-     *  5. Load X, Y, M, r' operands to memory blocks.
-     *  6. Start the operation by writing 1 to RSA_MOD_MULT_START_REG, then wait for it
-     *     to complete by monitoring RSA_IDLE_REG (which is now called RSA_QUERY_INTERRUPT_REG).
-     *  7. Read the result out.
-     *  8. Release the hardware lock so others can use it.
-     *  x. Clear the interrupt flag, if you used it (we don't). */
-
-    /* 1. Wait until hardware is ready. */
-    if ((ret = esp_mp_hw_wait_clean()) != MP_OKAY) {
-        return ret;
-    }
-
-    /* 2. Disable completion interrupt signal; we don't use.
-    **    0 => no interrupt; 1 => interrupt on completion. */
-    DPORT_REG_WRITE(RSA_INTERRUPT_REG, 0);
-
-    /* 3. Write (N_result_bits/32 - 1) to the RSA_MODE_REG. */
-    OperandBits = max(max(Xs, Ys), Ms);
-    if (OperandBits > ESP_HW_MULTI_RSAMAX_BITS) {
-        ESP_LOGW(TAG, "result exceeds max bit length");
-        return MP_VAL; /*  Error: value is not able to be used. */
-    }
-    WordsForOperand = bits2words(OperandBits);
-    DPORT_REG_WRITE(RSA_LENGTH_REG, WordsForOperand - 1);
-
-    /* 4. Write M' value into RSA_M_PRIME_REG (now called RSA_M_DASH_REG) */
-    DPORT_REG_WRITE(RSA_M_DASH_REG, mp);
-
-    /* Select acceleration options. */
-    DPORT_REG_WRITE(RSA_CONSTANT_TIME_REG, 0);
-
-    /* 5. Load X, Y, M, r' operands.
-     * Note RSA_MEM_RB_BLOCK_BASE == RSA_MEM_Z_BLOC_BASE on ESP32s3*/
-    esp_mpint_to_memblock(RSA_MEM_X_BLOCK_BASE, X, Xs, hwWords_sz);
-    esp_mpint_to_memblock(RSA_MEM_Y_BLOCK_BASE, Y, Ys, hwWords_sz);
-    esp_mpint_to_memblock(RSA_MEM_M_BLOCK_BASE, M, Ms, hwWords_sz);
-    esp_mpint_to_memblock(RSA_MEM_RB_BLOCK_BASE, &r_inv, mp_count_bits(&r_inv), hwWords_sz);
-
-    /* 6. Start operation and wait until it completes. */
-    process_start(RSA_MOD_MULT_START_REG);
-    ret = wait_until_done(RSA_QUERY_INTERRUPT_REG);
-    if (MP_OKAY != ret) {
-        return ret;
-    }
-
-    /* 7. read the result form MEM_Z              */
-    esp_memblock_to_mpint(RSA_MEM_Z_BLOCK_BASE, &tmpZ, zwords);
-
-    /* 8. clear and release HW                    */
-    esp_mp_hw_unlock();
-
-    if (negcheck) {
-        mp_sub(M, &tmpZ, &tmpZ);
-    }
-
-    mp_copy(&tmpZ, Z);
-    mp_clear(&tmpZ);
-    mp_clear(&r_inv);
-
-    return ret;
-    /* end if CONFIG_IDF_TARGET_ESP32S3 */
-#else
-    /* non-S3 Xtensa */
-
-    /*Steps to use HW in the following order:
-    * 1. wait until clean HW engine
-    * 2. Write(N/512bits - 1) to MULT_MODE_REG
-    * 3. Write X,M(=G, X, P) to memory blocks
-    *    need to write data to each memory block only according to the length
-    *    of the number.
-    * 4. Write M' to M_PRIME_REG
-    * 5. Write 1  to MODEXP_START_REG
-    * 6. Wait for the first operation to be done. Poll INTERRUPT_REG until it reads 1.
-    *    (Or until the INTER interrupt is generated.)
-    * 7. Write 1 to RSA_INTERRUPT_REG to clear the interrupt.
-    * 8. Write Y to RSA_X_MEM
-    * 9. Write 1 to RSA_MULT_START_REG
-    * 10. Wait for the second operation to be completed. Poll INTERRUPT_REG until it reads 1.
-    * 11. Read the Z from RSA_Z_MEM
-    * 12. Write 1 to RSA_INTERUPT_REG to clear the interrupt.
-    * 13. Release the HW engine
-    */
-
-    if ( (ret = esp_mp_hw_wait_clean()) != MP_OKAY ) {
-        return ret;
-    }
-    /* step.1                     512 bits => 16 words */
-    DPORT_REG_WRITE(RSA_MULT_MODE_REG, (hwWords_sz >> 4) - 1);
-
-    /* step.2 write X, M and r_inv into memory */
-    esp_mpint_to_memblock(RSA_MEM_X_BLOCK_BASE, X, Xs, hwWords_sz);
-    esp_mpint_to_memblock(RSA_MEM_M_BLOCK_BASE, M, Ms, hwWords_sz);
-    esp_mpint_to_memblock(RSA_MEM_Z_BLOCK_BASE,
-                          &r_inv,
-                          mp_count_bits(&r_inv),
-                          hwWords_sz);
-
-    /* step.3 write M' into memory                   */
-    DPORT_REG_WRITE(RSA_M_DASH_REG, mp);
-
-    /* step.4 start process                           */
-    process_start(RSA_MULT_START_REG);
-
-    /* step.5,6 wait until done                       */
-    wait_until_done(RSA_INTERRUPT_REG);
-    /* step.7 Y to MEM_X                              */
-    esp_mpint_to_memblock(RSA_MEM_X_BLOCK_BASE, Y, Ys, hwWords_sz);
-
-    /* step.8 start process                           */
-    process_start(RSA_MULT_START_REG);
-
-    /* step.9,11 wait until done                      */
-    wait_until_done(RSA_INTERRUPT_REG);
-
-    /* step.12 read the result from MEM_Z             */
-    esp_memblock_to_mpint(RSA_MEM_Z_BLOCK_BASE, &tmpZ, zwords);
-
-    /* step.13 clear and release HW                   */
-    esp_mp_hw_unlock();
-
-    /* additional steps                               */
-    /* this needs for known issue when Z is greater than M */
-    if (mp_cmp(&tmpZ, M) == MP_GT) {
-        /*  Z -= M  */
-        mp_sub(&tmpZ, M, &tmpZ);
-    }
-    if (negcheck) {
-        mp_sub(M, &tmpZ, &tmpZ);
-    }
-
-    mp_copy(&tmpZ, Z);
-
-    mp_clear(&tmpZ);
-    mp_clear(&r_inv);
-
-    return ret;
-#endif
-}
-
-/* Large Number Modular Exponentiation
- *
- *    Z = X^Y mod M
- *
- * See:
- *  ESP32, Chapter 24, https://www.espressif.com/sites/default/files/documentation/esp32_technical_reference_manual_en.pdf
- *  ESP32s3, section 20.3.1, https://www.espressif.com/sites/default/files/documentation/esp32-s3_technical_reference_manual_en.pdf
- * The operation is based on Montgomery multiplication. Aside from the
- * arguments X, Y , and M, two additional ones are needed —r and M′
-.* These arguments are calculated in advance by software.
-.*
-.* The RSA Accelerator supports operand lengths of N ∈ {512, 1024, 1536, 2048,
-.* 2560, 3072, 3584, 4096} bits on the ESP32 and N ∈ [32, 4096] bits on the ESP32s3.
-.* The bit length of arguments Z, X, Y , M, and r can be any one from the N set,
-.* but all numbers in a calculation must be of the same length.
-.* The bit length of M′ is always 32.
-.*
-.* Note some DH references may use: Y = (G ^ X) mod P
- */
-int esp_mp_exptmod(MATH_INT_T* X, MATH_INT_T* Y, word32 Ys, MATH_INT_T* M, MATH_INT_T* Z)
-{
-    int ret = 0;
-
-    word32 Xs;
-    word32 Ms;
-    word32 maxWords_sz;
-    word32 hwWords_sz;
-
-    MATH_INT_T r_inv;
-    mp_digit mp;
-
-#if CONFIG_IDF_TARGET_ESP32S3
-    uint32_t OperandBits;
-    uint32_t WordsForOperand;
-#endif
-
-    /* ask bits number */
-    Xs = mp_count_bits(X);
-    Ms = mp_count_bits(M);
-    /* maximum bits and words for writing to HW */
-    maxWords_sz = bits2words(max(Xs, max(Ys, Ms)));
-    hwWords_sz  = words2hwords(maxWords_sz);
-
-    if ((hwWords_sz << 5) > ESP_HW_RSAMAX_BIT) {
-        ESP_LOGE(TAG, "exceeds HW maximum bits");
-        return MP_VAL; /*  Error: value is not able to be used. */
-    }
-    /* calculate r_inv = R^2 mode M
-    *    where: R = b^n, and b = 2^32
-    *    accordingly R^2 = 2^(n*32*2)
-    */
-    ret = mp_init(&r_inv);
-    if ( (ret == 0) &&
-         ((ret = esp_get_rinv(&r_inv, M, (hwWords_sz << 6))) != MP_OKAY) ) {
-        ESP_LOGE(TAG, "calculate r_inv failed.");
-        mp_clear(&r_inv);
-        return ret;
-    }
-    /* lock and init the HW                           */
-    if ( (ret = esp_mp_hw_lock()) != MP_OKAY ) {
-        mp_clear(&r_inv);
-        return ret;
-    }
-    /* calc M' */
-    /* if Pm is odd, uses mp_montgomery_setup() */
-    if ( (ret = esp_calc_Mdash(M, 32/* bits */, &mp)) != MP_OKAY ) {
-        ESP_LOGE(TAG, "failed to calculate M dash");
-        mp_clear(&r_inv);
-        return ret;
-    }
-
-#if CONFIG_IDF_TARGET_ESP32S3
-    /* Steps to perform large number modular exponentiation. Calculates Z = (X ^ Y) modulo M.
-     * The number of bits in the operands (X, Y) is N. N can be 32x, where x = {1,2,3,...64}, so the
-     * maximum number of bits in the X and Y is 2048.
-     * See 20.3.3 of ESP32-S3 technical manual
-     *  1. Wait until the hardware is ready.
-     *  2. Enable/disable interrupt that signals completion -- we don't use the interrupt.
-     *  3. Write (N_bits/32 - 1) to the RSA_MODE_REG (now called RSA_LENGTH_REG).
-     *     Here N_bits is the maximum number of bits in X, Y and M.
-     *  4. Write M' value into RSA_M_PRIME_REG (now called RSA_M_DASH_REG).
-     *  5. Load X, Y, M, r' operands to memory blocks.
-     *  6. Start the operation by writing 1 to RSA_MODEXP_START_REG, then wait for it
-     *     to complete by monitoring RSA_IDLE_REG (which is now called RSA_QUERY_INTERRUPT_REG).
-     *  7. Read the result out.
-     *  8. Release the hardware lock so others can use it.
-     *  x. Clear the interrupt flag, if you used it (we don't). */
-
-    /* 1. Wait until hardware is ready. */
-    if ((ret = esp_mp_hw_wait_clean()) != MP_OKAY) {
-        return ret;
-    }
-
-    /* 2. Disable completion interrupt signal; we don't use.
-    **    0 => no interrupt; 1 => interrupt on completion. */
-    DPORT_REG_WRITE(RSA_INTERRUPT_REG, 0);
-
-    /* 3. Write (N_result_bits/32 - 1) to the RSA_MODE_REG. */
-    OperandBits = max(max(Xs, Ys), Ms);
-    if (OperandBits > ESP_HW_MULTI_RSAMAX_BITS) {
-        ESP_LOGW(TAG, "result exceeds max bit length");
-        return MP_VAL; /*  Error: value is not able to be used. */
-    }
-    WordsForOperand = bits2words(OperandBits);
-    DPORT_REG_WRITE(RSA_LENGTH_REG, WordsForOperand - 1);
-
-    /* 4. Write M' value into RSA_M_PRIME_REG (now called RSA_M_DASH_REG) */
-    DPORT_REG_WRITE(RSA_M_DASH_REG, mp);
-
-    /* 5. Load X, Y, M, r' operands. */
-    esp_mpint_to_memblock(RSA_MEM_X_BLOCK_BASE, X, Xs, hwWords_sz);
-    esp_mpint_to_memblock(RSA_MEM_Y_BLOCK_BASE, Y, Ys, hwWords_sz);
-    esp_mpint_to_memblock(RSA_MEM_M_BLOCK_BASE, M, Ms, hwWords_sz);
-    esp_mpint_to_memblock(RSA_MEM_Z_BLOCK_BASE, &r_inv,
-                          mp_count_bits(&r_inv), hwWords_sz);
-
-    /* 6. Start operation and wait until it completes. */
-    process_start(RSA_MODEXP_START_REG);
-    ret = wait_until_done(RSA_QUERY_INTERRUPT_REG);
-    if (MP_OKAY != ret) {
-        return ret;
-    }
-
-    /* 7. read the result form MEM_Z              */
-    esp_memblock_to_mpint(RSA_MEM_Z_BLOCK_BASE, Z, BITS_TO_WORDS(Ms));
-
-    /* 8. clear and release HW                    */
-    esp_mp_hw_unlock();
-
-    mp_clear(&r_inv);
-
-    return ret;
-    /* end if CONFIG_IDF_TARGET_ESP32S3 */
-#else
-    /* non-ESP32S3 Xtensa (regular ESP32) */
-
-    /* Steps to use HW in the following order:
-    * 1. Write(N/512bits - 1) to MODEXP_MODE_REG
-    * 2. Write X, Y, M and r_inv to memory blocks
-    *    need to write data to each memory block only according to the length
-    *    of the number.
-    * 3. Write M' to M_PRIME_REG
-    * 4. Write 1  to MODEXP_START_REG
-    * 5. Wait for the operation to be done. Poll INTERRUPT_REG until it reads 1.
-    *    (Or until the INTER interrupt is generated.)
-    * 6. Read the result Z(=Y) from Z_MEM
-    * 7. Write 1 to INTERRUPT_REG to clear the interrupt.
-    */
-    if ((ret = esp_mp_hw_wait_clean()) != MP_OKAY) {
-        return ret;
-    }
-
-    /* step.1                                         */
-    DPORT_REG_WRITE(RSA_MODEXP_MODE_REG, (hwWords_sz >> 4) - 1);
-    /* step.2 write G, X, P, r_inv and M' into memory */
-    esp_mpint_to_memblock(RSA_MEM_X_BLOCK_BASE, X, Xs, hwWords_sz);
-    esp_mpint_to_memblock(RSA_MEM_Y_BLOCK_BASE, Y, Ys, hwWords_sz);
-    esp_mpint_to_memblock(RSA_MEM_M_BLOCK_BASE, M, Ms, hwWords_sz);
-    esp_mpint_to_memblock(RSA_MEM_Z_BLOCK_BASE,
-                          &r_inv,
-                          mp_count_bits(&r_inv),
-                          hwWords_sz);
-    /* step.3 write M' into memory                    */
-    DPORT_REG_WRITE(RSA_M_DASH_REG, mp);
-    /* step.4 start process                           */
-    process_start(RSA_START_MODEXP_REG);
-
-    /* step.5 wait until done                         */
-    wait_until_done(RSA_INTERRUPT_REG);
-    /* step.6 read a result form memory               */
-    esp_memblock_to_mpint(RSA_MEM_Z_BLOCK_BASE, Z, BITS_TO_WORDS(Ms));
-    /* step.7 clear and release HW                    */
-    esp_mp_hw_unlock();
-
-    mp_clear(&r_inv);
-
-    return ret;
-#endif
-}
-
-#endif /* WOLFSSL_ESP32WROOM32_CRYPT_RSA_PRI) &&
-        * !NO_WOLFSSL_ESP32WROOM32_CRYPT_RSA_PRI */
-
-#endif /* !NO_RSA || HAVE_ECC */

+ 0 - 1368
lib/wolfssl/wolfcrypt/src/port/Espressif/esp32_sha.c

@@ -1,1368 +0,0 @@
-/* esp32_sha.c
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-#ifdef HAVE_CONFIG_H
-    #include <config.h>
-#endif
-
-#include <wolfssl/wolfcrypt/settings.h>
-/*****************************************************************************/
-/* this entire file content is excluded when NO_SHA, NO_SHA256
- * or when using WC_SHA384 or WC_SHA512
- */
-#if !defined(NO_SHA) || !defined(NO_SHA256) || defined(WC_SHA384) || \
-     defined(WC_SHA512)
-
-#include "wolfssl/wolfcrypt/logging.h"
-
-
-/* this entire file content is excluded if not using HW hash acceleration */
-#if defined(WOLFSSL_ESP32WROOM32_CRYPT) && \
-   !defined(NO_WOLFSSL_ESP32WROOM32_CRYPT_HASH)
-
-/* TODO this may be chip type dependent: add support for others */
-#include <hal/clk_gate_ll.h> /* ESP32-WROOM */
-
-#include <wolfssl/wolfcrypt/sha.h>
-#include <wolfssl/wolfcrypt/sha256.h>
-#include <wolfssl/wolfcrypt/sha512.h>
-
-#include "wolfssl/wolfcrypt/port/Espressif/esp32-crypt.h"
-#include "wolfssl/wolfcrypt/error-crypt.h"
-
-#ifdef NO_INLINE
-    #include <wolfssl/wolfcrypt/misc.h>
-#else
-    #define WOLFSSL_MISC_INCLUDED
-    #include <wolfcrypt/src/misc.c>
-#endif
-
-static const char* TAG = "wolf_hw_sha";
-
-#ifdef NO_SHA
-    #define WC_SHA_DIGEST_SIZE 20
-#endif
-
-/* RTOS mutex or just InUse variable  */
-#if defined(SINGLE_THREADED)
-    static int InUse = 0;
-#else
-    static wolfSSL_Mutex sha_mutex = NULL;
-
-    #if defined(DEBUG_WOLFSSL)
-        /* Only when debugging, we'll keep tracking of block numbers. */
-        static int this_block_num = 0;
-    #endif
-#endif
-
-/* esp_sha_init
-**
-**   ctx: any wolfSSL ctx from any hash algo
-**   hash_type: the specific wolfSSL enum for hash type
-**
-** Initializes ctx based on chipset capabilities and current state.
-** Active HW states, such as from during a copy operation, are demoted to SW.
-** For hash_type not available in HW, set SW mode.
-**
-** See esp_sha_init_ctx(ctx)
-*/
-int esp_sha_init(WC_ESP32SHA* ctx, enum wc_HashType hash_type)
-{
-    int ret = 0;
-
-#if defined(CONFIG_IDF_TARGET_ESP32) || defined(CONFIG_IDF_TARGET_ESP32S3)
-    switch (hash_type) { /* check each wolfSSL hash type WC_[n] */
-        case WC_HASH_TYPE_SHA:
-            ctx->sha_type = SHA1; /* assign Espressif SHA HW type */
-            ret = esp_sha_init_ctx(ctx);
-            break;
-
-        case WC_HASH_TYPE_SHA256:
-            ctx->sha_type = SHA2_256; /* assign Espressif SHA HW type */
-            ret = esp_sha_init_ctx(ctx);
-            break;
-
-    #ifdef CONFIG_IDF_TARGET_ESP32S3
-        case  WC_HASH_TYPE_SHA384:
-            /* TODO is SHA384 really not supported on -S3? */
-            ctx->mode = ESP32_SHA_SW;
-            ctx->sha_type = SHA2_384; /* Espressif type, but we won't use HW */
-            break;
-    #else
-        case  WC_HASH_TYPE_SHA384:
-            ctx->sha_type = SHA2_384; /* assign Espressif SHA HW type */
-            ret = esp_sha_init_ctx(ctx);
-            break;
-    #endif
-
-        case WC_HASH_TYPE_SHA512:
-            ctx->sha_type = SHA2_512; /* assign Espressif SHA HW type */
-            ret = esp_sha_init_ctx(ctx);
-            break;
-
-    #ifndef WOLFSSL_NOSHA512_224
-        case WC_HASH_TYPE_SHA512_224:
-            /* Don't call init, always SW as there's no HW. */
-            ctx->mode = ESP32_SHA_SW;
-            ctx->sha_type = SHA2_512; /* Espressif type, but we won't use HW */
-            break;
-    #endif
-
-    #ifndef WOLFSSL_NOSHA512_256
-        case WC_HASH_TYPE_SHA512_256:
-            /* Don't call init, always SW as there's no HW. */
-            ctx->mode = ESP32_SHA_SW;
-            ctx->sha_type = SHA2_512; /* Espressif type, but we won't use HW */
-            break;
-    #endif
-
-        default:
-           ret = esp_sha_init_ctx(ctx);
-           ESP_LOGW(TAG, "Unexpected hash_type in esp_sha_init");
-           break;
-    }
-#else
-    /* other chipsets will be implemented here */
-#endif /* defined(CONFIG_IDF_TARGET_ESP32) || defined(CONFIG_IDF_TARGET_ESP32S3) */
-
-    return ret;
-}
-
-/* we'll call a separate init as there's only 1 HW acceleration */
-int esp_sha_init_ctx(WC_ESP32SHA* ctx)
-{
-    if (ctx->initializer == NULL) {
-        ESP_LOGV(TAG, "regular init of blank WC_ESP32SHA ctx");
-
-        /* we'll keep track of who initialized this */
-        ctx->initializer = ctx; /* save our address in the initializer */
-        ctx->mode = ESP32_SHA_INIT;
-    }
-    else {
-        /* things may be more interesting when previously initialized */
-        if (ctx->initializer == ctx) {
-            /* We're likely re-using an existing object previously initialized.
-            ** There's of course a non-zero probability that garbage data is
-            ** the same pointer value, but that's highly unlikely; We'd need
-            ** to discard, then re-init to same memory location for a matching
-            ** initializer. */
-            ESP_LOGV(TAG, "re-using existing WC_ESP32SHA ctx");
-
-            /* we should never have an unexpected mode in a known ctx */
-            switch (ctx->mode) {
-                case ESP32_SHA_INIT:
-                case ESP32_SHA_SW:
-                    /* nothing interesting here */
-                    break;
-
-                case ESP32_SHA_HW:
-                    /* This will be dealt with below: likely demote to SW */
-                    break;
-
-                case ESP32_SHA_HW_COPY:
-                    /* This is an interesting mode, caller gave HW mode hint */
-                    ESP_LOGI(TAG, "ALERT: ESP32_SHA_HW_COPY?");
-                    break;
-
-                default:
-                    /* This should almost occur. We'd need to have an
-                    ** uninitialized ctx that just happens to include the
-                    ** breadcrumb initializer with the same address. */
-                    ESP_LOGW(TAG, "ALERT: unexpected WC_ESP32SHA ctx mode: "
-                                  "%d. ", ctx->mode);
-                    ctx->mode = ESP32_SHA_INIT;
-                    break;
-            }
-            /* We don't need to do anything here,
-            ** this section for diagnostics only.
-            ** May need to unlock HW, below. */
-        } /* ctx->initializer == ctx */
-        else {
-            /* We may end up here with either dirty memory
-            ** or copied SHA ctx.
-            **
-            ** Any copy function should have already set mode = ESP32_SHA_INIT.
-            **
-            ** In either case, initialize: */
-            ctx->initializer = ctx; /* set a new address */
-
-            /* Always set to ESP32_SHA_INIT, but give debug info as to why: */
-            switch (ctx->mode) {
-                case ESP32_SHA_INIT:
-                    /* if we are already in init mode, nothing to do. */
-                    break;
-
-                case ESP32_SHA_SW:
-                    /* this should rarely, if ever occur */
-                    ESP_LOGW(TAG, "ALERT: unexpected SW WC_ESP32SHA ctx mode. "
-                                  "Copied? Revert to ESP32_SHA_INIT.");
-                    ctx->mode = ESP32_SHA_INIT;
-                    break;
-
-                case ESP32_SHA_HW:
-                    /* this should rarely, if ever occur. */
-                    ESP_LOGW(TAG, "ALERT: unexpected HW WC_ESP32SHA ctx mode. "
-                                  "Copied?");
-                    ctx->mode = ESP32_SHA_INIT;
-                    break;
-
-                case ESP32_SHA_HW_COPY:
-                    /* This is an interesting but acceptable situation:
-                    ** an anticipated active HW copy that will demote to SW. */
-                    ESP_LOGV(TAG, "HW WC_ESP32SHA ctx mode = ESP32_SHA_HW_COPY.");
-                    break;
-
-                default:
-                    /* this will frequently occur during new init */
-                    ESP_LOGV(TAG, "ALERT: unexpected WC_ESP32SHA ctx mode. "
-                                  "Uninitialized?");
-                    ctx->mode = ESP32_SHA_INIT;
-                    break;
-            } /* switch */
-        } /* ctx->initializer != ctx */
-    } /* ctx->initializer != NULL */
-
-    /*
-    ** After possibly changing the mode (above) handle current mode:
-    */
-    switch (ctx->mode) {
-        case ESP32_SHA_INIT:
-            /* Likely a fresh, new SHA, as desired. */
-            ESP_LOGV(TAG, "Normal ESP32_SHA_INIT");
-            break;
-
-        case ESP32_SHA_HW:
-            /* We're already in hardware mode, so release. */
-            /* Interesting, but normal. */
-            ESP_LOGV(TAG, ">> HW unlock.");
-
-            /* During init is the ONLY TIME we call unlock.
-            ** If there's a problem, likely some undesired operation
-            ** outside of wolfSSL.
-            */
-            esp_sha_hw_unlock(ctx);
-            ctx->mode = ESP32_SHA_INIT;
-            break;
-
-        case ESP32_SHA_HW_COPY:
-            /* When we init during a known active HW copy, revert to SW. */
-            ESP_LOGV(TAG, "Planned revert to SW during copy.");
-            ctx->mode = ESP32_SHA_SW;
-            break;
-
-        case ESP32_SHA_SW:
-            /* This is an interesting situation: likely a call when
-            ** another SHA in progress, but copied. */
-            ESP_LOGV(TAG, ">> SW Set to init.");
-            ctx->mode = ESP32_SHA_INIT;
-            break;
-
-        case ESP32_SHA_FAIL_NEED_UNROLL:
-            /* Oh, how did we get here? likely uninitialized SHA memory.
-            ** User code logic may need attention. */
-            ESP_LOGW(TAG, "ALERT: \nESP32_SHA_FAIL_NEED_UNROLL\n");
-            ctx->mode = ESP32_SHA_INIT;
-            break;
-
-        default:
-            /* Most likely corrupted memory. */
-            ESP_LOGW(TAG, "ALERT: \nunexpected mode value: "
-                          "%d \n", ctx->mode);
-            ctx->mode = ESP32_SHA_INIT;
-            break;
-    } /* switch (ctx->mode)  */
-
-    /* reminder: always start isfirstblock = 1 (true) when using HW engine */
-    /* we're always on the first block at init time (not zero-based!) */
-    ctx->isfirstblock = true;
-    ctx->lockDepth = 0; /* new objects will always start with lock depth = 0 */
-
-    return 0; /* Always return success. We assume all issues handled, above. */
-} /* esp_sha_init_ctx */
-
-/*
-** internal SHA ctx copy for ESP HW
-*/
-int esp_sha_ctx_copy(struct wc_Sha* src, struct wc_Sha* dst)
-{
-    int ret;
-    if (src->ctx.mode == ESP32_SHA_HW) {
-        /* this is an interesting situation to copy HW digest to SW */
-        ESP_LOGV(TAG, "esp_sha_ctx_copy esp_sha_digest_process");
-
-        /* Get a copy of the HW digest, but don't process it. */
-        ret = esp_sha_digest_process(dst, 0);
-        if (ret == 0) {
-            /* note we arrived here only because the src is already in HW mode */
-            dst->ctx.mode = ESP32_SHA_HW_COPY; /* provide init hint to SW revert */
-
-            /* initializer will be set during init */
-            ret = esp_sha_init(&(dst->ctx), WC_HASH_TYPE_SHA);
-            if (ret != 0) {
-                ESP_LOGE(TAG, "Error during esp_sha_ctx_copy in esp_sha_init.");
-            }
-        }
-        else {
-            ESP_LOGE(TAG, "Error during esp_sha_ctx_copy in esp_sha_digest_process.");
-        }
-
-        if (dst->ctx.mode == ESP32_SHA_SW) {
-            /* The normal revert to SW in copy is expected */
-            ESP_LOGV(TAG, "Confirmed SHA Copy set to SW");
-        }
-        else {
-            /* However NOT reverting to SW is not right.
-            ** This should never happen. */
-            ESP_LOGW(TAG, "SHA Copy NOT set to SW");
-        }
-    } /* (src->ctx.mode == ESP32_SHA_HW */
-    else { /* src not in HW mode, ok to copy. */
-        /*
-        ** reminder XMEMCOPY, above: dst->ctx = src->ctx;
-        ** No special HW init needed in SW mode.
-        ** but we need to set our initializer breadcrumb: */
-        dst->ctx.initializer = &(dst->ctx); /* assign new breadcrumb to dst */
-        ret = 0;
-    }
-
-    return ret;
-} /* esp_sha_ctx_copy */
-
-/*
-** internal sha224 ctx copy (no ESP HW)
-*/
-int esp_sha224_ctx_copy(struct wc_Sha256* src, struct wc_Sha256* dst)
-{
-    /* There's no 224 hardware on ESP32 */
-    dst->ctx.initializer = &dst->ctx; /* assign the initializer to dst */
-
-    /* always set to SW, as there's no ESP32 HW for SHA224.
-    ** TODO: add support for ESP32-S2. ESP32-S3, ESP32-C3 here.
-    */
-    dst->ctx.mode = ESP32_SHA_SW;
-    return 0;
-} /* esp_sha224_ctx_copy */
-
-/*
-** internal sha256 ctx copy for ESP HW
-*/
-int esp_sha256_ctx_copy(struct wc_Sha256* src, struct wc_Sha256* dst)
-{
-    int ret;
-    if (src->ctx.mode == ESP32_SHA_HW) {
-        /* Get a copy of the HW digest, but don't process it. */
-        ESP_LOGI(TAG, "esp_sha256_ctx_copy esp_sha512_digest_process");
-        ret = esp_sha256_digest_process(dst, 0);
-
-        if (ret == 0) {
-            /* provide init hint to possibly SW revert */
-            dst->ctx.mode = ESP32_SHA_HW_COPY;
-
-            /* initializer breadcrumb will be set during init */
-            ret = esp_sha_init(&(dst->ctx), WC_HASH_TYPE_SHA256 );
-        }
-
-        if (dst->ctx.mode == ESP32_SHA_SW) {
-            ESP_LOGV(TAG, "Confirmed wc_Sha256 Copy set to SW");
-        }
-        else {
-            ESP_LOGW(TAG, "wc_Sha256 Copy NOT set to SW");
-        }
-    } /* (src->ctx.mode == ESP32_SHA_HW) */
-    else {
-        ret = 0;
-        /*
-        ** reminder this happened in XMEMCOPY: dst->ctx = src->ctx;
-        ** No special HW init needed in SW mode.
-        ** but we need to set our initializer: */
-        dst->ctx.initializer = &dst->ctx; /* assign the initializer to dst */
-    } /* not (src->ctx.mode == ESP32_SHA_HW) */
-
-    return ret;
-} /* esp_sha256_ctx_copy */
-
-/*
-** internal sha384 ctx copy for ESP HW
-*/
-int esp_sha384_ctx_copy(struct wc_Sha512* src, struct wc_Sha512* dst)
-{
-    int ret;
-    if (src->ctx.mode == ESP32_SHA_HW) {
-        /* Get a copy of the HW digest, but don't process it. */
-        ESP_LOGI(TAG, "esp_sha384_ctx_copy esp_sha512_digest_process");
-        ret = esp_sha512_digest_process(dst, 0);
-        if (ret == 0) {
-            /* provide init hint to SW revert */
-            dst->ctx.mode = ESP32_SHA_HW_COPY;
-
-            /* initializer will be set during init */
-            ret = esp_sha_init(&(dst->ctx), WC_HASH_TYPE_SHA384);
-            if (ret != 0) {
-                ESP_LOGE(TAG, "Error during esp_sha384_ctx_copy in esp_sha_init.");
-            }
-        }
-        else {
-            ESP_LOGE(TAG, "Error during esp_sha384_ctx_copy in esp_sha512_digest_process.");
-        }
-
-        /* just some diagnostic runtime info */
-        if (dst->ctx.mode == ESP32_SHA_SW) {
-            ESP_LOGV(TAG, "Confirmed wc_Sha512 Copy set to SW");
-        }
-        else {
-            ESP_LOGW(TAG, "wc_Sha512 Copy NOT set to SW");
-        }
-    } /* src->ctx.mode == ESP32_SHA_HW */
-    else {
-        ret = 0;
-        /*
-        ** reminder this happened in XMEMCOPY, above: dst->ctx = src->ctx;
-        ** No special HW init needed in SW mode.
-        ** but we need to set our initializer: */
-        dst->ctx.initializer = &dst->ctx; /* assign the initializer to dst */
-    } /* not (src->ctx.mode == ESP32_SHA_HW) */
-
-    return ret;
-} /* esp_sha384_ctx_copy */
-
-/*
-** Internal sha512 ctx copy for ESP HW.
-** If HW already active, fall back to SW for this ctx.
-*/
-int esp_sha512_ctx_copy(struct wc_Sha512* src, struct wc_Sha512* dst)
-{
-    int ret;
-    if (src->ctx.mode == ESP32_SHA_HW) {
-        /* Get a copy of the HW digest, but don't process it. */
-        ESP_LOGI(TAG, "esp_sha512_ctx_copy esp_sha512_digest_process");
-        ret = esp_sha512_digest_process(dst, 0);
-
-        if (ret == 0) {
-            /* provide init hint to SW revert */
-            dst->ctx.mode = ESP32_SHA_HW_COPY;
-
-            /* initializer will be set during init
-            ** reminder we should never arrive here for
-            ** ESP32 SHA512/224 or SHA512/224, as there's no HW */
-            ret = esp_sha_init(&(dst->ctx), WC_HASH_TYPE_SHA512);
-        }
-
-        if (dst->ctx.mode == ESP32_SHA_SW) {
-            ESP_LOGV(TAG, "Confirmed wc_Sha512 Copy set to SW");
-        }
-        else {
-            ESP_LOGW(TAG, "wc_Sha512 Copy NOT set to SW");
-        }
-    } /* src->ctx.mode == ESP32_SHA_HW */
-    else {
-        ret = 0;
-        /* reminder this happened in XMEMCOPY, above: dst->ctx = src->ctx;
-        ** No special HW init needed when not in active HW mode.
-        ** but we need to set our initializer breadcrumb: */
-        dst->ctx.initializer = &dst->ctx; /*breadcrumb is this ctx address */
-    }
-
-    return ret;
-} /* esp_sha512_ctx_copy */
-
-/*
-** determine the digest size, depending on SHA type.
-**
-** See FIPS PUB 180-4, Instruction Section 1.
-**
-** see ESP32 shah.h for values:
-**
-**  enum SHA_TYPE {
-**      SHA1 = 0,
-**      SHA2_256,
-**      SHA2_384,
-**      SHA2_512,
-**      SHA_INVALID = -1,
-**  };
-**
-** given the SHA_TYPE (see Espressif sha.h) return WC digest size.
-**
-** Returns zero for bad digest size type request.
-**
-*/
-static word32 wc_esp_sha_digest_size(enum SHA_TYPE type)
-{
-    int ret = 0;
-    ESP_LOGV(TAG, "  esp_sha_digest_size");
-
-    switch (type) {
-    #ifndef NO_SHA
-        case SHA1: /* typically 20 bytes */
-            ret = WC_SHA_DIGEST_SIZE;
-            break;
-#endif
-    #ifdef WOLFSSL_SHA224
-    /*
-        no SHA224 HW at this time.
-        case SHA2_224:
-            ret = WC_SHA224_DIGEST_SIZE;
-            break;
-    */
-    #endif
-    #ifndef NO_SHA256
-        case SHA2_256: /* typically 32 bytes */
-            ret = WC_SHA256_DIGEST_SIZE;
-            break;
-#endif
-    #ifdef WOLFSSL_SHA384
-        case SHA2_384:
-            ret =  WC_SHA384_DIGEST_SIZE;
-            break;
-#endif
-    #ifdef WOLFSSL_SHA512
-        case SHA2_512: /* typically 64 bytes */
-            ret = WC_SHA512_DIGEST_SIZE;
-            break;
-#endif
-        default:
-            ESP_LOGE(TAG, "Bad SHA type in wc_esp_sha_digest_size");
-            ret = 0;
-            break;
-    }
-
-    return ret; /* Return value is a size, not an error code. */
-} /* wc_esp_sha_digest_size */
-
-/*
-** wait until all engines becomes idle
-*/
-static int wc_esp_wait_until_idle(void)
-{
-    int ret = 0; /* assume success */
-
-#if defined(CONFIG_IDF_TARGET_ESP32C3) || defined(CONFIG_IDF_TARGET_ESP32C6)
-    /* ESP32-C3 RISC-V TODO */
-#elif defined(CONFIG_IDF_TARGET_ESP32S3)
-    while (REG_READ(SHA_BUSY_REG)) {
-      /* do nothing while waiting. */
-    }
-#else
-    while ((DPORT_REG_READ(SHA_1_BUSY_REG)   != 0) ||
-           (DPORT_REG_READ(SHA_256_BUSY_REG) != 0) ||
-           (DPORT_REG_READ(SHA_384_BUSY_REG) != 0) ||
-           (DPORT_REG_READ(SHA_512_BUSY_REG) != 0)) {
-        /* do nothing while waiting. */
-    }
-#endif
-
-    return ret;
-} /* wc_esp_wait_until_idle */
-
-/*
-** hack alert. there really should have been something implemented
-** in Espressif periph_ctrl.c to detect ref_counts[periph] depth.
-**
-** since there is not at this time, we have this brute-force method.
-**
-** when trying to unwrap an arbitrary depth of peripheral-enable(s),
-** we'll check the register upon *enable* to see if we actually did.
-**
-** Note that enable / disable only occurs when ref_counts[periph] == 0
-**
-** TODO: check if this works with other ESP32 platforms ESP32-C3,
-** ESP32-S3, etc.  (A: generally, no. RISC-V has different HW accelerator.)
-*/
-int esp_unroll_sha_module_enable(WC_ESP32SHA* ctx)
-{
-    /* if we end up here, there was a prior unexpected fail and
-     * we need to unroll enables */
-    int ret = 0; /* assume success unless proven otherwise */
-    int actual_unroll_count = 0;
-    int max_unroll_count = 1000; /* never get stuck in a hardware wait loop */
-
-#if defined(CONFIG_IDF_TARGET_ESP32)
-    uint32_t this_sha_mask; /* this is the bit-mask for our SHA CLK_EN_REG */
-#endif
-
-    if (ctx == NULL) {
-        ESP_LOGE(TAG, "esp_unroll_sha_module_enable called with null ctx.");
-        return BAD_FUNC_ARG;
-    }
-
-#if defined(CONFIG_IDF_TARGET_ESP32C3) || defined(CONFIG_IDF_TARGET_ESP32C6)
-    /*  RISC-V Architecture: TODO */
-#else
-    /* Xtensa Architecture */
-
-    /* unwind prior calls to THIS ctx. decrement ref_counts[periph] */
-    /* only when ref_counts[periph] == 0 does something actually happen */
-
-    /* once the value we read is a 0 in the DPORT_PERI_CLK_EN_REG bit
-     * then we have fully unrolled the enables via ref_counts[periph]==0 */
-#if CONFIG_IDF_TARGET_ESP32S3
-    /* once the value we read is a 0 in the DPORT_PERI_CLK_EN_REG bit
-     * then we have fully unrolled the enables via ref_counts[periph]==0 */
-    while (periph_ll_periph_enabled(PERIPH_SHA_MODULE)) {
-#else
-    /* this is the bit-mask for our SHA CLK_EN_REG */
-    this_sha_mask = periph_ll_get_clk_en_mask(PERIPH_SHA_MODULE);
-    asm volatile("memw");
-    while ((this_sha_mask & *(uint32_t*)DPORT_PERI_CLK_EN_REG) != 0) {
-#endif /* CONFIG_IDF_TARGET_ESP32S3 */
-        periph_module_disable(PERIPH_SHA_MODULE);
-        asm volatile("memw");
-        actual_unroll_count++;
-        ESP_LOGI(TAG, "unroll not yet successful. try #%d",
-                 actual_unroll_count);
-
-        /* we'll only try this some unreasonable number of times
-         * before giving up */
-        if (actual_unroll_count > max_unroll_count) {
-            ret = -1; /* failed to unroll */
-            break;
-        }
-    }
-#endif /* else; not RISC-V */
-    if (ret == 0) {
-        if (ctx->lockDepth != actual_unroll_count) {
-            /* this could be a warning of wonkiness in RTOS environment.
-            ** we were successful, but not expected depth count.
-            **
-            ** This should never happen unless someone else called
-            ** periph_module_disable() or threading not working properly.
-            **/
-            ESP_LOGW(TAG, "warning lockDepth mismatch.");
-        }
-        ctx->lockDepth = 0;
-        ctx->mode = ESP32_SHA_INIT;
-    }
-    else {
-        /* This should never occur. Something must have gone seriously
-        ** wrong. Check for non-wolfSSL outside calls that may have enabled HW.
-        */
-        ESP_LOGE(TAG, "Failed to unroll after %d attempts.",
-                      actual_unroll_count);
-        ESP_LOGI(TAG, "Setting ctx->mode = ESP32_SHA_SW");
-        ctx->mode = ESP32_SHA_SW;
-    }
-    return ret;
-} /* esp_unroll_sha_module_enable */
-
-/*
-** lock HW engine.
-** this should be called before using engine.
-*/
-int esp_sha_try_hw_lock(WC_ESP32SHA* ctx)
-{
-    int ret = 0;
-
-    ESP_LOGV(TAG, "enter esp_sha_hw_lock %x", (int)ctx->initializer);
-
-    if (ctx == NULL) {
-        ESP_LOGE(TAG, " esp_sha_try_hw_lock called with NULL ctx");
-        return BAD_FUNC_ARG;
-    }
-
-    /* Init mutex
-     *
-     * Note that even single thread mode may calculate hashes
-     * concurrently, so we still need to keep track of the
-     * engine being busy or not.
-     **/
-#if defined(SINGLE_THREADED)
-    if (ctx->mode == ESP32_SHA_INIT) {
-        if (!InUse) {
-            ctx->mode = ESP32_SHA_HW;
-            InUse = 1;
-        }
-        else {
-            ctx->mode = ESP32_SHA_SW;
-        }
-    }
-    else {
-         /* this should not happens */
-        ESP_LOGE(TAG, "unexpected error in esp_sha_try_hw_lock.");
-        return -1;
-    }
-#else /* not defined(SINGLE_THREADED) */
-    /*
-    ** there's only one SHA engine for all the hash types
-    ** so when any hash is in use, no others can use it.
-    ** fall back to SW.
-    **
-    ** here is some sample code to test the unrolling of SHA enables:
-    **
-
-    periph_module_enable(PERIPH_SHA_MODULE);
-    ctx->lockDepth++;
-    periph_module_enable(PERIPH_SHA_MODULE);
-    ctx->lockDepth++;
-    ctx->mode = ESP32_FAIL_NEED_INIT;
-
-    **
-    */
-
-    if (sha_mutex == NULL) {
-        ESP_LOGV(TAG, "Initializing sha_mutex");
-
-        /* created, but not yet locked */
-        ret = esp_CryptHwMutexInit(&sha_mutex);
-        if (ret == 0) {
-            ESP_LOGV(TAG, "esp_CryptHwMutexInit sha_mutex init success.");
-        }
-        else {
-            ESP_LOGE(TAG, "esp_CryptHwMutexInit sha_mutex failed.");
-            sha_mutex = 0;
-
-            ESP_LOGI(TAG, "Revert to ctx->mode = ESP32_SHA_SW.");
-            ctx->mode = ESP32_SHA_SW;
-            return 0; /* success, just not using HW */
-        }
-    }
-
-    /* check if this SHA has been operated as SW or HW, or not yet init */
-    if (ctx->mode == ESP32_SHA_INIT) {
-        /* try to lock the HW engine */
-        ESP_LOGV(TAG, "ESP32_SHA_INIT\n");
-
-        /* we don't wait:
-        ** either the engine is free, or we fall back to SW
-        **/
-        if (esp_CryptHwMutexLock(&sha_mutex, (TickType_t)0) == 0) {
-            /* check to see if we had a prior fail and need to unroll enables */
-            ret = esp_unroll_sha_module_enable(ctx);
-            ESP_LOGV(TAG, "Hardware Mode, lock depth = %d,  %x",
-                          ctx->lockDepth, (int)ctx->initializer);
-
-            if (ctx->lockDepth > 0) {
-                /* it is unlikely that this would ever occur,
-                ** as the mutex should be gate keeping */
-                ESP_LOGW(TAG, "WARNING: Hardware Mode "
-                              "interesting lock depth = %d,  %x",
-                              ctx->lockDepth, (int)ctx->initializer);
-            }
-        }
-        else {
-            /* We should have otherwise anticipated this; how did we get here?
-            ** This code should rarely, ideally never be reached. */
-            ESP_LOGI(TAG, "\nHardware in use; Mode REVERT to ESP32_SHA_SW\n");
-            ctx->mode = ESP32_SHA_SW;
-            return 0; /* success, but revert to SW */
-        }
-    } /* (ctx->mode == ESP32_SHA_INIT) */
-    else {
-        /* this should not happen: called during mode != ESP32_SHA_INIT  */
-        ESP_LOGE(TAG, "unexpected error in esp_sha_try_hw_lock.");
-        return -1;
-    }
-#endif /* not defined(SINGLE_THREADED) */
-
-#if defined(CONFIG_IDF_TARGET_ESP32C3) || defined(CONFIG_IDF_TARGET_ESP32C6)
-    /* ESP32-C3 RISC-V TODO */
-#else
-    if (ret == 0) {
-        ctx->lockDepth++; /* depth for THIS ctx (there could be others!) */
-        periph_module_enable(PERIPH_SHA_MODULE);
-        ctx->mode = ESP32_SHA_HW;
-    }
-    else {
-        ESP_LOGW(TAG, ">>>> Other problem; Mode REVERT to ESP32_SHA_SW");
-        ctx->mode = ESP32_SHA_SW;
-    }
-#endif
-    ESP_LOGV(TAG, "leave esp_sha_hw_lock");
-
-    return ret;
-} /* esp_sha_try_hw_lock */
-
-/*
-** release HW engine. when we don't have it locked, SHA module is DISABLED
-*/
-int esp_sha_hw_unlock(WC_ESP32SHA* ctx)
-{
-    ESP_LOGV(TAG, "enter esp_sha_hw_unlock");
-
-#if defined(CONFIG_IDF_TARGET_ESP32C3) || defined(CONFIG_IDF_TARGET_ESP32C6)
-    /* ESP32-C3 RISC-V TODO */
-#else
-    /* Disable AES hardware */
-    periph_module_disable(PERIPH_SHA_MODULE);
-#endif
-    /* we'll keep track of our lock depth.
-     * in case of unexpected results, all the periph_module_disable() calls
-     * and periph_module_disable() need to be unwound.
-     *
-     * see ref_counts[periph] in file: periph_ctrl.c */
-    if (ctx->lockDepth > 0) {
-        ctx->lockDepth--;
-    }
-    else {
-        ctx->lockDepth = 0;
-    }
-
-#if defined(SINGLE_THREADED)
-    InUse = 0;
-#else
-    /* unlock HW engine for next use */
-    esp_CryptHwMutexUnLock(&sha_mutex);
-#endif
-    ESP_LOGV(TAG, "leave esp_sha_hw_unlock, %x", (int)ctx->initializer);
-    return 0;
-} /* esp_sha_hw_unlock */
-
-/*
-* Start SHA process by using HW engine.
-* Assumes register already loaded.
-* Returns a negative value error code upon failure.
-*/
-static int esp_sha_start_process(WC_ESP32SHA* sha)
-{
-    int ret = 0;
-#if defined(CONFIG_IDF_TARGET_ESP32S3)
-    uint8_t HardwareAlgorithm;
-#endif
-
-    if (sha == NULL) {
-        return BAD_FUNC_ARG;
-    }
-
-    ESP_LOGV(TAG, "    enter esp_sha_start_process");
-
-    #if defined(CONFIG_IDF_TARGET_ESP32C3) || defined(CONFIG_IDF_TARGET_ESP32C6)
-        /* ESP32-C3 RISC-V TODO */
-	#elif defined(CONFIG_IDF_TARGET_ESP32S3)
-
-    /* Translate from Wolf SHA type to hardware algorithm. */
-    HardwareAlgorithm = 0;
-    switch (sha->sha_type) {
-        case SHA1:
-            HardwareAlgorithm = 0;
-            break;
-        case SHA2_256:
-            HardwareAlgorithm = 2;
-            break;
-    #if defined(WOLFSSL_SHA384)
-        case SHA2_384:
-            HardwareAlgorithm = 3;
-            break;
-    #endif
-    #if defined(WOLFSSL_SHA512)
-        case SHA2_512:
-            HardwareAlgorithm = 4;
-            break;
-    #endif
-        default:
-            /* Unsupported SHA mode. */
-            sha->mode = ESP32_SHA_FAIL_NEED_UNROLL;
-            return -1;
-    }
-
-    REG_WRITE(SHA_MODE_REG, HardwareAlgorithm);
-
-    if (sha->isfirstblock) {
-        REG_WRITE(SHA_START_REG, 1);
-        sha->isfirstblock = false;
-
-        ESP_LOGV(TAG, "      set sha->isfirstblock = 0");
-
-    #if defined(DEBUG_WOLFSSL)
-        this_block_num = 1; /* one-based counter, just for debug info */
-    #endif
-    } /* first block */
-    else {
-        REG_WRITE(SHA_CONTINUE_REG, 1);
-
-    #if defined(DEBUG_WOLFSSL)
-        this_block_num++; /* one-based counter */
-        ESP_LOGV(TAG, "      continue block #%d", this_block_num);
-    #endif
-    } /* not first block */
-
-#else /* not ESP32S3 */
-    if (sha->isfirstblock) {
-        /* start registers for first message block
-         * we don't make any relational memory position assumptions.
-         */
-        switch (sha->sha_type) {
-            case SHA1:
-                DPORT_REG_WRITE(SHA_1_START_REG, 1);
-                break;
-
-            case SHA2_256:
-                DPORT_REG_WRITE(SHA_256_START_REG, 1);
-                break;
-
-        #if defined(WOLFSSL_SHA384)
-            case SHA2_384:
-                DPORT_REG_WRITE(SHA_384_START_REG, 1);
-                break;
-        #endif
-
-        #if defined(WOLFSSL_SHA512)
-            case SHA2_512:
-                DPORT_REG_WRITE(SHA_512_START_REG, 1);
-                break;
-        #endif
-
-            default:
-                sha->mode = ESP32_SHA_FAIL_NEED_UNROLL;
-                ret = -1;
-                break;
-        }
-
-        sha->isfirstblock = false;
-        ESP_LOGV(TAG, "      set sha->isfirstblock = 0");
-
-    #if defined(DEBUG_WOLFSSL)
-        this_block_num = 1; /* one-based counter, just for debug info */
-    #endif
-
-    }
-    else {
-
-        /* continue registers for next message block.
-         * we don't make any relational memory position assumptions
-         * for future chip architecture changes.
-         */
-        switch (sha->sha_type) {
-            case SHA1:
-                DPORT_REG_WRITE(SHA_1_CONTINUE_REG, 1);
-                break;
-
-            case SHA2_256:
-                DPORT_REG_WRITE(SHA_256_CONTINUE_REG, 1);
-                break;
-
-        #if defined(WOLFSSL_SHA384)
-            case SHA2_384:
-                DPORT_REG_WRITE(SHA_384_CONTINUE_REG, 1);
-                break;
-        #endif
-
-        #if defined(WOLFSSL_SHA512)
-            case SHA2_512:
-                DPORT_REG_WRITE(SHA_512_CONTINUE_REG, 1);
-                break;
-        #endif
-
-            default:
-                /* error for unsupported other values */
-                sha->mode = ESP32_SHA_FAIL_NEED_UNROLL;
-                ret = -1;
-                break;
-        }
-    }
-    #endif
-
-        #if defined(DEBUG_WOLFSSL)
-            this_block_num++; /* one-based counter */
-            ESP_LOGV(TAG, "      continue block #%d", this_block_num);
-        #endif
-
-   ESP_LOGV(TAG, "    leave esp_sha_start_process");
-
-   return ret;
-}
-
-/*
-** process message block
-*/
-static int wc_esp_process_block(WC_ESP32SHA* ctx, /* see ctx->sha_type */
-                                 const word32* data,
-                                 word32 len)
-{
-    int ret = 0; /* assume success */
-    word32 word32_to_save = (len) / (sizeof(word32));
-#ifdef CONFIG_IDF_TARGET_ESP32S3
-    uint32_t* MessageSource;
-    uint32_t* AcceleratorMessage;
-#else
-    int i;
-#endif
-    ESP_LOGV(TAG, "  enter esp_process_block");
-    if (word32_to_save > 0x31) {
-        word32_to_save = 0x31;
-        ESP_LOGE(TAG, "  ERROR esp_process_block length exceeds 0x31 words.");
-    }
-
-    /* wait until the engine is available */
-    ret = wc_esp_wait_until_idle();
-
-#if CONFIG_IDF_TARGET_ESP32S3
-    MessageSource = (uint32_t*)data;
-    AcceleratorMessage = (uint32_t*)(SHA_TEXT_BASE);
-    while (word32_to_save--) {
-      /* Must swap endianness of data loaded into hardware accelerator to produce
-       * correct result. Using DPORT_REG_WRITE doesn't avoid this for ESP32s3.
-       * Note: data sheet claims we also need to swap endianness across 64 byte words
-       * when doing SHA-512, but the SHA-512 result is not correct if you do that. */
-      DPORT_REG_WRITE(AcceleratorMessage, __builtin_bswap32(*MessageSource));
-      ++AcceleratorMessage;
-      ++MessageSource;
-    } /*  (word32_to_save--) */
-
-#else
-    /* load [len] words of message data into HW */
-    for (i = 0; i < word32_to_save; i++) {
-        /* by using DPORT_REG_WRITE, we avoid the need
-         * to call __builtin_bswap32 to address endianness.
-         *
-         * a useful watch array cast to watch at runtime:
-         *   ((uint32_t[32])  (*(volatile uint32_t *)(SHA_TEXT_BASE)))
-         *
-         * Write value to DPORT register (does not require protecting)
-         */
-    #if defined(CONFIG_IDF_TARGET_ESP32C3) || defined(CONFIG_IDF_TARGET_ESP32C6)
-        /* ESP32-C3 RISC-V TODO */
-    #else
-        DPORT_REG_WRITE(SHA_TEXT_BASE + (i*sizeof(word32)), *(data + i));
-    #endif
-        /* memw confirmed auto inserted by compiler here */
-    }
-#endif
-
-    /* notify HW to start process
-     * see ctx->sha_type
-     * reg data does not change until we are ready to read */
-    ret = esp_sha_start_process(ctx);
-
-    ESP_LOGV(TAG, "  leave esp_process_block");
-    return ret;
-} /* wc_esp_process_block */
-
-/*
-** retrieve SHA digest from memory
-*/
-int wc_esp_digest_state(WC_ESP32SHA* ctx, byte* hash)
-{
-    word32 digestSz;
-
-#if CONFIG_IDF_TARGET_ESP32S3
-    uint64_t* pHash64Buffer;
-    uint32_t* pHashDestination;
-    size_t szHashWords;
-    size_t szHash64Words;
-# endif
-
-    ESP_LOGV(TAG, "enter esp_digest_state");
-
-    if (ctx == NULL) {
-        return BAD_FUNC_ARG;
-    }
-
-    /* sanity check */
-    digestSz = wc_esp_sha_digest_size(ctx->sha_type);
-    if (digestSz == 0) {
-        ctx->mode = ESP32_SHA_FAIL_NEED_UNROLL;
-        ESP_LOGE(TAG, "unexpected error. sha_type is invalid.");
-        return -1;
-    }
-#if CONFIG_IDF_TARGET_ESP32S3
-    if (ctx->isfirstblock == true) {
-        /* no hardware use yet. Nothing to do yet */
-        return 0;
-    }
-
-    /* wait until idle */
-    wc_esp_wait_until_idle();
-
-    /* read hash result into buffer & flip endianness */
-    pHashDestination = (uint32_t*)hash;
-    szHashWords = wc_esp_sha_digest_size(ctx->sha_type) / sizeof(uint32_t);
-    esp_dport_access_read_buffer(pHashDestination, SHA_H_BASE, szHashWords);
-
-    if (ctx->sha_type == SHA2_512) {
-        /* Although we don't have to swap endianness on 64-bit words
-        ** at the input, we do for the output. */
-        szHash64Words = szHashWords / 2;
-        pHash64Buffer = (uint64_t*)pHashDestination;
-        while (szHash64Words--) {
-            *pHash64Buffer = __builtin_bswap64(*pHash64Buffer);
-            ++pHash64Buffer;
-        }
-    } /*  (ctx->sha_type == SHA2_512) */
-    else {
-        while (szHashWords--) {
-            *pHashDestination = __builtin_bswap32(*pHashDestination);
-            ++pHashDestination;
-        }
-    } /* not (ctx->sha_type == SHA2_512) */
-
-    /* end if CONFIG_IDF_TARGET_ESP32S3 */
-#else
-    /* not CONFIG_IDF_TARGET_ESP32S3 */
-    /* wait until idle */
-    wc_esp_wait_until_idle();
-
-    /* each sha_type register is at a different location  */
-#if defined(CONFIG_IDF_TARGET_ESP32C3) || defined(CONFIG_IDF_TARGET_ESP32C6)
-    /* ESP32-C3 RISC-V TODO */
-#else
-    switch (ctx->sha_type) {
-        case SHA1:
-            DPORT_REG_WRITE(SHA_1_LOAD_REG, 1);
-            break;
-
-        case SHA2_256:
-            DPORT_REG_WRITE(SHA_256_LOAD_REG, 1);
-            break;
-
-    #if defined(WOLFSSL_SHA384)
-        case SHA2_384:
-            DPORT_REG_WRITE(SHA_384_LOAD_REG, 1);
-            break;
-    #endif
-
-    #if defined(WOLFSSL_SHA512)
-        case SHA2_512:
-            DPORT_REG_WRITE(SHA_512_LOAD_REG, 1);
-            break;
-    #endif
-
-        default:
-            ctx->mode = ESP32_SHA_FAIL_NEED_UNROLL;
-            return -1;
-    }
-
-    if (ctx->isfirstblock == true) {
-        /* no hardware use yet. Nothing to do yet */
-        return 0;
-    }
-
-    /* LOAD final digest */
-
-    wc_esp_wait_until_idle();
-
-    /* MEMW instructions before volatile memory references to guarantee
-     * sequential consistency. At least one MEMW should be executed in
-     * between every load or store to a volatile variable
-     */
-    asm volatile("memw");
-
-    /* put result in hash variable.
-     *
-     * ALERT - hardware specific. See esp_hw_support\port\esp32\dport_access.c
-     *
-     * note we read 4-byte word32's here via DPORT_SEQUENCE_REG_READ
-     *
-     *  example:
-     *    DPORT_SEQUENCE_REG_READ(address + i * 4);
-     */
-
-    esp_dport_access_read_buffer(
-#if ESP_IDF_VERSION_MAJOR >= 4
-        (uint32_t*)(hash), /* the result will be found in hash upon exit */
-#else
-        (word32*)(hash), /* the result will be found in hash upon exit */
-#endif
-        SHA_TEXT_BASE,   /* there's a fixed reg addr for all SHA */
-        digestSz / sizeof(word32) /* # 4-byte */
-    );
-#endif
-
-#if defined(WOLFSSL_SHA512) || defined(WOLFSSL_SHA384)
-    if (ctx->sha_type == SHA2_384 || ctx->sha_type == SHA2_512) {
-        word32  i;
-        word32* pwrd1 = (word32*)(hash);
-        /* swap 32 bit words in 64 bit values */
-        for (i = 0; i < WC_SHA512_DIGEST_SIZE / 4; i += 2) {
-            pwrd1[i]     ^= pwrd1[i + 1];
-            pwrd1[i + 1] ^= pwrd1[i];
-            pwrd1[i]     ^= pwrd1[i + 1];
-        }
-    }
-#endif
-#endif /* not CONFIG_IDF_TARGET_ESP32S3 */
-
-    ESP_LOGV(TAG, "leave esp_digest_state");
-    return 0;
-} /* wc_esp_digest_state */
-
-#ifndef NO_SHA
-/*
-** sha1 process
-*/
-int esp_sha_process(struct wc_Sha* sha, const byte* data)
-{
-    int ret = 0;
-
-    ESP_LOGV(TAG, "enter esp_sha_process");
-
-    wc_esp_process_block(&sha->ctx, (const word32*)data, WC_SHA_BLOCK_SIZE);
-
-    ESP_LOGV(TAG, "leave esp_sha_process");
-
-    return ret;
-} /* esp_sha_process */
-
-/*
-** retrieve sha1 digest
-*/
-int esp_sha_digest_process(struct wc_Sha* sha, byte blockprocess)
-{
-    int ret = 0;
-
-    ESP_LOGV(TAG, "enter esp_sha_digest_process");
-
-    if (blockprocess) {
-        wc_esp_process_block(&sha->ctx, sha->buffer, WC_SHA_BLOCK_SIZE);
-    }
-
-    ret = wc_esp_digest_state(&sha->ctx, (byte*)sha->digest);
-
-    ESP_LOGV(TAG, "leave esp_sha_digest_process");
-
-    return ret;
-} /* esp_sha_digest_process */
-#endif /* NO_SHA */
-
-
-#ifndef NO_SHA256
-/*
-** sha256 process
-**
-** repeatedly call this for [N] blocks of [WC_SHA256_BLOCK_SIZE] bytes of data
-*/
-int esp_sha256_process(struct wc_Sha256* sha, const byte* data)
-{
-    int ret = 0;
-
-    ESP_LOGV(TAG, "  enter esp_sha256_process");
-
-    if ((&sha->ctx)->sha_type == SHA2_256) {
-#if defined(DEBUG_WOLFSSL_VERBOSE)
-        ESP_LOGV(TAG, "    confirmed SHA type call match");
-#endif
-    }
-    else {
-        ret = -1;
-        ESP_LOGE(TAG, "    ERROR SHA type call mismatch");
-    }
-
-    wc_esp_process_block(&sha->ctx, (const word32*)data, WC_SHA256_BLOCK_SIZE);
-
-    ESP_LOGV(TAG, "  leave esp_sha256_process");
-
-    return ret;
-} /* esp_sha256_process */
-
-/*
-** retrieve sha256 digest
-**
-** note that wc_Sha256Final() in sha256.c expects to need to reverse byte
-** order, even though we could have returned them in the right order.
-*/
-int esp_sha256_digest_process(struct wc_Sha256* sha, byte blockprocess)
-{
-    int ret = 0;
-
-    ESP_LOGV(TAG, "enter esp_sha256_digest_process");
-
-    if (blockprocess) {
-        wc_esp_process_block(&sha->ctx, sha->buffer, WC_SHA256_BLOCK_SIZE);
-    }
-
-    wc_esp_digest_state(&sha->ctx, (byte*)sha->digest);
-
-    ESP_LOGV(TAG, "leave esp_sha256_digest_process");
-    return ret;
-} /* esp_sha256_digest_process */
-
-
-#endif /* NO_SHA256 */
-
-#if defined(WOLFSSL_SHA512) || defined(WOLFSSL_SHA384)
-/*
-** sha512 process. this is used for sha384 too.
-*/
-int esp_sha512_block(struct wc_Sha512* sha, const word32* data, byte isfinal)
-{
-    int ret = 0; /* assume success */
-    ESP_LOGV(TAG, "enter esp_sha512_block");
-    /* start register offset */
-
-    /* note that in SW mode, wolfSSL uses 64 bit words */
-    if (sha->ctx.mode == ESP32_SHA_SW) {
-        ByteReverseWords64(sha->buffer,
-                           sha->buffer,
-                           WC_SHA512_BLOCK_SIZE);
-        if (isfinal) {
-            sha->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2] =
-                                        sha->hiLen;
-            sha->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 1] =
-                                        sha->loLen;
-        }
-    }
-    else {
-        /* when we are in HW mode, Espressif uses 32 bit words */
-        ByteReverseWords((word32*)sha->buffer,
-                         (word32*)sha->buffer,
-                         WC_SHA512_BLOCK_SIZE);
-
-        if (isfinal) {
-            sha->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2] =
-                                        rotlFixed64(sha->hiLen, 32U);
-            sha->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 1] =
-                                        rotlFixed64(sha->loLen, 32U);
-        }
-
-        ret = wc_esp_process_block(&sha->ctx, data, WC_SHA512_BLOCK_SIZE);
-    }
-    ESP_LOGV(TAG, "leave esp_sha512_block");
-    return ret;
-} /* esp_sha512_block */
-
-/*
-** sha512 process. this is used for sha384 too.
-*/
-int esp_sha512_process(struct wc_Sha512* sha)
-{
-    int ret = 0; /* assume success */
-    word32 *data = (word32*)sha->buffer;
-
-    ESP_LOGV(TAG, "enter esp_sha512_process");
-
-    esp_sha512_block(sha, data, 0);
-
-    ESP_LOGV(TAG, "leave esp_sha512_process");
-    return ret;
-}
-
-/*
-** retrieve sha512 digest. this is used for sha384, sha512-224, sha512-256 too.
-*/
-int esp_sha512_digest_process(struct wc_Sha512* sha, byte blockproc)
-{
-    int ret = 0;
-    ESP_LOGV(TAG, "enter esp_sha512_digest_process");
-
-    if (blockproc) {
-        word32* data = (word32*)sha->buffer;
-
-        ret = esp_sha512_block(sha, data, 1);
-    }
-    if (sha->ctx.mode == ESP32_SHA_HW) {
-        ret = wc_esp_digest_state(&sha->ctx, (byte*)sha->digest);
-    }
-    else {
-        ESP_LOGW(TAG, "Call esp_sha512_digest_process in non-HW mode?");
-    }
-
-    ESP_LOGV(TAG, "leave esp_sha512_digest_process");
-    return ret;
-} /* esp_sha512_digest_process */
-#endif /* WOLFSSL_SHA512 || WOLFSSL_SHA384 */
-#endif /* WOLFSSL_ESP32WROOM32_CRYPT */
-#endif /* !defined(NO_SHA) ||... */

+ 0 - 335
lib/wolfssl/wolfcrypt/src/port/Espressif/esp32_util.c

@@ -1,335 +0,0 @@
-/* esp32_util.c
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-#include <wolfssl/wolfcrypt/settings.h>
-#include <wolfssl/version.h>
-
-#if defined(WOLFSSL_ESP32WROOM32_CRYPT) && \
-  (!defined(NO_AES)        || !defined(NO_SHA) || !defined(NO_SHA256) ||\
-   defined(WOLFSSL_SHA384) || defined(WOLFSSL_SHA512))
-
-#include <wolfssl/wolfcrypt/wc_port.h>
-#include <wolfssl/wolfcrypt/error-crypt.h>
-#include <wolfssl/wolfcrypt/logging.h>
-
-
-/*
- * initialize our mutex used to lock hardware access
- *
- * returns:
- *   0 upon success,
- *   BAD_MUTEX_E for null mutex
- *   other value from wc_InitMutex()
- *
- */
-int esp_CryptHwMutexInit(wolfSSL_Mutex* mutex) {
-    if (mutex == NULL) {
-        return BAD_MUTEX_E;
-    }
-
-    return wc_InitMutex(mutex);
-}
-
-/*
- * call the ESP-IDF mutex lock; xSemaphoreTake
- *
- */
-int esp_CryptHwMutexLock(wolfSSL_Mutex* mutex, TickType_t xBlockTime) {
-    if (mutex == NULL) {
-        WOLFSSL_ERROR_MSG("esp_CryptHwMutexLock called with null mutex");
-        return BAD_MUTEX_E;
-    }
-
-#ifdef SINGLE_THREADED
-    return wc_LockMutex(mutex); /* xSemaphoreTake take with portMAX_DELAY */
-#else
-    return ((xSemaphoreTake( *mutex, xBlockTime ) == pdTRUE) ? 0 : BAD_MUTEX_E);
-#endif
-}
-
-/*
- * call the ESP-IDF mutex UNlock; xSemaphoreGive
- *
- */
-int esp_CryptHwMutexUnLock(wolfSSL_Mutex* mutex) {
-    if (mutex == NULL) {
-        WOLFSSL_ERROR_MSG("esp_CryptHwMutexLock called with null mutex");
-        return BAD_MUTEX_E;
-    }
-
-#ifdef SINGLE_THREADED
-    return wc_UnLockMutex(mutex);
-#else
-    xSemaphoreGive(*mutex);
-    return 0;
-#endif
-}
-
-/*
-** Version / Platform info.
-**
-** This could evolve into a wolfSSL-wide feature. For now, here only. See:
-** https://github.com/wolfSSL/wolfssl/pull/6149
-*/
-#if defined(WOLFSSL_ESPIDF)
-    #include <esp_log.h>
-    #include "sdkconfig.h"
-    const char* TAG = "Version Info";
-    #define WOLFSSL_VERSION_PRINTF(...) ESP_LOGI(TAG, __VA_ARGS__)
-#else
-    #include <stdio.h>
-    #define WOLFSSL_VERSION_PRINTF(...) { printf(__VA_ARGS__); printf("\n"); }
-#endif
-
-/*
-*******************************************************************************
-** Specific Platforms
-*******************************************************************************
-*/
-
-/*
-** Specific platforms: Espressif
-*/
-#if defined(WOLFSSL_ESPIDF)
-static int ShowExtendedSystemInfo_platform_espressif()
-{
-#if defined(CONFIG_ESP32_DEFAULT_CPU_FREQ_MHZ)
-    WOLFSSL_VERSION_PRINTF("CONFIG_ESP32_DEFAULT_CPU_FREQ_MHZ: %u MHz",
-                           CONFIG_ESP32_DEFAULT_CPU_FREQ_MHZ);
-#endif
-
-#if CONFIG_IDF_TARGET_ESP32
-
-    WOLFSSL_VERSION_PRINTF("Xthal_have_ccount: %u",
-                           Xthal_have_ccount);
-
-    /* this is the legacy stack size */
-#if defined(CONFIG_MAIN_TASK_STACK_SIZE)
-    WOLFSSL_VERSION_PRINTF("CONFIG_MAIN_TASK_STACK_SIZE: %d",
-                           CONFIG_MAIN_TASK_STACK_SIZE);
-#endif
-
-    /* this is the modern stack size */
-#if defined(CONFIG_ESP_MAIN_TASK_STACK_SIZE)
-    WOLFSSL_VERSION_PRINTF("CONFIG_ESP_MAIN_TASK_STACK_SIZE: %d",
-                           CONFIG_ESP_MAIN_TASK_STACK_SIZE);
-#endif
-
-#if defined(CONFIG_TIMER_TASK_STACK_SIZE)
-    WOLFSSL_VERSION_PRINTF("CONFIG_TIMER_TASK_STACK_SIZE: %d",
-                           CONFIG_TIMER_TASK_STACK_SIZE);
-#endif
-
-#if defined(CONFIG_TIMER_TASK_STACK_DEPTH)
-    WOLFSSL_VERSION_PRINTF("CONFIG_TIMER_TASK_STACK_DEPTH: %d",
-                           CONFIG_TIMER_TASK_STACK_DEPTH);
-#endif
-
-#if defined(SINGLE_THREADED)
-    /* see also HAVE_STACK_SIZE_VERBOSE */
-    char thisHWM = 0;
-    WOLFSSL_VERSION_PRINTF("Stack HWM: %x", (size_t) &thisHWM);
-#else
-    WOLFSSL_VERSION_PRINTF("Stack HWM: %d",
-                           uxTaskGetStackHighWaterMark(NULL));
-#endif
-
-#elif CONFIG_IDF_TARGET_ESP32S2
-    WOLFSSL_VERSION_PRINTF("Xthal_have_ccount = %u",
-                           Xthal_have_ccount);
-#elif CONFIG_IDF_TARGET_ESP32C6
-    /* not supported at this time */
-#elif CONFIG_IDF_TARGET_ESP32C3
-    /* not supported at this time */
-#elif CONFIG_IDF_TARGET_ESP32S3
-    WOLFSSL_VERSION_PRINTF("Xthal_have_ccount = %u",
-                           Xthal_have_ccount);
-#elif CONFIG_IDF_TARGET_ESP32H2
-    /* not supported at this time */
-#elif CONFIG_IDF_TARGET_ESP32C2
-    /* not supported at this time */
-#else
-    /* not supported at this time */
-#endif
-
-    /* check to see if we are using hardware encryption */
-#if defined(NO_ESP32WROOM32_CRYPT)
-    WOLFSSL_VERSION_PRINTF("NO_ESP32WROOM32_CRYPT defined! "
-                           "HW acceleration DISABLED.");
-#else
-    /* first show what platform hardware acceleration is enabled
-    ** (some new platforms may not be supported yet) */
-#if defined(CONFIG_IDF_TARGET_ESP32)
-    WOLFSSL_VERSION_PRINTF("ESP32WROOM32_CRYPT is enabled for ESP32.");
-#elif defined(CONFIG_IDF_TARGET_ESP32S2)
-    WOLFSSL_VERSION_PRINTF("ESP32WROOM32_CRYPT is enabled for ESP32-S2.");
-#elif defined(CONFIG_IDF_TARGET_ESP32S3)
-    WOLFSSL_VERSION_PRINTF("ESP32WROOM32_CRYPT is enabled for ESP32-S3.");
-#else
-#error "ESP32WROOM32_CRYPT not yet supported on this IDF TARGET"
-#endif
-
-    /* Even though enabled, some specifics may be disabled */
-#if defined(NO_WOLFSSL_ESP32WROOM32_CRYPT_HASH)
-    WOLFSSL_VERSION_PRINTF("NO_WOLFSSL_ESP32WROOM32_CRYPT_HASH is defined!"
-                           "(disabled HW SHA).");
-#endif
-
-#if defined(NO_WOLFSSL_ESP32WROOM32_CRYPT_AES)
-    WOLFSSL_VERSION_PRINTF("NO_WOLFSSL_ESP32WROOM32_CRYPT_AES is defined!"
-                           "(disabled HW AES).");
-#endif
-
-#if defined(NO_WOLFSSL_ESP32WROOM32_CRYPT_RSA_PRI)
-    WOLFSSL_VERSION_PRINTF("NO_WOLFSSL_ESP32WROOM32_CRYPT_RSA_PRI defined!"
-                           "(disabled HW RSA)");
-#endif
-#endif
-
-    return 0;
-}
-#endif
-
-/*
-*******************************************************************************
-** All Platforms
-*******************************************************************************
-*/
-
-/*
-** All platforms: git details
-*/
-static int ShowExtendedSystemInfo_git()
-{
-#if defined(HAVE_WC_INTROSPECTION) && !defined(ALLOW_BINARY_MISMATCH_INTROSPECTION)
-#pragma message("WARNING: both HAVE_VERSION_EXTENDED_INFO and " \
-                "HAVE_WC_INTROSPECTION are enabled. Some extended " \
-                "information details will not be available.")
-
-    WOLFSSL_VERSION_PRINTF("HAVE_WC_INTROSPECTION enabled. "
-                           "Some extended system details not available.");
-#else
-    /* Display some interesting git values that may change,
-    ** but not desired for introspection which requires object code to be
-    ** maximally bitwise-invariant.
-    */
-#if defined(LIBWOLFSSL_VERSION_GIT_ORIGIN)
-        /* git config --get remote.origin.url */
-    WOLFSSL_VERSION_PRINTF("LIBWOLFSSL_VERSION_GIT_ORIGIN = %s",
-                           LIBWOLFSSL_VERSION_GIT_ORIGIN);
-#endif
-
-#if defined(LIBWOLFSSL_VERSION_GIT_BRANCH)
-    /* git rev-parse --abbrev-ref HEAD */
-    WOLFSSL_VERSION_PRINTF("LIBWOLFSSL_VERSION_GIT_BRANCH = %s",
-                           LIBWOLFSSL_VERSION_GIT_BRANCH);
-#endif
-
-#if defined(LIBWOLFSSL_VERSION_GIT_HASH)
-    WOLFSSL_VERSION_PRINTF("LIBWOLFSSL_VERSION_GIT_HASH = %s",
-                           LIBWOLFSSL_VERSION_GIT_HASH);
-#endif
-
-#if defined(LIBWOLFSSL_VERSION_GIT_SHORT_HASH )
-    WOLFSSL_VERSION_PRINTF("LIBWOLFSSL_VERSION_GIT_SHORT_HASH = %s",
-                           LIBWOLFSSL_VERSION_GIT_SHORT_HASH);
-#endif
-
-#if defined(LIBWOLFSSL_VERSION_GIT_HASH_DATE)
-    WOLFSSL_VERSION_PRINTF("LIBWOLFSSL_VERSION_GIT_HASH_DATE = %s",
-                           LIBWOLFSSL_VERSION_GIT_HASH_DATE);
-#endif
-
-#endif /* else not HAVE_WC_INTROSPECTION */
-    return 0;
-}
-
-/*
-** All platforms: thread details
-*/
-static int ShowExtendedSystemInfo_thread()
-{
-    /* all platforms: stack high water mark check */
-#if defined(SINGLE_THREADED)
-    WOLFSSL_VERSION_PRINTF("SINGLE_THREADED");
-#else
-    WOLFSSL_VERSION_PRINTF("NOT SINGLE_THREADED");
-#endif
-    return 0;
-}
-
-/*
-** All Platforms: platform details
-*/
-static int ShowExtendedSystemInfo_platform()
-{
-#if defined(WOLFSSL_ESPIDF)
-#if defined(CONFIG_IDF_TARGET)
-    WOLFSSL_VERSION_PRINTF("CONFIG_IDF_TARGET = %s",
-                           CONFIG_IDF_TARGET);
-    ShowExtendedSystemInfo_platform_espressif();
-#endif
-#endif
-    return 0;
-}
-
-/*
-*******************************************************************************
-** The public ShowExtendedSystemInfo()
-*******************************************************************************
-*/
-
-int ShowExtendedSystemInfo(void)
-    {
-        WOLFSSL_VERSION_PRINTF("Extended Version and Platform Information.");
-
-#if defined(LIBWOLFSSL_VERSION_STRING)
-        WOLFSSL_VERSION_PRINTF("LIBWOLFSSL_VERSION_STRING = %s",
-                               LIBWOLFSSL_VERSION_STRING);
-#endif
-
-#if defined(LIBWOLFSSL_VERSION_HEX)
-        WOLFSSL_VERSION_PRINTF("LIBWOLFSSL_VERSION_HEX = %x",
-                               LIBWOLFSSL_VERSION_HEX);
-#endif
-
-#if defined(WOLFSSL_MULTI_INSTALL_WARNING)
-        /* CMake may have detected undesired multiple installs, so give warning. */
-        WOLFSSL_VERSION_PRINTF("");
-        WOLFSSL_VERSION_PRINTF("WARNING: Multiple wolfSSL installs found.");
-        WOLFSSL_VERSION_PRINTF("Check ESP-IDF and local project [components] directory.");
-        WOLFSSL_VERSION_PRINTF("");
-#endif
-
-        ShowExtendedSystemInfo_git(); /* may be limited during active introspection */
-        ShowExtendedSystemInfo_platform();
-        ShowExtendedSystemInfo_thread();
-        return 0;
-    }
-
-
-
-int esp_ShowExtendedSystemInfo()
-{
-    return ShowExtendedSystemInfo();
-}
-
-#endif
-

+ 0 - 195
lib/wolfssl/wolfcrypt/src/port/Renesas/README.md

@@ -1,195 +0,0 @@
-# TSIP FIT Module port
-
-Support for TSIP FIT driver for symmetric AES, SHA1/SHA256 hardware acceleration
-and TLS-linked capability including Root CA, the server certificate or
-intermediate certificate verification.
-
-## Overview
-Renesas TSIP FIT module with wolfSSL by setting *WOLFSSL_RENESAS_TSIP* definition.
-
-Including the following examples:
-
-* simple tls_client/tls_server
-* crypt test
-* crypt benchmark
-
-The *user_settings.h* file enables some of the hardened settings.
-
-## Requirements
-
-### 1. [Renesas TSIP FIT module](https://www.renesas.com/us/en/products/software-tools/software-os-middleware-driver/security-crypto/trusted-secure-ip-driver.html)
-[FIT module](https://www.renesas.com/us/en/products/software-tools/software-os-middleware-driver/software-package/fit.html)
-Note : The included example program is tested with <u>TSIP FIT version **1.06**</u>.
-
-### 2. [e2studio](https://www.renesas.com/us/en/products/software-tools/tools/ide/e2studio.html)
-
-### 3. Evaluation Board that supports TSIP
-Note : The included example program is tested with [GR-ROSE](http://gadget.renesas.com/en/product/rose.html), which is classified to RX65N.
-
-## Setup and Build wolfSSL library
-1. Uncomment out `#define WOLFSSL_RENESAS_TSIP` in `/path/to/wolfssl/wolfssl/wolfcrypt/settings.h`
-2. Uncomment out `#define WOLFSSL_RENESAS_RX65N` in `/path/to/wolfssl/wolfssl/wolfcrypt/settings.h`
-3. Open a project file at /path/to/wolfssl/IDE/Renesas/e2studio/Projects/wolfssl/ by e2studio and build to create wolfssl library
-
-Note : Generating FIT module source files in advance are required to compile wolfSSL
-when enabling `WOLFSSL_RENESAS_TSIP` and `WOLFSSL_RENESAS_RX65N`. Please see for
-creating FIT module files at "Setup and Build and example program" in this readme below.
-
-To disable portions of the hardware acceleration you can optionally define:
-
-```c
-/* Disabled SHA acceleration */
-#define NO_WOLFSSL_RENESAS_TSIP_CRYPT_HASH
-/* Disabled TLS-linked acceleration */
-#define NO_WOLFSSL_RENESAS_TSIP_TLS_SESSION
-```
-
-## Benchmarks
-
-Platform:
-Renesas   : e2Studio v7.4.0
-ToolChain : Renesas CCRX version 3.00.00
-TSIP FIT  : version 1.0.6
-Board     : [GR-ROSE](http://gadget.renesas.com/en/product/rose.html)
-wolfSSL   : 4.1.0
-
-### Software only implementation:
-
-*block cipher*
-```
-RNG                200 KB took 1.099 seconds,  182.000 KB/s
-SHA                  1 MB took 1.005 seconds,    1.166 MB/s
-SHA-256            425 KB took 1.038 seconds,  409.520 KB/s
-```
-
-*TLS establishment time*
-```
-TLS_RSA_WITH_AES_128_CBC_SHA          : 0.651 (s)
-TLS_RSA_WITH_AES_128_CBC_SHA256       : 0.651 (s)
-TLS_RSA_WITH_AES_256_CBC_SHA          : 0.642 (s)
-TLS_RSA_WITH_AES_256_CBC_SHA256       : 0.662 (s)
-TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 : 2.050 (s)
-```
-### Hardware acceleration:
-
-*block cipher*
-```
-RNG                  1 MB took 1.011 seconds,    1.038 MB/s
-SHA                 12 MB took 1.001 seconds,   11.515 MB/s
-SHA-256             13 MB took 1.001 seconds,   12.900 MB/s
-```
-
-*TLS establishment time with TLS-linked capability*
-*Perform full TLS-linked capability*
-```
-TLS_RSA_WITH_AES_128_CBC_SHA          : 0.141 (s)
-TLS_RSA_WITH_AES_128_CBC_SHA256       : 0.141 (s)
-TLS_RSA_WITH_AES_256_CBC_SHA          : 0.141 (s)
-TLS_RSA_WITH_AES_256_CBC_SHA256       : 0.144 (s)
-```
-
-*Perform certificate verification by TSIP TLS-linked API*
-```
-TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 : 1.721 (s)
-```
-
-## Setup and Build an example program
-An example program expects the following FIT modules:
-
-* r_bsp
-* r_cmt_rx
-* r_config
-* r_ether_rx
-* r_sys_time_rx
-* r_t4_driver_rx
-* r_t4_rx
-* r_tsip_rx
-
-These needed source files can be generated by creating a dummy project including Renesas Smart Configurator as steps below:
-
-1. Create a dummy project including Renesas Smart Configurator for your evaluation board type
-2. Open Smart Configurator and add FIT modules above
-   It would need to expand *User Stack Size* property and *Heap Size* of r_bsp.
-   Change IP ADDRESS and PORT NUMBER in r_t4_rx_config.h
-   `#define T4_CFG_FIXED_IP_ADDRESS_CH0   192,168,1,33`
-   `#define T4_CFG_TCP_REPID1_PORT_NUMBER 11111`
-   Note: It would need to modify other configuration base on evaluation board.
-
-   When using GR-ROSE, you can choose "GR-ROSE" from "board" tab and "board" drop-down list and then is able to follow settings below:
-
-   Go to component tab and open r_ether_rx properties:
-   Ethernet interface : RMII
-   The register bus of PHY0 for ETHER0/1: Use ETHER0
-   Resource, ETHERC: Check ETHERC0_RMII
-
-   Go to component tab and open r_t4_rx properties:
-   Enable/Disable DHCP function : 0
-   IP address for ch0, when DHCP disable : 192,168,1,33
-   TCP REPID1 prot number : 11111
-
-   Go to pins tab and select ethernet controller
-   Check to use pins
-
-3. Generate source code
-Now, it is able to copy these FIT modules into an example project.
-4. Make "smc_gen" folder under /path/to/wolfssl/IDE/Renesas/e2studio/Projects/test/src/
-5. Copy the FIT modules into the folder that is created at step 4.
-6. Open an example project file at /path/to/wolfssl/IDE/Renesas/e2studio/Projects/test/ by e2studio
-7. Enable a macro definition in /path/to/wolfssl/IDE/Renesas/e2studio/Projects/test/src/wolfssl_demo.h for application type
-
-```c
-#define CRYPT_TEST     /* enable crypt test */
-#define BENCHMARK      /* enable benchmark application */
-#define TLS_CLIENT     /* enable simple tls client application */
-#define TLS_SERVER     /* enable simple tls server application */
-#define USE_TSIP_TLS   /* to inform user key and flash keying, when using TSIP */
-```
-
-   Note: CRYPT_TEST and BENCHMARK can be enabled at the same time. TLS_CLIENT and TLS_SERVER cannot be enabled together other definitions.
-7. Setup debug configuration based on your debug hardware
-
-## Run client/server program on the device
-When testing the embedded client or server on the device, it is recommended to test against one
-of the standard wolfSSL example application running on a desktop machine.
-
-
-For the embedded client, an example server commands for running on a desktop machine, IP address 192.168.1.45, is as follows:
-`./example/server/server -b -d -i`
-
-
-For the embedded server, an example client commands for running on a desktop machine is as follows:
-`./example/client/client -h 192.168.1.33 -p 11111`
-
-## Modify an example program
-To use own TSIP keys for TSIP TLS-linked API use, it needs own flash keyring, PSS signed signature and RSA key.
-
-### Create flash keyring and use it in an example program
-1. Please follow the instruction at TSIP manual, chapter 7. Key Data Operations.
-2. Copy and paste s_flash[] data to s_flash[] data in example-program/key_data.c
-`const uint32_t s_flash[] =`
-
-### Create RSA key pair for signing Root CA verification and use them in an example program
-To use TSIP TLS-linked APIs, it needs RSA key pair and Root CA certificate bundle signature by RSA 2048 PSS with SHA256.
-Shell and Perl script program in /path/to/wolfssl/IDE/Renesas/e2studio/Projects/tools/ can be used for the purpose.
-
-* `generate_rsa_keypair.sh`: generate RSA 2048 bit key pair. Show modulus and public exponent when specifying "-s" option
-* `rsa_pss_sign.sh`: sign the file by the specified private key
-* `genhexbuf.pl`: generate C header file including a byte array generated from the specified file in the script
-
-Modulus and public exponent showed by `generate_rsa_keypair.sh` can be used for input date to
-Renesas Secure Flash Programmer to generate encrypted RSA keys for TSIP TLS-linked API use.
-Please follow the instruction about how to generate RSA keys in the TSIP manual.
-
-Generated byte array of signed signature by `genhexbuf.pl` can be replaced signature data in key_data.c of an example program.
-
-Encrypted RSA key and generated byte array of signed signature need to be informed wolfSSL library before loading CA certification.
-Please see `SetTsipTlskey()` function an example program about how to inform them.
-
-### Coding
-
-In your application you must include <wolfssl/wolfcrypt/settings.h> before any other wolfSSL headers.
-If building the sources directly we recommend defining `WOLFSSL_USER_SETTINGS` and adding your own `user_settings.h` file.
-You can find a good reference for this in `/path/to/Renesas/e2studio/Projects/common/user_settings.h`.
-
-## Support
-For question please email [support@wolfssl.com]

+ 0 - 1300
lib/wolfssl/wolfcrypt/src/port/Renesas/renesas_common.c

@@ -1,1300 +0,0 @@
-/* renesas_common.c
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-#include <wolfssl/wolfcrypt/settings.h>
-
-#if defined(WOLFSSL_RENESAS_SCEPROTECT) \
-    || defined(WOLFSSL_RENESAS_SCEPROTECT_CRYPTONLY) \
-    || defined(WOLFSSL_RENESAS_TSIP_TLS)
-
-#if defined(WOLFSSL_RENESAS_SCEPROTECT) || \
-    defined(WOLFSSL_RENESAS_SCEPROTECT_CRYPTONLY)
-  #include <wolfssl/wolfcrypt/port/Renesas/renesas-sce-crypt.h>
-  #define cmn_hw_lock    wc_sce_hw_lock
-  #define cmn_hw_unlock  wc_sce_hw_unlock
-#elif defined(WOLFSSL_RENESAS_TSIP_TLS)
-  #include <wolfssl/wolfcrypt/port/Renesas/renesas-tsip-crypt.h>
-  #define cmn_hw_lock    tsip_hw_lock
-  #define cmn_hw_unlock  tsip_hw_unlock
-#endif
-
-#include <wolfssl/wolfcrypt/wc_port.h>
-#include <wolfssl/wolfcrypt/types.h>
-#include <wolfssl/wolfcrypt/asn.h>
-#include <wolfssl/internal.h>
-#include <wolfssl/error-ssl.h>
-#include <wolfssl/wolfcrypt/error-crypt.h>
-#include <wolfssl/wolfcrypt/logging.h>
-
-uint32_t   g_CAscm_Idx = (uint32_t)-1; /* index of CM table    */
-static int gdevId = 7890;           /* initial dev Id for Crypt Callback */
-
-#ifdef WOLF_CRYPTO_CB
-
-#include <wolfssl/wolfcrypt/cryptocb.h>
-
-WOLFSSL_LOCAL int Renesas_cmn_Cleanup(WOLFSSL* ssl)
-{
-    int ret = 0;
-    WOLFSSL_ENTER("Renesas_cmn_Cleanup");
-
-#if defined(WOLFSSL_RENESAS_TSIP_TLS)
-    ret = tsip_TlsCleanup(ssl);
-#endif
-    
-    WOLFSSL_LEAVE("Renesas_cmn_Cleanup", ret);
-    return ret;
-}
-WOLFSSL_LOCAL int Renesas_cmn_RsaSignCb(WOLFSSL* ssl,
-                                const unsigned char* in, unsigned int inSz,
-                                unsigned char* out, word32* outSz,
-                                const unsigned char* keyDer, unsigned int keySz,
-                                void* ctx)
-{
-    int ret = CRYPTOCB_UNAVAILABLE;
-    WOLFSSL_ENTER("Renesas_cmn_RsaSignCb");
-
-    /* This is just a stub function that provides no logic */
-
-    WOLFSSL_LEAVE("Renesas_cmn_RsaSignCb", ret);
-    return ret;
-}
-/* This function is a callback passed to wolfSSL_CTX_SetRsaSignCheckCb.
- * It tries to verify the signature passed to it by decrypting with a public
- * key.  
- * returns 0 on success, CRYPTOCB_UNAVAILABLE when public key is not set.
- */
-WOLFSSL_LOCAL int Renesas_cmn_RsaSignCheckCb(WOLFSSL* ssl,
-                                unsigned char* sig, unsigned int sigSz,
-                                unsigned char** out,
-                                const unsigned char* keyDer, unsigned int keySz,
-                                void* ctx)
-{
-    int ret = CRYPTOCB_UNAVAILABLE;
-    WOLFSSL_ENTER("Renesas_cmn_RsaSignCheckCb");
-
-    #if defined(WOLFSSL_RENESAS_TSIP)
-    
-    return tsip_VerifyRsaPkcsCb(ssl, sig, sigSz, out, keyDer, keySz, ctx);
-    
-    #endif /* WOLFSSL_RENESAS_TSIP */
-
-    WOLFSSL_LEAVE("Renesas_cmn_RsaSignCheckCb", ret);
-    return ret;
-}
-
-WOLFSSL_LOCAL int Renesas_cmn_EccSignCb(WOLFSSL* ssl,
-                                const unsigned char* in, unsigned int inSz,
-                                unsigned char* out, word32* outSz,
-                                const unsigned char* keyDer, unsigned int keySz,
-                                void* ctx)
-{
-    int ret = CRYPTOCB_UNAVAILABLE;
-    WOLFSSL_ENTER("Renesas_cmn_EccSignCb");
-
-    /* This is just a stub function that provides no logic */
-    
-    WOLFSSL_LEAVE("Renesas_cmn_EccSignCb", ret);
-    return ret;
-}
-/* Renesas Security Library Common Callback
- * For Crypto Callbacks
- *
- * devIdArg device Id
- * info     pointer to wc_CryptInfo
- * ctx      Crypto Callback context
- * return  0 on success, otherwise MEMORY_E or BAD_FUNC_ARG on failure
- */
-static int Renesas_cmn_CryptoDevCb(int devIdArg, wc_CryptoInfo* info, void* ctx)
-{
-    int ret = NOT_COMPILED_IN; /* return this to bypass HW and use SW */
-
-    WOLFSSL_ENTER("Renesas_cmn_CryptoDevCb");
-
-#if defined(WOLFSSL_RENESAS_TSIP_TLS)
-    TsipUserCtx*      cbInfo = (TsipUserCtx*)ctx;
-#elif defined(WOLFSSL_RENESAS_SCEPROTECT) || \
-        defined(WOLFSSL_RENESAS_SCEPROTECT_CRYPTONLY)
-    User_SCEPKCbInfo* cbInfo = (User_SCEPKCbInfo*)ctx;
-#endif
-
-    if (info == NULL || ctx == NULL)
-        return BAD_FUNC_ARG;
-
-#ifdef DEBUG_WOLFSSL
-    printf("CryptoDevCb: Algo Type %d session key set: %d\n",
-                                    info->algo_type, cbInfo->session_key_set);
-#endif
-
-#if defined(WOLFSSL_RENESAS_TSIP)
-    ret = CRYPTOCB_UNAVAILABLE;
-
-    if (info->algo_type == WC_ALGO_TYPE_CIPHER) {
-
-    #if !defined(NO_AES) || !defined(NO_DES3)
-    #ifdef HAVE_AESGCM
-        if (info->cipher.type == WC_CIPHER_AES_GCM &&
-            cbInfo->session_key_set == 1) {
-
-            if (info->cipher.enc) {
-                ret = wc_tsip_AesGcmEncrypt(
-                        info->cipher.aesgcm_enc.aes,
-                        (byte*)info->cipher.aesgcm_enc.out,
-                        (byte*)info->cipher.aesgcm_enc.in,
-                        info->cipher.aesgcm_enc.sz,
-                        (byte*)info->cipher.aesgcm_enc.iv,
-                        info->cipher.aesgcm_enc.ivSz,
-                        (byte*)info->cipher.aesgcm_enc.authTag,
-                        info->cipher.aesgcm_enc.authTagSz,
-                        (byte*)info->cipher.aesgcm_enc.authIn,
-                        info->cipher.aesgcm_enc.authInSz,
-                        (void*)ctx);
-
-            }
-            else {
-                ret = wc_tsip_AesGcmDecrypt(
-                        info->cipher.aesgcm_dec.aes,
-                        (byte*)info->cipher.aesgcm_dec.out,
-                        (byte*)info->cipher.aesgcm_dec.in,
-                        info->cipher.aesgcm_dec.sz,
-                        (byte*)info->cipher.aesgcm_dec.iv,
-                        info->cipher.aesgcm_dec.ivSz,
-                        (byte*)info->cipher.aesgcm_dec.authTag,
-                        info->cipher.aesgcm_dec.authTagSz,
-                        (byte*)info->cipher.aesgcm_dec.authIn,
-                        info->cipher.aesgcm_dec.authInSz,
-                        (void*)ctx);
-            }
-        }
-    #endif /* HAVE_AESGCM */
-    #ifdef HAVE_AES_CBC
-        if (info->cipher.type == WC_CIPHER_AES_CBC &&
-            cbInfo->session_key_set == 1) {
-
-            if (info->cipher.enc) {
-                ret = wc_tsip_AesCbcEncrypt(
-                    info->cipher.aescbc.aes,
-                    (byte*)info->cipher.aescbc.out,
-                    (byte*)info->cipher.aescbc.in,
-                    info->cipher.aescbc.sz);
-
-            }
-            else {
-                ret = wc_tsip_AesCbcDecrypt(
-                    info->cipher.aescbc.aes,
-                    (byte*)info->cipher.aescbc.out,
-                    (byte*)info->cipher.aescbc.in,
-                    info->cipher.aescbc.sz);
-            }
-        }
-    #endif /* HAVE_AES_CBC */
-    #endif /* !NO_AES || !NO_DES3 */
-    }
-    /* Is called for signing 
-     * Can handle only RSA PkCS#1v1.5 padding scheme here.
-    */
-    if (info->algo_type == WC_ALGO_TYPE_PK) {
-        #if !defined(NO_RSA)
-        if (info->pk.type == WC_PK_TYPE_RSA) {
-            if (info->pk.rsa.type == RSA_PRIVATE_ENCRYPT) {
-                ret = tsip_SignRsaPkcs(info, ctx);
-            }
-        }
-        #endif /* NO_RSA */
-        #if defined(HAVE_ECC)
-        else if (info->pk.type == WC_PK_TYPE_ECDSA_SIGN) {
-            ret = tsip_SignEcdsa(info, ctx);
-        }
-        #endif /* HAVE_ECC */
-    }
-#elif defined(WOLFSSL_RENESAS_SCEPROTECT) ||\
-        defined(WOLFSSL_RENESAS_SCEPROTECT_CRYPTONLY)
-
-    if (info->algo_type == WC_ALGO_TYPE_CIPHER) {
-
-    #if !defined(NO_AES) || !defined(NO_DES3)
-    #ifdef HAVE_AESGCM
-        if (info->cipher.type == WC_CIPHER_AES_GCM) {
-
-            if (info->cipher.enc &&
-                (cbInfo->keyflgs_tls.bits.session_key_set == 1 ||
-                 (cbInfo->keyflgs_crypt.bits.aes256_installedkey_set == 1 &&
-                  info->cipher.aesgcm_enc.aes->keylen == 32) ||
-                 (cbInfo->keyflgs_crypt.bits.aes128_installedkey_set == 1 &&
-                  info->cipher.aesgcm_enc.aes->keylen == 16))) {
-
-                if (cbInfo->keyflgs_crypt.bits.aes256_installedkey_set == 1 &&
-                  info->cipher.aesgcm_enc.aes->keylen == 32) {
-
-                    XMEMCPY(&info->cipher.aesgcm_enc.aes->ctx.sce_wrapped_key,
-                        &cbInfo->sce_wrapped_key_aes256,
-                        sizeof(sce_aes_wrapped_key_t));
-                    info->cipher.aesgcm_enc.aes->ctx.keySize = 32;
-
-                }
-                else if (
-                    cbInfo->keyflgs_crypt.bits.aes128_installedkey_set == 1 &&
-                    info->cipher.aesgcm_enc.aes->keylen == 16) {
-
-                    XMEMCPY(&info->cipher.aesgcm_enc.aes->ctx.sce_wrapped_key,
-                            &cbInfo->sce_wrapped_key_aes128,
-                            sizeof(sce_aes_wrapped_key_t));
-                    info->cipher.aesgcm_enc.aes->ctx.keySize = 16;
-                }
-
-                ret = wc_sce_AesGcmEncrypt(
-                        info->cipher.aesgcm_enc.aes,
-                        (byte*)info->cipher.aesgcm_enc.out,
-                        (byte*)info->cipher.aesgcm_enc.in,
-                        info->cipher.aesgcm_enc.sz,
-                        (byte*)info->cipher.aesgcm_enc.iv,
-                        info->cipher.aesgcm_enc.ivSz,
-                        (byte*)info->cipher.aesgcm_enc.authTag,
-                        info->cipher.aesgcm_enc.authTagSz,
-                        (byte*)info->cipher.aesgcm_enc.authIn,
-                        info->cipher.aesgcm_enc.authInSz,
-                        (void*)ctx);
-
-            }
-            else if (cbInfo->keyflgs_tls.bits.session_key_set == 1 ||
-                    (cbInfo->keyflgs_crypt.bits.aes256_installedkey_set == 1 &&
-                       info->cipher.aesgcm_dec.aes->keylen == 32) ||
-                    (cbInfo->keyflgs_crypt.bits.aes128_installedkey_set == 1 &&
-                       info->cipher.aesgcm_dec.aes->keylen == 16)) {
-
-                if (cbInfo->keyflgs_crypt.bits.aes256_installedkey_set == 1 &&
-                  info->cipher.aesgcm_dec.aes->keylen == 32) {
-
-                    XMEMCPY(&info->cipher.aesgcm_dec.aes->ctx.sce_wrapped_key,
-                            &cbInfo->sce_wrapped_key_aes256,
-                            sizeof(sce_aes_wrapped_key_t));
-                    info->cipher.aesgcm_dec.aes->ctx.keySize = 32;
-
-                }
-                else if (
-                    cbInfo->keyflgs_crypt.bits.aes128_installedkey_set == 1 &&
-                    info->cipher.aesgcm_dec.aes->keylen == 16) {
-
-                    XMEMCPY(&info->cipher.aesgcm_dec.aes->ctx.sce_wrapped_key,
-                            &cbInfo->sce_wrapped_key_aes128,
-                            sizeof(sce_aes_wrapped_key_t));
-                    info->cipher.aesgcm_dec.aes->ctx.keySize = 16;
-                }
-
-                ret = wc_sce_AesGcmDecrypt(
-                        info->cipher.aesgcm_dec.aes,
-                        (byte*)info->cipher.aesgcm_dec.out,
-                        (byte*)info->cipher.aesgcm_dec.in,
-                        info->cipher.aesgcm_dec.sz,
-                        (byte*)info->cipher.aesgcm_dec.iv,
-                        info->cipher.aesgcm_dec.ivSz,
-                        (byte*)info->cipher.aesgcm_dec.authTag,
-                        info->cipher.aesgcm_dec.authTagSz,
-                        (byte*)info->cipher.aesgcm_dec.authIn,
-                        info->cipher.aesgcm_dec.authInSz,
-                        (void*)ctx);
-            }
-        }
-    #endif /* HAVE_AESGCM */
-    #ifdef HAVE_AES_CBC
-        if ((info->cipher.type == WC_CIPHER_AES_CBC) &&
-            (cbInfo->keyflgs_tls.bits.session_key_set == 1 ||
-            (cbInfo->keyflgs_crypt.bits.aes256_installedkey_set == 1 &&
-                info->cipher.aescbc.aes->keylen == 32) ||
-            (cbInfo->keyflgs_crypt.bits.aes128_installedkey_set == 1 &&
-                info->cipher.aescbc.aes->keylen == 16))) {
-
-                if (info->cipher.enc) {
-                    if (
-                    cbInfo->keyflgs_crypt.bits.aes256_installedkey_set == 1 &&
-                    info->cipher.aescbc.aes->keylen == 32) {
-                        XMEMCPY(&info->cipher.aescbc.aes->ctx.sce_wrapped_key,
-                                &cbInfo->sce_wrapped_key_aes256,
-                                sizeof(sce_aes_wrapped_key_t));
-                        info->cipher.aescbc.aes->ctx.keySize = 32;
-
-                    }
-                    else if (
-                        cbInfo->keyflgs_crypt.bits.aes128_installedkey_set == 1 
-                        && info->cipher.aescbc.aes->keylen == 16) {
-                        XMEMCPY(&info->cipher.aescbc.aes->ctx.sce_wrapped_key,
-                                &cbInfo->sce_wrapped_key_aes128,
-                                sizeof(sce_aes_wrapped_key_t));
-                        info->cipher.aescbc.aes->ctx.keySize = 16;
-                    }
-
-                    ret = wc_sce_AesCbcEncrypt(
-                        info->cipher.aescbc.aes,
-                        (byte*)info->cipher.aescbc.out,
-                        (byte*)info->cipher.aescbc.in,
-                        info->cipher.aescbc.sz);
-                }
-                else {
-                    if (
-                    cbInfo->keyflgs_crypt.bits.aes256_installedkey_set == 1 &&
-                    info->cipher.aescbc.aes->keylen == 32) {
-                        XMEMCPY(&info->cipher.aescbc.aes->ctx.sce_wrapped_key,
-                                &cbInfo->sce_wrapped_key_aes256,
-                                sizeof(sce_aes_wrapped_key_t));
-                        info->cipher.aescbc.aes->ctx.keySize = 32;
-                    }  else if (
-                        cbInfo->keyflgs_crypt.bits.aes128_installedkey_set == 1
-                        && info->cipher.aescbc.aes->keylen == 16) {
-                        XMEMCPY(&info->cipher.aescbc.aes->ctx.sce_wrapped_key,
-                                &cbInfo->sce_wrapped_key_aes128,
-                                sizeof(sce_aes_wrapped_key_t));
-                        info->cipher.aescbc.aes->ctx.keySize = 16;
-                    }
-
-                    ret = wc_sce_AesCbcDecrypt(
-                        info->cipher.aescbc.aes,
-                        (byte*)info->cipher.aescbc.out,
-                        (byte*)info->cipher.aescbc.in,
-                        info->cipher.aescbc.sz);
-                }
-        }
-    #endif /* HAVE_AES_CBC */
-    #endif /* !NO_AES || !NO_DES3 */
-    }
-    #if !defined(NO_RSA) && defined(WOLFSSL_RENESAS_SCEPROTECT_CRYPTONLY)
-    else if (info->algo_type == WC_ALGO_TYPE_PK) {
-        
-       #if !defined(NO_RSA)
-       #if defined(WOLFSSL_KEY_GEN)
-        if (info->pk.type == WC_PK_TYPE_RSA_KEYGEN &&
-            (info->pk.rsakg.size == 1024 ||
-             info->pk.rsakg.size == 2048)) {
-            ret = wc_sce_MakeRsaKey(info->pk.rsakg.size, (void*)ctx);
-        }
-       #endif
-        if (info->pk.type == WC_PK_TYPE_RSA) {
-            /* to perform RSA on SCE, wrapped keys should be installed 
-             * in advance. SCE supports 1024 or 2048 bits key size.
-             * otherwise, falls-through happens.
-             */
-            if (cbInfo->keyflgs_crypt.bits.rsapri2048_installedkey_set == 1
-                 ||
-                 cbInfo->keyflgs_crypt.bits.rsapub2048_installedkey_set == 1
-                || 
-                cbInfo->keyflgs_crypt.bits.rsapri1024_installedkey_set == 1 
-                 ||
-                 cbInfo->keyflgs_crypt.bits.rsapub1024_installedkey_set == 1 
-                ) {
-                
-                if (info->pk.rsa.type == RSA_PRIVATE_DECRYPT ||
-                    info->pk.rsa.type == RSA_PUBLIC_ENCRYPT  )
-                    {
-                        ret = wc_sce_RsaFunction(info->pk.rsa.in,
-                                        info->pk.rsa.inLen,
-                                        info->pk.rsa.out,
-                                        info->pk.rsa.outLen,
-                                        info->pk.rsa.type,
-                                        info->pk.rsa.key,
-                                        info->pk.rsa.rng,
-                                        (void*)ctx);
-                }
-                else if (info->pk.rsa.type == RSA_PRIVATE_ENCRYPT /* sign */){
-                   ret = wc_sce_RsaSign(info->pk.rsa.in,
-                                        info->pk.rsa.inLen,
-                                        info->pk.rsa.out,
-                                        info->pk.rsa.outLen,
-                                        info->pk.rsa.key,
-                                        (void*)ctx);
-                }
-                else if (info->pk.rsa.type == RSA_PUBLIC_DECRYPT /* verify */) {
-                    ret = wc_sce_RsaVerify(info->pk.rsa.in,
-                                        info->pk.rsa.inLen,
-                                        info->pk.rsa.out,
-                                        info->pk.rsa.outLen,
-                                        info->pk.rsa.key,
-                                        (void*)ctx);
-                }
-            }
-            else {
-                WOLFSSL_MSG(
-                    "SCE can handle 1024 or 2048 bit key size. "
-                    "key size is not either 1024 or 2048. "
-                    "Or wrapped key is not installed. "
-                    "RSA operation falls through to SW operation.");
-            }
-        }
-       #endif /* NO_RSA && WOLFSSL_RENESAS_SCEPROTECT_CRYPTONLY */
-    }
-    #endif /* NO_RSA */
-#endif /* TSIP or SCE */
-
-    (void)devIdArg;
-    (void)ctx;
-    WOLFSSL_LEAVE("Renesas_cmn_CryptoDevCb", ret);
-    return ret;
-}
-
-/* Renesas Security Library Common Entry Point
- * For usable method
- *
- * ssl     : a pointer to WOLFSSL object
- * session_key_generated : if session key has been generated
- * return  1 for usable, 0 for unusable
- */
-int Renesas_cmn_usable(const WOLFSSL* ssl, byte session_key_generated)
-{
-    int ret;
-
-    #if defined(WOLFSSL_RENESAS_TSIP_TLS)
-        ret = tsip_usable(ssl, session_key_generated);
-    #elif defined(WOLFSSL_RENESAS_SCEPROTECT) ||\
-            defined(WOLFSSL_RENESAS_SCEPROTECT_CRYPTONLY)
-        ret = wc_sce_usable(ssl, session_key_generated);
-    #endif
-
-    return ret;
-}
-
-/* Renesas Security Library Common Method
- * Crypt Callback initialization
- *
- * ssl     : a pointer to WOLFSSL object
- * ctx     : callback context
- * return  valid device Id on success, otherwise INVALID_DEVIID
- *         device Id starts from 7890, and increases + 1 its number 
- *         when the method is successfully called.
- */
-int wc_CryptoCb_CryptInitRenesasCmn(WOLFSSL* ssl, void* ctx)
-{
-    (void)ssl;
-    (void)ctx;
-
- #if defined(WOLFSSL_RENESAS_TSIP_TLS)
-    TsipUserCtx* cbInfo = (TsipUserCtx*)ctx;
- #elif defined(WOLFSSL_RENESAS_SCEPROTECT) || \
-       defined(WOLFSSL_RENESAS_SCEPROTECT_CRYPTONLY)
-    User_SCEPKCbInfo* cbInfo = (User_SCEPKCbInfo*)ctx;
- #endif
-
-    if (cbInfo == NULL
-   #if !defined(WOLFSSL_RENESAS_SCEPROTECT_CRYPTONLY) && \
-       !defined(HAVE_RENESAS_SYNC)
-        || ssl == NULL) {
-   #else
-     ) {
-   #endif
-        printf("Invalid devId\n");
-        return INVALID_DEVID;
-    }
-    /* need exclusive control because of static variable */
-    if ((cmn_hw_lock()) == 0) {
-        cbInfo->devId = gdevId++;
-        cmn_hw_unlock();
-    }
-    else {
-        WOLFSSL_MSG("Failed to lock tsip hw");
-        return INVALID_DEVID;
-    }
-    
-    if (wc_CryptoCb_RegisterDevice(cbInfo->devId, 
-                            Renesas_cmn_CryptoDevCb, cbInfo) < 0) {
-        /* undo devId number */
-        gdevId--;
-        return INVALID_DEVID;
-    }
-
-   #if !defined(WOLFSSL_RENESAS_SCEPROTECT_CRYPTONLY) && \
-       !defined(HAVE_RENESAS_SYNC)
-    if (ssl)
-        wolfSSL_SetDevId(ssl, cbInfo->devId);
-   #endif
-    /* sanity check for overflow */
-    if (gdevId < 0) {
-        gdevId = 7890;
-    }
-    
-    return cbInfo->devId;
-}
-
-/* Renesas Security Library Common Method
- * Clean up CryptCb
- *
- * id     : a pointer to device id to unregister
- * no return value
- */
-void wc_CryptoCb_CleanupRenesasCmn(int* id)
-{
-    wc_CryptoCb_UnRegisterDevice(*id);
-    *id = INVALID_DEVID;
-}
-
-#endif /* WOLF_CRYPTO_CB */
-#endif /* WOLFSSL_RENESAS_SCEPROTECT || WOLFSSL_RENESAS_SCEPROTECT_CRYPTONLY
-	   /* WOLFSSL_RENESAS_TSIP_TLS*/
-
-#if defined(WOLFSSL_RENESAS_SCEPROTECT) || defined(WOLFSSL_RENESAS_TSIP_TLS)
-
-/* Renesas Security Library Common Method
- * Check CA index if CA can be used for SCE/TSIP because
- * the CA has been verified by SCE/TSIP
- *
- * cmdIdx : ca index
- * return 1 can be used, otherwise 0
- */
-WOLFSSL_LOCAL byte Renesas_cmn_checkCA(word32 cmIdx)
-{
-    WOLFSSL_ENTER("Renesas_cmn_checkCA");
-    return (cmIdx == g_CAscm_Idx? 1:0);
-}
-
-/* check if the root CA has been verified by TSIP/SCE,
- * and it exists in the CM table.
- */
-static byte sce_tsip_rootCAverified(void)
-{
-    WOLFSSL_ENTER("sce_tsip_rootCAverified");
-    return (g_CAscm_Idx != (uint32_t)-1 ? 1:0);
-}
-/* Renesas Security Library Common Callback
- * Callback for Rsa verify
- *
- * ssl      the WOLFSSL object
- * sig      Buffer holding signature
- * sigSz    Length of signature in bytes
- * out      Buffer to hold hash
- * key      Buffer to hold ecc key
- * keySz    Length of key in bytes
- * return FSP_SUCCESS(0) on success, otherwise FSP/TSIP error code
- */
-WOLFSSL_LOCAL int Renesas_cmn_RsaVerify(WOLFSSL* ssl, unsigned char* sig,
-          unsigned int sigSz, unsigned char** out,
-          const unsigned char* key, unsigned int keySz, void* ctx)
-{
-    int ret = 0;
-
-    WOLFSSL_ENTER("Renesas_cmn_RsaVerify");
-
-#if defined(WOLFSSL_RENESAS_TSIP_TLS)
-    ret = wc_tsip_RsaVerify(ssl, sig, sigSz, out, key, keySz, ctx);
-
-    if (ret == 0) {
-        /* Set Callback for SharedSecret when successful */
-        wolfSSL_CTX_SetEccSharedSecretCb(ssl->ctx, wc_tsip_EccSharedSecret);
-        wolfSSL_SetEccSharedSecretCtx(ssl, ctx);
-    }
-    else {
-        WOLFSSL_MSG("failed wc_tsip_RsaVerify");
-        wolfSSL_CTX_SetEccSharedSecretCb(ssl->ctx, NULL);
-        wolfSSL_SetEccSharedSecretCtx(ssl, NULL);
-    }
-
-#elif defined(WOLFSSL_RENESAS_SCEPROTECT)
-    ret = wc_SCE_RsaVerify(ssl, sig, sigSz, out,key, keySz, ctx);
-
-    if (ret == 0) {
-        /* Set Callback for SharedSecret when successful */
-        wolfSSL_CTX_SetEccSharedSecretCb(ssl->ctx, SCE_EccSharedSecret);
-        wolfSSL_SetEccSharedSecretCtx(ssl, ctx);
-    }
-    else {
-        WOLFSSL_MSG("failed R_SCE_TLS_ServerKeyExchangeVerify");
-        wolfSSL_CTX_SetEccSharedSecretCb(ssl->ctx, NULL);
-        wolfSSL_SetEccSharedSecretCtx(ssl, NULL);
-    }
-#endif
-
-    return ret;
-}
-/* Renesas Security Library Common Callback
- * Callback for Ecc verify
- *
- * ssl      the WOLFSSL object
- * sig      Buffer holding signature
- * sigSz    Length of signature in bytes
- * hash     Buffer to hold hash
- * hashSz   Length of hash
- * key      Buffer to hold ecc key
- * keySz    Length of key in bytes
- * result   a pointer to int indicates if the verify is ok
- * return FSP_SUCCESS(0) on success, otherwise FSP/TSIP error code
- */
-WOLFSSL_LOCAL int Renesas_cmn_EccVerify(WOLFSSL* ssl, const unsigned char* sig,
-          unsigned int sigSz, const unsigned char* hash, unsigned int hashSz,
-        const unsigned char* key, unsigned int keySz, int* result, void* ctx)
-{
-    int ret = 0;
-
-    WOLFSSL_ENTER("Renesas_cmn_EccVerify");
-
-#if defined(WOLFSSL_RENESAS_TSIP)
-    ret = wc_tsip_EccVerify(ssl, sig, sigSz, hash, hashSz, key, keySz,
-                                                        result, ctx);
-    if (ret == 0 && *result == 1) {
-        /* Set callback for SharedSecret when being successful */
-        wolfSSL_CTX_SetEccSharedSecretCb(ssl->ctx, wc_tsip_EccSharedSecret);
-        wolfSSL_SetEccSharedSecretCtx(ssl, ctx);
-    }
-    else {
-        WOLFSSL_MSG("failed wc_tsip_EccVerify");
-        wolfSSL_CTX_SetEccSharedSecretCb(ssl->ctx, NULL);
-        wolfSSL_SetEccSharedSecretCtx(ssl, NULL);
-    }
-#elif defined(WOLFSSL_RENESAS_SCEPROTECT)
-    ret = wc_SCE_EccVerify(ssl, sig, sigSz, hash, hashSz, key, keySz,
-                                                        result, ctx);
-    if (ret == 0 && *result == 1) {
-        /* Set callback for SharedSecret when being successful */
-        wolfSSL_CTX_SetEccSharedSecretCb(ssl->ctx, SCE_EccSharedSecret);
-        wolfSSL_SetEccSharedSecretCtx(ssl, ctx);
-    }
-    else {
-        WOLFSSL_MSG("failed R_SCE_TLS_ServerKeyExchangeVerify");
-        wolfSSL_CTX_SetEccSharedSecretCb(ssl->ctx, NULL);
-        wolfSSL_SetEccSharedSecretCtx(ssl, NULL);
-    }
-#endif
-
-    return ret;
-}
-/* Renesas Security Library Common Entry Point
- * For ROOT CA verifycation
- *
- * cert     Buffer to hold cert
- * cert_len Length of cert
- * key_n_start Byte position of public key in cert
- * key_n_len   Length of public key in bytes
- * key_e_start Byte position of public key exponent in cert
- * key_e_len   Length of public key exponent
- * cm_row      CA index
- * return FSP_SUCCESS(0) on success, otherwise WOLFSSL_FATAL_ERROR
- */
-int wc_Renesas_cmn_RootCertVerify(const byte* cert, word32 cert_len, 
-        word32 key_n_start, word32 key_n_len, word32 key_e_start, 
-        word32 key_e_len, word32 cm_row)
-{
-    int ret;
-
-    WOLFSSL_ENTER("wc_Renesas_cmn_RootCertVerify");
-
-    if (sce_tsip_rootCAverified() == 0) {
-
-    #if defined(WOLFSSL_RENESAS_TSIP_TLS)
-        ret = wc_tsip_tls_RootCertVerify(cert, cert_len, key_n_start,
-                key_n_len, key_e_start, key_e_len, cm_row);
-        if (ret != TSIP_SUCCESS) {
-            ret = WOLFSSL_FATAL_ERROR;
-        }
-    #elif defined(WOLFSSL_RENESAS_SCEPROTECT)
-
-        ret = wc_sce_tls_RootCertVerify(cert, cert_len, key_n_start,
-                key_n_len, key_e_start, key_e_len, cm_row);
-        if (ret != FSP_SUCCESS) {
-            ret = WOLFSSL_FATAL_ERROR;
-        }
-    #endif
-
-    }
-    else {
-        /* already verified. skipped */
-        ret = 0;
-    }
-    WOLFSSL_LEAVE("wc_Renesas_cmn_RootCertVerify", ret);
-    return ret;
-}
-
-/* Renesas Security Library Common Callback
- * Callback for tls finished
- *
- * ssl      the WOLFSSL object
- * side     CLIENT or SERVER
- * handshake_hash hash while doing handshake
- * hashes  calculated data by SCE/TSIP pseudo random function
- * return FSP_SUCCESS(0) on success, otherwise FSP/TSIP error code
- */
-WOLFSSL_LOCAL int Renesas_cmn_TlsFinished(WOLFSSL* ssl, const byte *side,
-                            const byte *handshake_hash, word32 hashSz,
-                            byte *hashes, void* ctx)
-{
-    int ret = -1;
-
-    (void)hashSz;
-    (void)ctx;
-
-    WOLFSSL_ENTER("Renesas_cmn_TlsFinished");
-
-    if (Renesas_cmn_usable(ssl, 1)) {
- #if defined(WOLFSSL_RENESAS_TSIP_TLS)
-        ret = wc_tsip_generateVerifyData(ssl->arrays->tsip_masterSecret,
-                            side, handshake_hash, hashes);
- #elif defined(WOLFSSL_RENESAS_SCEPROTECT)
-         ret = wc_sce_generateVerifyData(ssl->arrays->sce_masterSecret,
-                   side, handshake_hash, hashes);
- #endif
-    }
-     else
-         ret = PROTOCOLCB_UNAVAILABLE;
-
-    return ret;
-}
-
-/* Renesas Security Library Common Callback
- * Callback for setting Encrypt Keys.
- * Register callback for setting Encrypt Keys when keys are generated 
- * by SCE/TSIP
- *
- * ssl      the WOLFSSL object
- * ctx      Callback context
- * return 0 on success, -1 when keys are not generated by SCE/TSIP
- */
-static int Renesas_cmn_EncryptKeys(WOLFSSL* ssl, void* ctx)
-{
-    int ret;
-
-    WOLFSSL_ENTER("Renesas_cmn_EncryptKeys");
-
-    /* sanity check */
-    if (ssl == NULL || ctx == NULL)
-        return BAD_FUNC_ARG;
-
- #if defined(WOLFSSL_RENESAS_TSIP_TLS)
-    TsipUserCtx* cbInfo = (TsipUserCtx*)ctx;
-    
-    if (cbInfo->session_key_set == 1) {
- #elif defined(WOLFSSL_RENESAS_SCEPROTECT)
-    User_SCEPKCbInfo* cbInfo = (User_SCEPKCbInfo*)ctx;
-
-
-    if (cbInfo->keyflgs_tls.bits.session_key_set == 1) {
- #endif
-        ret = 0;
-
-        wolfSSL_CTX_SetTlsFinishedCb(ssl->ctx, Renesas_cmn_TlsFinished);
-        wolfSSL_SetTlsFinishedCtx(ssl, cbInfo);
-    }
-    else {
-        wolfSSL_CTX_SetTlsFinishedCb(ssl->ctx, NULL);
-        wolfSSL_SetTlsFinishedCtx(ssl, NULL);
-        ret = -1;
-    }
-
-    return ret;
-}
-
-/* Renesas Security Library Common Callback
- * Callback for Session Key generation
- * Register callback for Set Keys when keys are successfully
- * generated by SCE/TSIP
- *
- * ssl      the WOLFSSL object
- * ctx      Callback context
- * return FSP_SUCCESS(0) on success, otherwise SCE/TSIP error code
- */
-WOLFSSL_LOCAL int Renesas_cmn_generateSessionKey(WOLFSSL* ssl, void* ctx)
-{
-    int ret = -1;
-#if defined(WOLFSSL_RENESAS_TSIP_TLS)
-    TsipUserCtx*      cbInfo = (TsipUserCtx*)ctx;
-#elif defined(WOLFSSL_RENESAS_SCEPROTECT)
-    User_SCEPKCbInfo* cbInfo = (User_SCEPKCbInfo*)ctx;
-#endif
-    (void)ctx;
- 
-    WOLFSSL_ENTER("Renesas_cmn_generateSessionKey");
-    if (Renesas_cmn_usable(ssl, 0)) {
-#if defined(WOLFSSL_RENESAS_TSIP_TLS)
-        ret = wc_tsip_generateSessionKey(ssl, (TsipUserCtx*)ctx, cbInfo->devId);
-#elif defined(WOLFSSL_RENESAS_SCEPROTECT)
-        ret = wc_sce_generateSessionKey(ssl, ctx, cbInfo->devId);
-#endif
-    } 
-    else {
-         ret = PROTOCOLCB_UNAVAILABLE;
-    }
-    
-    if (ret == 0) {
-        wolfSSL_CTX_SetEncryptKeysCb(ssl->ctx, Renesas_cmn_EncryptKeys);
-        wolfSSL_SetEncryptKeysCtx(ssl, ctx);
-    }
-    else {
-        wolfSSL_CTX_SetEncryptKeysCb(ssl->ctx, NULL);
-        wolfSSL_SetEncryptKeysCtx(ssl, NULL);
-    }
-
-    return ret;
-}
-
-/* Renesas Security Library Common Callback
- * Callback for Premaster Secret generation
- * Register callback for Set Keys when keys are successfully
- * generated by SCE/TSIP
- *
- * ssl      the WOLFSSL object
- * premaster Buffer to hold pre master
- * preSz     Length of pre-master
- * ctx       Callback context
- * return FSP_SUCCESS(0) on success,
- * otherwise PROTOCOLCB_UNAVAILABLE
- *          so that caller could continue to process if want
- */
-WOLFSSL_LOCAL int Renesas_cmn_generatePremasterSecret(WOLFSSL* ssl,
-                            byte *premaster, word32 preSz, void* ctx)
-{
-    int ret;
-
-    (void) ctx;
-    (void) ssl;
-
-    WOLFSSL_ENTER("Renesas_cmn_generatePremasterSecret");
-
-#if defined(WOLFSSL_RENESAS_TSIP_TLS)
-    if (Renesas_cmn_usable(ssl, 0)) {
-        ret = wc_tsip_generatePremasterSecret(premaster, preSz);
-        ssl->arrays->preMasterSz = preSz;
-    }
-    else
-        ret = PROTOCOLCB_UNAVAILABLE;
-
-#elif defined(WOLFSSL_RENESAS_SCEPROTECT)
-    if (Renesas_cmn_usable(ssl, 0)) {
-        ret = wc_sce_generatePremasterSecret(premaster, preSz);
-        ssl->arrays->preMasterSz = preSz;
-    }
-    else
-        ret = PROTOCOLCB_UNAVAILABLE;
-#endif
-
-    return ret;
-}
-
-/* Renesas Security Library Common Callback
- * Callback for Master Secret generation
- * Register callback for Session Key Generation when master secret is
- * successfully generated by SCE/TSIP
- *
- * ssl       the WOLFSSL object
- * ctx       Callback context
- * return FSP_SUCCESS(0) on success,
- * otherwise PROTOCOLCB_UNAVAILABLE
- *          so that caller could continue to process if want
- */
-WOLFSSL_LOCAL int Renesas_cmn_genMasterSecret(struct WOLFSSL* ssl, void* ctx)
-{
-    int ret = WOLFSSL_NOT_IMPLEMENTED;
-
-    (void) ret;
-    (void) ctx;
-
-    WOLFSSL_ENTER("Renesas_cmn_genMasterSecret");
-
- #if defined(WOLFSSL_RENESAS_TSIP_TLS)
-
-    if (Renesas_cmn_usable(ssl, 0)) {
-    #if (WOLFSSL_RENESAS_TSIP_VER >= 109)
-        ret = wc_tsip_generateMasterSecretEx(
-                            ssl->options.cipherSuite0,
-                            ssl->options.cipherSuite,
-                            ssl->arrays->preMasterSecret,
-                            ssl->arrays->clientRandom,
-                            ssl->arrays->serverRandom,
-                            ssl->arrays->tsip_masterSecret);
-    #else
-        ret = wc_tsip_generateMasterSecret(
-                            ssl->arrays->preMasterSecret,
-                            ssl->arrays->clientRandom,
-                            ssl->arrays->serverRandom,
-                            ssl->arrays->tsip_masterSecret);
-    #endif
-
-        if (ret == 0) {
-            wc_tsip_storeKeyCtx(ssl, (TsipUserCtx*)ctx);
-            /* set Session Key generation Callback for use */
-            wolfSSL_CTX_SetGenSessionKeyCb(ssl->ctx,
-                                                Renesas_cmn_generateSessionKey);
-            wolfSSL_SetGenSessionKeyCtx(ssl, ctx);
-        }
-        else {
-            wolfSSL_CTX_SetGenSessionKeyCb(ssl->ctx, NULL);
-            wolfSSL_SetGenSessionKeyCtx(ssl, NULL);
-        }
-    }
-    else
-        ret = PROTOCOLCB_UNAVAILABLE;
-
- #elif defined(WOLFSSL_RENESAS_SCEPROTECT)
-    if (Renesas_cmn_usable(ssl, 0)) {
-        ret = wc_sce_generateMasterSecret(
-                            ssl->options.cipherSuite0,
-                            ssl->options.cipherSuite,
-                            ssl->arrays->preMasterSecret,
-                            ssl->arrays->clientRandom,
-                            ssl->arrays->serverRandom,
-                            ssl->arrays->sce_masterSecret);
-        if (ret == 0) {
-            wc_sce_storeKeyCtx(ssl, ctx);
-            /* set Session Key generation Callback for use */
-            wolfSSL_CTX_SetGenSessionKeyCb(ssl->ctx,
-                                                Renesas_cmn_generateSessionKey);
-            wolfSSL_SetGenSessionKeyCtx(ssl, ctx);
-        }
-        else {
-            wolfSSL_CTX_SetGenSessionKeyCb(ssl->ctx, NULL);
-            wolfSSL_SetGenSessionKeyCtx(ssl, NULL);
-        }
-    }
-    else {
-        WOLFSSL_MSG("PROTOCOLCB_UNAVAILABLE\n");
-        ret = PROTOCOLCB_UNAVAILABLE;
-    }
-
- #endif
-    return ret;
-}
-
-/* Renesas Security Library Common Callback
- * Callback for Rsa Encryption
- *
- * ssl       the WOLFSSL object
- * in        Buffer to hold plain text
- * inSz      Length of plain text
- * out       Buffer to hold cipher text
- * outSz     Length of cipher text buffer
- * KeyDer    Buffer holding Key in der format
- * KeySz     Length of Key Der
- * ctx       Callback context
- * return FSP_SUCCESS(0) on success,
- * otherwise CRYPTOCB_UNAVAILABLE
- *          so that caller could continue to process if want
- */
-WOLFSSL_LOCAL int Renesas_cmn_RsaEnc(WOLFSSL* ssl, const unsigned char* in,
-       unsigned int inSz, unsigned char* out, word32* outSz,
-       const unsigned char* keyDer, unsigned int keySz, void* ctx)
-{
-    int ret;
-    int EncSz;
-
-    (void)ctx;
-    (void)in;
-    (void)inSz;
-    (void)keyDer;
-    (void)keySz;
-    (void)EncSz;
-
-    WOLFSSL_ENTER("Renesas_cmn_RsaEnc");
-
-    /* sanity check */
-    if (ssl == NULL || in == NULL || out == NULL || keyDer == NULL ||
-            ctx == NULL) {
-        return BAD_FUNC_ARG;
-    }
-
-    EncSz = wc_RsaEncryptSize(ssl->peerRsaKey);
-
- #if defined(WOLFSSL_RENESAS_TSIP_TLS)
-    if (tsip_usable(ssl, 0)) {
-        if (EncSz == 256) {
-            ret = wc_tsip_generateEncryptPreMasterSecret(ssl, out, outSz);
-        }
-        else {
-            WOLFSSL_MSG("TSIP can only handle 256 bytes for RSA encrypt size.");
-            ret = CRYPTOCB_UNAVAILABLE;
-        }
-    }
-    else {
-        ret = CRYPTOCB_UNAVAILABLE;
-    }
-
- #elif defined(WOLFSSL_RENESAS_SCEPROTECT)
-    if (wc_sce_usable(ssl, 0) && EncSz == 256) {
-        ret = wc_sce_generateEncryptPreMasterSecret(ssl, out, outSz);
-    }
-    else {
-        if (EncSz != 256)
-            WOLFSSL_MSG("SCE cannot use"
-                     "because Rsa Encrypt Size isn't 256 bytes(2048 bits).");
-
-        ret = CRYPTOCB_UNAVAILABLE;
-    }
- #endif
-    return ret;
-}
-
-/* Renesas Security Library Common Callback
- * Callback for Verify hmac
- *
- * ssl       the WOLFSSL object
- * message   Buffer to hold message
- * inSz      Length of message
- * macSz     Length of mac size
- * content   content of inner data
- * ctx       Callback context
- * return FSP_SUCCESS(0) on success,
- * otherwise PROTOCOLCB_UNAVAILABLE
- *          so that caller could continue to process if want
- */
-WOLFSSL_LOCAL int Renesas_cmn_VerifyHmac(WOLFSSL *ssl, const byte* message,
-                    word32 messageSz, word32 macSz, word32 content, void* ctx)
-{
-    int ret;
-    (void)ctx;
-
-    WOLFSSL_ENTER("Renesas_cmn_VerifyHmac");
-
-#if defined(WOLFSSL_RENESAS_TSIP_TLS)
-    if (tsip_usable(ssl, 1)) {
-        ret = wc_tsip_ShaXHmacVerify(ssl, message, messageSz, macSz, content);
-    }
-    else
-        ret = PROTOCOLCB_UNAVAILABLE;
-#elif defined(WOLFSSL_RENESAS_SCEPROTECT)
-    if (wc_sce_usable(ssl, 1)) {
-        ret = wc_sce_Sha256VerifyHmac(ssl, message, messageSz, macSz, content);
-    }
-    else
-        ret = PROTOCOLCB_UNAVAILABLE;
-#endif
-
-    return ret;
-}
-
-#ifndef WOLFSSL_AEAD_ONLY
-/* Renesas Security Library Common Callback
- * Callback for TLS hmac
- *
- * ssl       the WOLFSSL object
- * digest    Buffer to hold digest by hmac
- * in        Buffer to hold in data
- * sz        Length of in data
- * padSz     Length of padding
- * content   content of inner data
- * epochOrder
- * return FSP_SUCCESS(0) on success, otherwise error code
- */
-WOLFSSL_LOCAL int Renesas_cmn_TLS_hmac(WOLFSSL* ssl, byte* digest,
-  const byte* in, word32 sz, int padSz, int content, int verify, int epochOrder)
-{
-    int ret;
-    byte   myInner[WOLFSSL_TLS_HMAC_INNER_SZ];
-
-    WOLFSSL_ENTER("Renesas_cmn_TLS_hmac");
-
-#if defined(WOLFSSL_RENESAS_TSIP_TLS)
-    if (Renesas_cmn_usable(ssl, 1)) {
-
-        wolfSSL_SetTlsHmacInner(ssl, myInner, sz, content, verify);
-
-        if (ssl->specs.hash_size == WC_SHA_DIGEST_SIZE) {
-            ret = wc_tsip_Sha1HmacGenerate(ssl, myInner,
-                                    WOLFSSL_TLS_HMAC_INNER_SZ, in, sz, digest);
-        }
-        else if (ssl->specs.hash_size == WC_SHA256_DIGEST_SIZE) {
-            ret = wc_tsip_Sha256HmacGenerate(ssl, myInner,
-                                    WOLFSSL_TLS_HMAC_INNER_SZ, in, sz, digest);
-        }
-        else {
-            ret = TSIP_MAC_DIGSZ_E;
-        }
-    }
-    else {
-        WOLFSSL_MSG("TLS_hmac is used instead of TSIP");
-        /* fall through to original TLS hmac method when TSIP cannot be used */
-        ret = TLS_hmac(ssl, digest, in, sz, padSz, content, verify, epochOrder);
-
-    }
-
-#elif defined(WOLFSSL_RENESAS_SCEPROTECT)
-    if (Renesas_cmn_usable(ssl, 1)) {
-        if (ssl->specs.hash_size == WC_SHA256_DIGEST_SIZE) {
-            wolfSSL_SetTlsHmacInner(ssl, myInner, sz, content, verify);
-            ret = wc_sce_Sha256GenerateHmac(ssl, myInner, 
-                                    WOLFSSL_TLS_HMAC_INNER_SZ, in, sz, digest);
-        }
-        else
-            ret = TSIP_MAC_DIGSZ_E;
-    }
-    else {
-        /* fall through to original TLS hmac method when SCE cannot be used */
-        ret = TLS_hmac(ssl, digest, in, sz, padSz, content, verify, epochOrder);
-    }
-
-#endif
-
-    return ret;
-}
-#endif /* !WOLFSSL_AEAD_ONLY */
-
-/* Renesas Security Library Common Callback
- * Callback for Signature PK Rsa verify
- *
- * sig      Buffer holding signature
- * sigSz    Length of signature in bytes
- * out      Buffer to hold hash
- * keyDer   Buffer to hold rsa key
- * keySz    Length of key in bytes
- * ctx      Callback context
- * return FSP_SUCCESS(0) on success,
- * otherwise CRYPTOCB_UNAVAILABLE
- *          so that caller could continue to process if want
- */
-WOLFSSL_LOCAL int Renesas_cmn_SigPkCbRsaVerify(unsigned char* sig,
-        unsigned int sigSz, unsigned char** out, const unsigned char* keyDer,
-        unsigned int keySz, void* ctx)
-{
-    int ret;
-    CertAttribute*  CertAtt;
-
-    (void)out;
-    (void)keyDer;
-    (void)keySz;
-
-    WOLFSSL_ENTER("Renesas_cmn_SigPkCbRsaVerify");
-
-    /* sanity check */
-    if (sig == NULL || out == NULL || keyDer == NULL || ctx == NULL)
-        return BAD_FUNC_ARG;
-
-    CertAtt = (CertAttribute*)ctx;
-    if (!CertAtt) {
-        return CRYPTOCB_UNAVAILABLE;
-    }
-
-#if defined(WOLFSSL_RENESAS_TSIP_TLS)
-    if (CertAtt->keyIndex != NULL) {
-        ret = wc_tsip_tls_CertVerify(CertAtt->cert, CertAtt->certSz, sig, sigSz,
-                                 CertAtt->pubkey_n_start - CertAtt->certBegin,
-                                 CertAtt->pubkey_n_len - 1,
-                                 CertAtt->pubkey_e_start - CertAtt->certBegin,
-                                 CertAtt->pubkey_e_len -1,
-                                 (uint8_t*)CertAtt->keyIndex);
-        if (ret == 0) {
-            CertAtt->verifyByTSIP_SCE = 1;
-        }
-        else {
-            WOLFSSL_MSG("RSA Verify by TSIP didn't match");
-            ret = ASN_SIG_CONFIRM_E;
-        }
-    }
-    else
-        ret = CRYPTOCB_UNAVAILABLE;
-
-#elif defined(WOLFSSL_RENESAS_SCEPROTECT)
-    if (CertAtt->keyIndex != NULL) {
-        ret = wc_sce_tls_CertVerify(CertAtt->cert, CertAtt->certSz, sig, sigSz,
-                                 CertAtt->pubkey_n_start - CertAtt->certBegin,
-                                 CertAtt->pubkey_n_len - 1,
-                                 CertAtt->pubkey_e_start - CertAtt->certBegin,
-                                 CertAtt->pubkey_e_len -1,
-                                 (uint8_t*)CertAtt->keyIndex);
-        if (ret == 0) {
-            CertAtt->verifyByTSIP_SCE = 1;
-        }
-        else {
-            WOLFSSL_MSG("RSA Verify by SCE didn't match");
-            ret = ASN_SIG_CONFIRM_E;
-        }
-    }
-    else
-        ret = CRYPTOCB_UNAVAILABLE;
-#endif
-
-    return ret;
-}
-
-/* Renesas Security Library Common Callback
- * Callback for Signature PK Ecc verify
- *
- * sig      Buffer holding signature
- * sigSz    Length of signature in bytes
- * has      Buffer to hold hash
- * hashSz   Length of hash
- * keyDer   Buffer to hold rsa key
- * keySz    Length of key in bytes
- * result   A pointer to int indicates a result
- * ctx      Callback context
- * return FSP_SUCCESS(0) on success,
- * otherwise CRYPTOCB_UNAVAILABLE
- *          so that caller could continue to process if want
- */
-WOLFSSL_LOCAL int Renesas_cmn_SigPkCbEccVerify(const unsigned char* sig,
-        unsigned int sigSz, const unsigned char* hash, unsigned int hashSz,
-       const unsigned char* keyDer, unsigned int keySz,
-       int* result, void* ctx)
-{
-    int ret;
-    CertAttribute*  CertAtt;
-
-    (void)result;
-    (void)keyDer;
-    (void)keySz;
-    (void)hash;
-    (void)hashSz;
-
-    WOLFSSL_ENTER("Renesas_cmn_SigPkCbEccVerify");
-
-    /* sanity check */
-    if (sig == NULL || keyDer == NULL || hash == NULL || ctx == NULL ||
-        result == NULL)
-        return BAD_FUNC_ARG;
-
-
-    CertAtt = (CertAttribute*)ctx;
-    if (!CertAtt) {
-        return CRYPTOCB_UNAVAILABLE;
-    }
-
-#if defined(WOLFSSL_RENESAS_TSIP_TLS)
-    if (CertAtt->keyIndex != NULL) {
-        ret = wc_tsip_tls_CertVerify(CertAtt->cert, CertAtt->certSz, sig, sigSz,
-                                 CertAtt->pubkey_n_start - CertAtt->certBegin,
-                                 CertAtt->pubkey_n_len - 1,
-                                 CertAtt->pubkey_e_start - CertAtt->certBegin,
-                                 CertAtt->pubkey_e_len -1,
-                                 (uint8_t*)CertAtt->keyIndex);
-        if (ret == 0) {
-            CertAtt->verifyByTSIP_SCE = 1;
-            *result = 1;
-        }
-        else {
-            WOLFSSL_MSG("RSA Verify by TSIP didn't match");
-            ret = ASN_SIG_CONFIRM_E;
-        }
-    }
-    else
-        ret = CRYPTOCB_UNAVAILABLE;
-#elif defined(WOLFSSL_RENESAS_SCEPROTECT)
-    if (CertAtt->keyIndex != NULL) {
-        ret = wc_sce_tls_CertVerify(CertAtt->cert, CertAtt->certSz, sig, sigSz,
-                                 CertAtt->pubkey_n_start - CertAtt->certBegin,
-                                 CertAtt->pubkey_n_len - 1,
-                                 CertAtt->pubkey_e_start - CertAtt->certBegin,
-                                 CertAtt->pubkey_e_len -1,
-                                 (uint8_t*)CertAtt->keyIndex);
-        if (ret == 0) {
-            CertAtt->verifyByTSIP_SCE = 1;
-            *result = 1;
-        }
-        else {
-            WOLFSSL_MSG("RSA Verify by SCE didn't match");
-            ret = ASN_SIG_CONFIRM_E;
-        }
-    }
-    else
-        ret = CRYPTOCB_UNAVAILABLE;
-#endif
-
-    return ret;
-}
-
-#endif /* SCEPROTECT || TSIP */

+ 0 - 443
lib/wolfssl/wolfcrypt/src/port/Renesas/renesas_rx64_hw_sha.c

@@ -1,443 +0,0 @@
-/* renesas_rx64_hw_sha.c
- *
- * Contributed by Johnson Controls Tyco IP Holdings LLP.
- *
- * Use of this Software is subject to the GPLv2 License
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-#include <string.h>
-#include <stdio.h>
-
-#ifdef HAVE_CONFIG_H
-    #include <config.h>
-#endif
-#include <wolfssl/wolfcrypt/settings.h>
-
-#if !defined(NO_SHA) || !defined(NO_SHA256)
-
-#include <wolfssl/wolfcrypt/logging.h>
-
-#if defined(WOLFSSL_RENESAS_RX64_HASH)
-
-#include <wolfssl/wolfcrypt/error-crypt.h>
-#include <wolfssl/wolfcrypt/port/Renesas/renesas-rx64-hw-crypt.h>
-
-#include <wolfssl/wolfcrypt/sha.h>
-
-typedef union
-{
-    R_sha1   sha1;
-    R_sha224 sha224;
-    R_sha256 sha256;
-} R_Sha_Data;
-
-/**
-Default SHA Hash Data When Input Msg Buffers are NULL.
-
-The source of this data can be obtained from a simple python
-program that requests the hash of an empty input argument.
-Example:
-import hashlib
-print("SHA default/empty hash values")
-print(f"SHA1   {hashlib.sha1(b'').hexdigest()}")
-print(f"SHA224 {hashlib.sha224(b'').hexdigest()}")
-print(f"SHA256 {hashlib.sha256(b'').hexdigest()}")
-
-OR
-
-The following website also provide data for these hashes when
-an empty buffer is given as input
-https://www.di-mgt.com.au/sha_testvectors.html
-**/
-
-static byte const DefaultShaHashData[] =
-{
-  0xDA, 0x39, 0xA3, 0xEE, 0x5E, 0x6B, 0x4B, 0x0D,
-  0x32, 0x55, 0xBF, 0xEF, 0x95, 0x60, 0x18, 0x90,
-  0xAF, 0xD8, 0x07, 0x09
-};
-
-static byte const DefaultSha224HashData[] =
-{
-  0xD1, 0x4A, 0x02, 0x8C, 0x2A, 0x3A, 0x2B, 0xC9,
-  0x47, 0x61, 0x02, 0xBB, 0x28, 0x82, 0x34, 0xC4,
-  0x15, 0xA2, 0xB0, 0x1F, 0x82, 0x8E, 0xA6, 0x2A,
-  0xC5, 0xB3, 0xE4, 0x2F
-};
-
-static byte const DefaultSha256HashData[] =
-{
-  0xE3, 0xB0, 0xC4, 0x42, 0x98, 0xFC, 0x1C, 0x14,
-  0x9A, 0xFB, 0xF4, 0xC8, 0x99, 0x6F, 0xB9, 0x24,
-  0x27, 0xAE, 0x41, 0xE4, 0x64, 0x9B, 0x93, 0x4C,
-  0xA4, 0x95, 0x99, 0x1B, 0x78, 0x52, 0xB8, 0x55
-};
-
-/**
- * @brief Calculate a SHA hash using the RX64 SHA subsystem.
- *
- * @param[in] data buffer with data to sha
- * @param[in] len length of data
- * @param[out] out Output buffer to store sha result in
- * @param[in] sha_type Sha type to calculate, from RX64_SHA_TYPE
- * @return int R_PROCESS_COMPLETE (0) on success, see r_sha.h for failure codes.
- */
-int RX64_ShaCalc(byte* data, word32 len, byte* out, word32 sha_type)
-{
-    int ret;
-    uint8_t flag = R_SHA_INIT;
-    word32 index = 0;
-    uint16_t chunk_length;
-    R_Sha_Data work_sha;
-
-    if (data == NULL || len == 0 ||
-        out == NULL || sha_type >= NUM_RX64_SHA_TYPES)
-    {
-        return BAD_FUNC_ARG;
-    }
-
-    XMEMSET(&work_sha, 0, sizeof(work_sha));
-
-    rx64_hw_lock();
-    do {
-        /*
-        The hardware functions can only accept UINT16_MAX bytes at a time.
-        To work around this break the buffer up into chunks and pass the
-        R_SHA_FINISH flag with the last chunk.
-        */
-        if (len - index <= UINT16_MAX) {
-            flag = flag | R_SHA_FINISH;
-            chunk_length = len - index;
-        } else {
-            chunk_length = UINT16_MAX;
-        }
-        /* Based on the hash type call the correct hardware function. */
-        if (sha_type == RX64_SHA1) {
-            ret = R_Sha1_HashDigest(&data[index], out, chunk_length, flag,
-                                    &work_sha.sha1);
-        } else if (sha_type == RX64_SHA224) {
-            ret = R_Sha224_HashDigest(&data[index], out, chunk_length, flag,
-                                      &work_sha.sha224);
-        } else if (sha_type == RX64_SHA256) {
-            ret = R_Sha256_HashDigest(&data[index], out, chunk_length, flag,
-                                      &work_sha.sha256);
-        }
-        if (ret != R_PROCESS_COMPLETE) {
-            /* On failure break, unlock hardware, return error. */
-            break;
-        }
-        index += chunk_length;
-        flag = R_SHA_ADD;
-    } while (index < len);
-
-    rx64_hw_unlock();
-    return ret;
-}
-
-/**
- * @brief Free a hash for use with the RX64 SHA subsystem.
- *
- * @param[in] hash The hash to free
- */
-static void RX64_HashFree(wolfssl_RX64_HW_Hash* hash)
-{
-    if (hash == NULL)
-        return;
-
-    if (hash->msg != NULL) {
-        XFREE(hash->msg, hash->heap, DYNAMIC_TYPE_TMP_BUFFER);
-        hash->msg = NULL;
-    }
-}
-
-/**
- * @brief Initialize a hash for use with the RX64 SHA subsystem.
- *
- * @param[in] hash The hash to initialize
- * @param[in] heap Optional pointer to memory to use.
- * @param devId Unused
- * @param[in] sha_type The SHA type for this hash
- * @return int 0 on success, BAD_FUNC_ARG on failure
- */
-static int RX64_HashInit(wolfssl_RX64_HW_Hash* hash, void* heap, int devId,
-    word32 sha_type)
-{
-    if (hash == NULL) {
-        return BAD_FUNC_ARG;
-    }
-
-    (void)devId;
-    XMEMSET(hash, 0, sizeof(wolfssl_RX64_HW_Hash));
-
-    hash->heap = heap;
-    hash->len  = 0;
-    hash->used = 0;
-    hash->msg  = NULL;
-    hash->sha_type = sha_type;
-
-    return 0;
-}
-
-/**
- * @brief Add data to the hash with the RX64 SHA subsystem.
- *
- * Note that do to the limitations in the RX64 hardware
- * and it's inability to save the current state,
- * this function actually just adds the data to a buffer
- * that will then be processed when calling HashFinal or HashGet
- *
- * @param[in] hash Hash structure
- * @param[in] data data to hash
- * @param[in] sz size of the data
- * @return int 0 on success, BAD_FUNC_ARG or MEMORY_E on failure
- */
-static int RX64_HashUpdate(wolfssl_RX64_HW_Hash* hash,
-                           const byte* data, word32 sz)
-{
-    if (hash == NULL || (sz > 0 && data == NULL)) {
-        return BAD_FUNC_ARG;
-    }
-
-    if (hash->len < hash->used + sz) {
-        if (hash->msg == NULL) {
-            hash->msg = (byte*)XMALLOC(hash->used + sz, hash->heap,
-                    DYNAMIC_TYPE_TMP_BUFFER);
-        } else {
-            byte* pt = (byte*)XREALLOC(hash->msg, hash->used + sz, hash->heap,
-                    DYNAMIC_TYPE_TMP_BUFFER);
-            if (pt == NULL) {
-                return MEMORY_E;
-            }
-            hash->msg = pt;
-        }
-        if (hash->msg == NULL) {
-            return MEMORY_E;
-        }
-        hash->len = hash->used + sz;
-    }
-    XMEMCPY(hash->msg + hash->used, data , sz);
-    hash->used += sz;
-
-    return 0;
-}
-
-/**
- * @brief Calculate hash value with the RX64 SHA subsystem and reset the hash.
- *
- * @param[in] hash Structure containing the information on what to hash
- * @param[out] out Sha hash
- * @retval int R_PROCESS_COMPLETE (0) on success.
- * @retval int BAD_FUNC_ARG or see r_sha.h on failure.
- */
-static int RX64_HashFinal(wolfssl_RX64_HW_Hash* hash, byte* out)
-{
-    int ret = R_PROCESS_COMPLETE;
-    void* heap;
-
-    if (hash == NULL || out == NULL) {
-        return BAD_FUNC_ARG;
-    }
-    if (hash->sha_type != RX64_SHA1 &&
-        hash->sha_type != RX64_SHA224 &&
-        hash->sha_type != RX64_SHA256)
-    {
-        return BAD_FUNC_ARG;
-    }
-
-    heap = hash->heap;
-
-    /*
-    RX64 HW SHA operations considers empty msgs to be an error,
-    though some wolfSSL operations expects to have successful SHA operations on
-    empty incoming buffers (e.g DeriveHandshakeSecret()).
-    Thus we must support the expected default SHA hash data for this operation
-    since TLS decrypt operations expect a specific SW Hash to be used when the
-    input buffer was empty/NULL.
-    */
-    if ((hash->msg == NULL) && (hash->len == 0) && (hash->used == 0))
-    {
-        if (hash->sha_type == RX64_SHA1)
-        {
-            XMEMCPY(out, DefaultShaHashData, sizeof(DefaultShaHashData));
-        }
-        else if (hash->sha_type == RX64_SHA224)
-        {
-            XMEMCPY(out, DefaultSha224HashData, sizeof(DefaultSha224HashData));
-        }
-        else if (hash->sha_type == RX64_SHA256)
-        {
-            XMEMCPY(out, DefaultSha256HashData, sizeof(DefaultSha256HashData));
-        }
-    }
-    else
-    {
-        /* Utilize RX64 SHA HW Acceleration for normal SHA operations. */
-        ret = RX64_ShaCalc(hash->msg, hash->len, out, hash->sha_type);
-        if (ret != R_PROCESS_COMPLETE)
-        {
-            return ret;
-        }
-    }
-
-    RX64_HashFree(hash);
-    return RX64_HashInit(hash, heap, 0, hash->sha_type);
-}
-
-/**
- * @brief Calculate hash value with the RX64 SHA subsystem.
- *
- * @param[in] hash Structure containing the information on what to hash
- * @param[out] out Sha hash
- * @retval int R_PROCESS_COMPLETE (0) on success.
- * @retval int BAD_FUNC_ARG or see r_sha.h on failure.
- */
-static int RX64_HashGet(wolfssl_RX64_HW_Hash* hash, byte* out)
-{
-    int ret;
-
-    if (hash == NULL || out == NULL) {
-        return BAD_FUNC_ARG;
-    }
-    if (hash->sha_type != RX64_SHA1 &&
-        hash->sha_type != RX64_SHA224 &&
-        hash->sha_type != RX64_SHA256)
-    {
-        return BAD_FUNC_ARG;
-    }
-
-    ret = RX64_ShaCalc(hash->msg, hash->len, out, hash->sha_type);
-    if (ret != R_PROCESS_COMPLETE) {
-        return ret;
-    }
-
-    return 0;
-}
-
-/**
- * @brief Copy a hash for use with the RX64 SHA subsystem.
- *
- * @param[in] src Source hash structure
- * @param[out] dst Destination hash structure
- * @return int 0 on success, BAD_FUNC_ARG on failure
- */
-static int RX64_HashCopy(wolfssl_RX64_HW_Hash* src, wolfssl_RX64_HW_Hash* dst)
-{
-    if (src == NULL || dst == NULL) {
-        return BAD_FUNC_ARG;
-    }
-
-    XMEMCPY(dst, src, sizeof(wolfssl_RX64_HW_Hash));
-
-    if (src->len > 0 && src->msg != NULL) {
-        dst->msg = (byte*)XMALLOC(src->len, dst->heap, DYNAMIC_TYPE_TMP_BUFFER);
-        if (dst->msg == NULL) {
-            return MEMORY_E;
-        }
-        XMEMCPY(dst->msg, src->msg, src->len);
-    }
-
-    return 0;
-}
-
-/* WolfCrypt wrapper function for RX64 SHA1 Init */
-int wc_InitSha_ex(wc_Sha* sha, void* heap, int devId)
-{
-    return RX64_HashInit((wolfssl_RX64_HW_Hash*)sha, heap, devId, RX64_SHA1);
-}
-/* WolfCrypt wrapper function for RX64 SHA1 Update */
-int wc_ShaUpdate(wc_Sha* sha, const byte* in, word32 sz)
-{
-    return RX64_HashUpdate((wolfssl_RX64_HW_Hash*)sha, in, sz);
-}
-/* WolfCrypt wrapper function for RX64 SHA1 Final */
-int wc_ShaFinal(wc_Sha* sha, byte* hash)
-{
-    return RX64_HashFinal((wolfssl_RX64_HW_Hash*)sha, hash);
-}
-/* WolfCrypt wrapper function for RX64 SHA1 Get */
-int wc_ShaGetHash(wc_Sha* sha, byte* hash)
-{
-    return RX64_HashGet((wolfssl_RX64_HW_Hash*)sha, hash);
-}
-/* WolfCrypt wrapper function for RX64 SHA1 Copy */
-int wc_ShaCopy(wc_Sha* src, wc_Sha* dst)
-{
-    return RX64_HashCopy((wolfssl_RX64_HW_Hash*)src, (wolfssl_RX64_HW_Hash*)dst);
-}
-
-#if defined(WOLFSSL_SHA224)
-#include <wolfssl/wolfcrypt/sha256.h>
-
-/* WolfCrypt wrapper function for RX64 SHA224 Init */
-int wc_InitSha224_ex(wc_Sha224* sha, void* heap, int devId)
-{
-    return RX64_HashInit((wolfssl_RX64_HW_Hash*)sha, heap, devId, RX64_SHA224);
-}
-/* WolfCrypt wrapper function for RX64 SHA224 Update */
-int wc_Sha224Update(wc_Sha224* sha, const byte* in, word32 sz)
-{
-    return RX64_HashUpdate((wolfssl_RX64_HW_Hash*)sha, in, sz);
-}
-/* WolfCrypt wrapper function for RX64 SHA224 Final */
-int wc_Sha224Final(wc_Sha224* sha, byte* hash)
-{
-    return RX64_HashFinal((wolfssl_RX64_HW_Hash*)sha, hash);
-}
-/* WolfCrypt wrapper function for RX64 SHA224 Get */
-int wc_Sha224GetHash(wc_Sha224* sha, byte* hash)
-{
-    return RX64_HashGet((wolfssl_RX64_HW_Hash*)sha, hash);
-}
-/* WolfCrypt wrapper function for RX64 SHA224 Copy */
-int wc_Sha224Copy(wc_Sha224* src, wc_Sha224* dst)
-{
-    return RX64_HashCopy((wolfssl_RX64_HW_Hash*)src, (wolfssl_RX64_HW_Hash*)dst);
-}
-#endif /* WOLFSSL_SHA224 */
-
-#if !defined(NO_SHA256)
-#include <wolfssl/wolfcrypt/sha256.h>
-
-/* WolfCrypt wrapper function for RX64 SHA256 Init */
-int wc_InitSha256_ex(wc_Sha256* sha, void* heap, int devId)
-{
-    return RX64_HashInit((wolfssl_RX64_HW_Hash*)sha, heap, devId, RX64_SHA256);
-}
-/* WolfCrypt wrapper function for RX64 SHA256 Update */
-int wc_Sha256Update(wc_Sha256* sha, const byte* in, word32 sz)
-{
-    return RX64_HashUpdate((wolfssl_RX64_HW_Hash*)sha, in, sz);
-}
-/* WolfCrypt wrapper function for RX64 SHA256 Final */
-int wc_Sha256Final(wc_Sha256* sha, byte* hash)
-{
-    return RX64_HashFinal((wolfssl_RX64_HW_Hash*)sha, hash);
-}
-/* WolfCrypt wrapper function for RX64 SHA256 Get */
-int wc_Sha256GetHash(wc_Sha256* sha, byte* hash)
-{
-    return RX64_HashGet((wolfssl_RX64_HW_Hash*)sha, hash);
-}
-/* WolfCrypt wrapper function for RX64 SHA256 Copy */
-int wc_Sha256Copy(wc_Sha256* src, wc_Sha256* dst)
-{
-    return RX64_HashCopy((wolfssl_RX64_HW_Hash*)src, (wolfssl_RX64_HW_Hash*)dst);
-}
-#endif /* !NO_SHA256 */
-#endif /* WOLFSSL_RENESAS_RX64_HASH */
-#endif /* #if !defined(NO_SHA) || !defined(NO_SHA256) */

+ 0 - 106
lib/wolfssl/wolfcrypt/src/port/Renesas/renesas_rx64_hw_util.c

@@ -1,106 +0,0 @@
-/* renesas_rx64_hw_util.c
- *
- * Contributed by Johnson Controls Tyco IP Holdings LLP.
- *
- * Use of this Software is subject to the GPLv2 License
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
- #ifdef HAVE_CONFIG_H
-    #include <config.h>
-#endif
-#include <wolfssl/wolfcrypt/settings.h>
-
-#if defined(WOLFSSL_RENESAS_RX64_HASH)
-
-#include <wolfssl/wolfcrypt/port/Renesas/renesas-rx64-hw-crypt.h>
-
-#include <stdio.h>
-/* mutex */
-static wolfSSL_Mutex rx64_hw_mutex;
-static int rx64_hw_CryptHwMutexInit_ = 0;
-
-/*
-* lock hw engine.
-* this should be called before using engine.
-*/
-int rx64_hw_lock(void)
-{
-    int ret = 0;
-
-    WOLFSSL_MSG("enter rx64_hw_lock");
-
-    if (rx64_hw_CryptHwMutexInit_ == 0){
-        ret = wc_InitMutex(&rx64_hw_mutex);
-        if (ret == 0) {
-            rx64_hw_CryptHwMutexInit_ = 1;
-        } else {
-            WOLFSSL_MSG(" mutex initialization failed.");
-            return -1;
-        }
-    }
-    if (wc_LockMutex(&rx64_hw_mutex) != 0) {
-        /* this should not happen */
-        return -1;
-    }
-
-    WOLFSSL_MSG("leave rx64_hw_lock");
-    return ret;
-}
-
-/*
-* release hw engine
-*/
-void rx64_hw_unlock(void)
-{
-    WOLFSSL_MSG("enter rx64_hw_unlock");
-    /* unlock hw engine for next use */
-    wc_UnLockMutex(&rx64_hw_mutex);
-    WOLFSSL_MSG("leave rx64_hw_unlock");
-}
-
-/* open RX64 HW drivers for use */
-void rx64_hw_Open(void)
-{
-    int ret = -1;
-    if (rx64_hw_lock() == 0) {
-        /* Enable the SHA coprocessor function. */
-        R_Sha_Init();
-        /* unlock hw */
-        rx64_hw_unlock();
-        ret = 0;
-    } else {
-        WOLFSSL_MSG("Failed to lock rx64 hw \n");
-    }
-    return ret;
-}
-
-/* close RX64 HW driver */
-void rx64_hw_Close(void)
-{
-    if (rx64_hw_lock() == 0) {
-        /* Disable the SHA coprocessor function. */
-        R_Sha_Close();
-        /* unlock hw */
-        rx64_hw_unlock();
-    } else {
-        WOLFSSL_MSG("Failed to unlock rx64 hw \n");
-    }
-}
-
-
-#endif /* WOLFSSL_RENESAS_RX64_HASH */

+ 0 - 589
lib/wolfssl/wolfcrypt/src/port/Renesas/renesas_sce_aes.c

@@ -1,589 +0,0 @@
-/* renesas_sce_aes.c
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-#ifdef HAVE_CONFIG_H
-    #include <config.h>
-#endif
-#include <wolfssl/wolfcrypt/settings.h>
-
-#ifndef NO_AES
-
-#if (defined(WOLFSSL_RENESAS_SCEPROTECT) || \
-     defined(WOLFSSL_RENESAS_SCEPROTECT_CRYPTONLY)) && \
-    !defined(NO_WOLFSSL_RENESAS_SCEPROTECT_AES)
-
-#include <wolfssl/wolfcrypt/wc_port.h>
-#include <wolfssl/wolfcrypt/error-crypt.h>
-#include <wolfssl/internal.h>
-#include <wolfssl/wolfcrypt/aes.h>
-#include "wolfssl/wolfcrypt/port/Renesas/renesas-sce-crypt.h"
-
-#ifdef NO_INLINE
-    #include <wolfssl/wolfcrypt/misc.h>
-#else
-    #define WOLFSSL_MISC_INCLUDED
-    #include <wolfcrypt/src/misc.c>
-#endif
-
-struct Aes;
-
-#define SCE_AES_GCM_AUTH_TAG_SIZE  16
-
-typedef fsp_err_t (*aesGcmEncInitFn)
-        (sce_gcm_handle_t*, sce_aes_wrapped_key_t*, uint8_t*, uint32_t);
-typedef fsp_err_t (*aesGcmEncUpdateFn)
-        (sce_gcm_handle_t*,uint8_t*, uint8_t*, uint32_t, uint8_t*, uint32_t);
-typedef fsp_err_t (*aesGcmEncFinalFn)
-        (sce_gcm_handle_t*, uint8_t*, uint32_t*, uint8_t*);
-
-typedef fsp_err_t (*aesGcmDecInitFn)
-        (sce_gcm_handle_t*, sce_aes_wrapped_key_t*, uint8_t*, uint32_t);
-typedef fsp_err_t (*aesGcmDecUpdateFn)
-        (sce_gcm_handle_t*,uint8_t*, uint8_t*, uint32_t, uint8_t*, uint32_t);
-typedef fsp_err_t (*aesGcmDecFinalFn)
-        (sce_gcm_handle_t*, uint8_t*, uint32_t*, uint8_t*, uint32_t);
-
-/* Perform Aes Gcm encryption by SCE
- *
- * aes    The AES object.
- * out    Buffer to hold cipher text
- * in     Buffer to hold plaintext
- * sz     Length of cipher text/plaintext in bytes
- * iv     Buffer holding IV/nonce
- * ivSz   Length of IV/nonce in bytes
- * authTag Buffer to hold authentication data
- * authTagSz Length of authentication data in bytes
- * ctx    The callback context
- * return FSP_SUCCESS(0) on Success, otherwise negative value
- */
-WOLFSSL_LOCAL int  wc_sce_AesGcmEncrypt(struct Aes* aes, byte* out,
-                              const byte* in, word32 sz,
-                              byte* iv, word32 ivSz,
-                              byte* authTag, word32 authTagSz,
-                              const byte* authIn, word32 authInSz,
-                              void* ctx)
-{
-    int ret;
-    sce_gcm_handle_t    _handle;
-    uint32_t            dataLen = sz;
-    User_SCEPKCbInfo    *info = (User_SCEPKCbInfo*)ctx;
-
-    aesGcmEncInitFn     initFn;
-    aesGcmEncUpdateFn   updateFn;
-    aesGcmEncFinalFn    finalFn;
-
-    uint8_t* plainBuf  = NULL;
-    uint8_t* cipherBuf = NULL;
-    uint8_t* aTagBuf   = NULL;
-    uint8_t  delta;
-    const uint8_t* iv_l = NULL;
-    uint32_t ivSz_l = 0;
-
-    sce_hmac_sha_wrapped_key_t key_client_mac;
-    sce_hmac_sha_wrapped_key_t key_server_mac;
-    sce_aes_wrapped_key_t      key_client_aes;
-    sce_aes_wrapped_key_t      key_server_aes;
-
-    /* sanity check */
-    if (aes == NULL || authTagSz > AES_BLOCK_SIZE || ivSz == 0 || ctx == NULL) {
-        return BAD_FUNC_ARG;
-    }
-
-    if (authTagSz < WOLFSSL_MIN_AUTH_TAG_SZ) {
-        WOLFSSL_MSG("GcmEncrypt authTagSz too small error");
-        return BAD_FUNC_ARG;
-    }
-
-    if (aes->ctx.keySize != 16 && aes->ctx.keySize != 32) {
-        WOLFSSL_MSG("keySize is invalid, neither 16 or 32.");
-        return BAD_FUNC_ARG;
-    }
-
-    if (aes->ctx.keySize == 16) {
-        initFn   = R_SCE_AES128GCM_EncryptInit;
-        updateFn = R_SCE_AES128GCM_EncryptUpdate;
-        finalFn  = R_SCE_AES128GCM_EncryptFinal;
-    }
-    else {
-        initFn   = R_SCE_AES256GCM_EncryptInit;
-        updateFn = R_SCE_AES256GCM_EncryptUpdate;
-        finalFn  = R_SCE_AES256GCM_EncryptFinal;
-    }
-
-
-    /* check if AES GCM can be used by SCE */
-    if ((ret = wc_sce_hw_lock()) == 0) {
-
-        /* allocate buffers for plaintext, ciphertext and authTag to make sure
-         * those buffers 32bit aligned as SCE requests.
-         */
-         delta = ((sz % AES_BLOCK_SIZE) == 0) ? 0 :
-                             AES_BLOCK_SIZE - (sz % AES_BLOCK_SIZE);
-        plainBuf  = XMALLOC(sz, aes->heap, DYNAMIC_TYPE_AES);
-        cipherBuf = XMALLOC(sz + delta, aes->heap, DYNAMIC_TYPE_AES);
-        aTagBuf   = XMALLOC(SCE_AES_GCM_AUTH_TAG_SIZE, aes->heap,
-                                                        DYNAMIC_TYPE_AES);
-
-        if (plainBuf == NULL || cipherBuf == NULL || aTagBuf == NULL) {
-            WOLFSSL_MSG("wc_sce_AesGcmEncrypt: buffer allocation failed");
-            ret = -1;
-        }
-
-        if (ret == 0) {
-            XMEMCPY(plainBuf, in, sz);
-            XMEMSET((void*)cipherBuf, 0, sz + delta);
-            XMEMSET((void*)authTag,   0, authTagSz);
-        }
-        
-      #if defined(WOLFSSL_RENESAS_SCEPROTECT)
-       if (ret == 0 &&
-           info->keyflgs_tls.bits.session_key_set == 1) {
-            /* generate AES-GCM session key. The key stored in
-             * Aes.ctx.tsip_keyIdx is not used here.
-             */
-            ret = R_SCE_TLS_SessionKeyGenerate(
-                    info->sce_cipher,
-                    (uint32_t*)info->sce_masterSecret,
-                    (uint8_t*) info->sce_clientRandom,
-                    (uint8_t*) info->sce_serverRandom,
-                    &iv[AESGCM_IMP_IV_SZ], /* use exp_IV */
-                    &key_client_mac,
-                    &key_server_mac,
-                    &key_client_aes,
-                    &key_server_aes,
-                    NULL, NULL);
-            if (ret != FSP_SUCCESS) {
-                WOLFSSL_MSG("R_SCE_TLS_SessionKeyGenerate failed");
-                ret = -1;
-            }
-
-        }
-        else {
-       #else
-        if (ret == 0) {
-       #endif
-            if (info->keyflgs_crypt.bits.aes256_installedkey_set == 1 ||
-                info->keyflgs_crypt.bits.aes128_installedkey_set == 1) {
-                if (aes->ctx.keySize == 32) {
-                    XMEMCPY(&key_client_aes, 
-                        (sce_aes_wrapped_key_t*)info->sce_wrapped_key_aes256,
-                        sizeof(sce_aes_wrapped_key_t));
-                }
-                else {
-                    XMEMCPY(&key_client_aes, 
-                        (sce_aes_wrapped_key_t*)info->sce_wrapped_key_aes128,
-                        sizeof(sce_aes_wrapped_key_t));
-                }
-                iv_l = iv;
-                ivSz_l = ivSz;
-            }
-            else {
-                WOLFSSL_MSG("AES key for SCE is not set.");
-                ret = -1;
-            }
-        }
-
-        if (ret == 0) {
-
-            /* since generated session key is coupled to iv, no need to pass
-             * them init func.
-             */
-            ret = initFn(&_handle, &key_client_aes, (uint8_t*)iv_l, ivSz_l);
-
-            if (ret == FSP_SUCCESS) {
-                ret = updateFn(&_handle, NULL, NULL, 0UL, (uint8_t*)authIn,
-                                                                    authInSz);
-            }
-            if (ret == FSP_SUCCESS) {
-                ret = updateFn(&_handle, plainBuf, cipherBuf, sz, NULL, 0UL);
-            }
-            if (ret != FSP_SUCCESS) {
-                WOLFSSL_MSG("R_SCE_AesXXXGcmEncryptUpdate2: failed");
-                ret = -1;
-            }
-
-            if (ret == FSP_SUCCESS) {
-                /* Once R_SCE_AesxxxGcmEncryptInit or R_SCE_AesxxxEncryptUpdate is
-                * called, R_SCE_AesxxxGcmEncryptFinal must be called regardless of
-                * the result of the previous call. Otherwise, SCE can not come out
-                * from its error state and all the trailing APIs will fail.
-                */
-                dataLen = 0;
-                ret = finalFn(&_handle,
-                           cipherBuf + (sz + delta - AES_BLOCK_SIZE),
-                              &dataLen,
-                              aTagBuf);
-
-                if (ret == FSP_SUCCESS) {
-                   /* copy encrypted data to out */
-                    if (sz != dataLen) {
-                        WOLFSSL_MSG("sz is not equal to dataLen!!!!");
-                        ret = -1;
-                    } else {
-                        XMEMCPY(out, cipherBuf, dataLen);
-                        /* copy auth tag to caller's buffer */
-                        XMEMCPY((void*)authTag, (void*)aTagBuf,
-                                    min(authTagSz, SCE_AES_GCM_AUTH_TAG_SIZE ));
-                    }
-                }
-                else {
-                    WOLFSSL_MSG("R_SCE_AesxxxGcmEncryptFinal: failed");
-                    ret = -1;
-                }
-            }
-        }
-
-        XFREE(plainBuf,  aes->heap, DYNAMIC_TYPE_AES);
-        XFREE(cipherBuf, aes->heap, DYNAMIC_TYPE_AES);
-        XFREE(aTagBuf,   aes->heap, DYNAMIC_TYPE_AES);
-
-        wc_sce_hw_unlock();
-
-    }
-
-    return ret;
-}
-/* Perform Aes Gcm decryption by SCE
- *
- * aes    The AES object.
- * out    Buffer to hold plaintext
- * in     Buffer to hold cipher text
- * sz     Length of cipher text/plaintext in bytes
- * iv     Buffer holding IV/nonce
- * ivSz   Length of IV/nonce in bytes
- * authTag Buffer to hold authentication data
- * authTagSz Length of authentication data in bytes
- * ctx    The Callback context
- * return FSP_SUCCESS(0) on Success, otherwise negative value
- */
-WOLFSSL_LOCAL int  wc_sce_AesGcmDecrypt(struct Aes* aes, byte* out,
-                          const byte* in, word32 sz,
-                          const byte* iv, word32 ivSz,
-                          const byte* authTag, word32 authTagSz,
-                          const byte* authIn, word32 authInSz,
-                          void* ctx)
-{
-    int ret;
-    sce_gcm_handle_t _handle;
-    uint32_t            dataLen = sz;
-    User_SCEPKCbInfo    *info = (User_SCEPKCbInfo*)ctx;
-
-    aesGcmDecInitFn     initFn;
-    aesGcmDecUpdateFn   updateFn;
-    aesGcmDecFinalFn    finalFn;
-
-    uint8_t* cipherBuf = NULL;
-    uint8_t* plainBuf  = NULL;
-    uint8_t* aTagBuf = NULL;
-    uint8_t  delta;
-    const uint8_t* iv_l = NULL;
-    uint32_t ivSz_l = 0;
-
-    sce_hmac_sha_wrapped_key_t key_client_mac;
-    sce_hmac_sha_wrapped_key_t key_server_mac;
-    sce_aes_wrapped_key_t      key_client_aes;
-    sce_aes_wrapped_key_t      key_server_aes;
-
-    /* sanity check */
-    if (aes == NULL || authTagSz > AES_BLOCK_SIZE || ivSz == 0 || ctx == NULL) {
-        return BAD_FUNC_ARG;
-    }
-
-    if (authTagSz < WOLFSSL_MIN_AUTH_TAG_SZ) {
-        WOLFSSL_MSG("GcmEncrypt authTagSz too small error");
-        return BAD_FUNC_ARG;
-    }
-
-    if (aes->ctx.keySize != 16 && aes->ctx.keySize != 32) {
-        WOLFSSL_MSG("keySize is invalid, neither 16 or 32.");
-        return BAD_FUNC_ARG;
-    }
-
-    if (aes->ctx.keySize == 16) {
-        initFn   = R_SCE_AES128GCM_DecryptInit;
-        updateFn = R_SCE_AES128GCM_DecryptUpdate;
-        finalFn  = R_SCE_AES128GCM_DecryptFinal;
-    }
-    else {
-        initFn   = R_SCE_AES256GCM_DecryptInit;
-        updateFn = R_SCE_AES256GCM_DecryptUpdate;
-        finalFn  = R_SCE_AES256GCM_DecryptFinal;
-    }
-
-
-    if ((ret = wc_sce_hw_lock()) == 0) {
-       /* allocate buffers for plain-text, cipher-text, authTag and AAD.
-         * TSIP requests those buffers 32bit aligned.
-         */
-         delta = ((sz % AES_BLOCK_SIZE) == 0) ? 0 :
-                                      AES_BLOCK_SIZE - (sz % AES_BLOCK_SIZE);
-        cipherBuf = XMALLOC(sz, aes->heap, DYNAMIC_TYPE_AES);
-        plainBuf  = XMALLOC(sz + delta, aes->heap, DYNAMIC_TYPE_AES);
-        aTagBuf   = XMALLOC(SCE_AES_GCM_AUTH_TAG_SIZE, aes->heap,
-                                                        DYNAMIC_TYPE_AES);
-
-        if (plainBuf == NULL || cipherBuf == NULL || aTagBuf == NULL) {
-            ret = -1;
-        }
-
-        if (ret == 0) {
-            XMEMSET((void*)plainBuf,  0, sz);
-            XMEMCPY(cipherBuf, in, sz);
-            XMEMCPY(aTagBuf, authTag, authTagSz);
-        }
-       #if defined(WOLFSSL_RENESAS_SCEPROTECT)
-        if (ret == 0 &&
-            info->keyflgs_tls.bits.session_key_set == 1) {
-            /* generate AES-GCM session key. The key stored in
-             * Aes.ctx.tsip_keyIdx is not used here.
-             */
-            ret = R_SCE_TLS_SessionKeyGenerate(
-                    info->sce_cipher,
-                    (uint32_t*)info->sce_masterSecret,
-                    (uint8_t*) info->sce_clientRandom,
-                    (uint8_t*) info->sce_serverRandom,
-                    (uint8_t*)&iv[AESGCM_IMP_IV_SZ], /* use exp_IV */
-                    &key_client_mac,
-                    &key_server_mac,
-                    &key_client_aes,
-                    &key_server_aes,
-                    NULL, NULL);
-            if (ret != FSP_SUCCESS) {
-                WOLFSSL_MSG("R_SCE_TLS_SessionKeyGenerate failed");
-                ret = -1;
-            }
-        }
-        else {
-       #else
-        if (ret == 0) {
-       #endif
-            if (info->keyflgs_crypt.bits.aes256_installedkey_set == 1 ||
-                info->keyflgs_crypt.bits.aes128_installedkey_set == 1) {
-                if (aes->ctx.keySize == 32) {
-                    XMEMCPY(&key_server_aes, 
-                        (sce_aes_wrapped_key_t*)info->sce_wrapped_key_aes256,
-                        sizeof(sce_aes_wrapped_key_t));
-                }
-                else {
-                    XMEMCPY(&key_server_aes, 
-                        (sce_aes_wrapped_key_t*)info->sce_wrapped_key_aes128,
-                        sizeof(sce_aes_wrapped_key_t));
-                }
-                iv_l = iv;
-                ivSz_l = ivSz;
-            }
-            else {
-                WOLFSSL_MSG("AES key for SCE is not set.");
-                ret = -1;
-            }
-        }
-        
-        if (ret == 0) {
-            /* since key_index has iv and ivSz in it, no need to pass them init
-             * func. Pass NULL and 0 as 3rd and 4th parameter respectively.
-             */
-             ret = initFn(&_handle, &key_server_aes, (uint8_t*)iv_l, ivSz_l);
-
-
-            if (ret == FSP_SUCCESS) {
-                /* pass only AAD and it's size before passing cipher text */
-                ret = updateFn(&_handle, NULL, NULL, 0UL, (uint8_t*)authIn,
-                                                                    authInSz);
-            }
-            if (ret == FSP_SUCCESS) {
-                ret = updateFn(&_handle, cipherBuf, plainBuf, sz, NULL, 0UL);
-            }
-            if (ret != FSP_SUCCESS) {
-                WOLFSSL_MSG("R_SCE_AesXXXGcmDecryptUpdate: failed in decrypt");
-                ret = -1;
-            }
-
-            if (ret == FSP_SUCCESS) {
-                dataLen = 0;
-                ret = finalFn(&_handle,
-                                  plainBuf + (sz + delta - AES_BLOCK_SIZE),
-                            &dataLen,
-                            aTagBuf,
-                            min(16, authTagSz));
-
-                if (ret == FSP_SUCCESS) {
-                    /* copy plain data to out */
-                    if (sz != dataLen) {
-                        WOLFSSL_MSG("sz is not equal to dataLen!!!!");
-                        ret = -1;
-                    }
-                    else {
-                        XMEMCPY(out, plainBuf, dataLen);
-                    }
-                }
-                else {
-                    WOLFSSL_MSG("R_SCE_AesXXXGcmDecryptFinal: failed");
-                    ret = -1;
-                }
-            }
-        }
-
-        XFREE(aTagBuf,   aes->heap, DYNAMIC_TYPE_AES);
-        XFREE(plainBuf,  aes->heap, DYNAMIC_TYPE_AES);
-        XFREE(cipherBuf, aes->heap, DYNAMIC_TYPE_AES);
-
-        wc_sce_hw_unlock();
-    }
-
-    return ret;
-}
-/* Perform Aes Cbc encryption by SCE
- *
- * aes    The AES object.
- * out    Buffer to hold cipher text
- * in     Buffer to hold plain text
- * sz     Length of cipher text/plaintext in bytes
- * return FSP_SUCCESS(0) on Success, otherwise negative value
- */
-WOLFSSL_LOCAL int wc_sce_AesCbcEncrypt(struct Aes* aes, byte* out,
-                                                const byte* in, word32 sz)
-{
-    sce_aes_handle_t _handle;
-    word32 ret;
-    word32 blocks = (sz / AES_BLOCK_SIZE);
-    uint32_t dataLength;
-    byte *iv;
-
-    if ((in == NULL) || (out == NULL) || (aes == NULL))
-      return BAD_FUNC_ARG;
-
-    /* while doing TLS handshake, SCE driver keeps true-key and iv *
-     * on the device. iv is dummy                                   */
-    iv = (uint8_t*)aes->reg;
-
-    if ((ret = wc_sce_hw_lock()) != 0) {
-        WOLFSSL_MSG("Failed to lock");
-        return ret;
-    }
-
-    if (aes->ctx.keySize == 16) {
-        ret = R_SCE_AES128CBC_EncryptInit(&_handle, &aes->ctx.sce_wrapped_key, iv);
-    }
-    else if (aes->ctx.keySize == 32) {
-        ret = R_SCE_AES256CBC_EncryptInit(&_handle, &aes->ctx.sce_wrapped_key, iv);
-    }
-    else {
-        WOLFSSL_MSG("invalid key Size for SCE. Key size is neither 16 or 32.");
-        wc_sce_hw_unlock();
-        return -1;
-    }
-
-    while (ret == FSP_SUCCESS && blocks--) {
-
-        if (aes->ctx.keySize == 16)
-            ret = R_SCE_AES128CBC_EncryptUpdate(&_handle, (uint8_t*)in,
-                                    (uint8_t*)out, (uint32_t)AES_BLOCK_SIZE);
-        else
-            ret = R_SCE_AES256CBC_EncryptUpdate(&_handle, (uint8_t*)in,
-                                    (uint8_t*)out, (uint32_t)AES_BLOCK_SIZE);
-
-        in  += AES_BLOCK_SIZE;
-        out += AES_BLOCK_SIZE;
-    }
-
-    if (ret == FSP_SUCCESS) {
-        if (aes->ctx.keySize == 16) {
-            ret = R_SCE_AES128CBC_EncryptFinal(&_handle, out, &dataLength);
-        }
-        else {
-            ret = R_SCE_AES256CBC_EncryptFinal(&_handle, out, &dataLength);
-        }
-    }
-    else {
-        WOLFSSL_MSG("SCE AES CBC encryption failed");
-        ret = -1;
-    }
-
-    wc_sce_hw_unlock();
-    return ret;
-}
-/* Perform Aes Cbc decryption by SCE
- *
- * aes    The AES object.
- * out    Buffer to hold plain text
- * in     Buffer to hold cipher text
- * sz     Length of cipher text/plaintext in bytes
- * return FSP_SUCCESS(0) on Success, otherwise negative value
- */
-WOLFSSL_LOCAL int wc_sce_AesCbcDecrypt(struct Aes* aes, byte* out, const byte* in, word32 sz)
-{
-    sce_aes_handle_t _handle;
-    word32 ret;
-    word32 blocks = (sz / AES_BLOCK_SIZE);
-    uint32_t dataLength;
-    byte *iv;
-
-    if ((in == NULL) || (out == NULL) || (aes == NULL))
-      return BAD_FUNC_ARG;
-
-    iv = (uint8_t*)aes->reg;
-
-    if ((ret = wc_sce_hw_lock()) != 0) {
-        WOLFSSL_MSG("Failed to lock");
-        return ret;
-    }
-
-    if (aes->ctx.keySize == 16) {
-        ret = R_SCE_AES128CBC_DecryptInit(&_handle, &aes->ctx.sce_wrapped_key, iv);
-    }
-    else if (aes->ctx.keySize == 32) {
-        ret = R_SCE_AES256CBC_DecryptInit(&_handle, &aes->ctx.sce_wrapped_key, iv);
-    }
-    else {
-        wc_sce_hw_unlock();
-        return -1;
-    }
-
-    while (ret == FSP_SUCCESS && blocks--) {
-
-        if (aes->ctx.keySize == 16)
-            ret = R_SCE_AES128CBC_DecryptUpdate(&_handle, (uint8_t*)in,
-                                        (uint8_t*)out, (uint32_t)AES_BLOCK_SIZE);
-        else
-            ret = R_SCE_AES256CBC_DecryptUpdate(&_handle, (uint8_t*)in,
-                                        (uint8_t*)out, (uint32_t)AES_BLOCK_SIZE);
-
-        in  += AES_BLOCK_SIZE;
-        out += AES_BLOCK_SIZE;
-    }
-
-    if (ret == FSP_SUCCESS) {
-        if (aes->ctx.keySize == 16)
-            ret = R_SCE_AES128CBC_DecryptFinal(&_handle, out, &dataLength);
-        else
-            ret = R_SCE_AES256CBC_DecryptFinal(&_handle, out, &dataLength);
-    }
-    else {
-        WOLFSSL_MSG("SCE AES CBC decryption failed");
-        ret = -1;
-    }
-
-    wc_sce_hw_unlock();
-    return ret;
-}
-
-#endif /* WOLFSSL_RENESAS_TSIP_CRYPT */
-#endif /* NO_AES */

+ 0 - 437
lib/wolfssl/wolfcrypt/src/port/Renesas/renesas_sce_rsa.c

@@ -1,437 +0,0 @@
-/* renesas_sce_rsa.c
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
- 
-#if !defined(NO_RSA) && \
-    defined(WOLFSSL_RENESAS_SCEPROTECT_CRYPTONLY)
-
-#include <string.h>
-#include <stdio.h>
-
-#ifdef HAVE_CONFIG_H
-    #include <config.h>
-#endif
-
-#include <wolfssl/wolfcrypt/settings.h>
-#include <wolfssl/wolfcrypt/logging.h>
-#include <wolfssl/wolfcrypt/error-crypt.h>
-#include <wolfssl/wolfcrypt/rsa.h>
-#include <wolfssl/wolfcrypt/port/Renesas/renesas-sce-crypt.h>
-
-/* Make Rsa key for SCE and set it to callback ctx
- * Assumes to be called by Crypt Callback
- *
- * size   desired keylenth, in bits. supports 1024 or 2048 bits
- * ctx    Callback context including pointer to hold generated key
- * return FSP_SUCCESS(0) on Success, otherwise negative value
- */
-WOLFSSL_LOCAL int wc_sce_MakeRsaKey(int size, void* ctx)
-{
-    fsp_err_t        ret;
-    User_SCEPKCbInfo *info = (User_SCEPKCbInfo*)ctx;
-
-    sce_rsa1024_wrapped_pair_key_t *wrapped_pair1024_key = NULL;
-    sce_rsa2048_wrapped_pair_key_t *wrapped_pair2048_key = NULL;
-
-    /* sanity check */
-    if (ctx == NULL)
-        return BAD_FUNC_ARG;
-    
-
-    if ((ret = wc_sce_hw_lock()) == 0) {
-        if (size == 1024) {
-            wrapped_pair1024_key = 
-            (sce_rsa1024_wrapped_pair_key_t*)XMALLOC(
-                sizeof(sce_rsa1024_wrapped_pair_key_t), NULL, 
-                                                DYNAMIC_TYPE_RSA_BUFFER);
-            if (wrapped_pair1024_key == NULL)
-                return MEMORY_E;
-                
-            ret = R_SCE_RSA1024_WrappedKeyPairGenerate(wrapped_pair1024_key);
-        }
-        else if (size == 2048) {
-            wrapped_pair2048_key = 
-            (sce_rsa1024_wrapped_pair_key_t*)XMALLOC(
-                sizeof(sce_rsa2048_wrapped_pair_key_t), NULL, 
-                                                DYNAMIC_TYPE_RSA_BUFFER);
-            if (wrapped_pair2048_key == NULL)
-                return MEMORY_E;
-                
-            ret = R_SCE_RSA2048_WrappedKeyPairGenerate(wrapped_pair2048_key);
-        }
-        else
-            return CRYPTOCB_UNAVAILABLE;
-            
-        if (ret == FSP_SUCCESS) {
-            if (size == 1024) {
-                if (info->sce_wrapped_key_rsapri1024 != NULL) {
-                    XFREE(info->sce_wrapped_key_rsapri1024, NULL, 
-                                                DYNAMIC_TYPE_RSA_BUFFER);
-                }
-                if (info->sce_wrapped_key_rsapub1024 != NULL) {
-                    XFREE(info->sce_wrapped_key_rsapub1024, NULL, 
-                                                DYNAMIC_TYPE_RSA_BUFFER);
-                }
-                info->sce_wrapped_key_rsapri1024 = 
-                (sce_rsa1024_private_wrapped_key_t*)XMALLOC(
-                    sizeof(sce_rsa1024_private_wrapped_key_t), NULL, 
-                                                DYNAMIC_TYPE_RSA_BUFFER);
-                    
-                if (info->sce_wrapped_key_rsapri1024 == NULL) {
-                    XFREE(wrapped_pair1024_key, 0, DYNAMIC_TYPE_RSA_BUFFER);
-                    return MEMORY_E;
-                }
-                
-                info->sce_wrapped_key_rsapub1024 =
-                (sce_rsa1024_public_wrapped_key_t*)XMALLOC(
-                    sizeof(sce_rsa1024_public_wrapped_key_t), NULL, 
-                                                DYNAMIC_TYPE_RSA_BUFFER);
-                    
-                if (info->sce_wrapped_key_rsapub1024 == NULL) {
-                    XFREE(wrapped_pair1024_key, 0, DYNAMIC_TYPE_RSA_BUFFER);
-                    XFREE(info->sce_wrapped_key_rsapub1024, 0, 
-                                                DYNAMIC_TYPE_RSA_BUFFER);
-                    return MEMORY_E;
-                }
-                /* copy generated key pair and free malloced key */
-                XMEMCPY(info->sce_wrapped_key_rsapri1024, 
-                                    &wrapped_pair1024_key->priv_key,
-                                    sizeof(sce_rsa1024_private_wrapped_key_t));
-                XMEMCPY(info->sce_wrapped_key_rsapub1024, 
-                                    &wrapped_pair1024_key->pub_key,
-                                    sizeof(sce_rsa1024_public_wrapped_key_t));
-                XFREE(wrapped_pair1024_key, 0, DYNAMIC_TYPE_RSA_BUFFER);
-                
-                info->keyflgs_crypt.bits.rsapri1024_installedkey_set = 1;
-                info->keyflgs_crypt.bits.rsapub1024_installedkey_set = 1;
-            }
-            else if (size == 2048) {
-                if (info->sce_wrapped_key_rsapri2048 != NULL) {
-                    XFREE(info->sce_wrapped_key_rsapri2048, NULL, 
-                                    DYNAMIC_TYPE_RSA_BUFFER);
-                }
-                if (info->sce_wrapped_key_rsapub2048 != NULL) {
-                    XFREE(info->sce_wrapped_key_rsapub2048, NULL, 
-                                    DYNAMIC_TYPE_RSA_BUFFER);
-                }
-                info->sce_wrapped_key_rsapri2048 = 
-                (sce_rsa2048_private_wrapped_key_t*)XMALLOC(
-                    sizeof(sce_rsa2048_private_wrapped_key_t), NULL, 
-                                    DYNAMIC_TYPE_RSA_BUFFER);
-                    
-                if (info->sce_wrapped_key_rsapri2048 == NULL) {
-                    XFREE(wrapped_pair2048_key, 0, DYNAMIC_TYPE_RSA_BUFFER);
-                    return MEMORY_E;
-                }
-                
-                info->sce_wrapped_key_rsapub2048 =
-                (sce_rsa2048_public_wrapped_key_t*)XMALLOC(
-                    sizeof(sce_rsa2048_public_wrapped_key_t), NULL, 
-                                    DYNAMIC_TYPE_RSA_BUFFER);
-                    
-                if (info->sce_wrapped_key_rsapub2048 == NULL) {
-                    XFREE(wrapped_pair2048_key, 0, DYNAMIC_TYPE_RSA_BUFFER);
-                    XFREE(info->sce_wrapped_key_rsapub1024, 0, 
-                                    DYNAMIC_TYPE_RSA_BUFFER);
-                    return MEMORY_E;
-                }
-                /* copy generated key pair and free malloced key */
-                XMEMCPY(info->sce_wrapped_key_rsapri2048, 
-                            &wrapped_pair2048_key->priv_key,
-                            sizeof(sce_rsa2048_private_wrapped_key_t));
-                XMEMCPY(info->sce_wrapped_key_rsapub2048, 
-                            &wrapped_pair2048_key->pub_key,
-                            sizeof(sce_rsa2048_public_wrapped_key_t));
-                XFREE(wrapped_pair2048_key, 0, DYNAMIC_TYPE_RSA_BUFFER);
-                
-                info->keyflgs_crypt.bits.rsapri2048_installedkey_set = 1;
-                info->keyflgs_crypt.bits.rsapub2048_installedkey_set = 1;
-                
-            }
-        }
-        else {
-            WOLFSSL_MSG("Failed to generate key pair by SCE");
-            return CRYPTOCB_UNAVAILABLE;
-        }
-        
-        wc_sce_hw_unlock();
-    }
-}
-
-/* Perform rsa encryption/decryption by SCE
- * Assumes to be called by Crypt Callback
- *
- * in     Buffer to hold plain text
- * inLen  Length of plain text in bytes
- * out    Buffer to hold cipher text
- * outLen Length of cipher in bytes
- * key    Rsa key object
- * rng    rng object
- * ctx    Callback context
- * return FSP_SUCCESS(0) on Success, otherwise negative value
- */
-WOLFSSL_LOCAL int wc_sce_RsaFunction(const byte* in, word32 inLen, byte* out,
-                    word32 outLen, int type, struct RsaKey* key, 
-                    struct WC_RNG* rng, void* ctx)
-{
-    int ret;
-    
-    sce_rsa_byte_data_t plain;
-    sce_rsa_byte_data_t cipher;
-    User_SCEPKCbInfo    *info = (User_SCEPKCbInfo*)ctx;
-    
-    int keySize;
-    
-    (void) key;
-    (void) rng;
-    
-    /* sanity check */
-    if (in == NULL || out == NULL || outLen == NULL ||
-                                            ctx == NULL){
-        return BAD_FUNC_ARG;
-    }
-    
-    keySize = 0;
-    if (info->keyflgs_crypt.bits.rsapri2048_installedkey_set == 1 ||
-        info->keyflgs_crypt.bits.rsapub2048_installedkey_set == 1 )
-        keySize = 2048;
-    else if (info->keyflgs_crypt.bits.rsapri1024_installedkey_set == 1 ||
-             info->keyflgs_crypt.bits.rsapub1024_installedkey_set == 1 )
-        keySize = 1024;
-    
-    if (keySize == 0) {
-        WOLFSSL_MSG("keySize is invalid, neither 128 or 256 bytes, "
-                                                        "1024 or 2048 bits.");
-        return BAD_FUNC_ARG;
-    }
-    
-    if ((ret = wc_sce_hw_lock()) == 0) {
-        if (type == RSA_PUBLIC_ENCRYPT) {
-            
-            plain.pdata = (byte*)in;
-            plain.data_length = inLen;
-            cipher.pdata = out;
-            cipher.data_length = outLen;
-
-            if (keySize == 1024) {
-                if(info->keyflgs_crypt.bits.rsapub1024_installedkey_set == 1)
-                    ret = R_SCE_RSAES_PKCS1024_Encrypt(&plain, &cipher,
-                        (sce_rsa1024_public_wrapped_key_t*)
-                            info->sce_wrapped_key_rsapub1024);
-                else {
-                    WOLFSSL_MSG("wrapped public 1024 bits key is not set.");
-                    return BAD_FUNC_ARG;
-                }
-            }
-            else {
-                if(info->keyflgs_crypt.bits.rsapub2048_installedkey_set == 1)
-                    ret = R_SCE_RSAES_PKCS2048_Encrypt(&plain, &cipher,
-                            (sce_rsa2048_public_wrapped_key_t*)
-                                info->sce_wrapped_key_rsapub2048);
-                else {
-                    WOLFSSL_MSG("wrapped public 2048 bits key is not set.");
-                    return BAD_FUNC_ARG;
-                }
-            }
-        }
-        else if (type == RSA_PRIVATE_DECRYPT) {
-            plain.pdata = out;
-            plain.data_length = outLen;
-            cipher.pdata = (byte*)in;
-            cipher.data_length = inLen;
-            
-            if (keySize == 1024) {
-                if(info->keyflgs_crypt.bits.rsapri1024_installedkey_set == 1)
-                    ret = R_SCE_RSAES_PKCS1024_Decrypt(&cipher, &plain,
-                            (sce_rsa1024_private_wrapped_key_t*)
-                                info->sce_wrapped_key_rsapri1024);
-                else {
-                    WOLFSSL_MSG("wrapped private 2048 bits key is not set.");
-                    return BAD_FUNC_ARG;
-                }
-            }
-            else {
-                if(info->keyflgs_crypt.bits.rsapri2048_installedkey_set == 1)
-                    ret = R_SCE_RSAES_PKCS2048_Decrypt(&cipher, &plain,
-                            (sce_rsa2048_private_wrapped_key_t*)
-                                info->sce_wrapped_key_rsapri2048);
-                else {
-                    WOLFSSL_MSG("wrapped private 2048 bits key is not set.");
-                    return BAD_FUNC_ARG;
-                }
-            }
-        }
-        
-        wc_sce_hw_unlock();
-    }
-    return ret;
-}
-
-/* Perform Rsa sign by SCE
- * Assumes to be called by Crypt Callback
- * 
- * in     Buffer to hold plaintext
- * inLen  Length of plaintext in bytes
- * out    Buffer to hold generated signature
- * outLen Length of signature in bytes
- * key    rsa key object
- * ctx    The callback context
- * return FSP_SUCCESS(0) on Success, otherwise negative value
- */
- 
-WOLFSSL_LOCAL int wc_sce_RsaSign(const byte* in, word32 inLen, byte* out,
-                    word32* outLen, struct RsaKey* key, void* ctx)
-{
-    int ret;
-    
-    sce_rsa_byte_data_t message_hash;
-    sce_rsa_byte_data_t signature;
-    User_SCEPKCbInfo    *info = (User_SCEPKCbInfo*)ctx;
-    int keySize;
-    
-    (void) key;
-    
-    /* sanity check */
-    if (in == NULL || out == NULL || outLen == NULL ||
-                                key == NULL || ctx == NULL){
-        return BAD_FUNC_ARG;
-    }
-    
-    keySize = 0;
-    if (info->keyflgs_crypt.bits.rsapri2048_installedkey_set == 1 ||
-        info->keyflgs_crypt.bits.rsapub2048_installedkey_set == 1 )
-        keySize = 2048;
-    else if (info->keyflgs_crypt.bits.rsapri1024_installedkey_set == 1 ||
-             info->keyflgs_crypt.bits.rsapub1024_installedkey_set == 1 )
-        keySize = 1024;
-        
-    if (keySize == 0) {
-        WOLFSSL_MSG("keySize is invalid, neither 1024 or 2048 bits.");
-        return BAD_FUNC_ARG;
-    }
-    
-    message_hash.pdata = in;
-    message_hash.data_length = inLen;
-    message_hash.data_type = 
-            info->keyflgs_crypt.bits.message_type;/* message 0, hash 1 */
-    signature.pdata = out;
-    signature.data_length = outLen;
-    
-    if ((ret = wc_sce_hw_lock()) == 0) {
-        if (keySize == 1024) {
-            
-            ret = R_SCE_RSASSA_PKCS1024_SignatureGenerate(&message_hash, 
-                        &signature,
-                        (sce_rsa1024_private_wrapped_key_t *)
-                                    info->sce_wrapped_key_rsapri1024,
-                        HW_SCE_RSA_HASH_SHA256);
-        }
-        else {
-            
-            ret = R_SCE_RSASSA_PKCS2048_SignatureGenerate(&message_hash, 
-                        &signature,
-                        (sce_rsa2048_private_wrapped_key_t *)
-                                    info->sce_wrapped_key_rsapri2048,
-                        HW_SCE_RSA_HASH_SHA256);
-        }
-        
-        wc_sce_hw_unlock();
-    }
-    
-    return ret;
-}
-
-/* Perform Rsa verify by SCE
- * Assumes to be called by Crypt Callback
- * 
- * in     Buffer to hold plaintext
- * inLen  Length of plaintext in bytes
- * out    Buffer to hold generated signature
- * outLen Length of signature in bytes
- * key    rsa key object
- * ctx    The callback context
- * return FSP_SUCCESS(0) on Success, otherwise negative value
- */
- 
-WOLFSSL_LOCAL int wc_sce_RsaVerify(const byte* in, word32 inLen, byte* out,
-                    word32* outLen,struct RsaKey* key, void* ctx)
-{
-    int ret;
-    
-    sce_rsa_byte_data_t message_hash;
-    sce_rsa_byte_data_t signature;
-    User_SCEPKCbInfo    *info = (User_SCEPKCbInfo*)ctx;
-    int keySize;
-    
-    (void) key;
-    
-    /* sanity check */
-    if (in == NULL || out == NULL || outLen == NULL ||
-                                key == NULL || ctx == NULL){
-        return BAD_FUNC_ARG;
-    }
-    
-    keySize = 0;
-    if (info->keyflgs_crypt.bits.rsapri2048_installedkey_set == 1 ||
-        info->keyflgs_crypt.bits.rsapub2048_installedkey_set == 1 )
-        keySize = 2048;
-    else if (info->keyflgs_crypt.bits.rsapri1024_installedkey_set == 1 ||
-             info->keyflgs_crypt.bits.rsapub1024_installedkey_set == 1 )
-        keySize = 1024;
-        
-    if (keySize == 0) {
-        WOLFSSL_MSG("keySize is invalid, neither 1024 or 2048 bits.");
-        return BAD_FUNC_ARG;
-    }
-    
-    
-    message_hash.pdata = in;
-    message_hash.data_length = inLen;
-    message_hash.data_type = 
-            info->keyflgs_crypt.bits.message_type;/* message 0, hash 1 */
-    
-    signature.pdata = out;
-    signature.data_length = outLen;
-    
-    if ((ret = wc_sce_hw_lock()) == 0) {
-        if (keySize == 1024) {
-            
-            ret = R_SCE_RSASSA_PKCS1024_SignatureVerify(&signature,
-                  &message_hash,
-                  (sce_rsa1024_public_wrapped_key_t *)
-                        info->sce_wrapped_key_rsapub1024,
-                  HW_SCE_RSA_HASH_SHA256);
-        }
-        else {
-            
-                ret = R_SCE_RSASSA_PKCS2048_SignatureVerify(&signature, 
-                    &message_hash,
-                    (sce_rsa2048_public_wrapped_key_t *)
-                         info->sce_wrapped_key_rsapub2048,
-                    HW_SCE_RSA_HASH_SHA256 );
-        }
-        
-        wc_sce_hw_unlock();
-    }
-    
-    return ret;
-}
-
-#endif /* !NO_RSA && WOLFSSL_RENESAS_SCEPROTECT_CRYPTONLY */

+ 0 - 267
lib/wolfssl/wolfcrypt/src/port/Renesas/renesas_sce_sha.c

@@ -1,267 +0,0 @@
-/* renesas_sce_sha.c
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-#include <string.h>
-#include <stdio.h>
-
-#ifdef HAVE_CONFIG_H
-    #include <config.h>
-#endif
-#include <wolfssl/wolfcrypt/settings.h>
-
-#if !defined(NO_SHA256)
-
-#include <wolfssl/wolfcrypt/logging.h>
-
-#if defined(WOLFSSL_RENESAS_SCEPROTECT) || \
-    defined(WOLFSSL_RENESAS_SCEPROTECT_CRYPTONLY)
-
-#include <wolfssl/wolfcrypt/error-crypt.h>
-#include <wolfssl/wolfcrypt/port/Renesas/renesas-sce-crypt.h>
-
-/* Free up allocation for msg
- *
- * hash    The SCE Hash object.
- * no return value
- */
-static void SCEHashFree(wolfssl_SCE_Hash* hash)
-{
-    if (hash == NULL)
-        return;
-
-    if (hash->msg != NULL) {
-        XFREE(hash->msg, hash->heap, DYNAMIC_TYPE_TMP_BUFFER);
-        hash->msg = NULL;
-    }
-}
-/* Initialize Hash object
- *
- * hash    The SCE Hash object.
- * heap    Buffer to hold heap if available
- * devId   device Id
- * return  0 on success, BAD_FUNC_ARG when has is NULL
- */
-static int SCEHashInit(wolfssl_SCE_Hash* hash, void* heap, int devId,
-    word32 sha_type)
-{
-    if (hash == NULL) {
-        return BAD_FUNC_ARG;
-    }
-
-    (void)devId;
-    XMEMSET(hash, 0, sizeof(wolfssl_SCE_Hash));
-
-    hash->heap = heap;
-    hash->len  = 0;
-    hash->used = 0;
-    hash->msg  = NULL;
-    hash->sha_type = sha_type;
-
-    return 0;
-}
-
-/* Add data to msg(work buffer) for final hash operation
- *
- * hash    The SCE Hash object.
- * data    Buffer to hold plain text for hash
- * sz      Length of data
- * return  0 on success, otherwise MEMORY_E or BAD_FUNC_ARG on failure
- */
-static int SCEHashUpdate(wolfssl_SCE_Hash* hash, const byte* data, word32 sz)
-{
-    if (hash == NULL || (sz > 0 && data == NULL)) {
-        return BAD_FUNC_ARG;
-    }
-
-    if (hash->len < hash->used + sz) {
-        if (hash->msg == NULL) {
-            hash->msg = (byte*)XMALLOC(hash->used + sz, hash->heap,
-                    DYNAMIC_TYPE_TMP_BUFFER);
-        }
-        else {
-#ifdef FREERTOS
-            byte* pt = (byte*)XMALLOC(hash->used + sz, hash->heap,
-                    DYNAMIC_TYPE_TMP_BUFFER);
-            if (pt == NULL) {
-                return MEMORY_E;
-            }
-            XMEMCPY(pt, hash->msg, hash->used);
-            XFREE(hash->msg, hash->heap, DYNAMIC_TYPE_TMP_BUFFER);
-            hash->msg = NULL;
-            hash->msg = pt;
-#else
-            byte* pt = (byte*)XREALLOC(hash->msg, hash->used + sz, hash->heap,
-                    DYNAMIC_TYPE_TMP_BUFFER);
-            if (pt == NULL) {
-                return MEMORY_E;
-            }
-            hash->msg = pt;
-#endif
-        }
-        if (hash->msg == NULL) {
-            return MEMORY_E;
-        }
-        hash->len = hash->used + sz;
-    }
-    XMEMCPY(hash->msg + hash->used, data , sz);
-    hash->used += sz;
-
-    return 0;
-}
-
-/* Perform hash operation using accumulated msg
- *
- * hash    The SCE Hash object.
- * out     Buffer to hold hashed text
- * outSz   Length of out
- * return  FSP_SUCCESS(0) on success,
- *         otherwise BAD_FUNC_ARG or FSP Error code on failure
- */
-static int SCEHashFinal(wolfssl_SCE_Hash* hash, byte* out, word32 outSz)
-{
-    int ret;
-    void* heap;
-    sce_sha_md5_handle_t handle;
-    uint32_t sz;
-
-    fsp_err_t (*Init)(sce_sha_md5_handle_t*);
-    fsp_err_t (*Update)(sce_sha_md5_handle_t*, uint8_t*, uint32_t);
-    fsp_err_t (*Final )(sce_sha_md5_handle_t*, uint8_t*, uint32_t*);
-
-    if (hash == NULL || out == NULL) {
-        return BAD_FUNC_ARG;
-    }
-
-    if (hash->sha_type == SCE_SHA256) {
-        Init = R_SCE_SHA256_Init;
-        Update = R_SCE_SHA256_Update;
-        Final = R_SCE_SHA256_Final;
-    }
-    else
-        return BAD_FUNC_ARG;
-
-    heap = hash->heap;
-
-    wc_sce_hw_lock();
-
-    if (Init(&handle) == FSP_SUCCESS) {
-        ret = Update(&handle, (uint8_t*)hash->msg, hash->used);
-        if (ret == FSP_SUCCESS) {
-            ret = Final(&handle, out, (uint32_t*)&sz);
-            if (ret != FSP_SUCCESS || sz != outSz) {
-                return ret;
-            }
-        }
-    }
-    wc_sce_hw_unlock();
-
-    SCEHashFree(hash);
-    return SCEHashInit(hash, heap, 0, hash->sha_type);
-}
-/* Hash operation to message and return a result */
-static int SCEHashGet(wolfssl_SCE_Hash* hash, byte* out, word32 outSz)
-{
-    int ret;
-    sce_sha_md5_handle_t handle;
-    uint32_t sz;
-
-    fsp_err_t (*Init)(sce_sha_md5_handle_t*);
-    fsp_err_t (*Update)(sce_sha_md5_handle_t*, uint8_t*, uint32_t);
-    fsp_err_t (*Final )(sce_sha_md5_handle_t*, uint8_t*, uint32_t*);
-
-    if (hash == NULL || out == NULL) {
-        return BAD_FUNC_ARG;
-    }
-
-    if (hash->sha_type == SCE_SHA256) {
-        Init = R_SCE_SHA256_Init;
-        Update = R_SCE_SHA256_Update;
-        Final = R_SCE_SHA256_Final;
-    }
-    else
-        return BAD_FUNC_ARG;
-
-    wc_sce_hw_lock();
-
-    if (Init(&handle) == FSP_SUCCESS) {
-        ret = Update(&handle, (uint8_t*)hash->msg, hash->used);
-        if (ret == FSP_SUCCESS) {
-            ret = Final(&handle, out, &sz);
-            if (ret != FSP_SUCCESS || sz != outSz) {
-                return ret;
-            }
-        }
-    }
-
-    wc_sce_hw_unlock();
-
-    return 0;
-}
-/* copy hash result from src to dst */
-static int SCEHashCopy(wolfssl_SCE_Hash* src, wolfssl_SCE_Hash* dst)
-{
-    if (src == NULL || dst == NULL) {
-        return BAD_FUNC_ARG;
-    }
-
-    XMEMCPY(dst, src, sizeof(wolfssl_SCE_Hash));
-
-    if (src->len > 0 && src->msg != NULL) {
-        dst->msg = (byte*)XMALLOC(src->len, dst->heap, DYNAMIC_TYPE_TMP_BUFFER);
-        if (dst->msg == NULL) {
-            return MEMORY_E;
-        }
-        XMEMCPY(dst->msg, src->msg, src->len);
-    }
-
-    return 0;
-}
-
-#if !defined(NO_SHA256)
-#include <wolfssl/wolfcrypt/sha256.h>
-
-/*  wrapper for wc_InitSha256_ex */
-int wc_InitSha256_ex(wc_Sha256* sha, void* heap, int devId)
-{
-    return SCEHashInit(sha, heap, devId, SCE_SHA256);
-}
-/*  wrapper for wc_Sha256Update */
-int wc_Sha256Update(wc_Sha256* sha, const byte* in, word32 sz)
-{
-    return SCEHashUpdate(sha, in, sz);
-}
-/*  wrapper for wc_Sha256Final */
-int wc_Sha256Final(wc_Sha256* sha, byte* hash)
-{
-    return SCEHashFinal(sha, hash, WC_SHA256_DIGEST_SIZE);
-}
-/*  wrapper for wc_Sha256GetHash */
-int wc_Sha256GetHash(wc_Sha256* sha, byte* hash)
-{
-    return SCEHashGet(sha, hash, WC_SHA256_DIGEST_SIZE);
-}
-/*  wrapper for wc_Sha256Copy */
-int wc_Sha256Copy(wc_Sha256* src, wc_Sha256* dst)
-{
-    return SCEHashCopy(src, dst);
-}
-#endif /* !NO_SHA256 */
-#endif /* WOLFSSL_RENESAS_SCEPROTECT */
-#endif /* #if !defined(NO_SHA) || !defined(NO_SHA256) */

+ 0 - 1160
lib/wolfssl/wolfcrypt/src/port/Renesas/renesas_sce_util.c

@@ -1,1160 +0,0 @@
-/* renesas_sce_util.c
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-#include <wolfssl/wolfcrypt/settings.h>
-
-#if defined(WOLFSSL_RENESAS_SCEPROTECT) || \
-    defined(WOLFSSL_RENESAS_SCEPROTECT_CRYPTONLY)
-
-#include <wolfssl/wolfcrypt/wc_port.h>
-#include <wolfssl/wolfcrypt/error-crypt.h>
-
-#include <wolfssl/wolfcrypt/port/Renesas/renesas-sce-crypt.h>
-#include <wolfssl/wolfcrypt/port/Renesas/renesas_cmn.h>
-#include <wolfssl/wolfcrypt/memory.h>
-#include <wolfssl/wolfcrypt/error-crypt.h>
-#include <wolfssl/wolfcrypt/aes.h>
-#include <wolfssl/ssl.h>
-#include <wolfssl/internal.h>
-
-#include <stdio.h>
-
-#if defined(DEBUG_PK_CB)
-    #define WOLFSSL_PKMSG(_f_, ...) printf(_f_, ##__VA_ARGS__)
-#else
-    #define WOLFSSL_PKMSG(_f_, ...)
-#endif
-
-#if defined(WOLFSSL_RENESAS_SCEPROTECT_ECC)
-WOLFSSL_GLOBAL SCE_PKCbInfo gSCE_PKCbInfo;
-#endif
-
-/* expect to have these variables defined at user application */
-extern sce_instance_ctrl_t sce_ctrl;
-extern sce_cfg_t sce_cfg;
-
-static const byte*  ca_cert_sig;
-static sce_key_data g_user_key_info;
-
-static uint32_t     g_encrypted_publicCA_key[HW_SCE_SINST_WORD_SIZE];
-extern uint32_t     g_CAscm_Idx;          /* index of CM table    */
-wolfSSL_Mutex       sce_mutex;
-static int          sce_CryptHwMutexInit_ = 0;
-static uint32_t     sce_sess_idx = 0;
-
-/* Mutex Init */
-static int sce_CryptHwMutexInit(wolfSSL_Mutex* mutex)
-{
-    return wc_InitMutex(mutex);
-}
-/* Mutex Lock */
-static int sce_CryptHwMutexLock(wolfSSL_Mutex* mutex)
-{
-    return wc_LockMutex(mutex);
-}
-/* Mutex Unlock */
-static int sce_CryptHwMutexUnLock(wolfSSL_Mutex* mutex)
-{
-    return wc_UnLockMutex(mutex);
-}
-
-/*
-* lock hw engine
-* this should be called before using engine.
-*/
-WOLFSSL_LOCAL int wc_sce_hw_lock()
-{
-    int ret = 0;
-
-    if (sce_CryptHwMutexInit_ == 0) {
-
-        ret = sce_CryptHwMutexInit(&sce_mutex);
-
-        if (ret == 0) {
-            sce_CryptHwMutexInit_ = 1;
-        }
-        else {
-            WOLFSSL_MSG(" mutex initialization failed.");
-            return -1;
-        }
-    }
-    if (sce_CryptHwMutexLock(&sce_mutex) != 0) {
-        /* this should not happens */
-        return -1;
-    }
-
-    return ret;
-}
-
-/*
-* release hw engine
-*/
-WOLFSSL_LOCAL void wc_sce_hw_unlock(void)
-{
-    sce_CryptHwMutexUnLock(&sce_mutex);
-}
-
-/* Open sce driver for use */
-WOLFSSL_LOCAL int wc_sce_Open()
-{
-    WOLFSSL_ENTER("wc_sce_Open");
-    int ret;
-    if ((ret = wc_sce_hw_lock()) == 0) {
-
-        ret = R_SCE_Open(&sce_ctrl, &sce_cfg);
-
-        if (ret != FSP_SUCCESS) {
-            WOLFSSL_MSG("RENESAS SCE Open failed");
-        }
-        if (ret == FSP_SUCCESS && g_user_key_info.encrypted_user_tls_key) {
-
-            ret = R_SCE_TLS_RootCertificateRSA2048PublicKeyInstall(
-                    g_user_key_info.encrypted_provisioning_key,
-                    g_user_key_info.iv,
-                    g_user_key_info.encrypted_user_tls_key,
-                    &g_user_key_info.user_rsa2048_tls_wrappedkey); /* OUT */
-
-            /* init vars */
-             g_CAscm_Idx = (uint32_t)-1;
-
-        }
-        else {
-            WOLFSSL_MSG("Failed to lock sce hw ");
-        }
-
-    }
-
-    /* unlock hw */
-    wc_sce_hw_unlock();
-
-    WOLFSSL_LEAVE("wc_sce_Open", ret);
-    return ret;
-}
-
-/* close SCE driver */
-WOLFSSL_LOCAL void wc_sce_Close()
-{
-    WOLFSSL_ENTER("sce Close");
-    int ret;
-
-    if ((ret = wc_sce_hw_lock()) == 0) {
-        /* close SCE */
-        ret = R_SCE_Close(&sce_ctrl);
-
-        /* unlock hw */
-        wc_sce_hw_unlock();
-        if (ret != FSP_SUCCESS) {
-            WOLFSSL_MSG("RENESAS SCE Close failed");
-        }
-    }
-    else {
-        WOLFSSL_MSG("Failed to unlock sce hw ");
-    }
-}
-
-#ifndef WOLFSSL_RENESAS_SCEPROTECT_CRYPTONLY
-
-#if defined(WOLFSSL_RENESAS_SCEPROTECT_ECC)
-/* Verify Server Key Exchange while doing ECDH key exchange */
-static int SCE_ServerKeyExVerify(uint32_t type, WOLFSSL* ssl, const uint8_t* sig,
-                                                      uint32_t sigSz, void* ctx)
-{
-    int ret = WOLFSSL_FAILURE;
-    User_SCEPKCbInfo* cbInfo;
-    byte qx[MAX_ECC_BYTES], qy[MAX_ECC_BYTES];
-    byte *peerkey = NULL;
-
-    word32 qxLen = sizeof(qx), qyLen = sizeof(qy);
-    (void) sigSz;
-
-    /* sanity check */
-    if (ssl == NULL || sig == NULL || ctx == NULL)
-        return ret;
-
-    cbInfo = (User_SCEPKCbInfo*)ctx;
-
-    /* export public peer public key */
-    ret = wc_ecc_export_public_raw(ssl->peerEccKey, qx, &qxLen, qy, &qyLen);
-    WOLFSSL_PKMSG("qxLen %d qyLen %d\n", qxLen, qyLen);
-    if (ret != 0) {
-        WOLFSSL_MSG("failed to export peer ecc key");
-        return ret;
-    }
-    /* make peer ecc key data for SCE */
-    /* 0padding(24bit) || 04(8bit) || Qx(256bit) || Qy(256bit) */
-    peerkey = (byte*)XMALLOC((3 + 1 + qxLen + qyLen), NULL, DYNAMIC_TYPE_TMP_BUFFER);
-    if (peerkey == NULL) {
-        WOLFSSL_MSG("failed to malloc ecc key");
-        return WOLFSSL_FAILURE;
-    }
-
-    XMEMSET(peerkey, 0, (3 + 1 + qxLen + qyLen));
-    peerkey[3] = ECC_POINT_UNCOMP;
-    XMEMCPY(&peerkey[4], qx, qxLen);
-    XMEMCPY(&peerkey[4+qxLen], qy, qyLen);
-
-    if ((ret = wc_sce_hw_lock()) == 0) {
-        /* 0 : RSA 2048bit, 1 : Reserved, 2 : ECDSA P-256 */
-        ret = R_SCE_TLS_ServerKeyExchangeVerify(
-            type,
-            (uint8_t*) ssl->arrays->clientRandom,
-            (uint8_t*) ssl->arrays->serverRandom,
-            (uint8_t*) peerkey,
-            (uint8_t*) sig,
-            (uint32_t*)ssl->peerSceTsipEncRsaKeyIndex,
-            (uint32_t*)cbInfo->encrypted_ephemeral_ecdh_public_key);
-
-        if (ret != FSP_SUCCESS) {
-            WOLFSSL_MSG("failed R_SCE_TLS_ServerKeyExchangeVerify");
-            cbInfo->keyflgs_tls.bits.pk_key_set = 0;
-        }
-        else {
-            ret = WOLFSSL_SUCCESS;
-            cbInfo->keyflgs_tls.bits.pk_key_set = 1;
-        }
-    }
-    else {
-        WOLFSSL_MSG("Failed to lock sce hw ");
-    }
-
-    /* unlock hw */
-    wc_sce_hw_unlock();
-
-    XFREE(peerkey, 0, DYNAMIC_TYPE_TMP_BUFFER);
-    return ret;
-}
-/* Callback for Rsa Verify */
-WOLFSSL_LOCAL int wc_SCE_RsaVerify(WOLFSSL* ssl, byte* sig, uint32_t sigSz,
-        uint8_t** out, const byte* key, uint32_t keySz, void* ctx)
-{
-    int ret = WOLFSSL_FAILURE;
-    (void) out;
-    (void) key;
-    (void) keySz;
-
-    WOLFSSL_PKMSG("SCE RSA Verify: sigSz %d, keySz %d\n", sigSz, keySz);
-
-    if (wc_sce_usable(ssl, 0))
-      ret = SCE_ServerKeyExVerify(0, ssl, sig, sigSz, ctx);
-    else
-      ret = CRYPTOCB_UNAVAILABLE;
-
-    if (ret == WOLFSSL_SUCCESS)
-        ret = 0; /* 0 for success */
-
-    WOLFSSL_PKMSG("SCE RSA Verify: ret %d\n", ret);
-
-    return ret;
-}
-/* Callback for Ecc Verify */
-WOLFSSL_LOCAL int wc_SCE_EccVerify(WOLFSSL* ssl, const uint8_t* sig,
-        uint32_t sigSz,  const uint8_t* hash, uint32_t hashSz,
-        const uint8_t* key, uint32_t keySz, int* result, void* ctx)
-{
-    int ret = WOLFSSL_FAILURE;
-    uint8_t *sigforSCE;
-    const byte rs_size = HW_SCE_ECDSA_DATA_BYTE_SIZE/2;
-    byte offset = 0x3;
-    (void) sigSz;
-    (void) hash;
-    (void) hashSz;
-    (void) key;
-    (void) keySz;
-
-    sigforSCE = NULL;
-
-    WOLFSSL_PKMSG("SCE ECC Verify: ssl->options.serverState = %d sigSz %d, hashSz %d, keySz %d\n",
-                    ssl->options.serverState, sigSz, hashSz, keySz);
-
-    if (!wc_sce_usable(ssl, 0)) {
-      WOLFSSL_PKMSG("Cannot handle cipher suite by SCE");
-      return CRYPTOCB_UNAVAILABLE;
-    }
-
-    if ((sigforSCE = (uint8_t*)XMALLOC(HW_SCE_ECDSA_DATA_BYTE_SIZE, NULL,
-                                                  DYNAMIC_TYPE_TEMP)) == NULL) {
-        WOLFSSL_MSG("failed to malloc memory");
-        return MEMORY_E;
-    }
-    /* initialization */
-    XMEMSET(sigforSCE, 0, HW_SCE_ECDSA_DATA_BYTE_SIZE);
-
-    /* r */
-    if (sig[offset] == 0x20) {
-        XMEMCPY(sigforSCE, &sig[offset+1], rs_size);
-
-        offset = 0x25;
-        /* s */
-        if (sig[offset] == 0x20) {
-          XMEMCPY(&sigforSCE[rs_size], &sig[offset+1], rs_size);
-        }
-        else {
-          XMEMCPY(&sigforSCE[rs_size], &sig[offset+2], rs_size);
-        }
-    }
-    else {
-        XMEMCPY(sigforSCE, &sig[offset+2], rs_size);
-
-        offset = 0x26;
-        /* s */
-        if (sig[offset] == rs_size) {
-          XMEMCPY(&sigforSCE[rs_size], &sig[offset+1], rs_size);
-        }
-        else {
-          XMEMCPY(&sigforSCE[rs_size], &sig[offset+2], rs_size);
-        }
-    }
-
-    ret = SCE_ServerKeyExVerify(2, ssl, sigforSCE, 64, ctx);
-
-    if (sigforSCE)
-        XFREE(sigforSCE, NULL, DYNAMIC_TYPE_TEMP);
-
-    if (ret == WOLFSSL_SUCCESS) {
-        *result = 1;
-        ret = 0; /* for success */
-    }
-    else
-        *result = 0;
-
-    WOLFSSL_PKMSG("SCE ECC Verify: ret %d, result %d\n", ret, *result);
-
-    return ret;
-}
-
-/* Callback for ECC shared secret */
-WOLFSSL_LOCAL int SCE_EccSharedSecret(WOLFSSL* ssl, ecc_key* otherKey,
-        uint8_t* pubKeyDer, unsigned int* pubKeySz,
-        uint8_t* out, unsigned int* outlen, int side, void* ctx)
-{
-    int       ret = WOLFSSL_SUCCESS;
-    (void) otherKey;
-    (void) side;
-
-    User_SCEPKCbInfo* cbInfo = (User_SCEPKCbInfo*)ctx;
-
-    (void)ssl;
-    (void)cbInfo;
-
-    /* sanity check */
-    if (ssl == NULL || pubKeyDer == NULL || pubKeySz == NULL ||
-        out == NULL || outlen == NULL || ctx == NULL)
-      return WOLFSSL_FAILURE;
-
-    WOLFSSL_PKMSG("PK ECC PMS: Side %s, Peer Curve %d\n",
-        side == WOLFSSL_CLIENT_END ? "client" : "server", otherKey->dp->id);
-
-    if (cbInfo->keyflgs_tls.bits.pk_key_set == 1) {
-        if ((ret = wc_sce_hw_lock()) == 0) {
-            /* Generate ECC PUblic key pair */
-            ret = R_SCE_TLS_ECC_secp256r1_EphemeralWrappedKeyPairGenerate(
-                &cbInfo->ecc_p256_wrapped_key,
-                (uint8_t*)&cbInfo->ecc_ecdh_public_key/* Qx 32 bytes and Qy 32 bytes*/ );
-            if (ret != FSP_SUCCESS) {
-                WOLFSSL_PKMSG("Failed secp256r1_EphemeralWrappedKeyPairGenerate %d\n", ret);
-                return ret;
-            }
-
-            /* copy generated ecdh public key inot buffer */
-            pubKeyDer[0] = ECC_POINT_UNCOMP;
-            *pubKeySz = 1 + sizeof(cbInfo->ecc_ecdh_public_key);
-            XMEMCPY(&pubKeyDer[1], &cbInfo->ecc_ecdh_public_key,
-                        sizeof(cbInfo->ecc_ecdh_public_key));
-
-            /* Generate Premaster Secret */
-            ret = R_SCE_TLS_PreMasterSecretGenerateForECC_secp256r1(
-                        (uint32_t*)&cbInfo->encrypted_ephemeral_ecdh_public_key,
-                        &cbInfo->ecc_p256_wrapped_key,
-                        (uint32_t*)out/* pre-master secret 64 bytes */);
-            if (ret != FSP_SUCCESS) {
-                WOLFSSL_PKMSG("Failed PreMasterSecretGenerateForECC_secp256r1 %d\n", ret);
-                return ret;
-            }
-            else {
-                /* set master secret generation callback for use */
-                wolfSSL_CTX_SetGenMasterSecretCb(ssl->ctx, Renesas_cmn_genMasterSecret);
-                wolfSSL_SetGenMasterSecretCtx(ssl, cbInfo);
-            }
-        }
-        else {
-            WOLFSSL_MSG("Failed to lock sce hw ");
-        }
-
-        /* unlock hw */
-        wc_sce_hw_unlock();
-
-        *outlen = 64;
-        WOLFSSL_PKMSG("PK ECC PMS: ret %d, PubKeySz %d, OutLen %d\n", ret, *pubKeySz, *outlen);
-    }
-
-    return ret;
-}
-#endif
-
-/* Return tls cipher suite enumeration that is supported by SCE library */
-static uint32_t GetSceCipherSuite(
-                    uint8_t cipherSuiteFirst,
-                    uint8_t cipherSuite)
-{
-    WOLFSSL_ENTER("GetSceCipherSuite");
-    uint32_t sceCipher;
-
-    if (cipherSuiteFirst == CIPHER_BYTE)
-    {
-        switch(cipherSuite) {
-            case TLS_RSA_WITH_AES_128_CBC_SHA256:
-                sceCipher = SCE_TLS_RSA_WITH_AES_128_CBC_SHA256;
-                break;
-
-            case TLS_RSA_WITH_AES_256_CBC_SHA256:
-                sceCipher = SCE_TLS_RSA_WITH_AES_256_CBC_SHA256;
-                break;
-
-            default:
-                sceCipher = (uint32_t)WOLFSSL_SCE_ILLEGAL_CIPHERSUITE;
-                break;
-        }
-        WOLFSSL_MSG("<< GetSceCipherSuite");
-        return sceCipher;
-    }
-    else if (cipherSuiteFirst == ECC_BYTE)
-    {
-        sceCipher = (uint32_t)WOLFSSL_SCE_ILLEGAL_CIPHERSUITE;
-        /* comment out until implementation completes */
-        switch(cipherSuite) {
-
-            case TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256:
-                sceCipher = SCE_TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256;
-                break;
-
-            case TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256:
-                sceCipher = SCE_TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256;
-                break;
-
-            case TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256:
-                sceCipher = SCE_TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256;
-                break;
-
-            case TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256:
-                sceCipher = SCE_TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256;
-                break;
-
-            default:
-                sceCipher = (uint32_t)WOLFSSL_SCE_ILLEGAL_CIPHERSUITE;
-                break;
-        }
-    }
-    else{
-        sceCipher = (uint32_t)WOLFSSL_SCE_ILLEGAL_CIPHERSUITE;
-    }
-
-    WOLFSSL_MSG("<< GetSceCipherSuite");
-
-    return sceCipher;
-}
-/* check if sce tls functions can be used for the cipher       */
-/*                                                             */
-/* ssl     : a pointer to WOLFSSL object                       */
-/* session_key_generated : if session key has been generated   */
-/* return  1 for usable, 0 for unusable                        */
-WOLFSSL_LOCAL int wc_sce_usable(const WOLFSSL *ssl,
-                                                uint8_t session_key_generated)
-{
-    WOLFSSL_ENTER("sce_usable");
-    uint32_t sceCipher;
-    byte side;
-    const Ciphers *enc;
-    const Ciphers *dec;
-
-    /* sanity check */
-    if (ssl == NULL)
-        return BAD_FUNC_ARG;
-
-    /* when rsa key index == NULL, SCE isn't used for cert verification. */
-    /* in the case, we cannot use TSIP.                                  */
-    if (!ssl->peerSceTsipEncRsaKeyIndex)
-        return 0;
-
-    /* when enabled Extended Master Secret, we cannot use SCE.            */
-    if (ssl->options.haveEMS)
-        return 0;
-
-    /* when session_key_generated is set as 1, expects to be created      */
-    /* session key already.                                               */
-    if (session_key_generated) {
-        enc = &ssl->encrypt;
-        dec = &ssl->decrypt;
-        if (enc == NULL || dec == NULL) {
-            /* something wrong */
-            return 0;
-        }
-        if (enc->aes == NULL || dec->aes == NULL) {
-            return 0;
-        }
-        if (enc->aes->ctx.setup == 0) {
-            /* session key for SCE is not created */
-            return 0;
-        }
-    }
-    /* retrieve cipher suite if SCE supports */
-    sceCipher = GetSceCipherSuite(ssl->options.cipherSuite0,
-                                     ssl->options.cipherSuite);
-    side = ssl->options.side;
-
-    if (sceCipher != (uint32_t)WOLFSSL_SCE_ILLEGAL_CIPHERSUITE 
-                                   && side == WOLFSSL_CLIENT_END)
-        return 1;
-    else
-        return 0;
-}
-
-/* Generate Hmac by sha256*/
-WOLFSSL_LOCAL int wc_sce_Sha256GenerateHmac(const WOLFSSL *ssl,const uint8_t* myInner,
-        uint32_t innerSz,const uint8_t* in, uint32_t sz, byte* digest)
-{
-    WOLFSSL_ENTER("sce_Sha256HmacGenerate");
-
-    sce_hmac_sha_handle_t _handle;
-    sce_hmac_sha_wrapped_key_t wrapped_key;
-    int ret;
-
-    if ((ssl == NULL) || (myInner == NULL) || (in == NULL) ||
-        (digest == NULL))
-      return BAD_FUNC_ARG;
-
-    wrapped_key = ssl->keys.sce_client_write_MAC_secret;
-
-    if ((ret = wc_sce_hw_lock()) != 0) {
-        WOLFSSL_MSG("hw lock failed");
-        return ret;
-    }
-
-    ret = R_SCE_SHA256HMAC_GenerateInit(
-                &_handle,
-                &wrapped_key);
-
-    if (ret == FSP_SUCCESS)
-        ret = R_SCE_SHA256HMAC_GenerateUpdate(
-                &_handle,
-                (uint8_t*)myInner,
-                innerSz);
-
-    if (ret == FSP_SUCCESS)
-        ret = R_SCE_SHA256HMAC_GenerateUpdate(
-                &_handle,
-                (uint8_t*)in,
-                sz);
-
-    if (ret == FSP_SUCCESS)
-        ret = R_SCE_SHA256HMAC_GenerateFinal(
-                &_handle,
-                digest);
-
-    if (ret != FSP_SUCCESS)
-        ret = WOLFSSL_FAILURE;
-
-    /* unlock hw */
-    wc_sce_hw_unlock();
-    WOLFSSL_LEAVE("sce_Sha256HmacGenerate", ret);
-    return ret;
-}
-
-/* Verify hmac */
-WOLFSSL_LOCAL int wc_sce_Sha256VerifyHmac(const WOLFSSL *ssl,
-        const uint8_t* message, uint32_t messageSz,
-        uint32_t macSz, uint32_t content)
-{
-    WOLFSSL_ENTER("sce_Sha256HmacVerify");
-
-    sce_hmac_sha_handle_t _handle;
-    sce_hmac_sha_wrapped_key_t wrapped_key;
-    byte   myInner[WOLFSSL_TLS_HMAC_INNER_SZ];
-    int ret;
-
-    if ((ssl == NULL) || (message == NULL))
-        return BAD_FUNC_ARG;
-
-    wrapped_key = ssl->keys.sce_server_write_MAC_secret;
-
-    if ((ret = wc_sce_hw_lock()) != 0) {
-        WOLFSSL_MSG("hw lock failed");
-        return ret;
-    }
-
-    wolfSSL_SetTlsHmacInner((WOLFSSL*)ssl, myInner,
-                                                        (word32)messageSz, (int)content, 1);
-
-    ret = R_SCE_SHA256HMAC_VerifyInit(
-                &_handle,
-                &wrapped_key);
-
-    if (ret == FSP_SUCCESS)
-        ret = R_SCE_SHA256HMAC_VerifyUpdate(
-                &_handle,
-                (uint8_t*)myInner,
-                WOLFSSL_TLS_HMAC_INNER_SZ);
-
-    if (ret == FSP_SUCCESS)
-        ret = R_SCE_SHA256HMAC_VerifyUpdate(
-                &_handle,
-                (uint8_t*)message,
-                (uint32_t)messageSz);
-
-    if (ret == FSP_SUCCESS)
-        ret = R_SCE_SHA256HMAC_VerifyFinal(
-                &_handle,
-                (uint8_t*)(message+messageSz),
-                (uint32_t)macSz);
-
-    if (ret != FSP_SUCCESS) {
-        WOLFSSL_MSG("SCE Mac verification failed");
-    }
-
-    /* unlock hw */
-    wc_sce_hw_unlock();
-    WOLFSSL_LEAVE("sce_Sha256HmacVerify", ret);
-    return ret;
-}
-
-/* generate Verify Data based on master secret */
-WOLFSSL_LOCAL int wc_sce_generateVerifyData(const uint8_t *ms, /* master secret */
-                           const uint8_t *side, const uint8_t *handshake_hash,
-                           uint8_t *hashes /* out */)
-{
-    WOLFSSL_ENTER("sce_generateVerifyData");
-    int ret ;
-    uint32_t l_side = SCE_TLS_GENERATE_CLIENT_VERIFY;
-
-    if ((ms == NULL) || (side == NULL) || (handshake_hash == NULL) ||
-        (hashes == NULL))
-      return BAD_FUNC_ARG;
-
-    if (XSTRNCMP((const char*)side, (const char*)kTlsServerFinStr,
-                                                FINISHED_LABEL_SZ) == 0)
-    {
-        l_side = SCE_TLS_GENERATE_SERVER_VERIFY;
-    }
-
-    if ((ret = wc_sce_hw_lock()) == 0) {
-        ret = R_SCE_TLS_VerifyDataGenerate(l_side, (uint32_t*)ms,
-                       (uint8_t*)handshake_hash, hashes/* out */);
-        if (ret != FSP_SUCCESS) {
-            WOLFSSL_MSG("R_SCE_TLS_VerifyDataGenerate failed");
-        }
-    }
-    /* unlock hw */
-    wc_sce_hw_unlock();
-    WOLFSSL_LEAVE("sce_generateVerifyData", ret);
-    return ret;
-}
-
-/* generate keys for TLS communication */
-WOLFSSL_LOCAL int wc_sce_generateSessionKey(WOLFSSL *ssl,
-                User_SCEPKCbInfo* cbInfo, int devId)
-{
-    WOLFSSL_MSG("sce_generateSessionKey()");
-    int ret;
-    Ciphers *enc;
-    Ciphers *dec;
-    sce_hmac_sha_wrapped_key_t key_client_mac;
-    sce_hmac_sha_wrapped_key_t key_server_mac;
-    sce_aes_wrapped_key_t key_client_aes;
-    sce_aes_wrapped_key_t key_server_aes;
-    uint32_t sceCS = GetSceCipherSuite(ssl->options.cipherSuite0,
-                                         ssl->options.cipherSuite);
-
-    if (ssl== NULL || cbInfo == NULL)
-      return BAD_FUNC_ARG;
-
-
-    if ((ret = wc_sce_hw_lock()) == 0) {
-        if (sceCS == TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256 ||
-                sceCS == TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256)
-        {
-            WOLFSSL_MSG("Session key for AES-GCM generation skipped.");
-
-            wolfSSL_KeepArrays(ssl);
-            ret = FSP_SUCCESS;
-
-        }
-        else {
-            ret = R_SCE_TLS_SessionKeyGenerate(
-                    GetSceCipherSuite(
-                        ssl->options.cipherSuite0,
-                        ssl->options.cipherSuite),
-                    (uint32_t*)ssl->arrays->sce_masterSecret,
-                    (uint8_t*) ssl->arrays->clientRandom,
-                    (uint8_t*) ssl->arrays->serverRandom,
-                    NULL,
-                    &key_client_mac,
-                    &key_server_mac,
-                    &key_client_aes,
-                    &key_server_aes,
-                    NULL, NULL);
-        }
-
-        if (ret != FSP_SUCCESS) {
-            WOLFSSL_MSG("R_SCE_TLS_SessionKeyGenerate failed");
-        }
-        else {
-            /* succeeded creating session keys */
-            /* alloc aes instance for both enc and dec */
-            enc = &ssl->encrypt;
-            dec = &ssl->decrypt;
-
-            if (enc) {
-                if (enc->aes == NULL) {
-                    enc->aes = (Aes*)XMALLOC(sizeof(Aes), ssl->heap,
-                                                    DYNAMIC_TYPE_CIPHER);
-                    if (enc->aes == NULL)
-                        return MEMORY_E;
-                }
-
-                XMEMSET(enc->aes, 0, sizeof(Aes));
-            }
-            if (dec) {
-                if (dec->aes == NULL) {
-                    dec->aes = (Aes*)XMALLOC(sizeof(Aes), ssl->heap,
-                                                    DYNAMIC_TYPE_CIPHER);
-                    if (dec->aes == NULL) {
-                        if (enc) {
-                            XFREE(enc->aes, NULL, DYNAMIC_TYPE_CIPHER);
-                        }
-                        return MEMORY_E;
-                    }
-                }
-
-                XMEMSET(dec->aes, 0, sizeof(Aes));
-            }
-            /* copy key index into aes */
-            if (ssl->options.side == PROVISION_CLIENT) {
-                XMEMCPY(&enc->aes->ctx.sce_wrapped_key, &key_client_aes,
-                                                    sizeof(key_client_aes));
-                XMEMCPY(&dec->aes->ctx.sce_wrapped_key, &key_server_aes,
-                                                    sizeof(key_server_aes));
-            }
-            else {
-                XMEMCPY(&enc->aes->ctx.sce_wrapped_key, &key_server_aes,
-                                                    sizeof(key_server_aes));
-                XMEMCPY(&dec->aes->ctx.sce_wrapped_key, &key_client_aes,
-                                                    sizeof(key_client_aes));
-            }
-            /* copy mac key index into keys */
-            ssl->keys.sce_client_write_MAC_secret = key_client_mac;
-            ssl->keys.sce_server_write_MAC_secret = key_server_mac;
-
-            /* set up key size and marked ready */
-            if (enc) {
-                enc->aes->ctx.keySize = ssl->specs.key_size;
-                enc->aes->ctx.setup = 1;
-                /* ready for use */
-                enc->setup = 1;
-            }
-            /* set up key size and marked ready */
-            if (dec) {
-                dec->aes->ctx.keySize = ssl->specs.key_size;
-                dec->aes->ctx.setup = 1;
-                /* ready for use */
-                dec->setup = 1;
-            }
-
-            if (cbInfo->sce_cipher == SCE_TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 ||
-               cbInfo->sce_cipher == SCE_TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256) {
-                enc->aes->nonceSz = AEAD_MAX_IMP_SZ;
-                dec->aes->nonceSz = AEAD_MAX_IMP_SZ;
-             }
-             enc->aes->devId = devId;
-             dec->aes->devId = devId;
-
-            /* marked as session key is set */
-            cbInfo->keyflgs_tls.bits.session_key_set = 1;
-        }
-        /* unlock hw */
-        wc_sce_hw_unlock();
-    }
-    else {
-        WOLFSSL_LEAVE("hw lock failed", ret);
-    }
-
-    WOLFSSL_LEAVE("sce_generateSessionKey", ret);
-    return ret;
-}
-
-/* generate master secret based on pre-master which is generated by SCE */
-WOLFSSL_LOCAL int wc_sce_generateMasterSecret(
-        uint8_t        cipherSuiteFirst,
-        uint8_t        cipherSuite,
-        const uint8_t *pr, /* pre-master    */
-        const uint8_t *cr, /* client random */
-        const uint8_t *sr, /* server random */
-        uint8_t *ms)
-{
-    WOLFSSL_ENTER("sce_generateMasterSecretEx");
-    int ret;
-
-    if ((pr == NULL) || (cr == NULL) || (sr == NULL) ||
-        (ms == NULL))
-      return BAD_FUNC_ARG;
-
-    uint32_t sceCS = GetSceCipherSuite(cipherSuiteFirst, cipherSuite);
-    if (sceCS == 0xffffffff)
-        return BAD_FUNC_ARG;
-
-    if ((ret = wc_sce_hw_lock()) == 0) {
-        ret = R_SCE_TLS_MasterSecretGenerate(
-            sceCS,
-            (uint32_t*)pr,
-            (uint8_t*)cr, (uint8_t*)sr, (uint32_t*)ms);
-        if (ret != FSP_SUCCESS) {
-            WOLFSSL_MSG("R_SCE_TLS_MasterSecretGenerate failed");
-        }
-        /* unlock hw */
-        wc_sce_hw_unlock();
-    }
-    else {
-        WOLFSSL_MSG(" hw lock failed ");
-    }
-    WOLFSSL_LEAVE("sce_generateMasterSecret", ret);
-    return ret;
-}
-
-/* generate pre-Master secrete by SCE */
-WOLFSSL_LOCAL int wc_sce_generatePremasterSecret(uint8_t *premaster,
-                                                        uint32_t preSz)
-{
-    WOLFSSL_ENTER("sce_generatePremasterSecret");
-    int ret;
-
-    if (premaster == NULL)
-      return BAD_FUNC_ARG;
-
-    if ((ret = wc_sce_hw_lock()) == 0 && preSz >=
-                                    (SCE_TLS_MASTER_SECRET_WORD_SIZE*4)) {
-            /* generate pre-master, 80 bytes */
-            ret = R_SCE_TLS_PreMasterSecretGenerateForRSA2048((uint32_t*)premaster);
-            if (ret != FSP_SUCCESS) {
-                WOLFSSL_MSG(" R_SCE_TLS_GeneratePreMasterSecret failed");
-            }
-        /* unlock hw */
-        wc_sce_hw_unlock();
-    }
-    else {
-        WOLFSSL_MSG(" hw lock failed or preSz is smaller than 80");
-    }
-
-    WOLFSSL_LEAVE("sce_generatePremasterSecret", ret);
-    return ret;
-}
-
-
-/*
-* generate encrypted pre-Master secrete by SCE
-*/
-WOLFSSL_LOCAL int wc_sce_generateEncryptPreMasterSecret(
-        WOLFSSL*    ssl,
-        uint8_t*       out,
-        uint32_t*     outSz)
-{
-    WOLFSSL_MSG("sce_generateEncryptPreMasterSecret");
-    int ret;
-
-    if ((ssl == NULL) || (out == NULL) || (outSz == NULL))
-      return BAD_FUNC_ARG;
-
-    if ((ret = wc_sce_hw_lock()) == 0) {
-        if (*outSz >= 256)
-            ret = R_SCE_TLS_PreMasterSecretEncryptWithRSA2048(
-                        (uint32_t*)ssl->peerSceTsipEncRsaKeyIndex,
-                        (uint32_t*)ssl->arrays->preMasterSecret,
-                        (uint8_t*)out);
-        else
-            ret = -1;
-
-        if (ret != FSP_SUCCESS) {
-            WOLFSSL_MSG("R_SCE_TLS_PreMasterSecretEncryptWithRSA2048 failed");
-        }
-        else {
-            *outSz = 256; /* SCE can only handles 2048 RSA */
-            /* set GenMaster Callback for Master secret generation */
-            void* ctx = wolfSSL_GetRsaVerifyCtx(ssl);
-            wolfSSL_CTX_SetGenMasterSecretCb(ssl->ctx,
-                                                Renesas_cmn_genMasterSecret);
-            wolfSSL_SetGenMasterSecretCtx(ssl, ctx);
-        }
-        wc_sce_hw_unlock();
-
-    }
-    else {
-        WOLFSSL_MSG(" hw lock failed ");
-    }
-    WOLFSSL_LEAVE("sce_generateEncryptPreMasterSecret", ret);
-    return ret;
-}
-
-
-/* Certificate verification by SCE */
-WOLFSSL_LOCAL int wc_sce_tls_CertVerify(
-        const uint8_t* cert,       uint32_t certSz,
-        const uint8_t* signature,  uint32_t sigSz,
-        uint32_t      key_n_start,uint32_t key_n_len,
-        uint32_t      key_e_start,uint32_t key_e_len,
-        uint8_t*      sce_encPublickey)
-{
-    WOLFSSL_ENTER("sce_tls_CertVerify");
-    int ret;
-    uint8_t *sigforSCE;
-    uint8_t *pSig;
-    const byte rs_size = 0x20;
-    byte offset = 0x3;
-    (void)sigSz;
-
-    sigforSCE = NULL;
-    pSig = NULL;
-
-    if (cert == NULL)
-      return BAD_FUNC_ARG;
-
-    if (!signature) {
-        WOLFSSL_MSG(" signature for ca verification is not set");
-        return -1;
-    }
-    if (!sce_encPublickey) {
-        WOLFSSL_MSG(" sce_encPublickey is NULL.");
-        return -1;
-    }
-
-    if (g_user_key_info.encrypted_user_tls_key_type ==
-                                SCE_TLS_PUBLIC_KEY_TYPE_ECDSA_P256/*ECDSA*/) {
-
-      if ((sigforSCE = (uint8_t*)XMALLOC(HW_SCE_ECDSA_DATA_BYTE_SIZE, NULL,
-                                                  DYNAMIC_TYPE_TEMP)) == NULL) {
-        WOLFSSL_MSG("failed to malloc memory");
-        return MEMORY_E;
-      }
-      /* initialization */
-      XMEMSET(sigforSCE, 0, HW_SCE_ECDSA_DATA_BYTE_SIZE);
-
-      if (signature[offset] == 0x20) {
-        XMEMCPY(sigforSCE, &signature[offset+1], rs_size);
-
-        offset = 0x25;
-        if (signature[offset] == 0x20) {
-          XMEMCPY(&sigforSCE[rs_size], &signature[offset+1], rs_size);
-        }
-        else {
-          XMEMCPY(&sigforSCE[rs_size], &signature[offset+2], rs_size);
-        }
-      }
-      else {
-        XMEMCPY(sigforSCE, &signature[offset+2], rs_size);
-        offset = 0x26;
-
-        if (signature[offset] == rs_size) {
-          XMEMCPY(&sigforSCE[rs_size], &signature[offset+1], rs_size);
-        }
-        else {
-          XMEMCPY(&sigforSCE[rs_size], &signature[offset+2], rs_size);
-        }
-      }
-      pSig = sigforSCE;
-    }
-    else {
-      pSig = (uint8_t*)signature;
-    }
-
-
-    if ((ret = wc_sce_hw_lock()) == 0) {
-        ret = R_SCE_TLS_CertificateVerify(
-                g_user_key_info.encrypted_user_tls_key_type,
-                (uint32_t*)g_encrypted_publicCA_key,/* encrypted public key  */
-                (uint8_t*)cert,                    /* certificate der        */
-                certSz,                            /* length of der          */
-                (uint8_t*)pSig,                 /* sign data by RSA PSS   */
-                key_n_start,  /* start position of public key n in bytes     */
-                (key_n_start + key_n_len),     /* length of the public key n */
-                key_e_start,                   /* start pos, key e in bytes  */
-                (key_e_start + key_e_len),     /* length of the public key e */
-                (uint32_t*)sce_encPublickey    /* returned encrypted key     */
-                );
-        if (ret != FSP_SUCCESS) {
-            WOLFSSL_MSG(" R_TSIP_TlsCertificateVerification() failed");
-        }
-        if (sigforSCE) {
-          XFREE(sigforSCE, NULL, DYNAMIC_TYPE_TEMP);
-        }
-        wc_sce_hw_unlock();
-    }
-    else {
-        WOLFSSL_MSG(" hw lock failed ");
-    }
-    WOLFSSL_LEAVE("sce_tls_CertVerify", ret);
-    return ret;
-}
-
-/* Root Certificate verification */
-WOLFSSL_LOCAL int wc_sce_tls_RootCertVerify(
-        const uint8_t* cert,        uint32_t cert_len,
-        uint32_t      key_n_start,    uint32_t key_n_len,
-        uint32_t      key_e_start,    uint32_t key_e_len,
-        uint32_t      cm_row)
-{
-    int ret;
-    /* call to generate encrypted public key for certificate verification */
-    uint8_t *signature = (uint8_t*)ca_cert_sig;
-
-    WOLFSSL_ENTER("wc_sce_tls_RootCertVerify");
-
-    if (cert == NULL)
-      return BAD_FUNC_ARG;
-
-    if (!signature) {
-        WOLFSSL_MSG(" signature for ca verification is not set");
-        return -1;
-    }
-
-    if ((ret = wc_sce_hw_lock()) == 0) {
-        ret = R_SCE_TLS_RootCertificateVerify(
-                g_user_key_info.encrypted_user_tls_key_type,
-                (uint8_t*)cert,             /* CA cert */
-                (uint32_t)cert_len,         /* length of CA cert */
-                key_n_start,                /* Byte position of public key */
-                (key_n_start + key_n_len),
-                key_e_start,
-                (key_e_start + key_e_len),
-                (uint8_t*)ca_cert_sig,      /* RSA 2048 PSS with SHA256 */
-                g_encrypted_publicCA_key);  /* RSA-2048 public key 560 bytes */
-                                            /* ECDSA 96 bytes */
-        if (ret != FSP_SUCCESS) {
-            WOLFSSL_MSG(" R_SCE_TLS_RootCertificateVerify() failed");
-        }
-        else {
-            g_CAscm_Idx = cm_row;
-        }
-        wc_sce_hw_unlock();
-    }
-    else {
-        WOLFSSL_MSG(" hw lock failed ");
-    }
-    WOLFSSL_LEAVE("wc_sce_tls_RootCertVerify", ret);
-    return ret;
-}
-
-/*  store elements for session key generation into ssl->keys.
- *  return 0 on success, negative value on failure
- */
-WOLFSSL_LOCAL int wc_sce_storeKeyCtx(WOLFSSL* ssl, User_SCEPKCbInfo* info)
-{
-    int ret = 0;
-
-    WOLFSSL_ENTER("sce_storeKeyCtx");
-
-    if (ssl == NULL || info == NULL)
-        ret = BAD_FUNC_ARG;
-
-    if (ret == 0) {
-        XMEMCPY(info->sce_masterSecret, ssl->arrays->sce_masterSecret,
-                                                SCE_TLS_MASTERSECRET_SIZE);
-        XMEMCPY(info->sce_clientRandom, ssl->arrays->clientRandom, 32);
-        XMEMCPY(info->sce_serverRandom, ssl->arrays->serverRandom, 32);
-
-        info->sce_cipher = (uint8_t)GetSceCipherSuite(ssl->options.cipherSuite0,
-                               ssl->options.cipherSuite);
-    }
-    WOLFSSL_LEAVE("sce_storeKeyCtx", ret);
-    return ret;
-}
-
-/* to inform ca certificate sign */
-/* signature format expects RSA 2048 PSS with SHA256 */
-WOLFSSL_API void wc_sce_inform_cert_sign(const uint8_t *sign)
-{
-    if (sign)
-        ca_cert_sig = sign;
-}
-
-/* let wolfSSL know user key information using TLS operation by SCE */
-WOLFSSL_API void wc_sce_inform_user_keys(
-    uint8_t* encrypted_provisioning_key,
-    uint8_t* iv,
-    uint8_t* encrypted_user_tls_key,
-    uint32_t encrypted_user_tls_key_type)
-{
-    WOLFSSL_ENTER("sce_inform_user_keys");
-    g_user_key_info.encrypted_provisioning_key = NULL;
-    g_user_key_info.iv = NULL;
-    g_user_key_info.encrypted_user_tls_key = NULL;
-
-    if (encrypted_provisioning_key ) {
-        g_user_key_info.encrypted_provisioning_key = encrypted_provisioning_key;
-    }
-    if (iv) {
-        g_user_key_info.iv = iv;
-    }
-    if (encrypted_user_tls_key) {
-        g_user_key_info.encrypted_user_tls_key = encrypted_user_tls_key;
-    }
-
-    g_user_key_info.encrypted_user_tls_key_type = encrypted_user_tls_key_type;
-
-    /* set callback for ECC */
-    WOLFSSL_MSG("sce_inform_user_keys_ex");
-}
-
-
-/* Set callbacks needed for sce TLS api handling */
-WOLFSSL_API void wc_sce_set_callbacks(WOLFSSL_CTX* ctx)
-{
-    wolfSSL_CTX_SetEccVerifyCb(ctx, Renesas_cmn_EccVerify);
-    wolfSSL_CTX_SetRsaVerifyCb(ctx, Renesas_cmn_RsaVerify);
-    wolfSSL_CTX_SetGenPreMasterCb(ctx, Renesas_cmn_generatePremasterSecret);
-    wolfSSL_CTX_SetRsaEncCb(ctx, Renesas_cmn_RsaEnc);
-    wolfSSL_CTX_SetVerifyMacCb(ctx, Renesas_cmn_VerifyHmac);
-
-    /* reset callbacks */
-    wolfSSL_CTX_SetEccSharedSecretCb(ctx, NULL);
-}
-/* Set callback contexts needed for sce TLS api handling */
-WOLFSSL_API int wc_sce_set_callback_ctx(WOLFSSL* ssl, void* user_ctx)
-{
-    if (sce_sess_idx > MAX_SCE_CBINDEX) {
-        WOLFSSL_MSG("exceeds maximum session index");
-        return -1;
-    }
-    gSCE_PKCbInfo.user_PKCbInfo[sce_sess_idx] = (User_SCEPKCbInfo*)user_ctx;
-    gSCE_PKCbInfo.user_PKCbInfo[sce_sess_idx]->keyflgs_tls.bits.pk_key_set = 0;
-    gSCE_PKCbInfo.user_PKCbInfo[sce_sess_idx]->keyflgs_tls.bits.session_key_set = 0;
-
-    wolfSSL_SetEccVerifyCtx(ssl, user_ctx);
-    wolfSSL_SetRsaEncCtx(ssl, user_ctx);
-    wolfSSL_SetRsaVerifyCtx(ssl, user_ctx);
-    wolfSSL_SetGenPreMasterCtx(ssl, user_ctx);
-    wolfSSL_SetEccSharedSecretCtx(ssl, NULL);
-    wolfSSL_SetVerifyMacCtx(ssl, user_ctx);
-
-    /* set up crypt callback */
-    wc_CryptoCb_CryptInitRenesasCmn(ssl, user_ctx);
-
-    gSCE_PKCbInfo.num_session = ++sce_sess_idx;
-
-    return 0;
-}
-#endif /*  !WOLFSSL_RENESAS_SCEPROTECT_CRYPTONLY */
-
-#endif /* WOLFSSL_RENESAS_SCEPROTECT || WOLFSSL_RENESAS_SCEPROTECT_CRYPTONLY */

+ 0 - 963
lib/wolfssl/wolfcrypt/src/port/Renesas/renesas_tsip_aes.c

@@ -1,963 +0,0 @@
-/* renesas_tsip_aes.c
- *
- * Copyright (C) 2006-2023 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-
-#include <string.h>
-#include <stdio.h>
-
-#ifdef HAVE_CONFIG_H
-    #include <config.h>
-#endif
-#include <wolfssl/wolfcrypt/settings.h>
-#include <stdio.h>
-
-#ifndef NO_AES
-
-#if defined(WOLFSSL_RENESAS_TSIP_CRYPT) && \
-    !defined(NO_WOLFSSL_RENESAS_TSIP_CRYPT_AES)
-
-#include <wolfssl/wolfcrypt/wc_port.h>
-#include <wolfssl/wolfcrypt/error-crypt.h>
-#include <wolfssl/internal.h>
-#include <wolfssl/wolfcrypt/aes.h>
-#include "wolfssl/wolfcrypt/port/Renesas/renesas-tsip-crypt.h"
-#ifdef NO_INLINE
-    #include <wolfssl/wolfcrypt/misc.h>
-#else
-    #define WOLFSSL_MISC_INCLUDED
-    #include <wolfcrypt/src/misc.c>
-#endif
-
-#define TSIP_AES_GCM_AUTH_TAG_SIZE  16
-
-typedef e_tsip_err_t (*aesGcmEncInitFn)
-        (tsip_gcm_handle_t*, tsip_aes_key_index_t*, uint8_t*, uint32_t);
-typedef e_tsip_err_t (*aesGcmEncUpdateFn)
-        (tsip_gcm_handle_t*,uint8_t*, uint8_t*, uint32_t, uint8_t*, uint32_t);
-typedef e_tsip_err_t (*aesGcmEncFinalFn)
-        (tsip_gcm_handle_t*, uint8_t*, uint32_t*, uint8_t*);
-typedef e_tsip_err_t (*aesGcmDecInitFn)
-        (tsip_gcm_handle_t*, tsip_aes_key_index_t*, uint8_t*, uint32_t);
-typedef e_tsip_err_t (*aesGcmDecUpdateFn)
-        (tsip_gcm_handle_t*,uint8_t*, uint8_t*, uint32_t, uint8_t*, uint32_t);
-typedef e_tsip_err_t (*aesGcmDecFinalFn)
-        (tsip_gcm_handle_t*, uint8_t*, uint32_t*, uint8_t*, uint32_t);
-
-
-
-/* function pointer type defs for TLSv13 handshake AES-GCM/CCM encryption */
-typedef e_tsip_err_t (*Tls13AesEncInitFn)
-        (tsip_tls13_handle_t*, e_tsip_tls13_phase_t, e_tsip_tls13_mode_t,
-         e_tsip_tls13_cipher_suite_t, tsip_aes_key_index_t*, uint32_t);
-typedef e_tsip_err_t (*Tls13AesEncUpdateFn)
-        (tsip_tls13_handle_t*, uint8_t*, uint8_t*, uint32_t);
-typedef e_tsip_err_t (*Tls13AesEncFinalFn)
-        (tsip_tls13_handle_t*, uint8_t*, uint32_t*);
-
-/* function pointer type defs for TLSv13 handshake AES-GCM/CCM decryption */
-typedef e_tsip_err_t (*Tls13AesDecInitFn)
-        (tsip_tls13_handle_t*, e_tsip_tls13_phase_t, e_tsip_tls13_mode_t,
-         e_tsip_tls13_cipher_suite_t, tsip_aes_key_index_t*, uint32_t);
-typedef e_tsip_err_t (*Tls13AesDecUpdateFn)
-        (tsip_tls13_handle_t*, uint8_t*, uint8_t*, uint32_t);
-typedef e_tsip_err_t (*Tls13AesDecFinalFn)
-        (tsip_tls13_handle_t*, uint8_t*, uint32_t*);
-
-
-
-
-
-/*  encrypt plain data.
- *  
- *  return cipher data size on success, negative value on failure.
- *         CRYPTOCB_UNAVAILABLE may be returned.   
- */
-WOLFSSL_LOCAL int tsip_Tls13AesEncrypt(
-                            struct WOLFSSL* ssl,
-                            byte* output,
-                            const byte* input,
-                            word16 sz)
-{
-    int ret = 0;
-    e_tsip_err_t    err = TSIP_SUCCESS;
-    TsipUserCtx*    tuc = NULL;
-    e_tsip_tls13_cipher_suite_t cs;
-    word32  cipher[(AES_BLOCK_SIZE + TSIP_AES_GCM_AUTH_TAG_SIZE) /
-                                                             sizeof(word32)];
-    word32  plain[AES_BLOCK_SIZE / sizeof(word32)];
-    int             idxIn,idxOut;
-    uint32_t        remain;
-    uint32_t        dataSz, finalSz;
-    e_tsip_tls13_phase_t phase;
-    tsip_aes_key_index_t* key = NULL;
-
-    WOLFSSL_ENTER("tsip_Tls13AesEncrypt");
-    
-    if ((ssl == NULL) || (input == NULL) || (output == NULL) || (sz == 0)) {
-        return BAD_FUNC_ARG;
-    } 
-
-    if (ssl->options.side != WOLFSSL_CLIENT_END) {
-        return CRYPTOCB_UNAVAILABLE;   /* expecting to fallback to S/W */
-    }
-
-    /* get user context for TSIP */
-    tuc = ssl->RenesasUserCtx;    
-    if (tuc == NULL) {
-        WOLFSSL_MSG("TsipUserCtx hasn't been set to ssl.");
-        return CRYPTOCB_UNAVAILABLE;
-    }
-
-    /* select the appropriate encryption key and phase */
-    if (ssl->options.handShakeDone) {
-        if (!tuc->ClientWriteTrafficKey_set) {
-            WOLFSSL_MSG("TSIP wasn't involved in the key-exchange.");
-            return CRYPTOCB_UNAVAILABLE;
-        }
-
-        key = &(tuc->clientAppWriteKey13Idx);
-        phase = TSIP_TLS13_PHASE_APPLICATION;
-    }
-    else {
-        if (!tuc->HandshakeClientTrafficKey_set) {
-            WOLFSSL_MSG("TSIP wasn't involved in the key-exchange.");
-            return CRYPTOCB_UNAVAILABLE;
-        }
-
-        key = &(tuc->clientWriteKey13Idx);
-        phase = TSIP_TLS13_PHASE_HANDSHAKE;
-    }
-
-    /* select AES mode */
-    if (ssl->specs.bulk_cipher_algorithm == wolfssl_aes_gcm)
-        cs = TSIP_TLS13_CIPHER_SUITE_AES_128_GCM_SHA256;
-    else if (ssl->specs.bulk_cipher_algorithm == wolfssl_aes_ccm)
-        cs = TSIP_TLS13_CIPHER_SUITE_AES_128_CCM_SHA256;
-    else
-        return CRYPTOCB_UNAVAILABLE;
-
-    remain  = sz;
-    finalSz = 0;
-
-    if ((ret = tsip_hw_lock()) == 0) {
-
-        err = R_TSIP_Tls13EncryptInit(
-                                    &(tuc->handle13),
-                                    phase,
-                                    TSIP_TLS13_MODE_FULL_HANDSHAKE,
-                                    cs,
-                                    key,
-                                    sz);
-        
-        if (err != TSIP_SUCCESS) {
-            WOLFSSL_MSG("R_TSIP_Tls13DecryptUpdate error");
-            ret = WC_HW_E;
-        }
-
-        idxIn  = 0;
-        idxOut = 0;
-
-        while (err == TSIP_SUCCESS && remain > 0) {
-
-            dataSz = min(remain, AES_BLOCK_SIZE);
-            ForceZero(plain, sizeof(plain));
-            ForceZero(cipher, sizeof(cipher));
-            XMEMCPY(plain, input + idxIn, dataSz); 
-
-
-            err = R_TSIP_Tls13EncryptUpdate(
-                                    &(tuc->handle13),
-                                    (uint8_t*)plain,
-                                    (uint8_t*)cipher,
-                                    dataSz);
-            
-            if (err == TSIP_SUCCESS) {
-                if (dataSz >= AES_BLOCK_SIZE) {
-                    XMEMCPY(output + idxOut, cipher, dataSz);
-                    idxOut += dataSz;
-                }
-                idxIn  += dataSz;   
-                remain -= dataSz;
-            }
-            else {
-                WOLFSSL_MSG("R_TSIP_Tls13DecryptUpdate error");
-                ret = WC_HW_E;
-            }
-        }
-
-        if (err == TSIP_SUCCESS) {
-
-            ForceZero(cipher, sizeof(cipher));
-            /* R_TSIP_Tls13EncryptFinal outputs encrypted content and auth-data
-             * to the buffer.
-             */
-            err = R_TSIP_Tls13EncryptFinal(
-                                    &(tuc->handle13),
-                                    (uint8_t*)cipher,
-                                    &finalSz);          /* total output size */
-
-            if (err == TSIP_SUCCESS) {
-                XMEMCPY(output + idxOut, cipher, finalSz - idxOut);
-                ret = finalSz;
-            }
-            else {
-                WOLFSSL_MSG("R_TSIP_Tls13EncryptFinal error");
-                ret = WC_HW_E;
-            }
-        }
-        tsip_hw_unlock();
-    }
-
-    WOLFSSL_LEAVE("tsip_Tls13AesEncrypt", ret);
-    return ret;
-}
-
-
-
-
-/* decrypt encrypted handshake data for TLSv1.3
- * AES-GCM or AES-CCM can be used
- * return 0 on success, otherwise on error.
- */ 
-WOLFSSL_LOCAL int tsip_Tls13AesDecrypt(
-                            struct WOLFSSL* ssl,
-                            byte* output,
-                            const byte* input,
-                            word16 sz)
-{
-    int ret = 0;
-    e_tsip_err_t    err = TSIP_SUCCESS;
-    TsipUserCtx*    tuc = NULL;
-    e_tsip_tls13_cipher_suite_t cs;
-    word32          cipher[AES_BLOCK_SIZE / sizeof(word32)];
-    word32          plain[AES_BLOCK_SIZE / sizeof(word32)];
-    int             idxIn,idxOut;
-    int             blocks;
-    uint32_t        remain,conRemain;
-    uint32_t        dataSz, finalSz;
-    e_tsip_tls13_phase_t     phase;
-    tsip_aes_key_index_t* key = NULL;
-
-    WOLFSSL_ENTER("tsip_Tls13AesDecrypt");
-
-    if ((ssl == NULL) || (input == NULL) || (output == NULL) || (sz == 0)) {
-        return BAD_FUNC_ARG;
-    } 
-
-    if (ssl->options.side != WOLFSSL_CLIENT_END) {
-        return CRYPTOCB_UNAVAILABLE;   /* expecting to fallback to S/W */
-    }
-
-    /* get user context for TSIP */
-    tuc = ssl->RenesasUserCtx;    
-    if (tuc == NULL) {
-        WOLFSSL_MSG("TsipUserCtx hasn't been set to ssl.");
-        return CRYPTOCB_UNAVAILABLE;
-    }
-
-    /* select the appropriate encryption key and phase */
-    if (ssl->options.handShakeDone) {
-        if (!tuc->ServerWriteTrafficKey_set) {
-            WOLFSSL_MSG("TSIP wasn't involved in the key-exchange.");
-            return CRYPTOCB_UNAVAILABLE;
-        }
-
-        key = &(tuc->serverAppWriteKey13Idx);
-        phase = TSIP_TLS13_PHASE_APPLICATION;
-    }
-    else {
-        if (!tuc->HandshakeServerTrafficKey_set) {
-            WOLFSSL_MSG("TSIP wasn't involved in the key-exchange.");
-            return CRYPTOCB_UNAVAILABLE;
-        }
-
-        key = &(tuc->serverWriteKey13Idx);
-        phase = TSIP_TLS13_PHASE_HANDSHAKE;
-    }
-
-    /* select AES mode */
-    if (ssl->specs.bulk_cipher_algorithm == wolfssl_aes_gcm)
-        cs = TSIP_TLS13_CIPHER_SUITE_AES_128_GCM_SHA256;
-    else if (ssl->specs.bulk_cipher_algorithm == wolfssl_aes_ccm)
-        cs = TSIP_TLS13_CIPHER_SUITE_AES_128_CCM_SHA256;
-    else
-        return CRYPTOCB_UNAVAILABLE;
-
-
-    blocks    = sz / AES_BLOCK_SIZE;
-    remain    = sz;
-    conRemain = sz - TSIP_AES_GCM_AUTH_TAG_SIZE;
-    
-    if ((ret = tsip_hw_lock()) == 0) {
-
-        err = R_TSIP_Tls13DecryptInit(
-                                    &(tuc->handle13),
-                                    phase,
-                                    TSIP_TLS13_MODE_FULL_HANDSHAKE,
-                                    cs,
-                                    key,
-                                    sz);
-        
-        if (err != TSIP_SUCCESS) {
-            WOLFSSL_MSG("R_TSIP_Tls13DecryptInit error");
-            ret = WC_HW_E;
-        }
-
-        idxIn  = 0;
-        idxOut = 0;
-
-        while (err == TSIP_SUCCESS && (blocks--) >= 0) {
-
-            dataSz = min(remain, AES_BLOCK_SIZE);
-            XMEMCPY(cipher, input + idxIn, dataSz);
-            ForceZero(plain, AES_BLOCK_SIZE);
-
-            err = R_TSIP_Tls13DecryptUpdate(
-                                    &(tuc->handle13),
-                                    (uint8_t*)cipher,
-                                    (uint8_t*)plain,
-                                    dataSz);
-            
-            if (err == TSIP_SUCCESS) {
-                if (dataSz >= AES_BLOCK_SIZE && conRemain >= AES_BLOCK_SIZE) {
-                    XMEMCPY(output + idxOut, plain, dataSz);
-                    idxOut += dataSz;
-                    conRemain -= min(conRemain, dataSz);
-                }
-                idxIn  += dataSz;   
-                remain -= dataSz;
-            }
-            else {
-                WOLFSSL_MSG("R_TSIP_Tls13DecryptUpdate error");
-                ret = WC_HW_E;
-            }
-        }
-
-        if (err == TSIP_SUCCESS) {
-            err = R_TSIP_Tls13DecryptFinal(
-                                    &(tuc->handle13),
-                                    (uint8_t*)plain,
-                                    &finalSz); /* total size will be returned */
-
-            if (err == TSIP_SUCCESS) {
-                XMEMCPY(output + idxOut, plain, conRemain);
-            }
-            else if (err== TSIP_ERR_AUTHENTICATION) {
-                WOLFSSL_MSG("tsip_Tls13AesDecrypt authentication error");
-                ret = AES_GCM_AUTH_E;
-            }
-            else {
-                WOLFSSL_MSG("R_TSIP_Tls13DecryptFinal error");
-                ret = WC_HW_E;
-            }
-        }
-        tsip_hw_unlock();
-    }
-
-    WOLFSSL_LEAVE("tsip_Tls13AesDecrypt", ret);
-    return ret;
-}
-
-
-
-#if (WOLFSSL_RENESAS_TSIP_VER >= 109)
-#ifdef WOLF_CRYPTO_CB
-
-WOLFSSL_LOCAL int wc_tsip_AesCipher(int devIdArg, wc_CryptoInfo* info, 
-                                                                    void* ctx)
-{
-    int ret = NOT_COMPILED_IN;
-    TsipUserCtx* cbInfo = (TsipUserCtx*)ctx;
-
-    WOLFSSL_ENTER("wc_tsip_AesCipher");
-
-    if (info == NULL || ctx == NULL)
-        return BAD_FUNC_ARG;
-    
-    if (info->algo_type == WC_ALGO_TYPE_CIPHER) {
-
-#if !defined(NO_AES) || !defined(NO_DES3)
-#ifdef HAVE_AESGCM
-        if (info->cipher.type == WC_CIPHER_AES_GCM &&
-            cbInfo->session_key_set == 1) {
-
-            if (info->cipher.enc) {
-                ret = wc_tsip_AesGcmEncrypt(
-                        info->cipher.aesgcm_enc.aes,
-                        (byte*)info->cipher.aesgcm_enc.out,
-                        (byte*)info->cipher.aesgcm_enc.in,
-                        info->cipher.aesgcm_enc.sz,
-                        (byte*)info->cipher.aesgcm_enc.iv,
-                        info->cipher.aesgcm_enc.ivSz,
-                        (byte*)info->cipher.aesgcm_enc.authTag,
-                        info->cipher.aesgcm_enc.authTagSz,
-                        (byte*)info->cipher.aesgcm_enc.authIn,
-                        info->cipher.aesgcm_enc.authInSz,
-                        (void*)ctx);
-
-            }
-            else {
-                ret = wc_tsip_AesGcmDecrypt(
-                        info->cipher.aesgcm_dec.aes,
-                        (byte*)info->cipher.aesgcm_dec.out,
-                        (byte*)info->cipher.aesgcm_dec.in,
-                        info->cipher.aesgcm_dec.sz,
-                        (byte*)info->cipher.aesgcm_dec.iv,
-                        info->cipher.aesgcm_dec.ivSz,
-                        (byte*)info->cipher.aesgcm_dec.authTag,
-                        info->cipher.aesgcm_dec.authTagSz,
-                        (byte*)info->cipher.aesgcm_dec.authIn,
-                        info->cipher.aesgcm_dec.authInSz,
-                        (void*)ctx);
-            }
-        }
-    #endif /* HAVE_AESGCM */
-    #ifdef HAVE_AES_CBC
-        if (info->cipher.type == WC_CIPHER_AES_CBC &&
-            cbInfo->session_key_set == 1) {
-
-            if (info->cipher.enc) {
-                ret = wc_tsip_AesCbcEncrypt(
-                    info->cipher.aescbc.aes,
-                    (byte*)info->cipher.aescbc.out,
-                    (byte*)info->cipher.aescbc.in,
-                    info->cipher.aescbc.sz);
-
-            }
-            else {
-                ret = wc_tsip_AesCbcDecrypt(
-                    info->cipher.aescbc.aes,
-                    (byte*)info->cipher.aescbc.out,
-                    (byte*)info->cipher.aescbc.in,
-                    info->cipher.aescbc.sz);
-            }
-        }
-    #endif /* HAVE_AES_CBC */
-    #endif /* !NO_AES || !NO_DES3 */
-
-    }
-    WOLFSSL_LEAVE("wc_tsip_AesCipher", ret);
-    return ret;
-}
-#endif /* WOLF_CRYPTO_CB */
-#endif /* WOLFSSL_RENESAS_TSIP_VER >= 109 */
-
-
-
-int wc_tsip_AesCbcEncrypt(struct Aes* aes, byte* out, const byte* in, word32 sz)
-{
-    tsip_aes_handle_t _handle;
-    int ret;
-    word32 blocks = (sz / AES_BLOCK_SIZE);
-    uint32_t dataLength;
-    byte *iv;
-
-    if ((in == NULL) || (out == NULL) || (aes == NULL))
-      return BAD_FUNC_ARG;
-
-    /* while doing TLS handshake, TSIP driver keeps true-key and iv *
-     * on the device. iv is dummy                                   */
-    iv = (uint8_t*)aes->reg;
-
-    if ((ret = tsip_hw_lock()) != 0) {
-        WOLFSSL_MSG("Failed to lock");
-        return ret;
-    }
-
-    if (aes->ctx.keySize == 16) {
-        ret = R_TSIP_Aes128CbcEncryptInit(&_handle, &aes->ctx.tsip_keyIdx, iv);
-    }
-    else if (aes->ctx.keySize == 32) {
-        ret = R_TSIP_Aes256CbcEncryptInit(&_handle, &aes->ctx.tsip_keyIdx, iv);
-    }
-    else {
-        tsip_hw_unlock();
-        return -1;
-    }
-
-    while (ret == TSIP_SUCCESS && blocks--) {
-        if (aes->ctx.keySize == 16)
-            ret = R_TSIP_Aes128CbcEncryptUpdate(&_handle, (uint8_t*)in,
-                                    (uint8_t*)out, (uint32_t)AES_BLOCK_SIZE);
-        else
-            ret = R_TSIP_Aes256CbcEncryptUpdate(&_handle, (uint8_t*)in,
-                                    (uint8_t*)out, (uint32_t)AES_BLOCK_SIZE);
-
-        in  += AES_BLOCK_SIZE;
-        out += AES_BLOCK_SIZE;
-    }
-
-    if (ret == TSIP_SUCCESS) {
-        if (aes->ctx.keySize == 16) {
-            ret = R_TSIP_Aes128CbcEncryptFinal(&_handle, out, &dataLength);
-        }
-        else {
-            ret = R_TSIP_Aes256CbcEncryptFinal(&_handle, out, &dataLength);
-        }
-    }
-    else {
-        WOLFSSL_MSG("TSIP AES CBC encryption failed");
-        ret = -1;
-    }
-
-    tsip_hw_unlock();
-    return ret;
-}
-
-int wc_tsip_AesCbcDecrypt(struct Aes* aes, byte* out, const byte* in, word32 sz)
-{
-   tsip_aes_handle_t _handle;
-    int ret;
-    word32 blocks = (sz / AES_BLOCK_SIZE);
-    uint32_t dataLength;
-    byte *iv;
-
-    if ((in == NULL) || (out == NULL) || (aes == NULL))
-      return BAD_FUNC_ARG;
-
-    iv = (uint8_t*)aes->reg;
-
-    if ((ret = tsip_hw_lock()) != 0) {
-        WOLFSSL_MSG("Failed to lock");
-        return ret;
-    }
-
-    if (aes->ctx.keySize == 16) {
-        ret = R_TSIP_Aes128CbcDecryptInit(&_handle, &aes->ctx.tsip_keyIdx, iv);
-    }
-    else if (aes->ctx.keySize == 32) {
-        ret = R_TSIP_Aes256CbcDecryptInit(&_handle, &aes->ctx.tsip_keyIdx, iv);
-    }
-    else {
-        tsip_hw_unlock();
-        return -1;
-    }
-
-    while (ret == TSIP_SUCCESS && blocks--) {
-
-        if (aes->ctx.keySize == 16)
-            ret = R_TSIP_Aes128CbcDecryptUpdate(&_handle, (uint8_t*)in,
-                                        (uint8_t*)out, (uint32_t)AES_BLOCK_SIZE);
-        else
-            ret = R_TSIP_Aes256CbcDecryptUpdate(&_handle, (uint8_t*)in,
-                                        (uint8_t*)out, (uint32_t)AES_BLOCK_SIZE);
-
-        in  += AES_BLOCK_SIZE;
-        out += AES_BLOCK_SIZE;
-    }
-
-    if (ret == TSIP_SUCCESS) {
-        if (aes->ctx.keySize == 16)
-            ret = R_TSIP_Aes128CbcDecryptFinal(&_handle, out, &dataLength);
-        else
-            ret = R_TSIP_Aes256CbcDecryptFinal(&_handle, out, &dataLength);
-    }
-    else {
-        WOLFSSL_MSG("TSIP AES CBC decryption failed");
-        ret = -1;
-    }
-
-    tsip_hw_unlock();
-    return ret;
-}
-/*
- * Encrypt plain data then output encrypted data and authentication tag data.
- * The session key used for encryption is generated inside this function and
- * the key which has been generated and stored in Aes is not used.
- * parameter
- *  - aes:  Aes structure
- *  - out:  buffer where the cipher text is output
- *  - in:   buffer where the plain data is storead
- *  - sz:   size of plain data and also means output size
- *  - iv:   iv should be consist of implicit-iv of 4 bytes and exp-iv of 8 bytes
- *  - authTag: buffer where the authentication data is output.
- *  - authTagSz: buffer size for authentication data.
- *  - authIn: buffer holding Additional Authentication Data(AAD)
- *  - authInSz: AAD size
- *  - ctx:   TsipUserCtx
- * return 0 on success, otherwise on error.
- * Note: As of TSIPv1.13, only accept 128 and 256 bit of key size
- *
- */
-int wc_tsip_AesGcmEncrypt(
-    struct Aes*  aes,     byte* out,
-    const  byte* in,       word32 sz,
-           byte* iv,       word32 ivSz,
-           byte* authTag,  word32 authTagSz, /* auth Tag */
-    const  byte* authIn,   word32 authInSz,  /* AAD */
-           void* ctx)
-{
-    int                 ret = -1;
-    e_tsip_err_t        err;
-    tsip_gcm_handle_t   hdl;
-    uint32_t            dataLen = sz;
-    uint32_t            cipherBufSz;
-
-    aesGcmEncInitFn     initFn;
-    aesGcmEncUpdateFn   updateFn;
-    aesGcmEncFinalFn    finalFn;
-
-    uint8_t* plainBuf  = NULL;
-    uint8_t* cipherBuf = NULL;
-    uint8_t* aTagBuf   = NULL;
-    uint8_t* aadBuf    = NULL;
-    const uint8_t* iv_l = NULL;
-    uint32_t ivSz_l = 0;
-    
-    tsip_aes_key_index_t key_client_aes;
-    TsipUserCtx *userCtx;
-
-    WOLFSSL_ENTER("wc_tsip_AesGcmEncrypt");
-
-    if (aes == NULL || ctx == NULL || (ivSz == 0)   ||
-       (sz != 0       && (in == NULL  || out == NULL)) ||
-       (ivSz != 0     &&  iv == NULL) ||
-       (authInSz != 0 && authIn == NULL)) {
-        WOLFSSL_LEAVE("wc_tsip_AesGcmEncrypt", BAD_FUNC_ARG);
-        return BAD_FUNC_ARG;
-    }
-    /* TSIP can handle 128 and 256 bit key only */
-    if (aes->ctx.keySize != 16 && aes->ctx.keySize != 32) {
-        WOLFSSL_MSG("illegal key size");
-        WOLFSSL_LEAVE("wc_tsip_AesGcmEncrypt", BAD_FUNC_ARG);
-        return  BAD_FUNC_ARG;
-    }
-
-    if (aes->ctx.keySize == 16) {
-        initFn   = R_TSIP_Aes128GcmEncryptInit;
-        updateFn = R_TSIP_Aes128GcmEncryptUpdate;
-        finalFn  = R_TSIP_Aes128GcmEncryptFinal;
-    }
-    else {
-        initFn   = R_TSIP_Aes256GcmEncryptInit;
-        updateFn = R_TSIP_Aes256GcmEncryptUpdate;
-        finalFn  = R_TSIP_Aes256GcmEncryptFinal;
-    }
-
-    userCtx = (TsipUserCtx*)ctx;
-
-    /* buffer for cipher data output must be multiple of AES_BLOCK_SIZE */
-    cipherBufSz = ((sz / AES_BLOCK_SIZE) + 1) * AES_BLOCK_SIZE;
-
-    if ((ret = tsip_hw_lock()) == 0) {
-
-        /* allocate buffers for plaintext, ciphertext, authTag and aad to make
-         * sure those buffers 32bit aligned as TSIP requests.
-         */
-        plainBuf  = XMALLOC(sz, aes->heap, DYNAMIC_TYPE_AES);
-        cipherBuf = XMALLOC(cipherBufSz, aes->heap, DYNAMIC_TYPE_AES);
-        aTagBuf   = XMALLOC(TSIP_AES_GCM_AUTH_TAG_SIZE, aes->heap,
-                                                        DYNAMIC_TYPE_AES);
-        aadBuf    = XMALLOC(authInSz, aes->heap, DYNAMIC_TYPE_AES);
-
-        if (plainBuf == NULL || cipherBuf == NULL || aTagBuf == NULL ||
-                                                      aadBuf == NULL ) {
-            WOLFSSL_MSG("wc_tsip_AesGcmEncrypt: buffer allocation failed");
-            ret = -1;
-        }
-
-        if (ret == 0) {
-            XMEMCPY(plainBuf, in, sz);
-            ForceZero(cipherBuf, cipherBufSz);
-            ForceZero(authTag, authTagSz);
-            XMEMCPY(aadBuf, authIn, authInSz);
-        }
-
-        if (ret == 0 && 
-            userCtx->session_key_set == 1) {
-            /* generate AES-GCM session key. The key stored in
-             * Aes.ctx.tsip_keyIdx is not used here.
-             */
-            err = R_TSIP_TlsGenerateSessionKey(
-                    userCtx->tsip_cipher,
-                    (uint32_t*)userCtx->tsip_masterSecret,
-                    (uint8_t*) userCtx->tsip_clientRandom,
-                    (uint8_t*) userCtx->tsip_serverRandom,
-                    &iv[AESGCM_IMP_IV_SZ], /* use exp_IV */
-                    NULL,
-                    NULL,
-                    &key_client_aes,
-                    NULL,
-                    NULL, NULL);
-            if (err != TSIP_SUCCESS) {
-
-                WOLFSSL_MSG("R_TSIP_TlsGenerateSessionKey failed");
-                ret = -1;
-            }
-        } else if (userCtx->user_aes128_key_set == 1 || 
-                   userCtx->user_aes256_key_set == 1) {
-            if (aes->ctx.keySize == 32) {
-                XMEMCPY(&key_client_aes, &userCtx->user_aes256_key_index,
-                        sizeof(tsip_aes_key_index_t));
-            }
-            else {
-                 XMEMCPY(&key_client_aes, &userCtx->user_aes128_key_index,
-                        sizeof(tsip_aes_key_index_t));
-            }
-            
-            iv_l = iv;
-            ivSz_l = ivSz;
-            
-        }
-
-        if (ret == 0) {
-
-            /* Since generated session key is coupled to iv, no need to pass
-             * iv init func.
-             * It expects to pass iv when users create their own key.
-             */
-            err = initFn(&hdl, &key_client_aes, (uint8_t*)iv_l, ivSz_l);
-
-            if (err == TSIP_SUCCESS) {
-            	err = updateFn(&hdl, NULL, NULL, 0UL, (uint8_t*)aadBuf, authInSz);
-            }
-            if (err == TSIP_SUCCESS) {
-                err = updateFn(&hdl, plainBuf, cipherBuf, sz, NULL, 0UL);
-            }
-            if (err != TSIP_SUCCESS) {
-                WOLFSSL_MSG("R_TSIP_AesXXXGcmEncryptUpdate: failed");
-                ret = -1;
-            }
-
-            /* Once R_TSIP_AesxxxGcmEncryptInit or R_TSIP_AesxxxEncryptUpdate is
-            * called, R_TSIP_AesxxxGcmEncryptFinal must be called regardless of
-            * the result of the previous call. Otherwise, TSIP can not come out
-            * from its error state and all the trailing APIs will fail.
-            */
-            dataLen = 0;
-            err = finalFn(&hdl,
-                          cipherBuf + (sz / AES_BLOCK_SIZE) * AES_BLOCK_SIZE,
-                          &dataLen,
-                          aTagBuf); /* aad of 16 bytes will be output */
-
-            if (err == TSIP_SUCCESS) {
-                /* copy encrypted data to out */
-                XMEMCPY(out, cipherBuf, sz);
-
-                /* copy auth tag to caller's buffer */
-                XMEMCPY((void*)authTag, (void*)aTagBuf,
-                                min(authTagSz, TSIP_AES_GCM_AUTH_TAG_SIZE ));
-
-            }
-            else {
-                WOLFSSL_MSG("R_TSIP_AesxxxGcmEncryptFinal: failed");
-                ret = -1;
-            }
-        }
-
-        XFREE(plainBuf,  aes->heap, DYNAMIC_TYPE_AES);
-        XFREE(cipherBuf, aes->heap, DYNAMIC_TYPE_AES);
-        XFREE(aTagBuf,   aes->heap, DYNAMIC_TYPE_AES);
-        XFREE(aadBuf,    aes->heap, DYNAMIC_TYPE_AES);
-
-        tsip_hw_unlock();
-    }
-    return ret;
-}
-/*
- * Decrypt cipher data into plaindata and output authentication tag data.
- * The session key used for decryption is generated inside this function and
- * the key which has been generated and stored in Aes is not used.
- * parameter
- *  - aes:  Aes structure
- *  - out:  buffer where the plain text is output
- *  - in:   buffer where the cipher data is storead
- *  - sz:   size of cipher data and also means output size
- *  - iv:   iv should be consist of implicit-iv of 4 bytes and exp-iv of 8 bytes
- *  - authTag: buffer where the authentication data is stored.
- *  - authTagSz: buffer size for authentication data.
- *  - authIn: buffer where Additional Authentication Data(AAD) is stored
- *  - authInSz: AAD size
- * return 0 on success, otherwise on error.
- * Note: As of TSIPv1.13, only accept 128 and 256 bit of key size
- *
- */
-int wc_tsip_AesGcmDecrypt(
-    Aes*        aes,      byte* out,
-    const byte* in,       word32 sz,
-    const byte* iv,       word32 ivSz,
-    const byte* authTag,  word32 authTagSz,
-    const byte* authIn,   word32 authInSz,
-          void* ctx)
-{
-    int                 ret = -1;
-    e_tsip_err_t        err;
-    tsip_gcm_handle_t   hdl;
-
-    uint32_t            dataLen;
-    uint32_t            plainBufSz;
-
-    aesGcmDecInitFn     initFn;
-    aesGcmDecUpdateFn   updateFn;
-    aesGcmDecFinalFn    finalFn;
-
-    uint8_t* cipherBuf = NULL;
-    uint8_t* plainBuf  = NULL;
-    uint8_t* aTagBuf   = NULL;
-    uint8_t* aadBuf    = NULL;
-    const uint8_t* iv_l = NULL;
-    uint32_t ivSz_l = 0;
-    
-    tsip_aes_key_index_t key_server_aes;
-    TsipUserCtx *userCtx;
-
-    WOLFSSL_ENTER("wc_tsip_AesGcmDecrypt");
-
-    if (aes == NULL || in == NULL || out == NULL || sz == 0 || ctx == NULL ||
-        iv == 0 || 
-        (authInSz != 0 && authIn == NULL) ||
-        (authInSz == 0 && authIn != NULL) ||
-        (authTagSz != 0 && authTag == NULL) ||
-        (authTagSz == 0 && authTag != NULL)) {
-        WOLFSSL_LEAVE("wc_tsip_AesGcmDecrypt", BAD_FUNC_ARG);
-        return BAD_FUNC_ARG;
-    }
-    if (aes->ctx.keySize != 16 && aes->ctx.keySize != 32) {
-        WOLFSSL_MSG("illegal key size");
-        WOLFSSL_LEAVE("wc_tsip_AesGcmDecrypt", BAD_FUNC_ARG);
-        return  BAD_FUNC_ARG;
-    }
-
-    if (aes->ctx.keySize == 16) {
-        initFn   = R_TSIP_Aes128GcmDecryptInit;
-        updateFn = R_TSIP_Aes128GcmDecryptUpdate;
-        finalFn  = R_TSIP_Aes128GcmDecryptFinal;
-    }
-    else {
-        initFn   = R_TSIP_Aes256GcmDecryptInit;
-        updateFn = R_TSIP_Aes256GcmDecryptUpdate;
-        finalFn  = R_TSIP_Aes256GcmDecryptFinal;
-    }
-
-    userCtx = (TsipUserCtx *)ctx;
-
-    /* buffer for plain data output must be multiple of AES_BLOCK_SIZE */
-    plainBufSz = ((sz / AES_BLOCK_SIZE) + 1) * AES_BLOCK_SIZE;
-
-    if ((ret = tsip_hw_lock()) == 0) {
-
-        /* allocate buffers for plaintext, cipher-text, authTag and AAD.
-         * TSIP requests those buffers 32bit aligned.
-         */
-        cipherBuf = XMALLOC(sz, aes->heap, DYNAMIC_TYPE_AES);
-        plainBuf  = XMALLOC(plainBufSz, aes->heap, DYNAMIC_TYPE_AES);
-        aTagBuf   = XMALLOC(TSIP_AES_GCM_AUTH_TAG_SIZE, aes->heap,
-                                                        DYNAMIC_TYPE_AES);
-        aadBuf    = XMALLOC(authInSz, aes->heap, DYNAMIC_TYPE_AES);
-
-        if (plainBuf == NULL || cipherBuf == NULL || aTagBuf == NULL ||
-                                                        aadBuf == NULL) {
-            ret = -1;
-        }
-
-        if (ret == 0) {
-            ForceZero(plainBuf, plainBufSz);
-            XMEMCPY(cipherBuf, in, sz);
-            ForceZero(aTagBuf, TSIP_AES_GCM_AUTH_TAG_SIZE);
-            XMEMCPY(aTagBuf,authTag,min(authTagSz, TSIP_AES_GCM_AUTH_TAG_SIZE));
-            XMEMCPY(aadBuf, authIn, authInSz);
-        }
-
-        if (ret == 0 && 
-            userCtx->session_key_set == 1) {
-            /* generate AES-GCM session key. The key stored in
-             * Aes.ctx.tsip_keyIdx is not used here.
-             */
-            err = R_TSIP_TlsGenerateSessionKey(
-                    userCtx->tsip_cipher,
-                    (uint32_t*)userCtx->tsip_masterSecret,
-                    (uint8_t*) userCtx->tsip_clientRandom,
-                    (uint8_t*) userCtx->tsip_serverRandom,
-                    (uint8_t*)&iv[AESGCM_IMP_IV_SZ], /* use exp_IV */
-                    NULL,
-                    NULL,
-                    NULL,
-                    &key_server_aes,
-                    NULL, NULL);
-            if (err != TSIP_SUCCESS) {
-                WOLFSSL_MSG("R_TSIP_TlsGenerateSessionKey failed");
-                ret = -1;
-            }
-        } else if (userCtx->user_aes128_key_set == 1 || 
-                   userCtx->user_aes256_key_set == 1) {
-            if (aes->ctx.keySize == 32) {
-                XMEMCPY(&key_server_aes, &userCtx->user_aes256_key_index,
-                        sizeof(tsip_aes_key_index_t));
-            }
-            else {
-                 XMEMCPY(&key_server_aes, &userCtx->user_aes128_key_index,
-                        sizeof(tsip_aes_key_index_t));
-            }
-            
-            iv_l = iv;
-            ivSz_l = ivSz;
-            
-        }
-
-        if (ret == 0) {
-            /* since key_index has iv and ivSz in it, no need to pass them init
-             * func. Pass NULL and 0 as 3rd and 4th parameter respectively.
-             *
-             * It expects to pass iv when users create their own key.
-             */
-            err = initFn(&hdl, &key_server_aes, (uint8_t*)iv_l, ivSz_l);
-
-            if (err == TSIP_SUCCESS) {
-                /* pass only AAD and it's size before passing cipher text */
-                err = updateFn(&hdl, NULL, NULL, 0UL, (uint8_t*)authIn,
-                                                                    authInSz);
-            }
-            if (err == TSIP_SUCCESS) {
-                err = updateFn(&hdl, cipherBuf, plainBuf, sz, NULL, 0UL);
-            }
-            if (err != TSIP_SUCCESS) {
-                WOLFSSL_MSG("R_TSIP_AesXXXGcmDecryptUpdate: failed in decrypt");
-                ret = -1;
-            }
-            if (err == TSIP_SUCCESS) {
-                dataLen = 0;
-                err = finalFn(&hdl,
-                        plainBuf + (sz / AES_BLOCK_SIZE) * AES_BLOCK_SIZE,
-                        &dataLen,
-                        aTagBuf,
-                        min(16, authTagSz)); /* TSIP accepts upto 16 byte */
-            }
-            if (err == TSIP_SUCCESS) {
-                /* copy plain data to out */
-                XMEMCPY(out, plainBuf, sz);
-            }
-            else {
-                WOLFSSL_MSG("R_TSIP_AesXXXGcmDecryptFinal: failed");
-                ret = -1;
-            }
-        }
-
-        XFREE(plainBuf,  aes->heap, DYNAMIC_TYPE_AES);
-        XFREE(cipherBuf, aes->heap, DYNAMIC_TYPE_AES);
-        XFREE(aTagBuf,   aes->heap, DYNAMIC_TYPE_AES);
-        XFREE(aadBuf,    aes->heap, DYNAMIC_TYPE_AES);
-
-        tsip_hw_unlock();
-    }
-    WOLFSSL_LEAVE("wc_tsip_AesGcmDecrypt", ret);
-    return ret;
-}
-#endif /* WOLFSSL_RENESAS_TSIP_CRYPT */
-#endif /* NO_AES */

Неке датотеке нису приказане због велике количине промена