pcompress/crypto/aes/aesni-x86_64.s

2729 lines
58 KiB
ArmAsm

;#
;# This file is a part of Pcompress, a chunked parallel multi-
;# algorithm lossless compression and decompression program.
;#
;# Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
;# Use is subject to license terms.
;#
;# This program is free software; you can redistribute it and/or
;# modify it under the terms of the GNU Lesser General Public
;# License as published by the Free Software Foundation; either
;# version 3 of the License, or (at your option) any later version.
;#
;# This program is distributed in the hope that it will be useful,
;# but WITHOUT ANY WARRANTY; without even the implied warranty of
;# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;# Lesser General Public License for more details.
;#
;# You should have received a copy of the GNU Lesser General Public
;# License along with this program.
;# If not, see <http://www.gnu.org/licenses/>.
;#
;# moinakg@belenix.org, http://moinakg.wordpress.com/
;#
;#
;# NOTE:
;# This file was obtained from the OpenSSL distribution and as such is
;# governed by the OpenSSL license in addition to the license text mentioned
;# above. A copy of those license terms is included in the file:
;# OPENSSL.LICENSE
;#
;# Only the OpenSSL license terms will apply when this file is used outside
;# of this software project.
;#
;#
;# ====================================================================
;# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
;# project. The module is, however, dual licensed under OpenSSL and
;# CRYPTOGAMS licenses depending on where you obtain it. For further
;# details see http://www.openssl.org/~appro/cryptogams/.
;# ====================================================================
;#
;# This module implements support for Intel AES-NI extension. In
;# OpenSSL context it's used with Intel engine, but can also be used as
;# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
;# details].
;#
;# Performance.
;#
;# Given aes(enc|dec) instructions' latency asymptotic performance for
;# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
;# processed with 128-bit key. And given their throughput asymptotic
;# performance for parallelizable modes is 1.25 cycles per byte. Being
;# asymptotic limit it's not something you commonly achieve in reality,
;# but how close does one get? Below are results collected for
;# different modes and block sized. Pairs of numbers are for en-/
;# decryption.
;#
;# 16-byte 64-byte 256-byte 1-KB 8-KB
;# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
;# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
;# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
;# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
;# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
;# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
;#
;# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
;# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
;# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
;# The results were collected with specially crafted speed.c benchmark
;# in order to compare them with results reported in "Intel Advanced
;# Encryption Standard (AES) New Instruction Set" White Paper Revision
;# 3.0 dated May 2010. All above results are consistently better. This
;# module also provides better performance for block sizes smaller than
;# 128 bytes in points *not* represented in the above table.
;#
;# Looking at the results for 8-KB buffer.
;#
;# CFB and OFB results are far from the limit, because implementation
;# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
;# single-block aesni_encrypt, which is not the most optimal way to go.
;# CBC encrypt result is unexpectedly high and there is no documented
;# explanation for it. Seemingly there is a small penalty for feeding
;# the result back to AES unit the way it's done in CBC mode. There is
;# nothing one can do and the result appears optimal. CCM result is
;# identical to CBC, because CBC-MAC is essentially CBC encrypt without
;# saving output. CCM CTR "stays invisible," because it's neatly
;# interleaved wih CBC-MAC. This provides ~30% improvement over
;# "straghtforward" CCM implementation with CTR and CBC-MAC performed
;# disjointly. Parallelizable modes practically achieve the theoretical
;# limit.
;#
;# Looking at how results vary with buffer size.
;#
;# Curves are practically saturated at 1-KB buffer size. In most cases
;# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
;# CTR curve doesn't follow this pattern and is "slowest" changing one
;# with "256-byte" result being 87% of "8-KB." This is because overhead
;# in CTR mode is most computationally intensive. Small-block CCM
;# decrypt is slower than encrypt, because first CTR and last CBC-MAC
;# iterations can't be interleaved.
;#
;# Results for 192- and 256-bit keys.
;#
;# EVP-free results were observed to scale perfectly with number of
;# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
;# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
;# are a tad smaller, because the above mentioned penalty biases all
;# results by same constant value. In similar way function call
;# overhead affects small-block performance, as well as OFB and CFB
;# results. Differences are not large, most common coefficients are
;# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
;# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
;# January 2011
;#
;# While Westmere processor features 6 cycles latency for aes[enc|dec]
;# instructions, which can be scheduled every second cycle, Sandy
;# Bridge spends 8 cycles per instruction, but it can schedule them
;# every cycle. This means that code targeting Westmere would perform
;# suboptimally on Sandy Bridge. Therefore this update.
;#
;# In addition, non-parallelizable CBC encrypt (as well as CCM) is
;# optimized. Relative improvement might appear modest, 8% on Westmere,
;# but in absolute terms it's 3.77 cycles per byte encrypted with
;# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
;# should be compared to asymptotic limits of 3.75 for Westmere and
;# 5.00 for Sandy Bridge. Actually, the fact that they get this close
;# to asymptotic limits is quite amazing. Indeed, the limit is
;# calculated as latency times number of rounds, 10 for 128-bit key,
;# and divided by 16, the number of bytes in block, or in other words
;# it accounts *solely* for aesenc instructions. But there are extra
;# instructions, and numbers so close to the asymptotic limits mean
;# that it's as if it takes as little as *one* additional cycle to
;# execute all of them. How is it possible? It is possible thanks to
;# out-of-order execution logic, which manages to overlap post-
;# processing of previous block, things like saving the output, with
;# actual encryption of current block, as well as pre-processing of
;# current block, things like fetching input and xor-ing it with
;# 0-round element of the key schedule, with actual encryption of
;# previous block. Keep this in mind...
;#
;# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
;# performance is achieved by interleaving instructions working on
;# independent blocks. In which case asymptotic limit for such modes
;# can be obtained by dividing above mentioned numbers by AES
;# instructions' interleave factor. Westmere can execute at most 3
;# instructions at a time, meaning that optimal interleave factor is 3,
;# and that's where the "magic" number of 1.25 come from. "Optimal
;# interleave factor" means that increase of interleave factor does
;# not improve performance. The formula has proven to reflect reality
;# pretty well on Westmere... Sandy Bridge on the other hand can
;# execute up to 8 AES instructions at a time, so how does varying
;# interleave factor affect the performance? Here is table for ECB
;# (numbers are cycles per byte processed with 128-bit key):
;#
;# instruction interleave factor 3x 6x 8x
;# theoretical asymptotic limit 1.67 0.83 0.625
;# measured performance for 8KB block 1.05 0.86 0.84
;#
;# "as if" interleave factor 4.7x 5.8x 6.0x
;#
;# Further data for other parallelizable modes:
;#
;# CBC decrypt 1.16 0.93 0.93
;# CTR 1.14 0.91 n/a
;#
;# Well, given 3x column it's probably inappropriate to call the limit
;# asymptotic, if it can be surpassed, isn't it? What happens there?
;# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
;# magic is responsible for this. Processor overlaps not only the
;# additional instructions with AES ones, but even AES instuctions
;# processing adjacent triplets of independent blocks. In the 6x case
;# additional instructions still claim disproportionally small amount
;# of additional cycles, but in 8x case number of instructions must be
;# a tad too high for out-of-order logic to cope with, and AES unit
;# remains underutilized... As you can see 8x interleave is hardly
;# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
;# utilizies 6x interleave because of limited register bank capacity.
;#
;# Higher interleave factors do have negative impact on Westmere
;# performance. While for ECB mode it's negligible ~1.5%, other
;# parallelizables perform ~5% worse, which is outweighed by ~25%
;# improvement on Sandy Bridge. To balance regression on Westmere
;# CTR mode was implemented with 6x aesenc interleave factor.
;# April 2011
;#
;# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
;# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
;# in CTR mode AES instruction interleave factor was chosen to be 6x.
;#
.text
.globl aesni_encrypt
.type aesni_encrypt,@function
.align 16
aesni_encrypt:
movups (%rdi),%xmm2
movl 240(%rdx),%eax
movups (%rdx),%xmm0
movups 16(%rdx),%xmm1
leaq 32(%rdx),%rdx
xorps %xmm0,%xmm2
.Loop_enc1_1:
.byte 102,15,56,220,209
decl %eax
movups (%rdx),%xmm1
leaq 16(%rdx),%rdx
jnz .Loop_enc1_1
.byte 102,15,56,221,209
movups %xmm2,(%rsi)
.byte 0xf3,0xc3
.size aesni_encrypt,.-aesni_encrypt
.globl aesni_decrypt
.type aesni_decrypt,@function
.align 16
aesni_decrypt:
movups (%rdi),%xmm2
movl 240(%rdx),%eax
movups (%rdx),%xmm0
movups 16(%rdx),%xmm1
leaq 32(%rdx),%rdx
xorps %xmm0,%xmm2
.Loop_dec1_2:
.byte 102,15,56,222,209
decl %eax
movups (%rdx),%xmm1
leaq 16(%rdx),%rdx
jnz .Loop_dec1_2
.byte 102,15,56,223,209
movups %xmm2,(%rsi)
.byte 0xf3,0xc3
.size aesni_decrypt, .-aesni_decrypt
.type _aesni_encrypt3,@function
.align 16
_aesni_encrypt3:
movups (%rcx),%xmm0
shrl $1,%eax
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
movups (%rcx),%xmm0
.Lenc_loop3:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
decl %eax
.byte 102,15,56,220,225
movups 16(%rcx),%xmm1
.byte 102,15,56,220,208
.byte 102,15,56,220,216
leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
movups (%rcx),%xmm0
jnz .Lenc_loop3
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.byte 102,15,56,221,208
.byte 102,15,56,221,216
.byte 102,15,56,221,224
.byte 0xf3,0xc3
.size _aesni_encrypt3,.-_aesni_encrypt3
.type _aesni_decrypt3,@function
.align 16
_aesni_decrypt3:
movups (%rcx),%xmm0
shrl $1,%eax
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
movups (%rcx),%xmm0
.Ldec_loop3:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
decl %eax
.byte 102,15,56,222,225
movups 16(%rcx),%xmm1
.byte 102,15,56,222,208
.byte 102,15,56,222,216
leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
movups (%rcx),%xmm0
jnz .Ldec_loop3
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
.byte 102,15,56,223,208
.byte 102,15,56,223,216
.byte 102,15,56,223,224
.byte 0xf3,0xc3
.size _aesni_decrypt3,.-_aesni_decrypt3
.type _aesni_encrypt4,@function
.align 16
_aesni_encrypt4:
movups (%rcx),%xmm0
shrl $1,%eax
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
xorps %xmm0,%xmm5
movups (%rcx),%xmm0
.Lenc_loop4:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
movups 16(%rcx),%xmm1
.byte 102,15,56,220,208
.byte 102,15,56,220,216
leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
movups (%rcx),%xmm0
jnz .Lenc_loop4
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,221,208
.byte 102,15,56,221,216
.byte 102,15,56,221,224
.byte 102,15,56,221,232
.byte 0xf3,0xc3
.size _aesni_encrypt4,.-_aesni_encrypt4
.type _aesni_decrypt4,@function
.align 16
_aesni_decrypt4:
movups (%rcx),%xmm0
shrl $1,%eax
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
xorps %xmm0,%xmm4
xorps %xmm0,%xmm5
movups (%rcx),%xmm0
.Ldec_loop4:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
movups 16(%rcx),%xmm1
.byte 102,15,56,222,208
.byte 102,15,56,222,216
leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
movups (%rcx),%xmm0
jnz .Ldec_loop4
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,223,208
.byte 102,15,56,223,216
.byte 102,15,56,223,224
.byte 102,15,56,223,232
.byte 0xf3,0xc3
.size _aesni_decrypt4,.-_aesni_decrypt4
.type _aesni_encrypt6,@function
.align 16
_aesni_encrypt6:
movups (%rcx),%xmm0
shrl $1,%eax
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
.byte 102,15,56,220,209
pxor %xmm0,%xmm4
.byte 102,15,56,220,217
pxor %xmm0,%xmm5
.byte 102,15,56,220,225
pxor %xmm0,%xmm6
.byte 102,15,56,220,233
pxor %xmm0,%xmm7
decl %eax
.byte 102,15,56,220,241
movups (%rcx),%xmm0
.byte 102,15,56,220,249
jmp .Lenc_loop6_enter
.align 16
.Lenc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.Lenc_loop6_enter:
movups 16(%rcx),%xmm1
.byte 102,15,56,220,208
.byte 102,15,56,220,216
leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
movups (%rcx),%xmm0
jnz .Lenc_loop6
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.byte 102,15,56,221,208
.byte 102,15,56,221,216
.byte 102,15,56,221,224
.byte 102,15,56,221,232
.byte 102,15,56,221,240
.byte 102,15,56,221,248
.byte 0xf3,0xc3
.size _aesni_encrypt6,.-_aesni_encrypt6
.type _aesni_decrypt6,@function
.align 16
_aesni_decrypt6:
movups (%rcx),%xmm0
shrl $1,%eax
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
pxor %xmm0,%xmm3
.byte 102,15,56,222,209
pxor %xmm0,%xmm4
.byte 102,15,56,222,217
pxor %xmm0,%xmm5
.byte 102,15,56,222,225
pxor %xmm0,%xmm6
.byte 102,15,56,222,233
pxor %xmm0,%xmm7
decl %eax
.byte 102,15,56,222,241
movups (%rcx),%xmm0
.byte 102,15,56,222,249
jmp .Ldec_loop6_enter
.align 16
.Ldec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.Ldec_loop6_enter:
movups 16(%rcx),%xmm1
.byte 102,15,56,222,208
.byte 102,15,56,222,216
leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
movups (%rcx),%xmm0
jnz .Ldec_loop6
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.byte 102,15,56,223,208
.byte 102,15,56,223,216
.byte 102,15,56,223,224
.byte 102,15,56,223,232
.byte 102,15,56,223,240
.byte 102,15,56,223,248
.byte 0xf3,0xc3
.size _aesni_decrypt6,.-_aesni_decrypt6
.type _aesni_encrypt8,@function
.align 16
_aesni_encrypt8:
movups (%rcx),%xmm0
shrl $1,%eax
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
.byte 102,15,56,220,209
pxor %xmm0,%xmm4
.byte 102,15,56,220,217
pxor %xmm0,%xmm5
.byte 102,15,56,220,225
pxor %xmm0,%xmm6
.byte 102,15,56,220,233
pxor %xmm0,%xmm7
decl %eax
.byte 102,15,56,220,241
pxor %xmm0,%xmm8
.byte 102,15,56,220,249
pxor %xmm0,%xmm9
movups (%rcx),%xmm0
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 16(%rcx),%xmm1
jmp .Lenc_loop8_enter
.align 16
.Lenc_loop8:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
movups 16(%rcx),%xmm1
.Lenc_loop8_enter:
.byte 102,15,56,220,208
.byte 102,15,56,220,216
leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
.byte 102,68,15,56,220,192
.byte 102,68,15,56,220,200
movups (%rcx),%xmm0
jnz .Lenc_loop8
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.byte 102,68,15,56,220,193
.byte 102,68,15,56,220,201
.byte 102,15,56,221,208
.byte 102,15,56,221,216
.byte 102,15,56,221,224
.byte 102,15,56,221,232
.byte 102,15,56,221,240
.byte 102,15,56,221,248
.byte 102,68,15,56,221,192
.byte 102,68,15,56,221,200
.byte 0xf3,0xc3
.size _aesni_encrypt8,.-_aesni_encrypt8
.type _aesni_decrypt8,@function
.align 16
_aesni_decrypt8:
movups (%rcx),%xmm0
shrl $1,%eax
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
xorps %xmm0,%xmm3
.byte 102,15,56,222,209
pxor %xmm0,%xmm4
.byte 102,15,56,222,217
pxor %xmm0,%xmm5
.byte 102,15,56,222,225
pxor %xmm0,%xmm6
.byte 102,15,56,222,233
pxor %xmm0,%xmm7
decl %eax
.byte 102,15,56,222,241
pxor %xmm0,%xmm8
.byte 102,15,56,222,249
pxor %xmm0,%xmm9
movups (%rcx),%xmm0
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 16(%rcx),%xmm1
jmp .Ldec_loop8_enter
.align 16
.Ldec_loop8:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 16(%rcx),%xmm1
.Ldec_loop8_enter:
.byte 102,15,56,222,208
.byte 102,15,56,222,216
leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
.byte 102,68,15,56,222,192
.byte 102,68,15,56,222,200
movups (%rcx),%xmm0
jnz .Ldec_loop8
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
.byte 102,15,56,223,208
.byte 102,15,56,223,216
.byte 102,15,56,223,224
.byte 102,15,56,223,232
.byte 102,15,56,223,240
.byte 102,15,56,223,248
.byte 102,68,15,56,223,192
.byte 102,68,15,56,223,200
.byte 0xf3,0xc3
.size _aesni_decrypt8,.-_aesni_decrypt8
.globl aesni_ecb_encrypt
.type aesni_ecb_encrypt,@function
.align 16
aesni_ecb_encrypt:
andq $-16,%rdx
jz .Lecb_ret
movl 240(%rcx),%eax
movups (%rcx),%xmm0
movq %rcx,%r11
movl %eax,%r10d
testl %r8d,%r8d
jz .Lecb_decrypt
cmpq $128,%rdx
jb .Lecb_enc_tail
movdqu (%rdi),%xmm2
movdqu 16(%rdi),%xmm3
movdqu 32(%rdi),%xmm4
movdqu 48(%rdi),%xmm5
movdqu 64(%rdi),%xmm6
movdqu 80(%rdi),%xmm7
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
subq $128,%rdx
jmp .Lecb_enc_loop8_enter
.align 16
.Lecb_enc_loop8:
movups %xmm2,(%rsi)
movq %r11,%rcx
movdqu (%rdi),%xmm2
movl %r10d,%eax
movups %xmm3,16(%rsi)
movdqu 16(%rdi),%xmm3
movups %xmm4,32(%rsi)
movdqu 32(%rdi),%xmm4
movups %xmm5,48(%rsi)
movdqu 48(%rdi),%xmm5
movups %xmm6,64(%rsi)
movdqu 64(%rdi),%xmm6
movups %xmm7,80(%rsi)
movdqu 80(%rdi),%xmm7
movups %xmm8,96(%rsi)
movdqu 96(%rdi),%xmm8
movups %xmm9,112(%rsi)
leaq 128(%rsi),%rsi
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
.Lecb_enc_loop8_enter:
call _aesni_encrypt8
subq $128,%rdx
jnc .Lecb_enc_loop8
movups %xmm2,(%rsi)
movq %r11,%rcx
movups %xmm3,16(%rsi)
movl %r10d,%eax
movups %xmm4,32(%rsi)
movups %xmm5,48(%rsi)
movups %xmm6,64(%rsi)
movups %xmm7,80(%rsi)
movups %xmm8,96(%rsi)
movups %xmm9,112(%rsi)
leaq 128(%rsi),%rsi
addq $128,%rdx
jz .Lecb_ret
.Lecb_enc_tail:
movups (%rdi),%xmm2
cmpq $32,%rdx
jb .Lecb_enc_one
movups 16(%rdi),%xmm3
je .Lecb_enc_two
movups 32(%rdi),%xmm4
cmpq $64,%rdx
jb .Lecb_enc_three
movups 48(%rdi),%xmm5
je .Lecb_enc_four
movups 64(%rdi),%xmm6
cmpq $96,%rdx
jb .Lecb_enc_five
movups 80(%rdi),%xmm7
je .Lecb_enc_six
movdqu 96(%rdi),%xmm8
call _aesni_encrypt8
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
movups %xmm4,32(%rsi)
movups %xmm5,48(%rsi)
movups %xmm6,64(%rsi)
movups %xmm7,80(%rsi)
movups %xmm8,96(%rsi)
jmp .Lecb_ret
.align 16
.Lecb_enc_one:
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_enc1_3:
.byte 102,15,56,220,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_enc1_3
.byte 102,15,56,221,209
movups %xmm2,(%rsi)
jmp .Lecb_ret
.align 16
.Lecb_enc_two:
xorps %xmm4,%xmm4
call _aesni_encrypt3
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
jmp .Lecb_ret
.align 16
.Lecb_enc_three:
call _aesni_encrypt3
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
movups %xmm4,32(%rsi)
jmp .Lecb_ret
.align 16
.Lecb_enc_four:
call _aesni_encrypt4
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
movups %xmm4,32(%rsi)
movups %xmm5,48(%rsi)
jmp .Lecb_ret
.align 16
.Lecb_enc_five:
xorps %xmm7,%xmm7
call _aesni_encrypt6
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
movups %xmm4,32(%rsi)
movups %xmm5,48(%rsi)
movups %xmm6,64(%rsi)
jmp .Lecb_ret
.align 16
.Lecb_enc_six:
call _aesni_encrypt6
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
movups %xmm4,32(%rsi)
movups %xmm5,48(%rsi)
movups %xmm6,64(%rsi)
movups %xmm7,80(%rsi)
jmp .Lecb_ret
.align 16
.Lecb_decrypt:
cmpq $128,%rdx
jb .Lecb_dec_tail
movdqu (%rdi),%xmm2
movdqu 16(%rdi),%xmm3
movdqu 32(%rdi),%xmm4
movdqu 48(%rdi),%xmm5
movdqu 64(%rdi),%xmm6
movdqu 80(%rdi),%xmm7
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
subq $128,%rdx
jmp .Lecb_dec_loop8_enter
.align 16
.Lecb_dec_loop8:
movups %xmm2,(%rsi)
movq %r11,%rcx
movdqu (%rdi),%xmm2
movl %r10d,%eax
movups %xmm3,16(%rsi)
movdqu 16(%rdi),%xmm3
movups %xmm4,32(%rsi)
movdqu 32(%rdi),%xmm4
movups %xmm5,48(%rsi)
movdqu 48(%rdi),%xmm5
movups %xmm6,64(%rsi)
movdqu 64(%rdi),%xmm6
movups %xmm7,80(%rsi)
movdqu 80(%rdi),%xmm7
movups %xmm8,96(%rsi)
movdqu 96(%rdi),%xmm8
movups %xmm9,112(%rsi)
leaq 128(%rsi),%rsi
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
.Lecb_dec_loop8_enter:
call _aesni_decrypt8
movups (%r11),%xmm0
subq $128,%rdx
jnc .Lecb_dec_loop8
movups %xmm2,(%rsi)
movq %r11,%rcx
movups %xmm3,16(%rsi)
movl %r10d,%eax
movups %xmm4,32(%rsi)
movups %xmm5,48(%rsi)
movups %xmm6,64(%rsi)
movups %xmm7,80(%rsi)
movups %xmm8,96(%rsi)
movups %xmm9,112(%rsi)
leaq 128(%rsi),%rsi
addq $128,%rdx
jz .Lecb_ret
.Lecb_dec_tail:
movups (%rdi),%xmm2
cmpq $32,%rdx
jb .Lecb_dec_one
movups 16(%rdi),%xmm3
je .Lecb_dec_two
movups 32(%rdi),%xmm4
cmpq $64,%rdx
jb .Lecb_dec_three
movups 48(%rdi),%xmm5
je .Lecb_dec_four
movups 64(%rdi),%xmm6
cmpq $96,%rdx
jb .Lecb_dec_five
movups 80(%rdi),%xmm7
je .Lecb_dec_six
movups 96(%rdi),%xmm8
movups (%rcx),%xmm0
call _aesni_decrypt8
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
movups %xmm4,32(%rsi)
movups %xmm5,48(%rsi)
movups %xmm6,64(%rsi)
movups %xmm7,80(%rsi)
movups %xmm8,96(%rsi)
jmp .Lecb_ret
.align 16
.Lecb_dec_one:
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_dec1_4:
.byte 102,15,56,222,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_dec1_4
.byte 102,15,56,223,209
movups %xmm2,(%rsi)
jmp .Lecb_ret
.align 16
.Lecb_dec_two:
xorps %xmm4,%xmm4
call _aesni_decrypt3
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
jmp .Lecb_ret
.align 16
.Lecb_dec_three:
call _aesni_decrypt3
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
movups %xmm4,32(%rsi)
jmp .Lecb_ret
.align 16
.Lecb_dec_four:
call _aesni_decrypt4
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
movups %xmm4,32(%rsi)
movups %xmm5,48(%rsi)
jmp .Lecb_ret
.align 16
.Lecb_dec_five:
xorps %xmm7,%xmm7
call _aesni_decrypt6
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
movups %xmm4,32(%rsi)
movups %xmm5,48(%rsi)
movups %xmm6,64(%rsi)
jmp .Lecb_ret
.align 16
.Lecb_dec_six:
call _aesni_decrypt6
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
movups %xmm4,32(%rsi)
movups %xmm5,48(%rsi)
movups %xmm6,64(%rsi)
movups %xmm7,80(%rsi)
.Lecb_ret:
.byte 0xf3,0xc3
.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
.globl aesni_ccm64_encrypt_blocks
.type aesni_ccm64_encrypt_blocks,@function
.align 16
aesni_ccm64_encrypt_blocks:
movl 240(%rcx),%eax
movdqu (%r8),%xmm9
movdqa .Lincrement64(%rip),%xmm6
movdqa .Lbswap_mask(%rip),%xmm7
shrl $1,%eax
leaq 0(%rcx),%r11
movdqu (%r9),%xmm3
movdqa %xmm9,%xmm2
movl %eax,%r10d
.byte 102,68,15,56,0,207
jmp .Lccm64_enc_outer
.align 16
.Lccm64_enc_outer:
movups (%r11),%xmm0
movl %r10d,%eax
movups (%rdi),%xmm8
xorps %xmm0,%xmm2
movups 16(%r11),%xmm1
xorps %xmm8,%xmm0
leaq 32(%r11),%rcx
xorps %xmm0,%xmm3
movups (%rcx),%xmm0
.Lccm64_enc2_loop:
.byte 102,15,56,220,209
decl %eax
.byte 102,15,56,220,217
movups 16(%rcx),%xmm1
.byte 102,15,56,220,208
leaq 32(%rcx),%rcx
.byte 102,15,56,220,216
movups 0(%rcx),%xmm0
jnz .Lccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
paddq %xmm6,%xmm9
.byte 102,15,56,221,208
.byte 102,15,56,221,216
decq %rdx
leaq 16(%rdi),%rdi
xorps %xmm2,%xmm8
movdqa %xmm9,%xmm2
movups %xmm8,(%rsi)
leaq 16(%rsi),%rsi
.byte 102,15,56,0,215
jnz .Lccm64_enc_outer
movups %xmm3,(%r9)
.byte 0xf3,0xc3
.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
.globl aesni_ccm64_decrypt_blocks
.type aesni_ccm64_decrypt_blocks,@function
.align 16
aesni_ccm64_decrypt_blocks:
movl 240(%rcx),%eax
movups (%r8),%xmm9
movdqu (%r9),%xmm3
movdqa .Lincrement64(%rip),%xmm6
movdqa .Lbswap_mask(%rip),%xmm7
movaps %xmm9,%xmm2
movl %eax,%r10d
movq %rcx,%r11
.byte 102,68,15,56,0,207
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_enc1_5:
.byte 102,15,56,220,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_enc1_5
.byte 102,15,56,221,209
movups (%rdi),%xmm8
paddq %xmm6,%xmm9
leaq 16(%rdi),%rdi
jmp .Lccm64_dec_outer
.align 16
.Lccm64_dec_outer:
xorps %xmm2,%xmm8
movdqa %xmm9,%xmm2
movl %r10d,%eax
movups %xmm8,(%rsi)
leaq 16(%rsi),%rsi
.byte 102,15,56,0,215
subq $1,%rdx
jz .Lccm64_dec_break
movups (%r11),%xmm0
shrl $1,%eax
movups 16(%r11),%xmm1
xorps %xmm0,%xmm8
leaq 32(%r11),%rcx
xorps %xmm0,%xmm2
xorps %xmm8,%xmm3
movups (%rcx),%xmm0
.Lccm64_dec2_loop:
.byte 102,15,56,220,209
decl %eax
.byte 102,15,56,220,217
movups 16(%rcx),%xmm1
.byte 102,15,56,220,208
leaq 32(%rcx),%rcx
.byte 102,15,56,220,216
movups 0(%rcx),%xmm0
jnz .Lccm64_dec2_loop
movups (%rdi),%xmm8
paddq %xmm6,%xmm9
.byte 102,15,56,220,209
.byte 102,15,56,220,217
leaq 16(%rdi),%rdi
.byte 102,15,56,221,208
.byte 102,15,56,221,216
jmp .Lccm64_dec_outer
.align 16
.Lccm64_dec_break:
movups (%r11),%xmm0
movups 16(%r11),%xmm1
xorps %xmm0,%xmm8
leaq 32(%r11),%r11
xorps %xmm8,%xmm3
.Loop_enc1_6:
.byte 102,15,56,220,217
decl %eax
movups (%r11),%xmm1
leaq 16(%r11),%r11
jnz .Loop_enc1_6
.byte 102,15,56,221,217
movups %xmm3,(%r9)
.byte 0xf3,0xc3
.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
.globl aesni_ctr32_encrypt_blocks
.type aesni_ctr32_encrypt_blocks,@function
.align 16
aesni_ctr32_encrypt_blocks:
cmpq $1,%rdx
je .Lctr32_one_shortcut
movdqu (%r8),%xmm14
movdqa .Lbswap_mask(%rip),%xmm15
xorl %eax,%eax
.byte 102,69,15,58,22,242,3
.byte 102,68,15,58,34,240,3
movl 240(%rcx),%eax
bswapl %r10d
pxor %xmm12,%xmm12
pxor %xmm13,%xmm13
.byte 102,69,15,58,34,226,0
leaq 3(%r10),%r11
.byte 102,69,15,58,34,235,0
incl %r10d
.byte 102,69,15,58,34,226,1
incq %r11
.byte 102,69,15,58,34,235,1
incl %r10d
.byte 102,69,15,58,34,226,2
incq %r11
.byte 102,69,15,58,34,235,2
movdqa %xmm12,-40(%rsp)
.byte 102,69,15,56,0,231
movdqa %xmm13,-24(%rsp)
.byte 102,69,15,56,0,239
pshufd $192,%xmm12,%xmm2
pshufd $128,%xmm12,%xmm3
pshufd $64,%xmm12,%xmm4
cmpq $6,%rdx
jb .Lctr32_tail
shrl $1,%eax
movq %rcx,%r11
movl %eax,%r10d
subq $6,%rdx
jmp .Lctr32_loop6
.align 16
.Lctr32_loop6:
pshufd $192,%xmm13,%xmm5
por %xmm14,%xmm2
movups (%r11),%xmm0
pshufd $128,%xmm13,%xmm6
por %xmm14,%xmm3
movups 16(%r11),%xmm1
pshufd $64,%xmm13,%xmm7
por %xmm14,%xmm4
por %xmm14,%xmm5
xorps %xmm0,%xmm2
por %xmm14,%xmm6
por %xmm14,%xmm7
pxor %xmm0,%xmm3
.byte 102,15,56,220,209
leaq 32(%r11),%rcx
pxor %xmm0,%xmm4
.byte 102,15,56,220,217
movdqa .Lincrement32(%rip),%xmm13
pxor %xmm0,%xmm5
.byte 102,15,56,220,225
movdqa -40(%rsp),%xmm12
pxor %xmm0,%xmm6
.byte 102,15,56,220,233
pxor %xmm0,%xmm7
movups (%rcx),%xmm0
decl %eax
.byte 102,15,56,220,241
.byte 102,15,56,220,249
jmp .Lctr32_enc_loop6_enter
.align 16
.Lctr32_enc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.Lctr32_enc_loop6_enter:
movups 16(%rcx),%xmm1
.byte 102,15,56,220,208
.byte 102,15,56,220,216
leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
movups (%rcx),%xmm0
jnz .Lctr32_enc_loop6
.byte 102,15,56,220,209
paddd %xmm13,%xmm12
.byte 102,15,56,220,217
paddd -24(%rsp),%xmm13
.byte 102,15,56,220,225
movdqa %xmm12,-40(%rsp)
.byte 102,15,56,220,233
movdqa %xmm13,-24(%rsp)
.byte 102,15,56,220,241
.byte 102,69,15,56,0,231
.byte 102,15,56,220,249
.byte 102,69,15,56,0,239
.byte 102,15,56,221,208
movups (%rdi),%xmm8
.byte 102,15,56,221,216
movups 16(%rdi),%xmm9
.byte 102,15,56,221,224
movups 32(%rdi),%xmm10
.byte 102,15,56,221,232
movups 48(%rdi),%xmm11
.byte 102,15,56,221,240
movups 64(%rdi),%xmm1
.byte 102,15,56,221,248
movups 80(%rdi),%xmm0
leaq 96(%rdi),%rdi
xorps %xmm2,%xmm8
pshufd $192,%xmm12,%xmm2
xorps %xmm3,%xmm9
pshufd $128,%xmm12,%xmm3
movups %xmm8,(%rsi)
xorps %xmm4,%xmm10
pshufd $64,%xmm12,%xmm4
movups %xmm9,16(%rsi)
xorps %xmm5,%xmm11
movups %xmm10,32(%rsi)
xorps %xmm6,%xmm1
movups %xmm11,48(%rsi)
xorps %xmm7,%xmm0
movups %xmm1,64(%rsi)
movups %xmm0,80(%rsi)
leaq 96(%rsi),%rsi
movl %r10d,%eax
subq $6,%rdx
jnc .Lctr32_loop6
addq $6,%rdx
jz .Lctr32_done
movq %r11,%rcx
leal 1(%rax,%rax,1),%eax
.Lctr32_tail:
por %xmm14,%xmm2
movups (%rdi),%xmm8
cmpq $2,%rdx
jb .Lctr32_one
por %xmm14,%xmm3
movups 16(%rdi),%xmm9
je .Lctr32_two
pshufd $192,%xmm13,%xmm5
por %xmm14,%xmm4
movups 32(%rdi),%xmm10
cmpq $4,%rdx
jb .Lctr32_three
pshufd $128,%xmm13,%xmm6
por %xmm14,%xmm5
movups 48(%rdi),%xmm11
je .Lctr32_four
por %xmm14,%xmm6
xorps %xmm7,%xmm7
call _aesni_encrypt6
movups 64(%rdi),%xmm1
xorps %xmm2,%xmm8
xorps %xmm3,%xmm9
movups %xmm8,(%rsi)
xorps %xmm4,%xmm10
movups %xmm9,16(%rsi)
xorps %xmm5,%xmm11
movups %xmm10,32(%rsi)
xorps %xmm6,%xmm1
movups %xmm11,48(%rsi)
movups %xmm1,64(%rsi)
jmp .Lctr32_done
.align 16
.Lctr32_one_shortcut:
movups (%r8),%xmm2
movups (%rdi),%xmm8
movl 240(%rcx),%eax
.Lctr32_one:
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_enc1_7:
.byte 102,15,56,220,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_enc1_7
.byte 102,15,56,221,209
xorps %xmm2,%xmm8
movups %xmm8,(%rsi)
jmp .Lctr32_done
.align 16
.Lctr32_two:
xorps %xmm4,%xmm4
call _aesni_encrypt3
xorps %xmm2,%xmm8
xorps %xmm3,%xmm9
movups %xmm8,(%rsi)
movups %xmm9,16(%rsi)
jmp .Lctr32_done
.align 16
.Lctr32_three:
call _aesni_encrypt3
xorps %xmm2,%xmm8
xorps %xmm3,%xmm9
movups %xmm8,(%rsi)
xorps %xmm4,%xmm10
movups %xmm9,16(%rsi)
movups %xmm10,32(%rsi)
jmp .Lctr32_done
.align 16
.Lctr32_four:
call _aesni_encrypt4
xorps %xmm2,%xmm8
xorps %xmm3,%xmm9
movups %xmm8,(%rsi)
xorps %xmm4,%xmm10
movups %xmm9,16(%rsi)
xorps %xmm5,%xmm11
movups %xmm10,32(%rsi)
movups %xmm11,48(%rsi)
.Lctr32_done:
.byte 0xf3,0xc3
.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
.globl aesni_xts_encrypt
.type aesni_xts_encrypt,@function
.align 16
aesni_xts_encrypt:
leaq -104(%rsp),%rsp
movups (%r9),%xmm15
movl 240(%r8),%eax
movl 240(%rcx),%r10d
movups (%r8),%xmm0
movups 16(%r8),%xmm1
leaq 32(%r8),%r8
xorps %xmm0,%xmm15
.Loop_enc1_8:
.byte 102,68,15,56,220,249
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
jnz .Loop_enc1_8
.byte 102,68,15,56,221,249
movq %rcx,%r11
movl %r10d,%eax
movq %rdx,%r9
andq $-16,%rdx
movdqa .Lxts_magic(%rip),%xmm8
pxor %xmm14,%xmm14
pcmpgtd %xmm15,%xmm14
pshufd $19,%xmm14,%xmm9
pxor %xmm14,%xmm14
movdqa %xmm15,%xmm10
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
pcmpgtd %xmm15,%xmm14
pxor %xmm9,%xmm15
pshufd $19,%xmm14,%xmm9
pxor %xmm14,%xmm14
movdqa %xmm15,%xmm11
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
pcmpgtd %xmm15,%xmm14
pxor %xmm9,%xmm15
pshufd $19,%xmm14,%xmm9
pxor %xmm14,%xmm14
movdqa %xmm15,%xmm12
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
pcmpgtd %xmm15,%xmm14
pxor %xmm9,%xmm15
pshufd $19,%xmm14,%xmm9
pxor %xmm14,%xmm14
movdqa %xmm15,%xmm13
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
pcmpgtd %xmm15,%xmm14
pxor %xmm9,%xmm15
subq $96,%rdx
jc .Lxts_enc_short
shrl $1,%eax
subl $1,%eax
movl %eax,%r10d
jmp .Lxts_enc_grandloop
.align 16
.Lxts_enc_grandloop:
pshufd $19,%xmm14,%xmm9
movdqa %xmm15,%xmm14
paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
pxor %xmm9,%xmm15
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
pxor %xmm11,%xmm3
movdqu 64(%rdi),%xmm6
pxor %xmm12,%xmm4
movdqu 80(%rdi),%xmm7
leaq 96(%rdi),%rdi
pxor %xmm13,%xmm5
movups (%r11),%xmm0
pxor %xmm14,%xmm6
pxor %xmm15,%xmm7
movups 16(%r11),%xmm1
pxor %xmm0,%xmm2
pxor %xmm0,%xmm3
movdqa %xmm10,0(%rsp)
.byte 102,15,56,220,209
leaq 32(%r11),%rcx
pxor %xmm0,%xmm4
movdqa %xmm11,16(%rsp)
.byte 102,15,56,220,217
pxor %xmm0,%xmm5
movdqa %xmm12,32(%rsp)
.byte 102,15,56,220,225
pxor %xmm0,%xmm6
movdqa %xmm13,48(%rsp)
.byte 102,15,56,220,233
pxor %xmm0,%xmm7
movups (%rcx),%xmm0
decl %eax
movdqa %xmm14,64(%rsp)
.byte 102,15,56,220,241
movdqa %xmm15,80(%rsp)
.byte 102,15,56,220,249
pxor %xmm14,%xmm14
pcmpgtd %xmm15,%xmm14
jmp .Lxts_enc_loop6_enter
.align 16
.Lxts_enc_loop6:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
decl %eax
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
.Lxts_enc_loop6_enter:
movups 16(%rcx),%xmm1
.byte 102,15,56,220,208
.byte 102,15,56,220,216
leaq 32(%rcx),%rcx
.byte 102,15,56,220,224
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
movups (%rcx),%xmm0
jnz .Lxts_enc_loop6
pshufd $19,%xmm14,%xmm9
pxor %xmm14,%xmm14
paddq %xmm15,%xmm15
.byte 102,15,56,220,209
pand %xmm8,%xmm9
.byte 102,15,56,220,217
pcmpgtd %xmm15,%xmm14
.byte 102,15,56,220,225
pxor %xmm9,%xmm15
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
movups 16(%rcx),%xmm1
pshufd $19,%xmm14,%xmm9
pxor %xmm14,%xmm14
movdqa %xmm15,%xmm10
paddq %xmm15,%xmm15
.byte 102,15,56,220,208
pand %xmm8,%xmm9
.byte 102,15,56,220,216
pcmpgtd %xmm15,%xmm14
.byte 102,15,56,220,224
pxor %xmm9,%xmm15
.byte 102,15,56,220,232
.byte 102,15,56,220,240
.byte 102,15,56,220,248
movups 32(%rcx),%xmm0
pshufd $19,%xmm14,%xmm9
pxor %xmm14,%xmm14
movdqa %xmm15,%xmm11
paddq %xmm15,%xmm15
.byte 102,15,56,220,209
pand %xmm8,%xmm9
.byte 102,15,56,220,217
pcmpgtd %xmm15,%xmm14
.byte 102,15,56,220,225
pxor %xmm9,%xmm15
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
pshufd $19,%xmm14,%xmm9
pxor %xmm14,%xmm14
movdqa %xmm15,%xmm12
paddq %xmm15,%xmm15
.byte 102,15,56,221,208
pand %xmm8,%xmm9
.byte 102,15,56,221,216
pcmpgtd %xmm15,%xmm14
.byte 102,15,56,221,224
pxor %xmm9,%xmm15
.byte 102,15,56,221,232
.byte 102,15,56,221,240
.byte 102,15,56,221,248
pshufd $19,%xmm14,%xmm9
pxor %xmm14,%xmm14
movdqa %xmm15,%xmm13
paddq %xmm15,%xmm15
xorps 0(%rsp),%xmm2
pand %xmm8,%xmm9
xorps 16(%rsp),%xmm3
pcmpgtd %xmm15,%xmm14
pxor %xmm9,%xmm15
xorps 32(%rsp),%xmm4
movups %xmm2,0(%rsi)
xorps 48(%rsp),%xmm5
movups %xmm3,16(%rsi)
xorps 64(%rsp),%xmm6
movups %xmm4,32(%rsi)
xorps 80(%rsp),%xmm7
movups %xmm5,48(%rsi)
movl %r10d,%eax
movups %xmm6,64(%rsi)
movups %xmm7,80(%rsi)
leaq 96(%rsi),%rsi
subq $96,%rdx
jnc .Lxts_enc_grandloop
leal 3(%rax,%rax,1),%eax
movq %r11,%rcx
movl %eax,%r10d
.Lxts_enc_short:
addq $96,%rdx
jz .Lxts_enc_done
cmpq $32,%rdx
jb .Lxts_enc_one
je .Lxts_enc_two
cmpq $64,%rdx
jb .Lxts_enc_three
je .Lxts_enc_four
pshufd $19,%xmm14,%xmm9
movdqa %xmm15,%xmm14
paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
pxor %xmm9,%xmm15
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
pxor %xmm11,%xmm3
movdqu 64(%rdi),%xmm6
leaq 80(%rdi),%rdi
pxor %xmm12,%xmm4
pxor %xmm13,%xmm5
pxor %xmm14,%xmm6
call _aesni_encrypt6
xorps %xmm10,%xmm2
movdqa %xmm15,%xmm10
xorps %xmm11,%xmm3
xorps %xmm12,%xmm4
movdqu %xmm2,(%rsi)
xorps %xmm13,%xmm5
movdqu %xmm3,16(%rsi)
xorps %xmm14,%xmm6
movdqu %xmm4,32(%rsi)
movdqu %xmm5,48(%rsi)
movdqu %xmm6,64(%rsi)
leaq 80(%rsi),%rsi
jmp .Lxts_enc_done
.align 16
.Lxts_enc_one:
movups (%rdi),%xmm2
leaq 16(%rdi),%rdi
xorps %xmm10,%xmm2
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_enc1_9:
.byte 102,15,56,220,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_enc1_9
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movdqa %xmm11,%xmm10
movups %xmm2,(%rsi)
leaq 16(%rsi),%rsi
jmp .Lxts_enc_done
.align 16
.Lxts_enc_two:
movups (%rdi),%xmm2
movups 16(%rdi),%xmm3
leaq 32(%rdi),%rdi
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
call _aesni_encrypt3
xorps %xmm10,%xmm2
movdqa %xmm12,%xmm10
xorps %xmm11,%xmm3
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
leaq 32(%rsi),%rsi
jmp .Lxts_enc_done
.align 16
.Lxts_enc_three:
movups (%rdi),%xmm2
movups 16(%rdi),%xmm3
movups 32(%rdi),%xmm4
leaq 48(%rdi),%rdi
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
xorps %xmm12,%xmm4
call _aesni_encrypt3
xorps %xmm10,%xmm2
movdqa %xmm13,%xmm10
xorps %xmm11,%xmm3
xorps %xmm12,%xmm4
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
movups %xmm4,32(%rsi)
leaq 48(%rsi),%rsi
jmp .Lxts_enc_done
.align 16
.Lxts_enc_four:
movups (%rdi),%xmm2
movups 16(%rdi),%xmm3
movups 32(%rdi),%xmm4
xorps %xmm10,%xmm2
movups 48(%rdi),%xmm5
leaq 64(%rdi),%rdi
xorps %xmm11,%xmm3
xorps %xmm12,%xmm4
xorps %xmm13,%xmm5
call _aesni_encrypt4
xorps %xmm10,%xmm2
movdqa %xmm15,%xmm10
xorps %xmm11,%xmm3
xorps %xmm12,%xmm4
movups %xmm2,(%rsi)
xorps %xmm13,%xmm5
movups %xmm3,16(%rsi)
movups %xmm4,32(%rsi)
movups %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp .Lxts_enc_done
.align 16
.Lxts_enc_done:
andq $15,%r9
jz .Lxts_enc_ret
movq %r9,%rdx
.Lxts_enc_steal:
movzbl (%rdi),%eax
movzbl -16(%rsi),%ecx
leaq 1(%rdi),%rdi
movb %al,-16(%rsi)
movb %cl,0(%rsi)
leaq 1(%rsi),%rsi
subq $1,%rdx
jnz .Lxts_enc_steal
subq %r9,%rsi
movq %r11,%rcx
movl %r10d,%eax
movups -16(%rsi),%xmm2
xorps %xmm10,%xmm2
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_enc1_10:
.byte 102,15,56,220,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_enc1_10
.byte 102,15,56,221,209
xorps %xmm10,%xmm2
movups %xmm2,-16(%rsi)
.Lxts_enc_ret:
leaq 104(%rsp),%rsp
.Lxts_enc_epilogue:
.byte 0xf3,0xc3
.size aesni_xts_encrypt,.-aesni_xts_encrypt
.globl aesni_xts_decrypt
.type aesni_xts_decrypt,@function
.align 16
aesni_xts_decrypt:
leaq -104(%rsp),%rsp
movups (%r9),%xmm15
movl 240(%r8),%eax
movl 240(%rcx),%r10d
movups (%r8),%xmm0
movups 16(%r8),%xmm1
leaq 32(%r8),%r8
xorps %xmm0,%xmm15
.Loop_enc1_11:
.byte 102,68,15,56,220,249
decl %eax
movups (%r8),%xmm1
leaq 16(%r8),%r8
jnz .Loop_enc1_11
.byte 102,68,15,56,221,249
xorl %eax,%eax
testq $15,%rdx
setnz %al
shlq $4,%rax
subq %rax,%rdx
movq %rcx,%r11
movl %r10d,%eax
movq %rdx,%r9
andq $-16,%rdx
movdqa .Lxts_magic(%rip),%xmm8
pxor %xmm14,%xmm14
pcmpgtd %xmm15,%xmm14
pshufd $19,%xmm14,%xmm9
pxor %xmm14,%xmm14
movdqa %xmm15,%xmm10
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
pcmpgtd %xmm15,%xmm14
pxor %xmm9,%xmm15
pshufd $19,%xmm14,%xmm9
pxor %xmm14,%xmm14
movdqa %xmm15,%xmm11
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
pcmpgtd %xmm15,%xmm14
pxor %xmm9,%xmm15
pshufd $19,%xmm14,%xmm9
pxor %xmm14,%xmm14
movdqa %xmm15,%xmm12
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
pcmpgtd %xmm15,%xmm14
pxor %xmm9,%xmm15
pshufd $19,%xmm14,%xmm9
pxor %xmm14,%xmm14
movdqa %xmm15,%xmm13
paddq %xmm15,%xmm15
pand %xmm8,%xmm9
pcmpgtd %xmm15,%xmm14
pxor %xmm9,%xmm15
subq $96,%rdx
jc .Lxts_dec_short
shrl $1,%eax
subl $1,%eax
movl %eax,%r10d
jmp .Lxts_dec_grandloop
.align 16
.Lxts_dec_grandloop:
pshufd $19,%xmm14,%xmm9
movdqa %xmm15,%xmm14
paddq %xmm15,%xmm15
movdqu 0(%rdi),%xmm2
pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
pxor %xmm9,%xmm15
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
pxor %xmm11,%xmm3
movdqu 64(%rdi),%xmm6
pxor %xmm12,%xmm4
movdqu 80(%rdi),%xmm7
leaq 96(%rdi),%rdi
pxor %xmm13,%xmm5
movups (%r11),%xmm0
pxor %xmm14,%xmm6
pxor %xmm15,%xmm7
movups 16(%r11),%xmm1
pxor %xmm0,%xmm2
pxor %xmm0,%xmm3
movdqa %xmm10,0(%rsp)
.byte 102,15,56,222,209
leaq 32(%r11),%rcx
pxor %xmm0,%xmm4
movdqa %xmm11,16(%rsp)
.byte 102,15,56,222,217
pxor %xmm0,%xmm5
movdqa %xmm12,32(%rsp)
.byte 102,15,56,222,225
pxor %xmm0,%xmm6
movdqa %xmm13,48(%rsp)
.byte 102,15,56,222,233
pxor %xmm0,%xmm7
movups (%rcx),%xmm0
decl %eax
movdqa %xmm14,64(%rsp)
.byte 102,15,56,222,241
movdqa %xmm15,80(%rsp)
.byte 102,15,56,222,249
pxor %xmm14,%xmm14
pcmpgtd %xmm15,%xmm14
jmp .Lxts_dec_loop6_enter
.align 16
.Lxts_dec_loop6:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
decl %eax
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.Lxts_dec_loop6_enter:
movups 16(%rcx),%xmm1
.byte 102,15,56,222,208
.byte 102,15,56,222,216
leaq 32(%rcx),%rcx
.byte 102,15,56,222,224
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
movups (%rcx),%xmm0
jnz .Lxts_dec_loop6
pshufd $19,%xmm14,%xmm9
pxor %xmm14,%xmm14
paddq %xmm15,%xmm15
.byte 102,15,56,222,209
pand %xmm8,%xmm9
.byte 102,15,56,222,217
pcmpgtd %xmm15,%xmm14
.byte 102,15,56,222,225
pxor %xmm9,%xmm15
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
movups 16(%rcx),%xmm1
pshufd $19,%xmm14,%xmm9
pxor %xmm14,%xmm14
movdqa %xmm15,%xmm10
paddq %xmm15,%xmm15
.byte 102,15,56,222,208
pand %xmm8,%xmm9
.byte 102,15,56,222,216
pcmpgtd %xmm15,%xmm14
.byte 102,15,56,222,224
pxor %xmm9,%xmm15
.byte 102,15,56,222,232
.byte 102,15,56,222,240
.byte 102,15,56,222,248
movups 32(%rcx),%xmm0
pshufd $19,%xmm14,%xmm9
pxor %xmm14,%xmm14
movdqa %xmm15,%xmm11
paddq %xmm15,%xmm15
.byte 102,15,56,222,209
pand %xmm8,%xmm9
.byte 102,15,56,222,217
pcmpgtd %xmm15,%xmm14
.byte 102,15,56,222,225
pxor %xmm9,%xmm15
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
pshufd $19,%xmm14,%xmm9
pxor %xmm14,%xmm14
movdqa %xmm15,%xmm12
paddq %xmm15,%xmm15
.byte 102,15,56,223,208
pand %xmm8,%xmm9
.byte 102,15,56,223,216
pcmpgtd %xmm15,%xmm14
.byte 102,15,56,223,224
pxor %xmm9,%xmm15
.byte 102,15,56,223,232
.byte 102,15,56,223,240
.byte 102,15,56,223,248
pshufd $19,%xmm14,%xmm9
pxor %xmm14,%xmm14
movdqa %xmm15,%xmm13
paddq %xmm15,%xmm15
xorps 0(%rsp),%xmm2
pand %xmm8,%xmm9
xorps 16(%rsp),%xmm3
pcmpgtd %xmm15,%xmm14
pxor %xmm9,%xmm15
xorps 32(%rsp),%xmm4
movups %xmm2,0(%rsi)
xorps 48(%rsp),%xmm5
movups %xmm3,16(%rsi)
xorps 64(%rsp),%xmm6
movups %xmm4,32(%rsi)
xorps 80(%rsp),%xmm7
movups %xmm5,48(%rsi)
movl %r10d,%eax
movups %xmm6,64(%rsi)
movups %xmm7,80(%rsi)
leaq 96(%rsi),%rsi
subq $96,%rdx
jnc .Lxts_dec_grandloop
leal 3(%rax,%rax,1),%eax
movq %r11,%rcx
movl %eax,%r10d
.Lxts_dec_short:
addq $96,%rdx
jz .Lxts_dec_done
cmpq $32,%rdx
jb .Lxts_dec_one
je .Lxts_dec_two
cmpq $64,%rdx
jb .Lxts_dec_three
je .Lxts_dec_four
pshufd $19,%xmm14,%xmm9
movdqa %xmm15,%xmm14
paddq %xmm15,%xmm15
movdqu (%rdi),%xmm2
pand %xmm8,%xmm9
movdqu 16(%rdi),%xmm3
pxor %xmm9,%xmm15
movdqu 32(%rdi),%xmm4
pxor %xmm10,%xmm2
movdqu 48(%rdi),%xmm5
pxor %xmm11,%xmm3
movdqu 64(%rdi),%xmm6
leaq 80(%rdi),%rdi
pxor %xmm12,%xmm4
pxor %xmm13,%xmm5
pxor %xmm14,%xmm6
call _aesni_decrypt6
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
xorps %xmm12,%xmm4
movdqu %xmm2,(%rsi)
xorps %xmm13,%xmm5
movdqu %xmm3,16(%rsi)
xorps %xmm14,%xmm6
movdqu %xmm4,32(%rsi)
pxor %xmm14,%xmm14
movdqu %xmm5,48(%rsi)
pcmpgtd %xmm15,%xmm14
movdqu %xmm6,64(%rsi)
leaq 80(%rsi),%rsi
pshufd $19,%xmm14,%xmm11
andq $15,%r9
jz .Lxts_dec_ret
movdqa %xmm15,%xmm10
paddq %xmm15,%xmm15
pand %xmm8,%xmm11
pxor %xmm15,%xmm11
jmp .Lxts_dec_done2
.align 16
.Lxts_dec_one:
movups (%rdi),%xmm2
leaq 16(%rdi),%rdi
xorps %xmm10,%xmm2
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_dec1_12:
.byte 102,15,56,222,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_dec1_12
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movdqa %xmm11,%xmm10
movups %xmm2,(%rsi)
movdqa %xmm12,%xmm11
leaq 16(%rsi),%rsi
jmp .Lxts_dec_done
.align 16
.Lxts_dec_two:
movups (%rdi),%xmm2
movups 16(%rdi),%xmm3
leaq 32(%rdi),%rdi
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
call _aesni_decrypt3
xorps %xmm10,%xmm2
movdqa %xmm12,%xmm10
xorps %xmm11,%xmm3
movdqa %xmm13,%xmm11
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
leaq 32(%rsi),%rsi
jmp .Lxts_dec_done
.align 16
.Lxts_dec_three:
movups (%rdi),%xmm2
movups 16(%rdi),%xmm3
movups 32(%rdi),%xmm4
leaq 48(%rdi),%rdi
xorps %xmm10,%xmm2
xorps %xmm11,%xmm3
xorps %xmm12,%xmm4
call _aesni_decrypt3
xorps %xmm10,%xmm2
movdqa %xmm13,%xmm10
xorps %xmm11,%xmm3
movdqa %xmm15,%xmm11
xorps %xmm12,%xmm4
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
movups %xmm4,32(%rsi)
leaq 48(%rsi),%rsi
jmp .Lxts_dec_done
.align 16
.Lxts_dec_four:
pshufd $19,%xmm14,%xmm9
movdqa %xmm15,%xmm14
paddq %xmm15,%xmm15
movups (%rdi),%xmm2
pand %xmm8,%xmm9
movups 16(%rdi),%xmm3
pxor %xmm9,%xmm15
movups 32(%rdi),%xmm4
xorps %xmm10,%xmm2
movups 48(%rdi),%xmm5
leaq 64(%rdi),%rdi
xorps %xmm11,%xmm3
xorps %xmm12,%xmm4
xorps %xmm13,%xmm5
call _aesni_decrypt4
xorps %xmm10,%xmm2
movdqa %xmm14,%xmm10
xorps %xmm11,%xmm3
movdqa %xmm15,%xmm11
xorps %xmm12,%xmm4
movups %xmm2,(%rsi)
xorps %xmm13,%xmm5
movups %xmm3,16(%rsi)
movups %xmm4,32(%rsi)
movups %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
jmp .Lxts_dec_done
.align 16
.Lxts_dec_done:
andq $15,%r9
jz .Lxts_dec_ret
.Lxts_dec_done2:
movq %r9,%rdx
movq %r11,%rcx
movl %r10d,%eax
movups (%rdi),%xmm2
xorps %xmm11,%xmm2
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_dec1_13:
.byte 102,15,56,222,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_dec1_13
.byte 102,15,56,223,209
xorps %xmm11,%xmm2
movups %xmm2,(%rsi)
.Lxts_dec_steal:
movzbl 16(%rdi),%eax
movzbl (%rsi),%ecx
leaq 1(%rdi),%rdi
movb %al,(%rsi)
movb %cl,16(%rsi)
leaq 1(%rsi),%rsi
subq $1,%rdx
jnz .Lxts_dec_steal
subq %r9,%rsi
movq %r11,%rcx
movl %r10d,%eax
movups (%rsi),%xmm2
xorps %xmm10,%xmm2
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_dec1_14:
.byte 102,15,56,222,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_dec1_14
.byte 102,15,56,223,209
xorps %xmm10,%xmm2
movups %xmm2,(%rsi)
.Lxts_dec_ret:
leaq 104(%rsp),%rsp
.Lxts_dec_epilogue:
.byte 0xf3,0xc3
.size aesni_xts_decrypt,.-aesni_xts_decrypt
.globl aesni_cbc_encrypt
.type aesni_cbc_encrypt,@function
.align 16
aesni_cbc_encrypt:
testq %rdx,%rdx
jz .Lcbc_ret
movl 240(%rcx),%r10d
movq %rcx,%r11
testl %r9d,%r9d
jz .Lcbc_decrypt
movups (%r8),%xmm2
movl %r10d,%eax
cmpq $16,%rdx
jb .Lcbc_enc_tail
subq $16,%rdx
jmp .Lcbc_enc_loop
.align 16
.Lcbc_enc_loop:
movups (%rdi),%xmm3
leaq 16(%rdi),%rdi
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
xorps %xmm0,%xmm3
leaq 32(%rcx),%rcx
xorps %xmm3,%xmm2
.Loop_enc1_15:
.byte 102,15,56,220,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_enc1_15
.byte 102,15,56,221,209
movl %r10d,%eax
movq %r11,%rcx
movups %xmm2,0(%rsi)
leaq 16(%rsi),%rsi
subq $16,%rdx
jnc .Lcbc_enc_loop
addq $16,%rdx
jnz .Lcbc_enc_tail
movups %xmm2,(%r8)
jmp .Lcbc_ret
.Lcbc_enc_tail:
movq %rdx,%rcx
xchgq %rdi,%rsi
.long 0x9066A4F3
movl $16,%ecx
subq %rdx,%rcx
xorl %eax,%eax
.long 0x9066AAF3
leaq -16(%rdi),%rdi
movl %r10d,%eax
movq %rdi,%rsi
movq %r11,%rcx
xorq %rdx,%rdx
jmp .Lcbc_enc_loop
.align 16
.Lcbc_decrypt:
movups (%r8),%xmm9
movl %r10d,%eax
cmpq $112,%rdx
jbe .Lcbc_dec_tail
shrl $1,%r10d
subq $112,%rdx
movl %r10d,%eax
movaps %xmm9,-24(%rsp)
jmp .Lcbc_dec_loop8_enter
.align 16
.Lcbc_dec_loop8:
movaps %xmm0,-24(%rsp)
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
.Lcbc_dec_loop8_enter:
movups (%rcx),%xmm0
movups (%rdi),%xmm2
movups 16(%rdi),%xmm3
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
movdqu 32(%rdi),%xmm4
xorps %xmm0,%xmm2
movdqu 48(%rdi),%xmm5
xorps %xmm0,%xmm3
movdqu 64(%rdi),%xmm6
.byte 102,15,56,222,209
pxor %xmm0,%xmm4
movdqu 80(%rdi),%xmm7
.byte 102,15,56,222,217
pxor %xmm0,%xmm5
movdqu 96(%rdi),%xmm8
.byte 102,15,56,222,225
pxor %xmm0,%xmm6
movdqu 112(%rdi),%xmm9
.byte 102,15,56,222,233
pxor %xmm0,%xmm7
decl %eax
.byte 102,15,56,222,241
pxor %xmm0,%xmm8
.byte 102,15,56,222,249
pxor %xmm0,%xmm9
movups (%rcx),%xmm0
.byte 102,68,15,56,222,193
.byte 102,68,15,56,222,201
movups 16(%rcx),%xmm1
call .Ldec_loop8_enter
movups (%rdi),%xmm1
movups 16(%rdi),%xmm0
xorps -24(%rsp),%xmm2
xorps %xmm1,%xmm3
movups 32(%rdi),%xmm1
xorps %xmm0,%xmm4
movups 48(%rdi),%xmm0
xorps %xmm1,%xmm5
movups 64(%rdi),%xmm1
xorps %xmm0,%xmm6
movups 80(%rdi),%xmm0
xorps %xmm1,%xmm7
movups 96(%rdi),%xmm1
xorps %xmm0,%xmm8
movups 112(%rdi),%xmm0
xorps %xmm1,%xmm9
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
movups %xmm4,32(%rsi)
movups %xmm5,48(%rsi)
movl %r10d,%eax
movups %xmm6,64(%rsi)
movq %r11,%rcx
movups %xmm7,80(%rsi)
leaq 128(%rdi),%rdi
movups %xmm8,96(%rsi)
leaq 112(%rsi),%rsi
subq $128,%rdx
ja .Lcbc_dec_loop8
movaps %xmm9,%xmm2
movaps %xmm0,%xmm9
addq $112,%rdx
jle .Lcbc_dec_tail_collected
movups %xmm2,(%rsi)
leal 1(%r10,%r10,1),%eax
leaq 16(%rsi),%rsi
.Lcbc_dec_tail:
movups (%rdi),%xmm2
movaps %xmm2,%xmm8
cmpq $16,%rdx
jbe .Lcbc_dec_one
movups 16(%rdi),%xmm3
movaps %xmm3,%xmm7
cmpq $32,%rdx
jbe .Lcbc_dec_two
movups 32(%rdi),%xmm4
movaps %xmm4,%xmm6
cmpq $48,%rdx
jbe .Lcbc_dec_three
movups 48(%rdi),%xmm5
cmpq $64,%rdx
jbe .Lcbc_dec_four
movups 64(%rdi),%xmm6
cmpq $80,%rdx
jbe .Lcbc_dec_five
movups 80(%rdi),%xmm7
cmpq $96,%rdx
jbe .Lcbc_dec_six
movups 96(%rdi),%xmm8
movaps %xmm9,-24(%rsp)
call _aesni_decrypt8
movups (%rdi),%xmm1
movups 16(%rdi),%xmm0
xorps -24(%rsp),%xmm2
xorps %xmm1,%xmm3
movups 32(%rdi),%xmm1
xorps %xmm0,%xmm4
movups 48(%rdi),%xmm0
xorps %xmm1,%xmm5
movups 64(%rdi),%xmm1
xorps %xmm0,%xmm6
movups 80(%rdi),%xmm0
xorps %xmm1,%xmm7
movups 96(%rdi),%xmm9
xorps %xmm0,%xmm8
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
movups %xmm4,32(%rsi)
movups %xmm5,48(%rsi)
movups %xmm6,64(%rsi)
movups %xmm7,80(%rsi)
leaq 96(%rsi),%rsi
movaps %xmm8,%xmm2
subq $112,%rdx
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_one:
movups (%rcx),%xmm0
movups 16(%rcx),%xmm1
leaq 32(%rcx),%rcx
xorps %xmm0,%xmm2
.Loop_dec1_16:
.byte 102,15,56,222,209
decl %eax
movups (%rcx),%xmm1
leaq 16(%rcx),%rcx
jnz .Loop_dec1_16
.byte 102,15,56,223,209
xorps %xmm9,%xmm2
movaps %xmm8,%xmm9
subq $16,%rdx
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_two:
xorps %xmm4,%xmm4
call _aesni_decrypt3
xorps %xmm9,%xmm2
xorps %xmm8,%xmm3
movups %xmm2,(%rsi)
movaps %xmm7,%xmm9
movaps %xmm3,%xmm2
leaq 16(%rsi),%rsi
subq $32,%rdx
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_three:
call _aesni_decrypt3
xorps %xmm9,%xmm2
xorps %xmm8,%xmm3
movups %xmm2,(%rsi)
xorps %xmm7,%xmm4
movups %xmm3,16(%rsi)
movaps %xmm6,%xmm9
movaps %xmm4,%xmm2
leaq 32(%rsi),%rsi
subq $48,%rdx
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_four:
call _aesni_decrypt4
xorps %xmm9,%xmm2
movups 48(%rdi),%xmm9
xorps %xmm8,%xmm3
movups %xmm2,(%rsi)
xorps %xmm7,%xmm4
movups %xmm3,16(%rsi)
xorps %xmm6,%xmm5
movups %xmm4,32(%rsi)
movaps %xmm5,%xmm2
leaq 48(%rsi),%rsi
subq $64,%rdx
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_five:
xorps %xmm7,%xmm7
call _aesni_decrypt6
movups 16(%rdi),%xmm1
movups 32(%rdi),%xmm0
xorps %xmm9,%xmm2
xorps %xmm8,%xmm3
xorps %xmm1,%xmm4
movups 48(%rdi),%xmm1
xorps %xmm0,%xmm5
movups 64(%rdi),%xmm9
xorps %xmm1,%xmm6
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
movups %xmm4,32(%rsi)
movups %xmm5,48(%rsi)
leaq 64(%rsi),%rsi
movaps %xmm6,%xmm2
subq $80,%rdx
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_six:
call _aesni_decrypt6
movups 16(%rdi),%xmm1
movups 32(%rdi),%xmm0
xorps %xmm9,%xmm2
xorps %xmm8,%xmm3
xorps %xmm1,%xmm4
movups 48(%rdi),%xmm1
xorps %xmm0,%xmm5
movups 64(%rdi),%xmm0
xorps %xmm1,%xmm6
movups 80(%rdi),%xmm9
xorps %xmm0,%xmm7
movups %xmm2,(%rsi)
movups %xmm3,16(%rsi)
movups %xmm4,32(%rsi)
movups %xmm5,48(%rsi)
movups %xmm6,64(%rsi)
leaq 80(%rsi),%rsi
movaps %xmm7,%xmm2
subq $96,%rdx
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_tail_collected:
andq $15,%rdx
movups %xmm9,(%r8)
jnz .Lcbc_dec_tail_partial
movups %xmm2,(%rsi)
jmp .Lcbc_dec_ret
.align 16
.Lcbc_dec_tail_partial:
movaps %xmm2,-24(%rsp)
movq $16,%rcx
movq %rsi,%rdi
subq %rdx,%rcx
leaq -24(%rsp),%rsi
.long 0x9066A4F3
.Lcbc_dec_ret:
.Lcbc_ret:
.byte 0xf3,0xc3
.size aesni_cbc_encrypt,.-aesni_cbc_encrypt
.globl aesni_set_decrypt_key
.type aesni_set_decrypt_key,@function
.align 16
aesni_set_decrypt_key:
.byte 0x48,0x83,0xEC,0x08
call __aesni_set_encrypt_key
shll $4,%esi
testl %eax,%eax
jnz .Ldec_key_ret
leaq 16(%rdx,%rsi,1),%rdi
movups (%rdx),%xmm0
movups (%rdi),%xmm1
movups %xmm0,(%rdi)
movups %xmm1,(%rdx)
leaq 16(%rdx),%rdx
leaq -16(%rdi),%rdi
.Ldec_key_inverse:
movups (%rdx),%xmm0
movups (%rdi),%xmm1
.byte 102,15,56,219,192
.byte 102,15,56,219,201
leaq 16(%rdx),%rdx
leaq -16(%rdi),%rdi
movups %xmm0,16(%rdi)
movups %xmm1,-16(%rdx)
cmpq %rdx,%rdi
ja .Ldec_key_inverse
movups (%rdx),%xmm0
.byte 102,15,56,219,192
movups %xmm0,(%rdi)
.Ldec_key_ret:
addq $8,%rsp
.byte 0xf3,0xc3
.LSEH_end_set_decrypt_key:
.size aesni_set_decrypt_key,.-aesni_set_decrypt_key
.globl aesni_set_encrypt_key
.type aesni_set_encrypt_key,@function
.align 16
aesni_set_encrypt_key:
__aesni_set_encrypt_key:
.byte 0x48,0x83,0xEC,0x08
movq $-1,%rax
testq %rdi,%rdi
jz .Lenc_key_ret
testq %rdx,%rdx
jz .Lenc_key_ret
movups (%rdi),%xmm0
xorps %xmm4,%xmm4
leaq 16(%rdx),%rax
cmpl $256,%esi
je .L14rounds
cmpl $192,%esi
je .L12rounds
cmpl $128,%esi
jne .Lbad_keybits
.L10rounds:
movl $9,%esi
movups %xmm0,(%rdx)
.byte 102,15,58,223,200,1
call .Lkey_expansion_128_cold
.byte 102,15,58,223,200,2
call .Lkey_expansion_128
.byte 102,15,58,223,200,4
call .Lkey_expansion_128
.byte 102,15,58,223,200,8
call .Lkey_expansion_128
.byte 102,15,58,223,200,16
call .Lkey_expansion_128
.byte 102,15,58,223,200,32
call .Lkey_expansion_128
.byte 102,15,58,223,200,64
call .Lkey_expansion_128
.byte 102,15,58,223,200,128
call .Lkey_expansion_128
.byte 102,15,58,223,200,27
call .Lkey_expansion_128
.byte 102,15,58,223,200,54
call .Lkey_expansion_128
movups %xmm0,(%rax)
movl %esi,80(%rax)
xorl %eax,%eax
jmp .Lenc_key_ret
.align 16
.L12rounds:
movq 16(%rdi),%xmm2
movl $11,%esi
movups %xmm0,(%rdx)
.byte 102,15,58,223,202,1
call .Lkey_expansion_192a_cold
.byte 102,15,58,223,202,2
call .Lkey_expansion_192b
.byte 102,15,58,223,202,4
call .Lkey_expansion_192a
.byte 102,15,58,223,202,8
call .Lkey_expansion_192b
.byte 102,15,58,223,202,16
call .Lkey_expansion_192a
.byte 102,15,58,223,202,32
call .Lkey_expansion_192b
.byte 102,15,58,223,202,64
call .Lkey_expansion_192a
.byte 102,15,58,223,202,128
call .Lkey_expansion_192b
movups %xmm0,(%rax)
movl %esi,48(%rax)
xorq %rax,%rax
jmp .Lenc_key_ret
.align 16
.L14rounds:
movups 16(%rdi),%xmm2
movl $13,%esi
leaq 16(%rax),%rax
movups %xmm0,(%rdx)
movups %xmm2,16(%rdx)
.byte 102,15,58,223,202,1
call .Lkey_expansion_256a_cold
.byte 102,15,58,223,200,1
call .Lkey_expansion_256b
.byte 102,15,58,223,202,2
call .Lkey_expansion_256a
.byte 102,15,58,223,200,2
call .Lkey_expansion_256b
.byte 102,15,58,223,202,4
call .Lkey_expansion_256a
.byte 102,15,58,223,200,4
call .Lkey_expansion_256b
.byte 102,15,58,223,202,8
call .Lkey_expansion_256a
.byte 102,15,58,223,200,8
call .Lkey_expansion_256b
.byte 102,15,58,223,202,16
call .Lkey_expansion_256a
.byte 102,15,58,223,200,16
call .Lkey_expansion_256b
.byte 102,15,58,223,202,32
call .Lkey_expansion_256a
.byte 102,15,58,223,200,32
call .Lkey_expansion_256b
.byte 102,15,58,223,202,64
call .Lkey_expansion_256a
movups %xmm0,(%rax)
movl %esi,16(%rax)
xorq %rax,%rax
jmp .Lenc_key_ret
.align 16
.Lbad_keybits:
movq $-2,%rax
.Lenc_key_ret:
addq $8,%rsp
.byte 0xf3,0xc3
.LSEH_end_set_encrypt_key:
.align 16
.Lkey_expansion_128:
movups %xmm0,(%rax)
leaq 16(%rax),%rax
.Lkey_expansion_128_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $255,%xmm1,%xmm1
xorps %xmm1,%xmm0
.byte 0xf3,0xc3
.align 16
.Lkey_expansion_192a:
movups %xmm0,(%rax)
leaq 16(%rax),%rax
.Lkey_expansion_192a_cold:
movaps %xmm2,%xmm5
.Lkey_expansion_192b_warm:
shufps $16,%xmm0,%xmm4
movdqa %xmm2,%xmm3
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
pslldq $4,%xmm3
xorps %xmm4,%xmm0
pshufd $85,%xmm1,%xmm1
pxor %xmm3,%xmm2
pxor %xmm1,%xmm0
pshufd $255,%xmm0,%xmm3
pxor %xmm3,%xmm2
.byte 0xf3,0xc3
.align 16
.Lkey_expansion_192b:
movaps %xmm0,%xmm3
shufps $68,%xmm0,%xmm5
movups %xmm5,(%rax)
shufps $78,%xmm2,%xmm3
movups %xmm3,16(%rax)
leaq 32(%rax),%rax
jmp .Lkey_expansion_192b_warm
.align 16
.Lkey_expansion_256a:
movups %xmm2,(%rax)
leaq 16(%rax),%rax
.Lkey_expansion_256a_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $255,%xmm1,%xmm1
xorps %xmm1,%xmm0
.byte 0xf3,0xc3
.align 16
.Lkey_expansion_256b:
movups %xmm0,(%rax)
leaq 16(%rax),%rax
shufps $16,%xmm2,%xmm4
xorps %xmm4,%xmm2
shufps $140,%xmm2,%xmm4
xorps %xmm4,%xmm2
shufps $170,%xmm1,%xmm1
xorps %xmm1,%xmm2
.byte 0xf3,0xc3
.size aesni_set_encrypt_key,.-aesni_set_encrypt_key
.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
.align 64
.Lbswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.Lincrement32:
.long 6,6,6,0
.Lincrement64:
.long 1,0,0,0
.Lxts_magic:
.long 0x87,0,1,0
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64