1610 lines
31 KiB
ArmAsm
1610 lines
31 KiB
ArmAsm
.text
|
|
|
|
|
|
|
|
.globl aesni_multi_cbc_encrypt
|
|
.type aesni_multi_cbc_encrypt,@function
|
|
.align 32
|
|
aesni_multi_cbc_encrypt:
|
|
.cfi_startproc
|
|
cmpl $2,%edx
|
|
jb .Lenc_non_avx
|
|
movl OPENSSL_ia32cap_P+4(%rip),%ecx
|
|
testl $268435456,%ecx
|
|
jnz _avx_cbc_enc_shortcut
|
|
jmp .Lenc_non_avx
|
|
.align 16
|
|
.Lenc_non_avx:
|
|
movq %rsp,%rax
|
|
.cfi_def_cfa_register %rax
|
|
pushq %rbx
|
|
.cfi_offset %rbx,-16
|
|
pushq %rbp
|
|
.cfi_offset %rbp,-24
|
|
pushq %r12
|
|
.cfi_offset %r12,-32
|
|
pushq %r13
|
|
.cfi_offset %r13,-40
|
|
pushq %r14
|
|
.cfi_offset %r14,-48
|
|
pushq %r15
|
|
.cfi_offset %r15,-56
|
|
|
|
|
|
|
|
|
|
|
|
|
|
subq $48,%rsp
|
|
andq $-64,%rsp
|
|
movq %rax,16(%rsp)
|
|
.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
|
|
|
|
.Lenc4x_body:
|
|
movdqu (%rsi),%xmm12
|
|
leaq 120(%rsi),%rsi
|
|
leaq 80(%rdi),%rdi
|
|
|
|
.Lenc4x_loop_grande:
|
|
movl %edx,24(%rsp)
|
|
xorl %edx,%edx
|
|
|
|
movl -64(%rdi),%ecx
|
|
movq -80(%rdi),%r8
|
|
cmpl %edx,%ecx
|
|
movq -72(%rdi),%r12
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
movdqu -56(%rdi),%xmm2
|
|
movl %ecx,32(%rsp)
|
|
cmovleq %rsp,%r8
|
|
|
|
movl -24(%rdi),%ecx
|
|
movq -40(%rdi),%r9
|
|
cmpl %edx,%ecx
|
|
movq -32(%rdi),%r13
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
movdqu -16(%rdi),%xmm3
|
|
movl %ecx,36(%rsp)
|
|
cmovleq %rsp,%r9
|
|
|
|
movl 16(%rdi),%ecx
|
|
movq 0(%rdi),%r10
|
|
cmpl %edx,%ecx
|
|
movq 8(%rdi),%r14
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
movdqu 24(%rdi),%xmm4
|
|
movl %ecx,40(%rsp)
|
|
cmovleq %rsp,%r10
|
|
|
|
movl 56(%rdi),%ecx
|
|
movq 40(%rdi),%r11
|
|
cmpl %edx,%ecx
|
|
movq 48(%rdi),%r15
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
movdqu 64(%rdi),%xmm5
|
|
movl %ecx,44(%rsp)
|
|
cmovleq %rsp,%r11
|
|
testl %edx,%edx
|
|
jz .Lenc4x_done
|
|
|
|
movups 16-120(%rsi),%xmm1
|
|
pxor %xmm12,%xmm2
|
|
movups 32-120(%rsi),%xmm0
|
|
pxor %xmm12,%xmm3
|
|
movl 240-120(%rsi),%eax
|
|
pxor %xmm12,%xmm4
|
|
movdqu (%r8),%xmm6
|
|
pxor %xmm12,%xmm5
|
|
movdqu (%r9),%xmm7
|
|
pxor %xmm6,%xmm2
|
|
movdqu (%r10),%xmm8
|
|
pxor %xmm7,%xmm3
|
|
movdqu (%r11),%xmm9
|
|
pxor %xmm8,%xmm4
|
|
pxor %xmm9,%xmm5
|
|
movdqa 32(%rsp),%xmm10
|
|
xorq %rbx,%rbx
|
|
jmp .Loop_enc4x
|
|
|
|
.align 32
|
|
.Loop_enc4x:
|
|
addq $16,%rbx
|
|
leaq 16(%rsp),%rbp
|
|
movl $1,%ecx
|
|
subq %rbx,%rbp
|
|
|
|
.byte 102,15,56,220,209
|
|
prefetcht0 31(%r8,%rbx,1)
|
|
prefetcht0 31(%r9,%rbx,1)
|
|
.byte 102,15,56,220,217
|
|
prefetcht0 31(%r10,%rbx,1)
|
|
prefetcht0 31(%r10,%rbx,1)
|
|
.byte 102,15,56,220,225
|
|
.byte 102,15,56,220,233
|
|
movups 48-120(%rsi),%xmm1
|
|
cmpl 32(%rsp),%ecx
|
|
.byte 102,15,56,220,208
|
|
.byte 102,15,56,220,216
|
|
.byte 102,15,56,220,224
|
|
cmovgeq %rbp,%r8
|
|
cmovgq %rbp,%r12
|
|
.byte 102,15,56,220,232
|
|
movups -56(%rsi),%xmm0
|
|
cmpl 36(%rsp),%ecx
|
|
.byte 102,15,56,220,209
|
|
.byte 102,15,56,220,217
|
|
.byte 102,15,56,220,225
|
|
cmovgeq %rbp,%r9
|
|
cmovgq %rbp,%r13
|
|
.byte 102,15,56,220,233
|
|
movups -40(%rsi),%xmm1
|
|
cmpl 40(%rsp),%ecx
|
|
.byte 102,15,56,220,208
|
|
.byte 102,15,56,220,216
|
|
.byte 102,15,56,220,224
|
|
cmovgeq %rbp,%r10
|
|
cmovgq %rbp,%r14
|
|
.byte 102,15,56,220,232
|
|
movups -24(%rsi),%xmm0
|
|
cmpl 44(%rsp),%ecx
|
|
.byte 102,15,56,220,209
|
|
.byte 102,15,56,220,217
|
|
.byte 102,15,56,220,225
|
|
cmovgeq %rbp,%r11
|
|
cmovgq %rbp,%r15
|
|
.byte 102,15,56,220,233
|
|
movups -8(%rsi),%xmm1
|
|
movdqa %xmm10,%xmm11
|
|
.byte 102,15,56,220,208
|
|
prefetcht0 15(%r12,%rbx,1)
|
|
prefetcht0 15(%r13,%rbx,1)
|
|
.byte 102,15,56,220,216
|
|
prefetcht0 15(%r14,%rbx,1)
|
|
prefetcht0 15(%r15,%rbx,1)
|
|
.byte 102,15,56,220,224
|
|
.byte 102,15,56,220,232
|
|
movups 128-120(%rsi),%xmm0
|
|
pxor %xmm12,%xmm12
|
|
|
|
.byte 102,15,56,220,209
|
|
pcmpgtd %xmm12,%xmm11
|
|
movdqu -120(%rsi),%xmm12
|
|
.byte 102,15,56,220,217
|
|
paddd %xmm11,%xmm10
|
|
movdqa %xmm10,32(%rsp)
|
|
.byte 102,15,56,220,225
|
|
.byte 102,15,56,220,233
|
|
movups 144-120(%rsi),%xmm1
|
|
|
|
cmpl $11,%eax
|
|
|
|
.byte 102,15,56,220,208
|
|
.byte 102,15,56,220,216
|
|
.byte 102,15,56,220,224
|
|
.byte 102,15,56,220,232
|
|
movups 160-120(%rsi),%xmm0
|
|
|
|
jb .Lenc4x_tail
|
|
|
|
.byte 102,15,56,220,209
|
|
.byte 102,15,56,220,217
|
|
.byte 102,15,56,220,225
|
|
.byte 102,15,56,220,233
|
|
movups 176-120(%rsi),%xmm1
|
|
|
|
.byte 102,15,56,220,208
|
|
.byte 102,15,56,220,216
|
|
.byte 102,15,56,220,224
|
|
.byte 102,15,56,220,232
|
|
movups 192-120(%rsi),%xmm0
|
|
|
|
je .Lenc4x_tail
|
|
|
|
.byte 102,15,56,220,209
|
|
.byte 102,15,56,220,217
|
|
.byte 102,15,56,220,225
|
|
.byte 102,15,56,220,233
|
|
movups 208-120(%rsi),%xmm1
|
|
|
|
.byte 102,15,56,220,208
|
|
.byte 102,15,56,220,216
|
|
.byte 102,15,56,220,224
|
|
.byte 102,15,56,220,232
|
|
movups 224-120(%rsi),%xmm0
|
|
jmp .Lenc4x_tail
|
|
|
|
.align 32
|
|
.Lenc4x_tail:
|
|
.byte 102,15,56,220,209
|
|
.byte 102,15,56,220,217
|
|
.byte 102,15,56,220,225
|
|
.byte 102,15,56,220,233
|
|
movdqu (%r8,%rbx,1),%xmm6
|
|
movdqu 16-120(%rsi),%xmm1
|
|
|
|
.byte 102,15,56,221,208
|
|
movdqu (%r9,%rbx,1),%xmm7
|
|
pxor %xmm12,%xmm6
|
|
.byte 102,15,56,221,216
|
|
movdqu (%r10,%rbx,1),%xmm8
|
|
pxor %xmm12,%xmm7
|
|
.byte 102,15,56,221,224
|
|
movdqu (%r11,%rbx,1),%xmm9
|
|
pxor %xmm12,%xmm8
|
|
.byte 102,15,56,221,232
|
|
movdqu 32-120(%rsi),%xmm0
|
|
pxor %xmm12,%xmm9
|
|
|
|
movups %xmm2,-16(%r12,%rbx,1)
|
|
pxor %xmm6,%xmm2
|
|
movups %xmm3,-16(%r13,%rbx,1)
|
|
pxor %xmm7,%xmm3
|
|
movups %xmm4,-16(%r14,%rbx,1)
|
|
pxor %xmm8,%xmm4
|
|
movups %xmm5,-16(%r15,%rbx,1)
|
|
pxor %xmm9,%xmm5
|
|
|
|
decl %edx
|
|
jnz .Loop_enc4x
|
|
|
|
movq 16(%rsp),%rax
|
|
.cfi_def_cfa %rax,8
|
|
movl 24(%rsp),%edx
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaq 160(%rdi),%rdi
|
|
decl %edx
|
|
jnz .Lenc4x_loop_grande
|
|
|
|
.Lenc4x_done:
|
|
movq -48(%rax),%r15
|
|
.cfi_restore %r15
|
|
movq -40(%rax),%r14
|
|
.cfi_restore %r14
|
|
movq -32(%rax),%r13
|
|
.cfi_restore %r13
|
|
movq -24(%rax),%r12
|
|
.cfi_restore %r12
|
|
movq -16(%rax),%rbp
|
|
.cfi_restore %rbp
|
|
movq -8(%rax),%rbx
|
|
.cfi_restore %rbx
|
|
leaq (%rax),%rsp
|
|
.cfi_def_cfa_register %rsp
|
|
.Lenc4x_epilogue:
|
|
.byte 0xf3,0xc3
|
|
.cfi_endproc
|
|
.size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
|
|
|
|
.globl aesni_multi_cbc_decrypt
|
|
.type aesni_multi_cbc_decrypt,@function
|
|
.align 32
|
|
aesni_multi_cbc_decrypt:
|
|
.cfi_startproc
|
|
cmpl $2,%edx
|
|
jb .Ldec_non_avx
|
|
movl OPENSSL_ia32cap_P+4(%rip),%ecx
|
|
testl $268435456,%ecx
|
|
jnz _avx_cbc_dec_shortcut
|
|
jmp .Ldec_non_avx
|
|
.align 16
|
|
.Ldec_non_avx:
|
|
movq %rsp,%rax
|
|
.cfi_def_cfa_register %rax
|
|
pushq %rbx
|
|
.cfi_offset %rbx,-16
|
|
pushq %rbp
|
|
.cfi_offset %rbp,-24
|
|
pushq %r12
|
|
.cfi_offset %r12,-32
|
|
pushq %r13
|
|
.cfi_offset %r13,-40
|
|
pushq %r14
|
|
.cfi_offset %r14,-48
|
|
pushq %r15
|
|
.cfi_offset %r15,-56
|
|
|
|
|
|
|
|
|
|
|
|
|
|
subq $48,%rsp
|
|
andq $-64,%rsp
|
|
movq %rax,16(%rsp)
|
|
.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
|
|
|
|
.Ldec4x_body:
|
|
movdqu (%rsi),%xmm12
|
|
leaq 120(%rsi),%rsi
|
|
leaq 80(%rdi),%rdi
|
|
|
|
.Ldec4x_loop_grande:
|
|
movl %edx,24(%rsp)
|
|
xorl %edx,%edx
|
|
|
|
movl -64(%rdi),%ecx
|
|
movq -80(%rdi),%r8
|
|
cmpl %edx,%ecx
|
|
movq -72(%rdi),%r12
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
movdqu -56(%rdi),%xmm6
|
|
movl %ecx,32(%rsp)
|
|
cmovleq %rsp,%r8
|
|
|
|
movl -24(%rdi),%ecx
|
|
movq -40(%rdi),%r9
|
|
cmpl %edx,%ecx
|
|
movq -32(%rdi),%r13
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
movdqu -16(%rdi),%xmm7
|
|
movl %ecx,36(%rsp)
|
|
cmovleq %rsp,%r9
|
|
|
|
movl 16(%rdi),%ecx
|
|
movq 0(%rdi),%r10
|
|
cmpl %edx,%ecx
|
|
movq 8(%rdi),%r14
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
movdqu 24(%rdi),%xmm8
|
|
movl %ecx,40(%rsp)
|
|
cmovleq %rsp,%r10
|
|
|
|
movl 56(%rdi),%ecx
|
|
movq 40(%rdi),%r11
|
|
cmpl %edx,%ecx
|
|
movq 48(%rdi),%r15
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
movdqu 64(%rdi),%xmm9
|
|
movl %ecx,44(%rsp)
|
|
cmovleq %rsp,%r11
|
|
testl %edx,%edx
|
|
jz .Ldec4x_done
|
|
|
|
movups 16-120(%rsi),%xmm1
|
|
movups 32-120(%rsi),%xmm0
|
|
movl 240-120(%rsi),%eax
|
|
movdqu (%r8),%xmm2
|
|
movdqu (%r9),%xmm3
|
|
pxor %xmm12,%xmm2
|
|
movdqu (%r10),%xmm4
|
|
pxor %xmm12,%xmm3
|
|
movdqu (%r11),%xmm5
|
|
pxor %xmm12,%xmm4
|
|
pxor %xmm12,%xmm5
|
|
movdqa 32(%rsp),%xmm10
|
|
xorq %rbx,%rbx
|
|
jmp .Loop_dec4x
|
|
|
|
.align 32
|
|
.Loop_dec4x:
|
|
addq $16,%rbx
|
|
leaq 16(%rsp),%rbp
|
|
movl $1,%ecx
|
|
subq %rbx,%rbp
|
|
|
|
.byte 102,15,56,222,209
|
|
prefetcht0 31(%r8,%rbx,1)
|
|
prefetcht0 31(%r9,%rbx,1)
|
|
.byte 102,15,56,222,217
|
|
prefetcht0 31(%r10,%rbx,1)
|
|
prefetcht0 31(%r11,%rbx,1)
|
|
.byte 102,15,56,222,225
|
|
.byte 102,15,56,222,233
|
|
movups 48-120(%rsi),%xmm1
|
|
cmpl 32(%rsp),%ecx
|
|
.byte 102,15,56,222,208
|
|
.byte 102,15,56,222,216
|
|
.byte 102,15,56,222,224
|
|
cmovgeq %rbp,%r8
|
|
cmovgq %rbp,%r12
|
|
.byte 102,15,56,222,232
|
|
movups -56(%rsi),%xmm0
|
|
cmpl 36(%rsp),%ecx
|
|
.byte 102,15,56,222,209
|
|
.byte 102,15,56,222,217
|
|
.byte 102,15,56,222,225
|
|
cmovgeq %rbp,%r9
|
|
cmovgq %rbp,%r13
|
|
.byte 102,15,56,222,233
|
|
movups -40(%rsi),%xmm1
|
|
cmpl 40(%rsp),%ecx
|
|
.byte 102,15,56,222,208
|
|
.byte 102,15,56,222,216
|
|
.byte 102,15,56,222,224
|
|
cmovgeq %rbp,%r10
|
|
cmovgq %rbp,%r14
|
|
.byte 102,15,56,222,232
|
|
movups -24(%rsi),%xmm0
|
|
cmpl 44(%rsp),%ecx
|
|
.byte 102,15,56,222,209
|
|
.byte 102,15,56,222,217
|
|
.byte 102,15,56,222,225
|
|
cmovgeq %rbp,%r11
|
|
cmovgq %rbp,%r15
|
|
.byte 102,15,56,222,233
|
|
movups -8(%rsi),%xmm1
|
|
movdqa %xmm10,%xmm11
|
|
.byte 102,15,56,222,208
|
|
prefetcht0 15(%r12,%rbx,1)
|
|
prefetcht0 15(%r13,%rbx,1)
|
|
.byte 102,15,56,222,216
|
|
prefetcht0 15(%r14,%rbx,1)
|
|
prefetcht0 15(%r15,%rbx,1)
|
|
.byte 102,15,56,222,224
|
|
.byte 102,15,56,222,232
|
|
movups 128-120(%rsi),%xmm0
|
|
pxor %xmm12,%xmm12
|
|
|
|
.byte 102,15,56,222,209
|
|
pcmpgtd %xmm12,%xmm11
|
|
movdqu -120(%rsi),%xmm12
|
|
.byte 102,15,56,222,217
|
|
paddd %xmm11,%xmm10
|
|
movdqa %xmm10,32(%rsp)
|
|
.byte 102,15,56,222,225
|
|
.byte 102,15,56,222,233
|
|
movups 144-120(%rsi),%xmm1
|
|
|
|
cmpl $11,%eax
|
|
|
|
.byte 102,15,56,222,208
|
|
.byte 102,15,56,222,216
|
|
.byte 102,15,56,222,224
|
|
.byte 102,15,56,222,232
|
|
movups 160-120(%rsi),%xmm0
|
|
|
|
jb .Ldec4x_tail
|
|
|
|
.byte 102,15,56,222,209
|
|
.byte 102,15,56,222,217
|
|
.byte 102,15,56,222,225
|
|
.byte 102,15,56,222,233
|
|
movups 176-120(%rsi),%xmm1
|
|
|
|
.byte 102,15,56,222,208
|
|
.byte 102,15,56,222,216
|
|
.byte 102,15,56,222,224
|
|
.byte 102,15,56,222,232
|
|
movups 192-120(%rsi),%xmm0
|
|
|
|
je .Ldec4x_tail
|
|
|
|
.byte 102,15,56,222,209
|
|
.byte 102,15,56,222,217
|
|
.byte 102,15,56,222,225
|
|
.byte 102,15,56,222,233
|
|
movups 208-120(%rsi),%xmm1
|
|
|
|
.byte 102,15,56,222,208
|
|
.byte 102,15,56,222,216
|
|
.byte 102,15,56,222,224
|
|
.byte 102,15,56,222,232
|
|
movups 224-120(%rsi),%xmm0
|
|
jmp .Ldec4x_tail
|
|
|
|
.align 32
|
|
.Ldec4x_tail:
|
|
.byte 102,15,56,222,209
|
|
.byte 102,15,56,222,217
|
|
.byte 102,15,56,222,225
|
|
pxor %xmm0,%xmm6
|
|
pxor %xmm0,%xmm7
|
|
.byte 102,15,56,222,233
|
|
movdqu 16-120(%rsi),%xmm1
|
|
pxor %xmm0,%xmm8
|
|
pxor %xmm0,%xmm9
|
|
movdqu 32-120(%rsi),%xmm0
|
|
|
|
.byte 102,15,56,223,214
|
|
.byte 102,15,56,223,223
|
|
movdqu -16(%r8,%rbx,1),%xmm6
|
|
movdqu -16(%r9,%rbx,1),%xmm7
|
|
.byte 102,65,15,56,223,224
|
|
.byte 102,65,15,56,223,233
|
|
movdqu -16(%r10,%rbx,1),%xmm8
|
|
movdqu -16(%r11,%rbx,1),%xmm9
|
|
|
|
movups %xmm2,-16(%r12,%rbx,1)
|
|
movdqu (%r8,%rbx,1),%xmm2
|
|
movups %xmm3,-16(%r13,%rbx,1)
|
|
movdqu (%r9,%rbx,1),%xmm3
|
|
pxor %xmm12,%xmm2
|
|
movups %xmm4,-16(%r14,%rbx,1)
|
|
movdqu (%r10,%rbx,1),%xmm4
|
|
pxor %xmm12,%xmm3
|
|
movups %xmm5,-16(%r15,%rbx,1)
|
|
movdqu (%r11,%rbx,1),%xmm5
|
|
pxor %xmm12,%xmm4
|
|
pxor %xmm12,%xmm5
|
|
|
|
decl %edx
|
|
jnz .Loop_dec4x
|
|
|
|
movq 16(%rsp),%rax
|
|
.cfi_def_cfa %rax,8
|
|
movl 24(%rsp),%edx
|
|
|
|
leaq 160(%rdi),%rdi
|
|
decl %edx
|
|
jnz .Ldec4x_loop_grande
|
|
|
|
.Ldec4x_done:
|
|
movq -48(%rax),%r15
|
|
.cfi_restore %r15
|
|
movq -40(%rax),%r14
|
|
.cfi_restore %r14
|
|
movq -32(%rax),%r13
|
|
.cfi_restore %r13
|
|
movq -24(%rax),%r12
|
|
.cfi_restore %r12
|
|
movq -16(%rax),%rbp
|
|
.cfi_restore %rbp
|
|
movq -8(%rax),%rbx
|
|
.cfi_restore %rbx
|
|
leaq (%rax),%rsp
|
|
.cfi_def_cfa_register %rsp
|
|
.Ldec4x_epilogue:
|
|
.byte 0xf3,0xc3
|
|
.cfi_endproc
|
|
.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
|
|
.type aesni_multi_cbc_encrypt_avx,@function
|
|
.align 32
|
|
aesni_multi_cbc_encrypt_avx:
|
|
.cfi_startproc
|
|
_avx_cbc_enc_shortcut:
|
|
movq %rsp,%rax
|
|
.cfi_def_cfa_register %rax
|
|
pushq %rbx
|
|
.cfi_offset %rbx,-16
|
|
pushq %rbp
|
|
.cfi_offset %rbp,-24
|
|
pushq %r12
|
|
.cfi_offset %r12,-32
|
|
pushq %r13
|
|
.cfi_offset %r13,-40
|
|
pushq %r14
|
|
.cfi_offset %r14,-48
|
|
pushq %r15
|
|
.cfi_offset %r15,-56
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
subq $192,%rsp
|
|
andq $-128,%rsp
|
|
movq %rax,16(%rsp)
|
|
.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
|
|
|
|
.Lenc8x_body:
|
|
vzeroupper
|
|
vmovdqu (%rsi),%xmm15
|
|
leaq 120(%rsi),%rsi
|
|
leaq 160(%rdi),%rdi
|
|
shrl $1,%edx
|
|
|
|
.Lenc8x_loop_grande:
|
|
|
|
xorl %edx,%edx
|
|
|
|
movl -144(%rdi),%ecx
|
|
|
|
movq -160(%rdi),%r8
|
|
cmpl %edx,%ecx
|
|
|
|
movq -152(%rdi),%rbx
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
vmovdqu -136(%rdi),%xmm2
|
|
movl %ecx,32(%rsp)
|
|
cmovleq %rsp,%r8
|
|
subq %r8,%rbx
|
|
movq %rbx,64(%rsp)
|
|
|
|
movl -104(%rdi),%ecx
|
|
|
|
movq -120(%rdi),%r9
|
|
cmpl %edx,%ecx
|
|
|
|
movq -112(%rdi),%rbp
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
vmovdqu -96(%rdi),%xmm3
|
|
movl %ecx,36(%rsp)
|
|
cmovleq %rsp,%r9
|
|
subq %r9,%rbp
|
|
movq %rbp,72(%rsp)
|
|
|
|
movl -64(%rdi),%ecx
|
|
|
|
movq -80(%rdi),%r10
|
|
cmpl %edx,%ecx
|
|
|
|
movq -72(%rdi),%rbp
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
vmovdqu -56(%rdi),%xmm4
|
|
movl %ecx,40(%rsp)
|
|
cmovleq %rsp,%r10
|
|
subq %r10,%rbp
|
|
movq %rbp,80(%rsp)
|
|
|
|
movl -24(%rdi),%ecx
|
|
|
|
movq -40(%rdi),%r11
|
|
cmpl %edx,%ecx
|
|
|
|
movq -32(%rdi),%rbp
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
vmovdqu -16(%rdi),%xmm5
|
|
movl %ecx,44(%rsp)
|
|
cmovleq %rsp,%r11
|
|
subq %r11,%rbp
|
|
movq %rbp,88(%rsp)
|
|
|
|
movl 16(%rdi),%ecx
|
|
|
|
movq 0(%rdi),%r12
|
|
cmpl %edx,%ecx
|
|
|
|
movq 8(%rdi),%rbp
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
vmovdqu 24(%rdi),%xmm6
|
|
movl %ecx,48(%rsp)
|
|
cmovleq %rsp,%r12
|
|
subq %r12,%rbp
|
|
movq %rbp,96(%rsp)
|
|
|
|
movl 56(%rdi),%ecx
|
|
|
|
movq 40(%rdi),%r13
|
|
cmpl %edx,%ecx
|
|
|
|
movq 48(%rdi),%rbp
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
vmovdqu 64(%rdi),%xmm7
|
|
movl %ecx,52(%rsp)
|
|
cmovleq %rsp,%r13
|
|
subq %r13,%rbp
|
|
movq %rbp,104(%rsp)
|
|
|
|
movl 96(%rdi),%ecx
|
|
|
|
movq 80(%rdi),%r14
|
|
cmpl %edx,%ecx
|
|
|
|
movq 88(%rdi),%rbp
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
vmovdqu 104(%rdi),%xmm8
|
|
movl %ecx,56(%rsp)
|
|
cmovleq %rsp,%r14
|
|
subq %r14,%rbp
|
|
movq %rbp,112(%rsp)
|
|
|
|
movl 136(%rdi),%ecx
|
|
|
|
movq 120(%rdi),%r15
|
|
cmpl %edx,%ecx
|
|
|
|
movq 128(%rdi),%rbp
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
vmovdqu 144(%rdi),%xmm9
|
|
movl %ecx,60(%rsp)
|
|
cmovleq %rsp,%r15
|
|
subq %r15,%rbp
|
|
movq %rbp,120(%rsp)
|
|
testl %edx,%edx
|
|
jz .Lenc8x_done
|
|
|
|
vmovups 16-120(%rsi),%xmm1
|
|
vmovups 32-120(%rsi),%xmm0
|
|
movl 240-120(%rsi),%eax
|
|
|
|
vpxor (%r8),%xmm15,%xmm10
|
|
leaq 128(%rsp),%rbp
|
|
vpxor (%r9),%xmm15,%xmm11
|
|
vpxor (%r10),%xmm15,%xmm12
|
|
vpxor (%r11),%xmm15,%xmm13
|
|
vpxor %xmm10,%xmm2,%xmm2
|
|
vpxor (%r12),%xmm15,%xmm10
|
|
vpxor %xmm11,%xmm3,%xmm3
|
|
vpxor (%r13),%xmm15,%xmm11
|
|
vpxor %xmm12,%xmm4,%xmm4
|
|
vpxor (%r14),%xmm15,%xmm12
|
|
vpxor %xmm13,%xmm5,%xmm5
|
|
vpxor (%r15),%xmm15,%xmm13
|
|
vpxor %xmm10,%xmm6,%xmm6
|
|
movl $1,%ecx
|
|
vpxor %xmm11,%xmm7,%xmm7
|
|
vpxor %xmm12,%xmm8,%xmm8
|
|
vpxor %xmm13,%xmm9,%xmm9
|
|
jmp .Loop_enc8x
|
|
|
|
.align 32
|
|
.Loop_enc8x:
|
|
vaesenc %xmm1,%xmm2,%xmm2
|
|
cmpl 32+0(%rsp),%ecx
|
|
vaesenc %xmm1,%xmm3,%xmm3
|
|
prefetcht0 31(%r8)
|
|
vaesenc %xmm1,%xmm4,%xmm4
|
|
vaesenc %xmm1,%xmm5,%xmm5
|
|
leaq (%r8,%rbx,1),%rbx
|
|
cmovgeq %rsp,%r8
|
|
vaesenc %xmm1,%xmm6,%xmm6
|
|
cmovgq %rsp,%rbx
|
|
vaesenc %xmm1,%xmm7,%xmm7
|
|
subq %r8,%rbx
|
|
vaesenc %xmm1,%xmm8,%xmm8
|
|
vpxor 16(%r8),%xmm15,%xmm10
|
|
movq %rbx,64+0(%rsp)
|
|
vaesenc %xmm1,%xmm9,%xmm9
|
|
vmovups -72(%rsi),%xmm1
|
|
leaq 16(%r8,%rbx,1),%r8
|
|
vmovdqu %xmm10,0(%rbp)
|
|
vaesenc %xmm0,%xmm2,%xmm2
|
|
cmpl 32+4(%rsp),%ecx
|
|
movq 64+8(%rsp),%rbx
|
|
vaesenc %xmm0,%xmm3,%xmm3
|
|
prefetcht0 31(%r9)
|
|
vaesenc %xmm0,%xmm4,%xmm4
|
|
vaesenc %xmm0,%xmm5,%xmm5
|
|
leaq (%r9,%rbx,1),%rbx
|
|
cmovgeq %rsp,%r9
|
|
vaesenc %xmm0,%xmm6,%xmm6
|
|
cmovgq %rsp,%rbx
|
|
vaesenc %xmm0,%xmm7,%xmm7
|
|
subq %r9,%rbx
|
|
vaesenc %xmm0,%xmm8,%xmm8
|
|
vpxor 16(%r9),%xmm15,%xmm11
|
|
movq %rbx,64+8(%rsp)
|
|
vaesenc %xmm0,%xmm9,%xmm9
|
|
vmovups -56(%rsi),%xmm0
|
|
leaq 16(%r9,%rbx,1),%r9
|
|
vmovdqu %xmm11,16(%rbp)
|
|
vaesenc %xmm1,%xmm2,%xmm2
|
|
cmpl 32+8(%rsp),%ecx
|
|
movq 64+16(%rsp),%rbx
|
|
vaesenc %xmm1,%xmm3,%xmm3
|
|
prefetcht0 31(%r10)
|
|
vaesenc %xmm1,%xmm4,%xmm4
|
|
prefetcht0 15(%r8)
|
|
vaesenc %xmm1,%xmm5,%xmm5
|
|
leaq (%r10,%rbx,1),%rbx
|
|
cmovgeq %rsp,%r10
|
|
vaesenc %xmm1,%xmm6,%xmm6
|
|
cmovgq %rsp,%rbx
|
|
vaesenc %xmm1,%xmm7,%xmm7
|
|
subq %r10,%rbx
|
|
vaesenc %xmm1,%xmm8,%xmm8
|
|
vpxor 16(%r10),%xmm15,%xmm12
|
|
movq %rbx,64+16(%rsp)
|
|
vaesenc %xmm1,%xmm9,%xmm9
|
|
vmovups -40(%rsi),%xmm1
|
|
leaq 16(%r10,%rbx,1),%r10
|
|
vmovdqu %xmm12,32(%rbp)
|
|
vaesenc %xmm0,%xmm2,%xmm2
|
|
cmpl 32+12(%rsp),%ecx
|
|
movq 64+24(%rsp),%rbx
|
|
vaesenc %xmm0,%xmm3,%xmm3
|
|
prefetcht0 31(%r11)
|
|
vaesenc %xmm0,%xmm4,%xmm4
|
|
prefetcht0 15(%r9)
|
|
vaesenc %xmm0,%xmm5,%xmm5
|
|
leaq (%r11,%rbx,1),%rbx
|
|
cmovgeq %rsp,%r11
|
|
vaesenc %xmm0,%xmm6,%xmm6
|
|
cmovgq %rsp,%rbx
|
|
vaesenc %xmm0,%xmm7,%xmm7
|
|
subq %r11,%rbx
|
|
vaesenc %xmm0,%xmm8,%xmm8
|
|
vpxor 16(%r11),%xmm15,%xmm13
|
|
movq %rbx,64+24(%rsp)
|
|
vaesenc %xmm0,%xmm9,%xmm9
|
|
vmovups -24(%rsi),%xmm0
|
|
leaq 16(%r11,%rbx,1),%r11
|
|
vmovdqu %xmm13,48(%rbp)
|
|
vaesenc %xmm1,%xmm2,%xmm2
|
|
cmpl 32+16(%rsp),%ecx
|
|
movq 64+32(%rsp),%rbx
|
|
vaesenc %xmm1,%xmm3,%xmm3
|
|
prefetcht0 31(%r12)
|
|
vaesenc %xmm1,%xmm4,%xmm4
|
|
prefetcht0 15(%r10)
|
|
vaesenc %xmm1,%xmm5,%xmm5
|
|
leaq (%r12,%rbx,1),%rbx
|
|
cmovgeq %rsp,%r12
|
|
vaesenc %xmm1,%xmm6,%xmm6
|
|
cmovgq %rsp,%rbx
|
|
vaesenc %xmm1,%xmm7,%xmm7
|
|
subq %r12,%rbx
|
|
vaesenc %xmm1,%xmm8,%xmm8
|
|
vpxor 16(%r12),%xmm15,%xmm10
|
|
movq %rbx,64+32(%rsp)
|
|
vaesenc %xmm1,%xmm9,%xmm9
|
|
vmovups -8(%rsi),%xmm1
|
|
leaq 16(%r12,%rbx,1),%r12
|
|
vaesenc %xmm0,%xmm2,%xmm2
|
|
cmpl 32+20(%rsp),%ecx
|
|
movq 64+40(%rsp),%rbx
|
|
vaesenc %xmm0,%xmm3,%xmm3
|
|
prefetcht0 31(%r13)
|
|
vaesenc %xmm0,%xmm4,%xmm4
|
|
prefetcht0 15(%r11)
|
|
vaesenc %xmm0,%xmm5,%xmm5
|
|
leaq (%rbx,%r13,1),%rbx
|
|
cmovgeq %rsp,%r13
|
|
vaesenc %xmm0,%xmm6,%xmm6
|
|
cmovgq %rsp,%rbx
|
|
vaesenc %xmm0,%xmm7,%xmm7
|
|
subq %r13,%rbx
|
|
vaesenc %xmm0,%xmm8,%xmm8
|
|
vpxor 16(%r13),%xmm15,%xmm11
|
|
movq %rbx,64+40(%rsp)
|
|
vaesenc %xmm0,%xmm9,%xmm9
|
|
vmovups 8(%rsi),%xmm0
|
|
leaq 16(%r13,%rbx,1),%r13
|
|
vaesenc %xmm1,%xmm2,%xmm2
|
|
cmpl 32+24(%rsp),%ecx
|
|
movq 64+48(%rsp),%rbx
|
|
vaesenc %xmm1,%xmm3,%xmm3
|
|
prefetcht0 31(%r14)
|
|
vaesenc %xmm1,%xmm4,%xmm4
|
|
prefetcht0 15(%r12)
|
|
vaesenc %xmm1,%xmm5,%xmm5
|
|
leaq (%r14,%rbx,1),%rbx
|
|
cmovgeq %rsp,%r14
|
|
vaesenc %xmm1,%xmm6,%xmm6
|
|
cmovgq %rsp,%rbx
|
|
vaesenc %xmm1,%xmm7,%xmm7
|
|
subq %r14,%rbx
|
|
vaesenc %xmm1,%xmm8,%xmm8
|
|
vpxor 16(%r14),%xmm15,%xmm12
|
|
movq %rbx,64+48(%rsp)
|
|
vaesenc %xmm1,%xmm9,%xmm9
|
|
vmovups 24(%rsi),%xmm1
|
|
leaq 16(%r14,%rbx,1),%r14
|
|
vaesenc %xmm0,%xmm2,%xmm2
|
|
cmpl 32+28(%rsp),%ecx
|
|
movq 64+56(%rsp),%rbx
|
|
vaesenc %xmm0,%xmm3,%xmm3
|
|
prefetcht0 31(%r15)
|
|
vaesenc %xmm0,%xmm4,%xmm4
|
|
prefetcht0 15(%r13)
|
|
vaesenc %xmm0,%xmm5,%xmm5
|
|
leaq (%r15,%rbx,1),%rbx
|
|
cmovgeq %rsp,%r15
|
|
vaesenc %xmm0,%xmm6,%xmm6
|
|
cmovgq %rsp,%rbx
|
|
vaesenc %xmm0,%xmm7,%xmm7
|
|
subq %r15,%rbx
|
|
vaesenc %xmm0,%xmm8,%xmm8
|
|
vpxor 16(%r15),%xmm15,%xmm13
|
|
movq %rbx,64+56(%rsp)
|
|
vaesenc %xmm0,%xmm9,%xmm9
|
|
vmovups 40(%rsi),%xmm0
|
|
leaq 16(%r15,%rbx,1),%r15
|
|
vmovdqu 32(%rsp),%xmm14
|
|
prefetcht0 15(%r14)
|
|
prefetcht0 15(%r15)
|
|
cmpl $11,%eax
|
|
jb .Lenc8x_tail
|
|
|
|
vaesenc %xmm1,%xmm2,%xmm2
|
|
vaesenc %xmm1,%xmm3,%xmm3
|
|
vaesenc %xmm1,%xmm4,%xmm4
|
|
vaesenc %xmm1,%xmm5,%xmm5
|
|
vaesenc %xmm1,%xmm6,%xmm6
|
|
vaesenc %xmm1,%xmm7,%xmm7
|
|
vaesenc %xmm1,%xmm8,%xmm8
|
|
vaesenc %xmm1,%xmm9,%xmm9
|
|
vmovups 176-120(%rsi),%xmm1
|
|
|
|
vaesenc %xmm0,%xmm2,%xmm2
|
|
vaesenc %xmm0,%xmm3,%xmm3
|
|
vaesenc %xmm0,%xmm4,%xmm4
|
|
vaesenc %xmm0,%xmm5,%xmm5
|
|
vaesenc %xmm0,%xmm6,%xmm6
|
|
vaesenc %xmm0,%xmm7,%xmm7
|
|
vaesenc %xmm0,%xmm8,%xmm8
|
|
vaesenc %xmm0,%xmm9,%xmm9
|
|
vmovups 192-120(%rsi),%xmm0
|
|
je .Lenc8x_tail
|
|
|
|
vaesenc %xmm1,%xmm2,%xmm2
|
|
vaesenc %xmm1,%xmm3,%xmm3
|
|
vaesenc %xmm1,%xmm4,%xmm4
|
|
vaesenc %xmm1,%xmm5,%xmm5
|
|
vaesenc %xmm1,%xmm6,%xmm6
|
|
vaesenc %xmm1,%xmm7,%xmm7
|
|
vaesenc %xmm1,%xmm8,%xmm8
|
|
vaesenc %xmm1,%xmm9,%xmm9
|
|
vmovups 208-120(%rsi),%xmm1
|
|
|
|
vaesenc %xmm0,%xmm2,%xmm2
|
|
vaesenc %xmm0,%xmm3,%xmm3
|
|
vaesenc %xmm0,%xmm4,%xmm4
|
|
vaesenc %xmm0,%xmm5,%xmm5
|
|
vaesenc %xmm0,%xmm6,%xmm6
|
|
vaesenc %xmm0,%xmm7,%xmm7
|
|
vaesenc %xmm0,%xmm8,%xmm8
|
|
vaesenc %xmm0,%xmm9,%xmm9
|
|
vmovups 224-120(%rsi),%xmm0
|
|
|
|
.Lenc8x_tail:
|
|
vaesenc %xmm1,%xmm2,%xmm2
|
|
vpxor %xmm15,%xmm15,%xmm15
|
|
vaesenc %xmm1,%xmm3,%xmm3
|
|
vaesenc %xmm1,%xmm4,%xmm4
|
|
vpcmpgtd %xmm15,%xmm14,%xmm15
|
|
vaesenc %xmm1,%xmm5,%xmm5
|
|
vaesenc %xmm1,%xmm6,%xmm6
|
|
vpaddd %xmm14,%xmm15,%xmm15
|
|
vmovdqu 48(%rsp),%xmm14
|
|
vaesenc %xmm1,%xmm7,%xmm7
|
|
movq 64(%rsp),%rbx
|
|
vaesenc %xmm1,%xmm8,%xmm8
|
|
vaesenc %xmm1,%xmm9,%xmm9
|
|
vmovups 16-120(%rsi),%xmm1
|
|
|
|
vaesenclast %xmm0,%xmm2,%xmm2
|
|
vmovdqa %xmm15,32(%rsp)
|
|
vpxor %xmm15,%xmm15,%xmm15
|
|
vaesenclast %xmm0,%xmm3,%xmm3
|
|
vaesenclast %xmm0,%xmm4,%xmm4
|
|
vpcmpgtd %xmm15,%xmm14,%xmm15
|
|
vaesenclast %xmm0,%xmm5,%xmm5
|
|
vaesenclast %xmm0,%xmm6,%xmm6
|
|
vpaddd %xmm15,%xmm14,%xmm14
|
|
vmovdqu -120(%rsi),%xmm15
|
|
vaesenclast %xmm0,%xmm7,%xmm7
|
|
vaesenclast %xmm0,%xmm8,%xmm8
|
|
vmovdqa %xmm14,48(%rsp)
|
|
vaesenclast %xmm0,%xmm9,%xmm9
|
|
vmovups 32-120(%rsi),%xmm0
|
|
|
|
vmovups %xmm2,-16(%r8)
|
|
subq %rbx,%r8
|
|
vpxor 0(%rbp),%xmm2,%xmm2
|
|
vmovups %xmm3,-16(%r9)
|
|
subq 72(%rsp),%r9
|
|
vpxor 16(%rbp),%xmm3,%xmm3
|
|
vmovups %xmm4,-16(%r10)
|
|
subq 80(%rsp),%r10
|
|
vpxor 32(%rbp),%xmm4,%xmm4
|
|
vmovups %xmm5,-16(%r11)
|
|
subq 88(%rsp),%r11
|
|
vpxor 48(%rbp),%xmm5,%xmm5
|
|
vmovups %xmm6,-16(%r12)
|
|
subq 96(%rsp),%r12
|
|
vpxor %xmm10,%xmm6,%xmm6
|
|
vmovups %xmm7,-16(%r13)
|
|
subq 104(%rsp),%r13
|
|
vpxor %xmm11,%xmm7,%xmm7
|
|
vmovups %xmm8,-16(%r14)
|
|
subq 112(%rsp),%r14
|
|
vpxor %xmm12,%xmm8,%xmm8
|
|
vmovups %xmm9,-16(%r15)
|
|
subq 120(%rsp),%r15
|
|
vpxor %xmm13,%xmm9,%xmm9
|
|
|
|
decl %edx
|
|
jnz .Loop_enc8x
|
|
|
|
movq 16(%rsp),%rax
|
|
.cfi_def_cfa %rax,8
|
|
|
|
|
|
|
|
|
|
|
|
.Lenc8x_done:
|
|
vzeroupper
|
|
movq -48(%rax),%r15
|
|
.cfi_restore %r15
|
|
movq -40(%rax),%r14
|
|
.cfi_restore %r14
|
|
movq -32(%rax),%r13
|
|
.cfi_restore %r13
|
|
movq -24(%rax),%r12
|
|
.cfi_restore %r12
|
|
movq -16(%rax),%rbp
|
|
.cfi_restore %rbp
|
|
movq -8(%rax),%rbx
|
|
.cfi_restore %rbx
|
|
leaq (%rax),%rsp
|
|
.cfi_def_cfa_register %rsp
|
|
.Lenc8x_epilogue:
|
|
.byte 0xf3,0xc3
|
|
.cfi_endproc
|
|
.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
|
|
|
|
.type aesni_multi_cbc_decrypt_avx,@function
|
|
.align 32
|
|
aesni_multi_cbc_decrypt_avx:
|
|
.cfi_startproc
|
|
_avx_cbc_dec_shortcut:
|
|
movq %rsp,%rax
|
|
.cfi_def_cfa_register %rax
|
|
pushq %rbx
|
|
.cfi_offset %rbx,-16
|
|
pushq %rbp
|
|
.cfi_offset %rbp,-24
|
|
pushq %r12
|
|
.cfi_offset %r12,-32
|
|
pushq %r13
|
|
.cfi_offset %r13,-40
|
|
pushq %r14
|
|
.cfi_offset %r14,-48
|
|
pushq %r15
|
|
.cfi_offset %r15,-56
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
subq $256,%rsp
|
|
andq $-256,%rsp
|
|
subq $192,%rsp
|
|
movq %rax,16(%rsp)
|
|
.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
|
|
|
|
.Ldec8x_body:
|
|
vzeroupper
|
|
vmovdqu (%rsi),%xmm15
|
|
leaq 120(%rsi),%rsi
|
|
leaq 160(%rdi),%rdi
|
|
shrl $1,%edx
|
|
|
|
.Ldec8x_loop_grande:
|
|
|
|
xorl %edx,%edx
|
|
|
|
movl -144(%rdi),%ecx
|
|
|
|
movq -160(%rdi),%r8
|
|
cmpl %edx,%ecx
|
|
|
|
movq -152(%rdi),%rbx
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
vmovdqu -136(%rdi),%xmm2
|
|
movl %ecx,32(%rsp)
|
|
cmovleq %rsp,%r8
|
|
subq %r8,%rbx
|
|
movq %rbx,64(%rsp)
|
|
vmovdqu %xmm2,192(%rsp)
|
|
|
|
movl -104(%rdi),%ecx
|
|
|
|
movq -120(%rdi),%r9
|
|
cmpl %edx,%ecx
|
|
|
|
movq -112(%rdi),%rbp
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
vmovdqu -96(%rdi),%xmm3
|
|
movl %ecx,36(%rsp)
|
|
cmovleq %rsp,%r9
|
|
subq %r9,%rbp
|
|
movq %rbp,72(%rsp)
|
|
vmovdqu %xmm3,208(%rsp)
|
|
|
|
movl -64(%rdi),%ecx
|
|
|
|
movq -80(%rdi),%r10
|
|
cmpl %edx,%ecx
|
|
|
|
movq -72(%rdi),%rbp
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
vmovdqu -56(%rdi),%xmm4
|
|
movl %ecx,40(%rsp)
|
|
cmovleq %rsp,%r10
|
|
subq %r10,%rbp
|
|
movq %rbp,80(%rsp)
|
|
vmovdqu %xmm4,224(%rsp)
|
|
|
|
movl -24(%rdi),%ecx
|
|
|
|
movq -40(%rdi),%r11
|
|
cmpl %edx,%ecx
|
|
|
|
movq -32(%rdi),%rbp
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
vmovdqu -16(%rdi),%xmm5
|
|
movl %ecx,44(%rsp)
|
|
cmovleq %rsp,%r11
|
|
subq %r11,%rbp
|
|
movq %rbp,88(%rsp)
|
|
vmovdqu %xmm5,240(%rsp)
|
|
|
|
movl 16(%rdi),%ecx
|
|
|
|
movq 0(%rdi),%r12
|
|
cmpl %edx,%ecx
|
|
|
|
movq 8(%rdi),%rbp
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
vmovdqu 24(%rdi),%xmm6
|
|
movl %ecx,48(%rsp)
|
|
cmovleq %rsp,%r12
|
|
subq %r12,%rbp
|
|
movq %rbp,96(%rsp)
|
|
vmovdqu %xmm6,256(%rsp)
|
|
|
|
movl 56(%rdi),%ecx
|
|
|
|
movq 40(%rdi),%r13
|
|
cmpl %edx,%ecx
|
|
|
|
movq 48(%rdi),%rbp
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
vmovdqu 64(%rdi),%xmm7
|
|
movl %ecx,52(%rsp)
|
|
cmovleq %rsp,%r13
|
|
subq %r13,%rbp
|
|
movq %rbp,104(%rsp)
|
|
vmovdqu %xmm7,272(%rsp)
|
|
|
|
movl 96(%rdi),%ecx
|
|
|
|
movq 80(%rdi),%r14
|
|
cmpl %edx,%ecx
|
|
|
|
movq 88(%rdi),%rbp
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
vmovdqu 104(%rdi),%xmm8
|
|
movl %ecx,56(%rsp)
|
|
cmovleq %rsp,%r14
|
|
subq %r14,%rbp
|
|
movq %rbp,112(%rsp)
|
|
vmovdqu %xmm8,288(%rsp)
|
|
|
|
movl 136(%rdi),%ecx
|
|
|
|
movq 120(%rdi),%r15
|
|
cmpl %edx,%ecx
|
|
|
|
movq 128(%rdi),%rbp
|
|
cmovgl %ecx,%edx
|
|
testl %ecx,%ecx
|
|
|
|
vmovdqu 144(%rdi),%xmm9
|
|
movl %ecx,60(%rsp)
|
|
cmovleq %rsp,%r15
|
|
subq %r15,%rbp
|
|
movq %rbp,120(%rsp)
|
|
vmovdqu %xmm9,304(%rsp)
|
|
testl %edx,%edx
|
|
jz .Ldec8x_done
|
|
|
|
vmovups 16-120(%rsi),%xmm1
|
|
vmovups 32-120(%rsi),%xmm0
|
|
movl 240-120(%rsi),%eax
|
|
leaq 192+128(%rsp),%rbp
|
|
|
|
vmovdqu (%r8),%xmm2
|
|
vmovdqu (%r9),%xmm3
|
|
vmovdqu (%r10),%xmm4
|
|
vmovdqu (%r11),%xmm5
|
|
vmovdqu (%r12),%xmm6
|
|
vmovdqu (%r13),%xmm7
|
|
vmovdqu (%r14),%xmm8
|
|
vmovdqu (%r15),%xmm9
|
|
vmovdqu %xmm2,0(%rbp)
|
|
vpxor %xmm15,%xmm2,%xmm2
|
|
vmovdqu %xmm3,16(%rbp)
|
|
vpxor %xmm15,%xmm3,%xmm3
|
|
vmovdqu %xmm4,32(%rbp)
|
|
vpxor %xmm15,%xmm4,%xmm4
|
|
vmovdqu %xmm5,48(%rbp)
|
|
vpxor %xmm15,%xmm5,%xmm5
|
|
vmovdqu %xmm6,64(%rbp)
|
|
vpxor %xmm15,%xmm6,%xmm6
|
|
vmovdqu %xmm7,80(%rbp)
|
|
vpxor %xmm15,%xmm7,%xmm7
|
|
vmovdqu %xmm8,96(%rbp)
|
|
vpxor %xmm15,%xmm8,%xmm8
|
|
vmovdqu %xmm9,112(%rbp)
|
|
vpxor %xmm15,%xmm9,%xmm9
|
|
xorq $0x80,%rbp
|
|
movl $1,%ecx
|
|
jmp .Loop_dec8x
|
|
|
|
.align 32
|
|
.Loop_dec8x:
|
|
vaesdec %xmm1,%xmm2,%xmm2
|
|
cmpl 32+0(%rsp),%ecx
|
|
vaesdec %xmm1,%xmm3,%xmm3
|
|
prefetcht0 31(%r8)
|
|
vaesdec %xmm1,%xmm4,%xmm4
|
|
vaesdec %xmm1,%xmm5,%xmm5
|
|
leaq (%r8,%rbx,1),%rbx
|
|
cmovgeq %rsp,%r8
|
|
vaesdec %xmm1,%xmm6,%xmm6
|
|
cmovgq %rsp,%rbx
|
|
vaesdec %xmm1,%xmm7,%xmm7
|
|
subq %r8,%rbx
|
|
vaesdec %xmm1,%xmm8,%xmm8
|
|
vmovdqu 16(%r8),%xmm10
|
|
movq %rbx,64+0(%rsp)
|
|
vaesdec %xmm1,%xmm9,%xmm9
|
|
vmovups -72(%rsi),%xmm1
|
|
leaq 16(%r8,%rbx,1),%r8
|
|
vmovdqu %xmm10,128(%rsp)
|
|
vaesdec %xmm0,%xmm2,%xmm2
|
|
cmpl 32+4(%rsp),%ecx
|
|
movq 64+8(%rsp),%rbx
|
|
vaesdec %xmm0,%xmm3,%xmm3
|
|
prefetcht0 31(%r9)
|
|
vaesdec %xmm0,%xmm4,%xmm4
|
|
vaesdec %xmm0,%xmm5,%xmm5
|
|
leaq (%r9,%rbx,1),%rbx
|
|
cmovgeq %rsp,%r9
|
|
vaesdec %xmm0,%xmm6,%xmm6
|
|
cmovgq %rsp,%rbx
|
|
vaesdec %xmm0,%xmm7,%xmm7
|
|
subq %r9,%rbx
|
|
vaesdec %xmm0,%xmm8,%xmm8
|
|
vmovdqu 16(%r9),%xmm11
|
|
movq %rbx,64+8(%rsp)
|
|
vaesdec %xmm0,%xmm9,%xmm9
|
|
vmovups -56(%rsi),%xmm0
|
|
leaq 16(%r9,%rbx,1),%r9
|
|
vmovdqu %xmm11,144(%rsp)
|
|
vaesdec %xmm1,%xmm2,%xmm2
|
|
cmpl 32+8(%rsp),%ecx
|
|
movq 64+16(%rsp),%rbx
|
|
vaesdec %xmm1,%xmm3,%xmm3
|
|
prefetcht0 31(%r10)
|
|
vaesdec %xmm1,%xmm4,%xmm4
|
|
prefetcht0 15(%r8)
|
|
vaesdec %xmm1,%xmm5,%xmm5
|
|
leaq (%r10,%rbx,1),%rbx
|
|
cmovgeq %rsp,%r10
|
|
vaesdec %xmm1,%xmm6,%xmm6
|
|
cmovgq %rsp,%rbx
|
|
vaesdec %xmm1,%xmm7,%xmm7
|
|
subq %r10,%rbx
|
|
vaesdec %xmm1,%xmm8,%xmm8
|
|
vmovdqu 16(%r10),%xmm12
|
|
movq %rbx,64+16(%rsp)
|
|
vaesdec %xmm1,%xmm9,%xmm9
|
|
vmovups -40(%rsi),%xmm1
|
|
leaq 16(%r10,%rbx,1),%r10
|
|
vmovdqu %xmm12,160(%rsp)
|
|
vaesdec %xmm0,%xmm2,%xmm2
|
|
cmpl 32+12(%rsp),%ecx
|
|
movq 64+24(%rsp),%rbx
|
|
vaesdec %xmm0,%xmm3,%xmm3
|
|
prefetcht0 31(%r11)
|
|
vaesdec %xmm0,%xmm4,%xmm4
|
|
prefetcht0 15(%r9)
|
|
vaesdec %xmm0,%xmm5,%xmm5
|
|
leaq (%r11,%rbx,1),%rbx
|
|
cmovgeq %rsp,%r11
|
|
vaesdec %xmm0,%xmm6,%xmm6
|
|
cmovgq %rsp,%rbx
|
|
vaesdec %xmm0,%xmm7,%xmm7
|
|
subq %r11,%rbx
|
|
vaesdec %xmm0,%xmm8,%xmm8
|
|
vmovdqu 16(%r11),%xmm13
|
|
movq %rbx,64+24(%rsp)
|
|
vaesdec %xmm0,%xmm9,%xmm9
|
|
vmovups -24(%rsi),%xmm0
|
|
leaq 16(%r11,%rbx,1),%r11
|
|
vmovdqu %xmm13,176(%rsp)
|
|
vaesdec %xmm1,%xmm2,%xmm2
|
|
cmpl 32+16(%rsp),%ecx
|
|
movq 64+32(%rsp),%rbx
|
|
vaesdec %xmm1,%xmm3,%xmm3
|
|
prefetcht0 31(%r12)
|
|
vaesdec %xmm1,%xmm4,%xmm4
|
|
prefetcht0 15(%r10)
|
|
vaesdec %xmm1,%xmm5,%xmm5
|
|
leaq (%r12,%rbx,1),%rbx
|
|
cmovgeq %rsp,%r12
|
|
vaesdec %xmm1,%xmm6,%xmm6
|
|
cmovgq %rsp,%rbx
|
|
vaesdec %xmm1,%xmm7,%xmm7
|
|
subq %r12,%rbx
|
|
vaesdec %xmm1,%xmm8,%xmm8
|
|
vmovdqu 16(%r12),%xmm10
|
|
movq %rbx,64+32(%rsp)
|
|
vaesdec %xmm1,%xmm9,%xmm9
|
|
vmovups -8(%rsi),%xmm1
|
|
leaq 16(%r12,%rbx,1),%r12
|
|
vaesdec %xmm0,%xmm2,%xmm2
|
|
cmpl 32+20(%rsp),%ecx
|
|
movq 64+40(%rsp),%rbx
|
|
vaesdec %xmm0,%xmm3,%xmm3
|
|
prefetcht0 31(%r13)
|
|
vaesdec %xmm0,%xmm4,%xmm4
|
|
prefetcht0 15(%r11)
|
|
vaesdec %xmm0,%xmm5,%xmm5
|
|
leaq (%rbx,%r13,1),%rbx
|
|
cmovgeq %rsp,%r13
|
|
vaesdec %xmm0,%xmm6,%xmm6
|
|
cmovgq %rsp,%rbx
|
|
vaesdec %xmm0,%xmm7,%xmm7
|
|
subq %r13,%rbx
|
|
vaesdec %xmm0,%xmm8,%xmm8
|
|
vmovdqu 16(%r13),%xmm11
|
|
movq %rbx,64+40(%rsp)
|
|
vaesdec %xmm0,%xmm9,%xmm9
|
|
vmovups 8(%rsi),%xmm0
|
|
leaq 16(%r13,%rbx,1),%r13
|
|
vaesdec %xmm1,%xmm2,%xmm2
|
|
cmpl 32+24(%rsp),%ecx
|
|
movq 64+48(%rsp),%rbx
|
|
vaesdec %xmm1,%xmm3,%xmm3
|
|
prefetcht0 31(%r14)
|
|
vaesdec %xmm1,%xmm4,%xmm4
|
|
prefetcht0 15(%r12)
|
|
vaesdec %xmm1,%xmm5,%xmm5
|
|
leaq (%r14,%rbx,1),%rbx
|
|
cmovgeq %rsp,%r14
|
|
vaesdec %xmm1,%xmm6,%xmm6
|
|
cmovgq %rsp,%rbx
|
|
vaesdec %xmm1,%xmm7,%xmm7
|
|
subq %r14,%rbx
|
|
vaesdec %xmm1,%xmm8,%xmm8
|
|
vmovdqu 16(%r14),%xmm12
|
|
movq %rbx,64+48(%rsp)
|
|
vaesdec %xmm1,%xmm9,%xmm9
|
|
vmovups 24(%rsi),%xmm1
|
|
leaq 16(%r14,%rbx,1),%r14
|
|
vaesdec %xmm0,%xmm2,%xmm2
|
|
cmpl 32+28(%rsp),%ecx
|
|
movq 64+56(%rsp),%rbx
|
|
vaesdec %xmm0,%xmm3,%xmm3
|
|
prefetcht0 31(%r15)
|
|
vaesdec %xmm0,%xmm4,%xmm4
|
|
prefetcht0 15(%r13)
|
|
vaesdec %xmm0,%xmm5,%xmm5
|
|
leaq (%r15,%rbx,1),%rbx
|
|
cmovgeq %rsp,%r15
|
|
vaesdec %xmm0,%xmm6,%xmm6
|
|
cmovgq %rsp,%rbx
|
|
vaesdec %xmm0,%xmm7,%xmm7
|
|
subq %r15,%rbx
|
|
vaesdec %xmm0,%xmm8,%xmm8
|
|
vmovdqu 16(%r15),%xmm13
|
|
movq %rbx,64+56(%rsp)
|
|
vaesdec %xmm0,%xmm9,%xmm9
|
|
vmovups 40(%rsi),%xmm0
|
|
leaq 16(%r15,%rbx,1),%r15
|
|
vmovdqu 32(%rsp),%xmm14
|
|
prefetcht0 15(%r14)
|
|
prefetcht0 15(%r15)
|
|
cmpl $11,%eax
|
|
jb .Ldec8x_tail
|
|
|
|
vaesdec %xmm1,%xmm2,%xmm2
|
|
vaesdec %xmm1,%xmm3,%xmm3
|
|
vaesdec %xmm1,%xmm4,%xmm4
|
|
vaesdec %xmm1,%xmm5,%xmm5
|
|
vaesdec %xmm1,%xmm6,%xmm6
|
|
vaesdec %xmm1,%xmm7,%xmm7
|
|
vaesdec %xmm1,%xmm8,%xmm8
|
|
vaesdec %xmm1,%xmm9,%xmm9
|
|
vmovups 176-120(%rsi),%xmm1
|
|
|
|
vaesdec %xmm0,%xmm2,%xmm2
|
|
vaesdec %xmm0,%xmm3,%xmm3
|
|
vaesdec %xmm0,%xmm4,%xmm4
|
|
vaesdec %xmm0,%xmm5,%xmm5
|
|
vaesdec %xmm0,%xmm6,%xmm6
|
|
vaesdec %xmm0,%xmm7,%xmm7
|
|
vaesdec %xmm0,%xmm8,%xmm8
|
|
vaesdec %xmm0,%xmm9,%xmm9
|
|
vmovups 192-120(%rsi),%xmm0
|
|
je .Ldec8x_tail
|
|
|
|
vaesdec %xmm1,%xmm2,%xmm2
|
|
vaesdec %xmm1,%xmm3,%xmm3
|
|
vaesdec %xmm1,%xmm4,%xmm4
|
|
vaesdec %xmm1,%xmm5,%xmm5
|
|
vaesdec %xmm1,%xmm6,%xmm6
|
|
vaesdec %xmm1,%xmm7,%xmm7
|
|
vaesdec %xmm1,%xmm8,%xmm8
|
|
vaesdec %xmm1,%xmm9,%xmm9
|
|
vmovups 208-120(%rsi),%xmm1
|
|
|
|
vaesdec %xmm0,%xmm2,%xmm2
|
|
vaesdec %xmm0,%xmm3,%xmm3
|
|
vaesdec %xmm0,%xmm4,%xmm4
|
|
vaesdec %xmm0,%xmm5,%xmm5
|
|
vaesdec %xmm0,%xmm6,%xmm6
|
|
vaesdec %xmm0,%xmm7,%xmm7
|
|
vaesdec %xmm0,%xmm8,%xmm8
|
|
vaesdec %xmm0,%xmm9,%xmm9
|
|
vmovups 224-120(%rsi),%xmm0
|
|
|
|
.Ldec8x_tail:
|
|
vaesdec %xmm1,%xmm2,%xmm2
|
|
vpxor %xmm15,%xmm15,%xmm15
|
|
vaesdec %xmm1,%xmm3,%xmm3
|
|
vaesdec %xmm1,%xmm4,%xmm4
|
|
vpcmpgtd %xmm15,%xmm14,%xmm15
|
|
vaesdec %xmm1,%xmm5,%xmm5
|
|
vaesdec %xmm1,%xmm6,%xmm6
|
|
vpaddd %xmm14,%xmm15,%xmm15
|
|
vmovdqu 48(%rsp),%xmm14
|
|
vaesdec %xmm1,%xmm7,%xmm7
|
|
movq 64(%rsp),%rbx
|
|
vaesdec %xmm1,%xmm8,%xmm8
|
|
vaesdec %xmm1,%xmm9,%xmm9
|
|
vmovups 16-120(%rsi),%xmm1
|
|
|
|
vaesdeclast %xmm0,%xmm2,%xmm2
|
|
vmovdqa %xmm15,32(%rsp)
|
|
vpxor %xmm15,%xmm15,%xmm15
|
|
vaesdeclast %xmm0,%xmm3,%xmm3
|
|
vpxor 0(%rbp),%xmm2,%xmm2
|
|
vaesdeclast %xmm0,%xmm4,%xmm4
|
|
vpxor 16(%rbp),%xmm3,%xmm3
|
|
vpcmpgtd %xmm15,%xmm14,%xmm15
|
|
vaesdeclast %xmm0,%xmm5,%xmm5
|
|
vpxor 32(%rbp),%xmm4,%xmm4
|
|
vaesdeclast %xmm0,%xmm6,%xmm6
|
|
vpxor 48(%rbp),%xmm5,%xmm5
|
|
vpaddd %xmm15,%xmm14,%xmm14
|
|
vmovdqu -120(%rsi),%xmm15
|
|
vaesdeclast %xmm0,%xmm7,%xmm7
|
|
vpxor 64(%rbp),%xmm6,%xmm6
|
|
vaesdeclast %xmm0,%xmm8,%xmm8
|
|
vpxor 80(%rbp),%xmm7,%xmm7
|
|
vmovdqa %xmm14,48(%rsp)
|
|
vaesdeclast %xmm0,%xmm9,%xmm9
|
|
vpxor 96(%rbp),%xmm8,%xmm8
|
|
vmovups 32-120(%rsi),%xmm0
|
|
|
|
vmovups %xmm2,-16(%r8)
|
|
subq %rbx,%r8
|
|
vmovdqu 128+0(%rsp),%xmm2
|
|
vpxor 112(%rbp),%xmm9,%xmm9
|
|
vmovups %xmm3,-16(%r9)
|
|
subq 72(%rsp),%r9
|
|
vmovdqu %xmm2,0(%rbp)
|
|
vpxor %xmm15,%xmm2,%xmm2
|
|
vmovdqu 128+16(%rsp),%xmm3
|
|
vmovups %xmm4,-16(%r10)
|
|
subq 80(%rsp),%r10
|
|
vmovdqu %xmm3,16(%rbp)
|
|
vpxor %xmm15,%xmm3,%xmm3
|
|
vmovdqu 128+32(%rsp),%xmm4
|
|
vmovups %xmm5,-16(%r11)
|
|
subq 88(%rsp),%r11
|
|
vmovdqu %xmm4,32(%rbp)
|
|
vpxor %xmm15,%xmm4,%xmm4
|
|
vmovdqu 128+48(%rsp),%xmm5
|
|
vmovups %xmm6,-16(%r12)
|
|
subq 96(%rsp),%r12
|
|
vmovdqu %xmm5,48(%rbp)
|
|
vpxor %xmm15,%xmm5,%xmm5
|
|
vmovdqu %xmm10,64(%rbp)
|
|
vpxor %xmm10,%xmm15,%xmm6
|
|
vmovups %xmm7,-16(%r13)
|
|
subq 104(%rsp),%r13
|
|
vmovdqu %xmm11,80(%rbp)
|
|
vpxor %xmm11,%xmm15,%xmm7
|
|
vmovups %xmm8,-16(%r14)
|
|
subq 112(%rsp),%r14
|
|
vmovdqu %xmm12,96(%rbp)
|
|
vpxor %xmm12,%xmm15,%xmm8
|
|
vmovups %xmm9,-16(%r15)
|
|
subq 120(%rsp),%r15
|
|
vmovdqu %xmm13,112(%rbp)
|
|
vpxor %xmm13,%xmm15,%xmm9
|
|
|
|
xorq $128,%rbp
|
|
decl %edx
|
|
jnz .Loop_dec8x
|
|
|
|
movq 16(%rsp),%rax
|
|
.cfi_def_cfa %rax,8
|
|
|
|
|
|
|
|
|
|
|
|
.Ldec8x_done:
|
|
vzeroupper
|
|
movq -48(%rax),%r15
|
|
.cfi_restore %r15
|
|
movq -40(%rax),%r14
|
|
.cfi_restore %r14
|
|
movq -32(%rax),%r13
|
|
.cfi_restore %r13
|
|
movq -24(%rax),%r12
|
|
.cfi_restore %r12
|
|
movq -16(%rax),%rbp
|
|
.cfi_restore %rbp
|
|
movq -8(%rax),%rbx
|
|
.cfi_restore %rbx
|
|
leaq (%rax),%rsp
|
|
.cfi_def_cfa_register %rsp
|
|
.Ldec8x_epilogue:
|
|
.byte 0xf3,0xc3
|
|
.cfi_endproc
|
|
.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
|
|
.section ".note.gnu.property", "a"
|
|
.p2align 3
|
|
.long 1f - 0f
|
|
.long 4f - 1f
|
|
.long 5
|
|
0:
|
|
# "GNU" encoded with .byte, since .asciz isn't supported
|
|
# on Solaris.
|
|
.byte 0x47
|
|
.byte 0x4e
|
|
.byte 0x55
|
|
.byte 0
|
|
1:
|
|
.p2align 3
|
|
.long 0xc0000002
|
|
.long 3f - 2f
|
|
2:
|
|
.long 3
|
|
3:
|
|
.p2align 3
|
|
4:
|