dockerfile/examples/openssl/openssl-3.2.1-src/crypto/bn/rsaz-2k-avx512.s

917 lines
19 KiB
ArmAsm
Raw Normal View History

2024-03-22 14:58:37 +08:00
.globl ossl_rsaz_avx512ifma_eligible
.type ossl_rsaz_avx512ifma_eligible,@function
.align 32
ossl_rsaz_avx512ifma_eligible:
movl OPENSSL_ia32cap_P+8(%rip),%ecx
xorl %eax,%eax
andl $2149777408,%ecx
cmpl $2149777408,%ecx
cmovel %ecx,%eax
.byte 0xf3,0xc3
.size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible
.text
.globl ossl_rsaz_amm52x20_x1_ifma256
.type ossl_rsaz_amm52x20_x1_ifma256,@function
.align 32
ossl_rsaz_amm52x20_x1_ifma256:
.cfi_startproc
.byte 243,15,30,250
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-16
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
.Lossl_rsaz_amm52x20_x1_ifma256_body:
vpxord %ymm0,%ymm0,%ymm0
vmovdqa64 %ymm0,%ymm3
vmovdqa64 %ymm0,%ymm16
vmovdqa64 %ymm0,%ymm17
vmovdqa64 %ymm0,%ymm18
vmovdqa64 %ymm0,%ymm19
xorl %r9d,%r9d
movq %rdx,%r11
movq $0xfffffffffffff,%rax
movl $5,%ebx
.align 32
.Lloop5:
movq 0(%r11),%r13
vpbroadcastq %r13,%ymm1
movq 0(%rsi),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r9
movq %r12,%r10
adcq $0,%r10
movq %r8,%r13
imulq %r9,%r13
andq %rax,%r13
vpbroadcastq %r13,%ymm2
movq 0(%rcx),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r9
adcq %r12,%r10
shrq $52,%r9
salq $12,%r10
orq %r10,%r9
vpmadd52luq 0(%rsi),%ymm1,%ymm3
vpmadd52luq 32(%rsi),%ymm1,%ymm16
vpmadd52luq 64(%rsi),%ymm1,%ymm17
vpmadd52luq 96(%rsi),%ymm1,%ymm18
vpmadd52luq 128(%rsi),%ymm1,%ymm19
vpmadd52luq 0(%rcx),%ymm2,%ymm3
vpmadd52luq 32(%rcx),%ymm2,%ymm16
vpmadd52luq 64(%rcx),%ymm2,%ymm17
vpmadd52luq 96(%rcx),%ymm2,%ymm18
vpmadd52luq 128(%rcx),%ymm2,%ymm19
valignq $1,%ymm3,%ymm16,%ymm3
valignq $1,%ymm16,%ymm17,%ymm16
valignq $1,%ymm17,%ymm18,%ymm17
valignq $1,%ymm18,%ymm19,%ymm18
valignq $1,%ymm19,%ymm0,%ymm19
vmovq %xmm3,%r13
addq %r13,%r9
vpmadd52huq 0(%rsi),%ymm1,%ymm3
vpmadd52huq 32(%rsi),%ymm1,%ymm16
vpmadd52huq 64(%rsi),%ymm1,%ymm17
vpmadd52huq 96(%rsi),%ymm1,%ymm18
vpmadd52huq 128(%rsi),%ymm1,%ymm19
vpmadd52huq 0(%rcx),%ymm2,%ymm3
vpmadd52huq 32(%rcx),%ymm2,%ymm16
vpmadd52huq 64(%rcx),%ymm2,%ymm17
vpmadd52huq 96(%rcx),%ymm2,%ymm18
vpmadd52huq 128(%rcx),%ymm2,%ymm19
movq 8(%r11),%r13
vpbroadcastq %r13,%ymm1
movq 0(%rsi),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r9
movq %r12,%r10
adcq $0,%r10
movq %r8,%r13
imulq %r9,%r13
andq %rax,%r13
vpbroadcastq %r13,%ymm2
movq 0(%rcx),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r9
adcq %r12,%r10
shrq $52,%r9
salq $12,%r10
orq %r10,%r9
vpmadd52luq 0(%rsi),%ymm1,%ymm3
vpmadd52luq 32(%rsi),%ymm1,%ymm16
vpmadd52luq 64(%rsi),%ymm1,%ymm17
vpmadd52luq 96(%rsi),%ymm1,%ymm18
vpmadd52luq 128(%rsi),%ymm1,%ymm19
vpmadd52luq 0(%rcx),%ymm2,%ymm3
vpmadd52luq 32(%rcx),%ymm2,%ymm16
vpmadd52luq 64(%rcx),%ymm2,%ymm17
vpmadd52luq 96(%rcx),%ymm2,%ymm18
vpmadd52luq 128(%rcx),%ymm2,%ymm19
valignq $1,%ymm3,%ymm16,%ymm3
valignq $1,%ymm16,%ymm17,%ymm16
valignq $1,%ymm17,%ymm18,%ymm17
valignq $1,%ymm18,%ymm19,%ymm18
valignq $1,%ymm19,%ymm0,%ymm19
vmovq %xmm3,%r13
addq %r13,%r9
vpmadd52huq 0(%rsi),%ymm1,%ymm3
vpmadd52huq 32(%rsi),%ymm1,%ymm16
vpmadd52huq 64(%rsi),%ymm1,%ymm17
vpmadd52huq 96(%rsi),%ymm1,%ymm18
vpmadd52huq 128(%rsi),%ymm1,%ymm19
vpmadd52huq 0(%rcx),%ymm2,%ymm3
vpmadd52huq 32(%rcx),%ymm2,%ymm16
vpmadd52huq 64(%rcx),%ymm2,%ymm17
vpmadd52huq 96(%rcx),%ymm2,%ymm18
vpmadd52huq 128(%rcx),%ymm2,%ymm19
movq 16(%r11),%r13
vpbroadcastq %r13,%ymm1
movq 0(%rsi),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r9
movq %r12,%r10
adcq $0,%r10
movq %r8,%r13
imulq %r9,%r13
andq %rax,%r13
vpbroadcastq %r13,%ymm2
movq 0(%rcx),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r9
adcq %r12,%r10
shrq $52,%r9
salq $12,%r10
orq %r10,%r9
vpmadd52luq 0(%rsi),%ymm1,%ymm3
vpmadd52luq 32(%rsi),%ymm1,%ymm16
vpmadd52luq 64(%rsi),%ymm1,%ymm17
vpmadd52luq 96(%rsi),%ymm1,%ymm18
vpmadd52luq 128(%rsi),%ymm1,%ymm19
vpmadd52luq 0(%rcx),%ymm2,%ymm3
vpmadd52luq 32(%rcx),%ymm2,%ymm16
vpmadd52luq 64(%rcx),%ymm2,%ymm17
vpmadd52luq 96(%rcx),%ymm2,%ymm18
vpmadd52luq 128(%rcx),%ymm2,%ymm19
valignq $1,%ymm3,%ymm16,%ymm3
valignq $1,%ymm16,%ymm17,%ymm16
valignq $1,%ymm17,%ymm18,%ymm17
valignq $1,%ymm18,%ymm19,%ymm18
valignq $1,%ymm19,%ymm0,%ymm19
vmovq %xmm3,%r13
addq %r13,%r9
vpmadd52huq 0(%rsi),%ymm1,%ymm3
vpmadd52huq 32(%rsi),%ymm1,%ymm16
vpmadd52huq 64(%rsi),%ymm1,%ymm17
vpmadd52huq 96(%rsi),%ymm1,%ymm18
vpmadd52huq 128(%rsi),%ymm1,%ymm19
vpmadd52huq 0(%rcx),%ymm2,%ymm3
vpmadd52huq 32(%rcx),%ymm2,%ymm16
vpmadd52huq 64(%rcx),%ymm2,%ymm17
vpmadd52huq 96(%rcx),%ymm2,%ymm18
vpmadd52huq 128(%rcx),%ymm2,%ymm19
movq 24(%r11),%r13
vpbroadcastq %r13,%ymm1
movq 0(%rsi),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r9
movq %r12,%r10
adcq $0,%r10
movq %r8,%r13
imulq %r9,%r13
andq %rax,%r13
vpbroadcastq %r13,%ymm2
movq 0(%rcx),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r9
adcq %r12,%r10
shrq $52,%r9
salq $12,%r10
orq %r10,%r9
vpmadd52luq 0(%rsi),%ymm1,%ymm3
vpmadd52luq 32(%rsi),%ymm1,%ymm16
vpmadd52luq 64(%rsi),%ymm1,%ymm17
vpmadd52luq 96(%rsi),%ymm1,%ymm18
vpmadd52luq 128(%rsi),%ymm1,%ymm19
vpmadd52luq 0(%rcx),%ymm2,%ymm3
vpmadd52luq 32(%rcx),%ymm2,%ymm16
vpmadd52luq 64(%rcx),%ymm2,%ymm17
vpmadd52luq 96(%rcx),%ymm2,%ymm18
vpmadd52luq 128(%rcx),%ymm2,%ymm19
valignq $1,%ymm3,%ymm16,%ymm3
valignq $1,%ymm16,%ymm17,%ymm16
valignq $1,%ymm17,%ymm18,%ymm17
valignq $1,%ymm18,%ymm19,%ymm18
valignq $1,%ymm19,%ymm0,%ymm19
vmovq %xmm3,%r13
addq %r13,%r9
vpmadd52huq 0(%rsi),%ymm1,%ymm3
vpmadd52huq 32(%rsi),%ymm1,%ymm16
vpmadd52huq 64(%rsi),%ymm1,%ymm17
vpmadd52huq 96(%rsi),%ymm1,%ymm18
vpmadd52huq 128(%rsi),%ymm1,%ymm19
vpmadd52huq 0(%rcx),%ymm2,%ymm3
vpmadd52huq 32(%rcx),%ymm2,%ymm16
vpmadd52huq 64(%rcx),%ymm2,%ymm17
vpmadd52huq 96(%rcx),%ymm2,%ymm18
vpmadd52huq 128(%rcx),%ymm2,%ymm19
leaq 32(%r11),%r11
decl %ebx
jne .Lloop5
vpbroadcastq %r9,%ymm0
vpblendd $3,%ymm0,%ymm3,%ymm3
vpsrlq $52,%ymm3,%ymm0
vpsrlq $52,%ymm16,%ymm1
vpsrlq $52,%ymm17,%ymm2
vpsrlq $52,%ymm18,%ymm25
vpsrlq $52,%ymm19,%ymm26
valignq $3,%ymm25,%ymm26,%ymm26
valignq $3,%ymm2,%ymm25,%ymm25
valignq $3,%ymm1,%ymm2,%ymm2
valignq $3,%ymm0,%ymm1,%ymm1
valignq $3,.Lzeros(%rip),%ymm0,%ymm0
vpandq .Lmask52x4(%rip),%ymm3,%ymm3
vpandq .Lmask52x4(%rip),%ymm16,%ymm16
vpandq .Lmask52x4(%rip),%ymm17,%ymm17
vpandq .Lmask52x4(%rip),%ymm18,%ymm18
vpandq .Lmask52x4(%rip),%ymm19,%ymm19
vpaddq %ymm0,%ymm3,%ymm3
vpaddq %ymm1,%ymm16,%ymm16
vpaddq %ymm2,%ymm17,%ymm17
vpaddq %ymm25,%ymm18,%ymm18
vpaddq %ymm26,%ymm19,%ymm19
vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1
vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2
vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k3
vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k4
vpcmpuq $6,.Lmask52x4(%rip),%ymm19,%k5
kmovb %k1,%r14d
kmovb %k2,%r13d
kmovb %k3,%r12d
kmovb %k4,%r11d
kmovb %k5,%r10d
vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1
vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2
vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k3
vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k4
vpcmpuq $0,.Lmask52x4(%rip),%ymm19,%k5
kmovb %k1,%r9d
kmovb %k2,%r8d
kmovb %k3,%ebx
kmovb %k4,%ecx
kmovb %k5,%edx
shlb $4,%r13b
orb %r13b,%r14b
shlb $4,%r11b
orb %r11b,%r12b
addb %r14b,%r14b
adcb %r12b,%r12b
adcb %r10b,%r10b
shlb $4,%r8b
orb %r8b,%r9b
shlb $4,%cl
orb %cl,%bl
addb %r9b,%r14b
adcb %bl,%r12b
adcb %dl,%r10b
xorb %r9b,%r14b
xorb %bl,%r12b
xorb %dl,%r10b
kmovb %r14d,%k1
shrb $4,%r14b
kmovb %r14d,%k2
kmovb %r12d,%k3
shrb $4,%r12b
kmovb %r12d,%k4
kmovb %r10d,%k5
vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1}
vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k2}
vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k3}
vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k4}
vpsubq .Lmask52x4(%rip),%ymm19,%ymm19{%k5}
vpandq .Lmask52x4(%rip),%ymm3,%ymm3
vpandq .Lmask52x4(%rip),%ymm16,%ymm16
vpandq .Lmask52x4(%rip),%ymm17,%ymm17
vpandq .Lmask52x4(%rip),%ymm18,%ymm18
vpandq .Lmask52x4(%rip),%ymm19,%ymm19
vmovdqu64 %ymm3,0(%rdi)
vmovdqu64 %ymm16,32(%rdi)
vmovdqu64 %ymm17,64(%rdi)
vmovdqu64 %ymm18,96(%rdi)
vmovdqu64 %ymm19,128(%rdi)
vzeroupper
movq 0(%rsp),%r15
.cfi_restore %r15
movq 8(%rsp),%r14
.cfi_restore %r14
movq 16(%rsp),%r13
.cfi_restore %r13
movq 24(%rsp),%r12
.cfi_restore %r12
movq 32(%rsp),%rbp
.cfi_restore %rbp
movq 40(%rsp),%rbx
.cfi_restore %rbx
leaq 48(%rsp),%rsp
.cfi_adjust_cfa_offset -48
.Lossl_rsaz_amm52x20_x1_ifma256_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
.size ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256
.data
.align 32
.Lmask52x4:
.quad 0xfffffffffffff
.quad 0xfffffffffffff
.quad 0xfffffffffffff
.quad 0xfffffffffffff
.text
.globl ossl_rsaz_amm52x20_x2_ifma256
.type ossl_rsaz_amm52x20_x2_ifma256,@function
.align 32
ossl_rsaz_amm52x20_x2_ifma256:
.cfi_startproc
.byte 243,15,30,250
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-16
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
.Lossl_rsaz_amm52x20_x2_ifma256_body:
vpxord %ymm0,%ymm0,%ymm0
vmovdqa64 %ymm0,%ymm3
vmovdqa64 %ymm0,%ymm16
vmovdqa64 %ymm0,%ymm17
vmovdqa64 %ymm0,%ymm18
vmovdqa64 %ymm0,%ymm19
vmovdqa64 %ymm0,%ymm4
vmovdqa64 %ymm0,%ymm20
vmovdqa64 %ymm0,%ymm21
vmovdqa64 %ymm0,%ymm22
vmovdqa64 %ymm0,%ymm23
xorl %r9d,%r9d
xorl %r15d,%r15d
movq %rdx,%r11
movq $0xfffffffffffff,%rax
movl $20,%ebx
.align 32
.Lloop20:
movq 0(%r11),%r13
vpbroadcastq %r13,%ymm1
movq 0(%rsi),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r9
movq %r12,%r10
adcq $0,%r10
movq (%r8),%r13
imulq %r9,%r13
andq %rax,%r13
vpbroadcastq %r13,%ymm2
movq 0(%rcx),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r9
adcq %r12,%r10
shrq $52,%r9
salq $12,%r10
orq %r10,%r9
vpmadd52luq 0(%rsi),%ymm1,%ymm3
vpmadd52luq 32(%rsi),%ymm1,%ymm16
vpmadd52luq 64(%rsi),%ymm1,%ymm17
vpmadd52luq 96(%rsi),%ymm1,%ymm18
vpmadd52luq 128(%rsi),%ymm1,%ymm19
vpmadd52luq 0(%rcx),%ymm2,%ymm3
vpmadd52luq 32(%rcx),%ymm2,%ymm16
vpmadd52luq 64(%rcx),%ymm2,%ymm17
vpmadd52luq 96(%rcx),%ymm2,%ymm18
vpmadd52luq 128(%rcx),%ymm2,%ymm19
valignq $1,%ymm3,%ymm16,%ymm3
valignq $1,%ymm16,%ymm17,%ymm16
valignq $1,%ymm17,%ymm18,%ymm17
valignq $1,%ymm18,%ymm19,%ymm18
valignq $1,%ymm19,%ymm0,%ymm19
vmovq %xmm3,%r13
addq %r13,%r9
vpmadd52huq 0(%rsi),%ymm1,%ymm3
vpmadd52huq 32(%rsi),%ymm1,%ymm16
vpmadd52huq 64(%rsi),%ymm1,%ymm17
vpmadd52huq 96(%rsi),%ymm1,%ymm18
vpmadd52huq 128(%rsi),%ymm1,%ymm19
vpmadd52huq 0(%rcx),%ymm2,%ymm3
vpmadd52huq 32(%rcx),%ymm2,%ymm16
vpmadd52huq 64(%rcx),%ymm2,%ymm17
vpmadd52huq 96(%rcx),%ymm2,%ymm18
vpmadd52huq 128(%rcx),%ymm2,%ymm19
movq 160(%r11),%r13
vpbroadcastq %r13,%ymm1
movq 160(%rsi),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r15
movq %r12,%r10
adcq $0,%r10
movq 8(%r8),%r13
imulq %r15,%r13
andq %rax,%r13
vpbroadcastq %r13,%ymm2
movq 160(%rcx),%rdx
mulxq %r13,%r13,%r12
addq %r13,%r15
adcq %r12,%r10
shrq $52,%r15
salq $12,%r10
orq %r10,%r15
vpmadd52luq 160(%rsi),%ymm1,%ymm4
vpmadd52luq 192(%rsi),%ymm1,%ymm20
vpmadd52luq 224(%rsi),%ymm1,%ymm21
vpmadd52luq 256(%rsi),%ymm1,%ymm22
vpmadd52luq 288(%rsi),%ymm1,%ymm23
vpmadd52luq 160(%rcx),%ymm2,%ymm4
vpmadd52luq 192(%rcx),%ymm2,%ymm20
vpmadd52luq 224(%rcx),%ymm2,%ymm21
vpmadd52luq 256(%rcx),%ymm2,%ymm22
vpmadd52luq 288(%rcx),%ymm2,%ymm23
valignq $1,%ymm4,%ymm20,%ymm4
valignq $1,%ymm20,%ymm21,%ymm20
valignq $1,%ymm21,%ymm22,%ymm21
valignq $1,%ymm22,%ymm23,%ymm22
valignq $1,%ymm23,%ymm0,%ymm23
vmovq %xmm4,%r13
addq %r13,%r15
vpmadd52huq 160(%rsi),%ymm1,%ymm4
vpmadd52huq 192(%rsi),%ymm1,%ymm20
vpmadd52huq 224(%rsi),%ymm1,%ymm21
vpmadd52huq 256(%rsi),%ymm1,%ymm22
vpmadd52huq 288(%rsi),%ymm1,%ymm23
vpmadd52huq 160(%rcx),%ymm2,%ymm4
vpmadd52huq 192(%rcx),%ymm2,%ymm20
vpmadd52huq 224(%rcx),%ymm2,%ymm21
vpmadd52huq 256(%rcx),%ymm2,%ymm22
vpmadd52huq 288(%rcx),%ymm2,%ymm23
leaq 8(%r11),%r11
decl %ebx
jne .Lloop20
vpbroadcastq %r9,%ymm0
vpblendd $3,%ymm0,%ymm3,%ymm3
vpsrlq $52,%ymm3,%ymm0
vpsrlq $52,%ymm16,%ymm1
vpsrlq $52,%ymm17,%ymm2
vpsrlq $52,%ymm18,%ymm25
vpsrlq $52,%ymm19,%ymm26
valignq $3,%ymm25,%ymm26,%ymm26
valignq $3,%ymm2,%ymm25,%ymm25
valignq $3,%ymm1,%ymm2,%ymm2
valignq $3,%ymm0,%ymm1,%ymm1
valignq $3,.Lzeros(%rip),%ymm0,%ymm0
vpandq .Lmask52x4(%rip),%ymm3,%ymm3
vpandq .Lmask52x4(%rip),%ymm16,%ymm16
vpandq .Lmask52x4(%rip),%ymm17,%ymm17
vpandq .Lmask52x4(%rip),%ymm18,%ymm18
vpandq .Lmask52x4(%rip),%ymm19,%ymm19
vpaddq %ymm0,%ymm3,%ymm3
vpaddq %ymm1,%ymm16,%ymm16
vpaddq %ymm2,%ymm17,%ymm17
vpaddq %ymm25,%ymm18,%ymm18
vpaddq %ymm26,%ymm19,%ymm19
vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1
vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2
vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k3
vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k4
vpcmpuq $6,.Lmask52x4(%rip),%ymm19,%k5
kmovb %k1,%r14d
kmovb %k2,%r13d
kmovb %k3,%r12d
kmovb %k4,%r11d
kmovb %k5,%r10d
vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1
vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2
vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k3
vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k4
vpcmpuq $0,.Lmask52x4(%rip),%ymm19,%k5
kmovb %k1,%r9d
kmovb %k2,%r8d
kmovb %k3,%ebx
kmovb %k4,%ecx
kmovb %k5,%edx
shlb $4,%r13b
orb %r13b,%r14b
shlb $4,%r11b
orb %r11b,%r12b
addb %r14b,%r14b
adcb %r12b,%r12b
adcb %r10b,%r10b
shlb $4,%r8b
orb %r8b,%r9b
shlb $4,%cl
orb %cl,%bl
addb %r9b,%r14b
adcb %bl,%r12b
adcb %dl,%r10b
xorb %r9b,%r14b
xorb %bl,%r12b
xorb %dl,%r10b
kmovb %r14d,%k1
shrb $4,%r14b
kmovb %r14d,%k2
kmovb %r12d,%k3
shrb $4,%r12b
kmovb %r12d,%k4
kmovb %r10d,%k5
vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1}
vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k2}
vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k3}
vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k4}
vpsubq .Lmask52x4(%rip),%ymm19,%ymm19{%k5}
vpandq .Lmask52x4(%rip),%ymm3,%ymm3
vpandq .Lmask52x4(%rip),%ymm16,%ymm16
vpandq .Lmask52x4(%rip),%ymm17,%ymm17
vpandq .Lmask52x4(%rip),%ymm18,%ymm18
vpandq .Lmask52x4(%rip),%ymm19,%ymm19
vpbroadcastq %r15,%ymm0
vpblendd $3,%ymm0,%ymm4,%ymm4
vpsrlq $52,%ymm4,%ymm0
vpsrlq $52,%ymm20,%ymm1
vpsrlq $52,%ymm21,%ymm2
vpsrlq $52,%ymm22,%ymm25
vpsrlq $52,%ymm23,%ymm26
valignq $3,%ymm25,%ymm26,%ymm26
valignq $3,%ymm2,%ymm25,%ymm25
valignq $3,%ymm1,%ymm2,%ymm2
valignq $3,%ymm0,%ymm1,%ymm1
valignq $3,.Lzeros(%rip),%ymm0,%ymm0
vpandq .Lmask52x4(%rip),%ymm4,%ymm4
vpandq .Lmask52x4(%rip),%ymm20,%ymm20
vpandq .Lmask52x4(%rip),%ymm21,%ymm21
vpandq .Lmask52x4(%rip),%ymm22,%ymm22
vpandq .Lmask52x4(%rip),%ymm23,%ymm23
vpaddq %ymm0,%ymm4,%ymm4
vpaddq %ymm1,%ymm20,%ymm20
vpaddq %ymm2,%ymm21,%ymm21
vpaddq %ymm25,%ymm22,%ymm22
vpaddq %ymm26,%ymm23,%ymm23
vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k1
vpcmpuq $6,.Lmask52x4(%rip),%ymm20,%k2
vpcmpuq $6,.Lmask52x4(%rip),%ymm21,%k3
vpcmpuq $6,.Lmask52x4(%rip),%ymm22,%k4
vpcmpuq $6,.Lmask52x4(%rip),%ymm23,%k5
kmovb %k1,%r14d
kmovb %k2,%r13d
kmovb %k3,%r12d
kmovb %k4,%r11d
kmovb %k5,%r10d
vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k1
vpcmpuq $0,.Lmask52x4(%rip),%ymm20,%k2
vpcmpuq $0,.Lmask52x4(%rip),%ymm21,%k3
vpcmpuq $0,.Lmask52x4(%rip),%ymm22,%k4
vpcmpuq $0,.Lmask52x4(%rip),%ymm23,%k5
kmovb %k1,%r9d
kmovb %k2,%r8d
kmovb %k3,%ebx
kmovb %k4,%ecx
kmovb %k5,%edx
shlb $4,%r13b
orb %r13b,%r14b
shlb $4,%r11b
orb %r11b,%r12b
addb %r14b,%r14b
adcb %r12b,%r12b
adcb %r10b,%r10b
shlb $4,%r8b
orb %r8b,%r9b
shlb $4,%cl
orb %cl,%bl
addb %r9b,%r14b
adcb %bl,%r12b
adcb %dl,%r10b
xorb %r9b,%r14b
xorb %bl,%r12b
xorb %dl,%r10b
kmovb %r14d,%k1
shrb $4,%r14b
kmovb %r14d,%k2
kmovb %r12d,%k3
shrb $4,%r12b
kmovb %r12d,%k4
kmovb %r10d,%k5
vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k1}
vpsubq .Lmask52x4(%rip),%ymm20,%ymm20{%k2}
vpsubq .Lmask52x4(%rip),%ymm21,%ymm21{%k3}
vpsubq .Lmask52x4(%rip),%ymm22,%ymm22{%k4}
vpsubq .Lmask52x4(%rip),%ymm23,%ymm23{%k5}
vpandq .Lmask52x4(%rip),%ymm4,%ymm4
vpandq .Lmask52x4(%rip),%ymm20,%ymm20
vpandq .Lmask52x4(%rip),%ymm21,%ymm21
vpandq .Lmask52x4(%rip),%ymm22,%ymm22
vpandq .Lmask52x4(%rip),%ymm23,%ymm23
vmovdqu64 %ymm3,0(%rdi)
vmovdqu64 %ymm16,32(%rdi)
vmovdqu64 %ymm17,64(%rdi)
vmovdqu64 %ymm18,96(%rdi)
vmovdqu64 %ymm19,128(%rdi)
vmovdqu64 %ymm4,160(%rdi)
vmovdqu64 %ymm20,192(%rdi)
vmovdqu64 %ymm21,224(%rdi)
vmovdqu64 %ymm22,256(%rdi)
vmovdqu64 %ymm23,288(%rdi)
vzeroupper
movq 0(%rsp),%r15
.cfi_restore %r15
movq 8(%rsp),%r14
.cfi_restore %r14
movq 16(%rsp),%r13
.cfi_restore %r13
movq 24(%rsp),%r12
.cfi_restore %r12
movq 32(%rsp),%rbp
.cfi_restore %rbp
movq 40(%rsp),%rbx
.cfi_restore %rbx
leaq 48(%rsp),%rsp
.cfi_adjust_cfa_offset -48
.Lossl_rsaz_amm52x20_x2_ifma256_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
.size ossl_rsaz_amm52x20_x2_ifma256, .-ossl_rsaz_amm52x20_x2_ifma256
.text
.align 32
.globl ossl_extract_multiplier_2x20_win5
.type ossl_extract_multiplier_2x20_win5,@function
ossl_extract_multiplier_2x20_win5:
.cfi_startproc
.byte 243,15,30,250
vmovdqa64 .Lones(%rip),%ymm24
vpbroadcastq %rdx,%ymm22
vpbroadcastq %rcx,%ymm23
leaq 10240(%rsi),%rax
vpxor %xmm0,%xmm0,%xmm0
vmovdqa64 %ymm0,%ymm21
vmovdqa64 %ymm0,%ymm1
vmovdqa64 %ymm0,%ymm2
vmovdqa64 %ymm0,%ymm3
vmovdqa64 %ymm0,%ymm4
vmovdqa64 %ymm0,%ymm5
vmovdqa64 %ymm0,%ymm16
vmovdqa64 %ymm0,%ymm17
vmovdqa64 %ymm0,%ymm18
vmovdqa64 %ymm0,%ymm19
.align 32
.Lloop:
vpcmpq $0,%ymm21,%ymm22,%k1
vpcmpq $0,%ymm21,%ymm23,%k2
vmovdqu64 0(%rsi),%ymm20
vpblendmq %ymm20,%ymm0,%ymm0{%k1}
vmovdqu64 32(%rsi),%ymm20
vpblendmq %ymm20,%ymm1,%ymm1{%k1}
vmovdqu64 64(%rsi),%ymm20
vpblendmq %ymm20,%ymm2,%ymm2{%k1}
vmovdqu64 96(%rsi),%ymm20
vpblendmq %ymm20,%ymm3,%ymm3{%k1}
vmovdqu64 128(%rsi),%ymm20
vpblendmq %ymm20,%ymm4,%ymm4{%k1}
vmovdqu64 160(%rsi),%ymm20
vpblendmq %ymm20,%ymm5,%ymm5{%k2}
vmovdqu64 192(%rsi),%ymm20
vpblendmq %ymm20,%ymm16,%ymm16{%k2}
vmovdqu64 224(%rsi),%ymm20
vpblendmq %ymm20,%ymm17,%ymm17{%k2}
vmovdqu64 256(%rsi),%ymm20
vpblendmq %ymm20,%ymm18,%ymm18{%k2}
vmovdqu64 288(%rsi),%ymm20
vpblendmq %ymm20,%ymm19,%ymm19{%k2}
vpaddq %ymm24,%ymm21,%ymm21
addq $320,%rsi
cmpq %rsi,%rax
jne .Lloop
vmovdqu64 %ymm0,0(%rdi)
vmovdqu64 %ymm1,32(%rdi)
vmovdqu64 %ymm2,64(%rdi)
vmovdqu64 %ymm3,96(%rdi)
vmovdqu64 %ymm4,128(%rdi)
vmovdqu64 %ymm5,160(%rdi)
vmovdqu64 %ymm16,192(%rdi)
vmovdqu64 %ymm17,224(%rdi)
vmovdqu64 %ymm18,256(%rdi)
vmovdqu64 %ymm19,288(%rdi)
.byte 0xf3,0xc3
.cfi_endproc
.size ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5
.data
.align 32
.Lones:
.quad 1,1,1,1
.Lzeros:
.quad 0,0,0,0
.section ".note.gnu.property", "a"
.p2align 3
.long 1f - 0f
.long 4f - 1f
.long 5
0:
# "GNU" encoded with .byte, since .asciz isn't supported
# on Solaris.
.byte 0x47
.byte 0x4e
.byte 0x55
.byte 0
1:
.p2align 3
.long 0xc0000002
.long 3f - 2f
2:
.long 3
3:
.p2align 3
4: