1375 lines
31 KiB
ArmAsm
1375 lines
31 KiB
ArmAsm
|
.text
|
||
|
|
||
|
.globl ossl_rsaz_amm52x40_x1_ifma256
|
||
|
.type ossl_rsaz_amm52x40_x1_ifma256,@function
|
||
|
.align 32
|
||
|
ossl_rsaz_amm52x40_x1_ifma256:
|
||
|
.cfi_startproc
|
||
|
.byte 243,15,30,250
|
||
|
pushq %rbx
|
||
|
.cfi_adjust_cfa_offset 8
|
||
|
.cfi_offset %rbx,-16
|
||
|
pushq %rbp
|
||
|
.cfi_adjust_cfa_offset 8
|
||
|
.cfi_offset %rbp,-24
|
||
|
pushq %r12
|
||
|
.cfi_adjust_cfa_offset 8
|
||
|
.cfi_offset %r12,-32
|
||
|
pushq %r13
|
||
|
.cfi_adjust_cfa_offset 8
|
||
|
.cfi_offset %r13,-40
|
||
|
pushq %r14
|
||
|
.cfi_adjust_cfa_offset 8
|
||
|
.cfi_offset %r14,-48
|
||
|
pushq %r15
|
||
|
.cfi_adjust_cfa_offset 8
|
||
|
.cfi_offset %r15,-56
|
||
|
|
||
|
vpxord %ymm0,%ymm0,%ymm0
|
||
|
vmovdqa64 %ymm0,%ymm3
|
||
|
vmovdqa64 %ymm0,%ymm4
|
||
|
vmovdqa64 %ymm0,%ymm5
|
||
|
vmovdqa64 %ymm0,%ymm6
|
||
|
vmovdqa64 %ymm0,%ymm7
|
||
|
vmovdqa64 %ymm0,%ymm8
|
||
|
vmovdqa64 %ymm0,%ymm9
|
||
|
vmovdqa64 %ymm0,%ymm10
|
||
|
vmovdqa64 %ymm0,%ymm11
|
||
|
vmovdqa64 %ymm0,%ymm12
|
||
|
|
||
|
xorl %r9d,%r9d
|
||
|
|
||
|
movq %rdx,%r11
|
||
|
movq $0xfffffffffffff,%rax
|
||
|
|
||
|
|
||
|
movl $10,%ebx
|
||
|
|
||
|
.align 32
|
||
|
.Lloop10:
|
||
|
movq 0(%r11),%r13
|
||
|
|
||
|
vpbroadcastq %r13,%ymm1
|
||
|
movq 0(%rsi),%rdx
|
||
|
mulxq %r13,%r13,%r12
|
||
|
addq %r13,%r9
|
||
|
movq %r12,%r10
|
||
|
adcq $0,%r10
|
||
|
|
||
|
movq %r8,%r13
|
||
|
imulq %r9,%r13
|
||
|
andq %rax,%r13
|
||
|
|
||
|
vpbroadcastq %r13,%ymm2
|
||
|
movq 0(%rcx),%rdx
|
||
|
mulxq %r13,%r13,%r12
|
||
|
addq %r13,%r9
|
||
|
adcq %r12,%r10
|
||
|
|
||
|
shrq $52,%r9
|
||
|
salq $12,%r10
|
||
|
orq %r10,%r9
|
||
|
|
||
|
vpmadd52luq 0(%rsi),%ymm1,%ymm3
|
||
|
vpmadd52luq 32(%rsi),%ymm1,%ymm4
|
||
|
vpmadd52luq 64(%rsi),%ymm1,%ymm5
|
||
|
vpmadd52luq 96(%rsi),%ymm1,%ymm6
|
||
|
vpmadd52luq 128(%rsi),%ymm1,%ymm7
|
||
|
vpmadd52luq 160(%rsi),%ymm1,%ymm8
|
||
|
vpmadd52luq 192(%rsi),%ymm1,%ymm9
|
||
|
vpmadd52luq 224(%rsi),%ymm1,%ymm10
|
||
|
vpmadd52luq 256(%rsi),%ymm1,%ymm11
|
||
|
vpmadd52luq 288(%rsi),%ymm1,%ymm12
|
||
|
|
||
|
vpmadd52luq 0(%rcx),%ymm2,%ymm3
|
||
|
vpmadd52luq 32(%rcx),%ymm2,%ymm4
|
||
|
vpmadd52luq 64(%rcx),%ymm2,%ymm5
|
||
|
vpmadd52luq 96(%rcx),%ymm2,%ymm6
|
||
|
vpmadd52luq 128(%rcx),%ymm2,%ymm7
|
||
|
vpmadd52luq 160(%rcx),%ymm2,%ymm8
|
||
|
vpmadd52luq 192(%rcx),%ymm2,%ymm9
|
||
|
vpmadd52luq 224(%rcx),%ymm2,%ymm10
|
||
|
vpmadd52luq 256(%rcx),%ymm2,%ymm11
|
||
|
vpmadd52luq 288(%rcx),%ymm2,%ymm12
|
||
|
|
||
|
|
||
|
valignq $1,%ymm3,%ymm4,%ymm3
|
||
|
valignq $1,%ymm4,%ymm5,%ymm4
|
||
|
valignq $1,%ymm5,%ymm6,%ymm5
|
||
|
valignq $1,%ymm6,%ymm7,%ymm6
|
||
|
valignq $1,%ymm7,%ymm8,%ymm7
|
||
|
valignq $1,%ymm8,%ymm9,%ymm8
|
||
|
valignq $1,%ymm9,%ymm10,%ymm9
|
||
|
valignq $1,%ymm10,%ymm11,%ymm10
|
||
|
valignq $1,%ymm11,%ymm12,%ymm11
|
||
|
valignq $1,%ymm12,%ymm0,%ymm12
|
||
|
|
||
|
vmovq %xmm3,%r13
|
||
|
addq %r13,%r9
|
||
|
|
||
|
vpmadd52huq 0(%rsi),%ymm1,%ymm3
|
||
|
vpmadd52huq 32(%rsi),%ymm1,%ymm4
|
||
|
vpmadd52huq 64(%rsi),%ymm1,%ymm5
|
||
|
vpmadd52huq 96(%rsi),%ymm1,%ymm6
|
||
|
vpmadd52huq 128(%rsi),%ymm1,%ymm7
|
||
|
vpmadd52huq 160(%rsi),%ymm1,%ymm8
|
||
|
vpmadd52huq 192(%rsi),%ymm1,%ymm9
|
||
|
vpmadd52huq 224(%rsi),%ymm1,%ymm10
|
||
|
vpmadd52huq 256(%rsi),%ymm1,%ymm11
|
||
|
vpmadd52huq 288(%rsi),%ymm1,%ymm12
|
||
|
|
||
|
vpmadd52huq 0(%rcx),%ymm2,%ymm3
|
||
|
vpmadd52huq 32(%rcx),%ymm2,%ymm4
|
||
|
vpmadd52huq 64(%rcx),%ymm2,%ymm5
|
||
|
vpmadd52huq 96(%rcx),%ymm2,%ymm6
|
||
|
vpmadd52huq 128(%rcx),%ymm2,%ymm7
|
||
|
vpmadd52huq 160(%rcx),%ymm2,%ymm8
|
||
|
vpmadd52huq 192(%rcx),%ymm2,%ymm9
|
||
|
vpmadd52huq 224(%rcx),%ymm2,%ymm10
|
||
|
vpmadd52huq 256(%rcx),%ymm2,%ymm11
|
||
|
vpmadd52huq 288(%rcx),%ymm2,%ymm12
|
||
|
movq 8(%r11),%r13
|
||
|
|
||
|
vpbroadcastq %r13,%ymm1
|
||
|
movq 0(%rsi),%rdx
|
||
|
mulxq %r13,%r13,%r12
|
||
|
addq %r13,%r9
|
||
|
movq %r12,%r10
|
||
|
adcq $0,%r10
|
||
|
|
||
|
movq %r8,%r13
|
||
|
imulq %r9,%r13
|
||
|
andq %rax,%r13
|
||
|
|
||
|
vpbroadcastq %r13,%ymm2
|
||
|
movq 0(%rcx),%rdx
|
||
|
mulxq %r13,%r13,%r12
|
||
|
addq %r13,%r9
|
||
|
adcq %r12,%r10
|
||
|
|
||
|
shrq $52,%r9
|
||
|
salq $12,%r10
|
||
|
orq %r10,%r9
|
||
|
|
||
|
vpmadd52luq 0(%rsi),%ymm1,%ymm3
|
||
|
vpmadd52luq 32(%rsi),%ymm1,%ymm4
|
||
|
vpmadd52luq 64(%rsi),%ymm1,%ymm5
|
||
|
vpmadd52luq 96(%rsi),%ymm1,%ymm6
|
||
|
vpmadd52luq 128(%rsi),%ymm1,%ymm7
|
||
|
vpmadd52luq 160(%rsi),%ymm1,%ymm8
|
||
|
vpmadd52luq 192(%rsi),%ymm1,%ymm9
|
||
|
vpmadd52luq 224(%rsi),%ymm1,%ymm10
|
||
|
vpmadd52luq 256(%rsi),%ymm1,%ymm11
|
||
|
vpmadd52luq 288(%rsi),%ymm1,%ymm12
|
||
|
|
||
|
vpmadd52luq 0(%rcx),%ymm2,%ymm3
|
||
|
vpmadd52luq 32(%rcx),%ymm2,%ymm4
|
||
|
vpmadd52luq 64(%rcx),%ymm2,%ymm5
|
||
|
vpmadd52luq 96(%rcx),%ymm2,%ymm6
|
||
|
vpmadd52luq 128(%rcx),%ymm2,%ymm7
|
||
|
vpmadd52luq 160(%rcx),%ymm2,%ymm8
|
||
|
vpmadd52luq 192(%rcx),%ymm2,%ymm9
|
||
|
vpmadd52luq 224(%rcx),%ymm2,%ymm10
|
||
|
vpmadd52luq 256(%rcx),%ymm2,%ymm11
|
||
|
vpmadd52luq 288(%rcx),%ymm2,%ymm12
|
||
|
|
||
|
|
||
|
valignq $1,%ymm3,%ymm4,%ymm3
|
||
|
valignq $1,%ymm4,%ymm5,%ymm4
|
||
|
valignq $1,%ymm5,%ymm6,%ymm5
|
||
|
valignq $1,%ymm6,%ymm7,%ymm6
|
||
|
valignq $1,%ymm7,%ymm8,%ymm7
|
||
|
valignq $1,%ymm8,%ymm9,%ymm8
|
||
|
valignq $1,%ymm9,%ymm10,%ymm9
|
||
|
valignq $1,%ymm10,%ymm11,%ymm10
|
||
|
valignq $1,%ymm11,%ymm12,%ymm11
|
||
|
valignq $1,%ymm12,%ymm0,%ymm12
|
||
|
|
||
|
vmovq %xmm3,%r13
|
||
|
addq %r13,%r9
|
||
|
|
||
|
vpmadd52huq 0(%rsi),%ymm1,%ymm3
|
||
|
vpmadd52huq 32(%rsi),%ymm1,%ymm4
|
||
|
vpmadd52huq 64(%rsi),%ymm1,%ymm5
|
||
|
vpmadd52huq 96(%rsi),%ymm1,%ymm6
|
||
|
vpmadd52huq 128(%rsi),%ymm1,%ymm7
|
||
|
vpmadd52huq 160(%rsi),%ymm1,%ymm8
|
||
|
vpmadd52huq 192(%rsi),%ymm1,%ymm9
|
||
|
vpmadd52huq 224(%rsi),%ymm1,%ymm10
|
||
|
vpmadd52huq 256(%rsi),%ymm1,%ymm11
|
||
|
vpmadd52huq 288(%rsi),%ymm1,%ymm12
|
||
|
|
||
|
vpmadd52huq 0(%rcx),%ymm2,%ymm3
|
||
|
vpmadd52huq 32(%rcx),%ymm2,%ymm4
|
||
|
vpmadd52huq 64(%rcx),%ymm2,%ymm5
|
||
|
vpmadd52huq 96(%rcx),%ymm2,%ymm6
|
||
|
vpmadd52huq 128(%rcx),%ymm2,%ymm7
|
||
|
vpmadd52huq 160(%rcx),%ymm2,%ymm8
|
||
|
vpmadd52huq 192(%rcx),%ymm2,%ymm9
|
||
|
vpmadd52huq 224(%rcx),%ymm2,%ymm10
|
||
|
vpmadd52huq 256(%rcx),%ymm2,%ymm11
|
||
|
vpmadd52huq 288(%rcx),%ymm2,%ymm12
|
||
|
movq 16(%r11),%r13
|
||
|
|
||
|
vpbroadcastq %r13,%ymm1
|
||
|
movq 0(%rsi),%rdx
|
||
|
mulxq %r13,%r13,%r12
|
||
|
addq %r13,%r9
|
||
|
movq %r12,%r10
|
||
|
adcq $0,%r10
|
||
|
|
||
|
movq %r8,%r13
|
||
|
imulq %r9,%r13
|
||
|
andq %rax,%r13
|
||
|
|
||
|
vpbroadcastq %r13,%ymm2
|
||
|
movq 0(%rcx),%rdx
|
||
|
mulxq %r13,%r13,%r12
|
||
|
addq %r13,%r9
|
||
|
adcq %r12,%r10
|
||
|
|
||
|
shrq $52,%r9
|
||
|
salq $12,%r10
|
||
|
orq %r10,%r9
|
||
|
|
||
|
vpmadd52luq 0(%rsi),%ymm1,%ymm3
|
||
|
vpmadd52luq 32(%rsi),%ymm1,%ymm4
|
||
|
vpmadd52luq 64(%rsi),%ymm1,%ymm5
|
||
|
vpmadd52luq 96(%rsi),%ymm1,%ymm6
|
||
|
vpmadd52luq 128(%rsi),%ymm1,%ymm7
|
||
|
vpmadd52luq 160(%rsi),%ymm1,%ymm8
|
||
|
vpmadd52luq 192(%rsi),%ymm1,%ymm9
|
||
|
vpmadd52luq 224(%rsi),%ymm1,%ymm10
|
||
|
vpmadd52luq 256(%rsi),%ymm1,%ymm11
|
||
|
vpmadd52luq 288(%rsi),%ymm1,%ymm12
|
||
|
|
||
|
vpmadd52luq 0(%rcx),%ymm2,%ymm3
|
||
|
vpmadd52luq 32(%rcx),%ymm2,%ymm4
|
||
|
vpmadd52luq 64(%rcx),%ymm2,%ymm5
|
||
|
vpmadd52luq 96(%rcx),%ymm2,%ymm6
|
||
|
vpmadd52luq 128(%rcx),%ymm2,%ymm7
|
||
|
vpmadd52luq 160(%rcx),%ymm2,%ymm8
|
||
|
vpmadd52luq 192(%rcx),%ymm2,%ymm9
|
||
|
vpmadd52luq 224(%rcx),%ymm2,%ymm10
|
||
|
vpmadd52luq 256(%rcx),%ymm2,%ymm11
|
||
|
vpmadd52luq 288(%rcx),%ymm2,%ymm12
|
||
|
|
||
|
|
||
|
valignq $1,%ymm3,%ymm4,%ymm3
|
||
|
valignq $1,%ymm4,%ymm5,%ymm4
|
||
|
valignq $1,%ymm5,%ymm6,%ymm5
|
||
|
valignq $1,%ymm6,%ymm7,%ymm6
|
||
|
valignq $1,%ymm7,%ymm8,%ymm7
|
||
|
valignq $1,%ymm8,%ymm9,%ymm8
|
||
|
valignq $1,%ymm9,%ymm10,%ymm9
|
||
|
valignq $1,%ymm10,%ymm11,%ymm10
|
||
|
valignq $1,%ymm11,%ymm12,%ymm11
|
||
|
valignq $1,%ymm12,%ymm0,%ymm12
|
||
|
|
||
|
vmovq %xmm3,%r13
|
||
|
addq %r13,%r9
|
||
|
|
||
|
vpmadd52huq 0(%rsi),%ymm1,%ymm3
|
||
|
vpmadd52huq 32(%rsi),%ymm1,%ymm4
|
||
|
vpmadd52huq 64(%rsi),%ymm1,%ymm5
|
||
|
vpmadd52huq 96(%rsi),%ymm1,%ymm6
|
||
|
vpmadd52huq 128(%rsi),%ymm1,%ymm7
|
||
|
vpmadd52huq 160(%rsi),%ymm1,%ymm8
|
||
|
vpmadd52huq 192(%rsi),%ymm1,%ymm9
|
||
|
vpmadd52huq 224(%rsi),%ymm1,%ymm10
|
||
|
vpmadd52huq 256(%rsi),%ymm1,%ymm11
|
||
|
vpmadd52huq 288(%rsi),%ymm1,%ymm12
|
||
|
|
||
|
vpmadd52huq 0(%rcx),%ymm2,%ymm3
|
||
|
vpmadd52huq 32(%rcx),%ymm2,%ymm4
|
||
|
vpmadd52huq 64(%rcx),%ymm2,%ymm5
|
||
|
vpmadd52huq 96(%rcx),%ymm2,%ymm6
|
||
|
vpmadd52huq 128(%rcx),%ymm2,%ymm7
|
||
|
vpmadd52huq 160(%rcx),%ymm2,%ymm8
|
||
|
vpmadd52huq 192(%rcx),%ymm2,%ymm9
|
||
|
vpmadd52huq 224(%rcx),%ymm2,%ymm10
|
||
|
vpmadd52huq 256(%rcx),%ymm2,%ymm11
|
||
|
vpmadd52huq 288(%rcx),%ymm2,%ymm12
|
||
|
movq 24(%r11),%r13
|
||
|
|
||
|
vpbroadcastq %r13,%ymm1
|
||
|
movq 0(%rsi),%rdx
|
||
|
mulxq %r13,%r13,%r12
|
||
|
addq %r13,%r9
|
||
|
movq %r12,%r10
|
||
|
adcq $0,%r10
|
||
|
|
||
|
movq %r8,%r13
|
||
|
imulq %r9,%r13
|
||
|
andq %rax,%r13
|
||
|
|
||
|
vpbroadcastq %r13,%ymm2
|
||
|
movq 0(%rcx),%rdx
|
||
|
mulxq %r13,%r13,%r12
|
||
|
addq %r13,%r9
|
||
|
adcq %r12,%r10
|
||
|
|
||
|
shrq $52,%r9
|
||
|
salq $12,%r10
|
||
|
orq %r10,%r9
|
||
|
|
||
|
vpmadd52luq 0(%rsi),%ymm1,%ymm3
|
||
|
vpmadd52luq 32(%rsi),%ymm1,%ymm4
|
||
|
vpmadd52luq 64(%rsi),%ymm1,%ymm5
|
||
|
vpmadd52luq 96(%rsi),%ymm1,%ymm6
|
||
|
vpmadd52luq 128(%rsi),%ymm1,%ymm7
|
||
|
vpmadd52luq 160(%rsi),%ymm1,%ymm8
|
||
|
vpmadd52luq 192(%rsi),%ymm1,%ymm9
|
||
|
vpmadd52luq 224(%rsi),%ymm1,%ymm10
|
||
|
vpmadd52luq 256(%rsi),%ymm1,%ymm11
|
||
|
vpmadd52luq 288(%rsi),%ymm1,%ymm12
|
||
|
|
||
|
vpmadd52luq 0(%rcx),%ymm2,%ymm3
|
||
|
vpmadd52luq 32(%rcx),%ymm2,%ymm4
|
||
|
vpmadd52luq 64(%rcx),%ymm2,%ymm5
|
||
|
vpmadd52luq 96(%rcx),%ymm2,%ymm6
|
||
|
vpmadd52luq 128(%rcx),%ymm2,%ymm7
|
||
|
vpmadd52luq 160(%rcx),%ymm2,%ymm8
|
||
|
vpmadd52luq 192(%rcx),%ymm2,%ymm9
|
||
|
vpmadd52luq 224(%rcx),%ymm2,%ymm10
|
||
|
vpmadd52luq 256(%rcx),%ymm2,%ymm11
|
||
|
vpmadd52luq 288(%rcx),%ymm2,%ymm12
|
||
|
|
||
|
|
||
|
valignq $1,%ymm3,%ymm4,%ymm3
|
||
|
valignq $1,%ymm4,%ymm5,%ymm4
|
||
|
valignq $1,%ymm5,%ymm6,%ymm5
|
||
|
valignq $1,%ymm6,%ymm7,%ymm6
|
||
|
valignq $1,%ymm7,%ymm8,%ymm7
|
||
|
valignq $1,%ymm8,%ymm9,%ymm8
|
||
|
valignq $1,%ymm9,%ymm10,%ymm9
|
||
|
valignq $1,%ymm10,%ymm11,%ymm10
|
||
|
valignq $1,%ymm11,%ymm12,%ymm11
|
||
|
valignq $1,%ymm12,%ymm0,%ymm12
|
||
|
|
||
|
vmovq %xmm3,%r13
|
||
|
addq %r13,%r9
|
||
|
|
||
|
vpmadd52huq 0(%rsi),%ymm1,%ymm3
|
||
|
vpmadd52huq 32(%rsi),%ymm1,%ymm4
|
||
|
vpmadd52huq 64(%rsi),%ymm1,%ymm5
|
||
|
vpmadd52huq 96(%rsi),%ymm1,%ymm6
|
||
|
vpmadd52huq 128(%rsi),%ymm1,%ymm7
|
||
|
vpmadd52huq 160(%rsi),%ymm1,%ymm8
|
||
|
vpmadd52huq 192(%rsi),%ymm1,%ymm9
|
||
|
vpmadd52huq 224(%rsi),%ymm1,%ymm10
|
||
|
vpmadd52huq 256(%rsi),%ymm1,%ymm11
|
||
|
vpmadd52huq 288(%rsi),%ymm1,%ymm12
|
||
|
|
||
|
vpmadd52huq 0(%rcx),%ymm2,%ymm3
|
||
|
vpmadd52huq 32(%rcx),%ymm2,%ymm4
|
||
|
vpmadd52huq 64(%rcx),%ymm2,%ymm5
|
||
|
vpmadd52huq 96(%rcx),%ymm2,%ymm6
|
||
|
vpmadd52huq 128(%rcx),%ymm2,%ymm7
|
||
|
vpmadd52huq 160(%rcx),%ymm2,%ymm8
|
||
|
vpmadd52huq 192(%rcx),%ymm2,%ymm9
|
||
|
vpmadd52huq 224(%rcx),%ymm2,%ymm10
|
||
|
vpmadd52huq 256(%rcx),%ymm2,%ymm11
|
||
|
vpmadd52huq 288(%rcx),%ymm2,%ymm12
|
||
|
leaq 32(%r11),%r11
|
||
|
decl %ebx
|
||
|
jne .Lloop10
|
||
|
|
||
|
vpbroadcastq %r9,%ymm0
|
||
|
vpblendd $3,%ymm0,%ymm3,%ymm3
|
||
|
|
||
|
|
||
|
|
||
|
vpsrlq $52,%ymm3,%ymm0
|
||
|
vpsrlq $52,%ymm4,%ymm1
|
||
|
vpsrlq $52,%ymm5,%ymm2
|
||
|
vpsrlq $52,%ymm6,%ymm23
|
||
|
vpsrlq $52,%ymm7,%ymm24
|
||
|
vpsrlq $52,%ymm8,%ymm25
|
||
|
vpsrlq $52,%ymm9,%ymm26
|
||
|
vpsrlq $52,%ymm10,%ymm27
|
||
|
vpsrlq $52,%ymm11,%ymm28
|
||
|
vpsrlq $52,%ymm12,%ymm29
|
||
|
|
||
|
|
||
|
valignq $3,%ymm28,%ymm29,%ymm29
|
||
|
valignq $3,%ymm27,%ymm28,%ymm28
|
||
|
valignq $3,%ymm26,%ymm27,%ymm27
|
||
|
valignq $3,%ymm25,%ymm26,%ymm26
|
||
|
valignq $3,%ymm24,%ymm25,%ymm25
|
||
|
valignq $3,%ymm23,%ymm24,%ymm24
|
||
|
valignq $3,%ymm2,%ymm23,%ymm23
|
||
|
valignq $3,%ymm1,%ymm2,%ymm2
|
||
|
valignq $3,%ymm0,%ymm1,%ymm1
|
||
|
valignq $3,.Lzeros(%rip),%ymm0,%ymm0
|
||
|
|
||
|
|
||
|
vpandq .Lmask52x4(%rip),%ymm3,%ymm3
|
||
|
vpandq .Lmask52x4(%rip),%ymm4,%ymm4
|
||
|
vpandq .Lmask52x4(%rip),%ymm5,%ymm5
|
||
|
vpandq .Lmask52x4(%rip),%ymm6,%ymm6
|
||
|
vpandq .Lmask52x4(%rip),%ymm7,%ymm7
|
||
|
vpandq .Lmask52x4(%rip),%ymm8,%ymm8
|
||
|
vpandq .Lmask52x4(%rip),%ymm9,%ymm9
|
||
|
vpandq .Lmask52x4(%rip),%ymm10,%ymm10
|
||
|
vpandq .Lmask52x4(%rip),%ymm11,%ymm11
|
||
|
vpandq .Lmask52x4(%rip),%ymm12,%ymm12
|
||
|
|
||
|
|
||
|
vpaddq %ymm0,%ymm3,%ymm3
|
||
|
vpaddq %ymm1,%ymm4,%ymm4
|
||
|
vpaddq %ymm2,%ymm5,%ymm5
|
||
|
vpaddq %ymm23,%ymm6,%ymm6
|
||
|
vpaddq %ymm24,%ymm7,%ymm7
|
||
|
vpaddq %ymm25,%ymm8,%ymm8
|
||
|
vpaddq %ymm26,%ymm9,%ymm9
|
||
|
vpaddq %ymm27,%ymm10,%ymm10
|
||
|
vpaddq %ymm28,%ymm11,%ymm11
|
||
|
vpaddq %ymm29,%ymm12,%ymm12
|
||
|
|
||
|
|
||
|
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k2
|
||
|
kmovb %k1,%r14d
|
||
|
kmovb %k2,%r13d
|
||
|
shlb $4,%r13b
|
||
|
orb %r13b,%r14b
|
||
|
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm5,%k1
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm6,%k2
|
||
|
kmovb %k1,%r13d
|
||
|
kmovb %k2,%r12d
|
||
|
shlb $4,%r12b
|
||
|
orb %r12b,%r13b
|
||
|
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm7,%k1
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm8,%k2
|
||
|
kmovb %k1,%r12d
|
||
|
kmovb %k2,%r11d
|
||
|
shlb $4,%r11b
|
||
|
orb %r11b,%r12b
|
||
|
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm9,%k1
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm10,%k2
|
||
|
kmovb %k1,%r11d
|
||
|
kmovb %k2,%r10d
|
||
|
shlb $4,%r10b
|
||
|
orb %r10b,%r11b
|
||
|
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm11,%k1
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm12,%k2
|
||
|
kmovb %k1,%r10d
|
||
|
kmovb %k2,%r9d
|
||
|
shlb $4,%r9b
|
||
|
orb %r9b,%r10b
|
||
|
|
||
|
addb %r14b,%r14b
|
||
|
adcb %r13b,%r13b
|
||
|
adcb %r12b,%r12b
|
||
|
adcb %r11b,%r11b
|
||
|
adcb %r10b,%r10b
|
||
|
|
||
|
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k2
|
||
|
kmovb %k1,%r9d
|
||
|
kmovb %k2,%r8d
|
||
|
shlb $4,%r8b
|
||
|
orb %r8b,%r9b
|
||
|
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm5,%k1
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm6,%k2
|
||
|
kmovb %k1,%r8d
|
||
|
kmovb %k2,%edx
|
||
|
shlb $4,%dl
|
||
|
orb %dl,%r8b
|
||
|
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm7,%k1
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm8,%k2
|
||
|
kmovb %k1,%edx
|
||
|
kmovb %k2,%ecx
|
||
|
shlb $4,%cl
|
||
|
orb %cl,%dl
|
||
|
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm9,%k1
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm10,%k2
|
||
|
kmovb %k1,%ecx
|
||
|
kmovb %k2,%ebx
|
||
|
shlb $4,%bl
|
||
|
orb %bl,%cl
|
||
|
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm11,%k1
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm12,%k2
|
||
|
kmovb %k1,%ebx
|
||
|
kmovb %k2,%eax
|
||
|
shlb $4,%al
|
||
|
orb %al,%bl
|
||
|
|
||
|
addb %r9b,%r14b
|
||
|
adcb %r8b,%r13b
|
||
|
adcb %dl,%r12b
|
||
|
adcb %cl,%r11b
|
||
|
adcb %bl,%r10b
|
||
|
|
||
|
xorb %r9b,%r14b
|
||
|
xorb %r8b,%r13b
|
||
|
xorb %dl,%r12b
|
||
|
xorb %cl,%r11b
|
||
|
xorb %bl,%r10b
|
||
|
|
||
|
kmovb %r14d,%k1
|
||
|
shrb $4,%r14b
|
||
|
kmovb %r14d,%k2
|
||
|
kmovb %r13d,%k3
|
||
|
shrb $4,%r13b
|
||
|
kmovb %r13d,%k4
|
||
|
kmovb %r12d,%k5
|
||
|
shrb $4,%r12b
|
||
|
kmovb %r12d,%k6
|
||
|
kmovb %r11d,%k7
|
||
|
|
||
|
vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k2}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm5,%ymm5{%k3}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm6,%ymm6{%k4}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm7,%ymm7{%k5}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm8,%ymm8{%k6}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm9,%ymm9{%k7}
|
||
|
|
||
|
vpandq .Lmask52x4(%rip),%ymm3,%ymm3
|
||
|
vpandq .Lmask52x4(%rip),%ymm4,%ymm4
|
||
|
vpandq .Lmask52x4(%rip),%ymm5,%ymm5
|
||
|
vpandq .Lmask52x4(%rip),%ymm6,%ymm6
|
||
|
vpandq .Lmask52x4(%rip),%ymm7,%ymm7
|
||
|
vpandq .Lmask52x4(%rip),%ymm8,%ymm8
|
||
|
vpandq .Lmask52x4(%rip),%ymm9,%ymm9
|
||
|
|
||
|
shrb $4,%r11b
|
||
|
kmovb %r11d,%k1
|
||
|
kmovb %r10d,%k2
|
||
|
shrb $4,%r10b
|
||
|
kmovb %r10d,%k3
|
||
|
|
||
|
vpsubq .Lmask52x4(%rip),%ymm10,%ymm10{%k1}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm11,%ymm11{%k2}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm12,%ymm12{%k3}
|
||
|
|
||
|
vpandq .Lmask52x4(%rip),%ymm10,%ymm10
|
||
|
vpandq .Lmask52x4(%rip),%ymm11,%ymm11
|
||
|
vpandq .Lmask52x4(%rip),%ymm12,%ymm12
|
||
|
|
||
|
vmovdqu64 %ymm3,0(%rdi)
|
||
|
vmovdqu64 %ymm4,32(%rdi)
|
||
|
vmovdqu64 %ymm5,64(%rdi)
|
||
|
vmovdqu64 %ymm6,96(%rdi)
|
||
|
vmovdqu64 %ymm7,128(%rdi)
|
||
|
vmovdqu64 %ymm8,160(%rdi)
|
||
|
vmovdqu64 %ymm9,192(%rdi)
|
||
|
vmovdqu64 %ymm10,224(%rdi)
|
||
|
vmovdqu64 %ymm11,256(%rdi)
|
||
|
vmovdqu64 %ymm12,288(%rdi)
|
||
|
|
||
|
vzeroupper
|
||
|
leaq (%rsp),%rax
|
||
|
.cfi_def_cfa_register %rax
|
||
|
movq 0(%rax),%r15
|
||
|
.cfi_restore %r15
|
||
|
movq 8(%rax),%r14
|
||
|
.cfi_restore %r14
|
||
|
movq 16(%rax),%r13
|
||
|
.cfi_restore %r13
|
||
|
movq 24(%rax),%r12
|
||
|
.cfi_restore %r12
|
||
|
movq 32(%rax),%rbp
|
||
|
.cfi_restore %rbp
|
||
|
movq 40(%rax),%rbx
|
||
|
.cfi_restore %rbx
|
||
|
leaq 48(%rax),%rsp
|
||
|
.cfi_def_cfa %rsp,8
|
||
|
.Lossl_rsaz_amm52x40_x1_ifma256_epilogue:
|
||
|
|
||
|
.byte 0xf3,0xc3
|
||
|
.cfi_endproc
|
||
|
.size ossl_rsaz_amm52x40_x1_ifma256, .-ossl_rsaz_amm52x40_x1_ifma256
|
||
|
.data
|
||
|
.align 32
|
||
|
.Lmask52x4:
|
||
|
.quad 0xfffffffffffff
|
||
|
.quad 0xfffffffffffff
|
||
|
.quad 0xfffffffffffff
|
||
|
.quad 0xfffffffffffff
|
||
|
.text
|
||
|
|
||
|
.globl ossl_rsaz_amm52x40_x2_ifma256
|
||
|
.type ossl_rsaz_amm52x40_x2_ifma256,@function
|
||
|
.align 32
|
||
|
ossl_rsaz_amm52x40_x2_ifma256:
|
||
|
.cfi_startproc
|
||
|
.byte 243,15,30,250
|
||
|
pushq %rbx
|
||
|
.cfi_adjust_cfa_offset 8
|
||
|
.cfi_offset %rbx,-16
|
||
|
pushq %rbp
|
||
|
.cfi_adjust_cfa_offset 8
|
||
|
.cfi_offset %rbp,-24
|
||
|
pushq %r12
|
||
|
.cfi_adjust_cfa_offset 8
|
||
|
.cfi_offset %r12,-32
|
||
|
pushq %r13
|
||
|
.cfi_adjust_cfa_offset 8
|
||
|
.cfi_offset %r13,-40
|
||
|
pushq %r14
|
||
|
.cfi_adjust_cfa_offset 8
|
||
|
.cfi_offset %r14,-48
|
||
|
pushq %r15
|
||
|
.cfi_adjust_cfa_offset 8
|
||
|
.cfi_offset %r15,-56
|
||
|
|
||
|
vpxord %ymm0,%ymm0,%ymm0
|
||
|
vmovdqa64 %ymm0,%ymm3
|
||
|
vmovdqa64 %ymm0,%ymm4
|
||
|
vmovdqa64 %ymm0,%ymm5
|
||
|
vmovdqa64 %ymm0,%ymm6
|
||
|
vmovdqa64 %ymm0,%ymm7
|
||
|
vmovdqa64 %ymm0,%ymm8
|
||
|
vmovdqa64 %ymm0,%ymm9
|
||
|
vmovdqa64 %ymm0,%ymm10
|
||
|
vmovdqa64 %ymm0,%ymm11
|
||
|
vmovdqa64 %ymm0,%ymm12
|
||
|
|
||
|
vmovdqa64 %ymm0,%ymm13
|
||
|
vmovdqa64 %ymm0,%ymm14
|
||
|
vmovdqa64 %ymm0,%ymm15
|
||
|
vmovdqa64 %ymm0,%ymm16
|
||
|
vmovdqa64 %ymm0,%ymm17
|
||
|
vmovdqa64 %ymm0,%ymm18
|
||
|
vmovdqa64 %ymm0,%ymm19
|
||
|
vmovdqa64 %ymm0,%ymm20
|
||
|
vmovdqa64 %ymm0,%ymm21
|
||
|
vmovdqa64 %ymm0,%ymm22
|
||
|
|
||
|
|
||
|
xorl %r9d,%r9d
|
||
|
xorl %r15d,%r15d
|
||
|
|
||
|
movq %rdx,%r11
|
||
|
movq $0xfffffffffffff,%rax
|
||
|
|
||
|
movl $40,%ebx
|
||
|
|
||
|
.align 32
|
||
|
.Lloop40:
|
||
|
movq 0(%r11),%r13
|
||
|
|
||
|
vpbroadcastq %r13,%ymm1
|
||
|
movq 0(%rsi),%rdx
|
||
|
mulxq %r13,%r13,%r12
|
||
|
addq %r13,%r9
|
||
|
movq %r12,%r10
|
||
|
adcq $0,%r10
|
||
|
|
||
|
movq (%r8),%r13
|
||
|
imulq %r9,%r13
|
||
|
andq %rax,%r13
|
||
|
|
||
|
vpbroadcastq %r13,%ymm2
|
||
|
movq 0(%rcx),%rdx
|
||
|
mulxq %r13,%r13,%r12
|
||
|
addq %r13,%r9
|
||
|
adcq %r12,%r10
|
||
|
|
||
|
shrq $52,%r9
|
||
|
salq $12,%r10
|
||
|
orq %r10,%r9
|
||
|
|
||
|
vpmadd52luq 0(%rsi),%ymm1,%ymm3
|
||
|
vpmadd52luq 32(%rsi),%ymm1,%ymm4
|
||
|
vpmadd52luq 64(%rsi),%ymm1,%ymm5
|
||
|
vpmadd52luq 96(%rsi),%ymm1,%ymm6
|
||
|
vpmadd52luq 128(%rsi),%ymm1,%ymm7
|
||
|
vpmadd52luq 160(%rsi),%ymm1,%ymm8
|
||
|
vpmadd52luq 192(%rsi),%ymm1,%ymm9
|
||
|
vpmadd52luq 224(%rsi),%ymm1,%ymm10
|
||
|
vpmadd52luq 256(%rsi),%ymm1,%ymm11
|
||
|
vpmadd52luq 288(%rsi),%ymm1,%ymm12
|
||
|
|
||
|
vpmadd52luq 0(%rcx),%ymm2,%ymm3
|
||
|
vpmadd52luq 32(%rcx),%ymm2,%ymm4
|
||
|
vpmadd52luq 64(%rcx),%ymm2,%ymm5
|
||
|
vpmadd52luq 96(%rcx),%ymm2,%ymm6
|
||
|
vpmadd52luq 128(%rcx),%ymm2,%ymm7
|
||
|
vpmadd52luq 160(%rcx),%ymm2,%ymm8
|
||
|
vpmadd52luq 192(%rcx),%ymm2,%ymm9
|
||
|
vpmadd52luq 224(%rcx),%ymm2,%ymm10
|
||
|
vpmadd52luq 256(%rcx),%ymm2,%ymm11
|
||
|
vpmadd52luq 288(%rcx),%ymm2,%ymm12
|
||
|
|
||
|
|
||
|
valignq $1,%ymm3,%ymm4,%ymm3
|
||
|
valignq $1,%ymm4,%ymm5,%ymm4
|
||
|
valignq $1,%ymm5,%ymm6,%ymm5
|
||
|
valignq $1,%ymm6,%ymm7,%ymm6
|
||
|
valignq $1,%ymm7,%ymm8,%ymm7
|
||
|
valignq $1,%ymm8,%ymm9,%ymm8
|
||
|
valignq $1,%ymm9,%ymm10,%ymm9
|
||
|
valignq $1,%ymm10,%ymm11,%ymm10
|
||
|
valignq $1,%ymm11,%ymm12,%ymm11
|
||
|
valignq $1,%ymm12,%ymm0,%ymm12
|
||
|
|
||
|
vmovq %xmm3,%r13
|
||
|
addq %r13,%r9
|
||
|
|
||
|
vpmadd52huq 0(%rsi),%ymm1,%ymm3
|
||
|
vpmadd52huq 32(%rsi),%ymm1,%ymm4
|
||
|
vpmadd52huq 64(%rsi),%ymm1,%ymm5
|
||
|
vpmadd52huq 96(%rsi),%ymm1,%ymm6
|
||
|
vpmadd52huq 128(%rsi),%ymm1,%ymm7
|
||
|
vpmadd52huq 160(%rsi),%ymm1,%ymm8
|
||
|
vpmadd52huq 192(%rsi),%ymm1,%ymm9
|
||
|
vpmadd52huq 224(%rsi),%ymm1,%ymm10
|
||
|
vpmadd52huq 256(%rsi),%ymm1,%ymm11
|
||
|
vpmadd52huq 288(%rsi),%ymm1,%ymm12
|
||
|
|
||
|
vpmadd52huq 0(%rcx),%ymm2,%ymm3
|
||
|
vpmadd52huq 32(%rcx),%ymm2,%ymm4
|
||
|
vpmadd52huq 64(%rcx),%ymm2,%ymm5
|
||
|
vpmadd52huq 96(%rcx),%ymm2,%ymm6
|
||
|
vpmadd52huq 128(%rcx),%ymm2,%ymm7
|
||
|
vpmadd52huq 160(%rcx),%ymm2,%ymm8
|
||
|
vpmadd52huq 192(%rcx),%ymm2,%ymm9
|
||
|
vpmadd52huq 224(%rcx),%ymm2,%ymm10
|
||
|
vpmadd52huq 256(%rcx),%ymm2,%ymm11
|
||
|
vpmadd52huq 288(%rcx),%ymm2,%ymm12
|
||
|
movq 320(%r11),%r13
|
||
|
|
||
|
vpbroadcastq %r13,%ymm1
|
||
|
movq 320(%rsi),%rdx
|
||
|
mulxq %r13,%r13,%r12
|
||
|
addq %r13,%r15
|
||
|
movq %r12,%r10
|
||
|
adcq $0,%r10
|
||
|
|
||
|
movq 8(%r8),%r13
|
||
|
imulq %r15,%r13
|
||
|
andq %rax,%r13
|
||
|
|
||
|
vpbroadcastq %r13,%ymm2
|
||
|
movq 320(%rcx),%rdx
|
||
|
mulxq %r13,%r13,%r12
|
||
|
addq %r13,%r15
|
||
|
adcq %r12,%r10
|
||
|
|
||
|
shrq $52,%r15
|
||
|
salq $12,%r10
|
||
|
orq %r10,%r15
|
||
|
|
||
|
vpmadd52luq 320(%rsi),%ymm1,%ymm13
|
||
|
vpmadd52luq 352(%rsi),%ymm1,%ymm14
|
||
|
vpmadd52luq 384(%rsi),%ymm1,%ymm15
|
||
|
vpmadd52luq 416(%rsi),%ymm1,%ymm16
|
||
|
vpmadd52luq 448(%rsi),%ymm1,%ymm17
|
||
|
vpmadd52luq 480(%rsi),%ymm1,%ymm18
|
||
|
vpmadd52luq 512(%rsi),%ymm1,%ymm19
|
||
|
vpmadd52luq 544(%rsi),%ymm1,%ymm20
|
||
|
vpmadd52luq 576(%rsi),%ymm1,%ymm21
|
||
|
vpmadd52luq 608(%rsi),%ymm1,%ymm22
|
||
|
|
||
|
vpmadd52luq 320(%rcx),%ymm2,%ymm13
|
||
|
vpmadd52luq 352(%rcx),%ymm2,%ymm14
|
||
|
vpmadd52luq 384(%rcx),%ymm2,%ymm15
|
||
|
vpmadd52luq 416(%rcx),%ymm2,%ymm16
|
||
|
vpmadd52luq 448(%rcx),%ymm2,%ymm17
|
||
|
vpmadd52luq 480(%rcx),%ymm2,%ymm18
|
||
|
vpmadd52luq 512(%rcx),%ymm2,%ymm19
|
||
|
vpmadd52luq 544(%rcx),%ymm2,%ymm20
|
||
|
vpmadd52luq 576(%rcx),%ymm2,%ymm21
|
||
|
vpmadd52luq 608(%rcx),%ymm2,%ymm22
|
||
|
|
||
|
|
||
|
valignq $1,%ymm13,%ymm14,%ymm13
|
||
|
valignq $1,%ymm14,%ymm15,%ymm14
|
||
|
valignq $1,%ymm15,%ymm16,%ymm15
|
||
|
valignq $1,%ymm16,%ymm17,%ymm16
|
||
|
valignq $1,%ymm17,%ymm18,%ymm17
|
||
|
valignq $1,%ymm18,%ymm19,%ymm18
|
||
|
valignq $1,%ymm19,%ymm20,%ymm19
|
||
|
valignq $1,%ymm20,%ymm21,%ymm20
|
||
|
valignq $1,%ymm21,%ymm22,%ymm21
|
||
|
valignq $1,%ymm22,%ymm0,%ymm22
|
||
|
|
||
|
vmovq %xmm13,%r13
|
||
|
addq %r13,%r15
|
||
|
|
||
|
vpmadd52huq 320(%rsi),%ymm1,%ymm13
|
||
|
vpmadd52huq 352(%rsi),%ymm1,%ymm14
|
||
|
vpmadd52huq 384(%rsi),%ymm1,%ymm15
|
||
|
vpmadd52huq 416(%rsi),%ymm1,%ymm16
|
||
|
vpmadd52huq 448(%rsi),%ymm1,%ymm17
|
||
|
vpmadd52huq 480(%rsi),%ymm1,%ymm18
|
||
|
vpmadd52huq 512(%rsi),%ymm1,%ymm19
|
||
|
vpmadd52huq 544(%rsi),%ymm1,%ymm20
|
||
|
vpmadd52huq 576(%rsi),%ymm1,%ymm21
|
||
|
vpmadd52huq 608(%rsi),%ymm1,%ymm22
|
||
|
|
||
|
vpmadd52huq 320(%rcx),%ymm2,%ymm13
|
||
|
vpmadd52huq 352(%rcx),%ymm2,%ymm14
|
||
|
vpmadd52huq 384(%rcx),%ymm2,%ymm15
|
||
|
vpmadd52huq 416(%rcx),%ymm2,%ymm16
|
||
|
vpmadd52huq 448(%rcx),%ymm2,%ymm17
|
||
|
vpmadd52huq 480(%rcx),%ymm2,%ymm18
|
||
|
vpmadd52huq 512(%rcx),%ymm2,%ymm19
|
||
|
vpmadd52huq 544(%rcx),%ymm2,%ymm20
|
||
|
vpmadd52huq 576(%rcx),%ymm2,%ymm21
|
||
|
vpmadd52huq 608(%rcx),%ymm2,%ymm22
|
||
|
leaq 8(%r11),%r11
|
||
|
decl %ebx
|
||
|
jne .Lloop40
|
||
|
|
||
|
vpbroadcastq %r9,%ymm0
|
||
|
vpblendd $3,%ymm0,%ymm3,%ymm3
|
||
|
|
||
|
|
||
|
|
||
|
vpsrlq $52,%ymm3,%ymm0
|
||
|
vpsrlq $52,%ymm4,%ymm1
|
||
|
vpsrlq $52,%ymm5,%ymm2
|
||
|
vpsrlq $52,%ymm6,%ymm23
|
||
|
vpsrlq $52,%ymm7,%ymm24
|
||
|
vpsrlq $52,%ymm8,%ymm25
|
||
|
vpsrlq $52,%ymm9,%ymm26
|
||
|
vpsrlq $52,%ymm10,%ymm27
|
||
|
vpsrlq $52,%ymm11,%ymm28
|
||
|
vpsrlq $52,%ymm12,%ymm29
|
||
|
|
||
|
|
||
|
valignq $3,%ymm28,%ymm29,%ymm29
|
||
|
valignq $3,%ymm27,%ymm28,%ymm28
|
||
|
valignq $3,%ymm26,%ymm27,%ymm27
|
||
|
valignq $3,%ymm25,%ymm26,%ymm26
|
||
|
valignq $3,%ymm24,%ymm25,%ymm25
|
||
|
valignq $3,%ymm23,%ymm24,%ymm24
|
||
|
valignq $3,%ymm2,%ymm23,%ymm23
|
||
|
valignq $3,%ymm1,%ymm2,%ymm2
|
||
|
valignq $3,%ymm0,%ymm1,%ymm1
|
||
|
valignq $3,.Lzeros(%rip),%ymm0,%ymm0
|
||
|
|
||
|
|
||
|
vpandq .Lmask52x4(%rip),%ymm3,%ymm3
|
||
|
vpandq .Lmask52x4(%rip),%ymm4,%ymm4
|
||
|
vpandq .Lmask52x4(%rip),%ymm5,%ymm5
|
||
|
vpandq .Lmask52x4(%rip),%ymm6,%ymm6
|
||
|
vpandq .Lmask52x4(%rip),%ymm7,%ymm7
|
||
|
vpandq .Lmask52x4(%rip),%ymm8,%ymm8
|
||
|
vpandq .Lmask52x4(%rip),%ymm9,%ymm9
|
||
|
vpandq .Lmask52x4(%rip),%ymm10,%ymm10
|
||
|
vpandq .Lmask52x4(%rip),%ymm11,%ymm11
|
||
|
vpandq .Lmask52x4(%rip),%ymm12,%ymm12
|
||
|
|
||
|
|
||
|
vpaddq %ymm0,%ymm3,%ymm3
|
||
|
vpaddq %ymm1,%ymm4,%ymm4
|
||
|
vpaddq %ymm2,%ymm5,%ymm5
|
||
|
vpaddq %ymm23,%ymm6,%ymm6
|
||
|
vpaddq %ymm24,%ymm7,%ymm7
|
||
|
vpaddq %ymm25,%ymm8,%ymm8
|
||
|
vpaddq %ymm26,%ymm9,%ymm9
|
||
|
vpaddq %ymm27,%ymm10,%ymm10
|
||
|
vpaddq %ymm28,%ymm11,%ymm11
|
||
|
vpaddq %ymm29,%ymm12,%ymm12
|
||
|
|
||
|
|
||
|
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k2
|
||
|
kmovb %k1,%r14d
|
||
|
kmovb %k2,%r13d
|
||
|
shlb $4,%r13b
|
||
|
orb %r13b,%r14b
|
||
|
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm5,%k1
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm6,%k2
|
||
|
kmovb %k1,%r13d
|
||
|
kmovb %k2,%r12d
|
||
|
shlb $4,%r12b
|
||
|
orb %r12b,%r13b
|
||
|
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm7,%k1
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm8,%k2
|
||
|
kmovb %k1,%r12d
|
||
|
kmovb %k2,%r11d
|
||
|
shlb $4,%r11b
|
||
|
orb %r11b,%r12b
|
||
|
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm9,%k1
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm10,%k2
|
||
|
kmovb %k1,%r11d
|
||
|
kmovb %k2,%r10d
|
||
|
shlb $4,%r10b
|
||
|
orb %r10b,%r11b
|
||
|
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm11,%k1
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm12,%k2
|
||
|
kmovb %k1,%r10d
|
||
|
kmovb %k2,%r9d
|
||
|
shlb $4,%r9b
|
||
|
orb %r9b,%r10b
|
||
|
|
||
|
addb %r14b,%r14b
|
||
|
adcb %r13b,%r13b
|
||
|
adcb %r12b,%r12b
|
||
|
adcb %r11b,%r11b
|
||
|
adcb %r10b,%r10b
|
||
|
|
||
|
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k2
|
||
|
kmovb %k1,%r9d
|
||
|
kmovb %k2,%r8d
|
||
|
shlb $4,%r8b
|
||
|
orb %r8b,%r9b
|
||
|
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm5,%k1
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm6,%k2
|
||
|
kmovb %k1,%r8d
|
||
|
kmovb %k2,%edx
|
||
|
shlb $4,%dl
|
||
|
orb %dl,%r8b
|
||
|
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm7,%k1
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm8,%k2
|
||
|
kmovb %k1,%edx
|
||
|
kmovb %k2,%ecx
|
||
|
shlb $4,%cl
|
||
|
orb %cl,%dl
|
||
|
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm9,%k1
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm10,%k2
|
||
|
kmovb %k1,%ecx
|
||
|
kmovb %k2,%ebx
|
||
|
shlb $4,%bl
|
||
|
orb %bl,%cl
|
||
|
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm11,%k1
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm12,%k2
|
||
|
kmovb %k1,%ebx
|
||
|
kmovb %k2,%eax
|
||
|
shlb $4,%al
|
||
|
orb %al,%bl
|
||
|
|
||
|
addb %r9b,%r14b
|
||
|
adcb %r8b,%r13b
|
||
|
adcb %dl,%r12b
|
||
|
adcb %cl,%r11b
|
||
|
adcb %bl,%r10b
|
||
|
|
||
|
xorb %r9b,%r14b
|
||
|
xorb %r8b,%r13b
|
||
|
xorb %dl,%r12b
|
||
|
xorb %cl,%r11b
|
||
|
xorb %bl,%r10b
|
||
|
|
||
|
kmovb %r14d,%k1
|
||
|
shrb $4,%r14b
|
||
|
kmovb %r14d,%k2
|
||
|
kmovb %r13d,%k3
|
||
|
shrb $4,%r13b
|
||
|
kmovb %r13d,%k4
|
||
|
kmovb %r12d,%k5
|
||
|
shrb $4,%r12b
|
||
|
kmovb %r12d,%k6
|
||
|
kmovb %r11d,%k7
|
||
|
|
||
|
vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k2}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm5,%ymm5{%k3}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm6,%ymm6{%k4}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm7,%ymm7{%k5}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm8,%ymm8{%k6}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm9,%ymm9{%k7}
|
||
|
|
||
|
vpandq .Lmask52x4(%rip),%ymm3,%ymm3
|
||
|
vpandq .Lmask52x4(%rip),%ymm4,%ymm4
|
||
|
vpandq .Lmask52x4(%rip),%ymm5,%ymm5
|
||
|
vpandq .Lmask52x4(%rip),%ymm6,%ymm6
|
||
|
vpandq .Lmask52x4(%rip),%ymm7,%ymm7
|
||
|
vpandq .Lmask52x4(%rip),%ymm8,%ymm8
|
||
|
vpandq .Lmask52x4(%rip),%ymm9,%ymm9
|
||
|
|
||
|
shrb $4,%r11b
|
||
|
kmovb %r11d,%k1
|
||
|
kmovb %r10d,%k2
|
||
|
shrb $4,%r10b
|
||
|
kmovb %r10d,%k3
|
||
|
|
||
|
vpsubq .Lmask52x4(%rip),%ymm10,%ymm10{%k1}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm11,%ymm11{%k2}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm12,%ymm12{%k3}
|
||
|
|
||
|
vpandq .Lmask52x4(%rip),%ymm10,%ymm10
|
||
|
vpandq .Lmask52x4(%rip),%ymm11,%ymm11
|
||
|
vpandq .Lmask52x4(%rip),%ymm12,%ymm12
|
||
|
|
||
|
vpbroadcastq %r15,%ymm0
|
||
|
vpblendd $3,%ymm0,%ymm13,%ymm13
|
||
|
|
||
|
|
||
|
|
||
|
vpsrlq $52,%ymm13,%ymm0
|
||
|
vpsrlq $52,%ymm14,%ymm1
|
||
|
vpsrlq $52,%ymm15,%ymm2
|
||
|
vpsrlq $52,%ymm16,%ymm23
|
||
|
vpsrlq $52,%ymm17,%ymm24
|
||
|
vpsrlq $52,%ymm18,%ymm25
|
||
|
vpsrlq $52,%ymm19,%ymm26
|
||
|
vpsrlq $52,%ymm20,%ymm27
|
||
|
vpsrlq $52,%ymm21,%ymm28
|
||
|
vpsrlq $52,%ymm22,%ymm29
|
||
|
|
||
|
|
||
|
valignq $3,%ymm28,%ymm29,%ymm29
|
||
|
valignq $3,%ymm27,%ymm28,%ymm28
|
||
|
valignq $3,%ymm26,%ymm27,%ymm27
|
||
|
valignq $3,%ymm25,%ymm26,%ymm26
|
||
|
valignq $3,%ymm24,%ymm25,%ymm25
|
||
|
valignq $3,%ymm23,%ymm24,%ymm24
|
||
|
valignq $3,%ymm2,%ymm23,%ymm23
|
||
|
valignq $3,%ymm1,%ymm2,%ymm2
|
||
|
valignq $3,%ymm0,%ymm1,%ymm1
|
||
|
valignq $3,.Lzeros(%rip),%ymm0,%ymm0
|
||
|
|
||
|
|
||
|
vpandq .Lmask52x4(%rip),%ymm13,%ymm13
|
||
|
vpandq .Lmask52x4(%rip),%ymm14,%ymm14
|
||
|
vpandq .Lmask52x4(%rip),%ymm15,%ymm15
|
||
|
vpandq .Lmask52x4(%rip),%ymm16,%ymm16
|
||
|
vpandq .Lmask52x4(%rip),%ymm17,%ymm17
|
||
|
vpandq .Lmask52x4(%rip),%ymm18,%ymm18
|
||
|
vpandq .Lmask52x4(%rip),%ymm19,%ymm19
|
||
|
vpandq .Lmask52x4(%rip),%ymm20,%ymm20
|
||
|
vpandq .Lmask52x4(%rip),%ymm21,%ymm21
|
||
|
vpandq .Lmask52x4(%rip),%ymm22,%ymm22
|
||
|
|
||
|
|
||
|
vpaddq %ymm0,%ymm13,%ymm13
|
||
|
vpaddq %ymm1,%ymm14,%ymm14
|
||
|
vpaddq %ymm2,%ymm15,%ymm15
|
||
|
vpaddq %ymm23,%ymm16,%ymm16
|
||
|
vpaddq %ymm24,%ymm17,%ymm17
|
||
|
vpaddq %ymm25,%ymm18,%ymm18
|
||
|
vpaddq %ymm26,%ymm19,%ymm19
|
||
|
vpaddq %ymm27,%ymm20,%ymm20
|
||
|
vpaddq %ymm28,%ymm21,%ymm21
|
||
|
vpaddq %ymm29,%ymm22,%ymm22
|
||
|
|
||
|
|
||
|
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm13,%k1
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm14,%k2
|
||
|
kmovb %k1,%r14d
|
||
|
kmovb %k2,%r13d
|
||
|
shlb $4,%r13b
|
||
|
orb %r13b,%r14b
|
||
|
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm15,%k1
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2
|
||
|
kmovb %k1,%r13d
|
||
|
kmovb %k2,%r12d
|
||
|
shlb $4,%r12b
|
||
|
orb %r12b,%r13b
|
||
|
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k1
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k2
|
||
|
kmovb %k1,%r12d
|
||
|
kmovb %k2,%r11d
|
||
|
shlb $4,%r11b
|
||
|
orb %r11b,%r12b
|
||
|
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm19,%k1
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm20,%k2
|
||
|
kmovb %k1,%r11d
|
||
|
kmovb %k2,%r10d
|
||
|
shlb $4,%r10b
|
||
|
orb %r10b,%r11b
|
||
|
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm21,%k1
|
||
|
vpcmpuq $6,.Lmask52x4(%rip),%ymm22,%k2
|
||
|
kmovb %k1,%r10d
|
||
|
kmovb %k2,%r9d
|
||
|
shlb $4,%r9b
|
||
|
orb %r9b,%r10b
|
||
|
|
||
|
addb %r14b,%r14b
|
||
|
adcb %r13b,%r13b
|
||
|
adcb %r12b,%r12b
|
||
|
adcb %r11b,%r11b
|
||
|
adcb %r10b,%r10b
|
||
|
|
||
|
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm13,%k1
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm14,%k2
|
||
|
kmovb %k1,%r9d
|
||
|
kmovb %k2,%r8d
|
||
|
shlb $4,%r8b
|
||
|
orb %r8b,%r9b
|
||
|
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm15,%k1
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2
|
||
|
kmovb %k1,%r8d
|
||
|
kmovb %k2,%edx
|
||
|
shlb $4,%dl
|
||
|
orb %dl,%r8b
|
||
|
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k1
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k2
|
||
|
kmovb %k1,%edx
|
||
|
kmovb %k2,%ecx
|
||
|
shlb $4,%cl
|
||
|
orb %cl,%dl
|
||
|
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm19,%k1
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm20,%k2
|
||
|
kmovb %k1,%ecx
|
||
|
kmovb %k2,%ebx
|
||
|
shlb $4,%bl
|
||
|
orb %bl,%cl
|
||
|
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm21,%k1
|
||
|
vpcmpuq $0,.Lmask52x4(%rip),%ymm22,%k2
|
||
|
kmovb %k1,%ebx
|
||
|
kmovb %k2,%eax
|
||
|
shlb $4,%al
|
||
|
orb %al,%bl
|
||
|
|
||
|
addb %r9b,%r14b
|
||
|
adcb %r8b,%r13b
|
||
|
adcb %dl,%r12b
|
||
|
adcb %cl,%r11b
|
||
|
adcb %bl,%r10b
|
||
|
|
||
|
xorb %r9b,%r14b
|
||
|
xorb %r8b,%r13b
|
||
|
xorb %dl,%r12b
|
||
|
xorb %cl,%r11b
|
||
|
xorb %bl,%r10b
|
||
|
|
||
|
kmovb %r14d,%k1
|
||
|
shrb $4,%r14b
|
||
|
kmovb %r14d,%k2
|
||
|
kmovb %r13d,%k3
|
||
|
shrb $4,%r13b
|
||
|
kmovb %r13d,%k4
|
||
|
kmovb %r12d,%k5
|
||
|
shrb $4,%r12b
|
||
|
kmovb %r12d,%k6
|
||
|
kmovb %r11d,%k7
|
||
|
|
||
|
vpsubq .Lmask52x4(%rip),%ymm13,%ymm13{%k1}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm14,%ymm14{%k2}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm15,%ymm15{%k3}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k4}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k5}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k6}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm19,%ymm19{%k7}
|
||
|
|
||
|
vpandq .Lmask52x4(%rip),%ymm13,%ymm13
|
||
|
vpandq .Lmask52x4(%rip),%ymm14,%ymm14
|
||
|
vpandq .Lmask52x4(%rip),%ymm15,%ymm15
|
||
|
vpandq .Lmask52x4(%rip),%ymm16,%ymm16
|
||
|
vpandq .Lmask52x4(%rip),%ymm17,%ymm17
|
||
|
vpandq .Lmask52x4(%rip),%ymm18,%ymm18
|
||
|
vpandq .Lmask52x4(%rip),%ymm19,%ymm19
|
||
|
|
||
|
shrb $4,%r11b
|
||
|
kmovb %r11d,%k1
|
||
|
kmovb %r10d,%k2
|
||
|
shrb $4,%r10b
|
||
|
kmovb %r10d,%k3
|
||
|
|
||
|
vpsubq .Lmask52x4(%rip),%ymm20,%ymm20{%k1}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm21,%ymm21{%k2}
|
||
|
vpsubq .Lmask52x4(%rip),%ymm22,%ymm22{%k3}
|
||
|
|
||
|
vpandq .Lmask52x4(%rip),%ymm20,%ymm20
|
||
|
vpandq .Lmask52x4(%rip),%ymm21,%ymm21
|
||
|
vpandq .Lmask52x4(%rip),%ymm22,%ymm22
|
||
|
|
||
|
vmovdqu64 %ymm3,0(%rdi)
|
||
|
vmovdqu64 %ymm4,32(%rdi)
|
||
|
vmovdqu64 %ymm5,64(%rdi)
|
||
|
vmovdqu64 %ymm6,96(%rdi)
|
||
|
vmovdqu64 %ymm7,128(%rdi)
|
||
|
vmovdqu64 %ymm8,160(%rdi)
|
||
|
vmovdqu64 %ymm9,192(%rdi)
|
||
|
vmovdqu64 %ymm10,224(%rdi)
|
||
|
vmovdqu64 %ymm11,256(%rdi)
|
||
|
vmovdqu64 %ymm12,288(%rdi)
|
||
|
|
||
|
vmovdqu64 %ymm13,320(%rdi)
|
||
|
vmovdqu64 %ymm14,352(%rdi)
|
||
|
vmovdqu64 %ymm15,384(%rdi)
|
||
|
vmovdqu64 %ymm16,416(%rdi)
|
||
|
vmovdqu64 %ymm17,448(%rdi)
|
||
|
vmovdqu64 %ymm18,480(%rdi)
|
||
|
vmovdqu64 %ymm19,512(%rdi)
|
||
|
vmovdqu64 %ymm20,544(%rdi)
|
||
|
vmovdqu64 %ymm21,576(%rdi)
|
||
|
vmovdqu64 %ymm22,608(%rdi)
|
||
|
|
||
|
vzeroupper
|
||
|
leaq (%rsp),%rax
|
||
|
.cfi_def_cfa_register %rax
|
||
|
movq 0(%rax),%r15
|
||
|
.cfi_restore %r15
|
||
|
movq 8(%rax),%r14
|
||
|
.cfi_restore %r14
|
||
|
movq 16(%rax),%r13
|
||
|
.cfi_restore %r13
|
||
|
movq 24(%rax),%r12
|
||
|
.cfi_restore %r12
|
||
|
movq 32(%rax),%rbp
|
||
|
.cfi_restore %rbp
|
||
|
movq 40(%rax),%rbx
|
||
|
.cfi_restore %rbx
|
||
|
leaq 48(%rax),%rsp
|
||
|
.cfi_def_cfa %rsp,8
|
||
|
.Lossl_rsaz_amm52x40_x2_ifma256_epilogue:
|
||
|
.byte 0xf3,0xc3
|
||
|
.cfi_endproc
|
||
|
.size ossl_rsaz_amm52x40_x2_ifma256, .-ossl_rsaz_amm52x40_x2_ifma256
|
||
|
.text
|
||
|
|
||
|
.align 32
|
||
|
.globl ossl_extract_multiplier_2x40_win5
|
||
|
.type ossl_extract_multiplier_2x40_win5,@function
|
||
|
ossl_extract_multiplier_2x40_win5:
|
||
|
.cfi_startproc
|
||
|
.byte 243,15,30,250
|
||
|
vmovdqa64 .Lones(%rip),%ymm24
|
||
|
vpbroadcastq %rdx,%ymm22
|
||
|
vpbroadcastq %rcx,%ymm23
|
||
|
leaq 20480(%rsi),%rax
|
||
|
|
||
|
|
||
|
movq %rsi,%r10
|
||
|
|
||
|
|
||
|
vpxor %xmm0,%xmm0,%xmm0
|
||
|
vmovdqa64 %ymm0,%ymm1
|
||
|
vmovdqa64 %ymm0,%ymm2
|
||
|
vmovdqa64 %ymm0,%ymm3
|
||
|
vmovdqa64 %ymm0,%ymm4
|
||
|
vmovdqa64 %ymm0,%ymm5
|
||
|
vmovdqa64 %ymm0,%ymm16
|
||
|
vmovdqa64 %ymm0,%ymm17
|
||
|
vmovdqa64 %ymm0,%ymm18
|
||
|
vmovdqa64 %ymm0,%ymm19
|
||
|
vpxorq %ymm21,%ymm21,%ymm21
|
||
|
.align 32
|
||
|
.Lloop_0:
|
||
|
vpcmpq $0,%ymm21,%ymm22,%k1
|
||
|
vmovdqu64 0(%rsi),%ymm20
|
||
|
vpblendmq %ymm20,%ymm0,%ymm0{%k1}
|
||
|
vmovdqu64 32(%rsi),%ymm20
|
||
|
vpblendmq %ymm20,%ymm1,%ymm1{%k1}
|
||
|
vmovdqu64 64(%rsi),%ymm20
|
||
|
vpblendmq %ymm20,%ymm2,%ymm2{%k1}
|
||
|
vmovdqu64 96(%rsi),%ymm20
|
||
|
vpblendmq %ymm20,%ymm3,%ymm3{%k1}
|
||
|
vmovdqu64 128(%rsi),%ymm20
|
||
|
vpblendmq %ymm20,%ymm4,%ymm4{%k1}
|
||
|
vmovdqu64 160(%rsi),%ymm20
|
||
|
vpblendmq %ymm20,%ymm5,%ymm5{%k1}
|
||
|
vmovdqu64 192(%rsi),%ymm20
|
||
|
vpblendmq %ymm20,%ymm16,%ymm16{%k1}
|
||
|
vmovdqu64 224(%rsi),%ymm20
|
||
|
vpblendmq %ymm20,%ymm17,%ymm17{%k1}
|
||
|
vmovdqu64 256(%rsi),%ymm20
|
||
|
vpblendmq %ymm20,%ymm18,%ymm18{%k1}
|
||
|
vmovdqu64 288(%rsi),%ymm20
|
||
|
vpblendmq %ymm20,%ymm19,%ymm19{%k1}
|
||
|
vpaddq %ymm24,%ymm21,%ymm21
|
||
|
addq $640,%rsi
|
||
|
cmpq %rsi,%rax
|
||
|
jne .Lloop_0
|
||
|
vmovdqu64 %ymm0,0(%rdi)
|
||
|
vmovdqu64 %ymm1,32(%rdi)
|
||
|
vmovdqu64 %ymm2,64(%rdi)
|
||
|
vmovdqu64 %ymm3,96(%rdi)
|
||
|
vmovdqu64 %ymm4,128(%rdi)
|
||
|
vmovdqu64 %ymm5,160(%rdi)
|
||
|
vmovdqu64 %ymm16,192(%rdi)
|
||
|
vmovdqu64 %ymm17,224(%rdi)
|
||
|
vmovdqu64 %ymm18,256(%rdi)
|
||
|
vmovdqu64 %ymm19,288(%rdi)
|
||
|
movq %r10,%rsi
|
||
|
vpxorq %ymm21,%ymm21,%ymm21
|
||
|
.align 32
|
||
|
.Lloop_320:
|
||
|
vpcmpq $0,%ymm21,%ymm23,%k1
|
||
|
vmovdqu64 320(%rsi),%ymm20
|
||
|
vpblendmq %ymm20,%ymm0,%ymm0{%k1}
|
||
|
vmovdqu64 352(%rsi),%ymm20
|
||
|
vpblendmq %ymm20,%ymm1,%ymm1{%k1}
|
||
|
vmovdqu64 384(%rsi),%ymm20
|
||
|
vpblendmq %ymm20,%ymm2,%ymm2{%k1}
|
||
|
vmovdqu64 416(%rsi),%ymm20
|
||
|
vpblendmq %ymm20,%ymm3,%ymm3{%k1}
|
||
|
vmovdqu64 448(%rsi),%ymm20
|
||
|
vpblendmq %ymm20,%ymm4,%ymm4{%k1}
|
||
|
vmovdqu64 480(%rsi),%ymm20
|
||
|
vpblendmq %ymm20,%ymm5,%ymm5{%k1}
|
||
|
vmovdqu64 512(%rsi),%ymm20
|
||
|
vpblendmq %ymm20,%ymm16,%ymm16{%k1}
|
||
|
vmovdqu64 544(%rsi),%ymm20
|
||
|
vpblendmq %ymm20,%ymm17,%ymm17{%k1}
|
||
|
vmovdqu64 576(%rsi),%ymm20
|
||
|
vpblendmq %ymm20,%ymm18,%ymm18{%k1}
|
||
|
vmovdqu64 608(%rsi),%ymm20
|
||
|
vpblendmq %ymm20,%ymm19,%ymm19{%k1}
|
||
|
vpaddq %ymm24,%ymm21,%ymm21
|
||
|
addq $640,%rsi
|
||
|
cmpq %rsi,%rax
|
||
|
jne .Lloop_320
|
||
|
vmovdqu64 %ymm0,320(%rdi)
|
||
|
vmovdqu64 %ymm1,352(%rdi)
|
||
|
vmovdqu64 %ymm2,384(%rdi)
|
||
|
vmovdqu64 %ymm3,416(%rdi)
|
||
|
vmovdqu64 %ymm4,448(%rdi)
|
||
|
vmovdqu64 %ymm5,480(%rdi)
|
||
|
vmovdqu64 %ymm16,512(%rdi)
|
||
|
vmovdqu64 %ymm17,544(%rdi)
|
||
|
vmovdqu64 %ymm18,576(%rdi)
|
||
|
vmovdqu64 %ymm19,608(%rdi)
|
||
|
|
||
|
.byte 0xf3,0xc3
|
||
|
.cfi_endproc
|
||
|
.size ossl_extract_multiplier_2x40_win5, .-ossl_extract_multiplier_2x40_win5
|
||
|
.data
|
||
|
.align 32
|
||
|
.Lones:
|
||
|
.quad 1,1,1,1
|
||
|
.Lzeros:
|
||
|
.quad 0,0,0,0
|
||
|
.section ".note.gnu.property", "a"
|
||
|
.p2align 3
|
||
|
.long 1f - 0f
|
||
|
.long 4f - 1f
|
||
|
.long 5
|
||
|
0:
|
||
|
# "GNU" encoded with .byte, since .asciz isn't supported
|
||
|
# on Solaris.
|
||
|
.byte 0x47
|
||
|
.byte 0x4e
|
||
|
.byte 0x55
|
||
|
.byte 0
|
||
|
1:
|
||
|
.p2align 3
|
||
|
.long 0xc0000002
|
||
|
.long 3f - 2f
|
||
|
2:
|
||
|
.long 3
|
||
|
3:
|
||
|
.p2align 3
|
||
|
4:
|