.text .globl ossl_rsaz_amm52x40_x1_ifma256 .type ossl_rsaz_amm52x40_x1_ifma256,@function .align 32 ossl_rsaz_amm52x40_x1_ifma256: .cfi_startproc .byte 243,15,30,250 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 vpxord %ymm0,%ymm0,%ymm0 vmovdqa64 %ymm0,%ymm3 vmovdqa64 %ymm0,%ymm4 vmovdqa64 %ymm0,%ymm5 vmovdqa64 %ymm0,%ymm6 vmovdqa64 %ymm0,%ymm7 vmovdqa64 %ymm0,%ymm8 vmovdqa64 %ymm0,%ymm9 vmovdqa64 %ymm0,%ymm10 vmovdqa64 %ymm0,%ymm11 vmovdqa64 %ymm0,%ymm12 xorl %r9d,%r9d movq %rdx,%r11 movq $0xfffffffffffff,%rax movl $10,%ebx .align 32 .Lloop10: movq 0(%r11),%r13 vpbroadcastq %r13,%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vpbroadcastq %r13,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 vpmadd52luq 0(%rsi),%ymm1,%ymm3 vpmadd52luq 32(%rsi),%ymm1,%ymm4 vpmadd52luq 64(%rsi),%ymm1,%ymm5 vpmadd52luq 96(%rsi),%ymm1,%ymm6 vpmadd52luq 128(%rsi),%ymm1,%ymm7 vpmadd52luq 160(%rsi),%ymm1,%ymm8 vpmadd52luq 192(%rsi),%ymm1,%ymm9 vpmadd52luq 224(%rsi),%ymm1,%ymm10 vpmadd52luq 256(%rsi),%ymm1,%ymm11 vpmadd52luq 288(%rsi),%ymm1,%ymm12 vpmadd52luq 0(%rcx),%ymm2,%ymm3 vpmadd52luq 32(%rcx),%ymm2,%ymm4 vpmadd52luq 64(%rcx),%ymm2,%ymm5 vpmadd52luq 96(%rcx),%ymm2,%ymm6 vpmadd52luq 128(%rcx),%ymm2,%ymm7 vpmadd52luq 160(%rcx),%ymm2,%ymm8 vpmadd52luq 192(%rcx),%ymm2,%ymm9 vpmadd52luq 224(%rcx),%ymm2,%ymm10 vpmadd52luq 256(%rcx),%ymm2,%ymm11 vpmadd52luq 288(%rcx),%ymm2,%ymm12 valignq $1,%ymm3,%ymm4,%ymm3 valignq $1,%ymm4,%ymm5,%ymm4 valignq $1,%ymm5,%ymm6,%ymm5 valignq $1,%ymm6,%ymm7,%ymm6 valignq $1,%ymm7,%ymm8,%ymm7 valignq $1,%ymm8,%ymm9,%ymm8 valignq $1,%ymm9,%ymm10,%ymm9 valignq $1,%ymm10,%ymm11,%ymm10 valignq $1,%ymm11,%ymm12,%ymm11 valignq $1,%ymm12,%ymm0,%ymm12 vmovq %xmm3,%r13 addq %r13,%r9 vpmadd52huq 0(%rsi),%ymm1,%ymm3 vpmadd52huq 32(%rsi),%ymm1,%ymm4 vpmadd52huq 64(%rsi),%ymm1,%ymm5 vpmadd52huq 96(%rsi),%ymm1,%ymm6 vpmadd52huq 128(%rsi),%ymm1,%ymm7 vpmadd52huq 160(%rsi),%ymm1,%ymm8 vpmadd52huq 192(%rsi),%ymm1,%ymm9 vpmadd52huq 224(%rsi),%ymm1,%ymm10 vpmadd52huq 256(%rsi),%ymm1,%ymm11 vpmadd52huq 288(%rsi),%ymm1,%ymm12 vpmadd52huq 0(%rcx),%ymm2,%ymm3 vpmadd52huq 32(%rcx),%ymm2,%ymm4 vpmadd52huq 64(%rcx),%ymm2,%ymm5 vpmadd52huq 96(%rcx),%ymm2,%ymm6 vpmadd52huq 128(%rcx),%ymm2,%ymm7 vpmadd52huq 160(%rcx),%ymm2,%ymm8 vpmadd52huq 192(%rcx),%ymm2,%ymm9 vpmadd52huq 224(%rcx),%ymm2,%ymm10 vpmadd52huq 256(%rcx),%ymm2,%ymm11 vpmadd52huq 288(%rcx),%ymm2,%ymm12 movq 8(%r11),%r13 vpbroadcastq %r13,%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vpbroadcastq %r13,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 vpmadd52luq 0(%rsi),%ymm1,%ymm3 vpmadd52luq 32(%rsi),%ymm1,%ymm4 vpmadd52luq 64(%rsi),%ymm1,%ymm5 vpmadd52luq 96(%rsi),%ymm1,%ymm6 vpmadd52luq 128(%rsi),%ymm1,%ymm7 vpmadd52luq 160(%rsi),%ymm1,%ymm8 vpmadd52luq 192(%rsi),%ymm1,%ymm9 vpmadd52luq 224(%rsi),%ymm1,%ymm10 vpmadd52luq 256(%rsi),%ymm1,%ymm11 vpmadd52luq 288(%rsi),%ymm1,%ymm12 vpmadd52luq 0(%rcx),%ymm2,%ymm3 vpmadd52luq 32(%rcx),%ymm2,%ymm4 vpmadd52luq 64(%rcx),%ymm2,%ymm5 vpmadd52luq 96(%rcx),%ymm2,%ymm6 vpmadd52luq 128(%rcx),%ymm2,%ymm7 vpmadd52luq 160(%rcx),%ymm2,%ymm8 vpmadd52luq 192(%rcx),%ymm2,%ymm9 vpmadd52luq 224(%rcx),%ymm2,%ymm10 vpmadd52luq 256(%rcx),%ymm2,%ymm11 vpmadd52luq 288(%rcx),%ymm2,%ymm12 valignq $1,%ymm3,%ymm4,%ymm3 valignq $1,%ymm4,%ymm5,%ymm4 valignq $1,%ymm5,%ymm6,%ymm5 valignq $1,%ymm6,%ymm7,%ymm6 valignq $1,%ymm7,%ymm8,%ymm7 valignq $1,%ymm8,%ymm9,%ymm8 valignq $1,%ymm9,%ymm10,%ymm9 valignq $1,%ymm10,%ymm11,%ymm10 valignq $1,%ymm11,%ymm12,%ymm11 valignq $1,%ymm12,%ymm0,%ymm12 vmovq %xmm3,%r13 addq %r13,%r9 vpmadd52huq 0(%rsi),%ymm1,%ymm3 vpmadd52huq 32(%rsi),%ymm1,%ymm4 vpmadd52huq 64(%rsi),%ymm1,%ymm5 vpmadd52huq 96(%rsi),%ymm1,%ymm6 vpmadd52huq 128(%rsi),%ymm1,%ymm7 vpmadd52huq 160(%rsi),%ymm1,%ymm8 vpmadd52huq 192(%rsi),%ymm1,%ymm9 vpmadd52huq 224(%rsi),%ymm1,%ymm10 vpmadd52huq 256(%rsi),%ymm1,%ymm11 vpmadd52huq 288(%rsi),%ymm1,%ymm12 vpmadd52huq 0(%rcx),%ymm2,%ymm3 vpmadd52huq 32(%rcx),%ymm2,%ymm4 vpmadd52huq 64(%rcx),%ymm2,%ymm5 vpmadd52huq 96(%rcx),%ymm2,%ymm6 vpmadd52huq 128(%rcx),%ymm2,%ymm7 vpmadd52huq 160(%rcx),%ymm2,%ymm8 vpmadd52huq 192(%rcx),%ymm2,%ymm9 vpmadd52huq 224(%rcx),%ymm2,%ymm10 vpmadd52huq 256(%rcx),%ymm2,%ymm11 vpmadd52huq 288(%rcx),%ymm2,%ymm12 movq 16(%r11),%r13 vpbroadcastq %r13,%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vpbroadcastq %r13,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 vpmadd52luq 0(%rsi),%ymm1,%ymm3 vpmadd52luq 32(%rsi),%ymm1,%ymm4 vpmadd52luq 64(%rsi),%ymm1,%ymm5 vpmadd52luq 96(%rsi),%ymm1,%ymm6 vpmadd52luq 128(%rsi),%ymm1,%ymm7 vpmadd52luq 160(%rsi),%ymm1,%ymm8 vpmadd52luq 192(%rsi),%ymm1,%ymm9 vpmadd52luq 224(%rsi),%ymm1,%ymm10 vpmadd52luq 256(%rsi),%ymm1,%ymm11 vpmadd52luq 288(%rsi),%ymm1,%ymm12 vpmadd52luq 0(%rcx),%ymm2,%ymm3 vpmadd52luq 32(%rcx),%ymm2,%ymm4 vpmadd52luq 64(%rcx),%ymm2,%ymm5 vpmadd52luq 96(%rcx),%ymm2,%ymm6 vpmadd52luq 128(%rcx),%ymm2,%ymm7 vpmadd52luq 160(%rcx),%ymm2,%ymm8 vpmadd52luq 192(%rcx),%ymm2,%ymm9 vpmadd52luq 224(%rcx),%ymm2,%ymm10 vpmadd52luq 256(%rcx),%ymm2,%ymm11 vpmadd52luq 288(%rcx),%ymm2,%ymm12 valignq $1,%ymm3,%ymm4,%ymm3 valignq $1,%ymm4,%ymm5,%ymm4 valignq $1,%ymm5,%ymm6,%ymm5 valignq $1,%ymm6,%ymm7,%ymm6 valignq $1,%ymm7,%ymm8,%ymm7 valignq $1,%ymm8,%ymm9,%ymm8 valignq $1,%ymm9,%ymm10,%ymm9 valignq $1,%ymm10,%ymm11,%ymm10 valignq $1,%ymm11,%ymm12,%ymm11 valignq $1,%ymm12,%ymm0,%ymm12 vmovq %xmm3,%r13 addq %r13,%r9 vpmadd52huq 0(%rsi),%ymm1,%ymm3 vpmadd52huq 32(%rsi),%ymm1,%ymm4 vpmadd52huq 64(%rsi),%ymm1,%ymm5 vpmadd52huq 96(%rsi),%ymm1,%ymm6 vpmadd52huq 128(%rsi),%ymm1,%ymm7 vpmadd52huq 160(%rsi),%ymm1,%ymm8 vpmadd52huq 192(%rsi),%ymm1,%ymm9 vpmadd52huq 224(%rsi),%ymm1,%ymm10 vpmadd52huq 256(%rsi),%ymm1,%ymm11 vpmadd52huq 288(%rsi),%ymm1,%ymm12 vpmadd52huq 0(%rcx),%ymm2,%ymm3 vpmadd52huq 32(%rcx),%ymm2,%ymm4 vpmadd52huq 64(%rcx),%ymm2,%ymm5 vpmadd52huq 96(%rcx),%ymm2,%ymm6 vpmadd52huq 128(%rcx),%ymm2,%ymm7 vpmadd52huq 160(%rcx),%ymm2,%ymm8 vpmadd52huq 192(%rcx),%ymm2,%ymm9 vpmadd52huq 224(%rcx),%ymm2,%ymm10 vpmadd52huq 256(%rcx),%ymm2,%ymm11 vpmadd52huq 288(%rcx),%ymm2,%ymm12 movq 24(%r11),%r13 vpbroadcastq %r13,%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vpbroadcastq %r13,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 vpmadd52luq 0(%rsi),%ymm1,%ymm3 vpmadd52luq 32(%rsi),%ymm1,%ymm4 vpmadd52luq 64(%rsi),%ymm1,%ymm5 vpmadd52luq 96(%rsi),%ymm1,%ymm6 vpmadd52luq 128(%rsi),%ymm1,%ymm7 vpmadd52luq 160(%rsi),%ymm1,%ymm8 vpmadd52luq 192(%rsi),%ymm1,%ymm9 vpmadd52luq 224(%rsi),%ymm1,%ymm10 vpmadd52luq 256(%rsi),%ymm1,%ymm11 vpmadd52luq 288(%rsi),%ymm1,%ymm12 vpmadd52luq 0(%rcx),%ymm2,%ymm3 vpmadd52luq 32(%rcx),%ymm2,%ymm4 vpmadd52luq 64(%rcx),%ymm2,%ymm5 vpmadd52luq 96(%rcx),%ymm2,%ymm6 vpmadd52luq 128(%rcx),%ymm2,%ymm7 vpmadd52luq 160(%rcx),%ymm2,%ymm8 vpmadd52luq 192(%rcx),%ymm2,%ymm9 vpmadd52luq 224(%rcx),%ymm2,%ymm10 vpmadd52luq 256(%rcx),%ymm2,%ymm11 vpmadd52luq 288(%rcx),%ymm2,%ymm12 valignq $1,%ymm3,%ymm4,%ymm3 valignq $1,%ymm4,%ymm5,%ymm4 valignq $1,%ymm5,%ymm6,%ymm5 valignq $1,%ymm6,%ymm7,%ymm6 valignq $1,%ymm7,%ymm8,%ymm7 valignq $1,%ymm8,%ymm9,%ymm8 valignq $1,%ymm9,%ymm10,%ymm9 valignq $1,%ymm10,%ymm11,%ymm10 valignq $1,%ymm11,%ymm12,%ymm11 valignq $1,%ymm12,%ymm0,%ymm12 vmovq %xmm3,%r13 addq %r13,%r9 vpmadd52huq 0(%rsi),%ymm1,%ymm3 vpmadd52huq 32(%rsi),%ymm1,%ymm4 vpmadd52huq 64(%rsi),%ymm1,%ymm5 vpmadd52huq 96(%rsi),%ymm1,%ymm6 vpmadd52huq 128(%rsi),%ymm1,%ymm7 vpmadd52huq 160(%rsi),%ymm1,%ymm8 vpmadd52huq 192(%rsi),%ymm1,%ymm9 vpmadd52huq 224(%rsi),%ymm1,%ymm10 vpmadd52huq 256(%rsi),%ymm1,%ymm11 vpmadd52huq 288(%rsi),%ymm1,%ymm12 vpmadd52huq 0(%rcx),%ymm2,%ymm3 vpmadd52huq 32(%rcx),%ymm2,%ymm4 vpmadd52huq 64(%rcx),%ymm2,%ymm5 vpmadd52huq 96(%rcx),%ymm2,%ymm6 vpmadd52huq 128(%rcx),%ymm2,%ymm7 vpmadd52huq 160(%rcx),%ymm2,%ymm8 vpmadd52huq 192(%rcx),%ymm2,%ymm9 vpmadd52huq 224(%rcx),%ymm2,%ymm10 vpmadd52huq 256(%rcx),%ymm2,%ymm11 vpmadd52huq 288(%rcx),%ymm2,%ymm12 leaq 32(%r11),%r11 decl %ebx jne .Lloop10 vpbroadcastq %r9,%ymm0 vpblendd $3,%ymm0,%ymm3,%ymm3 vpsrlq $52,%ymm3,%ymm0 vpsrlq $52,%ymm4,%ymm1 vpsrlq $52,%ymm5,%ymm2 vpsrlq $52,%ymm6,%ymm23 vpsrlq $52,%ymm7,%ymm24 vpsrlq $52,%ymm8,%ymm25 vpsrlq $52,%ymm9,%ymm26 vpsrlq $52,%ymm10,%ymm27 vpsrlq $52,%ymm11,%ymm28 vpsrlq $52,%ymm12,%ymm29 valignq $3,%ymm28,%ymm29,%ymm29 valignq $3,%ymm27,%ymm28,%ymm28 valignq $3,%ymm26,%ymm27,%ymm27 valignq $3,%ymm25,%ymm26,%ymm26 valignq $3,%ymm24,%ymm25,%ymm25 valignq $3,%ymm23,%ymm24,%ymm24 valignq $3,%ymm2,%ymm23,%ymm23 valignq $3,%ymm1,%ymm2,%ymm2 valignq $3,%ymm0,%ymm1,%ymm1 valignq $3,.Lzeros(%rip),%ymm0,%ymm0 vpandq .Lmask52x4(%rip),%ymm3,%ymm3 vpandq .Lmask52x4(%rip),%ymm4,%ymm4 vpandq .Lmask52x4(%rip),%ymm5,%ymm5 vpandq .Lmask52x4(%rip),%ymm6,%ymm6 vpandq .Lmask52x4(%rip),%ymm7,%ymm7 vpandq .Lmask52x4(%rip),%ymm8,%ymm8 vpandq .Lmask52x4(%rip),%ymm9,%ymm9 vpandq .Lmask52x4(%rip),%ymm10,%ymm10 vpandq .Lmask52x4(%rip),%ymm11,%ymm11 vpandq .Lmask52x4(%rip),%ymm12,%ymm12 vpaddq %ymm0,%ymm3,%ymm3 vpaddq %ymm1,%ymm4,%ymm4 vpaddq %ymm2,%ymm5,%ymm5 vpaddq %ymm23,%ymm6,%ymm6 vpaddq %ymm24,%ymm7,%ymm7 vpaddq %ymm25,%ymm8,%ymm8 vpaddq %ymm26,%ymm9,%ymm9 vpaddq %ymm27,%ymm10,%ymm10 vpaddq %ymm28,%ymm11,%ymm11 vpaddq %ymm29,%ymm12,%ymm12 vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1 vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k2 kmovb %k1,%r14d kmovb %k2,%r13d shlb $4,%r13b orb %r13b,%r14b vpcmpuq $6,.Lmask52x4(%rip),%ymm5,%k1 vpcmpuq $6,.Lmask52x4(%rip),%ymm6,%k2 kmovb %k1,%r13d kmovb %k2,%r12d shlb $4,%r12b orb %r12b,%r13b vpcmpuq $6,.Lmask52x4(%rip),%ymm7,%k1 vpcmpuq $6,.Lmask52x4(%rip),%ymm8,%k2 kmovb %k1,%r12d kmovb %k2,%r11d shlb $4,%r11b orb %r11b,%r12b vpcmpuq $6,.Lmask52x4(%rip),%ymm9,%k1 vpcmpuq $6,.Lmask52x4(%rip),%ymm10,%k2 kmovb %k1,%r11d kmovb %k2,%r10d shlb $4,%r10b orb %r10b,%r11b vpcmpuq $6,.Lmask52x4(%rip),%ymm11,%k1 vpcmpuq $6,.Lmask52x4(%rip),%ymm12,%k2 kmovb %k1,%r10d kmovb %k2,%r9d shlb $4,%r9b orb %r9b,%r10b addb %r14b,%r14b adcb %r13b,%r13b adcb %r12b,%r12b adcb %r11b,%r11b adcb %r10b,%r10b vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1 vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k2 kmovb %k1,%r9d kmovb %k2,%r8d shlb $4,%r8b orb %r8b,%r9b vpcmpuq $0,.Lmask52x4(%rip),%ymm5,%k1 vpcmpuq $0,.Lmask52x4(%rip),%ymm6,%k2 kmovb %k1,%r8d kmovb %k2,%edx shlb $4,%dl orb %dl,%r8b vpcmpuq $0,.Lmask52x4(%rip),%ymm7,%k1 vpcmpuq $0,.Lmask52x4(%rip),%ymm8,%k2 kmovb %k1,%edx kmovb %k2,%ecx shlb $4,%cl orb %cl,%dl vpcmpuq $0,.Lmask52x4(%rip),%ymm9,%k1 vpcmpuq $0,.Lmask52x4(%rip),%ymm10,%k2 kmovb %k1,%ecx kmovb %k2,%ebx shlb $4,%bl orb %bl,%cl vpcmpuq $0,.Lmask52x4(%rip),%ymm11,%k1 vpcmpuq $0,.Lmask52x4(%rip),%ymm12,%k2 kmovb %k1,%ebx kmovb %k2,%eax shlb $4,%al orb %al,%bl addb %r9b,%r14b adcb %r8b,%r13b adcb %dl,%r12b adcb %cl,%r11b adcb %bl,%r10b xorb %r9b,%r14b xorb %r8b,%r13b xorb %dl,%r12b xorb %cl,%r11b xorb %bl,%r10b kmovb %r14d,%k1 shrb $4,%r14b kmovb %r14d,%k2 kmovb %r13d,%k3 shrb $4,%r13b kmovb %r13d,%k4 kmovb %r12d,%k5 shrb $4,%r12b kmovb %r12d,%k6 kmovb %r11d,%k7 vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1} vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k2} vpsubq .Lmask52x4(%rip),%ymm5,%ymm5{%k3} vpsubq .Lmask52x4(%rip),%ymm6,%ymm6{%k4} vpsubq .Lmask52x4(%rip),%ymm7,%ymm7{%k5} vpsubq .Lmask52x4(%rip),%ymm8,%ymm8{%k6} vpsubq .Lmask52x4(%rip),%ymm9,%ymm9{%k7} vpandq .Lmask52x4(%rip),%ymm3,%ymm3 vpandq .Lmask52x4(%rip),%ymm4,%ymm4 vpandq .Lmask52x4(%rip),%ymm5,%ymm5 vpandq .Lmask52x4(%rip),%ymm6,%ymm6 vpandq .Lmask52x4(%rip),%ymm7,%ymm7 vpandq .Lmask52x4(%rip),%ymm8,%ymm8 vpandq .Lmask52x4(%rip),%ymm9,%ymm9 shrb $4,%r11b kmovb %r11d,%k1 kmovb %r10d,%k2 shrb $4,%r10b kmovb %r10d,%k3 vpsubq .Lmask52x4(%rip),%ymm10,%ymm10{%k1} vpsubq .Lmask52x4(%rip),%ymm11,%ymm11{%k2} vpsubq .Lmask52x4(%rip),%ymm12,%ymm12{%k3} vpandq .Lmask52x4(%rip),%ymm10,%ymm10 vpandq .Lmask52x4(%rip),%ymm11,%ymm11 vpandq .Lmask52x4(%rip),%ymm12,%ymm12 vmovdqu64 %ymm3,0(%rdi) vmovdqu64 %ymm4,32(%rdi) vmovdqu64 %ymm5,64(%rdi) vmovdqu64 %ymm6,96(%rdi) vmovdqu64 %ymm7,128(%rdi) vmovdqu64 %ymm8,160(%rdi) vmovdqu64 %ymm9,192(%rdi) vmovdqu64 %ymm10,224(%rdi) vmovdqu64 %ymm11,256(%rdi) vmovdqu64 %ymm12,288(%rdi) vzeroupper leaq (%rsp),%rax .cfi_def_cfa_register %rax movq 0(%rax),%r15 .cfi_restore %r15 movq 8(%rax),%r14 .cfi_restore %r14 movq 16(%rax),%r13 .cfi_restore %r13 movq 24(%rax),%r12 .cfi_restore %r12 movq 32(%rax),%rbp .cfi_restore %rbp movq 40(%rax),%rbx .cfi_restore %rbx leaq 48(%rax),%rsp .cfi_def_cfa %rsp,8 .Lossl_rsaz_amm52x40_x1_ifma256_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ossl_rsaz_amm52x40_x1_ifma256, .-ossl_rsaz_amm52x40_x1_ifma256 .data .align 32 .Lmask52x4: .quad 0xfffffffffffff .quad 0xfffffffffffff .quad 0xfffffffffffff .quad 0xfffffffffffff .text .globl ossl_rsaz_amm52x40_x2_ifma256 .type ossl_rsaz_amm52x40_x2_ifma256,@function .align 32 ossl_rsaz_amm52x40_x2_ifma256: .cfi_startproc .byte 243,15,30,250 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 vpxord %ymm0,%ymm0,%ymm0 vmovdqa64 %ymm0,%ymm3 vmovdqa64 %ymm0,%ymm4 vmovdqa64 %ymm0,%ymm5 vmovdqa64 %ymm0,%ymm6 vmovdqa64 %ymm0,%ymm7 vmovdqa64 %ymm0,%ymm8 vmovdqa64 %ymm0,%ymm9 vmovdqa64 %ymm0,%ymm10 vmovdqa64 %ymm0,%ymm11 vmovdqa64 %ymm0,%ymm12 vmovdqa64 %ymm0,%ymm13 vmovdqa64 %ymm0,%ymm14 vmovdqa64 %ymm0,%ymm15 vmovdqa64 %ymm0,%ymm16 vmovdqa64 %ymm0,%ymm17 vmovdqa64 %ymm0,%ymm18 vmovdqa64 %ymm0,%ymm19 vmovdqa64 %ymm0,%ymm20 vmovdqa64 %ymm0,%ymm21 vmovdqa64 %ymm0,%ymm22 xorl %r9d,%r9d xorl %r15d,%r15d movq %rdx,%r11 movq $0xfffffffffffff,%rax movl $40,%ebx .align 32 .Lloop40: movq 0(%r11),%r13 vpbroadcastq %r13,%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq (%r8),%r13 imulq %r9,%r13 andq %rax,%r13 vpbroadcastq %r13,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 vpmadd52luq 0(%rsi),%ymm1,%ymm3 vpmadd52luq 32(%rsi),%ymm1,%ymm4 vpmadd52luq 64(%rsi),%ymm1,%ymm5 vpmadd52luq 96(%rsi),%ymm1,%ymm6 vpmadd52luq 128(%rsi),%ymm1,%ymm7 vpmadd52luq 160(%rsi),%ymm1,%ymm8 vpmadd52luq 192(%rsi),%ymm1,%ymm9 vpmadd52luq 224(%rsi),%ymm1,%ymm10 vpmadd52luq 256(%rsi),%ymm1,%ymm11 vpmadd52luq 288(%rsi),%ymm1,%ymm12 vpmadd52luq 0(%rcx),%ymm2,%ymm3 vpmadd52luq 32(%rcx),%ymm2,%ymm4 vpmadd52luq 64(%rcx),%ymm2,%ymm5 vpmadd52luq 96(%rcx),%ymm2,%ymm6 vpmadd52luq 128(%rcx),%ymm2,%ymm7 vpmadd52luq 160(%rcx),%ymm2,%ymm8 vpmadd52luq 192(%rcx),%ymm2,%ymm9 vpmadd52luq 224(%rcx),%ymm2,%ymm10 vpmadd52luq 256(%rcx),%ymm2,%ymm11 vpmadd52luq 288(%rcx),%ymm2,%ymm12 valignq $1,%ymm3,%ymm4,%ymm3 valignq $1,%ymm4,%ymm5,%ymm4 valignq $1,%ymm5,%ymm6,%ymm5 valignq $1,%ymm6,%ymm7,%ymm6 valignq $1,%ymm7,%ymm8,%ymm7 valignq $1,%ymm8,%ymm9,%ymm8 valignq $1,%ymm9,%ymm10,%ymm9 valignq $1,%ymm10,%ymm11,%ymm10 valignq $1,%ymm11,%ymm12,%ymm11 valignq $1,%ymm12,%ymm0,%ymm12 vmovq %xmm3,%r13 addq %r13,%r9 vpmadd52huq 0(%rsi),%ymm1,%ymm3 vpmadd52huq 32(%rsi),%ymm1,%ymm4 vpmadd52huq 64(%rsi),%ymm1,%ymm5 vpmadd52huq 96(%rsi),%ymm1,%ymm6 vpmadd52huq 128(%rsi),%ymm1,%ymm7 vpmadd52huq 160(%rsi),%ymm1,%ymm8 vpmadd52huq 192(%rsi),%ymm1,%ymm9 vpmadd52huq 224(%rsi),%ymm1,%ymm10 vpmadd52huq 256(%rsi),%ymm1,%ymm11 vpmadd52huq 288(%rsi),%ymm1,%ymm12 vpmadd52huq 0(%rcx),%ymm2,%ymm3 vpmadd52huq 32(%rcx),%ymm2,%ymm4 vpmadd52huq 64(%rcx),%ymm2,%ymm5 vpmadd52huq 96(%rcx),%ymm2,%ymm6 vpmadd52huq 128(%rcx),%ymm2,%ymm7 vpmadd52huq 160(%rcx),%ymm2,%ymm8 vpmadd52huq 192(%rcx),%ymm2,%ymm9 vpmadd52huq 224(%rcx),%ymm2,%ymm10 vpmadd52huq 256(%rcx),%ymm2,%ymm11 vpmadd52huq 288(%rcx),%ymm2,%ymm12 movq 320(%r11),%r13 vpbroadcastq %r13,%ymm1 movq 320(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r15 movq %r12,%r10 adcq $0,%r10 movq 8(%r8),%r13 imulq %r15,%r13 andq %rax,%r13 vpbroadcastq %r13,%ymm2 movq 320(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r15 adcq %r12,%r10 shrq $52,%r15 salq $12,%r10 orq %r10,%r15 vpmadd52luq 320(%rsi),%ymm1,%ymm13 vpmadd52luq 352(%rsi),%ymm1,%ymm14 vpmadd52luq 384(%rsi),%ymm1,%ymm15 vpmadd52luq 416(%rsi),%ymm1,%ymm16 vpmadd52luq 448(%rsi),%ymm1,%ymm17 vpmadd52luq 480(%rsi),%ymm1,%ymm18 vpmadd52luq 512(%rsi),%ymm1,%ymm19 vpmadd52luq 544(%rsi),%ymm1,%ymm20 vpmadd52luq 576(%rsi),%ymm1,%ymm21 vpmadd52luq 608(%rsi),%ymm1,%ymm22 vpmadd52luq 320(%rcx),%ymm2,%ymm13 vpmadd52luq 352(%rcx),%ymm2,%ymm14 vpmadd52luq 384(%rcx),%ymm2,%ymm15 vpmadd52luq 416(%rcx),%ymm2,%ymm16 vpmadd52luq 448(%rcx),%ymm2,%ymm17 vpmadd52luq 480(%rcx),%ymm2,%ymm18 vpmadd52luq 512(%rcx),%ymm2,%ymm19 vpmadd52luq 544(%rcx),%ymm2,%ymm20 vpmadd52luq 576(%rcx),%ymm2,%ymm21 vpmadd52luq 608(%rcx),%ymm2,%ymm22 valignq $1,%ymm13,%ymm14,%ymm13 valignq $1,%ymm14,%ymm15,%ymm14 valignq $1,%ymm15,%ymm16,%ymm15 valignq $1,%ymm16,%ymm17,%ymm16 valignq $1,%ymm17,%ymm18,%ymm17 valignq $1,%ymm18,%ymm19,%ymm18 valignq $1,%ymm19,%ymm20,%ymm19 valignq $1,%ymm20,%ymm21,%ymm20 valignq $1,%ymm21,%ymm22,%ymm21 valignq $1,%ymm22,%ymm0,%ymm22 vmovq %xmm13,%r13 addq %r13,%r15 vpmadd52huq 320(%rsi),%ymm1,%ymm13 vpmadd52huq 352(%rsi),%ymm1,%ymm14 vpmadd52huq 384(%rsi),%ymm1,%ymm15 vpmadd52huq 416(%rsi),%ymm1,%ymm16 vpmadd52huq 448(%rsi),%ymm1,%ymm17 vpmadd52huq 480(%rsi),%ymm1,%ymm18 vpmadd52huq 512(%rsi),%ymm1,%ymm19 vpmadd52huq 544(%rsi),%ymm1,%ymm20 vpmadd52huq 576(%rsi),%ymm1,%ymm21 vpmadd52huq 608(%rsi),%ymm1,%ymm22 vpmadd52huq 320(%rcx),%ymm2,%ymm13 vpmadd52huq 352(%rcx),%ymm2,%ymm14 vpmadd52huq 384(%rcx),%ymm2,%ymm15 vpmadd52huq 416(%rcx),%ymm2,%ymm16 vpmadd52huq 448(%rcx),%ymm2,%ymm17 vpmadd52huq 480(%rcx),%ymm2,%ymm18 vpmadd52huq 512(%rcx),%ymm2,%ymm19 vpmadd52huq 544(%rcx),%ymm2,%ymm20 vpmadd52huq 576(%rcx),%ymm2,%ymm21 vpmadd52huq 608(%rcx),%ymm2,%ymm22 leaq 8(%r11),%r11 decl %ebx jne .Lloop40 vpbroadcastq %r9,%ymm0 vpblendd $3,%ymm0,%ymm3,%ymm3 vpsrlq $52,%ymm3,%ymm0 vpsrlq $52,%ymm4,%ymm1 vpsrlq $52,%ymm5,%ymm2 vpsrlq $52,%ymm6,%ymm23 vpsrlq $52,%ymm7,%ymm24 vpsrlq $52,%ymm8,%ymm25 vpsrlq $52,%ymm9,%ymm26 vpsrlq $52,%ymm10,%ymm27 vpsrlq $52,%ymm11,%ymm28 vpsrlq $52,%ymm12,%ymm29 valignq $3,%ymm28,%ymm29,%ymm29 valignq $3,%ymm27,%ymm28,%ymm28 valignq $3,%ymm26,%ymm27,%ymm27 valignq $3,%ymm25,%ymm26,%ymm26 valignq $3,%ymm24,%ymm25,%ymm25 valignq $3,%ymm23,%ymm24,%ymm24 valignq $3,%ymm2,%ymm23,%ymm23 valignq $3,%ymm1,%ymm2,%ymm2 valignq $3,%ymm0,%ymm1,%ymm1 valignq $3,.Lzeros(%rip),%ymm0,%ymm0 vpandq .Lmask52x4(%rip),%ymm3,%ymm3 vpandq .Lmask52x4(%rip),%ymm4,%ymm4 vpandq .Lmask52x4(%rip),%ymm5,%ymm5 vpandq .Lmask52x4(%rip),%ymm6,%ymm6 vpandq .Lmask52x4(%rip),%ymm7,%ymm7 vpandq .Lmask52x4(%rip),%ymm8,%ymm8 vpandq .Lmask52x4(%rip),%ymm9,%ymm9 vpandq .Lmask52x4(%rip),%ymm10,%ymm10 vpandq .Lmask52x4(%rip),%ymm11,%ymm11 vpandq .Lmask52x4(%rip),%ymm12,%ymm12 vpaddq %ymm0,%ymm3,%ymm3 vpaddq %ymm1,%ymm4,%ymm4 vpaddq %ymm2,%ymm5,%ymm5 vpaddq %ymm23,%ymm6,%ymm6 vpaddq %ymm24,%ymm7,%ymm7 vpaddq %ymm25,%ymm8,%ymm8 vpaddq %ymm26,%ymm9,%ymm9 vpaddq %ymm27,%ymm10,%ymm10 vpaddq %ymm28,%ymm11,%ymm11 vpaddq %ymm29,%ymm12,%ymm12 vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1 vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k2 kmovb %k1,%r14d kmovb %k2,%r13d shlb $4,%r13b orb %r13b,%r14b vpcmpuq $6,.Lmask52x4(%rip),%ymm5,%k1 vpcmpuq $6,.Lmask52x4(%rip),%ymm6,%k2 kmovb %k1,%r13d kmovb %k2,%r12d shlb $4,%r12b orb %r12b,%r13b vpcmpuq $6,.Lmask52x4(%rip),%ymm7,%k1 vpcmpuq $6,.Lmask52x4(%rip),%ymm8,%k2 kmovb %k1,%r12d kmovb %k2,%r11d shlb $4,%r11b orb %r11b,%r12b vpcmpuq $6,.Lmask52x4(%rip),%ymm9,%k1 vpcmpuq $6,.Lmask52x4(%rip),%ymm10,%k2 kmovb %k1,%r11d kmovb %k2,%r10d shlb $4,%r10b orb %r10b,%r11b vpcmpuq $6,.Lmask52x4(%rip),%ymm11,%k1 vpcmpuq $6,.Lmask52x4(%rip),%ymm12,%k2 kmovb %k1,%r10d kmovb %k2,%r9d shlb $4,%r9b orb %r9b,%r10b addb %r14b,%r14b adcb %r13b,%r13b adcb %r12b,%r12b adcb %r11b,%r11b adcb %r10b,%r10b vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1 vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k2 kmovb %k1,%r9d kmovb %k2,%r8d shlb $4,%r8b orb %r8b,%r9b vpcmpuq $0,.Lmask52x4(%rip),%ymm5,%k1 vpcmpuq $0,.Lmask52x4(%rip),%ymm6,%k2 kmovb %k1,%r8d kmovb %k2,%edx shlb $4,%dl orb %dl,%r8b vpcmpuq $0,.Lmask52x4(%rip),%ymm7,%k1 vpcmpuq $0,.Lmask52x4(%rip),%ymm8,%k2 kmovb %k1,%edx kmovb %k2,%ecx shlb $4,%cl orb %cl,%dl vpcmpuq $0,.Lmask52x4(%rip),%ymm9,%k1 vpcmpuq $0,.Lmask52x4(%rip),%ymm10,%k2 kmovb %k1,%ecx kmovb %k2,%ebx shlb $4,%bl orb %bl,%cl vpcmpuq $0,.Lmask52x4(%rip),%ymm11,%k1 vpcmpuq $0,.Lmask52x4(%rip),%ymm12,%k2 kmovb %k1,%ebx kmovb %k2,%eax shlb $4,%al orb %al,%bl addb %r9b,%r14b adcb %r8b,%r13b adcb %dl,%r12b adcb %cl,%r11b adcb %bl,%r10b xorb %r9b,%r14b xorb %r8b,%r13b xorb %dl,%r12b xorb %cl,%r11b xorb %bl,%r10b kmovb %r14d,%k1 shrb $4,%r14b kmovb %r14d,%k2 kmovb %r13d,%k3 shrb $4,%r13b kmovb %r13d,%k4 kmovb %r12d,%k5 shrb $4,%r12b kmovb %r12d,%k6 kmovb %r11d,%k7 vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1} vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k2} vpsubq .Lmask52x4(%rip),%ymm5,%ymm5{%k3} vpsubq .Lmask52x4(%rip),%ymm6,%ymm6{%k4} vpsubq .Lmask52x4(%rip),%ymm7,%ymm7{%k5} vpsubq .Lmask52x4(%rip),%ymm8,%ymm8{%k6} vpsubq .Lmask52x4(%rip),%ymm9,%ymm9{%k7} vpandq .Lmask52x4(%rip),%ymm3,%ymm3 vpandq .Lmask52x4(%rip),%ymm4,%ymm4 vpandq .Lmask52x4(%rip),%ymm5,%ymm5 vpandq .Lmask52x4(%rip),%ymm6,%ymm6 vpandq .Lmask52x4(%rip),%ymm7,%ymm7 vpandq .Lmask52x4(%rip),%ymm8,%ymm8 vpandq .Lmask52x4(%rip),%ymm9,%ymm9 shrb $4,%r11b kmovb %r11d,%k1 kmovb %r10d,%k2 shrb $4,%r10b kmovb %r10d,%k3 vpsubq .Lmask52x4(%rip),%ymm10,%ymm10{%k1} vpsubq .Lmask52x4(%rip),%ymm11,%ymm11{%k2} vpsubq .Lmask52x4(%rip),%ymm12,%ymm12{%k3} vpandq .Lmask52x4(%rip),%ymm10,%ymm10 vpandq .Lmask52x4(%rip),%ymm11,%ymm11 vpandq .Lmask52x4(%rip),%ymm12,%ymm12 vpbroadcastq %r15,%ymm0 vpblendd $3,%ymm0,%ymm13,%ymm13 vpsrlq $52,%ymm13,%ymm0 vpsrlq $52,%ymm14,%ymm1 vpsrlq $52,%ymm15,%ymm2 vpsrlq $52,%ymm16,%ymm23 vpsrlq $52,%ymm17,%ymm24 vpsrlq $52,%ymm18,%ymm25 vpsrlq $52,%ymm19,%ymm26 vpsrlq $52,%ymm20,%ymm27 vpsrlq $52,%ymm21,%ymm28 vpsrlq $52,%ymm22,%ymm29 valignq $3,%ymm28,%ymm29,%ymm29 valignq $3,%ymm27,%ymm28,%ymm28 valignq $3,%ymm26,%ymm27,%ymm27 valignq $3,%ymm25,%ymm26,%ymm26 valignq $3,%ymm24,%ymm25,%ymm25 valignq $3,%ymm23,%ymm24,%ymm24 valignq $3,%ymm2,%ymm23,%ymm23 valignq $3,%ymm1,%ymm2,%ymm2 valignq $3,%ymm0,%ymm1,%ymm1 valignq $3,.Lzeros(%rip),%ymm0,%ymm0 vpandq .Lmask52x4(%rip),%ymm13,%ymm13 vpandq .Lmask52x4(%rip),%ymm14,%ymm14 vpandq .Lmask52x4(%rip),%ymm15,%ymm15 vpandq .Lmask52x4(%rip),%ymm16,%ymm16 vpandq .Lmask52x4(%rip),%ymm17,%ymm17 vpandq .Lmask52x4(%rip),%ymm18,%ymm18 vpandq .Lmask52x4(%rip),%ymm19,%ymm19 vpandq .Lmask52x4(%rip),%ymm20,%ymm20 vpandq .Lmask52x4(%rip),%ymm21,%ymm21 vpandq .Lmask52x4(%rip),%ymm22,%ymm22 vpaddq %ymm0,%ymm13,%ymm13 vpaddq %ymm1,%ymm14,%ymm14 vpaddq %ymm2,%ymm15,%ymm15 vpaddq %ymm23,%ymm16,%ymm16 vpaddq %ymm24,%ymm17,%ymm17 vpaddq %ymm25,%ymm18,%ymm18 vpaddq %ymm26,%ymm19,%ymm19 vpaddq %ymm27,%ymm20,%ymm20 vpaddq %ymm28,%ymm21,%ymm21 vpaddq %ymm29,%ymm22,%ymm22 vpcmpuq $6,.Lmask52x4(%rip),%ymm13,%k1 vpcmpuq $6,.Lmask52x4(%rip),%ymm14,%k2 kmovb %k1,%r14d kmovb %k2,%r13d shlb $4,%r13b orb %r13b,%r14b vpcmpuq $6,.Lmask52x4(%rip),%ymm15,%k1 vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2 kmovb %k1,%r13d kmovb %k2,%r12d shlb $4,%r12b orb %r12b,%r13b vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k1 vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k2 kmovb %k1,%r12d kmovb %k2,%r11d shlb $4,%r11b orb %r11b,%r12b vpcmpuq $6,.Lmask52x4(%rip),%ymm19,%k1 vpcmpuq $6,.Lmask52x4(%rip),%ymm20,%k2 kmovb %k1,%r11d kmovb %k2,%r10d shlb $4,%r10b orb %r10b,%r11b vpcmpuq $6,.Lmask52x4(%rip),%ymm21,%k1 vpcmpuq $6,.Lmask52x4(%rip),%ymm22,%k2 kmovb %k1,%r10d kmovb %k2,%r9d shlb $4,%r9b orb %r9b,%r10b addb %r14b,%r14b adcb %r13b,%r13b adcb %r12b,%r12b adcb %r11b,%r11b adcb %r10b,%r10b vpcmpuq $0,.Lmask52x4(%rip),%ymm13,%k1 vpcmpuq $0,.Lmask52x4(%rip),%ymm14,%k2 kmovb %k1,%r9d kmovb %k2,%r8d shlb $4,%r8b orb %r8b,%r9b vpcmpuq $0,.Lmask52x4(%rip),%ymm15,%k1 vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2 kmovb %k1,%r8d kmovb %k2,%edx shlb $4,%dl orb %dl,%r8b vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k1 vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k2 kmovb %k1,%edx kmovb %k2,%ecx shlb $4,%cl orb %cl,%dl vpcmpuq $0,.Lmask52x4(%rip),%ymm19,%k1 vpcmpuq $0,.Lmask52x4(%rip),%ymm20,%k2 kmovb %k1,%ecx kmovb %k2,%ebx shlb $4,%bl orb %bl,%cl vpcmpuq $0,.Lmask52x4(%rip),%ymm21,%k1 vpcmpuq $0,.Lmask52x4(%rip),%ymm22,%k2 kmovb %k1,%ebx kmovb %k2,%eax shlb $4,%al orb %al,%bl addb %r9b,%r14b adcb %r8b,%r13b adcb %dl,%r12b adcb %cl,%r11b adcb %bl,%r10b xorb %r9b,%r14b xorb %r8b,%r13b xorb %dl,%r12b xorb %cl,%r11b xorb %bl,%r10b kmovb %r14d,%k1 shrb $4,%r14b kmovb %r14d,%k2 kmovb %r13d,%k3 shrb $4,%r13b kmovb %r13d,%k4 kmovb %r12d,%k5 shrb $4,%r12b kmovb %r12d,%k6 kmovb %r11d,%k7 vpsubq .Lmask52x4(%rip),%ymm13,%ymm13{%k1} vpsubq .Lmask52x4(%rip),%ymm14,%ymm14{%k2} vpsubq .Lmask52x4(%rip),%ymm15,%ymm15{%k3} vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k4} vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k5} vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k6} vpsubq .Lmask52x4(%rip),%ymm19,%ymm19{%k7} vpandq .Lmask52x4(%rip),%ymm13,%ymm13 vpandq .Lmask52x4(%rip),%ymm14,%ymm14 vpandq .Lmask52x4(%rip),%ymm15,%ymm15 vpandq .Lmask52x4(%rip),%ymm16,%ymm16 vpandq .Lmask52x4(%rip),%ymm17,%ymm17 vpandq .Lmask52x4(%rip),%ymm18,%ymm18 vpandq .Lmask52x4(%rip),%ymm19,%ymm19 shrb $4,%r11b kmovb %r11d,%k1 kmovb %r10d,%k2 shrb $4,%r10b kmovb %r10d,%k3 vpsubq .Lmask52x4(%rip),%ymm20,%ymm20{%k1} vpsubq .Lmask52x4(%rip),%ymm21,%ymm21{%k2} vpsubq .Lmask52x4(%rip),%ymm22,%ymm22{%k3} vpandq .Lmask52x4(%rip),%ymm20,%ymm20 vpandq .Lmask52x4(%rip),%ymm21,%ymm21 vpandq .Lmask52x4(%rip),%ymm22,%ymm22 vmovdqu64 %ymm3,0(%rdi) vmovdqu64 %ymm4,32(%rdi) vmovdqu64 %ymm5,64(%rdi) vmovdqu64 %ymm6,96(%rdi) vmovdqu64 %ymm7,128(%rdi) vmovdqu64 %ymm8,160(%rdi) vmovdqu64 %ymm9,192(%rdi) vmovdqu64 %ymm10,224(%rdi) vmovdqu64 %ymm11,256(%rdi) vmovdqu64 %ymm12,288(%rdi) vmovdqu64 %ymm13,320(%rdi) vmovdqu64 %ymm14,352(%rdi) vmovdqu64 %ymm15,384(%rdi) vmovdqu64 %ymm16,416(%rdi) vmovdqu64 %ymm17,448(%rdi) vmovdqu64 %ymm18,480(%rdi) vmovdqu64 %ymm19,512(%rdi) vmovdqu64 %ymm20,544(%rdi) vmovdqu64 %ymm21,576(%rdi) vmovdqu64 %ymm22,608(%rdi) vzeroupper leaq (%rsp),%rax .cfi_def_cfa_register %rax movq 0(%rax),%r15 .cfi_restore %r15 movq 8(%rax),%r14 .cfi_restore %r14 movq 16(%rax),%r13 .cfi_restore %r13 movq 24(%rax),%r12 .cfi_restore %r12 movq 32(%rax),%rbp .cfi_restore %rbp movq 40(%rax),%rbx .cfi_restore %rbx leaq 48(%rax),%rsp .cfi_def_cfa %rsp,8 .Lossl_rsaz_amm52x40_x2_ifma256_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ossl_rsaz_amm52x40_x2_ifma256, .-ossl_rsaz_amm52x40_x2_ifma256 .text .align 32 .globl ossl_extract_multiplier_2x40_win5 .type ossl_extract_multiplier_2x40_win5,@function ossl_extract_multiplier_2x40_win5: .cfi_startproc .byte 243,15,30,250 vmovdqa64 .Lones(%rip),%ymm24 vpbroadcastq %rdx,%ymm22 vpbroadcastq %rcx,%ymm23 leaq 20480(%rsi),%rax movq %rsi,%r10 vpxor %xmm0,%xmm0,%xmm0 vmovdqa64 %ymm0,%ymm1 vmovdqa64 %ymm0,%ymm2 vmovdqa64 %ymm0,%ymm3 vmovdqa64 %ymm0,%ymm4 vmovdqa64 %ymm0,%ymm5 vmovdqa64 %ymm0,%ymm16 vmovdqa64 %ymm0,%ymm17 vmovdqa64 %ymm0,%ymm18 vmovdqa64 %ymm0,%ymm19 vpxorq %ymm21,%ymm21,%ymm21 .align 32 .Lloop_0: vpcmpq $0,%ymm21,%ymm22,%k1 vmovdqu64 0(%rsi),%ymm20 vpblendmq %ymm20,%ymm0,%ymm0{%k1} vmovdqu64 32(%rsi),%ymm20 vpblendmq %ymm20,%ymm1,%ymm1{%k1} vmovdqu64 64(%rsi),%ymm20 vpblendmq %ymm20,%ymm2,%ymm2{%k1} vmovdqu64 96(%rsi),%ymm20 vpblendmq %ymm20,%ymm3,%ymm3{%k1} vmovdqu64 128(%rsi),%ymm20 vpblendmq %ymm20,%ymm4,%ymm4{%k1} vmovdqu64 160(%rsi),%ymm20 vpblendmq %ymm20,%ymm5,%ymm5{%k1} vmovdqu64 192(%rsi),%ymm20 vpblendmq %ymm20,%ymm16,%ymm16{%k1} vmovdqu64 224(%rsi),%ymm20 vpblendmq %ymm20,%ymm17,%ymm17{%k1} vmovdqu64 256(%rsi),%ymm20 vpblendmq %ymm20,%ymm18,%ymm18{%k1} vmovdqu64 288(%rsi),%ymm20 vpblendmq %ymm20,%ymm19,%ymm19{%k1} vpaddq %ymm24,%ymm21,%ymm21 addq $640,%rsi cmpq %rsi,%rax jne .Lloop_0 vmovdqu64 %ymm0,0(%rdi) vmovdqu64 %ymm1,32(%rdi) vmovdqu64 %ymm2,64(%rdi) vmovdqu64 %ymm3,96(%rdi) vmovdqu64 %ymm4,128(%rdi) vmovdqu64 %ymm5,160(%rdi) vmovdqu64 %ymm16,192(%rdi) vmovdqu64 %ymm17,224(%rdi) vmovdqu64 %ymm18,256(%rdi) vmovdqu64 %ymm19,288(%rdi) movq %r10,%rsi vpxorq %ymm21,%ymm21,%ymm21 .align 32 .Lloop_320: vpcmpq $0,%ymm21,%ymm23,%k1 vmovdqu64 320(%rsi),%ymm20 vpblendmq %ymm20,%ymm0,%ymm0{%k1} vmovdqu64 352(%rsi),%ymm20 vpblendmq %ymm20,%ymm1,%ymm1{%k1} vmovdqu64 384(%rsi),%ymm20 vpblendmq %ymm20,%ymm2,%ymm2{%k1} vmovdqu64 416(%rsi),%ymm20 vpblendmq %ymm20,%ymm3,%ymm3{%k1} vmovdqu64 448(%rsi),%ymm20 vpblendmq %ymm20,%ymm4,%ymm4{%k1} vmovdqu64 480(%rsi),%ymm20 vpblendmq %ymm20,%ymm5,%ymm5{%k1} vmovdqu64 512(%rsi),%ymm20 vpblendmq %ymm20,%ymm16,%ymm16{%k1} vmovdqu64 544(%rsi),%ymm20 vpblendmq %ymm20,%ymm17,%ymm17{%k1} vmovdqu64 576(%rsi),%ymm20 vpblendmq %ymm20,%ymm18,%ymm18{%k1} vmovdqu64 608(%rsi),%ymm20 vpblendmq %ymm20,%ymm19,%ymm19{%k1} vpaddq %ymm24,%ymm21,%ymm21 addq $640,%rsi cmpq %rsi,%rax jne .Lloop_320 vmovdqu64 %ymm0,320(%rdi) vmovdqu64 %ymm1,352(%rdi) vmovdqu64 %ymm2,384(%rdi) vmovdqu64 %ymm3,416(%rdi) vmovdqu64 %ymm4,448(%rdi) vmovdqu64 %ymm5,480(%rdi) vmovdqu64 %ymm16,512(%rdi) vmovdqu64 %ymm17,544(%rdi) vmovdqu64 %ymm18,576(%rdi) vmovdqu64 %ymm19,608(%rdi) .byte 0xf3,0xc3 .cfi_endproc .size ossl_extract_multiplier_2x40_win5, .-ossl_extract_multiplier_2x40_win5 .data .align 32 .Lones: .quad 1,1,1,1 .Lzeros: .quad 0,0,0,0 .section ".note.gnu.property", "a" .p2align 3 .long 1f - 0f .long 4f - 1f .long 5 0: # "GNU" encoded with .byte, since .asciz isn't supported # on Solaris. .byte 0x47 .byte 0x4e .byte 0x55 .byte 0 1: .p2align 3 .long 0xc0000002 .long 3f - 2f 2: .long 3 3: .p2align 3 4: