.globl ossl_rsaz_avx512ifma_eligible .type ossl_rsaz_avx512ifma_eligible,@function .align 32 ossl_rsaz_avx512ifma_eligible: movl OPENSSL_ia32cap_P+8(%rip),%ecx xorl %eax,%eax andl $2149777408,%ecx cmpl $2149777408,%ecx cmovel %ecx,%eax .byte 0xf3,0xc3 .size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible .text .globl ossl_rsaz_amm52x20_x1_ifma256 .type ossl_rsaz_amm52x20_x1_ifma256,@function .align 32 ossl_rsaz_amm52x20_x1_ifma256: .cfi_startproc .byte 243,15,30,250 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lossl_rsaz_amm52x20_x1_ifma256_body: vpxord %ymm0,%ymm0,%ymm0 vmovdqa64 %ymm0,%ymm3 vmovdqa64 %ymm0,%ymm16 vmovdqa64 %ymm0,%ymm17 vmovdqa64 %ymm0,%ymm18 vmovdqa64 %ymm0,%ymm19 xorl %r9d,%r9d movq %rdx,%r11 movq $0xfffffffffffff,%rax movl $5,%ebx .align 32 .Lloop5: movq 0(%r11),%r13 vpbroadcastq %r13,%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vpbroadcastq %r13,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 vpmadd52luq 0(%rsi),%ymm1,%ymm3 vpmadd52luq 32(%rsi),%ymm1,%ymm16 vpmadd52luq 64(%rsi),%ymm1,%ymm17 vpmadd52luq 96(%rsi),%ymm1,%ymm18 vpmadd52luq 128(%rsi),%ymm1,%ymm19 vpmadd52luq 0(%rcx),%ymm2,%ymm3 vpmadd52luq 32(%rcx),%ymm2,%ymm16 vpmadd52luq 64(%rcx),%ymm2,%ymm17 vpmadd52luq 96(%rcx),%ymm2,%ymm18 vpmadd52luq 128(%rcx),%ymm2,%ymm19 valignq $1,%ymm3,%ymm16,%ymm3 valignq $1,%ymm16,%ymm17,%ymm16 valignq $1,%ymm17,%ymm18,%ymm17 valignq $1,%ymm18,%ymm19,%ymm18 valignq $1,%ymm19,%ymm0,%ymm19 vmovq %xmm3,%r13 addq %r13,%r9 vpmadd52huq 0(%rsi),%ymm1,%ymm3 vpmadd52huq 32(%rsi),%ymm1,%ymm16 vpmadd52huq 64(%rsi),%ymm1,%ymm17 vpmadd52huq 96(%rsi),%ymm1,%ymm18 vpmadd52huq 128(%rsi),%ymm1,%ymm19 vpmadd52huq 0(%rcx),%ymm2,%ymm3 vpmadd52huq 32(%rcx),%ymm2,%ymm16 vpmadd52huq 64(%rcx),%ymm2,%ymm17 vpmadd52huq 96(%rcx),%ymm2,%ymm18 vpmadd52huq 128(%rcx),%ymm2,%ymm19 movq 8(%r11),%r13 vpbroadcastq %r13,%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vpbroadcastq %r13,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 vpmadd52luq 0(%rsi),%ymm1,%ymm3 vpmadd52luq 32(%rsi),%ymm1,%ymm16 vpmadd52luq 64(%rsi),%ymm1,%ymm17 vpmadd52luq 96(%rsi),%ymm1,%ymm18 vpmadd52luq 128(%rsi),%ymm1,%ymm19 vpmadd52luq 0(%rcx),%ymm2,%ymm3 vpmadd52luq 32(%rcx),%ymm2,%ymm16 vpmadd52luq 64(%rcx),%ymm2,%ymm17 vpmadd52luq 96(%rcx),%ymm2,%ymm18 vpmadd52luq 128(%rcx),%ymm2,%ymm19 valignq $1,%ymm3,%ymm16,%ymm3 valignq $1,%ymm16,%ymm17,%ymm16 valignq $1,%ymm17,%ymm18,%ymm17 valignq $1,%ymm18,%ymm19,%ymm18 valignq $1,%ymm19,%ymm0,%ymm19 vmovq %xmm3,%r13 addq %r13,%r9 vpmadd52huq 0(%rsi),%ymm1,%ymm3 vpmadd52huq 32(%rsi),%ymm1,%ymm16 vpmadd52huq 64(%rsi),%ymm1,%ymm17 vpmadd52huq 96(%rsi),%ymm1,%ymm18 vpmadd52huq 128(%rsi),%ymm1,%ymm19 vpmadd52huq 0(%rcx),%ymm2,%ymm3 vpmadd52huq 32(%rcx),%ymm2,%ymm16 vpmadd52huq 64(%rcx),%ymm2,%ymm17 vpmadd52huq 96(%rcx),%ymm2,%ymm18 vpmadd52huq 128(%rcx),%ymm2,%ymm19 movq 16(%r11),%r13 vpbroadcastq %r13,%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vpbroadcastq %r13,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 vpmadd52luq 0(%rsi),%ymm1,%ymm3 vpmadd52luq 32(%rsi),%ymm1,%ymm16 vpmadd52luq 64(%rsi),%ymm1,%ymm17 vpmadd52luq 96(%rsi),%ymm1,%ymm18 vpmadd52luq 128(%rsi),%ymm1,%ymm19 vpmadd52luq 0(%rcx),%ymm2,%ymm3 vpmadd52luq 32(%rcx),%ymm2,%ymm16 vpmadd52luq 64(%rcx),%ymm2,%ymm17 vpmadd52luq 96(%rcx),%ymm2,%ymm18 vpmadd52luq 128(%rcx),%ymm2,%ymm19 valignq $1,%ymm3,%ymm16,%ymm3 valignq $1,%ymm16,%ymm17,%ymm16 valignq $1,%ymm17,%ymm18,%ymm17 valignq $1,%ymm18,%ymm19,%ymm18 valignq $1,%ymm19,%ymm0,%ymm19 vmovq %xmm3,%r13 addq %r13,%r9 vpmadd52huq 0(%rsi),%ymm1,%ymm3 vpmadd52huq 32(%rsi),%ymm1,%ymm16 vpmadd52huq 64(%rsi),%ymm1,%ymm17 vpmadd52huq 96(%rsi),%ymm1,%ymm18 vpmadd52huq 128(%rsi),%ymm1,%ymm19 vpmadd52huq 0(%rcx),%ymm2,%ymm3 vpmadd52huq 32(%rcx),%ymm2,%ymm16 vpmadd52huq 64(%rcx),%ymm2,%ymm17 vpmadd52huq 96(%rcx),%ymm2,%ymm18 vpmadd52huq 128(%rcx),%ymm2,%ymm19 movq 24(%r11),%r13 vpbroadcastq %r13,%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vpbroadcastq %r13,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 vpmadd52luq 0(%rsi),%ymm1,%ymm3 vpmadd52luq 32(%rsi),%ymm1,%ymm16 vpmadd52luq 64(%rsi),%ymm1,%ymm17 vpmadd52luq 96(%rsi),%ymm1,%ymm18 vpmadd52luq 128(%rsi),%ymm1,%ymm19 vpmadd52luq 0(%rcx),%ymm2,%ymm3 vpmadd52luq 32(%rcx),%ymm2,%ymm16 vpmadd52luq 64(%rcx),%ymm2,%ymm17 vpmadd52luq 96(%rcx),%ymm2,%ymm18 vpmadd52luq 128(%rcx),%ymm2,%ymm19 valignq $1,%ymm3,%ymm16,%ymm3 valignq $1,%ymm16,%ymm17,%ymm16 valignq $1,%ymm17,%ymm18,%ymm17 valignq $1,%ymm18,%ymm19,%ymm18 valignq $1,%ymm19,%ymm0,%ymm19 vmovq %xmm3,%r13 addq %r13,%r9 vpmadd52huq 0(%rsi),%ymm1,%ymm3 vpmadd52huq 32(%rsi),%ymm1,%ymm16 vpmadd52huq 64(%rsi),%ymm1,%ymm17 vpmadd52huq 96(%rsi),%ymm1,%ymm18 vpmadd52huq 128(%rsi),%ymm1,%ymm19 vpmadd52huq 0(%rcx),%ymm2,%ymm3 vpmadd52huq 32(%rcx),%ymm2,%ymm16 vpmadd52huq 64(%rcx),%ymm2,%ymm17 vpmadd52huq 96(%rcx),%ymm2,%ymm18 vpmadd52huq 128(%rcx),%ymm2,%ymm19 leaq 32(%r11),%r11 decl %ebx jne .Lloop5 vpbroadcastq %r9,%ymm0 vpblendd $3,%ymm0,%ymm3,%ymm3 vpsrlq $52,%ymm3,%ymm0 vpsrlq $52,%ymm16,%ymm1 vpsrlq $52,%ymm17,%ymm2 vpsrlq $52,%ymm18,%ymm25 vpsrlq $52,%ymm19,%ymm26 valignq $3,%ymm25,%ymm26,%ymm26 valignq $3,%ymm2,%ymm25,%ymm25 valignq $3,%ymm1,%ymm2,%ymm2 valignq $3,%ymm0,%ymm1,%ymm1 valignq $3,.Lzeros(%rip),%ymm0,%ymm0 vpandq .Lmask52x4(%rip),%ymm3,%ymm3 vpandq .Lmask52x4(%rip),%ymm16,%ymm16 vpandq .Lmask52x4(%rip),%ymm17,%ymm17 vpandq .Lmask52x4(%rip),%ymm18,%ymm18 vpandq .Lmask52x4(%rip),%ymm19,%ymm19 vpaddq %ymm0,%ymm3,%ymm3 vpaddq %ymm1,%ymm16,%ymm16 vpaddq %ymm2,%ymm17,%ymm17 vpaddq %ymm25,%ymm18,%ymm18 vpaddq %ymm26,%ymm19,%ymm19 vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1 vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2 vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k3 vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k4 vpcmpuq $6,.Lmask52x4(%rip),%ymm19,%k5 kmovb %k1,%r14d kmovb %k2,%r13d kmovb %k3,%r12d kmovb %k4,%r11d kmovb %k5,%r10d vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1 vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2 vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k3 vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k4 vpcmpuq $0,.Lmask52x4(%rip),%ymm19,%k5 kmovb %k1,%r9d kmovb %k2,%r8d kmovb %k3,%ebx kmovb %k4,%ecx kmovb %k5,%edx shlb $4,%r13b orb %r13b,%r14b shlb $4,%r11b orb %r11b,%r12b addb %r14b,%r14b adcb %r12b,%r12b adcb %r10b,%r10b shlb $4,%r8b orb %r8b,%r9b shlb $4,%cl orb %cl,%bl addb %r9b,%r14b adcb %bl,%r12b adcb %dl,%r10b xorb %r9b,%r14b xorb %bl,%r12b xorb %dl,%r10b kmovb %r14d,%k1 shrb $4,%r14b kmovb %r14d,%k2 kmovb %r12d,%k3 shrb $4,%r12b kmovb %r12d,%k4 kmovb %r10d,%k5 vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1} vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k2} vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k3} vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k4} vpsubq .Lmask52x4(%rip),%ymm19,%ymm19{%k5} vpandq .Lmask52x4(%rip),%ymm3,%ymm3 vpandq .Lmask52x4(%rip),%ymm16,%ymm16 vpandq .Lmask52x4(%rip),%ymm17,%ymm17 vpandq .Lmask52x4(%rip),%ymm18,%ymm18 vpandq .Lmask52x4(%rip),%ymm19,%ymm19 vmovdqu64 %ymm3,0(%rdi) vmovdqu64 %ymm16,32(%rdi) vmovdqu64 %ymm17,64(%rdi) vmovdqu64 %ymm18,96(%rdi) vmovdqu64 %ymm19,128(%rdi) vzeroupper movq 0(%rsp),%r15 .cfi_restore %r15 movq 8(%rsp),%r14 .cfi_restore %r14 movq 16(%rsp),%r13 .cfi_restore %r13 movq 24(%rsp),%r12 .cfi_restore %r12 movq 32(%rsp),%rbp .cfi_restore %rbp movq 40(%rsp),%rbx .cfi_restore %rbx leaq 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lossl_rsaz_amm52x20_x1_ifma256_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256 .data .align 32 .Lmask52x4: .quad 0xfffffffffffff .quad 0xfffffffffffff .quad 0xfffffffffffff .quad 0xfffffffffffff .text .globl ossl_rsaz_amm52x20_x2_ifma256 .type ossl_rsaz_amm52x20_x2_ifma256,@function .align 32 ossl_rsaz_amm52x20_x2_ifma256: .cfi_startproc .byte 243,15,30,250 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lossl_rsaz_amm52x20_x2_ifma256_body: vpxord %ymm0,%ymm0,%ymm0 vmovdqa64 %ymm0,%ymm3 vmovdqa64 %ymm0,%ymm16 vmovdqa64 %ymm0,%ymm17 vmovdqa64 %ymm0,%ymm18 vmovdqa64 %ymm0,%ymm19 vmovdqa64 %ymm0,%ymm4 vmovdqa64 %ymm0,%ymm20 vmovdqa64 %ymm0,%ymm21 vmovdqa64 %ymm0,%ymm22 vmovdqa64 %ymm0,%ymm23 xorl %r9d,%r9d xorl %r15d,%r15d movq %rdx,%r11 movq $0xfffffffffffff,%rax movl $20,%ebx .align 32 .Lloop20: movq 0(%r11),%r13 vpbroadcastq %r13,%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq (%r8),%r13 imulq %r9,%r13 andq %rax,%r13 vpbroadcastq %r13,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 vpmadd52luq 0(%rsi),%ymm1,%ymm3 vpmadd52luq 32(%rsi),%ymm1,%ymm16 vpmadd52luq 64(%rsi),%ymm1,%ymm17 vpmadd52luq 96(%rsi),%ymm1,%ymm18 vpmadd52luq 128(%rsi),%ymm1,%ymm19 vpmadd52luq 0(%rcx),%ymm2,%ymm3 vpmadd52luq 32(%rcx),%ymm2,%ymm16 vpmadd52luq 64(%rcx),%ymm2,%ymm17 vpmadd52luq 96(%rcx),%ymm2,%ymm18 vpmadd52luq 128(%rcx),%ymm2,%ymm19 valignq $1,%ymm3,%ymm16,%ymm3 valignq $1,%ymm16,%ymm17,%ymm16 valignq $1,%ymm17,%ymm18,%ymm17 valignq $1,%ymm18,%ymm19,%ymm18 valignq $1,%ymm19,%ymm0,%ymm19 vmovq %xmm3,%r13 addq %r13,%r9 vpmadd52huq 0(%rsi),%ymm1,%ymm3 vpmadd52huq 32(%rsi),%ymm1,%ymm16 vpmadd52huq 64(%rsi),%ymm1,%ymm17 vpmadd52huq 96(%rsi),%ymm1,%ymm18 vpmadd52huq 128(%rsi),%ymm1,%ymm19 vpmadd52huq 0(%rcx),%ymm2,%ymm3 vpmadd52huq 32(%rcx),%ymm2,%ymm16 vpmadd52huq 64(%rcx),%ymm2,%ymm17 vpmadd52huq 96(%rcx),%ymm2,%ymm18 vpmadd52huq 128(%rcx),%ymm2,%ymm19 movq 160(%r11),%r13 vpbroadcastq %r13,%ymm1 movq 160(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r15 movq %r12,%r10 adcq $0,%r10 movq 8(%r8),%r13 imulq %r15,%r13 andq %rax,%r13 vpbroadcastq %r13,%ymm2 movq 160(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r15 adcq %r12,%r10 shrq $52,%r15 salq $12,%r10 orq %r10,%r15 vpmadd52luq 160(%rsi),%ymm1,%ymm4 vpmadd52luq 192(%rsi),%ymm1,%ymm20 vpmadd52luq 224(%rsi),%ymm1,%ymm21 vpmadd52luq 256(%rsi),%ymm1,%ymm22 vpmadd52luq 288(%rsi),%ymm1,%ymm23 vpmadd52luq 160(%rcx),%ymm2,%ymm4 vpmadd52luq 192(%rcx),%ymm2,%ymm20 vpmadd52luq 224(%rcx),%ymm2,%ymm21 vpmadd52luq 256(%rcx),%ymm2,%ymm22 vpmadd52luq 288(%rcx),%ymm2,%ymm23 valignq $1,%ymm4,%ymm20,%ymm4 valignq $1,%ymm20,%ymm21,%ymm20 valignq $1,%ymm21,%ymm22,%ymm21 valignq $1,%ymm22,%ymm23,%ymm22 valignq $1,%ymm23,%ymm0,%ymm23 vmovq %xmm4,%r13 addq %r13,%r15 vpmadd52huq 160(%rsi),%ymm1,%ymm4 vpmadd52huq 192(%rsi),%ymm1,%ymm20 vpmadd52huq 224(%rsi),%ymm1,%ymm21 vpmadd52huq 256(%rsi),%ymm1,%ymm22 vpmadd52huq 288(%rsi),%ymm1,%ymm23 vpmadd52huq 160(%rcx),%ymm2,%ymm4 vpmadd52huq 192(%rcx),%ymm2,%ymm20 vpmadd52huq 224(%rcx),%ymm2,%ymm21 vpmadd52huq 256(%rcx),%ymm2,%ymm22 vpmadd52huq 288(%rcx),%ymm2,%ymm23 leaq 8(%r11),%r11 decl %ebx jne .Lloop20 vpbroadcastq %r9,%ymm0 vpblendd $3,%ymm0,%ymm3,%ymm3 vpsrlq $52,%ymm3,%ymm0 vpsrlq $52,%ymm16,%ymm1 vpsrlq $52,%ymm17,%ymm2 vpsrlq $52,%ymm18,%ymm25 vpsrlq $52,%ymm19,%ymm26 valignq $3,%ymm25,%ymm26,%ymm26 valignq $3,%ymm2,%ymm25,%ymm25 valignq $3,%ymm1,%ymm2,%ymm2 valignq $3,%ymm0,%ymm1,%ymm1 valignq $3,.Lzeros(%rip),%ymm0,%ymm0 vpandq .Lmask52x4(%rip),%ymm3,%ymm3 vpandq .Lmask52x4(%rip),%ymm16,%ymm16 vpandq .Lmask52x4(%rip),%ymm17,%ymm17 vpandq .Lmask52x4(%rip),%ymm18,%ymm18 vpandq .Lmask52x4(%rip),%ymm19,%ymm19 vpaddq %ymm0,%ymm3,%ymm3 vpaddq %ymm1,%ymm16,%ymm16 vpaddq %ymm2,%ymm17,%ymm17 vpaddq %ymm25,%ymm18,%ymm18 vpaddq %ymm26,%ymm19,%ymm19 vpcmpuq $6,.Lmask52x4(%rip),%ymm3,%k1 vpcmpuq $6,.Lmask52x4(%rip),%ymm16,%k2 vpcmpuq $6,.Lmask52x4(%rip),%ymm17,%k3 vpcmpuq $6,.Lmask52x4(%rip),%ymm18,%k4 vpcmpuq $6,.Lmask52x4(%rip),%ymm19,%k5 kmovb %k1,%r14d kmovb %k2,%r13d kmovb %k3,%r12d kmovb %k4,%r11d kmovb %k5,%r10d vpcmpuq $0,.Lmask52x4(%rip),%ymm3,%k1 vpcmpuq $0,.Lmask52x4(%rip),%ymm16,%k2 vpcmpuq $0,.Lmask52x4(%rip),%ymm17,%k3 vpcmpuq $0,.Lmask52x4(%rip),%ymm18,%k4 vpcmpuq $0,.Lmask52x4(%rip),%ymm19,%k5 kmovb %k1,%r9d kmovb %k2,%r8d kmovb %k3,%ebx kmovb %k4,%ecx kmovb %k5,%edx shlb $4,%r13b orb %r13b,%r14b shlb $4,%r11b orb %r11b,%r12b addb %r14b,%r14b adcb %r12b,%r12b adcb %r10b,%r10b shlb $4,%r8b orb %r8b,%r9b shlb $4,%cl orb %cl,%bl addb %r9b,%r14b adcb %bl,%r12b adcb %dl,%r10b xorb %r9b,%r14b xorb %bl,%r12b xorb %dl,%r10b kmovb %r14d,%k1 shrb $4,%r14b kmovb %r14d,%k2 kmovb %r12d,%k3 shrb $4,%r12b kmovb %r12d,%k4 kmovb %r10d,%k5 vpsubq .Lmask52x4(%rip),%ymm3,%ymm3{%k1} vpsubq .Lmask52x4(%rip),%ymm16,%ymm16{%k2} vpsubq .Lmask52x4(%rip),%ymm17,%ymm17{%k3} vpsubq .Lmask52x4(%rip),%ymm18,%ymm18{%k4} vpsubq .Lmask52x4(%rip),%ymm19,%ymm19{%k5} vpandq .Lmask52x4(%rip),%ymm3,%ymm3 vpandq .Lmask52x4(%rip),%ymm16,%ymm16 vpandq .Lmask52x4(%rip),%ymm17,%ymm17 vpandq .Lmask52x4(%rip),%ymm18,%ymm18 vpandq .Lmask52x4(%rip),%ymm19,%ymm19 vpbroadcastq %r15,%ymm0 vpblendd $3,%ymm0,%ymm4,%ymm4 vpsrlq $52,%ymm4,%ymm0 vpsrlq $52,%ymm20,%ymm1 vpsrlq $52,%ymm21,%ymm2 vpsrlq $52,%ymm22,%ymm25 vpsrlq $52,%ymm23,%ymm26 valignq $3,%ymm25,%ymm26,%ymm26 valignq $3,%ymm2,%ymm25,%ymm25 valignq $3,%ymm1,%ymm2,%ymm2 valignq $3,%ymm0,%ymm1,%ymm1 valignq $3,.Lzeros(%rip),%ymm0,%ymm0 vpandq .Lmask52x4(%rip),%ymm4,%ymm4 vpandq .Lmask52x4(%rip),%ymm20,%ymm20 vpandq .Lmask52x4(%rip),%ymm21,%ymm21 vpandq .Lmask52x4(%rip),%ymm22,%ymm22 vpandq .Lmask52x4(%rip),%ymm23,%ymm23 vpaddq %ymm0,%ymm4,%ymm4 vpaddq %ymm1,%ymm20,%ymm20 vpaddq %ymm2,%ymm21,%ymm21 vpaddq %ymm25,%ymm22,%ymm22 vpaddq %ymm26,%ymm23,%ymm23 vpcmpuq $6,.Lmask52x4(%rip),%ymm4,%k1 vpcmpuq $6,.Lmask52x4(%rip),%ymm20,%k2 vpcmpuq $6,.Lmask52x4(%rip),%ymm21,%k3 vpcmpuq $6,.Lmask52x4(%rip),%ymm22,%k4 vpcmpuq $6,.Lmask52x4(%rip),%ymm23,%k5 kmovb %k1,%r14d kmovb %k2,%r13d kmovb %k3,%r12d kmovb %k4,%r11d kmovb %k5,%r10d vpcmpuq $0,.Lmask52x4(%rip),%ymm4,%k1 vpcmpuq $0,.Lmask52x4(%rip),%ymm20,%k2 vpcmpuq $0,.Lmask52x4(%rip),%ymm21,%k3 vpcmpuq $0,.Lmask52x4(%rip),%ymm22,%k4 vpcmpuq $0,.Lmask52x4(%rip),%ymm23,%k5 kmovb %k1,%r9d kmovb %k2,%r8d kmovb %k3,%ebx kmovb %k4,%ecx kmovb %k5,%edx shlb $4,%r13b orb %r13b,%r14b shlb $4,%r11b orb %r11b,%r12b addb %r14b,%r14b adcb %r12b,%r12b adcb %r10b,%r10b shlb $4,%r8b orb %r8b,%r9b shlb $4,%cl orb %cl,%bl addb %r9b,%r14b adcb %bl,%r12b adcb %dl,%r10b xorb %r9b,%r14b xorb %bl,%r12b xorb %dl,%r10b kmovb %r14d,%k1 shrb $4,%r14b kmovb %r14d,%k2 kmovb %r12d,%k3 shrb $4,%r12b kmovb %r12d,%k4 kmovb %r10d,%k5 vpsubq .Lmask52x4(%rip),%ymm4,%ymm4{%k1} vpsubq .Lmask52x4(%rip),%ymm20,%ymm20{%k2} vpsubq .Lmask52x4(%rip),%ymm21,%ymm21{%k3} vpsubq .Lmask52x4(%rip),%ymm22,%ymm22{%k4} vpsubq .Lmask52x4(%rip),%ymm23,%ymm23{%k5} vpandq .Lmask52x4(%rip),%ymm4,%ymm4 vpandq .Lmask52x4(%rip),%ymm20,%ymm20 vpandq .Lmask52x4(%rip),%ymm21,%ymm21 vpandq .Lmask52x4(%rip),%ymm22,%ymm22 vpandq .Lmask52x4(%rip),%ymm23,%ymm23 vmovdqu64 %ymm3,0(%rdi) vmovdqu64 %ymm16,32(%rdi) vmovdqu64 %ymm17,64(%rdi) vmovdqu64 %ymm18,96(%rdi) vmovdqu64 %ymm19,128(%rdi) vmovdqu64 %ymm4,160(%rdi) vmovdqu64 %ymm20,192(%rdi) vmovdqu64 %ymm21,224(%rdi) vmovdqu64 %ymm22,256(%rdi) vmovdqu64 %ymm23,288(%rdi) vzeroupper movq 0(%rsp),%r15 .cfi_restore %r15 movq 8(%rsp),%r14 .cfi_restore %r14 movq 16(%rsp),%r13 .cfi_restore %r13 movq 24(%rsp),%r12 .cfi_restore %r12 movq 32(%rsp),%rbp .cfi_restore %rbp movq 40(%rsp),%rbx .cfi_restore %rbx leaq 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lossl_rsaz_amm52x20_x2_ifma256_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ossl_rsaz_amm52x20_x2_ifma256, .-ossl_rsaz_amm52x20_x2_ifma256 .text .align 32 .globl ossl_extract_multiplier_2x20_win5 .type ossl_extract_multiplier_2x20_win5,@function ossl_extract_multiplier_2x20_win5: .cfi_startproc .byte 243,15,30,250 vmovdqa64 .Lones(%rip),%ymm24 vpbroadcastq %rdx,%ymm22 vpbroadcastq %rcx,%ymm23 leaq 10240(%rsi),%rax vpxor %xmm0,%xmm0,%xmm0 vmovdqa64 %ymm0,%ymm21 vmovdqa64 %ymm0,%ymm1 vmovdqa64 %ymm0,%ymm2 vmovdqa64 %ymm0,%ymm3 vmovdqa64 %ymm0,%ymm4 vmovdqa64 %ymm0,%ymm5 vmovdqa64 %ymm0,%ymm16 vmovdqa64 %ymm0,%ymm17 vmovdqa64 %ymm0,%ymm18 vmovdqa64 %ymm0,%ymm19 .align 32 .Lloop: vpcmpq $0,%ymm21,%ymm22,%k1 vpcmpq $0,%ymm21,%ymm23,%k2 vmovdqu64 0(%rsi),%ymm20 vpblendmq %ymm20,%ymm0,%ymm0{%k1} vmovdqu64 32(%rsi),%ymm20 vpblendmq %ymm20,%ymm1,%ymm1{%k1} vmovdqu64 64(%rsi),%ymm20 vpblendmq %ymm20,%ymm2,%ymm2{%k1} vmovdqu64 96(%rsi),%ymm20 vpblendmq %ymm20,%ymm3,%ymm3{%k1} vmovdqu64 128(%rsi),%ymm20 vpblendmq %ymm20,%ymm4,%ymm4{%k1} vmovdqu64 160(%rsi),%ymm20 vpblendmq %ymm20,%ymm5,%ymm5{%k2} vmovdqu64 192(%rsi),%ymm20 vpblendmq %ymm20,%ymm16,%ymm16{%k2} vmovdqu64 224(%rsi),%ymm20 vpblendmq %ymm20,%ymm17,%ymm17{%k2} vmovdqu64 256(%rsi),%ymm20 vpblendmq %ymm20,%ymm18,%ymm18{%k2} vmovdqu64 288(%rsi),%ymm20 vpblendmq %ymm20,%ymm19,%ymm19{%k2} vpaddq %ymm24,%ymm21,%ymm21 addq $320,%rsi cmpq %rsi,%rax jne .Lloop vmovdqu64 %ymm0,0(%rdi) vmovdqu64 %ymm1,32(%rdi) vmovdqu64 %ymm2,64(%rdi) vmovdqu64 %ymm3,96(%rdi) vmovdqu64 %ymm4,128(%rdi) vmovdqu64 %ymm5,160(%rdi) vmovdqu64 %ymm16,192(%rdi) vmovdqu64 %ymm17,224(%rdi) vmovdqu64 %ymm18,256(%rdi) vmovdqu64 %ymm19,288(%rdi) .byte 0xf3,0xc3 .cfi_endproc .size ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5 .data .align 32 .Lones: .quad 1,1,1,1 .Lzeros: .quad 0,0,0,0 .section ".note.gnu.property", "a" .p2align 3 .long 1f - 0f .long 4f - 1f .long 5 0: # "GNU" encoded with .byte, since .asciz isn't supported # on Solaris. .byte 0x47 .byte 0x4e .byte 0x55 .byte 0 1: .p2align 3 .long 0xc0000002 .long 3f - 2f 2: .long 3 3: .p2align 3 4: