xref: /freebsd/sys/crypto/openssl/amd64/rsaz-2k-avxifma.S (revision 4757b351ea9d59d71d4a38b82506d2d16fcd560d)
1/* Do not modify. This file is auto-generated from rsaz-2k-avxifma.pl. */
2.text
3
4.globl	ossl_rsaz_avxifma_eligible
5.type	ossl_rsaz_avxifma_eligible,@function
6.align	32
7ossl_rsaz_avxifma_eligible:
8	movl	OPENSSL_ia32cap_P+20(%rip),%ecx
9	xorl	%eax,%eax
10	andl	$8388608,%ecx
11	cmpl	$8388608,%ecx
12	cmovel	%ecx,%eax
13	.byte	0xf3,0xc3
14.size	ossl_rsaz_avxifma_eligible, .-ossl_rsaz_avxifma_eligible
15.text
16
17.globl	ossl_rsaz_amm52x20_x1_avxifma256
18.type	ossl_rsaz_amm52x20_x1_avxifma256,@function
19.align	32
20ossl_rsaz_amm52x20_x1_avxifma256:
21.cfi_startproc
22.byte	243,15,30,250
23	pushq	%rbx
24.cfi_adjust_cfa_offset	8
25.cfi_offset	%rbx,-16
26	pushq	%rbp
27.cfi_adjust_cfa_offset	8
28.cfi_offset	%rbp,-24
29	pushq	%r12
30.cfi_adjust_cfa_offset	8
31.cfi_offset	%r12,-32
32	pushq	%r13
33.cfi_adjust_cfa_offset	8
34.cfi_offset	%r13,-40
35	pushq	%r14
36.cfi_adjust_cfa_offset	8
37.cfi_offset	%r14,-48
38	pushq	%r15
39.cfi_adjust_cfa_offset	8
40.cfi_offset	%r15,-56
41.Lossl_rsaz_amm52x20_x1_avxifma256_body:
42
43
44	vpxor	%ymm0,%ymm0,%ymm0
45	vmovapd	%ymm0,%ymm3
46	vmovapd	%ymm0,%ymm5
47	vmovapd	%ymm0,%ymm6
48	vmovapd	%ymm0,%ymm7
49	vmovapd	%ymm0,%ymm8
50
51	xorl	%r9d,%r9d
52
53	movq	%rdx,%r11
54	movq	$0xfffffffffffff,%rax
55
56
57	movl	$5,%ebx
58
59.align	32
60.Lloop5:
61	movq	0(%r11),%r13
62
63	vpbroadcastq	0(%r11),%ymm1
64	movq	0(%rsi),%rdx
65	mulxq	%r13,%r13,%r12
66	addq	%r13,%r9
67	movq	%r12,%r10
68	adcq	$0,%r10
69
70	movq	%r8,%r13
71	imulq	%r9,%r13
72	andq	%rax,%r13
73
74	vmovq	%r13,%xmm2
75	vpbroadcastq	%xmm2,%ymm2
76	movq	0(%rcx),%rdx
77	mulxq	%r13,%r13,%r12
78	addq	%r13,%r9
79	adcq	%r12,%r10
80
81	shrq	$52,%r9
82	salq	$12,%r10
83	orq	%r10,%r9
84
85	leaq	-168(%rsp),%rsp
86{vex}	vpmadd52luq	0(%rsi),%ymm1,%ymm3
87{vex}	vpmadd52luq	32(%rsi),%ymm1,%ymm5
88{vex}	vpmadd52luq	64(%rsi),%ymm1,%ymm6
89{vex}	vpmadd52luq	96(%rsi),%ymm1,%ymm7
90{vex}	vpmadd52luq	128(%rsi),%ymm1,%ymm8
91
92{vex}	vpmadd52luq	0(%rcx),%ymm2,%ymm3
93{vex}	vpmadd52luq	32(%rcx),%ymm2,%ymm5
94{vex}	vpmadd52luq	64(%rcx),%ymm2,%ymm6
95{vex}	vpmadd52luq	96(%rcx),%ymm2,%ymm7
96{vex}	vpmadd52luq	128(%rcx),%ymm2,%ymm8
97
98
99	vmovdqu	%ymm3,0(%rsp)
100	vmovdqu	%ymm5,32(%rsp)
101	vmovdqu	%ymm6,64(%rsp)
102	vmovdqu	%ymm7,96(%rsp)
103	vmovdqu	%ymm8,128(%rsp)
104	movq	$0,160(%rsp)
105
106	vmovdqu	8(%rsp),%ymm3
107	vmovdqu	40(%rsp),%ymm5
108	vmovdqu	72(%rsp),%ymm6
109	vmovdqu	104(%rsp),%ymm7
110	vmovdqu	136(%rsp),%ymm8
111
112	addq	8(%rsp),%r9
113
114{vex}	vpmadd52huq	0(%rsi),%ymm1,%ymm3
115{vex}	vpmadd52huq	32(%rsi),%ymm1,%ymm5
116{vex}	vpmadd52huq	64(%rsi),%ymm1,%ymm6
117{vex}	vpmadd52huq	96(%rsi),%ymm1,%ymm7
118{vex}	vpmadd52huq	128(%rsi),%ymm1,%ymm8
119
120{vex}	vpmadd52huq	0(%rcx),%ymm2,%ymm3
121{vex}	vpmadd52huq	32(%rcx),%ymm2,%ymm5
122{vex}	vpmadd52huq	64(%rcx),%ymm2,%ymm6
123{vex}	vpmadd52huq	96(%rcx),%ymm2,%ymm7
124{vex}	vpmadd52huq	128(%rcx),%ymm2,%ymm8
125	leaq	168(%rsp),%rsp
126	movq	8(%r11),%r13
127
128	vpbroadcastq	8(%r11),%ymm1
129	movq	0(%rsi),%rdx
130	mulxq	%r13,%r13,%r12
131	addq	%r13,%r9
132	movq	%r12,%r10
133	adcq	$0,%r10
134
135	movq	%r8,%r13
136	imulq	%r9,%r13
137	andq	%rax,%r13
138
139	vmovq	%r13,%xmm2
140	vpbroadcastq	%xmm2,%ymm2
141	movq	0(%rcx),%rdx
142	mulxq	%r13,%r13,%r12
143	addq	%r13,%r9
144	adcq	%r12,%r10
145
146	shrq	$52,%r9
147	salq	$12,%r10
148	orq	%r10,%r9
149
150	leaq	-168(%rsp),%rsp
151{vex}	vpmadd52luq	0(%rsi),%ymm1,%ymm3
152{vex}	vpmadd52luq	32(%rsi),%ymm1,%ymm5
153{vex}	vpmadd52luq	64(%rsi),%ymm1,%ymm6
154{vex}	vpmadd52luq	96(%rsi),%ymm1,%ymm7
155{vex}	vpmadd52luq	128(%rsi),%ymm1,%ymm8
156
157{vex}	vpmadd52luq	0(%rcx),%ymm2,%ymm3
158{vex}	vpmadd52luq	32(%rcx),%ymm2,%ymm5
159{vex}	vpmadd52luq	64(%rcx),%ymm2,%ymm6
160{vex}	vpmadd52luq	96(%rcx),%ymm2,%ymm7
161{vex}	vpmadd52luq	128(%rcx),%ymm2,%ymm8
162
163
164	vmovdqu	%ymm3,0(%rsp)
165	vmovdqu	%ymm5,32(%rsp)
166	vmovdqu	%ymm6,64(%rsp)
167	vmovdqu	%ymm7,96(%rsp)
168	vmovdqu	%ymm8,128(%rsp)
169	movq	$0,160(%rsp)
170
171	vmovdqu	8(%rsp),%ymm3
172	vmovdqu	40(%rsp),%ymm5
173	vmovdqu	72(%rsp),%ymm6
174	vmovdqu	104(%rsp),%ymm7
175	vmovdqu	136(%rsp),%ymm8
176
177	addq	8(%rsp),%r9
178
179{vex}	vpmadd52huq	0(%rsi),%ymm1,%ymm3
180{vex}	vpmadd52huq	32(%rsi),%ymm1,%ymm5
181{vex}	vpmadd52huq	64(%rsi),%ymm1,%ymm6
182{vex}	vpmadd52huq	96(%rsi),%ymm1,%ymm7
183{vex}	vpmadd52huq	128(%rsi),%ymm1,%ymm8
184
185{vex}	vpmadd52huq	0(%rcx),%ymm2,%ymm3
186{vex}	vpmadd52huq	32(%rcx),%ymm2,%ymm5
187{vex}	vpmadd52huq	64(%rcx),%ymm2,%ymm6
188{vex}	vpmadd52huq	96(%rcx),%ymm2,%ymm7
189{vex}	vpmadd52huq	128(%rcx),%ymm2,%ymm8
190	leaq	168(%rsp),%rsp
191	movq	16(%r11),%r13
192
193	vpbroadcastq	16(%r11),%ymm1
194	movq	0(%rsi),%rdx
195	mulxq	%r13,%r13,%r12
196	addq	%r13,%r9
197	movq	%r12,%r10
198	adcq	$0,%r10
199
200	movq	%r8,%r13
201	imulq	%r9,%r13
202	andq	%rax,%r13
203
204	vmovq	%r13,%xmm2
205	vpbroadcastq	%xmm2,%ymm2
206	movq	0(%rcx),%rdx
207	mulxq	%r13,%r13,%r12
208	addq	%r13,%r9
209	adcq	%r12,%r10
210
211	shrq	$52,%r9
212	salq	$12,%r10
213	orq	%r10,%r9
214
215	leaq	-168(%rsp),%rsp
216{vex}	vpmadd52luq	0(%rsi),%ymm1,%ymm3
217{vex}	vpmadd52luq	32(%rsi),%ymm1,%ymm5
218{vex}	vpmadd52luq	64(%rsi),%ymm1,%ymm6
219{vex}	vpmadd52luq	96(%rsi),%ymm1,%ymm7
220{vex}	vpmadd52luq	128(%rsi),%ymm1,%ymm8
221
222{vex}	vpmadd52luq	0(%rcx),%ymm2,%ymm3
223{vex}	vpmadd52luq	32(%rcx),%ymm2,%ymm5
224{vex}	vpmadd52luq	64(%rcx),%ymm2,%ymm6
225{vex}	vpmadd52luq	96(%rcx),%ymm2,%ymm7
226{vex}	vpmadd52luq	128(%rcx),%ymm2,%ymm8
227
228
229	vmovdqu	%ymm3,0(%rsp)
230	vmovdqu	%ymm5,32(%rsp)
231	vmovdqu	%ymm6,64(%rsp)
232	vmovdqu	%ymm7,96(%rsp)
233	vmovdqu	%ymm8,128(%rsp)
234	movq	$0,160(%rsp)
235
236	vmovdqu	8(%rsp),%ymm3
237	vmovdqu	40(%rsp),%ymm5
238	vmovdqu	72(%rsp),%ymm6
239	vmovdqu	104(%rsp),%ymm7
240	vmovdqu	136(%rsp),%ymm8
241
242	addq	8(%rsp),%r9
243
244{vex}	vpmadd52huq	0(%rsi),%ymm1,%ymm3
245{vex}	vpmadd52huq	32(%rsi),%ymm1,%ymm5
246{vex}	vpmadd52huq	64(%rsi),%ymm1,%ymm6
247{vex}	vpmadd52huq	96(%rsi),%ymm1,%ymm7
248{vex}	vpmadd52huq	128(%rsi),%ymm1,%ymm8
249
250{vex}	vpmadd52huq	0(%rcx),%ymm2,%ymm3
251{vex}	vpmadd52huq	32(%rcx),%ymm2,%ymm5
252{vex}	vpmadd52huq	64(%rcx),%ymm2,%ymm6
253{vex}	vpmadd52huq	96(%rcx),%ymm2,%ymm7
254{vex}	vpmadd52huq	128(%rcx),%ymm2,%ymm8
255	leaq	168(%rsp),%rsp
256	movq	24(%r11),%r13
257
258	vpbroadcastq	24(%r11),%ymm1
259	movq	0(%rsi),%rdx
260	mulxq	%r13,%r13,%r12
261	addq	%r13,%r9
262	movq	%r12,%r10
263	adcq	$0,%r10
264
265	movq	%r8,%r13
266	imulq	%r9,%r13
267	andq	%rax,%r13
268
269	vmovq	%r13,%xmm2
270	vpbroadcastq	%xmm2,%ymm2
271	movq	0(%rcx),%rdx
272	mulxq	%r13,%r13,%r12
273	addq	%r13,%r9
274	adcq	%r12,%r10
275
276	shrq	$52,%r9
277	salq	$12,%r10
278	orq	%r10,%r9
279
280	leaq	-168(%rsp),%rsp
281{vex}	vpmadd52luq	0(%rsi),%ymm1,%ymm3
282{vex}	vpmadd52luq	32(%rsi),%ymm1,%ymm5
283{vex}	vpmadd52luq	64(%rsi),%ymm1,%ymm6
284{vex}	vpmadd52luq	96(%rsi),%ymm1,%ymm7
285{vex}	vpmadd52luq	128(%rsi),%ymm1,%ymm8
286
287{vex}	vpmadd52luq	0(%rcx),%ymm2,%ymm3
288{vex}	vpmadd52luq	32(%rcx),%ymm2,%ymm5
289{vex}	vpmadd52luq	64(%rcx),%ymm2,%ymm6
290{vex}	vpmadd52luq	96(%rcx),%ymm2,%ymm7
291{vex}	vpmadd52luq	128(%rcx),%ymm2,%ymm8
292
293
294	vmovdqu	%ymm3,0(%rsp)
295	vmovdqu	%ymm5,32(%rsp)
296	vmovdqu	%ymm6,64(%rsp)
297	vmovdqu	%ymm7,96(%rsp)
298	vmovdqu	%ymm8,128(%rsp)
299	movq	$0,160(%rsp)
300
301	vmovdqu	8(%rsp),%ymm3
302	vmovdqu	40(%rsp),%ymm5
303	vmovdqu	72(%rsp),%ymm6
304	vmovdqu	104(%rsp),%ymm7
305	vmovdqu	136(%rsp),%ymm8
306
307	addq	8(%rsp),%r9
308
309{vex}	vpmadd52huq	0(%rsi),%ymm1,%ymm3
310{vex}	vpmadd52huq	32(%rsi),%ymm1,%ymm5
311{vex}	vpmadd52huq	64(%rsi),%ymm1,%ymm6
312{vex}	vpmadd52huq	96(%rsi),%ymm1,%ymm7
313{vex}	vpmadd52huq	128(%rsi),%ymm1,%ymm8
314
315{vex}	vpmadd52huq	0(%rcx),%ymm2,%ymm3
316{vex}	vpmadd52huq	32(%rcx),%ymm2,%ymm5
317{vex}	vpmadd52huq	64(%rcx),%ymm2,%ymm6
318{vex}	vpmadd52huq	96(%rcx),%ymm2,%ymm7
319{vex}	vpmadd52huq	128(%rcx),%ymm2,%ymm8
320	leaq	168(%rsp),%rsp
321	leaq	32(%r11),%r11
322	decl	%ebx
323	jne	.Lloop5
324
325	vmovq	%r9,%xmm0
326	vpbroadcastq	%xmm0,%ymm0
327	vpblendd	$3,%ymm0,%ymm3,%ymm3
328
329
330
331	vpsrlq	$52,%ymm3,%ymm0
332	vpsrlq	$52,%ymm5,%ymm1
333	vpsrlq	$52,%ymm6,%ymm2
334	vpsrlq	$52,%ymm7,%ymm13
335	vpsrlq	$52,%ymm8,%ymm14
336
337
338	vpermq	$144,%ymm14,%ymm14
339	vpermq	$3,%ymm13,%ymm15
340	vblendpd	$1,%ymm15,%ymm14,%ymm14
341
342	vpermq	$144,%ymm13,%ymm13
343	vpermq	$3,%ymm2,%ymm15
344	vblendpd	$1,%ymm15,%ymm13,%ymm13
345
346	vpermq	$144,%ymm2,%ymm2
347	vpermq	$3,%ymm1,%ymm15
348	vblendpd	$1,%ymm15,%ymm2,%ymm2
349
350	vpermq	$144,%ymm1,%ymm1
351	vpermq	$3,%ymm0,%ymm15
352	vblendpd	$1,%ymm15,%ymm1,%ymm1
353
354	vpermq	$144,%ymm0,%ymm0
355	vpand	.Lhigh64x3(%rip),%ymm0,%ymm0
356
357
358	vpand	.Lmask52x4(%rip),%ymm3,%ymm3
359	vpand	.Lmask52x4(%rip),%ymm5,%ymm5
360	vpand	.Lmask52x4(%rip),%ymm6,%ymm6
361	vpand	.Lmask52x4(%rip),%ymm7,%ymm7
362	vpand	.Lmask52x4(%rip),%ymm8,%ymm8
363
364
365	vpaddq	%ymm0,%ymm3,%ymm3
366	vpaddq	%ymm1,%ymm5,%ymm5
367	vpaddq	%ymm2,%ymm6,%ymm6
368	vpaddq	%ymm13,%ymm7,%ymm7
369	vpaddq	%ymm14,%ymm8,%ymm8
370
371
372
373	vpcmpgtq	.Lmask52x4(%rip),%ymm3,%ymm0
374	vpcmpgtq	.Lmask52x4(%rip),%ymm5,%ymm1
375	vpcmpgtq	.Lmask52x4(%rip),%ymm6,%ymm2
376	vpcmpgtq	.Lmask52x4(%rip),%ymm7,%ymm13
377	vpcmpgtq	.Lmask52x4(%rip),%ymm8,%ymm14
378	vmovmskpd	%ymm0,%r14d
379	vmovmskpd	%ymm1,%r13d
380	vmovmskpd	%ymm2,%r12d
381	vmovmskpd	%ymm13,%r11d
382	vmovmskpd	%ymm14,%r10d
383
384
385	vpcmpeqq	.Lmask52x4(%rip),%ymm3,%ymm0
386	vpcmpeqq	.Lmask52x4(%rip),%ymm5,%ymm1
387	vpcmpeqq	.Lmask52x4(%rip),%ymm6,%ymm2
388	vpcmpeqq	.Lmask52x4(%rip),%ymm7,%ymm13
389	vpcmpeqq	.Lmask52x4(%rip),%ymm8,%ymm14
390	vmovmskpd	%ymm0,%r9d
391	vmovmskpd	%ymm1,%r8d
392	vmovmskpd	%ymm2,%ebx
393	vmovmskpd	%ymm13,%ecx
394	vmovmskpd	%ymm14,%edx
395
396
397
398	shlb	$4,%r13b
399	orb	%r13b,%r14b
400	shlb	$4,%r11b
401	orb	%r11b,%r12b
402
403	addb	%r14b,%r14b
404	adcb	%r12b,%r12b
405	adcb	%r10b,%r10b
406
407	shlb	$4,%r8b
408	orb	%r8b,%r9b
409	shlb	$4,%cl
410	orb	%cl,%bl
411
412	addb	%r9b,%r14b
413	adcb	%bl,%r12b
414	adcb	%dl,%r10b
415
416	xorb	%r9b,%r14b
417	xorb	%bl,%r12b
418	xorb	%dl,%r10b
419
420	leaq	.Lkmasklut(%rip),%rdx
421
422	movb	%r14b,%r13b
423	andq	$0xf,%r14
424	vpsubq	.Lmask52x4(%rip),%ymm3,%ymm0
425	shlq	$5,%r14
426	vmovapd	(%rdx,%r14,1),%ymm2
427	vblendvpd	%ymm2,%ymm0,%ymm3,%ymm3
428
429	shrb	$4,%r13b
430	andq	$0xf,%r13
431	vpsubq	.Lmask52x4(%rip),%ymm5,%ymm0
432	shlq	$5,%r13
433	vmovapd	(%rdx,%r13,1),%ymm2
434	vblendvpd	%ymm2,%ymm0,%ymm5,%ymm5
435
436	movb	%r12b,%r11b
437	andq	$0xf,%r12
438	vpsubq	.Lmask52x4(%rip),%ymm6,%ymm0
439	shlq	$5,%r12
440	vmovapd	(%rdx,%r12,1),%ymm2
441	vblendvpd	%ymm2,%ymm0,%ymm6,%ymm6
442
443	shrb	$4,%r11b
444	andq	$0xf,%r11
445	vpsubq	.Lmask52x4(%rip),%ymm7,%ymm0
446	shlq	$5,%r11
447	vmovapd	(%rdx,%r11,1),%ymm2
448	vblendvpd	%ymm2,%ymm0,%ymm7,%ymm7
449
450	andq	$0xf,%r10
451	vpsubq	.Lmask52x4(%rip),%ymm8,%ymm0
452	shlq	$5,%r10
453	vmovapd	(%rdx,%r10,1),%ymm2
454	vblendvpd	%ymm2,%ymm0,%ymm8,%ymm8
455
456
457	vpand	.Lmask52x4(%rip),%ymm3,%ymm3
458	vpand	.Lmask52x4(%rip),%ymm5,%ymm5
459	vpand	.Lmask52x4(%rip),%ymm6,%ymm6
460	vpand	.Lmask52x4(%rip),%ymm7,%ymm7
461	vpand	.Lmask52x4(%rip),%ymm8,%ymm8
462
463	vmovdqu	%ymm3,0(%rdi)
464	vmovdqu	%ymm5,32(%rdi)
465	vmovdqu	%ymm6,64(%rdi)
466	vmovdqu	%ymm7,96(%rdi)
467	vmovdqu	%ymm8,128(%rdi)
468
469	vzeroupper
470	movq	0(%rsp),%r15
471.cfi_restore	%r15
472	movq	8(%rsp),%r14
473.cfi_restore	%r14
474	movq	16(%rsp),%r13
475.cfi_restore	%r13
476	movq	24(%rsp),%r12
477.cfi_restore	%r12
478	movq	32(%rsp),%rbp
479.cfi_restore	%rbp
480	movq	40(%rsp),%rbx
481.cfi_restore	%rbx
482	leaq	48(%rsp),%rsp
483.cfi_adjust_cfa_offset	-48
484.Lossl_rsaz_amm52x20_x1_avxifma256_epilogue:
485	.byte	0xf3,0xc3
486.cfi_endproc
487.size	ossl_rsaz_amm52x20_x1_avxifma256, .-ossl_rsaz_amm52x20_x1_avxifma256
488.section	.rodata
489.align	32
490.Lmask52x4:
491.quad	0xfffffffffffff
492.quad	0xfffffffffffff
493.quad	0xfffffffffffff
494.quad	0xfffffffffffff
495.Lhigh64x3:
496.quad	0x0
497.quad	0xffffffffffffffff
498.quad	0xffffffffffffffff
499.quad	0xffffffffffffffff
500.Lkmasklut:
501
502.quad	0x0
503.quad	0x0
504.quad	0x0
505.quad	0x0
506
507.quad	0xffffffffffffffff
508.quad	0x0
509.quad	0x0
510.quad	0x0
511
512.quad	0x0
513.quad	0xffffffffffffffff
514.quad	0x0
515.quad	0x0
516
517.quad	0xffffffffffffffff
518.quad	0xffffffffffffffff
519.quad	0x0
520.quad	0x0
521
522.quad	0x0
523.quad	0x0
524.quad	0xffffffffffffffff
525.quad	0x0
526
527.quad	0xffffffffffffffff
528.quad	0x0
529.quad	0xffffffffffffffff
530.quad	0x0
531
532.quad	0x0
533.quad	0xffffffffffffffff
534.quad	0xffffffffffffffff
535.quad	0x0
536
537.quad	0xffffffffffffffff
538.quad	0xffffffffffffffff
539.quad	0xffffffffffffffff
540.quad	0x0
541
542.quad	0x0
543.quad	0x0
544.quad	0x0
545.quad	0xffffffffffffffff
546
547.quad	0xffffffffffffffff
548.quad	0x0
549.quad	0x0
550.quad	0xffffffffffffffff
551
552.quad	0x0
553.quad	0xffffffffffffffff
554.quad	0x0
555.quad	0xffffffffffffffff
556
557.quad	0xffffffffffffffff
558.quad	0xffffffffffffffff
559.quad	0x0
560.quad	0xffffffffffffffff
561
562.quad	0x0
563.quad	0x0
564.quad	0xffffffffffffffff
565.quad	0xffffffffffffffff
566
567.quad	0xffffffffffffffff
568.quad	0x0
569.quad	0xffffffffffffffff
570.quad	0xffffffffffffffff
571
572.quad	0x0
573.quad	0xffffffffffffffff
574.quad	0xffffffffffffffff
575.quad	0xffffffffffffffff
576
577.quad	0xffffffffffffffff
578.quad	0xffffffffffffffff
579.quad	0xffffffffffffffff
580.quad	0xffffffffffffffff
581.text
582
583.globl	ossl_rsaz_amm52x20_x2_avxifma256
584.type	ossl_rsaz_amm52x20_x2_avxifma256,@function
585.align	32
586ossl_rsaz_amm52x20_x2_avxifma256:
587.cfi_startproc
588.byte	243,15,30,250
589	pushq	%rbx
590.cfi_adjust_cfa_offset	8
591.cfi_offset	%rbx,-16
592	pushq	%rbp
593.cfi_adjust_cfa_offset	8
594.cfi_offset	%rbp,-24
595	pushq	%r12
596.cfi_adjust_cfa_offset	8
597.cfi_offset	%r12,-32
598	pushq	%r13
599.cfi_adjust_cfa_offset	8
600.cfi_offset	%r13,-40
601	pushq	%r14
602.cfi_adjust_cfa_offset	8
603.cfi_offset	%r14,-48
604	pushq	%r15
605.cfi_adjust_cfa_offset	8
606.cfi_offset	%r15,-56
607.Lossl_rsaz_amm52x20_x2_avxifma256_body:
608
609
610	vpxor	%ymm0,%ymm0,%ymm0
611	vmovapd	%ymm0,%ymm3
612	vmovapd	%ymm0,%ymm5
613	vmovapd	%ymm0,%ymm6
614	vmovapd	%ymm0,%ymm7
615	vmovapd	%ymm0,%ymm8
616	vmovapd	%ymm0,%ymm4
617	vmovapd	%ymm0,%ymm9
618	vmovapd	%ymm0,%ymm10
619	vmovapd	%ymm0,%ymm11
620	vmovapd	%ymm0,%ymm12
621
622	xorl	%r9d,%r9d
623	xorl	%r15d,%r15d
624
625	movq	%rdx,%r11
626	movq	$0xfffffffffffff,%rax
627
628	movl	$20,%ebx
629
630.align	32
631.Lloop20:
632	movq	0(%r11),%r13
633
634	vpbroadcastq	0(%r11),%ymm1
635	movq	0(%rsi),%rdx
636	mulxq	%r13,%r13,%r12
637	addq	%r13,%r9
638	movq	%r12,%r10
639	adcq	$0,%r10
640
641	movq	(%r8),%r13
642	imulq	%r9,%r13
643	andq	%rax,%r13
644
645	vmovq	%r13,%xmm2
646	vpbroadcastq	%xmm2,%ymm2
647	movq	0(%rcx),%rdx
648	mulxq	%r13,%r13,%r12
649	addq	%r13,%r9
650	adcq	%r12,%r10
651
652	shrq	$52,%r9
653	salq	$12,%r10
654	orq	%r10,%r9
655
656	leaq	-168(%rsp),%rsp
657{vex}	vpmadd52luq	0(%rsi),%ymm1,%ymm3
658{vex}	vpmadd52luq	32(%rsi),%ymm1,%ymm5
659{vex}	vpmadd52luq	64(%rsi),%ymm1,%ymm6
660{vex}	vpmadd52luq	96(%rsi),%ymm1,%ymm7
661{vex}	vpmadd52luq	128(%rsi),%ymm1,%ymm8
662
663{vex}	vpmadd52luq	0(%rcx),%ymm2,%ymm3
664{vex}	vpmadd52luq	32(%rcx),%ymm2,%ymm5
665{vex}	vpmadd52luq	64(%rcx),%ymm2,%ymm6
666{vex}	vpmadd52luq	96(%rcx),%ymm2,%ymm7
667{vex}	vpmadd52luq	128(%rcx),%ymm2,%ymm8
668
669
670	vmovdqu	%ymm3,0(%rsp)
671	vmovdqu	%ymm5,32(%rsp)
672	vmovdqu	%ymm6,64(%rsp)
673	vmovdqu	%ymm7,96(%rsp)
674	vmovdqu	%ymm8,128(%rsp)
675	movq	$0,160(%rsp)
676
677	vmovdqu	8(%rsp),%ymm3
678	vmovdqu	40(%rsp),%ymm5
679	vmovdqu	72(%rsp),%ymm6
680	vmovdqu	104(%rsp),%ymm7
681	vmovdqu	136(%rsp),%ymm8
682
683	addq	8(%rsp),%r9
684
685{vex}	vpmadd52huq	0(%rsi),%ymm1,%ymm3
686{vex}	vpmadd52huq	32(%rsi),%ymm1,%ymm5
687{vex}	vpmadd52huq	64(%rsi),%ymm1,%ymm6
688{vex}	vpmadd52huq	96(%rsi),%ymm1,%ymm7
689{vex}	vpmadd52huq	128(%rsi),%ymm1,%ymm8
690
691{vex}	vpmadd52huq	0(%rcx),%ymm2,%ymm3
692{vex}	vpmadd52huq	32(%rcx),%ymm2,%ymm5
693{vex}	vpmadd52huq	64(%rcx),%ymm2,%ymm6
694{vex}	vpmadd52huq	96(%rcx),%ymm2,%ymm7
695{vex}	vpmadd52huq	128(%rcx),%ymm2,%ymm8
696	leaq	168(%rsp),%rsp
697	movq	160(%r11),%r13
698
699	vpbroadcastq	160(%r11),%ymm1
700	movq	160(%rsi),%rdx
701	mulxq	%r13,%r13,%r12
702	addq	%r13,%r15
703	movq	%r12,%r10
704	adcq	$0,%r10
705
706	movq	8(%r8),%r13
707	imulq	%r15,%r13
708	andq	%rax,%r13
709
710	vmovq	%r13,%xmm2
711	vpbroadcastq	%xmm2,%ymm2
712	movq	160(%rcx),%rdx
713	mulxq	%r13,%r13,%r12
714	addq	%r13,%r15
715	adcq	%r12,%r10
716
717	shrq	$52,%r15
718	salq	$12,%r10
719	orq	%r10,%r15
720
721	leaq	-168(%rsp),%rsp
722{vex}	vpmadd52luq	160(%rsi),%ymm1,%ymm4
723{vex}	vpmadd52luq	192(%rsi),%ymm1,%ymm9
724{vex}	vpmadd52luq	224(%rsi),%ymm1,%ymm10
725{vex}	vpmadd52luq	256(%rsi),%ymm1,%ymm11
726{vex}	vpmadd52luq	288(%rsi),%ymm1,%ymm12
727
728{vex}	vpmadd52luq	160(%rcx),%ymm2,%ymm4
729{vex}	vpmadd52luq	192(%rcx),%ymm2,%ymm9
730{vex}	vpmadd52luq	224(%rcx),%ymm2,%ymm10
731{vex}	vpmadd52luq	256(%rcx),%ymm2,%ymm11
732{vex}	vpmadd52luq	288(%rcx),%ymm2,%ymm12
733
734
735	vmovdqu	%ymm4,0(%rsp)
736	vmovdqu	%ymm9,32(%rsp)
737	vmovdqu	%ymm10,64(%rsp)
738	vmovdqu	%ymm11,96(%rsp)
739	vmovdqu	%ymm12,128(%rsp)
740	movq	$0,160(%rsp)
741
742	vmovdqu	8(%rsp),%ymm4
743	vmovdqu	40(%rsp),%ymm9
744	vmovdqu	72(%rsp),%ymm10
745	vmovdqu	104(%rsp),%ymm11
746	vmovdqu	136(%rsp),%ymm12
747
748	addq	8(%rsp),%r15
749
750{vex}	vpmadd52huq	160(%rsi),%ymm1,%ymm4
751{vex}	vpmadd52huq	192(%rsi),%ymm1,%ymm9
752{vex}	vpmadd52huq	224(%rsi),%ymm1,%ymm10
753{vex}	vpmadd52huq	256(%rsi),%ymm1,%ymm11
754{vex}	vpmadd52huq	288(%rsi),%ymm1,%ymm12
755
756{vex}	vpmadd52huq	160(%rcx),%ymm2,%ymm4
757{vex}	vpmadd52huq	192(%rcx),%ymm2,%ymm9
758{vex}	vpmadd52huq	224(%rcx),%ymm2,%ymm10
759{vex}	vpmadd52huq	256(%rcx),%ymm2,%ymm11
760{vex}	vpmadd52huq	288(%rcx),%ymm2,%ymm12
761	leaq	168(%rsp),%rsp
762	leaq	8(%r11),%r11
763	decl	%ebx
764	jne	.Lloop20
765
766	vmovq	%r9,%xmm0
767	vpbroadcastq	%xmm0,%ymm0
768	vpblendd	$3,%ymm0,%ymm3,%ymm3
769
770
771
772	vpsrlq	$52,%ymm3,%ymm0
773	vpsrlq	$52,%ymm5,%ymm1
774	vpsrlq	$52,%ymm6,%ymm2
775	vpsrlq	$52,%ymm7,%ymm13
776	vpsrlq	$52,%ymm8,%ymm14
777
778
779	vpermq	$144,%ymm14,%ymm14
780	vpermq	$3,%ymm13,%ymm15
781	vblendpd	$1,%ymm15,%ymm14,%ymm14
782
783	vpermq	$144,%ymm13,%ymm13
784	vpermq	$3,%ymm2,%ymm15
785	vblendpd	$1,%ymm15,%ymm13,%ymm13
786
787	vpermq	$144,%ymm2,%ymm2
788	vpermq	$3,%ymm1,%ymm15
789	vblendpd	$1,%ymm15,%ymm2,%ymm2
790
791	vpermq	$144,%ymm1,%ymm1
792	vpermq	$3,%ymm0,%ymm15
793	vblendpd	$1,%ymm15,%ymm1,%ymm1
794
795	vpermq	$144,%ymm0,%ymm0
796	vpand	.Lhigh64x3(%rip),%ymm0,%ymm0
797
798
799	vpand	.Lmask52x4(%rip),%ymm3,%ymm3
800	vpand	.Lmask52x4(%rip),%ymm5,%ymm5
801	vpand	.Lmask52x4(%rip),%ymm6,%ymm6
802	vpand	.Lmask52x4(%rip),%ymm7,%ymm7
803	vpand	.Lmask52x4(%rip),%ymm8,%ymm8
804
805
806	vpaddq	%ymm0,%ymm3,%ymm3
807	vpaddq	%ymm1,%ymm5,%ymm5
808	vpaddq	%ymm2,%ymm6,%ymm6
809	vpaddq	%ymm13,%ymm7,%ymm7
810	vpaddq	%ymm14,%ymm8,%ymm8
811
812
813
814	vpcmpgtq	.Lmask52x4(%rip),%ymm3,%ymm0
815	vpcmpgtq	.Lmask52x4(%rip),%ymm5,%ymm1
816	vpcmpgtq	.Lmask52x4(%rip),%ymm6,%ymm2
817	vpcmpgtq	.Lmask52x4(%rip),%ymm7,%ymm13
818	vpcmpgtq	.Lmask52x4(%rip),%ymm8,%ymm14
819	vmovmskpd	%ymm0,%r14d
820	vmovmskpd	%ymm1,%r13d
821	vmovmskpd	%ymm2,%r12d
822	vmovmskpd	%ymm13,%r11d
823	vmovmskpd	%ymm14,%r10d
824
825
826	vpcmpeqq	.Lmask52x4(%rip),%ymm3,%ymm0
827	vpcmpeqq	.Lmask52x4(%rip),%ymm5,%ymm1
828	vpcmpeqq	.Lmask52x4(%rip),%ymm6,%ymm2
829	vpcmpeqq	.Lmask52x4(%rip),%ymm7,%ymm13
830	vpcmpeqq	.Lmask52x4(%rip),%ymm8,%ymm14
831	vmovmskpd	%ymm0,%r9d
832	vmovmskpd	%ymm1,%r8d
833	vmovmskpd	%ymm2,%ebx
834	vmovmskpd	%ymm13,%ecx
835	vmovmskpd	%ymm14,%edx
836
837
838
839	shlb	$4,%r13b
840	orb	%r13b,%r14b
841	shlb	$4,%r11b
842	orb	%r11b,%r12b
843
844	addb	%r14b,%r14b
845	adcb	%r12b,%r12b
846	adcb	%r10b,%r10b
847
848	shlb	$4,%r8b
849	orb	%r8b,%r9b
850	shlb	$4,%cl
851	orb	%cl,%bl
852
853	addb	%r9b,%r14b
854	adcb	%bl,%r12b
855	adcb	%dl,%r10b
856
857	xorb	%r9b,%r14b
858	xorb	%bl,%r12b
859	xorb	%dl,%r10b
860
861	leaq	.Lkmasklut(%rip),%rdx
862
863	movb	%r14b,%r13b
864	andq	$0xf,%r14
865	vpsubq	.Lmask52x4(%rip),%ymm3,%ymm0
866	shlq	$5,%r14
867	vmovapd	(%rdx,%r14,1),%ymm2
868	vblendvpd	%ymm2,%ymm0,%ymm3,%ymm3
869
870	shrb	$4,%r13b
871	andq	$0xf,%r13
872	vpsubq	.Lmask52x4(%rip),%ymm5,%ymm0
873	shlq	$5,%r13
874	vmovapd	(%rdx,%r13,1),%ymm2
875	vblendvpd	%ymm2,%ymm0,%ymm5,%ymm5
876
877	movb	%r12b,%r11b
878	andq	$0xf,%r12
879	vpsubq	.Lmask52x4(%rip),%ymm6,%ymm0
880	shlq	$5,%r12
881	vmovapd	(%rdx,%r12,1),%ymm2
882	vblendvpd	%ymm2,%ymm0,%ymm6,%ymm6
883
884	shrb	$4,%r11b
885	andq	$0xf,%r11
886	vpsubq	.Lmask52x4(%rip),%ymm7,%ymm0
887	shlq	$5,%r11
888	vmovapd	(%rdx,%r11,1),%ymm2
889	vblendvpd	%ymm2,%ymm0,%ymm7,%ymm7
890
891	andq	$0xf,%r10
892	vpsubq	.Lmask52x4(%rip),%ymm8,%ymm0
893	shlq	$5,%r10
894	vmovapd	(%rdx,%r10,1),%ymm2
895	vblendvpd	%ymm2,%ymm0,%ymm8,%ymm8
896
897
898	vpand	.Lmask52x4(%rip),%ymm3,%ymm3
899	vpand	.Lmask52x4(%rip),%ymm5,%ymm5
900	vpand	.Lmask52x4(%rip),%ymm6,%ymm6
901	vpand	.Lmask52x4(%rip),%ymm7,%ymm7
902	vpand	.Lmask52x4(%rip),%ymm8,%ymm8
903
904	vmovq	%r15,%xmm0
905	vpbroadcastq	%xmm0,%ymm0
906	vpblendd	$3,%ymm0,%ymm4,%ymm4
907
908
909
910	vpsrlq	$52,%ymm4,%ymm0
911	vpsrlq	$52,%ymm9,%ymm1
912	vpsrlq	$52,%ymm10,%ymm2
913	vpsrlq	$52,%ymm11,%ymm13
914	vpsrlq	$52,%ymm12,%ymm14
915
916
917	vpermq	$144,%ymm14,%ymm14
918	vpermq	$3,%ymm13,%ymm15
919	vblendpd	$1,%ymm15,%ymm14,%ymm14
920
921	vpermq	$144,%ymm13,%ymm13
922	vpermq	$3,%ymm2,%ymm15
923	vblendpd	$1,%ymm15,%ymm13,%ymm13
924
925	vpermq	$144,%ymm2,%ymm2
926	vpermq	$3,%ymm1,%ymm15
927	vblendpd	$1,%ymm15,%ymm2,%ymm2
928
929	vpermq	$144,%ymm1,%ymm1
930	vpermq	$3,%ymm0,%ymm15
931	vblendpd	$1,%ymm15,%ymm1,%ymm1
932
933	vpermq	$144,%ymm0,%ymm0
934	vpand	.Lhigh64x3(%rip),%ymm0,%ymm0
935
936
937	vpand	.Lmask52x4(%rip),%ymm4,%ymm4
938	vpand	.Lmask52x4(%rip),%ymm9,%ymm9
939	vpand	.Lmask52x4(%rip),%ymm10,%ymm10
940	vpand	.Lmask52x4(%rip),%ymm11,%ymm11
941	vpand	.Lmask52x4(%rip),%ymm12,%ymm12
942
943
944	vpaddq	%ymm0,%ymm4,%ymm4
945	vpaddq	%ymm1,%ymm9,%ymm9
946	vpaddq	%ymm2,%ymm10,%ymm10
947	vpaddq	%ymm13,%ymm11,%ymm11
948	vpaddq	%ymm14,%ymm12,%ymm12
949
950
951
952	vpcmpgtq	.Lmask52x4(%rip),%ymm4,%ymm0
953	vpcmpgtq	.Lmask52x4(%rip),%ymm9,%ymm1
954	vpcmpgtq	.Lmask52x4(%rip),%ymm10,%ymm2
955	vpcmpgtq	.Lmask52x4(%rip),%ymm11,%ymm13
956	vpcmpgtq	.Lmask52x4(%rip),%ymm12,%ymm14
957	vmovmskpd	%ymm0,%r14d
958	vmovmskpd	%ymm1,%r13d
959	vmovmskpd	%ymm2,%r12d
960	vmovmskpd	%ymm13,%r11d
961	vmovmskpd	%ymm14,%r10d
962
963
964	vpcmpeqq	.Lmask52x4(%rip),%ymm4,%ymm0
965	vpcmpeqq	.Lmask52x4(%rip),%ymm9,%ymm1
966	vpcmpeqq	.Lmask52x4(%rip),%ymm10,%ymm2
967	vpcmpeqq	.Lmask52x4(%rip),%ymm11,%ymm13
968	vpcmpeqq	.Lmask52x4(%rip),%ymm12,%ymm14
969	vmovmskpd	%ymm0,%r9d
970	vmovmskpd	%ymm1,%r8d
971	vmovmskpd	%ymm2,%ebx
972	vmovmskpd	%ymm13,%ecx
973	vmovmskpd	%ymm14,%edx
974
975
976
977	shlb	$4,%r13b
978	orb	%r13b,%r14b
979	shlb	$4,%r11b
980	orb	%r11b,%r12b
981
982	addb	%r14b,%r14b
983	adcb	%r12b,%r12b
984	adcb	%r10b,%r10b
985
986	shlb	$4,%r8b
987	orb	%r8b,%r9b
988	shlb	$4,%cl
989	orb	%cl,%bl
990
991	addb	%r9b,%r14b
992	adcb	%bl,%r12b
993	adcb	%dl,%r10b
994
995	xorb	%r9b,%r14b
996	xorb	%bl,%r12b
997	xorb	%dl,%r10b
998
999	leaq	.Lkmasklut(%rip),%rdx
1000
1001	movb	%r14b,%r13b
1002	andq	$0xf,%r14
1003	vpsubq	.Lmask52x4(%rip),%ymm4,%ymm0
1004	shlq	$5,%r14
1005	vmovapd	(%rdx,%r14,1),%ymm2
1006	vblendvpd	%ymm2,%ymm0,%ymm4,%ymm4
1007
1008	shrb	$4,%r13b
1009	andq	$0xf,%r13
1010	vpsubq	.Lmask52x4(%rip),%ymm9,%ymm0
1011	shlq	$5,%r13
1012	vmovapd	(%rdx,%r13,1),%ymm2
1013	vblendvpd	%ymm2,%ymm0,%ymm9,%ymm9
1014
1015	movb	%r12b,%r11b
1016	andq	$0xf,%r12
1017	vpsubq	.Lmask52x4(%rip),%ymm10,%ymm0
1018	shlq	$5,%r12
1019	vmovapd	(%rdx,%r12,1),%ymm2
1020	vblendvpd	%ymm2,%ymm0,%ymm10,%ymm10
1021
1022	shrb	$4,%r11b
1023	andq	$0xf,%r11
1024	vpsubq	.Lmask52x4(%rip),%ymm11,%ymm0
1025	shlq	$5,%r11
1026	vmovapd	(%rdx,%r11,1),%ymm2
1027	vblendvpd	%ymm2,%ymm0,%ymm11,%ymm11
1028
1029	andq	$0xf,%r10
1030	vpsubq	.Lmask52x4(%rip),%ymm12,%ymm0
1031	shlq	$5,%r10
1032	vmovapd	(%rdx,%r10,1),%ymm2
1033	vblendvpd	%ymm2,%ymm0,%ymm12,%ymm12
1034
1035
1036	vpand	.Lmask52x4(%rip),%ymm4,%ymm4
1037	vpand	.Lmask52x4(%rip),%ymm9,%ymm9
1038	vpand	.Lmask52x4(%rip),%ymm10,%ymm10
1039	vpand	.Lmask52x4(%rip),%ymm11,%ymm11
1040	vpand	.Lmask52x4(%rip),%ymm12,%ymm12
1041
1042	vmovdqu	%ymm3,0(%rdi)
1043	vmovdqu	%ymm5,32(%rdi)
1044	vmovdqu	%ymm6,64(%rdi)
1045	vmovdqu	%ymm7,96(%rdi)
1046	vmovdqu	%ymm8,128(%rdi)
1047
1048	vmovdqu	%ymm4,160(%rdi)
1049	vmovdqu	%ymm9,192(%rdi)
1050	vmovdqu	%ymm10,224(%rdi)
1051	vmovdqu	%ymm11,256(%rdi)
1052	vmovdqu	%ymm12,288(%rdi)
1053
1054	vzeroupper
1055	movq	0(%rsp),%r15
1056.cfi_restore	%r15
1057	movq	8(%rsp),%r14
1058.cfi_restore	%r14
1059	movq	16(%rsp),%r13
1060.cfi_restore	%r13
1061	movq	24(%rsp),%r12
1062.cfi_restore	%r12
1063	movq	32(%rsp),%rbp
1064.cfi_restore	%rbp
1065	movq	40(%rsp),%rbx
1066.cfi_restore	%rbx
1067	leaq	48(%rsp),%rsp
1068.cfi_adjust_cfa_offset	-48
1069.Lossl_rsaz_amm52x20_x2_avxifma256_epilogue:
1070	.byte	0xf3,0xc3
1071.cfi_endproc
1072.size	ossl_rsaz_amm52x20_x2_avxifma256, .-ossl_rsaz_amm52x20_x2_avxifma256
1073.text
1074
1075.align	32
1076.globl	ossl_extract_multiplier_2x20_win5_avx
1077.type	ossl_extract_multiplier_2x20_win5_avx,@function
1078ossl_extract_multiplier_2x20_win5_avx:
1079.cfi_startproc
1080.byte	243,15,30,250
1081	vmovapd	.Lones(%rip),%ymm14
1082	vmovq	%rdx,%xmm10
1083	vpbroadcastq	%xmm10,%ymm12
1084	vmovq	%rcx,%xmm10
1085	vpbroadcastq	%xmm10,%ymm13
1086	leaq	10240(%rsi),%rax
1087
1088
1089	vpxor	%xmm0,%xmm0,%xmm0
1090	vmovapd	%ymm0,%ymm11
1091	vmovapd	%ymm0,%ymm1
1092	vmovapd	%ymm0,%ymm2
1093	vmovapd	%ymm0,%ymm3
1094	vmovapd	%ymm0,%ymm4
1095	vmovapd	%ymm0,%ymm5
1096	vmovapd	%ymm0,%ymm6
1097	vmovapd	%ymm0,%ymm7
1098	vmovapd	%ymm0,%ymm8
1099	vmovapd	%ymm0,%ymm9
1100
1101.align	32
1102.Lloop:
1103	vpcmpeqq	%ymm11,%ymm12,%ymm15
1104	vmovdqu	0(%rsi),%ymm10
1105	vblendvpd	%ymm15,%ymm10,%ymm0,%ymm0
1106	vmovdqu	32(%rsi),%ymm10
1107	vblendvpd	%ymm15,%ymm10,%ymm1,%ymm1
1108	vmovdqu	64(%rsi),%ymm10
1109	vblendvpd	%ymm15,%ymm10,%ymm2,%ymm2
1110	vmovdqu	96(%rsi),%ymm10
1111	vblendvpd	%ymm15,%ymm10,%ymm3,%ymm3
1112	vmovdqu	128(%rsi),%ymm10
1113	vblendvpd	%ymm15,%ymm10,%ymm4,%ymm4
1114	vpcmpeqq	%ymm11,%ymm13,%ymm15
1115	vmovdqu	160(%rsi),%ymm10
1116	vblendvpd	%ymm15,%ymm10,%ymm5,%ymm5
1117	vmovdqu	192(%rsi),%ymm10
1118	vblendvpd	%ymm15,%ymm10,%ymm6,%ymm6
1119	vmovdqu	224(%rsi),%ymm10
1120	vblendvpd	%ymm15,%ymm10,%ymm7,%ymm7
1121	vmovdqu	256(%rsi),%ymm10
1122	vblendvpd	%ymm15,%ymm10,%ymm8,%ymm8
1123	vmovdqu	288(%rsi),%ymm10
1124	vblendvpd	%ymm15,%ymm10,%ymm9,%ymm9
1125	vpaddq	%ymm14,%ymm11,%ymm11
1126	addq	$320,%rsi
1127	cmpq	%rsi,%rax
1128	jne	.Lloop
1129	vmovdqu	%ymm0,0(%rdi)
1130	vmovdqu	%ymm1,32(%rdi)
1131	vmovdqu	%ymm2,64(%rdi)
1132	vmovdqu	%ymm3,96(%rdi)
1133	vmovdqu	%ymm4,128(%rdi)
1134	vmovdqu	%ymm5,160(%rdi)
1135	vmovdqu	%ymm6,192(%rdi)
1136	vmovdqu	%ymm7,224(%rdi)
1137	vmovdqu	%ymm8,256(%rdi)
1138	vmovdqu	%ymm9,288(%rdi)
1139	.byte	0xf3,0xc3
1140.cfi_endproc
1141.size	ossl_extract_multiplier_2x20_win5_avx, .-ossl_extract_multiplier_2x20_win5_avx
1142.section	.rodata
1143.align	32
1144.Lones:
1145.quad	1,1,1,1
1146.Lzeros:
1147.quad	0,0,0,0
1148	.section ".note.gnu.property", "a"
1149	.p2align 3
1150	.long 1f - 0f
1151	.long 4f - 1f
1152	.long 5
11530:
1154	# "GNU" encoded with .byte, since .asciz isn't supported
1155	# on Solaris.
1156	.byte 0x47
1157	.byte 0x4e
1158	.byte 0x55
1159	.byte 0
11601:
1161	.p2align 3
1162	.long 0xc0000002
1163	.long 3f - 2f
11642:
1165	.long 3
11663:
1167	.p2align 3
11684:
1169