xref: /freebsd/sys/crypto/openssl/amd64/rsaz-3k-avxifma.S (revision 4757b351ea9d59d71d4a38b82506d2d16fcd560d)
1/* Do not modify. This file is auto-generated from rsaz-3k-avxifma.pl. */
2.text
3
4.globl	ossl_rsaz_amm52x30_x1_avxifma256
5.type	ossl_rsaz_amm52x30_x1_avxifma256,@function
6.align	32
7ossl_rsaz_amm52x30_x1_avxifma256:
8.cfi_startproc
9.byte	243,15,30,250
10	pushq	%rbx
11.cfi_adjust_cfa_offset	8
12.cfi_offset	%rbx,-16
13	pushq	%rbp
14.cfi_adjust_cfa_offset	8
15.cfi_offset	%rbp,-24
16	pushq	%r12
17.cfi_adjust_cfa_offset	8
18.cfi_offset	%r12,-32
19	pushq	%r13
20.cfi_adjust_cfa_offset	8
21.cfi_offset	%r13,-40
22	pushq	%r14
23.cfi_adjust_cfa_offset	8
24.cfi_offset	%r14,-48
25	pushq	%r15
26.cfi_adjust_cfa_offset	8
27.cfi_offset	%r15,-56
28
29	vpxor	%ymm0,%ymm0,%ymm0
30	vmovapd	%ymm0,%ymm3
31	vmovapd	%ymm0,%ymm4
32	vmovapd	%ymm0,%ymm5
33	vmovapd	%ymm0,%ymm6
34	vmovapd	%ymm0,%ymm7
35	vmovapd	%ymm0,%ymm8
36	vmovapd	%ymm0,%ymm9
37	vmovapd	%ymm0,%ymm10
38
39	xorl	%r9d,%r9d
40
41	movq	%rdx,%r11
42	movq	$0xfffffffffffff,%rax
43
44
45	movl	$7,%ebx
46
47.align	32
48.Lloop7:
49	movq	0(%r11),%r13
50
51	vpbroadcastq	0(%r11),%ymm1
52	movq	0(%rsi),%rdx
53	mulxq	%r13,%r13,%r12
54	addq	%r13,%r9
55	movq	%r12,%r10
56	adcq	$0,%r10
57
58	movq	%r8,%r13
59	imulq	%r9,%r13
60	andq	%rax,%r13
61
62	vmovq	%r13,%xmm2
63	vpbroadcastq	%xmm2,%ymm2
64	movq	0(%rcx),%rdx
65	mulxq	%r13,%r13,%r12
66	addq	%r13,%r9
67	adcq	%r12,%r10
68
69	shrq	$52,%r9
70	salq	$12,%r10
71	orq	%r10,%r9
72
73	leaq	-264(%rsp),%rsp
74
75{vex}	vpmadd52luq	0(%rsi),%ymm1,%ymm3
76{vex}	vpmadd52luq	32(%rsi),%ymm1,%ymm4
77{vex}	vpmadd52luq	64(%rsi),%ymm1,%ymm5
78{vex}	vpmadd52luq	96(%rsi),%ymm1,%ymm6
79{vex}	vpmadd52luq	128(%rsi),%ymm1,%ymm7
80{vex}	vpmadd52luq	160(%rsi),%ymm1,%ymm8
81{vex}	vpmadd52luq	192(%rsi),%ymm1,%ymm9
82{vex}	vpmadd52luq	224(%rsi),%ymm1,%ymm10
83
84{vex}	vpmadd52luq	0(%rcx),%ymm2,%ymm3
85{vex}	vpmadd52luq	32(%rcx),%ymm2,%ymm4
86{vex}	vpmadd52luq	64(%rcx),%ymm2,%ymm5
87{vex}	vpmadd52luq	96(%rcx),%ymm2,%ymm6
88{vex}	vpmadd52luq	128(%rcx),%ymm2,%ymm7
89{vex}	vpmadd52luq	160(%rcx),%ymm2,%ymm8
90{vex}	vpmadd52luq	192(%rcx),%ymm2,%ymm9
91{vex}	vpmadd52luq	224(%rcx),%ymm2,%ymm10
92
93
94	vmovdqu	%ymm3,0(%rsp)
95	vmovdqu	%ymm4,32(%rsp)
96	vmovdqu	%ymm5,64(%rsp)
97	vmovdqu	%ymm6,96(%rsp)
98	vmovdqu	%ymm7,128(%rsp)
99	vmovdqu	%ymm8,160(%rsp)
100	vmovdqu	%ymm9,192(%rsp)
101	vmovdqu	%ymm10,224(%rsp)
102	movq	$0,256(%rsp)
103
104	vmovdqu	8(%rsp),%ymm3
105	vmovdqu	40(%rsp),%ymm4
106	vmovdqu	72(%rsp),%ymm5
107	vmovdqu	104(%rsp),%ymm6
108	vmovdqu	136(%rsp),%ymm7
109	vmovdqu	168(%rsp),%ymm8
110	vmovdqu	200(%rsp),%ymm9
111	vmovdqu	232(%rsp),%ymm10
112
113	addq	8(%rsp),%r9
114
115{vex}	vpmadd52huq	0(%rsi),%ymm1,%ymm3
116{vex}	vpmadd52huq	32(%rsi),%ymm1,%ymm4
117{vex}	vpmadd52huq	64(%rsi),%ymm1,%ymm5
118{vex}	vpmadd52huq	96(%rsi),%ymm1,%ymm6
119{vex}	vpmadd52huq	128(%rsi),%ymm1,%ymm7
120{vex}	vpmadd52huq	160(%rsi),%ymm1,%ymm8
121{vex}	vpmadd52huq	192(%rsi),%ymm1,%ymm9
122{vex}	vpmadd52huq	224(%rsi),%ymm1,%ymm10
123
124{vex}	vpmadd52huq	0(%rcx),%ymm2,%ymm3
125{vex}	vpmadd52huq	32(%rcx),%ymm2,%ymm4
126{vex}	vpmadd52huq	64(%rcx),%ymm2,%ymm5
127{vex}	vpmadd52huq	96(%rcx),%ymm2,%ymm6
128{vex}	vpmadd52huq	128(%rcx),%ymm2,%ymm7
129{vex}	vpmadd52huq	160(%rcx),%ymm2,%ymm8
130{vex}	vpmadd52huq	192(%rcx),%ymm2,%ymm9
131{vex}	vpmadd52huq	224(%rcx),%ymm2,%ymm10
132
133	leaq	264(%rsp),%rsp
134	movq	8(%r11),%r13
135
136	vpbroadcastq	8(%r11),%ymm1
137	movq	0(%rsi),%rdx
138	mulxq	%r13,%r13,%r12
139	addq	%r13,%r9
140	movq	%r12,%r10
141	adcq	$0,%r10
142
143	movq	%r8,%r13
144	imulq	%r9,%r13
145	andq	%rax,%r13
146
147	vmovq	%r13,%xmm2
148	vpbroadcastq	%xmm2,%ymm2
149	movq	0(%rcx),%rdx
150	mulxq	%r13,%r13,%r12
151	addq	%r13,%r9
152	adcq	%r12,%r10
153
154	shrq	$52,%r9
155	salq	$12,%r10
156	orq	%r10,%r9
157
158	leaq	-264(%rsp),%rsp
159
160{vex}	vpmadd52luq	0(%rsi),%ymm1,%ymm3
161{vex}	vpmadd52luq	32(%rsi),%ymm1,%ymm4
162{vex}	vpmadd52luq	64(%rsi),%ymm1,%ymm5
163{vex}	vpmadd52luq	96(%rsi),%ymm1,%ymm6
164{vex}	vpmadd52luq	128(%rsi),%ymm1,%ymm7
165{vex}	vpmadd52luq	160(%rsi),%ymm1,%ymm8
166{vex}	vpmadd52luq	192(%rsi),%ymm1,%ymm9
167{vex}	vpmadd52luq	224(%rsi),%ymm1,%ymm10
168
169{vex}	vpmadd52luq	0(%rcx),%ymm2,%ymm3
170{vex}	vpmadd52luq	32(%rcx),%ymm2,%ymm4
171{vex}	vpmadd52luq	64(%rcx),%ymm2,%ymm5
172{vex}	vpmadd52luq	96(%rcx),%ymm2,%ymm6
173{vex}	vpmadd52luq	128(%rcx),%ymm2,%ymm7
174{vex}	vpmadd52luq	160(%rcx),%ymm2,%ymm8
175{vex}	vpmadd52luq	192(%rcx),%ymm2,%ymm9
176{vex}	vpmadd52luq	224(%rcx),%ymm2,%ymm10
177
178
179	vmovdqu	%ymm3,0(%rsp)
180	vmovdqu	%ymm4,32(%rsp)
181	vmovdqu	%ymm5,64(%rsp)
182	vmovdqu	%ymm6,96(%rsp)
183	vmovdqu	%ymm7,128(%rsp)
184	vmovdqu	%ymm8,160(%rsp)
185	vmovdqu	%ymm9,192(%rsp)
186	vmovdqu	%ymm10,224(%rsp)
187	movq	$0,256(%rsp)
188
189	vmovdqu	8(%rsp),%ymm3
190	vmovdqu	40(%rsp),%ymm4
191	vmovdqu	72(%rsp),%ymm5
192	vmovdqu	104(%rsp),%ymm6
193	vmovdqu	136(%rsp),%ymm7
194	vmovdqu	168(%rsp),%ymm8
195	vmovdqu	200(%rsp),%ymm9
196	vmovdqu	232(%rsp),%ymm10
197
198	addq	8(%rsp),%r9
199
200{vex}	vpmadd52huq	0(%rsi),%ymm1,%ymm3
201{vex}	vpmadd52huq	32(%rsi),%ymm1,%ymm4
202{vex}	vpmadd52huq	64(%rsi),%ymm1,%ymm5
203{vex}	vpmadd52huq	96(%rsi),%ymm1,%ymm6
204{vex}	vpmadd52huq	128(%rsi),%ymm1,%ymm7
205{vex}	vpmadd52huq	160(%rsi),%ymm1,%ymm8
206{vex}	vpmadd52huq	192(%rsi),%ymm1,%ymm9
207{vex}	vpmadd52huq	224(%rsi),%ymm1,%ymm10
208
209{vex}	vpmadd52huq	0(%rcx),%ymm2,%ymm3
210{vex}	vpmadd52huq	32(%rcx),%ymm2,%ymm4
211{vex}	vpmadd52huq	64(%rcx),%ymm2,%ymm5
212{vex}	vpmadd52huq	96(%rcx),%ymm2,%ymm6
213{vex}	vpmadd52huq	128(%rcx),%ymm2,%ymm7
214{vex}	vpmadd52huq	160(%rcx),%ymm2,%ymm8
215{vex}	vpmadd52huq	192(%rcx),%ymm2,%ymm9
216{vex}	vpmadd52huq	224(%rcx),%ymm2,%ymm10
217
218	leaq	264(%rsp),%rsp
219	movq	16(%r11),%r13
220
221	vpbroadcastq	16(%r11),%ymm1
222	movq	0(%rsi),%rdx
223	mulxq	%r13,%r13,%r12
224	addq	%r13,%r9
225	movq	%r12,%r10
226	adcq	$0,%r10
227
228	movq	%r8,%r13
229	imulq	%r9,%r13
230	andq	%rax,%r13
231
232	vmovq	%r13,%xmm2
233	vpbroadcastq	%xmm2,%ymm2
234	movq	0(%rcx),%rdx
235	mulxq	%r13,%r13,%r12
236	addq	%r13,%r9
237	adcq	%r12,%r10
238
239	shrq	$52,%r9
240	salq	$12,%r10
241	orq	%r10,%r9
242
243	leaq	-264(%rsp),%rsp
244
245{vex}	vpmadd52luq	0(%rsi),%ymm1,%ymm3
246{vex}	vpmadd52luq	32(%rsi),%ymm1,%ymm4
247{vex}	vpmadd52luq	64(%rsi),%ymm1,%ymm5
248{vex}	vpmadd52luq	96(%rsi),%ymm1,%ymm6
249{vex}	vpmadd52luq	128(%rsi),%ymm1,%ymm7
250{vex}	vpmadd52luq	160(%rsi),%ymm1,%ymm8
251{vex}	vpmadd52luq	192(%rsi),%ymm1,%ymm9
252{vex}	vpmadd52luq	224(%rsi),%ymm1,%ymm10
253
254{vex}	vpmadd52luq	0(%rcx),%ymm2,%ymm3
255{vex}	vpmadd52luq	32(%rcx),%ymm2,%ymm4
256{vex}	vpmadd52luq	64(%rcx),%ymm2,%ymm5
257{vex}	vpmadd52luq	96(%rcx),%ymm2,%ymm6
258{vex}	vpmadd52luq	128(%rcx),%ymm2,%ymm7
259{vex}	vpmadd52luq	160(%rcx),%ymm2,%ymm8
260{vex}	vpmadd52luq	192(%rcx),%ymm2,%ymm9
261{vex}	vpmadd52luq	224(%rcx),%ymm2,%ymm10
262
263
264	vmovdqu	%ymm3,0(%rsp)
265	vmovdqu	%ymm4,32(%rsp)
266	vmovdqu	%ymm5,64(%rsp)
267	vmovdqu	%ymm6,96(%rsp)
268	vmovdqu	%ymm7,128(%rsp)
269	vmovdqu	%ymm8,160(%rsp)
270	vmovdqu	%ymm9,192(%rsp)
271	vmovdqu	%ymm10,224(%rsp)
272	movq	$0,256(%rsp)
273
274	vmovdqu	8(%rsp),%ymm3
275	vmovdqu	40(%rsp),%ymm4
276	vmovdqu	72(%rsp),%ymm5
277	vmovdqu	104(%rsp),%ymm6
278	vmovdqu	136(%rsp),%ymm7
279	vmovdqu	168(%rsp),%ymm8
280	vmovdqu	200(%rsp),%ymm9
281	vmovdqu	232(%rsp),%ymm10
282
283	addq	8(%rsp),%r9
284
285{vex}	vpmadd52huq	0(%rsi),%ymm1,%ymm3
286{vex}	vpmadd52huq	32(%rsi),%ymm1,%ymm4
287{vex}	vpmadd52huq	64(%rsi),%ymm1,%ymm5
288{vex}	vpmadd52huq	96(%rsi),%ymm1,%ymm6
289{vex}	vpmadd52huq	128(%rsi),%ymm1,%ymm7
290{vex}	vpmadd52huq	160(%rsi),%ymm1,%ymm8
291{vex}	vpmadd52huq	192(%rsi),%ymm1,%ymm9
292{vex}	vpmadd52huq	224(%rsi),%ymm1,%ymm10
293
294{vex}	vpmadd52huq	0(%rcx),%ymm2,%ymm3
295{vex}	vpmadd52huq	32(%rcx),%ymm2,%ymm4
296{vex}	vpmadd52huq	64(%rcx),%ymm2,%ymm5
297{vex}	vpmadd52huq	96(%rcx),%ymm2,%ymm6
298{vex}	vpmadd52huq	128(%rcx),%ymm2,%ymm7
299{vex}	vpmadd52huq	160(%rcx),%ymm2,%ymm8
300{vex}	vpmadd52huq	192(%rcx),%ymm2,%ymm9
301{vex}	vpmadd52huq	224(%rcx),%ymm2,%ymm10
302
303	leaq	264(%rsp),%rsp
304	movq	24(%r11),%r13
305
306	vpbroadcastq	24(%r11),%ymm1
307	movq	0(%rsi),%rdx
308	mulxq	%r13,%r13,%r12
309	addq	%r13,%r9
310	movq	%r12,%r10
311	adcq	$0,%r10
312
313	movq	%r8,%r13
314	imulq	%r9,%r13
315	andq	%rax,%r13
316
317	vmovq	%r13,%xmm2
318	vpbroadcastq	%xmm2,%ymm2
319	movq	0(%rcx),%rdx
320	mulxq	%r13,%r13,%r12
321	addq	%r13,%r9
322	adcq	%r12,%r10
323
324	shrq	$52,%r9
325	salq	$12,%r10
326	orq	%r10,%r9
327
328	leaq	-264(%rsp),%rsp
329
330{vex}	vpmadd52luq	0(%rsi),%ymm1,%ymm3
331{vex}	vpmadd52luq	32(%rsi),%ymm1,%ymm4
332{vex}	vpmadd52luq	64(%rsi),%ymm1,%ymm5
333{vex}	vpmadd52luq	96(%rsi),%ymm1,%ymm6
334{vex}	vpmadd52luq	128(%rsi),%ymm1,%ymm7
335{vex}	vpmadd52luq	160(%rsi),%ymm1,%ymm8
336{vex}	vpmadd52luq	192(%rsi),%ymm1,%ymm9
337{vex}	vpmadd52luq	224(%rsi),%ymm1,%ymm10
338
339{vex}	vpmadd52luq	0(%rcx),%ymm2,%ymm3
340{vex}	vpmadd52luq	32(%rcx),%ymm2,%ymm4
341{vex}	vpmadd52luq	64(%rcx),%ymm2,%ymm5
342{vex}	vpmadd52luq	96(%rcx),%ymm2,%ymm6
343{vex}	vpmadd52luq	128(%rcx),%ymm2,%ymm7
344{vex}	vpmadd52luq	160(%rcx),%ymm2,%ymm8
345{vex}	vpmadd52luq	192(%rcx),%ymm2,%ymm9
346{vex}	vpmadd52luq	224(%rcx),%ymm2,%ymm10
347
348
349	vmovdqu	%ymm3,0(%rsp)
350	vmovdqu	%ymm4,32(%rsp)
351	vmovdqu	%ymm5,64(%rsp)
352	vmovdqu	%ymm6,96(%rsp)
353	vmovdqu	%ymm7,128(%rsp)
354	vmovdqu	%ymm8,160(%rsp)
355	vmovdqu	%ymm9,192(%rsp)
356	vmovdqu	%ymm10,224(%rsp)
357	movq	$0,256(%rsp)
358
359	vmovdqu	8(%rsp),%ymm3
360	vmovdqu	40(%rsp),%ymm4
361	vmovdqu	72(%rsp),%ymm5
362	vmovdqu	104(%rsp),%ymm6
363	vmovdqu	136(%rsp),%ymm7
364	vmovdqu	168(%rsp),%ymm8
365	vmovdqu	200(%rsp),%ymm9
366	vmovdqu	232(%rsp),%ymm10
367
368	addq	8(%rsp),%r9
369
370{vex}	vpmadd52huq	0(%rsi),%ymm1,%ymm3
371{vex}	vpmadd52huq	32(%rsi),%ymm1,%ymm4
372{vex}	vpmadd52huq	64(%rsi),%ymm1,%ymm5
373{vex}	vpmadd52huq	96(%rsi),%ymm1,%ymm6
374{vex}	vpmadd52huq	128(%rsi),%ymm1,%ymm7
375{vex}	vpmadd52huq	160(%rsi),%ymm1,%ymm8
376{vex}	vpmadd52huq	192(%rsi),%ymm1,%ymm9
377{vex}	vpmadd52huq	224(%rsi),%ymm1,%ymm10
378
379{vex}	vpmadd52huq	0(%rcx),%ymm2,%ymm3
380{vex}	vpmadd52huq	32(%rcx),%ymm2,%ymm4
381{vex}	vpmadd52huq	64(%rcx),%ymm2,%ymm5
382{vex}	vpmadd52huq	96(%rcx),%ymm2,%ymm6
383{vex}	vpmadd52huq	128(%rcx),%ymm2,%ymm7
384{vex}	vpmadd52huq	160(%rcx),%ymm2,%ymm8
385{vex}	vpmadd52huq	192(%rcx),%ymm2,%ymm9
386{vex}	vpmadd52huq	224(%rcx),%ymm2,%ymm10
387
388	leaq	264(%rsp),%rsp
389	leaq	32(%r11),%r11
390	decl	%ebx
391	jne	.Lloop7
392	movq	0(%r11),%r13
393
394	vpbroadcastq	0(%r11),%ymm1
395	movq	0(%rsi),%rdx
396	mulxq	%r13,%r13,%r12
397	addq	%r13,%r9
398	movq	%r12,%r10
399	adcq	$0,%r10
400
401	movq	%r8,%r13
402	imulq	%r9,%r13
403	andq	%rax,%r13
404
405	vmovq	%r13,%xmm2
406	vpbroadcastq	%xmm2,%ymm2
407	movq	0(%rcx),%rdx
408	mulxq	%r13,%r13,%r12
409	addq	%r13,%r9
410	adcq	%r12,%r10
411
412	shrq	$52,%r9
413	salq	$12,%r10
414	orq	%r10,%r9
415
416	leaq	-264(%rsp),%rsp
417
418{vex}	vpmadd52luq	0(%rsi),%ymm1,%ymm3
419{vex}	vpmadd52luq	32(%rsi),%ymm1,%ymm4
420{vex}	vpmadd52luq	64(%rsi),%ymm1,%ymm5
421{vex}	vpmadd52luq	96(%rsi),%ymm1,%ymm6
422{vex}	vpmadd52luq	128(%rsi),%ymm1,%ymm7
423{vex}	vpmadd52luq	160(%rsi),%ymm1,%ymm8
424{vex}	vpmadd52luq	192(%rsi),%ymm1,%ymm9
425{vex}	vpmadd52luq	224(%rsi),%ymm1,%ymm10
426
427{vex}	vpmadd52luq	0(%rcx),%ymm2,%ymm3
428{vex}	vpmadd52luq	32(%rcx),%ymm2,%ymm4
429{vex}	vpmadd52luq	64(%rcx),%ymm2,%ymm5
430{vex}	vpmadd52luq	96(%rcx),%ymm2,%ymm6
431{vex}	vpmadd52luq	128(%rcx),%ymm2,%ymm7
432{vex}	vpmadd52luq	160(%rcx),%ymm2,%ymm8
433{vex}	vpmadd52luq	192(%rcx),%ymm2,%ymm9
434{vex}	vpmadd52luq	224(%rcx),%ymm2,%ymm10
435
436
437	vmovdqu	%ymm3,0(%rsp)
438	vmovdqu	%ymm4,32(%rsp)
439	vmovdqu	%ymm5,64(%rsp)
440	vmovdqu	%ymm6,96(%rsp)
441	vmovdqu	%ymm7,128(%rsp)
442	vmovdqu	%ymm8,160(%rsp)
443	vmovdqu	%ymm9,192(%rsp)
444	vmovdqu	%ymm10,224(%rsp)
445	movq	$0,256(%rsp)
446
447	vmovdqu	8(%rsp),%ymm3
448	vmovdqu	40(%rsp),%ymm4
449	vmovdqu	72(%rsp),%ymm5
450	vmovdqu	104(%rsp),%ymm6
451	vmovdqu	136(%rsp),%ymm7
452	vmovdqu	168(%rsp),%ymm8
453	vmovdqu	200(%rsp),%ymm9
454	vmovdqu	232(%rsp),%ymm10
455
456	addq	8(%rsp),%r9
457
458{vex}	vpmadd52huq	0(%rsi),%ymm1,%ymm3
459{vex}	vpmadd52huq	32(%rsi),%ymm1,%ymm4
460{vex}	vpmadd52huq	64(%rsi),%ymm1,%ymm5
461{vex}	vpmadd52huq	96(%rsi),%ymm1,%ymm6
462{vex}	vpmadd52huq	128(%rsi),%ymm1,%ymm7
463{vex}	vpmadd52huq	160(%rsi),%ymm1,%ymm8
464{vex}	vpmadd52huq	192(%rsi),%ymm1,%ymm9
465{vex}	vpmadd52huq	224(%rsi),%ymm1,%ymm10
466
467{vex}	vpmadd52huq	0(%rcx),%ymm2,%ymm3
468{vex}	vpmadd52huq	32(%rcx),%ymm2,%ymm4
469{vex}	vpmadd52huq	64(%rcx),%ymm2,%ymm5
470{vex}	vpmadd52huq	96(%rcx),%ymm2,%ymm6
471{vex}	vpmadd52huq	128(%rcx),%ymm2,%ymm7
472{vex}	vpmadd52huq	160(%rcx),%ymm2,%ymm8
473{vex}	vpmadd52huq	192(%rcx),%ymm2,%ymm9
474{vex}	vpmadd52huq	224(%rcx),%ymm2,%ymm10
475
476	leaq	264(%rsp),%rsp
477	movq	8(%r11),%r13
478
479	vpbroadcastq	8(%r11),%ymm1
480	movq	0(%rsi),%rdx
481	mulxq	%r13,%r13,%r12
482	addq	%r13,%r9
483	movq	%r12,%r10
484	adcq	$0,%r10
485
486	movq	%r8,%r13
487	imulq	%r9,%r13
488	andq	%rax,%r13
489
490	vmovq	%r13,%xmm2
491	vpbroadcastq	%xmm2,%ymm2
492	movq	0(%rcx),%rdx
493	mulxq	%r13,%r13,%r12
494	addq	%r13,%r9
495	adcq	%r12,%r10
496
497	shrq	$52,%r9
498	salq	$12,%r10
499	orq	%r10,%r9
500
501	leaq	-264(%rsp),%rsp
502
503{vex}	vpmadd52luq	0(%rsi),%ymm1,%ymm3
504{vex}	vpmadd52luq	32(%rsi),%ymm1,%ymm4
505{vex}	vpmadd52luq	64(%rsi),%ymm1,%ymm5
506{vex}	vpmadd52luq	96(%rsi),%ymm1,%ymm6
507{vex}	vpmadd52luq	128(%rsi),%ymm1,%ymm7
508{vex}	vpmadd52luq	160(%rsi),%ymm1,%ymm8
509{vex}	vpmadd52luq	192(%rsi),%ymm1,%ymm9
510{vex}	vpmadd52luq	224(%rsi),%ymm1,%ymm10
511
512{vex}	vpmadd52luq	0(%rcx),%ymm2,%ymm3
513{vex}	vpmadd52luq	32(%rcx),%ymm2,%ymm4
514{vex}	vpmadd52luq	64(%rcx),%ymm2,%ymm5
515{vex}	vpmadd52luq	96(%rcx),%ymm2,%ymm6
516{vex}	vpmadd52luq	128(%rcx),%ymm2,%ymm7
517{vex}	vpmadd52luq	160(%rcx),%ymm2,%ymm8
518{vex}	vpmadd52luq	192(%rcx),%ymm2,%ymm9
519{vex}	vpmadd52luq	224(%rcx),%ymm2,%ymm10
520
521
522	vmovdqu	%ymm3,0(%rsp)
523	vmovdqu	%ymm4,32(%rsp)
524	vmovdqu	%ymm5,64(%rsp)
525	vmovdqu	%ymm6,96(%rsp)
526	vmovdqu	%ymm7,128(%rsp)
527	vmovdqu	%ymm8,160(%rsp)
528	vmovdqu	%ymm9,192(%rsp)
529	vmovdqu	%ymm10,224(%rsp)
530	movq	$0,256(%rsp)
531
532	vmovdqu	8(%rsp),%ymm3
533	vmovdqu	40(%rsp),%ymm4
534	vmovdqu	72(%rsp),%ymm5
535	vmovdqu	104(%rsp),%ymm6
536	vmovdqu	136(%rsp),%ymm7
537	vmovdqu	168(%rsp),%ymm8
538	vmovdqu	200(%rsp),%ymm9
539	vmovdqu	232(%rsp),%ymm10
540
541	addq	8(%rsp),%r9
542
543{vex}	vpmadd52huq	0(%rsi),%ymm1,%ymm3
544{vex}	vpmadd52huq	32(%rsi),%ymm1,%ymm4
545{vex}	vpmadd52huq	64(%rsi),%ymm1,%ymm5
546{vex}	vpmadd52huq	96(%rsi),%ymm1,%ymm6
547{vex}	vpmadd52huq	128(%rsi),%ymm1,%ymm7
548{vex}	vpmadd52huq	160(%rsi),%ymm1,%ymm8
549{vex}	vpmadd52huq	192(%rsi),%ymm1,%ymm9
550{vex}	vpmadd52huq	224(%rsi),%ymm1,%ymm10
551
552{vex}	vpmadd52huq	0(%rcx),%ymm2,%ymm3
553{vex}	vpmadd52huq	32(%rcx),%ymm2,%ymm4
554{vex}	vpmadd52huq	64(%rcx),%ymm2,%ymm5
555{vex}	vpmadd52huq	96(%rcx),%ymm2,%ymm6
556{vex}	vpmadd52huq	128(%rcx),%ymm2,%ymm7
557{vex}	vpmadd52huq	160(%rcx),%ymm2,%ymm8
558{vex}	vpmadd52huq	192(%rcx),%ymm2,%ymm9
559{vex}	vpmadd52huq	224(%rcx),%ymm2,%ymm10
560
561	leaq	264(%rsp),%rsp
562
563	vmovq	%r9,%xmm0
564	vpbroadcastq	%xmm0,%ymm0
565	vpblendd	$3,%ymm0,%ymm3,%ymm3
566
567
568
569	vpsrlq	$52,%ymm3,%ymm0
570	vpsrlq	$52,%ymm4,%ymm1
571	vpsrlq	$52,%ymm5,%ymm2
572	vpsrlq	$52,%ymm6,%ymm11
573	vpsrlq	$52,%ymm7,%ymm12
574	vpsrlq	$52,%ymm8,%ymm13
575	vpsrlq	$52,%ymm9,%ymm14
576	vpsrlq	$52,%ymm10,%ymm15
577
578	leaq	-32(%rsp),%rsp
579	vmovupd	%ymm3,(%rsp)
580
581
582	vpermq	$144,%ymm15,%ymm15
583	vpermq	$3,%ymm14,%ymm3
584	vblendpd	$1,%ymm3,%ymm15,%ymm15
585
586	vpermq	$144,%ymm14,%ymm14
587	vpermq	$3,%ymm13,%ymm3
588	vblendpd	$1,%ymm3,%ymm14,%ymm14
589
590	vpermq	$144,%ymm13,%ymm13
591	vpermq	$3,%ymm12,%ymm3
592	vblendpd	$1,%ymm3,%ymm13,%ymm13
593
594	vpermq	$144,%ymm12,%ymm12
595	vpermq	$3,%ymm11,%ymm3
596	vblendpd	$1,%ymm3,%ymm12,%ymm12
597
598	vpermq	$144,%ymm11,%ymm11
599	vpermq	$3,%ymm2,%ymm3
600	vblendpd	$1,%ymm3,%ymm11,%ymm11
601
602	vpermq	$144,%ymm2,%ymm2
603	vpermq	$3,%ymm1,%ymm3
604	vblendpd	$1,%ymm3,%ymm2,%ymm2
605
606	vpermq	$144,%ymm1,%ymm1
607	vpermq	$3,%ymm0,%ymm3
608	vblendpd	$1,%ymm3,%ymm1,%ymm1
609
610	vpermq	$144,%ymm0,%ymm0
611	vpand	.Lhigh64x3(%rip),%ymm0,%ymm0
612
613	vmovupd	(%rsp),%ymm3
614	leaq	32(%rsp),%rsp
615
616
617	vpand	.Lmask52x4(%rip),%ymm3,%ymm3
618	vpand	.Lmask52x4(%rip),%ymm4,%ymm4
619	vpand	.Lmask52x4(%rip),%ymm5,%ymm5
620	vpand	.Lmask52x4(%rip),%ymm6,%ymm6
621	vpand	.Lmask52x4(%rip),%ymm7,%ymm7
622	vpand	.Lmask52x4(%rip),%ymm8,%ymm8
623	vpand	.Lmask52x4(%rip),%ymm9,%ymm9
624	vpand	.Lmask52x4(%rip),%ymm10,%ymm10
625
626
627	vpaddq	%ymm0,%ymm3,%ymm3
628	vpaddq	%ymm1,%ymm4,%ymm4
629	vpaddq	%ymm2,%ymm5,%ymm5
630	vpaddq	%ymm11,%ymm6,%ymm6
631	vpaddq	%ymm12,%ymm7,%ymm7
632	vpaddq	%ymm13,%ymm8,%ymm8
633	vpaddq	%ymm14,%ymm9,%ymm9
634	vpaddq	%ymm15,%ymm10,%ymm10
635
636
637
638	vpcmpgtq	.Lmask52x4(%rip),%ymm3,%ymm0
639	vpcmpgtq	.Lmask52x4(%rip),%ymm4,%ymm1
640	vmovmskpd	%ymm0,%r14d
641	vmovmskpd	%ymm1,%r13d
642	shlb	$4,%r13b
643	orb	%r13b,%r14b
644
645	vpcmpgtq	.Lmask52x4(%rip),%ymm5,%ymm2
646	vpcmpgtq	.Lmask52x4(%rip),%ymm6,%ymm11
647	vmovmskpd	%ymm2,%r13d
648	vmovmskpd	%ymm11,%r12d
649	shlb	$4,%r12b
650	orb	%r12b,%r13b
651
652	vpcmpgtq	.Lmask52x4(%rip),%ymm7,%ymm12
653	vpcmpgtq	.Lmask52x4(%rip),%ymm8,%ymm13
654	vmovmskpd	%ymm12,%r12d
655	vmovmskpd	%ymm13,%r11d
656	shlb	$4,%r11b
657	orb	%r11b,%r12b
658
659	vpcmpgtq	.Lmask52x4(%rip),%ymm9,%ymm14
660	vpcmpgtq	.Lmask52x4(%rip),%ymm10,%ymm15
661	vmovmskpd	%ymm14,%r11d
662	vmovmskpd	%ymm15,%r10d
663	shlb	$4,%r10b
664	orb	%r10b,%r11b
665
666	addb	%r14b,%r14b
667	adcb	%r13b,%r13b
668	adcb	%r12b,%r12b
669	adcb	%r11b,%r11b
670
671
672	vpcmpeqq	.Lmask52x4(%rip),%ymm3,%ymm0
673	vpcmpeqq	.Lmask52x4(%rip),%ymm4,%ymm1
674	vmovmskpd	%ymm0,%r9d
675	vmovmskpd	%ymm1,%r8d
676	shlb	$4,%r8b
677	orb	%r8b,%r9b
678
679	vpcmpeqq	.Lmask52x4(%rip),%ymm5,%ymm2
680	vpcmpeqq	.Lmask52x4(%rip),%ymm6,%ymm11
681	vmovmskpd	%ymm2,%r8d
682	vmovmskpd	%ymm11,%edx
683	shlb	$4,%dl
684	orb	%dl,%r8b
685
686	vpcmpeqq	.Lmask52x4(%rip),%ymm7,%ymm12
687	vpcmpeqq	.Lmask52x4(%rip),%ymm8,%ymm13
688	vmovmskpd	%ymm12,%edx
689	vmovmskpd	%ymm13,%ecx
690	shlb	$4,%cl
691	orb	%cl,%dl
692
693	vpcmpeqq	.Lmask52x4(%rip),%ymm9,%ymm14
694	vpcmpeqq	.Lmask52x4(%rip),%ymm10,%ymm15
695	vmovmskpd	%ymm14,%ecx
696	vmovmskpd	%ymm15,%ebx
697	shlb	$4,%bl
698	orb	%bl,%cl
699
700	addb	%r9b,%r14b
701	adcb	%r8b,%r13b
702	adcb	%dl,%r12b
703	adcb	%cl,%r11b
704
705	xorb	%r9b,%r14b
706	xorb	%r8b,%r13b
707	xorb	%dl,%r12b
708	xorb	%cl,%r11b
709
710	leaq	.Lkmasklut(%rip),%rdx
711
712	movb	%r14b,%r10b
713	andq	$0xf,%r14
714	vpsubq	.Lmask52x4(%rip),%ymm3,%ymm0
715	shlq	$5,%r14
716	vmovapd	(%rdx,%r14,1),%ymm2
717	vblendvpd	%ymm2,%ymm0,%ymm3,%ymm3
718
719	shrb	$4,%r10b
720	andq	$0xf,%r10
721	vpsubq	.Lmask52x4(%rip),%ymm4,%ymm0
722	shlq	$5,%r10
723	vmovapd	(%rdx,%r10,1),%ymm2
724	vblendvpd	%ymm2,%ymm0,%ymm4,%ymm4
725
726	movb	%r13b,%r10b
727	andq	$0xf,%r13
728	vpsubq	.Lmask52x4(%rip),%ymm5,%ymm0
729	shlq	$5,%r13
730	vmovapd	(%rdx,%r13,1),%ymm2
731	vblendvpd	%ymm2,%ymm0,%ymm5,%ymm5
732
733	shrb	$4,%r10b
734	andq	$0xf,%r10
735	vpsubq	.Lmask52x4(%rip),%ymm6,%ymm0
736	shlq	$5,%r10
737	vmovapd	(%rdx,%r10,1),%ymm2
738	vblendvpd	%ymm2,%ymm0,%ymm6,%ymm6
739
740	movb	%r12b,%r10b
741	andq	$0xf,%r12
742	vpsubq	.Lmask52x4(%rip),%ymm7,%ymm0
743	shlq	$5,%r12
744	vmovapd	(%rdx,%r12,1),%ymm2
745	vblendvpd	%ymm2,%ymm0,%ymm7,%ymm7
746
747	shrb	$4,%r10b
748	andq	$0xf,%r10
749	vpsubq	.Lmask52x4(%rip),%ymm8,%ymm0
750	shlq	$5,%r10
751	vmovapd	(%rdx,%r10,1),%ymm2
752	vblendvpd	%ymm2,%ymm0,%ymm8,%ymm8
753
754	movb	%r11b,%r10b
755	andq	$0xf,%r11
756	vpsubq	.Lmask52x4(%rip),%ymm9,%ymm0
757	shlq	$5,%r11
758	vmovapd	(%rdx,%r11,1),%ymm2
759	vblendvpd	%ymm2,%ymm0,%ymm9,%ymm9
760
761	shrb	$4,%r10b
762	andq	$0xf,%r10
763	vpsubq	.Lmask52x4(%rip),%ymm10,%ymm0
764	shlq	$5,%r10
765	vmovapd	(%rdx,%r10,1),%ymm2
766	vblendvpd	%ymm2,%ymm0,%ymm10,%ymm10
767
768	vpand	.Lmask52x4(%rip),%ymm3,%ymm3
769	vpand	.Lmask52x4(%rip),%ymm4,%ymm4
770	vpand	.Lmask52x4(%rip),%ymm5,%ymm5
771	vpand	.Lmask52x4(%rip),%ymm6,%ymm6
772	vpand	.Lmask52x4(%rip),%ymm7,%ymm7
773	vpand	.Lmask52x4(%rip),%ymm8,%ymm8
774	vpand	.Lmask52x4(%rip),%ymm9,%ymm9
775
776	vpand	.Lmask52x4(%rip),%ymm10,%ymm10
777
778	vmovdqu	%ymm3,0(%rdi)
779	vmovdqu	%ymm4,32(%rdi)
780	vmovdqu	%ymm5,64(%rdi)
781	vmovdqu	%ymm6,96(%rdi)
782	vmovdqu	%ymm7,128(%rdi)
783	vmovdqu	%ymm8,160(%rdi)
784	vmovdqu	%ymm9,192(%rdi)
785	vmovdqu	%ymm10,224(%rdi)
786
787	vzeroupper
788	leaq	(%rsp),%rax
789.cfi_def_cfa_register	%rax
790	movq	0(%rax),%r15
791.cfi_restore	%r15
792	movq	8(%rax),%r14
793.cfi_restore	%r14
794	movq	16(%rax),%r13
795.cfi_restore	%r13
796	movq	24(%rax),%r12
797.cfi_restore	%r12
798	movq	32(%rax),%rbp
799.cfi_restore	%rbp
800	movq	40(%rax),%rbx
801.cfi_restore	%rbx
802	leaq	48(%rax),%rsp
803.cfi_def_cfa	%rsp,8
804.Lossl_rsaz_amm52x30_x1_avxifma256_epilogue:
805	.byte	0xf3,0xc3
806.cfi_endproc
807.size	ossl_rsaz_amm52x30_x1_avxifma256, .-ossl_rsaz_amm52x30_x1_avxifma256
808.section	.rodata
809.align	32
810.Lmask52x4:
811.quad	0xfffffffffffff
812.quad	0xfffffffffffff
813.quad	0xfffffffffffff
814.quad	0xfffffffffffff
815.Lhigh64x3:
816.quad	0x0
817.quad	0xffffffffffffffff
818.quad	0xffffffffffffffff
819.quad	0xffffffffffffffff
820.Lkmasklut:
821
822.quad	0x0
823.quad	0x0
824.quad	0x0
825.quad	0x0
826
827.quad	0xffffffffffffffff
828.quad	0x0
829.quad	0x0
830.quad	0x0
831
832.quad	0x0
833.quad	0xffffffffffffffff
834.quad	0x0
835.quad	0x0
836
837.quad	0xffffffffffffffff
838.quad	0xffffffffffffffff
839.quad	0x0
840.quad	0x0
841
842.quad	0x0
843.quad	0x0
844.quad	0xffffffffffffffff
845.quad	0x0
846
847.quad	0xffffffffffffffff
848.quad	0x0
849.quad	0xffffffffffffffff
850.quad	0x0
851
852.quad	0x0
853.quad	0xffffffffffffffff
854.quad	0xffffffffffffffff
855.quad	0x0
856
857.quad	0xffffffffffffffff
858.quad	0xffffffffffffffff
859.quad	0xffffffffffffffff
860.quad	0x0
861
862.quad	0x0
863.quad	0x0
864.quad	0x0
865.quad	0xffffffffffffffff
866
867.quad	0xffffffffffffffff
868.quad	0x0
869.quad	0x0
870.quad	0xffffffffffffffff
871
872.quad	0x0
873.quad	0xffffffffffffffff
874.quad	0x0
875.quad	0xffffffffffffffff
876
877.quad	0xffffffffffffffff
878.quad	0xffffffffffffffff
879.quad	0x0
880.quad	0xffffffffffffffff
881
882.quad	0x0
883.quad	0x0
884.quad	0xffffffffffffffff
885.quad	0xffffffffffffffff
886
887.quad	0xffffffffffffffff
888.quad	0x0
889.quad	0xffffffffffffffff
890.quad	0xffffffffffffffff
891
892.quad	0x0
893.quad	0xffffffffffffffff
894.quad	0xffffffffffffffff
895.quad	0xffffffffffffffff
896
897.quad	0xffffffffffffffff
898.quad	0xffffffffffffffff
899.quad	0xffffffffffffffff
900.quad	0xffffffffffffffff
901.text
902
903.globl	ossl_rsaz_amm52x30_x2_avxifma256
904.type	ossl_rsaz_amm52x30_x2_avxifma256,@function
905.align	32
906ossl_rsaz_amm52x30_x2_avxifma256:
907.cfi_startproc
908.byte	243,15,30,250
909	pushq	%rbx
910.cfi_adjust_cfa_offset	8
911.cfi_offset	%rbx,-16
912	pushq	%rbp
913.cfi_adjust_cfa_offset	8
914.cfi_offset	%rbp,-24
915	pushq	%r12
916.cfi_adjust_cfa_offset	8
917.cfi_offset	%r12,-32
918	pushq	%r13
919.cfi_adjust_cfa_offset	8
920.cfi_offset	%r13,-40
921	pushq	%r14
922.cfi_adjust_cfa_offset	8
923.cfi_offset	%r14,-48
924	pushq	%r15
925.cfi_adjust_cfa_offset	8
926.cfi_offset	%r15,-56
927
928	vpxor	%ymm0,%ymm0,%ymm0
929	vmovapd	%ymm0,%ymm3
930	vmovapd	%ymm0,%ymm4
931	vmovapd	%ymm0,%ymm5
932	vmovapd	%ymm0,%ymm6
933	vmovapd	%ymm0,%ymm7
934	vmovapd	%ymm0,%ymm8
935	vmovapd	%ymm0,%ymm9
936	vmovapd	%ymm0,%ymm10
937
938	xorl	%r9d,%r9d
939
940	movq	%rdx,%r11
941	movq	$0xfffffffffffff,%rax
942
943	movl	$30,%ebx
944
945.align	32
946.Lloop30:
947	movq	0(%r11),%r13
948
949	vpbroadcastq	0(%r11),%ymm1
950	movq	0(%rsi),%rdx
951	mulxq	%r13,%r13,%r12
952	addq	%r13,%r9
953	movq	%r12,%r10
954	adcq	$0,%r10
955
956	movq	(%r8),%r13
957	imulq	%r9,%r13
958	andq	%rax,%r13
959
960	vmovq	%r13,%xmm2
961	vpbroadcastq	%xmm2,%ymm2
962	movq	0(%rcx),%rdx
963	mulxq	%r13,%r13,%r12
964	addq	%r13,%r9
965	adcq	%r12,%r10
966
967	shrq	$52,%r9
968	salq	$12,%r10
969	orq	%r10,%r9
970
971	leaq	-264(%rsp),%rsp
972
973{vex}	vpmadd52luq	0(%rsi),%ymm1,%ymm3
974{vex}	vpmadd52luq	32(%rsi),%ymm1,%ymm4
975{vex}	vpmadd52luq	64(%rsi),%ymm1,%ymm5
976{vex}	vpmadd52luq	96(%rsi),%ymm1,%ymm6
977{vex}	vpmadd52luq	128(%rsi),%ymm1,%ymm7
978{vex}	vpmadd52luq	160(%rsi),%ymm1,%ymm8
979{vex}	vpmadd52luq	192(%rsi),%ymm1,%ymm9
980{vex}	vpmadd52luq	224(%rsi),%ymm1,%ymm10
981
982{vex}	vpmadd52luq	0(%rcx),%ymm2,%ymm3
983{vex}	vpmadd52luq	32(%rcx),%ymm2,%ymm4
984{vex}	vpmadd52luq	64(%rcx),%ymm2,%ymm5
985{vex}	vpmadd52luq	96(%rcx),%ymm2,%ymm6
986{vex}	vpmadd52luq	128(%rcx),%ymm2,%ymm7
987{vex}	vpmadd52luq	160(%rcx),%ymm2,%ymm8
988{vex}	vpmadd52luq	192(%rcx),%ymm2,%ymm9
989{vex}	vpmadd52luq	224(%rcx),%ymm2,%ymm10
990
991
992	vmovdqu	%ymm3,0(%rsp)
993	vmovdqu	%ymm4,32(%rsp)
994	vmovdqu	%ymm5,64(%rsp)
995	vmovdqu	%ymm6,96(%rsp)
996	vmovdqu	%ymm7,128(%rsp)
997	vmovdqu	%ymm8,160(%rsp)
998	vmovdqu	%ymm9,192(%rsp)
999	vmovdqu	%ymm10,224(%rsp)
1000	movq	$0,256(%rsp)
1001
1002	vmovdqu	8(%rsp),%ymm3
1003	vmovdqu	40(%rsp),%ymm4
1004	vmovdqu	72(%rsp),%ymm5
1005	vmovdqu	104(%rsp),%ymm6
1006	vmovdqu	136(%rsp),%ymm7
1007	vmovdqu	168(%rsp),%ymm8
1008	vmovdqu	200(%rsp),%ymm9
1009	vmovdqu	232(%rsp),%ymm10
1010
1011	addq	8(%rsp),%r9
1012
1013{vex}	vpmadd52huq	0(%rsi),%ymm1,%ymm3
1014{vex}	vpmadd52huq	32(%rsi),%ymm1,%ymm4
1015{vex}	vpmadd52huq	64(%rsi),%ymm1,%ymm5
1016{vex}	vpmadd52huq	96(%rsi),%ymm1,%ymm6
1017{vex}	vpmadd52huq	128(%rsi),%ymm1,%ymm7
1018{vex}	vpmadd52huq	160(%rsi),%ymm1,%ymm8
1019{vex}	vpmadd52huq	192(%rsi),%ymm1,%ymm9
1020{vex}	vpmadd52huq	224(%rsi),%ymm1,%ymm10
1021
1022{vex}	vpmadd52huq	0(%rcx),%ymm2,%ymm3
1023{vex}	vpmadd52huq	32(%rcx),%ymm2,%ymm4
1024{vex}	vpmadd52huq	64(%rcx),%ymm2,%ymm5
1025{vex}	vpmadd52huq	96(%rcx),%ymm2,%ymm6
1026{vex}	vpmadd52huq	128(%rcx),%ymm2,%ymm7
1027{vex}	vpmadd52huq	160(%rcx),%ymm2,%ymm8
1028{vex}	vpmadd52huq	192(%rcx),%ymm2,%ymm9
1029{vex}	vpmadd52huq	224(%rcx),%ymm2,%ymm10
1030
1031	leaq	264(%rsp),%rsp
1032	leaq	8(%r11),%r11
1033	decl	%ebx
1034	jne	.Lloop30
1035
1036	pushq	%r11
1037	pushq	%rsi
1038	pushq	%rcx
1039	pushq	%r8
1040
1041	vmovq	%r9,%xmm0
1042	vpbroadcastq	%xmm0,%ymm0
1043	vpblendd	$3,%ymm0,%ymm3,%ymm3
1044
1045
1046
1047	vpsrlq	$52,%ymm3,%ymm0
1048	vpsrlq	$52,%ymm4,%ymm1
1049	vpsrlq	$52,%ymm5,%ymm2
1050	vpsrlq	$52,%ymm6,%ymm11
1051	vpsrlq	$52,%ymm7,%ymm12
1052	vpsrlq	$52,%ymm8,%ymm13
1053	vpsrlq	$52,%ymm9,%ymm14
1054	vpsrlq	$52,%ymm10,%ymm15
1055
1056	leaq	-32(%rsp),%rsp
1057	vmovupd	%ymm3,(%rsp)
1058
1059
1060	vpermq	$144,%ymm15,%ymm15
1061	vpermq	$3,%ymm14,%ymm3
1062	vblendpd	$1,%ymm3,%ymm15,%ymm15
1063
1064	vpermq	$144,%ymm14,%ymm14
1065	vpermq	$3,%ymm13,%ymm3
1066	vblendpd	$1,%ymm3,%ymm14,%ymm14
1067
1068	vpermq	$144,%ymm13,%ymm13
1069	vpermq	$3,%ymm12,%ymm3
1070	vblendpd	$1,%ymm3,%ymm13,%ymm13
1071
1072	vpermq	$144,%ymm12,%ymm12
1073	vpermq	$3,%ymm11,%ymm3
1074	vblendpd	$1,%ymm3,%ymm12,%ymm12
1075
1076	vpermq	$144,%ymm11,%ymm11
1077	vpermq	$3,%ymm2,%ymm3
1078	vblendpd	$1,%ymm3,%ymm11,%ymm11
1079
1080	vpermq	$144,%ymm2,%ymm2
1081	vpermq	$3,%ymm1,%ymm3
1082	vblendpd	$1,%ymm3,%ymm2,%ymm2
1083
1084	vpermq	$144,%ymm1,%ymm1
1085	vpermq	$3,%ymm0,%ymm3
1086	vblendpd	$1,%ymm3,%ymm1,%ymm1
1087
1088	vpermq	$144,%ymm0,%ymm0
1089	vpand	.Lhigh64x3(%rip),%ymm0,%ymm0
1090
1091	vmovupd	(%rsp),%ymm3
1092	leaq	32(%rsp),%rsp
1093
1094
1095	vpand	.Lmask52x4(%rip),%ymm3,%ymm3
1096	vpand	.Lmask52x4(%rip),%ymm4,%ymm4
1097	vpand	.Lmask52x4(%rip),%ymm5,%ymm5
1098	vpand	.Lmask52x4(%rip),%ymm6,%ymm6
1099	vpand	.Lmask52x4(%rip),%ymm7,%ymm7
1100	vpand	.Lmask52x4(%rip),%ymm8,%ymm8
1101	vpand	.Lmask52x4(%rip),%ymm9,%ymm9
1102	vpand	.Lmask52x4(%rip),%ymm10,%ymm10
1103
1104
1105	vpaddq	%ymm0,%ymm3,%ymm3
1106	vpaddq	%ymm1,%ymm4,%ymm4
1107	vpaddq	%ymm2,%ymm5,%ymm5
1108	vpaddq	%ymm11,%ymm6,%ymm6
1109	vpaddq	%ymm12,%ymm7,%ymm7
1110	vpaddq	%ymm13,%ymm8,%ymm8
1111	vpaddq	%ymm14,%ymm9,%ymm9
1112	vpaddq	%ymm15,%ymm10,%ymm10
1113
1114
1115
1116	vpcmpgtq	.Lmask52x4(%rip),%ymm3,%ymm0
1117	vpcmpgtq	.Lmask52x4(%rip),%ymm4,%ymm1
1118	vmovmskpd	%ymm0,%r14d
1119	vmovmskpd	%ymm1,%r13d
1120	shlb	$4,%r13b
1121	orb	%r13b,%r14b
1122
1123	vpcmpgtq	.Lmask52x4(%rip),%ymm5,%ymm2
1124	vpcmpgtq	.Lmask52x4(%rip),%ymm6,%ymm11
1125	vmovmskpd	%ymm2,%r13d
1126	vmovmskpd	%ymm11,%r12d
1127	shlb	$4,%r12b
1128	orb	%r12b,%r13b
1129
1130	vpcmpgtq	.Lmask52x4(%rip),%ymm7,%ymm12
1131	vpcmpgtq	.Lmask52x4(%rip),%ymm8,%ymm13
1132	vmovmskpd	%ymm12,%r12d
1133	vmovmskpd	%ymm13,%r11d
1134	shlb	$4,%r11b
1135	orb	%r11b,%r12b
1136
1137	vpcmpgtq	.Lmask52x4(%rip),%ymm9,%ymm14
1138	vpcmpgtq	.Lmask52x4(%rip),%ymm10,%ymm15
1139	vmovmskpd	%ymm14,%r11d
1140	vmovmskpd	%ymm15,%r10d
1141	shlb	$4,%r10b
1142	orb	%r10b,%r11b
1143
1144	addb	%r14b,%r14b
1145	adcb	%r13b,%r13b
1146	adcb	%r12b,%r12b
1147	adcb	%r11b,%r11b
1148
1149
1150	vpcmpeqq	.Lmask52x4(%rip),%ymm3,%ymm0
1151	vpcmpeqq	.Lmask52x4(%rip),%ymm4,%ymm1
1152	vmovmskpd	%ymm0,%r9d
1153	vmovmskpd	%ymm1,%r8d
1154	shlb	$4,%r8b
1155	orb	%r8b,%r9b
1156
1157	vpcmpeqq	.Lmask52x4(%rip),%ymm5,%ymm2
1158	vpcmpeqq	.Lmask52x4(%rip),%ymm6,%ymm11
1159	vmovmskpd	%ymm2,%r8d
1160	vmovmskpd	%ymm11,%edx
1161	shlb	$4,%dl
1162	orb	%dl,%r8b
1163
1164	vpcmpeqq	.Lmask52x4(%rip),%ymm7,%ymm12
1165	vpcmpeqq	.Lmask52x4(%rip),%ymm8,%ymm13
1166	vmovmskpd	%ymm12,%edx
1167	vmovmskpd	%ymm13,%ecx
1168	shlb	$4,%cl
1169	orb	%cl,%dl
1170
1171	vpcmpeqq	.Lmask52x4(%rip),%ymm9,%ymm14
1172	vpcmpeqq	.Lmask52x4(%rip),%ymm10,%ymm15
1173	vmovmskpd	%ymm14,%ecx
1174	vmovmskpd	%ymm15,%ebx
1175	shlb	$4,%bl
1176	orb	%bl,%cl
1177
1178	addb	%r9b,%r14b
1179	adcb	%r8b,%r13b
1180	adcb	%dl,%r12b
1181	adcb	%cl,%r11b
1182
1183	xorb	%r9b,%r14b
1184	xorb	%r8b,%r13b
1185	xorb	%dl,%r12b
1186	xorb	%cl,%r11b
1187
1188	leaq	.Lkmasklut(%rip),%rdx
1189
1190	movb	%r14b,%r10b
1191	andq	$0xf,%r14
1192	vpsubq	.Lmask52x4(%rip),%ymm3,%ymm0
1193	shlq	$5,%r14
1194	vmovapd	(%rdx,%r14,1),%ymm2
1195	vblendvpd	%ymm2,%ymm0,%ymm3,%ymm3
1196
1197	shrb	$4,%r10b
1198	andq	$0xf,%r10
1199	vpsubq	.Lmask52x4(%rip),%ymm4,%ymm0
1200	shlq	$5,%r10
1201	vmovapd	(%rdx,%r10,1),%ymm2
1202	vblendvpd	%ymm2,%ymm0,%ymm4,%ymm4
1203
1204	movb	%r13b,%r10b
1205	andq	$0xf,%r13
1206	vpsubq	.Lmask52x4(%rip),%ymm5,%ymm0
1207	shlq	$5,%r13
1208	vmovapd	(%rdx,%r13,1),%ymm2
1209	vblendvpd	%ymm2,%ymm0,%ymm5,%ymm5
1210
1211	shrb	$4,%r10b
1212	andq	$0xf,%r10
1213	vpsubq	.Lmask52x4(%rip),%ymm6,%ymm0
1214	shlq	$5,%r10
1215	vmovapd	(%rdx,%r10,1),%ymm2
1216	vblendvpd	%ymm2,%ymm0,%ymm6,%ymm6
1217
1218	movb	%r12b,%r10b
1219	andq	$0xf,%r12
1220	vpsubq	.Lmask52x4(%rip),%ymm7,%ymm0
1221	shlq	$5,%r12
1222	vmovapd	(%rdx,%r12,1),%ymm2
1223	vblendvpd	%ymm2,%ymm0,%ymm7,%ymm7
1224
1225	shrb	$4,%r10b
1226	andq	$0xf,%r10
1227	vpsubq	.Lmask52x4(%rip),%ymm8,%ymm0
1228	shlq	$5,%r10
1229	vmovapd	(%rdx,%r10,1),%ymm2
1230	vblendvpd	%ymm2,%ymm0,%ymm8,%ymm8
1231
1232	movb	%r11b,%r10b
1233	andq	$0xf,%r11
1234	vpsubq	.Lmask52x4(%rip),%ymm9,%ymm0
1235	shlq	$5,%r11
1236	vmovapd	(%rdx,%r11,1),%ymm2
1237	vblendvpd	%ymm2,%ymm0,%ymm9,%ymm9
1238
1239	shrb	$4,%r10b
1240	andq	$0xf,%r10
1241	vpsubq	.Lmask52x4(%rip),%ymm10,%ymm0
1242	shlq	$5,%r10
1243	vmovapd	(%rdx,%r10,1),%ymm2
1244	vblendvpd	%ymm2,%ymm0,%ymm10,%ymm10
1245
1246	vpand	.Lmask52x4(%rip),%ymm3,%ymm3
1247	vpand	.Lmask52x4(%rip),%ymm4,%ymm4
1248	vpand	.Lmask52x4(%rip),%ymm5,%ymm5
1249	vpand	.Lmask52x4(%rip),%ymm6,%ymm6
1250	vpand	.Lmask52x4(%rip),%ymm7,%ymm7
1251	vpand	.Lmask52x4(%rip),%ymm8,%ymm8
1252	vpand	.Lmask52x4(%rip),%ymm9,%ymm9
1253
1254	vpand	.Lmask52x4(%rip),%ymm10,%ymm10
1255	popq	%r8
1256	popq	%rcx
1257	popq	%rsi
1258	popq	%r11
1259
1260	vmovdqu	%ymm3,0(%rdi)
1261	vmovdqu	%ymm4,32(%rdi)
1262	vmovdqu	%ymm5,64(%rdi)
1263	vmovdqu	%ymm6,96(%rdi)
1264	vmovdqu	%ymm7,128(%rdi)
1265	vmovdqu	%ymm8,160(%rdi)
1266	vmovdqu	%ymm9,192(%rdi)
1267	vmovdqu	%ymm10,224(%rdi)
1268
1269	xorl	%r15d,%r15d
1270
1271	leaq	16(%r11),%r11
1272	movq	$0xfffffffffffff,%rax
1273
1274	movl	$30,%ebx
1275
1276	vpxor	%ymm0,%ymm0,%ymm0
1277	vmovapd	%ymm0,%ymm3
1278	vmovapd	%ymm0,%ymm4
1279	vmovapd	%ymm0,%ymm5
1280	vmovapd	%ymm0,%ymm6
1281	vmovapd	%ymm0,%ymm7
1282	vmovapd	%ymm0,%ymm8
1283	vmovapd	%ymm0,%ymm9
1284	vmovapd	%ymm0,%ymm10
1285.align	32
1286.Lloop40:
1287	movq	0(%r11),%r13
1288
1289	vpbroadcastq	0(%r11),%ymm1
1290	movq	256(%rsi),%rdx
1291	mulxq	%r13,%r13,%r12
1292	addq	%r13,%r9
1293	movq	%r12,%r10
1294	adcq	$0,%r10
1295
1296	movq	8(%r8),%r13
1297	imulq	%r9,%r13
1298	andq	%rax,%r13
1299
1300	vmovq	%r13,%xmm2
1301	vpbroadcastq	%xmm2,%ymm2
1302	movq	256(%rcx),%rdx
1303	mulxq	%r13,%r13,%r12
1304	addq	%r13,%r9
1305	adcq	%r12,%r10
1306
1307	shrq	$52,%r9
1308	salq	$12,%r10
1309	orq	%r10,%r9
1310
1311	leaq	-264(%rsp),%rsp
1312
1313{vex}	vpmadd52luq	256(%rsi),%ymm1,%ymm3
1314{vex}	vpmadd52luq	288(%rsi),%ymm1,%ymm4
1315{vex}	vpmadd52luq	320(%rsi),%ymm1,%ymm5
1316{vex}	vpmadd52luq	352(%rsi),%ymm1,%ymm6
1317{vex}	vpmadd52luq	384(%rsi),%ymm1,%ymm7
1318{vex}	vpmadd52luq	416(%rsi),%ymm1,%ymm8
1319{vex}	vpmadd52luq	448(%rsi),%ymm1,%ymm9
1320{vex}	vpmadd52luq	480(%rsi),%ymm1,%ymm10
1321
1322{vex}	vpmadd52luq	256(%rcx),%ymm2,%ymm3
1323{vex}	vpmadd52luq	288(%rcx),%ymm2,%ymm4
1324{vex}	vpmadd52luq	320(%rcx),%ymm2,%ymm5
1325{vex}	vpmadd52luq	352(%rcx),%ymm2,%ymm6
1326{vex}	vpmadd52luq	384(%rcx),%ymm2,%ymm7
1327{vex}	vpmadd52luq	416(%rcx),%ymm2,%ymm8
1328{vex}	vpmadd52luq	448(%rcx),%ymm2,%ymm9
1329{vex}	vpmadd52luq	480(%rcx),%ymm2,%ymm10
1330
1331
1332	vmovdqu	%ymm3,0(%rsp)
1333	vmovdqu	%ymm4,32(%rsp)
1334	vmovdqu	%ymm5,64(%rsp)
1335	vmovdqu	%ymm6,96(%rsp)
1336	vmovdqu	%ymm7,128(%rsp)
1337	vmovdqu	%ymm8,160(%rsp)
1338	vmovdqu	%ymm9,192(%rsp)
1339	vmovdqu	%ymm10,224(%rsp)
1340	movq	$0,256(%rsp)
1341
1342	vmovdqu	8(%rsp),%ymm3
1343	vmovdqu	40(%rsp),%ymm4
1344	vmovdqu	72(%rsp),%ymm5
1345	vmovdqu	104(%rsp),%ymm6
1346	vmovdqu	136(%rsp),%ymm7
1347	vmovdqu	168(%rsp),%ymm8
1348	vmovdqu	200(%rsp),%ymm9
1349	vmovdqu	232(%rsp),%ymm10
1350
1351	addq	8(%rsp),%r9
1352
1353{vex}	vpmadd52huq	256(%rsi),%ymm1,%ymm3
1354{vex}	vpmadd52huq	288(%rsi),%ymm1,%ymm4
1355{vex}	vpmadd52huq	320(%rsi),%ymm1,%ymm5
1356{vex}	vpmadd52huq	352(%rsi),%ymm1,%ymm6
1357{vex}	vpmadd52huq	384(%rsi),%ymm1,%ymm7
1358{vex}	vpmadd52huq	416(%rsi),%ymm1,%ymm8
1359{vex}	vpmadd52huq	448(%rsi),%ymm1,%ymm9
1360{vex}	vpmadd52huq	480(%rsi),%ymm1,%ymm10
1361
1362{vex}	vpmadd52huq	256(%rcx),%ymm2,%ymm3
1363{vex}	vpmadd52huq	288(%rcx),%ymm2,%ymm4
1364{vex}	vpmadd52huq	320(%rcx),%ymm2,%ymm5
1365{vex}	vpmadd52huq	352(%rcx),%ymm2,%ymm6
1366{vex}	vpmadd52huq	384(%rcx),%ymm2,%ymm7
1367{vex}	vpmadd52huq	416(%rcx),%ymm2,%ymm8
1368{vex}	vpmadd52huq	448(%rcx),%ymm2,%ymm9
1369{vex}	vpmadd52huq	480(%rcx),%ymm2,%ymm10
1370
1371	leaq	264(%rsp),%rsp
1372	leaq	8(%r11),%r11
1373	decl	%ebx
1374	jne	.Lloop40
1375
1376	vmovq	%r9,%xmm0
1377	vpbroadcastq	%xmm0,%ymm0
1378	vpblendd	$3,%ymm0,%ymm3,%ymm3
1379
1380
1381
1382	vpsrlq	$52,%ymm3,%ymm0
1383	vpsrlq	$52,%ymm4,%ymm1
1384	vpsrlq	$52,%ymm5,%ymm2
1385	vpsrlq	$52,%ymm6,%ymm11
1386	vpsrlq	$52,%ymm7,%ymm12
1387	vpsrlq	$52,%ymm8,%ymm13
1388	vpsrlq	$52,%ymm9,%ymm14
1389	vpsrlq	$52,%ymm10,%ymm15
1390
1391	leaq	-32(%rsp),%rsp
1392	vmovupd	%ymm3,(%rsp)
1393
1394
1395	vpermq	$144,%ymm15,%ymm15
1396	vpermq	$3,%ymm14,%ymm3
1397	vblendpd	$1,%ymm3,%ymm15,%ymm15
1398
1399	vpermq	$144,%ymm14,%ymm14
1400	vpermq	$3,%ymm13,%ymm3
1401	vblendpd	$1,%ymm3,%ymm14,%ymm14
1402
1403	vpermq	$144,%ymm13,%ymm13
1404	vpermq	$3,%ymm12,%ymm3
1405	vblendpd	$1,%ymm3,%ymm13,%ymm13
1406
1407	vpermq	$144,%ymm12,%ymm12
1408	vpermq	$3,%ymm11,%ymm3
1409	vblendpd	$1,%ymm3,%ymm12,%ymm12
1410
1411	vpermq	$144,%ymm11,%ymm11
1412	vpermq	$3,%ymm2,%ymm3
1413	vblendpd	$1,%ymm3,%ymm11,%ymm11
1414
1415	vpermq	$144,%ymm2,%ymm2
1416	vpermq	$3,%ymm1,%ymm3
1417	vblendpd	$1,%ymm3,%ymm2,%ymm2
1418
1419	vpermq	$144,%ymm1,%ymm1
1420	vpermq	$3,%ymm0,%ymm3
1421	vblendpd	$1,%ymm3,%ymm1,%ymm1
1422
1423	vpermq	$144,%ymm0,%ymm0
1424	vpand	.Lhigh64x3(%rip),%ymm0,%ymm0
1425
1426	vmovupd	(%rsp),%ymm3
1427	leaq	32(%rsp),%rsp
1428
1429
1430	vpand	.Lmask52x4(%rip),%ymm3,%ymm3
1431	vpand	.Lmask52x4(%rip),%ymm4,%ymm4
1432	vpand	.Lmask52x4(%rip),%ymm5,%ymm5
1433	vpand	.Lmask52x4(%rip),%ymm6,%ymm6
1434	vpand	.Lmask52x4(%rip),%ymm7,%ymm7
1435	vpand	.Lmask52x4(%rip),%ymm8,%ymm8
1436	vpand	.Lmask52x4(%rip),%ymm9,%ymm9
1437	vpand	.Lmask52x4(%rip),%ymm10,%ymm10
1438
1439
1440	vpaddq	%ymm0,%ymm3,%ymm3
1441	vpaddq	%ymm1,%ymm4,%ymm4
1442	vpaddq	%ymm2,%ymm5,%ymm5
1443	vpaddq	%ymm11,%ymm6,%ymm6
1444	vpaddq	%ymm12,%ymm7,%ymm7
1445	vpaddq	%ymm13,%ymm8,%ymm8
1446	vpaddq	%ymm14,%ymm9,%ymm9
1447	vpaddq	%ymm15,%ymm10,%ymm10
1448
1449
1450
1451	vpcmpgtq	.Lmask52x4(%rip),%ymm3,%ymm0
1452	vpcmpgtq	.Lmask52x4(%rip),%ymm4,%ymm1
1453	vmovmskpd	%ymm0,%r14d
1454	vmovmskpd	%ymm1,%r13d
1455	shlb	$4,%r13b
1456	orb	%r13b,%r14b
1457
1458	vpcmpgtq	.Lmask52x4(%rip),%ymm5,%ymm2
1459	vpcmpgtq	.Lmask52x4(%rip),%ymm6,%ymm11
1460	vmovmskpd	%ymm2,%r13d
1461	vmovmskpd	%ymm11,%r12d
1462	shlb	$4,%r12b
1463	orb	%r12b,%r13b
1464
1465	vpcmpgtq	.Lmask52x4(%rip),%ymm7,%ymm12
1466	vpcmpgtq	.Lmask52x4(%rip),%ymm8,%ymm13
1467	vmovmskpd	%ymm12,%r12d
1468	vmovmskpd	%ymm13,%r11d
1469	shlb	$4,%r11b
1470	orb	%r11b,%r12b
1471
1472	vpcmpgtq	.Lmask52x4(%rip),%ymm9,%ymm14
1473	vpcmpgtq	.Lmask52x4(%rip),%ymm10,%ymm15
1474	vmovmskpd	%ymm14,%r11d
1475	vmovmskpd	%ymm15,%r10d
1476	shlb	$4,%r10b
1477	orb	%r10b,%r11b
1478
1479	addb	%r14b,%r14b
1480	adcb	%r13b,%r13b
1481	adcb	%r12b,%r12b
1482	adcb	%r11b,%r11b
1483
1484
1485	vpcmpeqq	.Lmask52x4(%rip),%ymm3,%ymm0
1486	vpcmpeqq	.Lmask52x4(%rip),%ymm4,%ymm1
1487	vmovmskpd	%ymm0,%r9d
1488	vmovmskpd	%ymm1,%r8d
1489	shlb	$4,%r8b
1490	orb	%r8b,%r9b
1491
1492	vpcmpeqq	.Lmask52x4(%rip),%ymm5,%ymm2
1493	vpcmpeqq	.Lmask52x4(%rip),%ymm6,%ymm11
1494	vmovmskpd	%ymm2,%r8d
1495	vmovmskpd	%ymm11,%edx
1496	shlb	$4,%dl
1497	orb	%dl,%r8b
1498
1499	vpcmpeqq	.Lmask52x4(%rip),%ymm7,%ymm12
1500	vpcmpeqq	.Lmask52x4(%rip),%ymm8,%ymm13
1501	vmovmskpd	%ymm12,%edx
1502	vmovmskpd	%ymm13,%ecx
1503	shlb	$4,%cl
1504	orb	%cl,%dl
1505
1506	vpcmpeqq	.Lmask52x4(%rip),%ymm9,%ymm14
1507	vpcmpeqq	.Lmask52x4(%rip),%ymm10,%ymm15
1508	vmovmskpd	%ymm14,%ecx
1509	vmovmskpd	%ymm15,%ebx
1510	shlb	$4,%bl
1511	orb	%bl,%cl
1512
1513	addb	%r9b,%r14b
1514	adcb	%r8b,%r13b
1515	adcb	%dl,%r12b
1516	adcb	%cl,%r11b
1517
1518	xorb	%r9b,%r14b
1519	xorb	%r8b,%r13b
1520	xorb	%dl,%r12b
1521	xorb	%cl,%r11b
1522
1523	leaq	.Lkmasklut(%rip),%rdx
1524
1525	movb	%r14b,%r10b
1526	andq	$0xf,%r14
1527	vpsubq	.Lmask52x4(%rip),%ymm3,%ymm0
1528	shlq	$5,%r14
1529	vmovapd	(%rdx,%r14,1),%ymm2
1530	vblendvpd	%ymm2,%ymm0,%ymm3,%ymm3
1531
1532	shrb	$4,%r10b
1533	andq	$0xf,%r10
1534	vpsubq	.Lmask52x4(%rip),%ymm4,%ymm0
1535	shlq	$5,%r10
1536	vmovapd	(%rdx,%r10,1),%ymm2
1537	vblendvpd	%ymm2,%ymm0,%ymm4,%ymm4
1538
1539	movb	%r13b,%r10b
1540	andq	$0xf,%r13
1541	vpsubq	.Lmask52x4(%rip),%ymm5,%ymm0
1542	shlq	$5,%r13
1543	vmovapd	(%rdx,%r13,1),%ymm2
1544	vblendvpd	%ymm2,%ymm0,%ymm5,%ymm5
1545
1546	shrb	$4,%r10b
1547	andq	$0xf,%r10
1548	vpsubq	.Lmask52x4(%rip),%ymm6,%ymm0
1549	shlq	$5,%r10
1550	vmovapd	(%rdx,%r10,1),%ymm2
1551	vblendvpd	%ymm2,%ymm0,%ymm6,%ymm6
1552
1553	movb	%r12b,%r10b
1554	andq	$0xf,%r12
1555	vpsubq	.Lmask52x4(%rip),%ymm7,%ymm0
1556	shlq	$5,%r12
1557	vmovapd	(%rdx,%r12,1),%ymm2
1558	vblendvpd	%ymm2,%ymm0,%ymm7,%ymm7
1559
1560	shrb	$4,%r10b
1561	andq	$0xf,%r10
1562	vpsubq	.Lmask52x4(%rip),%ymm8,%ymm0
1563	shlq	$5,%r10
1564	vmovapd	(%rdx,%r10,1),%ymm2
1565	vblendvpd	%ymm2,%ymm0,%ymm8,%ymm8
1566
1567	movb	%r11b,%r10b
1568	andq	$0xf,%r11
1569	vpsubq	.Lmask52x4(%rip),%ymm9,%ymm0
1570	shlq	$5,%r11
1571	vmovapd	(%rdx,%r11,1),%ymm2
1572	vblendvpd	%ymm2,%ymm0,%ymm9,%ymm9
1573
1574	shrb	$4,%r10b
1575	andq	$0xf,%r10
1576	vpsubq	.Lmask52x4(%rip),%ymm10,%ymm0
1577	shlq	$5,%r10
1578	vmovapd	(%rdx,%r10,1),%ymm2
1579	vblendvpd	%ymm2,%ymm0,%ymm10,%ymm10
1580
1581	vpand	.Lmask52x4(%rip),%ymm3,%ymm3
1582	vpand	.Lmask52x4(%rip),%ymm4,%ymm4
1583	vpand	.Lmask52x4(%rip),%ymm5,%ymm5
1584	vpand	.Lmask52x4(%rip),%ymm6,%ymm6
1585	vpand	.Lmask52x4(%rip),%ymm7,%ymm7
1586	vpand	.Lmask52x4(%rip),%ymm8,%ymm8
1587	vpand	.Lmask52x4(%rip),%ymm9,%ymm9
1588
1589	vpand	.Lmask52x4(%rip),%ymm10,%ymm10
1590
1591	vmovdqu	%ymm3,256(%rdi)
1592	vmovdqu	%ymm4,288(%rdi)
1593	vmovdqu	%ymm5,320(%rdi)
1594	vmovdqu	%ymm6,352(%rdi)
1595	vmovdqu	%ymm7,384(%rdi)
1596	vmovdqu	%ymm8,416(%rdi)
1597	vmovdqu	%ymm9,448(%rdi)
1598	vmovdqu	%ymm10,480(%rdi)
1599
1600	vzeroupper
1601	leaq	(%rsp),%rax
1602.cfi_def_cfa_register	%rax
1603	movq	0(%rax),%r15
1604.cfi_restore	%r15
1605	movq	8(%rax),%r14
1606.cfi_restore	%r14
1607	movq	16(%rax),%r13
1608.cfi_restore	%r13
1609	movq	24(%rax),%r12
1610.cfi_restore	%r12
1611	movq	32(%rax),%rbp
1612.cfi_restore	%rbp
1613	movq	40(%rax),%rbx
1614.cfi_restore	%rbx
1615	leaq	48(%rax),%rsp
1616.cfi_def_cfa	%rsp,8
1617.Lossl_rsaz_amm52x30_x2_avxifma256_epilogue:
1618	.byte	0xf3,0xc3
1619.cfi_endproc
1620.size	ossl_rsaz_amm52x30_x2_avxifma256, .-ossl_rsaz_amm52x30_x2_avxifma256
1621.text
1622
1623.align	32
1624.globl	ossl_extract_multiplier_2x30_win5_avx
1625.type	ossl_extract_multiplier_2x30_win5_avx,@function
1626ossl_extract_multiplier_2x30_win5_avx:
1627.cfi_startproc
1628.byte	243,15,30,250
1629	vmovapd	.Lones(%rip),%ymm12
1630	vmovq	%rdx,%xmm8
1631	vpbroadcastq	%xmm8,%ymm10
1632	vmovq	%rcx,%xmm8
1633	vpbroadcastq	%xmm8,%ymm11
1634	leaq	16384(%rsi),%rax
1635
1636
1637	vpxor	%xmm0,%xmm0,%xmm0
1638	vmovapd	%ymm0,%ymm9
1639	vmovapd	%ymm0,%ymm1
1640	vmovapd	%ymm0,%ymm2
1641	vmovapd	%ymm0,%ymm3
1642	vmovapd	%ymm0,%ymm4
1643	vmovapd	%ymm0,%ymm5
1644	vmovapd	%ymm0,%ymm6
1645	vmovapd	%ymm0,%ymm7
1646
1647.align	32
1648.Lloop:
1649	vpcmpeqq	%ymm9,%ymm10,%ymm13
1650	vmovdqu	0(%rsi),%ymm8
1651
1652	vblendvpd	%ymm13,%ymm8,%ymm0,%ymm0
1653	vmovdqu	32(%rsi),%ymm8
1654
1655	vblendvpd	%ymm13,%ymm8,%ymm1,%ymm1
1656	vmovdqu	64(%rsi),%ymm8
1657
1658	vblendvpd	%ymm13,%ymm8,%ymm2,%ymm2
1659	vmovdqu	96(%rsi),%ymm8
1660
1661	vblendvpd	%ymm13,%ymm8,%ymm3,%ymm3
1662	vmovdqu	128(%rsi),%ymm8
1663
1664	vblendvpd	%ymm13,%ymm8,%ymm4,%ymm4
1665	vmovdqu	160(%rsi),%ymm8
1666
1667	vblendvpd	%ymm13,%ymm8,%ymm5,%ymm5
1668	vmovdqu	192(%rsi),%ymm8
1669
1670	vblendvpd	%ymm13,%ymm8,%ymm6,%ymm6
1671	vmovdqu	224(%rsi),%ymm8
1672
1673	vblendvpd	%ymm13,%ymm8,%ymm7,%ymm7
1674	vpaddq	%ymm12,%ymm9,%ymm9
1675	addq	$512,%rsi
1676	cmpq	%rsi,%rax
1677	jne	.Lloop
1678	vmovdqu	%ymm0,0(%rdi)
1679	vmovdqu	%ymm1,32(%rdi)
1680	vmovdqu	%ymm2,64(%rdi)
1681	vmovdqu	%ymm3,96(%rdi)
1682	vmovdqu	%ymm4,128(%rdi)
1683	vmovdqu	%ymm5,160(%rdi)
1684	vmovdqu	%ymm6,192(%rdi)
1685	vmovdqu	%ymm7,224(%rdi)
1686	leaq	-16384(%rax),%rsi
1687
1688
1689	vpxor	%xmm0,%xmm0,%xmm0
1690	vmovapd	%ymm0,%ymm9
1691	vmovapd	%ymm0,%ymm0
1692	vmovapd	%ymm0,%ymm1
1693	vmovapd	%ymm0,%ymm2
1694	vmovapd	%ymm0,%ymm3
1695	vmovapd	%ymm0,%ymm4
1696	vmovapd	%ymm0,%ymm5
1697	vmovapd	%ymm0,%ymm6
1698	vmovapd	%ymm0,%ymm7
1699
1700.align	32
1701.Lloop_8_15:
1702	vpcmpeqq	%ymm9,%ymm11,%ymm13
1703	vmovdqu	256(%rsi),%ymm8
1704
1705	vblendvpd	%ymm13,%ymm8,%ymm0,%ymm0
1706	vmovdqu	288(%rsi),%ymm8
1707
1708	vblendvpd	%ymm13,%ymm8,%ymm1,%ymm1
1709	vmovdqu	320(%rsi),%ymm8
1710
1711	vblendvpd	%ymm13,%ymm8,%ymm2,%ymm2
1712	vmovdqu	352(%rsi),%ymm8
1713
1714	vblendvpd	%ymm13,%ymm8,%ymm3,%ymm3
1715	vmovdqu	384(%rsi),%ymm8
1716
1717	vblendvpd	%ymm13,%ymm8,%ymm4,%ymm4
1718	vmovdqu	416(%rsi),%ymm8
1719
1720	vblendvpd	%ymm13,%ymm8,%ymm5,%ymm5
1721	vmovdqu	448(%rsi),%ymm8
1722
1723	vblendvpd	%ymm13,%ymm8,%ymm6,%ymm6
1724	vmovdqu	480(%rsi),%ymm8
1725
1726	vblendvpd	%ymm13,%ymm8,%ymm7,%ymm7
1727	vpaddq	%ymm12,%ymm9,%ymm9
1728	addq	$512,%rsi
1729	cmpq	%rsi,%rax
1730	jne	.Lloop_8_15
1731	vmovdqu	%ymm0,256(%rdi)
1732	vmovdqu	%ymm1,288(%rdi)
1733	vmovdqu	%ymm2,320(%rdi)
1734	vmovdqu	%ymm3,352(%rdi)
1735	vmovdqu	%ymm4,384(%rdi)
1736	vmovdqu	%ymm5,416(%rdi)
1737	vmovdqu	%ymm6,448(%rdi)
1738	vmovdqu	%ymm7,480(%rdi)
1739
1740	.byte	0xf3,0xc3
1741.cfi_endproc
1742.size	ossl_extract_multiplier_2x30_win5_avx, .-ossl_extract_multiplier_2x30_win5_avx
1743.section	.rodata
1744.align	32
1745.Lones:
1746.quad	1,1,1,1
1747.Lzeros:
1748.quad	0,0,0,0
1749	.section ".note.gnu.property", "a"
1750	.p2align 3
1751	.long 1f - 0f
1752	.long 4f - 1f
1753	.long 5
17540:
1755	# "GNU" encoded with .byte, since .asciz isn't supported
1756	# on Solaris.
1757	.byte 0x47
1758	.byte 0x4e
1759	.byte 0x55
1760	.byte 0
17611:
1762	.p2align 3
1763	.long 0xc0000002
1764	.long 3f - 2f
17652:
1766	.long 3
17673:
1768	.p2align 3
17694:
1770