xref: /freebsd/sys/crypto/openssl/amd64/rsaz-3k-avx512.S (revision 4757b351ea9d59d71d4a38b82506d2d16fcd560d)
1/* Do not modify. This file is auto-generated from rsaz-3k-avx512.pl. */
2.text
3
4.globl	ossl_rsaz_amm52x30_x1_ifma256
5.type	ossl_rsaz_amm52x30_x1_ifma256,@function
6.align	32
7ossl_rsaz_amm52x30_x1_ifma256:
8.cfi_startproc
9.byte	243,15,30,250
10	pushq	%rbx
11.cfi_adjust_cfa_offset	8
12.cfi_offset	%rbx,-16
13	pushq	%rbp
14.cfi_adjust_cfa_offset	8
15.cfi_offset	%rbp,-24
16	pushq	%r12
17.cfi_adjust_cfa_offset	8
18.cfi_offset	%r12,-32
19	pushq	%r13
20.cfi_adjust_cfa_offset	8
21.cfi_offset	%r13,-40
22	pushq	%r14
23.cfi_adjust_cfa_offset	8
24.cfi_offset	%r14,-48
25	pushq	%r15
26.cfi_adjust_cfa_offset	8
27.cfi_offset	%r15,-56
28
29	vpxord	%ymm0,%ymm0,%ymm0
30	vmovdqa64	%ymm0,%ymm3
31	vmovdqa64	%ymm0,%ymm4
32	vmovdqa64	%ymm0,%ymm5
33	vmovdqa64	%ymm0,%ymm6
34	vmovdqa64	%ymm0,%ymm7
35	vmovdqa64	%ymm0,%ymm8
36	vmovdqa64	%ymm0,%ymm9
37	vmovdqa64	%ymm0,%ymm10
38
39	xorl	%r9d,%r9d
40
41	movq	%rdx,%r11
42	movq	$0xfffffffffffff,%rax
43
44
45	movl	$7,%ebx
46
47.align	32
48.Lloop7:
49	movq	0(%r11),%r13
50
51	vpbroadcastq	%r13,%ymm1
52	movq	0(%rsi),%rdx
53	mulxq	%r13,%r13,%r12
54	addq	%r13,%r9
55	movq	%r12,%r10
56	adcq	$0,%r10
57
58	movq	%r8,%r13
59	imulq	%r9,%r13
60	andq	%rax,%r13
61
62	vpbroadcastq	%r13,%ymm2
63	movq	0(%rcx),%rdx
64	mulxq	%r13,%r13,%r12
65	addq	%r13,%r9
66	adcq	%r12,%r10
67
68	shrq	$52,%r9
69	salq	$12,%r10
70	orq	%r10,%r9
71
72	vpmadd52luq	0(%rsi),%ymm1,%ymm3
73	vpmadd52luq	32(%rsi),%ymm1,%ymm4
74	vpmadd52luq	64(%rsi),%ymm1,%ymm5
75	vpmadd52luq	96(%rsi),%ymm1,%ymm6
76	vpmadd52luq	128(%rsi),%ymm1,%ymm7
77	vpmadd52luq	160(%rsi),%ymm1,%ymm8
78	vpmadd52luq	192(%rsi),%ymm1,%ymm9
79	vpmadd52luq	224(%rsi),%ymm1,%ymm10
80
81	vpmadd52luq	0(%rcx),%ymm2,%ymm3
82	vpmadd52luq	32(%rcx),%ymm2,%ymm4
83	vpmadd52luq	64(%rcx),%ymm2,%ymm5
84	vpmadd52luq	96(%rcx),%ymm2,%ymm6
85	vpmadd52luq	128(%rcx),%ymm2,%ymm7
86	vpmadd52luq	160(%rcx),%ymm2,%ymm8
87	vpmadd52luq	192(%rcx),%ymm2,%ymm9
88	vpmadd52luq	224(%rcx),%ymm2,%ymm10
89
90
91	valignq	$1,%ymm3,%ymm4,%ymm3
92	valignq	$1,%ymm4,%ymm5,%ymm4
93	valignq	$1,%ymm5,%ymm6,%ymm5
94	valignq	$1,%ymm6,%ymm7,%ymm6
95	valignq	$1,%ymm7,%ymm8,%ymm7
96	valignq	$1,%ymm8,%ymm9,%ymm8
97	valignq	$1,%ymm9,%ymm10,%ymm9
98	valignq	$1,%ymm10,%ymm0,%ymm10
99
100	vmovq	%xmm3,%r13
101	addq	%r13,%r9
102
103	vpmadd52huq	0(%rsi),%ymm1,%ymm3
104	vpmadd52huq	32(%rsi),%ymm1,%ymm4
105	vpmadd52huq	64(%rsi),%ymm1,%ymm5
106	vpmadd52huq	96(%rsi),%ymm1,%ymm6
107	vpmadd52huq	128(%rsi),%ymm1,%ymm7
108	vpmadd52huq	160(%rsi),%ymm1,%ymm8
109	vpmadd52huq	192(%rsi),%ymm1,%ymm9
110	vpmadd52huq	224(%rsi),%ymm1,%ymm10
111
112	vpmadd52huq	0(%rcx),%ymm2,%ymm3
113	vpmadd52huq	32(%rcx),%ymm2,%ymm4
114	vpmadd52huq	64(%rcx),%ymm2,%ymm5
115	vpmadd52huq	96(%rcx),%ymm2,%ymm6
116	vpmadd52huq	128(%rcx),%ymm2,%ymm7
117	vpmadd52huq	160(%rcx),%ymm2,%ymm8
118	vpmadd52huq	192(%rcx),%ymm2,%ymm9
119	vpmadd52huq	224(%rcx),%ymm2,%ymm10
120	movq	8(%r11),%r13
121
122	vpbroadcastq	%r13,%ymm1
123	movq	0(%rsi),%rdx
124	mulxq	%r13,%r13,%r12
125	addq	%r13,%r9
126	movq	%r12,%r10
127	adcq	$0,%r10
128
129	movq	%r8,%r13
130	imulq	%r9,%r13
131	andq	%rax,%r13
132
133	vpbroadcastq	%r13,%ymm2
134	movq	0(%rcx),%rdx
135	mulxq	%r13,%r13,%r12
136	addq	%r13,%r9
137	adcq	%r12,%r10
138
139	shrq	$52,%r9
140	salq	$12,%r10
141	orq	%r10,%r9
142
143	vpmadd52luq	0(%rsi),%ymm1,%ymm3
144	vpmadd52luq	32(%rsi),%ymm1,%ymm4
145	vpmadd52luq	64(%rsi),%ymm1,%ymm5
146	vpmadd52luq	96(%rsi),%ymm1,%ymm6
147	vpmadd52luq	128(%rsi),%ymm1,%ymm7
148	vpmadd52luq	160(%rsi),%ymm1,%ymm8
149	vpmadd52luq	192(%rsi),%ymm1,%ymm9
150	vpmadd52luq	224(%rsi),%ymm1,%ymm10
151
152	vpmadd52luq	0(%rcx),%ymm2,%ymm3
153	vpmadd52luq	32(%rcx),%ymm2,%ymm4
154	vpmadd52luq	64(%rcx),%ymm2,%ymm5
155	vpmadd52luq	96(%rcx),%ymm2,%ymm6
156	vpmadd52luq	128(%rcx),%ymm2,%ymm7
157	vpmadd52luq	160(%rcx),%ymm2,%ymm8
158	vpmadd52luq	192(%rcx),%ymm2,%ymm9
159	vpmadd52luq	224(%rcx),%ymm2,%ymm10
160
161
162	valignq	$1,%ymm3,%ymm4,%ymm3
163	valignq	$1,%ymm4,%ymm5,%ymm4
164	valignq	$1,%ymm5,%ymm6,%ymm5
165	valignq	$1,%ymm6,%ymm7,%ymm6
166	valignq	$1,%ymm7,%ymm8,%ymm7
167	valignq	$1,%ymm8,%ymm9,%ymm8
168	valignq	$1,%ymm9,%ymm10,%ymm9
169	valignq	$1,%ymm10,%ymm0,%ymm10
170
171	vmovq	%xmm3,%r13
172	addq	%r13,%r9
173
174	vpmadd52huq	0(%rsi),%ymm1,%ymm3
175	vpmadd52huq	32(%rsi),%ymm1,%ymm4
176	vpmadd52huq	64(%rsi),%ymm1,%ymm5
177	vpmadd52huq	96(%rsi),%ymm1,%ymm6
178	vpmadd52huq	128(%rsi),%ymm1,%ymm7
179	vpmadd52huq	160(%rsi),%ymm1,%ymm8
180	vpmadd52huq	192(%rsi),%ymm1,%ymm9
181	vpmadd52huq	224(%rsi),%ymm1,%ymm10
182
183	vpmadd52huq	0(%rcx),%ymm2,%ymm3
184	vpmadd52huq	32(%rcx),%ymm2,%ymm4
185	vpmadd52huq	64(%rcx),%ymm2,%ymm5
186	vpmadd52huq	96(%rcx),%ymm2,%ymm6
187	vpmadd52huq	128(%rcx),%ymm2,%ymm7
188	vpmadd52huq	160(%rcx),%ymm2,%ymm8
189	vpmadd52huq	192(%rcx),%ymm2,%ymm9
190	vpmadd52huq	224(%rcx),%ymm2,%ymm10
191	movq	16(%r11),%r13
192
193	vpbroadcastq	%r13,%ymm1
194	movq	0(%rsi),%rdx
195	mulxq	%r13,%r13,%r12
196	addq	%r13,%r9
197	movq	%r12,%r10
198	adcq	$0,%r10
199
200	movq	%r8,%r13
201	imulq	%r9,%r13
202	andq	%rax,%r13
203
204	vpbroadcastq	%r13,%ymm2
205	movq	0(%rcx),%rdx
206	mulxq	%r13,%r13,%r12
207	addq	%r13,%r9
208	adcq	%r12,%r10
209
210	shrq	$52,%r9
211	salq	$12,%r10
212	orq	%r10,%r9
213
214	vpmadd52luq	0(%rsi),%ymm1,%ymm3
215	vpmadd52luq	32(%rsi),%ymm1,%ymm4
216	vpmadd52luq	64(%rsi),%ymm1,%ymm5
217	vpmadd52luq	96(%rsi),%ymm1,%ymm6
218	vpmadd52luq	128(%rsi),%ymm1,%ymm7
219	vpmadd52luq	160(%rsi),%ymm1,%ymm8
220	vpmadd52luq	192(%rsi),%ymm1,%ymm9
221	vpmadd52luq	224(%rsi),%ymm1,%ymm10
222
223	vpmadd52luq	0(%rcx),%ymm2,%ymm3
224	vpmadd52luq	32(%rcx),%ymm2,%ymm4
225	vpmadd52luq	64(%rcx),%ymm2,%ymm5
226	vpmadd52luq	96(%rcx),%ymm2,%ymm6
227	vpmadd52luq	128(%rcx),%ymm2,%ymm7
228	vpmadd52luq	160(%rcx),%ymm2,%ymm8
229	vpmadd52luq	192(%rcx),%ymm2,%ymm9
230	vpmadd52luq	224(%rcx),%ymm2,%ymm10
231
232
233	valignq	$1,%ymm3,%ymm4,%ymm3
234	valignq	$1,%ymm4,%ymm5,%ymm4
235	valignq	$1,%ymm5,%ymm6,%ymm5
236	valignq	$1,%ymm6,%ymm7,%ymm6
237	valignq	$1,%ymm7,%ymm8,%ymm7
238	valignq	$1,%ymm8,%ymm9,%ymm8
239	valignq	$1,%ymm9,%ymm10,%ymm9
240	valignq	$1,%ymm10,%ymm0,%ymm10
241
242	vmovq	%xmm3,%r13
243	addq	%r13,%r9
244
245	vpmadd52huq	0(%rsi),%ymm1,%ymm3
246	vpmadd52huq	32(%rsi),%ymm1,%ymm4
247	vpmadd52huq	64(%rsi),%ymm1,%ymm5
248	vpmadd52huq	96(%rsi),%ymm1,%ymm6
249	vpmadd52huq	128(%rsi),%ymm1,%ymm7
250	vpmadd52huq	160(%rsi),%ymm1,%ymm8
251	vpmadd52huq	192(%rsi),%ymm1,%ymm9
252	vpmadd52huq	224(%rsi),%ymm1,%ymm10
253
254	vpmadd52huq	0(%rcx),%ymm2,%ymm3
255	vpmadd52huq	32(%rcx),%ymm2,%ymm4
256	vpmadd52huq	64(%rcx),%ymm2,%ymm5
257	vpmadd52huq	96(%rcx),%ymm2,%ymm6
258	vpmadd52huq	128(%rcx),%ymm2,%ymm7
259	vpmadd52huq	160(%rcx),%ymm2,%ymm8
260	vpmadd52huq	192(%rcx),%ymm2,%ymm9
261	vpmadd52huq	224(%rcx),%ymm2,%ymm10
262	movq	24(%r11),%r13
263
264	vpbroadcastq	%r13,%ymm1
265	movq	0(%rsi),%rdx
266	mulxq	%r13,%r13,%r12
267	addq	%r13,%r9
268	movq	%r12,%r10
269	adcq	$0,%r10
270
271	movq	%r8,%r13
272	imulq	%r9,%r13
273	andq	%rax,%r13
274
275	vpbroadcastq	%r13,%ymm2
276	movq	0(%rcx),%rdx
277	mulxq	%r13,%r13,%r12
278	addq	%r13,%r9
279	adcq	%r12,%r10
280
281	shrq	$52,%r9
282	salq	$12,%r10
283	orq	%r10,%r9
284
285	vpmadd52luq	0(%rsi),%ymm1,%ymm3
286	vpmadd52luq	32(%rsi),%ymm1,%ymm4
287	vpmadd52luq	64(%rsi),%ymm1,%ymm5
288	vpmadd52luq	96(%rsi),%ymm1,%ymm6
289	vpmadd52luq	128(%rsi),%ymm1,%ymm7
290	vpmadd52luq	160(%rsi),%ymm1,%ymm8
291	vpmadd52luq	192(%rsi),%ymm1,%ymm9
292	vpmadd52luq	224(%rsi),%ymm1,%ymm10
293
294	vpmadd52luq	0(%rcx),%ymm2,%ymm3
295	vpmadd52luq	32(%rcx),%ymm2,%ymm4
296	vpmadd52luq	64(%rcx),%ymm2,%ymm5
297	vpmadd52luq	96(%rcx),%ymm2,%ymm6
298	vpmadd52luq	128(%rcx),%ymm2,%ymm7
299	vpmadd52luq	160(%rcx),%ymm2,%ymm8
300	vpmadd52luq	192(%rcx),%ymm2,%ymm9
301	vpmadd52luq	224(%rcx),%ymm2,%ymm10
302
303
304	valignq	$1,%ymm3,%ymm4,%ymm3
305	valignq	$1,%ymm4,%ymm5,%ymm4
306	valignq	$1,%ymm5,%ymm6,%ymm5
307	valignq	$1,%ymm6,%ymm7,%ymm6
308	valignq	$1,%ymm7,%ymm8,%ymm7
309	valignq	$1,%ymm8,%ymm9,%ymm8
310	valignq	$1,%ymm9,%ymm10,%ymm9
311	valignq	$1,%ymm10,%ymm0,%ymm10
312
313	vmovq	%xmm3,%r13
314	addq	%r13,%r9
315
316	vpmadd52huq	0(%rsi),%ymm1,%ymm3
317	vpmadd52huq	32(%rsi),%ymm1,%ymm4
318	vpmadd52huq	64(%rsi),%ymm1,%ymm5
319	vpmadd52huq	96(%rsi),%ymm1,%ymm6
320	vpmadd52huq	128(%rsi),%ymm1,%ymm7
321	vpmadd52huq	160(%rsi),%ymm1,%ymm8
322	vpmadd52huq	192(%rsi),%ymm1,%ymm9
323	vpmadd52huq	224(%rsi),%ymm1,%ymm10
324
325	vpmadd52huq	0(%rcx),%ymm2,%ymm3
326	vpmadd52huq	32(%rcx),%ymm2,%ymm4
327	vpmadd52huq	64(%rcx),%ymm2,%ymm5
328	vpmadd52huq	96(%rcx),%ymm2,%ymm6
329	vpmadd52huq	128(%rcx),%ymm2,%ymm7
330	vpmadd52huq	160(%rcx),%ymm2,%ymm8
331	vpmadd52huq	192(%rcx),%ymm2,%ymm9
332	vpmadd52huq	224(%rcx),%ymm2,%ymm10
333	leaq	32(%r11),%r11
334	decl	%ebx
335	jne	.Lloop7
336	movq	0(%r11),%r13
337
338	vpbroadcastq	%r13,%ymm1
339	movq	0(%rsi),%rdx
340	mulxq	%r13,%r13,%r12
341	addq	%r13,%r9
342	movq	%r12,%r10
343	adcq	$0,%r10
344
345	movq	%r8,%r13
346	imulq	%r9,%r13
347	andq	%rax,%r13
348
349	vpbroadcastq	%r13,%ymm2
350	movq	0(%rcx),%rdx
351	mulxq	%r13,%r13,%r12
352	addq	%r13,%r9
353	adcq	%r12,%r10
354
355	shrq	$52,%r9
356	salq	$12,%r10
357	orq	%r10,%r9
358
359	vpmadd52luq	0(%rsi),%ymm1,%ymm3
360	vpmadd52luq	32(%rsi),%ymm1,%ymm4
361	vpmadd52luq	64(%rsi),%ymm1,%ymm5
362	vpmadd52luq	96(%rsi),%ymm1,%ymm6
363	vpmadd52luq	128(%rsi),%ymm1,%ymm7
364	vpmadd52luq	160(%rsi),%ymm1,%ymm8
365	vpmadd52luq	192(%rsi),%ymm1,%ymm9
366	vpmadd52luq	224(%rsi),%ymm1,%ymm10
367
368	vpmadd52luq	0(%rcx),%ymm2,%ymm3
369	vpmadd52luq	32(%rcx),%ymm2,%ymm4
370	vpmadd52luq	64(%rcx),%ymm2,%ymm5
371	vpmadd52luq	96(%rcx),%ymm2,%ymm6
372	vpmadd52luq	128(%rcx),%ymm2,%ymm7
373	vpmadd52luq	160(%rcx),%ymm2,%ymm8
374	vpmadd52luq	192(%rcx),%ymm2,%ymm9
375	vpmadd52luq	224(%rcx),%ymm2,%ymm10
376
377
378	valignq	$1,%ymm3,%ymm4,%ymm3
379	valignq	$1,%ymm4,%ymm5,%ymm4
380	valignq	$1,%ymm5,%ymm6,%ymm5
381	valignq	$1,%ymm6,%ymm7,%ymm6
382	valignq	$1,%ymm7,%ymm8,%ymm7
383	valignq	$1,%ymm8,%ymm9,%ymm8
384	valignq	$1,%ymm9,%ymm10,%ymm9
385	valignq	$1,%ymm10,%ymm0,%ymm10
386
387	vmovq	%xmm3,%r13
388	addq	%r13,%r9
389
390	vpmadd52huq	0(%rsi),%ymm1,%ymm3
391	vpmadd52huq	32(%rsi),%ymm1,%ymm4
392	vpmadd52huq	64(%rsi),%ymm1,%ymm5
393	vpmadd52huq	96(%rsi),%ymm1,%ymm6
394	vpmadd52huq	128(%rsi),%ymm1,%ymm7
395	vpmadd52huq	160(%rsi),%ymm1,%ymm8
396	vpmadd52huq	192(%rsi),%ymm1,%ymm9
397	vpmadd52huq	224(%rsi),%ymm1,%ymm10
398
399	vpmadd52huq	0(%rcx),%ymm2,%ymm3
400	vpmadd52huq	32(%rcx),%ymm2,%ymm4
401	vpmadd52huq	64(%rcx),%ymm2,%ymm5
402	vpmadd52huq	96(%rcx),%ymm2,%ymm6
403	vpmadd52huq	128(%rcx),%ymm2,%ymm7
404	vpmadd52huq	160(%rcx),%ymm2,%ymm8
405	vpmadd52huq	192(%rcx),%ymm2,%ymm9
406	vpmadd52huq	224(%rcx),%ymm2,%ymm10
407	movq	8(%r11),%r13
408
409	vpbroadcastq	%r13,%ymm1
410	movq	0(%rsi),%rdx
411	mulxq	%r13,%r13,%r12
412	addq	%r13,%r9
413	movq	%r12,%r10
414	adcq	$0,%r10
415
416	movq	%r8,%r13
417	imulq	%r9,%r13
418	andq	%rax,%r13
419
420	vpbroadcastq	%r13,%ymm2
421	movq	0(%rcx),%rdx
422	mulxq	%r13,%r13,%r12
423	addq	%r13,%r9
424	adcq	%r12,%r10
425
426	shrq	$52,%r9
427	salq	$12,%r10
428	orq	%r10,%r9
429
430	vpmadd52luq	0(%rsi),%ymm1,%ymm3
431	vpmadd52luq	32(%rsi),%ymm1,%ymm4
432	vpmadd52luq	64(%rsi),%ymm1,%ymm5
433	vpmadd52luq	96(%rsi),%ymm1,%ymm6
434	vpmadd52luq	128(%rsi),%ymm1,%ymm7
435	vpmadd52luq	160(%rsi),%ymm1,%ymm8
436	vpmadd52luq	192(%rsi),%ymm1,%ymm9
437	vpmadd52luq	224(%rsi),%ymm1,%ymm10
438
439	vpmadd52luq	0(%rcx),%ymm2,%ymm3
440	vpmadd52luq	32(%rcx),%ymm2,%ymm4
441	vpmadd52luq	64(%rcx),%ymm2,%ymm5
442	vpmadd52luq	96(%rcx),%ymm2,%ymm6
443	vpmadd52luq	128(%rcx),%ymm2,%ymm7
444	vpmadd52luq	160(%rcx),%ymm2,%ymm8
445	vpmadd52luq	192(%rcx),%ymm2,%ymm9
446	vpmadd52luq	224(%rcx),%ymm2,%ymm10
447
448
449	valignq	$1,%ymm3,%ymm4,%ymm3
450	valignq	$1,%ymm4,%ymm5,%ymm4
451	valignq	$1,%ymm5,%ymm6,%ymm5
452	valignq	$1,%ymm6,%ymm7,%ymm6
453	valignq	$1,%ymm7,%ymm8,%ymm7
454	valignq	$1,%ymm8,%ymm9,%ymm8
455	valignq	$1,%ymm9,%ymm10,%ymm9
456	valignq	$1,%ymm10,%ymm0,%ymm10
457
458	vmovq	%xmm3,%r13
459	addq	%r13,%r9
460
461	vpmadd52huq	0(%rsi),%ymm1,%ymm3
462	vpmadd52huq	32(%rsi),%ymm1,%ymm4
463	vpmadd52huq	64(%rsi),%ymm1,%ymm5
464	vpmadd52huq	96(%rsi),%ymm1,%ymm6
465	vpmadd52huq	128(%rsi),%ymm1,%ymm7
466	vpmadd52huq	160(%rsi),%ymm1,%ymm8
467	vpmadd52huq	192(%rsi),%ymm1,%ymm9
468	vpmadd52huq	224(%rsi),%ymm1,%ymm10
469
470	vpmadd52huq	0(%rcx),%ymm2,%ymm3
471	vpmadd52huq	32(%rcx),%ymm2,%ymm4
472	vpmadd52huq	64(%rcx),%ymm2,%ymm5
473	vpmadd52huq	96(%rcx),%ymm2,%ymm6
474	vpmadd52huq	128(%rcx),%ymm2,%ymm7
475	vpmadd52huq	160(%rcx),%ymm2,%ymm8
476	vpmadd52huq	192(%rcx),%ymm2,%ymm9
477	vpmadd52huq	224(%rcx),%ymm2,%ymm10
478
479	vpbroadcastq	%r9,%ymm0
480	vpblendd	$3,%ymm0,%ymm3,%ymm3
481
482
483
484	vpsrlq	$52,%ymm3,%ymm0
485	vpsrlq	$52,%ymm4,%ymm1
486	vpsrlq	$52,%ymm5,%ymm2
487	vpsrlq	$52,%ymm6,%ymm19
488	vpsrlq	$52,%ymm7,%ymm20
489	vpsrlq	$52,%ymm8,%ymm21
490	vpsrlq	$52,%ymm9,%ymm22
491	vpsrlq	$52,%ymm10,%ymm23
492
493
494	valignq	$3,%ymm22,%ymm23,%ymm23
495	valignq	$3,%ymm21,%ymm22,%ymm22
496	valignq	$3,%ymm20,%ymm21,%ymm21
497	valignq	$3,%ymm19,%ymm20,%ymm20
498	valignq	$3,%ymm2,%ymm19,%ymm19
499	valignq	$3,%ymm1,%ymm2,%ymm2
500	valignq	$3,%ymm0,%ymm1,%ymm1
501	valignq	$3,.Lzeros(%rip),%ymm0,%ymm0
502
503
504	vpandq	.Lmask52x4(%rip),%ymm3,%ymm3
505	vpandq	.Lmask52x4(%rip),%ymm4,%ymm4
506	vpandq	.Lmask52x4(%rip),%ymm5,%ymm5
507	vpandq	.Lmask52x4(%rip),%ymm6,%ymm6
508	vpandq	.Lmask52x4(%rip),%ymm7,%ymm7
509	vpandq	.Lmask52x4(%rip),%ymm8,%ymm8
510	vpandq	.Lmask52x4(%rip),%ymm9,%ymm9
511	vpandq	.Lmask52x4(%rip),%ymm10,%ymm10
512
513
514	vpaddq	%ymm0,%ymm3,%ymm3
515	vpaddq	%ymm1,%ymm4,%ymm4
516	vpaddq	%ymm2,%ymm5,%ymm5
517	vpaddq	%ymm19,%ymm6,%ymm6
518	vpaddq	%ymm20,%ymm7,%ymm7
519	vpaddq	%ymm21,%ymm8,%ymm8
520	vpaddq	%ymm22,%ymm9,%ymm9
521	vpaddq	%ymm23,%ymm10,%ymm10
522
523
524
525	vpcmpuq	$6,.Lmask52x4(%rip),%ymm3,%k1
526	vpcmpuq	$6,.Lmask52x4(%rip),%ymm4,%k2
527	kmovb	%k1,%r14d
528	kmovb	%k2,%r13d
529	shlb	$4,%r13b
530	orb	%r13b,%r14b
531
532	vpcmpuq	$6,.Lmask52x4(%rip),%ymm5,%k1
533	vpcmpuq	$6,.Lmask52x4(%rip),%ymm6,%k2
534	kmovb	%k1,%r13d
535	kmovb	%k2,%r12d
536	shlb	$4,%r12b
537	orb	%r12b,%r13b
538
539	vpcmpuq	$6,.Lmask52x4(%rip),%ymm7,%k1
540	vpcmpuq	$6,.Lmask52x4(%rip),%ymm8,%k2
541	kmovb	%k1,%r12d
542	kmovb	%k2,%r11d
543	shlb	$4,%r11b
544	orb	%r11b,%r12b
545
546	vpcmpuq	$6,.Lmask52x4(%rip),%ymm9,%k1
547	vpcmpuq	$6,.Lmask52x4(%rip),%ymm10,%k2
548	kmovb	%k1,%r11d
549	kmovb	%k2,%r10d
550	shlb	$4,%r10b
551	orb	%r10b,%r11b
552
553	addb	%r14b,%r14b
554	adcb	%r13b,%r13b
555	adcb	%r12b,%r12b
556	adcb	%r11b,%r11b
557
558
559	vpcmpuq	$0,.Lmask52x4(%rip),%ymm3,%k1
560	vpcmpuq	$0,.Lmask52x4(%rip),%ymm4,%k2
561	kmovb	%k1,%r9d
562	kmovb	%k2,%r8d
563	shlb	$4,%r8b
564	orb	%r8b,%r9b
565
566	vpcmpuq	$0,.Lmask52x4(%rip),%ymm5,%k1
567	vpcmpuq	$0,.Lmask52x4(%rip),%ymm6,%k2
568	kmovb	%k1,%r8d
569	kmovb	%k2,%edx
570	shlb	$4,%dl
571	orb	%dl,%r8b
572
573	vpcmpuq	$0,.Lmask52x4(%rip),%ymm7,%k1
574	vpcmpuq	$0,.Lmask52x4(%rip),%ymm8,%k2
575	kmovb	%k1,%edx
576	kmovb	%k2,%ecx
577	shlb	$4,%cl
578	orb	%cl,%dl
579
580	vpcmpuq	$0,.Lmask52x4(%rip),%ymm9,%k1
581	vpcmpuq	$0,.Lmask52x4(%rip),%ymm10,%k2
582	kmovb	%k1,%ecx
583	kmovb	%k2,%ebx
584	shlb	$4,%bl
585	orb	%bl,%cl
586
587	addb	%r9b,%r14b
588	adcb	%r8b,%r13b
589	adcb	%dl,%r12b
590	adcb	%cl,%r11b
591
592	xorb	%r9b,%r14b
593	xorb	%r8b,%r13b
594	xorb	%dl,%r12b
595	xorb	%cl,%r11b
596
597	kmovb	%r14d,%k1
598	shrb	$4,%r14b
599	kmovb	%r14d,%k2
600	kmovb	%r13d,%k3
601	shrb	$4,%r13b
602	kmovb	%r13d,%k4
603	kmovb	%r12d,%k5
604	shrb	$4,%r12b
605	kmovb	%r12d,%k6
606	kmovb	%r11d,%k7
607
608	vpsubq	.Lmask52x4(%rip),%ymm3,%ymm3{%k1}
609	vpsubq	.Lmask52x4(%rip),%ymm4,%ymm4{%k2}
610	vpsubq	.Lmask52x4(%rip),%ymm5,%ymm5{%k3}
611	vpsubq	.Lmask52x4(%rip),%ymm6,%ymm6{%k4}
612	vpsubq	.Lmask52x4(%rip),%ymm7,%ymm7{%k5}
613	vpsubq	.Lmask52x4(%rip),%ymm8,%ymm8{%k6}
614	vpsubq	.Lmask52x4(%rip),%ymm9,%ymm9{%k7}
615
616	vpandq	.Lmask52x4(%rip),%ymm3,%ymm3
617	vpandq	.Lmask52x4(%rip),%ymm4,%ymm4
618	vpandq	.Lmask52x4(%rip),%ymm5,%ymm5
619	vpandq	.Lmask52x4(%rip),%ymm6,%ymm6
620	vpandq	.Lmask52x4(%rip),%ymm7,%ymm7
621	vpandq	.Lmask52x4(%rip),%ymm8,%ymm8
622	vpandq	.Lmask52x4(%rip),%ymm9,%ymm9
623
624	shrb	$4,%r11b
625	kmovb	%r11d,%k1
626
627	vpsubq	.Lmask52x4(%rip),%ymm10,%ymm10{%k1}
628
629	vpandq	.Lmask52x4(%rip),%ymm10,%ymm10
630
631	vmovdqu64	%ymm3,0(%rdi)
632	vmovdqu64	%ymm4,32(%rdi)
633	vmovdqu64	%ymm5,64(%rdi)
634	vmovdqu64	%ymm6,96(%rdi)
635	vmovdqu64	%ymm7,128(%rdi)
636	vmovdqu64	%ymm8,160(%rdi)
637	vmovdqu64	%ymm9,192(%rdi)
638	vmovdqu64	%ymm10,224(%rdi)
639
640	vzeroupper
641	leaq	(%rsp),%rax
642.cfi_def_cfa_register	%rax
643	movq	0(%rax),%r15
644.cfi_restore	%r15
645	movq	8(%rax),%r14
646.cfi_restore	%r14
647	movq	16(%rax),%r13
648.cfi_restore	%r13
649	movq	24(%rax),%r12
650.cfi_restore	%r12
651	movq	32(%rax),%rbp
652.cfi_restore	%rbp
653	movq	40(%rax),%rbx
654.cfi_restore	%rbx
655	leaq	48(%rax),%rsp
656.cfi_def_cfa	%rsp,8
657.Lossl_rsaz_amm52x30_x1_ifma256_epilogue:
658	.byte	0xf3,0xc3
659.cfi_endproc
660.size	ossl_rsaz_amm52x30_x1_ifma256, .-ossl_rsaz_amm52x30_x1_ifma256
661.section	.rodata
662.align	32
663.Lmask52x4:
664.quad	0xfffffffffffff
665.quad	0xfffffffffffff
666.quad	0xfffffffffffff
667.quad	0xfffffffffffff
668.text
669
670.globl	ossl_rsaz_amm52x30_x2_ifma256
671.type	ossl_rsaz_amm52x30_x2_ifma256,@function
672.align	32
673ossl_rsaz_amm52x30_x2_ifma256:
674.cfi_startproc
675.byte	243,15,30,250
676	pushq	%rbx
677.cfi_adjust_cfa_offset	8
678.cfi_offset	%rbx,-16
679	pushq	%rbp
680.cfi_adjust_cfa_offset	8
681.cfi_offset	%rbp,-24
682	pushq	%r12
683.cfi_adjust_cfa_offset	8
684.cfi_offset	%r12,-32
685	pushq	%r13
686.cfi_adjust_cfa_offset	8
687.cfi_offset	%r13,-40
688	pushq	%r14
689.cfi_adjust_cfa_offset	8
690.cfi_offset	%r14,-48
691	pushq	%r15
692.cfi_adjust_cfa_offset	8
693.cfi_offset	%r15,-56
694
695	vpxord	%ymm0,%ymm0,%ymm0
696	vmovdqa64	%ymm0,%ymm3
697	vmovdqa64	%ymm0,%ymm4
698	vmovdqa64	%ymm0,%ymm5
699	vmovdqa64	%ymm0,%ymm6
700	vmovdqa64	%ymm0,%ymm7
701	vmovdqa64	%ymm0,%ymm8
702	vmovdqa64	%ymm0,%ymm9
703	vmovdqa64	%ymm0,%ymm10
704
705	vmovdqa64	%ymm0,%ymm11
706	vmovdqa64	%ymm0,%ymm12
707	vmovdqa64	%ymm0,%ymm13
708	vmovdqa64	%ymm0,%ymm14
709	vmovdqa64	%ymm0,%ymm15
710	vmovdqa64	%ymm0,%ymm16
711	vmovdqa64	%ymm0,%ymm17
712	vmovdqa64	%ymm0,%ymm18
713
714
715	xorl	%r9d,%r9d
716	xorl	%r15d,%r15d
717
718	movq	%rdx,%r11
719	movq	$0xfffffffffffff,%rax
720
721	movl	$30,%ebx
722
723.align	32
724.Lloop30:
725	movq	0(%r11),%r13
726
727	vpbroadcastq	%r13,%ymm1
728	movq	0(%rsi),%rdx
729	mulxq	%r13,%r13,%r12
730	addq	%r13,%r9
731	movq	%r12,%r10
732	adcq	$0,%r10
733
734	movq	(%r8),%r13
735	imulq	%r9,%r13
736	andq	%rax,%r13
737
738	vpbroadcastq	%r13,%ymm2
739	movq	0(%rcx),%rdx
740	mulxq	%r13,%r13,%r12
741	addq	%r13,%r9
742	adcq	%r12,%r10
743
744	shrq	$52,%r9
745	salq	$12,%r10
746	orq	%r10,%r9
747
748	vpmadd52luq	0(%rsi),%ymm1,%ymm3
749	vpmadd52luq	32(%rsi),%ymm1,%ymm4
750	vpmadd52luq	64(%rsi),%ymm1,%ymm5
751	vpmadd52luq	96(%rsi),%ymm1,%ymm6
752	vpmadd52luq	128(%rsi),%ymm1,%ymm7
753	vpmadd52luq	160(%rsi),%ymm1,%ymm8
754	vpmadd52luq	192(%rsi),%ymm1,%ymm9
755	vpmadd52luq	224(%rsi),%ymm1,%ymm10
756
757	vpmadd52luq	0(%rcx),%ymm2,%ymm3
758	vpmadd52luq	32(%rcx),%ymm2,%ymm4
759	vpmadd52luq	64(%rcx),%ymm2,%ymm5
760	vpmadd52luq	96(%rcx),%ymm2,%ymm6
761	vpmadd52luq	128(%rcx),%ymm2,%ymm7
762	vpmadd52luq	160(%rcx),%ymm2,%ymm8
763	vpmadd52luq	192(%rcx),%ymm2,%ymm9
764	vpmadd52luq	224(%rcx),%ymm2,%ymm10
765
766
767	valignq	$1,%ymm3,%ymm4,%ymm3
768	valignq	$1,%ymm4,%ymm5,%ymm4
769	valignq	$1,%ymm5,%ymm6,%ymm5
770	valignq	$1,%ymm6,%ymm7,%ymm6
771	valignq	$1,%ymm7,%ymm8,%ymm7
772	valignq	$1,%ymm8,%ymm9,%ymm8
773	valignq	$1,%ymm9,%ymm10,%ymm9
774	valignq	$1,%ymm10,%ymm0,%ymm10
775
776	vmovq	%xmm3,%r13
777	addq	%r13,%r9
778
779	vpmadd52huq	0(%rsi),%ymm1,%ymm3
780	vpmadd52huq	32(%rsi),%ymm1,%ymm4
781	vpmadd52huq	64(%rsi),%ymm1,%ymm5
782	vpmadd52huq	96(%rsi),%ymm1,%ymm6
783	vpmadd52huq	128(%rsi),%ymm1,%ymm7
784	vpmadd52huq	160(%rsi),%ymm1,%ymm8
785	vpmadd52huq	192(%rsi),%ymm1,%ymm9
786	vpmadd52huq	224(%rsi),%ymm1,%ymm10
787
788	vpmadd52huq	0(%rcx),%ymm2,%ymm3
789	vpmadd52huq	32(%rcx),%ymm2,%ymm4
790	vpmadd52huq	64(%rcx),%ymm2,%ymm5
791	vpmadd52huq	96(%rcx),%ymm2,%ymm6
792	vpmadd52huq	128(%rcx),%ymm2,%ymm7
793	vpmadd52huq	160(%rcx),%ymm2,%ymm8
794	vpmadd52huq	192(%rcx),%ymm2,%ymm9
795	vpmadd52huq	224(%rcx),%ymm2,%ymm10
796	movq	256(%r11),%r13
797
798	vpbroadcastq	%r13,%ymm1
799	movq	256(%rsi),%rdx
800	mulxq	%r13,%r13,%r12
801	addq	%r13,%r15
802	movq	%r12,%r10
803	adcq	$0,%r10
804
805	movq	8(%r8),%r13
806	imulq	%r15,%r13
807	andq	%rax,%r13
808
809	vpbroadcastq	%r13,%ymm2
810	movq	256(%rcx),%rdx
811	mulxq	%r13,%r13,%r12
812	addq	%r13,%r15
813	adcq	%r12,%r10
814
815	shrq	$52,%r15
816	salq	$12,%r10
817	orq	%r10,%r15
818
819	vpmadd52luq	256(%rsi),%ymm1,%ymm11
820	vpmadd52luq	288(%rsi),%ymm1,%ymm12
821	vpmadd52luq	320(%rsi),%ymm1,%ymm13
822	vpmadd52luq	352(%rsi),%ymm1,%ymm14
823	vpmadd52luq	384(%rsi),%ymm1,%ymm15
824	vpmadd52luq	416(%rsi),%ymm1,%ymm16
825	vpmadd52luq	448(%rsi),%ymm1,%ymm17
826	vpmadd52luq	480(%rsi),%ymm1,%ymm18
827
828	vpmadd52luq	256(%rcx),%ymm2,%ymm11
829	vpmadd52luq	288(%rcx),%ymm2,%ymm12
830	vpmadd52luq	320(%rcx),%ymm2,%ymm13
831	vpmadd52luq	352(%rcx),%ymm2,%ymm14
832	vpmadd52luq	384(%rcx),%ymm2,%ymm15
833	vpmadd52luq	416(%rcx),%ymm2,%ymm16
834	vpmadd52luq	448(%rcx),%ymm2,%ymm17
835	vpmadd52luq	480(%rcx),%ymm2,%ymm18
836
837
838	valignq	$1,%ymm11,%ymm12,%ymm11
839	valignq	$1,%ymm12,%ymm13,%ymm12
840	valignq	$1,%ymm13,%ymm14,%ymm13
841	valignq	$1,%ymm14,%ymm15,%ymm14
842	valignq	$1,%ymm15,%ymm16,%ymm15
843	valignq	$1,%ymm16,%ymm17,%ymm16
844	valignq	$1,%ymm17,%ymm18,%ymm17
845	valignq	$1,%ymm18,%ymm0,%ymm18
846
847	vmovq	%xmm11,%r13
848	addq	%r13,%r15
849
850	vpmadd52huq	256(%rsi),%ymm1,%ymm11
851	vpmadd52huq	288(%rsi),%ymm1,%ymm12
852	vpmadd52huq	320(%rsi),%ymm1,%ymm13
853	vpmadd52huq	352(%rsi),%ymm1,%ymm14
854	vpmadd52huq	384(%rsi),%ymm1,%ymm15
855	vpmadd52huq	416(%rsi),%ymm1,%ymm16
856	vpmadd52huq	448(%rsi),%ymm1,%ymm17
857	vpmadd52huq	480(%rsi),%ymm1,%ymm18
858
859	vpmadd52huq	256(%rcx),%ymm2,%ymm11
860	vpmadd52huq	288(%rcx),%ymm2,%ymm12
861	vpmadd52huq	320(%rcx),%ymm2,%ymm13
862	vpmadd52huq	352(%rcx),%ymm2,%ymm14
863	vpmadd52huq	384(%rcx),%ymm2,%ymm15
864	vpmadd52huq	416(%rcx),%ymm2,%ymm16
865	vpmadd52huq	448(%rcx),%ymm2,%ymm17
866	vpmadd52huq	480(%rcx),%ymm2,%ymm18
867	leaq	8(%r11),%r11
868	decl	%ebx
869	jne	.Lloop30
870
871	vpbroadcastq	%r9,%ymm0
872	vpblendd	$3,%ymm0,%ymm3,%ymm3
873
874
875
876	vpsrlq	$52,%ymm3,%ymm0
877	vpsrlq	$52,%ymm4,%ymm1
878	vpsrlq	$52,%ymm5,%ymm2
879	vpsrlq	$52,%ymm6,%ymm19
880	vpsrlq	$52,%ymm7,%ymm20
881	vpsrlq	$52,%ymm8,%ymm21
882	vpsrlq	$52,%ymm9,%ymm22
883	vpsrlq	$52,%ymm10,%ymm23
884
885
886	valignq	$3,%ymm22,%ymm23,%ymm23
887	valignq	$3,%ymm21,%ymm22,%ymm22
888	valignq	$3,%ymm20,%ymm21,%ymm21
889	valignq	$3,%ymm19,%ymm20,%ymm20
890	valignq	$3,%ymm2,%ymm19,%ymm19
891	valignq	$3,%ymm1,%ymm2,%ymm2
892	valignq	$3,%ymm0,%ymm1,%ymm1
893	valignq	$3,.Lzeros(%rip),%ymm0,%ymm0
894
895
896	vpandq	.Lmask52x4(%rip),%ymm3,%ymm3
897	vpandq	.Lmask52x4(%rip),%ymm4,%ymm4
898	vpandq	.Lmask52x4(%rip),%ymm5,%ymm5
899	vpandq	.Lmask52x4(%rip),%ymm6,%ymm6
900	vpandq	.Lmask52x4(%rip),%ymm7,%ymm7
901	vpandq	.Lmask52x4(%rip),%ymm8,%ymm8
902	vpandq	.Lmask52x4(%rip),%ymm9,%ymm9
903	vpandq	.Lmask52x4(%rip),%ymm10,%ymm10
904
905
906	vpaddq	%ymm0,%ymm3,%ymm3
907	vpaddq	%ymm1,%ymm4,%ymm4
908	vpaddq	%ymm2,%ymm5,%ymm5
909	vpaddq	%ymm19,%ymm6,%ymm6
910	vpaddq	%ymm20,%ymm7,%ymm7
911	vpaddq	%ymm21,%ymm8,%ymm8
912	vpaddq	%ymm22,%ymm9,%ymm9
913	vpaddq	%ymm23,%ymm10,%ymm10
914
915
916
917	vpcmpuq	$6,.Lmask52x4(%rip),%ymm3,%k1
918	vpcmpuq	$6,.Lmask52x4(%rip),%ymm4,%k2
919	kmovb	%k1,%r14d
920	kmovb	%k2,%r13d
921	shlb	$4,%r13b
922	orb	%r13b,%r14b
923
924	vpcmpuq	$6,.Lmask52x4(%rip),%ymm5,%k1
925	vpcmpuq	$6,.Lmask52x4(%rip),%ymm6,%k2
926	kmovb	%k1,%r13d
927	kmovb	%k2,%r12d
928	shlb	$4,%r12b
929	orb	%r12b,%r13b
930
931	vpcmpuq	$6,.Lmask52x4(%rip),%ymm7,%k1
932	vpcmpuq	$6,.Lmask52x4(%rip),%ymm8,%k2
933	kmovb	%k1,%r12d
934	kmovb	%k2,%r11d
935	shlb	$4,%r11b
936	orb	%r11b,%r12b
937
938	vpcmpuq	$6,.Lmask52x4(%rip),%ymm9,%k1
939	vpcmpuq	$6,.Lmask52x4(%rip),%ymm10,%k2
940	kmovb	%k1,%r11d
941	kmovb	%k2,%r10d
942	shlb	$4,%r10b
943	orb	%r10b,%r11b
944
945	addb	%r14b,%r14b
946	adcb	%r13b,%r13b
947	adcb	%r12b,%r12b
948	adcb	%r11b,%r11b
949
950
951	vpcmpuq	$0,.Lmask52x4(%rip),%ymm3,%k1
952	vpcmpuq	$0,.Lmask52x4(%rip),%ymm4,%k2
953	kmovb	%k1,%r9d
954	kmovb	%k2,%r8d
955	shlb	$4,%r8b
956	orb	%r8b,%r9b
957
958	vpcmpuq	$0,.Lmask52x4(%rip),%ymm5,%k1
959	vpcmpuq	$0,.Lmask52x4(%rip),%ymm6,%k2
960	kmovb	%k1,%r8d
961	kmovb	%k2,%edx
962	shlb	$4,%dl
963	orb	%dl,%r8b
964
965	vpcmpuq	$0,.Lmask52x4(%rip),%ymm7,%k1
966	vpcmpuq	$0,.Lmask52x4(%rip),%ymm8,%k2
967	kmovb	%k1,%edx
968	kmovb	%k2,%ecx
969	shlb	$4,%cl
970	orb	%cl,%dl
971
972	vpcmpuq	$0,.Lmask52x4(%rip),%ymm9,%k1
973	vpcmpuq	$0,.Lmask52x4(%rip),%ymm10,%k2
974	kmovb	%k1,%ecx
975	kmovb	%k2,%ebx
976	shlb	$4,%bl
977	orb	%bl,%cl
978
979	addb	%r9b,%r14b
980	adcb	%r8b,%r13b
981	adcb	%dl,%r12b
982	adcb	%cl,%r11b
983
984	xorb	%r9b,%r14b
985	xorb	%r8b,%r13b
986	xorb	%dl,%r12b
987	xorb	%cl,%r11b
988
989	kmovb	%r14d,%k1
990	shrb	$4,%r14b
991	kmovb	%r14d,%k2
992	kmovb	%r13d,%k3
993	shrb	$4,%r13b
994	kmovb	%r13d,%k4
995	kmovb	%r12d,%k5
996	shrb	$4,%r12b
997	kmovb	%r12d,%k6
998	kmovb	%r11d,%k7
999
1000	vpsubq	.Lmask52x4(%rip),%ymm3,%ymm3{%k1}
1001	vpsubq	.Lmask52x4(%rip),%ymm4,%ymm4{%k2}
1002	vpsubq	.Lmask52x4(%rip),%ymm5,%ymm5{%k3}
1003	vpsubq	.Lmask52x4(%rip),%ymm6,%ymm6{%k4}
1004	vpsubq	.Lmask52x4(%rip),%ymm7,%ymm7{%k5}
1005	vpsubq	.Lmask52x4(%rip),%ymm8,%ymm8{%k6}
1006	vpsubq	.Lmask52x4(%rip),%ymm9,%ymm9{%k7}
1007
1008	vpandq	.Lmask52x4(%rip),%ymm3,%ymm3
1009	vpandq	.Lmask52x4(%rip),%ymm4,%ymm4
1010	vpandq	.Lmask52x4(%rip),%ymm5,%ymm5
1011	vpandq	.Lmask52x4(%rip),%ymm6,%ymm6
1012	vpandq	.Lmask52x4(%rip),%ymm7,%ymm7
1013	vpandq	.Lmask52x4(%rip),%ymm8,%ymm8
1014	vpandq	.Lmask52x4(%rip),%ymm9,%ymm9
1015
1016	shrb	$4,%r11b
1017	kmovb	%r11d,%k1
1018
1019	vpsubq	.Lmask52x4(%rip),%ymm10,%ymm10{%k1}
1020
1021	vpandq	.Lmask52x4(%rip),%ymm10,%ymm10
1022
1023	vpbroadcastq	%r15,%ymm0
1024	vpblendd	$3,%ymm0,%ymm11,%ymm11
1025
1026
1027
1028	vpsrlq	$52,%ymm11,%ymm0
1029	vpsrlq	$52,%ymm12,%ymm1
1030	vpsrlq	$52,%ymm13,%ymm2
1031	vpsrlq	$52,%ymm14,%ymm19
1032	vpsrlq	$52,%ymm15,%ymm20
1033	vpsrlq	$52,%ymm16,%ymm21
1034	vpsrlq	$52,%ymm17,%ymm22
1035	vpsrlq	$52,%ymm18,%ymm23
1036
1037
1038	valignq	$3,%ymm22,%ymm23,%ymm23
1039	valignq	$3,%ymm21,%ymm22,%ymm22
1040	valignq	$3,%ymm20,%ymm21,%ymm21
1041	valignq	$3,%ymm19,%ymm20,%ymm20
1042	valignq	$3,%ymm2,%ymm19,%ymm19
1043	valignq	$3,%ymm1,%ymm2,%ymm2
1044	valignq	$3,%ymm0,%ymm1,%ymm1
1045	valignq	$3,.Lzeros(%rip),%ymm0,%ymm0
1046
1047
1048	vpandq	.Lmask52x4(%rip),%ymm11,%ymm11
1049	vpandq	.Lmask52x4(%rip),%ymm12,%ymm12
1050	vpandq	.Lmask52x4(%rip),%ymm13,%ymm13
1051	vpandq	.Lmask52x4(%rip),%ymm14,%ymm14
1052	vpandq	.Lmask52x4(%rip),%ymm15,%ymm15
1053	vpandq	.Lmask52x4(%rip),%ymm16,%ymm16
1054	vpandq	.Lmask52x4(%rip),%ymm17,%ymm17
1055	vpandq	.Lmask52x4(%rip),%ymm18,%ymm18
1056
1057
1058	vpaddq	%ymm0,%ymm11,%ymm11
1059	vpaddq	%ymm1,%ymm12,%ymm12
1060	vpaddq	%ymm2,%ymm13,%ymm13
1061	vpaddq	%ymm19,%ymm14,%ymm14
1062	vpaddq	%ymm20,%ymm15,%ymm15
1063	vpaddq	%ymm21,%ymm16,%ymm16
1064	vpaddq	%ymm22,%ymm17,%ymm17
1065	vpaddq	%ymm23,%ymm18,%ymm18
1066
1067
1068
1069	vpcmpuq	$6,.Lmask52x4(%rip),%ymm11,%k1
1070	vpcmpuq	$6,.Lmask52x4(%rip),%ymm12,%k2
1071	kmovb	%k1,%r14d
1072	kmovb	%k2,%r13d
1073	shlb	$4,%r13b
1074	orb	%r13b,%r14b
1075
1076	vpcmpuq	$6,.Lmask52x4(%rip),%ymm13,%k1
1077	vpcmpuq	$6,.Lmask52x4(%rip),%ymm14,%k2
1078	kmovb	%k1,%r13d
1079	kmovb	%k2,%r12d
1080	shlb	$4,%r12b
1081	orb	%r12b,%r13b
1082
1083	vpcmpuq	$6,.Lmask52x4(%rip),%ymm15,%k1
1084	vpcmpuq	$6,.Lmask52x4(%rip),%ymm16,%k2
1085	kmovb	%k1,%r12d
1086	kmovb	%k2,%r11d
1087	shlb	$4,%r11b
1088	orb	%r11b,%r12b
1089
1090	vpcmpuq	$6,.Lmask52x4(%rip),%ymm17,%k1
1091	vpcmpuq	$6,.Lmask52x4(%rip),%ymm18,%k2
1092	kmovb	%k1,%r11d
1093	kmovb	%k2,%r10d
1094	shlb	$4,%r10b
1095	orb	%r10b,%r11b
1096
1097	addb	%r14b,%r14b
1098	adcb	%r13b,%r13b
1099	adcb	%r12b,%r12b
1100	adcb	%r11b,%r11b
1101
1102
1103	vpcmpuq	$0,.Lmask52x4(%rip),%ymm11,%k1
1104	vpcmpuq	$0,.Lmask52x4(%rip),%ymm12,%k2
1105	kmovb	%k1,%r9d
1106	kmovb	%k2,%r8d
1107	shlb	$4,%r8b
1108	orb	%r8b,%r9b
1109
1110	vpcmpuq	$0,.Lmask52x4(%rip),%ymm13,%k1
1111	vpcmpuq	$0,.Lmask52x4(%rip),%ymm14,%k2
1112	kmovb	%k1,%r8d
1113	kmovb	%k2,%edx
1114	shlb	$4,%dl
1115	orb	%dl,%r8b
1116
1117	vpcmpuq	$0,.Lmask52x4(%rip),%ymm15,%k1
1118	vpcmpuq	$0,.Lmask52x4(%rip),%ymm16,%k2
1119	kmovb	%k1,%edx
1120	kmovb	%k2,%ecx
1121	shlb	$4,%cl
1122	orb	%cl,%dl
1123
1124	vpcmpuq	$0,.Lmask52x4(%rip),%ymm17,%k1
1125	vpcmpuq	$0,.Lmask52x4(%rip),%ymm18,%k2
1126	kmovb	%k1,%ecx
1127	kmovb	%k2,%ebx
1128	shlb	$4,%bl
1129	orb	%bl,%cl
1130
1131	addb	%r9b,%r14b
1132	adcb	%r8b,%r13b
1133	adcb	%dl,%r12b
1134	adcb	%cl,%r11b
1135
1136	xorb	%r9b,%r14b
1137	xorb	%r8b,%r13b
1138	xorb	%dl,%r12b
1139	xorb	%cl,%r11b
1140
1141	kmovb	%r14d,%k1
1142	shrb	$4,%r14b
1143	kmovb	%r14d,%k2
1144	kmovb	%r13d,%k3
1145	shrb	$4,%r13b
1146	kmovb	%r13d,%k4
1147	kmovb	%r12d,%k5
1148	shrb	$4,%r12b
1149	kmovb	%r12d,%k6
1150	kmovb	%r11d,%k7
1151
1152	vpsubq	.Lmask52x4(%rip),%ymm11,%ymm11{%k1}
1153	vpsubq	.Lmask52x4(%rip),%ymm12,%ymm12{%k2}
1154	vpsubq	.Lmask52x4(%rip),%ymm13,%ymm13{%k3}
1155	vpsubq	.Lmask52x4(%rip),%ymm14,%ymm14{%k4}
1156	vpsubq	.Lmask52x4(%rip),%ymm15,%ymm15{%k5}
1157	vpsubq	.Lmask52x4(%rip),%ymm16,%ymm16{%k6}
1158	vpsubq	.Lmask52x4(%rip),%ymm17,%ymm17{%k7}
1159
1160	vpandq	.Lmask52x4(%rip),%ymm11,%ymm11
1161	vpandq	.Lmask52x4(%rip),%ymm12,%ymm12
1162	vpandq	.Lmask52x4(%rip),%ymm13,%ymm13
1163	vpandq	.Lmask52x4(%rip),%ymm14,%ymm14
1164	vpandq	.Lmask52x4(%rip),%ymm15,%ymm15
1165	vpandq	.Lmask52x4(%rip),%ymm16,%ymm16
1166	vpandq	.Lmask52x4(%rip),%ymm17,%ymm17
1167
1168	shrb	$4,%r11b
1169	kmovb	%r11d,%k1
1170
1171	vpsubq	.Lmask52x4(%rip),%ymm18,%ymm18{%k1}
1172
1173	vpandq	.Lmask52x4(%rip),%ymm18,%ymm18
1174
1175	vmovdqu64	%ymm3,0(%rdi)
1176	vmovdqu64	%ymm4,32(%rdi)
1177	vmovdqu64	%ymm5,64(%rdi)
1178	vmovdqu64	%ymm6,96(%rdi)
1179	vmovdqu64	%ymm7,128(%rdi)
1180	vmovdqu64	%ymm8,160(%rdi)
1181	vmovdqu64	%ymm9,192(%rdi)
1182	vmovdqu64	%ymm10,224(%rdi)
1183
1184	vmovdqu64	%ymm11,256(%rdi)
1185	vmovdqu64	%ymm12,288(%rdi)
1186	vmovdqu64	%ymm13,320(%rdi)
1187	vmovdqu64	%ymm14,352(%rdi)
1188	vmovdqu64	%ymm15,384(%rdi)
1189	vmovdqu64	%ymm16,416(%rdi)
1190	vmovdqu64	%ymm17,448(%rdi)
1191	vmovdqu64	%ymm18,480(%rdi)
1192
1193	vzeroupper
1194	leaq	(%rsp),%rax
1195.cfi_def_cfa_register	%rax
1196	movq	0(%rax),%r15
1197.cfi_restore	%r15
1198	movq	8(%rax),%r14
1199.cfi_restore	%r14
1200	movq	16(%rax),%r13
1201.cfi_restore	%r13
1202	movq	24(%rax),%r12
1203.cfi_restore	%r12
1204	movq	32(%rax),%rbp
1205.cfi_restore	%rbp
1206	movq	40(%rax),%rbx
1207.cfi_restore	%rbx
1208	leaq	48(%rax),%rsp
1209.cfi_def_cfa	%rsp,8
1210.Lossl_rsaz_amm52x30_x2_ifma256_epilogue:
1211	.byte	0xf3,0xc3
1212.cfi_endproc
1213.size	ossl_rsaz_amm52x30_x2_ifma256, .-ossl_rsaz_amm52x30_x2_ifma256
1214.text
1215
1216.align	32
1217.globl	ossl_extract_multiplier_2x30_win5
1218.type	ossl_extract_multiplier_2x30_win5,@function
1219ossl_extract_multiplier_2x30_win5:
1220.cfi_startproc
1221.byte	243,15,30,250
1222	vmovdqa64	.Lones(%rip),%ymm30
1223	vpbroadcastq	%rdx,%ymm28
1224	vpbroadcastq	%rcx,%ymm29
1225	leaq	16384(%rsi),%rax
1226
1227
1228	vpxor	%xmm0,%xmm0,%xmm0
1229	vmovdqa64	%ymm0,%ymm27
1230	vmovdqa64	%ymm0,%ymm1
1231	vmovdqa64	%ymm0,%ymm2
1232	vmovdqa64	%ymm0,%ymm3
1233	vmovdqa64	%ymm0,%ymm4
1234	vmovdqa64	%ymm0,%ymm5
1235	vmovdqa64	%ymm0,%ymm16
1236	vmovdqa64	%ymm0,%ymm17
1237	vmovdqa64	%ymm0,%ymm18
1238	vmovdqa64	%ymm0,%ymm19
1239	vmovdqa64	%ymm0,%ymm20
1240	vmovdqa64	%ymm0,%ymm21
1241	vmovdqa64	%ymm0,%ymm22
1242	vmovdqa64	%ymm0,%ymm23
1243	vmovdqa64	%ymm0,%ymm24
1244	vmovdqa64	%ymm0,%ymm25
1245
1246.align	32
1247.Lloop:
1248	vpcmpq	$0,%ymm27,%ymm28,%k1
1249	vpcmpq	$0,%ymm27,%ymm29,%k2
1250	vmovdqu64	0(%rsi),%ymm26
1251	vpblendmq	%ymm26,%ymm0,%ymm0{%k1}
1252	vmovdqu64	32(%rsi),%ymm26
1253	vpblendmq	%ymm26,%ymm1,%ymm1{%k1}
1254	vmovdqu64	64(%rsi),%ymm26
1255	vpblendmq	%ymm26,%ymm2,%ymm2{%k1}
1256	vmovdqu64	96(%rsi),%ymm26
1257	vpblendmq	%ymm26,%ymm3,%ymm3{%k1}
1258	vmovdqu64	128(%rsi),%ymm26
1259	vpblendmq	%ymm26,%ymm4,%ymm4{%k1}
1260	vmovdqu64	160(%rsi),%ymm26
1261	vpblendmq	%ymm26,%ymm5,%ymm5{%k1}
1262	vmovdqu64	192(%rsi),%ymm26
1263	vpblendmq	%ymm26,%ymm16,%ymm16{%k1}
1264	vmovdqu64	224(%rsi),%ymm26
1265	vpblendmq	%ymm26,%ymm17,%ymm17{%k1}
1266	vmovdqu64	256(%rsi),%ymm26
1267	vpblendmq	%ymm26,%ymm18,%ymm18{%k2}
1268	vmovdqu64	288(%rsi),%ymm26
1269	vpblendmq	%ymm26,%ymm19,%ymm19{%k2}
1270	vmovdqu64	320(%rsi),%ymm26
1271	vpblendmq	%ymm26,%ymm20,%ymm20{%k2}
1272	vmovdqu64	352(%rsi),%ymm26
1273	vpblendmq	%ymm26,%ymm21,%ymm21{%k2}
1274	vmovdqu64	384(%rsi),%ymm26
1275	vpblendmq	%ymm26,%ymm22,%ymm22{%k2}
1276	vmovdqu64	416(%rsi),%ymm26
1277	vpblendmq	%ymm26,%ymm23,%ymm23{%k2}
1278	vmovdqu64	448(%rsi),%ymm26
1279	vpblendmq	%ymm26,%ymm24,%ymm24{%k2}
1280	vmovdqu64	480(%rsi),%ymm26
1281	vpblendmq	%ymm26,%ymm25,%ymm25{%k2}
1282	vpaddq	%ymm30,%ymm27,%ymm27
1283	addq	$512,%rsi
1284	cmpq	%rsi,%rax
1285	jne	.Lloop
1286	vmovdqu64	%ymm0,0(%rdi)
1287	vmovdqu64	%ymm1,32(%rdi)
1288	vmovdqu64	%ymm2,64(%rdi)
1289	vmovdqu64	%ymm3,96(%rdi)
1290	vmovdqu64	%ymm4,128(%rdi)
1291	vmovdqu64	%ymm5,160(%rdi)
1292	vmovdqu64	%ymm16,192(%rdi)
1293	vmovdqu64	%ymm17,224(%rdi)
1294	vmovdqu64	%ymm18,256(%rdi)
1295	vmovdqu64	%ymm19,288(%rdi)
1296	vmovdqu64	%ymm20,320(%rdi)
1297	vmovdqu64	%ymm21,352(%rdi)
1298	vmovdqu64	%ymm22,384(%rdi)
1299	vmovdqu64	%ymm23,416(%rdi)
1300	vmovdqu64	%ymm24,448(%rdi)
1301	vmovdqu64	%ymm25,480(%rdi)
1302
1303	.byte	0xf3,0xc3
1304.cfi_endproc
1305.size	ossl_extract_multiplier_2x30_win5, .-ossl_extract_multiplier_2x30_win5
1306.section	.rodata
1307.align	32
1308.Lones:
1309.quad	1,1,1,1
1310.Lzeros:
1311.quad	0,0,0,0
1312	.section ".note.gnu.property", "a"
1313	.p2align 3
1314	.long 1f - 0f
1315	.long 4f - 1f
1316	.long 5
13170:
1318	# "GNU" encoded with .byte, since .asciz isn't supported
1319	# on Solaris.
1320	.byte 0x47
1321	.byte 0x4e
1322	.byte 0x55
1323	.byte 0
13241:
1325	.p2align 3
1326	.long 0xc0000002
1327	.long 3f - 2f
13282:
1329	.long 3
13303:
1331	.p2align 3
13324:
1333