xref: /freebsd/sys/crypto/openssl/amd64/rsaz-4k-avx512.S (revision 4757b351ea9d59d71d4a38b82506d2d16fcd560d)
1/* Do not modify. This file is auto-generated from rsaz-4k-avx512.pl. */
2.text
3
4.globl	ossl_rsaz_amm52x40_x1_ifma256
5.type	ossl_rsaz_amm52x40_x1_ifma256,@function
6.align	32
7ossl_rsaz_amm52x40_x1_ifma256:
8.cfi_startproc
9.byte	243,15,30,250
10	pushq	%rbx
11.cfi_adjust_cfa_offset	8
12.cfi_offset	%rbx,-16
13	pushq	%rbp
14.cfi_adjust_cfa_offset	8
15.cfi_offset	%rbp,-24
16	pushq	%r12
17.cfi_adjust_cfa_offset	8
18.cfi_offset	%r12,-32
19	pushq	%r13
20.cfi_adjust_cfa_offset	8
21.cfi_offset	%r13,-40
22	pushq	%r14
23.cfi_adjust_cfa_offset	8
24.cfi_offset	%r14,-48
25	pushq	%r15
26.cfi_adjust_cfa_offset	8
27.cfi_offset	%r15,-56
28
29	vpxord	%ymm0,%ymm0,%ymm0
30	vmovdqa64	%ymm0,%ymm3
31	vmovdqa64	%ymm0,%ymm4
32	vmovdqa64	%ymm0,%ymm5
33	vmovdqa64	%ymm0,%ymm6
34	vmovdqa64	%ymm0,%ymm7
35	vmovdqa64	%ymm0,%ymm8
36	vmovdqa64	%ymm0,%ymm9
37	vmovdqa64	%ymm0,%ymm10
38	vmovdqa64	%ymm0,%ymm11
39	vmovdqa64	%ymm0,%ymm12
40
41	xorl	%r9d,%r9d
42
43	movq	%rdx,%r11
44	movq	$0xfffffffffffff,%rax
45
46
47	movl	$10,%ebx
48
49.align	32
50.Lloop10:
51	movq	0(%r11),%r13
52
53	vpbroadcastq	%r13,%ymm1
54	movq	0(%rsi),%rdx
55	mulxq	%r13,%r13,%r12
56	addq	%r13,%r9
57	movq	%r12,%r10
58	adcq	$0,%r10
59
60	movq	%r8,%r13
61	imulq	%r9,%r13
62	andq	%rax,%r13
63
64	vpbroadcastq	%r13,%ymm2
65	movq	0(%rcx),%rdx
66	mulxq	%r13,%r13,%r12
67	addq	%r13,%r9
68	adcq	%r12,%r10
69
70	shrq	$52,%r9
71	salq	$12,%r10
72	orq	%r10,%r9
73
74	vpmadd52luq	0(%rsi),%ymm1,%ymm3
75	vpmadd52luq	32(%rsi),%ymm1,%ymm4
76	vpmadd52luq	64(%rsi),%ymm1,%ymm5
77	vpmadd52luq	96(%rsi),%ymm1,%ymm6
78	vpmadd52luq	128(%rsi),%ymm1,%ymm7
79	vpmadd52luq	160(%rsi),%ymm1,%ymm8
80	vpmadd52luq	192(%rsi),%ymm1,%ymm9
81	vpmadd52luq	224(%rsi),%ymm1,%ymm10
82	vpmadd52luq	256(%rsi),%ymm1,%ymm11
83	vpmadd52luq	288(%rsi),%ymm1,%ymm12
84
85	vpmadd52luq	0(%rcx),%ymm2,%ymm3
86	vpmadd52luq	32(%rcx),%ymm2,%ymm4
87	vpmadd52luq	64(%rcx),%ymm2,%ymm5
88	vpmadd52luq	96(%rcx),%ymm2,%ymm6
89	vpmadd52luq	128(%rcx),%ymm2,%ymm7
90	vpmadd52luq	160(%rcx),%ymm2,%ymm8
91	vpmadd52luq	192(%rcx),%ymm2,%ymm9
92	vpmadd52luq	224(%rcx),%ymm2,%ymm10
93	vpmadd52luq	256(%rcx),%ymm2,%ymm11
94	vpmadd52luq	288(%rcx),%ymm2,%ymm12
95
96
97	valignq	$1,%ymm3,%ymm4,%ymm3
98	valignq	$1,%ymm4,%ymm5,%ymm4
99	valignq	$1,%ymm5,%ymm6,%ymm5
100	valignq	$1,%ymm6,%ymm7,%ymm6
101	valignq	$1,%ymm7,%ymm8,%ymm7
102	valignq	$1,%ymm8,%ymm9,%ymm8
103	valignq	$1,%ymm9,%ymm10,%ymm9
104	valignq	$1,%ymm10,%ymm11,%ymm10
105	valignq	$1,%ymm11,%ymm12,%ymm11
106	valignq	$1,%ymm12,%ymm0,%ymm12
107
108	vmovq	%xmm3,%r13
109	addq	%r13,%r9
110
111	vpmadd52huq	0(%rsi),%ymm1,%ymm3
112	vpmadd52huq	32(%rsi),%ymm1,%ymm4
113	vpmadd52huq	64(%rsi),%ymm1,%ymm5
114	vpmadd52huq	96(%rsi),%ymm1,%ymm6
115	vpmadd52huq	128(%rsi),%ymm1,%ymm7
116	vpmadd52huq	160(%rsi),%ymm1,%ymm8
117	vpmadd52huq	192(%rsi),%ymm1,%ymm9
118	vpmadd52huq	224(%rsi),%ymm1,%ymm10
119	vpmadd52huq	256(%rsi),%ymm1,%ymm11
120	vpmadd52huq	288(%rsi),%ymm1,%ymm12
121
122	vpmadd52huq	0(%rcx),%ymm2,%ymm3
123	vpmadd52huq	32(%rcx),%ymm2,%ymm4
124	vpmadd52huq	64(%rcx),%ymm2,%ymm5
125	vpmadd52huq	96(%rcx),%ymm2,%ymm6
126	vpmadd52huq	128(%rcx),%ymm2,%ymm7
127	vpmadd52huq	160(%rcx),%ymm2,%ymm8
128	vpmadd52huq	192(%rcx),%ymm2,%ymm9
129	vpmadd52huq	224(%rcx),%ymm2,%ymm10
130	vpmadd52huq	256(%rcx),%ymm2,%ymm11
131	vpmadd52huq	288(%rcx),%ymm2,%ymm12
132	movq	8(%r11),%r13
133
134	vpbroadcastq	%r13,%ymm1
135	movq	0(%rsi),%rdx
136	mulxq	%r13,%r13,%r12
137	addq	%r13,%r9
138	movq	%r12,%r10
139	adcq	$0,%r10
140
141	movq	%r8,%r13
142	imulq	%r9,%r13
143	andq	%rax,%r13
144
145	vpbroadcastq	%r13,%ymm2
146	movq	0(%rcx),%rdx
147	mulxq	%r13,%r13,%r12
148	addq	%r13,%r9
149	adcq	%r12,%r10
150
151	shrq	$52,%r9
152	salq	$12,%r10
153	orq	%r10,%r9
154
155	vpmadd52luq	0(%rsi),%ymm1,%ymm3
156	vpmadd52luq	32(%rsi),%ymm1,%ymm4
157	vpmadd52luq	64(%rsi),%ymm1,%ymm5
158	vpmadd52luq	96(%rsi),%ymm1,%ymm6
159	vpmadd52luq	128(%rsi),%ymm1,%ymm7
160	vpmadd52luq	160(%rsi),%ymm1,%ymm8
161	vpmadd52luq	192(%rsi),%ymm1,%ymm9
162	vpmadd52luq	224(%rsi),%ymm1,%ymm10
163	vpmadd52luq	256(%rsi),%ymm1,%ymm11
164	vpmadd52luq	288(%rsi),%ymm1,%ymm12
165
166	vpmadd52luq	0(%rcx),%ymm2,%ymm3
167	vpmadd52luq	32(%rcx),%ymm2,%ymm4
168	vpmadd52luq	64(%rcx),%ymm2,%ymm5
169	vpmadd52luq	96(%rcx),%ymm2,%ymm6
170	vpmadd52luq	128(%rcx),%ymm2,%ymm7
171	vpmadd52luq	160(%rcx),%ymm2,%ymm8
172	vpmadd52luq	192(%rcx),%ymm2,%ymm9
173	vpmadd52luq	224(%rcx),%ymm2,%ymm10
174	vpmadd52luq	256(%rcx),%ymm2,%ymm11
175	vpmadd52luq	288(%rcx),%ymm2,%ymm12
176
177
178	valignq	$1,%ymm3,%ymm4,%ymm3
179	valignq	$1,%ymm4,%ymm5,%ymm4
180	valignq	$1,%ymm5,%ymm6,%ymm5
181	valignq	$1,%ymm6,%ymm7,%ymm6
182	valignq	$1,%ymm7,%ymm8,%ymm7
183	valignq	$1,%ymm8,%ymm9,%ymm8
184	valignq	$1,%ymm9,%ymm10,%ymm9
185	valignq	$1,%ymm10,%ymm11,%ymm10
186	valignq	$1,%ymm11,%ymm12,%ymm11
187	valignq	$1,%ymm12,%ymm0,%ymm12
188
189	vmovq	%xmm3,%r13
190	addq	%r13,%r9
191
192	vpmadd52huq	0(%rsi),%ymm1,%ymm3
193	vpmadd52huq	32(%rsi),%ymm1,%ymm4
194	vpmadd52huq	64(%rsi),%ymm1,%ymm5
195	vpmadd52huq	96(%rsi),%ymm1,%ymm6
196	vpmadd52huq	128(%rsi),%ymm1,%ymm7
197	vpmadd52huq	160(%rsi),%ymm1,%ymm8
198	vpmadd52huq	192(%rsi),%ymm1,%ymm9
199	vpmadd52huq	224(%rsi),%ymm1,%ymm10
200	vpmadd52huq	256(%rsi),%ymm1,%ymm11
201	vpmadd52huq	288(%rsi),%ymm1,%ymm12
202
203	vpmadd52huq	0(%rcx),%ymm2,%ymm3
204	vpmadd52huq	32(%rcx),%ymm2,%ymm4
205	vpmadd52huq	64(%rcx),%ymm2,%ymm5
206	vpmadd52huq	96(%rcx),%ymm2,%ymm6
207	vpmadd52huq	128(%rcx),%ymm2,%ymm7
208	vpmadd52huq	160(%rcx),%ymm2,%ymm8
209	vpmadd52huq	192(%rcx),%ymm2,%ymm9
210	vpmadd52huq	224(%rcx),%ymm2,%ymm10
211	vpmadd52huq	256(%rcx),%ymm2,%ymm11
212	vpmadd52huq	288(%rcx),%ymm2,%ymm12
213	movq	16(%r11),%r13
214
215	vpbroadcastq	%r13,%ymm1
216	movq	0(%rsi),%rdx
217	mulxq	%r13,%r13,%r12
218	addq	%r13,%r9
219	movq	%r12,%r10
220	adcq	$0,%r10
221
222	movq	%r8,%r13
223	imulq	%r9,%r13
224	andq	%rax,%r13
225
226	vpbroadcastq	%r13,%ymm2
227	movq	0(%rcx),%rdx
228	mulxq	%r13,%r13,%r12
229	addq	%r13,%r9
230	adcq	%r12,%r10
231
232	shrq	$52,%r9
233	salq	$12,%r10
234	orq	%r10,%r9
235
236	vpmadd52luq	0(%rsi),%ymm1,%ymm3
237	vpmadd52luq	32(%rsi),%ymm1,%ymm4
238	vpmadd52luq	64(%rsi),%ymm1,%ymm5
239	vpmadd52luq	96(%rsi),%ymm1,%ymm6
240	vpmadd52luq	128(%rsi),%ymm1,%ymm7
241	vpmadd52luq	160(%rsi),%ymm1,%ymm8
242	vpmadd52luq	192(%rsi),%ymm1,%ymm9
243	vpmadd52luq	224(%rsi),%ymm1,%ymm10
244	vpmadd52luq	256(%rsi),%ymm1,%ymm11
245	vpmadd52luq	288(%rsi),%ymm1,%ymm12
246
247	vpmadd52luq	0(%rcx),%ymm2,%ymm3
248	vpmadd52luq	32(%rcx),%ymm2,%ymm4
249	vpmadd52luq	64(%rcx),%ymm2,%ymm5
250	vpmadd52luq	96(%rcx),%ymm2,%ymm6
251	vpmadd52luq	128(%rcx),%ymm2,%ymm7
252	vpmadd52luq	160(%rcx),%ymm2,%ymm8
253	vpmadd52luq	192(%rcx),%ymm2,%ymm9
254	vpmadd52luq	224(%rcx),%ymm2,%ymm10
255	vpmadd52luq	256(%rcx),%ymm2,%ymm11
256	vpmadd52luq	288(%rcx),%ymm2,%ymm12
257
258
259	valignq	$1,%ymm3,%ymm4,%ymm3
260	valignq	$1,%ymm4,%ymm5,%ymm4
261	valignq	$1,%ymm5,%ymm6,%ymm5
262	valignq	$1,%ymm6,%ymm7,%ymm6
263	valignq	$1,%ymm7,%ymm8,%ymm7
264	valignq	$1,%ymm8,%ymm9,%ymm8
265	valignq	$1,%ymm9,%ymm10,%ymm9
266	valignq	$1,%ymm10,%ymm11,%ymm10
267	valignq	$1,%ymm11,%ymm12,%ymm11
268	valignq	$1,%ymm12,%ymm0,%ymm12
269
270	vmovq	%xmm3,%r13
271	addq	%r13,%r9
272
273	vpmadd52huq	0(%rsi),%ymm1,%ymm3
274	vpmadd52huq	32(%rsi),%ymm1,%ymm4
275	vpmadd52huq	64(%rsi),%ymm1,%ymm5
276	vpmadd52huq	96(%rsi),%ymm1,%ymm6
277	vpmadd52huq	128(%rsi),%ymm1,%ymm7
278	vpmadd52huq	160(%rsi),%ymm1,%ymm8
279	vpmadd52huq	192(%rsi),%ymm1,%ymm9
280	vpmadd52huq	224(%rsi),%ymm1,%ymm10
281	vpmadd52huq	256(%rsi),%ymm1,%ymm11
282	vpmadd52huq	288(%rsi),%ymm1,%ymm12
283
284	vpmadd52huq	0(%rcx),%ymm2,%ymm3
285	vpmadd52huq	32(%rcx),%ymm2,%ymm4
286	vpmadd52huq	64(%rcx),%ymm2,%ymm5
287	vpmadd52huq	96(%rcx),%ymm2,%ymm6
288	vpmadd52huq	128(%rcx),%ymm2,%ymm7
289	vpmadd52huq	160(%rcx),%ymm2,%ymm8
290	vpmadd52huq	192(%rcx),%ymm2,%ymm9
291	vpmadd52huq	224(%rcx),%ymm2,%ymm10
292	vpmadd52huq	256(%rcx),%ymm2,%ymm11
293	vpmadd52huq	288(%rcx),%ymm2,%ymm12
294	movq	24(%r11),%r13
295
296	vpbroadcastq	%r13,%ymm1
297	movq	0(%rsi),%rdx
298	mulxq	%r13,%r13,%r12
299	addq	%r13,%r9
300	movq	%r12,%r10
301	adcq	$0,%r10
302
303	movq	%r8,%r13
304	imulq	%r9,%r13
305	andq	%rax,%r13
306
307	vpbroadcastq	%r13,%ymm2
308	movq	0(%rcx),%rdx
309	mulxq	%r13,%r13,%r12
310	addq	%r13,%r9
311	adcq	%r12,%r10
312
313	shrq	$52,%r9
314	salq	$12,%r10
315	orq	%r10,%r9
316
317	vpmadd52luq	0(%rsi),%ymm1,%ymm3
318	vpmadd52luq	32(%rsi),%ymm1,%ymm4
319	vpmadd52luq	64(%rsi),%ymm1,%ymm5
320	vpmadd52luq	96(%rsi),%ymm1,%ymm6
321	vpmadd52luq	128(%rsi),%ymm1,%ymm7
322	vpmadd52luq	160(%rsi),%ymm1,%ymm8
323	vpmadd52luq	192(%rsi),%ymm1,%ymm9
324	vpmadd52luq	224(%rsi),%ymm1,%ymm10
325	vpmadd52luq	256(%rsi),%ymm1,%ymm11
326	vpmadd52luq	288(%rsi),%ymm1,%ymm12
327
328	vpmadd52luq	0(%rcx),%ymm2,%ymm3
329	vpmadd52luq	32(%rcx),%ymm2,%ymm4
330	vpmadd52luq	64(%rcx),%ymm2,%ymm5
331	vpmadd52luq	96(%rcx),%ymm2,%ymm6
332	vpmadd52luq	128(%rcx),%ymm2,%ymm7
333	vpmadd52luq	160(%rcx),%ymm2,%ymm8
334	vpmadd52luq	192(%rcx),%ymm2,%ymm9
335	vpmadd52luq	224(%rcx),%ymm2,%ymm10
336	vpmadd52luq	256(%rcx),%ymm2,%ymm11
337	vpmadd52luq	288(%rcx),%ymm2,%ymm12
338
339
340	valignq	$1,%ymm3,%ymm4,%ymm3
341	valignq	$1,%ymm4,%ymm5,%ymm4
342	valignq	$1,%ymm5,%ymm6,%ymm5
343	valignq	$1,%ymm6,%ymm7,%ymm6
344	valignq	$1,%ymm7,%ymm8,%ymm7
345	valignq	$1,%ymm8,%ymm9,%ymm8
346	valignq	$1,%ymm9,%ymm10,%ymm9
347	valignq	$1,%ymm10,%ymm11,%ymm10
348	valignq	$1,%ymm11,%ymm12,%ymm11
349	valignq	$1,%ymm12,%ymm0,%ymm12
350
351	vmovq	%xmm3,%r13
352	addq	%r13,%r9
353
354	vpmadd52huq	0(%rsi),%ymm1,%ymm3
355	vpmadd52huq	32(%rsi),%ymm1,%ymm4
356	vpmadd52huq	64(%rsi),%ymm1,%ymm5
357	vpmadd52huq	96(%rsi),%ymm1,%ymm6
358	vpmadd52huq	128(%rsi),%ymm1,%ymm7
359	vpmadd52huq	160(%rsi),%ymm1,%ymm8
360	vpmadd52huq	192(%rsi),%ymm1,%ymm9
361	vpmadd52huq	224(%rsi),%ymm1,%ymm10
362	vpmadd52huq	256(%rsi),%ymm1,%ymm11
363	vpmadd52huq	288(%rsi),%ymm1,%ymm12
364
365	vpmadd52huq	0(%rcx),%ymm2,%ymm3
366	vpmadd52huq	32(%rcx),%ymm2,%ymm4
367	vpmadd52huq	64(%rcx),%ymm2,%ymm5
368	vpmadd52huq	96(%rcx),%ymm2,%ymm6
369	vpmadd52huq	128(%rcx),%ymm2,%ymm7
370	vpmadd52huq	160(%rcx),%ymm2,%ymm8
371	vpmadd52huq	192(%rcx),%ymm2,%ymm9
372	vpmadd52huq	224(%rcx),%ymm2,%ymm10
373	vpmadd52huq	256(%rcx),%ymm2,%ymm11
374	vpmadd52huq	288(%rcx),%ymm2,%ymm12
375	leaq	32(%r11),%r11
376	decl	%ebx
377	jne	.Lloop10
378
379	vpbroadcastq	%r9,%ymm0
380	vpblendd	$3,%ymm0,%ymm3,%ymm3
381
382
383
384	vpsrlq	$52,%ymm3,%ymm0
385	vpsrlq	$52,%ymm4,%ymm1
386	vpsrlq	$52,%ymm5,%ymm2
387	vpsrlq	$52,%ymm6,%ymm23
388	vpsrlq	$52,%ymm7,%ymm24
389	vpsrlq	$52,%ymm8,%ymm25
390	vpsrlq	$52,%ymm9,%ymm26
391	vpsrlq	$52,%ymm10,%ymm27
392	vpsrlq	$52,%ymm11,%ymm28
393	vpsrlq	$52,%ymm12,%ymm29
394
395
396	valignq	$3,%ymm28,%ymm29,%ymm29
397	valignq	$3,%ymm27,%ymm28,%ymm28
398	valignq	$3,%ymm26,%ymm27,%ymm27
399	valignq	$3,%ymm25,%ymm26,%ymm26
400	valignq	$3,%ymm24,%ymm25,%ymm25
401	valignq	$3,%ymm23,%ymm24,%ymm24
402	valignq	$3,%ymm2,%ymm23,%ymm23
403	valignq	$3,%ymm1,%ymm2,%ymm2
404	valignq	$3,%ymm0,%ymm1,%ymm1
405	valignq	$3,.Lzeros(%rip),%ymm0,%ymm0
406
407
408	vpandq	.Lmask52x4(%rip),%ymm3,%ymm3
409	vpandq	.Lmask52x4(%rip),%ymm4,%ymm4
410	vpandq	.Lmask52x4(%rip),%ymm5,%ymm5
411	vpandq	.Lmask52x4(%rip),%ymm6,%ymm6
412	vpandq	.Lmask52x4(%rip),%ymm7,%ymm7
413	vpandq	.Lmask52x4(%rip),%ymm8,%ymm8
414	vpandq	.Lmask52x4(%rip),%ymm9,%ymm9
415	vpandq	.Lmask52x4(%rip),%ymm10,%ymm10
416	vpandq	.Lmask52x4(%rip),%ymm11,%ymm11
417	vpandq	.Lmask52x4(%rip),%ymm12,%ymm12
418
419
420	vpaddq	%ymm0,%ymm3,%ymm3
421	vpaddq	%ymm1,%ymm4,%ymm4
422	vpaddq	%ymm2,%ymm5,%ymm5
423	vpaddq	%ymm23,%ymm6,%ymm6
424	vpaddq	%ymm24,%ymm7,%ymm7
425	vpaddq	%ymm25,%ymm8,%ymm8
426	vpaddq	%ymm26,%ymm9,%ymm9
427	vpaddq	%ymm27,%ymm10,%ymm10
428	vpaddq	%ymm28,%ymm11,%ymm11
429	vpaddq	%ymm29,%ymm12,%ymm12
430
431
432
433	vpcmpuq	$6,.Lmask52x4(%rip),%ymm3,%k1
434	vpcmpuq	$6,.Lmask52x4(%rip),%ymm4,%k2
435	kmovb	%k1,%r14d
436	kmovb	%k2,%r13d
437	shlb	$4,%r13b
438	orb	%r13b,%r14b
439
440	vpcmpuq	$6,.Lmask52x4(%rip),%ymm5,%k1
441	vpcmpuq	$6,.Lmask52x4(%rip),%ymm6,%k2
442	kmovb	%k1,%r13d
443	kmovb	%k2,%r12d
444	shlb	$4,%r12b
445	orb	%r12b,%r13b
446
447	vpcmpuq	$6,.Lmask52x4(%rip),%ymm7,%k1
448	vpcmpuq	$6,.Lmask52x4(%rip),%ymm8,%k2
449	kmovb	%k1,%r12d
450	kmovb	%k2,%r11d
451	shlb	$4,%r11b
452	orb	%r11b,%r12b
453
454	vpcmpuq	$6,.Lmask52x4(%rip),%ymm9,%k1
455	vpcmpuq	$6,.Lmask52x4(%rip),%ymm10,%k2
456	kmovb	%k1,%r11d
457	kmovb	%k2,%r10d
458	shlb	$4,%r10b
459	orb	%r10b,%r11b
460
461	vpcmpuq	$6,.Lmask52x4(%rip),%ymm11,%k1
462	vpcmpuq	$6,.Lmask52x4(%rip),%ymm12,%k2
463	kmovb	%k1,%r10d
464	kmovb	%k2,%r9d
465	shlb	$4,%r9b
466	orb	%r9b,%r10b
467
468	addb	%r14b,%r14b
469	adcb	%r13b,%r13b
470	adcb	%r12b,%r12b
471	adcb	%r11b,%r11b
472	adcb	%r10b,%r10b
473
474
475	vpcmpuq	$0,.Lmask52x4(%rip),%ymm3,%k1
476	vpcmpuq	$0,.Lmask52x4(%rip),%ymm4,%k2
477	kmovb	%k1,%r9d
478	kmovb	%k2,%r8d
479	shlb	$4,%r8b
480	orb	%r8b,%r9b
481
482	vpcmpuq	$0,.Lmask52x4(%rip),%ymm5,%k1
483	vpcmpuq	$0,.Lmask52x4(%rip),%ymm6,%k2
484	kmovb	%k1,%r8d
485	kmovb	%k2,%edx
486	shlb	$4,%dl
487	orb	%dl,%r8b
488
489	vpcmpuq	$0,.Lmask52x4(%rip),%ymm7,%k1
490	vpcmpuq	$0,.Lmask52x4(%rip),%ymm8,%k2
491	kmovb	%k1,%edx
492	kmovb	%k2,%ecx
493	shlb	$4,%cl
494	orb	%cl,%dl
495
496	vpcmpuq	$0,.Lmask52x4(%rip),%ymm9,%k1
497	vpcmpuq	$0,.Lmask52x4(%rip),%ymm10,%k2
498	kmovb	%k1,%ecx
499	kmovb	%k2,%ebx
500	shlb	$4,%bl
501	orb	%bl,%cl
502
503	vpcmpuq	$0,.Lmask52x4(%rip),%ymm11,%k1
504	vpcmpuq	$0,.Lmask52x4(%rip),%ymm12,%k2
505	kmovb	%k1,%ebx
506	kmovb	%k2,%eax
507	shlb	$4,%al
508	orb	%al,%bl
509
510	addb	%r9b,%r14b
511	adcb	%r8b,%r13b
512	adcb	%dl,%r12b
513	adcb	%cl,%r11b
514	adcb	%bl,%r10b
515
516	xorb	%r9b,%r14b
517	xorb	%r8b,%r13b
518	xorb	%dl,%r12b
519	xorb	%cl,%r11b
520	xorb	%bl,%r10b
521
522	kmovb	%r14d,%k1
523	shrb	$4,%r14b
524	kmovb	%r14d,%k2
525	kmovb	%r13d,%k3
526	shrb	$4,%r13b
527	kmovb	%r13d,%k4
528	kmovb	%r12d,%k5
529	shrb	$4,%r12b
530	kmovb	%r12d,%k6
531	kmovb	%r11d,%k7
532
533	vpsubq	.Lmask52x4(%rip),%ymm3,%ymm3{%k1}
534	vpsubq	.Lmask52x4(%rip),%ymm4,%ymm4{%k2}
535	vpsubq	.Lmask52x4(%rip),%ymm5,%ymm5{%k3}
536	vpsubq	.Lmask52x4(%rip),%ymm6,%ymm6{%k4}
537	vpsubq	.Lmask52x4(%rip),%ymm7,%ymm7{%k5}
538	vpsubq	.Lmask52x4(%rip),%ymm8,%ymm8{%k6}
539	vpsubq	.Lmask52x4(%rip),%ymm9,%ymm9{%k7}
540
541	vpandq	.Lmask52x4(%rip),%ymm3,%ymm3
542	vpandq	.Lmask52x4(%rip),%ymm4,%ymm4
543	vpandq	.Lmask52x4(%rip),%ymm5,%ymm5
544	vpandq	.Lmask52x4(%rip),%ymm6,%ymm6
545	vpandq	.Lmask52x4(%rip),%ymm7,%ymm7
546	vpandq	.Lmask52x4(%rip),%ymm8,%ymm8
547	vpandq	.Lmask52x4(%rip),%ymm9,%ymm9
548
549	shrb	$4,%r11b
550	kmovb	%r11d,%k1
551	kmovb	%r10d,%k2
552	shrb	$4,%r10b
553	kmovb	%r10d,%k3
554
555	vpsubq	.Lmask52x4(%rip),%ymm10,%ymm10{%k1}
556	vpsubq	.Lmask52x4(%rip),%ymm11,%ymm11{%k2}
557	vpsubq	.Lmask52x4(%rip),%ymm12,%ymm12{%k3}
558
559	vpandq	.Lmask52x4(%rip),%ymm10,%ymm10
560	vpandq	.Lmask52x4(%rip),%ymm11,%ymm11
561	vpandq	.Lmask52x4(%rip),%ymm12,%ymm12
562
563	vmovdqu64	%ymm3,0(%rdi)
564	vmovdqu64	%ymm4,32(%rdi)
565	vmovdqu64	%ymm5,64(%rdi)
566	vmovdqu64	%ymm6,96(%rdi)
567	vmovdqu64	%ymm7,128(%rdi)
568	vmovdqu64	%ymm8,160(%rdi)
569	vmovdqu64	%ymm9,192(%rdi)
570	vmovdqu64	%ymm10,224(%rdi)
571	vmovdqu64	%ymm11,256(%rdi)
572	vmovdqu64	%ymm12,288(%rdi)
573
574	vzeroupper
575	leaq	(%rsp),%rax
576.cfi_def_cfa_register	%rax
577	movq	0(%rax),%r15
578.cfi_restore	%r15
579	movq	8(%rax),%r14
580.cfi_restore	%r14
581	movq	16(%rax),%r13
582.cfi_restore	%r13
583	movq	24(%rax),%r12
584.cfi_restore	%r12
585	movq	32(%rax),%rbp
586.cfi_restore	%rbp
587	movq	40(%rax),%rbx
588.cfi_restore	%rbx
589	leaq	48(%rax),%rsp
590.cfi_def_cfa	%rsp,8
591.Lossl_rsaz_amm52x40_x1_ifma256_epilogue:
592
593	.byte	0xf3,0xc3
594.cfi_endproc
595.size	ossl_rsaz_amm52x40_x1_ifma256, .-ossl_rsaz_amm52x40_x1_ifma256
596.section	.rodata
597.align	32
598.Lmask52x4:
599.quad	0xfffffffffffff
600.quad	0xfffffffffffff
601.quad	0xfffffffffffff
602.quad	0xfffffffffffff
603.text
604
605.globl	ossl_rsaz_amm52x40_x2_ifma256
606.type	ossl_rsaz_amm52x40_x2_ifma256,@function
607.align	32
608ossl_rsaz_amm52x40_x2_ifma256:
609.cfi_startproc
610.byte	243,15,30,250
611	pushq	%rbx
612.cfi_adjust_cfa_offset	8
613.cfi_offset	%rbx,-16
614	pushq	%rbp
615.cfi_adjust_cfa_offset	8
616.cfi_offset	%rbp,-24
617	pushq	%r12
618.cfi_adjust_cfa_offset	8
619.cfi_offset	%r12,-32
620	pushq	%r13
621.cfi_adjust_cfa_offset	8
622.cfi_offset	%r13,-40
623	pushq	%r14
624.cfi_adjust_cfa_offset	8
625.cfi_offset	%r14,-48
626	pushq	%r15
627.cfi_adjust_cfa_offset	8
628.cfi_offset	%r15,-56
629
630	vpxord	%ymm0,%ymm0,%ymm0
631	vmovdqa64	%ymm0,%ymm3
632	vmovdqa64	%ymm0,%ymm4
633	vmovdqa64	%ymm0,%ymm5
634	vmovdqa64	%ymm0,%ymm6
635	vmovdqa64	%ymm0,%ymm7
636	vmovdqa64	%ymm0,%ymm8
637	vmovdqa64	%ymm0,%ymm9
638	vmovdqa64	%ymm0,%ymm10
639	vmovdqa64	%ymm0,%ymm11
640	vmovdqa64	%ymm0,%ymm12
641
642	vmovdqa64	%ymm0,%ymm13
643	vmovdqa64	%ymm0,%ymm14
644	vmovdqa64	%ymm0,%ymm15
645	vmovdqa64	%ymm0,%ymm16
646	vmovdqa64	%ymm0,%ymm17
647	vmovdqa64	%ymm0,%ymm18
648	vmovdqa64	%ymm0,%ymm19
649	vmovdqa64	%ymm0,%ymm20
650	vmovdqa64	%ymm0,%ymm21
651	vmovdqa64	%ymm0,%ymm22
652
653
654	xorl	%r9d,%r9d
655	xorl	%r15d,%r15d
656
657	movq	%rdx,%r11
658	movq	$0xfffffffffffff,%rax
659
660	movl	$40,%ebx
661
662.align	32
663.Lloop40:
664	movq	0(%r11),%r13
665
666	vpbroadcastq	%r13,%ymm1
667	movq	0(%rsi),%rdx
668	mulxq	%r13,%r13,%r12
669	addq	%r13,%r9
670	movq	%r12,%r10
671	adcq	$0,%r10
672
673	movq	(%r8),%r13
674	imulq	%r9,%r13
675	andq	%rax,%r13
676
677	vpbroadcastq	%r13,%ymm2
678	movq	0(%rcx),%rdx
679	mulxq	%r13,%r13,%r12
680	addq	%r13,%r9
681	adcq	%r12,%r10
682
683	shrq	$52,%r9
684	salq	$12,%r10
685	orq	%r10,%r9
686
687	vpmadd52luq	0(%rsi),%ymm1,%ymm3
688	vpmadd52luq	32(%rsi),%ymm1,%ymm4
689	vpmadd52luq	64(%rsi),%ymm1,%ymm5
690	vpmadd52luq	96(%rsi),%ymm1,%ymm6
691	vpmadd52luq	128(%rsi),%ymm1,%ymm7
692	vpmadd52luq	160(%rsi),%ymm1,%ymm8
693	vpmadd52luq	192(%rsi),%ymm1,%ymm9
694	vpmadd52luq	224(%rsi),%ymm1,%ymm10
695	vpmadd52luq	256(%rsi),%ymm1,%ymm11
696	vpmadd52luq	288(%rsi),%ymm1,%ymm12
697
698	vpmadd52luq	0(%rcx),%ymm2,%ymm3
699	vpmadd52luq	32(%rcx),%ymm2,%ymm4
700	vpmadd52luq	64(%rcx),%ymm2,%ymm5
701	vpmadd52luq	96(%rcx),%ymm2,%ymm6
702	vpmadd52luq	128(%rcx),%ymm2,%ymm7
703	vpmadd52luq	160(%rcx),%ymm2,%ymm8
704	vpmadd52luq	192(%rcx),%ymm2,%ymm9
705	vpmadd52luq	224(%rcx),%ymm2,%ymm10
706	vpmadd52luq	256(%rcx),%ymm2,%ymm11
707	vpmadd52luq	288(%rcx),%ymm2,%ymm12
708
709
710	valignq	$1,%ymm3,%ymm4,%ymm3
711	valignq	$1,%ymm4,%ymm5,%ymm4
712	valignq	$1,%ymm5,%ymm6,%ymm5
713	valignq	$1,%ymm6,%ymm7,%ymm6
714	valignq	$1,%ymm7,%ymm8,%ymm7
715	valignq	$1,%ymm8,%ymm9,%ymm8
716	valignq	$1,%ymm9,%ymm10,%ymm9
717	valignq	$1,%ymm10,%ymm11,%ymm10
718	valignq	$1,%ymm11,%ymm12,%ymm11
719	valignq	$1,%ymm12,%ymm0,%ymm12
720
721	vmovq	%xmm3,%r13
722	addq	%r13,%r9
723
724	vpmadd52huq	0(%rsi),%ymm1,%ymm3
725	vpmadd52huq	32(%rsi),%ymm1,%ymm4
726	vpmadd52huq	64(%rsi),%ymm1,%ymm5
727	vpmadd52huq	96(%rsi),%ymm1,%ymm6
728	vpmadd52huq	128(%rsi),%ymm1,%ymm7
729	vpmadd52huq	160(%rsi),%ymm1,%ymm8
730	vpmadd52huq	192(%rsi),%ymm1,%ymm9
731	vpmadd52huq	224(%rsi),%ymm1,%ymm10
732	vpmadd52huq	256(%rsi),%ymm1,%ymm11
733	vpmadd52huq	288(%rsi),%ymm1,%ymm12
734
735	vpmadd52huq	0(%rcx),%ymm2,%ymm3
736	vpmadd52huq	32(%rcx),%ymm2,%ymm4
737	vpmadd52huq	64(%rcx),%ymm2,%ymm5
738	vpmadd52huq	96(%rcx),%ymm2,%ymm6
739	vpmadd52huq	128(%rcx),%ymm2,%ymm7
740	vpmadd52huq	160(%rcx),%ymm2,%ymm8
741	vpmadd52huq	192(%rcx),%ymm2,%ymm9
742	vpmadd52huq	224(%rcx),%ymm2,%ymm10
743	vpmadd52huq	256(%rcx),%ymm2,%ymm11
744	vpmadd52huq	288(%rcx),%ymm2,%ymm12
745	movq	320(%r11),%r13
746
747	vpbroadcastq	%r13,%ymm1
748	movq	320(%rsi),%rdx
749	mulxq	%r13,%r13,%r12
750	addq	%r13,%r15
751	movq	%r12,%r10
752	adcq	$0,%r10
753
754	movq	8(%r8),%r13
755	imulq	%r15,%r13
756	andq	%rax,%r13
757
758	vpbroadcastq	%r13,%ymm2
759	movq	320(%rcx),%rdx
760	mulxq	%r13,%r13,%r12
761	addq	%r13,%r15
762	adcq	%r12,%r10
763
764	shrq	$52,%r15
765	salq	$12,%r10
766	orq	%r10,%r15
767
768	vpmadd52luq	320(%rsi),%ymm1,%ymm13
769	vpmadd52luq	352(%rsi),%ymm1,%ymm14
770	vpmadd52luq	384(%rsi),%ymm1,%ymm15
771	vpmadd52luq	416(%rsi),%ymm1,%ymm16
772	vpmadd52luq	448(%rsi),%ymm1,%ymm17
773	vpmadd52luq	480(%rsi),%ymm1,%ymm18
774	vpmadd52luq	512(%rsi),%ymm1,%ymm19
775	vpmadd52luq	544(%rsi),%ymm1,%ymm20
776	vpmadd52luq	576(%rsi),%ymm1,%ymm21
777	vpmadd52luq	608(%rsi),%ymm1,%ymm22
778
779	vpmadd52luq	320(%rcx),%ymm2,%ymm13
780	vpmadd52luq	352(%rcx),%ymm2,%ymm14
781	vpmadd52luq	384(%rcx),%ymm2,%ymm15
782	vpmadd52luq	416(%rcx),%ymm2,%ymm16
783	vpmadd52luq	448(%rcx),%ymm2,%ymm17
784	vpmadd52luq	480(%rcx),%ymm2,%ymm18
785	vpmadd52luq	512(%rcx),%ymm2,%ymm19
786	vpmadd52luq	544(%rcx),%ymm2,%ymm20
787	vpmadd52luq	576(%rcx),%ymm2,%ymm21
788	vpmadd52luq	608(%rcx),%ymm2,%ymm22
789
790
791	valignq	$1,%ymm13,%ymm14,%ymm13
792	valignq	$1,%ymm14,%ymm15,%ymm14
793	valignq	$1,%ymm15,%ymm16,%ymm15
794	valignq	$1,%ymm16,%ymm17,%ymm16
795	valignq	$1,%ymm17,%ymm18,%ymm17
796	valignq	$1,%ymm18,%ymm19,%ymm18
797	valignq	$1,%ymm19,%ymm20,%ymm19
798	valignq	$1,%ymm20,%ymm21,%ymm20
799	valignq	$1,%ymm21,%ymm22,%ymm21
800	valignq	$1,%ymm22,%ymm0,%ymm22
801
802	vmovq	%xmm13,%r13
803	addq	%r13,%r15
804
805	vpmadd52huq	320(%rsi),%ymm1,%ymm13
806	vpmadd52huq	352(%rsi),%ymm1,%ymm14
807	vpmadd52huq	384(%rsi),%ymm1,%ymm15
808	vpmadd52huq	416(%rsi),%ymm1,%ymm16
809	vpmadd52huq	448(%rsi),%ymm1,%ymm17
810	vpmadd52huq	480(%rsi),%ymm1,%ymm18
811	vpmadd52huq	512(%rsi),%ymm1,%ymm19
812	vpmadd52huq	544(%rsi),%ymm1,%ymm20
813	vpmadd52huq	576(%rsi),%ymm1,%ymm21
814	vpmadd52huq	608(%rsi),%ymm1,%ymm22
815
816	vpmadd52huq	320(%rcx),%ymm2,%ymm13
817	vpmadd52huq	352(%rcx),%ymm2,%ymm14
818	vpmadd52huq	384(%rcx),%ymm2,%ymm15
819	vpmadd52huq	416(%rcx),%ymm2,%ymm16
820	vpmadd52huq	448(%rcx),%ymm2,%ymm17
821	vpmadd52huq	480(%rcx),%ymm2,%ymm18
822	vpmadd52huq	512(%rcx),%ymm2,%ymm19
823	vpmadd52huq	544(%rcx),%ymm2,%ymm20
824	vpmadd52huq	576(%rcx),%ymm2,%ymm21
825	vpmadd52huq	608(%rcx),%ymm2,%ymm22
826	leaq	8(%r11),%r11
827	decl	%ebx
828	jne	.Lloop40
829
830	vpbroadcastq	%r9,%ymm0
831	vpblendd	$3,%ymm0,%ymm3,%ymm3
832
833
834
835	vpsrlq	$52,%ymm3,%ymm0
836	vpsrlq	$52,%ymm4,%ymm1
837	vpsrlq	$52,%ymm5,%ymm2
838	vpsrlq	$52,%ymm6,%ymm23
839	vpsrlq	$52,%ymm7,%ymm24
840	vpsrlq	$52,%ymm8,%ymm25
841	vpsrlq	$52,%ymm9,%ymm26
842	vpsrlq	$52,%ymm10,%ymm27
843	vpsrlq	$52,%ymm11,%ymm28
844	vpsrlq	$52,%ymm12,%ymm29
845
846
847	valignq	$3,%ymm28,%ymm29,%ymm29
848	valignq	$3,%ymm27,%ymm28,%ymm28
849	valignq	$3,%ymm26,%ymm27,%ymm27
850	valignq	$3,%ymm25,%ymm26,%ymm26
851	valignq	$3,%ymm24,%ymm25,%ymm25
852	valignq	$3,%ymm23,%ymm24,%ymm24
853	valignq	$3,%ymm2,%ymm23,%ymm23
854	valignq	$3,%ymm1,%ymm2,%ymm2
855	valignq	$3,%ymm0,%ymm1,%ymm1
856	valignq	$3,.Lzeros(%rip),%ymm0,%ymm0
857
858
859	vpandq	.Lmask52x4(%rip),%ymm3,%ymm3
860	vpandq	.Lmask52x4(%rip),%ymm4,%ymm4
861	vpandq	.Lmask52x4(%rip),%ymm5,%ymm5
862	vpandq	.Lmask52x4(%rip),%ymm6,%ymm6
863	vpandq	.Lmask52x4(%rip),%ymm7,%ymm7
864	vpandq	.Lmask52x4(%rip),%ymm8,%ymm8
865	vpandq	.Lmask52x4(%rip),%ymm9,%ymm9
866	vpandq	.Lmask52x4(%rip),%ymm10,%ymm10
867	vpandq	.Lmask52x4(%rip),%ymm11,%ymm11
868	vpandq	.Lmask52x4(%rip),%ymm12,%ymm12
869
870
871	vpaddq	%ymm0,%ymm3,%ymm3
872	vpaddq	%ymm1,%ymm4,%ymm4
873	vpaddq	%ymm2,%ymm5,%ymm5
874	vpaddq	%ymm23,%ymm6,%ymm6
875	vpaddq	%ymm24,%ymm7,%ymm7
876	vpaddq	%ymm25,%ymm8,%ymm8
877	vpaddq	%ymm26,%ymm9,%ymm9
878	vpaddq	%ymm27,%ymm10,%ymm10
879	vpaddq	%ymm28,%ymm11,%ymm11
880	vpaddq	%ymm29,%ymm12,%ymm12
881
882
883
884	vpcmpuq	$6,.Lmask52x4(%rip),%ymm3,%k1
885	vpcmpuq	$6,.Lmask52x4(%rip),%ymm4,%k2
886	kmovb	%k1,%r14d
887	kmovb	%k2,%r13d
888	shlb	$4,%r13b
889	orb	%r13b,%r14b
890
891	vpcmpuq	$6,.Lmask52x4(%rip),%ymm5,%k1
892	vpcmpuq	$6,.Lmask52x4(%rip),%ymm6,%k2
893	kmovb	%k1,%r13d
894	kmovb	%k2,%r12d
895	shlb	$4,%r12b
896	orb	%r12b,%r13b
897
898	vpcmpuq	$6,.Lmask52x4(%rip),%ymm7,%k1
899	vpcmpuq	$6,.Lmask52x4(%rip),%ymm8,%k2
900	kmovb	%k1,%r12d
901	kmovb	%k2,%r11d
902	shlb	$4,%r11b
903	orb	%r11b,%r12b
904
905	vpcmpuq	$6,.Lmask52x4(%rip),%ymm9,%k1
906	vpcmpuq	$6,.Lmask52x4(%rip),%ymm10,%k2
907	kmovb	%k1,%r11d
908	kmovb	%k2,%r10d
909	shlb	$4,%r10b
910	orb	%r10b,%r11b
911
912	vpcmpuq	$6,.Lmask52x4(%rip),%ymm11,%k1
913	vpcmpuq	$6,.Lmask52x4(%rip),%ymm12,%k2
914	kmovb	%k1,%r10d
915	kmovb	%k2,%r9d
916	shlb	$4,%r9b
917	orb	%r9b,%r10b
918
919	addb	%r14b,%r14b
920	adcb	%r13b,%r13b
921	adcb	%r12b,%r12b
922	adcb	%r11b,%r11b
923	adcb	%r10b,%r10b
924
925
926	vpcmpuq	$0,.Lmask52x4(%rip),%ymm3,%k1
927	vpcmpuq	$0,.Lmask52x4(%rip),%ymm4,%k2
928	kmovb	%k1,%r9d
929	kmovb	%k2,%r8d
930	shlb	$4,%r8b
931	orb	%r8b,%r9b
932
933	vpcmpuq	$0,.Lmask52x4(%rip),%ymm5,%k1
934	vpcmpuq	$0,.Lmask52x4(%rip),%ymm6,%k2
935	kmovb	%k1,%r8d
936	kmovb	%k2,%edx
937	shlb	$4,%dl
938	orb	%dl,%r8b
939
940	vpcmpuq	$0,.Lmask52x4(%rip),%ymm7,%k1
941	vpcmpuq	$0,.Lmask52x4(%rip),%ymm8,%k2
942	kmovb	%k1,%edx
943	kmovb	%k2,%ecx
944	shlb	$4,%cl
945	orb	%cl,%dl
946
947	vpcmpuq	$0,.Lmask52x4(%rip),%ymm9,%k1
948	vpcmpuq	$0,.Lmask52x4(%rip),%ymm10,%k2
949	kmovb	%k1,%ecx
950	kmovb	%k2,%ebx
951	shlb	$4,%bl
952	orb	%bl,%cl
953
954	vpcmpuq	$0,.Lmask52x4(%rip),%ymm11,%k1
955	vpcmpuq	$0,.Lmask52x4(%rip),%ymm12,%k2
956	kmovb	%k1,%ebx
957	kmovb	%k2,%eax
958	shlb	$4,%al
959	orb	%al,%bl
960
961	addb	%r9b,%r14b
962	adcb	%r8b,%r13b
963	adcb	%dl,%r12b
964	adcb	%cl,%r11b
965	adcb	%bl,%r10b
966
967	xorb	%r9b,%r14b
968	xorb	%r8b,%r13b
969	xorb	%dl,%r12b
970	xorb	%cl,%r11b
971	xorb	%bl,%r10b
972
973	kmovb	%r14d,%k1
974	shrb	$4,%r14b
975	kmovb	%r14d,%k2
976	kmovb	%r13d,%k3
977	shrb	$4,%r13b
978	kmovb	%r13d,%k4
979	kmovb	%r12d,%k5
980	shrb	$4,%r12b
981	kmovb	%r12d,%k6
982	kmovb	%r11d,%k7
983
984	vpsubq	.Lmask52x4(%rip),%ymm3,%ymm3{%k1}
985	vpsubq	.Lmask52x4(%rip),%ymm4,%ymm4{%k2}
986	vpsubq	.Lmask52x4(%rip),%ymm5,%ymm5{%k3}
987	vpsubq	.Lmask52x4(%rip),%ymm6,%ymm6{%k4}
988	vpsubq	.Lmask52x4(%rip),%ymm7,%ymm7{%k5}
989	vpsubq	.Lmask52x4(%rip),%ymm8,%ymm8{%k6}
990	vpsubq	.Lmask52x4(%rip),%ymm9,%ymm9{%k7}
991
992	vpandq	.Lmask52x4(%rip),%ymm3,%ymm3
993	vpandq	.Lmask52x4(%rip),%ymm4,%ymm4
994	vpandq	.Lmask52x4(%rip),%ymm5,%ymm5
995	vpandq	.Lmask52x4(%rip),%ymm6,%ymm6
996	vpandq	.Lmask52x4(%rip),%ymm7,%ymm7
997	vpandq	.Lmask52x4(%rip),%ymm8,%ymm8
998	vpandq	.Lmask52x4(%rip),%ymm9,%ymm9
999
1000	shrb	$4,%r11b
1001	kmovb	%r11d,%k1
1002	kmovb	%r10d,%k2
1003	shrb	$4,%r10b
1004	kmovb	%r10d,%k3
1005
1006	vpsubq	.Lmask52x4(%rip),%ymm10,%ymm10{%k1}
1007	vpsubq	.Lmask52x4(%rip),%ymm11,%ymm11{%k2}
1008	vpsubq	.Lmask52x4(%rip),%ymm12,%ymm12{%k3}
1009
1010	vpandq	.Lmask52x4(%rip),%ymm10,%ymm10
1011	vpandq	.Lmask52x4(%rip),%ymm11,%ymm11
1012	vpandq	.Lmask52x4(%rip),%ymm12,%ymm12
1013
1014	vpbroadcastq	%r15,%ymm0
1015	vpblendd	$3,%ymm0,%ymm13,%ymm13
1016
1017
1018
1019	vpsrlq	$52,%ymm13,%ymm0
1020	vpsrlq	$52,%ymm14,%ymm1
1021	vpsrlq	$52,%ymm15,%ymm2
1022	vpsrlq	$52,%ymm16,%ymm23
1023	vpsrlq	$52,%ymm17,%ymm24
1024	vpsrlq	$52,%ymm18,%ymm25
1025	vpsrlq	$52,%ymm19,%ymm26
1026	vpsrlq	$52,%ymm20,%ymm27
1027	vpsrlq	$52,%ymm21,%ymm28
1028	vpsrlq	$52,%ymm22,%ymm29
1029
1030
1031	valignq	$3,%ymm28,%ymm29,%ymm29
1032	valignq	$3,%ymm27,%ymm28,%ymm28
1033	valignq	$3,%ymm26,%ymm27,%ymm27
1034	valignq	$3,%ymm25,%ymm26,%ymm26
1035	valignq	$3,%ymm24,%ymm25,%ymm25
1036	valignq	$3,%ymm23,%ymm24,%ymm24
1037	valignq	$3,%ymm2,%ymm23,%ymm23
1038	valignq	$3,%ymm1,%ymm2,%ymm2
1039	valignq	$3,%ymm0,%ymm1,%ymm1
1040	valignq	$3,.Lzeros(%rip),%ymm0,%ymm0
1041
1042
1043	vpandq	.Lmask52x4(%rip),%ymm13,%ymm13
1044	vpandq	.Lmask52x4(%rip),%ymm14,%ymm14
1045	vpandq	.Lmask52x4(%rip),%ymm15,%ymm15
1046	vpandq	.Lmask52x4(%rip),%ymm16,%ymm16
1047	vpandq	.Lmask52x4(%rip),%ymm17,%ymm17
1048	vpandq	.Lmask52x4(%rip),%ymm18,%ymm18
1049	vpandq	.Lmask52x4(%rip),%ymm19,%ymm19
1050	vpandq	.Lmask52x4(%rip),%ymm20,%ymm20
1051	vpandq	.Lmask52x4(%rip),%ymm21,%ymm21
1052	vpandq	.Lmask52x4(%rip),%ymm22,%ymm22
1053
1054
1055	vpaddq	%ymm0,%ymm13,%ymm13
1056	vpaddq	%ymm1,%ymm14,%ymm14
1057	vpaddq	%ymm2,%ymm15,%ymm15
1058	vpaddq	%ymm23,%ymm16,%ymm16
1059	vpaddq	%ymm24,%ymm17,%ymm17
1060	vpaddq	%ymm25,%ymm18,%ymm18
1061	vpaddq	%ymm26,%ymm19,%ymm19
1062	vpaddq	%ymm27,%ymm20,%ymm20
1063	vpaddq	%ymm28,%ymm21,%ymm21
1064	vpaddq	%ymm29,%ymm22,%ymm22
1065
1066
1067
1068	vpcmpuq	$6,.Lmask52x4(%rip),%ymm13,%k1
1069	vpcmpuq	$6,.Lmask52x4(%rip),%ymm14,%k2
1070	kmovb	%k1,%r14d
1071	kmovb	%k2,%r13d
1072	shlb	$4,%r13b
1073	orb	%r13b,%r14b
1074
1075	vpcmpuq	$6,.Lmask52x4(%rip),%ymm15,%k1
1076	vpcmpuq	$6,.Lmask52x4(%rip),%ymm16,%k2
1077	kmovb	%k1,%r13d
1078	kmovb	%k2,%r12d
1079	shlb	$4,%r12b
1080	orb	%r12b,%r13b
1081
1082	vpcmpuq	$6,.Lmask52x4(%rip),%ymm17,%k1
1083	vpcmpuq	$6,.Lmask52x4(%rip),%ymm18,%k2
1084	kmovb	%k1,%r12d
1085	kmovb	%k2,%r11d
1086	shlb	$4,%r11b
1087	orb	%r11b,%r12b
1088
1089	vpcmpuq	$6,.Lmask52x4(%rip),%ymm19,%k1
1090	vpcmpuq	$6,.Lmask52x4(%rip),%ymm20,%k2
1091	kmovb	%k1,%r11d
1092	kmovb	%k2,%r10d
1093	shlb	$4,%r10b
1094	orb	%r10b,%r11b
1095
1096	vpcmpuq	$6,.Lmask52x4(%rip),%ymm21,%k1
1097	vpcmpuq	$6,.Lmask52x4(%rip),%ymm22,%k2
1098	kmovb	%k1,%r10d
1099	kmovb	%k2,%r9d
1100	shlb	$4,%r9b
1101	orb	%r9b,%r10b
1102
1103	addb	%r14b,%r14b
1104	adcb	%r13b,%r13b
1105	adcb	%r12b,%r12b
1106	adcb	%r11b,%r11b
1107	adcb	%r10b,%r10b
1108
1109
1110	vpcmpuq	$0,.Lmask52x4(%rip),%ymm13,%k1
1111	vpcmpuq	$0,.Lmask52x4(%rip),%ymm14,%k2
1112	kmovb	%k1,%r9d
1113	kmovb	%k2,%r8d
1114	shlb	$4,%r8b
1115	orb	%r8b,%r9b
1116
1117	vpcmpuq	$0,.Lmask52x4(%rip),%ymm15,%k1
1118	vpcmpuq	$0,.Lmask52x4(%rip),%ymm16,%k2
1119	kmovb	%k1,%r8d
1120	kmovb	%k2,%edx
1121	shlb	$4,%dl
1122	orb	%dl,%r8b
1123
1124	vpcmpuq	$0,.Lmask52x4(%rip),%ymm17,%k1
1125	vpcmpuq	$0,.Lmask52x4(%rip),%ymm18,%k2
1126	kmovb	%k1,%edx
1127	kmovb	%k2,%ecx
1128	shlb	$4,%cl
1129	orb	%cl,%dl
1130
1131	vpcmpuq	$0,.Lmask52x4(%rip),%ymm19,%k1
1132	vpcmpuq	$0,.Lmask52x4(%rip),%ymm20,%k2
1133	kmovb	%k1,%ecx
1134	kmovb	%k2,%ebx
1135	shlb	$4,%bl
1136	orb	%bl,%cl
1137
1138	vpcmpuq	$0,.Lmask52x4(%rip),%ymm21,%k1
1139	vpcmpuq	$0,.Lmask52x4(%rip),%ymm22,%k2
1140	kmovb	%k1,%ebx
1141	kmovb	%k2,%eax
1142	shlb	$4,%al
1143	orb	%al,%bl
1144
1145	addb	%r9b,%r14b
1146	adcb	%r8b,%r13b
1147	adcb	%dl,%r12b
1148	adcb	%cl,%r11b
1149	adcb	%bl,%r10b
1150
1151	xorb	%r9b,%r14b
1152	xorb	%r8b,%r13b
1153	xorb	%dl,%r12b
1154	xorb	%cl,%r11b
1155	xorb	%bl,%r10b
1156
1157	kmovb	%r14d,%k1
1158	shrb	$4,%r14b
1159	kmovb	%r14d,%k2
1160	kmovb	%r13d,%k3
1161	shrb	$4,%r13b
1162	kmovb	%r13d,%k4
1163	kmovb	%r12d,%k5
1164	shrb	$4,%r12b
1165	kmovb	%r12d,%k6
1166	kmovb	%r11d,%k7
1167
1168	vpsubq	.Lmask52x4(%rip),%ymm13,%ymm13{%k1}
1169	vpsubq	.Lmask52x4(%rip),%ymm14,%ymm14{%k2}
1170	vpsubq	.Lmask52x4(%rip),%ymm15,%ymm15{%k3}
1171	vpsubq	.Lmask52x4(%rip),%ymm16,%ymm16{%k4}
1172	vpsubq	.Lmask52x4(%rip),%ymm17,%ymm17{%k5}
1173	vpsubq	.Lmask52x4(%rip),%ymm18,%ymm18{%k6}
1174	vpsubq	.Lmask52x4(%rip),%ymm19,%ymm19{%k7}
1175
1176	vpandq	.Lmask52x4(%rip),%ymm13,%ymm13
1177	vpandq	.Lmask52x4(%rip),%ymm14,%ymm14
1178	vpandq	.Lmask52x4(%rip),%ymm15,%ymm15
1179	vpandq	.Lmask52x4(%rip),%ymm16,%ymm16
1180	vpandq	.Lmask52x4(%rip),%ymm17,%ymm17
1181	vpandq	.Lmask52x4(%rip),%ymm18,%ymm18
1182	vpandq	.Lmask52x4(%rip),%ymm19,%ymm19
1183
1184	shrb	$4,%r11b
1185	kmovb	%r11d,%k1
1186	kmovb	%r10d,%k2
1187	shrb	$4,%r10b
1188	kmovb	%r10d,%k3
1189
1190	vpsubq	.Lmask52x4(%rip),%ymm20,%ymm20{%k1}
1191	vpsubq	.Lmask52x4(%rip),%ymm21,%ymm21{%k2}
1192	vpsubq	.Lmask52x4(%rip),%ymm22,%ymm22{%k3}
1193
1194	vpandq	.Lmask52x4(%rip),%ymm20,%ymm20
1195	vpandq	.Lmask52x4(%rip),%ymm21,%ymm21
1196	vpandq	.Lmask52x4(%rip),%ymm22,%ymm22
1197
1198	vmovdqu64	%ymm3,0(%rdi)
1199	vmovdqu64	%ymm4,32(%rdi)
1200	vmovdqu64	%ymm5,64(%rdi)
1201	vmovdqu64	%ymm6,96(%rdi)
1202	vmovdqu64	%ymm7,128(%rdi)
1203	vmovdqu64	%ymm8,160(%rdi)
1204	vmovdqu64	%ymm9,192(%rdi)
1205	vmovdqu64	%ymm10,224(%rdi)
1206	vmovdqu64	%ymm11,256(%rdi)
1207	vmovdqu64	%ymm12,288(%rdi)
1208
1209	vmovdqu64	%ymm13,320(%rdi)
1210	vmovdqu64	%ymm14,352(%rdi)
1211	vmovdqu64	%ymm15,384(%rdi)
1212	vmovdqu64	%ymm16,416(%rdi)
1213	vmovdqu64	%ymm17,448(%rdi)
1214	vmovdqu64	%ymm18,480(%rdi)
1215	vmovdqu64	%ymm19,512(%rdi)
1216	vmovdqu64	%ymm20,544(%rdi)
1217	vmovdqu64	%ymm21,576(%rdi)
1218	vmovdqu64	%ymm22,608(%rdi)
1219
1220	vzeroupper
1221	leaq	(%rsp),%rax
1222.cfi_def_cfa_register	%rax
1223	movq	0(%rax),%r15
1224.cfi_restore	%r15
1225	movq	8(%rax),%r14
1226.cfi_restore	%r14
1227	movq	16(%rax),%r13
1228.cfi_restore	%r13
1229	movq	24(%rax),%r12
1230.cfi_restore	%r12
1231	movq	32(%rax),%rbp
1232.cfi_restore	%rbp
1233	movq	40(%rax),%rbx
1234.cfi_restore	%rbx
1235	leaq	48(%rax),%rsp
1236.cfi_def_cfa	%rsp,8
1237.Lossl_rsaz_amm52x40_x2_ifma256_epilogue:
1238	.byte	0xf3,0xc3
1239.cfi_endproc
1240.size	ossl_rsaz_amm52x40_x2_ifma256, .-ossl_rsaz_amm52x40_x2_ifma256
1241.text
1242
1243.align	32
1244.globl	ossl_extract_multiplier_2x40_win5
1245.type	ossl_extract_multiplier_2x40_win5,@function
1246ossl_extract_multiplier_2x40_win5:
1247.cfi_startproc
1248.byte	243,15,30,250
1249	vmovdqa64	.Lones(%rip),%ymm24
1250	vpbroadcastq	%rdx,%ymm22
1251	vpbroadcastq	%rcx,%ymm23
1252	leaq	20480(%rsi),%rax
1253
1254
1255	movq	%rsi,%r10
1256
1257
1258	vpxor	%xmm0,%xmm0,%xmm0
1259	vmovdqa64	%ymm0,%ymm1
1260	vmovdqa64	%ymm0,%ymm2
1261	vmovdqa64	%ymm0,%ymm3
1262	vmovdqa64	%ymm0,%ymm4
1263	vmovdqa64	%ymm0,%ymm5
1264	vmovdqa64	%ymm0,%ymm16
1265	vmovdqa64	%ymm0,%ymm17
1266	vmovdqa64	%ymm0,%ymm18
1267	vmovdqa64	%ymm0,%ymm19
1268	vpxorq	%ymm21,%ymm21,%ymm21
1269.align	32
1270.Lloop_0:
1271	vpcmpq	$0,%ymm21,%ymm22,%k1
1272	vmovdqu64	0(%rsi),%ymm20
1273	vpblendmq	%ymm20,%ymm0,%ymm0{%k1}
1274	vmovdqu64	32(%rsi),%ymm20
1275	vpblendmq	%ymm20,%ymm1,%ymm1{%k1}
1276	vmovdqu64	64(%rsi),%ymm20
1277	vpblendmq	%ymm20,%ymm2,%ymm2{%k1}
1278	vmovdqu64	96(%rsi),%ymm20
1279	vpblendmq	%ymm20,%ymm3,%ymm3{%k1}
1280	vmovdqu64	128(%rsi),%ymm20
1281	vpblendmq	%ymm20,%ymm4,%ymm4{%k1}
1282	vmovdqu64	160(%rsi),%ymm20
1283	vpblendmq	%ymm20,%ymm5,%ymm5{%k1}
1284	vmovdqu64	192(%rsi),%ymm20
1285	vpblendmq	%ymm20,%ymm16,%ymm16{%k1}
1286	vmovdqu64	224(%rsi),%ymm20
1287	vpblendmq	%ymm20,%ymm17,%ymm17{%k1}
1288	vmovdqu64	256(%rsi),%ymm20
1289	vpblendmq	%ymm20,%ymm18,%ymm18{%k1}
1290	vmovdqu64	288(%rsi),%ymm20
1291	vpblendmq	%ymm20,%ymm19,%ymm19{%k1}
1292	vpaddq	%ymm24,%ymm21,%ymm21
1293	addq	$640,%rsi
1294	cmpq	%rsi,%rax
1295	jne	.Lloop_0
1296	vmovdqu64	%ymm0,0(%rdi)
1297	vmovdqu64	%ymm1,32(%rdi)
1298	vmovdqu64	%ymm2,64(%rdi)
1299	vmovdqu64	%ymm3,96(%rdi)
1300	vmovdqu64	%ymm4,128(%rdi)
1301	vmovdqu64	%ymm5,160(%rdi)
1302	vmovdqu64	%ymm16,192(%rdi)
1303	vmovdqu64	%ymm17,224(%rdi)
1304	vmovdqu64	%ymm18,256(%rdi)
1305	vmovdqu64	%ymm19,288(%rdi)
1306	movq	%r10,%rsi
1307	vpxorq	%ymm21,%ymm21,%ymm21
1308.align	32
1309.Lloop_320:
1310	vpcmpq	$0,%ymm21,%ymm23,%k1
1311	vmovdqu64	320(%rsi),%ymm20
1312	vpblendmq	%ymm20,%ymm0,%ymm0{%k1}
1313	vmovdqu64	352(%rsi),%ymm20
1314	vpblendmq	%ymm20,%ymm1,%ymm1{%k1}
1315	vmovdqu64	384(%rsi),%ymm20
1316	vpblendmq	%ymm20,%ymm2,%ymm2{%k1}
1317	vmovdqu64	416(%rsi),%ymm20
1318	vpblendmq	%ymm20,%ymm3,%ymm3{%k1}
1319	vmovdqu64	448(%rsi),%ymm20
1320	vpblendmq	%ymm20,%ymm4,%ymm4{%k1}
1321	vmovdqu64	480(%rsi),%ymm20
1322	vpblendmq	%ymm20,%ymm5,%ymm5{%k1}
1323	vmovdqu64	512(%rsi),%ymm20
1324	vpblendmq	%ymm20,%ymm16,%ymm16{%k1}
1325	vmovdqu64	544(%rsi),%ymm20
1326	vpblendmq	%ymm20,%ymm17,%ymm17{%k1}
1327	vmovdqu64	576(%rsi),%ymm20
1328	vpblendmq	%ymm20,%ymm18,%ymm18{%k1}
1329	vmovdqu64	608(%rsi),%ymm20
1330	vpblendmq	%ymm20,%ymm19,%ymm19{%k1}
1331	vpaddq	%ymm24,%ymm21,%ymm21
1332	addq	$640,%rsi
1333	cmpq	%rsi,%rax
1334	jne	.Lloop_320
1335	vmovdqu64	%ymm0,320(%rdi)
1336	vmovdqu64	%ymm1,352(%rdi)
1337	vmovdqu64	%ymm2,384(%rdi)
1338	vmovdqu64	%ymm3,416(%rdi)
1339	vmovdqu64	%ymm4,448(%rdi)
1340	vmovdqu64	%ymm5,480(%rdi)
1341	vmovdqu64	%ymm16,512(%rdi)
1342	vmovdqu64	%ymm17,544(%rdi)
1343	vmovdqu64	%ymm18,576(%rdi)
1344	vmovdqu64	%ymm19,608(%rdi)
1345
1346	.byte	0xf3,0xc3
1347.cfi_endproc
1348.size	ossl_extract_multiplier_2x40_win5, .-ossl_extract_multiplier_2x40_win5
1349.section	.rodata
1350.align	32
1351.Lones:
1352.quad	1,1,1,1
1353.Lzeros:
1354.quad	0,0,0,0
1355	.section ".note.gnu.property", "a"
1356	.p2align 3
1357	.long 1f - 0f
1358	.long 4f - 1f
1359	.long 5
13600:
1361	# "GNU" encoded with .byte, since .asciz isn't supported
1362	# on Solaris.
1363	.byte 0x47
1364	.byte 0x4e
1365	.byte 0x55
1366	.byte 0
13671:
1368	.p2align 3
1369	.long 0xc0000002
1370	.long 3f - 2f
13712:
1372	.long 3
13733:
1374	.p2align 3
13754:
1376