xref: /freebsd/sys/crypto/openssl/amd64/rsaz-2k-avx512.S (revision 4757b351ea9d59d71d4a38b82506d2d16fcd560d)
1/* Do not modify. This file is auto-generated from rsaz-2k-avx512.pl. */
2
3.globl	ossl_rsaz_avx512ifma_eligible
4.type	ossl_rsaz_avx512ifma_eligible,@function
5.align	32
6ossl_rsaz_avx512ifma_eligible:
7	movl	OPENSSL_ia32cap_P+8(%rip),%ecx
8	xorl	%eax,%eax
9	andl	$2149777408,%ecx
10	cmpl	$2149777408,%ecx
11	cmovel	%ecx,%eax
12	.byte	0xf3,0xc3
13.size	ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible
14.text
15
16.globl	ossl_rsaz_amm52x20_x1_ifma256
17.type	ossl_rsaz_amm52x20_x1_ifma256,@function
18.align	32
19ossl_rsaz_amm52x20_x1_ifma256:
20.cfi_startproc
21.byte	243,15,30,250
22	pushq	%rbx
23.cfi_adjust_cfa_offset	8
24.cfi_offset	%rbx,-16
25	pushq	%rbp
26.cfi_adjust_cfa_offset	8
27.cfi_offset	%rbp,-24
28	pushq	%r12
29.cfi_adjust_cfa_offset	8
30.cfi_offset	%r12,-32
31	pushq	%r13
32.cfi_adjust_cfa_offset	8
33.cfi_offset	%r13,-40
34	pushq	%r14
35.cfi_adjust_cfa_offset	8
36.cfi_offset	%r14,-48
37	pushq	%r15
38.cfi_adjust_cfa_offset	8
39.cfi_offset	%r15,-56
40.Lossl_rsaz_amm52x20_x1_ifma256_body:
41
42
43	vpxord	%ymm0,%ymm0,%ymm0
44	vmovdqa64	%ymm0,%ymm3
45	vmovdqa64	%ymm0,%ymm16
46	vmovdqa64	%ymm0,%ymm17
47	vmovdqa64	%ymm0,%ymm18
48	vmovdqa64	%ymm0,%ymm19
49
50	xorl	%r9d,%r9d
51
52	movq	%rdx,%r11
53	movq	$0xfffffffffffff,%rax
54
55
56	movl	$5,%ebx
57
58.align	32
59.Lloop5:
60	movq	0(%r11),%r13
61
62	vpbroadcastq	%r13,%ymm1
63	movq	0(%rsi),%rdx
64	mulxq	%r13,%r13,%r12
65	addq	%r13,%r9
66	movq	%r12,%r10
67	adcq	$0,%r10
68
69	movq	%r8,%r13
70	imulq	%r9,%r13
71	andq	%rax,%r13
72
73	vpbroadcastq	%r13,%ymm2
74	movq	0(%rcx),%rdx
75	mulxq	%r13,%r13,%r12
76	addq	%r13,%r9
77	adcq	%r12,%r10
78
79	shrq	$52,%r9
80	salq	$12,%r10
81	orq	%r10,%r9
82
83	vpmadd52luq	0(%rsi),%ymm1,%ymm3
84	vpmadd52luq	32(%rsi),%ymm1,%ymm16
85	vpmadd52luq	64(%rsi),%ymm1,%ymm17
86	vpmadd52luq	96(%rsi),%ymm1,%ymm18
87	vpmadd52luq	128(%rsi),%ymm1,%ymm19
88
89	vpmadd52luq	0(%rcx),%ymm2,%ymm3
90	vpmadd52luq	32(%rcx),%ymm2,%ymm16
91	vpmadd52luq	64(%rcx),%ymm2,%ymm17
92	vpmadd52luq	96(%rcx),%ymm2,%ymm18
93	vpmadd52luq	128(%rcx),%ymm2,%ymm19
94
95
96	valignq	$1,%ymm3,%ymm16,%ymm3
97	valignq	$1,%ymm16,%ymm17,%ymm16
98	valignq	$1,%ymm17,%ymm18,%ymm17
99	valignq	$1,%ymm18,%ymm19,%ymm18
100	valignq	$1,%ymm19,%ymm0,%ymm19
101
102	vmovq	%xmm3,%r13
103	addq	%r13,%r9
104
105	vpmadd52huq	0(%rsi),%ymm1,%ymm3
106	vpmadd52huq	32(%rsi),%ymm1,%ymm16
107	vpmadd52huq	64(%rsi),%ymm1,%ymm17
108	vpmadd52huq	96(%rsi),%ymm1,%ymm18
109	vpmadd52huq	128(%rsi),%ymm1,%ymm19
110
111	vpmadd52huq	0(%rcx),%ymm2,%ymm3
112	vpmadd52huq	32(%rcx),%ymm2,%ymm16
113	vpmadd52huq	64(%rcx),%ymm2,%ymm17
114	vpmadd52huq	96(%rcx),%ymm2,%ymm18
115	vpmadd52huq	128(%rcx),%ymm2,%ymm19
116	movq	8(%r11),%r13
117
118	vpbroadcastq	%r13,%ymm1
119	movq	0(%rsi),%rdx
120	mulxq	%r13,%r13,%r12
121	addq	%r13,%r9
122	movq	%r12,%r10
123	adcq	$0,%r10
124
125	movq	%r8,%r13
126	imulq	%r9,%r13
127	andq	%rax,%r13
128
129	vpbroadcastq	%r13,%ymm2
130	movq	0(%rcx),%rdx
131	mulxq	%r13,%r13,%r12
132	addq	%r13,%r9
133	adcq	%r12,%r10
134
135	shrq	$52,%r9
136	salq	$12,%r10
137	orq	%r10,%r9
138
139	vpmadd52luq	0(%rsi),%ymm1,%ymm3
140	vpmadd52luq	32(%rsi),%ymm1,%ymm16
141	vpmadd52luq	64(%rsi),%ymm1,%ymm17
142	vpmadd52luq	96(%rsi),%ymm1,%ymm18
143	vpmadd52luq	128(%rsi),%ymm1,%ymm19
144
145	vpmadd52luq	0(%rcx),%ymm2,%ymm3
146	vpmadd52luq	32(%rcx),%ymm2,%ymm16
147	vpmadd52luq	64(%rcx),%ymm2,%ymm17
148	vpmadd52luq	96(%rcx),%ymm2,%ymm18
149	vpmadd52luq	128(%rcx),%ymm2,%ymm19
150
151
152	valignq	$1,%ymm3,%ymm16,%ymm3
153	valignq	$1,%ymm16,%ymm17,%ymm16
154	valignq	$1,%ymm17,%ymm18,%ymm17
155	valignq	$1,%ymm18,%ymm19,%ymm18
156	valignq	$1,%ymm19,%ymm0,%ymm19
157
158	vmovq	%xmm3,%r13
159	addq	%r13,%r9
160
161	vpmadd52huq	0(%rsi),%ymm1,%ymm3
162	vpmadd52huq	32(%rsi),%ymm1,%ymm16
163	vpmadd52huq	64(%rsi),%ymm1,%ymm17
164	vpmadd52huq	96(%rsi),%ymm1,%ymm18
165	vpmadd52huq	128(%rsi),%ymm1,%ymm19
166
167	vpmadd52huq	0(%rcx),%ymm2,%ymm3
168	vpmadd52huq	32(%rcx),%ymm2,%ymm16
169	vpmadd52huq	64(%rcx),%ymm2,%ymm17
170	vpmadd52huq	96(%rcx),%ymm2,%ymm18
171	vpmadd52huq	128(%rcx),%ymm2,%ymm19
172	movq	16(%r11),%r13
173
174	vpbroadcastq	%r13,%ymm1
175	movq	0(%rsi),%rdx
176	mulxq	%r13,%r13,%r12
177	addq	%r13,%r9
178	movq	%r12,%r10
179	adcq	$0,%r10
180
181	movq	%r8,%r13
182	imulq	%r9,%r13
183	andq	%rax,%r13
184
185	vpbroadcastq	%r13,%ymm2
186	movq	0(%rcx),%rdx
187	mulxq	%r13,%r13,%r12
188	addq	%r13,%r9
189	adcq	%r12,%r10
190
191	shrq	$52,%r9
192	salq	$12,%r10
193	orq	%r10,%r9
194
195	vpmadd52luq	0(%rsi),%ymm1,%ymm3
196	vpmadd52luq	32(%rsi),%ymm1,%ymm16
197	vpmadd52luq	64(%rsi),%ymm1,%ymm17
198	vpmadd52luq	96(%rsi),%ymm1,%ymm18
199	vpmadd52luq	128(%rsi),%ymm1,%ymm19
200
201	vpmadd52luq	0(%rcx),%ymm2,%ymm3
202	vpmadd52luq	32(%rcx),%ymm2,%ymm16
203	vpmadd52luq	64(%rcx),%ymm2,%ymm17
204	vpmadd52luq	96(%rcx),%ymm2,%ymm18
205	vpmadd52luq	128(%rcx),%ymm2,%ymm19
206
207
208	valignq	$1,%ymm3,%ymm16,%ymm3
209	valignq	$1,%ymm16,%ymm17,%ymm16
210	valignq	$1,%ymm17,%ymm18,%ymm17
211	valignq	$1,%ymm18,%ymm19,%ymm18
212	valignq	$1,%ymm19,%ymm0,%ymm19
213
214	vmovq	%xmm3,%r13
215	addq	%r13,%r9
216
217	vpmadd52huq	0(%rsi),%ymm1,%ymm3
218	vpmadd52huq	32(%rsi),%ymm1,%ymm16
219	vpmadd52huq	64(%rsi),%ymm1,%ymm17
220	vpmadd52huq	96(%rsi),%ymm1,%ymm18
221	vpmadd52huq	128(%rsi),%ymm1,%ymm19
222
223	vpmadd52huq	0(%rcx),%ymm2,%ymm3
224	vpmadd52huq	32(%rcx),%ymm2,%ymm16
225	vpmadd52huq	64(%rcx),%ymm2,%ymm17
226	vpmadd52huq	96(%rcx),%ymm2,%ymm18
227	vpmadd52huq	128(%rcx),%ymm2,%ymm19
228	movq	24(%r11),%r13
229
230	vpbroadcastq	%r13,%ymm1
231	movq	0(%rsi),%rdx
232	mulxq	%r13,%r13,%r12
233	addq	%r13,%r9
234	movq	%r12,%r10
235	adcq	$0,%r10
236
237	movq	%r8,%r13
238	imulq	%r9,%r13
239	andq	%rax,%r13
240
241	vpbroadcastq	%r13,%ymm2
242	movq	0(%rcx),%rdx
243	mulxq	%r13,%r13,%r12
244	addq	%r13,%r9
245	adcq	%r12,%r10
246
247	shrq	$52,%r9
248	salq	$12,%r10
249	orq	%r10,%r9
250
251	vpmadd52luq	0(%rsi),%ymm1,%ymm3
252	vpmadd52luq	32(%rsi),%ymm1,%ymm16
253	vpmadd52luq	64(%rsi),%ymm1,%ymm17
254	vpmadd52luq	96(%rsi),%ymm1,%ymm18
255	vpmadd52luq	128(%rsi),%ymm1,%ymm19
256
257	vpmadd52luq	0(%rcx),%ymm2,%ymm3
258	vpmadd52luq	32(%rcx),%ymm2,%ymm16
259	vpmadd52luq	64(%rcx),%ymm2,%ymm17
260	vpmadd52luq	96(%rcx),%ymm2,%ymm18
261	vpmadd52luq	128(%rcx),%ymm2,%ymm19
262
263
264	valignq	$1,%ymm3,%ymm16,%ymm3
265	valignq	$1,%ymm16,%ymm17,%ymm16
266	valignq	$1,%ymm17,%ymm18,%ymm17
267	valignq	$1,%ymm18,%ymm19,%ymm18
268	valignq	$1,%ymm19,%ymm0,%ymm19
269
270	vmovq	%xmm3,%r13
271	addq	%r13,%r9
272
273	vpmadd52huq	0(%rsi),%ymm1,%ymm3
274	vpmadd52huq	32(%rsi),%ymm1,%ymm16
275	vpmadd52huq	64(%rsi),%ymm1,%ymm17
276	vpmadd52huq	96(%rsi),%ymm1,%ymm18
277	vpmadd52huq	128(%rsi),%ymm1,%ymm19
278
279	vpmadd52huq	0(%rcx),%ymm2,%ymm3
280	vpmadd52huq	32(%rcx),%ymm2,%ymm16
281	vpmadd52huq	64(%rcx),%ymm2,%ymm17
282	vpmadd52huq	96(%rcx),%ymm2,%ymm18
283	vpmadd52huq	128(%rcx),%ymm2,%ymm19
284	leaq	32(%r11),%r11
285	decl	%ebx
286	jne	.Lloop5
287
288	vpbroadcastq	%r9,%ymm0
289	vpblendd	$3,%ymm0,%ymm3,%ymm3
290
291
292
293	vpsrlq	$52,%ymm3,%ymm0
294	vpsrlq	$52,%ymm16,%ymm1
295	vpsrlq	$52,%ymm17,%ymm2
296	vpsrlq	$52,%ymm18,%ymm25
297	vpsrlq	$52,%ymm19,%ymm26
298
299
300	valignq	$3,%ymm25,%ymm26,%ymm26
301	valignq	$3,%ymm2,%ymm25,%ymm25
302	valignq	$3,%ymm1,%ymm2,%ymm2
303	valignq	$3,%ymm0,%ymm1,%ymm1
304	valignq	$3,.Lzeros(%rip),%ymm0,%ymm0
305
306
307	vpandq	.Lmask52x4(%rip),%ymm3,%ymm3
308	vpandq	.Lmask52x4(%rip),%ymm16,%ymm16
309	vpandq	.Lmask52x4(%rip),%ymm17,%ymm17
310	vpandq	.Lmask52x4(%rip),%ymm18,%ymm18
311	vpandq	.Lmask52x4(%rip),%ymm19,%ymm19
312
313
314	vpaddq	%ymm0,%ymm3,%ymm3
315	vpaddq	%ymm1,%ymm16,%ymm16
316	vpaddq	%ymm2,%ymm17,%ymm17
317	vpaddq	%ymm25,%ymm18,%ymm18
318	vpaddq	%ymm26,%ymm19,%ymm19
319
320
321
322	vpcmpuq	$6,.Lmask52x4(%rip),%ymm3,%k1
323	vpcmpuq	$6,.Lmask52x4(%rip),%ymm16,%k2
324	vpcmpuq	$6,.Lmask52x4(%rip),%ymm17,%k3
325	vpcmpuq	$6,.Lmask52x4(%rip),%ymm18,%k4
326	vpcmpuq	$6,.Lmask52x4(%rip),%ymm19,%k5
327	kmovb	%k1,%r14d
328	kmovb	%k2,%r13d
329	kmovb	%k3,%r12d
330	kmovb	%k4,%r11d
331	kmovb	%k5,%r10d
332
333
334	vpcmpuq	$0,.Lmask52x4(%rip),%ymm3,%k1
335	vpcmpuq	$0,.Lmask52x4(%rip),%ymm16,%k2
336	vpcmpuq	$0,.Lmask52x4(%rip),%ymm17,%k3
337	vpcmpuq	$0,.Lmask52x4(%rip),%ymm18,%k4
338	vpcmpuq	$0,.Lmask52x4(%rip),%ymm19,%k5
339	kmovb	%k1,%r9d
340	kmovb	%k2,%r8d
341	kmovb	%k3,%ebx
342	kmovb	%k4,%ecx
343	kmovb	%k5,%edx
344
345
346
347	shlb	$4,%r13b
348	orb	%r13b,%r14b
349	shlb	$4,%r11b
350	orb	%r11b,%r12b
351
352	addb	%r14b,%r14b
353	adcb	%r12b,%r12b
354	adcb	%r10b,%r10b
355
356	shlb	$4,%r8b
357	orb	%r8b,%r9b
358	shlb	$4,%cl
359	orb	%cl,%bl
360
361	addb	%r9b,%r14b
362	adcb	%bl,%r12b
363	adcb	%dl,%r10b
364
365	xorb	%r9b,%r14b
366	xorb	%bl,%r12b
367	xorb	%dl,%r10b
368
369	kmovb	%r14d,%k1
370	shrb	$4,%r14b
371	kmovb	%r14d,%k2
372	kmovb	%r12d,%k3
373	shrb	$4,%r12b
374	kmovb	%r12d,%k4
375	kmovb	%r10d,%k5
376
377
378	vpsubq	.Lmask52x4(%rip),%ymm3,%ymm3{%k1}
379	vpsubq	.Lmask52x4(%rip),%ymm16,%ymm16{%k2}
380	vpsubq	.Lmask52x4(%rip),%ymm17,%ymm17{%k3}
381	vpsubq	.Lmask52x4(%rip),%ymm18,%ymm18{%k4}
382	vpsubq	.Lmask52x4(%rip),%ymm19,%ymm19{%k5}
383
384	vpandq	.Lmask52x4(%rip),%ymm3,%ymm3
385	vpandq	.Lmask52x4(%rip),%ymm16,%ymm16
386	vpandq	.Lmask52x4(%rip),%ymm17,%ymm17
387	vpandq	.Lmask52x4(%rip),%ymm18,%ymm18
388	vpandq	.Lmask52x4(%rip),%ymm19,%ymm19
389
390	vmovdqu64	%ymm3,0(%rdi)
391	vmovdqu64	%ymm16,32(%rdi)
392	vmovdqu64	%ymm17,64(%rdi)
393	vmovdqu64	%ymm18,96(%rdi)
394	vmovdqu64	%ymm19,128(%rdi)
395
396	vzeroupper
397	movq	0(%rsp),%r15
398.cfi_restore	%r15
399	movq	8(%rsp),%r14
400.cfi_restore	%r14
401	movq	16(%rsp),%r13
402.cfi_restore	%r13
403	movq	24(%rsp),%r12
404.cfi_restore	%r12
405	movq	32(%rsp),%rbp
406.cfi_restore	%rbp
407	movq	40(%rsp),%rbx
408.cfi_restore	%rbx
409	leaq	48(%rsp),%rsp
410.cfi_adjust_cfa_offset	-48
411.Lossl_rsaz_amm52x20_x1_ifma256_epilogue:
412	.byte	0xf3,0xc3
413.cfi_endproc
414.size	ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256
415.section	.rodata
416.align	32
417.Lmask52x4:
418.quad	0xfffffffffffff
419.quad	0xfffffffffffff
420.quad	0xfffffffffffff
421.quad	0xfffffffffffff
422.text
423
424.globl	ossl_rsaz_amm52x20_x2_ifma256
425.type	ossl_rsaz_amm52x20_x2_ifma256,@function
426.align	32
427ossl_rsaz_amm52x20_x2_ifma256:
428.cfi_startproc
429.byte	243,15,30,250
430	pushq	%rbx
431.cfi_adjust_cfa_offset	8
432.cfi_offset	%rbx,-16
433	pushq	%rbp
434.cfi_adjust_cfa_offset	8
435.cfi_offset	%rbp,-24
436	pushq	%r12
437.cfi_adjust_cfa_offset	8
438.cfi_offset	%r12,-32
439	pushq	%r13
440.cfi_adjust_cfa_offset	8
441.cfi_offset	%r13,-40
442	pushq	%r14
443.cfi_adjust_cfa_offset	8
444.cfi_offset	%r14,-48
445	pushq	%r15
446.cfi_adjust_cfa_offset	8
447.cfi_offset	%r15,-56
448.Lossl_rsaz_amm52x20_x2_ifma256_body:
449
450
451	vpxord	%ymm0,%ymm0,%ymm0
452	vmovdqa64	%ymm0,%ymm3
453	vmovdqa64	%ymm0,%ymm16
454	vmovdqa64	%ymm0,%ymm17
455	vmovdqa64	%ymm0,%ymm18
456	vmovdqa64	%ymm0,%ymm19
457	vmovdqa64	%ymm0,%ymm4
458	vmovdqa64	%ymm0,%ymm20
459	vmovdqa64	%ymm0,%ymm21
460	vmovdqa64	%ymm0,%ymm22
461	vmovdqa64	%ymm0,%ymm23
462
463	xorl	%r9d,%r9d
464	xorl	%r15d,%r15d
465
466	movq	%rdx,%r11
467	movq	$0xfffffffffffff,%rax
468
469	movl	$20,%ebx
470
471.align	32
472.Lloop20:
473	movq	0(%r11),%r13
474
475	vpbroadcastq	%r13,%ymm1
476	movq	0(%rsi),%rdx
477	mulxq	%r13,%r13,%r12
478	addq	%r13,%r9
479	movq	%r12,%r10
480	adcq	$0,%r10
481
482	movq	(%r8),%r13
483	imulq	%r9,%r13
484	andq	%rax,%r13
485
486	vpbroadcastq	%r13,%ymm2
487	movq	0(%rcx),%rdx
488	mulxq	%r13,%r13,%r12
489	addq	%r13,%r9
490	adcq	%r12,%r10
491
492	shrq	$52,%r9
493	salq	$12,%r10
494	orq	%r10,%r9
495
496	vpmadd52luq	0(%rsi),%ymm1,%ymm3
497	vpmadd52luq	32(%rsi),%ymm1,%ymm16
498	vpmadd52luq	64(%rsi),%ymm1,%ymm17
499	vpmadd52luq	96(%rsi),%ymm1,%ymm18
500	vpmadd52luq	128(%rsi),%ymm1,%ymm19
501
502	vpmadd52luq	0(%rcx),%ymm2,%ymm3
503	vpmadd52luq	32(%rcx),%ymm2,%ymm16
504	vpmadd52luq	64(%rcx),%ymm2,%ymm17
505	vpmadd52luq	96(%rcx),%ymm2,%ymm18
506	vpmadd52luq	128(%rcx),%ymm2,%ymm19
507
508
509	valignq	$1,%ymm3,%ymm16,%ymm3
510	valignq	$1,%ymm16,%ymm17,%ymm16
511	valignq	$1,%ymm17,%ymm18,%ymm17
512	valignq	$1,%ymm18,%ymm19,%ymm18
513	valignq	$1,%ymm19,%ymm0,%ymm19
514
515	vmovq	%xmm3,%r13
516	addq	%r13,%r9
517
518	vpmadd52huq	0(%rsi),%ymm1,%ymm3
519	vpmadd52huq	32(%rsi),%ymm1,%ymm16
520	vpmadd52huq	64(%rsi),%ymm1,%ymm17
521	vpmadd52huq	96(%rsi),%ymm1,%ymm18
522	vpmadd52huq	128(%rsi),%ymm1,%ymm19
523
524	vpmadd52huq	0(%rcx),%ymm2,%ymm3
525	vpmadd52huq	32(%rcx),%ymm2,%ymm16
526	vpmadd52huq	64(%rcx),%ymm2,%ymm17
527	vpmadd52huq	96(%rcx),%ymm2,%ymm18
528	vpmadd52huq	128(%rcx),%ymm2,%ymm19
529	movq	160(%r11),%r13
530
531	vpbroadcastq	%r13,%ymm1
532	movq	160(%rsi),%rdx
533	mulxq	%r13,%r13,%r12
534	addq	%r13,%r15
535	movq	%r12,%r10
536	adcq	$0,%r10
537
538	movq	8(%r8),%r13
539	imulq	%r15,%r13
540	andq	%rax,%r13
541
542	vpbroadcastq	%r13,%ymm2
543	movq	160(%rcx),%rdx
544	mulxq	%r13,%r13,%r12
545	addq	%r13,%r15
546	adcq	%r12,%r10
547
548	shrq	$52,%r15
549	salq	$12,%r10
550	orq	%r10,%r15
551
552	vpmadd52luq	160(%rsi),%ymm1,%ymm4
553	vpmadd52luq	192(%rsi),%ymm1,%ymm20
554	vpmadd52luq	224(%rsi),%ymm1,%ymm21
555	vpmadd52luq	256(%rsi),%ymm1,%ymm22
556	vpmadd52luq	288(%rsi),%ymm1,%ymm23
557
558	vpmadd52luq	160(%rcx),%ymm2,%ymm4
559	vpmadd52luq	192(%rcx),%ymm2,%ymm20
560	vpmadd52luq	224(%rcx),%ymm2,%ymm21
561	vpmadd52luq	256(%rcx),%ymm2,%ymm22
562	vpmadd52luq	288(%rcx),%ymm2,%ymm23
563
564
565	valignq	$1,%ymm4,%ymm20,%ymm4
566	valignq	$1,%ymm20,%ymm21,%ymm20
567	valignq	$1,%ymm21,%ymm22,%ymm21
568	valignq	$1,%ymm22,%ymm23,%ymm22
569	valignq	$1,%ymm23,%ymm0,%ymm23
570
571	vmovq	%xmm4,%r13
572	addq	%r13,%r15
573
574	vpmadd52huq	160(%rsi),%ymm1,%ymm4
575	vpmadd52huq	192(%rsi),%ymm1,%ymm20
576	vpmadd52huq	224(%rsi),%ymm1,%ymm21
577	vpmadd52huq	256(%rsi),%ymm1,%ymm22
578	vpmadd52huq	288(%rsi),%ymm1,%ymm23
579
580	vpmadd52huq	160(%rcx),%ymm2,%ymm4
581	vpmadd52huq	192(%rcx),%ymm2,%ymm20
582	vpmadd52huq	224(%rcx),%ymm2,%ymm21
583	vpmadd52huq	256(%rcx),%ymm2,%ymm22
584	vpmadd52huq	288(%rcx),%ymm2,%ymm23
585	leaq	8(%r11),%r11
586	decl	%ebx
587	jne	.Lloop20
588
589	vpbroadcastq	%r9,%ymm0
590	vpblendd	$3,%ymm0,%ymm3,%ymm3
591
592
593
594	vpsrlq	$52,%ymm3,%ymm0
595	vpsrlq	$52,%ymm16,%ymm1
596	vpsrlq	$52,%ymm17,%ymm2
597	vpsrlq	$52,%ymm18,%ymm25
598	vpsrlq	$52,%ymm19,%ymm26
599
600
601	valignq	$3,%ymm25,%ymm26,%ymm26
602	valignq	$3,%ymm2,%ymm25,%ymm25
603	valignq	$3,%ymm1,%ymm2,%ymm2
604	valignq	$3,%ymm0,%ymm1,%ymm1
605	valignq	$3,.Lzeros(%rip),%ymm0,%ymm0
606
607
608	vpandq	.Lmask52x4(%rip),%ymm3,%ymm3
609	vpandq	.Lmask52x4(%rip),%ymm16,%ymm16
610	vpandq	.Lmask52x4(%rip),%ymm17,%ymm17
611	vpandq	.Lmask52x4(%rip),%ymm18,%ymm18
612	vpandq	.Lmask52x4(%rip),%ymm19,%ymm19
613
614
615	vpaddq	%ymm0,%ymm3,%ymm3
616	vpaddq	%ymm1,%ymm16,%ymm16
617	vpaddq	%ymm2,%ymm17,%ymm17
618	vpaddq	%ymm25,%ymm18,%ymm18
619	vpaddq	%ymm26,%ymm19,%ymm19
620
621
622
623	vpcmpuq	$6,.Lmask52x4(%rip),%ymm3,%k1
624	vpcmpuq	$6,.Lmask52x4(%rip),%ymm16,%k2
625	vpcmpuq	$6,.Lmask52x4(%rip),%ymm17,%k3
626	vpcmpuq	$6,.Lmask52x4(%rip),%ymm18,%k4
627	vpcmpuq	$6,.Lmask52x4(%rip),%ymm19,%k5
628	kmovb	%k1,%r14d
629	kmovb	%k2,%r13d
630	kmovb	%k3,%r12d
631	kmovb	%k4,%r11d
632	kmovb	%k5,%r10d
633
634
635	vpcmpuq	$0,.Lmask52x4(%rip),%ymm3,%k1
636	vpcmpuq	$0,.Lmask52x4(%rip),%ymm16,%k2
637	vpcmpuq	$0,.Lmask52x4(%rip),%ymm17,%k3
638	vpcmpuq	$0,.Lmask52x4(%rip),%ymm18,%k4
639	vpcmpuq	$0,.Lmask52x4(%rip),%ymm19,%k5
640	kmovb	%k1,%r9d
641	kmovb	%k2,%r8d
642	kmovb	%k3,%ebx
643	kmovb	%k4,%ecx
644	kmovb	%k5,%edx
645
646
647
648	shlb	$4,%r13b
649	orb	%r13b,%r14b
650	shlb	$4,%r11b
651	orb	%r11b,%r12b
652
653	addb	%r14b,%r14b
654	adcb	%r12b,%r12b
655	adcb	%r10b,%r10b
656
657	shlb	$4,%r8b
658	orb	%r8b,%r9b
659	shlb	$4,%cl
660	orb	%cl,%bl
661
662	addb	%r9b,%r14b
663	adcb	%bl,%r12b
664	adcb	%dl,%r10b
665
666	xorb	%r9b,%r14b
667	xorb	%bl,%r12b
668	xorb	%dl,%r10b
669
670	kmovb	%r14d,%k1
671	shrb	$4,%r14b
672	kmovb	%r14d,%k2
673	kmovb	%r12d,%k3
674	shrb	$4,%r12b
675	kmovb	%r12d,%k4
676	kmovb	%r10d,%k5
677
678
679	vpsubq	.Lmask52x4(%rip),%ymm3,%ymm3{%k1}
680	vpsubq	.Lmask52x4(%rip),%ymm16,%ymm16{%k2}
681	vpsubq	.Lmask52x4(%rip),%ymm17,%ymm17{%k3}
682	vpsubq	.Lmask52x4(%rip),%ymm18,%ymm18{%k4}
683	vpsubq	.Lmask52x4(%rip),%ymm19,%ymm19{%k5}
684
685	vpandq	.Lmask52x4(%rip),%ymm3,%ymm3
686	vpandq	.Lmask52x4(%rip),%ymm16,%ymm16
687	vpandq	.Lmask52x4(%rip),%ymm17,%ymm17
688	vpandq	.Lmask52x4(%rip),%ymm18,%ymm18
689	vpandq	.Lmask52x4(%rip),%ymm19,%ymm19
690
691	vpbroadcastq	%r15,%ymm0
692	vpblendd	$3,%ymm0,%ymm4,%ymm4
693
694
695
696	vpsrlq	$52,%ymm4,%ymm0
697	vpsrlq	$52,%ymm20,%ymm1
698	vpsrlq	$52,%ymm21,%ymm2
699	vpsrlq	$52,%ymm22,%ymm25
700	vpsrlq	$52,%ymm23,%ymm26
701
702
703	valignq	$3,%ymm25,%ymm26,%ymm26
704	valignq	$3,%ymm2,%ymm25,%ymm25
705	valignq	$3,%ymm1,%ymm2,%ymm2
706	valignq	$3,%ymm0,%ymm1,%ymm1
707	valignq	$3,.Lzeros(%rip),%ymm0,%ymm0
708
709
710	vpandq	.Lmask52x4(%rip),%ymm4,%ymm4
711	vpandq	.Lmask52x4(%rip),%ymm20,%ymm20
712	vpandq	.Lmask52x4(%rip),%ymm21,%ymm21
713	vpandq	.Lmask52x4(%rip),%ymm22,%ymm22
714	vpandq	.Lmask52x4(%rip),%ymm23,%ymm23
715
716
717	vpaddq	%ymm0,%ymm4,%ymm4
718	vpaddq	%ymm1,%ymm20,%ymm20
719	vpaddq	%ymm2,%ymm21,%ymm21
720	vpaddq	%ymm25,%ymm22,%ymm22
721	vpaddq	%ymm26,%ymm23,%ymm23
722
723
724
725	vpcmpuq	$6,.Lmask52x4(%rip),%ymm4,%k1
726	vpcmpuq	$6,.Lmask52x4(%rip),%ymm20,%k2
727	vpcmpuq	$6,.Lmask52x4(%rip),%ymm21,%k3
728	vpcmpuq	$6,.Lmask52x4(%rip),%ymm22,%k4
729	vpcmpuq	$6,.Lmask52x4(%rip),%ymm23,%k5
730	kmovb	%k1,%r14d
731	kmovb	%k2,%r13d
732	kmovb	%k3,%r12d
733	kmovb	%k4,%r11d
734	kmovb	%k5,%r10d
735
736
737	vpcmpuq	$0,.Lmask52x4(%rip),%ymm4,%k1
738	vpcmpuq	$0,.Lmask52x4(%rip),%ymm20,%k2
739	vpcmpuq	$0,.Lmask52x4(%rip),%ymm21,%k3
740	vpcmpuq	$0,.Lmask52x4(%rip),%ymm22,%k4
741	vpcmpuq	$0,.Lmask52x4(%rip),%ymm23,%k5
742	kmovb	%k1,%r9d
743	kmovb	%k2,%r8d
744	kmovb	%k3,%ebx
745	kmovb	%k4,%ecx
746	kmovb	%k5,%edx
747
748
749
750	shlb	$4,%r13b
751	orb	%r13b,%r14b
752	shlb	$4,%r11b
753	orb	%r11b,%r12b
754
755	addb	%r14b,%r14b
756	adcb	%r12b,%r12b
757	adcb	%r10b,%r10b
758
759	shlb	$4,%r8b
760	orb	%r8b,%r9b
761	shlb	$4,%cl
762	orb	%cl,%bl
763
764	addb	%r9b,%r14b
765	adcb	%bl,%r12b
766	adcb	%dl,%r10b
767
768	xorb	%r9b,%r14b
769	xorb	%bl,%r12b
770	xorb	%dl,%r10b
771
772	kmovb	%r14d,%k1
773	shrb	$4,%r14b
774	kmovb	%r14d,%k2
775	kmovb	%r12d,%k3
776	shrb	$4,%r12b
777	kmovb	%r12d,%k4
778	kmovb	%r10d,%k5
779
780
781	vpsubq	.Lmask52x4(%rip),%ymm4,%ymm4{%k1}
782	vpsubq	.Lmask52x4(%rip),%ymm20,%ymm20{%k2}
783	vpsubq	.Lmask52x4(%rip),%ymm21,%ymm21{%k3}
784	vpsubq	.Lmask52x4(%rip),%ymm22,%ymm22{%k4}
785	vpsubq	.Lmask52x4(%rip),%ymm23,%ymm23{%k5}
786
787	vpandq	.Lmask52x4(%rip),%ymm4,%ymm4
788	vpandq	.Lmask52x4(%rip),%ymm20,%ymm20
789	vpandq	.Lmask52x4(%rip),%ymm21,%ymm21
790	vpandq	.Lmask52x4(%rip),%ymm22,%ymm22
791	vpandq	.Lmask52x4(%rip),%ymm23,%ymm23
792
793	vmovdqu64	%ymm3,0(%rdi)
794	vmovdqu64	%ymm16,32(%rdi)
795	vmovdqu64	%ymm17,64(%rdi)
796	vmovdqu64	%ymm18,96(%rdi)
797	vmovdqu64	%ymm19,128(%rdi)
798
799	vmovdqu64	%ymm4,160(%rdi)
800	vmovdqu64	%ymm20,192(%rdi)
801	vmovdqu64	%ymm21,224(%rdi)
802	vmovdqu64	%ymm22,256(%rdi)
803	vmovdqu64	%ymm23,288(%rdi)
804
805	vzeroupper
806	movq	0(%rsp),%r15
807.cfi_restore	%r15
808	movq	8(%rsp),%r14
809.cfi_restore	%r14
810	movq	16(%rsp),%r13
811.cfi_restore	%r13
812	movq	24(%rsp),%r12
813.cfi_restore	%r12
814	movq	32(%rsp),%rbp
815.cfi_restore	%rbp
816	movq	40(%rsp),%rbx
817.cfi_restore	%rbx
818	leaq	48(%rsp),%rsp
819.cfi_adjust_cfa_offset	-48
820.Lossl_rsaz_amm52x20_x2_ifma256_epilogue:
821	.byte	0xf3,0xc3
822.cfi_endproc
823.size	ossl_rsaz_amm52x20_x2_ifma256, .-ossl_rsaz_amm52x20_x2_ifma256
824.text
825
826.align	32
827.globl	ossl_extract_multiplier_2x20_win5
828.type	ossl_extract_multiplier_2x20_win5,@function
829ossl_extract_multiplier_2x20_win5:
830.cfi_startproc
831.byte	243,15,30,250
832	vmovdqa64	.Lones(%rip),%ymm24
833	vpbroadcastq	%rdx,%ymm22
834	vpbroadcastq	%rcx,%ymm23
835	leaq	10240(%rsi),%rax
836
837
838	vpxor	%xmm0,%xmm0,%xmm0
839	vmovdqa64	%ymm0,%ymm21
840	vmovdqa64	%ymm0,%ymm1
841	vmovdqa64	%ymm0,%ymm2
842	vmovdqa64	%ymm0,%ymm3
843	vmovdqa64	%ymm0,%ymm4
844	vmovdqa64	%ymm0,%ymm5
845	vmovdqa64	%ymm0,%ymm16
846	vmovdqa64	%ymm0,%ymm17
847	vmovdqa64	%ymm0,%ymm18
848	vmovdqa64	%ymm0,%ymm19
849
850.align	32
851.Lloop:
852	vpcmpq	$0,%ymm21,%ymm22,%k1
853	vpcmpq	$0,%ymm21,%ymm23,%k2
854	vmovdqu64	0(%rsi),%ymm20
855	vpblendmq	%ymm20,%ymm0,%ymm0{%k1}
856	vmovdqu64	32(%rsi),%ymm20
857	vpblendmq	%ymm20,%ymm1,%ymm1{%k1}
858	vmovdqu64	64(%rsi),%ymm20
859	vpblendmq	%ymm20,%ymm2,%ymm2{%k1}
860	vmovdqu64	96(%rsi),%ymm20
861	vpblendmq	%ymm20,%ymm3,%ymm3{%k1}
862	vmovdqu64	128(%rsi),%ymm20
863	vpblendmq	%ymm20,%ymm4,%ymm4{%k1}
864	vmovdqu64	160(%rsi),%ymm20
865	vpblendmq	%ymm20,%ymm5,%ymm5{%k2}
866	vmovdqu64	192(%rsi),%ymm20
867	vpblendmq	%ymm20,%ymm16,%ymm16{%k2}
868	vmovdqu64	224(%rsi),%ymm20
869	vpblendmq	%ymm20,%ymm17,%ymm17{%k2}
870	vmovdqu64	256(%rsi),%ymm20
871	vpblendmq	%ymm20,%ymm18,%ymm18{%k2}
872	vmovdqu64	288(%rsi),%ymm20
873	vpblendmq	%ymm20,%ymm19,%ymm19{%k2}
874	vpaddq	%ymm24,%ymm21,%ymm21
875	addq	$320,%rsi
876	cmpq	%rsi,%rax
877	jne	.Lloop
878	vmovdqu64	%ymm0,0(%rdi)
879	vmovdqu64	%ymm1,32(%rdi)
880	vmovdqu64	%ymm2,64(%rdi)
881	vmovdqu64	%ymm3,96(%rdi)
882	vmovdqu64	%ymm4,128(%rdi)
883	vmovdqu64	%ymm5,160(%rdi)
884	vmovdqu64	%ymm16,192(%rdi)
885	vmovdqu64	%ymm17,224(%rdi)
886	vmovdqu64	%ymm18,256(%rdi)
887	vmovdqu64	%ymm19,288(%rdi)
888	.byte	0xf3,0xc3
889.cfi_endproc
890.size	ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5
891.section	.rodata
892.align	32
893.Lones:
894.quad	1,1,1,1
895.Lzeros:
896.quad	0,0,0,0
897	.section ".note.gnu.property", "a"
898	.p2align 3
899	.long 1f - 0f
900	.long 4f - 1f
901	.long 5
9020:
903	# "GNU" encoded with .byte, since .asciz isn't supported
904	# on Solaris.
905	.byte 0x47
906	.byte 0x4e
907	.byte 0x55
908	.byte 0
9091:
910	.p2align 3
911	.long 0xc0000002
912	.long 3f - 2f
9132:
914	.long 3
9153:
916	.p2align 3
9174:
918