xref: /freebsd/sys/crypto/openssl/amd64/aesni-mb-x86_64.S (revision 25fb30bd9abc492359ad1f66901a06cb8cd08370)
1/* $FreeBSD$ */
2/* Do not modify. This file is auto-generated from aesni-mb-x86_64.pl. */
3.text
4
5
6
7.globl	aesni_multi_cbc_encrypt
8.type	aesni_multi_cbc_encrypt,@function
9.align	32
10aesni_multi_cbc_encrypt:
11.cfi_startproc
12	cmpl	$2,%edx
13	jb	.Lenc_non_avx
14	movl	OPENSSL_ia32cap_P+4(%rip),%ecx
15	testl	$268435456,%ecx
16	jnz	_avx_cbc_enc_shortcut
17	jmp	.Lenc_non_avx
18.align	16
19.Lenc_non_avx:
20	movq	%rsp,%rax
21.cfi_def_cfa_register	%rax
22	pushq	%rbx
23.cfi_offset	%rbx,-16
24	pushq	%rbp
25.cfi_offset	%rbp,-24
26	pushq	%r12
27.cfi_offset	%r12,-32
28	pushq	%r13
29.cfi_offset	%r13,-40
30	pushq	%r14
31.cfi_offset	%r14,-48
32	pushq	%r15
33.cfi_offset	%r15,-56
34
35
36
37
38
39
40	subq	$48,%rsp
41	andq	$-64,%rsp
42	movq	%rax,16(%rsp)
43.cfi_escape	0x0f,0x05,0x77,0x10,0x06,0x23,0x08
44
45.Lenc4x_body:
46	movdqu	(%rsi),%xmm12
47	leaq	120(%rsi),%rsi
48	leaq	80(%rdi),%rdi
49
50.Lenc4x_loop_grande:
51	movl	%edx,24(%rsp)
52	xorl	%edx,%edx
53	movl	-64(%rdi),%ecx
54	movq	-80(%rdi),%r8
55	cmpl	%edx,%ecx
56	movq	-72(%rdi),%r12
57	cmovgl	%ecx,%edx
58	testl	%ecx,%ecx
59	movdqu	-56(%rdi),%xmm2
60	movl	%ecx,32(%rsp)
61	cmovleq	%rsp,%r8
62	movl	-24(%rdi),%ecx
63	movq	-40(%rdi),%r9
64	cmpl	%edx,%ecx
65	movq	-32(%rdi),%r13
66	cmovgl	%ecx,%edx
67	testl	%ecx,%ecx
68	movdqu	-16(%rdi),%xmm3
69	movl	%ecx,36(%rsp)
70	cmovleq	%rsp,%r9
71	movl	16(%rdi),%ecx
72	movq	0(%rdi),%r10
73	cmpl	%edx,%ecx
74	movq	8(%rdi),%r14
75	cmovgl	%ecx,%edx
76	testl	%ecx,%ecx
77	movdqu	24(%rdi),%xmm4
78	movl	%ecx,40(%rsp)
79	cmovleq	%rsp,%r10
80	movl	56(%rdi),%ecx
81	movq	40(%rdi),%r11
82	cmpl	%edx,%ecx
83	movq	48(%rdi),%r15
84	cmovgl	%ecx,%edx
85	testl	%ecx,%ecx
86	movdqu	64(%rdi),%xmm5
87	movl	%ecx,44(%rsp)
88	cmovleq	%rsp,%r11
89	testl	%edx,%edx
90	jz	.Lenc4x_done
91
92	movups	16-120(%rsi),%xmm1
93	pxor	%xmm12,%xmm2
94	movups	32-120(%rsi),%xmm0
95	pxor	%xmm12,%xmm3
96	movl	240-120(%rsi),%eax
97	pxor	%xmm12,%xmm4
98	movdqu	(%r8),%xmm6
99	pxor	%xmm12,%xmm5
100	movdqu	(%r9),%xmm7
101	pxor	%xmm6,%xmm2
102	movdqu	(%r10),%xmm8
103	pxor	%xmm7,%xmm3
104	movdqu	(%r11),%xmm9
105	pxor	%xmm8,%xmm4
106	pxor	%xmm9,%xmm5
107	movdqa	32(%rsp),%xmm10
108	xorq	%rbx,%rbx
109	jmp	.Loop_enc4x
110
111.align	32
112.Loop_enc4x:
113	addq	$16,%rbx
114	leaq	16(%rsp),%rbp
115	movl	$1,%ecx
116	subq	%rbx,%rbp
117
118.byte	102,15,56,220,209
119	prefetcht0	31(%r8,%rbx,1)
120	prefetcht0	31(%r9,%rbx,1)
121.byte	102,15,56,220,217
122	prefetcht0	31(%r10,%rbx,1)
123	prefetcht0	31(%r10,%rbx,1)
124.byte	102,15,56,220,225
125.byte	102,15,56,220,233
126	movups	48-120(%rsi),%xmm1
127	cmpl	32(%rsp),%ecx
128.byte	102,15,56,220,208
129.byte	102,15,56,220,216
130.byte	102,15,56,220,224
131	cmovgeq	%rbp,%r8
132	cmovgq	%rbp,%r12
133.byte	102,15,56,220,232
134	movups	-56(%rsi),%xmm0
135	cmpl	36(%rsp),%ecx
136.byte	102,15,56,220,209
137.byte	102,15,56,220,217
138.byte	102,15,56,220,225
139	cmovgeq	%rbp,%r9
140	cmovgq	%rbp,%r13
141.byte	102,15,56,220,233
142	movups	-40(%rsi),%xmm1
143	cmpl	40(%rsp),%ecx
144.byte	102,15,56,220,208
145.byte	102,15,56,220,216
146.byte	102,15,56,220,224
147	cmovgeq	%rbp,%r10
148	cmovgq	%rbp,%r14
149.byte	102,15,56,220,232
150	movups	-24(%rsi),%xmm0
151	cmpl	44(%rsp),%ecx
152.byte	102,15,56,220,209
153.byte	102,15,56,220,217
154.byte	102,15,56,220,225
155	cmovgeq	%rbp,%r11
156	cmovgq	%rbp,%r15
157.byte	102,15,56,220,233
158	movups	-8(%rsi),%xmm1
159	movdqa	%xmm10,%xmm11
160.byte	102,15,56,220,208
161	prefetcht0	15(%r12,%rbx,1)
162	prefetcht0	15(%r13,%rbx,1)
163.byte	102,15,56,220,216
164	prefetcht0	15(%r14,%rbx,1)
165	prefetcht0	15(%r15,%rbx,1)
166.byte	102,15,56,220,224
167.byte	102,15,56,220,232
168	movups	128-120(%rsi),%xmm0
169	pxor	%xmm12,%xmm12
170
171.byte	102,15,56,220,209
172	pcmpgtd	%xmm12,%xmm11
173	movdqu	-120(%rsi),%xmm12
174.byte	102,15,56,220,217
175	paddd	%xmm11,%xmm10
176	movdqa	%xmm10,32(%rsp)
177.byte	102,15,56,220,225
178.byte	102,15,56,220,233
179	movups	144-120(%rsi),%xmm1
180
181	cmpl	$11,%eax
182
183.byte	102,15,56,220,208
184.byte	102,15,56,220,216
185.byte	102,15,56,220,224
186.byte	102,15,56,220,232
187	movups	160-120(%rsi),%xmm0
188
189	jb	.Lenc4x_tail
190
191.byte	102,15,56,220,209
192.byte	102,15,56,220,217
193.byte	102,15,56,220,225
194.byte	102,15,56,220,233
195	movups	176-120(%rsi),%xmm1
196
197.byte	102,15,56,220,208
198.byte	102,15,56,220,216
199.byte	102,15,56,220,224
200.byte	102,15,56,220,232
201	movups	192-120(%rsi),%xmm0
202
203	je	.Lenc4x_tail
204
205.byte	102,15,56,220,209
206.byte	102,15,56,220,217
207.byte	102,15,56,220,225
208.byte	102,15,56,220,233
209	movups	208-120(%rsi),%xmm1
210
211.byte	102,15,56,220,208
212.byte	102,15,56,220,216
213.byte	102,15,56,220,224
214.byte	102,15,56,220,232
215	movups	224-120(%rsi),%xmm0
216	jmp	.Lenc4x_tail
217
218.align	32
219.Lenc4x_tail:
220.byte	102,15,56,220,209
221.byte	102,15,56,220,217
222.byte	102,15,56,220,225
223.byte	102,15,56,220,233
224	movdqu	(%r8,%rbx,1),%xmm6
225	movdqu	16-120(%rsi),%xmm1
226
227.byte	102,15,56,221,208
228	movdqu	(%r9,%rbx,1),%xmm7
229	pxor	%xmm12,%xmm6
230.byte	102,15,56,221,216
231	movdqu	(%r10,%rbx,1),%xmm8
232	pxor	%xmm12,%xmm7
233.byte	102,15,56,221,224
234	movdqu	(%r11,%rbx,1),%xmm9
235	pxor	%xmm12,%xmm8
236.byte	102,15,56,221,232
237	movdqu	32-120(%rsi),%xmm0
238	pxor	%xmm12,%xmm9
239
240	movups	%xmm2,-16(%r12,%rbx,1)
241	pxor	%xmm6,%xmm2
242	movups	%xmm3,-16(%r13,%rbx,1)
243	pxor	%xmm7,%xmm3
244	movups	%xmm4,-16(%r14,%rbx,1)
245	pxor	%xmm8,%xmm4
246	movups	%xmm5,-16(%r15,%rbx,1)
247	pxor	%xmm9,%xmm5
248
249	decl	%edx
250	jnz	.Loop_enc4x
251
252	movq	16(%rsp),%rax
253.cfi_def_cfa	%rax,8
254	movl	24(%rsp),%edx
255
256
257
258
259
260
261
262
263
264
265	leaq	160(%rdi),%rdi
266	decl	%edx
267	jnz	.Lenc4x_loop_grande
268
269.Lenc4x_done:
270	movq	-48(%rax),%r15
271.cfi_restore	%r15
272	movq	-40(%rax),%r14
273.cfi_restore	%r14
274	movq	-32(%rax),%r13
275.cfi_restore	%r13
276	movq	-24(%rax),%r12
277.cfi_restore	%r12
278	movq	-16(%rax),%rbp
279.cfi_restore	%rbp
280	movq	-8(%rax),%rbx
281.cfi_restore	%rbx
282	leaq	(%rax),%rsp
283.cfi_def_cfa_register	%rsp
284.Lenc4x_epilogue:
285	.byte	0xf3,0xc3
286.cfi_endproc
287.size	aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
288
289.globl	aesni_multi_cbc_decrypt
290.type	aesni_multi_cbc_decrypt,@function
291.align	32
292aesni_multi_cbc_decrypt:
293.cfi_startproc
294	cmpl	$2,%edx
295	jb	.Ldec_non_avx
296	movl	OPENSSL_ia32cap_P+4(%rip),%ecx
297	testl	$268435456,%ecx
298	jnz	_avx_cbc_dec_shortcut
299	jmp	.Ldec_non_avx
300.align	16
301.Ldec_non_avx:
302	movq	%rsp,%rax
303.cfi_def_cfa_register	%rax
304	pushq	%rbx
305.cfi_offset	%rbx,-16
306	pushq	%rbp
307.cfi_offset	%rbp,-24
308	pushq	%r12
309.cfi_offset	%r12,-32
310	pushq	%r13
311.cfi_offset	%r13,-40
312	pushq	%r14
313.cfi_offset	%r14,-48
314	pushq	%r15
315.cfi_offset	%r15,-56
316
317
318
319
320
321
322	subq	$48,%rsp
323	andq	$-64,%rsp
324	movq	%rax,16(%rsp)
325.cfi_escape	0x0f,0x05,0x77,0x10,0x06,0x23,0x08
326
327.Ldec4x_body:
328	movdqu	(%rsi),%xmm12
329	leaq	120(%rsi),%rsi
330	leaq	80(%rdi),%rdi
331
332.Ldec4x_loop_grande:
333	movl	%edx,24(%rsp)
334	xorl	%edx,%edx
335	movl	-64(%rdi),%ecx
336	movq	-80(%rdi),%r8
337	cmpl	%edx,%ecx
338	movq	-72(%rdi),%r12
339	cmovgl	%ecx,%edx
340	testl	%ecx,%ecx
341	movdqu	-56(%rdi),%xmm6
342	movl	%ecx,32(%rsp)
343	cmovleq	%rsp,%r8
344	movl	-24(%rdi),%ecx
345	movq	-40(%rdi),%r9
346	cmpl	%edx,%ecx
347	movq	-32(%rdi),%r13
348	cmovgl	%ecx,%edx
349	testl	%ecx,%ecx
350	movdqu	-16(%rdi),%xmm7
351	movl	%ecx,36(%rsp)
352	cmovleq	%rsp,%r9
353	movl	16(%rdi),%ecx
354	movq	0(%rdi),%r10
355	cmpl	%edx,%ecx
356	movq	8(%rdi),%r14
357	cmovgl	%ecx,%edx
358	testl	%ecx,%ecx
359	movdqu	24(%rdi),%xmm8
360	movl	%ecx,40(%rsp)
361	cmovleq	%rsp,%r10
362	movl	56(%rdi),%ecx
363	movq	40(%rdi),%r11
364	cmpl	%edx,%ecx
365	movq	48(%rdi),%r15
366	cmovgl	%ecx,%edx
367	testl	%ecx,%ecx
368	movdqu	64(%rdi),%xmm9
369	movl	%ecx,44(%rsp)
370	cmovleq	%rsp,%r11
371	testl	%edx,%edx
372	jz	.Ldec4x_done
373
374	movups	16-120(%rsi),%xmm1
375	movups	32-120(%rsi),%xmm0
376	movl	240-120(%rsi),%eax
377	movdqu	(%r8),%xmm2
378	movdqu	(%r9),%xmm3
379	pxor	%xmm12,%xmm2
380	movdqu	(%r10),%xmm4
381	pxor	%xmm12,%xmm3
382	movdqu	(%r11),%xmm5
383	pxor	%xmm12,%xmm4
384	pxor	%xmm12,%xmm5
385	movdqa	32(%rsp),%xmm10
386	xorq	%rbx,%rbx
387	jmp	.Loop_dec4x
388
389.align	32
390.Loop_dec4x:
391	addq	$16,%rbx
392	leaq	16(%rsp),%rbp
393	movl	$1,%ecx
394	subq	%rbx,%rbp
395
396.byte	102,15,56,222,209
397	prefetcht0	31(%r8,%rbx,1)
398	prefetcht0	31(%r9,%rbx,1)
399.byte	102,15,56,222,217
400	prefetcht0	31(%r10,%rbx,1)
401	prefetcht0	31(%r11,%rbx,1)
402.byte	102,15,56,222,225
403.byte	102,15,56,222,233
404	movups	48-120(%rsi),%xmm1
405	cmpl	32(%rsp),%ecx
406.byte	102,15,56,222,208
407.byte	102,15,56,222,216
408.byte	102,15,56,222,224
409	cmovgeq	%rbp,%r8
410	cmovgq	%rbp,%r12
411.byte	102,15,56,222,232
412	movups	-56(%rsi),%xmm0
413	cmpl	36(%rsp),%ecx
414.byte	102,15,56,222,209
415.byte	102,15,56,222,217
416.byte	102,15,56,222,225
417	cmovgeq	%rbp,%r9
418	cmovgq	%rbp,%r13
419.byte	102,15,56,222,233
420	movups	-40(%rsi),%xmm1
421	cmpl	40(%rsp),%ecx
422.byte	102,15,56,222,208
423.byte	102,15,56,222,216
424.byte	102,15,56,222,224
425	cmovgeq	%rbp,%r10
426	cmovgq	%rbp,%r14
427.byte	102,15,56,222,232
428	movups	-24(%rsi),%xmm0
429	cmpl	44(%rsp),%ecx
430.byte	102,15,56,222,209
431.byte	102,15,56,222,217
432.byte	102,15,56,222,225
433	cmovgeq	%rbp,%r11
434	cmovgq	%rbp,%r15
435.byte	102,15,56,222,233
436	movups	-8(%rsi),%xmm1
437	movdqa	%xmm10,%xmm11
438.byte	102,15,56,222,208
439	prefetcht0	15(%r12,%rbx,1)
440	prefetcht0	15(%r13,%rbx,1)
441.byte	102,15,56,222,216
442	prefetcht0	15(%r14,%rbx,1)
443	prefetcht0	15(%r15,%rbx,1)
444.byte	102,15,56,222,224
445.byte	102,15,56,222,232
446	movups	128-120(%rsi),%xmm0
447	pxor	%xmm12,%xmm12
448
449.byte	102,15,56,222,209
450	pcmpgtd	%xmm12,%xmm11
451	movdqu	-120(%rsi),%xmm12
452.byte	102,15,56,222,217
453	paddd	%xmm11,%xmm10
454	movdqa	%xmm10,32(%rsp)
455.byte	102,15,56,222,225
456.byte	102,15,56,222,233
457	movups	144-120(%rsi),%xmm1
458
459	cmpl	$11,%eax
460
461.byte	102,15,56,222,208
462.byte	102,15,56,222,216
463.byte	102,15,56,222,224
464.byte	102,15,56,222,232
465	movups	160-120(%rsi),%xmm0
466
467	jb	.Ldec4x_tail
468
469.byte	102,15,56,222,209
470.byte	102,15,56,222,217
471.byte	102,15,56,222,225
472.byte	102,15,56,222,233
473	movups	176-120(%rsi),%xmm1
474
475.byte	102,15,56,222,208
476.byte	102,15,56,222,216
477.byte	102,15,56,222,224
478.byte	102,15,56,222,232
479	movups	192-120(%rsi),%xmm0
480
481	je	.Ldec4x_tail
482
483.byte	102,15,56,222,209
484.byte	102,15,56,222,217
485.byte	102,15,56,222,225
486.byte	102,15,56,222,233
487	movups	208-120(%rsi),%xmm1
488
489.byte	102,15,56,222,208
490.byte	102,15,56,222,216
491.byte	102,15,56,222,224
492.byte	102,15,56,222,232
493	movups	224-120(%rsi),%xmm0
494	jmp	.Ldec4x_tail
495
496.align	32
497.Ldec4x_tail:
498.byte	102,15,56,222,209
499.byte	102,15,56,222,217
500.byte	102,15,56,222,225
501	pxor	%xmm0,%xmm6
502	pxor	%xmm0,%xmm7
503.byte	102,15,56,222,233
504	movdqu	16-120(%rsi),%xmm1
505	pxor	%xmm0,%xmm8
506	pxor	%xmm0,%xmm9
507	movdqu	32-120(%rsi),%xmm0
508
509.byte	102,15,56,223,214
510.byte	102,15,56,223,223
511	movdqu	-16(%r8,%rbx,1),%xmm6
512	movdqu	-16(%r9,%rbx,1),%xmm7
513.byte	102,65,15,56,223,224
514.byte	102,65,15,56,223,233
515	movdqu	-16(%r10,%rbx,1),%xmm8
516	movdqu	-16(%r11,%rbx,1),%xmm9
517
518	movups	%xmm2,-16(%r12,%rbx,1)
519	movdqu	(%r8,%rbx,1),%xmm2
520	movups	%xmm3,-16(%r13,%rbx,1)
521	movdqu	(%r9,%rbx,1),%xmm3
522	pxor	%xmm12,%xmm2
523	movups	%xmm4,-16(%r14,%rbx,1)
524	movdqu	(%r10,%rbx,1),%xmm4
525	pxor	%xmm12,%xmm3
526	movups	%xmm5,-16(%r15,%rbx,1)
527	movdqu	(%r11,%rbx,1),%xmm5
528	pxor	%xmm12,%xmm4
529	pxor	%xmm12,%xmm5
530
531	decl	%edx
532	jnz	.Loop_dec4x
533
534	movq	16(%rsp),%rax
535.cfi_def_cfa	%rax,8
536	movl	24(%rsp),%edx
537
538	leaq	160(%rdi),%rdi
539	decl	%edx
540	jnz	.Ldec4x_loop_grande
541
542.Ldec4x_done:
543	movq	-48(%rax),%r15
544.cfi_restore	%r15
545	movq	-40(%rax),%r14
546.cfi_restore	%r14
547	movq	-32(%rax),%r13
548.cfi_restore	%r13
549	movq	-24(%rax),%r12
550.cfi_restore	%r12
551	movq	-16(%rax),%rbp
552.cfi_restore	%rbp
553	movq	-8(%rax),%rbx
554.cfi_restore	%rbx
555	leaq	(%rax),%rsp
556.cfi_def_cfa_register	%rsp
557.Ldec4x_epilogue:
558	.byte	0xf3,0xc3
559.cfi_endproc
560.size	aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
561.type	aesni_multi_cbc_encrypt_avx,@function
562.align	32
563aesni_multi_cbc_encrypt_avx:
564.cfi_startproc
565_avx_cbc_enc_shortcut:
566	movq	%rsp,%rax
567.cfi_def_cfa_register	%rax
568	pushq	%rbx
569.cfi_offset	%rbx,-16
570	pushq	%rbp
571.cfi_offset	%rbp,-24
572	pushq	%r12
573.cfi_offset	%r12,-32
574	pushq	%r13
575.cfi_offset	%r13,-40
576	pushq	%r14
577.cfi_offset	%r14,-48
578	pushq	%r15
579.cfi_offset	%r15,-56
580
581
582
583
584
585
586
587
588	subq	$192,%rsp
589	andq	$-128,%rsp
590	movq	%rax,16(%rsp)
591.cfi_escape	0x0f,0x05,0x77,0x10,0x06,0x23,0x08
592
593.Lenc8x_body:
594	vzeroupper
595	vmovdqu	(%rsi),%xmm15
596	leaq	120(%rsi),%rsi
597	leaq	160(%rdi),%rdi
598	shrl	$1,%edx
599
600.Lenc8x_loop_grande:
601
602	xorl	%edx,%edx
603	movl	-144(%rdi),%ecx
604	movq	-160(%rdi),%r8
605	cmpl	%edx,%ecx
606	movq	-152(%rdi),%rbx
607	cmovgl	%ecx,%edx
608	testl	%ecx,%ecx
609	vmovdqu	-136(%rdi),%xmm2
610	movl	%ecx,32(%rsp)
611	cmovleq	%rsp,%r8
612	subq	%r8,%rbx
613	movq	%rbx,64(%rsp)
614	movl	-104(%rdi),%ecx
615	movq	-120(%rdi),%r9
616	cmpl	%edx,%ecx
617	movq	-112(%rdi),%rbp
618	cmovgl	%ecx,%edx
619	testl	%ecx,%ecx
620	vmovdqu	-96(%rdi),%xmm3
621	movl	%ecx,36(%rsp)
622	cmovleq	%rsp,%r9
623	subq	%r9,%rbp
624	movq	%rbp,72(%rsp)
625	movl	-64(%rdi),%ecx
626	movq	-80(%rdi),%r10
627	cmpl	%edx,%ecx
628	movq	-72(%rdi),%rbp
629	cmovgl	%ecx,%edx
630	testl	%ecx,%ecx
631	vmovdqu	-56(%rdi),%xmm4
632	movl	%ecx,40(%rsp)
633	cmovleq	%rsp,%r10
634	subq	%r10,%rbp
635	movq	%rbp,80(%rsp)
636	movl	-24(%rdi),%ecx
637	movq	-40(%rdi),%r11
638	cmpl	%edx,%ecx
639	movq	-32(%rdi),%rbp
640	cmovgl	%ecx,%edx
641	testl	%ecx,%ecx
642	vmovdqu	-16(%rdi),%xmm5
643	movl	%ecx,44(%rsp)
644	cmovleq	%rsp,%r11
645	subq	%r11,%rbp
646	movq	%rbp,88(%rsp)
647	movl	16(%rdi),%ecx
648	movq	0(%rdi),%r12
649	cmpl	%edx,%ecx
650	movq	8(%rdi),%rbp
651	cmovgl	%ecx,%edx
652	testl	%ecx,%ecx
653	vmovdqu	24(%rdi),%xmm6
654	movl	%ecx,48(%rsp)
655	cmovleq	%rsp,%r12
656	subq	%r12,%rbp
657	movq	%rbp,96(%rsp)
658	movl	56(%rdi),%ecx
659	movq	40(%rdi),%r13
660	cmpl	%edx,%ecx
661	movq	48(%rdi),%rbp
662	cmovgl	%ecx,%edx
663	testl	%ecx,%ecx
664	vmovdqu	64(%rdi),%xmm7
665	movl	%ecx,52(%rsp)
666	cmovleq	%rsp,%r13
667	subq	%r13,%rbp
668	movq	%rbp,104(%rsp)
669	movl	96(%rdi),%ecx
670	movq	80(%rdi),%r14
671	cmpl	%edx,%ecx
672	movq	88(%rdi),%rbp
673	cmovgl	%ecx,%edx
674	testl	%ecx,%ecx
675	vmovdqu	104(%rdi),%xmm8
676	movl	%ecx,56(%rsp)
677	cmovleq	%rsp,%r14
678	subq	%r14,%rbp
679	movq	%rbp,112(%rsp)
680	movl	136(%rdi),%ecx
681	movq	120(%rdi),%r15
682	cmpl	%edx,%ecx
683	movq	128(%rdi),%rbp
684	cmovgl	%ecx,%edx
685	testl	%ecx,%ecx
686	vmovdqu	144(%rdi),%xmm9
687	movl	%ecx,60(%rsp)
688	cmovleq	%rsp,%r15
689	subq	%r15,%rbp
690	movq	%rbp,120(%rsp)
691	testl	%edx,%edx
692	jz	.Lenc8x_done
693
694	vmovups	16-120(%rsi),%xmm1
695	vmovups	32-120(%rsi),%xmm0
696	movl	240-120(%rsi),%eax
697
698	vpxor	(%r8),%xmm15,%xmm10
699	leaq	128(%rsp),%rbp
700	vpxor	(%r9),%xmm15,%xmm11
701	vpxor	(%r10),%xmm15,%xmm12
702	vpxor	(%r11),%xmm15,%xmm13
703	vpxor	%xmm10,%xmm2,%xmm2
704	vpxor	(%r12),%xmm15,%xmm10
705	vpxor	%xmm11,%xmm3,%xmm3
706	vpxor	(%r13),%xmm15,%xmm11
707	vpxor	%xmm12,%xmm4,%xmm4
708	vpxor	(%r14),%xmm15,%xmm12
709	vpxor	%xmm13,%xmm5,%xmm5
710	vpxor	(%r15),%xmm15,%xmm13
711	vpxor	%xmm10,%xmm6,%xmm6
712	movl	$1,%ecx
713	vpxor	%xmm11,%xmm7,%xmm7
714	vpxor	%xmm12,%xmm8,%xmm8
715	vpxor	%xmm13,%xmm9,%xmm9
716	jmp	.Loop_enc8x
717
718.align	32
719.Loop_enc8x:
720	vaesenc	%xmm1,%xmm2,%xmm2
721	cmpl	32+0(%rsp),%ecx
722	vaesenc	%xmm1,%xmm3,%xmm3
723	prefetcht0	31(%r8)
724	vaesenc	%xmm1,%xmm4,%xmm4
725	vaesenc	%xmm1,%xmm5,%xmm5
726	leaq	(%r8,%rbx,1),%rbx
727	cmovgeq	%rsp,%r8
728	vaesenc	%xmm1,%xmm6,%xmm6
729	cmovgq	%rsp,%rbx
730	vaesenc	%xmm1,%xmm7,%xmm7
731	subq	%r8,%rbx
732	vaesenc	%xmm1,%xmm8,%xmm8
733	vpxor	16(%r8),%xmm15,%xmm10
734	movq	%rbx,64+0(%rsp)
735	vaesenc	%xmm1,%xmm9,%xmm9
736	vmovups	-72(%rsi),%xmm1
737	leaq	16(%r8,%rbx,1),%r8
738	vmovdqu	%xmm10,0(%rbp)
739	vaesenc	%xmm0,%xmm2,%xmm2
740	cmpl	32+4(%rsp),%ecx
741	movq	64+8(%rsp),%rbx
742	vaesenc	%xmm0,%xmm3,%xmm3
743	prefetcht0	31(%r9)
744	vaesenc	%xmm0,%xmm4,%xmm4
745	vaesenc	%xmm0,%xmm5,%xmm5
746	leaq	(%r9,%rbx,1),%rbx
747	cmovgeq	%rsp,%r9
748	vaesenc	%xmm0,%xmm6,%xmm6
749	cmovgq	%rsp,%rbx
750	vaesenc	%xmm0,%xmm7,%xmm7
751	subq	%r9,%rbx
752	vaesenc	%xmm0,%xmm8,%xmm8
753	vpxor	16(%r9),%xmm15,%xmm11
754	movq	%rbx,64+8(%rsp)
755	vaesenc	%xmm0,%xmm9,%xmm9
756	vmovups	-56(%rsi),%xmm0
757	leaq	16(%r9,%rbx,1),%r9
758	vmovdqu	%xmm11,16(%rbp)
759	vaesenc	%xmm1,%xmm2,%xmm2
760	cmpl	32+8(%rsp),%ecx
761	movq	64+16(%rsp),%rbx
762	vaesenc	%xmm1,%xmm3,%xmm3
763	prefetcht0	31(%r10)
764	vaesenc	%xmm1,%xmm4,%xmm4
765	prefetcht0	15(%r8)
766	vaesenc	%xmm1,%xmm5,%xmm5
767	leaq	(%r10,%rbx,1),%rbx
768	cmovgeq	%rsp,%r10
769	vaesenc	%xmm1,%xmm6,%xmm6
770	cmovgq	%rsp,%rbx
771	vaesenc	%xmm1,%xmm7,%xmm7
772	subq	%r10,%rbx
773	vaesenc	%xmm1,%xmm8,%xmm8
774	vpxor	16(%r10),%xmm15,%xmm12
775	movq	%rbx,64+16(%rsp)
776	vaesenc	%xmm1,%xmm9,%xmm9
777	vmovups	-40(%rsi),%xmm1
778	leaq	16(%r10,%rbx,1),%r10
779	vmovdqu	%xmm12,32(%rbp)
780	vaesenc	%xmm0,%xmm2,%xmm2
781	cmpl	32+12(%rsp),%ecx
782	movq	64+24(%rsp),%rbx
783	vaesenc	%xmm0,%xmm3,%xmm3
784	prefetcht0	31(%r11)
785	vaesenc	%xmm0,%xmm4,%xmm4
786	prefetcht0	15(%r9)
787	vaesenc	%xmm0,%xmm5,%xmm5
788	leaq	(%r11,%rbx,1),%rbx
789	cmovgeq	%rsp,%r11
790	vaesenc	%xmm0,%xmm6,%xmm6
791	cmovgq	%rsp,%rbx
792	vaesenc	%xmm0,%xmm7,%xmm7
793	subq	%r11,%rbx
794	vaesenc	%xmm0,%xmm8,%xmm8
795	vpxor	16(%r11),%xmm15,%xmm13
796	movq	%rbx,64+24(%rsp)
797	vaesenc	%xmm0,%xmm9,%xmm9
798	vmovups	-24(%rsi),%xmm0
799	leaq	16(%r11,%rbx,1),%r11
800	vmovdqu	%xmm13,48(%rbp)
801	vaesenc	%xmm1,%xmm2,%xmm2
802	cmpl	32+16(%rsp),%ecx
803	movq	64+32(%rsp),%rbx
804	vaesenc	%xmm1,%xmm3,%xmm3
805	prefetcht0	31(%r12)
806	vaesenc	%xmm1,%xmm4,%xmm4
807	prefetcht0	15(%r10)
808	vaesenc	%xmm1,%xmm5,%xmm5
809	leaq	(%r12,%rbx,1),%rbx
810	cmovgeq	%rsp,%r12
811	vaesenc	%xmm1,%xmm6,%xmm6
812	cmovgq	%rsp,%rbx
813	vaesenc	%xmm1,%xmm7,%xmm7
814	subq	%r12,%rbx
815	vaesenc	%xmm1,%xmm8,%xmm8
816	vpxor	16(%r12),%xmm15,%xmm10
817	movq	%rbx,64+32(%rsp)
818	vaesenc	%xmm1,%xmm9,%xmm9
819	vmovups	-8(%rsi),%xmm1
820	leaq	16(%r12,%rbx,1),%r12
821	vaesenc	%xmm0,%xmm2,%xmm2
822	cmpl	32+20(%rsp),%ecx
823	movq	64+40(%rsp),%rbx
824	vaesenc	%xmm0,%xmm3,%xmm3
825	prefetcht0	31(%r13)
826	vaesenc	%xmm0,%xmm4,%xmm4
827	prefetcht0	15(%r11)
828	vaesenc	%xmm0,%xmm5,%xmm5
829	leaq	(%rbx,%r13,1),%rbx
830	cmovgeq	%rsp,%r13
831	vaesenc	%xmm0,%xmm6,%xmm6
832	cmovgq	%rsp,%rbx
833	vaesenc	%xmm0,%xmm7,%xmm7
834	subq	%r13,%rbx
835	vaesenc	%xmm0,%xmm8,%xmm8
836	vpxor	16(%r13),%xmm15,%xmm11
837	movq	%rbx,64+40(%rsp)
838	vaesenc	%xmm0,%xmm9,%xmm9
839	vmovups	8(%rsi),%xmm0
840	leaq	16(%r13,%rbx,1),%r13
841	vaesenc	%xmm1,%xmm2,%xmm2
842	cmpl	32+24(%rsp),%ecx
843	movq	64+48(%rsp),%rbx
844	vaesenc	%xmm1,%xmm3,%xmm3
845	prefetcht0	31(%r14)
846	vaesenc	%xmm1,%xmm4,%xmm4
847	prefetcht0	15(%r12)
848	vaesenc	%xmm1,%xmm5,%xmm5
849	leaq	(%r14,%rbx,1),%rbx
850	cmovgeq	%rsp,%r14
851	vaesenc	%xmm1,%xmm6,%xmm6
852	cmovgq	%rsp,%rbx
853	vaesenc	%xmm1,%xmm7,%xmm7
854	subq	%r14,%rbx
855	vaesenc	%xmm1,%xmm8,%xmm8
856	vpxor	16(%r14),%xmm15,%xmm12
857	movq	%rbx,64+48(%rsp)
858	vaesenc	%xmm1,%xmm9,%xmm9
859	vmovups	24(%rsi),%xmm1
860	leaq	16(%r14,%rbx,1),%r14
861	vaesenc	%xmm0,%xmm2,%xmm2
862	cmpl	32+28(%rsp),%ecx
863	movq	64+56(%rsp),%rbx
864	vaesenc	%xmm0,%xmm3,%xmm3
865	prefetcht0	31(%r15)
866	vaesenc	%xmm0,%xmm4,%xmm4
867	prefetcht0	15(%r13)
868	vaesenc	%xmm0,%xmm5,%xmm5
869	leaq	(%r15,%rbx,1),%rbx
870	cmovgeq	%rsp,%r15
871	vaesenc	%xmm0,%xmm6,%xmm6
872	cmovgq	%rsp,%rbx
873	vaesenc	%xmm0,%xmm7,%xmm7
874	subq	%r15,%rbx
875	vaesenc	%xmm0,%xmm8,%xmm8
876	vpxor	16(%r15),%xmm15,%xmm13
877	movq	%rbx,64+56(%rsp)
878	vaesenc	%xmm0,%xmm9,%xmm9
879	vmovups	40(%rsi),%xmm0
880	leaq	16(%r15,%rbx,1),%r15
881	vmovdqu	32(%rsp),%xmm14
882	prefetcht0	15(%r14)
883	prefetcht0	15(%r15)
884	cmpl	$11,%eax
885	jb	.Lenc8x_tail
886
887	vaesenc	%xmm1,%xmm2,%xmm2
888	vaesenc	%xmm1,%xmm3,%xmm3
889	vaesenc	%xmm1,%xmm4,%xmm4
890	vaesenc	%xmm1,%xmm5,%xmm5
891	vaesenc	%xmm1,%xmm6,%xmm6
892	vaesenc	%xmm1,%xmm7,%xmm7
893	vaesenc	%xmm1,%xmm8,%xmm8
894	vaesenc	%xmm1,%xmm9,%xmm9
895	vmovups	176-120(%rsi),%xmm1
896
897	vaesenc	%xmm0,%xmm2,%xmm2
898	vaesenc	%xmm0,%xmm3,%xmm3
899	vaesenc	%xmm0,%xmm4,%xmm4
900	vaesenc	%xmm0,%xmm5,%xmm5
901	vaesenc	%xmm0,%xmm6,%xmm6
902	vaesenc	%xmm0,%xmm7,%xmm7
903	vaesenc	%xmm0,%xmm8,%xmm8
904	vaesenc	%xmm0,%xmm9,%xmm9
905	vmovups	192-120(%rsi),%xmm0
906	je	.Lenc8x_tail
907
908	vaesenc	%xmm1,%xmm2,%xmm2
909	vaesenc	%xmm1,%xmm3,%xmm3
910	vaesenc	%xmm1,%xmm4,%xmm4
911	vaesenc	%xmm1,%xmm5,%xmm5
912	vaesenc	%xmm1,%xmm6,%xmm6
913	vaesenc	%xmm1,%xmm7,%xmm7
914	vaesenc	%xmm1,%xmm8,%xmm8
915	vaesenc	%xmm1,%xmm9,%xmm9
916	vmovups	208-120(%rsi),%xmm1
917
918	vaesenc	%xmm0,%xmm2,%xmm2
919	vaesenc	%xmm0,%xmm3,%xmm3
920	vaesenc	%xmm0,%xmm4,%xmm4
921	vaesenc	%xmm0,%xmm5,%xmm5
922	vaesenc	%xmm0,%xmm6,%xmm6
923	vaesenc	%xmm0,%xmm7,%xmm7
924	vaesenc	%xmm0,%xmm8,%xmm8
925	vaesenc	%xmm0,%xmm9,%xmm9
926	vmovups	224-120(%rsi),%xmm0
927
928.Lenc8x_tail:
929	vaesenc	%xmm1,%xmm2,%xmm2
930	vpxor	%xmm15,%xmm15,%xmm15
931	vaesenc	%xmm1,%xmm3,%xmm3
932	vaesenc	%xmm1,%xmm4,%xmm4
933	vpcmpgtd	%xmm15,%xmm14,%xmm15
934	vaesenc	%xmm1,%xmm5,%xmm5
935	vaesenc	%xmm1,%xmm6,%xmm6
936	vpaddd	%xmm14,%xmm15,%xmm15
937	vmovdqu	48(%rsp),%xmm14
938	vaesenc	%xmm1,%xmm7,%xmm7
939	movq	64(%rsp),%rbx
940	vaesenc	%xmm1,%xmm8,%xmm8
941	vaesenc	%xmm1,%xmm9,%xmm9
942	vmovups	16-120(%rsi),%xmm1
943
944	vaesenclast	%xmm0,%xmm2,%xmm2
945	vmovdqa	%xmm15,32(%rsp)
946	vpxor	%xmm15,%xmm15,%xmm15
947	vaesenclast	%xmm0,%xmm3,%xmm3
948	vaesenclast	%xmm0,%xmm4,%xmm4
949	vpcmpgtd	%xmm15,%xmm14,%xmm15
950	vaesenclast	%xmm0,%xmm5,%xmm5
951	vaesenclast	%xmm0,%xmm6,%xmm6
952	vpaddd	%xmm15,%xmm14,%xmm14
953	vmovdqu	-120(%rsi),%xmm15
954	vaesenclast	%xmm0,%xmm7,%xmm7
955	vaesenclast	%xmm0,%xmm8,%xmm8
956	vmovdqa	%xmm14,48(%rsp)
957	vaesenclast	%xmm0,%xmm9,%xmm9
958	vmovups	32-120(%rsi),%xmm0
959
960	vmovups	%xmm2,-16(%r8)
961	subq	%rbx,%r8
962	vpxor	0(%rbp),%xmm2,%xmm2
963	vmovups	%xmm3,-16(%r9)
964	subq	72(%rsp),%r9
965	vpxor	16(%rbp),%xmm3,%xmm3
966	vmovups	%xmm4,-16(%r10)
967	subq	80(%rsp),%r10
968	vpxor	32(%rbp),%xmm4,%xmm4
969	vmovups	%xmm5,-16(%r11)
970	subq	88(%rsp),%r11
971	vpxor	48(%rbp),%xmm5,%xmm5
972	vmovups	%xmm6,-16(%r12)
973	subq	96(%rsp),%r12
974	vpxor	%xmm10,%xmm6,%xmm6
975	vmovups	%xmm7,-16(%r13)
976	subq	104(%rsp),%r13
977	vpxor	%xmm11,%xmm7,%xmm7
978	vmovups	%xmm8,-16(%r14)
979	subq	112(%rsp),%r14
980	vpxor	%xmm12,%xmm8,%xmm8
981	vmovups	%xmm9,-16(%r15)
982	subq	120(%rsp),%r15
983	vpxor	%xmm13,%xmm9,%xmm9
984
985	decl	%edx
986	jnz	.Loop_enc8x
987
988	movq	16(%rsp),%rax
989.cfi_def_cfa	%rax,8
990
991
992
993
994
995.Lenc8x_done:
996	vzeroupper
997	movq	-48(%rax),%r15
998.cfi_restore	%r15
999	movq	-40(%rax),%r14
1000.cfi_restore	%r14
1001	movq	-32(%rax),%r13
1002.cfi_restore	%r13
1003	movq	-24(%rax),%r12
1004.cfi_restore	%r12
1005	movq	-16(%rax),%rbp
1006.cfi_restore	%rbp
1007	movq	-8(%rax),%rbx
1008.cfi_restore	%rbx
1009	leaq	(%rax),%rsp
1010.cfi_def_cfa_register	%rsp
1011.Lenc8x_epilogue:
1012	.byte	0xf3,0xc3
1013.cfi_endproc
1014.size	aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
1015
1016.type	aesni_multi_cbc_decrypt_avx,@function
1017.align	32
1018aesni_multi_cbc_decrypt_avx:
1019.cfi_startproc
1020_avx_cbc_dec_shortcut:
1021	movq	%rsp,%rax
1022.cfi_def_cfa_register	%rax
1023	pushq	%rbx
1024.cfi_offset	%rbx,-16
1025	pushq	%rbp
1026.cfi_offset	%rbp,-24
1027	pushq	%r12
1028.cfi_offset	%r12,-32
1029	pushq	%r13
1030.cfi_offset	%r13,-40
1031	pushq	%r14
1032.cfi_offset	%r14,-48
1033	pushq	%r15
1034.cfi_offset	%r15,-56
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044	subq	$256,%rsp
1045	andq	$-256,%rsp
1046	subq	$192,%rsp
1047	movq	%rax,16(%rsp)
1048.cfi_escape	0x0f,0x05,0x77,0x10,0x06,0x23,0x08
1049
1050.Ldec8x_body:
1051	vzeroupper
1052	vmovdqu	(%rsi),%xmm15
1053	leaq	120(%rsi),%rsi
1054	leaq	160(%rdi),%rdi
1055	shrl	$1,%edx
1056
1057.Ldec8x_loop_grande:
1058
1059	xorl	%edx,%edx
1060	movl	-144(%rdi),%ecx
1061	movq	-160(%rdi),%r8
1062	cmpl	%edx,%ecx
1063	movq	-152(%rdi),%rbx
1064	cmovgl	%ecx,%edx
1065	testl	%ecx,%ecx
1066	vmovdqu	-136(%rdi),%xmm2
1067	movl	%ecx,32(%rsp)
1068	cmovleq	%rsp,%r8
1069	subq	%r8,%rbx
1070	movq	%rbx,64(%rsp)
1071	vmovdqu	%xmm2,192(%rsp)
1072	movl	-104(%rdi),%ecx
1073	movq	-120(%rdi),%r9
1074	cmpl	%edx,%ecx
1075	movq	-112(%rdi),%rbp
1076	cmovgl	%ecx,%edx
1077	testl	%ecx,%ecx
1078	vmovdqu	-96(%rdi),%xmm3
1079	movl	%ecx,36(%rsp)
1080	cmovleq	%rsp,%r9
1081	subq	%r9,%rbp
1082	movq	%rbp,72(%rsp)
1083	vmovdqu	%xmm3,208(%rsp)
1084	movl	-64(%rdi),%ecx
1085	movq	-80(%rdi),%r10
1086	cmpl	%edx,%ecx
1087	movq	-72(%rdi),%rbp
1088	cmovgl	%ecx,%edx
1089	testl	%ecx,%ecx
1090	vmovdqu	-56(%rdi),%xmm4
1091	movl	%ecx,40(%rsp)
1092	cmovleq	%rsp,%r10
1093	subq	%r10,%rbp
1094	movq	%rbp,80(%rsp)
1095	vmovdqu	%xmm4,224(%rsp)
1096	movl	-24(%rdi),%ecx
1097	movq	-40(%rdi),%r11
1098	cmpl	%edx,%ecx
1099	movq	-32(%rdi),%rbp
1100	cmovgl	%ecx,%edx
1101	testl	%ecx,%ecx
1102	vmovdqu	-16(%rdi),%xmm5
1103	movl	%ecx,44(%rsp)
1104	cmovleq	%rsp,%r11
1105	subq	%r11,%rbp
1106	movq	%rbp,88(%rsp)
1107	vmovdqu	%xmm5,240(%rsp)
1108	movl	16(%rdi),%ecx
1109	movq	0(%rdi),%r12
1110	cmpl	%edx,%ecx
1111	movq	8(%rdi),%rbp
1112	cmovgl	%ecx,%edx
1113	testl	%ecx,%ecx
1114	vmovdqu	24(%rdi),%xmm6
1115	movl	%ecx,48(%rsp)
1116	cmovleq	%rsp,%r12
1117	subq	%r12,%rbp
1118	movq	%rbp,96(%rsp)
1119	vmovdqu	%xmm6,256(%rsp)
1120	movl	56(%rdi),%ecx
1121	movq	40(%rdi),%r13
1122	cmpl	%edx,%ecx
1123	movq	48(%rdi),%rbp
1124	cmovgl	%ecx,%edx
1125	testl	%ecx,%ecx
1126	vmovdqu	64(%rdi),%xmm7
1127	movl	%ecx,52(%rsp)
1128	cmovleq	%rsp,%r13
1129	subq	%r13,%rbp
1130	movq	%rbp,104(%rsp)
1131	vmovdqu	%xmm7,272(%rsp)
1132	movl	96(%rdi),%ecx
1133	movq	80(%rdi),%r14
1134	cmpl	%edx,%ecx
1135	movq	88(%rdi),%rbp
1136	cmovgl	%ecx,%edx
1137	testl	%ecx,%ecx
1138	vmovdqu	104(%rdi),%xmm8
1139	movl	%ecx,56(%rsp)
1140	cmovleq	%rsp,%r14
1141	subq	%r14,%rbp
1142	movq	%rbp,112(%rsp)
1143	vmovdqu	%xmm8,288(%rsp)
1144	movl	136(%rdi),%ecx
1145	movq	120(%rdi),%r15
1146	cmpl	%edx,%ecx
1147	movq	128(%rdi),%rbp
1148	cmovgl	%ecx,%edx
1149	testl	%ecx,%ecx
1150	vmovdqu	144(%rdi),%xmm9
1151	movl	%ecx,60(%rsp)
1152	cmovleq	%rsp,%r15
1153	subq	%r15,%rbp
1154	movq	%rbp,120(%rsp)
1155	vmovdqu	%xmm9,304(%rsp)
1156	testl	%edx,%edx
1157	jz	.Ldec8x_done
1158
1159	vmovups	16-120(%rsi),%xmm1
1160	vmovups	32-120(%rsi),%xmm0
1161	movl	240-120(%rsi),%eax
1162	leaq	192+128(%rsp),%rbp
1163
1164	vmovdqu	(%r8),%xmm2
1165	vmovdqu	(%r9),%xmm3
1166	vmovdqu	(%r10),%xmm4
1167	vmovdqu	(%r11),%xmm5
1168	vmovdqu	(%r12),%xmm6
1169	vmovdqu	(%r13),%xmm7
1170	vmovdqu	(%r14),%xmm8
1171	vmovdqu	(%r15),%xmm9
1172	vmovdqu	%xmm2,0(%rbp)
1173	vpxor	%xmm15,%xmm2,%xmm2
1174	vmovdqu	%xmm3,16(%rbp)
1175	vpxor	%xmm15,%xmm3,%xmm3
1176	vmovdqu	%xmm4,32(%rbp)
1177	vpxor	%xmm15,%xmm4,%xmm4
1178	vmovdqu	%xmm5,48(%rbp)
1179	vpxor	%xmm15,%xmm5,%xmm5
1180	vmovdqu	%xmm6,64(%rbp)
1181	vpxor	%xmm15,%xmm6,%xmm6
1182	vmovdqu	%xmm7,80(%rbp)
1183	vpxor	%xmm15,%xmm7,%xmm7
1184	vmovdqu	%xmm8,96(%rbp)
1185	vpxor	%xmm15,%xmm8,%xmm8
1186	vmovdqu	%xmm9,112(%rbp)
1187	vpxor	%xmm15,%xmm9,%xmm9
1188	xorq	$0x80,%rbp
1189	movl	$1,%ecx
1190	jmp	.Loop_dec8x
1191
1192.align	32
1193.Loop_dec8x:
1194	vaesdec	%xmm1,%xmm2,%xmm2
1195	cmpl	32+0(%rsp),%ecx
1196	vaesdec	%xmm1,%xmm3,%xmm3
1197	prefetcht0	31(%r8)
1198	vaesdec	%xmm1,%xmm4,%xmm4
1199	vaesdec	%xmm1,%xmm5,%xmm5
1200	leaq	(%r8,%rbx,1),%rbx
1201	cmovgeq	%rsp,%r8
1202	vaesdec	%xmm1,%xmm6,%xmm6
1203	cmovgq	%rsp,%rbx
1204	vaesdec	%xmm1,%xmm7,%xmm7
1205	subq	%r8,%rbx
1206	vaesdec	%xmm1,%xmm8,%xmm8
1207	vmovdqu	16(%r8),%xmm10
1208	movq	%rbx,64+0(%rsp)
1209	vaesdec	%xmm1,%xmm9,%xmm9
1210	vmovups	-72(%rsi),%xmm1
1211	leaq	16(%r8,%rbx,1),%r8
1212	vmovdqu	%xmm10,128(%rsp)
1213	vaesdec	%xmm0,%xmm2,%xmm2
1214	cmpl	32+4(%rsp),%ecx
1215	movq	64+8(%rsp),%rbx
1216	vaesdec	%xmm0,%xmm3,%xmm3
1217	prefetcht0	31(%r9)
1218	vaesdec	%xmm0,%xmm4,%xmm4
1219	vaesdec	%xmm0,%xmm5,%xmm5
1220	leaq	(%r9,%rbx,1),%rbx
1221	cmovgeq	%rsp,%r9
1222	vaesdec	%xmm0,%xmm6,%xmm6
1223	cmovgq	%rsp,%rbx
1224	vaesdec	%xmm0,%xmm7,%xmm7
1225	subq	%r9,%rbx
1226	vaesdec	%xmm0,%xmm8,%xmm8
1227	vmovdqu	16(%r9),%xmm11
1228	movq	%rbx,64+8(%rsp)
1229	vaesdec	%xmm0,%xmm9,%xmm9
1230	vmovups	-56(%rsi),%xmm0
1231	leaq	16(%r9,%rbx,1),%r9
1232	vmovdqu	%xmm11,144(%rsp)
1233	vaesdec	%xmm1,%xmm2,%xmm2
1234	cmpl	32+8(%rsp),%ecx
1235	movq	64+16(%rsp),%rbx
1236	vaesdec	%xmm1,%xmm3,%xmm3
1237	prefetcht0	31(%r10)
1238	vaesdec	%xmm1,%xmm4,%xmm4
1239	prefetcht0	15(%r8)
1240	vaesdec	%xmm1,%xmm5,%xmm5
1241	leaq	(%r10,%rbx,1),%rbx
1242	cmovgeq	%rsp,%r10
1243	vaesdec	%xmm1,%xmm6,%xmm6
1244	cmovgq	%rsp,%rbx
1245	vaesdec	%xmm1,%xmm7,%xmm7
1246	subq	%r10,%rbx
1247	vaesdec	%xmm1,%xmm8,%xmm8
1248	vmovdqu	16(%r10),%xmm12
1249	movq	%rbx,64+16(%rsp)
1250	vaesdec	%xmm1,%xmm9,%xmm9
1251	vmovups	-40(%rsi),%xmm1
1252	leaq	16(%r10,%rbx,1),%r10
1253	vmovdqu	%xmm12,160(%rsp)
1254	vaesdec	%xmm0,%xmm2,%xmm2
1255	cmpl	32+12(%rsp),%ecx
1256	movq	64+24(%rsp),%rbx
1257	vaesdec	%xmm0,%xmm3,%xmm3
1258	prefetcht0	31(%r11)
1259	vaesdec	%xmm0,%xmm4,%xmm4
1260	prefetcht0	15(%r9)
1261	vaesdec	%xmm0,%xmm5,%xmm5
1262	leaq	(%r11,%rbx,1),%rbx
1263	cmovgeq	%rsp,%r11
1264	vaesdec	%xmm0,%xmm6,%xmm6
1265	cmovgq	%rsp,%rbx
1266	vaesdec	%xmm0,%xmm7,%xmm7
1267	subq	%r11,%rbx
1268	vaesdec	%xmm0,%xmm8,%xmm8
1269	vmovdqu	16(%r11),%xmm13
1270	movq	%rbx,64+24(%rsp)
1271	vaesdec	%xmm0,%xmm9,%xmm9
1272	vmovups	-24(%rsi),%xmm0
1273	leaq	16(%r11,%rbx,1),%r11
1274	vmovdqu	%xmm13,176(%rsp)
1275	vaesdec	%xmm1,%xmm2,%xmm2
1276	cmpl	32+16(%rsp),%ecx
1277	movq	64+32(%rsp),%rbx
1278	vaesdec	%xmm1,%xmm3,%xmm3
1279	prefetcht0	31(%r12)
1280	vaesdec	%xmm1,%xmm4,%xmm4
1281	prefetcht0	15(%r10)
1282	vaesdec	%xmm1,%xmm5,%xmm5
1283	leaq	(%r12,%rbx,1),%rbx
1284	cmovgeq	%rsp,%r12
1285	vaesdec	%xmm1,%xmm6,%xmm6
1286	cmovgq	%rsp,%rbx
1287	vaesdec	%xmm1,%xmm7,%xmm7
1288	subq	%r12,%rbx
1289	vaesdec	%xmm1,%xmm8,%xmm8
1290	vmovdqu	16(%r12),%xmm10
1291	movq	%rbx,64+32(%rsp)
1292	vaesdec	%xmm1,%xmm9,%xmm9
1293	vmovups	-8(%rsi),%xmm1
1294	leaq	16(%r12,%rbx,1),%r12
1295	vaesdec	%xmm0,%xmm2,%xmm2
1296	cmpl	32+20(%rsp),%ecx
1297	movq	64+40(%rsp),%rbx
1298	vaesdec	%xmm0,%xmm3,%xmm3
1299	prefetcht0	31(%r13)
1300	vaesdec	%xmm0,%xmm4,%xmm4
1301	prefetcht0	15(%r11)
1302	vaesdec	%xmm0,%xmm5,%xmm5
1303	leaq	(%rbx,%r13,1),%rbx
1304	cmovgeq	%rsp,%r13
1305	vaesdec	%xmm0,%xmm6,%xmm6
1306	cmovgq	%rsp,%rbx
1307	vaesdec	%xmm0,%xmm7,%xmm7
1308	subq	%r13,%rbx
1309	vaesdec	%xmm0,%xmm8,%xmm8
1310	vmovdqu	16(%r13),%xmm11
1311	movq	%rbx,64+40(%rsp)
1312	vaesdec	%xmm0,%xmm9,%xmm9
1313	vmovups	8(%rsi),%xmm0
1314	leaq	16(%r13,%rbx,1),%r13
1315	vaesdec	%xmm1,%xmm2,%xmm2
1316	cmpl	32+24(%rsp),%ecx
1317	movq	64+48(%rsp),%rbx
1318	vaesdec	%xmm1,%xmm3,%xmm3
1319	prefetcht0	31(%r14)
1320	vaesdec	%xmm1,%xmm4,%xmm4
1321	prefetcht0	15(%r12)
1322	vaesdec	%xmm1,%xmm5,%xmm5
1323	leaq	(%r14,%rbx,1),%rbx
1324	cmovgeq	%rsp,%r14
1325	vaesdec	%xmm1,%xmm6,%xmm6
1326	cmovgq	%rsp,%rbx
1327	vaesdec	%xmm1,%xmm7,%xmm7
1328	subq	%r14,%rbx
1329	vaesdec	%xmm1,%xmm8,%xmm8
1330	vmovdqu	16(%r14),%xmm12
1331	movq	%rbx,64+48(%rsp)
1332	vaesdec	%xmm1,%xmm9,%xmm9
1333	vmovups	24(%rsi),%xmm1
1334	leaq	16(%r14,%rbx,1),%r14
1335	vaesdec	%xmm0,%xmm2,%xmm2
1336	cmpl	32+28(%rsp),%ecx
1337	movq	64+56(%rsp),%rbx
1338	vaesdec	%xmm0,%xmm3,%xmm3
1339	prefetcht0	31(%r15)
1340	vaesdec	%xmm0,%xmm4,%xmm4
1341	prefetcht0	15(%r13)
1342	vaesdec	%xmm0,%xmm5,%xmm5
1343	leaq	(%r15,%rbx,1),%rbx
1344	cmovgeq	%rsp,%r15
1345	vaesdec	%xmm0,%xmm6,%xmm6
1346	cmovgq	%rsp,%rbx
1347	vaesdec	%xmm0,%xmm7,%xmm7
1348	subq	%r15,%rbx
1349	vaesdec	%xmm0,%xmm8,%xmm8
1350	vmovdqu	16(%r15),%xmm13
1351	movq	%rbx,64+56(%rsp)
1352	vaesdec	%xmm0,%xmm9,%xmm9
1353	vmovups	40(%rsi),%xmm0
1354	leaq	16(%r15,%rbx,1),%r15
1355	vmovdqu	32(%rsp),%xmm14
1356	prefetcht0	15(%r14)
1357	prefetcht0	15(%r15)
1358	cmpl	$11,%eax
1359	jb	.Ldec8x_tail
1360
1361	vaesdec	%xmm1,%xmm2,%xmm2
1362	vaesdec	%xmm1,%xmm3,%xmm3
1363	vaesdec	%xmm1,%xmm4,%xmm4
1364	vaesdec	%xmm1,%xmm5,%xmm5
1365	vaesdec	%xmm1,%xmm6,%xmm6
1366	vaesdec	%xmm1,%xmm7,%xmm7
1367	vaesdec	%xmm1,%xmm8,%xmm8
1368	vaesdec	%xmm1,%xmm9,%xmm9
1369	vmovups	176-120(%rsi),%xmm1
1370
1371	vaesdec	%xmm0,%xmm2,%xmm2
1372	vaesdec	%xmm0,%xmm3,%xmm3
1373	vaesdec	%xmm0,%xmm4,%xmm4
1374	vaesdec	%xmm0,%xmm5,%xmm5
1375	vaesdec	%xmm0,%xmm6,%xmm6
1376	vaesdec	%xmm0,%xmm7,%xmm7
1377	vaesdec	%xmm0,%xmm8,%xmm8
1378	vaesdec	%xmm0,%xmm9,%xmm9
1379	vmovups	192-120(%rsi),%xmm0
1380	je	.Ldec8x_tail
1381
1382	vaesdec	%xmm1,%xmm2,%xmm2
1383	vaesdec	%xmm1,%xmm3,%xmm3
1384	vaesdec	%xmm1,%xmm4,%xmm4
1385	vaesdec	%xmm1,%xmm5,%xmm5
1386	vaesdec	%xmm1,%xmm6,%xmm6
1387	vaesdec	%xmm1,%xmm7,%xmm7
1388	vaesdec	%xmm1,%xmm8,%xmm8
1389	vaesdec	%xmm1,%xmm9,%xmm9
1390	vmovups	208-120(%rsi),%xmm1
1391
1392	vaesdec	%xmm0,%xmm2,%xmm2
1393	vaesdec	%xmm0,%xmm3,%xmm3
1394	vaesdec	%xmm0,%xmm4,%xmm4
1395	vaesdec	%xmm0,%xmm5,%xmm5
1396	vaesdec	%xmm0,%xmm6,%xmm6
1397	vaesdec	%xmm0,%xmm7,%xmm7
1398	vaesdec	%xmm0,%xmm8,%xmm8
1399	vaesdec	%xmm0,%xmm9,%xmm9
1400	vmovups	224-120(%rsi),%xmm0
1401
1402.Ldec8x_tail:
1403	vaesdec	%xmm1,%xmm2,%xmm2
1404	vpxor	%xmm15,%xmm15,%xmm15
1405	vaesdec	%xmm1,%xmm3,%xmm3
1406	vaesdec	%xmm1,%xmm4,%xmm4
1407	vpcmpgtd	%xmm15,%xmm14,%xmm15
1408	vaesdec	%xmm1,%xmm5,%xmm5
1409	vaesdec	%xmm1,%xmm6,%xmm6
1410	vpaddd	%xmm14,%xmm15,%xmm15
1411	vmovdqu	48(%rsp),%xmm14
1412	vaesdec	%xmm1,%xmm7,%xmm7
1413	movq	64(%rsp),%rbx
1414	vaesdec	%xmm1,%xmm8,%xmm8
1415	vaesdec	%xmm1,%xmm9,%xmm9
1416	vmovups	16-120(%rsi),%xmm1
1417
1418	vaesdeclast	%xmm0,%xmm2,%xmm2
1419	vmovdqa	%xmm15,32(%rsp)
1420	vpxor	%xmm15,%xmm15,%xmm15
1421	vaesdeclast	%xmm0,%xmm3,%xmm3
1422	vpxor	0(%rbp),%xmm2,%xmm2
1423	vaesdeclast	%xmm0,%xmm4,%xmm4
1424	vpxor	16(%rbp),%xmm3,%xmm3
1425	vpcmpgtd	%xmm15,%xmm14,%xmm15
1426	vaesdeclast	%xmm0,%xmm5,%xmm5
1427	vpxor	32(%rbp),%xmm4,%xmm4
1428	vaesdeclast	%xmm0,%xmm6,%xmm6
1429	vpxor	48(%rbp),%xmm5,%xmm5
1430	vpaddd	%xmm15,%xmm14,%xmm14
1431	vmovdqu	-120(%rsi),%xmm15
1432	vaesdeclast	%xmm0,%xmm7,%xmm7
1433	vpxor	64(%rbp),%xmm6,%xmm6
1434	vaesdeclast	%xmm0,%xmm8,%xmm8
1435	vpxor	80(%rbp),%xmm7,%xmm7
1436	vmovdqa	%xmm14,48(%rsp)
1437	vaesdeclast	%xmm0,%xmm9,%xmm9
1438	vpxor	96(%rbp),%xmm8,%xmm8
1439	vmovups	32-120(%rsi),%xmm0
1440
1441	vmovups	%xmm2,-16(%r8)
1442	subq	%rbx,%r8
1443	vmovdqu	128+0(%rsp),%xmm2
1444	vpxor	112(%rbp),%xmm9,%xmm9
1445	vmovups	%xmm3,-16(%r9)
1446	subq	72(%rsp),%r9
1447	vmovdqu	%xmm2,0(%rbp)
1448	vpxor	%xmm15,%xmm2,%xmm2
1449	vmovdqu	128+16(%rsp),%xmm3
1450	vmovups	%xmm4,-16(%r10)
1451	subq	80(%rsp),%r10
1452	vmovdqu	%xmm3,16(%rbp)
1453	vpxor	%xmm15,%xmm3,%xmm3
1454	vmovdqu	128+32(%rsp),%xmm4
1455	vmovups	%xmm5,-16(%r11)
1456	subq	88(%rsp),%r11
1457	vmovdqu	%xmm4,32(%rbp)
1458	vpxor	%xmm15,%xmm4,%xmm4
1459	vmovdqu	128+48(%rsp),%xmm5
1460	vmovups	%xmm6,-16(%r12)
1461	subq	96(%rsp),%r12
1462	vmovdqu	%xmm5,48(%rbp)
1463	vpxor	%xmm15,%xmm5,%xmm5
1464	vmovdqu	%xmm10,64(%rbp)
1465	vpxor	%xmm10,%xmm15,%xmm6
1466	vmovups	%xmm7,-16(%r13)
1467	subq	104(%rsp),%r13
1468	vmovdqu	%xmm11,80(%rbp)
1469	vpxor	%xmm11,%xmm15,%xmm7
1470	vmovups	%xmm8,-16(%r14)
1471	subq	112(%rsp),%r14
1472	vmovdqu	%xmm12,96(%rbp)
1473	vpxor	%xmm12,%xmm15,%xmm8
1474	vmovups	%xmm9,-16(%r15)
1475	subq	120(%rsp),%r15
1476	vmovdqu	%xmm13,112(%rbp)
1477	vpxor	%xmm13,%xmm15,%xmm9
1478
1479	xorq	$128,%rbp
1480	decl	%edx
1481	jnz	.Loop_dec8x
1482
1483	movq	16(%rsp),%rax
1484.cfi_def_cfa	%rax,8
1485
1486
1487
1488
1489
1490.Ldec8x_done:
1491	vzeroupper
1492	movq	-48(%rax),%r15
1493.cfi_restore	%r15
1494	movq	-40(%rax),%r14
1495.cfi_restore	%r14
1496	movq	-32(%rax),%r13
1497.cfi_restore	%r13
1498	movq	-24(%rax),%r12
1499.cfi_restore	%r12
1500	movq	-16(%rax),%rbp
1501.cfi_restore	%rbp
1502	movq	-8(%rax),%rbx
1503.cfi_restore	%rbx
1504	leaq	(%rax),%rsp
1505.cfi_def_cfa_register	%rsp
1506.Ldec8x_epilogue:
1507	.byte	0xf3,0xc3
1508.cfi_endproc
1509.size	aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
1510