xref: /freebsd/sys/crypto/openssl/amd64/aesni-mb-x86_64.S (revision 0e8011faf58b743cc652e3b2ad0f7671227610df)
1/* Do not modify. This file is auto-generated from aesni-mb-x86_64.pl. */
2.text
3
4
5
6.globl	aesni_multi_cbc_encrypt
7.type	aesni_multi_cbc_encrypt,@function
8.align	32
9aesni_multi_cbc_encrypt:
10.cfi_startproc
11	cmpl	$2,%edx
12	jb	.Lenc_non_avx
13	movl	OPENSSL_ia32cap_P+4(%rip),%ecx
14	testl	$268435456,%ecx
15	jnz	_avx_cbc_enc_shortcut
16	jmp	.Lenc_non_avx
17.align	16
18.Lenc_non_avx:
19	movq	%rsp,%rax
20.cfi_def_cfa_register	%rax
21	pushq	%rbx
22.cfi_offset	%rbx,-16
23	pushq	%rbp
24.cfi_offset	%rbp,-24
25	pushq	%r12
26.cfi_offset	%r12,-32
27	pushq	%r13
28.cfi_offset	%r13,-40
29	pushq	%r14
30.cfi_offset	%r14,-48
31	pushq	%r15
32.cfi_offset	%r15,-56
33
34
35
36
37
38
39	subq	$48,%rsp
40	andq	$-64,%rsp
41	movq	%rax,16(%rsp)
42.cfi_escape	0x0f,0x05,0x77,0x10,0x06,0x23,0x08
43
44.Lenc4x_body:
45	movdqu	(%rsi),%xmm12
46	leaq	120(%rsi),%rsi
47	leaq	80(%rdi),%rdi
48
49.Lenc4x_loop_grande:
50	movl	%edx,24(%rsp)
51	xorl	%edx,%edx
52
53	movl	-64(%rdi),%ecx
54	movq	-80(%rdi),%r8
55	cmpl	%edx,%ecx
56	movq	-72(%rdi),%r12
57	cmovgl	%ecx,%edx
58	testl	%ecx,%ecx
59
60	movdqu	-56(%rdi),%xmm2
61	movl	%ecx,32(%rsp)
62	cmovleq	%rsp,%r8
63
64	movl	-24(%rdi),%ecx
65	movq	-40(%rdi),%r9
66	cmpl	%edx,%ecx
67	movq	-32(%rdi),%r13
68	cmovgl	%ecx,%edx
69	testl	%ecx,%ecx
70
71	movdqu	-16(%rdi),%xmm3
72	movl	%ecx,36(%rsp)
73	cmovleq	%rsp,%r9
74
75	movl	16(%rdi),%ecx
76	movq	0(%rdi),%r10
77	cmpl	%edx,%ecx
78	movq	8(%rdi),%r14
79	cmovgl	%ecx,%edx
80	testl	%ecx,%ecx
81
82	movdqu	24(%rdi),%xmm4
83	movl	%ecx,40(%rsp)
84	cmovleq	%rsp,%r10
85
86	movl	56(%rdi),%ecx
87	movq	40(%rdi),%r11
88	cmpl	%edx,%ecx
89	movq	48(%rdi),%r15
90	cmovgl	%ecx,%edx
91	testl	%ecx,%ecx
92
93	movdqu	64(%rdi),%xmm5
94	movl	%ecx,44(%rsp)
95	cmovleq	%rsp,%r11
96	testl	%edx,%edx
97	jz	.Lenc4x_done
98
99	movups	16-120(%rsi),%xmm1
100	pxor	%xmm12,%xmm2
101	movups	32-120(%rsi),%xmm0
102	pxor	%xmm12,%xmm3
103	movl	240-120(%rsi),%eax
104	pxor	%xmm12,%xmm4
105	movdqu	(%r8),%xmm6
106	pxor	%xmm12,%xmm5
107	movdqu	(%r9),%xmm7
108	pxor	%xmm6,%xmm2
109	movdqu	(%r10),%xmm8
110	pxor	%xmm7,%xmm3
111	movdqu	(%r11),%xmm9
112	pxor	%xmm8,%xmm4
113	pxor	%xmm9,%xmm5
114	movdqa	32(%rsp),%xmm10
115	xorq	%rbx,%rbx
116	jmp	.Loop_enc4x
117
118.align	32
119.Loop_enc4x:
120	addq	$16,%rbx
121	leaq	16(%rsp),%rbp
122	movl	$1,%ecx
123	subq	%rbx,%rbp
124
125.byte	102,15,56,220,209
126	prefetcht0	31(%r8,%rbx,1)
127	prefetcht0	31(%r9,%rbx,1)
128.byte	102,15,56,220,217
129	prefetcht0	31(%r10,%rbx,1)
130	prefetcht0	31(%r10,%rbx,1)
131.byte	102,15,56,220,225
132.byte	102,15,56,220,233
133	movups	48-120(%rsi),%xmm1
134	cmpl	32(%rsp),%ecx
135.byte	102,15,56,220,208
136.byte	102,15,56,220,216
137.byte	102,15,56,220,224
138	cmovgeq	%rbp,%r8
139	cmovgq	%rbp,%r12
140.byte	102,15,56,220,232
141	movups	-56(%rsi),%xmm0
142	cmpl	36(%rsp),%ecx
143.byte	102,15,56,220,209
144.byte	102,15,56,220,217
145.byte	102,15,56,220,225
146	cmovgeq	%rbp,%r9
147	cmovgq	%rbp,%r13
148.byte	102,15,56,220,233
149	movups	-40(%rsi),%xmm1
150	cmpl	40(%rsp),%ecx
151.byte	102,15,56,220,208
152.byte	102,15,56,220,216
153.byte	102,15,56,220,224
154	cmovgeq	%rbp,%r10
155	cmovgq	%rbp,%r14
156.byte	102,15,56,220,232
157	movups	-24(%rsi),%xmm0
158	cmpl	44(%rsp),%ecx
159.byte	102,15,56,220,209
160.byte	102,15,56,220,217
161.byte	102,15,56,220,225
162	cmovgeq	%rbp,%r11
163	cmovgq	%rbp,%r15
164.byte	102,15,56,220,233
165	movups	-8(%rsi),%xmm1
166	movdqa	%xmm10,%xmm11
167.byte	102,15,56,220,208
168	prefetcht0	15(%r12,%rbx,1)
169	prefetcht0	15(%r13,%rbx,1)
170.byte	102,15,56,220,216
171	prefetcht0	15(%r14,%rbx,1)
172	prefetcht0	15(%r15,%rbx,1)
173.byte	102,15,56,220,224
174.byte	102,15,56,220,232
175	movups	128-120(%rsi),%xmm0
176	pxor	%xmm12,%xmm12
177
178.byte	102,15,56,220,209
179	pcmpgtd	%xmm12,%xmm11
180	movdqu	-120(%rsi),%xmm12
181.byte	102,15,56,220,217
182	paddd	%xmm11,%xmm10
183	movdqa	%xmm10,32(%rsp)
184.byte	102,15,56,220,225
185.byte	102,15,56,220,233
186	movups	144-120(%rsi),%xmm1
187
188	cmpl	$11,%eax
189
190.byte	102,15,56,220,208
191.byte	102,15,56,220,216
192.byte	102,15,56,220,224
193.byte	102,15,56,220,232
194	movups	160-120(%rsi),%xmm0
195
196	jb	.Lenc4x_tail
197
198.byte	102,15,56,220,209
199.byte	102,15,56,220,217
200.byte	102,15,56,220,225
201.byte	102,15,56,220,233
202	movups	176-120(%rsi),%xmm1
203
204.byte	102,15,56,220,208
205.byte	102,15,56,220,216
206.byte	102,15,56,220,224
207.byte	102,15,56,220,232
208	movups	192-120(%rsi),%xmm0
209
210	je	.Lenc4x_tail
211
212.byte	102,15,56,220,209
213.byte	102,15,56,220,217
214.byte	102,15,56,220,225
215.byte	102,15,56,220,233
216	movups	208-120(%rsi),%xmm1
217
218.byte	102,15,56,220,208
219.byte	102,15,56,220,216
220.byte	102,15,56,220,224
221.byte	102,15,56,220,232
222	movups	224-120(%rsi),%xmm0
223	jmp	.Lenc4x_tail
224
225.align	32
226.Lenc4x_tail:
227.byte	102,15,56,220,209
228.byte	102,15,56,220,217
229.byte	102,15,56,220,225
230.byte	102,15,56,220,233
231	movdqu	(%r8,%rbx,1),%xmm6
232	movdqu	16-120(%rsi),%xmm1
233
234.byte	102,15,56,221,208
235	movdqu	(%r9,%rbx,1),%xmm7
236	pxor	%xmm12,%xmm6
237.byte	102,15,56,221,216
238	movdqu	(%r10,%rbx,1),%xmm8
239	pxor	%xmm12,%xmm7
240.byte	102,15,56,221,224
241	movdqu	(%r11,%rbx,1),%xmm9
242	pxor	%xmm12,%xmm8
243.byte	102,15,56,221,232
244	movdqu	32-120(%rsi),%xmm0
245	pxor	%xmm12,%xmm9
246
247	movups	%xmm2,-16(%r12,%rbx,1)
248	pxor	%xmm6,%xmm2
249	movups	%xmm3,-16(%r13,%rbx,1)
250	pxor	%xmm7,%xmm3
251	movups	%xmm4,-16(%r14,%rbx,1)
252	pxor	%xmm8,%xmm4
253	movups	%xmm5,-16(%r15,%rbx,1)
254	pxor	%xmm9,%xmm5
255
256	decl	%edx
257	jnz	.Loop_enc4x
258
259	movq	16(%rsp),%rax
260.cfi_def_cfa	%rax,8
261	movl	24(%rsp),%edx
262
263
264
265
266
267
268
269
270
271
272
273	leaq	160(%rdi),%rdi
274	decl	%edx
275	jnz	.Lenc4x_loop_grande
276
277.Lenc4x_done:
278	movq	-48(%rax),%r15
279.cfi_restore	%r15
280	movq	-40(%rax),%r14
281.cfi_restore	%r14
282	movq	-32(%rax),%r13
283.cfi_restore	%r13
284	movq	-24(%rax),%r12
285.cfi_restore	%r12
286	movq	-16(%rax),%rbp
287.cfi_restore	%rbp
288	movq	-8(%rax),%rbx
289.cfi_restore	%rbx
290	leaq	(%rax),%rsp
291.cfi_def_cfa_register	%rsp
292.Lenc4x_epilogue:
293	.byte	0xf3,0xc3
294.cfi_endproc
295.size	aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
296
297.globl	aesni_multi_cbc_decrypt
298.type	aesni_multi_cbc_decrypt,@function
299.align	32
300aesni_multi_cbc_decrypt:
301.cfi_startproc
302	cmpl	$2,%edx
303	jb	.Ldec_non_avx
304	movl	OPENSSL_ia32cap_P+4(%rip),%ecx
305	testl	$268435456,%ecx
306	jnz	_avx_cbc_dec_shortcut
307	jmp	.Ldec_non_avx
308.align	16
309.Ldec_non_avx:
310	movq	%rsp,%rax
311.cfi_def_cfa_register	%rax
312	pushq	%rbx
313.cfi_offset	%rbx,-16
314	pushq	%rbp
315.cfi_offset	%rbp,-24
316	pushq	%r12
317.cfi_offset	%r12,-32
318	pushq	%r13
319.cfi_offset	%r13,-40
320	pushq	%r14
321.cfi_offset	%r14,-48
322	pushq	%r15
323.cfi_offset	%r15,-56
324
325
326
327
328
329
330	subq	$48,%rsp
331	andq	$-64,%rsp
332	movq	%rax,16(%rsp)
333.cfi_escape	0x0f,0x05,0x77,0x10,0x06,0x23,0x08
334
335.Ldec4x_body:
336	movdqu	(%rsi),%xmm12
337	leaq	120(%rsi),%rsi
338	leaq	80(%rdi),%rdi
339
340.Ldec4x_loop_grande:
341	movl	%edx,24(%rsp)
342	xorl	%edx,%edx
343
344	movl	-64(%rdi),%ecx
345	movq	-80(%rdi),%r8
346	cmpl	%edx,%ecx
347	movq	-72(%rdi),%r12
348	cmovgl	%ecx,%edx
349	testl	%ecx,%ecx
350
351	movdqu	-56(%rdi),%xmm6
352	movl	%ecx,32(%rsp)
353	cmovleq	%rsp,%r8
354
355	movl	-24(%rdi),%ecx
356	movq	-40(%rdi),%r9
357	cmpl	%edx,%ecx
358	movq	-32(%rdi),%r13
359	cmovgl	%ecx,%edx
360	testl	%ecx,%ecx
361
362	movdqu	-16(%rdi),%xmm7
363	movl	%ecx,36(%rsp)
364	cmovleq	%rsp,%r9
365
366	movl	16(%rdi),%ecx
367	movq	0(%rdi),%r10
368	cmpl	%edx,%ecx
369	movq	8(%rdi),%r14
370	cmovgl	%ecx,%edx
371	testl	%ecx,%ecx
372
373	movdqu	24(%rdi),%xmm8
374	movl	%ecx,40(%rsp)
375	cmovleq	%rsp,%r10
376
377	movl	56(%rdi),%ecx
378	movq	40(%rdi),%r11
379	cmpl	%edx,%ecx
380	movq	48(%rdi),%r15
381	cmovgl	%ecx,%edx
382	testl	%ecx,%ecx
383
384	movdqu	64(%rdi),%xmm9
385	movl	%ecx,44(%rsp)
386	cmovleq	%rsp,%r11
387	testl	%edx,%edx
388	jz	.Ldec4x_done
389
390	movups	16-120(%rsi),%xmm1
391	movups	32-120(%rsi),%xmm0
392	movl	240-120(%rsi),%eax
393	movdqu	(%r8),%xmm2
394	movdqu	(%r9),%xmm3
395	pxor	%xmm12,%xmm2
396	movdqu	(%r10),%xmm4
397	pxor	%xmm12,%xmm3
398	movdqu	(%r11),%xmm5
399	pxor	%xmm12,%xmm4
400	pxor	%xmm12,%xmm5
401	movdqa	32(%rsp),%xmm10
402	xorq	%rbx,%rbx
403	jmp	.Loop_dec4x
404
405.align	32
406.Loop_dec4x:
407	addq	$16,%rbx
408	leaq	16(%rsp),%rbp
409	movl	$1,%ecx
410	subq	%rbx,%rbp
411
412.byte	102,15,56,222,209
413	prefetcht0	31(%r8,%rbx,1)
414	prefetcht0	31(%r9,%rbx,1)
415.byte	102,15,56,222,217
416	prefetcht0	31(%r10,%rbx,1)
417	prefetcht0	31(%r11,%rbx,1)
418.byte	102,15,56,222,225
419.byte	102,15,56,222,233
420	movups	48-120(%rsi),%xmm1
421	cmpl	32(%rsp),%ecx
422.byte	102,15,56,222,208
423.byte	102,15,56,222,216
424.byte	102,15,56,222,224
425	cmovgeq	%rbp,%r8
426	cmovgq	%rbp,%r12
427.byte	102,15,56,222,232
428	movups	-56(%rsi),%xmm0
429	cmpl	36(%rsp),%ecx
430.byte	102,15,56,222,209
431.byte	102,15,56,222,217
432.byte	102,15,56,222,225
433	cmovgeq	%rbp,%r9
434	cmovgq	%rbp,%r13
435.byte	102,15,56,222,233
436	movups	-40(%rsi),%xmm1
437	cmpl	40(%rsp),%ecx
438.byte	102,15,56,222,208
439.byte	102,15,56,222,216
440.byte	102,15,56,222,224
441	cmovgeq	%rbp,%r10
442	cmovgq	%rbp,%r14
443.byte	102,15,56,222,232
444	movups	-24(%rsi),%xmm0
445	cmpl	44(%rsp),%ecx
446.byte	102,15,56,222,209
447.byte	102,15,56,222,217
448.byte	102,15,56,222,225
449	cmovgeq	%rbp,%r11
450	cmovgq	%rbp,%r15
451.byte	102,15,56,222,233
452	movups	-8(%rsi),%xmm1
453	movdqa	%xmm10,%xmm11
454.byte	102,15,56,222,208
455	prefetcht0	15(%r12,%rbx,1)
456	prefetcht0	15(%r13,%rbx,1)
457.byte	102,15,56,222,216
458	prefetcht0	15(%r14,%rbx,1)
459	prefetcht0	15(%r15,%rbx,1)
460.byte	102,15,56,222,224
461.byte	102,15,56,222,232
462	movups	128-120(%rsi),%xmm0
463	pxor	%xmm12,%xmm12
464
465.byte	102,15,56,222,209
466	pcmpgtd	%xmm12,%xmm11
467	movdqu	-120(%rsi),%xmm12
468.byte	102,15,56,222,217
469	paddd	%xmm11,%xmm10
470	movdqa	%xmm10,32(%rsp)
471.byte	102,15,56,222,225
472.byte	102,15,56,222,233
473	movups	144-120(%rsi),%xmm1
474
475	cmpl	$11,%eax
476
477.byte	102,15,56,222,208
478.byte	102,15,56,222,216
479.byte	102,15,56,222,224
480.byte	102,15,56,222,232
481	movups	160-120(%rsi),%xmm0
482
483	jb	.Ldec4x_tail
484
485.byte	102,15,56,222,209
486.byte	102,15,56,222,217
487.byte	102,15,56,222,225
488.byte	102,15,56,222,233
489	movups	176-120(%rsi),%xmm1
490
491.byte	102,15,56,222,208
492.byte	102,15,56,222,216
493.byte	102,15,56,222,224
494.byte	102,15,56,222,232
495	movups	192-120(%rsi),%xmm0
496
497	je	.Ldec4x_tail
498
499.byte	102,15,56,222,209
500.byte	102,15,56,222,217
501.byte	102,15,56,222,225
502.byte	102,15,56,222,233
503	movups	208-120(%rsi),%xmm1
504
505.byte	102,15,56,222,208
506.byte	102,15,56,222,216
507.byte	102,15,56,222,224
508.byte	102,15,56,222,232
509	movups	224-120(%rsi),%xmm0
510	jmp	.Ldec4x_tail
511
512.align	32
513.Ldec4x_tail:
514.byte	102,15,56,222,209
515.byte	102,15,56,222,217
516.byte	102,15,56,222,225
517	pxor	%xmm0,%xmm6
518	pxor	%xmm0,%xmm7
519.byte	102,15,56,222,233
520	movdqu	16-120(%rsi),%xmm1
521	pxor	%xmm0,%xmm8
522	pxor	%xmm0,%xmm9
523	movdqu	32-120(%rsi),%xmm0
524
525.byte	102,15,56,223,214
526.byte	102,15,56,223,223
527	movdqu	-16(%r8,%rbx,1),%xmm6
528	movdqu	-16(%r9,%rbx,1),%xmm7
529.byte	102,65,15,56,223,224
530.byte	102,65,15,56,223,233
531	movdqu	-16(%r10,%rbx,1),%xmm8
532	movdqu	-16(%r11,%rbx,1),%xmm9
533
534	movups	%xmm2,-16(%r12,%rbx,1)
535	movdqu	(%r8,%rbx,1),%xmm2
536	movups	%xmm3,-16(%r13,%rbx,1)
537	movdqu	(%r9,%rbx,1),%xmm3
538	pxor	%xmm12,%xmm2
539	movups	%xmm4,-16(%r14,%rbx,1)
540	movdqu	(%r10,%rbx,1),%xmm4
541	pxor	%xmm12,%xmm3
542	movups	%xmm5,-16(%r15,%rbx,1)
543	movdqu	(%r11,%rbx,1),%xmm5
544	pxor	%xmm12,%xmm4
545	pxor	%xmm12,%xmm5
546
547	decl	%edx
548	jnz	.Loop_dec4x
549
550	movq	16(%rsp),%rax
551.cfi_def_cfa	%rax,8
552	movl	24(%rsp),%edx
553
554	leaq	160(%rdi),%rdi
555	decl	%edx
556	jnz	.Ldec4x_loop_grande
557
558.Ldec4x_done:
559	movq	-48(%rax),%r15
560.cfi_restore	%r15
561	movq	-40(%rax),%r14
562.cfi_restore	%r14
563	movq	-32(%rax),%r13
564.cfi_restore	%r13
565	movq	-24(%rax),%r12
566.cfi_restore	%r12
567	movq	-16(%rax),%rbp
568.cfi_restore	%rbp
569	movq	-8(%rax),%rbx
570.cfi_restore	%rbx
571	leaq	(%rax),%rsp
572.cfi_def_cfa_register	%rsp
573.Ldec4x_epilogue:
574	.byte	0xf3,0xc3
575.cfi_endproc
576.size	aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
577.type	aesni_multi_cbc_encrypt_avx,@function
578.align	32
579aesni_multi_cbc_encrypt_avx:
580.cfi_startproc
581_avx_cbc_enc_shortcut:
582	movq	%rsp,%rax
583.cfi_def_cfa_register	%rax
584	pushq	%rbx
585.cfi_offset	%rbx,-16
586	pushq	%rbp
587.cfi_offset	%rbp,-24
588	pushq	%r12
589.cfi_offset	%r12,-32
590	pushq	%r13
591.cfi_offset	%r13,-40
592	pushq	%r14
593.cfi_offset	%r14,-48
594	pushq	%r15
595.cfi_offset	%r15,-56
596
597
598
599
600
601
602
603
604	subq	$192,%rsp
605	andq	$-128,%rsp
606	movq	%rax,16(%rsp)
607.cfi_escape	0x0f,0x05,0x77,0x10,0x06,0x23,0x08
608
609.Lenc8x_body:
610	vzeroupper
611	vmovdqu	(%rsi),%xmm15
612	leaq	120(%rsi),%rsi
613	leaq	160(%rdi),%rdi
614	shrl	$1,%edx
615
616.Lenc8x_loop_grande:
617
618	xorl	%edx,%edx
619
620	movl	-144(%rdi),%ecx
621
622	movq	-160(%rdi),%r8
623	cmpl	%edx,%ecx
624
625	movq	-152(%rdi),%rbx
626	cmovgl	%ecx,%edx
627	testl	%ecx,%ecx
628
629	vmovdqu	-136(%rdi),%xmm2
630	movl	%ecx,32(%rsp)
631	cmovleq	%rsp,%r8
632	subq	%r8,%rbx
633	movq	%rbx,64(%rsp)
634
635	movl	-104(%rdi),%ecx
636
637	movq	-120(%rdi),%r9
638	cmpl	%edx,%ecx
639
640	movq	-112(%rdi),%rbp
641	cmovgl	%ecx,%edx
642	testl	%ecx,%ecx
643
644	vmovdqu	-96(%rdi),%xmm3
645	movl	%ecx,36(%rsp)
646	cmovleq	%rsp,%r9
647	subq	%r9,%rbp
648	movq	%rbp,72(%rsp)
649
650	movl	-64(%rdi),%ecx
651
652	movq	-80(%rdi),%r10
653	cmpl	%edx,%ecx
654
655	movq	-72(%rdi),%rbp
656	cmovgl	%ecx,%edx
657	testl	%ecx,%ecx
658
659	vmovdqu	-56(%rdi),%xmm4
660	movl	%ecx,40(%rsp)
661	cmovleq	%rsp,%r10
662	subq	%r10,%rbp
663	movq	%rbp,80(%rsp)
664
665	movl	-24(%rdi),%ecx
666
667	movq	-40(%rdi),%r11
668	cmpl	%edx,%ecx
669
670	movq	-32(%rdi),%rbp
671	cmovgl	%ecx,%edx
672	testl	%ecx,%ecx
673
674	vmovdqu	-16(%rdi),%xmm5
675	movl	%ecx,44(%rsp)
676	cmovleq	%rsp,%r11
677	subq	%r11,%rbp
678	movq	%rbp,88(%rsp)
679
680	movl	16(%rdi),%ecx
681
682	movq	0(%rdi),%r12
683	cmpl	%edx,%ecx
684
685	movq	8(%rdi),%rbp
686	cmovgl	%ecx,%edx
687	testl	%ecx,%ecx
688
689	vmovdqu	24(%rdi),%xmm6
690	movl	%ecx,48(%rsp)
691	cmovleq	%rsp,%r12
692	subq	%r12,%rbp
693	movq	%rbp,96(%rsp)
694
695	movl	56(%rdi),%ecx
696
697	movq	40(%rdi),%r13
698	cmpl	%edx,%ecx
699
700	movq	48(%rdi),%rbp
701	cmovgl	%ecx,%edx
702	testl	%ecx,%ecx
703
704	vmovdqu	64(%rdi),%xmm7
705	movl	%ecx,52(%rsp)
706	cmovleq	%rsp,%r13
707	subq	%r13,%rbp
708	movq	%rbp,104(%rsp)
709
710	movl	96(%rdi),%ecx
711
712	movq	80(%rdi),%r14
713	cmpl	%edx,%ecx
714
715	movq	88(%rdi),%rbp
716	cmovgl	%ecx,%edx
717	testl	%ecx,%ecx
718
719	vmovdqu	104(%rdi),%xmm8
720	movl	%ecx,56(%rsp)
721	cmovleq	%rsp,%r14
722	subq	%r14,%rbp
723	movq	%rbp,112(%rsp)
724
725	movl	136(%rdi),%ecx
726
727	movq	120(%rdi),%r15
728	cmpl	%edx,%ecx
729
730	movq	128(%rdi),%rbp
731	cmovgl	%ecx,%edx
732	testl	%ecx,%ecx
733
734	vmovdqu	144(%rdi),%xmm9
735	movl	%ecx,60(%rsp)
736	cmovleq	%rsp,%r15
737	subq	%r15,%rbp
738	movq	%rbp,120(%rsp)
739	testl	%edx,%edx
740	jz	.Lenc8x_done
741
742	vmovups	16-120(%rsi),%xmm1
743	vmovups	32-120(%rsi),%xmm0
744	movl	240-120(%rsi),%eax
745
746	vpxor	(%r8),%xmm15,%xmm10
747	leaq	128(%rsp),%rbp
748	vpxor	(%r9),%xmm15,%xmm11
749	vpxor	(%r10),%xmm15,%xmm12
750	vpxor	(%r11),%xmm15,%xmm13
751	vpxor	%xmm10,%xmm2,%xmm2
752	vpxor	(%r12),%xmm15,%xmm10
753	vpxor	%xmm11,%xmm3,%xmm3
754	vpxor	(%r13),%xmm15,%xmm11
755	vpxor	%xmm12,%xmm4,%xmm4
756	vpxor	(%r14),%xmm15,%xmm12
757	vpxor	%xmm13,%xmm5,%xmm5
758	vpxor	(%r15),%xmm15,%xmm13
759	vpxor	%xmm10,%xmm6,%xmm6
760	movl	$1,%ecx
761	vpxor	%xmm11,%xmm7,%xmm7
762	vpxor	%xmm12,%xmm8,%xmm8
763	vpxor	%xmm13,%xmm9,%xmm9
764	jmp	.Loop_enc8x
765
766.align	32
767.Loop_enc8x:
768	vaesenc	%xmm1,%xmm2,%xmm2
769	cmpl	32+0(%rsp),%ecx
770	vaesenc	%xmm1,%xmm3,%xmm3
771	prefetcht0	31(%r8)
772	vaesenc	%xmm1,%xmm4,%xmm4
773	vaesenc	%xmm1,%xmm5,%xmm5
774	leaq	(%r8,%rbx,1),%rbx
775	cmovgeq	%rsp,%r8
776	vaesenc	%xmm1,%xmm6,%xmm6
777	cmovgq	%rsp,%rbx
778	vaesenc	%xmm1,%xmm7,%xmm7
779	subq	%r8,%rbx
780	vaesenc	%xmm1,%xmm8,%xmm8
781	vpxor	16(%r8),%xmm15,%xmm10
782	movq	%rbx,64+0(%rsp)
783	vaesenc	%xmm1,%xmm9,%xmm9
784	vmovups	-72(%rsi),%xmm1
785	leaq	16(%r8,%rbx,1),%r8
786	vmovdqu	%xmm10,0(%rbp)
787	vaesenc	%xmm0,%xmm2,%xmm2
788	cmpl	32+4(%rsp),%ecx
789	movq	64+8(%rsp),%rbx
790	vaesenc	%xmm0,%xmm3,%xmm3
791	prefetcht0	31(%r9)
792	vaesenc	%xmm0,%xmm4,%xmm4
793	vaesenc	%xmm0,%xmm5,%xmm5
794	leaq	(%r9,%rbx,1),%rbx
795	cmovgeq	%rsp,%r9
796	vaesenc	%xmm0,%xmm6,%xmm6
797	cmovgq	%rsp,%rbx
798	vaesenc	%xmm0,%xmm7,%xmm7
799	subq	%r9,%rbx
800	vaesenc	%xmm0,%xmm8,%xmm8
801	vpxor	16(%r9),%xmm15,%xmm11
802	movq	%rbx,64+8(%rsp)
803	vaesenc	%xmm0,%xmm9,%xmm9
804	vmovups	-56(%rsi),%xmm0
805	leaq	16(%r9,%rbx,1),%r9
806	vmovdqu	%xmm11,16(%rbp)
807	vaesenc	%xmm1,%xmm2,%xmm2
808	cmpl	32+8(%rsp),%ecx
809	movq	64+16(%rsp),%rbx
810	vaesenc	%xmm1,%xmm3,%xmm3
811	prefetcht0	31(%r10)
812	vaesenc	%xmm1,%xmm4,%xmm4
813	prefetcht0	15(%r8)
814	vaesenc	%xmm1,%xmm5,%xmm5
815	leaq	(%r10,%rbx,1),%rbx
816	cmovgeq	%rsp,%r10
817	vaesenc	%xmm1,%xmm6,%xmm6
818	cmovgq	%rsp,%rbx
819	vaesenc	%xmm1,%xmm7,%xmm7
820	subq	%r10,%rbx
821	vaesenc	%xmm1,%xmm8,%xmm8
822	vpxor	16(%r10),%xmm15,%xmm12
823	movq	%rbx,64+16(%rsp)
824	vaesenc	%xmm1,%xmm9,%xmm9
825	vmovups	-40(%rsi),%xmm1
826	leaq	16(%r10,%rbx,1),%r10
827	vmovdqu	%xmm12,32(%rbp)
828	vaesenc	%xmm0,%xmm2,%xmm2
829	cmpl	32+12(%rsp),%ecx
830	movq	64+24(%rsp),%rbx
831	vaesenc	%xmm0,%xmm3,%xmm3
832	prefetcht0	31(%r11)
833	vaesenc	%xmm0,%xmm4,%xmm4
834	prefetcht0	15(%r9)
835	vaesenc	%xmm0,%xmm5,%xmm5
836	leaq	(%r11,%rbx,1),%rbx
837	cmovgeq	%rsp,%r11
838	vaesenc	%xmm0,%xmm6,%xmm6
839	cmovgq	%rsp,%rbx
840	vaesenc	%xmm0,%xmm7,%xmm7
841	subq	%r11,%rbx
842	vaesenc	%xmm0,%xmm8,%xmm8
843	vpxor	16(%r11),%xmm15,%xmm13
844	movq	%rbx,64+24(%rsp)
845	vaesenc	%xmm0,%xmm9,%xmm9
846	vmovups	-24(%rsi),%xmm0
847	leaq	16(%r11,%rbx,1),%r11
848	vmovdqu	%xmm13,48(%rbp)
849	vaesenc	%xmm1,%xmm2,%xmm2
850	cmpl	32+16(%rsp),%ecx
851	movq	64+32(%rsp),%rbx
852	vaesenc	%xmm1,%xmm3,%xmm3
853	prefetcht0	31(%r12)
854	vaesenc	%xmm1,%xmm4,%xmm4
855	prefetcht0	15(%r10)
856	vaesenc	%xmm1,%xmm5,%xmm5
857	leaq	(%r12,%rbx,1),%rbx
858	cmovgeq	%rsp,%r12
859	vaesenc	%xmm1,%xmm6,%xmm6
860	cmovgq	%rsp,%rbx
861	vaesenc	%xmm1,%xmm7,%xmm7
862	subq	%r12,%rbx
863	vaesenc	%xmm1,%xmm8,%xmm8
864	vpxor	16(%r12),%xmm15,%xmm10
865	movq	%rbx,64+32(%rsp)
866	vaesenc	%xmm1,%xmm9,%xmm9
867	vmovups	-8(%rsi),%xmm1
868	leaq	16(%r12,%rbx,1),%r12
869	vaesenc	%xmm0,%xmm2,%xmm2
870	cmpl	32+20(%rsp),%ecx
871	movq	64+40(%rsp),%rbx
872	vaesenc	%xmm0,%xmm3,%xmm3
873	prefetcht0	31(%r13)
874	vaesenc	%xmm0,%xmm4,%xmm4
875	prefetcht0	15(%r11)
876	vaesenc	%xmm0,%xmm5,%xmm5
877	leaq	(%rbx,%r13,1),%rbx
878	cmovgeq	%rsp,%r13
879	vaesenc	%xmm0,%xmm6,%xmm6
880	cmovgq	%rsp,%rbx
881	vaesenc	%xmm0,%xmm7,%xmm7
882	subq	%r13,%rbx
883	vaesenc	%xmm0,%xmm8,%xmm8
884	vpxor	16(%r13),%xmm15,%xmm11
885	movq	%rbx,64+40(%rsp)
886	vaesenc	%xmm0,%xmm9,%xmm9
887	vmovups	8(%rsi),%xmm0
888	leaq	16(%r13,%rbx,1),%r13
889	vaesenc	%xmm1,%xmm2,%xmm2
890	cmpl	32+24(%rsp),%ecx
891	movq	64+48(%rsp),%rbx
892	vaesenc	%xmm1,%xmm3,%xmm3
893	prefetcht0	31(%r14)
894	vaesenc	%xmm1,%xmm4,%xmm4
895	prefetcht0	15(%r12)
896	vaesenc	%xmm1,%xmm5,%xmm5
897	leaq	(%r14,%rbx,1),%rbx
898	cmovgeq	%rsp,%r14
899	vaesenc	%xmm1,%xmm6,%xmm6
900	cmovgq	%rsp,%rbx
901	vaesenc	%xmm1,%xmm7,%xmm7
902	subq	%r14,%rbx
903	vaesenc	%xmm1,%xmm8,%xmm8
904	vpxor	16(%r14),%xmm15,%xmm12
905	movq	%rbx,64+48(%rsp)
906	vaesenc	%xmm1,%xmm9,%xmm9
907	vmovups	24(%rsi),%xmm1
908	leaq	16(%r14,%rbx,1),%r14
909	vaesenc	%xmm0,%xmm2,%xmm2
910	cmpl	32+28(%rsp),%ecx
911	movq	64+56(%rsp),%rbx
912	vaesenc	%xmm0,%xmm3,%xmm3
913	prefetcht0	31(%r15)
914	vaesenc	%xmm0,%xmm4,%xmm4
915	prefetcht0	15(%r13)
916	vaesenc	%xmm0,%xmm5,%xmm5
917	leaq	(%r15,%rbx,1),%rbx
918	cmovgeq	%rsp,%r15
919	vaesenc	%xmm0,%xmm6,%xmm6
920	cmovgq	%rsp,%rbx
921	vaesenc	%xmm0,%xmm7,%xmm7
922	subq	%r15,%rbx
923	vaesenc	%xmm0,%xmm8,%xmm8
924	vpxor	16(%r15),%xmm15,%xmm13
925	movq	%rbx,64+56(%rsp)
926	vaesenc	%xmm0,%xmm9,%xmm9
927	vmovups	40(%rsi),%xmm0
928	leaq	16(%r15,%rbx,1),%r15
929	vmovdqu	32(%rsp),%xmm14
930	prefetcht0	15(%r14)
931	prefetcht0	15(%r15)
932	cmpl	$11,%eax
933	jb	.Lenc8x_tail
934
935	vaesenc	%xmm1,%xmm2,%xmm2
936	vaesenc	%xmm1,%xmm3,%xmm3
937	vaesenc	%xmm1,%xmm4,%xmm4
938	vaesenc	%xmm1,%xmm5,%xmm5
939	vaesenc	%xmm1,%xmm6,%xmm6
940	vaesenc	%xmm1,%xmm7,%xmm7
941	vaesenc	%xmm1,%xmm8,%xmm8
942	vaesenc	%xmm1,%xmm9,%xmm9
943	vmovups	176-120(%rsi),%xmm1
944
945	vaesenc	%xmm0,%xmm2,%xmm2
946	vaesenc	%xmm0,%xmm3,%xmm3
947	vaesenc	%xmm0,%xmm4,%xmm4
948	vaesenc	%xmm0,%xmm5,%xmm5
949	vaesenc	%xmm0,%xmm6,%xmm6
950	vaesenc	%xmm0,%xmm7,%xmm7
951	vaesenc	%xmm0,%xmm8,%xmm8
952	vaesenc	%xmm0,%xmm9,%xmm9
953	vmovups	192-120(%rsi),%xmm0
954	je	.Lenc8x_tail
955
956	vaesenc	%xmm1,%xmm2,%xmm2
957	vaesenc	%xmm1,%xmm3,%xmm3
958	vaesenc	%xmm1,%xmm4,%xmm4
959	vaesenc	%xmm1,%xmm5,%xmm5
960	vaesenc	%xmm1,%xmm6,%xmm6
961	vaesenc	%xmm1,%xmm7,%xmm7
962	vaesenc	%xmm1,%xmm8,%xmm8
963	vaesenc	%xmm1,%xmm9,%xmm9
964	vmovups	208-120(%rsi),%xmm1
965
966	vaesenc	%xmm0,%xmm2,%xmm2
967	vaesenc	%xmm0,%xmm3,%xmm3
968	vaesenc	%xmm0,%xmm4,%xmm4
969	vaesenc	%xmm0,%xmm5,%xmm5
970	vaesenc	%xmm0,%xmm6,%xmm6
971	vaesenc	%xmm0,%xmm7,%xmm7
972	vaesenc	%xmm0,%xmm8,%xmm8
973	vaesenc	%xmm0,%xmm9,%xmm9
974	vmovups	224-120(%rsi),%xmm0
975
976.Lenc8x_tail:
977	vaesenc	%xmm1,%xmm2,%xmm2
978	vpxor	%xmm15,%xmm15,%xmm15
979	vaesenc	%xmm1,%xmm3,%xmm3
980	vaesenc	%xmm1,%xmm4,%xmm4
981	vpcmpgtd	%xmm15,%xmm14,%xmm15
982	vaesenc	%xmm1,%xmm5,%xmm5
983	vaesenc	%xmm1,%xmm6,%xmm6
984	vpaddd	%xmm14,%xmm15,%xmm15
985	vmovdqu	48(%rsp),%xmm14
986	vaesenc	%xmm1,%xmm7,%xmm7
987	movq	64(%rsp),%rbx
988	vaesenc	%xmm1,%xmm8,%xmm8
989	vaesenc	%xmm1,%xmm9,%xmm9
990	vmovups	16-120(%rsi),%xmm1
991
992	vaesenclast	%xmm0,%xmm2,%xmm2
993	vmovdqa	%xmm15,32(%rsp)
994	vpxor	%xmm15,%xmm15,%xmm15
995	vaesenclast	%xmm0,%xmm3,%xmm3
996	vaesenclast	%xmm0,%xmm4,%xmm4
997	vpcmpgtd	%xmm15,%xmm14,%xmm15
998	vaesenclast	%xmm0,%xmm5,%xmm5
999	vaesenclast	%xmm0,%xmm6,%xmm6
1000	vpaddd	%xmm15,%xmm14,%xmm14
1001	vmovdqu	-120(%rsi),%xmm15
1002	vaesenclast	%xmm0,%xmm7,%xmm7
1003	vaesenclast	%xmm0,%xmm8,%xmm8
1004	vmovdqa	%xmm14,48(%rsp)
1005	vaesenclast	%xmm0,%xmm9,%xmm9
1006	vmovups	32-120(%rsi),%xmm0
1007
1008	vmovups	%xmm2,-16(%r8)
1009	subq	%rbx,%r8
1010	vpxor	0(%rbp),%xmm2,%xmm2
1011	vmovups	%xmm3,-16(%r9)
1012	subq	72(%rsp),%r9
1013	vpxor	16(%rbp),%xmm3,%xmm3
1014	vmovups	%xmm4,-16(%r10)
1015	subq	80(%rsp),%r10
1016	vpxor	32(%rbp),%xmm4,%xmm4
1017	vmovups	%xmm5,-16(%r11)
1018	subq	88(%rsp),%r11
1019	vpxor	48(%rbp),%xmm5,%xmm5
1020	vmovups	%xmm6,-16(%r12)
1021	subq	96(%rsp),%r12
1022	vpxor	%xmm10,%xmm6,%xmm6
1023	vmovups	%xmm7,-16(%r13)
1024	subq	104(%rsp),%r13
1025	vpxor	%xmm11,%xmm7,%xmm7
1026	vmovups	%xmm8,-16(%r14)
1027	subq	112(%rsp),%r14
1028	vpxor	%xmm12,%xmm8,%xmm8
1029	vmovups	%xmm9,-16(%r15)
1030	subq	120(%rsp),%r15
1031	vpxor	%xmm13,%xmm9,%xmm9
1032
1033	decl	%edx
1034	jnz	.Loop_enc8x
1035
1036	movq	16(%rsp),%rax
1037.cfi_def_cfa	%rax,8
1038
1039
1040
1041
1042
1043.Lenc8x_done:
1044	vzeroupper
1045	movq	-48(%rax),%r15
1046.cfi_restore	%r15
1047	movq	-40(%rax),%r14
1048.cfi_restore	%r14
1049	movq	-32(%rax),%r13
1050.cfi_restore	%r13
1051	movq	-24(%rax),%r12
1052.cfi_restore	%r12
1053	movq	-16(%rax),%rbp
1054.cfi_restore	%rbp
1055	movq	-8(%rax),%rbx
1056.cfi_restore	%rbx
1057	leaq	(%rax),%rsp
1058.cfi_def_cfa_register	%rsp
1059.Lenc8x_epilogue:
1060	.byte	0xf3,0xc3
1061.cfi_endproc
1062.size	aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
1063
1064.type	aesni_multi_cbc_decrypt_avx,@function
1065.align	32
1066aesni_multi_cbc_decrypt_avx:
1067.cfi_startproc
1068_avx_cbc_dec_shortcut:
1069	movq	%rsp,%rax
1070.cfi_def_cfa_register	%rax
1071	pushq	%rbx
1072.cfi_offset	%rbx,-16
1073	pushq	%rbp
1074.cfi_offset	%rbp,-24
1075	pushq	%r12
1076.cfi_offset	%r12,-32
1077	pushq	%r13
1078.cfi_offset	%r13,-40
1079	pushq	%r14
1080.cfi_offset	%r14,-48
1081	pushq	%r15
1082.cfi_offset	%r15,-56
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092	subq	$256,%rsp
1093	andq	$-256,%rsp
1094	subq	$192,%rsp
1095	movq	%rax,16(%rsp)
1096.cfi_escape	0x0f,0x05,0x77,0x10,0x06,0x23,0x08
1097
1098.Ldec8x_body:
1099	vzeroupper
1100	vmovdqu	(%rsi),%xmm15
1101	leaq	120(%rsi),%rsi
1102	leaq	160(%rdi),%rdi
1103	shrl	$1,%edx
1104
1105.Ldec8x_loop_grande:
1106
1107	xorl	%edx,%edx
1108
1109	movl	-144(%rdi),%ecx
1110
1111	movq	-160(%rdi),%r8
1112	cmpl	%edx,%ecx
1113
1114	movq	-152(%rdi),%rbx
1115	cmovgl	%ecx,%edx
1116	testl	%ecx,%ecx
1117
1118	vmovdqu	-136(%rdi),%xmm2
1119	movl	%ecx,32(%rsp)
1120	cmovleq	%rsp,%r8
1121	subq	%r8,%rbx
1122	movq	%rbx,64(%rsp)
1123	vmovdqu	%xmm2,192(%rsp)
1124
1125	movl	-104(%rdi),%ecx
1126
1127	movq	-120(%rdi),%r9
1128	cmpl	%edx,%ecx
1129
1130	movq	-112(%rdi),%rbp
1131	cmovgl	%ecx,%edx
1132	testl	%ecx,%ecx
1133
1134	vmovdqu	-96(%rdi),%xmm3
1135	movl	%ecx,36(%rsp)
1136	cmovleq	%rsp,%r9
1137	subq	%r9,%rbp
1138	movq	%rbp,72(%rsp)
1139	vmovdqu	%xmm3,208(%rsp)
1140
1141	movl	-64(%rdi),%ecx
1142
1143	movq	-80(%rdi),%r10
1144	cmpl	%edx,%ecx
1145
1146	movq	-72(%rdi),%rbp
1147	cmovgl	%ecx,%edx
1148	testl	%ecx,%ecx
1149
1150	vmovdqu	-56(%rdi),%xmm4
1151	movl	%ecx,40(%rsp)
1152	cmovleq	%rsp,%r10
1153	subq	%r10,%rbp
1154	movq	%rbp,80(%rsp)
1155	vmovdqu	%xmm4,224(%rsp)
1156
1157	movl	-24(%rdi),%ecx
1158
1159	movq	-40(%rdi),%r11
1160	cmpl	%edx,%ecx
1161
1162	movq	-32(%rdi),%rbp
1163	cmovgl	%ecx,%edx
1164	testl	%ecx,%ecx
1165
1166	vmovdqu	-16(%rdi),%xmm5
1167	movl	%ecx,44(%rsp)
1168	cmovleq	%rsp,%r11
1169	subq	%r11,%rbp
1170	movq	%rbp,88(%rsp)
1171	vmovdqu	%xmm5,240(%rsp)
1172
1173	movl	16(%rdi),%ecx
1174
1175	movq	0(%rdi),%r12
1176	cmpl	%edx,%ecx
1177
1178	movq	8(%rdi),%rbp
1179	cmovgl	%ecx,%edx
1180	testl	%ecx,%ecx
1181
1182	vmovdqu	24(%rdi),%xmm6
1183	movl	%ecx,48(%rsp)
1184	cmovleq	%rsp,%r12
1185	subq	%r12,%rbp
1186	movq	%rbp,96(%rsp)
1187	vmovdqu	%xmm6,256(%rsp)
1188
1189	movl	56(%rdi),%ecx
1190
1191	movq	40(%rdi),%r13
1192	cmpl	%edx,%ecx
1193
1194	movq	48(%rdi),%rbp
1195	cmovgl	%ecx,%edx
1196	testl	%ecx,%ecx
1197
1198	vmovdqu	64(%rdi),%xmm7
1199	movl	%ecx,52(%rsp)
1200	cmovleq	%rsp,%r13
1201	subq	%r13,%rbp
1202	movq	%rbp,104(%rsp)
1203	vmovdqu	%xmm7,272(%rsp)
1204
1205	movl	96(%rdi),%ecx
1206
1207	movq	80(%rdi),%r14
1208	cmpl	%edx,%ecx
1209
1210	movq	88(%rdi),%rbp
1211	cmovgl	%ecx,%edx
1212	testl	%ecx,%ecx
1213
1214	vmovdqu	104(%rdi),%xmm8
1215	movl	%ecx,56(%rsp)
1216	cmovleq	%rsp,%r14
1217	subq	%r14,%rbp
1218	movq	%rbp,112(%rsp)
1219	vmovdqu	%xmm8,288(%rsp)
1220
1221	movl	136(%rdi),%ecx
1222
1223	movq	120(%rdi),%r15
1224	cmpl	%edx,%ecx
1225
1226	movq	128(%rdi),%rbp
1227	cmovgl	%ecx,%edx
1228	testl	%ecx,%ecx
1229
1230	vmovdqu	144(%rdi),%xmm9
1231	movl	%ecx,60(%rsp)
1232	cmovleq	%rsp,%r15
1233	subq	%r15,%rbp
1234	movq	%rbp,120(%rsp)
1235	vmovdqu	%xmm9,304(%rsp)
1236	testl	%edx,%edx
1237	jz	.Ldec8x_done
1238
1239	vmovups	16-120(%rsi),%xmm1
1240	vmovups	32-120(%rsi),%xmm0
1241	movl	240-120(%rsi),%eax
1242	leaq	192+128(%rsp),%rbp
1243
1244	vmovdqu	(%r8),%xmm2
1245	vmovdqu	(%r9),%xmm3
1246	vmovdqu	(%r10),%xmm4
1247	vmovdqu	(%r11),%xmm5
1248	vmovdqu	(%r12),%xmm6
1249	vmovdqu	(%r13),%xmm7
1250	vmovdqu	(%r14),%xmm8
1251	vmovdqu	(%r15),%xmm9
1252	vmovdqu	%xmm2,0(%rbp)
1253	vpxor	%xmm15,%xmm2,%xmm2
1254	vmovdqu	%xmm3,16(%rbp)
1255	vpxor	%xmm15,%xmm3,%xmm3
1256	vmovdqu	%xmm4,32(%rbp)
1257	vpxor	%xmm15,%xmm4,%xmm4
1258	vmovdqu	%xmm5,48(%rbp)
1259	vpxor	%xmm15,%xmm5,%xmm5
1260	vmovdqu	%xmm6,64(%rbp)
1261	vpxor	%xmm15,%xmm6,%xmm6
1262	vmovdqu	%xmm7,80(%rbp)
1263	vpxor	%xmm15,%xmm7,%xmm7
1264	vmovdqu	%xmm8,96(%rbp)
1265	vpxor	%xmm15,%xmm8,%xmm8
1266	vmovdqu	%xmm9,112(%rbp)
1267	vpxor	%xmm15,%xmm9,%xmm9
1268	xorq	$0x80,%rbp
1269	movl	$1,%ecx
1270	jmp	.Loop_dec8x
1271
1272.align	32
1273.Loop_dec8x:
1274	vaesdec	%xmm1,%xmm2,%xmm2
1275	cmpl	32+0(%rsp),%ecx
1276	vaesdec	%xmm1,%xmm3,%xmm3
1277	prefetcht0	31(%r8)
1278	vaesdec	%xmm1,%xmm4,%xmm4
1279	vaesdec	%xmm1,%xmm5,%xmm5
1280	leaq	(%r8,%rbx,1),%rbx
1281	cmovgeq	%rsp,%r8
1282	vaesdec	%xmm1,%xmm6,%xmm6
1283	cmovgq	%rsp,%rbx
1284	vaesdec	%xmm1,%xmm7,%xmm7
1285	subq	%r8,%rbx
1286	vaesdec	%xmm1,%xmm8,%xmm8
1287	vmovdqu	16(%r8),%xmm10
1288	movq	%rbx,64+0(%rsp)
1289	vaesdec	%xmm1,%xmm9,%xmm9
1290	vmovups	-72(%rsi),%xmm1
1291	leaq	16(%r8,%rbx,1),%r8
1292	vmovdqu	%xmm10,128(%rsp)
1293	vaesdec	%xmm0,%xmm2,%xmm2
1294	cmpl	32+4(%rsp),%ecx
1295	movq	64+8(%rsp),%rbx
1296	vaesdec	%xmm0,%xmm3,%xmm3
1297	prefetcht0	31(%r9)
1298	vaesdec	%xmm0,%xmm4,%xmm4
1299	vaesdec	%xmm0,%xmm5,%xmm5
1300	leaq	(%r9,%rbx,1),%rbx
1301	cmovgeq	%rsp,%r9
1302	vaesdec	%xmm0,%xmm6,%xmm6
1303	cmovgq	%rsp,%rbx
1304	vaesdec	%xmm0,%xmm7,%xmm7
1305	subq	%r9,%rbx
1306	vaesdec	%xmm0,%xmm8,%xmm8
1307	vmovdqu	16(%r9),%xmm11
1308	movq	%rbx,64+8(%rsp)
1309	vaesdec	%xmm0,%xmm9,%xmm9
1310	vmovups	-56(%rsi),%xmm0
1311	leaq	16(%r9,%rbx,1),%r9
1312	vmovdqu	%xmm11,144(%rsp)
1313	vaesdec	%xmm1,%xmm2,%xmm2
1314	cmpl	32+8(%rsp),%ecx
1315	movq	64+16(%rsp),%rbx
1316	vaesdec	%xmm1,%xmm3,%xmm3
1317	prefetcht0	31(%r10)
1318	vaesdec	%xmm1,%xmm4,%xmm4
1319	prefetcht0	15(%r8)
1320	vaesdec	%xmm1,%xmm5,%xmm5
1321	leaq	(%r10,%rbx,1),%rbx
1322	cmovgeq	%rsp,%r10
1323	vaesdec	%xmm1,%xmm6,%xmm6
1324	cmovgq	%rsp,%rbx
1325	vaesdec	%xmm1,%xmm7,%xmm7
1326	subq	%r10,%rbx
1327	vaesdec	%xmm1,%xmm8,%xmm8
1328	vmovdqu	16(%r10),%xmm12
1329	movq	%rbx,64+16(%rsp)
1330	vaesdec	%xmm1,%xmm9,%xmm9
1331	vmovups	-40(%rsi),%xmm1
1332	leaq	16(%r10,%rbx,1),%r10
1333	vmovdqu	%xmm12,160(%rsp)
1334	vaesdec	%xmm0,%xmm2,%xmm2
1335	cmpl	32+12(%rsp),%ecx
1336	movq	64+24(%rsp),%rbx
1337	vaesdec	%xmm0,%xmm3,%xmm3
1338	prefetcht0	31(%r11)
1339	vaesdec	%xmm0,%xmm4,%xmm4
1340	prefetcht0	15(%r9)
1341	vaesdec	%xmm0,%xmm5,%xmm5
1342	leaq	(%r11,%rbx,1),%rbx
1343	cmovgeq	%rsp,%r11
1344	vaesdec	%xmm0,%xmm6,%xmm6
1345	cmovgq	%rsp,%rbx
1346	vaesdec	%xmm0,%xmm7,%xmm7
1347	subq	%r11,%rbx
1348	vaesdec	%xmm0,%xmm8,%xmm8
1349	vmovdqu	16(%r11),%xmm13
1350	movq	%rbx,64+24(%rsp)
1351	vaesdec	%xmm0,%xmm9,%xmm9
1352	vmovups	-24(%rsi),%xmm0
1353	leaq	16(%r11,%rbx,1),%r11
1354	vmovdqu	%xmm13,176(%rsp)
1355	vaesdec	%xmm1,%xmm2,%xmm2
1356	cmpl	32+16(%rsp),%ecx
1357	movq	64+32(%rsp),%rbx
1358	vaesdec	%xmm1,%xmm3,%xmm3
1359	prefetcht0	31(%r12)
1360	vaesdec	%xmm1,%xmm4,%xmm4
1361	prefetcht0	15(%r10)
1362	vaesdec	%xmm1,%xmm5,%xmm5
1363	leaq	(%r12,%rbx,1),%rbx
1364	cmovgeq	%rsp,%r12
1365	vaesdec	%xmm1,%xmm6,%xmm6
1366	cmovgq	%rsp,%rbx
1367	vaesdec	%xmm1,%xmm7,%xmm7
1368	subq	%r12,%rbx
1369	vaesdec	%xmm1,%xmm8,%xmm8
1370	vmovdqu	16(%r12),%xmm10
1371	movq	%rbx,64+32(%rsp)
1372	vaesdec	%xmm1,%xmm9,%xmm9
1373	vmovups	-8(%rsi),%xmm1
1374	leaq	16(%r12,%rbx,1),%r12
1375	vaesdec	%xmm0,%xmm2,%xmm2
1376	cmpl	32+20(%rsp),%ecx
1377	movq	64+40(%rsp),%rbx
1378	vaesdec	%xmm0,%xmm3,%xmm3
1379	prefetcht0	31(%r13)
1380	vaesdec	%xmm0,%xmm4,%xmm4
1381	prefetcht0	15(%r11)
1382	vaesdec	%xmm0,%xmm5,%xmm5
1383	leaq	(%rbx,%r13,1),%rbx
1384	cmovgeq	%rsp,%r13
1385	vaesdec	%xmm0,%xmm6,%xmm6
1386	cmovgq	%rsp,%rbx
1387	vaesdec	%xmm0,%xmm7,%xmm7
1388	subq	%r13,%rbx
1389	vaesdec	%xmm0,%xmm8,%xmm8
1390	vmovdqu	16(%r13),%xmm11
1391	movq	%rbx,64+40(%rsp)
1392	vaesdec	%xmm0,%xmm9,%xmm9
1393	vmovups	8(%rsi),%xmm0
1394	leaq	16(%r13,%rbx,1),%r13
1395	vaesdec	%xmm1,%xmm2,%xmm2
1396	cmpl	32+24(%rsp),%ecx
1397	movq	64+48(%rsp),%rbx
1398	vaesdec	%xmm1,%xmm3,%xmm3
1399	prefetcht0	31(%r14)
1400	vaesdec	%xmm1,%xmm4,%xmm4
1401	prefetcht0	15(%r12)
1402	vaesdec	%xmm1,%xmm5,%xmm5
1403	leaq	(%r14,%rbx,1),%rbx
1404	cmovgeq	%rsp,%r14
1405	vaesdec	%xmm1,%xmm6,%xmm6
1406	cmovgq	%rsp,%rbx
1407	vaesdec	%xmm1,%xmm7,%xmm7
1408	subq	%r14,%rbx
1409	vaesdec	%xmm1,%xmm8,%xmm8
1410	vmovdqu	16(%r14),%xmm12
1411	movq	%rbx,64+48(%rsp)
1412	vaesdec	%xmm1,%xmm9,%xmm9
1413	vmovups	24(%rsi),%xmm1
1414	leaq	16(%r14,%rbx,1),%r14
1415	vaesdec	%xmm0,%xmm2,%xmm2
1416	cmpl	32+28(%rsp),%ecx
1417	movq	64+56(%rsp),%rbx
1418	vaesdec	%xmm0,%xmm3,%xmm3
1419	prefetcht0	31(%r15)
1420	vaesdec	%xmm0,%xmm4,%xmm4
1421	prefetcht0	15(%r13)
1422	vaesdec	%xmm0,%xmm5,%xmm5
1423	leaq	(%r15,%rbx,1),%rbx
1424	cmovgeq	%rsp,%r15
1425	vaesdec	%xmm0,%xmm6,%xmm6
1426	cmovgq	%rsp,%rbx
1427	vaesdec	%xmm0,%xmm7,%xmm7
1428	subq	%r15,%rbx
1429	vaesdec	%xmm0,%xmm8,%xmm8
1430	vmovdqu	16(%r15),%xmm13
1431	movq	%rbx,64+56(%rsp)
1432	vaesdec	%xmm0,%xmm9,%xmm9
1433	vmovups	40(%rsi),%xmm0
1434	leaq	16(%r15,%rbx,1),%r15
1435	vmovdqu	32(%rsp),%xmm14
1436	prefetcht0	15(%r14)
1437	prefetcht0	15(%r15)
1438	cmpl	$11,%eax
1439	jb	.Ldec8x_tail
1440
1441	vaesdec	%xmm1,%xmm2,%xmm2
1442	vaesdec	%xmm1,%xmm3,%xmm3
1443	vaesdec	%xmm1,%xmm4,%xmm4
1444	vaesdec	%xmm1,%xmm5,%xmm5
1445	vaesdec	%xmm1,%xmm6,%xmm6
1446	vaesdec	%xmm1,%xmm7,%xmm7
1447	vaesdec	%xmm1,%xmm8,%xmm8
1448	vaesdec	%xmm1,%xmm9,%xmm9
1449	vmovups	176-120(%rsi),%xmm1
1450
1451	vaesdec	%xmm0,%xmm2,%xmm2
1452	vaesdec	%xmm0,%xmm3,%xmm3
1453	vaesdec	%xmm0,%xmm4,%xmm4
1454	vaesdec	%xmm0,%xmm5,%xmm5
1455	vaesdec	%xmm0,%xmm6,%xmm6
1456	vaesdec	%xmm0,%xmm7,%xmm7
1457	vaesdec	%xmm0,%xmm8,%xmm8
1458	vaesdec	%xmm0,%xmm9,%xmm9
1459	vmovups	192-120(%rsi),%xmm0
1460	je	.Ldec8x_tail
1461
1462	vaesdec	%xmm1,%xmm2,%xmm2
1463	vaesdec	%xmm1,%xmm3,%xmm3
1464	vaesdec	%xmm1,%xmm4,%xmm4
1465	vaesdec	%xmm1,%xmm5,%xmm5
1466	vaesdec	%xmm1,%xmm6,%xmm6
1467	vaesdec	%xmm1,%xmm7,%xmm7
1468	vaesdec	%xmm1,%xmm8,%xmm8
1469	vaesdec	%xmm1,%xmm9,%xmm9
1470	vmovups	208-120(%rsi),%xmm1
1471
1472	vaesdec	%xmm0,%xmm2,%xmm2
1473	vaesdec	%xmm0,%xmm3,%xmm3
1474	vaesdec	%xmm0,%xmm4,%xmm4
1475	vaesdec	%xmm0,%xmm5,%xmm5
1476	vaesdec	%xmm0,%xmm6,%xmm6
1477	vaesdec	%xmm0,%xmm7,%xmm7
1478	vaesdec	%xmm0,%xmm8,%xmm8
1479	vaesdec	%xmm0,%xmm9,%xmm9
1480	vmovups	224-120(%rsi),%xmm0
1481
1482.Ldec8x_tail:
1483	vaesdec	%xmm1,%xmm2,%xmm2
1484	vpxor	%xmm15,%xmm15,%xmm15
1485	vaesdec	%xmm1,%xmm3,%xmm3
1486	vaesdec	%xmm1,%xmm4,%xmm4
1487	vpcmpgtd	%xmm15,%xmm14,%xmm15
1488	vaesdec	%xmm1,%xmm5,%xmm5
1489	vaesdec	%xmm1,%xmm6,%xmm6
1490	vpaddd	%xmm14,%xmm15,%xmm15
1491	vmovdqu	48(%rsp),%xmm14
1492	vaesdec	%xmm1,%xmm7,%xmm7
1493	movq	64(%rsp),%rbx
1494	vaesdec	%xmm1,%xmm8,%xmm8
1495	vaesdec	%xmm1,%xmm9,%xmm9
1496	vmovups	16-120(%rsi),%xmm1
1497
1498	vaesdeclast	%xmm0,%xmm2,%xmm2
1499	vmovdqa	%xmm15,32(%rsp)
1500	vpxor	%xmm15,%xmm15,%xmm15
1501	vaesdeclast	%xmm0,%xmm3,%xmm3
1502	vpxor	0(%rbp),%xmm2,%xmm2
1503	vaesdeclast	%xmm0,%xmm4,%xmm4
1504	vpxor	16(%rbp),%xmm3,%xmm3
1505	vpcmpgtd	%xmm15,%xmm14,%xmm15
1506	vaesdeclast	%xmm0,%xmm5,%xmm5
1507	vpxor	32(%rbp),%xmm4,%xmm4
1508	vaesdeclast	%xmm0,%xmm6,%xmm6
1509	vpxor	48(%rbp),%xmm5,%xmm5
1510	vpaddd	%xmm15,%xmm14,%xmm14
1511	vmovdqu	-120(%rsi),%xmm15
1512	vaesdeclast	%xmm0,%xmm7,%xmm7
1513	vpxor	64(%rbp),%xmm6,%xmm6
1514	vaesdeclast	%xmm0,%xmm8,%xmm8
1515	vpxor	80(%rbp),%xmm7,%xmm7
1516	vmovdqa	%xmm14,48(%rsp)
1517	vaesdeclast	%xmm0,%xmm9,%xmm9
1518	vpxor	96(%rbp),%xmm8,%xmm8
1519	vmovups	32-120(%rsi),%xmm0
1520
1521	vmovups	%xmm2,-16(%r8)
1522	subq	%rbx,%r8
1523	vmovdqu	128+0(%rsp),%xmm2
1524	vpxor	112(%rbp),%xmm9,%xmm9
1525	vmovups	%xmm3,-16(%r9)
1526	subq	72(%rsp),%r9
1527	vmovdqu	%xmm2,0(%rbp)
1528	vpxor	%xmm15,%xmm2,%xmm2
1529	vmovdqu	128+16(%rsp),%xmm3
1530	vmovups	%xmm4,-16(%r10)
1531	subq	80(%rsp),%r10
1532	vmovdqu	%xmm3,16(%rbp)
1533	vpxor	%xmm15,%xmm3,%xmm3
1534	vmovdqu	128+32(%rsp),%xmm4
1535	vmovups	%xmm5,-16(%r11)
1536	subq	88(%rsp),%r11
1537	vmovdqu	%xmm4,32(%rbp)
1538	vpxor	%xmm15,%xmm4,%xmm4
1539	vmovdqu	128+48(%rsp),%xmm5
1540	vmovups	%xmm6,-16(%r12)
1541	subq	96(%rsp),%r12
1542	vmovdqu	%xmm5,48(%rbp)
1543	vpxor	%xmm15,%xmm5,%xmm5
1544	vmovdqu	%xmm10,64(%rbp)
1545	vpxor	%xmm10,%xmm15,%xmm6
1546	vmovups	%xmm7,-16(%r13)
1547	subq	104(%rsp),%r13
1548	vmovdqu	%xmm11,80(%rbp)
1549	vpxor	%xmm11,%xmm15,%xmm7
1550	vmovups	%xmm8,-16(%r14)
1551	subq	112(%rsp),%r14
1552	vmovdqu	%xmm12,96(%rbp)
1553	vpxor	%xmm12,%xmm15,%xmm8
1554	vmovups	%xmm9,-16(%r15)
1555	subq	120(%rsp),%r15
1556	vmovdqu	%xmm13,112(%rbp)
1557	vpxor	%xmm13,%xmm15,%xmm9
1558
1559	xorq	$128,%rbp
1560	decl	%edx
1561	jnz	.Loop_dec8x
1562
1563	movq	16(%rsp),%rax
1564.cfi_def_cfa	%rax,8
1565
1566
1567
1568
1569
1570.Ldec8x_done:
1571	vzeroupper
1572	movq	-48(%rax),%r15
1573.cfi_restore	%r15
1574	movq	-40(%rax),%r14
1575.cfi_restore	%r14
1576	movq	-32(%rax),%r13
1577.cfi_restore	%r13
1578	movq	-24(%rax),%r12
1579.cfi_restore	%r12
1580	movq	-16(%rax),%rbp
1581.cfi_restore	%rbp
1582	movq	-8(%rax),%rbx
1583.cfi_restore	%rbx
1584	leaq	(%rax),%rsp
1585.cfi_def_cfa_register	%rsp
1586.Ldec8x_epilogue:
1587	.byte	0xf3,0xc3
1588.cfi_endproc
1589.size	aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
1590	.section ".note.gnu.property", "a"
1591	.p2align 3
1592	.long 1f - 0f
1593	.long 4f - 1f
1594	.long 5
15950:
1596	# "GNU" encoded with .byte, since .asciz isn't supported
1597	# on Solaris.
1598	.byte 0x47
1599	.byte 0x4e
1600	.byte 0x55
1601	.byte 0
16021:
1603	.p2align 3
1604	.long 0xc0000002
1605	.long 3f - 2f
16062:
1607	.long 3
16083:
1609	.p2align 3
16104:
1611