xref: /freebsd/sys/crypto/openssl/amd64/chacha-x86_64.S (revision c0855eaa3ee9614804b6bd6a255aa9f71e095f43)
1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from chacha-x86_64.pl. */
2bc3d5698SJohn Baldwin.text
3bc3d5698SJohn Baldwin
4bc3d5698SJohn Baldwin
5bc3d5698SJohn Baldwin
6bc3d5698SJohn Baldwin.align	64
7bc3d5698SJohn Baldwin.Lzero:
8bc3d5698SJohn Baldwin.long	0,0,0,0
9bc3d5698SJohn Baldwin.Lone:
10bc3d5698SJohn Baldwin.long	1,0,0,0
11bc3d5698SJohn Baldwin.Linc:
12bc3d5698SJohn Baldwin.long	0,1,2,3
13bc3d5698SJohn Baldwin.Lfour:
14bc3d5698SJohn Baldwin.long	4,4,4,4
15bc3d5698SJohn Baldwin.Lincy:
16bc3d5698SJohn Baldwin.long	0,2,4,6,1,3,5,7
17bc3d5698SJohn Baldwin.Leight:
18bc3d5698SJohn Baldwin.long	8,8,8,8,8,8,8,8
19bc3d5698SJohn Baldwin.Lrot16:
20bc3d5698SJohn Baldwin.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
21bc3d5698SJohn Baldwin.Lrot24:
22bc3d5698SJohn Baldwin.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
23bc3d5698SJohn Baldwin.Ltwoy:
24bc3d5698SJohn Baldwin.long	2,0,0,0, 2,0,0,0
25bc3d5698SJohn Baldwin.align	64
26bc3d5698SJohn Baldwin.Lzeroz:
27bc3d5698SJohn Baldwin.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
28bc3d5698SJohn Baldwin.Lfourz:
29bc3d5698SJohn Baldwin.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
30bc3d5698SJohn Baldwin.Lincz:
31bc3d5698SJohn Baldwin.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
32bc3d5698SJohn Baldwin.Lsixteen:
33bc3d5698SJohn Baldwin.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
34bc3d5698SJohn Baldwin.Lsigma:
35bc3d5698SJohn Baldwin.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
36bc3d5698SJohn Baldwin.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
37bc3d5698SJohn Baldwin.globl	ChaCha20_ctr32
38bc3d5698SJohn Baldwin.type	ChaCha20_ctr32,@function
39bc3d5698SJohn Baldwin.align	64
40bc3d5698SJohn BaldwinChaCha20_ctr32:
41bc3d5698SJohn Baldwin.cfi_startproc
42bc3d5698SJohn Baldwin	cmpq	$0,%rdx
43bc3d5698SJohn Baldwin	je	.Lno_data
44bc3d5698SJohn Baldwin	movq	OPENSSL_ia32cap_P+4(%rip),%r10
45bc3d5698SJohn Baldwin	testl	$512,%r10d
46bc3d5698SJohn Baldwin	jnz	.LChaCha20_ssse3
47bc3d5698SJohn Baldwin
48bc3d5698SJohn Baldwin	pushq	%rbx
49bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
50bc3d5698SJohn Baldwin.cfi_offset	%rbx,-16
51bc3d5698SJohn Baldwin	pushq	%rbp
52bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
53bc3d5698SJohn Baldwin.cfi_offset	%rbp,-24
54bc3d5698SJohn Baldwin	pushq	%r12
55bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
56bc3d5698SJohn Baldwin.cfi_offset	%r12,-32
57bc3d5698SJohn Baldwin	pushq	%r13
58bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
59bc3d5698SJohn Baldwin.cfi_offset	%r13,-40
60bc3d5698SJohn Baldwin	pushq	%r14
61bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
62bc3d5698SJohn Baldwin.cfi_offset	%r14,-48
63bc3d5698SJohn Baldwin	pushq	%r15
64bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
65bc3d5698SJohn Baldwin.cfi_offset	%r15,-56
66bc3d5698SJohn Baldwin	subq	$64+24,%rsp
67bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	64+24
68bc3d5698SJohn Baldwin.Lctr32_body:
69bc3d5698SJohn Baldwin
70bc3d5698SJohn Baldwin
71bc3d5698SJohn Baldwin	movdqu	(%rcx),%xmm1
72bc3d5698SJohn Baldwin	movdqu	16(%rcx),%xmm2
73bc3d5698SJohn Baldwin	movdqu	(%r8),%xmm3
74bc3d5698SJohn Baldwin	movdqa	.Lone(%rip),%xmm4
75bc3d5698SJohn Baldwin
76bc3d5698SJohn Baldwin
77bc3d5698SJohn Baldwin	movdqa	%xmm1,16(%rsp)
78bc3d5698SJohn Baldwin	movdqa	%xmm2,32(%rsp)
79bc3d5698SJohn Baldwin	movdqa	%xmm3,48(%rsp)
80bc3d5698SJohn Baldwin	movq	%rdx,%rbp
81bc3d5698SJohn Baldwin	jmp	.Loop_outer
82bc3d5698SJohn Baldwin
83bc3d5698SJohn Baldwin.align	32
84bc3d5698SJohn Baldwin.Loop_outer:
85bc3d5698SJohn Baldwin	movl	$0x61707865,%eax
86bc3d5698SJohn Baldwin	movl	$0x3320646e,%ebx
87bc3d5698SJohn Baldwin	movl	$0x79622d32,%ecx
88bc3d5698SJohn Baldwin	movl	$0x6b206574,%edx
89bc3d5698SJohn Baldwin	movl	16(%rsp),%r8d
90bc3d5698SJohn Baldwin	movl	20(%rsp),%r9d
91bc3d5698SJohn Baldwin	movl	24(%rsp),%r10d
92bc3d5698SJohn Baldwin	movl	28(%rsp),%r11d
93bc3d5698SJohn Baldwin	movd	%xmm3,%r12d
94bc3d5698SJohn Baldwin	movl	52(%rsp),%r13d
95bc3d5698SJohn Baldwin	movl	56(%rsp),%r14d
96bc3d5698SJohn Baldwin	movl	60(%rsp),%r15d
97bc3d5698SJohn Baldwin
98bc3d5698SJohn Baldwin	movq	%rbp,64+0(%rsp)
99bc3d5698SJohn Baldwin	movl	$10,%ebp
100bc3d5698SJohn Baldwin	movq	%rsi,64+8(%rsp)
101bc3d5698SJohn Baldwin.byte	102,72,15,126,214
102bc3d5698SJohn Baldwin	movq	%rdi,64+16(%rsp)
103bc3d5698SJohn Baldwin	movq	%rsi,%rdi
104bc3d5698SJohn Baldwin	shrq	$32,%rdi
105bc3d5698SJohn Baldwin	jmp	.Loop
106bc3d5698SJohn Baldwin
107bc3d5698SJohn Baldwin.align	32
108bc3d5698SJohn Baldwin.Loop:
109bc3d5698SJohn Baldwin	addl	%r8d,%eax
110bc3d5698SJohn Baldwin	xorl	%eax,%r12d
111bc3d5698SJohn Baldwin	roll	$16,%r12d
112bc3d5698SJohn Baldwin	addl	%r9d,%ebx
113bc3d5698SJohn Baldwin	xorl	%ebx,%r13d
114bc3d5698SJohn Baldwin	roll	$16,%r13d
115bc3d5698SJohn Baldwin	addl	%r12d,%esi
116bc3d5698SJohn Baldwin	xorl	%esi,%r8d
117bc3d5698SJohn Baldwin	roll	$12,%r8d
118bc3d5698SJohn Baldwin	addl	%r13d,%edi
119bc3d5698SJohn Baldwin	xorl	%edi,%r9d
120bc3d5698SJohn Baldwin	roll	$12,%r9d
121bc3d5698SJohn Baldwin	addl	%r8d,%eax
122bc3d5698SJohn Baldwin	xorl	%eax,%r12d
123bc3d5698SJohn Baldwin	roll	$8,%r12d
124bc3d5698SJohn Baldwin	addl	%r9d,%ebx
125bc3d5698SJohn Baldwin	xorl	%ebx,%r13d
126bc3d5698SJohn Baldwin	roll	$8,%r13d
127bc3d5698SJohn Baldwin	addl	%r12d,%esi
128bc3d5698SJohn Baldwin	xorl	%esi,%r8d
129bc3d5698SJohn Baldwin	roll	$7,%r8d
130bc3d5698SJohn Baldwin	addl	%r13d,%edi
131bc3d5698SJohn Baldwin	xorl	%edi,%r9d
132bc3d5698SJohn Baldwin	roll	$7,%r9d
133bc3d5698SJohn Baldwin	movl	%esi,32(%rsp)
134bc3d5698SJohn Baldwin	movl	%edi,36(%rsp)
135bc3d5698SJohn Baldwin	movl	40(%rsp),%esi
136bc3d5698SJohn Baldwin	movl	44(%rsp),%edi
137bc3d5698SJohn Baldwin	addl	%r10d,%ecx
138bc3d5698SJohn Baldwin	xorl	%ecx,%r14d
139bc3d5698SJohn Baldwin	roll	$16,%r14d
140bc3d5698SJohn Baldwin	addl	%r11d,%edx
141bc3d5698SJohn Baldwin	xorl	%edx,%r15d
142bc3d5698SJohn Baldwin	roll	$16,%r15d
143bc3d5698SJohn Baldwin	addl	%r14d,%esi
144bc3d5698SJohn Baldwin	xorl	%esi,%r10d
145bc3d5698SJohn Baldwin	roll	$12,%r10d
146bc3d5698SJohn Baldwin	addl	%r15d,%edi
147bc3d5698SJohn Baldwin	xorl	%edi,%r11d
148bc3d5698SJohn Baldwin	roll	$12,%r11d
149bc3d5698SJohn Baldwin	addl	%r10d,%ecx
150bc3d5698SJohn Baldwin	xorl	%ecx,%r14d
151bc3d5698SJohn Baldwin	roll	$8,%r14d
152bc3d5698SJohn Baldwin	addl	%r11d,%edx
153bc3d5698SJohn Baldwin	xorl	%edx,%r15d
154bc3d5698SJohn Baldwin	roll	$8,%r15d
155bc3d5698SJohn Baldwin	addl	%r14d,%esi
156bc3d5698SJohn Baldwin	xorl	%esi,%r10d
157bc3d5698SJohn Baldwin	roll	$7,%r10d
158bc3d5698SJohn Baldwin	addl	%r15d,%edi
159bc3d5698SJohn Baldwin	xorl	%edi,%r11d
160bc3d5698SJohn Baldwin	roll	$7,%r11d
161bc3d5698SJohn Baldwin	addl	%r9d,%eax
162bc3d5698SJohn Baldwin	xorl	%eax,%r15d
163bc3d5698SJohn Baldwin	roll	$16,%r15d
164bc3d5698SJohn Baldwin	addl	%r10d,%ebx
165bc3d5698SJohn Baldwin	xorl	%ebx,%r12d
166bc3d5698SJohn Baldwin	roll	$16,%r12d
167bc3d5698SJohn Baldwin	addl	%r15d,%esi
168bc3d5698SJohn Baldwin	xorl	%esi,%r9d
169bc3d5698SJohn Baldwin	roll	$12,%r9d
170bc3d5698SJohn Baldwin	addl	%r12d,%edi
171bc3d5698SJohn Baldwin	xorl	%edi,%r10d
172bc3d5698SJohn Baldwin	roll	$12,%r10d
173bc3d5698SJohn Baldwin	addl	%r9d,%eax
174bc3d5698SJohn Baldwin	xorl	%eax,%r15d
175bc3d5698SJohn Baldwin	roll	$8,%r15d
176bc3d5698SJohn Baldwin	addl	%r10d,%ebx
177bc3d5698SJohn Baldwin	xorl	%ebx,%r12d
178bc3d5698SJohn Baldwin	roll	$8,%r12d
179bc3d5698SJohn Baldwin	addl	%r15d,%esi
180bc3d5698SJohn Baldwin	xorl	%esi,%r9d
181bc3d5698SJohn Baldwin	roll	$7,%r9d
182bc3d5698SJohn Baldwin	addl	%r12d,%edi
183bc3d5698SJohn Baldwin	xorl	%edi,%r10d
184bc3d5698SJohn Baldwin	roll	$7,%r10d
185bc3d5698SJohn Baldwin	movl	%esi,40(%rsp)
186bc3d5698SJohn Baldwin	movl	%edi,44(%rsp)
187bc3d5698SJohn Baldwin	movl	32(%rsp),%esi
188bc3d5698SJohn Baldwin	movl	36(%rsp),%edi
189bc3d5698SJohn Baldwin	addl	%r11d,%ecx
190bc3d5698SJohn Baldwin	xorl	%ecx,%r13d
191bc3d5698SJohn Baldwin	roll	$16,%r13d
192bc3d5698SJohn Baldwin	addl	%r8d,%edx
193bc3d5698SJohn Baldwin	xorl	%edx,%r14d
194bc3d5698SJohn Baldwin	roll	$16,%r14d
195bc3d5698SJohn Baldwin	addl	%r13d,%esi
196bc3d5698SJohn Baldwin	xorl	%esi,%r11d
197bc3d5698SJohn Baldwin	roll	$12,%r11d
198bc3d5698SJohn Baldwin	addl	%r14d,%edi
199bc3d5698SJohn Baldwin	xorl	%edi,%r8d
200bc3d5698SJohn Baldwin	roll	$12,%r8d
201bc3d5698SJohn Baldwin	addl	%r11d,%ecx
202bc3d5698SJohn Baldwin	xorl	%ecx,%r13d
203bc3d5698SJohn Baldwin	roll	$8,%r13d
204bc3d5698SJohn Baldwin	addl	%r8d,%edx
205bc3d5698SJohn Baldwin	xorl	%edx,%r14d
206bc3d5698SJohn Baldwin	roll	$8,%r14d
207bc3d5698SJohn Baldwin	addl	%r13d,%esi
208bc3d5698SJohn Baldwin	xorl	%esi,%r11d
209bc3d5698SJohn Baldwin	roll	$7,%r11d
210bc3d5698SJohn Baldwin	addl	%r14d,%edi
211bc3d5698SJohn Baldwin	xorl	%edi,%r8d
212bc3d5698SJohn Baldwin	roll	$7,%r8d
213bc3d5698SJohn Baldwin	decl	%ebp
214bc3d5698SJohn Baldwin	jnz	.Loop
215bc3d5698SJohn Baldwin	movl	%edi,36(%rsp)
216bc3d5698SJohn Baldwin	movl	%esi,32(%rsp)
217bc3d5698SJohn Baldwin	movq	64(%rsp),%rbp
218bc3d5698SJohn Baldwin	movdqa	%xmm2,%xmm1
219bc3d5698SJohn Baldwin	movq	64+8(%rsp),%rsi
220bc3d5698SJohn Baldwin	paddd	%xmm4,%xmm3
221bc3d5698SJohn Baldwin	movq	64+16(%rsp),%rdi
222bc3d5698SJohn Baldwin
223bc3d5698SJohn Baldwin	addl	$0x61707865,%eax
224bc3d5698SJohn Baldwin	addl	$0x3320646e,%ebx
225bc3d5698SJohn Baldwin	addl	$0x79622d32,%ecx
226bc3d5698SJohn Baldwin	addl	$0x6b206574,%edx
227bc3d5698SJohn Baldwin	addl	16(%rsp),%r8d
228bc3d5698SJohn Baldwin	addl	20(%rsp),%r9d
229bc3d5698SJohn Baldwin	addl	24(%rsp),%r10d
230bc3d5698SJohn Baldwin	addl	28(%rsp),%r11d
231bc3d5698SJohn Baldwin	addl	48(%rsp),%r12d
232bc3d5698SJohn Baldwin	addl	52(%rsp),%r13d
233bc3d5698SJohn Baldwin	addl	56(%rsp),%r14d
234bc3d5698SJohn Baldwin	addl	60(%rsp),%r15d
235bc3d5698SJohn Baldwin	paddd	32(%rsp),%xmm1
236bc3d5698SJohn Baldwin
237bc3d5698SJohn Baldwin	cmpq	$64,%rbp
238bc3d5698SJohn Baldwin	jb	.Ltail
239bc3d5698SJohn Baldwin
240bc3d5698SJohn Baldwin	xorl	0(%rsi),%eax
241bc3d5698SJohn Baldwin	xorl	4(%rsi),%ebx
242bc3d5698SJohn Baldwin	xorl	8(%rsi),%ecx
243bc3d5698SJohn Baldwin	xorl	12(%rsi),%edx
244bc3d5698SJohn Baldwin	xorl	16(%rsi),%r8d
245bc3d5698SJohn Baldwin	xorl	20(%rsi),%r9d
246bc3d5698SJohn Baldwin	xorl	24(%rsi),%r10d
247bc3d5698SJohn Baldwin	xorl	28(%rsi),%r11d
248bc3d5698SJohn Baldwin	movdqu	32(%rsi),%xmm0
249bc3d5698SJohn Baldwin	xorl	48(%rsi),%r12d
250bc3d5698SJohn Baldwin	xorl	52(%rsi),%r13d
251bc3d5698SJohn Baldwin	xorl	56(%rsi),%r14d
252bc3d5698SJohn Baldwin	xorl	60(%rsi),%r15d
253bc3d5698SJohn Baldwin	leaq	64(%rsi),%rsi
254bc3d5698SJohn Baldwin	pxor	%xmm1,%xmm0
255bc3d5698SJohn Baldwin
256bc3d5698SJohn Baldwin	movdqa	%xmm2,32(%rsp)
257bc3d5698SJohn Baldwin	movd	%xmm3,48(%rsp)
258bc3d5698SJohn Baldwin
259bc3d5698SJohn Baldwin	movl	%eax,0(%rdi)
260bc3d5698SJohn Baldwin	movl	%ebx,4(%rdi)
261bc3d5698SJohn Baldwin	movl	%ecx,8(%rdi)
262bc3d5698SJohn Baldwin	movl	%edx,12(%rdi)
263bc3d5698SJohn Baldwin	movl	%r8d,16(%rdi)
264bc3d5698SJohn Baldwin	movl	%r9d,20(%rdi)
265bc3d5698SJohn Baldwin	movl	%r10d,24(%rdi)
266bc3d5698SJohn Baldwin	movl	%r11d,28(%rdi)
267bc3d5698SJohn Baldwin	movdqu	%xmm0,32(%rdi)
268bc3d5698SJohn Baldwin	movl	%r12d,48(%rdi)
269bc3d5698SJohn Baldwin	movl	%r13d,52(%rdi)
270bc3d5698SJohn Baldwin	movl	%r14d,56(%rdi)
271bc3d5698SJohn Baldwin	movl	%r15d,60(%rdi)
272bc3d5698SJohn Baldwin	leaq	64(%rdi),%rdi
273bc3d5698SJohn Baldwin
274bc3d5698SJohn Baldwin	subq	$64,%rbp
275bc3d5698SJohn Baldwin	jnz	.Loop_outer
276bc3d5698SJohn Baldwin
277bc3d5698SJohn Baldwin	jmp	.Ldone
278bc3d5698SJohn Baldwin
279bc3d5698SJohn Baldwin.align	16
280bc3d5698SJohn Baldwin.Ltail:
281bc3d5698SJohn Baldwin	movl	%eax,0(%rsp)
282bc3d5698SJohn Baldwin	movl	%ebx,4(%rsp)
283bc3d5698SJohn Baldwin	xorq	%rbx,%rbx
284bc3d5698SJohn Baldwin	movl	%ecx,8(%rsp)
285bc3d5698SJohn Baldwin	movl	%edx,12(%rsp)
286bc3d5698SJohn Baldwin	movl	%r8d,16(%rsp)
287bc3d5698SJohn Baldwin	movl	%r9d,20(%rsp)
288bc3d5698SJohn Baldwin	movl	%r10d,24(%rsp)
289bc3d5698SJohn Baldwin	movl	%r11d,28(%rsp)
290bc3d5698SJohn Baldwin	movdqa	%xmm1,32(%rsp)
291bc3d5698SJohn Baldwin	movl	%r12d,48(%rsp)
292bc3d5698SJohn Baldwin	movl	%r13d,52(%rsp)
293bc3d5698SJohn Baldwin	movl	%r14d,56(%rsp)
294bc3d5698SJohn Baldwin	movl	%r15d,60(%rsp)
295bc3d5698SJohn Baldwin
296bc3d5698SJohn Baldwin.Loop_tail:
297bc3d5698SJohn Baldwin	movzbl	(%rsi,%rbx,1),%eax
298bc3d5698SJohn Baldwin	movzbl	(%rsp,%rbx,1),%edx
299bc3d5698SJohn Baldwin	leaq	1(%rbx),%rbx
300bc3d5698SJohn Baldwin	xorl	%edx,%eax
301bc3d5698SJohn Baldwin	movb	%al,-1(%rdi,%rbx,1)
302bc3d5698SJohn Baldwin	decq	%rbp
303bc3d5698SJohn Baldwin	jnz	.Loop_tail
304bc3d5698SJohn Baldwin
305bc3d5698SJohn Baldwin.Ldone:
306bc3d5698SJohn Baldwin	leaq	64+24+48(%rsp),%rsi
307bc3d5698SJohn Baldwin.cfi_def_cfa	%rsi,8
308bc3d5698SJohn Baldwin	movq	-48(%rsi),%r15
309bc3d5698SJohn Baldwin.cfi_restore	%r15
310bc3d5698SJohn Baldwin	movq	-40(%rsi),%r14
311bc3d5698SJohn Baldwin.cfi_restore	%r14
312bc3d5698SJohn Baldwin	movq	-32(%rsi),%r13
313bc3d5698SJohn Baldwin.cfi_restore	%r13
314bc3d5698SJohn Baldwin	movq	-24(%rsi),%r12
315bc3d5698SJohn Baldwin.cfi_restore	%r12
316bc3d5698SJohn Baldwin	movq	-16(%rsi),%rbp
317bc3d5698SJohn Baldwin.cfi_restore	%rbp
318bc3d5698SJohn Baldwin	movq	-8(%rsi),%rbx
319bc3d5698SJohn Baldwin.cfi_restore	%rbx
320bc3d5698SJohn Baldwin	leaq	(%rsi),%rsp
321bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rsp
322bc3d5698SJohn Baldwin.Lno_data:
323bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
324bc3d5698SJohn Baldwin.cfi_endproc
325bc3d5698SJohn Baldwin.size	ChaCha20_ctr32,.-ChaCha20_ctr32
326bc3d5698SJohn Baldwin.type	ChaCha20_ssse3,@function
327bc3d5698SJohn Baldwin.align	32
328bc3d5698SJohn BaldwinChaCha20_ssse3:
329bc3d5698SJohn Baldwin.cfi_startproc
330bc3d5698SJohn Baldwin.LChaCha20_ssse3:
331bc3d5698SJohn Baldwin	movq	%rsp,%r9
332bc3d5698SJohn Baldwin.cfi_def_cfa_register	%r9
333bc3d5698SJohn Baldwin	testl	$2048,%r10d
334bc3d5698SJohn Baldwin	jnz	.LChaCha20_4xop
335bc3d5698SJohn Baldwin	cmpq	$128,%rdx
336bc3d5698SJohn Baldwin	je	.LChaCha20_128
337bc3d5698SJohn Baldwin	ja	.LChaCha20_4x
338bc3d5698SJohn Baldwin
339bc3d5698SJohn Baldwin.Ldo_sse3_after_all:
340bc3d5698SJohn Baldwin	subq	$64+8,%rsp
341bc3d5698SJohn Baldwin	movdqa	.Lsigma(%rip),%xmm0
342bc3d5698SJohn Baldwin	movdqu	(%rcx),%xmm1
343bc3d5698SJohn Baldwin	movdqu	16(%rcx),%xmm2
344bc3d5698SJohn Baldwin	movdqu	(%r8),%xmm3
345bc3d5698SJohn Baldwin	movdqa	.Lrot16(%rip),%xmm6
346bc3d5698SJohn Baldwin	movdqa	.Lrot24(%rip),%xmm7
347bc3d5698SJohn Baldwin
348bc3d5698SJohn Baldwin	movdqa	%xmm0,0(%rsp)
349bc3d5698SJohn Baldwin	movdqa	%xmm1,16(%rsp)
350bc3d5698SJohn Baldwin	movdqa	%xmm2,32(%rsp)
351bc3d5698SJohn Baldwin	movdqa	%xmm3,48(%rsp)
352bc3d5698SJohn Baldwin	movq	$10,%r8
353bc3d5698SJohn Baldwin	jmp	.Loop_ssse3
354bc3d5698SJohn Baldwin
355bc3d5698SJohn Baldwin.align	32
356bc3d5698SJohn Baldwin.Loop_outer_ssse3:
357bc3d5698SJohn Baldwin	movdqa	.Lone(%rip),%xmm3
358bc3d5698SJohn Baldwin	movdqa	0(%rsp),%xmm0
359bc3d5698SJohn Baldwin	movdqa	16(%rsp),%xmm1
360bc3d5698SJohn Baldwin	movdqa	32(%rsp),%xmm2
361bc3d5698SJohn Baldwin	paddd	48(%rsp),%xmm3
362bc3d5698SJohn Baldwin	movq	$10,%r8
363bc3d5698SJohn Baldwin	movdqa	%xmm3,48(%rsp)
364bc3d5698SJohn Baldwin	jmp	.Loop_ssse3
365bc3d5698SJohn Baldwin
366bc3d5698SJohn Baldwin.align	32
367bc3d5698SJohn Baldwin.Loop_ssse3:
368bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm0
369bc3d5698SJohn Baldwin	pxor	%xmm0,%xmm3
370bc3d5698SJohn Baldwin.byte	102,15,56,0,222
371bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm2
372bc3d5698SJohn Baldwin	pxor	%xmm2,%xmm1
373bc3d5698SJohn Baldwin	movdqa	%xmm1,%xmm4
374bc3d5698SJohn Baldwin	psrld	$20,%xmm1
375bc3d5698SJohn Baldwin	pslld	$12,%xmm4
376bc3d5698SJohn Baldwin	por	%xmm4,%xmm1
377bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm0
378bc3d5698SJohn Baldwin	pxor	%xmm0,%xmm3
379bc3d5698SJohn Baldwin.byte	102,15,56,0,223
380bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm2
381bc3d5698SJohn Baldwin	pxor	%xmm2,%xmm1
382bc3d5698SJohn Baldwin	movdqa	%xmm1,%xmm4
383bc3d5698SJohn Baldwin	psrld	$25,%xmm1
384bc3d5698SJohn Baldwin	pslld	$7,%xmm4
385bc3d5698SJohn Baldwin	por	%xmm4,%xmm1
386bc3d5698SJohn Baldwin	pshufd	$78,%xmm2,%xmm2
387bc3d5698SJohn Baldwin	pshufd	$57,%xmm1,%xmm1
388bc3d5698SJohn Baldwin	pshufd	$147,%xmm3,%xmm3
389bc3d5698SJohn Baldwin	nop
390bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm0
391bc3d5698SJohn Baldwin	pxor	%xmm0,%xmm3
392bc3d5698SJohn Baldwin.byte	102,15,56,0,222
393bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm2
394bc3d5698SJohn Baldwin	pxor	%xmm2,%xmm1
395bc3d5698SJohn Baldwin	movdqa	%xmm1,%xmm4
396bc3d5698SJohn Baldwin	psrld	$20,%xmm1
397bc3d5698SJohn Baldwin	pslld	$12,%xmm4
398bc3d5698SJohn Baldwin	por	%xmm4,%xmm1
399bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm0
400bc3d5698SJohn Baldwin	pxor	%xmm0,%xmm3
401bc3d5698SJohn Baldwin.byte	102,15,56,0,223
402bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm2
403bc3d5698SJohn Baldwin	pxor	%xmm2,%xmm1
404bc3d5698SJohn Baldwin	movdqa	%xmm1,%xmm4
405bc3d5698SJohn Baldwin	psrld	$25,%xmm1
406bc3d5698SJohn Baldwin	pslld	$7,%xmm4
407bc3d5698SJohn Baldwin	por	%xmm4,%xmm1
408bc3d5698SJohn Baldwin	pshufd	$78,%xmm2,%xmm2
409bc3d5698SJohn Baldwin	pshufd	$147,%xmm1,%xmm1
410bc3d5698SJohn Baldwin	pshufd	$57,%xmm3,%xmm3
411bc3d5698SJohn Baldwin	decq	%r8
412bc3d5698SJohn Baldwin	jnz	.Loop_ssse3
413bc3d5698SJohn Baldwin	paddd	0(%rsp),%xmm0
414bc3d5698SJohn Baldwin	paddd	16(%rsp),%xmm1
415bc3d5698SJohn Baldwin	paddd	32(%rsp),%xmm2
416bc3d5698SJohn Baldwin	paddd	48(%rsp),%xmm3
417bc3d5698SJohn Baldwin
418bc3d5698SJohn Baldwin	cmpq	$64,%rdx
419bc3d5698SJohn Baldwin	jb	.Ltail_ssse3
420bc3d5698SJohn Baldwin
421bc3d5698SJohn Baldwin	movdqu	0(%rsi),%xmm4
422bc3d5698SJohn Baldwin	movdqu	16(%rsi),%xmm5
423bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm0
424bc3d5698SJohn Baldwin	movdqu	32(%rsi),%xmm4
425bc3d5698SJohn Baldwin	pxor	%xmm5,%xmm1
426bc3d5698SJohn Baldwin	movdqu	48(%rsi),%xmm5
427bc3d5698SJohn Baldwin	leaq	64(%rsi),%rsi
428bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm2
429bc3d5698SJohn Baldwin	pxor	%xmm5,%xmm3
430bc3d5698SJohn Baldwin
431bc3d5698SJohn Baldwin	movdqu	%xmm0,0(%rdi)
432bc3d5698SJohn Baldwin	movdqu	%xmm1,16(%rdi)
433bc3d5698SJohn Baldwin	movdqu	%xmm2,32(%rdi)
434bc3d5698SJohn Baldwin	movdqu	%xmm3,48(%rdi)
435bc3d5698SJohn Baldwin	leaq	64(%rdi),%rdi
436bc3d5698SJohn Baldwin
437bc3d5698SJohn Baldwin	subq	$64,%rdx
438bc3d5698SJohn Baldwin	jnz	.Loop_outer_ssse3
439bc3d5698SJohn Baldwin
440bc3d5698SJohn Baldwin	jmp	.Ldone_ssse3
441bc3d5698SJohn Baldwin
442bc3d5698SJohn Baldwin.align	16
443bc3d5698SJohn Baldwin.Ltail_ssse3:
444bc3d5698SJohn Baldwin	movdqa	%xmm0,0(%rsp)
445bc3d5698SJohn Baldwin	movdqa	%xmm1,16(%rsp)
446bc3d5698SJohn Baldwin	movdqa	%xmm2,32(%rsp)
447bc3d5698SJohn Baldwin	movdqa	%xmm3,48(%rsp)
448bc3d5698SJohn Baldwin	xorq	%r8,%r8
449bc3d5698SJohn Baldwin
450bc3d5698SJohn Baldwin.Loop_tail_ssse3:
451bc3d5698SJohn Baldwin	movzbl	(%rsi,%r8,1),%eax
452bc3d5698SJohn Baldwin	movzbl	(%rsp,%r8,1),%ecx
453bc3d5698SJohn Baldwin	leaq	1(%r8),%r8
454bc3d5698SJohn Baldwin	xorl	%ecx,%eax
455bc3d5698SJohn Baldwin	movb	%al,-1(%rdi,%r8,1)
456bc3d5698SJohn Baldwin	decq	%rdx
457bc3d5698SJohn Baldwin	jnz	.Loop_tail_ssse3
458bc3d5698SJohn Baldwin
459bc3d5698SJohn Baldwin.Ldone_ssse3:
460bc3d5698SJohn Baldwin	leaq	(%r9),%rsp
461bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rsp
462bc3d5698SJohn Baldwin.Lssse3_epilogue:
463bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
464bc3d5698SJohn Baldwin.cfi_endproc
465bc3d5698SJohn Baldwin.size	ChaCha20_ssse3,.-ChaCha20_ssse3
466bc3d5698SJohn Baldwin.type	ChaCha20_128,@function
467bc3d5698SJohn Baldwin.align	32
468bc3d5698SJohn BaldwinChaCha20_128:
469bc3d5698SJohn Baldwin.cfi_startproc
470bc3d5698SJohn Baldwin.LChaCha20_128:
471bc3d5698SJohn Baldwin	movq	%rsp,%r9
472bc3d5698SJohn Baldwin.cfi_def_cfa_register	%r9
473bc3d5698SJohn Baldwin	subq	$64+8,%rsp
474bc3d5698SJohn Baldwin	movdqa	.Lsigma(%rip),%xmm8
475bc3d5698SJohn Baldwin	movdqu	(%rcx),%xmm9
476bc3d5698SJohn Baldwin	movdqu	16(%rcx),%xmm2
477bc3d5698SJohn Baldwin	movdqu	(%r8),%xmm3
478bc3d5698SJohn Baldwin	movdqa	.Lone(%rip),%xmm1
479bc3d5698SJohn Baldwin	movdqa	.Lrot16(%rip),%xmm6
480bc3d5698SJohn Baldwin	movdqa	.Lrot24(%rip),%xmm7
481bc3d5698SJohn Baldwin
482bc3d5698SJohn Baldwin	movdqa	%xmm8,%xmm10
483bc3d5698SJohn Baldwin	movdqa	%xmm8,0(%rsp)
484bc3d5698SJohn Baldwin	movdqa	%xmm9,%xmm11
485bc3d5698SJohn Baldwin	movdqa	%xmm9,16(%rsp)
486bc3d5698SJohn Baldwin	movdqa	%xmm2,%xmm0
487bc3d5698SJohn Baldwin	movdqa	%xmm2,32(%rsp)
488bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm1
489bc3d5698SJohn Baldwin	movdqa	%xmm3,48(%rsp)
490bc3d5698SJohn Baldwin	movq	$10,%r8
491bc3d5698SJohn Baldwin	jmp	.Loop_128
492bc3d5698SJohn Baldwin
493bc3d5698SJohn Baldwin.align	32
494bc3d5698SJohn Baldwin.Loop_128:
495bc3d5698SJohn Baldwin	paddd	%xmm9,%xmm8
496bc3d5698SJohn Baldwin	pxor	%xmm8,%xmm3
497bc3d5698SJohn Baldwin	paddd	%xmm11,%xmm10
498bc3d5698SJohn Baldwin	pxor	%xmm10,%xmm1
499bc3d5698SJohn Baldwin.byte	102,15,56,0,222
500bc3d5698SJohn Baldwin.byte	102,15,56,0,206
501bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm2
502bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm0
503bc3d5698SJohn Baldwin	pxor	%xmm2,%xmm9
504bc3d5698SJohn Baldwin	pxor	%xmm0,%xmm11
505bc3d5698SJohn Baldwin	movdqa	%xmm9,%xmm4
506bc3d5698SJohn Baldwin	psrld	$20,%xmm9
507bc3d5698SJohn Baldwin	movdqa	%xmm11,%xmm5
508bc3d5698SJohn Baldwin	pslld	$12,%xmm4
509bc3d5698SJohn Baldwin	psrld	$20,%xmm11
510bc3d5698SJohn Baldwin	por	%xmm4,%xmm9
511bc3d5698SJohn Baldwin	pslld	$12,%xmm5
512bc3d5698SJohn Baldwin	por	%xmm5,%xmm11
513bc3d5698SJohn Baldwin	paddd	%xmm9,%xmm8
514bc3d5698SJohn Baldwin	pxor	%xmm8,%xmm3
515bc3d5698SJohn Baldwin	paddd	%xmm11,%xmm10
516bc3d5698SJohn Baldwin	pxor	%xmm10,%xmm1
517bc3d5698SJohn Baldwin.byte	102,15,56,0,223
518bc3d5698SJohn Baldwin.byte	102,15,56,0,207
519bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm2
520bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm0
521bc3d5698SJohn Baldwin	pxor	%xmm2,%xmm9
522bc3d5698SJohn Baldwin	pxor	%xmm0,%xmm11
523bc3d5698SJohn Baldwin	movdqa	%xmm9,%xmm4
524bc3d5698SJohn Baldwin	psrld	$25,%xmm9
525bc3d5698SJohn Baldwin	movdqa	%xmm11,%xmm5
526bc3d5698SJohn Baldwin	pslld	$7,%xmm4
527bc3d5698SJohn Baldwin	psrld	$25,%xmm11
528bc3d5698SJohn Baldwin	por	%xmm4,%xmm9
529bc3d5698SJohn Baldwin	pslld	$7,%xmm5
530bc3d5698SJohn Baldwin	por	%xmm5,%xmm11
531bc3d5698SJohn Baldwin	pshufd	$78,%xmm2,%xmm2
532bc3d5698SJohn Baldwin	pshufd	$57,%xmm9,%xmm9
533bc3d5698SJohn Baldwin	pshufd	$147,%xmm3,%xmm3
534bc3d5698SJohn Baldwin	pshufd	$78,%xmm0,%xmm0
535bc3d5698SJohn Baldwin	pshufd	$57,%xmm11,%xmm11
536bc3d5698SJohn Baldwin	pshufd	$147,%xmm1,%xmm1
537bc3d5698SJohn Baldwin	paddd	%xmm9,%xmm8
538bc3d5698SJohn Baldwin	pxor	%xmm8,%xmm3
539bc3d5698SJohn Baldwin	paddd	%xmm11,%xmm10
540bc3d5698SJohn Baldwin	pxor	%xmm10,%xmm1
541bc3d5698SJohn Baldwin.byte	102,15,56,0,222
542bc3d5698SJohn Baldwin.byte	102,15,56,0,206
543bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm2
544bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm0
545bc3d5698SJohn Baldwin	pxor	%xmm2,%xmm9
546bc3d5698SJohn Baldwin	pxor	%xmm0,%xmm11
547bc3d5698SJohn Baldwin	movdqa	%xmm9,%xmm4
548bc3d5698SJohn Baldwin	psrld	$20,%xmm9
549bc3d5698SJohn Baldwin	movdqa	%xmm11,%xmm5
550bc3d5698SJohn Baldwin	pslld	$12,%xmm4
551bc3d5698SJohn Baldwin	psrld	$20,%xmm11
552bc3d5698SJohn Baldwin	por	%xmm4,%xmm9
553bc3d5698SJohn Baldwin	pslld	$12,%xmm5
554bc3d5698SJohn Baldwin	por	%xmm5,%xmm11
555bc3d5698SJohn Baldwin	paddd	%xmm9,%xmm8
556bc3d5698SJohn Baldwin	pxor	%xmm8,%xmm3
557bc3d5698SJohn Baldwin	paddd	%xmm11,%xmm10
558bc3d5698SJohn Baldwin	pxor	%xmm10,%xmm1
559bc3d5698SJohn Baldwin.byte	102,15,56,0,223
560bc3d5698SJohn Baldwin.byte	102,15,56,0,207
561bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm2
562bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm0
563bc3d5698SJohn Baldwin	pxor	%xmm2,%xmm9
564bc3d5698SJohn Baldwin	pxor	%xmm0,%xmm11
565bc3d5698SJohn Baldwin	movdqa	%xmm9,%xmm4
566bc3d5698SJohn Baldwin	psrld	$25,%xmm9
567bc3d5698SJohn Baldwin	movdqa	%xmm11,%xmm5
568bc3d5698SJohn Baldwin	pslld	$7,%xmm4
569bc3d5698SJohn Baldwin	psrld	$25,%xmm11
570bc3d5698SJohn Baldwin	por	%xmm4,%xmm9
571bc3d5698SJohn Baldwin	pslld	$7,%xmm5
572bc3d5698SJohn Baldwin	por	%xmm5,%xmm11
573bc3d5698SJohn Baldwin	pshufd	$78,%xmm2,%xmm2
574bc3d5698SJohn Baldwin	pshufd	$147,%xmm9,%xmm9
575bc3d5698SJohn Baldwin	pshufd	$57,%xmm3,%xmm3
576bc3d5698SJohn Baldwin	pshufd	$78,%xmm0,%xmm0
577bc3d5698SJohn Baldwin	pshufd	$147,%xmm11,%xmm11
578bc3d5698SJohn Baldwin	pshufd	$57,%xmm1,%xmm1
579bc3d5698SJohn Baldwin	decq	%r8
580bc3d5698SJohn Baldwin	jnz	.Loop_128
581bc3d5698SJohn Baldwin	paddd	0(%rsp),%xmm8
582bc3d5698SJohn Baldwin	paddd	16(%rsp),%xmm9
583bc3d5698SJohn Baldwin	paddd	32(%rsp),%xmm2
584bc3d5698SJohn Baldwin	paddd	48(%rsp),%xmm3
585bc3d5698SJohn Baldwin	paddd	.Lone(%rip),%xmm1
586bc3d5698SJohn Baldwin	paddd	0(%rsp),%xmm10
587bc3d5698SJohn Baldwin	paddd	16(%rsp),%xmm11
588bc3d5698SJohn Baldwin	paddd	32(%rsp),%xmm0
589bc3d5698SJohn Baldwin	paddd	48(%rsp),%xmm1
590bc3d5698SJohn Baldwin
591bc3d5698SJohn Baldwin	movdqu	0(%rsi),%xmm4
592bc3d5698SJohn Baldwin	movdqu	16(%rsi),%xmm5
593bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm8
594bc3d5698SJohn Baldwin	movdqu	32(%rsi),%xmm4
595bc3d5698SJohn Baldwin	pxor	%xmm5,%xmm9
596bc3d5698SJohn Baldwin	movdqu	48(%rsi),%xmm5
597bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm2
598bc3d5698SJohn Baldwin	movdqu	64(%rsi),%xmm4
599bc3d5698SJohn Baldwin	pxor	%xmm5,%xmm3
600bc3d5698SJohn Baldwin	movdqu	80(%rsi),%xmm5
601bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm10
602bc3d5698SJohn Baldwin	movdqu	96(%rsi),%xmm4
603bc3d5698SJohn Baldwin	pxor	%xmm5,%xmm11
604bc3d5698SJohn Baldwin	movdqu	112(%rsi),%xmm5
605bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm0
606bc3d5698SJohn Baldwin	pxor	%xmm5,%xmm1
607bc3d5698SJohn Baldwin
608bc3d5698SJohn Baldwin	movdqu	%xmm8,0(%rdi)
609bc3d5698SJohn Baldwin	movdqu	%xmm9,16(%rdi)
610bc3d5698SJohn Baldwin	movdqu	%xmm2,32(%rdi)
611bc3d5698SJohn Baldwin	movdqu	%xmm3,48(%rdi)
612bc3d5698SJohn Baldwin	movdqu	%xmm10,64(%rdi)
613bc3d5698SJohn Baldwin	movdqu	%xmm11,80(%rdi)
614bc3d5698SJohn Baldwin	movdqu	%xmm0,96(%rdi)
615bc3d5698SJohn Baldwin	movdqu	%xmm1,112(%rdi)
616bc3d5698SJohn Baldwin	leaq	(%r9),%rsp
617bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rsp
618bc3d5698SJohn Baldwin.L128_epilogue:
619bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
620bc3d5698SJohn Baldwin.cfi_endproc
621bc3d5698SJohn Baldwin.size	ChaCha20_128,.-ChaCha20_128
622bc3d5698SJohn Baldwin.type	ChaCha20_4x,@function
623bc3d5698SJohn Baldwin.align	32
624bc3d5698SJohn BaldwinChaCha20_4x:
625bc3d5698SJohn Baldwin.cfi_startproc
626bc3d5698SJohn Baldwin.LChaCha20_4x:
627bc3d5698SJohn Baldwin	movq	%rsp,%r9
628bc3d5698SJohn Baldwin.cfi_def_cfa_register	%r9
629bc3d5698SJohn Baldwin	movq	%r10,%r11
630bc3d5698SJohn Baldwin	shrq	$32,%r10
631bc3d5698SJohn Baldwin	testq	$32,%r10
632bc3d5698SJohn Baldwin	jnz	.LChaCha20_8x
633bc3d5698SJohn Baldwin	cmpq	$192,%rdx
634bc3d5698SJohn Baldwin	ja	.Lproceed4x
635bc3d5698SJohn Baldwin
636bc3d5698SJohn Baldwin	andq	$71303168,%r11
637bc3d5698SJohn Baldwin	cmpq	$4194304,%r11
638bc3d5698SJohn Baldwin	je	.Ldo_sse3_after_all
639bc3d5698SJohn Baldwin
640bc3d5698SJohn Baldwin.Lproceed4x:
641bc3d5698SJohn Baldwin	subq	$0x140+8,%rsp
642bc3d5698SJohn Baldwin	movdqa	.Lsigma(%rip),%xmm11
643bc3d5698SJohn Baldwin	movdqu	(%rcx),%xmm15
644bc3d5698SJohn Baldwin	movdqu	16(%rcx),%xmm7
645bc3d5698SJohn Baldwin	movdqu	(%r8),%xmm3
646bc3d5698SJohn Baldwin	leaq	256(%rsp),%rcx
647bc3d5698SJohn Baldwin	leaq	.Lrot16(%rip),%r10
648bc3d5698SJohn Baldwin	leaq	.Lrot24(%rip),%r11
649bc3d5698SJohn Baldwin
650bc3d5698SJohn Baldwin	pshufd	$0x00,%xmm11,%xmm8
651bc3d5698SJohn Baldwin	pshufd	$0x55,%xmm11,%xmm9
652bc3d5698SJohn Baldwin	movdqa	%xmm8,64(%rsp)
653bc3d5698SJohn Baldwin	pshufd	$0xaa,%xmm11,%xmm10
654bc3d5698SJohn Baldwin	movdqa	%xmm9,80(%rsp)
655bc3d5698SJohn Baldwin	pshufd	$0xff,%xmm11,%xmm11
656bc3d5698SJohn Baldwin	movdqa	%xmm10,96(%rsp)
657bc3d5698SJohn Baldwin	movdqa	%xmm11,112(%rsp)
658bc3d5698SJohn Baldwin
659bc3d5698SJohn Baldwin	pshufd	$0x00,%xmm15,%xmm12
660bc3d5698SJohn Baldwin	pshufd	$0x55,%xmm15,%xmm13
661bc3d5698SJohn Baldwin	movdqa	%xmm12,128-256(%rcx)
662bc3d5698SJohn Baldwin	pshufd	$0xaa,%xmm15,%xmm14
663bc3d5698SJohn Baldwin	movdqa	%xmm13,144-256(%rcx)
664bc3d5698SJohn Baldwin	pshufd	$0xff,%xmm15,%xmm15
665bc3d5698SJohn Baldwin	movdqa	%xmm14,160-256(%rcx)
666bc3d5698SJohn Baldwin	movdqa	%xmm15,176-256(%rcx)
667bc3d5698SJohn Baldwin
668bc3d5698SJohn Baldwin	pshufd	$0x00,%xmm7,%xmm4
669bc3d5698SJohn Baldwin	pshufd	$0x55,%xmm7,%xmm5
670bc3d5698SJohn Baldwin	movdqa	%xmm4,192-256(%rcx)
671bc3d5698SJohn Baldwin	pshufd	$0xaa,%xmm7,%xmm6
672bc3d5698SJohn Baldwin	movdqa	%xmm5,208-256(%rcx)
673bc3d5698SJohn Baldwin	pshufd	$0xff,%xmm7,%xmm7
674bc3d5698SJohn Baldwin	movdqa	%xmm6,224-256(%rcx)
675bc3d5698SJohn Baldwin	movdqa	%xmm7,240-256(%rcx)
676bc3d5698SJohn Baldwin
677bc3d5698SJohn Baldwin	pshufd	$0x00,%xmm3,%xmm0
678bc3d5698SJohn Baldwin	pshufd	$0x55,%xmm3,%xmm1
679bc3d5698SJohn Baldwin	paddd	.Linc(%rip),%xmm0
680bc3d5698SJohn Baldwin	pshufd	$0xaa,%xmm3,%xmm2
681bc3d5698SJohn Baldwin	movdqa	%xmm1,272-256(%rcx)
682bc3d5698SJohn Baldwin	pshufd	$0xff,%xmm3,%xmm3
683bc3d5698SJohn Baldwin	movdqa	%xmm2,288-256(%rcx)
684bc3d5698SJohn Baldwin	movdqa	%xmm3,304-256(%rcx)
685bc3d5698SJohn Baldwin
686bc3d5698SJohn Baldwin	jmp	.Loop_enter4x
687bc3d5698SJohn Baldwin
688bc3d5698SJohn Baldwin.align	32
689bc3d5698SJohn Baldwin.Loop_outer4x:
690bc3d5698SJohn Baldwin	movdqa	64(%rsp),%xmm8
691bc3d5698SJohn Baldwin	movdqa	80(%rsp),%xmm9
692bc3d5698SJohn Baldwin	movdqa	96(%rsp),%xmm10
693bc3d5698SJohn Baldwin	movdqa	112(%rsp),%xmm11
694bc3d5698SJohn Baldwin	movdqa	128-256(%rcx),%xmm12
695bc3d5698SJohn Baldwin	movdqa	144-256(%rcx),%xmm13
696bc3d5698SJohn Baldwin	movdqa	160-256(%rcx),%xmm14
697bc3d5698SJohn Baldwin	movdqa	176-256(%rcx),%xmm15
698bc3d5698SJohn Baldwin	movdqa	192-256(%rcx),%xmm4
699bc3d5698SJohn Baldwin	movdqa	208-256(%rcx),%xmm5
700bc3d5698SJohn Baldwin	movdqa	224-256(%rcx),%xmm6
701bc3d5698SJohn Baldwin	movdqa	240-256(%rcx),%xmm7
702bc3d5698SJohn Baldwin	movdqa	256-256(%rcx),%xmm0
703bc3d5698SJohn Baldwin	movdqa	272-256(%rcx),%xmm1
704bc3d5698SJohn Baldwin	movdqa	288-256(%rcx),%xmm2
705bc3d5698SJohn Baldwin	movdqa	304-256(%rcx),%xmm3
706bc3d5698SJohn Baldwin	paddd	.Lfour(%rip),%xmm0
707bc3d5698SJohn Baldwin
708bc3d5698SJohn Baldwin.Loop_enter4x:
709bc3d5698SJohn Baldwin	movdqa	%xmm6,32(%rsp)
710bc3d5698SJohn Baldwin	movdqa	%xmm7,48(%rsp)
711bc3d5698SJohn Baldwin	movdqa	(%r10),%xmm7
712bc3d5698SJohn Baldwin	movl	$10,%eax
713bc3d5698SJohn Baldwin	movdqa	%xmm0,256-256(%rcx)
714bc3d5698SJohn Baldwin	jmp	.Loop4x
715bc3d5698SJohn Baldwin
716bc3d5698SJohn Baldwin.align	32
717bc3d5698SJohn Baldwin.Loop4x:
718bc3d5698SJohn Baldwin	paddd	%xmm12,%xmm8
719bc3d5698SJohn Baldwin	paddd	%xmm13,%xmm9
720bc3d5698SJohn Baldwin	pxor	%xmm8,%xmm0
721bc3d5698SJohn Baldwin	pxor	%xmm9,%xmm1
722bc3d5698SJohn Baldwin.byte	102,15,56,0,199
723bc3d5698SJohn Baldwin.byte	102,15,56,0,207
724bc3d5698SJohn Baldwin	paddd	%xmm0,%xmm4
725bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm5
726bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm12
727bc3d5698SJohn Baldwin	pxor	%xmm5,%xmm13
728bc3d5698SJohn Baldwin	movdqa	%xmm12,%xmm6
729bc3d5698SJohn Baldwin	pslld	$12,%xmm12
730bc3d5698SJohn Baldwin	psrld	$20,%xmm6
731bc3d5698SJohn Baldwin	movdqa	%xmm13,%xmm7
732bc3d5698SJohn Baldwin	pslld	$12,%xmm13
733bc3d5698SJohn Baldwin	por	%xmm6,%xmm12
734bc3d5698SJohn Baldwin	psrld	$20,%xmm7
735bc3d5698SJohn Baldwin	movdqa	(%r11),%xmm6
736bc3d5698SJohn Baldwin	por	%xmm7,%xmm13
737bc3d5698SJohn Baldwin	paddd	%xmm12,%xmm8
738bc3d5698SJohn Baldwin	paddd	%xmm13,%xmm9
739bc3d5698SJohn Baldwin	pxor	%xmm8,%xmm0
740bc3d5698SJohn Baldwin	pxor	%xmm9,%xmm1
741bc3d5698SJohn Baldwin.byte	102,15,56,0,198
742bc3d5698SJohn Baldwin.byte	102,15,56,0,206
743bc3d5698SJohn Baldwin	paddd	%xmm0,%xmm4
744bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm5
745bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm12
746bc3d5698SJohn Baldwin	pxor	%xmm5,%xmm13
747bc3d5698SJohn Baldwin	movdqa	%xmm12,%xmm7
748bc3d5698SJohn Baldwin	pslld	$7,%xmm12
749bc3d5698SJohn Baldwin	psrld	$25,%xmm7
750bc3d5698SJohn Baldwin	movdqa	%xmm13,%xmm6
751bc3d5698SJohn Baldwin	pslld	$7,%xmm13
752bc3d5698SJohn Baldwin	por	%xmm7,%xmm12
753bc3d5698SJohn Baldwin	psrld	$25,%xmm6
754bc3d5698SJohn Baldwin	movdqa	(%r10),%xmm7
755bc3d5698SJohn Baldwin	por	%xmm6,%xmm13
756bc3d5698SJohn Baldwin	movdqa	%xmm4,0(%rsp)
757bc3d5698SJohn Baldwin	movdqa	%xmm5,16(%rsp)
758bc3d5698SJohn Baldwin	movdqa	32(%rsp),%xmm4
759bc3d5698SJohn Baldwin	movdqa	48(%rsp),%xmm5
760bc3d5698SJohn Baldwin	paddd	%xmm14,%xmm10
761bc3d5698SJohn Baldwin	paddd	%xmm15,%xmm11
762bc3d5698SJohn Baldwin	pxor	%xmm10,%xmm2
763bc3d5698SJohn Baldwin	pxor	%xmm11,%xmm3
764bc3d5698SJohn Baldwin.byte	102,15,56,0,215
765bc3d5698SJohn Baldwin.byte	102,15,56,0,223
766bc3d5698SJohn Baldwin	paddd	%xmm2,%xmm4
767bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm5
768bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm14
769bc3d5698SJohn Baldwin	pxor	%xmm5,%xmm15
770bc3d5698SJohn Baldwin	movdqa	%xmm14,%xmm6
771bc3d5698SJohn Baldwin	pslld	$12,%xmm14
772bc3d5698SJohn Baldwin	psrld	$20,%xmm6
773bc3d5698SJohn Baldwin	movdqa	%xmm15,%xmm7
774bc3d5698SJohn Baldwin	pslld	$12,%xmm15
775bc3d5698SJohn Baldwin	por	%xmm6,%xmm14
776bc3d5698SJohn Baldwin	psrld	$20,%xmm7
777bc3d5698SJohn Baldwin	movdqa	(%r11),%xmm6
778bc3d5698SJohn Baldwin	por	%xmm7,%xmm15
779bc3d5698SJohn Baldwin	paddd	%xmm14,%xmm10
780bc3d5698SJohn Baldwin	paddd	%xmm15,%xmm11
781bc3d5698SJohn Baldwin	pxor	%xmm10,%xmm2
782bc3d5698SJohn Baldwin	pxor	%xmm11,%xmm3
783bc3d5698SJohn Baldwin.byte	102,15,56,0,214
784bc3d5698SJohn Baldwin.byte	102,15,56,0,222
785bc3d5698SJohn Baldwin	paddd	%xmm2,%xmm4
786bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm5
787bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm14
788bc3d5698SJohn Baldwin	pxor	%xmm5,%xmm15
789bc3d5698SJohn Baldwin	movdqa	%xmm14,%xmm7
790bc3d5698SJohn Baldwin	pslld	$7,%xmm14
791bc3d5698SJohn Baldwin	psrld	$25,%xmm7
792bc3d5698SJohn Baldwin	movdqa	%xmm15,%xmm6
793bc3d5698SJohn Baldwin	pslld	$7,%xmm15
794bc3d5698SJohn Baldwin	por	%xmm7,%xmm14
795bc3d5698SJohn Baldwin	psrld	$25,%xmm6
796bc3d5698SJohn Baldwin	movdqa	(%r10),%xmm7
797bc3d5698SJohn Baldwin	por	%xmm6,%xmm15
798bc3d5698SJohn Baldwin	paddd	%xmm13,%xmm8
799bc3d5698SJohn Baldwin	paddd	%xmm14,%xmm9
800bc3d5698SJohn Baldwin	pxor	%xmm8,%xmm3
801bc3d5698SJohn Baldwin	pxor	%xmm9,%xmm0
802bc3d5698SJohn Baldwin.byte	102,15,56,0,223
803bc3d5698SJohn Baldwin.byte	102,15,56,0,199
804bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm4
805bc3d5698SJohn Baldwin	paddd	%xmm0,%xmm5
806bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm13
807bc3d5698SJohn Baldwin	pxor	%xmm5,%xmm14
808bc3d5698SJohn Baldwin	movdqa	%xmm13,%xmm6
809bc3d5698SJohn Baldwin	pslld	$12,%xmm13
810bc3d5698SJohn Baldwin	psrld	$20,%xmm6
811bc3d5698SJohn Baldwin	movdqa	%xmm14,%xmm7
812bc3d5698SJohn Baldwin	pslld	$12,%xmm14
813bc3d5698SJohn Baldwin	por	%xmm6,%xmm13
814bc3d5698SJohn Baldwin	psrld	$20,%xmm7
815bc3d5698SJohn Baldwin	movdqa	(%r11),%xmm6
816bc3d5698SJohn Baldwin	por	%xmm7,%xmm14
817bc3d5698SJohn Baldwin	paddd	%xmm13,%xmm8
818bc3d5698SJohn Baldwin	paddd	%xmm14,%xmm9
819bc3d5698SJohn Baldwin	pxor	%xmm8,%xmm3
820bc3d5698SJohn Baldwin	pxor	%xmm9,%xmm0
821bc3d5698SJohn Baldwin.byte	102,15,56,0,222
822bc3d5698SJohn Baldwin.byte	102,15,56,0,198
823bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm4
824bc3d5698SJohn Baldwin	paddd	%xmm0,%xmm5
825bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm13
826bc3d5698SJohn Baldwin	pxor	%xmm5,%xmm14
827bc3d5698SJohn Baldwin	movdqa	%xmm13,%xmm7
828bc3d5698SJohn Baldwin	pslld	$7,%xmm13
829bc3d5698SJohn Baldwin	psrld	$25,%xmm7
830bc3d5698SJohn Baldwin	movdqa	%xmm14,%xmm6
831bc3d5698SJohn Baldwin	pslld	$7,%xmm14
832bc3d5698SJohn Baldwin	por	%xmm7,%xmm13
833bc3d5698SJohn Baldwin	psrld	$25,%xmm6
834bc3d5698SJohn Baldwin	movdqa	(%r10),%xmm7
835bc3d5698SJohn Baldwin	por	%xmm6,%xmm14
836bc3d5698SJohn Baldwin	movdqa	%xmm4,32(%rsp)
837bc3d5698SJohn Baldwin	movdqa	%xmm5,48(%rsp)
838bc3d5698SJohn Baldwin	movdqa	0(%rsp),%xmm4
839bc3d5698SJohn Baldwin	movdqa	16(%rsp),%xmm5
840bc3d5698SJohn Baldwin	paddd	%xmm15,%xmm10
841bc3d5698SJohn Baldwin	paddd	%xmm12,%xmm11
842bc3d5698SJohn Baldwin	pxor	%xmm10,%xmm1
843bc3d5698SJohn Baldwin	pxor	%xmm11,%xmm2
844bc3d5698SJohn Baldwin.byte	102,15,56,0,207
845bc3d5698SJohn Baldwin.byte	102,15,56,0,215
846bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm4
847bc3d5698SJohn Baldwin	paddd	%xmm2,%xmm5
848bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm15
849bc3d5698SJohn Baldwin	pxor	%xmm5,%xmm12
850bc3d5698SJohn Baldwin	movdqa	%xmm15,%xmm6
851bc3d5698SJohn Baldwin	pslld	$12,%xmm15
852bc3d5698SJohn Baldwin	psrld	$20,%xmm6
853bc3d5698SJohn Baldwin	movdqa	%xmm12,%xmm7
854bc3d5698SJohn Baldwin	pslld	$12,%xmm12
855bc3d5698SJohn Baldwin	por	%xmm6,%xmm15
856bc3d5698SJohn Baldwin	psrld	$20,%xmm7
857bc3d5698SJohn Baldwin	movdqa	(%r11),%xmm6
858bc3d5698SJohn Baldwin	por	%xmm7,%xmm12
859bc3d5698SJohn Baldwin	paddd	%xmm15,%xmm10
860bc3d5698SJohn Baldwin	paddd	%xmm12,%xmm11
861bc3d5698SJohn Baldwin	pxor	%xmm10,%xmm1
862bc3d5698SJohn Baldwin	pxor	%xmm11,%xmm2
863bc3d5698SJohn Baldwin.byte	102,15,56,0,206
864bc3d5698SJohn Baldwin.byte	102,15,56,0,214
865bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm4
866bc3d5698SJohn Baldwin	paddd	%xmm2,%xmm5
867bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm15
868bc3d5698SJohn Baldwin	pxor	%xmm5,%xmm12
869bc3d5698SJohn Baldwin	movdqa	%xmm15,%xmm7
870bc3d5698SJohn Baldwin	pslld	$7,%xmm15
871bc3d5698SJohn Baldwin	psrld	$25,%xmm7
872bc3d5698SJohn Baldwin	movdqa	%xmm12,%xmm6
873bc3d5698SJohn Baldwin	pslld	$7,%xmm12
874bc3d5698SJohn Baldwin	por	%xmm7,%xmm15
875bc3d5698SJohn Baldwin	psrld	$25,%xmm6
876bc3d5698SJohn Baldwin	movdqa	(%r10),%xmm7
877bc3d5698SJohn Baldwin	por	%xmm6,%xmm12
878bc3d5698SJohn Baldwin	decl	%eax
879bc3d5698SJohn Baldwin	jnz	.Loop4x
880bc3d5698SJohn Baldwin
881bc3d5698SJohn Baldwin	paddd	64(%rsp),%xmm8
882bc3d5698SJohn Baldwin	paddd	80(%rsp),%xmm9
883bc3d5698SJohn Baldwin	paddd	96(%rsp),%xmm10
884bc3d5698SJohn Baldwin	paddd	112(%rsp),%xmm11
885bc3d5698SJohn Baldwin
886bc3d5698SJohn Baldwin	movdqa	%xmm8,%xmm6
887bc3d5698SJohn Baldwin	punpckldq	%xmm9,%xmm8
888bc3d5698SJohn Baldwin	movdqa	%xmm10,%xmm7
889bc3d5698SJohn Baldwin	punpckldq	%xmm11,%xmm10
890bc3d5698SJohn Baldwin	punpckhdq	%xmm9,%xmm6
891bc3d5698SJohn Baldwin	punpckhdq	%xmm11,%xmm7
892bc3d5698SJohn Baldwin	movdqa	%xmm8,%xmm9
893bc3d5698SJohn Baldwin	punpcklqdq	%xmm10,%xmm8
894bc3d5698SJohn Baldwin	movdqa	%xmm6,%xmm11
895bc3d5698SJohn Baldwin	punpcklqdq	%xmm7,%xmm6
896bc3d5698SJohn Baldwin	punpckhqdq	%xmm10,%xmm9
897bc3d5698SJohn Baldwin	punpckhqdq	%xmm7,%xmm11
898bc3d5698SJohn Baldwin	paddd	128-256(%rcx),%xmm12
899bc3d5698SJohn Baldwin	paddd	144-256(%rcx),%xmm13
900bc3d5698SJohn Baldwin	paddd	160-256(%rcx),%xmm14
901bc3d5698SJohn Baldwin	paddd	176-256(%rcx),%xmm15
902bc3d5698SJohn Baldwin
903bc3d5698SJohn Baldwin	movdqa	%xmm8,0(%rsp)
904bc3d5698SJohn Baldwin	movdqa	%xmm9,16(%rsp)
905bc3d5698SJohn Baldwin	movdqa	32(%rsp),%xmm8
906bc3d5698SJohn Baldwin	movdqa	48(%rsp),%xmm9
907bc3d5698SJohn Baldwin
908bc3d5698SJohn Baldwin	movdqa	%xmm12,%xmm10
909bc3d5698SJohn Baldwin	punpckldq	%xmm13,%xmm12
910bc3d5698SJohn Baldwin	movdqa	%xmm14,%xmm7
911bc3d5698SJohn Baldwin	punpckldq	%xmm15,%xmm14
912bc3d5698SJohn Baldwin	punpckhdq	%xmm13,%xmm10
913bc3d5698SJohn Baldwin	punpckhdq	%xmm15,%xmm7
914bc3d5698SJohn Baldwin	movdqa	%xmm12,%xmm13
915bc3d5698SJohn Baldwin	punpcklqdq	%xmm14,%xmm12
916bc3d5698SJohn Baldwin	movdqa	%xmm10,%xmm15
917bc3d5698SJohn Baldwin	punpcklqdq	%xmm7,%xmm10
918bc3d5698SJohn Baldwin	punpckhqdq	%xmm14,%xmm13
919bc3d5698SJohn Baldwin	punpckhqdq	%xmm7,%xmm15
920bc3d5698SJohn Baldwin	paddd	192-256(%rcx),%xmm4
921bc3d5698SJohn Baldwin	paddd	208-256(%rcx),%xmm5
922bc3d5698SJohn Baldwin	paddd	224-256(%rcx),%xmm8
923bc3d5698SJohn Baldwin	paddd	240-256(%rcx),%xmm9
924bc3d5698SJohn Baldwin
925bc3d5698SJohn Baldwin	movdqa	%xmm6,32(%rsp)
926bc3d5698SJohn Baldwin	movdqa	%xmm11,48(%rsp)
927bc3d5698SJohn Baldwin
928bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm14
929bc3d5698SJohn Baldwin	punpckldq	%xmm5,%xmm4
930bc3d5698SJohn Baldwin	movdqa	%xmm8,%xmm7
931bc3d5698SJohn Baldwin	punpckldq	%xmm9,%xmm8
932bc3d5698SJohn Baldwin	punpckhdq	%xmm5,%xmm14
933bc3d5698SJohn Baldwin	punpckhdq	%xmm9,%xmm7
934bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm5
935bc3d5698SJohn Baldwin	punpcklqdq	%xmm8,%xmm4
936bc3d5698SJohn Baldwin	movdqa	%xmm14,%xmm9
937bc3d5698SJohn Baldwin	punpcklqdq	%xmm7,%xmm14
938bc3d5698SJohn Baldwin	punpckhqdq	%xmm8,%xmm5
939bc3d5698SJohn Baldwin	punpckhqdq	%xmm7,%xmm9
940bc3d5698SJohn Baldwin	paddd	256-256(%rcx),%xmm0
941bc3d5698SJohn Baldwin	paddd	272-256(%rcx),%xmm1
942bc3d5698SJohn Baldwin	paddd	288-256(%rcx),%xmm2
943bc3d5698SJohn Baldwin	paddd	304-256(%rcx),%xmm3
944bc3d5698SJohn Baldwin
945bc3d5698SJohn Baldwin	movdqa	%xmm0,%xmm8
946bc3d5698SJohn Baldwin	punpckldq	%xmm1,%xmm0
947bc3d5698SJohn Baldwin	movdqa	%xmm2,%xmm7
948bc3d5698SJohn Baldwin	punpckldq	%xmm3,%xmm2
949bc3d5698SJohn Baldwin	punpckhdq	%xmm1,%xmm8
950bc3d5698SJohn Baldwin	punpckhdq	%xmm3,%xmm7
951bc3d5698SJohn Baldwin	movdqa	%xmm0,%xmm1
952bc3d5698SJohn Baldwin	punpcklqdq	%xmm2,%xmm0
953bc3d5698SJohn Baldwin	movdqa	%xmm8,%xmm3
954bc3d5698SJohn Baldwin	punpcklqdq	%xmm7,%xmm8
955bc3d5698SJohn Baldwin	punpckhqdq	%xmm2,%xmm1
956bc3d5698SJohn Baldwin	punpckhqdq	%xmm7,%xmm3
957bc3d5698SJohn Baldwin	cmpq	$256,%rdx
958bc3d5698SJohn Baldwin	jb	.Ltail4x
959bc3d5698SJohn Baldwin
960bc3d5698SJohn Baldwin	movdqu	0(%rsi),%xmm6
961bc3d5698SJohn Baldwin	movdqu	16(%rsi),%xmm11
962bc3d5698SJohn Baldwin	movdqu	32(%rsi),%xmm2
963bc3d5698SJohn Baldwin	movdqu	48(%rsi),%xmm7
964bc3d5698SJohn Baldwin	pxor	0(%rsp),%xmm6
965bc3d5698SJohn Baldwin	pxor	%xmm12,%xmm11
966bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm2
967bc3d5698SJohn Baldwin	pxor	%xmm0,%xmm7
968bc3d5698SJohn Baldwin
969bc3d5698SJohn Baldwin	movdqu	%xmm6,0(%rdi)
970bc3d5698SJohn Baldwin	movdqu	64(%rsi),%xmm6
971bc3d5698SJohn Baldwin	movdqu	%xmm11,16(%rdi)
972bc3d5698SJohn Baldwin	movdqu	80(%rsi),%xmm11
973bc3d5698SJohn Baldwin	movdqu	%xmm2,32(%rdi)
974bc3d5698SJohn Baldwin	movdqu	96(%rsi),%xmm2
975bc3d5698SJohn Baldwin	movdqu	%xmm7,48(%rdi)
976bc3d5698SJohn Baldwin	movdqu	112(%rsi),%xmm7
977bc3d5698SJohn Baldwin	leaq	128(%rsi),%rsi
978bc3d5698SJohn Baldwin	pxor	16(%rsp),%xmm6
979bc3d5698SJohn Baldwin	pxor	%xmm13,%xmm11
980bc3d5698SJohn Baldwin	pxor	%xmm5,%xmm2
981bc3d5698SJohn Baldwin	pxor	%xmm1,%xmm7
982bc3d5698SJohn Baldwin
983bc3d5698SJohn Baldwin	movdqu	%xmm6,64(%rdi)
984bc3d5698SJohn Baldwin	movdqu	0(%rsi),%xmm6
985bc3d5698SJohn Baldwin	movdqu	%xmm11,80(%rdi)
986bc3d5698SJohn Baldwin	movdqu	16(%rsi),%xmm11
987bc3d5698SJohn Baldwin	movdqu	%xmm2,96(%rdi)
988bc3d5698SJohn Baldwin	movdqu	32(%rsi),%xmm2
989bc3d5698SJohn Baldwin	movdqu	%xmm7,112(%rdi)
990bc3d5698SJohn Baldwin	leaq	128(%rdi),%rdi
991bc3d5698SJohn Baldwin	movdqu	48(%rsi),%xmm7
992bc3d5698SJohn Baldwin	pxor	32(%rsp),%xmm6
993bc3d5698SJohn Baldwin	pxor	%xmm10,%xmm11
994bc3d5698SJohn Baldwin	pxor	%xmm14,%xmm2
995bc3d5698SJohn Baldwin	pxor	%xmm8,%xmm7
996bc3d5698SJohn Baldwin
997bc3d5698SJohn Baldwin	movdqu	%xmm6,0(%rdi)
998bc3d5698SJohn Baldwin	movdqu	64(%rsi),%xmm6
999bc3d5698SJohn Baldwin	movdqu	%xmm11,16(%rdi)
1000bc3d5698SJohn Baldwin	movdqu	80(%rsi),%xmm11
1001bc3d5698SJohn Baldwin	movdqu	%xmm2,32(%rdi)
1002bc3d5698SJohn Baldwin	movdqu	96(%rsi),%xmm2
1003bc3d5698SJohn Baldwin	movdqu	%xmm7,48(%rdi)
1004bc3d5698SJohn Baldwin	movdqu	112(%rsi),%xmm7
1005bc3d5698SJohn Baldwin	leaq	128(%rsi),%rsi
1006bc3d5698SJohn Baldwin	pxor	48(%rsp),%xmm6
1007bc3d5698SJohn Baldwin	pxor	%xmm15,%xmm11
1008bc3d5698SJohn Baldwin	pxor	%xmm9,%xmm2
1009bc3d5698SJohn Baldwin	pxor	%xmm3,%xmm7
1010bc3d5698SJohn Baldwin	movdqu	%xmm6,64(%rdi)
1011bc3d5698SJohn Baldwin	movdqu	%xmm11,80(%rdi)
1012bc3d5698SJohn Baldwin	movdqu	%xmm2,96(%rdi)
1013bc3d5698SJohn Baldwin	movdqu	%xmm7,112(%rdi)
1014bc3d5698SJohn Baldwin	leaq	128(%rdi),%rdi
1015bc3d5698SJohn Baldwin
1016bc3d5698SJohn Baldwin	subq	$256,%rdx
1017bc3d5698SJohn Baldwin	jnz	.Loop_outer4x
1018bc3d5698SJohn Baldwin
1019bc3d5698SJohn Baldwin	jmp	.Ldone4x
1020bc3d5698SJohn Baldwin
1021bc3d5698SJohn Baldwin.Ltail4x:
1022bc3d5698SJohn Baldwin	cmpq	$192,%rdx
1023bc3d5698SJohn Baldwin	jae	.L192_or_more4x
1024bc3d5698SJohn Baldwin	cmpq	$128,%rdx
1025bc3d5698SJohn Baldwin	jae	.L128_or_more4x
1026bc3d5698SJohn Baldwin	cmpq	$64,%rdx
1027bc3d5698SJohn Baldwin	jae	.L64_or_more4x
1028bc3d5698SJohn Baldwin
1029bc3d5698SJohn Baldwin
1030bc3d5698SJohn Baldwin	xorq	%r10,%r10
1031bc3d5698SJohn Baldwin
1032bc3d5698SJohn Baldwin	movdqa	%xmm12,16(%rsp)
1033bc3d5698SJohn Baldwin	movdqa	%xmm4,32(%rsp)
1034bc3d5698SJohn Baldwin	movdqa	%xmm0,48(%rsp)
1035bc3d5698SJohn Baldwin	jmp	.Loop_tail4x
1036bc3d5698SJohn Baldwin
1037bc3d5698SJohn Baldwin.align	32
1038bc3d5698SJohn Baldwin.L64_or_more4x:
1039bc3d5698SJohn Baldwin	movdqu	0(%rsi),%xmm6
1040bc3d5698SJohn Baldwin	movdqu	16(%rsi),%xmm11
1041bc3d5698SJohn Baldwin	movdqu	32(%rsi),%xmm2
1042bc3d5698SJohn Baldwin	movdqu	48(%rsi),%xmm7
1043bc3d5698SJohn Baldwin	pxor	0(%rsp),%xmm6
1044bc3d5698SJohn Baldwin	pxor	%xmm12,%xmm11
1045bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm2
1046bc3d5698SJohn Baldwin	pxor	%xmm0,%xmm7
1047bc3d5698SJohn Baldwin	movdqu	%xmm6,0(%rdi)
1048bc3d5698SJohn Baldwin	movdqu	%xmm11,16(%rdi)
1049bc3d5698SJohn Baldwin	movdqu	%xmm2,32(%rdi)
1050bc3d5698SJohn Baldwin	movdqu	%xmm7,48(%rdi)
1051bc3d5698SJohn Baldwin	je	.Ldone4x
1052bc3d5698SJohn Baldwin
1053bc3d5698SJohn Baldwin	movdqa	16(%rsp),%xmm6
1054bc3d5698SJohn Baldwin	leaq	64(%rsi),%rsi
1055bc3d5698SJohn Baldwin	xorq	%r10,%r10
1056bc3d5698SJohn Baldwin	movdqa	%xmm6,0(%rsp)
1057bc3d5698SJohn Baldwin	movdqa	%xmm13,16(%rsp)
1058bc3d5698SJohn Baldwin	leaq	64(%rdi),%rdi
1059bc3d5698SJohn Baldwin	movdqa	%xmm5,32(%rsp)
1060bc3d5698SJohn Baldwin	subq	$64,%rdx
1061bc3d5698SJohn Baldwin	movdqa	%xmm1,48(%rsp)
1062bc3d5698SJohn Baldwin	jmp	.Loop_tail4x
1063bc3d5698SJohn Baldwin
1064bc3d5698SJohn Baldwin.align	32
1065bc3d5698SJohn Baldwin.L128_or_more4x:
1066bc3d5698SJohn Baldwin	movdqu	0(%rsi),%xmm6
1067bc3d5698SJohn Baldwin	movdqu	16(%rsi),%xmm11
1068bc3d5698SJohn Baldwin	movdqu	32(%rsi),%xmm2
1069bc3d5698SJohn Baldwin	movdqu	48(%rsi),%xmm7
1070bc3d5698SJohn Baldwin	pxor	0(%rsp),%xmm6
1071bc3d5698SJohn Baldwin	pxor	%xmm12,%xmm11
1072bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm2
1073bc3d5698SJohn Baldwin	pxor	%xmm0,%xmm7
1074bc3d5698SJohn Baldwin
1075bc3d5698SJohn Baldwin	movdqu	%xmm6,0(%rdi)
1076bc3d5698SJohn Baldwin	movdqu	64(%rsi),%xmm6
1077bc3d5698SJohn Baldwin	movdqu	%xmm11,16(%rdi)
1078bc3d5698SJohn Baldwin	movdqu	80(%rsi),%xmm11
1079bc3d5698SJohn Baldwin	movdqu	%xmm2,32(%rdi)
1080bc3d5698SJohn Baldwin	movdqu	96(%rsi),%xmm2
1081bc3d5698SJohn Baldwin	movdqu	%xmm7,48(%rdi)
1082bc3d5698SJohn Baldwin	movdqu	112(%rsi),%xmm7
1083bc3d5698SJohn Baldwin	pxor	16(%rsp),%xmm6
1084bc3d5698SJohn Baldwin	pxor	%xmm13,%xmm11
1085bc3d5698SJohn Baldwin	pxor	%xmm5,%xmm2
1086bc3d5698SJohn Baldwin	pxor	%xmm1,%xmm7
1087bc3d5698SJohn Baldwin	movdqu	%xmm6,64(%rdi)
1088bc3d5698SJohn Baldwin	movdqu	%xmm11,80(%rdi)
1089bc3d5698SJohn Baldwin	movdqu	%xmm2,96(%rdi)
1090bc3d5698SJohn Baldwin	movdqu	%xmm7,112(%rdi)
1091bc3d5698SJohn Baldwin	je	.Ldone4x
1092bc3d5698SJohn Baldwin
1093bc3d5698SJohn Baldwin	movdqa	32(%rsp),%xmm6
1094bc3d5698SJohn Baldwin	leaq	128(%rsi),%rsi
1095bc3d5698SJohn Baldwin	xorq	%r10,%r10
1096bc3d5698SJohn Baldwin	movdqa	%xmm6,0(%rsp)
1097bc3d5698SJohn Baldwin	movdqa	%xmm10,16(%rsp)
1098bc3d5698SJohn Baldwin	leaq	128(%rdi),%rdi
1099bc3d5698SJohn Baldwin	movdqa	%xmm14,32(%rsp)
1100bc3d5698SJohn Baldwin	subq	$128,%rdx
1101bc3d5698SJohn Baldwin	movdqa	%xmm8,48(%rsp)
1102bc3d5698SJohn Baldwin	jmp	.Loop_tail4x
1103bc3d5698SJohn Baldwin
1104bc3d5698SJohn Baldwin.align	32
1105bc3d5698SJohn Baldwin.L192_or_more4x:
1106bc3d5698SJohn Baldwin	movdqu	0(%rsi),%xmm6
1107bc3d5698SJohn Baldwin	movdqu	16(%rsi),%xmm11
1108bc3d5698SJohn Baldwin	movdqu	32(%rsi),%xmm2
1109bc3d5698SJohn Baldwin	movdqu	48(%rsi),%xmm7
1110bc3d5698SJohn Baldwin	pxor	0(%rsp),%xmm6
1111bc3d5698SJohn Baldwin	pxor	%xmm12,%xmm11
1112bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm2
1113bc3d5698SJohn Baldwin	pxor	%xmm0,%xmm7
1114bc3d5698SJohn Baldwin
1115bc3d5698SJohn Baldwin	movdqu	%xmm6,0(%rdi)
1116bc3d5698SJohn Baldwin	movdqu	64(%rsi),%xmm6
1117bc3d5698SJohn Baldwin	movdqu	%xmm11,16(%rdi)
1118bc3d5698SJohn Baldwin	movdqu	80(%rsi),%xmm11
1119bc3d5698SJohn Baldwin	movdqu	%xmm2,32(%rdi)
1120bc3d5698SJohn Baldwin	movdqu	96(%rsi),%xmm2
1121bc3d5698SJohn Baldwin	movdqu	%xmm7,48(%rdi)
1122bc3d5698SJohn Baldwin	movdqu	112(%rsi),%xmm7
1123bc3d5698SJohn Baldwin	leaq	128(%rsi),%rsi
1124bc3d5698SJohn Baldwin	pxor	16(%rsp),%xmm6
1125bc3d5698SJohn Baldwin	pxor	%xmm13,%xmm11
1126bc3d5698SJohn Baldwin	pxor	%xmm5,%xmm2
1127bc3d5698SJohn Baldwin	pxor	%xmm1,%xmm7
1128bc3d5698SJohn Baldwin
1129bc3d5698SJohn Baldwin	movdqu	%xmm6,64(%rdi)
1130bc3d5698SJohn Baldwin	movdqu	0(%rsi),%xmm6
1131bc3d5698SJohn Baldwin	movdqu	%xmm11,80(%rdi)
1132bc3d5698SJohn Baldwin	movdqu	16(%rsi),%xmm11
1133bc3d5698SJohn Baldwin	movdqu	%xmm2,96(%rdi)
1134bc3d5698SJohn Baldwin	movdqu	32(%rsi),%xmm2
1135bc3d5698SJohn Baldwin	movdqu	%xmm7,112(%rdi)
1136bc3d5698SJohn Baldwin	leaq	128(%rdi),%rdi
1137bc3d5698SJohn Baldwin	movdqu	48(%rsi),%xmm7
1138bc3d5698SJohn Baldwin	pxor	32(%rsp),%xmm6
1139bc3d5698SJohn Baldwin	pxor	%xmm10,%xmm11
1140bc3d5698SJohn Baldwin	pxor	%xmm14,%xmm2
1141bc3d5698SJohn Baldwin	pxor	%xmm8,%xmm7
1142bc3d5698SJohn Baldwin	movdqu	%xmm6,0(%rdi)
1143bc3d5698SJohn Baldwin	movdqu	%xmm11,16(%rdi)
1144bc3d5698SJohn Baldwin	movdqu	%xmm2,32(%rdi)
1145bc3d5698SJohn Baldwin	movdqu	%xmm7,48(%rdi)
1146bc3d5698SJohn Baldwin	je	.Ldone4x
1147bc3d5698SJohn Baldwin
1148bc3d5698SJohn Baldwin	movdqa	48(%rsp),%xmm6
1149bc3d5698SJohn Baldwin	leaq	64(%rsi),%rsi
1150bc3d5698SJohn Baldwin	xorq	%r10,%r10
1151bc3d5698SJohn Baldwin	movdqa	%xmm6,0(%rsp)
1152bc3d5698SJohn Baldwin	movdqa	%xmm15,16(%rsp)
1153bc3d5698SJohn Baldwin	leaq	64(%rdi),%rdi
1154bc3d5698SJohn Baldwin	movdqa	%xmm9,32(%rsp)
1155bc3d5698SJohn Baldwin	subq	$192,%rdx
1156bc3d5698SJohn Baldwin	movdqa	%xmm3,48(%rsp)
1157bc3d5698SJohn Baldwin
1158bc3d5698SJohn Baldwin.Loop_tail4x:
1159bc3d5698SJohn Baldwin	movzbl	(%rsi,%r10,1),%eax
1160bc3d5698SJohn Baldwin	movzbl	(%rsp,%r10,1),%ecx
1161bc3d5698SJohn Baldwin	leaq	1(%r10),%r10
1162bc3d5698SJohn Baldwin	xorl	%ecx,%eax
1163bc3d5698SJohn Baldwin	movb	%al,-1(%rdi,%r10,1)
1164bc3d5698SJohn Baldwin	decq	%rdx
1165bc3d5698SJohn Baldwin	jnz	.Loop_tail4x
1166bc3d5698SJohn Baldwin
1167bc3d5698SJohn Baldwin.Ldone4x:
1168bc3d5698SJohn Baldwin	leaq	(%r9),%rsp
1169bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rsp
1170bc3d5698SJohn Baldwin.L4x_epilogue:
1171bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
1172bc3d5698SJohn Baldwin.cfi_endproc
1173bc3d5698SJohn Baldwin.size	ChaCha20_4x,.-ChaCha20_4x
1174bc3d5698SJohn Baldwin.type	ChaCha20_4xop,@function
1175bc3d5698SJohn Baldwin.align	32
1176bc3d5698SJohn BaldwinChaCha20_4xop:
1177bc3d5698SJohn Baldwin.cfi_startproc
1178bc3d5698SJohn Baldwin.LChaCha20_4xop:
1179bc3d5698SJohn Baldwin	movq	%rsp,%r9
1180bc3d5698SJohn Baldwin.cfi_def_cfa_register	%r9
1181bc3d5698SJohn Baldwin	subq	$0x140+8,%rsp
1182bc3d5698SJohn Baldwin	vzeroupper
1183bc3d5698SJohn Baldwin
1184bc3d5698SJohn Baldwin	vmovdqa	.Lsigma(%rip),%xmm11
1185bc3d5698SJohn Baldwin	vmovdqu	(%rcx),%xmm3
1186bc3d5698SJohn Baldwin	vmovdqu	16(%rcx),%xmm15
1187bc3d5698SJohn Baldwin	vmovdqu	(%r8),%xmm7
1188bc3d5698SJohn Baldwin	leaq	256(%rsp),%rcx
1189bc3d5698SJohn Baldwin
1190bc3d5698SJohn Baldwin	vpshufd	$0x00,%xmm11,%xmm8
1191bc3d5698SJohn Baldwin	vpshufd	$0x55,%xmm11,%xmm9
1192bc3d5698SJohn Baldwin	vmovdqa	%xmm8,64(%rsp)
1193bc3d5698SJohn Baldwin	vpshufd	$0xaa,%xmm11,%xmm10
1194bc3d5698SJohn Baldwin	vmovdqa	%xmm9,80(%rsp)
1195bc3d5698SJohn Baldwin	vpshufd	$0xff,%xmm11,%xmm11
1196bc3d5698SJohn Baldwin	vmovdqa	%xmm10,96(%rsp)
1197bc3d5698SJohn Baldwin	vmovdqa	%xmm11,112(%rsp)
1198bc3d5698SJohn Baldwin
1199bc3d5698SJohn Baldwin	vpshufd	$0x00,%xmm3,%xmm0
1200bc3d5698SJohn Baldwin	vpshufd	$0x55,%xmm3,%xmm1
1201bc3d5698SJohn Baldwin	vmovdqa	%xmm0,128-256(%rcx)
1202bc3d5698SJohn Baldwin	vpshufd	$0xaa,%xmm3,%xmm2
1203bc3d5698SJohn Baldwin	vmovdqa	%xmm1,144-256(%rcx)
1204bc3d5698SJohn Baldwin	vpshufd	$0xff,%xmm3,%xmm3
1205bc3d5698SJohn Baldwin	vmovdqa	%xmm2,160-256(%rcx)
1206bc3d5698SJohn Baldwin	vmovdqa	%xmm3,176-256(%rcx)
1207bc3d5698SJohn Baldwin
1208bc3d5698SJohn Baldwin	vpshufd	$0x00,%xmm15,%xmm12
1209bc3d5698SJohn Baldwin	vpshufd	$0x55,%xmm15,%xmm13
1210bc3d5698SJohn Baldwin	vmovdqa	%xmm12,192-256(%rcx)
1211bc3d5698SJohn Baldwin	vpshufd	$0xaa,%xmm15,%xmm14
1212bc3d5698SJohn Baldwin	vmovdqa	%xmm13,208-256(%rcx)
1213bc3d5698SJohn Baldwin	vpshufd	$0xff,%xmm15,%xmm15
1214bc3d5698SJohn Baldwin	vmovdqa	%xmm14,224-256(%rcx)
1215bc3d5698SJohn Baldwin	vmovdqa	%xmm15,240-256(%rcx)
1216bc3d5698SJohn Baldwin
1217bc3d5698SJohn Baldwin	vpshufd	$0x00,%xmm7,%xmm4
1218bc3d5698SJohn Baldwin	vpshufd	$0x55,%xmm7,%xmm5
1219bc3d5698SJohn Baldwin	vpaddd	.Linc(%rip),%xmm4,%xmm4
1220bc3d5698SJohn Baldwin	vpshufd	$0xaa,%xmm7,%xmm6
1221bc3d5698SJohn Baldwin	vmovdqa	%xmm5,272-256(%rcx)
1222bc3d5698SJohn Baldwin	vpshufd	$0xff,%xmm7,%xmm7
1223bc3d5698SJohn Baldwin	vmovdqa	%xmm6,288-256(%rcx)
1224bc3d5698SJohn Baldwin	vmovdqa	%xmm7,304-256(%rcx)
1225bc3d5698SJohn Baldwin
1226bc3d5698SJohn Baldwin	jmp	.Loop_enter4xop
1227bc3d5698SJohn Baldwin
1228bc3d5698SJohn Baldwin.align	32
1229bc3d5698SJohn Baldwin.Loop_outer4xop:
1230bc3d5698SJohn Baldwin	vmovdqa	64(%rsp),%xmm8
1231bc3d5698SJohn Baldwin	vmovdqa	80(%rsp),%xmm9
1232bc3d5698SJohn Baldwin	vmovdqa	96(%rsp),%xmm10
1233bc3d5698SJohn Baldwin	vmovdqa	112(%rsp),%xmm11
1234bc3d5698SJohn Baldwin	vmovdqa	128-256(%rcx),%xmm0
1235bc3d5698SJohn Baldwin	vmovdqa	144-256(%rcx),%xmm1
1236bc3d5698SJohn Baldwin	vmovdqa	160-256(%rcx),%xmm2
1237bc3d5698SJohn Baldwin	vmovdqa	176-256(%rcx),%xmm3
1238bc3d5698SJohn Baldwin	vmovdqa	192-256(%rcx),%xmm12
1239bc3d5698SJohn Baldwin	vmovdqa	208-256(%rcx),%xmm13
1240bc3d5698SJohn Baldwin	vmovdqa	224-256(%rcx),%xmm14
1241bc3d5698SJohn Baldwin	vmovdqa	240-256(%rcx),%xmm15
1242bc3d5698SJohn Baldwin	vmovdqa	256-256(%rcx),%xmm4
1243bc3d5698SJohn Baldwin	vmovdqa	272-256(%rcx),%xmm5
1244bc3d5698SJohn Baldwin	vmovdqa	288-256(%rcx),%xmm6
1245bc3d5698SJohn Baldwin	vmovdqa	304-256(%rcx),%xmm7
1246bc3d5698SJohn Baldwin	vpaddd	.Lfour(%rip),%xmm4,%xmm4
1247bc3d5698SJohn Baldwin
1248bc3d5698SJohn Baldwin.Loop_enter4xop:
1249bc3d5698SJohn Baldwin	movl	$10,%eax
1250bc3d5698SJohn Baldwin	vmovdqa	%xmm4,256-256(%rcx)
1251bc3d5698SJohn Baldwin	jmp	.Loop4xop
1252bc3d5698SJohn Baldwin
1253bc3d5698SJohn Baldwin.align	32
1254bc3d5698SJohn Baldwin.Loop4xop:
1255bc3d5698SJohn Baldwin	vpaddd	%xmm0,%xmm8,%xmm8
1256bc3d5698SJohn Baldwin	vpaddd	%xmm1,%xmm9,%xmm9
1257bc3d5698SJohn Baldwin	vpaddd	%xmm2,%xmm10,%xmm10
1258bc3d5698SJohn Baldwin	vpaddd	%xmm3,%xmm11,%xmm11
1259bc3d5698SJohn Baldwin	vpxor	%xmm4,%xmm8,%xmm4
1260bc3d5698SJohn Baldwin	vpxor	%xmm5,%xmm9,%xmm5
1261bc3d5698SJohn Baldwin	vpxor	%xmm6,%xmm10,%xmm6
1262bc3d5698SJohn Baldwin	vpxor	%xmm7,%xmm11,%xmm7
1263bc3d5698SJohn Baldwin.byte	143,232,120,194,228,16
1264bc3d5698SJohn Baldwin.byte	143,232,120,194,237,16
1265bc3d5698SJohn Baldwin.byte	143,232,120,194,246,16
1266bc3d5698SJohn Baldwin.byte	143,232,120,194,255,16
1267bc3d5698SJohn Baldwin	vpaddd	%xmm4,%xmm12,%xmm12
1268bc3d5698SJohn Baldwin	vpaddd	%xmm5,%xmm13,%xmm13
1269bc3d5698SJohn Baldwin	vpaddd	%xmm6,%xmm14,%xmm14
1270bc3d5698SJohn Baldwin	vpaddd	%xmm7,%xmm15,%xmm15
1271bc3d5698SJohn Baldwin	vpxor	%xmm0,%xmm12,%xmm0
1272bc3d5698SJohn Baldwin	vpxor	%xmm1,%xmm13,%xmm1
1273bc3d5698SJohn Baldwin	vpxor	%xmm14,%xmm2,%xmm2
1274bc3d5698SJohn Baldwin	vpxor	%xmm15,%xmm3,%xmm3
1275bc3d5698SJohn Baldwin.byte	143,232,120,194,192,12
1276bc3d5698SJohn Baldwin.byte	143,232,120,194,201,12
1277bc3d5698SJohn Baldwin.byte	143,232,120,194,210,12
1278bc3d5698SJohn Baldwin.byte	143,232,120,194,219,12
1279bc3d5698SJohn Baldwin	vpaddd	%xmm8,%xmm0,%xmm8
1280bc3d5698SJohn Baldwin	vpaddd	%xmm9,%xmm1,%xmm9
1281bc3d5698SJohn Baldwin	vpaddd	%xmm2,%xmm10,%xmm10
1282bc3d5698SJohn Baldwin	vpaddd	%xmm3,%xmm11,%xmm11
1283bc3d5698SJohn Baldwin	vpxor	%xmm4,%xmm8,%xmm4
1284bc3d5698SJohn Baldwin	vpxor	%xmm5,%xmm9,%xmm5
1285bc3d5698SJohn Baldwin	vpxor	%xmm6,%xmm10,%xmm6
1286bc3d5698SJohn Baldwin	vpxor	%xmm7,%xmm11,%xmm7
1287bc3d5698SJohn Baldwin.byte	143,232,120,194,228,8
1288bc3d5698SJohn Baldwin.byte	143,232,120,194,237,8
1289bc3d5698SJohn Baldwin.byte	143,232,120,194,246,8
1290bc3d5698SJohn Baldwin.byte	143,232,120,194,255,8
1291bc3d5698SJohn Baldwin	vpaddd	%xmm4,%xmm12,%xmm12
1292bc3d5698SJohn Baldwin	vpaddd	%xmm5,%xmm13,%xmm13
1293bc3d5698SJohn Baldwin	vpaddd	%xmm6,%xmm14,%xmm14
1294bc3d5698SJohn Baldwin	vpaddd	%xmm7,%xmm15,%xmm15
1295bc3d5698SJohn Baldwin	vpxor	%xmm0,%xmm12,%xmm0
1296bc3d5698SJohn Baldwin	vpxor	%xmm1,%xmm13,%xmm1
1297bc3d5698SJohn Baldwin	vpxor	%xmm14,%xmm2,%xmm2
1298bc3d5698SJohn Baldwin	vpxor	%xmm15,%xmm3,%xmm3
1299bc3d5698SJohn Baldwin.byte	143,232,120,194,192,7
1300bc3d5698SJohn Baldwin.byte	143,232,120,194,201,7
1301bc3d5698SJohn Baldwin.byte	143,232,120,194,210,7
1302bc3d5698SJohn Baldwin.byte	143,232,120,194,219,7
1303bc3d5698SJohn Baldwin	vpaddd	%xmm1,%xmm8,%xmm8
1304bc3d5698SJohn Baldwin	vpaddd	%xmm2,%xmm9,%xmm9
1305bc3d5698SJohn Baldwin	vpaddd	%xmm3,%xmm10,%xmm10
1306bc3d5698SJohn Baldwin	vpaddd	%xmm0,%xmm11,%xmm11
1307bc3d5698SJohn Baldwin	vpxor	%xmm7,%xmm8,%xmm7
1308bc3d5698SJohn Baldwin	vpxor	%xmm4,%xmm9,%xmm4
1309bc3d5698SJohn Baldwin	vpxor	%xmm5,%xmm10,%xmm5
1310bc3d5698SJohn Baldwin	vpxor	%xmm6,%xmm11,%xmm6
1311bc3d5698SJohn Baldwin.byte	143,232,120,194,255,16
1312bc3d5698SJohn Baldwin.byte	143,232,120,194,228,16
1313bc3d5698SJohn Baldwin.byte	143,232,120,194,237,16
1314bc3d5698SJohn Baldwin.byte	143,232,120,194,246,16
1315bc3d5698SJohn Baldwin	vpaddd	%xmm7,%xmm14,%xmm14
1316bc3d5698SJohn Baldwin	vpaddd	%xmm4,%xmm15,%xmm15
1317bc3d5698SJohn Baldwin	vpaddd	%xmm5,%xmm12,%xmm12
1318bc3d5698SJohn Baldwin	vpaddd	%xmm6,%xmm13,%xmm13
1319bc3d5698SJohn Baldwin	vpxor	%xmm1,%xmm14,%xmm1
1320bc3d5698SJohn Baldwin	vpxor	%xmm2,%xmm15,%xmm2
1321bc3d5698SJohn Baldwin	vpxor	%xmm12,%xmm3,%xmm3
1322bc3d5698SJohn Baldwin	vpxor	%xmm13,%xmm0,%xmm0
1323bc3d5698SJohn Baldwin.byte	143,232,120,194,201,12
1324bc3d5698SJohn Baldwin.byte	143,232,120,194,210,12
1325bc3d5698SJohn Baldwin.byte	143,232,120,194,219,12
1326bc3d5698SJohn Baldwin.byte	143,232,120,194,192,12
1327bc3d5698SJohn Baldwin	vpaddd	%xmm8,%xmm1,%xmm8
1328bc3d5698SJohn Baldwin	vpaddd	%xmm9,%xmm2,%xmm9
1329bc3d5698SJohn Baldwin	vpaddd	%xmm3,%xmm10,%xmm10
1330bc3d5698SJohn Baldwin	vpaddd	%xmm0,%xmm11,%xmm11
1331bc3d5698SJohn Baldwin	vpxor	%xmm7,%xmm8,%xmm7
1332bc3d5698SJohn Baldwin	vpxor	%xmm4,%xmm9,%xmm4
1333bc3d5698SJohn Baldwin	vpxor	%xmm5,%xmm10,%xmm5
1334bc3d5698SJohn Baldwin	vpxor	%xmm6,%xmm11,%xmm6
1335bc3d5698SJohn Baldwin.byte	143,232,120,194,255,8
1336bc3d5698SJohn Baldwin.byte	143,232,120,194,228,8
1337bc3d5698SJohn Baldwin.byte	143,232,120,194,237,8
1338bc3d5698SJohn Baldwin.byte	143,232,120,194,246,8
1339bc3d5698SJohn Baldwin	vpaddd	%xmm7,%xmm14,%xmm14
1340bc3d5698SJohn Baldwin	vpaddd	%xmm4,%xmm15,%xmm15
1341bc3d5698SJohn Baldwin	vpaddd	%xmm5,%xmm12,%xmm12
1342bc3d5698SJohn Baldwin	vpaddd	%xmm6,%xmm13,%xmm13
1343bc3d5698SJohn Baldwin	vpxor	%xmm1,%xmm14,%xmm1
1344bc3d5698SJohn Baldwin	vpxor	%xmm2,%xmm15,%xmm2
1345bc3d5698SJohn Baldwin	vpxor	%xmm12,%xmm3,%xmm3
1346bc3d5698SJohn Baldwin	vpxor	%xmm13,%xmm0,%xmm0
1347bc3d5698SJohn Baldwin.byte	143,232,120,194,201,7
1348bc3d5698SJohn Baldwin.byte	143,232,120,194,210,7
1349bc3d5698SJohn Baldwin.byte	143,232,120,194,219,7
1350bc3d5698SJohn Baldwin.byte	143,232,120,194,192,7
1351bc3d5698SJohn Baldwin	decl	%eax
1352bc3d5698SJohn Baldwin	jnz	.Loop4xop
1353bc3d5698SJohn Baldwin
1354bc3d5698SJohn Baldwin	vpaddd	64(%rsp),%xmm8,%xmm8
1355bc3d5698SJohn Baldwin	vpaddd	80(%rsp),%xmm9,%xmm9
1356bc3d5698SJohn Baldwin	vpaddd	96(%rsp),%xmm10,%xmm10
1357bc3d5698SJohn Baldwin	vpaddd	112(%rsp),%xmm11,%xmm11
1358bc3d5698SJohn Baldwin
1359bc3d5698SJohn Baldwin	vmovdqa	%xmm14,32(%rsp)
1360bc3d5698SJohn Baldwin	vmovdqa	%xmm15,48(%rsp)
1361bc3d5698SJohn Baldwin
1362bc3d5698SJohn Baldwin	vpunpckldq	%xmm9,%xmm8,%xmm14
1363bc3d5698SJohn Baldwin	vpunpckldq	%xmm11,%xmm10,%xmm15
1364bc3d5698SJohn Baldwin	vpunpckhdq	%xmm9,%xmm8,%xmm8
1365bc3d5698SJohn Baldwin	vpunpckhdq	%xmm11,%xmm10,%xmm10
1366bc3d5698SJohn Baldwin	vpunpcklqdq	%xmm15,%xmm14,%xmm9
1367bc3d5698SJohn Baldwin	vpunpckhqdq	%xmm15,%xmm14,%xmm14
1368bc3d5698SJohn Baldwin	vpunpcklqdq	%xmm10,%xmm8,%xmm11
1369bc3d5698SJohn Baldwin	vpunpckhqdq	%xmm10,%xmm8,%xmm8
1370bc3d5698SJohn Baldwin	vpaddd	128-256(%rcx),%xmm0,%xmm0
1371bc3d5698SJohn Baldwin	vpaddd	144-256(%rcx),%xmm1,%xmm1
1372bc3d5698SJohn Baldwin	vpaddd	160-256(%rcx),%xmm2,%xmm2
1373bc3d5698SJohn Baldwin	vpaddd	176-256(%rcx),%xmm3,%xmm3
1374bc3d5698SJohn Baldwin
1375bc3d5698SJohn Baldwin	vmovdqa	%xmm9,0(%rsp)
1376bc3d5698SJohn Baldwin	vmovdqa	%xmm14,16(%rsp)
1377bc3d5698SJohn Baldwin	vmovdqa	32(%rsp),%xmm9
1378bc3d5698SJohn Baldwin	vmovdqa	48(%rsp),%xmm14
1379bc3d5698SJohn Baldwin
1380bc3d5698SJohn Baldwin	vpunpckldq	%xmm1,%xmm0,%xmm10
1381bc3d5698SJohn Baldwin	vpunpckldq	%xmm3,%xmm2,%xmm15
1382bc3d5698SJohn Baldwin	vpunpckhdq	%xmm1,%xmm0,%xmm0
1383bc3d5698SJohn Baldwin	vpunpckhdq	%xmm3,%xmm2,%xmm2
1384bc3d5698SJohn Baldwin	vpunpcklqdq	%xmm15,%xmm10,%xmm1
1385bc3d5698SJohn Baldwin	vpunpckhqdq	%xmm15,%xmm10,%xmm10
1386bc3d5698SJohn Baldwin	vpunpcklqdq	%xmm2,%xmm0,%xmm3
1387bc3d5698SJohn Baldwin	vpunpckhqdq	%xmm2,%xmm0,%xmm0
1388bc3d5698SJohn Baldwin	vpaddd	192-256(%rcx),%xmm12,%xmm12
1389bc3d5698SJohn Baldwin	vpaddd	208-256(%rcx),%xmm13,%xmm13
1390bc3d5698SJohn Baldwin	vpaddd	224-256(%rcx),%xmm9,%xmm9
1391bc3d5698SJohn Baldwin	vpaddd	240-256(%rcx),%xmm14,%xmm14
1392bc3d5698SJohn Baldwin
1393bc3d5698SJohn Baldwin	vpunpckldq	%xmm13,%xmm12,%xmm2
1394bc3d5698SJohn Baldwin	vpunpckldq	%xmm14,%xmm9,%xmm15
1395bc3d5698SJohn Baldwin	vpunpckhdq	%xmm13,%xmm12,%xmm12
1396bc3d5698SJohn Baldwin	vpunpckhdq	%xmm14,%xmm9,%xmm9
1397bc3d5698SJohn Baldwin	vpunpcklqdq	%xmm15,%xmm2,%xmm13
1398bc3d5698SJohn Baldwin	vpunpckhqdq	%xmm15,%xmm2,%xmm2
1399bc3d5698SJohn Baldwin	vpunpcklqdq	%xmm9,%xmm12,%xmm14
1400bc3d5698SJohn Baldwin	vpunpckhqdq	%xmm9,%xmm12,%xmm12
1401bc3d5698SJohn Baldwin	vpaddd	256-256(%rcx),%xmm4,%xmm4
1402bc3d5698SJohn Baldwin	vpaddd	272-256(%rcx),%xmm5,%xmm5
1403bc3d5698SJohn Baldwin	vpaddd	288-256(%rcx),%xmm6,%xmm6
1404bc3d5698SJohn Baldwin	vpaddd	304-256(%rcx),%xmm7,%xmm7
1405bc3d5698SJohn Baldwin
1406bc3d5698SJohn Baldwin	vpunpckldq	%xmm5,%xmm4,%xmm9
1407bc3d5698SJohn Baldwin	vpunpckldq	%xmm7,%xmm6,%xmm15
1408bc3d5698SJohn Baldwin	vpunpckhdq	%xmm5,%xmm4,%xmm4
1409bc3d5698SJohn Baldwin	vpunpckhdq	%xmm7,%xmm6,%xmm6
1410bc3d5698SJohn Baldwin	vpunpcklqdq	%xmm15,%xmm9,%xmm5
1411bc3d5698SJohn Baldwin	vpunpckhqdq	%xmm15,%xmm9,%xmm9
1412bc3d5698SJohn Baldwin	vpunpcklqdq	%xmm6,%xmm4,%xmm7
1413bc3d5698SJohn Baldwin	vpunpckhqdq	%xmm6,%xmm4,%xmm4
1414bc3d5698SJohn Baldwin	vmovdqa	0(%rsp),%xmm6
1415bc3d5698SJohn Baldwin	vmovdqa	16(%rsp),%xmm15
1416bc3d5698SJohn Baldwin
1417bc3d5698SJohn Baldwin	cmpq	$256,%rdx
1418bc3d5698SJohn Baldwin	jb	.Ltail4xop
1419bc3d5698SJohn Baldwin
1420bc3d5698SJohn Baldwin	vpxor	0(%rsi),%xmm6,%xmm6
1421bc3d5698SJohn Baldwin	vpxor	16(%rsi),%xmm1,%xmm1
1422bc3d5698SJohn Baldwin	vpxor	32(%rsi),%xmm13,%xmm13
1423bc3d5698SJohn Baldwin	vpxor	48(%rsi),%xmm5,%xmm5
1424bc3d5698SJohn Baldwin	vpxor	64(%rsi),%xmm15,%xmm15
1425bc3d5698SJohn Baldwin	vpxor	80(%rsi),%xmm10,%xmm10
1426bc3d5698SJohn Baldwin	vpxor	96(%rsi),%xmm2,%xmm2
1427bc3d5698SJohn Baldwin	vpxor	112(%rsi),%xmm9,%xmm9
1428bc3d5698SJohn Baldwin	leaq	128(%rsi),%rsi
1429bc3d5698SJohn Baldwin	vpxor	0(%rsi),%xmm11,%xmm11
1430bc3d5698SJohn Baldwin	vpxor	16(%rsi),%xmm3,%xmm3
1431bc3d5698SJohn Baldwin	vpxor	32(%rsi),%xmm14,%xmm14
1432bc3d5698SJohn Baldwin	vpxor	48(%rsi),%xmm7,%xmm7
1433bc3d5698SJohn Baldwin	vpxor	64(%rsi),%xmm8,%xmm8
1434bc3d5698SJohn Baldwin	vpxor	80(%rsi),%xmm0,%xmm0
1435bc3d5698SJohn Baldwin	vpxor	96(%rsi),%xmm12,%xmm12
1436bc3d5698SJohn Baldwin	vpxor	112(%rsi),%xmm4,%xmm4
1437bc3d5698SJohn Baldwin	leaq	128(%rsi),%rsi
1438bc3d5698SJohn Baldwin
1439bc3d5698SJohn Baldwin	vmovdqu	%xmm6,0(%rdi)
1440bc3d5698SJohn Baldwin	vmovdqu	%xmm1,16(%rdi)
1441bc3d5698SJohn Baldwin	vmovdqu	%xmm13,32(%rdi)
1442bc3d5698SJohn Baldwin	vmovdqu	%xmm5,48(%rdi)
1443bc3d5698SJohn Baldwin	vmovdqu	%xmm15,64(%rdi)
1444bc3d5698SJohn Baldwin	vmovdqu	%xmm10,80(%rdi)
1445bc3d5698SJohn Baldwin	vmovdqu	%xmm2,96(%rdi)
1446bc3d5698SJohn Baldwin	vmovdqu	%xmm9,112(%rdi)
1447bc3d5698SJohn Baldwin	leaq	128(%rdi),%rdi
1448bc3d5698SJohn Baldwin	vmovdqu	%xmm11,0(%rdi)
1449bc3d5698SJohn Baldwin	vmovdqu	%xmm3,16(%rdi)
1450bc3d5698SJohn Baldwin	vmovdqu	%xmm14,32(%rdi)
1451bc3d5698SJohn Baldwin	vmovdqu	%xmm7,48(%rdi)
1452bc3d5698SJohn Baldwin	vmovdqu	%xmm8,64(%rdi)
1453bc3d5698SJohn Baldwin	vmovdqu	%xmm0,80(%rdi)
1454bc3d5698SJohn Baldwin	vmovdqu	%xmm12,96(%rdi)
1455bc3d5698SJohn Baldwin	vmovdqu	%xmm4,112(%rdi)
1456bc3d5698SJohn Baldwin	leaq	128(%rdi),%rdi
1457bc3d5698SJohn Baldwin
1458bc3d5698SJohn Baldwin	subq	$256,%rdx
1459bc3d5698SJohn Baldwin	jnz	.Loop_outer4xop
1460bc3d5698SJohn Baldwin
1461bc3d5698SJohn Baldwin	jmp	.Ldone4xop
1462bc3d5698SJohn Baldwin
1463bc3d5698SJohn Baldwin.align	32
1464bc3d5698SJohn Baldwin.Ltail4xop:
1465bc3d5698SJohn Baldwin	cmpq	$192,%rdx
1466bc3d5698SJohn Baldwin	jae	.L192_or_more4xop
1467bc3d5698SJohn Baldwin	cmpq	$128,%rdx
1468bc3d5698SJohn Baldwin	jae	.L128_or_more4xop
1469bc3d5698SJohn Baldwin	cmpq	$64,%rdx
1470bc3d5698SJohn Baldwin	jae	.L64_or_more4xop
1471bc3d5698SJohn Baldwin
1472bc3d5698SJohn Baldwin	xorq	%r10,%r10
1473bc3d5698SJohn Baldwin	vmovdqa	%xmm6,0(%rsp)
1474bc3d5698SJohn Baldwin	vmovdqa	%xmm1,16(%rsp)
1475bc3d5698SJohn Baldwin	vmovdqa	%xmm13,32(%rsp)
1476bc3d5698SJohn Baldwin	vmovdqa	%xmm5,48(%rsp)
1477bc3d5698SJohn Baldwin	jmp	.Loop_tail4xop
1478bc3d5698SJohn Baldwin
1479bc3d5698SJohn Baldwin.align	32
1480bc3d5698SJohn Baldwin.L64_or_more4xop:
1481bc3d5698SJohn Baldwin	vpxor	0(%rsi),%xmm6,%xmm6
1482bc3d5698SJohn Baldwin	vpxor	16(%rsi),%xmm1,%xmm1
1483bc3d5698SJohn Baldwin	vpxor	32(%rsi),%xmm13,%xmm13
1484bc3d5698SJohn Baldwin	vpxor	48(%rsi),%xmm5,%xmm5
1485bc3d5698SJohn Baldwin	vmovdqu	%xmm6,0(%rdi)
1486bc3d5698SJohn Baldwin	vmovdqu	%xmm1,16(%rdi)
1487bc3d5698SJohn Baldwin	vmovdqu	%xmm13,32(%rdi)
1488bc3d5698SJohn Baldwin	vmovdqu	%xmm5,48(%rdi)
1489bc3d5698SJohn Baldwin	je	.Ldone4xop
1490bc3d5698SJohn Baldwin
1491bc3d5698SJohn Baldwin	leaq	64(%rsi),%rsi
1492bc3d5698SJohn Baldwin	vmovdqa	%xmm15,0(%rsp)
1493bc3d5698SJohn Baldwin	xorq	%r10,%r10
1494bc3d5698SJohn Baldwin	vmovdqa	%xmm10,16(%rsp)
1495bc3d5698SJohn Baldwin	leaq	64(%rdi),%rdi
1496bc3d5698SJohn Baldwin	vmovdqa	%xmm2,32(%rsp)
1497bc3d5698SJohn Baldwin	subq	$64,%rdx
1498bc3d5698SJohn Baldwin	vmovdqa	%xmm9,48(%rsp)
1499bc3d5698SJohn Baldwin	jmp	.Loop_tail4xop
1500bc3d5698SJohn Baldwin
1501bc3d5698SJohn Baldwin.align	32
1502bc3d5698SJohn Baldwin.L128_or_more4xop:
1503bc3d5698SJohn Baldwin	vpxor	0(%rsi),%xmm6,%xmm6
1504bc3d5698SJohn Baldwin	vpxor	16(%rsi),%xmm1,%xmm1
1505bc3d5698SJohn Baldwin	vpxor	32(%rsi),%xmm13,%xmm13
1506bc3d5698SJohn Baldwin	vpxor	48(%rsi),%xmm5,%xmm5
1507bc3d5698SJohn Baldwin	vpxor	64(%rsi),%xmm15,%xmm15
1508bc3d5698SJohn Baldwin	vpxor	80(%rsi),%xmm10,%xmm10
1509bc3d5698SJohn Baldwin	vpxor	96(%rsi),%xmm2,%xmm2
1510bc3d5698SJohn Baldwin	vpxor	112(%rsi),%xmm9,%xmm9
1511bc3d5698SJohn Baldwin
1512bc3d5698SJohn Baldwin	vmovdqu	%xmm6,0(%rdi)
1513bc3d5698SJohn Baldwin	vmovdqu	%xmm1,16(%rdi)
1514bc3d5698SJohn Baldwin	vmovdqu	%xmm13,32(%rdi)
1515bc3d5698SJohn Baldwin	vmovdqu	%xmm5,48(%rdi)
1516bc3d5698SJohn Baldwin	vmovdqu	%xmm15,64(%rdi)
1517bc3d5698SJohn Baldwin	vmovdqu	%xmm10,80(%rdi)
1518bc3d5698SJohn Baldwin	vmovdqu	%xmm2,96(%rdi)
1519bc3d5698SJohn Baldwin	vmovdqu	%xmm9,112(%rdi)
1520bc3d5698SJohn Baldwin	je	.Ldone4xop
1521bc3d5698SJohn Baldwin
1522bc3d5698SJohn Baldwin	leaq	128(%rsi),%rsi
1523bc3d5698SJohn Baldwin	vmovdqa	%xmm11,0(%rsp)
1524bc3d5698SJohn Baldwin	xorq	%r10,%r10
1525bc3d5698SJohn Baldwin	vmovdqa	%xmm3,16(%rsp)
1526bc3d5698SJohn Baldwin	leaq	128(%rdi),%rdi
1527bc3d5698SJohn Baldwin	vmovdqa	%xmm14,32(%rsp)
1528bc3d5698SJohn Baldwin	subq	$128,%rdx
1529bc3d5698SJohn Baldwin	vmovdqa	%xmm7,48(%rsp)
1530bc3d5698SJohn Baldwin	jmp	.Loop_tail4xop
1531bc3d5698SJohn Baldwin
1532bc3d5698SJohn Baldwin.align	32
1533bc3d5698SJohn Baldwin.L192_or_more4xop:
1534bc3d5698SJohn Baldwin	vpxor	0(%rsi),%xmm6,%xmm6
1535bc3d5698SJohn Baldwin	vpxor	16(%rsi),%xmm1,%xmm1
1536bc3d5698SJohn Baldwin	vpxor	32(%rsi),%xmm13,%xmm13
1537bc3d5698SJohn Baldwin	vpxor	48(%rsi),%xmm5,%xmm5
1538bc3d5698SJohn Baldwin	vpxor	64(%rsi),%xmm15,%xmm15
1539bc3d5698SJohn Baldwin	vpxor	80(%rsi),%xmm10,%xmm10
1540bc3d5698SJohn Baldwin	vpxor	96(%rsi),%xmm2,%xmm2
1541bc3d5698SJohn Baldwin	vpxor	112(%rsi),%xmm9,%xmm9
1542bc3d5698SJohn Baldwin	leaq	128(%rsi),%rsi
1543bc3d5698SJohn Baldwin	vpxor	0(%rsi),%xmm11,%xmm11
1544bc3d5698SJohn Baldwin	vpxor	16(%rsi),%xmm3,%xmm3
1545bc3d5698SJohn Baldwin	vpxor	32(%rsi),%xmm14,%xmm14
1546bc3d5698SJohn Baldwin	vpxor	48(%rsi),%xmm7,%xmm7
1547bc3d5698SJohn Baldwin
1548bc3d5698SJohn Baldwin	vmovdqu	%xmm6,0(%rdi)
1549bc3d5698SJohn Baldwin	vmovdqu	%xmm1,16(%rdi)
1550bc3d5698SJohn Baldwin	vmovdqu	%xmm13,32(%rdi)
1551bc3d5698SJohn Baldwin	vmovdqu	%xmm5,48(%rdi)
1552bc3d5698SJohn Baldwin	vmovdqu	%xmm15,64(%rdi)
1553bc3d5698SJohn Baldwin	vmovdqu	%xmm10,80(%rdi)
1554bc3d5698SJohn Baldwin	vmovdqu	%xmm2,96(%rdi)
1555bc3d5698SJohn Baldwin	vmovdqu	%xmm9,112(%rdi)
1556bc3d5698SJohn Baldwin	leaq	128(%rdi),%rdi
1557bc3d5698SJohn Baldwin	vmovdqu	%xmm11,0(%rdi)
1558bc3d5698SJohn Baldwin	vmovdqu	%xmm3,16(%rdi)
1559bc3d5698SJohn Baldwin	vmovdqu	%xmm14,32(%rdi)
1560bc3d5698SJohn Baldwin	vmovdqu	%xmm7,48(%rdi)
1561bc3d5698SJohn Baldwin	je	.Ldone4xop
1562bc3d5698SJohn Baldwin
1563bc3d5698SJohn Baldwin	leaq	64(%rsi),%rsi
1564bc3d5698SJohn Baldwin	vmovdqa	%xmm8,0(%rsp)
1565bc3d5698SJohn Baldwin	xorq	%r10,%r10
1566bc3d5698SJohn Baldwin	vmovdqa	%xmm0,16(%rsp)
1567bc3d5698SJohn Baldwin	leaq	64(%rdi),%rdi
1568bc3d5698SJohn Baldwin	vmovdqa	%xmm12,32(%rsp)
1569bc3d5698SJohn Baldwin	subq	$192,%rdx
1570bc3d5698SJohn Baldwin	vmovdqa	%xmm4,48(%rsp)
1571bc3d5698SJohn Baldwin
1572bc3d5698SJohn Baldwin.Loop_tail4xop:
1573bc3d5698SJohn Baldwin	movzbl	(%rsi,%r10,1),%eax
1574bc3d5698SJohn Baldwin	movzbl	(%rsp,%r10,1),%ecx
1575bc3d5698SJohn Baldwin	leaq	1(%r10),%r10
1576bc3d5698SJohn Baldwin	xorl	%ecx,%eax
1577bc3d5698SJohn Baldwin	movb	%al,-1(%rdi,%r10,1)
1578bc3d5698SJohn Baldwin	decq	%rdx
1579bc3d5698SJohn Baldwin	jnz	.Loop_tail4xop
1580bc3d5698SJohn Baldwin
1581bc3d5698SJohn Baldwin.Ldone4xop:
1582bc3d5698SJohn Baldwin	vzeroupper
1583bc3d5698SJohn Baldwin	leaq	(%r9),%rsp
1584bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rsp
1585bc3d5698SJohn Baldwin.L4xop_epilogue:
1586bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
1587bc3d5698SJohn Baldwin.cfi_endproc
1588bc3d5698SJohn Baldwin.size	ChaCha20_4xop,.-ChaCha20_4xop
1589bc3d5698SJohn Baldwin.type	ChaCha20_8x,@function
1590bc3d5698SJohn Baldwin.align	32
1591bc3d5698SJohn BaldwinChaCha20_8x:
1592bc3d5698SJohn Baldwin.cfi_startproc
1593bc3d5698SJohn Baldwin.LChaCha20_8x:
1594bc3d5698SJohn Baldwin	movq	%rsp,%r9
1595bc3d5698SJohn Baldwin.cfi_def_cfa_register	%r9
1596bc3d5698SJohn Baldwin	subq	$0x280+8,%rsp
1597bc3d5698SJohn Baldwin	andq	$-32,%rsp
1598bc3d5698SJohn Baldwin	vzeroupper
1599bc3d5698SJohn Baldwin
1600bc3d5698SJohn Baldwin
1601bc3d5698SJohn Baldwin
1602bc3d5698SJohn Baldwin
1603bc3d5698SJohn Baldwin
1604bc3d5698SJohn Baldwin
1605bc3d5698SJohn Baldwin
1606bc3d5698SJohn Baldwin
1607bc3d5698SJohn Baldwin
1608bc3d5698SJohn Baldwin
1609bc3d5698SJohn Baldwin	vbroadcasti128	.Lsigma(%rip),%ymm11
1610bc3d5698SJohn Baldwin	vbroadcasti128	(%rcx),%ymm3
1611bc3d5698SJohn Baldwin	vbroadcasti128	16(%rcx),%ymm15
1612bc3d5698SJohn Baldwin	vbroadcasti128	(%r8),%ymm7
1613bc3d5698SJohn Baldwin	leaq	256(%rsp),%rcx
1614bc3d5698SJohn Baldwin	leaq	512(%rsp),%rax
1615bc3d5698SJohn Baldwin	leaq	.Lrot16(%rip),%r10
1616bc3d5698SJohn Baldwin	leaq	.Lrot24(%rip),%r11
1617bc3d5698SJohn Baldwin
1618bc3d5698SJohn Baldwin	vpshufd	$0x00,%ymm11,%ymm8
1619bc3d5698SJohn Baldwin	vpshufd	$0x55,%ymm11,%ymm9
1620bc3d5698SJohn Baldwin	vmovdqa	%ymm8,128-256(%rcx)
1621bc3d5698SJohn Baldwin	vpshufd	$0xaa,%ymm11,%ymm10
1622bc3d5698SJohn Baldwin	vmovdqa	%ymm9,160-256(%rcx)
1623bc3d5698SJohn Baldwin	vpshufd	$0xff,%ymm11,%ymm11
1624bc3d5698SJohn Baldwin	vmovdqa	%ymm10,192-256(%rcx)
1625bc3d5698SJohn Baldwin	vmovdqa	%ymm11,224-256(%rcx)
1626bc3d5698SJohn Baldwin
1627bc3d5698SJohn Baldwin	vpshufd	$0x00,%ymm3,%ymm0
1628bc3d5698SJohn Baldwin	vpshufd	$0x55,%ymm3,%ymm1
1629bc3d5698SJohn Baldwin	vmovdqa	%ymm0,256-256(%rcx)
1630bc3d5698SJohn Baldwin	vpshufd	$0xaa,%ymm3,%ymm2
1631bc3d5698SJohn Baldwin	vmovdqa	%ymm1,288-256(%rcx)
1632bc3d5698SJohn Baldwin	vpshufd	$0xff,%ymm3,%ymm3
1633bc3d5698SJohn Baldwin	vmovdqa	%ymm2,320-256(%rcx)
1634bc3d5698SJohn Baldwin	vmovdqa	%ymm3,352-256(%rcx)
1635bc3d5698SJohn Baldwin
1636bc3d5698SJohn Baldwin	vpshufd	$0x00,%ymm15,%ymm12
1637bc3d5698SJohn Baldwin	vpshufd	$0x55,%ymm15,%ymm13
1638bc3d5698SJohn Baldwin	vmovdqa	%ymm12,384-512(%rax)
1639bc3d5698SJohn Baldwin	vpshufd	$0xaa,%ymm15,%ymm14
1640bc3d5698SJohn Baldwin	vmovdqa	%ymm13,416-512(%rax)
1641bc3d5698SJohn Baldwin	vpshufd	$0xff,%ymm15,%ymm15
1642bc3d5698SJohn Baldwin	vmovdqa	%ymm14,448-512(%rax)
1643bc3d5698SJohn Baldwin	vmovdqa	%ymm15,480-512(%rax)
1644bc3d5698SJohn Baldwin
1645bc3d5698SJohn Baldwin	vpshufd	$0x00,%ymm7,%ymm4
1646bc3d5698SJohn Baldwin	vpshufd	$0x55,%ymm7,%ymm5
1647bc3d5698SJohn Baldwin	vpaddd	.Lincy(%rip),%ymm4,%ymm4
1648bc3d5698SJohn Baldwin	vpshufd	$0xaa,%ymm7,%ymm6
1649bc3d5698SJohn Baldwin	vmovdqa	%ymm5,544-512(%rax)
1650bc3d5698SJohn Baldwin	vpshufd	$0xff,%ymm7,%ymm7
1651bc3d5698SJohn Baldwin	vmovdqa	%ymm6,576-512(%rax)
1652bc3d5698SJohn Baldwin	vmovdqa	%ymm7,608-512(%rax)
1653bc3d5698SJohn Baldwin
1654bc3d5698SJohn Baldwin	jmp	.Loop_enter8x
1655bc3d5698SJohn Baldwin
1656bc3d5698SJohn Baldwin.align	32
1657bc3d5698SJohn Baldwin.Loop_outer8x:
1658bc3d5698SJohn Baldwin	vmovdqa	128-256(%rcx),%ymm8
1659bc3d5698SJohn Baldwin	vmovdqa	160-256(%rcx),%ymm9
1660bc3d5698SJohn Baldwin	vmovdqa	192-256(%rcx),%ymm10
1661bc3d5698SJohn Baldwin	vmovdqa	224-256(%rcx),%ymm11
1662bc3d5698SJohn Baldwin	vmovdqa	256-256(%rcx),%ymm0
1663bc3d5698SJohn Baldwin	vmovdqa	288-256(%rcx),%ymm1
1664bc3d5698SJohn Baldwin	vmovdqa	320-256(%rcx),%ymm2
1665bc3d5698SJohn Baldwin	vmovdqa	352-256(%rcx),%ymm3
1666bc3d5698SJohn Baldwin	vmovdqa	384-512(%rax),%ymm12
1667bc3d5698SJohn Baldwin	vmovdqa	416-512(%rax),%ymm13
1668bc3d5698SJohn Baldwin	vmovdqa	448-512(%rax),%ymm14
1669bc3d5698SJohn Baldwin	vmovdqa	480-512(%rax),%ymm15
1670bc3d5698SJohn Baldwin	vmovdqa	512-512(%rax),%ymm4
1671bc3d5698SJohn Baldwin	vmovdqa	544-512(%rax),%ymm5
1672bc3d5698SJohn Baldwin	vmovdqa	576-512(%rax),%ymm6
1673bc3d5698SJohn Baldwin	vmovdqa	608-512(%rax),%ymm7
1674bc3d5698SJohn Baldwin	vpaddd	.Leight(%rip),%ymm4,%ymm4
1675bc3d5698SJohn Baldwin
1676bc3d5698SJohn Baldwin.Loop_enter8x:
1677bc3d5698SJohn Baldwin	vmovdqa	%ymm14,64(%rsp)
1678bc3d5698SJohn Baldwin	vmovdqa	%ymm15,96(%rsp)
1679bc3d5698SJohn Baldwin	vbroadcasti128	(%r10),%ymm15
1680bc3d5698SJohn Baldwin	vmovdqa	%ymm4,512-512(%rax)
1681bc3d5698SJohn Baldwin	movl	$10,%eax
1682bc3d5698SJohn Baldwin	jmp	.Loop8x
1683bc3d5698SJohn Baldwin
1684bc3d5698SJohn Baldwin.align	32
1685bc3d5698SJohn Baldwin.Loop8x:
1686bc3d5698SJohn Baldwin	vpaddd	%ymm0,%ymm8,%ymm8
1687bc3d5698SJohn Baldwin	vpxor	%ymm4,%ymm8,%ymm4
1688bc3d5698SJohn Baldwin	vpshufb	%ymm15,%ymm4,%ymm4
1689bc3d5698SJohn Baldwin	vpaddd	%ymm1,%ymm9,%ymm9
1690bc3d5698SJohn Baldwin	vpxor	%ymm5,%ymm9,%ymm5
1691bc3d5698SJohn Baldwin	vpshufb	%ymm15,%ymm5,%ymm5
1692bc3d5698SJohn Baldwin	vpaddd	%ymm4,%ymm12,%ymm12
1693bc3d5698SJohn Baldwin	vpxor	%ymm0,%ymm12,%ymm0
1694bc3d5698SJohn Baldwin	vpslld	$12,%ymm0,%ymm14
1695bc3d5698SJohn Baldwin	vpsrld	$20,%ymm0,%ymm0
1696bc3d5698SJohn Baldwin	vpor	%ymm0,%ymm14,%ymm0
1697bc3d5698SJohn Baldwin	vbroadcasti128	(%r11),%ymm14
1698bc3d5698SJohn Baldwin	vpaddd	%ymm5,%ymm13,%ymm13
1699bc3d5698SJohn Baldwin	vpxor	%ymm1,%ymm13,%ymm1
1700bc3d5698SJohn Baldwin	vpslld	$12,%ymm1,%ymm15
1701bc3d5698SJohn Baldwin	vpsrld	$20,%ymm1,%ymm1
1702bc3d5698SJohn Baldwin	vpor	%ymm1,%ymm15,%ymm1
1703bc3d5698SJohn Baldwin	vpaddd	%ymm0,%ymm8,%ymm8
1704bc3d5698SJohn Baldwin	vpxor	%ymm4,%ymm8,%ymm4
1705bc3d5698SJohn Baldwin	vpshufb	%ymm14,%ymm4,%ymm4
1706bc3d5698SJohn Baldwin	vpaddd	%ymm1,%ymm9,%ymm9
1707bc3d5698SJohn Baldwin	vpxor	%ymm5,%ymm9,%ymm5
1708bc3d5698SJohn Baldwin	vpshufb	%ymm14,%ymm5,%ymm5
1709bc3d5698SJohn Baldwin	vpaddd	%ymm4,%ymm12,%ymm12
1710bc3d5698SJohn Baldwin	vpxor	%ymm0,%ymm12,%ymm0
1711bc3d5698SJohn Baldwin	vpslld	$7,%ymm0,%ymm15
1712bc3d5698SJohn Baldwin	vpsrld	$25,%ymm0,%ymm0
1713bc3d5698SJohn Baldwin	vpor	%ymm0,%ymm15,%ymm0
1714bc3d5698SJohn Baldwin	vbroadcasti128	(%r10),%ymm15
1715bc3d5698SJohn Baldwin	vpaddd	%ymm5,%ymm13,%ymm13
1716bc3d5698SJohn Baldwin	vpxor	%ymm1,%ymm13,%ymm1
1717bc3d5698SJohn Baldwin	vpslld	$7,%ymm1,%ymm14
1718bc3d5698SJohn Baldwin	vpsrld	$25,%ymm1,%ymm1
1719bc3d5698SJohn Baldwin	vpor	%ymm1,%ymm14,%ymm1
1720bc3d5698SJohn Baldwin	vmovdqa	%ymm12,0(%rsp)
1721bc3d5698SJohn Baldwin	vmovdqa	%ymm13,32(%rsp)
1722bc3d5698SJohn Baldwin	vmovdqa	64(%rsp),%ymm12
1723bc3d5698SJohn Baldwin	vmovdqa	96(%rsp),%ymm13
1724bc3d5698SJohn Baldwin	vpaddd	%ymm2,%ymm10,%ymm10
1725bc3d5698SJohn Baldwin	vpxor	%ymm6,%ymm10,%ymm6
1726bc3d5698SJohn Baldwin	vpshufb	%ymm15,%ymm6,%ymm6
1727bc3d5698SJohn Baldwin	vpaddd	%ymm3,%ymm11,%ymm11
1728bc3d5698SJohn Baldwin	vpxor	%ymm7,%ymm11,%ymm7
1729bc3d5698SJohn Baldwin	vpshufb	%ymm15,%ymm7,%ymm7
1730bc3d5698SJohn Baldwin	vpaddd	%ymm6,%ymm12,%ymm12
1731bc3d5698SJohn Baldwin	vpxor	%ymm2,%ymm12,%ymm2
1732bc3d5698SJohn Baldwin	vpslld	$12,%ymm2,%ymm14
1733bc3d5698SJohn Baldwin	vpsrld	$20,%ymm2,%ymm2
1734bc3d5698SJohn Baldwin	vpor	%ymm2,%ymm14,%ymm2
1735bc3d5698SJohn Baldwin	vbroadcasti128	(%r11),%ymm14
1736bc3d5698SJohn Baldwin	vpaddd	%ymm7,%ymm13,%ymm13
1737bc3d5698SJohn Baldwin	vpxor	%ymm3,%ymm13,%ymm3
1738bc3d5698SJohn Baldwin	vpslld	$12,%ymm3,%ymm15
1739bc3d5698SJohn Baldwin	vpsrld	$20,%ymm3,%ymm3
1740bc3d5698SJohn Baldwin	vpor	%ymm3,%ymm15,%ymm3
1741bc3d5698SJohn Baldwin	vpaddd	%ymm2,%ymm10,%ymm10
1742bc3d5698SJohn Baldwin	vpxor	%ymm6,%ymm10,%ymm6
1743bc3d5698SJohn Baldwin	vpshufb	%ymm14,%ymm6,%ymm6
1744bc3d5698SJohn Baldwin	vpaddd	%ymm3,%ymm11,%ymm11
1745bc3d5698SJohn Baldwin	vpxor	%ymm7,%ymm11,%ymm7
1746bc3d5698SJohn Baldwin	vpshufb	%ymm14,%ymm7,%ymm7
1747bc3d5698SJohn Baldwin	vpaddd	%ymm6,%ymm12,%ymm12
1748bc3d5698SJohn Baldwin	vpxor	%ymm2,%ymm12,%ymm2
1749bc3d5698SJohn Baldwin	vpslld	$7,%ymm2,%ymm15
1750bc3d5698SJohn Baldwin	vpsrld	$25,%ymm2,%ymm2
1751bc3d5698SJohn Baldwin	vpor	%ymm2,%ymm15,%ymm2
1752bc3d5698SJohn Baldwin	vbroadcasti128	(%r10),%ymm15
1753bc3d5698SJohn Baldwin	vpaddd	%ymm7,%ymm13,%ymm13
1754bc3d5698SJohn Baldwin	vpxor	%ymm3,%ymm13,%ymm3
1755bc3d5698SJohn Baldwin	vpslld	$7,%ymm3,%ymm14
1756bc3d5698SJohn Baldwin	vpsrld	$25,%ymm3,%ymm3
1757bc3d5698SJohn Baldwin	vpor	%ymm3,%ymm14,%ymm3
1758bc3d5698SJohn Baldwin	vpaddd	%ymm1,%ymm8,%ymm8
1759bc3d5698SJohn Baldwin	vpxor	%ymm7,%ymm8,%ymm7
1760bc3d5698SJohn Baldwin	vpshufb	%ymm15,%ymm7,%ymm7
1761bc3d5698SJohn Baldwin	vpaddd	%ymm2,%ymm9,%ymm9
1762bc3d5698SJohn Baldwin	vpxor	%ymm4,%ymm9,%ymm4
1763bc3d5698SJohn Baldwin	vpshufb	%ymm15,%ymm4,%ymm4
1764bc3d5698SJohn Baldwin	vpaddd	%ymm7,%ymm12,%ymm12
1765bc3d5698SJohn Baldwin	vpxor	%ymm1,%ymm12,%ymm1
1766bc3d5698SJohn Baldwin	vpslld	$12,%ymm1,%ymm14
1767bc3d5698SJohn Baldwin	vpsrld	$20,%ymm1,%ymm1
1768bc3d5698SJohn Baldwin	vpor	%ymm1,%ymm14,%ymm1
1769bc3d5698SJohn Baldwin	vbroadcasti128	(%r11),%ymm14
1770bc3d5698SJohn Baldwin	vpaddd	%ymm4,%ymm13,%ymm13
1771bc3d5698SJohn Baldwin	vpxor	%ymm2,%ymm13,%ymm2
1772bc3d5698SJohn Baldwin	vpslld	$12,%ymm2,%ymm15
1773bc3d5698SJohn Baldwin	vpsrld	$20,%ymm2,%ymm2
1774bc3d5698SJohn Baldwin	vpor	%ymm2,%ymm15,%ymm2
1775bc3d5698SJohn Baldwin	vpaddd	%ymm1,%ymm8,%ymm8
1776bc3d5698SJohn Baldwin	vpxor	%ymm7,%ymm8,%ymm7
1777bc3d5698SJohn Baldwin	vpshufb	%ymm14,%ymm7,%ymm7
1778bc3d5698SJohn Baldwin	vpaddd	%ymm2,%ymm9,%ymm9
1779bc3d5698SJohn Baldwin	vpxor	%ymm4,%ymm9,%ymm4
1780bc3d5698SJohn Baldwin	vpshufb	%ymm14,%ymm4,%ymm4
1781bc3d5698SJohn Baldwin	vpaddd	%ymm7,%ymm12,%ymm12
1782bc3d5698SJohn Baldwin	vpxor	%ymm1,%ymm12,%ymm1
1783bc3d5698SJohn Baldwin	vpslld	$7,%ymm1,%ymm15
1784bc3d5698SJohn Baldwin	vpsrld	$25,%ymm1,%ymm1
1785bc3d5698SJohn Baldwin	vpor	%ymm1,%ymm15,%ymm1
1786bc3d5698SJohn Baldwin	vbroadcasti128	(%r10),%ymm15
1787bc3d5698SJohn Baldwin	vpaddd	%ymm4,%ymm13,%ymm13
1788bc3d5698SJohn Baldwin	vpxor	%ymm2,%ymm13,%ymm2
1789bc3d5698SJohn Baldwin	vpslld	$7,%ymm2,%ymm14
1790bc3d5698SJohn Baldwin	vpsrld	$25,%ymm2,%ymm2
1791bc3d5698SJohn Baldwin	vpor	%ymm2,%ymm14,%ymm2
1792bc3d5698SJohn Baldwin	vmovdqa	%ymm12,64(%rsp)
1793bc3d5698SJohn Baldwin	vmovdqa	%ymm13,96(%rsp)
1794bc3d5698SJohn Baldwin	vmovdqa	0(%rsp),%ymm12
1795bc3d5698SJohn Baldwin	vmovdqa	32(%rsp),%ymm13
1796bc3d5698SJohn Baldwin	vpaddd	%ymm3,%ymm10,%ymm10
1797bc3d5698SJohn Baldwin	vpxor	%ymm5,%ymm10,%ymm5
1798bc3d5698SJohn Baldwin	vpshufb	%ymm15,%ymm5,%ymm5
1799bc3d5698SJohn Baldwin	vpaddd	%ymm0,%ymm11,%ymm11
1800bc3d5698SJohn Baldwin	vpxor	%ymm6,%ymm11,%ymm6
1801bc3d5698SJohn Baldwin	vpshufb	%ymm15,%ymm6,%ymm6
1802bc3d5698SJohn Baldwin	vpaddd	%ymm5,%ymm12,%ymm12
1803bc3d5698SJohn Baldwin	vpxor	%ymm3,%ymm12,%ymm3
1804bc3d5698SJohn Baldwin	vpslld	$12,%ymm3,%ymm14
1805bc3d5698SJohn Baldwin	vpsrld	$20,%ymm3,%ymm3
1806bc3d5698SJohn Baldwin	vpor	%ymm3,%ymm14,%ymm3
1807bc3d5698SJohn Baldwin	vbroadcasti128	(%r11),%ymm14
1808bc3d5698SJohn Baldwin	vpaddd	%ymm6,%ymm13,%ymm13
1809bc3d5698SJohn Baldwin	vpxor	%ymm0,%ymm13,%ymm0
1810bc3d5698SJohn Baldwin	vpslld	$12,%ymm0,%ymm15
1811bc3d5698SJohn Baldwin	vpsrld	$20,%ymm0,%ymm0
1812bc3d5698SJohn Baldwin	vpor	%ymm0,%ymm15,%ymm0
1813bc3d5698SJohn Baldwin	vpaddd	%ymm3,%ymm10,%ymm10
1814bc3d5698SJohn Baldwin	vpxor	%ymm5,%ymm10,%ymm5
1815bc3d5698SJohn Baldwin	vpshufb	%ymm14,%ymm5,%ymm5
1816bc3d5698SJohn Baldwin	vpaddd	%ymm0,%ymm11,%ymm11
1817bc3d5698SJohn Baldwin	vpxor	%ymm6,%ymm11,%ymm6
1818bc3d5698SJohn Baldwin	vpshufb	%ymm14,%ymm6,%ymm6
1819bc3d5698SJohn Baldwin	vpaddd	%ymm5,%ymm12,%ymm12
1820bc3d5698SJohn Baldwin	vpxor	%ymm3,%ymm12,%ymm3
1821bc3d5698SJohn Baldwin	vpslld	$7,%ymm3,%ymm15
1822bc3d5698SJohn Baldwin	vpsrld	$25,%ymm3,%ymm3
1823bc3d5698SJohn Baldwin	vpor	%ymm3,%ymm15,%ymm3
1824bc3d5698SJohn Baldwin	vbroadcasti128	(%r10),%ymm15
1825bc3d5698SJohn Baldwin	vpaddd	%ymm6,%ymm13,%ymm13
1826bc3d5698SJohn Baldwin	vpxor	%ymm0,%ymm13,%ymm0
1827bc3d5698SJohn Baldwin	vpslld	$7,%ymm0,%ymm14
1828bc3d5698SJohn Baldwin	vpsrld	$25,%ymm0,%ymm0
1829bc3d5698SJohn Baldwin	vpor	%ymm0,%ymm14,%ymm0
1830bc3d5698SJohn Baldwin	decl	%eax
1831bc3d5698SJohn Baldwin	jnz	.Loop8x
1832bc3d5698SJohn Baldwin
1833bc3d5698SJohn Baldwin	leaq	512(%rsp),%rax
1834bc3d5698SJohn Baldwin	vpaddd	128-256(%rcx),%ymm8,%ymm8
1835bc3d5698SJohn Baldwin	vpaddd	160-256(%rcx),%ymm9,%ymm9
1836bc3d5698SJohn Baldwin	vpaddd	192-256(%rcx),%ymm10,%ymm10
1837bc3d5698SJohn Baldwin	vpaddd	224-256(%rcx),%ymm11,%ymm11
1838bc3d5698SJohn Baldwin
1839bc3d5698SJohn Baldwin	vpunpckldq	%ymm9,%ymm8,%ymm14
1840bc3d5698SJohn Baldwin	vpunpckldq	%ymm11,%ymm10,%ymm15
1841bc3d5698SJohn Baldwin	vpunpckhdq	%ymm9,%ymm8,%ymm8
1842bc3d5698SJohn Baldwin	vpunpckhdq	%ymm11,%ymm10,%ymm10
1843bc3d5698SJohn Baldwin	vpunpcklqdq	%ymm15,%ymm14,%ymm9
1844bc3d5698SJohn Baldwin	vpunpckhqdq	%ymm15,%ymm14,%ymm14
1845bc3d5698SJohn Baldwin	vpunpcklqdq	%ymm10,%ymm8,%ymm11
1846bc3d5698SJohn Baldwin	vpunpckhqdq	%ymm10,%ymm8,%ymm8
1847bc3d5698SJohn Baldwin	vpaddd	256-256(%rcx),%ymm0,%ymm0
1848bc3d5698SJohn Baldwin	vpaddd	288-256(%rcx),%ymm1,%ymm1
1849bc3d5698SJohn Baldwin	vpaddd	320-256(%rcx),%ymm2,%ymm2
1850bc3d5698SJohn Baldwin	vpaddd	352-256(%rcx),%ymm3,%ymm3
1851bc3d5698SJohn Baldwin
1852bc3d5698SJohn Baldwin	vpunpckldq	%ymm1,%ymm0,%ymm10
1853bc3d5698SJohn Baldwin	vpunpckldq	%ymm3,%ymm2,%ymm15
1854bc3d5698SJohn Baldwin	vpunpckhdq	%ymm1,%ymm0,%ymm0
1855bc3d5698SJohn Baldwin	vpunpckhdq	%ymm3,%ymm2,%ymm2
1856bc3d5698SJohn Baldwin	vpunpcklqdq	%ymm15,%ymm10,%ymm1
1857bc3d5698SJohn Baldwin	vpunpckhqdq	%ymm15,%ymm10,%ymm10
1858bc3d5698SJohn Baldwin	vpunpcklqdq	%ymm2,%ymm0,%ymm3
1859bc3d5698SJohn Baldwin	vpunpckhqdq	%ymm2,%ymm0,%ymm0
1860bc3d5698SJohn Baldwin	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
1861bc3d5698SJohn Baldwin	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
1862bc3d5698SJohn Baldwin	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
1863bc3d5698SJohn Baldwin	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
1864bc3d5698SJohn Baldwin	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
1865bc3d5698SJohn Baldwin	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
1866bc3d5698SJohn Baldwin	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
1867bc3d5698SJohn Baldwin	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
1868bc3d5698SJohn Baldwin	vmovdqa	%ymm15,0(%rsp)
1869bc3d5698SJohn Baldwin	vmovdqa	%ymm9,32(%rsp)
1870bc3d5698SJohn Baldwin	vmovdqa	64(%rsp),%ymm15
1871bc3d5698SJohn Baldwin	vmovdqa	96(%rsp),%ymm9
1872bc3d5698SJohn Baldwin
1873bc3d5698SJohn Baldwin	vpaddd	384-512(%rax),%ymm12,%ymm12
1874bc3d5698SJohn Baldwin	vpaddd	416-512(%rax),%ymm13,%ymm13
1875bc3d5698SJohn Baldwin	vpaddd	448-512(%rax),%ymm15,%ymm15
1876bc3d5698SJohn Baldwin	vpaddd	480-512(%rax),%ymm9,%ymm9
1877bc3d5698SJohn Baldwin
1878bc3d5698SJohn Baldwin	vpunpckldq	%ymm13,%ymm12,%ymm2
1879bc3d5698SJohn Baldwin	vpunpckldq	%ymm9,%ymm15,%ymm8
1880bc3d5698SJohn Baldwin	vpunpckhdq	%ymm13,%ymm12,%ymm12
1881bc3d5698SJohn Baldwin	vpunpckhdq	%ymm9,%ymm15,%ymm15
1882bc3d5698SJohn Baldwin	vpunpcklqdq	%ymm8,%ymm2,%ymm13
1883bc3d5698SJohn Baldwin	vpunpckhqdq	%ymm8,%ymm2,%ymm2
1884bc3d5698SJohn Baldwin	vpunpcklqdq	%ymm15,%ymm12,%ymm9
1885bc3d5698SJohn Baldwin	vpunpckhqdq	%ymm15,%ymm12,%ymm12
1886bc3d5698SJohn Baldwin	vpaddd	512-512(%rax),%ymm4,%ymm4
1887bc3d5698SJohn Baldwin	vpaddd	544-512(%rax),%ymm5,%ymm5
1888bc3d5698SJohn Baldwin	vpaddd	576-512(%rax),%ymm6,%ymm6
1889bc3d5698SJohn Baldwin	vpaddd	608-512(%rax),%ymm7,%ymm7
1890bc3d5698SJohn Baldwin
1891bc3d5698SJohn Baldwin	vpunpckldq	%ymm5,%ymm4,%ymm15
1892bc3d5698SJohn Baldwin	vpunpckldq	%ymm7,%ymm6,%ymm8
1893bc3d5698SJohn Baldwin	vpunpckhdq	%ymm5,%ymm4,%ymm4
1894bc3d5698SJohn Baldwin	vpunpckhdq	%ymm7,%ymm6,%ymm6
1895bc3d5698SJohn Baldwin	vpunpcklqdq	%ymm8,%ymm15,%ymm5
1896bc3d5698SJohn Baldwin	vpunpckhqdq	%ymm8,%ymm15,%ymm15
1897bc3d5698SJohn Baldwin	vpunpcklqdq	%ymm6,%ymm4,%ymm7
1898bc3d5698SJohn Baldwin	vpunpckhqdq	%ymm6,%ymm4,%ymm4
1899bc3d5698SJohn Baldwin	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
1900bc3d5698SJohn Baldwin	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
1901bc3d5698SJohn Baldwin	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
1902bc3d5698SJohn Baldwin	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
1903bc3d5698SJohn Baldwin	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
1904bc3d5698SJohn Baldwin	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
1905bc3d5698SJohn Baldwin	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
1906bc3d5698SJohn Baldwin	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
1907bc3d5698SJohn Baldwin	vmovdqa	0(%rsp),%ymm6
1908bc3d5698SJohn Baldwin	vmovdqa	32(%rsp),%ymm12
1909bc3d5698SJohn Baldwin
1910bc3d5698SJohn Baldwin	cmpq	$512,%rdx
1911bc3d5698SJohn Baldwin	jb	.Ltail8x
1912bc3d5698SJohn Baldwin
1913bc3d5698SJohn Baldwin	vpxor	0(%rsi),%ymm6,%ymm6
1914bc3d5698SJohn Baldwin	vpxor	32(%rsi),%ymm8,%ymm8
1915bc3d5698SJohn Baldwin	vpxor	64(%rsi),%ymm1,%ymm1
1916bc3d5698SJohn Baldwin	vpxor	96(%rsi),%ymm5,%ymm5
1917bc3d5698SJohn Baldwin	leaq	128(%rsi),%rsi
1918bc3d5698SJohn Baldwin	vmovdqu	%ymm6,0(%rdi)
1919bc3d5698SJohn Baldwin	vmovdqu	%ymm8,32(%rdi)
1920bc3d5698SJohn Baldwin	vmovdqu	%ymm1,64(%rdi)
1921bc3d5698SJohn Baldwin	vmovdqu	%ymm5,96(%rdi)
1922bc3d5698SJohn Baldwin	leaq	128(%rdi),%rdi
1923bc3d5698SJohn Baldwin
1924bc3d5698SJohn Baldwin	vpxor	0(%rsi),%ymm12,%ymm12
1925bc3d5698SJohn Baldwin	vpxor	32(%rsi),%ymm13,%ymm13
1926bc3d5698SJohn Baldwin	vpxor	64(%rsi),%ymm10,%ymm10
1927bc3d5698SJohn Baldwin	vpxor	96(%rsi),%ymm15,%ymm15
1928bc3d5698SJohn Baldwin	leaq	128(%rsi),%rsi
1929bc3d5698SJohn Baldwin	vmovdqu	%ymm12,0(%rdi)
1930bc3d5698SJohn Baldwin	vmovdqu	%ymm13,32(%rdi)
1931bc3d5698SJohn Baldwin	vmovdqu	%ymm10,64(%rdi)
1932bc3d5698SJohn Baldwin	vmovdqu	%ymm15,96(%rdi)
1933bc3d5698SJohn Baldwin	leaq	128(%rdi),%rdi
1934bc3d5698SJohn Baldwin
1935bc3d5698SJohn Baldwin	vpxor	0(%rsi),%ymm14,%ymm14
1936bc3d5698SJohn Baldwin	vpxor	32(%rsi),%ymm2,%ymm2
1937bc3d5698SJohn Baldwin	vpxor	64(%rsi),%ymm3,%ymm3
1938bc3d5698SJohn Baldwin	vpxor	96(%rsi),%ymm7,%ymm7
1939bc3d5698SJohn Baldwin	leaq	128(%rsi),%rsi
1940bc3d5698SJohn Baldwin	vmovdqu	%ymm14,0(%rdi)
1941bc3d5698SJohn Baldwin	vmovdqu	%ymm2,32(%rdi)
1942bc3d5698SJohn Baldwin	vmovdqu	%ymm3,64(%rdi)
1943bc3d5698SJohn Baldwin	vmovdqu	%ymm7,96(%rdi)
1944bc3d5698SJohn Baldwin	leaq	128(%rdi),%rdi
1945bc3d5698SJohn Baldwin
1946bc3d5698SJohn Baldwin	vpxor	0(%rsi),%ymm11,%ymm11
1947bc3d5698SJohn Baldwin	vpxor	32(%rsi),%ymm9,%ymm9
1948bc3d5698SJohn Baldwin	vpxor	64(%rsi),%ymm0,%ymm0
1949bc3d5698SJohn Baldwin	vpxor	96(%rsi),%ymm4,%ymm4
1950bc3d5698SJohn Baldwin	leaq	128(%rsi),%rsi
1951bc3d5698SJohn Baldwin	vmovdqu	%ymm11,0(%rdi)
1952bc3d5698SJohn Baldwin	vmovdqu	%ymm9,32(%rdi)
1953bc3d5698SJohn Baldwin	vmovdqu	%ymm0,64(%rdi)
1954bc3d5698SJohn Baldwin	vmovdqu	%ymm4,96(%rdi)
1955bc3d5698SJohn Baldwin	leaq	128(%rdi),%rdi
1956bc3d5698SJohn Baldwin
1957bc3d5698SJohn Baldwin	subq	$512,%rdx
1958bc3d5698SJohn Baldwin	jnz	.Loop_outer8x
1959bc3d5698SJohn Baldwin
1960bc3d5698SJohn Baldwin	jmp	.Ldone8x
1961bc3d5698SJohn Baldwin
1962bc3d5698SJohn Baldwin.Ltail8x:
1963bc3d5698SJohn Baldwin	cmpq	$448,%rdx
1964bc3d5698SJohn Baldwin	jae	.L448_or_more8x
1965bc3d5698SJohn Baldwin	cmpq	$384,%rdx
1966bc3d5698SJohn Baldwin	jae	.L384_or_more8x
1967bc3d5698SJohn Baldwin	cmpq	$320,%rdx
1968bc3d5698SJohn Baldwin	jae	.L320_or_more8x
1969bc3d5698SJohn Baldwin	cmpq	$256,%rdx
1970bc3d5698SJohn Baldwin	jae	.L256_or_more8x
1971bc3d5698SJohn Baldwin	cmpq	$192,%rdx
1972bc3d5698SJohn Baldwin	jae	.L192_or_more8x
1973bc3d5698SJohn Baldwin	cmpq	$128,%rdx
1974bc3d5698SJohn Baldwin	jae	.L128_or_more8x
1975bc3d5698SJohn Baldwin	cmpq	$64,%rdx
1976bc3d5698SJohn Baldwin	jae	.L64_or_more8x
1977bc3d5698SJohn Baldwin
1978bc3d5698SJohn Baldwin	xorq	%r10,%r10
1979bc3d5698SJohn Baldwin	vmovdqa	%ymm6,0(%rsp)
1980bc3d5698SJohn Baldwin	vmovdqa	%ymm8,32(%rsp)
1981bc3d5698SJohn Baldwin	jmp	.Loop_tail8x
1982bc3d5698SJohn Baldwin
1983bc3d5698SJohn Baldwin.align	32
1984bc3d5698SJohn Baldwin.L64_or_more8x:
1985bc3d5698SJohn Baldwin	vpxor	0(%rsi),%ymm6,%ymm6
1986bc3d5698SJohn Baldwin	vpxor	32(%rsi),%ymm8,%ymm8
1987bc3d5698SJohn Baldwin	vmovdqu	%ymm6,0(%rdi)
1988bc3d5698SJohn Baldwin	vmovdqu	%ymm8,32(%rdi)
1989bc3d5698SJohn Baldwin	je	.Ldone8x
1990bc3d5698SJohn Baldwin
1991bc3d5698SJohn Baldwin	leaq	64(%rsi),%rsi
1992bc3d5698SJohn Baldwin	xorq	%r10,%r10
1993bc3d5698SJohn Baldwin	vmovdqa	%ymm1,0(%rsp)
1994bc3d5698SJohn Baldwin	leaq	64(%rdi),%rdi
1995bc3d5698SJohn Baldwin	subq	$64,%rdx
1996bc3d5698SJohn Baldwin	vmovdqa	%ymm5,32(%rsp)
1997bc3d5698SJohn Baldwin	jmp	.Loop_tail8x
1998bc3d5698SJohn Baldwin
1999bc3d5698SJohn Baldwin.align	32
2000bc3d5698SJohn Baldwin.L128_or_more8x:
2001bc3d5698SJohn Baldwin	vpxor	0(%rsi),%ymm6,%ymm6
2002bc3d5698SJohn Baldwin	vpxor	32(%rsi),%ymm8,%ymm8
2003bc3d5698SJohn Baldwin	vpxor	64(%rsi),%ymm1,%ymm1
2004bc3d5698SJohn Baldwin	vpxor	96(%rsi),%ymm5,%ymm5
2005bc3d5698SJohn Baldwin	vmovdqu	%ymm6,0(%rdi)
2006bc3d5698SJohn Baldwin	vmovdqu	%ymm8,32(%rdi)
2007bc3d5698SJohn Baldwin	vmovdqu	%ymm1,64(%rdi)
2008bc3d5698SJohn Baldwin	vmovdqu	%ymm5,96(%rdi)
2009bc3d5698SJohn Baldwin	je	.Ldone8x
2010bc3d5698SJohn Baldwin
2011bc3d5698SJohn Baldwin	leaq	128(%rsi),%rsi
2012bc3d5698SJohn Baldwin	xorq	%r10,%r10
2013bc3d5698SJohn Baldwin	vmovdqa	%ymm12,0(%rsp)
2014bc3d5698SJohn Baldwin	leaq	128(%rdi),%rdi
2015bc3d5698SJohn Baldwin	subq	$128,%rdx
2016bc3d5698SJohn Baldwin	vmovdqa	%ymm13,32(%rsp)
2017bc3d5698SJohn Baldwin	jmp	.Loop_tail8x
2018bc3d5698SJohn Baldwin
2019bc3d5698SJohn Baldwin.align	32
2020bc3d5698SJohn Baldwin.L192_or_more8x:
2021bc3d5698SJohn Baldwin	vpxor	0(%rsi),%ymm6,%ymm6
2022bc3d5698SJohn Baldwin	vpxor	32(%rsi),%ymm8,%ymm8
2023bc3d5698SJohn Baldwin	vpxor	64(%rsi),%ymm1,%ymm1
2024bc3d5698SJohn Baldwin	vpxor	96(%rsi),%ymm5,%ymm5
2025bc3d5698SJohn Baldwin	vpxor	128(%rsi),%ymm12,%ymm12
2026bc3d5698SJohn Baldwin	vpxor	160(%rsi),%ymm13,%ymm13
2027bc3d5698SJohn Baldwin	vmovdqu	%ymm6,0(%rdi)
2028bc3d5698SJohn Baldwin	vmovdqu	%ymm8,32(%rdi)
2029bc3d5698SJohn Baldwin	vmovdqu	%ymm1,64(%rdi)
2030bc3d5698SJohn Baldwin	vmovdqu	%ymm5,96(%rdi)
2031bc3d5698SJohn Baldwin	vmovdqu	%ymm12,128(%rdi)
2032bc3d5698SJohn Baldwin	vmovdqu	%ymm13,160(%rdi)
2033bc3d5698SJohn Baldwin	je	.Ldone8x
2034bc3d5698SJohn Baldwin
2035bc3d5698SJohn Baldwin	leaq	192(%rsi),%rsi
2036bc3d5698SJohn Baldwin	xorq	%r10,%r10
2037bc3d5698SJohn Baldwin	vmovdqa	%ymm10,0(%rsp)
2038bc3d5698SJohn Baldwin	leaq	192(%rdi),%rdi
2039bc3d5698SJohn Baldwin	subq	$192,%rdx
2040bc3d5698SJohn Baldwin	vmovdqa	%ymm15,32(%rsp)
2041bc3d5698SJohn Baldwin	jmp	.Loop_tail8x
2042bc3d5698SJohn Baldwin
2043bc3d5698SJohn Baldwin.align	32
2044bc3d5698SJohn Baldwin.L256_or_more8x:
2045bc3d5698SJohn Baldwin	vpxor	0(%rsi),%ymm6,%ymm6
2046bc3d5698SJohn Baldwin	vpxor	32(%rsi),%ymm8,%ymm8
2047bc3d5698SJohn Baldwin	vpxor	64(%rsi),%ymm1,%ymm1
2048bc3d5698SJohn Baldwin	vpxor	96(%rsi),%ymm5,%ymm5
2049bc3d5698SJohn Baldwin	vpxor	128(%rsi),%ymm12,%ymm12
2050bc3d5698SJohn Baldwin	vpxor	160(%rsi),%ymm13,%ymm13
2051bc3d5698SJohn Baldwin	vpxor	192(%rsi),%ymm10,%ymm10
2052bc3d5698SJohn Baldwin	vpxor	224(%rsi),%ymm15,%ymm15
2053bc3d5698SJohn Baldwin	vmovdqu	%ymm6,0(%rdi)
2054bc3d5698SJohn Baldwin	vmovdqu	%ymm8,32(%rdi)
2055bc3d5698SJohn Baldwin	vmovdqu	%ymm1,64(%rdi)
2056bc3d5698SJohn Baldwin	vmovdqu	%ymm5,96(%rdi)
2057bc3d5698SJohn Baldwin	vmovdqu	%ymm12,128(%rdi)
2058bc3d5698SJohn Baldwin	vmovdqu	%ymm13,160(%rdi)
2059bc3d5698SJohn Baldwin	vmovdqu	%ymm10,192(%rdi)
2060bc3d5698SJohn Baldwin	vmovdqu	%ymm15,224(%rdi)
2061bc3d5698SJohn Baldwin	je	.Ldone8x
2062bc3d5698SJohn Baldwin
2063bc3d5698SJohn Baldwin	leaq	256(%rsi),%rsi
2064bc3d5698SJohn Baldwin	xorq	%r10,%r10
2065bc3d5698SJohn Baldwin	vmovdqa	%ymm14,0(%rsp)
2066bc3d5698SJohn Baldwin	leaq	256(%rdi),%rdi
2067bc3d5698SJohn Baldwin	subq	$256,%rdx
2068bc3d5698SJohn Baldwin	vmovdqa	%ymm2,32(%rsp)
2069bc3d5698SJohn Baldwin	jmp	.Loop_tail8x
2070bc3d5698SJohn Baldwin
2071bc3d5698SJohn Baldwin.align	32
2072bc3d5698SJohn Baldwin.L320_or_more8x:
2073bc3d5698SJohn Baldwin	vpxor	0(%rsi),%ymm6,%ymm6
2074bc3d5698SJohn Baldwin	vpxor	32(%rsi),%ymm8,%ymm8
2075bc3d5698SJohn Baldwin	vpxor	64(%rsi),%ymm1,%ymm1
2076bc3d5698SJohn Baldwin	vpxor	96(%rsi),%ymm5,%ymm5
2077bc3d5698SJohn Baldwin	vpxor	128(%rsi),%ymm12,%ymm12
2078bc3d5698SJohn Baldwin	vpxor	160(%rsi),%ymm13,%ymm13
2079bc3d5698SJohn Baldwin	vpxor	192(%rsi),%ymm10,%ymm10
2080bc3d5698SJohn Baldwin	vpxor	224(%rsi),%ymm15,%ymm15
2081bc3d5698SJohn Baldwin	vpxor	256(%rsi),%ymm14,%ymm14
2082bc3d5698SJohn Baldwin	vpxor	288(%rsi),%ymm2,%ymm2
2083bc3d5698SJohn Baldwin	vmovdqu	%ymm6,0(%rdi)
2084bc3d5698SJohn Baldwin	vmovdqu	%ymm8,32(%rdi)
2085bc3d5698SJohn Baldwin	vmovdqu	%ymm1,64(%rdi)
2086bc3d5698SJohn Baldwin	vmovdqu	%ymm5,96(%rdi)
2087bc3d5698SJohn Baldwin	vmovdqu	%ymm12,128(%rdi)
2088bc3d5698SJohn Baldwin	vmovdqu	%ymm13,160(%rdi)
2089bc3d5698SJohn Baldwin	vmovdqu	%ymm10,192(%rdi)
2090bc3d5698SJohn Baldwin	vmovdqu	%ymm15,224(%rdi)
2091bc3d5698SJohn Baldwin	vmovdqu	%ymm14,256(%rdi)
2092bc3d5698SJohn Baldwin	vmovdqu	%ymm2,288(%rdi)
2093bc3d5698SJohn Baldwin	je	.Ldone8x
2094bc3d5698SJohn Baldwin
2095bc3d5698SJohn Baldwin	leaq	320(%rsi),%rsi
2096bc3d5698SJohn Baldwin	xorq	%r10,%r10
2097bc3d5698SJohn Baldwin	vmovdqa	%ymm3,0(%rsp)
2098bc3d5698SJohn Baldwin	leaq	320(%rdi),%rdi
2099bc3d5698SJohn Baldwin	subq	$320,%rdx
2100bc3d5698SJohn Baldwin	vmovdqa	%ymm7,32(%rsp)
2101bc3d5698SJohn Baldwin	jmp	.Loop_tail8x
2102bc3d5698SJohn Baldwin
2103bc3d5698SJohn Baldwin.align	32
2104bc3d5698SJohn Baldwin.L384_or_more8x:
2105bc3d5698SJohn Baldwin	vpxor	0(%rsi),%ymm6,%ymm6
2106bc3d5698SJohn Baldwin	vpxor	32(%rsi),%ymm8,%ymm8
2107bc3d5698SJohn Baldwin	vpxor	64(%rsi),%ymm1,%ymm1
2108bc3d5698SJohn Baldwin	vpxor	96(%rsi),%ymm5,%ymm5
2109bc3d5698SJohn Baldwin	vpxor	128(%rsi),%ymm12,%ymm12
2110bc3d5698SJohn Baldwin	vpxor	160(%rsi),%ymm13,%ymm13
2111bc3d5698SJohn Baldwin	vpxor	192(%rsi),%ymm10,%ymm10
2112bc3d5698SJohn Baldwin	vpxor	224(%rsi),%ymm15,%ymm15
2113bc3d5698SJohn Baldwin	vpxor	256(%rsi),%ymm14,%ymm14
2114bc3d5698SJohn Baldwin	vpxor	288(%rsi),%ymm2,%ymm2
2115bc3d5698SJohn Baldwin	vpxor	320(%rsi),%ymm3,%ymm3
2116bc3d5698SJohn Baldwin	vpxor	352(%rsi),%ymm7,%ymm7
2117bc3d5698SJohn Baldwin	vmovdqu	%ymm6,0(%rdi)
2118bc3d5698SJohn Baldwin	vmovdqu	%ymm8,32(%rdi)
2119bc3d5698SJohn Baldwin	vmovdqu	%ymm1,64(%rdi)
2120bc3d5698SJohn Baldwin	vmovdqu	%ymm5,96(%rdi)
2121bc3d5698SJohn Baldwin	vmovdqu	%ymm12,128(%rdi)
2122bc3d5698SJohn Baldwin	vmovdqu	%ymm13,160(%rdi)
2123bc3d5698SJohn Baldwin	vmovdqu	%ymm10,192(%rdi)
2124bc3d5698SJohn Baldwin	vmovdqu	%ymm15,224(%rdi)
2125bc3d5698SJohn Baldwin	vmovdqu	%ymm14,256(%rdi)
2126bc3d5698SJohn Baldwin	vmovdqu	%ymm2,288(%rdi)
2127bc3d5698SJohn Baldwin	vmovdqu	%ymm3,320(%rdi)
2128bc3d5698SJohn Baldwin	vmovdqu	%ymm7,352(%rdi)
2129bc3d5698SJohn Baldwin	je	.Ldone8x
2130bc3d5698SJohn Baldwin
2131bc3d5698SJohn Baldwin	leaq	384(%rsi),%rsi
2132bc3d5698SJohn Baldwin	xorq	%r10,%r10
2133bc3d5698SJohn Baldwin	vmovdqa	%ymm11,0(%rsp)
2134bc3d5698SJohn Baldwin	leaq	384(%rdi),%rdi
2135bc3d5698SJohn Baldwin	subq	$384,%rdx
2136bc3d5698SJohn Baldwin	vmovdqa	%ymm9,32(%rsp)
2137bc3d5698SJohn Baldwin	jmp	.Loop_tail8x
2138bc3d5698SJohn Baldwin
2139bc3d5698SJohn Baldwin.align	32
2140bc3d5698SJohn Baldwin.L448_or_more8x:
2141bc3d5698SJohn Baldwin	vpxor	0(%rsi),%ymm6,%ymm6
2142bc3d5698SJohn Baldwin	vpxor	32(%rsi),%ymm8,%ymm8
2143bc3d5698SJohn Baldwin	vpxor	64(%rsi),%ymm1,%ymm1
2144bc3d5698SJohn Baldwin	vpxor	96(%rsi),%ymm5,%ymm5
2145bc3d5698SJohn Baldwin	vpxor	128(%rsi),%ymm12,%ymm12
2146bc3d5698SJohn Baldwin	vpxor	160(%rsi),%ymm13,%ymm13
2147bc3d5698SJohn Baldwin	vpxor	192(%rsi),%ymm10,%ymm10
2148bc3d5698SJohn Baldwin	vpxor	224(%rsi),%ymm15,%ymm15
2149bc3d5698SJohn Baldwin	vpxor	256(%rsi),%ymm14,%ymm14
2150bc3d5698SJohn Baldwin	vpxor	288(%rsi),%ymm2,%ymm2
2151bc3d5698SJohn Baldwin	vpxor	320(%rsi),%ymm3,%ymm3
2152bc3d5698SJohn Baldwin	vpxor	352(%rsi),%ymm7,%ymm7
2153bc3d5698SJohn Baldwin	vpxor	384(%rsi),%ymm11,%ymm11
2154bc3d5698SJohn Baldwin	vpxor	416(%rsi),%ymm9,%ymm9
2155bc3d5698SJohn Baldwin	vmovdqu	%ymm6,0(%rdi)
2156bc3d5698SJohn Baldwin	vmovdqu	%ymm8,32(%rdi)
2157bc3d5698SJohn Baldwin	vmovdqu	%ymm1,64(%rdi)
2158bc3d5698SJohn Baldwin	vmovdqu	%ymm5,96(%rdi)
2159bc3d5698SJohn Baldwin	vmovdqu	%ymm12,128(%rdi)
2160bc3d5698SJohn Baldwin	vmovdqu	%ymm13,160(%rdi)
2161bc3d5698SJohn Baldwin	vmovdqu	%ymm10,192(%rdi)
2162bc3d5698SJohn Baldwin	vmovdqu	%ymm15,224(%rdi)
2163bc3d5698SJohn Baldwin	vmovdqu	%ymm14,256(%rdi)
2164bc3d5698SJohn Baldwin	vmovdqu	%ymm2,288(%rdi)
2165bc3d5698SJohn Baldwin	vmovdqu	%ymm3,320(%rdi)
2166bc3d5698SJohn Baldwin	vmovdqu	%ymm7,352(%rdi)
2167bc3d5698SJohn Baldwin	vmovdqu	%ymm11,384(%rdi)
2168bc3d5698SJohn Baldwin	vmovdqu	%ymm9,416(%rdi)
2169bc3d5698SJohn Baldwin	je	.Ldone8x
2170bc3d5698SJohn Baldwin
2171bc3d5698SJohn Baldwin	leaq	448(%rsi),%rsi
2172bc3d5698SJohn Baldwin	xorq	%r10,%r10
2173bc3d5698SJohn Baldwin	vmovdqa	%ymm0,0(%rsp)
2174bc3d5698SJohn Baldwin	leaq	448(%rdi),%rdi
2175bc3d5698SJohn Baldwin	subq	$448,%rdx
2176bc3d5698SJohn Baldwin	vmovdqa	%ymm4,32(%rsp)
2177bc3d5698SJohn Baldwin
2178bc3d5698SJohn Baldwin.Loop_tail8x:
2179bc3d5698SJohn Baldwin	movzbl	(%rsi,%r10,1),%eax
2180bc3d5698SJohn Baldwin	movzbl	(%rsp,%r10,1),%ecx
2181bc3d5698SJohn Baldwin	leaq	1(%r10),%r10
2182bc3d5698SJohn Baldwin	xorl	%ecx,%eax
2183bc3d5698SJohn Baldwin	movb	%al,-1(%rdi,%r10,1)
2184bc3d5698SJohn Baldwin	decq	%rdx
2185bc3d5698SJohn Baldwin	jnz	.Loop_tail8x
2186bc3d5698SJohn Baldwin
2187bc3d5698SJohn Baldwin.Ldone8x:
2188bc3d5698SJohn Baldwin	vzeroall
2189bc3d5698SJohn Baldwin	leaq	(%r9),%rsp
2190bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rsp
2191bc3d5698SJohn Baldwin.L8x_epilogue:
2192bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
2193bc3d5698SJohn Baldwin.cfi_endproc
2194bc3d5698SJohn Baldwin.size	ChaCha20_8x,.-ChaCha20_8x
2195*c0855eaaSJohn Baldwin	.section ".note.gnu.property", "a"
2196*c0855eaaSJohn Baldwin	.p2align 3
2197*c0855eaaSJohn Baldwin	.long 1f - 0f
2198*c0855eaaSJohn Baldwin	.long 4f - 1f
2199*c0855eaaSJohn Baldwin	.long 5
2200*c0855eaaSJohn Baldwin0:
2201*c0855eaaSJohn Baldwin	# "GNU" encoded with .byte, since .asciz isn't supported
2202*c0855eaaSJohn Baldwin	# on Solaris.
2203*c0855eaaSJohn Baldwin	.byte 0x47
2204*c0855eaaSJohn Baldwin	.byte 0x4e
2205*c0855eaaSJohn Baldwin	.byte 0x55
2206*c0855eaaSJohn Baldwin	.byte 0
2207*c0855eaaSJohn Baldwin1:
2208*c0855eaaSJohn Baldwin	.p2align 3
2209*c0855eaaSJohn Baldwin	.long 0xc0000002
2210*c0855eaaSJohn Baldwin	.long 3f - 2f
2211*c0855eaaSJohn Baldwin2:
2212*c0855eaaSJohn Baldwin	.long 3
2213*c0855eaaSJohn Baldwin3:
2214*c0855eaaSJohn Baldwin	.p2align 3
2215*c0855eaaSJohn Baldwin4:
2216