xref: /freebsd/sys/crypto/openssl/amd64/ghash-x86_64.S (revision df21a004be237a1dccd03c7b47254625eea62fa9)
1/* Do not modify. This file is auto-generated from ghash-x86_64.pl. */
2.text
3
4
5.globl	gcm_gmult_4bit
6.type	gcm_gmult_4bit,@function
7.align	16
8gcm_gmult_4bit:
9.cfi_startproc
10.byte	243,15,30,250
11	pushq	%rbx
12.cfi_adjust_cfa_offset	8
13.cfi_offset	%rbx,-16
14	pushq	%rbp
15.cfi_adjust_cfa_offset	8
16.cfi_offset	%rbp,-24
17	pushq	%r12
18.cfi_adjust_cfa_offset	8
19.cfi_offset	%r12,-32
20	pushq	%r13
21.cfi_adjust_cfa_offset	8
22.cfi_offset	%r13,-40
23	pushq	%r14
24.cfi_adjust_cfa_offset	8
25.cfi_offset	%r14,-48
26	pushq	%r15
27.cfi_adjust_cfa_offset	8
28.cfi_offset	%r15,-56
29	subq	$280,%rsp
30.cfi_adjust_cfa_offset	280
31.Lgmult_prologue:
32
33	movzbq	15(%rdi),%r8
34	leaq	.Lrem_4bit(%rip),%r11
35	xorq	%rax,%rax
36	xorq	%rbx,%rbx
37	movb	%r8b,%al
38	movb	%r8b,%bl
39	shlb	$4,%al
40	movq	$14,%rcx
41	movq	8(%rsi,%rax,1),%r8
42	movq	(%rsi,%rax,1),%r9
43	andb	$0xf0,%bl
44	movq	%r8,%rdx
45	jmp	.Loop1
46
47.align	16
48.Loop1:
49	shrq	$4,%r8
50	andq	$0xf,%rdx
51	movq	%r9,%r10
52	movb	(%rdi,%rcx,1),%al
53	shrq	$4,%r9
54	xorq	8(%rsi,%rbx,1),%r8
55	shlq	$60,%r10
56	xorq	(%rsi,%rbx,1),%r9
57	movb	%al,%bl
58	xorq	(%r11,%rdx,8),%r9
59	movq	%r8,%rdx
60	shlb	$4,%al
61	xorq	%r10,%r8
62	decq	%rcx
63	js	.Lbreak1
64
65	shrq	$4,%r8
66	andq	$0xf,%rdx
67	movq	%r9,%r10
68	shrq	$4,%r9
69	xorq	8(%rsi,%rax,1),%r8
70	shlq	$60,%r10
71	xorq	(%rsi,%rax,1),%r9
72	andb	$0xf0,%bl
73	xorq	(%r11,%rdx,8),%r9
74	movq	%r8,%rdx
75	xorq	%r10,%r8
76	jmp	.Loop1
77
78.align	16
79.Lbreak1:
80	shrq	$4,%r8
81	andq	$0xf,%rdx
82	movq	%r9,%r10
83	shrq	$4,%r9
84	xorq	8(%rsi,%rax,1),%r8
85	shlq	$60,%r10
86	xorq	(%rsi,%rax,1),%r9
87	andb	$0xf0,%bl
88	xorq	(%r11,%rdx,8),%r9
89	movq	%r8,%rdx
90	xorq	%r10,%r8
91
92	shrq	$4,%r8
93	andq	$0xf,%rdx
94	movq	%r9,%r10
95	shrq	$4,%r9
96	xorq	8(%rsi,%rbx,1),%r8
97	shlq	$60,%r10
98	xorq	(%rsi,%rbx,1),%r9
99	xorq	%r10,%r8
100	xorq	(%r11,%rdx,8),%r9
101
102	bswapq	%r8
103	bswapq	%r9
104	movq	%r8,8(%rdi)
105	movq	%r9,(%rdi)
106
107	leaq	280+48(%rsp),%rsi
108.cfi_def_cfa	%rsi,8
109	movq	-8(%rsi),%rbx
110.cfi_restore	%rbx
111	leaq	(%rsi),%rsp
112.cfi_def_cfa_register	%rsp
113.Lgmult_epilogue:
114	.byte	0xf3,0xc3
115.cfi_endproc
116.size	gcm_gmult_4bit,.-gcm_gmult_4bit
117.globl	gcm_ghash_4bit
118.type	gcm_ghash_4bit,@function
119.align	16
120gcm_ghash_4bit:
121.cfi_startproc
122.byte	243,15,30,250
123	pushq	%rbx
124.cfi_adjust_cfa_offset	8
125.cfi_offset	%rbx,-16
126	pushq	%rbp
127.cfi_adjust_cfa_offset	8
128.cfi_offset	%rbp,-24
129	pushq	%r12
130.cfi_adjust_cfa_offset	8
131.cfi_offset	%r12,-32
132	pushq	%r13
133.cfi_adjust_cfa_offset	8
134.cfi_offset	%r13,-40
135	pushq	%r14
136.cfi_adjust_cfa_offset	8
137.cfi_offset	%r14,-48
138	pushq	%r15
139.cfi_adjust_cfa_offset	8
140.cfi_offset	%r15,-56
141	subq	$280,%rsp
142.cfi_adjust_cfa_offset	280
143.Lghash_prologue:
144	movq	%rdx,%r14
145	movq	%rcx,%r15
146	subq	$-128,%rsi
147	leaq	16+128(%rsp),%rbp
148	xorl	%edx,%edx
149	movq	0+0-128(%rsi),%r8
150	movq	0+8-128(%rsi),%rax
151	movb	%al,%dl
152	shrq	$4,%rax
153	movq	%r8,%r10
154	shrq	$4,%r8
155	movq	16+0-128(%rsi),%r9
156	shlb	$4,%dl
157	movq	16+8-128(%rsi),%rbx
158	shlq	$60,%r10
159	movb	%dl,0(%rsp)
160	orq	%r10,%rax
161	movb	%bl,%dl
162	shrq	$4,%rbx
163	movq	%r9,%r10
164	shrq	$4,%r9
165	movq	%r8,0(%rbp)
166	movq	32+0-128(%rsi),%r8
167	shlb	$4,%dl
168	movq	%rax,0-128(%rbp)
169	movq	32+8-128(%rsi),%rax
170	shlq	$60,%r10
171	movb	%dl,1(%rsp)
172	orq	%r10,%rbx
173	movb	%al,%dl
174	shrq	$4,%rax
175	movq	%r8,%r10
176	shrq	$4,%r8
177	movq	%r9,8(%rbp)
178	movq	48+0-128(%rsi),%r9
179	shlb	$4,%dl
180	movq	%rbx,8-128(%rbp)
181	movq	48+8-128(%rsi),%rbx
182	shlq	$60,%r10
183	movb	%dl,2(%rsp)
184	orq	%r10,%rax
185	movb	%bl,%dl
186	shrq	$4,%rbx
187	movq	%r9,%r10
188	shrq	$4,%r9
189	movq	%r8,16(%rbp)
190	movq	64+0-128(%rsi),%r8
191	shlb	$4,%dl
192	movq	%rax,16-128(%rbp)
193	movq	64+8-128(%rsi),%rax
194	shlq	$60,%r10
195	movb	%dl,3(%rsp)
196	orq	%r10,%rbx
197	movb	%al,%dl
198	shrq	$4,%rax
199	movq	%r8,%r10
200	shrq	$4,%r8
201	movq	%r9,24(%rbp)
202	movq	80+0-128(%rsi),%r9
203	shlb	$4,%dl
204	movq	%rbx,24-128(%rbp)
205	movq	80+8-128(%rsi),%rbx
206	shlq	$60,%r10
207	movb	%dl,4(%rsp)
208	orq	%r10,%rax
209	movb	%bl,%dl
210	shrq	$4,%rbx
211	movq	%r9,%r10
212	shrq	$4,%r9
213	movq	%r8,32(%rbp)
214	movq	96+0-128(%rsi),%r8
215	shlb	$4,%dl
216	movq	%rax,32-128(%rbp)
217	movq	96+8-128(%rsi),%rax
218	shlq	$60,%r10
219	movb	%dl,5(%rsp)
220	orq	%r10,%rbx
221	movb	%al,%dl
222	shrq	$4,%rax
223	movq	%r8,%r10
224	shrq	$4,%r8
225	movq	%r9,40(%rbp)
226	movq	112+0-128(%rsi),%r9
227	shlb	$4,%dl
228	movq	%rbx,40-128(%rbp)
229	movq	112+8-128(%rsi),%rbx
230	shlq	$60,%r10
231	movb	%dl,6(%rsp)
232	orq	%r10,%rax
233	movb	%bl,%dl
234	shrq	$4,%rbx
235	movq	%r9,%r10
236	shrq	$4,%r9
237	movq	%r8,48(%rbp)
238	movq	128+0-128(%rsi),%r8
239	shlb	$4,%dl
240	movq	%rax,48-128(%rbp)
241	movq	128+8-128(%rsi),%rax
242	shlq	$60,%r10
243	movb	%dl,7(%rsp)
244	orq	%r10,%rbx
245	movb	%al,%dl
246	shrq	$4,%rax
247	movq	%r8,%r10
248	shrq	$4,%r8
249	movq	%r9,56(%rbp)
250	movq	144+0-128(%rsi),%r9
251	shlb	$4,%dl
252	movq	%rbx,56-128(%rbp)
253	movq	144+8-128(%rsi),%rbx
254	shlq	$60,%r10
255	movb	%dl,8(%rsp)
256	orq	%r10,%rax
257	movb	%bl,%dl
258	shrq	$4,%rbx
259	movq	%r9,%r10
260	shrq	$4,%r9
261	movq	%r8,64(%rbp)
262	movq	160+0-128(%rsi),%r8
263	shlb	$4,%dl
264	movq	%rax,64-128(%rbp)
265	movq	160+8-128(%rsi),%rax
266	shlq	$60,%r10
267	movb	%dl,9(%rsp)
268	orq	%r10,%rbx
269	movb	%al,%dl
270	shrq	$4,%rax
271	movq	%r8,%r10
272	shrq	$4,%r8
273	movq	%r9,72(%rbp)
274	movq	176+0-128(%rsi),%r9
275	shlb	$4,%dl
276	movq	%rbx,72-128(%rbp)
277	movq	176+8-128(%rsi),%rbx
278	shlq	$60,%r10
279	movb	%dl,10(%rsp)
280	orq	%r10,%rax
281	movb	%bl,%dl
282	shrq	$4,%rbx
283	movq	%r9,%r10
284	shrq	$4,%r9
285	movq	%r8,80(%rbp)
286	movq	192+0-128(%rsi),%r8
287	shlb	$4,%dl
288	movq	%rax,80-128(%rbp)
289	movq	192+8-128(%rsi),%rax
290	shlq	$60,%r10
291	movb	%dl,11(%rsp)
292	orq	%r10,%rbx
293	movb	%al,%dl
294	shrq	$4,%rax
295	movq	%r8,%r10
296	shrq	$4,%r8
297	movq	%r9,88(%rbp)
298	movq	208+0-128(%rsi),%r9
299	shlb	$4,%dl
300	movq	%rbx,88-128(%rbp)
301	movq	208+8-128(%rsi),%rbx
302	shlq	$60,%r10
303	movb	%dl,12(%rsp)
304	orq	%r10,%rax
305	movb	%bl,%dl
306	shrq	$4,%rbx
307	movq	%r9,%r10
308	shrq	$4,%r9
309	movq	%r8,96(%rbp)
310	movq	224+0-128(%rsi),%r8
311	shlb	$4,%dl
312	movq	%rax,96-128(%rbp)
313	movq	224+8-128(%rsi),%rax
314	shlq	$60,%r10
315	movb	%dl,13(%rsp)
316	orq	%r10,%rbx
317	movb	%al,%dl
318	shrq	$4,%rax
319	movq	%r8,%r10
320	shrq	$4,%r8
321	movq	%r9,104(%rbp)
322	movq	240+0-128(%rsi),%r9
323	shlb	$4,%dl
324	movq	%rbx,104-128(%rbp)
325	movq	240+8-128(%rsi),%rbx
326	shlq	$60,%r10
327	movb	%dl,14(%rsp)
328	orq	%r10,%rax
329	movb	%bl,%dl
330	shrq	$4,%rbx
331	movq	%r9,%r10
332	shrq	$4,%r9
333	movq	%r8,112(%rbp)
334	shlb	$4,%dl
335	movq	%rax,112-128(%rbp)
336	shlq	$60,%r10
337	movb	%dl,15(%rsp)
338	orq	%r10,%rbx
339	movq	%r9,120(%rbp)
340	movq	%rbx,120-128(%rbp)
341	addq	$-128,%rsi
342	movq	8(%rdi),%r8
343	movq	0(%rdi),%r9
344	addq	%r14,%r15
345	leaq	.Lrem_8bit(%rip),%r11
346	jmp	.Louter_loop
347.align	16
348.Louter_loop:
349	xorq	(%r14),%r9
350	movq	8(%r14),%rdx
351	leaq	16(%r14),%r14
352	xorq	%r8,%rdx
353	movq	%r9,(%rdi)
354	movq	%rdx,8(%rdi)
355	shrq	$32,%rdx
356	xorq	%rax,%rax
357	roll	$8,%edx
358	movb	%dl,%al
359	movzbl	%dl,%ebx
360	shlb	$4,%al
361	shrl	$4,%ebx
362	roll	$8,%edx
363	movq	8(%rsi,%rax,1),%r8
364	movq	(%rsi,%rax,1),%r9
365	movb	%dl,%al
366	movzbl	%dl,%ecx
367	shlb	$4,%al
368	movzbq	(%rsp,%rbx,1),%r12
369	shrl	$4,%ecx
370	xorq	%r8,%r12
371	movq	%r9,%r10
372	shrq	$8,%r8
373	movzbq	%r12b,%r12
374	shrq	$8,%r9
375	xorq	-128(%rbp,%rbx,8),%r8
376	shlq	$56,%r10
377	xorq	(%rbp,%rbx,8),%r9
378	roll	$8,%edx
379	xorq	8(%rsi,%rax,1),%r8
380	xorq	(%rsi,%rax,1),%r9
381	movb	%dl,%al
382	xorq	%r10,%r8
383	movzwq	(%r11,%r12,2),%r12
384	movzbl	%dl,%ebx
385	shlb	$4,%al
386	movzbq	(%rsp,%rcx,1),%r13
387	shrl	$4,%ebx
388	shlq	$48,%r12
389	xorq	%r8,%r13
390	movq	%r9,%r10
391	xorq	%r12,%r9
392	shrq	$8,%r8
393	movzbq	%r13b,%r13
394	shrq	$8,%r9
395	xorq	-128(%rbp,%rcx,8),%r8
396	shlq	$56,%r10
397	xorq	(%rbp,%rcx,8),%r9
398	roll	$8,%edx
399	xorq	8(%rsi,%rax,1),%r8
400	xorq	(%rsi,%rax,1),%r9
401	movb	%dl,%al
402	xorq	%r10,%r8
403	movzwq	(%r11,%r13,2),%r13
404	movzbl	%dl,%ecx
405	shlb	$4,%al
406	movzbq	(%rsp,%rbx,1),%r12
407	shrl	$4,%ecx
408	shlq	$48,%r13
409	xorq	%r8,%r12
410	movq	%r9,%r10
411	xorq	%r13,%r9
412	shrq	$8,%r8
413	movzbq	%r12b,%r12
414	movl	8(%rdi),%edx
415	shrq	$8,%r9
416	xorq	-128(%rbp,%rbx,8),%r8
417	shlq	$56,%r10
418	xorq	(%rbp,%rbx,8),%r9
419	roll	$8,%edx
420	xorq	8(%rsi,%rax,1),%r8
421	xorq	(%rsi,%rax,1),%r9
422	movb	%dl,%al
423	xorq	%r10,%r8
424	movzwq	(%r11,%r12,2),%r12
425	movzbl	%dl,%ebx
426	shlb	$4,%al
427	movzbq	(%rsp,%rcx,1),%r13
428	shrl	$4,%ebx
429	shlq	$48,%r12
430	xorq	%r8,%r13
431	movq	%r9,%r10
432	xorq	%r12,%r9
433	shrq	$8,%r8
434	movzbq	%r13b,%r13
435	shrq	$8,%r9
436	xorq	-128(%rbp,%rcx,8),%r8
437	shlq	$56,%r10
438	xorq	(%rbp,%rcx,8),%r9
439	roll	$8,%edx
440	xorq	8(%rsi,%rax,1),%r8
441	xorq	(%rsi,%rax,1),%r9
442	movb	%dl,%al
443	xorq	%r10,%r8
444	movzwq	(%r11,%r13,2),%r13
445	movzbl	%dl,%ecx
446	shlb	$4,%al
447	movzbq	(%rsp,%rbx,1),%r12
448	shrl	$4,%ecx
449	shlq	$48,%r13
450	xorq	%r8,%r12
451	movq	%r9,%r10
452	xorq	%r13,%r9
453	shrq	$8,%r8
454	movzbq	%r12b,%r12
455	shrq	$8,%r9
456	xorq	-128(%rbp,%rbx,8),%r8
457	shlq	$56,%r10
458	xorq	(%rbp,%rbx,8),%r9
459	roll	$8,%edx
460	xorq	8(%rsi,%rax,1),%r8
461	xorq	(%rsi,%rax,1),%r9
462	movb	%dl,%al
463	xorq	%r10,%r8
464	movzwq	(%r11,%r12,2),%r12
465	movzbl	%dl,%ebx
466	shlb	$4,%al
467	movzbq	(%rsp,%rcx,1),%r13
468	shrl	$4,%ebx
469	shlq	$48,%r12
470	xorq	%r8,%r13
471	movq	%r9,%r10
472	xorq	%r12,%r9
473	shrq	$8,%r8
474	movzbq	%r13b,%r13
475	shrq	$8,%r9
476	xorq	-128(%rbp,%rcx,8),%r8
477	shlq	$56,%r10
478	xorq	(%rbp,%rcx,8),%r9
479	roll	$8,%edx
480	xorq	8(%rsi,%rax,1),%r8
481	xorq	(%rsi,%rax,1),%r9
482	movb	%dl,%al
483	xorq	%r10,%r8
484	movzwq	(%r11,%r13,2),%r13
485	movzbl	%dl,%ecx
486	shlb	$4,%al
487	movzbq	(%rsp,%rbx,1),%r12
488	shrl	$4,%ecx
489	shlq	$48,%r13
490	xorq	%r8,%r12
491	movq	%r9,%r10
492	xorq	%r13,%r9
493	shrq	$8,%r8
494	movzbq	%r12b,%r12
495	movl	4(%rdi),%edx
496	shrq	$8,%r9
497	xorq	-128(%rbp,%rbx,8),%r8
498	shlq	$56,%r10
499	xorq	(%rbp,%rbx,8),%r9
500	roll	$8,%edx
501	xorq	8(%rsi,%rax,1),%r8
502	xorq	(%rsi,%rax,1),%r9
503	movb	%dl,%al
504	xorq	%r10,%r8
505	movzwq	(%r11,%r12,2),%r12
506	movzbl	%dl,%ebx
507	shlb	$4,%al
508	movzbq	(%rsp,%rcx,1),%r13
509	shrl	$4,%ebx
510	shlq	$48,%r12
511	xorq	%r8,%r13
512	movq	%r9,%r10
513	xorq	%r12,%r9
514	shrq	$8,%r8
515	movzbq	%r13b,%r13
516	shrq	$8,%r9
517	xorq	-128(%rbp,%rcx,8),%r8
518	shlq	$56,%r10
519	xorq	(%rbp,%rcx,8),%r9
520	roll	$8,%edx
521	xorq	8(%rsi,%rax,1),%r8
522	xorq	(%rsi,%rax,1),%r9
523	movb	%dl,%al
524	xorq	%r10,%r8
525	movzwq	(%r11,%r13,2),%r13
526	movzbl	%dl,%ecx
527	shlb	$4,%al
528	movzbq	(%rsp,%rbx,1),%r12
529	shrl	$4,%ecx
530	shlq	$48,%r13
531	xorq	%r8,%r12
532	movq	%r9,%r10
533	xorq	%r13,%r9
534	shrq	$8,%r8
535	movzbq	%r12b,%r12
536	shrq	$8,%r9
537	xorq	-128(%rbp,%rbx,8),%r8
538	shlq	$56,%r10
539	xorq	(%rbp,%rbx,8),%r9
540	roll	$8,%edx
541	xorq	8(%rsi,%rax,1),%r8
542	xorq	(%rsi,%rax,1),%r9
543	movb	%dl,%al
544	xorq	%r10,%r8
545	movzwq	(%r11,%r12,2),%r12
546	movzbl	%dl,%ebx
547	shlb	$4,%al
548	movzbq	(%rsp,%rcx,1),%r13
549	shrl	$4,%ebx
550	shlq	$48,%r12
551	xorq	%r8,%r13
552	movq	%r9,%r10
553	xorq	%r12,%r9
554	shrq	$8,%r8
555	movzbq	%r13b,%r13
556	shrq	$8,%r9
557	xorq	-128(%rbp,%rcx,8),%r8
558	shlq	$56,%r10
559	xorq	(%rbp,%rcx,8),%r9
560	roll	$8,%edx
561	xorq	8(%rsi,%rax,1),%r8
562	xorq	(%rsi,%rax,1),%r9
563	movb	%dl,%al
564	xorq	%r10,%r8
565	movzwq	(%r11,%r13,2),%r13
566	movzbl	%dl,%ecx
567	shlb	$4,%al
568	movzbq	(%rsp,%rbx,1),%r12
569	shrl	$4,%ecx
570	shlq	$48,%r13
571	xorq	%r8,%r12
572	movq	%r9,%r10
573	xorq	%r13,%r9
574	shrq	$8,%r8
575	movzbq	%r12b,%r12
576	movl	0(%rdi),%edx
577	shrq	$8,%r9
578	xorq	-128(%rbp,%rbx,8),%r8
579	shlq	$56,%r10
580	xorq	(%rbp,%rbx,8),%r9
581	roll	$8,%edx
582	xorq	8(%rsi,%rax,1),%r8
583	xorq	(%rsi,%rax,1),%r9
584	movb	%dl,%al
585	xorq	%r10,%r8
586	movzwq	(%r11,%r12,2),%r12
587	movzbl	%dl,%ebx
588	shlb	$4,%al
589	movzbq	(%rsp,%rcx,1),%r13
590	shrl	$4,%ebx
591	shlq	$48,%r12
592	xorq	%r8,%r13
593	movq	%r9,%r10
594	xorq	%r12,%r9
595	shrq	$8,%r8
596	movzbq	%r13b,%r13
597	shrq	$8,%r9
598	xorq	-128(%rbp,%rcx,8),%r8
599	shlq	$56,%r10
600	xorq	(%rbp,%rcx,8),%r9
601	roll	$8,%edx
602	xorq	8(%rsi,%rax,1),%r8
603	xorq	(%rsi,%rax,1),%r9
604	movb	%dl,%al
605	xorq	%r10,%r8
606	movzwq	(%r11,%r13,2),%r13
607	movzbl	%dl,%ecx
608	shlb	$4,%al
609	movzbq	(%rsp,%rbx,1),%r12
610	shrl	$4,%ecx
611	shlq	$48,%r13
612	xorq	%r8,%r12
613	movq	%r9,%r10
614	xorq	%r13,%r9
615	shrq	$8,%r8
616	movzbq	%r12b,%r12
617	shrq	$8,%r9
618	xorq	-128(%rbp,%rbx,8),%r8
619	shlq	$56,%r10
620	xorq	(%rbp,%rbx,8),%r9
621	roll	$8,%edx
622	xorq	8(%rsi,%rax,1),%r8
623	xorq	(%rsi,%rax,1),%r9
624	movb	%dl,%al
625	xorq	%r10,%r8
626	movzwq	(%r11,%r12,2),%r12
627	movzbl	%dl,%ebx
628	shlb	$4,%al
629	movzbq	(%rsp,%rcx,1),%r13
630	shrl	$4,%ebx
631	shlq	$48,%r12
632	xorq	%r8,%r13
633	movq	%r9,%r10
634	xorq	%r12,%r9
635	shrq	$8,%r8
636	movzbq	%r13b,%r13
637	shrq	$8,%r9
638	xorq	-128(%rbp,%rcx,8),%r8
639	shlq	$56,%r10
640	xorq	(%rbp,%rcx,8),%r9
641	roll	$8,%edx
642	xorq	8(%rsi,%rax,1),%r8
643	xorq	(%rsi,%rax,1),%r9
644	movb	%dl,%al
645	xorq	%r10,%r8
646	movzwq	(%r11,%r13,2),%r13
647	movzbl	%dl,%ecx
648	shlb	$4,%al
649	movzbq	(%rsp,%rbx,1),%r12
650	andl	$240,%ecx
651	shlq	$48,%r13
652	xorq	%r8,%r12
653	movq	%r9,%r10
654	xorq	%r13,%r9
655	shrq	$8,%r8
656	movzbq	%r12b,%r12
657	movl	-4(%rdi),%edx
658	shrq	$8,%r9
659	xorq	-128(%rbp,%rbx,8),%r8
660	shlq	$56,%r10
661	xorq	(%rbp,%rbx,8),%r9
662	movzwq	(%r11,%r12,2),%r12
663	xorq	8(%rsi,%rax,1),%r8
664	xorq	(%rsi,%rax,1),%r9
665	shlq	$48,%r12
666	xorq	%r10,%r8
667	xorq	%r12,%r9
668	movzbq	%r8b,%r13
669	shrq	$4,%r8
670	movq	%r9,%r10
671	shlb	$4,%r13b
672	shrq	$4,%r9
673	xorq	8(%rsi,%rcx,1),%r8
674	movzwq	(%r11,%r13,2),%r13
675	shlq	$60,%r10
676	xorq	(%rsi,%rcx,1),%r9
677	xorq	%r10,%r8
678	shlq	$48,%r13
679	bswapq	%r8
680	xorq	%r13,%r9
681	bswapq	%r9
682	cmpq	%r15,%r14
683	jb	.Louter_loop
684	movq	%r8,8(%rdi)
685	movq	%r9,(%rdi)
686
687	leaq	280+48(%rsp),%rsi
688.cfi_def_cfa	%rsi,8
689	movq	-48(%rsi),%r15
690.cfi_restore	%r15
691	movq	-40(%rsi),%r14
692.cfi_restore	%r14
693	movq	-32(%rsi),%r13
694.cfi_restore	%r13
695	movq	-24(%rsi),%r12
696.cfi_restore	%r12
697	movq	-16(%rsi),%rbp
698.cfi_restore	%rbp
699	movq	-8(%rsi),%rbx
700.cfi_restore	%rbx
701	leaq	0(%rsi),%rsp
702.cfi_def_cfa_register	%rsp
703.Lghash_epilogue:
704	.byte	0xf3,0xc3
705.cfi_endproc
706.size	gcm_ghash_4bit,.-gcm_ghash_4bit
707.globl	gcm_init_clmul
708.type	gcm_init_clmul,@function
709.align	16
710gcm_init_clmul:
711.cfi_startproc
712.byte	243,15,30,250
713.L_init_clmul:
714	movdqu	(%rsi),%xmm2
715	pshufd	$78,%xmm2,%xmm2
716
717
718	pshufd	$255,%xmm2,%xmm4
719	movdqa	%xmm2,%xmm3
720	psllq	$1,%xmm2
721	pxor	%xmm5,%xmm5
722	psrlq	$63,%xmm3
723	pcmpgtd	%xmm4,%xmm5
724	pslldq	$8,%xmm3
725	por	%xmm3,%xmm2
726
727
728	pand	.L0x1c2_polynomial(%rip),%xmm5
729	pxor	%xmm5,%xmm2
730
731
732	pshufd	$78,%xmm2,%xmm6
733	movdqa	%xmm2,%xmm0
734	pxor	%xmm2,%xmm6
735	movdqa	%xmm0,%xmm1
736	pshufd	$78,%xmm0,%xmm3
737	pxor	%xmm0,%xmm3
738.byte	102,15,58,68,194,0
739.byte	102,15,58,68,202,17
740.byte	102,15,58,68,222,0
741	pxor	%xmm0,%xmm3
742	pxor	%xmm1,%xmm3
743
744	movdqa	%xmm3,%xmm4
745	psrldq	$8,%xmm3
746	pslldq	$8,%xmm4
747	pxor	%xmm3,%xmm1
748	pxor	%xmm4,%xmm0
749
750	movdqa	%xmm0,%xmm4
751	movdqa	%xmm0,%xmm3
752	psllq	$5,%xmm0
753	pxor	%xmm0,%xmm3
754	psllq	$1,%xmm0
755	pxor	%xmm3,%xmm0
756	psllq	$57,%xmm0
757	movdqa	%xmm0,%xmm3
758	pslldq	$8,%xmm0
759	psrldq	$8,%xmm3
760	pxor	%xmm4,%xmm0
761	pxor	%xmm3,%xmm1
762
763
764	movdqa	%xmm0,%xmm4
765	psrlq	$1,%xmm0
766	pxor	%xmm4,%xmm1
767	pxor	%xmm0,%xmm4
768	psrlq	$5,%xmm0
769	pxor	%xmm4,%xmm0
770	psrlq	$1,%xmm0
771	pxor	%xmm1,%xmm0
772	pshufd	$78,%xmm2,%xmm3
773	pshufd	$78,%xmm0,%xmm4
774	pxor	%xmm2,%xmm3
775	movdqu	%xmm2,0(%rdi)
776	pxor	%xmm0,%xmm4
777	movdqu	%xmm0,16(%rdi)
778.byte	102,15,58,15,227,8
779	movdqu	%xmm4,32(%rdi)
780	movdqa	%xmm0,%xmm1
781	pshufd	$78,%xmm0,%xmm3
782	pxor	%xmm0,%xmm3
783.byte	102,15,58,68,194,0
784.byte	102,15,58,68,202,17
785.byte	102,15,58,68,222,0
786	pxor	%xmm0,%xmm3
787	pxor	%xmm1,%xmm3
788
789	movdqa	%xmm3,%xmm4
790	psrldq	$8,%xmm3
791	pslldq	$8,%xmm4
792	pxor	%xmm3,%xmm1
793	pxor	%xmm4,%xmm0
794
795	movdqa	%xmm0,%xmm4
796	movdqa	%xmm0,%xmm3
797	psllq	$5,%xmm0
798	pxor	%xmm0,%xmm3
799	psllq	$1,%xmm0
800	pxor	%xmm3,%xmm0
801	psllq	$57,%xmm0
802	movdqa	%xmm0,%xmm3
803	pslldq	$8,%xmm0
804	psrldq	$8,%xmm3
805	pxor	%xmm4,%xmm0
806	pxor	%xmm3,%xmm1
807
808
809	movdqa	%xmm0,%xmm4
810	psrlq	$1,%xmm0
811	pxor	%xmm4,%xmm1
812	pxor	%xmm0,%xmm4
813	psrlq	$5,%xmm0
814	pxor	%xmm4,%xmm0
815	psrlq	$1,%xmm0
816	pxor	%xmm1,%xmm0
817	movdqa	%xmm0,%xmm5
818	movdqa	%xmm0,%xmm1
819	pshufd	$78,%xmm0,%xmm3
820	pxor	%xmm0,%xmm3
821.byte	102,15,58,68,194,0
822.byte	102,15,58,68,202,17
823.byte	102,15,58,68,222,0
824	pxor	%xmm0,%xmm3
825	pxor	%xmm1,%xmm3
826
827	movdqa	%xmm3,%xmm4
828	psrldq	$8,%xmm3
829	pslldq	$8,%xmm4
830	pxor	%xmm3,%xmm1
831	pxor	%xmm4,%xmm0
832
833	movdqa	%xmm0,%xmm4
834	movdqa	%xmm0,%xmm3
835	psllq	$5,%xmm0
836	pxor	%xmm0,%xmm3
837	psllq	$1,%xmm0
838	pxor	%xmm3,%xmm0
839	psllq	$57,%xmm0
840	movdqa	%xmm0,%xmm3
841	pslldq	$8,%xmm0
842	psrldq	$8,%xmm3
843	pxor	%xmm4,%xmm0
844	pxor	%xmm3,%xmm1
845
846
847	movdqa	%xmm0,%xmm4
848	psrlq	$1,%xmm0
849	pxor	%xmm4,%xmm1
850	pxor	%xmm0,%xmm4
851	psrlq	$5,%xmm0
852	pxor	%xmm4,%xmm0
853	psrlq	$1,%xmm0
854	pxor	%xmm1,%xmm0
855	pshufd	$78,%xmm5,%xmm3
856	pshufd	$78,%xmm0,%xmm4
857	pxor	%xmm5,%xmm3
858	movdqu	%xmm5,48(%rdi)
859	pxor	%xmm0,%xmm4
860	movdqu	%xmm0,64(%rdi)
861.byte	102,15,58,15,227,8
862	movdqu	%xmm4,80(%rdi)
863	.byte	0xf3,0xc3
864.cfi_endproc
865.size	gcm_init_clmul,.-gcm_init_clmul
866.globl	gcm_gmult_clmul
867.type	gcm_gmult_clmul,@function
868.align	16
869gcm_gmult_clmul:
870.cfi_startproc
871.byte	243,15,30,250
872.L_gmult_clmul:
873	movdqu	(%rdi),%xmm0
874	movdqa	.Lbswap_mask(%rip),%xmm5
875	movdqu	(%rsi),%xmm2
876	movdqu	32(%rsi),%xmm4
877.byte	102,15,56,0,197
878	movdqa	%xmm0,%xmm1
879	pshufd	$78,%xmm0,%xmm3
880	pxor	%xmm0,%xmm3
881.byte	102,15,58,68,194,0
882.byte	102,15,58,68,202,17
883.byte	102,15,58,68,220,0
884	pxor	%xmm0,%xmm3
885	pxor	%xmm1,%xmm3
886
887	movdqa	%xmm3,%xmm4
888	psrldq	$8,%xmm3
889	pslldq	$8,%xmm4
890	pxor	%xmm3,%xmm1
891	pxor	%xmm4,%xmm0
892
893	movdqa	%xmm0,%xmm4
894	movdqa	%xmm0,%xmm3
895	psllq	$5,%xmm0
896	pxor	%xmm0,%xmm3
897	psllq	$1,%xmm0
898	pxor	%xmm3,%xmm0
899	psllq	$57,%xmm0
900	movdqa	%xmm0,%xmm3
901	pslldq	$8,%xmm0
902	psrldq	$8,%xmm3
903	pxor	%xmm4,%xmm0
904	pxor	%xmm3,%xmm1
905
906
907	movdqa	%xmm0,%xmm4
908	psrlq	$1,%xmm0
909	pxor	%xmm4,%xmm1
910	pxor	%xmm0,%xmm4
911	psrlq	$5,%xmm0
912	pxor	%xmm4,%xmm0
913	psrlq	$1,%xmm0
914	pxor	%xmm1,%xmm0
915.byte	102,15,56,0,197
916	movdqu	%xmm0,(%rdi)
917	.byte	0xf3,0xc3
918.cfi_endproc
919.size	gcm_gmult_clmul,.-gcm_gmult_clmul
920.globl	gcm_ghash_clmul
921.type	gcm_ghash_clmul,@function
922.align	32
923gcm_ghash_clmul:
924.cfi_startproc
925.byte	243,15,30,250
926.L_ghash_clmul:
927	movdqa	.Lbswap_mask(%rip),%xmm10
928
929	movdqu	(%rdi),%xmm0
930	movdqu	(%rsi),%xmm2
931	movdqu	32(%rsi),%xmm7
932.byte	102,65,15,56,0,194
933
934	subq	$0x10,%rcx
935	jz	.Lodd_tail
936
937	movdqu	16(%rsi),%xmm6
938	movl	OPENSSL_ia32cap_P+4(%rip),%eax
939	cmpq	$0x30,%rcx
940	jb	.Lskip4x
941
942	andl	$71303168,%eax
943	cmpl	$4194304,%eax
944	je	.Lskip4x
945
946	subq	$0x30,%rcx
947	movq	$0xA040608020C0E000,%rax
948	movdqu	48(%rsi),%xmm14
949	movdqu	64(%rsi),%xmm15
950
951
952
953
954	movdqu	48(%rdx),%xmm3
955	movdqu	32(%rdx),%xmm11
956.byte	102,65,15,56,0,218
957.byte	102,69,15,56,0,218
958	movdqa	%xmm3,%xmm5
959	pshufd	$78,%xmm3,%xmm4
960	pxor	%xmm3,%xmm4
961.byte	102,15,58,68,218,0
962.byte	102,15,58,68,234,17
963.byte	102,15,58,68,231,0
964
965	movdqa	%xmm11,%xmm13
966	pshufd	$78,%xmm11,%xmm12
967	pxor	%xmm11,%xmm12
968.byte	102,68,15,58,68,222,0
969.byte	102,68,15,58,68,238,17
970.byte	102,68,15,58,68,231,16
971	xorps	%xmm11,%xmm3
972	xorps	%xmm13,%xmm5
973	movups	80(%rsi),%xmm7
974	xorps	%xmm12,%xmm4
975
976	movdqu	16(%rdx),%xmm11
977	movdqu	0(%rdx),%xmm8
978.byte	102,69,15,56,0,218
979.byte	102,69,15,56,0,194
980	movdqa	%xmm11,%xmm13
981	pshufd	$78,%xmm11,%xmm12
982	pxor	%xmm8,%xmm0
983	pxor	%xmm11,%xmm12
984.byte	102,69,15,58,68,222,0
985	movdqa	%xmm0,%xmm1
986	pshufd	$78,%xmm0,%xmm8
987	pxor	%xmm0,%xmm8
988.byte	102,69,15,58,68,238,17
989.byte	102,68,15,58,68,231,0
990	xorps	%xmm11,%xmm3
991	xorps	%xmm13,%xmm5
992
993	leaq	64(%rdx),%rdx
994	subq	$0x40,%rcx
995	jc	.Ltail4x
996
997	jmp	.Lmod4_loop
998.align	32
999.Lmod4_loop:
1000.byte	102,65,15,58,68,199,0
1001	xorps	%xmm12,%xmm4
1002	movdqu	48(%rdx),%xmm11
1003.byte	102,69,15,56,0,218
1004.byte	102,65,15,58,68,207,17
1005	xorps	%xmm3,%xmm0
1006	movdqu	32(%rdx),%xmm3
1007	movdqa	%xmm11,%xmm13
1008.byte	102,68,15,58,68,199,16
1009	pshufd	$78,%xmm11,%xmm12
1010	xorps	%xmm5,%xmm1
1011	pxor	%xmm11,%xmm12
1012.byte	102,65,15,56,0,218
1013	movups	32(%rsi),%xmm7
1014	xorps	%xmm4,%xmm8
1015.byte	102,68,15,58,68,218,0
1016	pshufd	$78,%xmm3,%xmm4
1017
1018	pxor	%xmm0,%xmm8
1019	movdqa	%xmm3,%xmm5
1020	pxor	%xmm1,%xmm8
1021	pxor	%xmm3,%xmm4
1022	movdqa	%xmm8,%xmm9
1023.byte	102,68,15,58,68,234,17
1024	pslldq	$8,%xmm8
1025	psrldq	$8,%xmm9
1026	pxor	%xmm8,%xmm0
1027	movdqa	.L7_mask(%rip),%xmm8
1028	pxor	%xmm9,%xmm1
1029.byte	102,76,15,110,200
1030
1031	pand	%xmm0,%xmm8
1032.byte	102,69,15,56,0,200
1033	pxor	%xmm0,%xmm9
1034.byte	102,68,15,58,68,231,0
1035	psllq	$57,%xmm9
1036	movdqa	%xmm9,%xmm8
1037	pslldq	$8,%xmm9
1038.byte	102,15,58,68,222,0
1039	psrldq	$8,%xmm8
1040	pxor	%xmm9,%xmm0
1041	pxor	%xmm8,%xmm1
1042	movdqu	0(%rdx),%xmm8
1043
1044	movdqa	%xmm0,%xmm9
1045	psrlq	$1,%xmm0
1046.byte	102,15,58,68,238,17
1047	xorps	%xmm11,%xmm3
1048	movdqu	16(%rdx),%xmm11
1049.byte	102,69,15,56,0,218
1050.byte	102,15,58,68,231,16
1051	xorps	%xmm13,%xmm5
1052	movups	80(%rsi),%xmm7
1053.byte	102,69,15,56,0,194
1054	pxor	%xmm9,%xmm1
1055	pxor	%xmm0,%xmm9
1056	psrlq	$5,%xmm0
1057
1058	movdqa	%xmm11,%xmm13
1059	pxor	%xmm12,%xmm4
1060	pshufd	$78,%xmm11,%xmm12
1061	pxor	%xmm9,%xmm0
1062	pxor	%xmm8,%xmm1
1063	pxor	%xmm11,%xmm12
1064.byte	102,69,15,58,68,222,0
1065	psrlq	$1,%xmm0
1066	pxor	%xmm1,%xmm0
1067	movdqa	%xmm0,%xmm1
1068.byte	102,69,15,58,68,238,17
1069	xorps	%xmm11,%xmm3
1070	pshufd	$78,%xmm0,%xmm8
1071	pxor	%xmm0,%xmm8
1072
1073.byte	102,68,15,58,68,231,0
1074	xorps	%xmm13,%xmm5
1075
1076	leaq	64(%rdx),%rdx
1077	subq	$0x40,%rcx
1078	jnc	.Lmod4_loop
1079
1080.Ltail4x:
1081.byte	102,65,15,58,68,199,0
1082.byte	102,65,15,58,68,207,17
1083.byte	102,68,15,58,68,199,16
1084	xorps	%xmm12,%xmm4
1085	xorps	%xmm3,%xmm0
1086	xorps	%xmm5,%xmm1
1087	pxor	%xmm0,%xmm1
1088	pxor	%xmm4,%xmm8
1089
1090	pxor	%xmm1,%xmm8
1091	pxor	%xmm0,%xmm1
1092
1093	movdqa	%xmm8,%xmm9
1094	psrldq	$8,%xmm8
1095	pslldq	$8,%xmm9
1096	pxor	%xmm8,%xmm1
1097	pxor	%xmm9,%xmm0
1098
1099	movdqa	%xmm0,%xmm4
1100	movdqa	%xmm0,%xmm3
1101	psllq	$5,%xmm0
1102	pxor	%xmm0,%xmm3
1103	psllq	$1,%xmm0
1104	pxor	%xmm3,%xmm0
1105	psllq	$57,%xmm0
1106	movdqa	%xmm0,%xmm3
1107	pslldq	$8,%xmm0
1108	psrldq	$8,%xmm3
1109	pxor	%xmm4,%xmm0
1110	pxor	%xmm3,%xmm1
1111
1112
1113	movdqa	%xmm0,%xmm4
1114	psrlq	$1,%xmm0
1115	pxor	%xmm4,%xmm1
1116	pxor	%xmm0,%xmm4
1117	psrlq	$5,%xmm0
1118	pxor	%xmm4,%xmm0
1119	psrlq	$1,%xmm0
1120	pxor	%xmm1,%xmm0
1121	addq	$0x40,%rcx
1122	jz	.Ldone
1123	movdqu	32(%rsi),%xmm7
1124	subq	$0x10,%rcx
1125	jz	.Lodd_tail
1126.Lskip4x:
1127
1128
1129
1130
1131
1132	movdqu	(%rdx),%xmm8
1133	movdqu	16(%rdx),%xmm3
1134.byte	102,69,15,56,0,194
1135.byte	102,65,15,56,0,218
1136	pxor	%xmm8,%xmm0
1137
1138	movdqa	%xmm3,%xmm5
1139	pshufd	$78,%xmm3,%xmm4
1140	pxor	%xmm3,%xmm4
1141.byte	102,15,58,68,218,0
1142.byte	102,15,58,68,234,17
1143.byte	102,15,58,68,231,0
1144
1145	leaq	32(%rdx),%rdx
1146	nop
1147	subq	$0x20,%rcx
1148	jbe	.Leven_tail
1149	nop
1150	jmp	.Lmod_loop
1151
1152.align	32
1153.Lmod_loop:
1154	movdqa	%xmm0,%xmm1
1155	movdqa	%xmm4,%xmm8
1156	pshufd	$78,%xmm0,%xmm4
1157	pxor	%xmm0,%xmm4
1158
1159.byte	102,15,58,68,198,0
1160.byte	102,15,58,68,206,17
1161.byte	102,15,58,68,231,16
1162
1163	pxor	%xmm3,%xmm0
1164	pxor	%xmm5,%xmm1
1165	movdqu	(%rdx),%xmm9
1166	pxor	%xmm0,%xmm8
1167.byte	102,69,15,56,0,202
1168	movdqu	16(%rdx),%xmm3
1169
1170	pxor	%xmm1,%xmm8
1171	pxor	%xmm9,%xmm1
1172	pxor	%xmm8,%xmm4
1173.byte	102,65,15,56,0,218
1174	movdqa	%xmm4,%xmm8
1175	psrldq	$8,%xmm8
1176	pslldq	$8,%xmm4
1177	pxor	%xmm8,%xmm1
1178	pxor	%xmm4,%xmm0
1179
1180	movdqa	%xmm3,%xmm5
1181
1182	movdqa	%xmm0,%xmm9
1183	movdqa	%xmm0,%xmm8
1184	psllq	$5,%xmm0
1185	pxor	%xmm0,%xmm8
1186.byte	102,15,58,68,218,0
1187	psllq	$1,%xmm0
1188	pxor	%xmm8,%xmm0
1189	psllq	$57,%xmm0
1190	movdqa	%xmm0,%xmm8
1191	pslldq	$8,%xmm0
1192	psrldq	$8,%xmm8
1193	pxor	%xmm9,%xmm0
1194	pshufd	$78,%xmm5,%xmm4
1195	pxor	%xmm8,%xmm1
1196	pxor	%xmm5,%xmm4
1197
1198	movdqa	%xmm0,%xmm9
1199	psrlq	$1,%xmm0
1200.byte	102,15,58,68,234,17
1201	pxor	%xmm9,%xmm1
1202	pxor	%xmm0,%xmm9
1203	psrlq	$5,%xmm0
1204	pxor	%xmm9,%xmm0
1205	leaq	32(%rdx),%rdx
1206	psrlq	$1,%xmm0
1207.byte	102,15,58,68,231,0
1208	pxor	%xmm1,%xmm0
1209
1210	subq	$0x20,%rcx
1211	ja	.Lmod_loop
1212
1213.Leven_tail:
1214	movdqa	%xmm0,%xmm1
1215	movdqa	%xmm4,%xmm8
1216	pshufd	$78,%xmm0,%xmm4
1217	pxor	%xmm0,%xmm4
1218
1219.byte	102,15,58,68,198,0
1220.byte	102,15,58,68,206,17
1221.byte	102,15,58,68,231,16
1222
1223	pxor	%xmm3,%xmm0
1224	pxor	%xmm5,%xmm1
1225	pxor	%xmm0,%xmm8
1226	pxor	%xmm1,%xmm8
1227	pxor	%xmm8,%xmm4
1228	movdqa	%xmm4,%xmm8
1229	psrldq	$8,%xmm8
1230	pslldq	$8,%xmm4
1231	pxor	%xmm8,%xmm1
1232	pxor	%xmm4,%xmm0
1233
1234	movdqa	%xmm0,%xmm4
1235	movdqa	%xmm0,%xmm3
1236	psllq	$5,%xmm0
1237	pxor	%xmm0,%xmm3
1238	psllq	$1,%xmm0
1239	pxor	%xmm3,%xmm0
1240	psllq	$57,%xmm0
1241	movdqa	%xmm0,%xmm3
1242	pslldq	$8,%xmm0
1243	psrldq	$8,%xmm3
1244	pxor	%xmm4,%xmm0
1245	pxor	%xmm3,%xmm1
1246
1247
1248	movdqa	%xmm0,%xmm4
1249	psrlq	$1,%xmm0
1250	pxor	%xmm4,%xmm1
1251	pxor	%xmm0,%xmm4
1252	psrlq	$5,%xmm0
1253	pxor	%xmm4,%xmm0
1254	psrlq	$1,%xmm0
1255	pxor	%xmm1,%xmm0
1256	testq	%rcx,%rcx
1257	jnz	.Ldone
1258
1259.Lodd_tail:
1260	movdqu	(%rdx),%xmm8
1261.byte	102,69,15,56,0,194
1262	pxor	%xmm8,%xmm0
1263	movdqa	%xmm0,%xmm1
1264	pshufd	$78,%xmm0,%xmm3
1265	pxor	%xmm0,%xmm3
1266.byte	102,15,58,68,194,0
1267.byte	102,15,58,68,202,17
1268.byte	102,15,58,68,223,0
1269	pxor	%xmm0,%xmm3
1270	pxor	%xmm1,%xmm3
1271
1272	movdqa	%xmm3,%xmm4
1273	psrldq	$8,%xmm3
1274	pslldq	$8,%xmm4
1275	pxor	%xmm3,%xmm1
1276	pxor	%xmm4,%xmm0
1277
1278	movdqa	%xmm0,%xmm4
1279	movdqa	%xmm0,%xmm3
1280	psllq	$5,%xmm0
1281	pxor	%xmm0,%xmm3
1282	psllq	$1,%xmm0
1283	pxor	%xmm3,%xmm0
1284	psllq	$57,%xmm0
1285	movdqa	%xmm0,%xmm3
1286	pslldq	$8,%xmm0
1287	psrldq	$8,%xmm3
1288	pxor	%xmm4,%xmm0
1289	pxor	%xmm3,%xmm1
1290
1291
1292	movdqa	%xmm0,%xmm4
1293	psrlq	$1,%xmm0
1294	pxor	%xmm4,%xmm1
1295	pxor	%xmm0,%xmm4
1296	psrlq	$5,%xmm0
1297	pxor	%xmm4,%xmm0
1298	psrlq	$1,%xmm0
1299	pxor	%xmm1,%xmm0
1300.Ldone:
1301.byte	102,65,15,56,0,194
1302	movdqu	%xmm0,(%rdi)
1303	.byte	0xf3,0xc3
1304.cfi_endproc
1305.size	gcm_ghash_clmul,.-gcm_ghash_clmul
1306.globl	gcm_init_avx
1307.type	gcm_init_avx,@function
1308.align	32
1309gcm_init_avx:
1310.cfi_startproc
1311.byte	243,15,30,250
1312	vzeroupper
1313
1314	vmovdqu	(%rsi),%xmm2
1315	vpshufd	$78,%xmm2,%xmm2
1316
1317
1318	vpshufd	$255,%xmm2,%xmm4
1319	vpsrlq	$63,%xmm2,%xmm3
1320	vpsllq	$1,%xmm2,%xmm2
1321	vpxor	%xmm5,%xmm5,%xmm5
1322	vpcmpgtd	%xmm4,%xmm5,%xmm5
1323	vpslldq	$8,%xmm3,%xmm3
1324	vpor	%xmm3,%xmm2,%xmm2
1325
1326
1327	vpand	.L0x1c2_polynomial(%rip),%xmm5,%xmm5
1328	vpxor	%xmm5,%xmm2,%xmm2
1329
1330	vpunpckhqdq	%xmm2,%xmm2,%xmm6
1331	vmovdqa	%xmm2,%xmm0
1332	vpxor	%xmm2,%xmm6,%xmm6
1333	movq	$4,%r10
1334	jmp	.Linit_start_avx
1335.align	32
1336.Linit_loop_avx:
1337	vpalignr	$8,%xmm3,%xmm4,%xmm5
1338	vmovdqu	%xmm5,-16(%rdi)
1339	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1340	vpxor	%xmm0,%xmm3,%xmm3
1341	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1342	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1343	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1344	vpxor	%xmm0,%xmm1,%xmm4
1345	vpxor	%xmm4,%xmm3,%xmm3
1346
1347	vpslldq	$8,%xmm3,%xmm4
1348	vpsrldq	$8,%xmm3,%xmm3
1349	vpxor	%xmm4,%xmm0,%xmm0
1350	vpxor	%xmm3,%xmm1,%xmm1
1351	vpsllq	$57,%xmm0,%xmm3
1352	vpsllq	$62,%xmm0,%xmm4
1353	vpxor	%xmm3,%xmm4,%xmm4
1354	vpsllq	$63,%xmm0,%xmm3
1355	vpxor	%xmm3,%xmm4,%xmm4
1356	vpslldq	$8,%xmm4,%xmm3
1357	vpsrldq	$8,%xmm4,%xmm4
1358	vpxor	%xmm3,%xmm0,%xmm0
1359	vpxor	%xmm4,%xmm1,%xmm1
1360
1361	vpsrlq	$1,%xmm0,%xmm4
1362	vpxor	%xmm0,%xmm1,%xmm1
1363	vpxor	%xmm4,%xmm0,%xmm0
1364	vpsrlq	$5,%xmm4,%xmm4
1365	vpxor	%xmm4,%xmm0,%xmm0
1366	vpsrlq	$1,%xmm0,%xmm0
1367	vpxor	%xmm1,%xmm0,%xmm0
1368.Linit_start_avx:
1369	vmovdqa	%xmm0,%xmm5
1370	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1371	vpxor	%xmm0,%xmm3,%xmm3
1372	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1373	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1374	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1375	vpxor	%xmm0,%xmm1,%xmm4
1376	vpxor	%xmm4,%xmm3,%xmm3
1377
1378	vpslldq	$8,%xmm3,%xmm4
1379	vpsrldq	$8,%xmm3,%xmm3
1380	vpxor	%xmm4,%xmm0,%xmm0
1381	vpxor	%xmm3,%xmm1,%xmm1
1382	vpsllq	$57,%xmm0,%xmm3
1383	vpsllq	$62,%xmm0,%xmm4
1384	vpxor	%xmm3,%xmm4,%xmm4
1385	vpsllq	$63,%xmm0,%xmm3
1386	vpxor	%xmm3,%xmm4,%xmm4
1387	vpslldq	$8,%xmm4,%xmm3
1388	vpsrldq	$8,%xmm4,%xmm4
1389	vpxor	%xmm3,%xmm0,%xmm0
1390	vpxor	%xmm4,%xmm1,%xmm1
1391
1392	vpsrlq	$1,%xmm0,%xmm4
1393	vpxor	%xmm0,%xmm1,%xmm1
1394	vpxor	%xmm4,%xmm0,%xmm0
1395	vpsrlq	$5,%xmm4,%xmm4
1396	vpxor	%xmm4,%xmm0,%xmm0
1397	vpsrlq	$1,%xmm0,%xmm0
1398	vpxor	%xmm1,%xmm0,%xmm0
1399	vpshufd	$78,%xmm5,%xmm3
1400	vpshufd	$78,%xmm0,%xmm4
1401	vpxor	%xmm5,%xmm3,%xmm3
1402	vmovdqu	%xmm5,0(%rdi)
1403	vpxor	%xmm0,%xmm4,%xmm4
1404	vmovdqu	%xmm0,16(%rdi)
1405	leaq	48(%rdi),%rdi
1406	subq	$1,%r10
1407	jnz	.Linit_loop_avx
1408
1409	vpalignr	$8,%xmm4,%xmm3,%xmm5
1410	vmovdqu	%xmm5,-16(%rdi)
1411
1412	vzeroupper
1413	.byte	0xf3,0xc3
1414.cfi_endproc
1415.size	gcm_init_avx,.-gcm_init_avx
1416.globl	gcm_gmult_avx
1417.type	gcm_gmult_avx,@function
1418.align	32
1419gcm_gmult_avx:
1420.cfi_startproc
1421.byte	243,15,30,250
1422	jmp	.L_gmult_clmul
1423.cfi_endproc
1424.size	gcm_gmult_avx,.-gcm_gmult_avx
1425.globl	gcm_ghash_avx
1426.type	gcm_ghash_avx,@function
1427.align	32
1428gcm_ghash_avx:
1429.cfi_startproc
1430.byte	243,15,30,250
1431	vzeroupper
1432
1433	vmovdqu	(%rdi),%xmm10
1434	leaq	.L0x1c2_polynomial(%rip),%r10
1435	leaq	64(%rsi),%rsi
1436	vmovdqu	.Lbswap_mask(%rip),%xmm13
1437	vpshufb	%xmm13,%xmm10,%xmm10
1438	cmpq	$0x80,%rcx
1439	jb	.Lshort_avx
1440	subq	$0x80,%rcx
1441
1442	vmovdqu	112(%rdx),%xmm14
1443	vmovdqu	0-64(%rsi),%xmm6
1444	vpshufb	%xmm13,%xmm14,%xmm14
1445	vmovdqu	32-64(%rsi),%xmm7
1446
1447	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1448	vmovdqu	96(%rdx),%xmm15
1449	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1450	vpxor	%xmm14,%xmm9,%xmm9
1451	vpshufb	%xmm13,%xmm15,%xmm15
1452	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1453	vmovdqu	16-64(%rsi),%xmm6
1454	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1455	vmovdqu	80(%rdx),%xmm14
1456	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1457	vpxor	%xmm15,%xmm8,%xmm8
1458
1459	vpshufb	%xmm13,%xmm14,%xmm14
1460	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1461	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1462	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1463	vmovdqu	48-64(%rsi),%xmm6
1464	vpxor	%xmm14,%xmm9,%xmm9
1465	vmovdqu	64(%rdx),%xmm15
1466	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1467	vmovdqu	80-64(%rsi),%xmm7
1468
1469	vpshufb	%xmm13,%xmm15,%xmm15
1470	vpxor	%xmm0,%xmm3,%xmm3
1471	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1472	vpxor	%xmm1,%xmm4,%xmm4
1473	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1474	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1475	vmovdqu	64-64(%rsi),%xmm6
1476	vpxor	%xmm2,%xmm5,%xmm5
1477	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1478	vpxor	%xmm15,%xmm8,%xmm8
1479
1480	vmovdqu	48(%rdx),%xmm14
1481	vpxor	%xmm3,%xmm0,%xmm0
1482	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1483	vpxor	%xmm4,%xmm1,%xmm1
1484	vpshufb	%xmm13,%xmm14,%xmm14
1485	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1486	vmovdqu	96-64(%rsi),%xmm6
1487	vpxor	%xmm5,%xmm2,%xmm2
1488	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1489	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1490	vmovdqu	128-64(%rsi),%xmm7
1491	vpxor	%xmm14,%xmm9,%xmm9
1492
1493	vmovdqu	32(%rdx),%xmm15
1494	vpxor	%xmm0,%xmm3,%xmm3
1495	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1496	vpxor	%xmm1,%xmm4,%xmm4
1497	vpshufb	%xmm13,%xmm15,%xmm15
1498	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1499	vmovdqu	112-64(%rsi),%xmm6
1500	vpxor	%xmm2,%xmm5,%xmm5
1501	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1502	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1503	vpxor	%xmm15,%xmm8,%xmm8
1504
1505	vmovdqu	16(%rdx),%xmm14
1506	vpxor	%xmm3,%xmm0,%xmm0
1507	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1508	vpxor	%xmm4,%xmm1,%xmm1
1509	vpshufb	%xmm13,%xmm14,%xmm14
1510	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1511	vmovdqu	144-64(%rsi),%xmm6
1512	vpxor	%xmm5,%xmm2,%xmm2
1513	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1514	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1515	vmovdqu	176-64(%rsi),%xmm7
1516	vpxor	%xmm14,%xmm9,%xmm9
1517
1518	vmovdqu	(%rdx),%xmm15
1519	vpxor	%xmm0,%xmm3,%xmm3
1520	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1521	vpxor	%xmm1,%xmm4,%xmm4
1522	vpshufb	%xmm13,%xmm15,%xmm15
1523	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1524	vmovdqu	160-64(%rsi),%xmm6
1525	vpxor	%xmm2,%xmm5,%xmm5
1526	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1527
1528	leaq	128(%rdx),%rdx
1529	cmpq	$0x80,%rcx
1530	jb	.Ltail_avx
1531
1532	vpxor	%xmm10,%xmm15,%xmm15
1533	subq	$0x80,%rcx
1534	jmp	.Loop8x_avx
1535
1536.align	32
1537.Loop8x_avx:
1538	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1539	vmovdqu	112(%rdx),%xmm14
1540	vpxor	%xmm0,%xmm3,%xmm3
1541	vpxor	%xmm15,%xmm8,%xmm8
1542	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
1543	vpshufb	%xmm13,%xmm14,%xmm14
1544	vpxor	%xmm1,%xmm4,%xmm4
1545	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
1546	vmovdqu	0-64(%rsi),%xmm6
1547	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1548	vpxor	%xmm2,%xmm5,%xmm5
1549	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
1550	vmovdqu	32-64(%rsi),%xmm7
1551	vpxor	%xmm14,%xmm9,%xmm9
1552
1553	vmovdqu	96(%rdx),%xmm15
1554	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1555	vpxor	%xmm3,%xmm10,%xmm10
1556	vpshufb	%xmm13,%xmm15,%xmm15
1557	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1558	vxorps	%xmm4,%xmm11,%xmm11
1559	vmovdqu	16-64(%rsi),%xmm6
1560	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1561	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1562	vpxor	%xmm5,%xmm12,%xmm12
1563	vxorps	%xmm15,%xmm8,%xmm8
1564
1565	vmovdqu	80(%rdx),%xmm14
1566	vpxor	%xmm10,%xmm12,%xmm12
1567	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1568	vpxor	%xmm11,%xmm12,%xmm12
1569	vpslldq	$8,%xmm12,%xmm9
1570	vpxor	%xmm0,%xmm3,%xmm3
1571	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1572	vpsrldq	$8,%xmm12,%xmm12
1573	vpxor	%xmm9,%xmm10,%xmm10
1574	vmovdqu	48-64(%rsi),%xmm6
1575	vpshufb	%xmm13,%xmm14,%xmm14
1576	vxorps	%xmm12,%xmm11,%xmm11
1577	vpxor	%xmm1,%xmm4,%xmm4
1578	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1579	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1580	vmovdqu	80-64(%rsi),%xmm7
1581	vpxor	%xmm14,%xmm9,%xmm9
1582	vpxor	%xmm2,%xmm5,%xmm5
1583
1584	vmovdqu	64(%rdx),%xmm15
1585	vpalignr	$8,%xmm10,%xmm10,%xmm12
1586	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1587	vpshufb	%xmm13,%xmm15,%xmm15
1588	vpxor	%xmm3,%xmm0,%xmm0
1589	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1590	vmovdqu	64-64(%rsi),%xmm6
1591	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1592	vpxor	%xmm4,%xmm1,%xmm1
1593	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1594	vxorps	%xmm15,%xmm8,%xmm8
1595	vpxor	%xmm5,%xmm2,%xmm2
1596
1597	vmovdqu	48(%rdx),%xmm14
1598	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1599	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1600	vpshufb	%xmm13,%xmm14,%xmm14
1601	vpxor	%xmm0,%xmm3,%xmm3
1602	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1603	vmovdqu	96-64(%rsi),%xmm6
1604	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1605	vpxor	%xmm1,%xmm4,%xmm4
1606	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1607	vmovdqu	128-64(%rsi),%xmm7
1608	vpxor	%xmm14,%xmm9,%xmm9
1609	vpxor	%xmm2,%xmm5,%xmm5
1610
1611	vmovdqu	32(%rdx),%xmm15
1612	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1613	vpshufb	%xmm13,%xmm15,%xmm15
1614	vpxor	%xmm3,%xmm0,%xmm0
1615	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1616	vmovdqu	112-64(%rsi),%xmm6
1617	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1618	vpxor	%xmm4,%xmm1,%xmm1
1619	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1620	vpxor	%xmm15,%xmm8,%xmm8
1621	vpxor	%xmm5,%xmm2,%xmm2
1622	vxorps	%xmm12,%xmm10,%xmm10
1623
1624	vmovdqu	16(%rdx),%xmm14
1625	vpalignr	$8,%xmm10,%xmm10,%xmm12
1626	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1627	vpshufb	%xmm13,%xmm14,%xmm14
1628	vpxor	%xmm0,%xmm3,%xmm3
1629	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1630	vmovdqu	144-64(%rsi),%xmm6
1631	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1632	vxorps	%xmm11,%xmm12,%xmm12
1633	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1634	vpxor	%xmm1,%xmm4,%xmm4
1635	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1636	vmovdqu	176-64(%rsi),%xmm7
1637	vpxor	%xmm14,%xmm9,%xmm9
1638	vpxor	%xmm2,%xmm5,%xmm5
1639
1640	vmovdqu	(%rdx),%xmm15
1641	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1642	vpshufb	%xmm13,%xmm15,%xmm15
1643	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1644	vmovdqu	160-64(%rsi),%xmm6
1645	vpxor	%xmm12,%xmm15,%xmm15
1646	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1647	vpxor	%xmm10,%xmm15,%xmm15
1648
1649	leaq	128(%rdx),%rdx
1650	subq	$0x80,%rcx
1651	jnc	.Loop8x_avx
1652
1653	addq	$0x80,%rcx
1654	jmp	.Ltail_no_xor_avx
1655
1656.align	32
1657.Lshort_avx:
1658	vmovdqu	-16(%rdx,%rcx,1),%xmm14
1659	leaq	(%rdx,%rcx,1),%rdx
1660	vmovdqu	0-64(%rsi),%xmm6
1661	vmovdqu	32-64(%rsi),%xmm7
1662	vpshufb	%xmm13,%xmm14,%xmm15
1663
1664	vmovdqa	%xmm0,%xmm3
1665	vmovdqa	%xmm1,%xmm4
1666	vmovdqa	%xmm2,%xmm5
1667	subq	$0x10,%rcx
1668	jz	.Ltail_avx
1669
1670	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1671	vpxor	%xmm0,%xmm3,%xmm3
1672	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1673	vpxor	%xmm15,%xmm8,%xmm8
1674	vmovdqu	-32(%rdx),%xmm14
1675	vpxor	%xmm1,%xmm4,%xmm4
1676	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1677	vmovdqu	16-64(%rsi),%xmm6
1678	vpshufb	%xmm13,%xmm14,%xmm15
1679	vpxor	%xmm2,%xmm5,%xmm5
1680	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1681	vpsrldq	$8,%xmm7,%xmm7
1682	subq	$0x10,%rcx
1683	jz	.Ltail_avx
1684
1685	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1686	vpxor	%xmm0,%xmm3,%xmm3
1687	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1688	vpxor	%xmm15,%xmm8,%xmm8
1689	vmovdqu	-48(%rdx),%xmm14
1690	vpxor	%xmm1,%xmm4,%xmm4
1691	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1692	vmovdqu	48-64(%rsi),%xmm6
1693	vpshufb	%xmm13,%xmm14,%xmm15
1694	vpxor	%xmm2,%xmm5,%xmm5
1695	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1696	vmovdqu	80-64(%rsi),%xmm7
1697	subq	$0x10,%rcx
1698	jz	.Ltail_avx
1699
1700	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1701	vpxor	%xmm0,%xmm3,%xmm3
1702	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1703	vpxor	%xmm15,%xmm8,%xmm8
1704	vmovdqu	-64(%rdx),%xmm14
1705	vpxor	%xmm1,%xmm4,%xmm4
1706	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1707	vmovdqu	64-64(%rsi),%xmm6
1708	vpshufb	%xmm13,%xmm14,%xmm15
1709	vpxor	%xmm2,%xmm5,%xmm5
1710	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1711	vpsrldq	$8,%xmm7,%xmm7
1712	subq	$0x10,%rcx
1713	jz	.Ltail_avx
1714
1715	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1716	vpxor	%xmm0,%xmm3,%xmm3
1717	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1718	vpxor	%xmm15,%xmm8,%xmm8
1719	vmovdqu	-80(%rdx),%xmm14
1720	vpxor	%xmm1,%xmm4,%xmm4
1721	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1722	vmovdqu	96-64(%rsi),%xmm6
1723	vpshufb	%xmm13,%xmm14,%xmm15
1724	vpxor	%xmm2,%xmm5,%xmm5
1725	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1726	vmovdqu	128-64(%rsi),%xmm7
1727	subq	$0x10,%rcx
1728	jz	.Ltail_avx
1729
1730	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1731	vpxor	%xmm0,%xmm3,%xmm3
1732	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1733	vpxor	%xmm15,%xmm8,%xmm8
1734	vmovdqu	-96(%rdx),%xmm14
1735	vpxor	%xmm1,%xmm4,%xmm4
1736	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1737	vmovdqu	112-64(%rsi),%xmm6
1738	vpshufb	%xmm13,%xmm14,%xmm15
1739	vpxor	%xmm2,%xmm5,%xmm5
1740	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1741	vpsrldq	$8,%xmm7,%xmm7
1742	subq	$0x10,%rcx
1743	jz	.Ltail_avx
1744
1745	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1746	vpxor	%xmm0,%xmm3,%xmm3
1747	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1748	vpxor	%xmm15,%xmm8,%xmm8
1749	vmovdqu	-112(%rdx),%xmm14
1750	vpxor	%xmm1,%xmm4,%xmm4
1751	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1752	vmovdqu	144-64(%rsi),%xmm6
1753	vpshufb	%xmm13,%xmm14,%xmm15
1754	vpxor	%xmm2,%xmm5,%xmm5
1755	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1756	vmovq	184-64(%rsi),%xmm7
1757	subq	$0x10,%rcx
1758	jmp	.Ltail_avx
1759
1760.align	32
1761.Ltail_avx:
1762	vpxor	%xmm10,%xmm15,%xmm15
1763.Ltail_no_xor_avx:
1764	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1765	vpxor	%xmm0,%xmm3,%xmm3
1766	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1767	vpxor	%xmm15,%xmm8,%xmm8
1768	vpxor	%xmm1,%xmm4,%xmm4
1769	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1770	vpxor	%xmm2,%xmm5,%xmm5
1771	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1772
1773	vmovdqu	(%r10),%xmm12
1774
1775	vpxor	%xmm0,%xmm3,%xmm10
1776	vpxor	%xmm1,%xmm4,%xmm11
1777	vpxor	%xmm2,%xmm5,%xmm5
1778
1779	vpxor	%xmm10,%xmm5,%xmm5
1780	vpxor	%xmm11,%xmm5,%xmm5
1781	vpslldq	$8,%xmm5,%xmm9
1782	vpsrldq	$8,%xmm5,%xmm5
1783	vpxor	%xmm9,%xmm10,%xmm10
1784	vpxor	%xmm5,%xmm11,%xmm11
1785
1786	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1787	vpalignr	$8,%xmm10,%xmm10,%xmm10
1788	vpxor	%xmm9,%xmm10,%xmm10
1789
1790	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1791	vpalignr	$8,%xmm10,%xmm10,%xmm10
1792	vpxor	%xmm11,%xmm10,%xmm10
1793	vpxor	%xmm9,%xmm10,%xmm10
1794
1795	cmpq	$0,%rcx
1796	jne	.Lshort_avx
1797
1798	vpshufb	%xmm13,%xmm10,%xmm10
1799	vmovdqu	%xmm10,(%rdi)
1800	vzeroupper
1801	.byte	0xf3,0xc3
1802.cfi_endproc
1803.size	gcm_ghash_avx,.-gcm_ghash_avx
1804.section	.rodata
1805.align	64
1806.Lbswap_mask:
1807.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1808.L0x1c2_polynomial:
1809.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1810.L7_mask:
1811.long	7,0,7,0
1812.L7_mask_poly:
1813.long	7,0,450,0
1814.align	64
1815.type	.Lrem_4bit,@object
1816.Lrem_4bit:
1817.long	0,0,0,471859200,0,943718400,0,610271232
1818.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
1819.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
1820.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
1821.type	.Lrem_8bit,@object
1822.Lrem_8bit:
1823.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1824.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1825.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1826.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1827.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1828.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1829.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1830.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1831.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1832.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1833.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1834.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1835.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1836.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1837.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1838.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1839.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1840.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1841.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1842.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1843.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1844.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1845.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1846.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1847.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1848.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1849.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1850.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1851.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1852.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1853.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1854.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1855
1856.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1857.align	64
1858.previous
1859	.section ".note.gnu.property", "a"
1860	.p2align 3
1861	.long 1f - 0f
1862	.long 4f - 1f
1863	.long 5
18640:
1865	# "GNU" encoded with .byte, since .asciz isn't supported
1866	# on Solaris.
1867	.byte 0x47
1868	.byte 0x4e
1869	.byte 0x55
1870	.byte 0
18711:
1872	.p2align 3
1873	.long 0xc0000002
1874	.long 3f - 2f
18752:
1876	.long 3
18773:
1878	.p2align 3
18794:
1880