xref: /freebsd/sys/crypto/openssl/amd64/ghash-x86_64.S (revision 9729f076e4d93c5a37e78d427bfe0f1ab99bbcc6)
1/* $FreeBSD$ */
2/* Do not modify. This file is auto-generated from ghash-x86_64.pl. */
3.text
4
5
6.globl	gcm_gmult_4bit
7.type	gcm_gmult_4bit,@function
8.align	16
9gcm_gmult_4bit:
10.cfi_startproc
11	pushq	%rbx
12.cfi_adjust_cfa_offset	8
13.cfi_offset	%rbx,-16
14	pushq	%rbp
15.cfi_adjust_cfa_offset	8
16.cfi_offset	%rbp,-24
17	pushq	%r12
18.cfi_adjust_cfa_offset	8
19.cfi_offset	%r12,-32
20	pushq	%r13
21.cfi_adjust_cfa_offset	8
22.cfi_offset	%r13,-40
23	pushq	%r14
24.cfi_adjust_cfa_offset	8
25.cfi_offset	%r14,-48
26	pushq	%r15
27.cfi_adjust_cfa_offset	8
28.cfi_offset	%r15,-56
29	subq	$280,%rsp
30.cfi_adjust_cfa_offset	280
31.Lgmult_prologue:
32
33	movzbq	15(%rdi),%r8
34	leaq	.Lrem_4bit(%rip),%r11
35	xorq	%rax,%rax
36	xorq	%rbx,%rbx
37	movb	%r8b,%al
38	movb	%r8b,%bl
39	shlb	$4,%al
40	movq	$14,%rcx
41	movq	8(%rsi,%rax,1),%r8
42	movq	(%rsi,%rax,1),%r9
43	andb	$0xf0,%bl
44	movq	%r8,%rdx
45	jmp	.Loop1
46
47.align	16
48.Loop1:
49	shrq	$4,%r8
50	andq	$0xf,%rdx
51	movq	%r9,%r10
52	movb	(%rdi,%rcx,1),%al
53	shrq	$4,%r9
54	xorq	8(%rsi,%rbx,1),%r8
55	shlq	$60,%r10
56	xorq	(%rsi,%rbx,1),%r9
57	movb	%al,%bl
58	xorq	(%r11,%rdx,8),%r9
59	movq	%r8,%rdx
60	shlb	$4,%al
61	xorq	%r10,%r8
62	decq	%rcx
63	js	.Lbreak1
64
65	shrq	$4,%r8
66	andq	$0xf,%rdx
67	movq	%r9,%r10
68	shrq	$4,%r9
69	xorq	8(%rsi,%rax,1),%r8
70	shlq	$60,%r10
71	xorq	(%rsi,%rax,1),%r9
72	andb	$0xf0,%bl
73	xorq	(%r11,%rdx,8),%r9
74	movq	%r8,%rdx
75	xorq	%r10,%r8
76	jmp	.Loop1
77
78.align	16
79.Lbreak1:
80	shrq	$4,%r8
81	andq	$0xf,%rdx
82	movq	%r9,%r10
83	shrq	$4,%r9
84	xorq	8(%rsi,%rax,1),%r8
85	shlq	$60,%r10
86	xorq	(%rsi,%rax,1),%r9
87	andb	$0xf0,%bl
88	xorq	(%r11,%rdx,8),%r9
89	movq	%r8,%rdx
90	xorq	%r10,%r8
91
92	shrq	$4,%r8
93	andq	$0xf,%rdx
94	movq	%r9,%r10
95	shrq	$4,%r9
96	xorq	8(%rsi,%rbx,1),%r8
97	shlq	$60,%r10
98	xorq	(%rsi,%rbx,1),%r9
99	xorq	%r10,%r8
100	xorq	(%r11,%rdx,8),%r9
101
102	bswapq	%r8
103	bswapq	%r9
104	movq	%r8,8(%rdi)
105	movq	%r9,(%rdi)
106
107	leaq	280+48(%rsp),%rsi
108.cfi_def_cfa	%rsi,8
109	movq	-8(%rsi),%rbx
110.cfi_restore	%rbx
111	leaq	(%rsi),%rsp
112.cfi_def_cfa_register	%rsp
113.Lgmult_epilogue:
114	.byte	0xf3,0xc3
115.cfi_endproc
116.size	gcm_gmult_4bit,.-gcm_gmult_4bit
117.globl	gcm_ghash_4bit
118.type	gcm_ghash_4bit,@function
119.align	16
120gcm_ghash_4bit:
121.cfi_startproc
122	pushq	%rbx
123.cfi_adjust_cfa_offset	8
124.cfi_offset	%rbx,-16
125	pushq	%rbp
126.cfi_adjust_cfa_offset	8
127.cfi_offset	%rbp,-24
128	pushq	%r12
129.cfi_adjust_cfa_offset	8
130.cfi_offset	%r12,-32
131	pushq	%r13
132.cfi_adjust_cfa_offset	8
133.cfi_offset	%r13,-40
134	pushq	%r14
135.cfi_adjust_cfa_offset	8
136.cfi_offset	%r14,-48
137	pushq	%r15
138.cfi_adjust_cfa_offset	8
139.cfi_offset	%r15,-56
140	subq	$280,%rsp
141.cfi_adjust_cfa_offset	280
142.Lghash_prologue:
143	movq	%rdx,%r14
144	movq	%rcx,%r15
145	subq	$-128,%rsi
146	leaq	16+128(%rsp),%rbp
147	xorl	%edx,%edx
148	movq	0+0-128(%rsi),%r8
149	movq	0+8-128(%rsi),%rax
150	movb	%al,%dl
151	shrq	$4,%rax
152	movq	%r8,%r10
153	shrq	$4,%r8
154	movq	16+0-128(%rsi),%r9
155	shlb	$4,%dl
156	movq	16+8-128(%rsi),%rbx
157	shlq	$60,%r10
158	movb	%dl,0(%rsp)
159	orq	%r10,%rax
160	movb	%bl,%dl
161	shrq	$4,%rbx
162	movq	%r9,%r10
163	shrq	$4,%r9
164	movq	%r8,0(%rbp)
165	movq	32+0-128(%rsi),%r8
166	shlb	$4,%dl
167	movq	%rax,0-128(%rbp)
168	movq	32+8-128(%rsi),%rax
169	shlq	$60,%r10
170	movb	%dl,1(%rsp)
171	orq	%r10,%rbx
172	movb	%al,%dl
173	shrq	$4,%rax
174	movq	%r8,%r10
175	shrq	$4,%r8
176	movq	%r9,8(%rbp)
177	movq	48+0-128(%rsi),%r9
178	shlb	$4,%dl
179	movq	%rbx,8-128(%rbp)
180	movq	48+8-128(%rsi),%rbx
181	shlq	$60,%r10
182	movb	%dl,2(%rsp)
183	orq	%r10,%rax
184	movb	%bl,%dl
185	shrq	$4,%rbx
186	movq	%r9,%r10
187	shrq	$4,%r9
188	movq	%r8,16(%rbp)
189	movq	64+0-128(%rsi),%r8
190	shlb	$4,%dl
191	movq	%rax,16-128(%rbp)
192	movq	64+8-128(%rsi),%rax
193	shlq	$60,%r10
194	movb	%dl,3(%rsp)
195	orq	%r10,%rbx
196	movb	%al,%dl
197	shrq	$4,%rax
198	movq	%r8,%r10
199	shrq	$4,%r8
200	movq	%r9,24(%rbp)
201	movq	80+0-128(%rsi),%r9
202	shlb	$4,%dl
203	movq	%rbx,24-128(%rbp)
204	movq	80+8-128(%rsi),%rbx
205	shlq	$60,%r10
206	movb	%dl,4(%rsp)
207	orq	%r10,%rax
208	movb	%bl,%dl
209	shrq	$4,%rbx
210	movq	%r9,%r10
211	shrq	$4,%r9
212	movq	%r8,32(%rbp)
213	movq	96+0-128(%rsi),%r8
214	shlb	$4,%dl
215	movq	%rax,32-128(%rbp)
216	movq	96+8-128(%rsi),%rax
217	shlq	$60,%r10
218	movb	%dl,5(%rsp)
219	orq	%r10,%rbx
220	movb	%al,%dl
221	shrq	$4,%rax
222	movq	%r8,%r10
223	shrq	$4,%r8
224	movq	%r9,40(%rbp)
225	movq	112+0-128(%rsi),%r9
226	shlb	$4,%dl
227	movq	%rbx,40-128(%rbp)
228	movq	112+8-128(%rsi),%rbx
229	shlq	$60,%r10
230	movb	%dl,6(%rsp)
231	orq	%r10,%rax
232	movb	%bl,%dl
233	shrq	$4,%rbx
234	movq	%r9,%r10
235	shrq	$4,%r9
236	movq	%r8,48(%rbp)
237	movq	128+0-128(%rsi),%r8
238	shlb	$4,%dl
239	movq	%rax,48-128(%rbp)
240	movq	128+8-128(%rsi),%rax
241	shlq	$60,%r10
242	movb	%dl,7(%rsp)
243	orq	%r10,%rbx
244	movb	%al,%dl
245	shrq	$4,%rax
246	movq	%r8,%r10
247	shrq	$4,%r8
248	movq	%r9,56(%rbp)
249	movq	144+0-128(%rsi),%r9
250	shlb	$4,%dl
251	movq	%rbx,56-128(%rbp)
252	movq	144+8-128(%rsi),%rbx
253	shlq	$60,%r10
254	movb	%dl,8(%rsp)
255	orq	%r10,%rax
256	movb	%bl,%dl
257	shrq	$4,%rbx
258	movq	%r9,%r10
259	shrq	$4,%r9
260	movq	%r8,64(%rbp)
261	movq	160+0-128(%rsi),%r8
262	shlb	$4,%dl
263	movq	%rax,64-128(%rbp)
264	movq	160+8-128(%rsi),%rax
265	shlq	$60,%r10
266	movb	%dl,9(%rsp)
267	orq	%r10,%rbx
268	movb	%al,%dl
269	shrq	$4,%rax
270	movq	%r8,%r10
271	shrq	$4,%r8
272	movq	%r9,72(%rbp)
273	movq	176+0-128(%rsi),%r9
274	shlb	$4,%dl
275	movq	%rbx,72-128(%rbp)
276	movq	176+8-128(%rsi),%rbx
277	shlq	$60,%r10
278	movb	%dl,10(%rsp)
279	orq	%r10,%rax
280	movb	%bl,%dl
281	shrq	$4,%rbx
282	movq	%r9,%r10
283	shrq	$4,%r9
284	movq	%r8,80(%rbp)
285	movq	192+0-128(%rsi),%r8
286	shlb	$4,%dl
287	movq	%rax,80-128(%rbp)
288	movq	192+8-128(%rsi),%rax
289	shlq	$60,%r10
290	movb	%dl,11(%rsp)
291	orq	%r10,%rbx
292	movb	%al,%dl
293	shrq	$4,%rax
294	movq	%r8,%r10
295	shrq	$4,%r8
296	movq	%r9,88(%rbp)
297	movq	208+0-128(%rsi),%r9
298	shlb	$4,%dl
299	movq	%rbx,88-128(%rbp)
300	movq	208+8-128(%rsi),%rbx
301	shlq	$60,%r10
302	movb	%dl,12(%rsp)
303	orq	%r10,%rax
304	movb	%bl,%dl
305	shrq	$4,%rbx
306	movq	%r9,%r10
307	shrq	$4,%r9
308	movq	%r8,96(%rbp)
309	movq	224+0-128(%rsi),%r8
310	shlb	$4,%dl
311	movq	%rax,96-128(%rbp)
312	movq	224+8-128(%rsi),%rax
313	shlq	$60,%r10
314	movb	%dl,13(%rsp)
315	orq	%r10,%rbx
316	movb	%al,%dl
317	shrq	$4,%rax
318	movq	%r8,%r10
319	shrq	$4,%r8
320	movq	%r9,104(%rbp)
321	movq	240+0-128(%rsi),%r9
322	shlb	$4,%dl
323	movq	%rbx,104-128(%rbp)
324	movq	240+8-128(%rsi),%rbx
325	shlq	$60,%r10
326	movb	%dl,14(%rsp)
327	orq	%r10,%rax
328	movb	%bl,%dl
329	shrq	$4,%rbx
330	movq	%r9,%r10
331	shrq	$4,%r9
332	movq	%r8,112(%rbp)
333	shlb	$4,%dl
334	movq	%rax,112-128(%rbp)
335	shlq	$60,%r10
336	movb	%dl,15(%rsp)
337	orq	%r10,%rbx
338	movq	%r9,120(%rbp)
339	movq	%rbx,120-128(%rbp)
340	addq	$-128,%rsi
341	movq	8(%rdi),%r8
342	movq	0(%rdi),%r9
343	addq	%r14,%r15
344	leaq	.Lrem_8bit(%rip),%r11
345	jmp	.Louter_loop
346.align	16
347.Louter_loop:
348	xorq	(%r14),%r9
349	movq	8(%r14),%rdx
350	leaq	16(%r14),%r14
351	xorq	%r8,%rdx
352	movq	%r9,(%rdi)
353	movq	%rdx,8(%rdi)
354	shrq	$32,%rdx
355	xorq	%rax,%rax
356	roll	$8,%edx
357	movb	%dl,%al
358	movzbl	%dl,%ebx
359	shlb	$4,%al
360	shrl	$4,%ebx
361	roll	$8,%edx
362	movq	8(%rsi,%rax,1),%r8
363	movq	(%rsi,%rax,1),%r9
364	movb	%dl,%al
365	movzbl	%dl,%ecx
366	shlb	$4,%al
367	movzbq	(%rsp,%rbx,1),%r12
368	shrl	$4,%ecx
369	xorq	%r8,%r12
370	movq	%r9,%r10
371	shrq	$8,%r8
372	movzbq	%r12b,%r12
373	shrq	$8,%r9
374	xorq	-128(%rbp,%rbx,8),%r8
375	shlq	$56,%r10
376	xorq	(%rbp,%rbx,8),%r9
377	roll	$8,%edx
378	xorq	8(%rsi,%rax,1),%r8
379	xorq	(%rsi,%rax,1),%r9
380	movb	%dl,%al
381	xorq	%r10,%r8
382	movzwq	(%r11,%r12,2),%r12
383	movzbl	%dl,%ebx
384	shlb	$4,%al
385	movzbq	(%rsp,%rcx,1),%r13
386	shrl	$4,%ebx
387	shlq	$48,%r12
388	xorq	%r8,%r13
389	movq	%r9,%r10
390	xorq	%r12,%r9
391	shrq	$8,%r8
392	movzbq	%r13b,%r13
393	shrq	$8,%r9
394	xorq	-128(%rbp,%rcx,8),%r8
395	shlq	$56,%r10
396	xorq	(%rbp,%rcx,8),%r9
397	roll	$8,%edx
398	xorq	8(%rsi,%rax,1),%r8
399	xorq	(%rsi,%rax,1),%r9
400	movb	%dl,%al
401	xorq	%r10,%r8
402	movzwq	(%r11,%r13,2),%r13
403	movzbl	%dl,%ecx
404	shlb	$4,%al
405	movzbq	(%rsp,%rbx,1),%r12
406	shrl	$4,%ecx
407	shlq	$48,%r13
408	xorq	%r8,%r12
409	movq	%r9,%r10
410	xorq	%r13,%r9
411	shrq	$8,%r8
412	movzbq	%r12b,%r12
413	movl	8(%rdi),%edx
414	shrq	$8,%r9
415	xorq	-128(%rbp,%rbx,8),%r8
416	shlq	$56,%r10
417	xorq	(%rbp,%rbx,8),%r9
418	roll	$8,%edx
419	xorq	8(%rsi,%rax,1),%r8
420	xorq	(%rsi,%rax,1),%r9
421	movb	%dl,%al
422	xorq	%r10,%r8
423	movzwq	(%r11,%r12,2),%r12
424	movzbl	%dl,%ebx
425	shlb	$4,%al
426	movzbq	(%rsp,%rcx,1),%r13
427	shrl	$4,%ebx
428	shlq	$48,%r12
429	xorq	%r8,%r13
430	movq	%r9,%r10
431	xorq	%r12,%r9
432	shrq	$8,%r8
433	movzbq	%r13b,%r13
434	shrq	$8,%r9
435	xorq	-128(%rbp,%rcx,8),%r8
436	shlq	$56,%r10
437	xorq	(%rbp,%rcx,8),%r9
438	roll	$8,%edx
439	xorq	8(%rsi,%rax,1),%r8
440	xorq	(%rsi,%rax,1),%r9
441	movb	%dl,%al
442	xorq	%r10,%r8
443	movzwq	(%r11,%r13,2),%r13
444	movzbl	%dl,%ecx
445	shlb	$4,%al
446	movzbq	(%rsp,%rbx,1),%r12
447	shrl	$4,%ecx
448	shlq	$48,%r13
449	xorq	%r8,%r12
450	movq	%r9,%r10
451	xorq	%r13,%r9
452	shrq	$8,%r8
453	movzbq	%r12b,%r12
454	shrq	$8,%r9
455	xorq	-128(%rbp,%rbx,8),%r8
456	shlq	$56,%r10
457	xorq	(%rbp,%rbx,8),%r9
458	roll	$8,%edx
459	xorq	8(%rsi,%rax,1),%r8
460	xorq	(%rsi,%rax,1),%r9
461	movb	%dl,%al
462	xorq	%r10,%r8
463	movzwq	(%r11,%r12,2),%r12
464	movzbl	%dl,%ebx
465	shlb	$4,%al
466	movzbq	(%rsp,%rcx,1),%r13
467	shrl	$4,%ebx
468	shlq	$48,%r12
469	xorq	%r8,%r13
470	movq	%r9,%r10
471	xorq	%r12,%r9
472	shrq	$8,%r8
473	movzbq	%r13b,%r13
474	shrq	$8,%r9
475	xorq	-128(%rbp,%rcx,8),%r8
476	shlq	$56,%r10
477	xorq	(%rbp,%rcx,8),%r9
478	roll	$8,%edx
479	xorq	8(%rsi,%rax,1),%r8
480	xorq	(%rsi,%rax,1),%r9
481	movb	%dl,%al
482	xorq	%r10,%r8
483	movzwq	(%r11,%r13,2),%r13
484	movzbl	%dl,%ecx
485	shlb	$4,%al
486	movzbq	(%rsp,%rbx,1),%r12
487	shrl	$4,%ecx
488	shlq	$48,%r13
489	xorq	%r8,%r12
490	movq	%r9,%r10
491	xorq	%r13,%r9
492	shrq	$8,%r8
493	movzbq	%r12b,%r12
494	movl	4(%rdi),%edx
495	shrq	$8,%r9
496	xorq	-128(%rbp,%rbx,8),%r8
497	shlq	$56,%r10
498	xorq	(%rbp,%rbx,8),%r9
499	roll	$8,%edx
500	xorq	8(%rsi,%rax,1),%r8
501	xorq	(%rsi,%rax,1),%r9
502	movb	%dl,%al
503	xorq	%r10,%r8
504	movzwq	(%r11,%r12,2),%r12
505	movzbl	%dl,%ebx
506	shlb	$4,%al
507	movzbq	(%rsp,%rcx,1),%r13
508	shrl	$4,%ebx
509	shlq	$48,%r12
510	xorq	%r8,%r13
511	movq	%r9,%r10
512	xorq	%r12,%r9
513	shrq	$8,%r8
514	movzbq	%r13b,%r13
515	shrq	$8,%r9
516	xorq	-128(%rbp,%rcx,8),%r8
517	shlq	$56,%r10
518	xorq	(%rbp,%rcx,8),%r9
519	roll	$8,%edx
520	xorq	8(%rsi,%rax,1),%r8
521	xorq	(%rsi,%rax,1),%r9
522	movb	%dl,%al
523	xorq	%r10,%r8
524	movzwq	(%r11,%r13,2),%r13
525	movzbl	%dl,%ecx
526	shlb	$4,%al
527	movzbq	(%rsp,%rbx,1),%r12
528	shrl	$4,%ecx
529	shlq	$48,%r13
530	xorq	%r8,%r12
531	movq	%r9,%r10
532	xorq	%r13,%r9
533	shrq	$8,%r8
534	movzbq	%r12b,%r12
535	shrq	$8,%r9
536	xorq	-128(%rbp,%rbx,8),%r8
537	shlq	$56,%r10
538	xorq	(%rbp,%rbx,8),%r9
539	roll	$8,%edx
540	xorq	8(%rsi,%rax,1),%r8
541	xorq	(%rsi,%rax,1),%r9
542	movb	%dl,%al
543	xorq	%r10,%r8
544	movzwq	(%r11,%r12,2),%r12
545	movzbl	%dl,%ebx
546	shlb	$4,%al
547	movzbq	(%rsp,%rcx,1),%r13
548	shrl	$4,%ebx
549	shlq	$48,%r12
550	xorq	%r8,%r13
551	movq	%r9,%r10
552	xorq	%r12,%r9
553	shrq	$8,%r8
554	movzbq	%r13b,%r13
555	shrq	$8,%r9
556	xorq	-128(%rbp,%rcx,8),%r8
557	shlq	$56,%r10
558	xorq	(%rbp,%rcx,8),%r9
559	roll	$8,%edx
560	xorq	8(%rsi,%rax,1),%r8
561	xorq	(%rsi,%rax,1),%r9
562	movb	%dl,%al
563	xorq	%r10,%r8
564	movzwq	(%r11,%r13,2),%r13
565	movzbl	%dl,%ecx
566	shlb	$4,%al
567	movzbq	(%rsp,%rbx,1),%r12
568	shrl	$4,%ecx
569	shlq	$48,%r13
570	xorq	%r8,%r12
571	movq	%r9,%r10
572	xorq	%r13,%r9
573	shrq	$8,%r8
574	movzbq	%r12b,%r12
575	movl	0(%rdi),%edx
576	shrq	$8,%r9
577	xorq	-128(%rbp,%rbx,8),%r8
578	shlq	$56,%r10
579	xorq	(%rbp,%rbx,8),%r9
580	roll	$8,%edx
581	xorq	8(%rsi,%rax,1),%r8
582	xorq	(%rsi,%rax,1),%r9
583	movb	%dl,%al
584	xorq	%r10,%r8
585	movzwq	(%r11,%r12,2),%r12
586	movzbl	%dl,%ebx
587	shlb	$4,%al
588	movzbq	(%rsp,%rcx,1),%r13
589	shrl	$4,%ebx
590	shlq	$48,%r12
591	xorq	%r8,%r13
592	movq	%r9,%r10
593	xorq	%r12,%r9
594	shrq	$8,%r8
595	movzbq	%r13b,%r13
596	shrq	$8,%r9
597	xorq	-128(%rbp,%rcx,8),%r8
598	shlq	$56,%r10
599	xorq	(%rbp,%rcx,8),%r9
600	roll	$8,%edx
601	xorq	8(%rsi,%rax,1),%r8
602	xorq	(%rsi,%rax,1),%r9
603	movb	%dl,%al
604	xorq	%r10,%r8
605	movzwq	(%r11,%r13,2),%r13
606	movzbl	%dl,%ecx
607	shlb	$4,%al
608	movzbq	(%rsp,%rbx,1),%r12
609	shrl	$4,%ecx
610	shlq	$48,%r13
611	xorq	%r8,%r12
612	movq	%r9,%r10
613	xorq	%r13,%r9
614	shrq	$8,%r8
615	movzbq	%r12b,%r12
616	shrq	$8,%r9
617	xorq	-128(%rbp,%rbx,8),%r8
618	shlq	$56,%r10
619	xorq	(%rbp,%rbx,8),%r9
620	roll	$8,%edx
621	xorq	8(%rsi,%rax,1),%r8
622	xorq	(%rsi,%rax,1),%r9
623	movb	%dl,%al
624	xorq	%r10,%r8
625	movzwq	(%r11,%r12,2),%r12
626	movzbl	%dl,%ebx
627	shlb	$4,%al
628	movzbq	(%rsp,%rcx,1),%r13
629	shrl	$4,%ebx
630	shlq	$48,%r12
631	xorq	%r8,%r13
632	movq	%r9,%r10
633	xorq	%r12,%r9
634	shrq	$8,%r8
635	movzbq	%r13b,%r13
636	shrq	$8,%r9
637	xorq	-128(%rbp,%rcx,8),%r8
638	shlq	$56,%r10
639	xorq	(%rbp,%rcx,8),%r9
640	roll	$8,%edx
641	xorq	8(%rsi,%rax,1),%r8
642	xorq	(%rsi,%rax,1),%r9
643	movb	%dl,%al
644	xorq	%r10,%r8
645	movzwq	(%r11,%r13,2),%r13
646	movzbl	%dl,%ecx
647	shlb	$4,%al
648	movzbq	(%rsp,%rbx,1),%r12
649	andl	$240,%ecx
650	shlq	$48,%r13
651	xorq	%r8,%r12
652	movq	%r9,%r10
653	xorq	%r13,%r9
654	shrq	$8,%r8
655	movzbq	%r12b,%r12
656	movl	-4(%rdi),%edx
657	shrq	$8,%r9
658	xorq	-128(%rbp,%rbx,8),%r8
659	shlq	$56,%r10
660	xorq	(%rbp,%rbx,8),%r9
661	movzwq	(%r11,%r12,2),%r12
662	xorq	8(%rsi,%rax,1),%r8
663	xorq	(%rsi,%rax,1),%r9
664	shlq	$48,%r12
665	xorq	%r10,%r8
666	xorq	%r12,%r9
667	movzbq	%r8b,%r13
668	shrq	$4,%r8
669	movq	%r9,%r10
670	shlb	$4,%r13b
671	shrq	$4,%r9
672	xorq	8(%rsi,%rcx,1),%r8
673	movzwq	(%r11,%r13,2),%r13
674	shlq	$60,%r10
675	xorq	(%rsi,%rcx,1),%r9
676	xorq	%r10,%r8
677	shlq	$48,%r13
678	bswapq	%r8
679	xorq	%r13,%r9
680	bswapq	%r9
681	cmpq	%r15,%r14
682	jb	.Louter_loop
683	movq	%r8,8(%rdi)
684	movq	%r9,(%rdi)
685
686	leaq	280+48(%rsp),%rsi
687.cfi_def_cfa	%rsi,8
688	movq	-48(%rsi),%r15
689.cfi_restore	%r15
690	movq	-40(%rsi),%r14
691.cfi_restore	%r14
692	movq	-32(%rsi),%r13
693.cfi_restore	%r13
694	movq	-24(%rsi),%r12
695.cfi_restore	%r12
696	movq	-16(%rsi),%rbp
697.cfi_restore	%rbp
698	movq	-8(%rsi),%rbx
699.cfi_restore	%rbx
700	leaq	0(%rsi),%rsp
701.cfi_def_cfa_register	%rsp
702.Lghash_epilogue:
703	.byte	0xf3,0xc3
704.cfi_endproc
705.size	gcm_ghash_4bit,.-gcm_ghash_4bit
706.globl	gcm_init_clmul
707.type	gcm_init_clmul,@function
708.align	16
709gcm_init_clmul:
710.cfi_startproc
711.L_init_clmul:
712	movdqu	(%rsi),%xmm2
713	pshufd	$78,%xmm2,%xmm2
714
715
716	pshufd	$255,%xmm2,%xmm4
717	movdqa	%xmm2,%xmm3
718	psllq	$1,%xmm2
719	pxor	%xmm5,%xmm5
720	psrlq	$63,%xmm3
721	pcmpgtd	%xmm4,%xmm5
722	pslldq	$8,%xmm3
723	por	%xmm3,%xmm2
724
725
726	pand	.L0x1c2_polynomial(%rip),%xmm5
727	pxor	%xmm5,%xmm2
728
729
730	pshufd	$78,%xmm2,%xmm6
731	movdqa	%xmm2,%xmm0
732	pxor	%xmm2,%xmm6
733	movdqa	%xmm0,%xmm1
734	pshufd	$78,%xmm0,%xmm3
735	pxor	%xmm0,%xmm3
736.byte	102,15,58,68,194,0
737.byte	102,15,58,68,202,17
738.byte	102,15,58,68,222,0
739	pxor	%xmm0,%xmm3
740	pxor	%xmm1,%xmm3
741
742	movdqa	%xmm3,%xmm4
743	psrldq	$8,%xmm3
744	pslldq	$8,%xmm4
745	pxor	%xmm3,%xmm1
746	pxor	%xmm4,%xmm0
747
748	movdqa	%xmm0,%xmm4
749	movdqa	%xmm0,%xmm3
750	psllq	$5,%xmm0
751	pxor	%xmm0,%xmm3
752	psllq	$1,%xmm0
753	pxor	%xmm3,%xmm0
754	psllq	$57,%xmm0
755	movdqa	%xmm0,%xmm3
756	pslldq	$8,%xmm0
757	psrldq	$8,%xmm3
758	pxor	%xmm4,%xmm0
759	pxor	%xmm3,%xmm1
760
761
762	movdqa	%xmm0,%xmm4
763	psrlq	$1,%xmm0
764	pxor	%xmm4,%xmm1
765	pxor	%xmm0,%xmm4
766	psrlq	$5,%xmm0
767	pxor	%xmm4,%xmm0
768	psrlq	$1,%xmm0
769	pxor	%xmm1,%xmm0
770	pshufd	$78,%xmm2,%xmm3
771	pshufd	$78,%xmm0,%xmm4
772	pxor	%xmm2,%xmm3
773	movdqu	%xmm2,0(%rdi)
774	pxor	%xmm0,%xmm4
775	movdqu	%xmm0,16(%rdi)
776.byte	102,15,58,15,227,8
777	movdqu	%xmm4,32(%rdi)
778	movdqa	%xmm0,%xmm1
779	pshufd	$78,%xmm0,%xmm3
780	pxor	%xmm0,%xmm3
781.byte	102,15,58,68,194,0
782.byte	102,15,58,68,202,17
783.byte	102,15,58,68,222,0
784	pxor	%xmm0,%xmm3
785	pxor	%xmm1,%xmm3
786
787	movdqa	%xmm3,%xmm4
788	psrldq	$8,%xmm3
789	pslldq	$8,%xmm4
790	pxor	%xmm3,%xmm1
791	pxor	%xmm4,%xmm0
792
793	movdqa	%xmm0,%xmm4
794	movdqa	%xmm0,%xmm3
795	psllq	$5,%xmm0
796	pxor	%xmm0,%xmm3
797	psllq	$1,%xmm0
798	pxor	%xmm3,%xmm0
799	psllq	$57,%xmm0
800	movdqa	%xmm0,%xmm3
801	pslldq	$8,%xmm0
802	psrldq	$8,%xmm3
803	pxor	%xmm4,%xmm0
804	pxor	%xmm3,%xmm1
805
806
807	movdqa	%xmm0,%xmm4
808	psrlq	$1,%xmm0
809	pxor	%xmm4,%xmm1
810	pxor	%xmm0,%xmm4
811	psrlq	$5,%xmm0
812	pxor	%xmm4,%xmm0
813	psrlq	$1,%xmm0
814	pxor	%xmm1,%xmm0
815	movdqa	%xmm0,%xmm5
816	movdqa	%xmm0,%xmm1
817	pshufd	$78,%xmm0,%xmm3
818	pxor	%xmm0,%xmm3
819.byte	102,15,58,68,194,0
820.byte	102,15,58,68,202,17
821.byte	102,15,58,68,222,0
822	pxor	%xmm0,%xmm3
823	pxor	%xmm1,%xmm3
824
825	movdqa	%xmm3,%xmm4
826	psrldq	$8,%xmm3
827	pslldq	$8,%xmm4
828	pxor	%xmm3,%xmm1
829	pxor	%xmm4,%xmm0
830
831	movdqa	%xmm0,%xmm4
832	movdqa	%xmm0,%xmm3
833	psllq	$5,%xmm0
834	pxor	%xmm0,%xmm3
835	psllq	$1,%xmm0
836	pxor	%xmm3,%xmm0
837	psllq	$57,%xmm0
838	movdqa	%xmm0,%xmm3
839	pslldq	$8,%xmm0
840	psrldq	$8,%xmm3
841	pxor	%xmm4,%xmm0
842	pxor	%xmm3,%xmm1
843
844
845	movdqa	%xmm0,%xmm4
846	psrlq	$1,%xmm0
847	pxor	%xmm4,%xmm1
848	pxor	%xmm0,%xmm4
849	psrlq	$5,%xmm0
850	pxor	%xmm4,%xmm0
851	psrlq	$1,%xmm0
852	pxor	%xmm1,%xmm0
853	pshufd	$78,%xmm5,%xmm3
854	pshufd	$78,%xmm0,%xmm4
855	pxor	%xmm5,%xmm3
856	movdqu	%xmm5,48(%rdi)
857	pxor	%xmm0,%xmm4
858	movdqu	%xmm0,64(%rdi)
859.byte	102,15,58,15,227,8
860	movdqu	%xmm4,80(%rdi)
861	.byte	0xf3,0xc3
862.cfi_endproc
863.size	gcm_init_clmul,.-gcm_init_clmul
864.globl	gcm_gmult_clmul
865.type	gcm_gmult_clmul,@function
866.align	16
867gcm_gmult_clmul:
868.cfi_startproc
869.L_gmult_clmul:
870	movdqu	(%rdi),%xmm0
871	movdqa	.Lbswap_mask(%rip),%xmm5
872	movdqu	(%rsi),%xmm2
873	movdqu	32(%rsi),%xmm4
874.byte	102,15,56,0,197
875	movdqa	%xmm0,%xmm1
876	pshufd	$78,%xmm0,%xmm3
877	pxor	%xmm0,%xmm3
878.byte	102,15,58,68,194,0
879.byte	102,15,58,68,202,17
880.byte	102,15,58,68,220,0
881	pxor	%xmm0,%xmm3
882	pxor	%xmm1,%xmm3
883
884	movdqa	%xmm3,%xmm4
885	psrldq	$8,%xmm3
886	pslldq	$8,%xmm4
887	pxor	%xmm3,%xmm1
888	pxor	%xmm4,%xmm0
889
890	movdqa	%xmm0,%xmm4
891	movdqa	%xmm0,%xmm3
892	psllq	$5,%xmm0
893	pxor	%xmm0,%xmm3
894	psllq	$1,%xmm0
895	pxor	%xmm3,%xmm0
896	psllq	$57,%xmm0
897	movdqa	%xmm0,%xmm3
898	pslldq	$8,%xmm0
899	psrldq	$8,%xmm3
900	pxor	%xmm4,%xmm0
901	pxor	%xmm3,%xmm1
902
903
904	movdqa	%xmm0,%xmm4
905	psrlq	$1,%xmm0
906	pxor	%xmm4,%xmm1
907	pxor	%xmm0,%xmm4
908	psrlq	$5,%xmm0
909	pxor	%xmm4,%xmm0
910	psrlq	$1,%xmm0
911	pxor	%xmm1,%xmm0
912.byte	102,15,56,0,197
913	movdqu	%xmm0,(%rdi)
914	.byte	0xf3,0xc3
915.cfi_endproc
916.size	gcm_gmult_clmul,.-gcm_gmult_clmul
917.globl	gcm_ghash_clmul
918.type	gcm_ghash_clmul,@function
919.align	32
920gcm_ghash_clmul:
921.cfi_startproc
922.L_ghash_clmul:
923	movdqa	.Lbswap_mask(%rip),%xmm10
924
925	movdqu	(%rdi),%xmm0
926	movdqu	(%rsi),%xmm2
927	movdqu	32(%rsi),%xmm7
928.byte	102,65,15,56,0,194
929
930	subq	$0x10,%rcx
931	jz	.Lodd_tail
932
933	movdqu	16(%rsi),%xmm6
934	movl	OPENSSL_ia32cap_P+4(%rip),%eax
935	cmpq	$0x30,%rcx
936	jb	.Lskip4x
937
938	andl	$71303168,%eax
939	cmpl	$4194304,%eax
940	je	.Lskip4x
941
942	subq	$0x30,%rcx
943	movq	$0xA040608020C0E000,%rax
944	movdqu	48(%rsi),%xmm14
945	movdqu	64(%rsi),%xmm15
946
947
948
949
950	movdqu	48(%rdx),%xmm3
951	movdqu	32(%rdx),%xmm11
952.byte	102,65,15,56,0,218
953.byte	102,69,15,56,0,218
954	movdqa	%xmm3,%xmm5
955	pshufd	$78,%xmm3,%xmm4
956	pxor	%xmm3,%xmm4
957.byte	102,15,58,68,218,0
958.byte	102,15,58,68,234,17
959.byte	102,15,58,68,231,0
960
961	movdqa	%xmm11,%xmm13
962	pshufd	$78,%xmm11,%xmm12
963	pxor	%xmm11,%xmm12
964.byte	102,68,15,58,68,222,0
965.byte	102,68,15,58,68,238,17
966.byte	102,68,15,58,68,231,16
967	xorps	%xmm11,%xmm3
968	xorps	%xmm13,%xmm5
969	movups	80(%rsi),%xmm7
970	xorps	%xmm12,%xmm4
971
972	movdqu	16(%rdx),%xmm11
973	movdqu	0(%rdx),%xmm8
974.byte	102,69,15,56,0,218
975.byte	102,69,15,56,0,194
976	movdqa	%xmm11,%xmm13
977	pshufd	$78,%xmm11,%xmm12
978	pxor	%xmm8,%xmm0
979	pxor	%xmm11,%xmm12
980.byte	102,69,15,58,68,222,0
981	movdqa	%xmm0,%xmm1
982	pshufd	$78,%xmm0,%xmm8
983	pxor	%xmm0,%xmm8
984.byte	102,69,15,58,68,238,17
985.byte	102,68,15,58,68,231,0
986	xorps	%xmm11,%xmm3
987	xorps	%xmm13,%xmm5
988
989	leaq	64(%rdx),%rdx
990	subq	$0x40,%rcx
991	jc	.Ltail4x
992
993	jmp	.Lmod4_loop
994.align	32
995.Lmod4_loop:
996.byte	102,65,15,58,68,199,0
997	xorps	%xmm12,%xmm4
998	movdqu	48(%rdx),%xmm11
999.byte	102,69,15,56,0,218
1000.byte	102,65,15,58,68,207,17
1001	xorps	%xmm3,%xmm0
1002	movdqu	32(%rdx),%xmm3
1003	movdqa	%xmm11,%xmm13
1004.byte	102,68,15,58,68,199,16
1005	pshufd	$78,%xmm11,%xmm12
1006	xorps	%xmm5,%xmm1
1007	pxor	%xmm11,%xmm12
1008.byte	102,65,15,56,0,218
1009	movups	32(%rsi),%xmm7
1010	xorps	%xmm4,%xmm8
1011.byte	102,68,15,58,68,218,0
1012	pshufd	$78,%xmm3,%xmm4
1013
1014	pxor	%xmm0,%xmm8
1015	movdqa	%xmm3,%xmm5
1016	pxor	%xmm1,%xmm8
1017	pxor	%xmm3,%xmm4
1018	movdqa	%xmm8,%xmm9
1019.byte	102,68,15,58,68,234,17
1020	pslldq	$8,%xmm8
1021	psrldq	$8,%xmm9
1022	pxor	%xmm8,%xmm0
1023	movdqa	.L7_mask(%rip),%xmm8
1024	pxor	%xmm9,%xmm1
1025.byte	102,76,15,110,200
1026
1027	pand	%xmm0,%xmm8
1028.byte	102,69,15,56,0,200
1029	pxor	%xmm0,%xmm9
1030.byte	102,68,15,58,68,231,0
1031	psllq	$57,%xmm9
1032	movdqa	%xmm9,%xmm8
1033	pslldq	$8,%xmm9
1034.byte	102,15,58,68,222,0
1035	psrldq	$8,%xmm8
1036	pxor	%xmm9,%xmm0
1037	pxor	%xmm8,%xmm1
1038	movdqu	0(%rdx),%xmm8
1039
1040	movdqa	%xmm0,%xmm9
1041	psrlq	$1,%xmm0
1042.byte	102,15,58,68,238,17
1043	xorps	%xmm11,%xmm3
1044	movdqu	16(%rdx),%xmm11
1045.byte	102,69,15,56,0,218
1046.byte	102,15,58,68,231,16
1047	xorps	%xmm13,%xmm5
1048	movups	80(%rsi),%xmm7
1049.byte	102,69,15,56,0,194
1050	pxor	%xmm9,%xmm1
1051	pxor	%xmm0,%xmm9
1052	psrlq	$5,%xmm0
1053
1054	movdqa	%xmm11,%xmm13
1055	pxor	%xmm12,%xmm4
1056	pshufd	$78,%xmm11,%xmm12
1057	pxor	%xmm9,%xmm0
1058	pxor	%xmm8,%xmm1
1059	pxor	%xmm11,%xmm12
1060.byte	102,69,15,58,68,222,0
1061	psrlq	$1,%xmm0
1062	pxor	%xmm1,%xmm0
1063	movdqa	%xmm0,%xmm1
1064.byte	102,69,15,58,68,238,17
1065	xorps	%xmm11,%xmm3
1066	pshufd	$78,%xmm0,%xmm8
1067	pxor	%xmm0,%xmm8
1068
1069.byte	102,68,15,58,68,231,0
1070	xorps	%xmm13,%xmm5
1071
1072	leaq	64(%rdx),%rdx
1073	subq	$0x40,%rcx
1074	jnc	.Lmod4_loop
1075
1076.Ltail4x:
1077.byte	102,65,15,58,68,199,0
1078.byte	102,65,15,58,68,207,17
1079.byte	102,68,15,58,68,199,16
1080	xorps	%xmm12,%xmm4
1081	xorps	%xmm3,%xmm0
1082	xorps	%xmm5,%xmm1
1083	pxor	%xmm0,%xmm1
1084	pxor	%xmm4,%xmm8
1085
1086	pxor	%xmm1,%xmm8
1087	pxor	%xmm0,%xmm1
1088
1089	movdqa	%xmm8,%xmm9
1090	psrldq	$8,%xmm8
1091	pslldq	$8,%xmm9
1092	pxor	%xmm8,%xmm1
1093	pxor	%xmm9,%xmm0
1094
1095	movdqa	%xmm0,%xmm4
1096	movdqa	%xmm0,%xmm3
1097	psllq	$5,%xmm0
1098	pxor	%xmm0,%xmm3
1099	psllq	$1,%xmm0
1100	pxor	%xmm3,%xmm0
1101	psllq	$57,%xmm0
1102	movdqa	%xmm0,%xmm3
1103	pslldq	$8,%xmm0
1104	psrldq	$8,%xmm3
1105	pxor	%xmm4,%xmm0
1106	pxor	%xmm3,%xmm1
1107
1108
1109	movdqa	%xmm0,%xmm4
1110	psrlq	$1,%xmm0
1111	pxor	%xmm4,%xmm1
1112	pxor	%xmm0,%xmm4
1113	psrlq	$5,%xmm0
1114	pxor	%xmm4,%xmm0
1115	psrlq	$1,%xmm0
1116	pxor	%xmm1,%xmm0
1117	addq	$0x40,%rcx
1118	jz	.Ldone
1119	movdqu	32(%rsi),%xmm7
1120	subq	$0x10,%rcx
1121	jz	.Lodd_tail
1122.Lskip4x:
1123
1124
1125
1126
1127
1128	movdqu	(%rdx),%xmm8
1129	movdqu	16(%rdx),%xmm3
1130.byte	102,69,15,56,0,194
1131.byte	102,65,15,56,0,218
1132	pxor	%xmm8,%xmm0
1133
1134	movdqa	%xmm3,%xmm5
1135	pshufd	$78,%xmm3,%xmm4
1136	pxor	%xmm3,%xmm4
1137.byte	102,15,58,68,218,0
1138.byte	102,15,58,68,234,17
1139.byte	102,15,58,68,231,0
1140
1141	leaq	32(%rdx),%rdx
1142	nop
1143	subq	$0x20,%rcx
1144	jbe	.Leven_tail
1145	nop
1146	jmp	.Lmod_loop
1147
1148.align	32
1149.Lmod_loop:
1150	movdqa	%xmm0,%xmm1
1151	movdqa	%xmm4,%xmm8
1152	pshufd	$78,%xmm0,%xmm4
1153	pxor	%xmm0,%xmm4
1154
1155.byte	102,15,58,68,198,0
1156.byte	102,15,58,68,206,17
1157.byte	102,15,58,68,231,16
1158
1159	pxor	%xmm3,%xmm0
1160	pxor	%xmm5,%xmm1
1161	movdqu	(%rdx),%xmm9
1162	pxor	%xmm0,%xmm8
1163.byte	102,69,15,56,0,202
1164	movdqu	16(%rdx),%xmm3
1165
1166	pxor	%xmm1,%xmm8
1167	pxor	%xmm9,%xmm1
1168	pxor	%xmm8,%xmm4
1169.byte	102,65,15,56,0,218
1170	movdqa	%xmm4,%xmm8
1171	psrldq	$8,%xmm8
1172	pslldq	$8,%xmm4
1173	pxor	%xmm8,%xmm1
1174	pxor	%xmm4,%xmm0
1175
1176	movdqa	%xmm3,%xmm5
1177
1178	movdqa	%xmm0,%xmm9
1179	movdqa	%xmm0,%xmm8
1180	psllq	$5,%xmm0
1181	pxor	%xmm0,%xmm8
1182.byte	102,15,58,68,218,0
1183	psllq	$1,%xmm0
1184	pxor	%xmm8,%xmm0
1185	psllq	$57,%xmm0
1186	movdqa	%xmm0,%xmm8
1187	pslldq	$8,%xmm0
1188	psrldq	$8,%xmm8
1189	pxor	%xmm9,%xmm0
1190	pshufd	$78,%xmm5,%xmm4
1191	pxor	%xmm8,%xmm1
1192	pxor	%xmm5,%xmm4
1193
1194	movdqa	%xmm0,%xmm9
1195	psrlq	$1,%xmm0
1196.byte	102,15,58,68,234,17
1197	pxor	%xmm9,%xmm1
1198	pxor	%xmm0,%xmm9
1199	psrlq	$5,%xmm0
1200	pxor	%xmm9,%xmm0
1201	leaq	32(%rdx),%rdx
1202	psrlq	$1,%xmm0
1203.byte	102,15,58,68,231,0
1204	pxor	%xmm1,%xmm0
1205
1206	subq	$0x20,%rcx
1207	ja	.Lmod_loop
1208
1209.Leven_tail:
1210	movdqa	%xmm0,%xmm1
1211	movdqa	%xmm4,%xmm8
1212	pshufd	$78,%xmm0,%xmm4
1213	pxor	%xmm0,%xmm4
1214
1215.byte	102,15,58,68,198,0
1216.byte	102,15,58,68,206,17
1217.byte	102,15,58,68,231,16
1218
1219	pxor	%xmm3,%xmm0
1220	pxor	%xmm5,%xmm1
1221	pxor	%xmm0,%xmm8
1222	pxor	%xmm1,%xmm8
1223	pxor	%xmm8,%xmm4
1224	movdqa	%xmm4,%xmm8
1225	psrldq	$8,%xmm8
1226	pslldq	$8,%xmm4
1227	pxor	%xmm8,%xmm1
1228	pxor	%xmm4,%xmm0
1229
1230	movdqa	%xmm0,%xmm4
1231	movdqa	%xmm0,%xmm3
1232	psllq	$5,%xmm0
1233	pxor	%xmm0,%xmm3
1234	psllq	$1,%xmm0
1235	pxor	%xmm3,%xmm0
1236	psllq	$57,%xmm0
1237	movdqa	%xmm0,%xmm3
1238	pslldq	$8,%xmm0
1239	psrldq	$8,%xmm3
1240	pxor	%xmm4,%xmm0
1241	pxor	%xmm3,%xmm1
1242
1243
1244	movdqa	%xmm0,%xmm4
1245	psrlq	$1,%xmm0
1246	pxor	%xmm4,%xmm1
1247	pxor	%xmm0,%xmm4
1248	psrlq	$5,%xmm0
1249	pxor	%xmm4,%xmm0
1250	psrlq	$1,%xmm0
1251	pxor	%xmm1,%xmm0
1252	testq	%rcx,%rcx
1253	jnz	.Ldone
1254
1255.Lodd_tail:
1256	movdqu	(%rdx),%xmm8
1257.byte	102,69,15,56,0,194
1258	pxor	%xmm8,%xmm0
1259	movdqa	%xmm0,%xmm1
1260	pshufd	$78,%xmm0,%xmm3
1261	pxor	%xmm0,%xmm3
1262.byte	102,15,58,68,194,0
1263.byte	102,15,58,68,202,17
1264.byte	102,15,58,68,223,0
1265	pxor	%xmm0,%xmm3
1266	pxor	%xmm1,%xmm3
1267
1268	movdqa	%xmm3,%xmm4
1269	psrldq	$8,%xmm3
1270	pslldq	$8,%xmm4
1271	pxor	%xmm3,%xmm1
1272	pxor	%xmm4,%xmm0
1273
1274	movdqa	%xmm0,%xmm4
1275	movdqa	%xmm0,%xmm3
1276	psllq	$5,%xmm0
1277	pxor	%xmm0,%xmm3
1278	psllq	$1,%xmm0
1279	pxor	%xmm3,%xmm0
1280	psllq	$57,%xmm0
1281	movdqa	%xmm0,%xmm3
1282	pslldq	$8,%xmm0
1283	psrldq	$8,%xmm3
1284	pxor	%xmm4,%xmm0
1285	pxor	%xmm3,%xmm1
1286
1287
1288	movdqa	%xmm0,%xmm4
1289	psrlq	$1,%xmm0
1290	pxor	%xmm4,%xmm1
1291	pxor	%xmm0,%xmm4
1292	psrlq	$5,%xmm0
1293	pxor	%xmm4,%xmm0
1294	psrlq	$1,%xmm0
1295	pxor	%xmm1,%xmm0
1296.Ldone:
1297.byte	102,65,15,56,0,194
1298	movdqu	%xmm0,(%rdi)
1299	.byte	0xf3,0xc3
1300.cfi_endproc
1301.size	gcm_ghash_clmul,.-gcm_ghash_clmul
1302.globl	gcm_init_avx
1303.type	gcm_init_avx,@function
1304.align	32
1305gcm_init_avx:
1306.cfi_startproc
1307	vzeroupper
1308
1309	vmovdqu	(%rsi),%xmm2
1310	vpshufd	$78,%xmm2,%xmm2
1311
1312
1313	vpshufd	$255,%xmm2,%xmm4
1314	vpsrlq	$63,%xmm2,%xmm3
1315	vpsllq	$1,%xmm2,%xmm2
1316	vpxor	%xmm5,%xmm5,%xmm5
1317	vpcmpgtd	%xmm4,%xmm5,%xmm5
1318	vpslldq	$8,%xmm3,%xmm3
1319	vpor	%xmm3,%xmm2,%xmm2
1320
1321
1322	vpand	.L0x1c2_polynomial(%rip),%xmm5,%xmm5
1323	vpxor	%xmm5,%xmm2,%xmm2
1324
1325	vpunpckhqdq	%xmm2,%xmm2,%xmm6
1326	vmovdqa	%xmm2,%xmm0
1327	vpxor	%xmm2,%xmm6,%xmm6
1328	movq	$4,%r10
1329	jmp	.Linit_start_avx
1330.align	32
1331.Linit_loop_avx:
1332	vpalignr	$8,%xmm3,%xmm4,%xmm5
1333	vmovdqu	%xmm5,-16(%rdi)
1334	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1335	vpxor	%xmm0,%xmm3,%xmm3
1336	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1337	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1338	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1339	vpxor	%xmm0,%xmm1,%xmm4
1340	vpxor	%xmm4,%xmm3,%xmm3
1341
1342	vpslldq	$8,%xmm3,%xmm4
1343	vpsrldq	$8,%xmm3,%xmm3
1344	vpxor	%xmm4,%xmm0,%xmm0
1345	vpxor	%xmm3,%xmm1,%xmm1
1346	vpsllq	$57,%xmm0,%xmm3
1347	vpsllq	$62,%xmm0,%xmm4
1348	vpxor	%xmm3,%xmm4,%xmm4
1349	vpsllq	$63,%xmm0,%xmm3
1350	vpxor	%xmm3,%xmm4,%xmm4
1351	vpslldq	$8,%xmm4,%xmm3
1352	vpsrldq	$8,%xmm4,%xmm4
1353	vpxor	%xmm3,%xmm0,%xmm0
1354	vpxor	%xmm4,%xmm1,%xmm1
1355
1356	vpsrlq	$1,%xmm0,%xmm4
1357	vpxor	%xmm0,%xmm1,%xmm1
1358	vpxor	%xmm4,%xmm0,%xmm0
1359	vpsrlq	$5,%xmm4,%xmm4
1360	vpxor	%xmm4,%xmm0,%xmm0
1361	vpsrlq	$1,%xmm0,%xmm0
1362	vpxor	%xmm1,%xmm0,%xmm0
1363.Linit_start_avx:
1364	vmovdqa	%xmm0,%xmm5
1365	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1366	vpxor	%xmm0,%xmm3,%xmm3
1367	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1368	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1369	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1370	vpxor	%xmm0,%xmm1,%xmm4
1371	vpxor	%xmm4,%xmm3,%xmm3
1372
1373	vpslldq	$8,%xmm3,%xmm4
1374	vpsrldq	$8,%xmm3,%xmm3
1375	vpxor	%xmm4,%xmm0,%xmm0
1376	vpxor	%xmm3,%xmm1,%xmm1
1377	vpsllq	$57,%xmm0,%xmm3
1378	vpsllq	$62,%xmm0,%xmm4
1379	vpxor	%xmm3,%xmm4,%xmm4
1380	vpsllq	$63,%xmm0,%xmm3
1381	vpxor	%xmm3,%xmm4,%xmm4
1382	vpslldq	$8,%xmm4,%xmm3
1383	vpsrldq	$8,%xmm4,%xmm4
1384	vpxor	%xmm3,%xmm0,%xmm0
1385	vpxor	%xmm4,%xmm1,%xmm1
1386
1387	vpsrlq	$1,%xmm0,%xmm4
1388	vpxor	%xmm0,%xmm1,%xmm1
1389	vpxor	%xmm4,%xmm0,%xmm0
1390	vpsrlq	$5,%xmm4,%xmm4
1391	vpxor	%xmm4,%xmm0,%xmm0
1392	vpsrlq	$1,%xmm0,%xmm0
1393	vpxor	%xmm1,%xmm0,%xmm0
1394	vpshufd	$78,%xmm5,%xmm3
1395	vpshufd	$78,%xmm0,%xmm4
1396	vpxor	%xmm5,%xmm3,%xmm3
1397	vmovdqu	%xmm5,0(%rdi)
1398	vpxor	%xmm0,%xmm4,%xmm4
1399	vmovdqu	%xmm0,16(%rdi)
1400	leaq	48(%rdi),%rdi
1401	subq	$1,%r10
1402	jnz	.Linit_loop_avx
1403
1404	vpalignr	$8,%xmm4,%xmm3,%xmm5
1405	vmovdqu	%xmm5,-16(%rdi)
1406
1407	vzeroupper
1408	.byte	0xf3,0xc3
1409.cfi_endproc
1410.size	gcm_init_avx,.-gcm_init_avx
1411.globl	gcm_gmult_avx
1412.type	gcm_gmult_avx,@function
1413.align	32
1414gcm_gmult_avx:
1415.cfi_startproc
1416	jmp	.L_gmult_clmul
1417.cfi_endproc
1418.size	gcm_gmult_avx,.-gcm_gmult_avx
1419.globl	gcm_ghash_avx
1420.type	gcm_ghash_avx,@function
1421.align	32
1422gcm_ghash_avx:
1423.cfi_startproc
1424	vzeroupper
1425
1426	vmovdqu	(%rdi),%xmm10
1427	leaq	.L0x1c2_polynomial(%rip),%r10
1428	leaq	64(%rsi),%rsi
1429	vmovdqu	.Lbswap_mask(%rip),%xmm13
1430	vpshufb	%xmm13,%xmm10,%xmm10
1431	cmpq	$0x80,%rcx
1432	jb	.Lshort_avx
1433	subq	$0x80,%rcx
1434
1435	vmovdqu	112(%rdx),%xmm14
1436	vmovdqu	0-64(%rsi),%xmm6
1437	vpshufb	%xmm13,%xmm14,%xmm14
1438	vmovdqu	32-64(%rsi),%xmm7
1439
1440	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1441	vmovdqu	96(%rdx),%xmm15
1442	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1443	vpxor	%xmm14,%xmm9,%xmm9
1444	vpshufb	%xmm13,%xmm15,%xmm15
1445	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1446	vmovdqu	16-64(%rsi),%xmm6
1447	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1448	vmovdqu	80(%rdx),%xmm14
1449	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1450	vpxor	%xmm15,%xmm8,%xmm8
1451
1452	vpshufb	%xmm13,%xmm14,%xmm14
1453	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1454	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1455	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1456	vmovdqu	48-64(%rsi),%xmm6
1457	vpxor	%xmm14,%xmm9,%xmm9
1458	vmovdqu	64(%rdx),%xmm15
1459	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1460	vmovdqu	80-64(%rsi),%xmm7
1461
1462	vpshufb	%xmm13,%xmm15,%xmm15
1463	vpxor	%xmm0,%xmm3,%xmm3
1464	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1465	vpxor	%xmm1,%xmm4,%xmm4
1466	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1467	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1468	vmovdqu	64-64(%rsi),%xmm6
1469	vpxor	%xmm2,%xmm5,%xmm5
1470	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1471	vpxor	%xmm15,%xmm8,%xmm8
1472
1473	vmovdqu	48(%rdx),%xmm14
1474	vpxor	%xmm3,%xmm0,%xmm0
1475	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1476	vpxor	%xmm4,%xmm1,%xmm1
1477	vpshufb	%xmm13,%xmm14,%xmm14
1478	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1479	vmovdqu	96-64(%rsi),%xmm6
1480	vpxor	%xmm5,%xmm2,%xmm2
1481	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1482	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1483	vmovdqu	128-64(%rsi),%xmm7
1484	vpxor	%xmm14,%xmm9,%xmm9
1485
1486	vmovdqu	32(%rdx),%xmm15
1487	vpxor	%xmm0,%xmm3,%xmm3
1488	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1489	vpxor	%xmm1,%xmm4,%xmm4
1490	vpshufb	%xmm13,%xmm15,%xmm15
1491	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1492	vmovdqu	112-64(%rsi),%xmm6
1493	vpxor	%xmm2,%xmm5,%xmm5
1494	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1495	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1496	vpxor	%xmm15,%xmm8,%xmm8
1497
1498	vmovdqu	16(%rdx),%xmm14
1499	vpxor	%xmm3,%xmm0,%xmm0
1500	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1501	vpxor	%xmm4,%xmm1,%xmm1
1502	vpshufb	%xmm13,%xmm14,%xmm14
1503	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1504	vmovdqu	144-64(%rsi),%xmm6
1505	vpxor	%xmm5,%xmm2,%xmm2
1506	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1507	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1508	vmovdqu	176-64(%rsi),%xmm7
1509	vpxor	%xmm14,%xmm9,%xmm9
1510
1511	vmovdqu	(%rdx),%xmm15
1512	vpxor	%xmm0,%xmm3,%xmm3
1513	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1514	vpxor	%xmm1,%xmm4,%xmm4
1515	vpshufb	%xmm13,%xmm15,%xmm15
1516	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1517	vmovdqu	160-64(%rsi),%xmm6
1518	vpxor	%xmm2,%xmm5,%xmm5
1519	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1520
1521	leaq	128(%rdx),%rdx
1522	cmpq	$0x80,%rcx
1523	jb	.Ltail_avx
1524
1525	vpxor	%xmm10,%xmm15,%xmm15
1526	subq	$0x80,%rcx
1527	jmp	.Loop8x_avx
1528
1529.align	32
1530.Loop8x_avx:
1531	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1532	vmovdqu	112(%rdx),%xmm14
1533	vpxor	%xmm0,%xmm3,%xmm3
1534	vpxor	%xmm15,%xmm8,%xmm8
1535	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
1536	vpshufb	%xmm13,%xmm14,%xmm14
1537	vpxor	%xmm1,%xmm4,%xmm4
1538	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
1539	vmovdqu	0-64(%rsi),%xmm6
1540	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1541	vpxor	%xmm2,%xmm5,%xmm5
1542	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
1543	vmovdqu	32-64(%rsi),%xmm7
1544	vpxor	%xmm14,%xmm9,%xmm9
1545
1546	vmovdqu	96(%rdx),%xmm15
1547	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1548	vpxor	%xmm3,%xmm10,%xmm10
1549	vpshufb	%xmm13,%xmm15,%xmm15
1550	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1551	vxorps	%xmm4,%xmm11,%xmm11
1552	vmovdqu	16-64(%rsi),%xmm6
1553	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1554	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1555	vpxor	%xmm5,%xmm12,%xmm12
1556	vxorps	%xmm15,%xmm8,%xmm8
1557
1558	vmovdqu	80(%rdx),%xmm14
1559	vpxor	%xmm10,%xmm12,%xmm12
1560	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1561	vpxor	%xmm11,%xmm12,%xmm12
1562	vpslldq	$8,%xmm12,%xmm9
1563	vpxor	%xmm0,%xmm3,%xmm3
1564	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1565	vpsrldq	$8,%xmm12,%xmm12
1566	vpxor	%xmm9,%xmm10,%xmm10
1567	vmovdqu	48-64(%rsi),%xmm6
1568	vpshufb	%xmm13,%xmm14,%xmm14
1569	vxorps	%xmm12,%xmm11,%xmm11
1570	vpxor	%xmm1,%xmm4,%xmm4
1571	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1572	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1573	vmovdqu	80-64(%rsi),%xmm7
1574	vpxor	%xmm14,%xmm9,%xmm9
1575	vpxor	%xmm2,%xmm5,%xmm5
1576
1577	vmovdqu	64(%rdx),%xmm15
1578	vpalignr	$8,%xmm10,%xmm10,%xmm12
1579	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1580	vpshufb	%xmm13,%xmm15,%xmm15
1581	vpxor	%xmm3,%xmm0,%xmm0
1582	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1583	vmovdqu	64-64(%rsi),%xmm6
1584	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1585	vpxor	%xmm4,%xmm1,%xmm1
1586	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1587	vxorps	%xmm15,%xmm8,%xmm8
1588	vpxor	%xmm5,%xmm2,%xmm2
1589
1590	vmovdqu	48(%rdx),%xmm14
1591	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1592	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1593	vpshufb	%xmm13,%xmm14,%xmm14
1594	vpxor	%xmm0,%xmm3,%xmm3
1595	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1596	vmovdqu	96-64(%rsi),%xmm6
1597	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1598	vpxor	%xmm1,%xmm4,%xmm4
1599	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1600	vmovdqu	128-64(%rsi),%xmm7
1601	vpxor	%xmm14,%xmm9,%xmm9
1602	vpxor	%xmm2,%xmm5,%xmm5
1603
1604	vmovdqu	32(%rdx),%xmm15
1605	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1606	vpshufb	%xmm13,%xmm15,%xmm15
1607	vpxor	%xmm3,%xmm0,%xmm0
1608	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1609	vmovdqu	112-64(%rsi),%xmm6
1610	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1611	vpxor	%xmm4,%xmm1,%xmm1
1612	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1613	vpxor	%xmm15,%xmm8,%xmm8
1614	vpxor	%xmm5,%xmm2,%xmm2
1615	vxorps	%xmm12,%xmm10,%xmm10
1616
1617	vmovdqu	16(%rdx),%xmm14
1618	vpalignr	$8,%xmm10,%xmm10,%xmm12
1619	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1620	vpshufb	%xmm13,%xmm14,%xmm14
1621	vpxor	%xmm0,%xmm3,%xmm3
1622	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1623	vmovdqu	144-64(%rsi),%xmm6
1624	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1625	vxorps	%xmm11,%xmm12,%xmm12
1626	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1627	vpxor	%xmm1,%xmm4,%xmm4
1628	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1629	vmovdqu	176-64(%rsi),%xmm7
1630	vpxor	%xmm14,%xmm9,%xmm9
1631	vpxor	%xmm2,%xmm5,%xmm5
1632
1633	vmovdqu	(%rdx),%xmm15
1634	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1635	vpshufb	%xmm13,%xmm15,%xmm15
1636	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1637	vmovdqu	160-64(%rsi),%xmm6
1638	vpxor	%xmm12,%xmm15,%xmm15
1639	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1640	vpxor	%xmm10,%xmm15,%xmm15
1641
1642	leaq	128(%rdx),%rdx
1643	subq	$0x80,%rcx
1644	jnc	.Loop8x_avx
1645
1646	addq	$0x80,%rcx
1647	jmp	.Ltail_no_xor_avx
1648
1649.align	32
1650.Lshort_avx:
1651	vmovdqu	-16(%rdx,%rcx,1),%xmm14
1652	leaq	(%rdx,%rcx,1),%rdx
1653	vmovdqu	0-64(%rsi),%xmm6
1654	vmovdqu	32-64(%rsi),%xmm7
1655	vpshufb	%xmm13,%xmm14,%xmm15
1656
1657	vmovdqa	%xmm0,%xmm3
1658	vmovdqa	%xmm1,%xmm4
1659	vmovdqa	%xmm2,%xmm5
1660	subq	$0x10,%rcx
1661	jz	.Ltail_avx
1662
1663	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1664	vpxor	%xmm0,%xmm3,%xmm3
1665	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1666	vpxor	%xmm15,%xmm8,%xmm8
1667	vmovdqu	-32(%rdx),%xmm14
1668	vpxor	%xmm1,%xmm4,%xmm4
1669	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1670	vmovdqu	16-64(%rsi),%xmm6
1671	vpshufb	%xmm13,%xmm14,%xmm15
1672	vpxor	%xmm2,%xmm5,%xmm5
1673	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1674	vpsrldq	$8,%xmm7,%xmm7
1675	subq	$0x10,%rcx
1676	jz	.Ltail_avx
1677
1678	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1679	vpxor	%xmm0,%xmm3,%xmm3
1680	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1681	vpxor	%xmm15,%xmm8,%xmm8
1682	vmovdqu	-48(%rdx),%xmm14
1683	vpxor	%xmm1,%xmm4,%xmm4
1684	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1685	vmovdqu	48-64(%rsi),%xmm6
1686	vpshufb	%xmm13,%xmm14,%xmm15
1687	vpxor	%xmm2,%xmm5,%xmm5
1688	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1689	vmovdqu	80-64(%rsi),%xmm7
1690	subq	$0x10,%rcx
1691	jz	.Ltail_avx
1692
1693	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1694	vpxor	%xmm0,%xmm3,%xmm3
1695	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1696	vpxor	%xmm15,%xmm8,%xmm8
1697	vmovdqu	-64(%rdx),%xmm14
1698	vpxor	%xmm1,%xmm4,%xmm4
1699	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1700	vmovdqu	64-64(%rsi),%xmm6
1701	vpshufb	%xmm13,%xmm14,%xmm15
1702	vpxor	%xmm2,%xmm5,%xmm5
1703	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1704	vpsrldq	$8,%xmm7,%xmm7
1705	subq	$0x10,%rcx
1706	jz	.Ltail_avx
1707
1708	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1709	vpxor	%xmm0,%xmm3,%xmm3
1710	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1711	vpxor	%xmm15,%xmm8,%xmm8
1712	vmovdqu	-80(%rdx),%xmm14
1713	vpxor	%xmm1,%xmm4,%xmm4
1714	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1715	vmovdqu	96-64(%rsi),%xmm6
1716	vpshufb	%xmm13,%xmm14,%xmm15
1717	vpxor	%xmm2,%xmm5,%xmm5
1718	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1719	vmovdqu	128-64(%rsi),%xmm7
1720	subq	$0x10,%rcx
1721	jz	.Ltail_avx
1722
1723	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1724	vpxor	%xmm0,%xmm3,%xmm3
1725	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1726	vpxor	%xmm15,%xmm8,%xmm8
1727	vmovdqu	-96(%rdx),%xmm14
1728	vpxor	%xmm1,%xmm4,%xmm4
1729	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1730	vmovdqu	112-64(%rsi),%xmm6
1731	vpshufb	%xmm13,%xmm14,%xmm15
1732	vpxor	%xmm2,%xmm5,%xmm5
1733	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1734	vpsrldq	$8,%xmm7,%xmm7
1735	subq	$0x10,%rcx
1736	jz	.Ltail_avx
1737
1738	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1739	vpxor	%xmm0,%xmm3,%xmm3
1740	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1741	vpxor	%xmm15,%xmm8,%xmm8
1742	vmovdqu	-112(%rdx),%xmm14
1743	vpxor	%xmm1,%xmm4,%xmm4
1744	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1745	vmovdqu	144-64(%rsi),%xmm6
1746	vpshufb	%xmm13,%xmm14,%xmm15
1747	vpxor	%xmm2,%xmm5,%xmm5
1748	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1749	vmovq	184-64(%rsi),%xmm7
1750	subq	$0x10,%rcx
1751	jmp	.Ltail_avx
1752
1753.align	32
1754.Ltail_avx:
1755	vpxor	%xmm10,%xmm15,%xmm15
1756.Ltail_no_xor_avx:
1757	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1758	vpxor	%xmm0,%xmm3,%xmm3
1759	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1760	vpxor	%xmm15,%xmm8,%xmm8
1761	vpxor	%xmm1,%xmm4,%xmm4
1762	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1763	vpxor	%xmm2,%xmm5,%xmm5
1764	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1765
1766	vmovdqu	(%r10),%xmm12
1767
1768	vpxor	%xmm0,%xmm3,%xmm10
1769	vpxor	%xmm1,%xmm4,%xmm11
1770	vpxor	%xmm2,%xmm5,%xmm5
1771
1772	vpxor	%xmm10,%xmm5,%xmm5
1773	vpxor	%xmm11,%xmm5,%xmm5
1774	vpslldq	$8,%xmm5,%xmm9
1775	vpsrldq	$8,%xmm5,%xmm5
1776	vpxor	%xmm9,%xmm10,%xmm10
1777	vpxor	%xmm5,%xmm11,%xmm11
1778
1779	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1780	vpalignr	$8,%xmm10,%xmm10,%xmm10
1781	vpxor	%xmm9,%xmm10,%xmm10
1782
1783	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1784	vpalignr	$8,%xmm10,%xmm10,%xmm10
1785	vpxor	%xmm11,%xmm10,%xmm10
1786	vpxor	%xmm9,%xmm10,%xmm10
1787
1788	cmpq	$0,%rcx
1789	jne	.Lshort_avx
1790
1791	vpshufb	%xmm13,%xmm10,%xmm10
1792	vmovdqu	%xmm10,(%rdi)
1793	vzeroupper
1794	.byte	0xf3,0xc3
1795.cfi_endproc
1796.size	gcm_ghash_avx,.-gcm_ghash_avx
1797.align	64
1798.Lbswap_mask:
1799.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1800.L0x1c2_polynomial:
1801.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1802.L7_mask:
1803.long	7,0,7,0
1804.L7_mask_poly:
1805.long	7,0,450,0
1806.align	64
1807.type	.Lrem_4bit,@object
1808.Lrem_4bit:
1809.long	0,0,0,471859200,0,943718400,0,610271232
1810.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
1811.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
1812.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
1813.type	.Lrem_8bit,@object
1814.Lrem_8bit:
1815.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1816.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1817.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1818.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1819.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1820.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1821.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1822.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1823.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1824.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1825.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1826.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1827.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1828.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1829.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1830.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1831.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1832.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1833.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1834.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1835.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1836.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1837.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1838.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1839.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1840.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1841.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1842.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1843.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1844.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1845.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1846.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1847
1848.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1849.align	64
1850