xref: /freebsd/sys/crypto/openssl/amd64/ghash-x86_64.S (revision 2a63c3be158216222d89a073dcbd6a72ee4aab5a)
1/* Do not modify. This file is auto-generated from ghash-x86_64.pl. */
2.text
3
4
5.globl	gcm_gmult_4bit
6.type	gcm_gmult_4bit,@function
7.align	16
8gcm_gmult_4bit:
9.cfi_startproc
10	pushq	%rbx
11.cfi_adjust_cfa_offset	8
12.cfi_offset	%rbx,-16
13	pushq	%rbp
14.cfi_adjust_cfa_offset	8
15.cfi_offset	%rbp,-24
16	pushq	%r12
17.cfi_adjust_cfa_offset	8
18.cfi_offset	%r12,-32
19	pushq	%r13
20.cfi_adjust_cfa_offset	8
21.cfi_offset	%r13,-40
22	pushq	%r14
23.cfi_adjust_cfa_offset	8
24.cfi_offset	%r14,-48
25	pushq	%r15
26.cfi_adjust_cfa_offset	8
27.cfi_offset	%r15,-56
28	subq	$280,%rsp
29.cfi_adjust_cfa_offset	280
30.Lgmult_prologue:
31
32	movzbq	15(%rdi),%r8
33	leaq	.Lrem_4bit(%rip),%r11
34	xorq	%rax,%rax
35	xorq	%rbx,%rbx
36	movb	%r8b,%al
37	movb	%r8b,%bl
38	shlb	$4,%al
39	movq	$14,%rcx
40	movq	8(%rsi,%rax,1),%r8
41	movq	(%rsi,%rax,1),%r9
42	andb	$0xf0,%bl
43	movq	%r8,%rdx
44	jmp	.Loop1
45
46.align	16
47.Loop1:
48	shrq	$4,%r8
49	andq	$0xf,%rdx
50	movq	%r9,%r10
51	movb	(%rdi,%rcx,1),%al
52	shrq	$4,%r9
53	xorq	8(%rsi,%rbx,1),%r8
54	shlq	$60,%r10
55	xorq	(%rsi,%rbx,1),%r9
56	movb	%al,%bl
57	xorq	(%r11,%rdx,8),%r9
58	movq	%r8,%rdx
59	shlb	$4,%al
60	xorq	%r10,%r8
61	decq	%rcx
62	js	.Lbreak1
63
64	shrq	$4,%r8
65	andq	$0xf,%rdx
66	movq	%r9,%r10
67	shrq	$4,%r9
68	xorq	8(%rsi,%rax,1),%r8
69	shlq	$60,%r10
70	xorq	(%rsi,%rax,1),%r9
71	andb	$0xf0,%bl
72	xorq	(%r11,%rdx,8),%r9
73	movq	%r8,%rdx
74	xorq	%r10,%r8
75	jmp	.Loop1
76
77.align	16
78.Lbreak1:
79	shrq	$4,%r8
80	andq	$0xf,%rdx
81	movq	%r9,%r10
82	shrq	$4,%r9
83	xorq	8(%rsi,%rax,1),%r8
84	shlq	$60,%r10
85	xorq	(%rsi,%rax,1),%r9
86	andb	$0xf0,%bl
87	xorq	(%r11,%rdx,8),%r9
88	movq	%r8,%rdx
89	xorq	%r10,%r8
90
91	shrq	$4,%r8
92	andq	$0xf,%rdx
93	movq	%r9,%r10
94	shrq	$4,%r9
95	xorq	8(%rsi,%rbx,1),%r8
96	shlq	$60,%r10
97	xorq	(%rsi,%rbx,1),%r9
98	xorq	%r10,%r8
99	xorq	(%r11,%rdx,8),%r9
100
101	bswapq	%r8
102	bswapq	%r9
103	movq	%r8,8(%rdi)
104	movq	%r9,(%rdi)
105
106	leaq	280+48(%rsp),%rsi
107.cfi_def_cfa	%rsi,8
108	movq	-8(%rsi),%rbx
109.cfi_restore	%rbx
110	leaq	(%rsi),%rsp
111.cfi_def_cfa_register	%rsp
112.Lgmult_epilogue:
113	.byte	0xf3,0xc3
114.cfi_endproc
115.size	gcm_gmult_4bit,.-gcm_gmult_4bit
116.globl	gcm_ghash_4bit
117.type	gcm_ghash_4bit,@function
118.align	16
119gcm_ghash_4bit:
120.cfi_startproc
121	pushq	%rbx
122.cfi_adjust_cfa_offset	8
123.cfi_offset	%rbx,-16
124	pushq	%rbp
125.cfi_adjust_cfa_offset	8
126.cfi_offset	%rbp,-24
127	pushq	%r12
128.cfi_adjust_cfa_offset	8
129.cfi_offset	%r12,-32
130	pushq	%r13
131.cfi_adjust_cfa_offset	8
132.cfi_offset	%r13,-40
133	pushq	%r14
134.cfi_adjust_cfa_offset	8
135.cfi_offset	%r14,-48
136	pushq	%r15
137.cfi_adjust_cfa_offset	8
138.cfi_offset	%r15,-56
139	subq	$280,%rsp
140.cfi_adjust_cfa_offset	280
141.Lghash_prologue:
142	movq	%rdx,%r14
143	movq	%rcx,%r15
144	subq	$-128,%rsi
145	leaq	16+128(%rsp),%rbp
146	xorl	%edx,%edx
147	movq	0+0-128(%rsi),%r8
148	movq	0+8-128(%rsi),%rax
149	movb	%al,%dl
150	shrq	$4,%rax
151	movq	%r8,%r10
152	shrq	$4,%r8
153	movq	16+0-128(%rsi),%r9
154	shlb	$4,%dl
155	movq	16+8-128(%rsi),%rbx
156	shlq	$60,%r10
157	movb	%dl,0(%rsp)
158	orq	%r10,%rax
159	movb	%bl,%dl
160	shrq	$4,%rbx
161	movq	%r9,%r10
162	shrq	$4,%r9
163	movq	%r8,0(%rbp)
164	movq	32+0-128(%rsi),%r8
165	shlb	$4,%dl
166	movq	%rax,0-128(%rbp)
167	movq	32+8-128(%rsi),%rax
168	shlq	$60,%r10
169	movb	%dl,1(%rsp)
170	orq	%r10,%rbx
171	movb	%al,%dl
172	shrq	$4,%rax
173	movq	%r8,%r10
174	shrq	$4,%r8
175	movq	%r9,8(%rbp)
176	movq	48+0-128(%rsi),%r9
177	shlb	$4,%dl
178	movq	%rbx,8-128(%rbp)
179	movq	48+8-128(%rsi),%rbx
180	shlq	$60,%r10
181	movb	%dl,2(%rsp)
182	orq	%r10,%rax
183	movb	%bl,%dl
184	shrq	$4,%rbx
185	movq	%r9,%r10
186	shrq	$4,%r9
187	movq	%r8,16(%rbp)
188	movq	64+0-128(%rsi),%r8
189	shlb	$4,%dl
190	movq	%rax,16-128(%rbp)
191	movq	64+8-128(%rsi),%rax
192	shlq	$60,%r10
193	movb	%dl,3(%rsp)
194	orq	%r10,%rbx
195	movb	%al,%dl
196	shrq	$4,%rax
197	movq	%r8,%r10
198	shrq	$4,%r8
199	movq	%r9,24(%rbp)
200	movq	80+0-128(%rsi),%r9
201	shlb	$4,%dl
202	movq	%rbx,24-128(%rbp)
203	movq	80+8-128(%rsi),%rbx
204	shlq	$60,%r10
205	movb	%dl,4(%rsp)
206	orq	%r10,%rax
207	movb	%bl,%dl
208	shrq	$4,%rbx
209	movq	%r9,%r10
210	shrq	$4,%r9
211	movq	%r8,32(%rbp)
212	movq	96+0-128(%rsi),%r8
213	shlb	$4,%dl
214	movq	%rax,32-128(%rbp)
215	movq	96+8-128(%rsi),%rax
216	shlq	$60,%r10
217	movb	%dl,5(%rsp)
218	orq	%r10,%rbx
219	movb	%al,%dl
220	shrq	$4,%rax
221	movq	%r8,%r10
222	shrq	$4,%r8
223	movq	%r9,40(%rbp)
224	movq	112+0-128(%rsi),%r9
225	shlb	$4,%dl
226	movq	%rbx,40-128(%rbp)
227	movq	112+8-128(%rsi),%rbx
228	shlq	$60,%r10
229	movb	%dl,6(%rsp)
230	orq	%r10,%rax
231	movb	%bl,%dl
232	shrq	$4,%rbx
233	movq	%r9,%r10
234	shrq	$4,%r9
235	movq	%r8,48(%rbp)
236	movq	128+0-128(%rsi),%r8
237	shlb	$4,%dl
238	movq	%rax,48-128(%rbp)
239	movq	128+8-128(%rsi),%rax
240	shlq	$60,%r10
241	movb	%dl,7(%rsp)
242	orq	%r10,%rbx
243	movb	%al,%dl
244	shrq	$4,%rax
245	movq	%r8,%r10
246	shrq	$4,%r8
247	movq	%r9,56(%rbp)
248	movq	144+0-128(%rsi),%r9
249	shlb	$4,%dl
250	movq	%rbx,56-128(%rbp)
251	movq	144+8-128(%rsi),%rbx
252	shlq	$60,%r10
253	movb	%dl,8(%rsp)
254	orq	%r10,%rax
255	movb	%bl,%dl
256	shrq	$4,%rbx
257	movq	%r9,%r10
258	shrq	$4,%r9
259	movq	%r8,64(%rbp)
260	movq	160+0-128(%rsi),%r8
261	shlb	$4,%dl
262	movq	%rax,64-128(%rbp)
263	movq	160+8-128(%rsi),%rax
264	shlq	$60,%r10
265	movb	%dl,9(%rsp)
266	orq	%r10,%rbx
267	movb	%al,%dl
268	shrq	$4,%rax
269	movq	%r8,%r10
270	shrq	$4,%r8
271	movq	%r9,72(%rbp)
272	movq	176+0-128(%rsi),%r9
273	shlb	$4,%dl
274	movq	%rbx,72-128(%rbp)
275	movq	176+8-128(%rsi),%rbx
276	shlq	$60,%r10
277	movb	%dl,10(%rsp)
278	orq	%r10,%rax
279	movb	%bl,%dl
280	shrq	$4,%rbx
281	movq	%r9,%r10
282	shrq	$4,%r9
283	movq	%r8,80(%rbp)
284	movq	192+0-128(%rsi),%r8
285	shlb	$4,%dl
286	movq	%rax,80-128(%rbp)
287	movq	192+8-128(%rsi),%rax
288	shlq	$60,%r10
289	movb	%dl,11(%rsp)
290	orq	%r10,%rbx
291	movb	%al,%dl
292	shrq	$4,%rax
293	movq	%r8,%r10
294	shrq	$4,%r8
295	movq	%r9,88(%rbp)
296	movq	208+0-128(%rsi),%r9
297	shlb	$4,%dl
298	movq	%rbx,88-128(%rbp)
299	movq	208+8-128(%rsi),%rbx
300	shlq	$60,%r10
301	movb	%dl,12(%rsp)
302	orq	%r10,%rax
303	movb	%bl,%dl
304	shrq	$4,%rbx
305	movq	%r9,%r10
306	shrq	$4,%r9
307	movq	%r8,96(%rbp)
308	movq	224+0-128(%rsi),%r8
309	shlb	$4,%dl
310	movq	%rax,96-128(%rbp)
311	movq	224+8-128(%rsi),%rax
312	shlq	$60,%r10
313	movb	%dl,13(%rsp)
314	orq	%r10,%rbx
315	movb	%al,%dl
316	shrq	$4,%rax
317	movq	%r8,%r10
318	shrq	$4,%r8
319	movq	%r9,104(%rbp)
320	movq	240+0-128(%rsi),%r9
321	shlb	$4,%dl
322	movq	%rbx,104-128(%rbp)
323	movq	240+8-128(%rsi),%rbx
324	shlq	$60,%r10
325	movb	%dl,14(%rsp)
326	orq	%r10,%rax
327	movb	%bl,%dl
328	shrq	$4,%rbx
329	movq	%r9,%r10
330	shrq	$4,%r9
331	movq	%r8,112(%rbp)
332	shlb	$4,%dl
333	movq	%rax,112-128(%rbp)
334	shlq	$60,%r10
335	movb	%dl,15(%rsp)
336	orq	%r10,%rbx
337	movq	%r9,120(%rbp)
338	movq	%rbx,120-128(%rbp)
339	addq	$-128,%rsi
340	movq	8(%rdi),%r8
341	movq	0(%rdi),%r9
342	addq	%r14,%r15
343	leaq	.Lrem_8bit(%rip),%r11
344	jmp	.Louter_loop
345.align	16
346.Louter_loop:
347	xorq	(%r14),%r9
348	movq	8(%r14),%rdx
349	leaq	16(%r14),%r14
350	xorq	%r8,%rdx
351	movq	%r9,(%rdi)
352	movq	%rdx,8(%rdi)
353	shrq	$32,%rdx
354	xorq	%rax,%rax
355	roll	$8,%edx
356	movb	%dl,%al
357	movzbl	%dl,%ebx
358	shlb	$4,%al
359	shrl	$4,%ebx
360	roll	$8,%edx
361	movq	8(%rsi,%rax,1),%r8
362	movq	(%rsi,%rax,1),%r9
363	movb	%dl,%al
364	movzbl	%dl,%ecx
365	shlb	$4,%al
366	movzbq	(%rsp,%rbx,1),%r12
367	shrl	$4,%ecx
368	xorq	%r8,%r12
369	movq	%r9,%r10
370	shrq	$8,%r8
371	movzbq	%r12b,%r12
372	shrq	$8,%r9
373	xorq	-128(%rbp,%rbx,8),%r8
374	shlq	$56,%r10
375	xorq	(%rbp,%rbx,8),%r9
376	roll	$8,%edx
377	xorq	8(%rsi,%rax,1),%r8
378	xorq	(%rsi,%rax,1),%r9
379	movb	%dl,%al
380	xorq	%r10,%r8
381	movzwq	(%r11,%r12,2),%r12
382	movzbl	%dl,%ebx
383	shlb	$4,%al
384	movzbq	(%rsp,%rcx,1),%r13
385	shrl	$4,%ebx
386	shlq	$48,%r12
387	xorq	%r8,%r13
388	movq	%r9,%r10
389	xorq	%r12,%r9
390	shrq	$8,%r8
391	movzbq	%r13b,%r13
392	shrq	$8,%r9
393	xorq	-128(%rbp,%rcx,8),%r8
394	shlq	$56,%r10
395	xorq	(%rbp,%rcx,8),%r9
396	roll	$8,%edx
397	xorq	8(%rsi,%rax,1),%r8
398	xorq	(%rsi,%rax,1),%r9
399	movb	%dl,%al
400	xorq	%r10,%r8
401	movzwq	(%r11,%r13,2),%r13
402	movzbl	%dl,%ecx
403	shlb	$4,%al
404	movzbq	(%rsp,%rbx,1),%r12
405	shrl	$4,%ecx
406	shlq	$48,%r13
407	xorq	%r8,%r12
408	movq	%r9,%r10
409	xorq	%r13,%r9
410	shrq	$8,%r8
411	movzbq	%r12b,%r12
412	movl	8(%rdi),%edx
413	shrq	$8,%r9
414	xorq	-128(%rbp,%rbx,8),%r8
415	shlq	$56,%r10
416	xorq	(%rbp,%rbx,8),%r9
417	roll	$8,%edx
418	xorq	8(%rsi,%rax,1),%r8
419	xorq	(%rsi,%rax,1),%r9
420	movb	%dl,%al
421	xorq	%r10,%r8
422	movzwq	(%r11,%r12,2),%r12
423	movzbl	%dl,%ebx
424	shlb	$4,%al
425	movzbq	(%rsp,%rcx,1),%r13
426	shrl	$4,%ebx
427	shlq	$48,%r12
428	xorq	%r8,%r13
429	movq	%r9,%r10
430	xorq	%r12,%r9
431	shrq	$8,%r8
432	movzbq	%r13b,%r13
433	shrq	$8,%r9
434	xorq	-128(%rbp,%rcx,8),%r8
435	shlq	$56,%r10
436	xorq	(%rbp,%rcx,8),%r9
437	roll	$8,%edx
438	xorq	8(%rsi,%rax,1),%r8
439	xorq	(%rsi,%rax,1),%r9
440	movb	%dl,%al
441	xorq	%r10,%r8
442	movzwq	(%r11,%r13,2),%r13
443	movzbl	%dl,%ecx
444	shlb	$4,%al
445	movzbq	(%rsp,%rbx,1),%r12
446	shrl	$4,%ecx
447	shlq	$48,%r13
448	xorq	%r8,%r12
449	movq	%r9,%r10
450	xorq	%r13,%r9
451	shrq	$8,%r8
452	movzbq	%r12b,%r12
453	shrq	$8,%r9
454	xorq	-128(%rbp,%rbx,8),%r8
455	shlq	$56,%r10
456	xorq	(%rbp,%rbx,8),%r9
457	roll	$8,%edx
458	xorq	8(%rsi,%rax,1),%r8
459	xorq	(%rsi,%rax,1),%r9
460	movb	%dl,%al
461	xorq	%r10,%r8
462	movzwq	(%r11,%r12,2),%r12
463	movzbl	%dl,%ebx
464	shlb	$4,%al
465	movzbq	(%rsp,%rcx,1),%r13
466	shrl	$4,%ebx
467	shlq	$48,%r12
468	xorq	%r8,%r13
469	movq	%r9,%r10
470	xorq	%r12,%r9
471	shrq	$8,%r8
472	movzbq	%r13b,%r13
473	shrq	$8,%r9
474	xorq	-128(%rbp,%rcx,8),%r8
475	shlq	$56,%r10
476	xorq	(%rbp,%rcx,8),%r9
477	roll	$8,%edx
478	xorq	8(%rsi,%rax,1),%r8
479	xorq	(%rsi,%rax,1),%r9
480	movb	%dl,%al
481	xorq	%r10,%r8
482	movzwq	(%r11,%r13,2),%r13
483	movzbl	%dl,%ecx
484	shlb	$4,%al
485	movzbq	(%rsp,%rbx,1),%r12
486	shrl	$4,%ecx
487	shlq	$48,%r13
488	xorq	%r8,%r12
489	movq	%r9,%r10
490	xorq	%r13,%r9
491	shrq	$8,%r8
492	movzbq	%r12b,%r12
493	movl	4(%rdi),%edx
494	shrq	$8,%r9
495	xorq	-128(%rbp,%rbx,8),%r8
496	shlq	$56,%r10
497	xorq	(%rbp,%rbx,8),%r9
498	roll	$8,%edx
499	xorq	8(%rsi,%rax,1),%r8
500	xorq	(%rsi,%rax,1),%r9
501	movb	%dl,%al
502	xorq	%r10,%r8
503	movzwq	(%r11,%r12,2),%r12
504	movzbl	%dl,%ebx
505	shlb	$4,%al
506	movzbq	(%rsp,%rcx,1),%r13
507	shrl	$4,%ebx
508	shlq	$48,%r12
509	xorq	%r8,%r13
510	movq	%r9,%r10
511	xorq	%r12,%r9
512	shrq	$8,%r8
513	movzbq	%r13b,%r13
514	shrq	$8,%r9
515	xorq	-128(%rbp,%rcx,8),%r8
516	shlq	$56,%r10
517	xorq	(%rbp,%rcx,8),%r9
518	roll	$8,%edx
519	xorq	8(%rsi,%rax,1),%r8
520	xorq	(%rsi,%rax,1),%r9
521	movb	%dl,%al
522	xorq	%r10,%r8
523	movzwq	(%r11,%r13,2),%r13
524	movzbl	%dl,%ecx
525	shlb	$4,%al
526	movzbq	(%rsp,%rbx,1),%r12
527	shrl	$4,%ecx
528	shlq	$48,%r13
529	xorq	%r8,%r12
530	movq	%r9,%r10
531	xorq	%r13,%r9
532	shrq	$8,%r8
533	movzbq	%r12b,%r12
534	shrq	$8,%r9
535	xorq	-128(%rbp,%rbx,8),%r8
536	shlq	$56,%r10
537	xorq	(%rbp,%rbx,8),%r9
538	roll	$8,%edx
539	xorq	8(%rsi,%rax,1),%r8
540	xorq	(%rsi,%rax,1),%r9
541	movb	%dl,%al
542	xorq	%r10,%r8
543	movzwq	(%r11,%r12,2),%r12
544	movzbl	%dl,%ebx
545	shlb	$4,%al
546	movzbq	(%rsp,%rcx,1),%r13
547	shrl	$4,%ebx
548	shlq	$48,%r12
549	xorq	%r8,%r13
550	movq	%r9,%r10
551	xorq	%r12,%r9
552	shrq	$8,%r8
553	movzbq	%r13b,%r13
554	shrq	$8,%r9
555	xorq	-128(%rbp,%rcx,8),%r8
556	shlq	$56,%r10
557	xorq	(%rbp,%rcx,8),%r9
558	roll	$8,%edx
559	xorq	8(%rsi,%rax,1),%r8
560	xorq	(%rsi,%rax,1),%r9
561	movb	%dl,%al
562	xorq	%r10,%r8
563	movzwq	(%r11,%r13,2),%r13
564	movzbl	%dl,%ecx
565	shlb	$4,%al
566	movzbq	(%rsp,%rbx,1),%r12
567	shrl	$4,%ecx
568	shlq	$48,%r13
569	xorq	%r8,%r12
570	movq	%r9,%r10
571	xorq	%r13,%r9
572	shrq	$8,%r8
573	movzbq	%r12b,%r12
574	movl	0(%rdi),%edx
575	shrq	$8,%r9
576	xorq	-128(%rbp,%rbx,8),%r8
577	shlq	$56,%r10
578	xorq	(%rbp,%rbx,8),%r9
579	roll	$8,%edx
580	xorq	8(%rsi,%rax,1),%r8
581	xorq	(%rsi,%rax,1),%r9
582	movb	%dl,%al
583	xorq	%r10,%r8
584	movzwq	(%r11,%r12,2),%r12
585	movzbl	%dl,%ebx
586	shlb	$4,%al
587	movzbq	(%rsp,%rcx,1),%r13
588	shrl	$4,%ebx
589	shlq	$48,%r12
590	xorq	%r8,%r13
591	movq	%r9,%r10
592	xorq	%r12,%r9
593	shrq	$8,%r8
594	movzbq	%r13b,%r13
595	shrq	$8,%r9
596	xorq	-128(%rbp,%rcx,8),%r8
597	shlq	$56,%r10
598	xorq	(%rbp,%rcx,8),%r9
599	roll	$8,%edx
600	xorq	8(%rsi,%rax,1),%r8
601	xorq	(%rsi,%rax,1),%r9
602	movb	%dl,%al
603	xorq	%r10,%r8
604	movzwq	(%r11,%r13,2),%r13
605	movzbl	%dl,%ecx
606	shlb	$4,%al
607	movzbq	(%rsp,%rbx,1),%r12
608	shrl	$4,%ecx
609	shlq	$48,%r13
610	xorq	%r8,%r12
611	movq	%r9,%r10
612	xorq	%r13,%r9
613	shrq	$8,%r8
614	movzbq	%r12b,%r12
615	shrq	$8,%r9
616	xorq	-128(%rbp,%rbx,8),%r8
617	shlq	$56,%r10
618	xorq	(%rbp,%rbx,8),%r9
619	roll	$8,%edx
620	xorq	8(%rsi,%rax,1),%r8
621	xorq	(%rsi,%rax,1),%r9
622	movb	%dl,%al
623	xorq	%r10,%r8
624	movzwq	(%r11,%r12,2),%r12
625	movzbl	%dl,%ebx
626	shlb	$4,%al
627	movzbq	(%rsp,%rcx,1),%r13
628	shrl	$4,%ebx
629	shlq	$48,%r12
630	xorq	%r8,%r13
631	movq	%r9,%r10
632	xorq	%r12,%r9
633	shrq	$8,%r8
634	movzbq	%r13b,%r13
635	shrq	$8,%r9
636	xorq	-128(%rbp,%rcx,8),%r8
637	shlq	$56,%r10
638	xorq	(%rbp,%rcx,8),%r9
639	roll	$8,%edx
640	xorq	8(%rsi,%rax,1),%r8
641	xorq	(%rsi,%rax,1),%r9
642	movb	%dl,%al
643	xorq	%r10,%r8
644	movzwq	(%r11,%r13,2),%r13
645	movzbl	%dl,%ecx
646	shlb	$4,%al
647	movzbq	(%rsp,%rbx,1),%r12
648	andl	$240,%ecx
649	shlq	$48,%r13
650	xorq	%r8,%r12
651	movq	%r9,%r10
652	xorq	%r13,%r9
653	shrq	$8,%r8
654	movzbq	%r12b,%r12
655	movl	-4(%rdi),%edx
656	shrq	$8,%r9
657	xorq	-128(%rbp,%rbx,8),%r8
658	shlq	$56,%r10
659	xorq	(%rbp,%rbx,8),%r9
660	movzwq	(%r11,%r12,2),%r12
661	xorq	8(%rsi,%rax,1),%r8
662	xorq	(%rsi,%rax,1),%r9
663	shlq	$48,%r12
664	xorq	%r10,%r8
665	xorq	%r12,%r9
666	movzbq	%r8b,%r13
667	shrq	$4,%r8
668	movq	%r9,%r10
669	shlb	$4,%r13b
670	shrq	$4,%r9
671	xorq	8(%rsi,%rcx,1),%r8
672	movzwq	(%r11,%r13,2),%r13
673	shlq	$60,%r10
674	xorq	(%rsi,%rcx,1),%r9
675	xorq	%r10,%r8
676	shlq	$48,%r13
677	bswapq	%r8
678	xorq	%r13,%r9
679	bswapq	%r9
680	cmpq	%r15,%r14
681	jb	.Louter_loop
682	movq	%r8,8(%rdi)
683	movq	%r9,(%rdi)
684
685	leaq	280+48(%rsp),%rsi
686.cfi_def_cfa	%rsi,8
687	movq	-48(%rsi),%r15
688.cfi_restore	%r15
689	movq	-40(%rsi),%r14
690.cfi_restore	%r14
691	movq	-32(%rsi),%r13
692.cfi_restore	%r13
693	movq	-24(%rsi),%r12
694.cfi_restore	%r12
695	movq	-16(%rsi),%rbp
696.cfi_restore	%rbp
697	movq	-8(%rsi),%rbx
698.cfi_restore	%rbx
699	leaq	0(%rsi),%rsp
700.cfi_def_cfa_register	%rsp
701.Lghash_epilogue:
702	.byte	0xf3,0xc3
703.cfi_endproc
704.size	gcm_ghash_4bit,.-gcm_ghash_4bit
705.globl	gcm_init_clmul
706.type	gcm_init_clmul,@function
707.align	16
708gcm_init_clmul:
709.cfi_startproc
710.L_init_clmul:
711	movdqu	(%rsi),%xmm2
712	pshufd	$78,%xmm2,%xmm2
713
714
715	pshufd	$255,%xmm2,%xmm4
716	movdqa	%xmm2,%xmm3
717	psllq	$1,%xmm2
718	pxor	%xmm5,%xmm5
719	psrlq	$63,%xmm3
720	pcmpgtd	%xmm4,%xmm5
721	pslldq	$8,%xmm3
722	por	%xmm3,%xmm2
723
724
725	pand	.L0x1c2_polynomial(%rip),%xmm5
726	pxor	%xmm5,%xmm2
727
728
729	pshufd	$78,%xmm2,%xmm6
730	movdqa	%xmm2,%xmm0
731	pxor	%xmm2,%xmm6
732	movdqa	%xmm0,%xmm1
733	pshufd	$78,%xmm0,%xmm3
734	pxor	%xmm0,%xmm3
735.byte	102,15,58,68,194,0
736.byte	102,15,58,68,202,17
737.byte	102,15,58,68,222,0
738	pxor	%xmm0,%xmm3
739	pxor	%xmm1,%xmm3
740
741	movdqa	%xmm3,%xmm4
742	psrldq	$8,%xmm3
743	pslldq	$8,%xmm4
744	pxor	%xmm3,%xmm1
745	pxor	%xmm4,%xmm0
746
747	movdqa	%xmm0,%xmm4
748	movdqa	%xmm0,%xmm3
749	psllq	$5,%xmm0
750	pxor	%xmm0,%xmm3
751	psllq	$1,%xmm0
752	pxor	%xmm3,%xmm0
753	psllq	$57,%xmm0
754	movdqa	%xmm0,%xmm3
755	pslldq	$8,%xmm0
756	psrldq	$8,%xmm3
757	pxor	%xmm4,%xmm0
758	pxor	%xmm3,%xmm1
759
760
761	movdqa	%xmm0,%xmm4
762	psrlq	$1,%xmm0
763	pxor	%xmm4,%xmm1
764	pxor	%xmm0,%xmm4
765	psrlq	$5,%xmm0
766	pxor	%xmm4,%xmm0
767	psrlq	$1,%xmm0
768	pxor	%xmm1,%xmm0
769	pshufd	$78,%xmm2,%xmm3
770	pshufd	$78,%xmm0,%xmm4
771	pxor	%xmm2,%xmm3
772	movdqu	%xmm2,0(%rdi)
773	pxor	%xmm0,%xmm4
774	movdqu	%xmm0,16(%rdi)
775.byte	102,15,58,15,227,8
776	movdqu	%xmm4,32(%rdi)
777	movdqa	%xmm0,%xmm1
778	pshufd	$78,%xmm0,%xmm3
779	pxor	%xmm0,%xmm3
780.byte	102,15,58,68,194,0
781.byte	102,15,58,68,202,17
782.byte	102,15,58,68,222,0
783	pxor	%xmm0,%xmm3
784	pxor	%xmm1,%xmm3
785
786	movdqa	%xmm3,%xmm4
787	psrldq	$8,%xmm3
788	pslldq	$8,%xmm4
789	pxor	%xmm3,%xmm1
790	pxor	%xmm4,%xmm0
791
792	movdqa	%xmm0,%xmm4
793	movdqa	%xmm0,%xmm3
794	psllq	$5,%xmm0
795	pxor	%xmm0,%xmm3
796	psllq	$1,%xmm0
797	pxor	%xmm3,%xmm0
798	psllq	$57,%xmm0
799	movdqa	%xmm0,%xmm3
800	pslldq	$8,%xmm0
801	psrldq	$8,%xmm3
802	pxor	%xmm4,%xmm0
803	pxor	%xmm3,%xmm1
804
805
806	movdqa	%xmm0,%xmm4
807	psrlq	$1,%xmm0
808	pxor	%xmm4,%xmm1
809	pxor	%xmm0,%xmm4
810	psrlq	$5,%xmm0
811	pxor	%xmm4,%xmm0
812	psrlq	$1,%xmm0
813	pxor	%xmm1,%xmm0
814	movdqa	%xmm0,%xmm5
815	movdqa	%xmm0,%xmm1
816	pshufd	$78,%xmm0,%xmm3
817	pxor	%xmm0,%xmm3
818.byte	102,15,58,68,194,0
819.byte	102,15,58,68,202,17
820.byte	102,15,58,68,222,0
821	pxor	%xmm0,%xmm3
822	pxor	%xmm1,%xmm3
823
824	movdqa	%xmm3,%xmm4
825	psrldq	$8,%xmm3
826	pslldq	$8,%xmm4
827	pxor	%xmm3,%xmm1
828	pxor	%xmm4,%xmm0
829
830	movdqa	%xmm0,%xmm4
831	movdqa	%xmm0,%xmm3
832	psllq	$5,%xmm0
833	pxor	%xmm0,%xmm3
834	psllq	$1,%xmm0
835	pxor	%xmm3,%xmm0
836	psllq	$57,%xmm0
837	movdqa	%xmm0,%xmm3
838	pslldq	$8,%xmm0
839	psrldq	$8,%xmm3
840	pxor	%xmm4,%xmm0
841	pxor	%xmm3,%xmm1
842
843
844	movdqa	%xmm0,%xmm4
845	psrlq	$1,%xmm0
846	pxor	%xmm4,%xmm1
847	pxor	%xmm0,%xmm4
848	psrlq	$5,%xmm0
849	pxor	%xmm4,%xmm0
850	psrlq	$1,%xmm0
851	pxor	%xmm1,%xmm0
852	pshufd	$78,%xmm5,%xmm3
853	pshufd	$78,%xmm0,%xmm4
854	pxor	%xmm5,%xmm3
855	movdqu	%xmm5,48(%rdi)
856	pxor	%xmm0,%xmm4
857	movdqu	%xmm0,64(%rdi)
858.byte	102,15,58,15,227,8
859	movdqu	%xmm4,80(%rdi)
860	.byte	0xf3,0xc3
861.cfi_endproc
862.size	gcm_init_clmul,.-gcm_init_clmul
863.globl	gcm_gmult_clmul
864.type	gcm_gmult_clmul,@function
865.align	16
866gcm_gmult_clmul:
867.cfi_startproc
868.L_gmult_clmul:
869	movdqu	(%rdi),%xmm0
870	movdqa	.Lbswap_mask(%rip),%xmm5
871	movdqu	(%rsi),%xmm2
872	movdqu	32(%rsi),%xmm4
873.byte	102,15,56,0,197
874	movdqa	%xmm0,%xmm1
875	pshufd	$78,%xmm0,%xmm3
876	pxor	%xmm0,%xmm3
877.byte	102,15,58,68,194,0
878.byte	102,15,58,68,202,17
879.byte	102,15,58,68,220,0
880	pxor	%xmm0,%xmm3
881	pxor	%xmm1,%xmm3
882
883	movdqa	%xmm3,%xmm4
884	psrldq	$8,%xmm3
885	pslldq	$8,%xmm4
886	pxor	%xmm3,%xmm1
887	pxor	%xmm4,%xmm0
888
889	movdqa	%xmm0,%xmm4
890	movdqa	%xmm0,%xmm3
891	psllq	$5,%xmm0
892	pxor	%xmm0,%xmm3
893	psllq	$1,%xmm0
894	pxor	%xmm3,%xmm0
895	psllq	$57,%xmm0
896	movdqa	%xmm0,%xmm3
897	pslldq	$8,%xmm0
898	psrldq	$8,%xmm3
899	pxor	%xmm4,%xmm0
900	pxor	%xmm3,%xmm1
901
902
903	movdqa	%xmm0,%xmm4
904	psrlq	$1,%xmm0
905	pxor	%xmm4,%xmm1
906	pxor	%xmm0,%xmm4
907	psrlq	$5,%xmm0
908	pxor	%xmm4,%xmm0
909	psrlq	$1,%xmm0
910	pxor	%xmm1,%xmm0
911.byte	102,15,56,0,197
912	movdqu	%xmm0,(%rdi)
913	.byte	0xf3,0xc3
914.cfi_endproc
915.size	gcm_gmult_clmul,.-gcm_gmult_clmul
916.globl	gcm_ghash_clmul
917.type	gcm_ghash_clmul,@function
918.align	32
919gcm_ghash_clmul:
920.cfi_startproc
921.L_ghash_clmul:
922	movdqa	.Lbswap_mask(%rip),%xmm10
923
924	movdqu	(%rdi),%xmm0
925	movdqu	(%rsi),%xmm2
926	movdqu	32(%rsi),%xmm7
927.byte	102,65,15,56,0,194
928
929	subq	$0x10,%rcx
930	jz	.Lodd_tail
931
932	movdqu	16(%rsi),%xmm6
933	movl	OPENSSL_ia32cap_P+4(%rip),%eax
934	cmpq	$0x30,%rcx
935	jb	.Lskip4x
936
937	andl	$71303168,%eax
938	cmpl	$4194304,%eax
939	je	.Lskip4x
940
941	subq	$0x30,%rcx
942	movq	$0xA040608020C0E000,%rax
943	movdqu	48(%rsi),%xmm14
944	movdqu	64(%rsi),%xmm15
945
946
947
948
949	movdqu	48(%rdx),%xmm3
950	movdqu	32(%rdx),%xmm11
951.byte	102,65,15,56,0,218
952.byte	102,69,15,56,0,218
953	movdqa	%xmm3,%xmm5
954	pshufd	$78,%xmm3,%xmm4
955	pxor	%xmm3,%xmm4
956.byte	102,15,58,68,218,0
957.byte	102,15,58,68,234,17
958.byte	102,15,58,68,231,0
959
960	movdqa	%xmm11,%xmm13
961	pshufd	$78,%xmm11,%xmm12
962	pxor	%xmm11,%xmm12
963.byte	102,68,15,58,68,222,0
964.byte	102,68,15,58,68,238,17
965.byte	102,68,15,58,68,231,16
966	xorps	%xmm11,%xmm3
967	xorps	%xmm13,%xmm5
968	movups	80(%rsi),%xmm7
969	xorps	%xmm12,%xmm4
970
971	movdqu	16(%rdx),%xmm11
972	movdqu	0(%rdx),%xmm8
973.byte	102,69,15,56,0,218
974.byte	102,69,15,56,0,194
975	movdqa	%xmm11,%xmm13
976	pshufd	$78,%xmm11,%xmm12
977	pxor	%xmm8,%xmm0
978	pxor	%xmm11,%xmm12
979.byte	102,69,15,58,68,222,0
980	movdqa	%xmm0,%xmm1
981	pshufd	$78,%xmm0,%xmm8
982	pxor	%xmm0,%xmm8
983.byte	102,69,15,58,68,238,17
984.byte	102,68,15,58,68,231,0
985	xorps	%xmm11,%xmm3
986	xorps	%xmm13,%xmm5
987
988	leaq	64(%rdx),%rdx
989	subq	$0x40,%rcx
990	jc	.Ltail4x
991
992	jmp	.Lmod4_loop
993.align	32
994.Lmod4_loop:
995.byte	102,65,15,58,68,199,0
996	xorps	%xmm12,%xmm4
997	movdqu	48(%rdx),%xmm11
998.byte	102,69,15,56,0,218
999.byte	102,65,15,58,68,207,17
1000	xorps	%xmm3,%xmm0
1001	movdqu	32(%rdx),%xmm3
1002	movdqa	%xmm11,%xmm13
1003.byte	102,68,15,58,68,199,16
1004	pshufd	$78,%xmm11,%xmm12
1005	xorps	%xmm5,%xmm1
1006	pxor	%xmm11,%xmm12
1007.byte	102,65,15,56,0,218
1008	movups	32(%rsi),%xmm7
1009	xorps	%xmm4,%xmm8
1010.byte	102,68,15,58,68,218,0
1011	pshufd	$78,%xmm3,%xmm4
1012
1013	pxor	%xmm0,%xmm8
1014	movdqa	%xmm3,%xmm5
1015	pxor	%xmm1,%xmm8
1016	pxor	%xmm3,%xmm4
1017	movdqa	%xmm8,%xmm9
1018.byte	102,68,15,58,68,234,17
1019	pslldq	$8,%xmm8
1020	psrldq	$8,%xmm9
1021	pxor	%xmm8,%xmm0
1022	movdqa	.L7_mask(%rip),%xmm8
1023	pxor	%xmm9,%xmm1
1024.byte	102,76,15,110,200
1025
1026	pand	%xmm0,%xmm8
1027.byte	102,69,15,56,0,200
1028	pxor	%xmm0,%xmm9
1029.byte	102,68,15,58,68,231,0
1030	psllq	$57,%xmm9
1031	movdqa	%xmm9,%xmm8
1032	pslldq	$8,%xmm9
1033.byte	102,15,58,68,222,0
1034	psrldq	$8,%xmm8
1035	pxor	%xmm9,%xmm0
1036	pxor	%xmm8,%xmm1
1037	movdqu	0(%rdx),%xmm8
1038
1039	movdqa	%xmm0,%xmm9
1040	psrlq	$1,%xmm0
1041.byte	102,15,58,68,238,17
1042	xorps	%xmm11,%xmm3
1043	movdqu	16(%rdx),%xmm11
1044.byte	102,69,15,56,0,218
1045.byte	102,15,58,68,231,16
1046	xorps	%xmm13,%xmm5
1047	movups	80(%rsi),%xmm7
1048.byte	102,69,15,56,0,194
1049	pxor	%xmm9,%xmm1
1050	pxor	%xmm0,%xmm9
1051	psrlq	$5,%xmm0
1052
1053	movdqa	%xmm11,%xmm13
1054	pxor	%xmm12,%xmm4
1055	pshufd	$78,%xmm11,%xmm12
1056	pxor	%xmm9,%xmm0
1057	pxor	%xmm8,%xmm1
1058	pxor	%xmm11,%xmm12
1059.byte	102,69,15,58,68,222,0
1060	psrlq	$1,%xmm0
1061	pxor	%xmm1,%xmm0
1062	movdqa	%xmm0,%xmm1
1063.byte	102,69,15,58,68,238,17
1064	xorps	%xmm11,%xmm3
1065	pshufd	$78,%xmm0,%xmm8
1066	pxor	%xmm0,%xmm8
1067
1068.byte	102,68,15,58,68,231,0
1069	xorps	%xmm13,%xmm5
1070
1071	leaq	64(%rdx),%rdx
1072	subq	$0x40,%rcx
1073	jnc	.Lmod4_loop
1074
1075.Ltail4x:
1076.byte	102,65,15,58,68,199,0
1077.byte	102,65,15,58,68,207,17
1078.byte	102,68,15,58,68,199,16
1079	xorps	%xmm12,%xmm4
1080	xorps	%xmm3,%xmm0
1081	xorps	%xmm5,%xmm1
1082	pxor	%xmm0,%xmm1
1083	pxor	%xmm4,%xmm8
1084
1085	pxor	%xmm1,%xmm8
1086	pxor	%xmm0,%xmm1
1087
1088	movdqa	%xmm8,%xmm9
1089	psrldq	$8,%xmm8
1090	pslldq	$8,%xmm9
1091	pxor	%xmm8,%xmm1
1092	pxor	%xmm9,%xmm0
1093
1094	movdqa	%xmm0,%xmm4
1095	movdqa	%xmm0,%xmm3
1096	psllq	$5,%xmm0
1097	pxor	%xmm0,%xmm3
1098	psllq	$1,%xmm0
1099	pxor	%xmm3,%xmm0
1100	psllq	$57,%xmm0
1101	movdqa	%xmm0,%xmm3
1102	pslldq	$8,%xmm0
1103	psrldq	$8,%xmm3
1104	pxor	%xmm4,%xmm0
1105	pxor	%xmm3,%xmm1
1106
1107
1108	movdqa	%xmm0,%xmm4
1109	psrlq	$1,%xmm0
1110	pxor	%xmm4,%xmm1
1111	pxor	%xmm0,%xmm4
1112	psrlq	$5,%xmm0
1113	pxor	%xmm4,%xmm0
1114	psrlq	$1,%xmm0
1115	pxor	%xmm1,%xmm0
1116	addq	$0x40,%rcx
1117	jz	.Ldone
1118	movdqu	32(%rsi),%xmm7
1119	subq	$0x10,%rcx
1120	jz	.Lodd_tail
1121.Lskip4x:
1122
1123
1124
1125
1126
1127	movdqu	(%rdx),%xmm8
1128	movdqu	16(%rdx),%xmm3
1129.byte	102,69,15,56,0,194
1130.byte	102,65,15,56,0,218
1131	pxor	%xmm8,%xmm0
1132
1133	movdqa	%xmm3,%xmm5
1134	pshufd	$78,%xmm3,%xmm4
1135	pxor	%xmm3,%xmm4
1136.byte	102,15,58,68,218,0
1137.byte	102,15,58,68,234,17
1138.byte	102,15,58,68,231,0
1139
1140	leaq	32(%rdx),%rdx
1141	nop
1142	subq	$0x20,%rcx
1143	jbe	.Leven_tail
1144	nop
1145	jmp	.Lmod_loop
1146
1147.align	32
1148.Lmod_loop:
1149	movdqa	%xmm0,%xmm1
1150	movdqa	%xmm4,%xmm8
1151	pshufd	$78,%xmm0,%xmm4
1152	pxor	%xmm0,%xmm4
1153
1154.byte	102,15,58,68,198,0
1155.byte	102,15,58,68,206,17
1156.byte	102,15,58,68,231,16
1157
1158	pxor	%xmm3,%xmm0
1159	pxor	%xmm5,%xmm1
1160	movdqu	(%rdx),%xmm9
1161	pxor	%xmm0,%xmm8
1162.byte	102,69,15,56,0,202
1163	movdqu	16(%rdx),%xmm3
1164
1165	pxor	%xmm1,%xmm8
1166	pxor	%xmm9,%xmm1
1167	pxor	%xmm8,%xmm4
1168.byte	102,65,15,56,0,218
1169	movdqa	%xmm4,%xmm8
1170	psrldq	$8,%xmm8
1171	pslldq	$8,%xmm4
1172	pxor	%xmm8,%xmm1
1173	pxor	%xmm4,%xmm0
1174
1175	movdqa	%xmm3,%xmm5
1176
1177	movdqa	%xmm0,%xmm9
1178	movdqa	%xmm0,%xmm8
1179	psllq	$5,%xmm0
1180	pxor	%xmm0,%xmm8
1181.byte	102,15,58,68,218,0
1182	psllq	$1,%xmm0
1183	pxor	%xmm8,%xmm0
1184	psllq	$57,%xmm0
1185	movdqa	%xmm0,%xmm8
1186	pslldq	$8,%xmm0
1187	psrldq	$8,%xmm8
1188	pxor	%xmm9,%xmm0
1189	pshufd	$78,%xmm5,%xmm4
1190	pxor	%xmm8,%xmm1
1191	pxor	%xmm5,%xmm4
1192
1193	movdqa	%xmm0,%xmm9
1194	psrlq	$1,%xmm0
1195.byte	102,15,58,68,234,17
1196	pxor	%xmm9,%xmm1
1197	pxor	%xmm0,%xmm9
1198	psrlq	$5,%xmm0
1199	pxor	%xmm9,%xmm0
1200	leaq	32(%rdx),%rdx
1201	psrlq	$1,%xmm0
1202.byte	102,15,58,68,231,0
1203	pxor	%xmm1,%xmm0
1204
1205	subq	$0x20,%rcx
1206	ja	.Lmod_loop
1207
1208.Leven_tail:
1209	movdqa	%xmm0,%xmm1
1210	movdqa	%xmm4,%xmm8
1211	pshufd	$78,%xmm0,%xmm4
1212	pxor	%xmm0,%xmm4
1213
1214.byte	102,15,58,68,198,0
1215.byte	102,15,58,68,206,17
1216.byte	102,15,58,68,231,16
1217
1218	pxor	%xmm3,%xmm0
1219	pxor	%xmm5,%xmm1
1220	pxor	%xmm0,%xmm8
1221	pxor	%xmm1,%xmm8
1222	pxor	%xmm8,%xmm4
1223	movdqa	%xmm4,%xmm8
1224	psrldq	$8,%xmm8
1225	pslldq	$8,%xmm4
1226	pxor	%xmm8,%xmm1
1227	pxor	%xmm4,%xmm0
1228
1229	movdqa	%xmm0,%xmm4
1230	movdqa	%xmm0,%xmm3
1231	psllq	$5,%xmm0
1232	pxor	%xmm0,%xmm3
1233	psllq	$1,%xmm0
1234	pxor	%xmm3,%xmm0
1235	psllq	$57,%xmm0
1236	movdqa	%xmm0,%xmm3
1237	pslldq	$8,%xmm0
1238	psrldq	$8,%xmm3
1239	pxor	%xmm4,%xmm0
1240	pxor	%xmm3,%xmm1
1241
1242
1243	movdqa	%xmm0,%xmm4
1244	psrlq	$1,%xmm0
1245	pxor	%xmm4,%xmm1
1246	pxor	%xmm0,%xmm4
1247	psrlq	$5,%xmm0
1248	pxor	%xmm4,%xmm0
1249	psrlq	$1,%xmm0
1250	pxor	%xmm1,%xmm0
1251	testq	%rcx,%rcx
1252	jnz	.Ldone
1253
1254.Lodd_tail:
1255	movdqu	(%rdx),%xmm8
1256.byte	102,69,15,56,0,194
1257	pxor	%xmm8,%xmm0
1258	movdqa	%xmm0,%xmm1
1259	pshufd	$78,%xmm0,%xmm3
1260	pxor	%xmm0,%xmm3
1261.byte	102,15,58,68,194,0
1262.byte	102,15,58,68,202,17
1263.byte	102,15,58,68,223,0
1264	pxor	%xmm0,%xmm3
1265	pxor	%xmm1,%xmm3
1266
1267	movdqa	%xmm3,%xmm4
1268	psrldq	$8,%xmm3
1269	pslldq	$8,%xmm4
1270	pxor	%xmm3,%xmm1
1271	pxor	%xmm4,%xmm0
1272
1273	movdqa	%xmm0,%xmm4
1274	movdqa	%xmm0,%xmm3
1275	psllq	$5,%xmm0
1276	pxor	%xmm0,%xmm3
1277	psllq	$1,%xmm0
1278	pxor	%xmm3,%xmm0
1279	psllq	$57,%xmm0
1280	movdqa	%xmm0,%xmm3
1281	pslldq	$8,%xmm0
1282	psrldq	$8,%xmm3
1283	pxor	%xmm4,%xmm0
1284	pxor	%xmm3,%xmm1
1285
1286
1287	movdqa	%xmm0,%xmm4
1288	psrlq	$1,%xmm0
1289	pxor	%xmm4,%xmm1
1290	pxor	%xmm0,%xmm4
1291	psrlq	$5,%xmm0
1292	pxor	%xmm4,%xmm0
1293	psrlq	$1,%xmm0
1294	pxor	%xmm1,%xmm0
1295.Ldone:
1296.byte	102,65,15,56,0,194
1297	movdqu	%xmm0,(%rdi)
1298	.byte	0xf3,0xc3
1299.cfi_endproc
1300.size	gcm_ghash_clmul,.-gcm_ghash_clmul
1301.globl	gcm_init_avx
1302.type	gcm_init_avx,@function
1303.align	32
1304gcm_init_avx:
1305.cfi_startproc
1306	vzeroupper
1307
1308	vmovdqu	(%rsi),%xmm2
1309	vpshufd	$78,%xmm2,%xmm2
1310
1311
1312	vpshufd	$255,%xmm2,%xmm4
1313	vpsrlq	$63,%xmm2,%xmm3
1314	vpsllq	$1,%xmm2,%xmm2
1315	vpxor	%xmm5,%xmm5,%xmm5
1316	vpcmpgtd	%xmm4,%xmm5,%xmm5
1317	vpslldq	$8,%xmm3,%xmm3
1318	vpor	%xmm3,%xmm2,%xmm2
1319
1320
1321	vpand	.L0x1c2_polynomial(%rip),%xmm5,%xmm5
1322	vpxor	%xmm5,%xmm2,%xmm2
1323
1324	vpunpckhqdq	%xmm2,%xmm2,%xmm6
1325	vmovdqa	%xmm2,%xmm0
1326	vpxor	%xmm2,%xmm6,%xmm6
1327	movq	$4,%r10
1328	jmp	.Linit_start_avx
1329.align	32
1330.Linit_loop_avx:
1331	vpalignr	$8,%xmm3,%xmm4,%xmm5
1332	vmovdqu	%xmm5,-16(%rdi)
1333	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1334	vpxor	%xmm0,%xmm3,%xmm3
1335	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1336	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1337	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1338	vpxor	%xmm0,%xmm1,%xmm4
1339	vpxor	%xmm4,%xmm3,%xmm3
1340
1341	vpslldq	$8,%xmm3,%xmm4
1342	vpsrldq	$8,%xmm3,%xmm3
1343	vpxor	%xmm4,%xmm0,%xmm0
1344	vpxor	%xmm3,%xmm1,%xmm1
1345	vpsllq	$57,%xmm0,%xmm3
1346	vpsllq	$62,%xmm0,%xmm4
1347	vpxor	%xmm3,%xmm4,%xmm4
1348	vpsllq	$63,%xmm0,%xmm3
1349	vpxor	%xmm3,%xmm4,%xmm4
1350	vpslldq	$8,%xmm4,%xmm3
1351	vpsrldq	$8,%xmm4,%xmm4
1352	vpxor	%xmm3,%xmm0,%xmm0
1353	vpxor	%xmm4,%xmm1,%xmm1
1354
1355	vpsrlq	$1,%xmm0,%xmm4
1356	vpxor	%xmm0,%xmm1,%xmm1
1357	vpxor	%xmm4,%xmm0,%xmm0
1358	vpsrlq	$5,%xmm4,%xmm4
1359	vpxor	%xmm4,%xmm0,%xmm0
1360	vpsrlq	$1,%xmm0,%xmm0
1361	vpxor	%xmm1,%xmm0,%xmm0
1362.Linit_start_avx:
1363	vmovdqa	%xmm0,%xmm5
1364	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1365	vpxor	%xmm0,%xmm3,%xmm3
1366	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1367	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1368	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1369	vpxor	%xmm0,%xmm1,%xmm4
1370	vpxor	%xmm4,%xmm3,%xmm3
1371
1372	vpslldq	$8,%xmm3,%xmm4
1373	vpsrldq	$8,%xmm3,%xmm3
1374	vpxor	%xmm4,%xmm0,%xmm0
1375	vpxor	%xmm3,%xmm1,%xmm1
1376	vpsllq	$57,%xmm0,%xmm3
1377	vpsllq	$62,%xmm0,%xmm4
1378	vpxor	%xmm3,%xmm4,%xmm4
1379	vpsllq	$63,%xmm0,%xmm3
1380	vpxor	%xmm3,%xmm4,%xmm4
1381	vpslldq	$8,%xmm4,%xmm3
1382	vpsrldq	$8,%xmm4,%xmm4
1383	vpxor	%xmm3,%xmm0,%xmm0
1384	vpxor	%xmm4,%xmm1,%xmm1
1385
1386	vpsrlq	$1,%xmm0,%xmm4
1387	vpxor	%xmm0,%xmm1,%xmm1
1388	vpxor	%xmm4,%xmm0,%xmm0
1389	vpsrlq	$5,%xmm4,%xmm4
1390	vpxor	%xmm4,%xmm0,%xmm0
1391	vpsrlq	$1,%xmm0,%xmm0
1392	vpxor	%xmm1,%xmm0,%xmm0
1393	vpshufd	$78,%xmm5,%xmm3
1394	vpshufd	$78,%xmm0,%xmm4
1395	vpxor	%xmm5,%xmm3,%xmm3
1396	vmovdqu	%xmm5,0(%rdi)
1397	vpxor	%xmm0,%xmm4,%xmm4
1398	vmovdqu	%xmm0,16(%rdi)
1399	leaq	48(%rdi),%rdi
1400	subq	$1,%r10
1401	jnz	.Linit_loop_avx
1402
1403	vpalignr	$8,%xmm4,%xmm3,%xmm5
1404	vmovdqu	%xmm5,-16(%rdi)
1405
1406	vzeroupper
1407	.byte	0xf3,0xc3
1408.cfi_endproc
1409.size	gcm_init_avx,.-gcm_init_avx
1410.globl	gcm_gmult_avx
1411.type	gcm_gmult_avx,@function
1412.align	32
1413gcm_gmult_avx:
1414.cfi_startproc
1415	jmp	.L_gmult_clmul
1416.cfi_endproc
1417.size	gcm_gmult_avx,.-gcm_gmult_avx
1418.globl	gcm_ghash_avx
1419.type	gcm_ghash_avx,@function
1420.align	32
1421gcm_ghash_avx:
1422.cfi_startproc
1423	vzeroupper
1424
1425	vmovdqu	(%rdi),%xmm10
1426	leaq	.L0x1c2_polynomial(%rip),%r10
1427	leaq	64(%rsi),%rsi
1428	vmovdqu	.Lbswap_mask(%rip),%xmm13
1429	vpshufb	%xmm13,%xmm10,%xmm10
1430	cmpq	$0x80,%rcx
1431	jb	.Lshort_avx
1432	subq	$0x80,%rcx
1433
1434	vmovdqu	112(%rdx),%xmm14
1435	vmovdqu	0-64(%rsi),%xmm6
1436	vpshufb	%xmm13,%xmm14,%xmm14
1437	vmovdqu	32-64(%rsi),%xmm7
1438
1439	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1440	vmovdqu	96(%rdx),%xmm15
1441	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1442	vpxor	%xmm14,%xmm9,%xmm9
1443	vpshufb	%xmm13,%xmm15,%xmm15
1444	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1445	vmovdqu	16-64(%rsi),%xmm6
1446	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1447	vmovdqu	80(%rdx),%xmm14
1448	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1449	vpxor	%xmm15,%xmm8,%xmm8
1450
1451	vpshufb	%xmm13,%xmm14,%xmm14
1452	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1453	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1454	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1455	vmovdqu	48-64(%rsi),%xmm6
1456	vpxor	%xmm14,%xmm9,%xmm9
1457	vmovdqu	64(%rdx),%xmm15
1458	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1459	vmovdqu	80-64(%rsi),%xmm7
1460
1461	vpshufb	%xmm13,%xmm15,%xmm15
1462	vpxor	%xmm0,%xmm3,%xmm3
1463	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1464	vpxor	%xmm1,%xmm4,%xmm4
1465	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1466	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1467	vmovdqu	64-64(%rsi),%xmm6
1468	vpxor	%xmm2,%xmm5,%xmm5
1469	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1470	vpxor	%xmm15,%xmm8,%xmm8
1471
1472	vmovdqu	48(%rdx),%xmm14
1473	vpxor	%xmm3,%xmm0,%xmm0
1474	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1475	vpxor	%xmm4,%xmm1,%xmm1
1476	vpshufb	%xmm13,%xmm14,%xmm14
1477	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1478	vmovdqu	96-64(%rsi),%xmm6
1479	vpxor	%xmm5,%xmm2,%xmm2
1480	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1481	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1482	vmovdqu	128-64(%rsi),%xmm7
1483	vpxor	%xmm14,%xmm9,%xmm9
1484
1485	vmovdqu	32(%rdx),%xmm15
1486	vpxor	%xmm0,%xmm3,%xmm3
1487	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1488	vpxor	%xmm1,%xmm4,%xmm4
1489	vpshufb	%xmm13,%xmm15,%xmm15
1490	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1491	vmovdqu	112-64(%rsi),%xmm6
1492	vpxor	%xmm2,%xmm5,%xmm5
1493	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1494	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1495	vpxor	%xmm15,%xmm8,%xmm8
1496
1497	vmovdqu	16(%rdx),%xmm14
1498	vpxor	%xmm3,%xmm0,%xmm0
1499	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1500	vpxor	%xmm4,%xmm1,%xmm1
1501	vpshufb	%xmm13,%xmm14,%xmm14
1502	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1503	vmovdqu	144-64(%rsi),%xmm6
1504	vpxor	%xmm5,%xmm2,%xmm2
1505	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1506	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1507	vmovdqu	176-64(%rsi),%xmm7
1508	vpxor	%xmm14,%xmm9,%xmm9
1509
1510	vmovdqu	(%rdx),%xmm15
1511	vpxor	%xmm0,%xmm3,%xmm3
1512	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1513	vpxor	%xmm1,%xmm4,%xmm4
1514	vpshufb	%xmm13,%xmm15,%xmm15
1515	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1516	vmovdqu	160-64(%rsi),%xmm6
1517	vpxor	%xmm2,%xmm5,%xmm5
1518	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1519
1520	leaq	128(%rdx),%rdx
1521	cmpq	$0x80,%rcx
1522	jb	.Ltail_avx
1523
1524	vpxor	%xmm10,%xmm15,%xmm15
1525	subq	$0x80,%rcx
1526	jmp	.Loop8x_avx
1527
1528.align	32
1529.Loop8x_avx:
1530	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1531	vmovdqu	112(%rdx),%xmm14
1532	vpxor	%xmm0,%xmm3,%xmm3
1533	vpxor	%xmm15,%xmm8,%xmm8
1534	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
1535	vpshufb	%xmm13,%xmm14,%xmm14
1536	vpxor	%xmm1,%xmm4,%xmm4
1537	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
1538	vmovdqu	0-64(%rsi),%xmm6
1539	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1540	vpxor	%xmm2,%xmm5,%xmm5
1541	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
1542	vmovdqu	32-64(%rsi),%xmm7
1543	vpxor	%xmm14,%xmm9,%xmm9
1544
1545	vmovdqu	96(%rdx),%xmm15
1546	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1547	vpxor	%xmm3,%xmm10,%xmm10
1548	vpshufb	%xmm13,%xmm15,%xmm15
1549	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1550	vxorps	%xmm4,%xmm11,%xmm11
1551	vmovdqu	16-64(%rsi),%xmm6
1552	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1553	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1554	vpxor	%xmm5,%xmm12,%xmm12
1555	vxorps	%xmm15,%xmm8,%xmm8
1556
1557	vmovdqu	80(%rdx),%xmm14
1558	vpxor	%xmm10,%xmm12,%xmm12
1559	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1560	vpxor	%xmm11,%xmm12,%xmm12
1561	vpslldq	$8,%xmm12,%xmm9
1562	vpxor	%xmm0,%xmm3,%xmm3
1563	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1564	vpsrldq	$8,%xmm12,%xmm12
1565	vpxor	%xmm9,%xmm10,%xmm10
1566	vmovdqu	48-64(%rsi),%xmm6
1567	vpshufb	%xmm13,%xmm14,%xmm14
1568	vxorps	%xmm12,%xmm11,%xmm11
1569	vpxor	%xmm1,%xmm4,%xmm4
1570	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1571	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1572	vmovdqu	80-64(%rsi),%xmm7
1573	vpxor	%xmm14,%xmm9,%xmm9
1574	vpxor	%xmm2,%xmm5,%xmm5
1575
1576	vmovdqu	64(%rdx),%xmm15
1577	vpalignr	$8,%xmm10,%xmm10,%xmm12
1578	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1579	vpshufb	%xmm13,%xmm15,%xmm15
1580	vpxor	%xmm3,%xmm0,%xmm0
1581	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1582	vmovdqu	64-64(%rsi),%xmm6
1583	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1584	vpxor	%xmm4,%xmm1,%xmm1
1585	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1586	vxorps	%xmm15,%xmm8,%xmm8
1587	vpxor	%xmm5,%xmm2,%xmm2
1588
1589	vmovdqu	48(%rdx),%xmm14
1590	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1591	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1592	vpshufb	%xmm13,%xmm14,%xmm14
1593	vpxor	%xmm0,%xmm3,%xmm3
1594	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1595	vmovdqu	96-64(%rsi),%xmm6
1596	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1597	vpxor	%xmm1,%xmm4,%xmm4
1598	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1599	vmovdqu	128-64(%rsi),%xmm7
1600	vpxor	%xmm14,%xmm9,%xmm9
1601	vpxor	%xmm2,%xmm5,%xmm5
1602
1603	vmovdqu	32(%rdx),%xmm15
1604	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1605	vpshufb	%xmm13,%xmm15,%xmm15
1606	vpxor	%xmm3,%xmm0,%xmm0
1607	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1608	vmovdqu	112-64(%rsi),%xmm6
1609	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1610	vpxor	%xmm4,%xmm1,%xmm1
1611	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1612	vpxor	%xmm15,%xmm8,%xmm8
1613	vpxor	%xmm5,%xmm2,%xmm2
1614	vxorps	%xmm12,%xmm10,%xmm10
1615
1616	vmovdqu	16(%rdx),%xmm14
1617	vpalignr	$8,%xmm10,%xmm10,%xmm12
1618	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1619	vpshufb	%xmm13,%xmm14,%xmm14
1620	vpxor	%xmm0,%xmm3,%xmm3
1621	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1622	vmovdqu	144-64(%rsi),%xmm6
1623	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1624	vxorps	%xmm11,%xmm12,%xmm12
1625	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1626	vpxor	%xmm1,%xmm4,%xmm4
1627	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1628	vmovdqu	176-64(%rsi),%xmm7
1629	vpxor	%xmm14,%xmm9,%xmm9
1630	vpxor	%xmm2,%xmm5,%xmm5
1631
1632	vmovdqu	(%rdx),%xmm15
1633	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1634	vpshufb	%xmm13,%xmm15,%xmm15
1635	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1636	vmovdqu	160-64(%rsi),%xmm6
1637	vpxor	%xmm12,%xmm15,%xmm15
1638	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1639	vpxor	%xmm10,%xmm15,%xmm15
1640
1641	leaq	128(%rdx),%rdx
1642	subq	$0x80,%rcx
1643	jnc	.Loop8x_avx
1644
1645	addq	$0x80,%rcx
1646	jmp	.Ltail_no_xor_avx
1647
1648.align	32
1649.Lshort_avx:
1650	vmovdqu	-16(%rdx,%rcx,1),%xmm14
1651	leaq	(%rdx,%rcx,1),%rdx
1652	vmovdqu	0-64(%rsi),%xmm6
1653	vmovdqu	32-64(%rsi),%xmm7
1654	vpshufb	%xmm13,%xmm14,%xmm15
1655
1656	vmovdqa	%xmm0,%xmm3
1657	vmovdqa	%xmm1,%xmm4
1658	vmovdqa	%xmm2,%xmm5
1659	subq	$0x10,%rcx
1660	jz	.Ltail_avx
1661
1662	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1663	vpxor	%xmm0,%xmm3,%xmm3
1664	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1665	vpxor	%xmm15,%xmm8,%xmm8
1666	vmovdqu	-32(%rdx),%xmm14
1667	vpxor	%xmm1,%xmm4,%xmm4
1668	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1669	vmovdqu	16-64(%rsi),%xmm6
1670	vpshufb	%xmm13,%xmm14,%xmm15
1671	vpxor	%xmm2,%xmm5,%xmm5
1672	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1673	vpsrldq	$8,%xmm7,%xmm7
1674	subq	$0x10,%rcx
1675	jz	.Ltail_avx
1676
1677	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1678	vpxor	%xmm0,%xmm3,%xmm3
1679	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1680	vpxor	%xmm15,%xmm8,%xmm8
1681	vmovdqu	-48(%rdx),%xmm14
1682	vpxor	%xmm1,%xmm4,%xmm4
1683	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1684	vmovdqu	48-64(%rsi),%xmm6
1685	vpshufb	%xmm13,%xmm14,%xmm15
1686	vpxor	%xmm2,%xmm5,%xmm5
1687	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1688	vmovdqu	80-64(%rsi),%xmm7
1689	subq	$0x10,%rcx
1690	jz	.Ltail_avx
1691
1692	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1693	vpxor	%xmm0,%xmm3,%xmm3
1694	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1695	vpxor	%xmm15,%xmm8,%xmm8
1696	vmovdqu	-64(%rdx),%xmm14
1697	vpxor	%xmm1,%xmm4,%xmm4
1698	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1699	vmovdqu	64-64(%rsi),%xmm6
1700	vpshufb	%xmm13,%xmm14,%xmm15
1701	vpxor	%xmm2,%xmm5,%xmm5
1702	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1703	vpsrldq	$8,%xmm7,%xmm7
1704	subq	$0x10,%rcx
1705	jz	.Ltail_avx
1706
1707	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1708	vpxor	%xmm0,%xmm3,%xmm3
1709	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1710	vpxor	%xmm15,%xmm8,%xmm8
1711	vmovdqu	-80(%rdx),%xmm14
1712	vpxor	%xmm1,%xmm4,%xmm4
1713	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1714	vmovdqu	96-64(%rsi),%xmm6
1715	vpshufb	%xmm13,%xmm14,%xmm15
1716	vpxor	%xmm2,%xmm5,%xmm5
1717	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1718	vmovdqu	128-64(%rsi),%xmm7
1719	subq	$0x10,%rcx
1720	jz	.Ltail_avx
1721
1722	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1723	vpxor	%xmm0,%xmm3,%xmm3
1724	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1725	vpxor	%xmm15,%xmm8,%xmm8
1726	vmovdqu	-96(%rdx),%xmm14
1727	vpxor	%xmm1,%xmm4,%xmm4
1728	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1729	vmovdqu	112-64(%rsi),%xmm6
1730	vpshufb	%xmm13,%xmm14,%xmm15
1731	vpxor	%xmm2,%xmm5,%xmm5
1732	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1733	vpsrldq	$8,%xmm7,%xmm7
1734	subq	$0x10,%rcx
1735	jz	.Ltail_avx
1736
1737	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1738	vpxor	%xmm0,%xmm3,%xmm3
1739	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1740	vpxor	%xmm15,%xmm8,%xmm8
1741	vmovdqu	-112(%rdx),%xmm14
1742	vpxor	%xmm1,%xmm4,%xmm4
1743	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1744	vmovdqu	144-64(%rsi),%xmm6
1745	vpshufb	%xmm13,%xmm14,%xmm15
1746	vpxor	%xmm2,%xmm5,%xmm5
1747	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1748	vmovq	184-64(%rsi),%xmm7
1749	subq	$0x10,%rcx
1750	jmp	.Ltail_avx
1751
1752.align	32
1753.Ltail_avx:
1754	vpxor	%xmm10,%xmm15,%xmm15
1755.Ltail_no_xor_avx:
1756	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1757	vpxor	%xmm0,%xmm3,%xmm3
1758	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1759	vpxor	%xmm15,%xmm8,%xmm8
1760	vpxor	%xmm1,%xmm4,%xmm4
1761	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1762	vpxor	%xmm2,%xmm5,%xmm5
1763	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1764
1765	vmovdqu	(%r10),%xmm12
1766
1767	vpxor	%xmm0,%xmm3,%xmm10
1768	vpxor	%xmm1,%xmm4,%xmm11
1769	vpxor	%xmm2,%xmm5,%xmm5
1770
1771	vpxor	%xmm10,%xmm5,%xmm5
1772	vpxor	%xmm11,%xmm5,%xmm5
1773	vpslldq	$8,%xmm5,%xmm9
1774	vpsrldq	$8,%xmm5,%xmm5
1775	vpxor	%xmm9,%xmm10,%xmm10
1776	vpxor	%xmm5,%xmm11,%xmm11
1777
1778	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1779	vpalignr	$8,%xmm10,%xmm10,%xmm10
1780	vpxor	%xmm9,%xmm10,%xmm10
1781
1782	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1783	vpalignr	$8,%xmm10,%xmm10,%xmm10
1784	vpxor	%xmm11,%xmm10,%xmm10
1785	vpxor	%xmm9,%xmm10,%xmm10
1786
1787	cmpq	$0,%rcx
1788	jne	.Lshort_avx
1789
1790	vpshufb	%xmm13,%xmm10,%xmm10
1791	vmovdqu	%xmm10,(%rdi)
1792	vzeroupper
1793	.byte	0xf3,0xc3
1794.cfi_endproc
1795.size	gcm_ghash_avx,.-gcm_ghash_avx
1796.align	64
1797.Lbswap_mask:
1798.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1799.L0x1c2_polynomial:
1800.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1801.L7_mask:
1802.long	7,0,7,0
1803.L7_mask_poly:
1804.long	7,0,450,0
1805.align	64
1806.type	.Lrem_4bit,@object
1807.Lrem_4bit:
1808.long	0,0,0,471859200,0,943718400,0,610271232
1809.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
1810.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
1811.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
1812.type	.Lrem_8bit,@object
1813.Lrem_8bit:
1814.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1815.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1816.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1817.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1818.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1819.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1820.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1821.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1822.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1823.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1824.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1825.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1826.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1827.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1828.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1829.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1830.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1831.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1832.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1833.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1834.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1835.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1836.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1837.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1838.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1839.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1840.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1841.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1842.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1843.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1844.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1845.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1846
1847.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1848.align	64
1849