xref: /freebsd/sys/crypto/openssl/amd64/ghash-x86_64.S (revision 6fa42b91ca3f481912af98c4d49c44507eb1b8e1)
1/* Do not modify. This file is auto-generated from ghash-x86_64.pl. */
2.text
3
4
5.globl	gcm_gmult_4bit
6.type	gcm_gmult_4bit,@function
7.align	16
8gcm_gmult_4bit:
9.cfi_startproc
10.byte	243,15,30,250
11	pushq	%rbx
12.cfi_adjust_cfa_offset	8
13.cfi_offset	%rbx,-16
14	pushq	%rbp
15.cfi_adjust_cfa_offset	8
16.cfi_offset	%rbp,-24
17	pushq	%r12
18.cfi_adjust_cfa_offset	8
19.cfi_offset	%r12,-32
20	pushq	%r13
21.cfi_adjust_cfa_offset	8
22.cfi_offset	%r13,-40
23	pushq	%r14
24.cfi_adjust_cfa_offset	8
25.cfi_offset	%r14,-48
26	pushq	%r15
27.cfi_adjust_cfa_offset	8
28.cfi_offset	%r15,-56
29	subq	$280,%rsp
30.cfi_adjust_cfa_offset	280
31.Lgmult_prologue:
32
33	movzbq	15(%rdi),%r8
34	leaq	.Lrem_4bit(%rip),%r11
35	xorq	%rax,%rax
36	xorq	%rbx,%rbx
37	movb	%r8b,%al
38	movb	%r8b,%bl
39	shlb	$4,%al
40	movq	$14,%rcx
41	movq	8(%rsi,%rax,1),%r8
42	movq	(%rsi,%rax,1),%r9
43	andb	$0xf0,%bl
44	movq	%r8,%rdx
45	jmp	.Loop1
46
47.align	16
48.Loop1:
49	shrq	$4,%r8
50	andq	$0xf,%rdx
51	movq	%r9,%r10
52	movb	(%rdi,%rcx,1),%al
53	shrq	$4,%r9
54	xorq	8(%rsi,%rbx,1),%r8
55	shlq	$60,%r10
56	xorq	(%rsi,%rbx,1),%r9
57	movb	%al,%bl
58	xorq	(%r11,%rdx,8),%r9
59	movq	%r8,%rdx
60	shlb	$4,%al
61	xorq	%r10,%r8
62	decq	%rcx
63	js	.Lbreak1
64
65	shrq	$4,%r8
66	andq	$0xf,%rdx
67	movq	%r9,%r10
68	shrq	$4,%r9
69	xorq	8(%rsi,%rax,1),%r8
70	shlq	$60,%r10
71	xorq	(%rsi,%rax,1),%r9
72	andb	$0xf0,%bl
73	xorq	(%r11,%rdx,8),%r9
74	movq	%r8,%rdx
75	xorq	%r10,%r8
76	jmp	.Loop1
77
78.align	16
79.Lbreak1:
80	shrq	$4,%r8
81	andq	$0xf,%rdx
82	movq	%r9,%r10
83	shrq	$4,%r9
84	xorq	8(%rsi,%rax,1),%r8
85	shlq	$60,%r10
86	xorq	(%rsi,%rax,1),%r9
87	andb	$0xf0,%bl
88	xorq	(%r11,%rdx,8),%r9
89	movq	%r8,%rdx
90	xorq	%r10,%r8
91
92	shrq	$4,%r8
93	andq	$0xf,%rdx
94	movq	%r9,%r10
95	shrq	$4,%r9
96	xorq	8(%rsi,%rbx,1),%r8
97	shlq	$60,%r10
98	xorq	(%rsi,%rbx,1),%r9
99	xorq	%r10,%r8
100	xorq	(%r11,%rdx,8),%r9
101
102	bswapq	%r8
103	bswapq	%r9
104	movq	%r8,8(%rdi)
105	movq	%r9,(%rdi)
106
107	leaq	280+48(%rsp),%rsi
108.cfi_def_cfa	%rsi,8
109	movq	-8(%rsi),%rbx
110.cfi_restore	%rbx
111	leaq	(%rsi),%rsp
112.cfi_def_cfa_register	%rsp
113.Lgmult_epilogue:
114	.byte	0xf3,0xc3
115.cfi_endproc
116.size	gcm_gmult_4bit,.-gcm_gmult_4bit
117.globl	gcm_ghash_4bit
118.type	gcm_ghash_4bit,@function
119.align	16
120gcm_ghash_4bit:
121.cfi_startproc
122.byte	243,15,30,250
123	pushq	%rbx
124.cfi_adjust_cfa_offset	8
125.cfi_offset	%rbx,-16
126	pushq	%rbp
127.cfi_adjust_cfa_offset	8
128.cfi_offset	%rbp,-24
129	pushq	%r12
130.cfi_adjust_cfa_offset	8
131.cfi_offset	%r12,-32
132	pushq	%r13
133.cfi_adjust_cfa_offset	8
134.cfi_offset	%r13,-40
135	pushq	%r14
136.cfi_adjust_cfa_offset	8
137.cfi_offset	%r14,-48
138	pushq	%r15
139.cfi_adjust_cfa_offset	8
140.cfi_offset	%r15,-56
141	subq	$280,%rsp
142.cfi_adjust_cfa_offset	280
143.Lghash_prologue:
144	movq	%rdx,%r14
145	movq	%rcx,%r15
146	subq	$-128,%rsi
147	leaq	16+128(%rsp),%rbp
148	xorl	%edx,%edx
149	movq	0+0-128(%rsi),%r8
150	movq	0+8-128(%rsi),%rax
151	movb	%al,%dl
152	shrq	$4,%rax
153	movq	%r8,%r10
154	shrq	$4,%r8
155	movq	16+0-128(%rsi),%r9
156	shlb	$4,%dl
157	movq	16+8-128(%rsi),%rbx
158	shlq	$60,%r10
159	movb	%dl,0(%rsp)
160	orq	%r10,%rax
161	movb	%bl,%dl
162	shrq	$4,%rbx
163	movq	%r9,%r10
164	shrq	$4,%r9
165	movq	%r8,0(%rbp)
166	movq	32+0-128(%rsi),%r8
167	shlb	$4,%dl
168	movq	%rax,0-128(%rbp)
169	movq	32+8-128(%rsi),%rax
170	shlq	$60,%r10
171	movb	%dl,1(%rsp)
172	orq	%r10,%rbx
173	movb	%al,%dl
174	shrq	$4,%rax
175	movq	%r8,%r10
176	shrq	$4,%r8
177	movq	%r9,8(%rbp)
178	movq	48+0-128(%rsi),%r9
179	shlb	$4,%dl
180	movq	%rbx,8-128(%rbp)
181	movq	48+8-128(%rsi),%rbx
182	shlq	$60,%r10
183	movb	%dl,2(%rsp)
184	orq	%r10,%rax
185	movb	%bl,%dl
186	shrq	$4,%rbx
187	movq	%r9,%r10
188	shrq	$4,%r9
189	movq	%r8,16(%rbp)
190	movq	64+0-128(%rsi),%r8
191	shlb	$4,%dl
192	movq	%rax,16-128(%rbp)
193	movq	64+8-128(%rsi),%rax
194	shlq	$60,%r10
195	movb	%dl,3(%rsp)
196	orq	%r10,%rbx
197	movb	%al,%dl
198	shrq	$4,%rax
199	movq	%r8,%r10
200	shrq	$4,%r8
201	movq	%r9,24(%rbp)
202	movq	80+0-128(%rsi),%r9
203	shlb	$4,%dl
204	movq	%rbx,24-128(%rbp)
205	movq	80+8-128(%rsi),%rbx
206	shlq	$60,%r10
207	movb	%dl,4(%rsp)
208	orq	%r10,%rax
209	movb	%bl,%dl
210	shrq	$4,%rbx
211	movq	%r9,%r10
212	shrq	$4,%r9
213	movq	%r8,32(%rbp)
214	movq	96+0-128(%rsi),%r8
215	shlb	$4,%dl
216	movq	%rax,32-128(%rbp)
217	movq	96+8-128(%rsi),%rax
218	shlq	$60,%r10
219	movb	%dl,5(%rsp)
220	orq	%r10,%rbx
221	movb	%al,%dl
222	shrq	$4,%rax
223	movq	%r8,%r10
224	shrq	$4,%r8
225	movq	%r9,40(%rbp)
226	movq	112+0-128(%rsi),%r9
227	shlb	$4,%dl
228	movq	%rbx,40-128(%rbp)
229	movq	112+8-128(%rsi),%rbx
230	shlq	$60,%r10
231	movb	%dl,6(%rsp)
232	orq	%r10,%rax
233	movb	%bl,%dl
234	shrq	$4,%rbx
235	movq	%r9,%r10
236	shrq	$4,%r9
237	movq	%r8,48(%rbp)
238	movq	128+0-128(%rsi),%r8
239	shlb	$4,%dl
240	movq	%rax,48-128(%rbp)
241	movq	128+8-128(%rsi),%rax
242	shlq	$60,%r10
243	movb	%dl,7(%rsp)
244	orq	%r10,%rbx
245	movb	%al,%dl
246	shrq	$4,%rax
247	movq	%r8,%r10
248	shrq	$4,%r8
249	movq	%r9,56(%rbp)
250	movq	144+0-128(%rsi),%r9
251	shlb	$4,%dl
252	movq	%rbx,56-128(%rbp)
253	movq	144+8-128(%rsi),%rbx
254	shlq	$60,%r10
255	movb	%dl,8(%rsp)
256	orq	%r10,%rax
257	movb	%bl,%dl
258	shrq	$4,%rbx
259	movq	%r9,%r10
260	shrq	$4,%r9
261	movq	%r8,64(%rbp)
262	movq	160+0-128(%rsi),%r8
263	shlb	$4,%dl
264	movq	%rax,64-128(%rbp)
265	movq	160+8-128(%rsi),%rax
266	shlq	$60,%r10
267	movb	%dl,9(%rsp)
268	orq	%r10,%rbx
269	movb	%al,%dl
270	shrq	$4,%rax
271	movq	%r8,%r10
272	shrq	$4,%r8
273	movq	%r9,72(%rbp)
274	movq	176+0-128(%rsi),%r9
275	shlb	$4,%dl
276	movq	%rbx,72-128(%rbp)
277	movq	176+8-128(%rsi),%rbx
278	shlq	$60,%r10
279	movb	%dl,10(%rsp)
280	orq	%r10,%rax
281	movb	%bl,%dl
282	shrq	$4,%rbx
283	movq	%r9,%r10
284	shrq	$4,%r9
285	movq	%r8,80(%rbp)
286	movq	192+0-128(%rsi),%r8
287	shlb	$4,%dl
288	movq	%rax,80-128(%rbp)
289	movq	192+8-128(%rsi),%rax
290	shlq	$60,%r10
291	movb	%dl,11(%rsp)
292	orq	%r10,%rbx
293	movb	%al,%dl
294	shrq	$4,%rax
295	movq	%r8,%r10
296	shrq	$4,%r8
297	movq	%r9,88(%rbp)
298	movq	208+0-128(%rsi),%r9
299	shlb	$4,%dl
300	movq	%rbx,88-128(%rbp)
301	movq	208+8-128(%rsi),%rbx
302	shlq	$60,%r10
303	movb	%dl,12(%rsp)
304	orq	%r10,%rax
305	movb	%bl,%dl
306	shrq	$4,%rbx
307	movq	%r9,%r10
308	shrq	$4,%r9
309	movq	%r8,96(%rbp)
310	movq	224+0-128(%rsi),%r8
311	shlb	$4,%dl
312	movq	%rax,96-128(%rbp)
313	movq	224+8-128(%rsi),%rax
314	shlq	$60,%r10
315	movb	%dl,13(%rsp)
316	orq	%r10,%rbx
317	movb	%al,%dl
318	shrq	$4,%rax
319	movq	%r8,%r10
320	shrq	$4,%r8
321	movq	%r9,104(%rbp)
322	movq	240+0-128(%rsi),%r9
323	shlb	$4,%dl
324	movq	%rbx,104-128(%rbp)
325	movq	240+8-128(%rsi),%rbx
326	shlq	$60,%r10
327	movb	%dl,14(%rsp)
328	orq	%r10,%rax
329	movb	%bl,%dl
330	shrq	$4,%rbx
331	movq	%r9,%r10
332	shrq	$4,%r9
333	movq	%r8,112(%rbp)
334	shlb	$4,%dl
335	movq	%rax,112-128(%rbp)
336	shlq	$60,%r10
337	movb	%dl,15(%rsp)
338	orq	%r10,%rbx
339	movq	%r9,120(%rbp)
340	movq	%rbx,120-128(%rbp)
341	addq	$-128,%rsi
342	movq	8(%rdi),%r8
343	movq	0(%rdi),%r9
344	addq	%r14,%r15
345	leaq	.Lrem_8bit(%rip),%r11
346	jmp	.Louter_loop
347.align	16
348.Louter_loop:
349	xorq	(%r14),%r9
350	movq	8(%r14),%rdx
351	leaq	16(%r14),%r14
352	xorq	%r8,%rdx
353	movq	%r9,(%rdi)
354	movq	%rdx,8(%rdi)
355	shrq	$32,%rdx
356	xorq	%rax,%rax
357	roll	$8,%edx
358	movb	%dl,%al
359	movzbl	%dl,%ebx
360	shlb	$4,%al
361	shrl	$4,%ebx
362	roll	$8,%edx
363	movq	8(%rsi,%rax,1),%r8
364	movq	(%rsi,%rax,1),%r9
365	movb	%dl,%al
366	movzbl	%dl,%ecx
367	shlb	$4,%al
368	movzbq	(%rsp,%rbx,1),%r12
369	shrl	$4,%ecx
370	xorq	%r8,%r12
371	movq	%r9,%r10
372	shrq	$8,%r8
373	movzbq	%r12b,%r12
374	shrq	$8,%r9
375	xorq	-128(%rbp,%rbx,8),%r8
376	shlq	$56,%r10
377	xorq	(%rbp,%rbx,8),%r9
378	roll	$8,%edx
379	xorq	8(%rsi,%rax,1),%r8
380	xorq	(%rsi,%rax,1),%r9
381	movb	%dl,%al
382	xorq	%r10,%r8
383	movzwq	(%r11,%r12,2),%r12
384	movzbl	%dl,%ebx
385	shlb	$4,%al
386	movzbq	(%rsp,%rcx,1),%r13
387	shrl	$4,%ebx
388	shlq	$48,%r12
389	xorq	%r8,%r13
390	movq	%r9,%r10
391	xorq	%r12,%r9
392	shrq	$8,%r8
393	movzbq	%r13b,%r13
394	shrq	$8,%r9
395	xorq	-128(%rbp,%rcx,8),%r8
396	shlq	$56,%r10
397	xorq	(%rbp,%rcx,8),%r9
398	roll	$8,%edx
399	xorq	8(%rsi,%rax,1),%r8
400	xorq	(%rsi,%rax,1),%r9
401	movb	%dl,%al
402	xorq	%r10,%r8
403	movzwq	(%r11,%r13,2),%r13
404	movzbl	%dl,%ecx
405	shlb	$4,%al
406	movzbq	(%rsp,%rbx,1),%r12
407	shrl	$4,%ecx
408	shlq	$48,%r13
409	xorq	%r8,%r12
410	movq	%r9,%r10
411	xorq	%r13,%r9
412	shrq	$8,%r8
413	movzbq	%r12b,%r12
414	movl	8(%rdi),%edx
415	shrq	$8,%r9
416	xorq	-128(%rbp,%rbx,8),%r8
417	shlq	$56,%r10
418	xorq	(%rbp,%rbx,8),%r9
419	roll	$8,%edx
420	xorq	8(%rsi,%rax,1),%r8
421	xorq	(%rsi,%rax,1),%r9
422	movb	%dl,%al
423	xorq	%r10,%r8
424	movzwq	(%r11,%r12,2),%r12
425	movzbl	%dl,%ebx
426	shlb	$4,%al
427	movzbq	(%rsp,%rcx,1),%r13
428	shrl	$4,%ebx
429	shlq	$48,%r12
430	xorq	%r8,%r13
431	movq	%r9,%r10
432	xorq	%r12,%r9
433	shrq	$8,%r8
434	movzbq	%r13b,%r13
435	shrq	$8,%r9
436	xorq	-128(%rbp,%rcx,8),%r8
437	shlq	$56,%r10
438	xorq	(%rbp,%rcx,8),%r9
439	roll	$8,%edx
440	xorq	8(%rsi,%rax,1),%r8
441	xorq	(%rsi,%rax,1),%r9
442	movb	%dl,%al
443	xorq	%r10,%r8
444	movzwq	(%r11,%r13,2),%r13
445	movzbl	%dl,%ecx
446	shlb	$4,%al
447	movzbq	(%rsp,%rbx,1),%r12
448	shrl	$4,%ecx
449	shlq	$48,%r13
450	xorq	%r8,%r12
451	movq	%r9,%r10
452	xorq	%r13,%r9
453	shrq	$8,%r8
454	movzbq	%r12b,%r12
455	shrq	$8,%r9
456	xorq	-128(%rbp,%rbx,8),%r8
457	shlq	$56,%r10
458	xorq	(%rbp,%rbx,8),%r9
459	roll	$8,%edx
460	xorq	8(%rsi,%rax,1),%r8
461	xorq	(%rsi,%rax,1),%r9
462	movb	%dl,%al
463	xorq	%r10,%r8
464	movzwq	(%r11,%r12,2),%r12
465	movzbl	%dl,%ebx
466	shlb	$4,%al
467	movzbq	(%rsp,%rcx,1),%r13
468	shrl	$4,%ebx
469	shlq	$48,%r12
470	xorq	%r8,%r13
471	movq	%r9,%r10
472	xorq	%r12,%r9
473	shrq	$8,%r8
474	movzbq	%r13b,%r13
475	shrq	$8,%r9
476	xorq	-128(%rbp,%rcx,8),%r8
477	shlq	$56,%r10
478	xorq	(%rbp,%rcx,8),%r9
479	roll	$8,%edx
480	xorq	8(%rsi,%rax,1),%r8
481	xorq	(%rsi,%rax,1),%r9
482	movb	%dl,%al
483	xorq	%r10,%r8
484	movzwq	(%r11,%r13,2),%r13
485	movzbl	%dl,%ecx
486	shlb	$4,%al
487	movzbq	(%rsp,%rbx,1),%r12
488	shrl	$4,%ecx
489	shlq	$48,%r13
490	xorq	%r8,%r12
491	movq	%r9,%r10
492	xorq	%r13,%r9
493	shrq	$8,%r8
494	movzbq	%r12b,%r12
495	movl	4(%rdi),%edx
496	shrq	$8,%r9
497	xorq	-128(%rbp,%rbx,8),%r8
498	shlq	$56,%r10
499	xorq	(%rbp,%rbx,8),%r9
500	roll	$8,%edx
501	xorq	8(%rsi,%rax,1),%r8
502	xorq	(%rsi,%rax,1),%r9
503	movb	%dl,%al
504	xorq	%r10,%r8
505	movzwq	(%r11,%r12,2),%r12
506	movzbl	%dl,%ebx
507	shlb	$4,%al
508	movzbq	(%rsp,%rcx,1),%r13
509	shrl	$4,%ebx
510	shlq	$48,%r12
511	xorq	%r8,%r13
512	movq	%r9,%r10
513	xorq	%r12,%r9
514	shrq	$8,%r8
515	movzbq	%r13b,%r13
516	shrq	$8,%r9
517	xorq	-128(%rbp,%rcx,8),%r8
518	shlq	$56,%r10
519	xorq	(%rbp,%rcx,8),%r9
520	roll	$8,%edx
521	xorq	8(%rsi,%rax,1),%r8
522	xorq	(%rsi,%rax,1),%r9
523	movb	%dl,%al
524	xorq	%r10,%r8
525	movzwq	(%r11,%r13,2),%r13
526	movzbl	%dl,%ecx
527	shlb	$4,%al
528	movzbq	(%rsp,%rbx,1),%r12
529	shrl	$4,%ecx
530	shlq	$48,%r13
531	xorq	%r8,%r12
532	movq	%r9,%r10
533	xorq	%r13,%r9
534	shrq	$8,%r8
535	movzbq	%r12b,%r12
536	shrq	$8,%r9
537	xorq	-128(%rbp,%rbx,8),%r8
538	shlq	$56,%r10
539	xorq	(%rbp,%rbx,8),%r9
540	roll	$8,%edx
541	xorq	8(%rsi,%rax,1),%r8
542	xorq	(%rsi,%rax,1),%r9
543	movb	%dl,%al
544	xorq	%r10,%r8
545	movzwq	(%r11,%r12,2),%r12
546	movzbl	%dl,%ebx
547	shlb	$4,%al
548	movzbq	(%rsp,%rcx,1),%r13
549	shrl	$4,%ebx
550	shlq	$48,%r12
551	xorq	%r8,%r13
552	movq	%r9,%r10
553	xorq	%r12,%r9
554	shrq	$8,%r8
555	movzbq	%r13b,%r13
556	shrq	$8,%r9
557	xorq	-128(%rbp,%rcx,8),%r8
558	shlq	$56,%r10
559	xorq	(%rbp,%rcx,8),%r9
560	roll	$8,%edx
561	xorq	8(%rsi,%rax,1),%r8
562	xorq	(%rsi,%rax,1),%r9
563	movb	%dl,%al
564	xorq	%r10,%r8
565	movzwq	(%r11,%r13,2),%r13
566	movzbl	%dl,%ecx
567	shlb	$4,%al
568	movzbq	(%rsp,%rbx,1),%r12
569	shrl	$4,%ecx
570	shlq	$48,%r13
571	xorq	%r8,%r12
572	movq	%r9,%r10
573	xorq	%r13,%r9
574	shrq	$8,%r8
575	movzbq	%r12b,%r12
576	movl	0(%rdi),%edx
577	shrq	$8,%r9
578	xorq	-128(%rbp,%rbx,8),%r8
579	shlq	$56,%r10
580	xorq	(%rbp,%rbx,8),%r9
581	roll	$8,%edx
582	xorq	8(%rsi,%rax,1),%r8
583	xorq	(%rsi,%rax,1),%r9
584	movb	%dl,%al
585	xorq	%r10,%r8
586	movzwq	(%r11,%r12,2),%r12
587	movzbl	%dl,%ebx
588	shlb	$4,%al
589	movzbq	(%rsp,%rcx,1),%r13
590	shrl	$4,%ebx
591	shlq	$48,%r12
592	xorq	%r8,%r13
593	movq	%r9,%r10
594	xorq	%r12,%r9
595	shrq	$8,%r8
596	movzbq	%r13b,%r13
597	shrq	$8,%r9
598	xorq	-128(%rbp,%rcx,8),%r8
599	shlq	$56,%r10
600	xorq	(%rbp,%rcx,8),%r9
601	roll	$8,%edx
602	xorq	8(%rsi,%rax,1),%r8
603	xorq	(%rsi,%rax,1),%r9
604	movb	%dl,%al
605	xorq	%r10,%r8
606	movzwq	(%r11,%r13,2),%r13
607	movzbl	%dl,%ecx
608	shlb	$4,%al
609	movzbq	(%rsp,%rbx,1),%r12
610	shrl	$4,%ecx
611	shlq	$48,%r13
612	xorq	%r8,%r12
613	movq	%r9,%r10
614	xorq	%r13,%r9
615	shrq	$8,%r8
616	movzbq	%r12b,%r12
617	shrq	$8,%r9
618	xorq	-128(%rbp,%rbx,8),%r8
619	shlq	$56,%r10
620	xorq	(%rbp,%rbx,8),%r9
621	roll	$8,%edx
622	xorq	8(%rsi,%rax,1),%r8
623	xorq	(%rsi,%rax,1),%r9
624	movb	%dl,%al
625	xorq	%r10,%r8
626	movzwq	(%r11,%r12,2),%r12
627	movzbl	%dl,%ebx
628	shlb	$4,%al
629	movzbq	(%rsp,%rcx,1),%r13
630	shrl	$4,%ebx
631	shlq	$48,%r12
632	xorq	%r8,%r13
633	movq	%r9,%r10
634	xorq	%r12,%r9
635	shrq	$8,%r8
636	movzbq	%r13b,%r13
637	shrq	$8,%r9
638	xorq	-128(%rbp,%rcx,8),%r8
639	shlq	$56,%r10
640	xorq	(%rbp,%rcx,8),%r9
641	roll	$8,%edx
642	xorq	8(%rsi,%rax,1),%r8
643	xorq	(%rsi,%rax,1),%r9
644	movb	%dl,%al
645	xorq	%r10,%r8
646	movzwq	(%r11,%r13,2),%r13
647	movzbl	%dl,%ecx
648	shlb	$4,%al
649	movzbq	(%rsp,%rbx,1),%r12
650	andl	$240,%ecx
651	shlq	$48,%r13
652	xorq	%r8,%r12
653	movq	%r9,%r10
654	xorq	%r13,%r9
655	shrq	$8,%r8
656	movzbq	%r12b,%r12
657	movl	-4(%rdi),%edx
658	shrq	$8,%r9
659	xorq	-128(%rbp,%rbx,8),%r8
660	shlq	$56,%r10
661	xorq	(%rbp,%rbx,8),%r9
662	movzwq	(%r11,%r12,2),%r12
663	xorq	8(%rsi,%rax,1),%r8
664	xorq	(%rsi,%rax,1),%r9
665	shlq	$48,%r12
666	xorq	%r10,%r8
667	xorq	%r12,%r9
668	movzbq	%r8b,%r13
669	shrq	$4,%r8
670	movq	%r9,%r10
671	shlb	$4,%r13b
672	shrq	$4,%r9
673	xorq	8(%rsi,%rcx,1),%r8
674	movzwq	(%r11,%r13,2),%r13
675	shlq	$60,%r10
676	xorq	(%rsi,%rcx,1),%r9
677	xorq	%r10,%r8
678	shlq	$48,%r13
679	bswapq	%r8
680	xorq	%r13,%r9
681	bswapq	%r9
682	cmpq	%r15,%r14
683	jb	.Louter_loop
684	movq	%r8,8(%rdi)
685	movq	%r9,(%rdi)
686
687	leaq	280+48(%rsp),%rsi
688.cfi_def_cfa	%rsi,8
689	movq	-48(%rsi),%r15
690.cfi_restore	%r15
691	movq	-40(%rsi),%r14
692.cfi_restore	%r14
693	movq	-32(%rsi),%r13
694.cfi_restore	%r13
695	movq	-24(%rsi),%r12
696.cfi_restore	%r12
697	movq	-16(%rsi),%rbp
698.cfi_restore	%rbp
699	movq	-8(%rsi),%rbx
700.cfi_restore	%rbx
701	leaq	0(%rsi),%rsp
702.cfi_def_cfa_register	%rsp
703.Lghash_epilogue:
704	.byte	0xf3,0xc3
705.cfi_endproc
706.size	gcm_ghash_4bit,.-gcm_ghash_4bit
707.globl	gcm_init_clmul
708.type	gcm_init_clmul,@function
709.align	16
710gcm_init_clmul:
711.cfi_startproc
712.L_init_clmul:
713	movdqu	(%rsi),%xmm2
714	pshufd	$78,%xmm2,%xmm2
715
716
717	pshufd	$255,%xmm2,%xmm4
718	movdqa	%xmm2,%xmm3
719	psllq	$1,%xmm2
720	pxor	%xmm5,%xmm5
721	psrlq	$63,%xmm3
722	pcmpgtd	%xmm4,%xmm5
723	pslldq	$8,%xmm3
724	por	%xmm3,%xmm2
725
726
727	pand	.L0x1c2_polynomial(%rip),%xmm5
728	pxor	%xmm5,%xmm2
729
730
731	pshufd	$78,%xmm2,%xmm6
732	movdqa	%xmm2,%xmm0
733	pxor	%xmm2,%xmm6
734	movdqa	%xmm0,%xmm1
735	pshufd	$78,%xmm0,%xmm3
736	pxor	%xmm0,%xmm3
737.byte	102,15,58,68,194,0
738.byte	102,15,58,68,202,17
739.byte	102,15,58,68,222,0
740	pxor	%xmm0,%xmm3
741	pxor	%xmm1,%xmm3
742
743	movdqa	%xmm3,%xmm4
744	psrldq	$8,%xmm3
745	pslldq	$8,%xmm4
746	pxor	%xmm3,%xmm1
747	pxor	%xmm4,%xmm0
748
749	movdqa	%xmm0,%xmm4
750	movdqa	%xmm0,%xmm3
751	psllq	$5,%xmm0
752	pxor	%xmm0,%xmm3
753	psllq	$1,%xmm0
754	pxor	%xmm3,%xmm0
755	psllq	$57,%xmm0
756	movdqa	%xmm0,%xmm3
757	pslldq	$8,%xmm0
758	psrldq	$8,%xmm3
759	pxor	%xmm4,%xmm0
760	pxor	%xmm3,%xmm1
761
762
763	movdqa	%xmm0,%xmm4
764	psrlq	$1,%xmm0
765	pxor	%xmm4,%xmm1
766	pxor	%xmm0,%xmm4
767	psrlq	$5,%xmm0
768	pxor	%xmm4,%xmm0
769	psrlq	$1,%xmm0
770	pxor	%xmm1,%xmm0
771	pshufd	$78,%xmm2,%xmm3
772	pshufd	$78,%xmm0,%xmm4
773	pxor	%xmm2,%xmm3
774	movdqu	%xmm2,0(%rdi)
775	pxor	%xmm0,%xmm4
776	movdqu	%xmm0,16(%rdi)
777.byte	102,15,58,15,227,8
778	movdqu	%xmm4,32(%rdi)
779	movdqa	%xmm0,%xmm1
780	pshufd	$78,%xmm0,%xmm3
781	pxor	%xmm0,%xmm3
782.byte	102,15,58,68,194,0
783.byte	102,15,58,68,202,17
784.byte	102,15,58,68,222,0
785	pxor	%xmm0,%xmm3
786	pxor	%xmm1,%xmm3
787
788	movdqa	%xmm3,%xmm4
789	psrldq	$8,%xmm3
790	pslldq	$8,%xmm4
791	pxor	%xmm3,%xmm1
792	pxor	%xmm4,%xmm0
793
794	movdqa	%xmm0,%xmm4
795	movdqa	%xmm0,%xmm3
796	psllq	$5,%xmm0
797	pxor	%xmm0,%xmm3
798	psllq	$1,%xmm0
799	pxor	%xmm3,%xmm0
800	psllq	$57,%xmm0
801	movdqa	%xmm0,%xmm3
802	pslldq	$8,%xmm0
803	psrldq	$8,%xmm3
804	pxor	%xmm4,%xmm0
805	pxor	%xmm3,%xmm1
806
807
808	movdqa	%xmm0,%xmm4
809	psrlq	$1,%xmm0
810	pxor	%xmm4,%xmm1
811	pxor	%xmm0,%xmm4
812	psrlq	$5,%xmm0
813	pxor	%xmm4,%xmm0
814	psrlq	$1,%xmm0
815	pxor	%xmm1,%xmm0
816	movdqa	%xmm0,%xmm5
817	movdqa	%xmm0,%xmm1
818	pshufd	$78,%xmm0,%xmm3
819	pxor	%xmm0,%xmm3
820.byte	102,15,58,68,194,0
821.byte	102,15,58,68,202,17
822.byte	102,15,58,68,222,0
823	pxor	%xmm0,%xmm3
824	pxor	%xmm1,%xmm3
825
826	movdqa	%xmm3,%xmm4
827	psrldq	$8,%xmm3
828	pslldq	$8,%xmm4
829	pxor	%xmm3,%xmm1
830	pxor	%xmm4,%xmm0
831
832	movdqa	%xmm0,%xmm4
833	movdqa	%xmm0,%xmm3
834	psllq	$5,%xmm0
835	pxor	%xmm0,%xmm3
836	psllq	$1,%xmm0
837	pxor	%xmm3,%xmm0
838	psllq	$57,%xmm0
839	movdqa	%xmm0,%xmm3
840	pslldq	$8,%xmm0
841	psrldq	$8,%xmm3
842	pxor	%xmm4,%xmm0
843	pxor	%xmm3,%xmm1
844
845
846	movdqa	%xmm0,%xmm4
847	psrlq	$1,%xmm0
848	pxor	%xmm4,%xmm1
849	pxor	%xmm0,%xmm4
850	psrlq	$5,%xmm0
851	pxor	%xmm4,%xmm0
852	psrlq	$1,%xmm0
853	pxor	%xmm1,%xmm0
854	pshufd	$78,%xmm5,%xmm3
855	pshufd	$78,%xmm0,%xmm4
856	pxor	%xmm5,%xmm3
857	movdqu	%xmm5,48(%rdi)
858	pxor	%xmm0,%xmm4
859	movdqu	%xmm0,64(%rdi)
860.byte	102,15,58,15,227,8
861	movdqu	%xmm4,80(%rdi)
862	.byte	0xf3,0xc3
863.cfi_endproc
864.size	gcm_init_clmul,.-gcm_init_clmul
865.globl	gcm_gmult_clmul
866.type	gcm_gmult_clmul,@function
867.align	16
868gcm_gmult_clmul:
869.cfi_startproc
870.byte	243,15,30,250
871.L_gmult_clmul:
872	movdqu	(%rdi),%xmm0
873	movdqa	.Lbswap_mask(%rip),%xmm5
874	movdqu	(%rsi),%xmm2
875	movdqu	32(%rsi),%xmm4
876.byte	102,15,56,0,197
877	movdqa	%xmm0,%xmm1
878	pshufd	$78,%xmm0,%xmm3
879	pxor	%xmm0,%xmm3
880.byte	102,15,58,68,194,0
881.byte	102,15,58,68,202,17
882.byte	102,15,58,68,220,0
883	pxor	%xmm0,%xmm3
884	pxor	%xmm1,%xmm3
885
886	movdqa	%xmm3,%xmm4
887	psrldq	$8,%xmm3
888	pslldq	$8,%xmm4
889	pxor	%xmm3,%xmm1
890	pxor	%xmm4,%xmm0
891
892	movdqa	%xmm0,%xmm4
893	movdqa	%xmm0,%xmm3
894	psllq	$5,%xmm0
895	pxor	%xmm0,%xmm3
896	psllq	$1,%xmm0
897	pxor	%xmm3,%xmm0
898	psllq	$57,%xmm0
899	movdqa	%xmm0,%xmm3
900	pslldq	$8,%xmm0
901	psrldq	$8,%xmm3
902	pxor	%xmm4,%xmm0
903	pxor	%xmm3,%xmm1
904
905
906	movdqa	%xmm0,%xmm4
907	psrlq	$1,%xmm0
908	pxor	%xmm4,%xmm1
909	pxor	%xmm0,%xmm4
910	psrlq	$5,%xmm0
911	pxor	%xmm4,%xmm0
912	psrlq	$1,%xmm0
913	pxor	%xmm1,%xmm0
914.byte	102,15,56,0,197
915	movdqu	%xmm0,(%rdi)
916	.byte	0xf3,0xc3
917.cfi_endproc
918.size	gcm_gmult_clmul,.-gcm_gmult_clmul
919.globl	gcm_ghash_clmul
920.type	gcm_ghash_clmul,@function
921.align	32
922gcm_ghash_clmul:
923.cfi_startproc
924.byte	243,15,30,250
925.L_ghash_clmul:
926	movdqa	.Lbswap_mask(%rip),%xmm10
927
928	movdqu	(%rdi),%xmm0
929	movdqu	(%rsi),%xmm2
930	movdqu	32(%rsi),%xmm7
931.byte	102,65,15,56,0,194
932
933	subq	$0x10,%rcx
934	jz	.Lodd_tail
935
936	movdqu	16(%rsi),%xmm6
937	movl	OPENSSL_ia32cap_P+4(%rip),%eax
938	cmpq	$0x30,%rcx
939	jb	.Lskip4x
940
941	andl	$71303168,%eax
942	cmpl	$4194304,%eax
943	je	.Lskip4x
944
945	subq	$0x30,%rcx
946	movq	$0xA040608020C0E000,%rax
947	movdqu	48(%rsi),%xmm14
948	movdqu	64(%rsi),%xmm15
949
950
951
952
953	movdqu	48(%rdx),%xmm3
954	movdqu	32(%rdx),%xmm11
955.byte	102,65,15,56,0,218
956.byte	102,69,15,56,0,218
957	movdqa	%xmm3,%xmm5
958	pshufd	$78,%xmm3,%xmm4
959	pxor	%xmm3,%xmm4
960.byte	102,15,58,68,218,0
961.byte	102,15,58,68,234,17
962.byte	102,15,58,68,231,0
963
964	movdqa	%xmm11,%xmm13
965	pshufd	$78,%xmm11,%xmm12
966	pxor	%xmm11,%xmm12
967.byte	102,68,15,58,68,222,0
968.byte	102,68,15,58,68,238,17
969.byte	102,68,15,58,68,231,16
970	xorps	%xmm11,%xmm3
971	xorps	%xmm13,%xmm5
972	movups	80(%rsi),%xmm7
973	xorps	%xmm12,%xmm4
974
975	movdqu	16(%rdx),%xmm11
976	movdqu	0(%rdx),%xmm8
977.byte	102,69,15,56,0,218
978.byte	102,69,15,56,0,194
979	movdqa	%xmm11,%xmm13
980	pshufd	$78,%xmm11,%xmm12
981	pxor	%xmm8,%xmm0
982	pxor	%xmm11,%xmm12
983.byte	102,69,15,58,68,222,0
984	movdqa	%xmm0,%xmm1
985	pshufd	$78,%xmm0,%xmm8
986	pxor	%xmm0,%xmm8
987.byte	102,69,15,58,68,238,17
988.byte	102,68,15,58,68,231,0
989	xorps	%xmm11,%xmm3
990	xorps	%xmm13,%xmm5
991
992	leaq	64(%rdx),%rdx
993	subq	$0x40,%rcx
994	jc	.Ltail4x
995
996	jmp	.Lmod4_loop
997.align	32
998.Lmod4_loop:
999.byte	102,65,15,58,68,199,0
1000	xorps	%xmm12,%xmm4
1001	movdqu	48(%rdx),%xmm11
1002.byte	102,69,15,56,0,218
1003.byte	102,65,15,58,68,207,17
1004	xorps	%xmm3,%xmm0
1005	movdqu	32(%rdx),%xmm3
1006	movdqa	%xmm11,%xmm13
1007.byte	102,68,15,58,68,199,16
1008	pshufd	$78,%xmm11,%xmm12
1009	xorps	%xmm5,%xmm1
1010	pxor	%xmm11,%xmm12
1011.byte	102,65,15,56,0,218
1012	movups	32(%rsi),%xmm7
1013	xorps	%xmm4,%xmm8
1014.byte	102,68,15,58,68,218,0
1015	pshufd	$78,%xmm3,%xmm4
1016
1017	pxor	%xmm0,%xmm8
1018	movdqa	%xmm3,%xmm5
1019	pxor	%xmm1,%xmm8
1020	pxor	%xmm3,%xmm4
1021	movdqa	%xmm8,%xmm9
1022.byte	102,68,15,58,68,234,17
1023	pslldq	$8,%xmm8
1024	psrldq	$8,%xmm9
1025	pxor	%xmm8,%xmm0
1026	movdqa	.L7_mask(%rip),%xmm8
1027	pxor	%xmm9,%xmm1
1028.byte	102,76,15,110,200
1029
1030	pand	%xmm0,%xmm8
1031.byte	102,69,15,56,0,200
1032	pxor	%xmm0,%xmm9
1033.byte	102,68,15,58,68,231,0
1034	psllq	$57,%xmm9
1035	movdqa	%xmm9,%xmm8
1036	pslldq	$8,%xmm9
1037.byte	102,15,58,68,222,0
1038	psrldq	$8,%xmm8
1039	pxor	%xmm9,%xmm0
1040	pxor	%xmm8,%xmm1
1041	movdqu	0(%rdx),%xmm8
1042
1043	movdqa	%xmm0,%xmm9
1044	psrlq	$1,%xmm0
1045.byte	102,15,58,68,238,17
1046	xorps	%xmm11,%xmm3
1047	movdqu	16(%rdx),%xmm11
1048.byte	102,69,15,56,0,218
1049.byte	102,15,58,68,231,16
1050	xorps	%xmm13,%xmm5
1051	movups	80(%rsi),%xmm7
1052.byte	102,69,15,56,0,194
1053	pxor	%xmm9,%xmm1
1054	pxor	%xmm0,%xmm9
1055	psrlq	$5,%xmm0
1056
1057	movdqa	%xmm11,%xmm13
1058	pxor	%xmm12,%xmm4
1059	pshufd	$78,%xmm11,%xmm12
1060	pxor	%xmm9,%xmm0
1061	pxor	%xmm8,%xmm1
1062	pxor	%xmm11,%xmm12
1063.byte	102,69,15,58,68,222,0
1064	psrlq	$1,%xmm0
1065	pxor	%xmm1,%xmm0
1066	movdqa	%xmm0,%xmm1
1067.byte	102,69,15,58,68,238,17
1068	xorps	%xmm11,%xmm3
1069	pshufd	$78,%xmm0,%xmm8
1070	pxor	%xmm0,%xmm8
1071
1072.byte	102,68,15,58,68,231,0
1073	xorps	%xmm13,%xmm5
1074
1075	leaq	64(%rdx),%rdx
1076	subq	$0x40,%rcx
1077	jnc	.Lmod4_loop
1078
1079.Ltail4x:
1080.byte	102,65,15,58,68,199,0
1081.byte	102,65,15,58,68,207,17
1082.byte	102,68,15,58,68,199,16
1083	xorps	%xmm12,%xmm4
1084	xorps	%xmm3,%xmm0
1085	xorps	%xmm5,%xmm1
1086	pxor	%xmm0,%xmm1
1087	pxor	%xmm4,%xmm8
1088
1089	pxor	%xmm1,%xmm8
1090	pxor	%xmm0,%xmm1
1091
1092	movdqa	%xmm8,%xmm9
1093	psrldq	$8,%xmm8
1094	pslldq	$8,%xmm9
1095	pxor	%xmm8,%xmm1
1096	pxor	%xmm9,%xmm0
1097
1098	movdqa	%xmm0,%xmm4
1099	movdqa	%xmm0,%xmm3
1100	psllq	$5,%xmm0
1101	pxor	%xmm0,%xmm3
1102	psllq	$1,%xmm0
1103	pxor	%xmm3,%xmm0
1104	psllq	$57,%xmm0
1105	movdqa	%xmm0,%xmm3
1106	pslldq	$8,%xmm0
1107	psrldq	$8,%xmm3
1108	pxor	%xmm4,%xmm0
1109	pxor	%xmm3,%xmm1
1110
1111
1112	movdqa	%xmm0,%xmm4
1113	psrlq	$1,%xmm0
1114	pxor	%xmm4,%xmm1
1115	pxor	%xmm0,%xmm4
1116	psrlq	$5,%xmm0
1117	pxor	%xmm4,%xmm0
1118	psrlq	$1,%xmm0
1119	pxor	%xmm1,%xmm0
1120	addq	$0x40,%rcx
1121	jz	.Ldone
1122	movdqu	32(%rsi),%xmm7
1123	subq	$0x10,%rcx
1124	jz	.Lodd_tail
1125.Lskip4x:
1126
1127
1128
1129
1130
1131	movdqu	(%rdx),%xmm8
1132	movdqu	16(%rdx),%xmm3
1133.byte	102,69,15,56,0,194
1134.byte	102,65,15,56,0,218
1135	pxor	%xmm8,%xmm0
1136
1137	movdqa	%xmm3,%xmm5
1138	pshufd	$78,%xmm3,%xmm4
1139	pxor	%xmm3,%xmm4
1140.byte	102,15,58,68,218,0
1141.byte	102,15,58,68,234,17
1142.byte	102,15,58,68,231,0
1143
1144	leaq	32(%rdx),%rdx
1145	nop
1146	subq	$0x20,%rcx
1147	jbe	.Leven_tail
1148	nop
1149	jmp	.Lmod_loop
1150
1151.align	32
1152.Lmod_loop:
1153	movdqa	%xmm0,%xmm1
1154	movdqa	%xmm4,%xmm8
1155	pshufd	$78,%xmm0,%xmm4
1156	pxor	%xmm0,%xmm4
1157
1158.byte	102,15,58,68,198,0
1159.byte	102,15,58,68,206,17
1160.byte	102,15,58,68,231,16
1161
1162	pxor	%xmm3,%xmm0
1163	pxor	%xmm5,%xmm1
1164	movdqu	(%rdx),%xmm9
1165	pxor	%xmm0,%xmm8
1166.byte	102,69,15,56,0,202
1167	movdqu	16(%rdx),%xmm3
1168
1169	pxor	%xmm1,%xmm8
1170	pxor	%xmm9,%xmm1
1171	pxor	%xmm8,%xmm4
1172.byte	102,65,15,56,0,218
1173	movdqa	%xmm4,%xmm8
1174	psrldq	$8,%xmm8
1175	pslldq	$8,%xmm4
1176	pxor	%xmm8,%xmm1
1177	pxor	%xmm4,%xmm0
1178
1179	movdqa	%xmm3,%xmm5
1180
1181	movdqa	%xmm0,%xmm9
1182	movdqa	%xmm0,%xmm8
1183	psllq	$5,%xmm0
1184	pxor	%xmm0,%xmm8
1185.byte	102,15,58,68,218,0
1186	psllq	$1,%xmm0
1187	pxor	%xmm8,%xmm0
1188	psllq	$57,%xmm0
1189	movdqa	%xmm0,%xmm8
1190	pslldq	$8,%xmm0
1191	psrldq	$8,%xmm8
1192	pxor	%xmm9,%xmm0
1193	pshufd	$78,%xmm5,%xmm4
1194	pxor	%xmm8,%xmm1
1195	pxor	%xmm5,%xmm4
1196
1197	movdqa	%xmm0,%xmm9
1198	psrlq	$1,%xmm0
1199.byte	102,15,58,68,234,17
1200	pxor	%xmm9,%xmm1
1201	pxor	%xmm0,%xmm9
1202	psrlq	$5,%xmm0
1203	pxor	%xmm9,%xmm0
1204	leaq	32(%rdx),%rdx
1205	psrlq	$1,%xmm0
1206.byte	102,15,58,68,231,0
1207	pxor	%xmm1,%xmm0
1208
1209	subq	$0x20,%rcx
1210	ja	.Lmod_loop
1211
1212.Leven_tail:
1213	movdqa	%xmm0,%xmm1
1214	movdqa	%xmm4,%xmm8
1215	pshufd	$78,%xmm0,%xmm4
1216	pxor	%xmm0,%xmm4
1217
1218.byte	102,15,58,68,198,0
1219.byte	102,15,58,68,206,17
1220.byte	102,15,58,68,231,16
1221
1222	pxor	%xmm3,%xmm0
1223	pxor	%xmm5,%xmm1
1224	pxor	%xmm0,%xmm8
1225	pxor	%xmm1,%xmm8
1226	pxor	%xmm8,%xmm4
1227	movdqa	%xmm4,%xmm8
1228	psrldq	$8,%xmm8
1229	pslldq	$8,%xmm4
1230	pxor	%xmm8,%xmm1
1231	pxor	%xmm4,%xmm0
1232
1233	movdqa	%xmm0,%xmm4
1234	movdqa	%xmm0,%xmm3
1235	psllq	$5,%xmm0
1236	pxor	%xmm0,%xmm3
1237	psllq	$1,%xmm0
1238	pxor	%xmm3,%xmm0
1239	psllq	$57,%xmm0
1240	movdqa	%xmm0,%xmm3
1241	pslldq	$8,%xmm0
1242	psrldq	$8,%xmm3
1243	pxor	%xmm4,%xmm0
1244	pxor	%xmm3,%xmm1
1245
1246
1247	movdqa	%xmm0,%xmm4
1248	psrlq	$1,%xmm0
1249	pxor	%xmm4,%xmm1
1250	pxor	%xmm0,%xmm4
1251	psrlq	$5,%xmm0
1252	pxor	%xmm4,%xmm0
1253	psrlq	$1,%xmm0
1254	pxor	%xmm1,%xmm0
1255	testq	%rcx,%rcx
1256	jnz	.Ldone
1257
1258.Lodd_tail:
1259	movdqu	(%rdx),%xmm8
1260.byte	102,69,15,56,0,194
1261	pxor	%xmm8,%xmm0
1262	movdqa	%xmm0,%xmm1
1263	pshufd	$78,%xmm0,%xmm3
1264	pxor	%xmm0,%xmm3
1265.byte	102,15,58,68,194,0
1266.byte	102,15,58,68,202,17
1267.byte	102,15,58,68,223,0
1268	pxor	%xmm0,%xmm3
1269	pxor	%xmm1,%xmm3
1270
1271	movdqa	%xmm3,%xmm4
1272	psrldq	$8,%xmm3
1273	pslldq	$8,%xmm4
1274	pxor	%xmm3,%xmm1
1275	pxor	%xmm4,%xmm0
1276
1277	movdqa	%xmm0,%xmm4
1278	movdqa	%xmm0,%xmm3
1279	psllq	$5,%xmm0
1280	pxor	%xmm0,%xmm3
1281	psllq	$1,%xmm0
1282	pxor	%xmm3,%xmm0
1283	psllq	$57,%xmm0
1284	movdqa	%xmm0,%xmm3
1285	pslldq	$8,%xmm0
1286	psrldq	$8,%xmm3
1287	pxor	%xmm4,%xmm0
1288	pxor	%xmm3,%xmm1
1289
1290
1291	movdqa	%xmm0,%xmm4
1292	psrlq	$1,%xmm0
1293	pxor	%xmm4,%xmm1
1294	pxor	%xmm0,%xmm4
1295	psrlq	$5,%xmm0
1296	pxor	%xmm4,%xmm0
1297	psrlq	$1,%xmm0
1298	pxor	%xmm1,%xmm0
1299.Ldone:
1300.byte	102,65,15,56,0,194
1301	movdqu	%xmm0,(%rdi)
1302	.byte	0xf3,0xc3
1303.cfi_endproc
1304.size	gcm_ghash_clmul,.-gcm_ghash_clmul
1305.globl	gcm_init_avx
1306.type	gcm_init_avx,@function
1307.align	32
1308gcm_init_avx:
1309.cfi_startproc
1310	vzeroupper
1311
1312	vmovdqu	(%rsi),%xmm2
1313	vpshufd	$78,%xmm2,%xmm2
1314
1315
1316	vpshufd	$255,%xmm2,%xmm4
1317	vpsrlq	$63,%xmm2,%xmm3
1318	vpsllq	$1,%xmm2,%xmm2
1319	vpxor	%xmm5,%xmm5,%xmm5
1320	vpcmpgtd	%xmm4,%xmm5,%xmm5
1321	vpslldq	$8,%xmm3,%xmm3
1322	vpor	%xmm3,%xmm2,%xmm2
1323
1324
1325	vpand	.L0x1c2_polynomial(%rip),%xmm5,%xmm5
1326	vpxor	%xmm5,%xmm2,%xmm2
1327
1328	vpunpckhqdq	%xmm2,%xmm2,%xmm6
1329	vmovdqa	%xmm2,%xmm0
1330	vpxor	%xmm2,%xmm6,%xmm6
1331	movq	$4,%r10
1332	jmp	.Linit_start_avx
1333.align	32
1334.Linit_loop_avx:
1335	vpalignr	$8,%xmm3,%xmm4,%xmm5
1336	vmovdqu	%xmm5,-16(%rdi)
1337	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1338	vpxor	%xmm0,%xmm3,%xmm3
1339	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1340	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1341	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1342	vpxor	%xmm0,%xmm1,%xmm4
1343	vpxor	%xmm4,%xmm3,%xmm3
1344
1345	vpslldq	$8,%xmm3,%xmm4
1346	vpsrldq	$8,%xmm3,%xmm3
1347	vpxor	%xmm4,%xmm0,%xmm0
1348	vpxor	%xmm3,%xmm1,%xmm1
1349	vpsllq	$57,%xmm0,%xmm3
1350	vpsllq	$62,%xmm0,%xmm4
1351	vpxor	%xmm3,%xmm4,%xmm4
1352	vpsllq	$63,%xmm0,%xmm3
1353	vpxor	%xmm3,%xmm4,%xmm4
1354	vpslldq	$8,%xmm4,%xmm3
1355	vpsrldq	$8,%xmm4,%xmm4
1356	vpxor	%xmm3,%xmm0,%xmm0
1357	vpxor	%xmm4,%xmm1,%xmm1
1358
1359	vpsrlq	$1,%xmm0,%xmm4
1360	vpxor	%xmm0,%xmm1,%xmm1
1361	vpxor	%xmm4,%xmm0,%xmm0
1362	vpsrlq	$5,%xmm4,%xmm4
1363	vpxor	%xmm4,%xmm0,%xmm0
1364	vpsrlq	$1,%xmm0,%xmm0
1365	vpxor	%xmm1,%xmm0,%xmm0
1366.Linit_start_avx:
1367	vmovdqa	%xmm0,%xmm5
1368	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1369	vpxor	%xmm0,%xmm3,%xmm3
1370	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1371	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1372	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1373	vpxor	%xmm0,%xmm1,%xmm4
1374	vpxor	%xmm4,%xmm3,%xmm3
1375
1376	vpslldq	$8,%xmm3,%xmm4
1377	vpsrldq	$8,%xmm3,%xmm3
1378	vpxor	%xmm4,%xmm0,%xmm0
1379	vpxor	%xmm3,%xmm1,%xmm1
1380	vpsllq	$57,%xmm0,%xmm3
1381	vpsllq	$62,%xmm0,%xmm4
1382	vpxor	%xmm3,%xmm4,%xmm4
1383	vpsllq	$63,%xmm0,%xmm3
1384	vpxor	%xmm3,%xmm4,%xmm4
1385	vpslldq	$8,%xmm4,%xmm3
1386	vpsrldq	$8,%xmm4,%xmm4
1387	vpxor	%xmm3,%xmm0,%xmm0
1388	vpxor	%xmm4,%xmm1,%xmm1
1389
1390	vpsrlq	$1,%xmm0,%xmm4
1391	vpxor	%xmm0,%xmm1,%xmm1
1392	vpxor	%xmm4,%xmm0,%xmm0
1393	vpsrlq	$5,%xmm4,%xmm4
1394	vpxor	%xmm4,%xmm0,%xmm0
1395	vpsrlq	$1,%xmm0,%xmm0
1396	vpxor	%xmm1,%xmm0,%xmm0
1397	vpshufd	$78,%xmm5,%xmm3
1398	vpshufd	$78,%xmm0,%xmm4
1399	vpxor	%xmm5,%xmm3,%xmm3
1400	vmovdqu	%xmm5,0(%rdi)
1401	vpxor	%xmm0,%xmm4,%xmm4
1402	vmovdqu	%xmm0,16(%rdi)
1403	leaq	48(%rdi),%rdi
1404	subq	$1,%r10
1405	jnz	.Linit_loop_avx
1406
1407	vpalignr	$8,%xmm4,%xmm3,%xmm5
1408	vmovdqu	%xmm5,-16(%rdi)
1409
1410	vzeroupper
1411	.byte	0xf3,0xc3
1412.cfi_endproc
1413.size	gcm_init_avx,.-gcm_init_avx
1414.globl	gcm_gmult_avx
1415.type	gcm_gmult_avx,@function
1416.align	32
1417gcm_gmult_avx:
1418.cfi_startproc
1419.byte	243,15,30,250
1420	jmp	.L_gmult_clmul
1421.cfi_endproc
1422.size	gcm_gmult_avx,.-gcm_gmult_avx
1423.globl	gcm_ghash_avx
1424.type	gcm_ghash_avx,@function
1425.align	32
1426gcm_ghash_avx:
1427.cfi_startproc
1428.byte	243,15,30,250
1429	vzeroupper
1430
1431	vmovdqu	(%rdi),%xmm10
1432	leaq	.L0x1c2_polynomial(%rip),%r10
1433	leaq	64(%rsi),%rsi
1434	vmovdqu	.Lbswap_mask(%rip),%xmm13
1435	vpshufb	%xmm13,%xmm10,%xmm10
1436	cmpq	$0x80,%rcx
1437	jb	.Lshort_avx
1438	subq	$0x80,%rcx
1439
1440	vmovdqu	112(%rdx),%xmm14
1441	vmovdqu	0-64(%rsi),%xmm6
1442	vpshufb	%xmm13,%xmm14,%xmm14
1443	vmovdqu	32-64(%rsi),%xmm7
1444
1445	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1446	vmovdqu	96(%rdx),%xmm15
1447	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1448	vpxor	%xmm14,%xmm9,%xmm9
1449	vpshufb	%xmm13,%xmm15,%xmm15
1450	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1451	vmovdqu	16-64(%rsi),%xmm6
1452	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1453	vmovdqu	80(%rdx),%xmm14
1454	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1455	vpxor	%xmm15,%xmm8,%xmm8
1456
1457	vpshufb	%xmm13,%xmm14,%xmm14
1458	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1459	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1460	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1461	vmovdqu	48-64(%rsi),%xmm6
1462	vpxor	%xmm14,%xmm9,%xmm9
1463	vmovdqu	64(%rdx),%xmm15
1464	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1465	vmovdqu	80-64(%rsi),%xmm7
1466
1467	vpshufb	%xmm13,%xmm15,%xmm15
1468	vpxor	%xmm0,%xmm3,%xmm3
1469	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1470	vpxor	%xmm1,%xmm4,%xmm4
1471	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1472	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1473	vmovdqu	64-64(%rsi),%xmm6
1474	vpxor	%xmm2,%xmm5,%xmm5
1475	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1476	vpxor	%xmm15,%xmm8,%xmm8
1477
1478	vmovdqu	48(%rdx),%xmm14
1479	vpxor	%xmm3,%xmm0,%xmm0
1480	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1481	vpxor	%xmm4,%xmm1,%xmm1
1482	vpshufb	%xmm13,%xmm14,%xmm14
1483	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1484	vmovdqu	96-64(%rsi),%xmm6
1485	vpxor	%xmm5,%xmm2,%xmm2
1486	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1487	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1488	vmovdqu	128-64(%rsi),%xmm7
1489	vpxor	%xmm14,%xmm9,%xmm9
1490
1491	vmovdqu	32(%rdx),%xmm15
1492	vpxor	%xmm0,%xmm3,%xmm3
1493	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1494	vpxor	%xmm1,%xmm4,%xmm4
1495	vpshufb	%xmm13,%xmm15,%xmm15
1496	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1497	vmovdqu	112-64(%rsi),%xmm6
1498	vpxor	%xmm2,%xmm5,%xmm5
1499	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1500	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1501	vpxor	%xmm15,%xmm8,%xmm8
1502
1503	vmovdqu	16(%rdx),%xmm14
1504	vpxor	%xmm3,%xmm0,%xmm0
1505	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1506	vpxor	%xmm4,%xmm1,%xmm1
1507	vpshufb	%xmm13,%xmm14,%xmm14
1508	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1509	vmovdqu	144-64(%rsi),%xmm6
1510	vpxor	%xmm5,%xmm2,%xmm2
1511	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1512	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1513	vmovdqu	176-64(%rsi),%xmm7
1514	vpxor	%xmm14,%xmm9,%xmm9
1515
1516	vmovdqu	(%rdx),%xmm15
1517	vpxor	%xmm0,%xmm3,%xmm3
1518	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1519	vpxor	%xmm1,%xmm4,%xmm4
1520	vpshufb	%xmm13,%xmm15,%xmm15
1521	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1522	vmovdqu	160-64(%rsi),%xmm6
1523	vpxor	%xmm2,%xmm5,%xmm5
1524	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1525
1526	leaq	128(%rdx),%rdx
1527	cmpq	$0x80,%rcx
1528	jb	.Ltail_avx
1529
1530	vpxor	%xmm10,%xmm15,%xmm15
1531	subq	$0x80,%rcx
1532	jmp	.Loop8x_avx
1533
1534.align	32
1535.Loop8x_avx:
1536	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1537	vmovdqu	112(%rdx),%xmm14
1538	vpxor	%xmm0,%xmm3,%xmm3
1539	vpxor	%xmm15,%xmm8,%xmm8
1540	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
1541	vpshufb	%xmm13,%xmm14,%xmm14
1542	vpxor	%xmm1,%xmm4,%xmm4
1543	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
1544	vmovdqu	0-64(%rsi),%xmm6
1545	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1546	vpxor	%xmm2,%xmm5,%xmm5
1547	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
1548	vmovdqu	32-64(%rsi),%xmm7
1549	vpxor	%xmm14,%xmm9,%xmm9
1550
1551	vmovdqu	96(%rdx),%xmm15
1552	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1553	vpxor	%xmm3,%xmm10,%xmm10
1554	vpshufb	%xmm13,%xmm15,%xmm15
1555	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1556	vxorps	%xmm4,%xmm11,%xmm11
1557	vmovdqu	16-64(%rsi),%xmm6
1558	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1559	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1560	vpxor	%xmm5,%xmm12,%xmm12
1561	vxorps	%xmm15,%xmm8,%xmm8
1562
1563	vmovdqu	80(%rdx),%xmm14
1564	vpxor	%xmm10,%xmm12,%xmm12
1565	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1566	vpxor	%xmm11,%xmm12,%xmm12
1567	vpslldq	$8,%xmm12,%xmm9
1568	vpxor	%xmm0,%xmm3,%xmm3
1569	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1570	vpsrldq	$8,%xmm12,%xmm12
1571	vpxor	%xmm9,%xmm10,%xmm10
1572	vmovdqu	48-64(%rsi),%xmm6
1573	vpshufb	%xmm13,%xmm14,%xmm14
1574	vxorps	%xmm12,%xmm11,%xmm11
1575	vpxor	%xmm1,%xmm4,%xmm4
1576	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1577	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1578	vmovdqu	80-64(%rsi),%xmm7
1579	vpxor	%xmm14,%xmm9,%xmm9
1580	vpxor	%xmm2,%xmm5,%xmm5
1581
1582	vmovdqu	64(%rdx),%xmm15
1583	vpalignr	$8,%xmm10,%xmm10,%xmm12
1584	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1585	vpshufb	%xmm13,%xmm15,%xmm15
1586	vpxor	%xmm3,%xmm0,%xmm0
1587	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1588	vmovdqu	64-64(%rsi),%xmm6
1589	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1590	vpxor	%xmm4,%xmm1,%xmm1
1591	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1592	vxorps	%xmm15,%xmm8,%xmm8
1593	vpxor	%xmm5,%xmm2,%xmm2
1594
1595	vmovdqu	48(%rdx),%xmm14
1596	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1597	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1598	vpshufb	%xmm13,%xmm14,%xmm14
1599	vpxor	%xmm0,%xmm3,%xmm3
1600	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1601	vmovdqu	96-64(%rsi),%xmm6
1602	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1603	vpxor	%xmm1,%xmm4,%xmm4
1604	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1605	vmovdqu	128-64(%rsi),%xmm7
1606	vpxor	%xmm14,%xmm9,%xmm9
1607	vpxor	%xmm2,%xmm5,%xmm5
1608
1609	vmovdqu	32(%rdx),%xmm15
1610	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1611	vpshufb	%xmm13,%xmm15,%xmm15
1612	vpxor	%xmm3,%xmm0,%xmm0
1613	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1614	vmovdqu	112-64(%rsi),%xmm6
1615	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1616	vpxor	%xmm4,%xmm1,%xmm1
1617	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1618	vpxor	%xmm15,%xmm8,%xmm8
1619	vpxor	%xmm5,%xmm2,%xmm2
1620	vxorps	%xmm12,%xmm10,%xmm10
1621
1622	vmovdqu	16(%rdx),%xmm14
1623	vpalignr	$8,%xmm10,%xmm10,%xmm12
1624	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1625	vpshufb	%xmm13,%xmm14,%xmm14
1626	vpxor	%xmm0,%xmm3,%xmm3
1627	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1628	vmovdqu	144-64(%rsi),%xmm6
1629	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1630	vxorps	%xmm11,%xmm12,%xmm12
1631	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1632	vpxor	%xmm1,%xmm4,%xmm4
1633	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1634	vmovdqu	176-64(%rsi),%xmm7
1635	vpxor	%xmm14,%xmm9,%xmm9
1636	vpxor	%xmm2,%xmm5,%xmm5
1637
1638	vmovdqu	(%rdx),%xmm15
1639	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1640	vpshufb	%xmm13,%xmm15,%xmm15
1641	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1642	vmovdqu	160-64(%rsi),%xmm6
1643	vpxor	%xmm12,%xmm15,%xmm15
1644	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1645	vpxor	%xmm10,%xmm15,%xmm15
1646
1647	leaq	128(%rdx),%rdx
1648	subq	$0x80,%rcx
1649	jnc	.Loop8x_avx
1650
1651	addq	$0x80,%rcx
1652	jmp	.Ltail_no_xor_avx
1653
1654.align	32
1655.Lshort_avx:
1656	vmovdqu	-16(%rdx,%rcx,1),%xmm14
1657	leaq	(%rdx,%rcx,1),%rdx
1658	vmovdqu	0-64(%rsi),%xmm6
1659	vmovdqu	32-64(%rsi),%xmm7
1660	vpshufb	%xmm13,%xmm14,%xmm15
1661
1662	vmovdqa	%xmm0,%xmm3
1663	vmovdqa	%xmm1,%xmm4
1664	vmovdqa	%xmm2,%xmm5
1665	subq	$0x10,%rcx
1666	jz	.Ltail_avx
1667
1668	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1669	vpxor	%xmm0,%xmm3,%xmm3
1670	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1671	vpxor	%xmm15,%xmm8,%xmm8
1672	vmovdqu	-32(%rdx),%xmm14
1673	vpxor	%xmm1,%xmm4,%xmm4
1674	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1675	vmovdqu	16-64(%rsi),%xmm6
1676	vpshufb	%xmm13,%xmm14,%xmm15
1677	vpxor	%xmm2,%xmm5,%xmm5
1678	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1679	vpsrldq	$8,%xmm7,%xmm7
1680	subq	$0x10,%rcx
1681	jz	.Ltail_avx
1682
1683	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1684	vpxor	%xmm0,%xmm3,%xmm3
1685	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1686	vpxor	%xmm15,%xmm8,%xmm8
1687	vmovdqu	-48(%rdx),%xmm14
1688	vpxor	%xmm1,%xmm4,%xmm4
1689	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1690	vmovdqu	48-64(%rsi),%xmm6
1691	vpshufb	%xmm13,%xmm14,%xmm15
1692	vpxor	%xmm2,%xmm5,%xmm5
1693	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1694	vmovdqu	80-64(%rsi),%xmm7
1695	subq	$0x10,%rcx
1696	jz	.Ltail_avx
1697
1698	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1699	vpxor	%xmm0,%xmm3,%xmm3
1700	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1701	vpxor	%xmm15,%xmm8,%xmm8
1702	vmovdqu	-64(%rdx),%xmm14
1703	vpxor	%xmm1,%xmm4,%xmm4
1704	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1705	vmovdqu	64-64(%rsi),%xmm6
1706	vpshufb	%xmm13,%xmm14,%xmm15
1707	vpxor	%xmm2,%xmm5,%xmm5
1708	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1709	vpsrldq	$8,%xmm7,%xmm7
1710	subq	$0x10,%rcx
1711	jz	.Ltail_avx
1712
1713	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1714	vpxor	%xmm0,%xmm3,%xmm3
1715	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1716	vpxor	%xmm15,%xmm8,%xmm8
1717	vmovdqu	-80(%rdx),%xmm14
1718	vpxor	%xmm1,%xmm4,%xmm4
1719	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1720	vmovdqu	96-64(%rsi),%xmm6
1721	vpshufb	%xmm13,%xmm14,%xmm15
1722	vpxor	%xmm2,%xmm5,%xmm5
1723	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1724	vmovdqu	128-64(%rsi),%xmm7
1725	subq	$0x10,%rcx
1726	jz	.Ltail_avx
1727
1728	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1729	vpxor	%xmm0,%xmm3,%xmm3
1730	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1731	vpxor	%xmm15,%xmm8,%xmm8
1732	vmovdqu	-96(%rdx),%xmm14
1733	vpxor	%xmm1,%xmm4,%xmm4
1734	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1735	vmovdqu	112-64(%rsi),%xmm6
1736	vpshufb	%xmm13,%xmm14,%xmm15
1737	vpxor	%xmm2,%xmm5,%xmm5
1738	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1739	vpsrldq	$8,%xmm7,%xmm7
1740	subq	$0x10,%rcx
1741	jz	.Ltail_avx
1742
1743	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1744	vpxor	%xmm0,%xmm3,%xmm3
1745	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1746	vpxor	%xmm15,%xmm8,%xmm8
1747	vmovdqu	-112(%rdx),%xmm14
1748	vpxor	%xmm1,%xmm4,%xmm4
1749	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1750	vmovdqu	144-64(%rsi),%xmm6
1751	vpshufb	%xmm13,%xmm14,%xmm15
1752	vpxor	%xmm2,%xmm5,%xmm5
1753	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1754	vmovq	184-64(%rsi),%xmm7
1755	subq	$0x10,%rcx
1756	jmp	.Ltail_avx
1757
1758.align	32
1759.Ltail_avx:
1760	vpxor	%xmm10,%xmm15,%xmm15
1761.Ltail_no_xor_avx:
1762	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1763	vpxor	%xmm0,%xmm3,%xmm3
1764	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1765	vpxor	%xmm15,%xmm8,%xmm8
1766	vpxor	%xmm1,%xmm4,%xmm4
1767	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1768	vpxor	%xmm2,%xmm5,%xmm5
1769	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1770
1771	vmovdqu	(%r10),%xmm12
1772
1773	vpxor	%xmm0,%xmm3,%xmm10
1774	vpxor	%xmm1,%xmm4,%xmm11
1775	vpxor	%xmm2,%xmm5,%xmm5
1776
1777	vpxor	%xmm10,%xmm5,%xmm5
1778	vpxor	%xmm11,%xmm5,%xmm5
1779	vpslldq	$8,%xmm5,%xmm9
1780	vpsrldq	$8,%xmm5,%xmm5
1781	vpxor	%xmm9,%xmm10,%xmm10
1782	vpxor	%xmm5,%xmm11,%xmm11
1783
1784	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1785	vpalignr	$8,%xmm10,%xmm10,%xmm10
1786	vpxor	%xmm9,%xmm10,%xmm10
1787
1788	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1789	vpalignr	$8,%xmm10,%xmm10,%xmm10
1790	vpxor	%xmm11,%xmm10,%xmm10
1791	vpxor	%xmm9,%xmm10,%xmm10
1792
1793	cmpq	$0,%rcx
1794	jne	.Lshort_avx
1795
1796	vpshufb	%xmm13,%xmm10,%xmm10
1797	vmovdqu	%xmm10,(%rdi)
1798	vzeroupper
1799	.byte	0xf3,0xc3
1800.cfi_endproc
1801.size	gcm_ghash_avx,.-gcm_ghash_avx
1802.align	64
1803.Lbswap_mask:
1804.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1805.L0x1c2_polynomial:
1806.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1807.L7_mask:
1808.long	7,0,7,0
1809.L7_mask_poly:
1810.long	7,0,450,0
1811.align	64
1812.type	.Lrem_4bit,@object
1813.Lrem_4bit:
1814.long	0,0,0,471859200,0,943718400,0,610271232
1815.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
1816.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
1817.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
1818.type	.Lrem_8bit,@object
1819.Lrem_8bit:
1820.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1821.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1822.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1823.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1824.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1825.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1826.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1827.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1828.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1829.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1830.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1831.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1832.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1833.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1834.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1835.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1836.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1837.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1838.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1839.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1840.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1841.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1842.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1843.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1844.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1845.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1846.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1847.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1848.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1849.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1850.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1851.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1852
1853.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1854.align	64
1855	.section ".note.gnu.property", "a"
1856	.p2align 3
1857	.long 1f - 0f
1858	.long 4f - 1f
1859	.long 5
18600:
1861	# "GNU" encoded with .byte, since .asciz isn't supported
1862	# on Solaris.
1863	.byte 0x47
1864	.byte 0x4e
1865	.byte 0x55
1866	.byte 0
18671:
1868	.p2align 3
1869	.long 0xc0000002
1870	.long 3f - 2f
18712:
1872	.long 3
18733:
1874	.p2align 3
18754:
1876