xref: /titanic_50/usr/src/common/crypto/modes/amd64/gcm_intel.s (revision 8de5c4f463386063e184a851437d58080c6c626c)
1104d3bdeSDan OpenSolaris Anderson/*
2104d3bdeSDan OpenSolaris Anderson * CDDL HEADER START
3104d3bdeSDan OpenSolaris Anderson *
4104d3bdeSDan OpenSolaris Anderson * The contents of this file are subject to the terms of the
5104d3bdeSDan OpenSolaris Anderson * Common Development and Distribution License (the "License").
6104d3bdeSDan OpenSolaris Anderson * You may not use this file except in compliance with the License.
7104d3bdeSDan OpenSolaris Anderson *
8104d3bdeSDan OpenSolaris Anderson * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9104d3bdeSDan OpenSolaris Anderson * or http://www.opensolaris.org/os/licensing.
10104d3bdeSDan OpenSolaris Anderson * See the License for the specific language governing permissions
11104d3bdeSDan OpenSolaris Anderson * and limitations under the License.
12104d3bdeSDan OpenSolaris Anderson *
13104d3bdeSDan OpenSolaris Anderson * When distributing Covered Code, include this CDDL HEADER in each
14104d3bdeSDan OpenSolaris Anderson * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15104d3bdeSDan OpenSolaris Anderson * If applicable, add the following below this CDDL HEADER, with the
16104d3bdeSDan OpenSolaris Anderson * fields enclosed by brackets "[]" replaced with your own identifying
17104d3bdeSDan OpenSolaris Anderson * information: Portions Copyright [yyyy] [name of copyright owner]
18104d3bdeSDan OpenSolaris Anderson *
19104d3bdeSDan OpenSolaris Anderson * CDDL HEADER END
20104d3bdeSDan OpenSolaris Anderson */
21104d3bdeSDan OpenSolaris Anderson
22104d3bdeSDan OpenSolaris Anderson/*
23104d3bdeSDan OpenSolaris Anderson * Copyright (c) 2009 Intel Corporation
24104d3bdeSDan OpenSolaris Anderson * All Rights Reserved.
25104d3bdeSDan OpenSolaris Anderson */
26104d3bdeSDan OpenSolaris Anderson/*
27104d3bdeSDan OpenSolaris Anderson * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
28104d3bdeSDan OpenSolaris Anderson * Use is subject to license terms.
29104d3bdeSDan OpenSolaris Anderson */
30104d3bdeSDan OpenSolaris Anderson
31104d3bdeSDan OpenSolaris Anderson/*
32104d3bdeSDan OpenSolaris Anderson * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
33104d3bdeSDan OpenSolaris Anderson * instructions.  This file contains an accelerated
34104d3bdeSDan OpenSolaris Anderson * Galois Field Multiplication implementation.
35104d3bdeSDan OpenSolaris Anderson *
36104d3bdeSDan OpenSolaris Anderson * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
37104d3bdeSDan OpenSolaris Anderson * carry-less multiplication. More information about PCLMULQDQ can be
38104d3bdeSDan OpenSolaris Anderson * found at:
39104d3bdeSDan OpenSolaris Anderson * http://software.intel.com/en-us/articles/
40104d3bdeSDan OpenSolaris Anderson * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
41104d3bdeSDan OpenSolaris Anderson *
42104d3bdeSDan OpenSolaris Anderson */
43104d3bdeSDan OpenSolaris Anderson
44104d3bdeSDan OpenSolaris Anderson/*
45104d3bdeSDan OpenSolaris Anderson * ====================================================================
46104d3bdeSDan OpenSolaris Anderson * OpenSolaris OS modifications
47104d3bdeSDan OpenSolaris Anderson *
48104d3bdeSDan OpenSolaris Anderson * This source originates as file galois_hash_asm.c from
49104d3bdeSDan OpenSolaris Anderson * Intel Corporation dated September 21, 2009.
50104d3bdeSDan OpenSolaris Anderson *
51104d3bdeSDan OpenSolaris Anderson * This OpenSolaris version has these major changes from the original source:
52104d3bdeSDan OpenSolaris Anderson *
53104d3bdeSDan OpenSolaris Anderson * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
54104d3bdeSDan OpenSolaris Anderson * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
55104d3bdeSDan OpenSolaris Anderson * definition for lint.
56104d3bdeSDan OpenSolaris Anderson *
57104d3bdeSDan OpenSolaris Anderson * 2. Formatted code, added comments, and added #includes and #defines.
58104d3bdeSDan OpenSolaris Anderson *
59*8de5c4f4SDan OpenSolaris Anderson * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
60104d3bdeSDan OpenSolaris Anderson * calling kpreempt_disable() and kpreempt_enable().
61104d3bdeSDan OpenSolaris Anderson * If the TS bit is not set, Save and restore %xmm registers at the beginning
62104d3bdeSDan OpenSolaris Anderson * and end of function calls (%xmm* registers are not saved and restored by
63104d3bdeSDan OpenSolaris Anderson * during kernel thread preemption).
64104d3bdeSDan OpenSolaris Anderson *
65*8de5c4f4SDan OpenSolaris Anderson * 4. Removed code to perform hashing.  This is already done with C macro
66104d3bdeSDan OpenSolaris Anderson * GHASH in gcm.c.  For better performance, this removed code should be
67104d3bdeSDan OpenSolaris Anderson * reintegrated in the future to replace the C GHASH macro.
68104d3bdeSDan OpenSolaris Anderson *
69*8de5c4f4SDan OpenSolaris Anderson * 5. Added code to byte swap 16-byte input and output.
70104d3bdeSDan OpenSolaris Anderson *
71*8de5c4f4SDan OpenSolaris Anderson * 6. Folded in comments from the original C source with embedded assembly
72104d3bdeSDan OpenSolaris Anderson * (SB_w_shift_xor.c)
73104d3bdeSDan OpenSolaris Anderson *
74*8de5c4f4SDan OpenSolaris Anderson * 7. Renamed function and reordered parameters to match OpenSolaris:
75104d3bdeSDan OpenSolaris Anderson * Intel interface:
76104d3bdeSDan OpenSolaris Anderson *	void galois_hash_asm(unsigned char *hk, unsigned char *s,
77104d3bdeSDan OpenSolaris Anderson *		unsigned char *d, int length)
78104d3bdeSDan OpenSolaris Anderson * OpenSolaris OS interface:
79104d3bdeSDan OpenSolaris Anderson *	void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
80104d3bdeSDan OpenSolaris Anderson * ====================================================================
81104d3bdeSDan OpenSolaris Anderson */
82104d3bdeSDan OpenSolaris Anderson
83104d3bdeSDan OpenSolaris Anderson
84104d3bdeSDan OpenSolaris Anderson#if defined(lint) || defined(__lint)
85104d3bdeSDan OpenSolaris Anderson
86104d3bdeSDan OpenSolaris Anderson#include <sys/types.h>
87104d3bdeSDan OpenSolaris Anderson
88104d3bdeSDan OpenSolaris Anderson/* ARGSUSED */
89104d3bdeSDan OpenSolaris Andersonvoid
90104d3bdeSDan OpenSolaris Andersongcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
91104d3bdeSDan OpenSolaris Anderson}
92104d3bdeSDan OpenSolaris Anderson
93104d3bdeSDan OpenSolaris Anderson#else	/* lint */
94104d3bdeSDan OpenSolaris Anderson
95104d3bdeSDan OpenSolaris Anderson#include <sys/asm_linkage.h>
96104d3bdeSDan OpenSolaris Anderson#include <sys/controlregs.h>
97104d3bdeSDan OpenSolaris Anderson#ifdef _KERNEL
98104d3bdeSDan OpenSolaris Anderson#include <sys/machprivregs.h>
99104d3bdeSDan OpenSolaris Anderson#endif
100104d3bdeSDan OpenSolaris Anderson
101104d3bdeSDan OpenSolaris Anderson#ifdef _KERNEL
102104d3bdeSDan OpenSolaris Anderson	/*
103104d3bdeSDan OpenSolaris Anderson	 * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv.  That is,
104104d3bdeSDan OpenSolaris Anderson	 * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
105104d3bdeSDan OpenSolaris Anderson	 * uses it to pass P2 to syscall.
106104d3bdeSDan OpenSolaris Anderson	 * This also occurs with the STTS macro, but we don't care if
107104d3bdeSDan OpenSolaris Anderson	 * P2 (%rsi) is modified just before function exit.
108104d3bdeSDan OpenSolaris Anderson	 * The CLTS and STTS macros push and pop P1 (%rdi) already.
109104d3bdeSDan OpenSolaris Anderson	 */
110104d3bdeSDan OpenSolaris Anderson#ifdef __xpv
111104d3bdeSDan OpenSolaris Anderson#define	PROTECTED_CLTS \
112104d3bdeSDan OpenSolaris Anderson	push	%rsi; \
113104d3bdeSDan OpenSolaris Anderson	CLTS; \
114104d3bdeSDan OpenSolaris Anderson	pop	%rsi
115104d3bdeSDan OpenSolaris Anderson#else
116104d3bdeSDan OpenSolaris Anderson#define	PROTECTED_CLTS \
117104d3bdeSDan OpenSolaris Anderson	CLTS
118104d3bdeSDan OpenSolaris Anderson#endif	/* __xpv */
119104d3bdeSDan OpenSolaris Anderson
120104d3bdeSDan OpenSolaris Anderson	/*
121104d3bdeSDan OpenSolaris Anderson	 * If CR0_TS is not set, align stack (with push %rbp) and push
122104d3bdeSDan OpenSolaris Anderson	 * %xmm0 - %xmm10 on stack, otherwise clear CR0_TS
123104d3bdeSDan OpenSolaris Anderson	 */
124104d3bdeSDan OpenSolaris Anderson#define	CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) \
125104d3bdeSDan OpenSolaris Anderson	push	%rbp; \
126104d3bdeSDan OpenSolaris Anderson	mov	%rsp, %rbp; \
127104d3bdeSDan OpenSolaris Anderson	movq	%cr0, tmpreg; \
128104d3bdeSDan OpenSolaris Anderson	testq	$CR0_TS, tmpreg; \
129104d3bdeSDan OpenSolaris Anderson	jnz	1f; \
130104d3bdeSDan OpenSolaris Anderson	and	$-XMM_ALIGN, %rsp; \
131104d3bdeSDan OpenSolaris Anderson	sub	$[XMM_SIZE * 11], %rsp; \
132104d3bdeSDan OpenSolaris Anderson	movaps	%xmm0, 160(%rsp); \
133104d3bdeSDan OpenSolaris Anderson	movaps	%xmm1, 144(%rsp); \
134104d3bdeSDan OpenSolaris Anderson	movaps	%xmm2, 128(%rsp); \
135104d3bdeSDan OpenSolaris Anderson	movaps	%xmm3, 112(%rsp); \
136104d3bdeSDan OpenSolaris Anderson	movaps	%xmm4, 96(%rsp); \
137104d3bdeSDan OpenSolaris Anderson	movaps	%xmm5, 80(%rsp); \
138104d3bdeSDan OpenSolaris Anderson	movaps	%xmm6, 64(%rsp); \
139104d3bdeSDan OpenSolaris Anderson	movaps	%xmm7, 48(%rsp); \
140104d3bdeSDan OpenSolaris Anderson	movaps	%xmm8, 32(%rsp); \
141104d3bdeSDan OpenSolaris Anderson	movaps	%xmm9, 16(%rsp); \
142104d3bdeSDan OpenSolaris Anderson	movaps	%xmm10, (%rsp); \
143104d3bdeSDan OpenSolaris Anderson	jmp	2f; \
144104d3bdeSDan OpenSolaris Anderson1: \
145104d3bdeSDan OpenSolaris Anderson	PROTECTED_CLTS; \
146104d3bdeSDan OpenSolaris Anderson2:
147104d3bdeSDan OpenSolaris Anderson
148104d3bdeSDan OpenSolaris Anderson
149104d3bdeSDan OpenSolaris Anderson	/*
150104d3bdeSDan OpenSolaris Anderson	 * If CR0_TS was not set above, pop %xmm0 - %xmm10 off stack,
151104d3bdeSDan OpenSolaris Anderson	 * otherwise set CR0_TS.
152104d3bdeSDan OpenSolaris Anderson	 */
153104d3bdeSDan OpenSolaris Anderson#define	SET_TS_OR_POP_XMM_REGISTERS(tmpreg) \
154104d3bdeSDan OpenSolaris Anderson	testq	$CR0_TS, tmpreg; \
155104d3bdeSDan OpenSolaris Anderson	jnz	1f; \
156104d3bdeSDan OpenSolaris Anderson	movaps	(%rsp), %xmm10; \
157104d3bdeSDan OpenSolaris Anderson	movaps	16(%rsp), %xmm9; \
158104d3bdeSDan OpenSolaris Anderson	movaps	32(%rsp), %xmm8; \
159104d3bdeSDan OpenSolaris Anderson	movaps	48(%rsp), %xmm7; \
160104d3bdeSDan OpenSolaris Anderson	movaps	64(%rsp), %xmm6; \
161104d3bdeSDan OpenSolaris Anderson	movaps	80(%rsp), %xmm5; \
162104d3bdeSDan OpenSolaris Anderson	movaps	96(%rsp), %xmm4; \
163104d3bdeSDan OpenSolaris Anderson	movaps	112(%rsp), %xmm3; \
164104d3bdeSDan OpenSolaris Anderson	movaps	128(%rsp), %xmm2; \
165104d3bdeSDan OpenSolaris Anderson	movaps	144(%rsp), %xmm1; \
166104d3bdeSDan OpenSolaris Anderson	movaps	160(%rsp), %xmm0; \
167104d3bdeSDan OpenSolaris Anderson	jmp	2f; \
168104d3bdeSDan OpenSolaris Anderson1: \
169104d3bdeSDan OpenSolaris Anderson	STTS(tmpreg); \
170104d3bdeSDan OpenSolaris Anderson2: \
171104d3bdeSDan OpenSolaris Anderson	mov	%rbp, %rsp; \
172104d3bdeSDan OpenSolaris Anderson	pop	%rbp
173104d3bdeSDan OpenSolaris Anderson
174104d3bdeSDan OpenSolaris Anderson
175104d3bdeSDan OpenSolaris Anderson#else
176104d3bdeSDan OpenSolaris Anderson#define	PROTECTED_CLTS
177104d3bdeSDan OpenSolaris Anderson#define	CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg)
178104d3bdeSDan OpenSolaris Anderson#define	SET_TS_OR_POP_XMM_REGISTERS(tmpreg)
179104d3bdeSDan OpenSolaris Anderson#endif	/* _KERNEL */
180104d3bdeSDan OpenSolaris Anderson
181104d3bdeSDan OpenSolaris Anderson/*
182104d3bdeSDan OpenSolaris Anderson * Use this mask to byte-swap a 16-byte integer with the pshufb instruction
183104d3bdeSDan OpenSolaris Anderson */
184104d3bdeSDan OpenSolaris Anderson
185104d3bdeSDan OpenSolaris Anderson// static uint8_t byte_swap16_mask[] = {
186104d3bdeSDan OpenSolaris Anderson//	 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
187104d3bdeSDan OpenSolaris Anderson.text
188104d3bdeSDan OpenSolaris Anderson.align XMM_ALIGN
189104d3bdeSDan OpenSolaris Anderson.Lbyte_swap16_mask:
190104d3bdeSDan OpenSolaris Anderson	.byte	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
191104d3bdeSDan OpenSolaris Anderson
192104d3bdeSDan OpenSolaris Anderson
193104d3bdeSDan OpenSolaris Anderson
194104d3bdeSDan OpenSolaris Anderson/*
195104d3bdeSDan OpenSolaris Anderson * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
196104d3bdeSDan OpenSolaris Anderson *
197104d3bdeSDan OpenSolaris Anderson * Perform a carry-less multiplication (that is, use XOR instead of the
198104d3bdeSDan OpenSolaris Anderson * multiply operator) on P1 and P2 and place the result in P3.
199104d3bdeSDan OpenSolaris Anderson *
200104d3bdeSDan OpenSolaris Anderson * Byte swap the input and the output.
201104d3bdeSDan OpenSolaris Anderson *
202104d3bdeSDan OpenSolaris Anderson * Note: x_in, y, and res all point to a block of 20-byte numbers
203104d3bdeSDan OpenSolaris Anderson * (an array of two 64-bit integers).
204104d3bdeSDan OpenSolaris Anderson *
205104d3bdeSDan OpenSolaris Anderson * Note2: For kernel code, caller is responsible for ensuring
206104d3bdeSDan OpenSolaris Anderson * kpreempt_disable() has been called.  This is because %xmm registers are
207104d3bdeSDan OpenSolaris Anderson * not saved/restored.  Clear and set the CR0.TS bit on entry and exit,
208104d3bdeSDan OpenSolaris Anderson * respectively, if TS is set on entry.  Otherwise, if TS is not set,
209104d3bdeSDan OpenSolaris Anderson * save and restore %xmm registers on the stack.
210104d3bdeSDan OpenSolaris Anderson *
211104d3bdeSDan OpenSolaris Anderson * Note3: Original Intel definition:
212104d3bdeSDan OpenSolaris Anderson * void galois_hash_asm(unsigned char *hk, unsigned char *s,
213104d3bdeSDan OpenSolaris Anderson *	unsigned char *d, int length)
214104d3bdeSDan OpenSolaris Anderson *
215104d3bdeSDan OpenSolaris Anderson * Note4: Register/parameter mapping:
216104d3bdeSDan OpenSolaris Anderson * Intel:
217104d3bdeSDan OpenSolaris Anderson *	Parameter 1: %rcx (copied to %xmm0)	hk or x_in
218104d3bdeSDan OpenSolaris Anderson *	Parameter 2: %rdx (copied to %xmm1)	s or y
219104d3bdeSDan OpenSolaris Anderson *	Parameter 3: %rdi (result)		d or res
220104d3bdeSDan OpenSolaris Anderson * OpenSolaris:
221104d3bdeSDan OpenSolaris Anderson *	Parameter 1: %rdi (copied to %xmm0)	x_in
222104d3bdeSDan OpenSolaris Anderson *	Parameter 2: %rsi (copied to %xmm1)	y
223104d3bdeSDan OpenSolaris Anderson *	Parameter 3: %rdx (result)		res
224104d3bdeSDan OpenSolaris Anderson */
225104d3bdeSDan OpenSolaris Anderson
226104d3bdeSDan OpenSolaris AndersonENTRY_NP(gcm_mul_pclmulqdq)
227104d3bdeSDan OpenSolaris Anderson	CLEAR_TS_OR_PUSH_XMM_REGISTERS(%r10)
228104d3bdeSDan OpenSolaris Anderson
229104d3bdeSDan OpenSolaris Anderson	//
230104d3bdeSDan OpenSolaris Anderson	// Copy Parameters
231104d3bdeSDan OpenSolaris Anderson	//
232104d3bdeSDan OpenSolaris Anderson	movdqu	(%rdi), %xmm0	// P1
233104d3bdeSDan OpenSolaris Anderson	movdqu	(%rsi), %xmm1	// P2
234104d3bdeSDan OpenSolaris Anderson
235104d3bdeSDan OpenSolaris Anderson	//
236104d3bdeSDan OpenSolaris Anderson	// Byte swap 16-byte input
237104d3bdeSDan OpenSolaris Anderson	//
238104d3bdeSDan OpenSolaris Anderson	lea	.Lbyte_swap16_mask(%rip), %rax
239104d3bdeSDan OpenSolaris Anderson	movaps	(%rax), %xmm10
240*8de5c4f4SDan OpenSolaris Anderson	pshufb	%xmm10, %xmm0
241*8de5c4f4SDan OpenSolaris Anderson	pshufb	%xmm10, %xmm1
242104d3bdeSDan OpenSolaris Anderson
243104d3bdeSDan OpenSolaris Anderson
244104d3bdeSDan OpenSolaris Anderson	//
245104d3bdeSDan OpenSolaris Anderson	// Multiply with the hash key
246104d3bdeSDan OpenSolaris Anderson	//
247104d3bdeSDan OpenSolaris Anderson	movdqu	%xmm0, %xmm3
248*8de5c4f4SDan OpenSolaris Anderson	pclmulqdq $0, %xmm1, %xmm3	// xmm3 holds a0*b0
249104d3bdeSDan OpenSolaris Anderson
250104d3bdeSDan OpenSolaris Anderson	movdqu	%xmm0, %xmm4
251*8de5c4f4SDan OpenSolaris Anderson	pclmulqdq $16, %xmm1, %xmm4	// xmm4 holds a0*b1
252104d3bdeSDan OpenSolaris Anderson
253104d3bdeSDan OpenSolaris Anderson	movdqu	%xmm0, %xmm5
254*8de5c4f4SDan OpenSolaris Anderson	pclmulqdq $1, %xmm1, %xmm5	// xmm5 holds a1*b0
255104d3bdeSDan OpenSolaris Anderson	movdqu	%xmm0, %xmm6
256*8de5c4f4SDan OpenSolaris Anderson	pclmulqdq $17, %xmm1, %xmm6	// xmm6 holds a1*b1
257104d3bdeSDan OpenSolaris Anderson
258104d3bdeSDan OpenSolaris Anderson	pxor	%xmm5, %xmm4	// xmm4 holds a0*b1 + a1*b0
259104d3bdeSDan OpenSolaris Anderson
260104d3bdeSDan OpenSolaris Anderson	movdqu	%xmm4, %xmm5	// move the contents of xmm4 to xmm5
261104d3bdeSDan OpenSolaris Anderson	psrldq	$8, %xmm4	// shift by xmm4 64 bits to the right
262104d3bdeSDan OpenSolaris Anderson	pslldq	$8, %xmm5	// shift by xmm5 64 bits to the left
263104d3bdeSDan OpenSolaris Anderson	pxor	%xmm5, %xmm3
264104d3bdeSDan OpenSolaris Anderson	pxor	%xmm4, %xmm6	// Register pair <xmm6:xmm3> holds the result
265104d3bdeSDan OpenSolaris Anderson				// of the carry-less multiplication of
266104d3bdeSDan OpenSolaris Anderson				// xmm0 by xmm1.
267104d3bdeSDan OpenSolaris Anderson
268104d3bdeSDan OpenSolaris Anderson	// We shift the result of the multiplication by one bit position
269104d3bdeSDan OpenSolaris Anderson	// to the left to cope for the fact that the bits are reversed.
270104d3bdeSDan OpenSolaris Anderson	movdqu	%xmm3, %xmm7
271104d3bdeSDan OpenSolaris Anderson	movdqu	%xmm6, %xmm8
272104d3bdeSDan OpenSolaris Anderson	pslld	$1, %xmm3
273104d3bdeSDan OpenSolaris Anderson	pslld	$1, %xmm6
274104d3bdeSDan OpenSolaris Anderson	psrld	$31, %xmm7
275104d3bdeSDan OpenSolaris Anderson	psrld	$31, %xmm8
276104d3bdeSDan OpenSolaris Anderson	movdqu	%xmm7, %xmm9
277104d3bdeSDan OpenSolaris Anderson	pslldq	$4, %xmm8
278104d3bdeSDan OpenSolaris Anderson	pslldq	$4, %xmm7
279104d3bdeSDan OpenSolaris Anderson	psrldq	$12, %xmm9
280104d3bdeSDan OpenSolaris Anderson	por	%xmm7, %xmm3
281104d3bdeSDan OpenSolaris Anderson	por	%xmm8, %xmm6
282104d3bdeSDan OpenSolaris Anderson	por	%xmm9, %xmm6
283104d3bdeSDan OpenSolaris Anderson
284104d3bdeSDan OpenSolaris Anderson	//
285104d3bdeSDan OpenSolaris Anderson	// First phase of the reduction
286104d3bdeSDan OpenSolaris Anderson	//
287104d3bdeSDan OpenSolaris Anderson	// Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
288104d3bdeSDan OpenSolaris Anderson	// independently.
289104d3bdeSDan OpenSolaris Anderson	movdqu	%xmm3, %xmm7
290104d3bdeSDan OpenSolaris Anderson	movdqu	%xmm3, %xmm8
291104d3bdeSDan OpenSolaris Anderson	movdqu	%xmm3, %xmm9
292104d3bdeSDan OpenSolaris Anderson	pslld	$31, %xmm7	// packed right shift shifting << 31
293104d3bdeSDan OpenSolaris Anderson	pslld	$30, %xmm8	// packed right shift shifting << 30
294104d3bdeSDan OpenSolaris Anderson	pslld	$25, %xmm9	// packed right shift shifting << 25
295104d3bdeSDan OpenSolaris Anderson	pxor	%xmm8, %xmm7	// xor the shifted versions
296104d3bdeSDan OpenSolaris Anderson	pxor	%xmm9, %xmm7
297104d3bdeSDan OpenSolaris Anderson	movdqu	%xmm7, %xmm8
298104d3bdeSDan OpenSolaris Anderson	pslldq	$12, %xmm7
299104d3bdeSDan OpenSolaris Anderson	psrldq	$4, %xmm8
300104d3bdeSDan OpenSolaris Anderson	pxor	%xmm7, %xmm3	// first phase of the reduction complete
301104d3bdeSDan OpenSolaris Anderson
302104d3bdeSDan OpenSolaris Anderson	//
303104d3bdeSDan OpenSolaris Anderson	// Second phase of the reduction
304104d3bdeSDan OpenSolaris Anderson	//
305104d3bdeSDan OpenSolaris Anderson	// Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
306104d3bdeSDan OpenSolaris Anderson	// shift operations.
307104d3bdeSDan OpenSolaris Anderson	movdqu	%xmm3, %xmm2
308104d3bdeSDan OpenSolaris Anderson	movdqu	%xmm3, %xmm4	// packed left shifting >> 1
309104d3bdeSDan OpenSolaris Anderson	movdqu	%xmm3, %xmm5
310104d3bdeSDan OpenSolaris Anderson	psrld	$1, %xmm2
311104d3bdeSDan OpenSolaris Anderson	psrld	$2, %xmm4	// packed left shifting >> 2
312104d3bdeSDan OpenSolaris Anderson	psrld	$7, %xmm5	// packed left shifting >> 7
313104d3bdeSDan OpenSolaris Anderson	pxor	%xmm4, %xmm2	// xor the shifted versions
314104d3bdeSDan OpenSolaris Anderson	pxor	%xmm5, %xmm2
315104d3bdeSDan OpenSolaris Anderson	pxor	%xmm8, %xmm2
316104d3bdeSDan OpenSolaris Anderson	pxor	%xmm2, %xmm3
317104d3bdeSDan OpenSolaris Anderson	pxor	%xmm3, %xmm6	// the result is in xmm6
318104d3bdeSDan OpenSolaris Anderson
319104d3bdeSDan OpenSolaris Anderson	//
320104d3bdeSDan OpenSolaris Anderson	// Byte swap 16-byte result
321104d3bdeSDan OpenSolaris Anderson	//
322*8de5c4f4SDan OpenSolaris Anderson	pshufb	%xmm10, %xmm6	// %xmm10 has the swap mask
323104d3bdeSDan OpenSolaris Anderson
324104d3bdeSDan OpenSolaris Anderson	//
325104d3bdeSDan OpenSolaris Anderson	// Store the result
326104d3bdeSDan OpenSolaris Anderson	//
327104d3bdeSDan OpenSolaris Anderson	movdqu	%xmm6, (%rdx)	// P3
328104d3bdeSDan OpenSolaris Anderson
329104d3bdeSDan OpenSolaris Anderson
330104d3bdeSDan OpenSolaris Anderson	//
331104d3bdeSDan OpenSolaris Anderson	// Cleanup and Return
332104d3bdeSDan OpenSolaris Anderson	//
333104d3bdeSDan OpenSolaris Anderson	SET_TS_OR_POP_XMM_REGISTERS(%r10)
334104d3bdeSDan OpenSolaris Anderson	ret
335104d3bdeSDan OpenSolaris Anderson	SET_SIZE(gcm_mul_pclmulqdq)
336104d3bdeSDan OpenSolaris Anderson
337104d3bdeSDan OpenSolaris Anderson#endif	/* lint || __lint */
338