xref: /titanic_50/usr/src/common/crypto/modes/amd64/gcm_intel.s (revision cfed26cb92bcaf129a6022a551f26391e5841135)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2009 Intel Corporation
24 * All Rights Reserved.
25 */
26/*
27 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
28 * Use is subject to license terms.
29 */
30
31/*
32 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
33 * instructions.  This file contains an accelerated
34 * Galois Field Multiplication implementation.
35 *
36 * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
37 * carry-less multiplication. More information about PCLMULQDQ can be
38 * found at:
39 * http://software.intel.com/en-us/articles/
40 * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
41 *
42 */
43
44/*
45 * ====================================================================
46 * OpenSolaris OS modifications
47 *
48 * This source originates as file galois_hash_asm.c from
49 * Intel Corporation dated September 21, 2009.
50 *
51 * This OpenSolaris version has these major changes from the original source:
52 *
53 * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
54 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
55 * definition for lint.
56 *
57 * 2. Formatted code, added comments, and added #includes and #defines.
58 *
59 * 3. Commented out pclmulqdq and pshufb instructions and replaced with
60 * .byte sequences (as pclmulqdq isn't supported yet by all of the gas, as,
61 * and aw assemblers).
62 *
63 * 4. If bit CR0.TS is set, clear and set the TS bit, after and before
64 * calling kpreempt_disable() and kpreempt_enable().
65 * If the TS bit is not set, Save and restore %xmm registers at the beginning
66 * and end of function calls (%xmm* registers are not saved and restored by
67 * during kernel thread preemption).
68 *
69 * 5. Removed code to perform hashing.  This is already done with C macro
70 * GHASH in gcm.c.  For better performance, this removed code should be
71 * reintegrated in the future to replace the C GHASH macro.
72 *
73 * 6. Added code to byte swap 16-byte input and output.
74 *
75 * 7. Folded in comments from the original C source with embedded assembly
76 * (SB_w_shift_xor.c)
77 *
78 * 8. Renamed function and reordered parameters to match OpenSolaris:
79 * Intel interface:
80 *	void galois_hash_asm(unsigned char *hk, unsigned char *s,
81 *		unsigned char *d, int length)
82 * OpenSolaris OS interface:
83 *	void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
84 * ====================================================================
85 */
86
87
88#if defined(lint) || defined(__lint)
89
90#include <sys/types.h>
91
92/* ARGSUSED */
93void
94gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
95}
96
97#else	/* lint */
98
99#include <sys/asm_linkage.h>
100#include <sys/controlregs.h>
101#ifdef _KERNEL
102#include <sys/machprivregs.h>
103#endif
104
105#ifdef _KERNEL
106	/*
107	 * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv.  That is,
108	 * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
109	 * uses it to pass P2 to syscall.
110	 * This also occurs with the STTS macro, but we don't care if
111	 * P2 (%rsi) is modified just before function exit.
112	 * The CLTS and STTS macros push and pop P1 (%rdi) already.
113	 */
114#ifdef __xpv
115#define	PROTECTED_CLTS \
116	push	%rsi; \
117	CLTS; \
118	pop	%rsi
119#else
120#define	PROTECTED_CLTS \
121	CLTS
122#endif	/* __xpv */
123
124	/*
125	 * If CR0_TS is not set, align stack (with push %rbp) and push
126	 * %xmm0 - %xmm10 on stack, otherwise clear CR0_TS
127	 */
128#define	CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) \
129	push	%rbp; \
130	mov	%rsp, %rbp; \
131	movq    %cr0, tmpreg; \
132	testq	$CR0_TS, tmpreg; \
133	jnz	1f; \
134	and	$-XMM_ALIGN, %rsp; \
135	sub	$[XMM_SIZE * 11], %rsp; \
136	movaps	%xmm0, 160(%rsp); \
137	movaps	%xmm1, 144(%rsp); \
138	movaps	%xmm2, 128(%rsp); \
139	movaps	%xmm3, 112(%rsp); \
140	movaps	%xmm4, 96(%rsp); \
141	movaps	%xmm5, 80(%rsp); \
142	movaps	%xmm6, 64(%rsp); \
143	movaps	%xmm7, 48(%rsp); \
144	movaps	%xmm8, 32(%rsp); \
145	movaps	%xmm9, 16(%rsp); \
146	movaps	%xmm10, (%rsp); \
147	jmp	2f; \
1481: \
149	PROTECTED_CLTS; \
1502:
151
152
153	/*
154	 * If CR0_TS was not set above, pop %xmm0 - %xmm10 off stack,
155	 * otherwise set CR0_TS.
156	 */
157#define	SET_TS_OR_POP_XMM_REGISTERS(tmpreg) \
158	testq	$CR0_TS, tmpreg; \
159	jnz	1f; \
160	movaps	(%rsp), %xmm10; \
161	movaps	16(%rsp), %xmm9; \
162	movaps	32(%rsp), %xmm8; \
163	movaps	48(%rsp), %xmm7; \
164	movaps	64(%rsp), %xmm6; \
165	movaps	80(%rsp), %xmm5; \
166	movaps	96(%rsp), %xmm4; \
167	movaps	112(%rsp), %xmm3; \
168	movaps	128(%rsp), %xmm2; \
169	movaps	144(%rsp), %xmm1; \
170	movaps	160(%rsp), %xmm0; \
171	jmp	2f; \
1721: \
173	STTS(tmpreg); \
1742: \
175	mov	%rbp, %rsp; \
176	pop	%rbp
177
178
179#else
180#define	PROTECTED_CLTS
181#define	CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg)
182#define	SET_TS_OR_POP_XMM_REGISTERS(tmpreg)
183#endif	/* _KERNEL */
184
185/*
186 * Use this mask to byte-swap a 16-byte integer with the pshufb instruction
187 */
188
189// static uint8_t byte_swap16_mask[] = {
190//	 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
191.text
192.align XMM_ALIGN
193.Lbyte_swap16_mask:
194	.byte	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
195
196
197
198/*
199 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
200 *
201 * Perform a carry-less multiplication (that is, use XOR instead of the
202 * multiply operator) on  P1 and P2 and place the result in P3.
203 *
204 * Byte swap the input and the output.
205 *
206 * Note: x_in, y, and res all point to a block of 20-byte numbers
207 * (an array of two 64-bit integers).
208 *
209 * Note2: For kernel code, caller is responsible for ensuring
210 * kpreempt_disable() has been called.  This is because %xmm registers are
211 * not saved/restored.  Clear and set the CR0.TS bit on entry and exit,
212 * respectively,  if TS is set on entry.  Otherwise, if TS is not set,
213 * save and restore %xmm registers on the stack.
214 *
215 * Note3: Original Intel definition:
216 * void galois_hash_asm(unsigned char *hk, unsigned char *s,
217 *	unsigned char *d, int length)
218 *
219 * Note4: Register/parameter mapping:
220 * Intel:
221 *	Parameter 1: %rcx (copied to %xmm0)	hk or x_in
222 *	Parameter 2: %rdx (copied to %xmm1)	s or y
223 *	Parameter 3: %rdi (result)		d or res
224 * OpenSolaris:
225 *	Parameter 1: %rdi (copied to %xmm0)	x_in
226 *	Parameter 2: %rsi (copied to %xmm1)	y
227 *	Parameter 3: %rdx (result)		res
228 */
229
230ENTRY_NP(gcm_mul_pclmulqdq)
231	CLEAR_TS_OR_PUSH_XMM_REGISTERS(%r10)
232
233	//
234	// Copy Parameters
235	//
236	movdqu	(%rdi), %xmm0 // P1
237	movdqu	(%rsi), %xmm1 // P2
238
239	//
240	// Byte swap 16-byte input
241	//
242	lea	.Lbyte_swap16_mask(%rip), %rax
243	movaps	(%rax), %xmm10
244	//pshufb	%xmm10, %xmm0
245	.byte	0x66, 0x41, 0x0f, 0x38, 0x00, 0xc2
246	//pshufb	%xmm10, %xmm1
247	.byte	0x66, 0x41, 0x0f, 0x38, 0x00, 0xca
248
249
250	//
251	// Multiply with the hash key
252	//
253	movdqu	%xmm0, %xmm3
254	//pclmulqdq $0, %xmm1, %xmm3	// xmm3 holds a0*b0
255	.byte	0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x00
256
257	movdqu	%xmm0, %xmm4
258	//pclmulqdq $16, %xmm1, %xmm4	// xmm4 holds a0*b1
259	.byte	0x66, 0x0f, 0x3a, 0x44, 0xe1, 0x10
260
261	movdqu	%xmm0, %xmm5
262	//pclmulqdq $1, %xmm1, %xmm5	// xmm5 holds a1*b0
263	.byte	0x66, 0x0f, 0x3a, 0x44, 0xe9, 0x01
264	movdqu	%xmm0, %xmm6
265	//pclmulqdq $17, %xmm1, %xmm6	// xmm6 holds a1*b1
266	.byte	0x66, 0x0f, 0x3a, 0x44, 0xf1, 0x11
267
268	pxor	%xmm5, %xmm4	// xmm4 holds a0*b1 + a1*b0
269
270	movdqu	%xmm4, %xmm5	// move the contents of xmm4 to xmm5
271	psrldq	$8, %xmm4	// shift by xmm4 64 bits to the right
272	pslldq	$8, %xmm5	// shift by xmm5 64 bits to the left
273	pxor	%xmm5, %xmm3
274	pxor	%xmm4, %xmm6	// Register pair <xmm6:xmm3> holds the result
275				// of the carry-less multiplication of
276				// xmm0 by xmm1.
277
278	// We shift the result of the multiplication by one bit position
279	// to the left to cope for the fact that the bits are reversed.
280	movdqu	%xmm3, %xmm7
281	movdqu	%xmm6, %xmm8
282	pslld	$1, %xmm3
283	pslld	$1, %xmm6
284	psrld	$31, %xmm7
285	psrld	$31, %xmm8
286	movdqu	%xmm7, %xmm9
287	pslldq	$4, %xmm8
288	pslldq	$4, %xmm7
289	psrldq	$12, %xmm9
290	por	%xmm7, %xmm3
291	por	%xmm8, %xmm6
292	por	%xmm9, %xmm6
293
294	//
295	// First phase of the reduction
296	//
297	// Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
298	// independently.
299	movdqu	%xmm3, %xmm7
300	movdqu	%xmm3, %xmm8
301	movdqu	%xmm3, %xmm9
302	pslld	$31, %xmm7	// packed right shift shifting << 31
303	pslld	$30, %xmm8	// packed right shift shifting << 30
304	pslld	$25, %xmm9	// packed right shift shifting << 25
305	pxor	%xmm8, %xmm7	// xor the shifted versions
306	pxor	%xmm9, %xmm7
307	movdqu	%xmm7, %xmm8
308	pslldq	$12, %xmm7
309	psrldq	$4, %xmm8
310	pxor	%xmm7, %xmm3	// first phase of the reduction complete
311
312	//
313	// Second phase of the reduction
314	//
315	// Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
316	// shift operations.
317	movdqu	%xmm3, %xmm2
318	movdqu	%xmm3, %xmm4	// packed left shifting >> 1
319	movdqu	%xmm3, %xmm5
320	psrld	$1, %xmm2
321	psrld	$2, %xmm4	// packed left shifting >> 2
322	psrld	$7, %xmm5	// packed left shifting >> 7
323	pxor	%xmm4, %xmm2	// xor the shifted versions
324	pxor	%xmm5, %xmm2
325	pxor	%xmm8, %xmm2
326	pxor	%xmm2, %xmm3
327	pxor	%xmm3, %xmm6	// the result is in xmm6
328
329	//
330	// Byte swap 16-byte result
331	//
332	//pshufb	%xmm10, %xmm6	// %xmm10 has the swap mask
333	.byte	0x66, 0x41, 0x0f, 0x38, 0x00, 0xf2
334
335	//
336	// Store the result
337	//
338	movdqu	%xmm6, (%rdx) // P3
339
340
341	//
342	// Cleanup and Return
343	//
344	SET_TS_OR_POP_XMM_REGISTERS(%r10)
345	ret
346	SET_SIZE(gcm_mul_pclmulqdq)
347
348#endif  /* lint || __lint */
349