xref: /illumos-gate/usr/src/uts/intel/ml/sseblk.S (revision d48be21240dfd051b689384ce2b23479d757f2d8)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*
27 * Copyright 2019 Joyent, Inc.
28 */
29
30#include <sys/asm_linkage.h>
31#include <sys/regset.h>
32#include <sys/privregs.h>
33
34#include "assym.h"
35
36/*
37 * Do block operations using Streaming SIMD extensions
38 */
39
40#if defined(DEBUG)
41#define	ASSERT_KPREEMPT_DISABLED(t, r32, msg)	\
42	movq	%gs:CPU_THREAD, t;		\
43	movsbl	T_PREEMPT(t), r32;		\
44	testl	r32, r32;			\
45	jne	5f;				\
46	pushq	%rbp;				\
47	movq	%rsp, %rbp;			\
48	leaq	msg(%rip), %rdi;		\
49	xorl	%eax, %eax;			\
50	call	panic;				\
515:
52#else	/* DEBUG */
53#define	ASSERT_KPREEMPT_DISABLED(t, r32, msg)
54#endif	/* DEBUG */
55
56#define	BLOCKSHIFT	6
57#define	BLOCKSIZE	64	/* (1 << BLOCKSHIFT) */
58#define	BLOCKMASK	63	/* (BLOCKSIZE - 1) */
59
60#if (1 << BLOCKSHIFT) != BLOCKSIZE || BLOCKMASK != (BLOCKSIZE - 1)
61#error	"mucked up constants"
62#endif
63
64#define	SAVE_XMM0(r)				\
65	SAVE_XMM_PROLOG(r, 1);			\
66	movdqa	%xmm0, (r)
67
68#define	ZERO_LOOP_INIT_XMM(dst)			\
69	pxor	%xmm0, %xmm0
70
71#define	ZERO_LOOP_BODY_XMM(dst, cnt)		\
72	movntdq	%xmm0, (dst);			\
73	movntdq	%xmm0, 0x10(dst);		\
74	movntdq	%xmm0, 0x20(dst);		\
75	movntdq	%xmm0, 0x30(dst);		\
76	addq	$BLOCKSIZE, dst;		\
77	subq	$1, cnt
78
79#define	ZERO_LOOP_FINI_XMM(dst)			\
80	mfence
81
82#define	RSTOR_XMM0(r)				\
83	movdqa	0x0(r), %xmm0;			\
84	RSTOR_XMM_EPILOG(r, 1)
85
86	/*
87	 * %rdi		dst
88	 * %rsi		size
89	 * %rax		saved %cr0 (#if DEBUG then %eax is t->t_preempt)
90	 * %r8		pointer to %xmm register save area
91	 */
92	ENTRY(hwblkclr)
93	pushq	%rbp
94	movq	%rsp, %rbp
95	testl	$BLOCKMASK, %edi	/* address must be BLOCKSIZE aligned */
96	jne	.dobzero
97	cmpq	$BLOCKSIZE, %rsi	/* size must be at least BLOCKSIZE */
98	jl	.dobzero
99	testq	$BLOCKMASK, %rsi	/* .. and be a multiple of BLOCKSIZE */
100	jne	.dobzero
101	shrq	$BLOCKSHIFT, %rsi
102
103	ASSERT_KPREEMPT_DISABLED(%r11, %eax, .not_disabled)
104	movq	%cr0, %rax
105	clts
106	testl	$CR0_TS, %eax
107	jnz	1f
108
109	SAVE_XMM0(%r8)
1101:	ZERO_LOOP_INIT_XMM(%rdi)
1119:	ZERO_LOOP_BODY_XMM(%rdi, %rsi)
112	jnz	9b
113	ZERO_LOOP_FINI_XMM(%rdi)
114
115	testl	$CR0_TS, %eax
116	jnz	2f
117	RSTOR_XMM0(%r8)
1182:	movq	%rax, %cr0
119	leave
120	ret
121.dobzero:
122	leave
123	jmp	bzero
124	SET_SIZE(hwblkclr)
125
126
127#define	PREFETCH_START(src)			\
128	prefetchnta	0x0(src);		\
129	prefetchnta	0x40(src)
130
131#define	SAVE_XMMS(r)				\
132	SAVE_XMM_PROLOG(r, 8);			\
133	movdqa	%xmm0, (r);			\
134	movdqa	%xmm1, 0x10(r);			\
135	movdqa	%xmm2, 0x20(r);			\
136	movdqa	%xmm3, 0x30(r);			\
137	movdqa	%xmm4, 0x40(r);			\
138	movdqa	%xmm5, 0x50(r);			\
139	movdqa	%xmm6, 0x60(r);			\
140	movdqa	%xmm7, 0x70(r)
141
142#define	COPY_LOOP_INIT_XMM(src)			\
143	prefetchnta	0x80(src);		\
144	prefetchnta	0xc0(src);		\
145	movdqa	0x0(src), %xmm0;		\
146	movdqa	0x10(src), %xmm1;		\
147	movdqa	0x20(src), %xmm2;		\
148	movdqa	0x30(src), %xmm3;		\
149	movdqa	0x40(src), %xmm4;		\
150	movdqa	0x50(src), %xmm5;		\
151	movdqa	0x60(src), %xmm6;		\
152	movdqa	0x70(src), %xmm7;		\
153	addq	$0x80, src
154
155#define	COPY_LOOP_BODY_XMM(src, dst, cnt)	\
156	prefetchnta	0x80(src);		\
157	prefetchnta	0xc0(src);		\
158	prefetchnta	0x100(src);		\
159	prefetchnta	0x140(src);		\
160	movntdq	%xmm0, (dst);			\
161	movntdq	%xmm1, 0x10(dst);		\
162	movntdq	%xmm2, 0x20(dst);		\
163	movntdq	%xmm3, 0x30(dst);		\
164	movdqa	0x0(src), %xmm0;		\
165	movdqa	0x10(src), %xmm1;		\
166	movntdq	%xmm4, 0x40(dst);		\
167	movntdq	%xmm5, 0x50(dst);		\
168	movdqa	0x20(src), %xmm2;		\
169	movdqa	0x30(src), %xmm3;		\
170	movntdq	%xmm6, 0x60(dst);		\
171	movntdq	%xmm7, 0x70(dst);		\
172	movdqa	0x40(src), %xmm4;		\
173	movdqa	0x50(src), %xmm5;		\
174	addq	$0x80, dst;			\
175	movdqa	0x60(src), %xmm6;		\
176	movdqa	0x70(src), %xmm7;		\
177	addq	$0x80, src;			\
178	subl	$1, cnt
179
180#define	COPY_LOOP_FINI_XMM(dst)			\
181	movntdq	%xmm0, 0x0(dst);		\
182	movntdq	%xmm1, 0x10(dst);		\
183	movntdq	%xmm2, 0x20(dst);		\
184	movntdq	%xmm3, 0x30(dst);		\
185	movntdq	%xmm4, 0x40(dst);		\
186	movntdq	%xmm5, 0x50(dst);		\
187	movntdq %xmm6, 0x60(dst);		\
188	movntdq	%xmm7, 0x70(dst)
189
190#define	RSTOR_XMMS(r)				\
191	movdqa	0x0(r), %xmm0;			\
192	movdqa	0x10(r), %xmm1;			\
193	movdqa	0x20(r), %xmm2;			\
194	movdqa	0x30(r), %xmm3;			\
195	movdqa	0x40(r), %xmm4;			\
196	movdqa	0x50(r), %xmm5;			\
197	movdqa	0x60(r), %xmm6;			\
198	movdqa	0x70(r), %xmm7;			\
199	RSTOR_XMM_EPILOG(r, 8)
200
201	/*
202	 * %rdi		src
203	 * %rsi		dst
204	 * %rdx		#if DEBUG then curthread
205	 * %ecx		loop count
206	 * %rax		saved %cr0 (#if DEBUG then %eax is t->t_prempt)
207	 * %r8		pointer to %xmm register save area
208	 */
209	ENTRY(hwblkpagecopy)
210	pushq	%rbp
211	movq	%rsp, %rbp
212	PREFETCH_START(%rdi)
213	/*
214	 * PAGESIZE is 4096, each loop moves 128 bytes, but the initial
215	 * load and final store save us on loop count
216	 */
217	movl	$_CONST(32 - 1), %ecx
218	ASSERT_KPREEMPT_DISABLED(%rdx, %eax, .not_disabled)
219	movq	%cr0, %rax
220	clts
221	testl	$CR0_TS, %eax
222	jnz	3f
223	SAVE_XMMS(%r8)
2243:	COPY_LOOP_INIT_XMM(%rdi)
2254:	COPY_LOOP_BODY_XMM(%rdi, %rsi, %ecx)
226	jnz	4b
227	COPY_LOOP_FINI_XMM(%rsi)
228	testl	$CR0_TS, %eax
229	jnz	5f
230	RSTOR_XMMS(%r8)
2315:	movq	%rax, %cr0
232	mfence
233	leave
234	ret
235	SET_SIZE(hwblkpagecopy)
236
237	ENTRY(block_zero_no_xmm)
238	pushq	%rbp
239	movq	%rsp, %rbp
240	xorl	%eax, %eax
241	addq	%rsi, %rdi
242	negq	%rsi
2431:
244	movnti	%rax, (%rdi, %rsi)
245	movnti	%rax, 8(%rdi, %rsi)
246	movnti	%rax, 16(%rdi, %rsi)
247	movnti	%rax, 24(%rdi, %rsi)
248	addq	$32, %rsi
249	jnz	1b
250	mfence
251	leave
252	ret
253	SET_SIZE(block_zero_no_xmm)
254
255
256	ENTRY(page_copy_no_xmm)
257	movq	$MMU_STD_PAGESIZE, %rcx
258	addq	%rcx, %rdi
259	addq	%rcx, %rsi
260	negq	%rcx
2611:
262	movq	(%rsi, %rcx), %rax
263	movnti	%rax, (%rdi, %rcx)
264	movq	8(%rsi, %rcx), %rax
265	movnti	%rax, 8(%rdi, %rcx)
266	movq	16(%rsi, %rcx), %rax
267	movnti	%rax, 16(%rdi, %rcx)
268	movq	24(%rsi, %rcx), %rax
269	movnti	%rax, 24(%rdi, %rcx)
270	addq	$32, %rcx
271	jnz	1b
272	mfence
273	ret
274	SET_SIZE(page_copy_no_xmm)
275
276#if defined(DEBUG)
277	.text
278.not_disabled:
279	.string	"sseblk: preemption not disabled!"
280#endif
281