xref: /titanic_44/usr/src/uts/intel/ia32/ml/sseblk.s (revision 749f21d359d8fbd020c974a1a5227316221bfc9c)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma	ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <sys/asm_linkage.h>
30#include <sys/regset.h>
31#include <sys/privregs.h>
32
33#if defined(__lint)
34#include <sys/types.h>
35#include <sys/archsystm.h>
36#else
37#include "assym.h"
38#endif
39
40/*
41 * Do block operations using Streaming SIMD extensions
42 */
43
44#if defined(DEBUG)
45#if defined(__amd64)
46#define	ASSERT_KPREEMPT_DISABLED(t, r32, msg)	\
47	movq	%gs:CPU_THREAD, t;		\
48	movsbl	T_PREEMPT(t), r32;		\
49	testl	r32, r32;			\
50	jne	5f;				\
51	pushq	%rbp;				\
52	movq	%rsp, %rbp;			\
53	leaq	msg(%rip), %rdi;		\
54	xorl	%eax, %eax;			\
55	call	panic;				\
565:
57#elif defined(__i386)
58#define	ASSERT_KPREEMPT_DISABLED(t, r32, msg)	\
59	movl	%gs:CPU_THREAD, t;		\
60	movsbl	T_PREEMPT(t), r32;		\
61	testl	r32, r32;			\
62	jne	5f;				\
63	pushl	%ebp;				\
64	movl	%esp, %ebp;			\
65	pushl	$msg;				\
66	call	panic;				\
675:
68#endif	/* __i386 */
69#else	/* DEBUG */
70#define	ASSERT_KPREEMPT_DISABLED(t, r32, msg)
71#endif	/* DEBUG */
72
73#define	BLOCKSHIFT	6
74#define	BLOCKSIZE	64	/* (1 << BLOCKSHIFT) */
75#define	BLOCKMASK	63	/* (BLOCKSIZE - 1) */
76
77#if (1 << BLOCKSHIFT) != BLOCKSIZE || BLOCKMASK != (BLOCKSIZE - 1)
78#error	"mucked up constants"
79#endif
80
81#if defined(__lint)
82
83/*ARGSUSED*/
84void
85hwblkclr(void *addr, size_t size)
86{}
87
88#else	/* __lint */
89
90#if defined(__amd64)
91#define	ADD	addq
92#define	SUB	subq
93#else
94#define	ADD	addl
95#define	SUB	subl
96#endif
97
98#define	SAVE_XMM0(r)				\
99	SAVE_XMM_PROLOG(r, 1);			\
100	movdqa	%xmm0, (r)
101
102#define	ZERO_LOOP_INIT_XMM(dst)			\
103	pxor	%xmm0, %xmm0
104
105#define	ZERO_LOOP_BODY_XMM(dst, cnt)		\
106	movntdq	%xmm0, (dst);			\
107	movntdq	%xmm0, 0x10(dst);		\
108	movntdq	%xmm0, 0x20(dst);		\
109	movntdq	%xmm0, 0x30(dst);		\
110	ADD	$BLOCKSIZE, dst;		\
111	SUB	$1, cnt
112
113#define	ZERO_LOOP_FINI_XMM(dst)			\
114	mfence
115
116#define	RSTOR_XMM0(r)				\
117	movdqa	0x0(r), %xmm0;			\
118	RSTOR_XMM_EPILOG(r, 1)
119
120#if defined(__amd64)
121
122	/*
123	 * %rdi		dst
124	 * %rsi		size
125	 * %rax		saved %cr0 (#if DEBUG then %eax is t->t_preempt)
126	 * %r8		pointer to %xmm register save area
127	 */
128	ENTRY(hwblkclr)
129	pushq	%rbp
130	movq	%rsp, %rbp
131	testl	$BLOCKMASK, %edi	/* address must be BLOCKSIZE aligned */
132	jne	.dobzero
133	cmpq	$BLOCKSIZE, %rsi	/* size must be at least BLOCKSIZE */
134	jl	.dobzero
135	testq	$BLOCKMASK, %rsi	/* .. and be a multiple of BLOCKSIZE */
136	jne	.dobzero
137	shrq	$BLOCKSHIFT, %rsi
138
139	ASSERT_KPREEMPT_DISABLED(%r11, %eax, .not_disabled)
140	movq	%cr0, %rax
141	clts
142	testl	$CR0_TS, %eax
143	jnz	1f
144
145	SAVE_XMM0(%r8)
1461:	ZERO_LOOP_INIT_XMM(%rdi)
1479:	ZERO_LOOP_BODY_XMM(%rdi, %rsi)
148	jnz	9b
149	ZERO_LOOP_FINI_XMM(%rdi)
150
151	testl	$CR0_TS, %eax
152	jnz	2f
153	RSTOR_XMM0(%r8)
1542:	movq	%rax, %cr0
155	leave
156	ret
157.dobzero:
158	leave
159	jmp	bzero
160	SET_SIZE(hwblkclr)
161
162#elif defined(__i386)
163
164	/*
165	 * %eax		dst
166	 * %ecx		size in bytes, loop count
167	 * %ebx		saved %cr0 (#if DEBUG then t->t_preempt)
168	 * %edi		pointer to %xmm register save area
169	 */
170	ENTRY(hwblkclr)
171	movl	4(%esp), %eax
172	movl	8(%esp), %ecx
173	testl	$BLOCKMASK, %eax	/* address must be BLOCKSIZE aligned */
174	jne	.dobzero
175	cmpl	$BLOCKSIZE, %ecx	/* size must be at least BLOCKSIZE */
176	jl	.dobzero
177	testl	$BLOCKMASK, %ecx 	/* .. and be a multiple of BLOCKSIZE */
178	jne	.dobzero
179	shrl	$BLOCKSHIFT, %ecx
180	movl	0xc(%esp), %edx
181	pushl	%ebx
182
183	pushl	%esi
184	ASSERT_KPREEMPT_DISABLED(%esi, %ebx, .not_disabled)
185	popl	%esi
186	movl	%cr0, %ebx
187	clts
188	testl	$CR0_TS, %ebx
189	jnz	1f
190
191	pushl	%edi
192	SAVE_XMM0(%edi)
1931:	ZERO_LOOP_INIT_XMM(%eax)
1949:	ZERO_LOOP_BODY_XMM(%eax, %ecx)
195	jnz	9b
196	ZERO_LOOP_FINI_XMM(%eax)
197
198	testl	$CR0_TS, %ebx
199	jnz	2f
200	RSTOR_XMM0(%edi)
201	popl	%edi
2022:	movl	%ebx, %cr0
203	popl	%ebx
204	ret
205.dobzero:
206	jmp	bzero
207	SET_SIZE(hwblkclr)
208
209#endif	/* __i386 */
210#endif	/* __lint */
211
212
213#if defined(__lint)
214
215/*ARGSUSED*/
216void
217hwblkpagecopy(const void *src, void *dst)
218{}
219
220#else	/* __lint */
221
222#define	PREFETCH_START(src)			\
223	prefetchnta	0x0(src);		\
224	prefetchnta	0x40(src)
225
226#define	SAVE_XMMS(r)				\
227	SAVE_XMM_PROLOG(r, 8);			\
228	movdqa	%xmm0, (r);			\
229	movdqa	%xmm1, 0x10(r);			\
230	movdqa	%xmm2, 0x20(r);			\
231	movdqa	%xmm3, 0x30(r);			\
232	movdqa	%xmm4, 0x40(r);			\
233	movdqa	%xmm5, 0x50(r);			\
234	movdqa	%xmm6, 0x60(r);			\
235	movdqa	%xmm7, 0x70(r)
236
237#define	COPY_LOOP_INIT_XMM(src)			\
238	prefetchnta	0x80(src);		\
239	prefetchnta	0xc0(src);		\
240	movdqa	0x0(src), %xmm0;		\
241	movdqa	0x10(src), %xmm1;		\
242	movdqa	0x20(src), %xmm2;		\
243	movdqa	0x30(src), %xmm3;		\
244	movdqa	0x40(src), %xmm4;		\
245	movdqa	0x50(src), %xmm5;		\
246	movdqa	0x60(src), %xmm6;		\
247	movdqa	0x70(src), %xmm7;		\
248	ADD	$0x80, src
249
250#define	COPY_LOOP_BODY_XMM(src, dst, cnt)	\
251	prefetchnta	0x80(src);		\
252	prefetchnta	0xc0(src);		\
253	prefetchnta	0x100(src);		\
254	prefetchnta	0x140(src);		\
255	movntdq	%xmm0, (dst);			\
256	movntdq	%xmm1, 0x10(dst);		\
257	movntdq	%xmm2, 0x20(dst);		\
258	movntdq	%xmm3, 0x30(dst);		\
259	movdqa	0x0(src), %xmm0;		\
260	movdqa	0x10(src), %xmm1;		\
261	movntdq	%xmm4, 0x40(dst);		\
262	movntdq	%xmm5, 0x50(dst);		\
263	movdqa	0x20(src), %xmm2;		\
264	movdqa	0x30(src), %xmm3;		\
265	movntdq	%xmm6, 0x60(dst);		\
266	movntdq	%xmm7, 0x70(dst);		\
267	movdqa	0x40(src), %xmm4;		\
268	movdqa	0x50(src), %xmm5;		\
269	ADD	$0x80, dst;			\
270	movdqa	0x60(src), %xmm6;		\
271	movdqa	0x70(src), %xmm7;		\
272	ADD	$0x80, src;			\
273	subl	$1, cnt
274
275#define	COPY_LOOP_FINI_XMM(dst)			\
276	movntdq	%xmm0, 0x0(dst);		\
277	movntdq	%xmm1, 0x10(dst);		\
278	movntdq	%xmm2, 0x20(dst);		\
279	movntdq	%xmm3, 0x30(dst);		\
280	movntdq	%xmm4, 0x40(dst);		\
281	movntdq	%xmm5, 0x50(dst);		\
282	movntdq %xmm6, 0x60(dst);		\
283	movntdq	%xmm7, 0x70(dst)
284
285#define	RSTOR_XMMS(r)				\
286	movdqa	0x0(r), %xmm0;			\
287	movdqa	0x10(r), %xmm1;			\
288	movdqa	0x20(r), %xmm2;			\
289	movdqa	0x30(r), %xmm3;			\
290	movdqa	0x40(r), %xmm4;			\
291	movdqa	0x50(r), %xmm5;			\
292	movdqa	0x60(r), %xmm6;			\
293	movdqa	0x70(r), %xmm7;			\
294	RSTOR_XMM_EPILOG(r, 8)
295
296#if defined(__amd64)
297
298	/*
299	 * %rdi		src
300	 * %rsi		dst
301	 * %rdx		#if DEBUG then curthread
302	 * %ecx		loop count
303	 * %rax		saved %cr0 (#if DEBUG then %eax is t->t_prempt)
304	 * %r8		pointer to %xmm register save area
305	 */
306	ENTRY(hwblkpagecopy)
307	pushq	%rbp
308	movq	%rsp, %rbp
309	PREFETCH_START(%rdi)
310	/*
311	 * PAGESIZE is 4096, each loop moves 128 bytes, but the initial
312	 * load and final store save us on loop count
313	 */
314	movl	$_CONST(32 - 1), %ecx
315	ASSERT_KPREEMPT_DISABLED(%rdx, %eax, .not_disabled)
316	movq	%cr0, %rax
317	clts
318	testl	$CR0_TS, %eax
319	jnz	3f
320	SAVE_XMMS(%r8)
3213:	COPY_LOOP_INIT_XMM(%rdi)
3224:	COPY_LOOP_BODY_XMM(%rdi, %rsi, %ecx)
323	jnz	4b
324	COPY_LOOP_FINI_XMM(%rsi)
325	testl	$CR0_TS, %eax
326	jnz	5f
327	RSTOR_XMMS(%r8)
3285:	movq	%rax, %cr0
329	mfence
330	leave
331	ret
332	SET_SIZE(hwblkpagecopy)
333
334#elif defined(__i386)
335
336	/*
337	 * %eax		src
338	 * %edx		dst
339	 * %ecx		loop count
340	 * %ebx		saved %cr0 (#if DEBUG then t->t_prempt)
341	 * %edi		pointer to %xmm register save area
342	 * %esi		#if DEBUG temporary thread pointer
343	 */
344	ENTRY(hwblkpagecopy)
345	movl	4(%esp), %eax
346	movl	8(%esp), %edx
347	PREFETCH_START(%eax)
348	pushl	%ebx
349	/*
350	 * PAGESIZE is 4096, each loop moves 128 bytes, but the initial
351	 * load and final store save us one loop count
352	 */
353	movl	$_CONST(32 - 1), %ecx
354	pushl	%esi
355	ASSERT_KPREEMPT_DISABLED(%esi, %ebx, .not_disabled)
356	popl	%esi
357	movl	%cr0, %ebx
358	clts
359	testl	$CR0_TS, %ebx
360	jnz	3f
361	pushl	%edi
362	SAVE_XMMS(%edi)
3633:	COPY_LOOP_INIT_XMM(%eax)
3644:	COPY_LOOP_BODY_XMM(%eax, %edx, %ecx)
365	jnz	4b
366	COPY_LOOP_FINI_XMM(%edx)
367	testl	$CR0_TS, %ebx
368	jnz	5f
369	RSTOR_XMMS(%edi)
370	popl	%edi
3715:	movl	%ebx, %cr0
372	popl	%ebx
373	mfence
374	ret
375	SET_SIZE(hwblkpagecopy)
376
377#endif	/* __i386 */
378#endif	/* __lint */
379
380
381#if defined(__lint)
382
383/*ARGSUSED*/
384void
385hat_pte_zero(void *dst, size_t len)
386{}
387
388#else
389
390#if defined(__amd64)
391
392	ENTRY(hat_pte_zero)
393	xorl	%eax, %eax
3941:
395	movnti	%rax, (%rdi)
396	addq	$8, %rdi
397	subq	$8, %rsi
398	jnz	1b
399	mfence
400	ret
401	SET_SIZE(hat_pte_zero)
402
403#elif defined(__i386)
404
405	ENTRY(hat_pte_zero)
406	xorl	%eax, %eax
407	movl	4(%esp), %edx
408	movl	8(%esp), %ecx
4091:
410	movnti	%eax, (%edx)
411	addl	$4, %edx
412	subl	$4, %ecx
413	jnz	1b
414	mfence
415	ret
416	SET_SIZE(hat_pte_zero)
417
418#endif	/* __i386 */
419
420#endif	/* __lint */
421
422#if defined(DEBUG) && !defined(__lint)
423	.text
424.not_disabled:
425	.string	"sseblk: preemption not disabled!"
426#endif
427