xref: /titanic_51/usr/src/uts/intel/ia32/ml/sseblk.s (revision 261a51afbf7133d9f7c89f1388050677f56b7d1a)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma	ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/asm_linkage.h>
29#include <sys/regset.h>
30#include <sys/privregs.h>
31
32#if defined(__lint)
33#include <sys/types.h>
34#include <sys/archsystm.h>
35#else
36#include "assym.h"
37#endif
38
39/*
40 * Do block operations using Streaming SIMD extensions
41 */
42
43#if defined(DEBUG)
44#if defined(__amd64)
45#define	ASSERT_KPREEMPT_DISABLED(t, r32, msg)	\
46	movq	%gs:CPU_THREAD, t;		\
47	movsbl	T_PREEMPT(t), r32;		\
48	testl	r32, r32;			\
49	jne	5f;				\
50	pushq	%rbp;				\
51	movq	%rsp, %rbp;			\
52	leaq	msg(%rip), %rdi;		\
53	xorl	%eax, %eax;			\
54	call	panic;				\
555:
56#elif defined(__i386)
57#define	ASSERT_KPREEMPT_DISABLED(t, r32, msg)	\
58	movl	%gs:CPU_THREAD, t;		\
59	movsbl	T_PREEMPT(t), r32;		\
60	testl	r32, r32;			\
61	jne	5f;				\
62	pushl	%ebp;				\
63	movl	%esp, %ebp;			\
64	pushl	$msg;				\
65	call	panic;				\
665:
67#endif	/* __i386 */
68#else	/* DEBUG */
69#define	ASSERT_KPREEMPT_DISABLED(t, r32, msg)
70#endif	/* DEBUG */
71
72#define	BLOCKSHIFT	6
73#define	BLOCKSIZE	64	/* (1 << BLOCKSHIFT) */
74#define	BLOCKMASK	63	/* (BLOCKSIZE - 1) */
75
76#if (1 << BLOCKSHIFT) != BLOCKSIZE || BLOCKMASK != (BLOCKSIZE - 1)
77#error	"mucked up constants"
78#endif
79
80#if defined(__lint)
81
82/*ARGSUSED*/
83void
84hwblkclr(void *addr, size_t size)
85{}
86
87#else	/* __lint */
88
89#if defined(__amd64)
90#define	ADD	addq
91#define	SUB	subq
92#else
93#define	ADD	addl
94#define	SUB	subl
95#endif
96
97#define	SAVE_XMM0(r)				\
98	SAVE_XMM_PROLOG(r, 1);			\
99	movdqa	%xmm0, (r)
100
101#define	ZERO_LOOP_INIT_XMM(dst)			\
102	pxor	%xmm0, %xmm0
103
104#define	ZERO_LOOP_BODY_XMM(dst, cnt)		\
105	movntdq	%xmm0, (dst);			\
106	movntdq	%xmm0, 0x10(dst);		\
107	movntdq	%xmm0, 0x20(dst);		\
108	movntdq	%xmm0, 0x30(dst);		\
109	ADD	$BLOCKSIZE, dst;		\
110	SUB	$1, cnt
111
112#define	ZERO_LOOP_FINI_XMM(dst)			\
113	mfence
114
115#define	RSTOR_XMM0(r)				\
116	movdqa	0x0(r), %xmm0;			\
117	RSTOR_XMM_EPILOG(r, 1)
118
119#if defined(__amd64)
120
121	/*
122	 * %rdi		dst
123	 * %rsi		size
124	 * %rax		saved %cr0 (#if DEBUG then %eax is t->t_preempt)
125	 * %r8		pointer to %xmm register save area
126	 */
127	ENTRY(hwblkclr)
128	pushq	%rbp
129	movq	%rsp, %rbp
130	testl	$BLOCKMASK, %edi	/* address must be BLOCKSIZE aligned */
131	jne	.dobzero
132	cmpq	$BLOCKSIZE, %rsi	/* size must be at least BLOCKSIZE */
133	jl	.dobzero
134	testq	$BLOCKMASK, %rsi	/* .. and be a multiple of BLOCKSIZE */
135	jne	.dobzero
136	shrq	$BLOCKSHIFT, %rsi
137
138	ASSERT_KPREEMPT_DISABLED(%r11, %eax, .not_disabled)
139	movq	%cr0, %rax
140	clts
141	testl	$CR0_TS, %eax
142	jnz	1f
143
144	SAVE_XMM0(%r8)
1451:	ZERO_LOOP_INIT_XMM(%rdi)
1469:	ZERO_LOOP_BODY_XMM(%rdi, %rsi)
147	jnz	9b
148	ZERO_LOOP_FINI_XMM(%rdi)
149
150	testl	$CR0_TS, %eax
151	jnz	2f
152	RSTOR_XMM0(%r8)
1532:	movq	%rax, %cr0
154	leave
155	ret
156.dobzero:
157	leave
158	jmp	bzero
159	SET_SIZE(hwblkclr)
160
161#elif defined(__i386)
162
163	/*
164	 * %eax		dst
165	 * %ecx		size in bytes, loop count
166	 * %ebx		saved %cr0 (#if DEBUG then t->t_preempt)
167	 * %edi		pointer to %xmm register save area
168	 */
169	ENTRY(hwblkclr)
170	movl	4(%esp), %eax
171	movl	8(%esp), %ecx
172	testl	$BLOCKMASK, %eax	/* address must be BLOCKSIZE aligned */
173	jne	.dobzero
174	cmpl	$BLOCKSIZE, %ecx	/* size must be at least BLOCKSIZE */
175	jl	.dobzero
176	testl	$BLOCKMASK, %ecx 	/* .. and be a multiple of BLOCKSIZE */
177	jne	.dobzero
178	shrl	$BLOCKSHIFT, %ecx
179	movl	0xc(%esp), %edx
180	pushl	%ebx
181
182	pushl	%esi
183	ASSERT_KPREEMPT_DISABLED(%esi, %ebx, .not_disabled)
184	popl	%esi
185	movl	%cr0, %ebx
186	clts
187	testl	$CR0_TS, %ebx
188	jnz	1f
189
190	pushl	%edi
191	SAVE_XMM0(%edi)
1921:	ZERO_LOOP_INIT_XMM(%eax)
1939:	ZERO_LOOP_BODY_XMM(%eax, %ecx)
194	jnz	9b
195	ZERO_LOOP_FINI_XMM(%eax)
196
197	testl	$CR0_TS, %ebx
198	jnz	2f
199	RSTOR_XMM0(%edi)
200	popl	%edi
2012:	movl	%ebx, %cr0
202	popl	%ebx
203	ret
204.dobzero:
205	jmp	bzero
206	SET_SIZE(hwblkclr)
207
208#endif	/* __i386 */
209#endif	/* __lint */
210
211
212#if defined(__lint)
213
214/*ARGSUSED*/
215void
216hwblkpagecopy(const void *src, void *dst)
217{}
218
219#else	/* __lint */
220
221#define	PREFETCH_START(src)			\
222	prefetchnta	0x0(src);		\
223	prefetchnta	0x40(src)
224
225#define	SAVE_XMMS(r)				\
226	SAVE_XMM_PROLOG(r, 8);			\
227	movdqa	%xmm0, (r);			\
228	movdqa	%xmm1, 0x10(r);			\
229	movdqa	%xmm2, 0x20(r);			\
230	movdqa	%xmm3, 0x30(r);			\
231	movdqa	%xmm4, 0x40(r);			\
232	movdqa	%xmm5, 0x50(r);			\
233	movdqa	%xmm6, 0x60(r);			\
234	movdqa	%xmm7, 0x70(r)
235
236#define	COPY_LOOP_INIT_XMM(src)			\
237	prefetchnta	0x80(src);		\
238	prefetchnta	0xc0(src);		\
239	movdqa	0x0(src), %xmm0;		\
240	movdqa	0x10(src), %xmm1;		\
241	movdqa	0x20(src), %xmm2;		\
242	movdqa	0x30(src), %xmm3;		\
243	movdqa	0x40(src), %xmm4;		\
244	movdqa	0x50(src), %xmm5;		\
245	movdqa	0x60(src), %xmm6;		\
246	movdqa	0x70(src), %xmm7;		\
247	ADD	$0x80, src
248
249#define	COPY_LOOP_BODY_XMM(src, dst, cnt)	\
250	prefetchnta	0x80(src);		\
251	prefetchnta	0xc0(src);		\
252	prefetchnta	0x100(src);		\
253	prefetchnta	0x140(src);		\
254	movntdq	%xmm0, (dst);			\
255	movntdq	%xmm1, 0x10(dst);		\
256	movntdq	%xmm2, 0x20(dst);		\
257	movntdq	%xmm3, 0x30(dst);		\
258	movdqa	0x0(src), %xmm0;		\
259	movdqa	0x10(src), %xmm1;		\
260	movntdq	%xmm4, 0x40(dst);		\
261	movntdq	%xmm5, 0x50(dst);		\
262	movdqa	0x20(src), %xmm2;		\
263	movdqa	0x30(src), %xmm3;		\
264	movntdq	%xmm6, 0x60(dst);		\
265	movntdq	%xmm7, 0x70(dst);		\
266	movdqa	0x40(src), %xmm4;		\
267	movdqa	0x50(src), %xmm5;		\
268	ADD	$0x80, dst;			\
269	movdqa	0x60(src), %xmm6;		\
270	movdqa	0x70(src), %xmm7;		\
271	ADD	$0x80, src;			\
272	subl	$1, cnt
273
274#define	COPY_LOOP_FINI_XMM(dst)			\
275	movntdq	%xmm0, 0x0(dst);		\
276	movntdq	%xmm1, 0x10(dst);		\
277	movntdq	%xmm2, 0x20(dst);		\
278	movntdq	%xmm3, 0x30(dst);		\
279	movntdq	%xmm4, 0x40(dst);		\
280	movntdq	%xmm5, 0x50(dst);		\
281	movntdq %xmm6, 0x60(dst);		\
282	movntdq	%xmm7, 0x70(dst)
283
284#define	RSTOR_XMMS(r)				\
285	movdqa	0x0(r), %xmm0;			\
286	movdqa	0x10(r), %xmm1;			\
287	movdqa	0x20(r), %xmm2;			\
288	movdqa	0x30(r), %xmm3;			\
289	movdqa	0x40(r), %xmm4;			\
290	movdqa	0x50(r), %xmm5;			\
291	movdqa	0x60(r), %xmm6;			\
292	movdqa	0x70(r), %xmm7;			\
293	RSTOR_XMM_EPILOG(r, 8)
294
295#if defined(__amd64)
296
297	/*
298	 * %rdi		src
299	 * %rsi		dst
300	 * %rdx		#if DEBUG then curthread
301	 * %ecx		loop count
302	 * %rax		saved %cr0 (#if DEBUG then %eax is t->t_prempt)
303	 * %r8		pointer to %xmm register save area
304	 */
305	ENTRY(hwblkpagecopy)
306	pushq	%rbp
307	movq	%rsp, %rbp
308	PREFETCH_START(%rdi)
309	/*
310	 * PAGESIZE is 4096, each loop moves 128 bytes, but the initial
311	 * load and final store save us on loop count
312	 */
313	movl	$_CONST(32 - 1), %ecx
314	ASSERT_KPREEMPT_DISABLED(%rdx, %eax, .not_disabled)
315	movq	%cr0, %rax
316	clts
317	testl	$CR0_TS, %eax
318	jnz	3f
319	SAVE_XMMS(%r8)
3203:	COPY_LOOP_INIT_XMM(%rdi)
3214:	COPY_LOOP_BODY_XMM(%rdi, %rsi, %ecx)
322	jnz	4b
323	COPY_LOOP_FINI_XMM(%rsi)
324	testl	$CR0_TS, %eax
325	jnz	5f
326	RSTOR_XMMS(%r8)
3275:	movq	%rax, %cr0
328	mfence
329	leave
330	ret
331	SET_SIZE(hwblkpagecopy)
332
333#elif defined(__i386)
334
335	/*
336	 * %eax		src
337	 * %edx		dst
338	 * %ecx		loop count
339	 * %ebx		saved %cr0 (#if DEBUG then t->t_prempt)
340	 * %edi		pointer to %xmm register save area
341	 * %esi		#if DEBUG temporary thread pointer
342	 */
343	ENTRY(hwblkpagecopy)
344	movl	4(%esp), %eax
345	movl	8(%esp), %edx
346	PREFETCH_START(%eax)
347	pushl	%ebx
348	/*
349	 * PAGESIZE is 4096, each loop moves 128 bytes, but the initial
350	 * load and final store save us one loop count
351	 */
352	movl	$_CONST(32 - 1), %ecx
353	pushl	%esi
354	ASSERT_KPREEMPT_DISABLED(%esi, %ebx, .not_disabled)
355	popl	%esi
356	movl	%cr0, %ebx
357	clts
358	testl	$CR0_TS, %ebx
359	jnz	3f
360	pushl	%edi
361	SAVE_XMMS(%edi)
3623:	COPY_LOOP_INIT_XMM(%eax)
3634:	COPY_LOOP_BODY_XMM(%eax, %edx, %ecx)
364	jnz	4b
365	COPY_LOOP_FINI_XMM(%edx)
366	testl	$CR0_TS, %ebx
367	jnz	5f
368	RSTOR_XMMS(%edi)
369	popl	%edi
3705:	movl	%ebx, %cr0
371	popl	%ebx
372	mfence
373	ret
374	SET_SIZE(hwblkpagecopy)
375
376#endif	/* __i386 */
377#endif	/* __lint */
378
379#if defined(__lint)
380
381/*
382 * Version of hwblkclr which doesn't use XMM registers.
383 * Note that it requires aligned dst and len.
384 *
385 * XXPV This needs to be performance tuned at some point.
386 *	Is 4 the best number of iterations to unroll?
387 */
388/*ARGSUSED*/
389void
390block_zero_no_xmm(void *dst, int len)
391{}
392
393#else	/* __lint */
394
395#if defined(__amd64)
396
397	ENTRY(block_zero_no_xmm)
398	pushq	%rbp
399	movq	%rsp, %rbp
400	xorl	%eax, %eax
401	addq	%rsi, %rdi
402	negq	%rsi
4031:
404	movnti	%rax, (%rdi, %rsi)
405	movnti	%rax, 8(%rdi, %rsi)
406	movnti	%rax, 16(%rdi, %rsi)
407	movnti	%rax, 24(%rdi, %rsi)
408	addq	$32, %rsi
409	jnz	1b
410	mfence
411	leave
412	ret
413	SET_SIZE(block_zero_no_xmm)
414
415#elif defined(__i386)
416
417	ENTRY(block_zero_no_xmm)
418	pushl	%ebp
419	movl	%esp, %ebp
420	xorl	%eax, %eax
421	movl	8(%ebp), %edx
422	movl	12(%ebp), %ecx
423	addl	%ecx, %edx
424	negl	%ecx
4251:
426	movnti	%eax, (%edx, %ecx)
427	movnti	%eax, 4(%edx, %ecx)
428	movnti	%eax, 8(%edx, %ecx)
429	movnti	%eax, 12(%edx, %ecx)
430	addl	$16, %ecx
431	jnz	1b
432	mfence
433	leave
434	ret
435	SET_SIZE(block_zero_no_xmm)
436
437#endif	/* __i386 */
438#endif	/* __lint */
439
440
441#if defined(__lint)
442
443/*
444 * Version of page copy which doesn't use XMM registers.
445 *
446 * XXPV	This needs to be performance tuned at some point.
447 *	Is 4 the right number of iterations to unroll?
448 *	Is the load/store order optimal? Should it use prefetch?
449 */
450/*ARGSUSED*/
451void
452page_copy_no_xmm(void *dst, void *src)
453{}
454
455#else	/* __lint */
456
457#if defined(__amd64)
458
459	ENTRY(page_copy_no_xmm)
460	movq	$MMU_STD_PAGESIZE, %rcx
461	addq	%rcx, %rdi
462	addq	%rcx, %rsi
463	negq	%rcx
4641:
465	movq	(%rsi, %rcx), %rax
466	movnti	%rax, (%rdi, %rcx)
467	movq	8(%rsi, %rcx), %rax
468	movnti	%rax, 8(%rdi, %rcx)
469	movq	16(%rsi, %rcx), %rax
470	movnti	%rax, 16(%rdi, %rcx)
471	movq	24(%rsi, %rcx), %rax
472	movnti	%rax, 24(%rdi, %rcx)
473	addq	$32, %rcx
474	jnz	1b
475	mfence
476	ret
477	SET_SIZE(page_copy_no_xmm)
478
479#elif defined(__i386)
480
481	ENTRY(page_copy_no_xmm)
482	pushl	%esi
483	movl	$MMU_STD_PAGESIZE, %ecx
484	movl	8(%esp), %edx
485	movl	12(%esp), %esi
486	addl	%ecx, %edx
487	addl	%ecx, %esi
488	negl	%ecx
4891:
490	movl	(%esi, %ecx), %eax
491	movnti	%eax, (%edx, %ecx)
492	movl	4(%esi, %ecx), %eax
493	movnti	%eax, 4(%edx, %ecx)
494	movl	8(%esi, %ecx), %eax
495	movnti	%eax, 8(%edx, %ecx)
496	movl	12(%esi, %ecx), %eax
497	movnti	%eax, 12(%edx, %ecx)
498	addl	$16, %ecx
499	jnz	1b
500	mfence
501	popl	%esi
502	ret
503	SET_SIZE(page_copy_no_xmm)
504
505#endif	/* __i386 */
506#endif	/* __lint */
507
508#if defined(DEBUG) && !defined(__lint)
509	.text
510.not_disabled:
511	.string	"sseblk: preemption not disabled!"
512#endif
513