xref: /titanic_52/usr/src/uts/intel/ia32/ml/sseblk.s (revision ae115bc77f6fcde83175c75b4206dc2e50747966)
17c478bd9Sstevel@tonic-gate/*
27c478bd9Sstevel@tonic-gate * CDDL HEADER START
37c478bd9Sstevel@tonic-gate *
47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the
5*ae115bc7Smrj * Common Development and Distribution License (the "License").
6*ae115bc7Smrj * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate *
87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate * and limitations under the License.
127c478bd9Sstevel@tonic-gate *
137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate *
197c478bd9Sstevel@tonic-gate * CDDL HEADER END
207c478bd9Sstevel@tonic-gate */
217c478bd9Sstevel@tonic-gate/*
22*ae115bc7Smrj * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
237c478bd9Sstevel@tonic-gate * Use is subject to license terms.
247c478bd9Sstevel@tonic-gate */
257c478bd9Sstevel@tonic-gate
267c478bd9Sstevel@tonic-gate#pragma	ident	"%Z%%M%	%I%	%E% SMI"
277c478bd9Sstevel@tonic-gate
287c478bd9Sstevel@tonic-gate#include <sys/asm_linkage.h>
297c478bd9Sstevel@tonic-gate#include <sys/regset.h>
307c478bd9Sstevel@tonic-gate#include <sys/privregs.h>
317c478bd9Sstevel@tonic-gate
327c478bd9Sstevel@tonic-gate#if defined(__lint)
337c478bd9Sstevel@tonic-gate#include <sys/types.h>
347c478bd9Sstevel@tonic-gate#include <sys/archsystm.h>
357c478bd9Sstevel@tonic-gate#else
367c478bd9Sstevel@tonic-gate#include "assym.h"
377c478bd9Sstevel@tonic-gate#endif
387c478bd9Sstevel@tonic-gate
397c478bd9Sstevel@tonic-gate/*
407c478bd9Sstevel@tonic-gate * Do block operations using Streaming SIMD extensions
417c478bd9Sstevel@tonic-gate */
427c478bd9Sstevel@tonic-gate
437c478bd9Sstevel@tonic-gate#if defined(DEBUG)
447c478bd9Sstevel@tonic-gate#if defined(__amd64)
457c478bd9Sstevel@tonic-gate#define	ASSERT_KPREEMPT_DISABLED(t, r32, msg)	\
467c478bd9Sstevel@tonic-gate	movq	%gs:CPU_THREAD, t;		\
477c478bd9Sstevel@tonic-gate	movsbl	T_PREEMPT(t), r32;		\
487c478bd9Sstevel@tonic-gate	testl	r32, r32;			\
497c478bd9Sstevel@tonic-gate	jne	5f;				\
507c478bd9Sstevel@tonic-gate	pushq	%rbp;				\
517c478bd9Sstevel@tonic-gate	movq	%rsp, %rbp;			\
527c478bd9Sstevel@tonic-gate	leaq	msg(%rip), %rdi;		\
537c478bd9Sstevel@tonic-gate	xorl	%eax, %eax;			\
547c478bd9Sstevel@tonic-gate	call	panic;				\
557c478bd9Sstevel@tonic-gate5:
567c478bd9Sstevel@tonic-gate#elif defined(__i386)
577c478bd9Sstevel@tonic-gate#define	ASSERT_KPREEMPT_DISABLED(t, r32, msg)	\
587c478bd9Sstevel@tonic-gate	movl	%gs:CPU_THREAD, t;		\
597c478bd9Sstevel@tonic-gate	movsbl	T_PREEMPT(t), r32;		\
607c478bd9Sstevel@tonic-gate	testl	r32, r32;			\
617c478bd9Sstevel@tonic-gate	jne	5f;				\
627c478bd9Sstevel@tonic-gate	pushl	%ebp;				\
637c478bd9Sstevel@tonic-gate	movl	%esp, %ebp;			\
647c478bd9Sstevel@tonic-gate	pushl	$msg;				\
657c478bd9Sstevel@tonic-gate	call	panic;				\
667c478bd9Sstevel@tonic-gate5:
677c478bd9Sstevel@tonic-gate#endif	/* __i386 */
687c478bd9Sstevel@tonic-gate#else	/* DEBUG */
697c478bd9Sstevel@tonic-gate#define	ASSERT_KPREEMPT_DISABLED(t, r32, msg)
707c478bd9Sstevel@tonic-gate#endif	/* DEBUG */
717c478bd9Sstevel@tonic-gate
727c478bd9Sstevel@tonic-gate#define	BLOCKSHIFT	6
737c478bd9Sstevel@tonic-gate#define	BLOCKSIZE	64	/* (1 << BLOCKSHIFT) */
747c478bd9Sstevel@tonic-gate#define	BLOCKMASK	63	/* (BLOCKSIZE - 1) */
757c478bd9Sstevel@tonic-gate
767c478bd9Sstevel@tonic-gate#if (1 << BLOCKSHIFT) != BLOCKSIZE || BLOCKMASK != (BLOCKSIZE - 1)
777c478bd9Sstevel@tonic-gate#error	"mucked up constants"
787c478bd9Sstevel@tonic-gate#endif
797c478bd9Sstevel@tonic-gate
807c478bd9Sstevel@tonic-gate#if defined(__lint)
817c478bd9Sstevel@tonic-gate
827c478bd9Sstevel@tonic-gate/*ARGSUSED*/
837c478bd9Sstevel@tonic-gatevoid
847c478bd9Sstevel@tonic-gatehwblkclr(void *addr, size_t size)
857c478bd9Sstevel@tonic-gate{}
867c478bd9Sstevel@tonic-gate
877c478bd9Sstevel@tonic-gate#else	/* __lint */
887c478bd9Sstevel@tonic-gate
897c478bd9Sstevel@tonic-gate#if defined(__amd64)
907c478bd9Sstevel@tonic-gate#define	ADD	addq
917c478bd9Sstevel@tonic-gate#define	SUB	subq
927c478bd9Sstevel@tonic-gate#else
937c478bd9Sstevel@tonic-gate#define	ADD	addl
947c478bd9Sstevel@tonic-gate#define	SUB	subl
957c478bd9Sstevel@tonic-gate#endif
967c478bd9Sstevel@tonic-gate
977c478bd9Sstevel@tonic-gate#define	SAVE_XMM0(r)				\
987c478bd9Sstevel@tonic-gate	SAVE_XMM_PROLOG(r, 1);			\
997c478bd9Sstevel@tonic-gate	movdqa	%xmm0, (r)
1007c478bd9Sstevel@tonic-gate
1017c478bd9Sstevel@tonic-gate#define	ZERO_LOOP_INIT_XMM(dst)			\
1027c478bd9Sstevel@tonic-gate	pxor	%xmm0, %xmm0
1037c478bd9Sstevel@tonic-gate
1047c478bd9Sstevel@tonic-gate#define	ZERO_LOOP_BODY_XMM(dst, cnt)		\
1057c478bd9Sstevel@tonic-gate	movntdq	%xmm0, (dst);			\
1067c478bd9Sstevel@tonic-gate	movntdq	%xmm0, 0x10(dst);		\
1077c478bd9Sstevel@tonic-gate	movntdq	%xmm0, 0x20(dst);		\
1087c478bd9Sstevel@tonic-gate	movntdq	%xmm0, 0x30(dst);		\
1097c478bd9Sstevel@tonic-gate	ADD	$BLOCKSIZE, dst;		\
1107c478bd9Sstevel@tonic-gate	SUB	$1, cnt
1117c478bd9Sstevel@tonic-gate
1127c478bd9Sstevel@tonic-gate#define	ZERO_LOOP_FINI_XMM(dst)			\
1137c478bd9Sstevel@tonic-gate	mfence
1147c478bd9Sstevel@tonic-gate
1157c478bd9Sstevel@tonic-gate#define	RSTOR_XMM0(r)				\
1167c478bd9Sstevel@tonic-gate	movdqa	0x0(r), %xmm0;			\
1177c478bd9Sstevel@tonic-gate	RSTOR_XMM_EPILOG(r, 1)
1187c478bd9Sstevel@tonic-gate
1197c478bd9Sstevel@tonic-gate#if defined(__amd64)
1207c478bd9Sstevel@tonic-gate
1217c478bd9Sstevel@tonic-gate	/*
1227c478bd9Sstevel@tonic-gate	 * %rdi		dst
1237c478bd9Sstevel@tonic-gate	 * %rsi		size
1247c478bd9Sstevel@tonic-gate	 * %rax		saved %cr0 (#if DEBUG then %eax is t->t_preempt)
1257c478bd9Sstevel@tonic-gate	 * %r8		pointer to %xmm register save area
1267c478bd9Sstevel@tonic-gate	 */
1277c478bd9Sstevel@tonic-gate	ENTRY(hwblkclr)
1287c478bd9Sstevel@tonic-gate	pushq	%rbp
1297c478bd9Sstevel@tonic-gate	movq	%rsp, %rbp
1307c478bd9Sstevel@tonic-gate	testl	$BLOCKMASK, %edi	/* address must be BLOCKSIZE aligned */
1317c478bd9Sstevel@tonic-gate	jne	.dobzero
1327c478bd9Sstevel@tonic-gate	cmpq	$BLOCKSIZE, %rsi	/* size must be at least BLOCKSIZE */
1337c478bd9Sstevel@tonic-gate	jl	.dobzero
1347c478bd9Sstevel@tonic-gate	testq	$BLOCKMASK, %rsi	/* .. and be a multiple of BLOCKSIZE */
1357c478bd9Sstevel@tonic-gate	jne	.dobzero
1367c478bd9Sstevel@tonic-gate	shrq	$BLOCKSHIFT, %rsi
1377c478bd9Sstevel@tonic-gate
1387c478bd9Sstevel@tonic-gate	ASSERT_KPREEMPT_DISABLED(%r11, %eax, .not_disabled)
1397c478bd9Sstevel@tonic-gate	movq	%cr0, %rax
1407c478bd9Sstevel@tonic-gate	clts
1417c478bd9Sstevel@tonic-gate	testl	$CR0_TS, %eax
1427c478bd9Sstevel@tonic-gate	jnz	1f
1437c478bd9Sstevel@tonic-gate
1447c478bd9Sstevel@tonic-gate	SAVE_XMM0(%r8)
1457c478bd9Sstevel@tonic-gate1:	ZERO_LOOP_INIT_XMM(%rdi)
1467c478bd9Sstevel@tonic-gate9:	ZERO_LOOP_BODY_XMM(%rdi, %rsi)
1477c478bd9Sstevel@tonic-gate	jnz	9b
1487c478bd9Sstevel@tonic-gate	ZERO_LOOP_FINI_XMM(%rdi)
1497c478bd9Sstevel@tonic-gate
1507c478bd9Sstevel@tonic-gate	testl	$CR0_TS, %eax
1517c478bd9Sstevel@tonic-gate	jnz	2f
1527c478bd9Sstevel@tonic-gate	RSTOR_XMM0(%r8)
1537c478bd9Sstevel@tonic-gate2:	movq	%rax, %cr0
1547c478bd9Sstevel@tonic-gate	leave
1557c478bd9Sstevel@tonic-gate	ret
1567c478bd9Sstevel@tonic-gate.dobzero:
1577c478bd9Sstevel@tonic-gate	leave
1587c478bd9Sstevel@tonic-gate	jmp	bzero
1597c478bd9Sstevel@tonic-gate	SET_SIZE(hwblkclr)
1607c478bd9Sstevel@tonic-gate
1617c478bd9Sstevel@tonic-gate#elif defined(__i386)
1627c478bd9Sstevel@tonic-gate
1637c478bd9Sstevel@tonic-gate	/*
1647c478bd9Sstevel@tonic-gate	 * %eax		dst
1657c478bd9Sstevel@tonic-gate	 * %ecx		size in bytes, loop count
1667c478bd9Sstevel@tonic-gate	 * %ebx		saved %cr0 (#if DEBUG then t->t_preempt)
1677c478bd9Sstevel@tonic-gate	 * %edi		pointer to %xmm register save area
1687c478bd9Sstevel@tonic-gate	 */
1697c478bd9Sstevel@tonic-gate	ENTRY(hwblkclr)
1707c478bd9Sstevel@tonic-gate	movl	4(%esp), %eax
1717c478bd9Sstevel@tonic-gate	movl	8(%esp), %ecx
1727c478bd9Sstevel@tonic-gate	testl	$BLOCKMASK, %eax	/* address must be BLOCKSIZE aligned */
1737c478bd9Sstevel@tonic-gate	jne	.dobzero
1747c478bd9Sstevel@tonic-gate	cmpl	$BLOCKSIZE, %ecx	/* size must be at least BLOCKSIZE */
1757c478bd9Sstevel@tonic-gate	jl	.dobzero
1767c478bd9Sstevel@tonic-gate	testl	$BLOCKMASK, %ecx 	/* .. and be a multiple of BLOCKSIZE */
1777c478bd9Sstevel@tonic-gate	jne	.dobzero
1787c478bd9Sstevel@tonic-gate	shrl	$BLOCKSHIFT, %ecx
1797c478bd9Sstevel@tonic-gate	movl	0xc(%esp), %edx
1807c478bd9Sstevel@tonic-gate	pushl	%ebx
1817c478bd9Sstevel@tonic-gate
1827c478bd9Sstevel@tonic-gate	pushl	%esi
1837c478bd9Sstevel@tonic-gate	ASSERT_KPREEMPT_DISABLED(%esi, %ebx, .not_disabled)
1847c478bd9Sstevel@tonic-gate	popl	%esi
1857c478bd9Sstevel@tonic-gate	movl	%cr0, %ebx
1867c478bd9Sstevel@tonic-gate	clts
1877c478bd9Sstevel@tonic-gate	testl	$CR0_TS, %ebx
1887c478bd9Sstevel@tonic-gate	jnz	1f
1897c478bd9Sstevel@tonic-gate
1907c478bd9Sstevel@tonic-gate	pushl	%edi
1917c478bd9Sstevel@tonic-gate	SAVE_XMM0(%edi)
1927c478bd9Sstevel@tonic-gate1:	ZERO_LOOP_INIT_XMM(%eax)
1937c478bd9Sstevel@tonic-gate9:	ZERO_LOOP_BODY_XMM(%eax, %ecx)
1947c478bd9Sstevel@tonic-gate	jnz	9b
1957c478bd9Sstevel@tonic-gate	ZERO_LOOP_FINI_XMM(%eax)
1967c478bd9Sstevel@tonic-gate
1977c478bd9Sstevel@tonic-gate	testl	$CR0_TS, %ebx
1987c478bd9Sstevel@tonic-gate	jnz	2f
1997c478bd9Sstevel@tonic-gate	RSTOR_XMM0(%edi)
2007c478bd9Sstevel@tonic-gate	popl	%edi
2017c478bd9Sstevel@tonic-gate2:	movl	%ebx, %cr0
2027c478bd9Sstevel@tonic-gate	popl	%ebx
2037c478bd9Sstevel@tonic-gate	ret
2047c478bd9Sstevel@tonic-gate.dobzero:
2057c478bd9Sstevel@tonic-gate	jmp	bzero
2067c478bd9Sstevel@tonic-gate	SET_SIZE(hwblkclr)
2077c478bd9Sstevel@tonic-gate
2087c478bd9Sstevel@tonic-gate#endif	/* __i386 */
2097c478bd9Sstevel@tonic-gate#endif	/* __lint */
2107c478bd9Sstevel@tonic-gate
2117c478bd9Sstevel@tonic-gate
2127c478bd9Sstevel@tonic-gate#if defined(__lint)
2137c478bd9Sstevel@tonic-gate
2147c478bd9Sstevel@tonic-gate/*ARGSUSED*/
2157c478bd9Sstevel@tonic-gatevoid
2167c478bd9Sstevel@tonic-gatehwblkpagecopy(const void *src, void *dst)
2177c478bd9Sstevel@tonic-gate{}
2187c478bd9Sstevel@tonic-gate
2197c478bd9Sstevel@tonic-gate#else	/* __lint */
2207c478bd9Sstevel@tonic-gate
2217c478bd9Sstevel@tonic-gate#define	PREFETCH_START(src)			\
2227c478bd9Sstevel@tonic-gate	prefetchnta	0x0(src);		\
2237c478bd9Sstevel@tonic-gate	prefetchnta	0x40(src)
2247c478bd9Sstevel@tonic-gate
2257c478bd9Sstevel@tonic-gate#define	SAVE_XMMS(r)				\
2267c478bd9Sstevel@tonic-gate	SAVE_XMM_PROLOG(r, 8);			\
2277c478bd9Sstevel@tonic-gate	movdqa	%xmm0, (r);			\
2287c478bd9Sstevel@tonic-gate	movdqa	%xmm1, 0x10(r);			\
2297c478bd9Sstevel@tonic-gate	movdqa	%xmm2, 0x20(r);			\
2307c478bd9Sstevel@tonic-gate	movdqa	%xmm3, 0x30(r);			\
2317c478bd9Sstevel@tonic-gate	movdqa	%xmm4, 0x40(r);			\
2327c478bd9Sstevel@tonic-gate	movdqa	%xmm5, 0x50(r);			\
2337c478bd9Sstevel@tonic-gate	movdqa	%xmm6, 0x60(r);			\
2347c478bd9Sstevel@tonic-gate	movdqa	%xmm7, 0x70(r)
2357c478bd9Sstevel@tonic-gate
2367c478bd9Sstevel@tonic-gate#define	COPY_LOOP_INIT_XMM(src)			\
2377c478bd9Sstevel@tonic-gate	prefetchnta	0x80(src);		\
2387c478bd9Sstevel@tonic-gate	prefetchnta	0xc0(src);		\
2397c478bd9Sstevel@tonic-gate	movdqa	0x0(src), %xmm0;		\
2407c478bd9Sstevel@tonic-gate	movdqa	0x10(src), %xmm1;		\
2417c478bd9Sstevel@tonic-gate	movdqa	0x20(src), %xmm2;		\
2427c478bd9Sstevel@tonic-gate	movdqa	0x30(src), %xmm3;		\
2437c478bd9Sstevel@tonic-gate	movdqa	0x40(src), %xmm4;		\
2447c478bd9Sstevel@tonic-gate	movdqa	0x50(src), %xmm5;		\
2457c478bd9Sstevel@tonic-gate	movdqa	0x60(src), %xmm6;		\
2467c478bd9Sstevel@tonic-gate	movdqa	0x70(src), %xmm7;		\
2477c478bd9Sstevel@tonic-gate	ADD	$0x80, src
2487c478bd9Sstevel@tonic-gate
2497c478bd9Sstevel@tonic-gate#define	COPY_LOOP_BODY_XMM(src, dst, cnt)	\
2507c478bd9Sstevel@tonic-gate	prefetchnta	0x80(src);		\
2517c478bd9Sstevel@tonic-gate	prefetchnta	0xc0(src);		\
2527c478bd9Sstevel@tonic-gate	prefetchnta	0x100(src);		\
2537c478bd9Sstevel@tonic-gate	prefetchnta	0x140(src);		\
2547c478bd9Sstevel@tonic-gate	movntdq	%xmm0, (dst);			\
2557c478bd9Sstevel@tonic-gate	movntdq	%xmm1, 0x10(dst);		\
2567c478bd9Sstevel@tonic-gate	movntdq	%xmm2, 0x20(dst);		\
2577c478bd9Sstevel@tonic-gate	movntdq	%xmm3, 0x30(dst);		\
2587c478bd9Sstevel@tonic-gate	movdqa	0x0(src), %xmm0;		\
2597c478bd9Sstevel@tonic-gate	movdqa	0x10(src), %xmm1;		\
2607c478bd9Sstevel@tonic-gate	movntdq	%xmm4, 0x40(dst);		\
2617c478bd9Sstevel@tonic-gate	movntdq	%xmm5, 0x50(dst);		\
2627c478bd9Sstevel@tonic-gate	movdqa	0x20(src), %xmm2;		\
2637c478bd9Sstevel@tonic-gate	movdqa	0x30(src), %xmm3;		\
2647c478bd9Sstevel@tonic-gate	movntdq	%xmm6, 0x60(dst);		\
2657c478bd9Sstevel@tonic-gate	movntdq	%xmm7, 0x70(dst);		\
2667c478bd9Sstevel@tonic-gate	movdqa	0x40(src), %xmm4;		\
2677c478bd9Sstevel@tonic-gate	movdqa	0x50(src), %xmm5;		\
2687c478bd9Sstevel@tonic-gate	ADD	$0x80, dst;			\
2697c478bd9Sstevel@tonic-gate	movdqa	0x60(src), %xmm6;		\
2707c478bd9Sstevel@tonic-gate	movdqa	0x70(src), %xmm7;		\
2717c478bd9Sstevel@tonic-gate	ADD	$0x80, src;			\
2727c478bd9Sstevel@tonic-gate	subl	$1, cnt
2737c478bd9Sstevel@tonic-gate
2747c478bd9Sstevel@tonic-gate#define	COPY_LOOP_FINI_XMM(dst)			\
2757c478bd9Sstevel@tonic-gate	movntdq	%xmm0, 0x0(dst);		\
2767c478bd9Sstevel@tonic-gate	movntdq	%xmm1, 0x10(dst);		\
2777c478bd9Sstevel@tonic-gate	movntdq	%xmm2, 0x20(dst);		\
2787c478bd9Sstevel@tonic-gate	movntdq	%xmm3, 0x30(dst);		\
2797c478bd9Sstevel@tonic-gate	movntdq	%xmm4, 0x40(dst);		\
2807c478bd9Sstevel@tonic-gate	movntdq	%xmm5, 0x50(dst);		\
2817c478bd9Sstevel@tonic-gate	movntdq %xmm6, 0x60(dst);		\
2827c478bd9Sstevel@tonic-gate	movntdq	%xmm7, 0x70(dst)
2837c478bd9Sstevel@tonic-gate
2847c478bd9Sstevel@tonic-gate#define	RSTOR_XMMS(r)				\
2857c478bd9Sstevel@tonic-gate	movdqa	0x0(r), %xmm0;			\
2867c478bd9Sstevel@tonic-gate	movdqa	0x10(r), %xmm1;			\
2877c478bd9Sstevel@tonic-gate	movdqa	0x20(r), %xmm2;			\
2887c478bd9Sstevel@tonic-gate	movdqa	0x30(r), %xmm3;			\
2897c478bd9Sstevel@tonic-gate	movdqa	0x40(r), %xmm4;			\
2907c478bd9Sstevel@tonic-gate	movdqa	0x50(r), %xmm5;			\
2917c478bd9Sstevel@tonic-gate	movdqa	0x60(r), %xmm6;			\
2927c478bd9Sstevel@tonic-gate	movdqa	0x70(r), %xmm7;			\
2937c478bd9Sstevel@tonic-gate	RSTOR_XMM_EPILOG(r, 8)
2947c478bd9Sstevel@tonic-gate
2957c478bd9Sstevel@tonic-gate#if defined(__amd64)
2967c478bd9Sstevel@tonic-gate
2977c478bd9Sstevel@tonic-gate	/*
2987c478bd9Sstevel@tonic-gate	 * %rdi		src
2997c478bd9Sstevel@tonic-gate	 * %rsi		dst
3007c478bd9Sstevel@tonic-gate	 * %rdx		#if DEBUG then curthread
3017c478bd9Sstevel@tonic-gate	 * %ecx		loop count
3027c478bd9Sstevel@tonic-gate	 * %rax		saved %cr0 (#if DEBUG then %eax is t->t_prempt)
3037c478bd9Sstevel@tonic-gate	 * %r8		pointer to %xmm register save area
3047c478bd9Sstevel@tonic-gate	 */
3057c478bd9Sstevel@tonic-gate	ENTRY(hwblkpagecopy)
3067c478bd9Sstevel@tonic-gate	pushq	%rbp
3077c478bd9Sstevel@tonic-gate	movq	%rsp, %rbp
3087c478bd9Sstevel@tonic-gate	PREFETCH_START(%rdi)
3097c478bd9Sstevel@tonic-gate	/*
3107c478bd9Sstevel@tonic-gate	 * PAGESIZE is 4096, each loop moves 128 bytes, but the initial
3117c478bd9Sstevel@tonic-gate	 * load and final store save us on loop count
3127c478bd9Sstevel@tonic-gate	 */
3137c478bd9Sstevel@tonic-gate	movl	$_CONST(32 - 1), %ecx
3147c478bd9Sstevel@tonic-gate	ASSERT_KPREEMPT_DISABLED(%rdx, %eax, .not_disabled)
3157c478bd9Sstevel@tonic-gate	movq	%cr0, %rax
3167c478bd9Sstevel@tonic-gate	clts
3177c478bd9Sstevel@tonic-gate	testl	$CR0_TS, %eax
3187c478bd9Sstevel@tonic-gate	jnz	3f
3197c478bd9Sstevel@tonic-gate	SAVE_XMMS(%r8)
3207c478bd9Sstevel@tonic-gate3:	COPY_LOOP_INIT_XMM(%rdi)
3217c478bd9Sstevel@tonic-gate4:	COPY_LOOP_BODY_XMM(%rdi, %rsi, %ecx)
3227c478bd9Sstevel@tonic-gate	jnz	4b
3237c478bd9Sstevel@tonic-gate	COPY_LOOP_FINI_XMM(%rsi)
3247c478bd9Sstevel@tonic-gate	testl	$CR0_TS, %eax
3257c478bd9Sstevel@tonic-gate	jnz	5f
3267c478bd9Sstevel@tonic-gate	RSTOR_XMMS(%r8)
3277c478bd9Sstevel@tonic-gate5:	movq	%rax, %cr0
3287c478bd9Sstevel@tonic-gate	mfence
3297c478bd9Sstevel@tonic-gate	leave
3307c478bd9Sstevel@tonic-gate	ret
3317c478bd9Sstevel@tonic-gate	SET_SIZE(hwblkpagecopy)
3327c478bd9Sstevel@tonic-gate
3337c478bd9Sstevel@tonic-gate#elif defined(__i386)
3347c478bd9Sstevel@tonic-gate
3357c478bd9Sstevel@tonic-gate	/*
3367c478bd9Sstevel@tonic-gate	 * %eax		src
3377c478bd9Sstevel@tonic-gate	 * %edx		dst
3387c478bd9Sstevel@tonic-gate	 * %ecx		loop count
3397c478bd9Sstevel@tonic-gate	 * %ebx		saved %cr0 (#if DEBUG then t->t_prempt)
3407c478bd9Sstevel@tonic-gate	 * %edi		pointer to %xmm register save area
3417c478bd9Sstevel@tonic-gate	 * %esi		#if DEBUG temporary thread pointer
3427c478bd9Sstevel@tonic-gate	 */
3437c478bd9Sstevel@tonic-gate	ENTRY(hwblkpagecopy)
3447c478bd9Sstevel@tonic-gate	movl	4(%esp), %eax
3457c478bd9Sstevel@tonic-gate	movl	8(%esp), %edx
3467c478bd9Sstevel@tonic-gate	PREFETCH_START(%eax)
3477c478bd9Sstevel@tonic-gate	pushl	%ebx
3487c478bd9Sstevel@tonic-gate	/*
3497c478bd9Sstevel@tonic-gate	 * PAGESIZE is 4096, each loop moves 128 bytes, but the initial
3507c478bd9Sstevel@tonic-gate	 * load and final store save us one loop count
3517c478bd9Sstevel@tonic-gate	 */
3527c478bd9Sstevel@tonic-gate	movl	$_CONST(32 - 1), %ecx
3537c478bd9Sstevel@tonic-gate	pushl	%esi
3547c478bd9Sstevel@tonic-gate	ASSERT_KPREEMPT_DISABLED(%esi, %ebx, .not_disabled)
3557c478bd9Sstevel@tonic-gate	popl	%esi
3567c478bd9Sstevel@tonic-gate	movl	%cr0, %ebx
3577c478bd9Sstevel@tonic-gate	clts
3587c478bd9Sstevel@tonic-gate	testl	$CR0_TS, %ebx
3597c478bd9Sstevel@tonic-gate	jnz	3f
3607c478bd9Sstevel@tonic-gate	pushl	%edi
3617c478bd9Sstevel@tonic-gate	SAVE_XMMS(%edi)
3627c478bd9Sstevel@tonic-gate3:	COPY_LOOP_INIT_XMM(%eax)
3637c478bd9Sstevel@tonic-gate4:	COPY_LOOP_BODY_XMM(%eax, %edx, %ecx)
3647c478bd9Sstevel@tonic-gate	jnz	4b
3657c478bd9Sstevel@tonic-gate	COPY_LOOP_FINI_XMM(%edx)
3667c478bd9Sstevel@tonic-gate	testl	$CR0_TS, %ebx
3677c478bd9Sstevel@tonic-gate	jnz	5f
3687c478bd9Sstevel@tonic-gate	RSTOR_XMMS(%edi)
3697c478bd9Sstevel@tonic-gate	popl	%edi
3707c478bd9Sstevel@tonic-gate5:	movl	%ebx, %cr0
3717c478bd9Sstevel@tonic-gate	popl	%ebx
3727c478bd9Sstevel@tonic-gate	mfence
3737c478bd9Sstevel@tonic-gate	ret
3747c478bd9Sstevel@tonic-gate	SET_SIZE(hwblkpagecopy)
3757c478bd9Sstevel@tonic-gate
3767c478bd9Sstevel@tonic-gate#endif	/* __i386 */
3777c478bd9Sstevel@tonic-gate#endif	/* __lint */
3787c478bd9Sstevel@tonic-gate
3797c478bd9Sstevel@tonic-gate#if defined(__lint)
3807c478bd9Sstevel@tonic-gate
381*ae115bc7Smrj/*
382*ae115bc7Smrj * Version of hwblkclr which doesn't use XMM registers.
383*ae115bc7Smrj * Note that it requires aligned dst and len.
384*ae115bc7Smrj *
385*ae115bc7Smrj * XXPV This needs to be performance tuned at some point.
386*ae115bc7Smrj *	Is 4 the best number of iterations to unroll?
387*ae115bc7Smrj */
3887c478bd9Sstevel@tonic-gate/*ARGSUSED*/
3897c478bd9Sstevel@tonic-gatevoid
390*ae115bc7Smrjblock_zero_no_xmm(void *dst, int len)
3917c478bd9Sstevel@tonic-gate{}
3927c478bd9Sstevel@tonic-gate
393*ae115bc7Smrj#else	/* __lint */
3947c478bd9Sstevel@tonic-gate
3957c478bd9Sstevel@tonic-gate#if defined(__amd64)
3967c478bd9Sstevel@tonic-gate
397*ae115bc7Smrj	ENTRY(block_zero_no_xmm)
398*ae115bc7Smrj	pushq	%rbp
399*ae115bc7Smrj	movq	%rsp, %rbp
4007c478bd9Sstevel@tonic-gate	xorl	%eax, %eax
401*ae115bc7Smrj	addq	%rsi, %rdi
402*ae115bc7Smrj	negq	%rsi
4037c478bd9Sstevel@tonic-gate1:
404*ae115bc7Smrj	movnti	%rax, (%rdi, %rsi)
405*ae115bc7Smrj	movnti	%rax, 8(%rdi, %rsi)
406*ae115bc7Smrj	movnti	%rax, 16(%rdi, %rsi)
407*ae115bc7Smrj	movnti	%rax, 24(%rdi, %rsi)
408*ae115bc7Smrj	addq	$32, %rsi
4097c478bd9Sstevel@tonic-gate	jnz	1b
4107c478bd9Sstevel@tonic-gate	mfence
411*ae115bc7Smrj	leave
4127c478bd9Sstevel@tonic-gate	ret
413*ae115bc7Smrj	SET_SIZE(block_zero_no_xmm)
4147c478bd9Sstevel@tonic-gate
4157c478bd9Sstevel@tonic-gate#elif defined(__i386)
4167c478bd9Sstevel@tonic-gate
417*ae115bc7Smrj	ENTRY(block_zero_no_xmm)
418*ae115bc7Smrj	pushl	%ebp
419*ae115bc7Smrj	movl	%esp, %ebp
4207c478bd9Sstevel@tonic-gate	xorl	%eax, %eax
421*ae115bc7Smrj	movl	8(%ebp), %edx
422*ae115bc7Smrj	movl	12(%ebp), %ecx
423*ae115bc7Smrj	addl	%ecx, %edx
424*ae115bc7Smrj	negl	%ecx
4257c478bd9Sstevel@tonic-gate1:
426*ae115bc7Smrj	movnti	%eax, (%edx, %ecx)
427*ae115bc7Smrj	movnti	%eax, 4(%edx, %ecx)
428*ae115bc7Smrj	movnti	%eax, 8(%edx, %ecx)
429*ae115bc7Smrj	movnti	%eax, 12(%edx, %ecx)
430*ae115bc7Smrj	addl	$16, %ecx
431*ae115bc7Smrj	jnz	1b
432*ae115bc7Smrj	mfence
433*ae115bc7Smrj	leave
434*ae115bc7Smrj	ret
435*ae115bc7Smrj	SET_SIZE(block_zero_no_xmm)
436*ae115bc7Smrj
437*ae115bc7Smrj#endif	/* __i386 */
438*ae115bc7Smrj#endif	/* __lint */
439*ae115bc7Smrj
440*ae115bc7Smrj
441*ae115bc7Smrj#if defined(__lint)
442*ae115bc7Smrj
443*ae115bc7Smrj/*
444*ae115bc7Smrj * Version of page copy which doesn't use XMM registers.
445*ae115bc7Smrj *
446*ae115bc7Smrj * XXPV	This needs to be performance tuned at some point.
447*ae115bc7Smrj *	Is 4 the right number of iterations to unroll?
448*ae115bc7Smrj *	Is the load/store order optimal? Should it use prefetch?
449*ae115bc7Smrj */
450*ae115bc7Smrj/*ARGSUSED*/
451*ae115bc7Smrjvoid
452*ae115bc7Smrjpage_copy_no_xmm(void *dst, void *src)
453*ae115bc7Smrj{}
454*ae115bc7Smrj
455*ae115bc7Smrj#else	/* __lint */
456*ae115bc7Smrj
457*ae115bc7Smrj#if defined(__amd64)
458*ae115bc7Smrj
459*ae115bc7Smrj	ENTRY(page_copy_no_xmm)
460*ae115bc7Smrj	movq	$MMU_STD_PAGESIZE, %rcx
461*ae115bc7Smrj	addq	%rcx, %rdi
462*ae115bc7Smrj	addq	%rcx, %rsi
463*ae115bc7Smrj	negq	%rcx
464*ae115bc7Smrj1:
465*ae115bc7Smrj	movq	(%rsi, %rcx), %rax
466*ae115bc7Smrj	movnti	%rax, (%rdi, %rcx)
467*ae115bc7Smrj	movq	8(%rsi, %rcx), %rax
468*ae115bc7Smrj	movnti	%rax, 8(%rdi, %rcx)
469*ae115bc7Smrj	movq	16(%rsi, %rcx), %rax
470*ae115bc7Smrj	movnti	%rax, 16(%rdi, %rcx)
471*ae115bc7Smrj	movq	24(%rsi, %rcx), %rax
472*ae115bc7Smrj	movnti	%rax, 24(%rdi, %rcx)
473*ae115bc7Smrj	addq	$32, %rcx
4747c478bd9Sstevel@tonic-gate	jnz	1b
4757c478bd9Sstevel@tonic-gate	mfence
4767c478bd9Sstevel@tonic-gate	ret
477*ae115bc7Smrj	SET_SIZE(page_copy_no_xmm)
478*ae115bc7Smrj
479*ae115bc7Smrj#elif defined(__i386)
480*ae115bc7Smrj
481*ae115bc7Smrj	ENTRY(page_copy_no_xmm)
482*ae115bc7Smrj	pushl	%esi
483*ae115bc7Smrj	movl	$MMU_STD_PAGESIZE, %ecx
484*ae115bc7Smrj	movl	8(%esp), %edx
485*ae115bc7Smrj	movl	12(%esp), %esi
486*ae115bc7Smrj	addl	%ecx, %edx
487*ae115bc7Smrj	addl	%ecx, %esi
488*ae115bc7Smrj	negl	%ecx
489*ae115bc7Smrj1:
490*ae115bc7Smrj	movl	(%esi, %ecx), %eax
491*ae115bc7Smrj	movnti	%eax, (%edx, %ecx)
492*ae115bc7Smrj	movl	4(%esi, %ecx), %eax
493*ae115bc7Smrj	movnti	%eax, 4(%edx, %ecx)
494*ae115bc7Smrj	movl	8(%esi, %ecx), %eax
495*ae115bc7Smrj	movnti	%eax, 8(%edx, %ecx)
496*ae115bc7Smrj	movl	12(%esi, %ecx), %eax
497*ae115bc7Smrj	movnti	%eax, 12(%edx, %ecx)
498*ae115bc7Smrj	addl	$16, %ecx
499*ae115bc7Smrj	jnz	1b
500*ae115bc7Smrj	mfence
501*ae115bc7Smrj	popl	%esi
502*ae115bc7Smrj	ret
503*ae115bc7Smrj	SET_SIZE(page_copy_no_xmm)
5047c478bd9Sstevel@tonic-gate
5057c478bd9Sstevel@tonic-gate#endif	/* __i386 */
5067c478bd9Sstevel@tonic-gate#endif	/* __lint */
5077c478bd9Sstevel@tonic-gate
5087c478bd9Sstevel@tonic-gate#if defined(DEBUG) && !defined(__lint)
5097c478bd9Sstevel@tonic-gate	.text
5107c478bd9Sstevel@tonic-gate.not_disabled:
5117c478bd9Sstevel@tonic-gate	.string	"sseblk: preemption not disabled!"
5127c478bd9Sstevel@tonic-gate#endif
513