/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #if defined(__lint) #include #include #else #include "assym.h" #endif /* * Do block operations using Streaming SIMD extensions */ #if defined(DEBUG) #if defined(__amd64) #define ASSERT_KPREEMPT_DISABLED(t, r32, msg) \ movq %gs:CPU_THREAD, t; \ movsbl T_PREEMPT(t), r32; \ testl r32, r32; \ jne 5f; \ pushq %rbp; \ movq %rsp, %rbp; \ leaq msg(%rip), %rdi; \ xorl %eax, %eax; \ call panic; \ 5: #elif defined(__i386) #define ASSERT_KPREEMPT_DISABLED(t, r32, msg) \ movl %gs:CPU_THREAD, t; \ movsbl T_PREEMPT(t), r32; \ testl r32, r32; \ jne 5f; \ pushl %ebp; \ movl %esp, %ebp; \ pushl $msg; \ call panic; \ 5: #endif /* __i386 */ #else /* DEBUG */ #define ASSERT_KPREEMPT_DISABLED(t, r32, msg) #endif /* DEBUG */ #define BLOCKSHIFT 6 #define BLOCKSIZE 64 /* (1 << BLOCKSHIFT) */ #define BLOCKMASK 63 /* (BLOCKSIZE - 1) */ #if (1 << BLOCKSHIFT) != BLOCKSIZE || BLOCKMASK != (BLOCKSIZE - 1) #error "mucked up constants" #endif #if defined(__lint) /*ARGSUSED*/ void hwblkclr(void *addr, size_t size) {} #else /* __lint */ #if defined(__amd64) #define ADD addq #define SUB subq #else #define ADD addl #define SUB subl #endif #define SAVE_XMM0(r) \ SAVE_XMM_PROLOG(r, 1); \ movdqa %xmm0, (r) #define ZERO_LOOP_INIT_XMM(dst) \ pxor %xmm0, %xmm0 #define ZERO_LOOP_BODY_XMM(dst, cnt) \ movntdq %xmm0, (dst); \ movntdq %xmm0, 0x10(dst); \ movntdq %xmm0, 0x20(dst); \ movntdq %xmm0, 0x30(dst); \ ADD $BLOCKSIZE, dst; \ SUB $1, cnt #define ZERO_LOOP_FINI_XMM(dst) \ mfence #define RSTOR_XMM0(r) \ movdqa 0x0(r), %xmm0; \ RSTOR_XMM_EPILOG(r, 1) #if defined(__amd64) /* * %rdi dst * %rsi size * %rax saved %cr0 (#if DEBUG then %eax is t->t_preempt) * %r8 pointer to %xmm register save area */ ENTRY(hwblkclr) pushq %rbp movq %rsp, %rbp testl $BLOCKMASK, %edi /* address must be BLOCKSIZE aligned */ jne .dobzero cmpq $BLOCKSIZE, %rsi /* size must be at least BLOCKSIZE */ jl .dobzero testq $BLOCKMASK, %rsi /* .. and be a multiple of BLOCKSIZE */ jne .dobzero shrq $BLOCKSHIFT, %rsi ASSERT_KPREEMPT_DISABLED(%r11, %eax, .not_disabled) movq %cr0, %rax clts testl $CR0_TS, %eax jnz 1f SAVE_XMM0(%r8) 1: ZERO_LOOP_INIT_XMM(%rdi) 9: ZERO_LOOP_BODY_XMM(%rdi, %rsi) jnz 9b ZERO_LOOP_FINI_XMM(%rdi) testl $CR0_TS, %eax jnz 2f RSTOR_XMM0(%r8) 2: movq %rax, %cr0 leave ret .dobzero: leave jmp bzero SET_SIZE(hwblkclr) #elif defined(__i386) /* * %eax dst * %ecx size in bytes, loop count * %ebx saved %cr0 (#if DEBUG then t->t_preempt) * %edi pointer to %xmm register save area */ ENTRY(hwblkclr) movl 4(%esp), %eax movl 8(%esp), %ecx testl $BLOCKMASK, %eax /* address must be BLOCKSIZE aligned */ jne .dobzero cmpl $BLOCKSIZE, %ecx /* size must be at least BLOCKSIZE */ jl .dobzero testl $BLOCKMASK, %ecx /* .. and be a multiple of BLOCKSIZE */ jne .dobzero shrl $BLOCKSHIFT, %ecx movl 0xc(%esp), %edx pushl %ebx pushl %esi ASSERT_KPREEMPT_DISABLED(%esi, %ebx, .not_disabled) popl %esi movl %cr0, %ebx clts testl $CR0_TS, %ebx jnz 1f pushl %edi SAVE_XMM0(%edi) 1: ZERO_LOOP_INIT_XMM(%eax) 9: ZERO_LOOP_BODY_XMM(%eax, %ecx) jnz 9b ZERO_LOOP_FINI_XMM(%eax) testl $CR0_TS, %ebx jnz 2f RSTOR_XMM0(%edi) popl %edi 2: movl %ebx, %cr0 popl %ebx ret .dobzero: jmp bzero SET_SIZE(hwblkclr) #endif /* __i386 */ #endif /* __lint */ #if defined(__lint) /*ARGSUSED*/ void hwblkpagecopy(const void *src, void *dst) {} #else /* __lint */ #define PREFETCH_START(src) \ prefetchnta 0x0(src); \ prefetchnta 0x40(src) #define SAVE_XMMS(r) \ SAVE_XMM_PROLOG(r, 8); \ movdqa %xmm0, (r); \ movdqa %xmm1, 0x10(r); \ movdqa %xmm2, 0x20(r); \ movdqa %xmm3, 0x30(r); \ movdqa %xmm4, 0x40(r); \ movdqa %xmm5, 0x50(r); \ movdqa %xmm6, 0x60(r); \ movdqa %xmm7, 0x70(r) #define COPY_LOOP_INIT_XMM(src) \ prefetchnta 0x80(src); \ prefetchnta 0xc0(src); \ movdqa 0x0(src), %xmm0; \ movdqa 0x10(src), %xmm1; \ movdqa 0x20(src), %xmm2; \ movdqa 0x30(src), %xmm3; \ movdqa 0x40(src), %xmm4; \ movdqa 0x50(src), %xmm5; \ movdqa 0x60(src), %xmm6; \ movdqa 0x70(src), %xmm7; \ ADD $0x80, src #define COPY_LOOP_BODY_XMM(src, dst, cnt) \ prefetchnta 0x80(src); \ prefetchnta 0xc0(src); \ prefetchnta 0x100(src); \ prefetchnta 0x140(src); \ movntdq %xmm0, (dst); \ movntdq %xmm1, 0x10(dst); \ movntdq %xmm2, 0x20(dst); \ movntdq %xmm3, 0x30(dst); \ movdqa 0x0(src), %xmm0; \ movdqa 0x10(src), %xmm1; \ movntdq %xmm4, 0x40(dst); \ movntdq %xmm5, 0x50(dst); \ movdqa 0x20(src), %xmm2; \ movdqa 0x30(src), %xmm3; \ movntdq %xmm6, 0x60(dst); \ movntdq %xmm7, 0x70(dst); \ movdqa 0x40(src), %xmm4; \ movdqa 0x50(src), %xmm5; \ ADD $0x80, dst; \ movdqa 0x60(src), %xmm6; \ movdqa 0x70(src), %xmm7; \ ADD $0x80, src; \ subl $1, cnt #define COPY_LOOP_FINI_XMM(dst) \ movntdq %xmm0, 0x0(dst); \ movntdq %xmm1, 0x10(dst); \ movntdq %xmm2, 0x20(dst); \ movntdq %xmm3, 0x30(dst); \ movntdq %xmm4, 0x40(dst); \ movntdq %xmm5, 0x50(dst); \ movntdq %xmm6, 0x60(dst); \ movntdq %xmm7, 0x70(dst) #define RSTOR_XMMS(r) \ movdqa 0x0(r), %xmm0; \ movdqa 0x10(r), %xmm1; \ movdqa 0x20(r), %xmm2; \ movdqa 0x30(r), %xmm3; \ movdqa 0x40(r), %xmm4; \ movdqa 0x50(r), %xmm5; \ movdqa 0x60(r), %xmm6; \ movdqa 0x70(r), %xmm7; \ RSTOR_XMM_EPILOG(r, 8) #if defined(__amd64) /* * %rdi src * %rsi dst * %rdx #if DEBUG then curthread * %ecx loop count * %rax saved %cr0 (#if DEBUG then %eax is t->t_prempt) * %r8 pointer to %xmm register save area */ ENTRY(hwblkpagecopy) pushq %rbp movq %rsp, %rbp PREFETCH_START(%rdi) /* * PAGESIZE is 4096, each loop moves 128 bytes, but the initial * load and final store save us on loop count */ movl $_CONST(32 - 1), %ecx ASSERT_KPREEMPT_DISABLED(%rdx, %eax, .not_disabled) movq %cr0, %rax clts testl $CR0_TS, %eax jnz 3f SAVE_XMMS(%r8) 3: COPY_LOOP_INIT_XMM(%rdi) 4: COPY_LOOP_BODY_XMM(%rdi, %rsi, %ecx) jnz 4b COPY_LOOP_FINI_XMM(%rsi) testl $CR0_TS, %eax jnz 5f RSTOR_XMMS(%r8) 5: movq %rax, %cr0 mfence leave ret SET_SIZE(hwblkpagecopy) #elif defined(__i386) /* * %eax src * %edx dst * %ecx loop count * %ebx saved %cr0 (#if DEBUG then t->t_prempt) * %edi pointer to %xmm register save area * %esi #if DEBUG temporary thread pointer */ ENTRY(hwblkpagecopy) movl 4(%esp), %eax movl 8(%esp), %edx PREFETCH_START(%eax) pushl %ebx /* * PAGESIZE is 4096, each loop moves 128 bytes, but the initial * load and final store save us one loop count */ movl $_CONST(32 - 1), %ecx pushl %esi ASSERT_KPREEMPT_DISABLED(%esi, %ebx, .not_disabled) popl %esi movl %cr0, %ebx clts testl $CR0_TS, %ebx jnz 3f pushl %edi SAVE_XMMS(%edi) 3: COPY_LOOP_INIT_XMM(%eax) 4: COPY_LOOP_BODY_XMM(%eax, %edx, %ecx) jnz 4b COPY_LOOP_FINI_XMM(%edx) testl $CR0_TS, %ebx jnz 5f RSTOR_XMMS(%edi) popl %edi 5: movl %ebx, %cr0 popl %ebx mfence ret SET_SIZE(hwblkpagecopy) #endif /* __i386 */ #endif /* __lint */ #if defined(__lint) /* * Version of hwblkclr which doesn't use XMM registers. * Note that it requires aligned dst and len. * * XXPV This needs to be performance tuned at some point. * Is 4 the best number of iterations to unroll? */ /*ARGSUSED*/ void block_zero_no_xmm(void *dst, int len) {} #else /* __lint */ #if defined(__amd64) ENTRY(block_zero_no_xmm) pushq %rbp movq %rsp, %rbp xorl %eax, %eax addq %rsi, %rdi negq %rsi 1: movnti %rax, (%rdi, %rsi) movnti %rax, 8(%rdi, %rsi) movnti %rax, 16(%rdi, %rsi) movnti %rax, 24(%rdi, %rsi) addq $32, %rsi jnz 1b mfence leave ret SET_SIZE(block_zero_no_xmm) #elif defined(__i386) ENTRY(block_zero_no_xmm) pushl %ebp movl %esp, %ebp xorl %eax, %eax movl 8(%ebp), %edx movl 12(%ebp), %ecx addl %ecx, %edx negl %ecx 1: movnti %eax, (%edx, %ecx) movnti %eax, 4(%edx, %ecx) movnti %eax, 8(%edx, %ecx) movnti %eax, 12(%edx, %ecx) addl $16, %ecx jnz 1b mfence leave ret SET_SIZE(block_zero_no_xmm) #endif /* __i386 */ #endif /* __lint */ #if defined(__lint) /* * Version of page copy which doesn't use XMM registers. * * XXPV This needs to be performance tuned at some point. * Is 4 the right number of iterations to unroll? * Is the load/store order optimal? Should it use prefetch? */ /*ARGSUSED*/ void page_copy_no_xmm(void *dst, void *src) {} #else /* __lint */ #if defined(__amd64) ENTRY(page_copy_no_xmm) movq $MMU_STD_PAGESIZE, %rcx addq %rcx, %rdi addq %rcx, %rsi negq %rcx 1: movq (%rsi, %rcx), %rax movnti %rax, (%rdi, %rcx) movq 8(%rsi, %rcx), %rax movnti %rax, 8(%rdi, %rcx) movq 16(%rsi, %rcx), %rax movnti %rax, 16(%rdi, %rcx) movq 24(%rsi, %rcx), %rax movnti %rax, 24(%rdi, %rcx) addq $32, %rcx jnz 1b mfence ret SET_SIZE(page_copy_no_xmm) #elif defined(__i386) ENTRY(page_copy_no_xmm) pushl %esi movl $MMU_STD_PAGESIZE, %ecx movl 8(%esp), %edx movl 12(%esp), %esi addl %ecx, %edx addl %ecx, %esi negl %ecx 1: movl (%esi, %ecx), %eax movnti %eax, (%edx, %ecx) movl 4(%esi, %ecx), %eax movnti %eax, 4(%edx, %ecx) movl 8(%esi, %ecx), %eax movnti %eax, 8(%edx, %ecx) movl 12(%esi, %ecx), %eax movnti %eax, 12(%edx, %ecx) addl $16, %ecx jnz 1b mfence popl %esi ret SET_SIZE(page_copy_no_xmm) #endif /* __i386 */ #endif /* __lint */ #if defined(DEBUG) && !defined(__lint) .text .not_disabled: .string "sseblk: preemption not disabled!" #endif