1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma ident "%Z%%M% %I% %E% SMI" 28 29 .file "%M%" 30 31#include <sys/asm_linkage.h> 32 33 ANSI_PRAGMA_WEAK(memset,function) 34 35 ENTRY(memset) 36 pushl %edi / save register variable 37 movl 8(%esp),%edi / %edi = string address 38 movl 12(%esp),%eax / %al = byte to duplicate 39 movl 16(%esp),%ecx / %ecx = number of copies 40 41 / For all basic blocks in this routine, maintain the following 42 / entry conditions: %eax each byte is set to desired byte. 43 / NOTE: .byteset doesn't require this 44 / %ecx contains # bytes to set 45 / %edi contain address to set 46 47 cld / make sure we go the right way... 48 cmpl $20,%ecx / strings with fewer than 20 chars should be byte set 49 jbe .byteset 50 51 andl $0xff, %eax / trim anything above low byte 52 imul $0x01010101, %eax / extend low byte to each byte 53 54 cmpl $256, %ecx / smaller areas don't benefit from alignment 55 jbe .wordset 56 57 cmpl $511, %ecx / areas smaller than this should be wordset 58 jbe .check_wordset 59 60 / 61 / prep work for sse temporal and non-temporal 62 / 63 64 pushl %ebx / more registers are needed 65 pushl %esi / for alignment work 66 67 / 68 / align address to 64 byte boundaries. 69 / 70 71 movl %ecx, %ebx / save byte count 72 movl %edi, %esi / esi is scratch register 73 andl $63, %esi / bytes to align to 64 byte align addr 74 neg %esi / compute count of bytes 75 addl $64, %esi / needed to align 76 andl $63, %esi / to 64 byte align addr 77 jz .sse_aligned / skip alignment if not needed 78 subl %esi, %ebx / ebx contains remainder of bytes to set 79 movl %esi, %ecx / alignment bytes 80 shrl $2,%ecx / %ecx = number of words to set 81 rep; sstol 82 movl %esi,%ecx 83 andl $3,%ecx / %ecx = number of bytes left 84 rep; sstob 85 movl %ebx, %ecx / remainder to be set 86 87.sse_aligned: 88 89 shr $6, %ecx / number of 64 byte blocks to set 90 91 / 92 / load xmm0 with bytes to be set 93 / 94 subl $16,%esp / give ourselves some working room on the stack 95 movl %eax,(%esp) / copy eax into each of 4 bytes 96 movl %eax,4(%esp) / avoid pushl since it causes more interlocking 97 movl %eax,8(%esp) / 98 movl %eax,12(%esp) / 99 movups (%esp), %xmm0 / unaligned load from stack into xmm0 100 addl $16,%esp / restore stack position 101 102 cmpl $262143, %ebx / blocks smaller than this allocate in the cache 103 jbe .sse_loop 104 jmp .sse_nt_loop / branch across alignment nops 105 106 .align 16 107 108.sse_nt_loop: 109 movntps %xmm0, (%edi) / block non-temporal store 110 movntps %xmm0, 16(%edi) / use sse rather than sse2 111 movntps %xmm0, 32(%edi) / so we work more places 112 movntps %xmm0, 48(%edi) / 113 114 addl $64, %edi / increment dest address 115 dec %ecx / dec count of blocks 116 jnz .sse_nt_loop / jump if not done 117 118 andl $63, %ebx / remainder of bytes to copy 119 movl %ebx, %ecx / ecx contains remainer of bytes to set 120 popl %esi / restore stack config 121 popl %ebx / 122#if defined(_SSE2_INSN) 123 mfence 124#elif defined(_SSE_INSN) 125 sfence 126#else 127#error "Must have either SSE or SSE2" 128#endif 129 cmpl $20, %ecx / compare and jump accordingly 130 jbe .byteset 131 jmp .wordset 132 133 .align 16 134.sse_loop: 135 movaps %xmm0, (%edi) / block copy w/ SSE 136 movaps %xmm0, 16(%edi) 137 movaps %xmm0, 32(%edi) 138 movaps %xmm0, 48(%edi) 139 140 addl $64, %edi / increment addr 141 dec %ecx / dec count of blocks 142 jnz .sse_loop / jump if not done 143 144 andl $63, %ebx / remainder of bytes to copy 145 movl %ebx, %ecx / in %ecx as normal 146 popl %esi / restore stack config 147 popl %ebx / 148 cmpl $20, %ecx 149 jbe .byteset 150 jmp .wordset 151 152.check_wordset: 153 movl %edi, %edx / save current store ptr 154 andl $7, %edi / check alignment 155 movl %edx,%edi / %edi = string address 156 jz .wordset / all ok 157 158 159.align_wordset: 160 pushl %ebx / more registers are needed 161 pushl %esi 162 163 movl %ecx, %ebx 164 movl %edi, %esi 165 andl $7, %esi 166 neg %esi 167 addl $8, %esi 168 andl $7, %esi 169 subl %esi, %ebx / ebx contains remainder of bytes to copy 170 movl %esi, %ecx 171 rep; sstob 172 movl %ebx, %ecx 173 popl %esi / restore stack config 174 popl %ebx / 175 176.wordset: 177 movl %ecx, %edx / save cont 178 shrl $2,%ecx / %ecx = number of words to set 179 rep; sstol 180 movl %edx,%ecx 181 andl $3,%ecx / %ecx = number of bytes left 182 183.byteset: 184 rep; sstob 185 movl 8(%esp),%eax / return string address 186 popl %edi / restore register variable 187 ret 188 SET_SIZE(memset) 189