1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 .ident "%Z%%M% %I% %E% SMI" 28 29 .file "%M%" 30 31#include <sys/asm_linkage.h> 32 33 ANSI_PRAGMA_WEAK(memset,function) 34 35#include "SYS.h" 36 37 ENTRY(memset) 38 pushl %edi / save register variable 39 movl 8(%esp),%edi / %edi = string address 40 movl 12(%esp),%eax / %al = byte to duplicate 41 movl 16(%esp),%ecx / %ecx = number of copies 42 43 / For all basic blocks in this routine, maintain the following 44 / entry conditions: %eax each byte is set to desired byte. 45 / NOTE: .byteset doesn't require this 46 / %ecx contains # bytes to set 47 / %edi contain address to set 48 49 cld / make sure we go the right way... 50 cmpl $20,%ecx / strings with fewer than 20 chars should be byte set 51 jbe .byteset 52 53 andl $0xff, %eax / trim anything above low byte 54 imul $0x01010101, %eax / extend low byte to each byte 55 56 cmpl $256, %ecx / smaller areas don't benefit from alignment 57 jbe .wordset 58 59 cmpl $511, %ecx / areas smaller than this should be wordset 60 jbe .check_wordset 61 62 / 63 / prep work for sse temporal and non-temporal 64 / 65 66 pushl %ebx / more registers are needed 67 pushl %esi / for alignment work 68 69 / 70 / align address to 64 byte boundaries. 71 / 72 73 movl %ecx, %ebx / save byte count 74 movl %edi, %esi / esi is scratch register 75 andl $63, %esi / bytes to align to 64 byte align addr 76 neg %esi / compute count of bytes 77 addl $64, %esi / needed to align 78 andl $63, %esi / to 64 byte align addr 79 jz .sse_aligned / skip alignment if not needed 80 subl %esi, %ebx / ebx contains remainder of bytes to set 81 movl %esi, %ecx / alignment bytes 82 shrl $2,%ecx / %ecx = number of words to set 83 rep; sstol 84 movl %esi,%ecx 85 andl $3,%ecx / %ecx = number of bytes left 86 rep; sstob 87 movl %ebx, %ecx / remainder to be set 88 89.sse_aligned: 90 91 shr $6, %ecx / number of 64 byte blocks to set 92 93 / 94 / load xmm0 with bytes to be set 95 / 96 subl $16,%esp / give ourselves some working room on the stack 97 movl %eax,(%esp) / copy eax into each of 4 bytes 98 movl %eax,4(%esp) / avoid pushl since it causes more interlocking 99 movl %eax,8(%esp) / 100 movl %eax,12(%esp) / 101 movups (%esp), %xmm0 / unaligned load from stack into xmm0 102 addl $16,%esp / restore stack position 103 104 cmpl $262143, %ebx / blocks smaller than this allocate in the cache 105 jbe .sse_loop 106 jmp .sse_nt_loop / branch across alignment nops 107 108 .align 16 109 110.sse_nt_loop: 111 movntps %xmm0, (%edi) / block non-temporal store 112 movntps %xmm0, 16(%edi) / use sse rather than sse2 113 movntps %xmm0, 32(%edi) / so we work more places 114 movntps %xmm0, 48(%edi) / 115 116 addl $64, %edi / increment dest address 117 dec %ecx / dec count of blocks 118 jnz .sse_nt_loop / jump if not done 119 120 andl $63, %ebx / remainder of bytes to copy 121 movl %ebx, %ecx / ecx contains remainer of bytes to set 122 popl %esi / restore stack config 123 popl %ebx / 124#if defined(_SSE2_INSN) 125 mfence 126#elif defined(_SSE_INSN) 127 sfence 128#else 129#error "Must have either SSE or SSE2" 130#endif 131 cmpl $20, %ecx / compare and jump accordingly 132 jbe .byteset 133 jmp .wordset 134 135 .align 16 136.sse_loop: 137 movaps %xmm0, (%edi) / block copy w/ SSE 138 movaps %xmm0, 16(%edi) 139 movaps %xmm0, 32(%edi) 140 movaps %xmm0, 48(%edi) 141 142 addl $64, %edi / increment addr 143 dec %ecx / dec count of blocks 144 jnz .sse_loop / jump if not done 145 146 andl $63, %ebx / remainder of bytes to copy 147 movl %ebx, %ecx / in %ecx as normal 148 popl %esi / restore stack config 149 popl %ebx / 150 cmpl $20, %ecx 151 jbe .byteset 152 jmp .wordset 153 154.check_wordset: 155 movl %edi, %edx / save current store ptr 156 andl $7, %edi / check alignment 157 movl %edx,%edi / %edi = string address 158 jz .wordset / all ok 159 160 161.align_wordset: 162 pushl %ebx / more registers are needed 163 pushl %esi 164 165 movl %ecx, %ebx 166 movl %edi, %esi 167 andl $7, %esi 168 neg %esi 169 addl $8, %esi 170 andl $7, %esi 171 subl %esi, %ebx / ebx contains remainder of bytes to copy 172 movl %esi, %ecx 173 rep; sstob 174 movl %ebx, %ecx 175 popl %esi / restore stack config 176 popl %ebx / 177 178.wordset: 179 movl %ecx, %edx / save cont 180 shrl $2,%ecx / %ecx = number of words to set 181 rep; sstol 182 movl %edx,%ecx 183 andl $3,%ecx / %ecx = number of bytes left 184 185.byteset: 186 rep; sstob 187 movl 8(%esp),%eax / return string address 188 popl %edi / restore register variable 189 ret 190 SET_SIZE(memset) 191