1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22/* 23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 .ident "%Z%%M% %I% %E% SMI" 28 29 .file "%M%" 30 31#include <sys/asm_linkage.h> 32 33 ANSI_PRAGMA_WEAK(memset,function) 34 35#include "SYS.h" 36 37 ANSI_PRAGMA_WEAK2(_private_memset,memset,function) 38 39 ENTRY(memset) 40 pushl %edi / save register variable 41 movl 8(%esp),%edi / %edi = string address 42 movl 12(%esp),%eax / %al = byte to duplicate 43 movl 16(%esp),%ecx / %ecx = number of copies 44 45 / For all basic blocks in this routine, maintain the following 46 / entry conditions: %eax each byte is set to desired byte. 47 / NOTE: .byteset doesn't require this 48 / %ecx contains # bytes to set 49 / %edi contain address to set 50 51 cld / make sure we go the right way... 52 cmpl $20,%ecx / strings with fewer than 20 chars should be byte set 53 jbe .byteset 54 55 andl $0xff, %eax / trim anything above low byte 56 imul $0x01010101, %eax / extend low byte to each byte 57 58 cmpl $256, %ecx / smaller areas don't benefit from alignment 59 jbe .wordset 60 61 cmpl $511, %ecx / areas smaller than this should be wordset 62 jbe .check_wordset 63 64 / 65 / prep work for sse temporal and non-temporal 66 / 67 68 pushl %ebx / more registers are needed 69 pushl %esi / for alignment work 70 71 / 72 / align address to 64 byte boundaries. 73 / 74 75 movl %ecx, %ebx / save byte count 76 movl %edi, %esi / esi is scratch register 77 andl $63, %esi / bytes to align to 64 byte align addr 78 neg %esi / compute count of bytes 79 addl $64, %esi / needed to align 80 andl $63, %esi / to 64 byte align addr 81 jz .sse_aligned / skip alignment if not needed 82 subl %esi, %ebx / ebx contains remainder of bytes to set 83 movl %esi, %ecx / alignment bytes 84 shrl $2,%ecx / %ecx = number of words to set 85 rep; sstol 86 movl %esi,%ecx 87 andl $3,%ecx / %ecx = number of bytes left 88 rep; sstob 89 movl %ebx, %ecx / remainder to be set 90 91.sse_aligned: 92 93 shr $6, %ecx / number of 64 byte blocks to set 94 95 / 96 / load xmm0 with bytes to be set 97 / 98 subl $16,%esp / give ourselves some working room on the stack 99 movl %eax,(%esp) / copy eax into each of 4 bytes 100 movl %eax,4(%esp) / avoid pushl since it causes more interlocking 101 movl %eax,8(%esp) / 102 movl %eax,12(%esp) / 103 movups (%esp), %xmm0 / unaligned load from stack into xmm0 104 addl $16,%esp / restore stack position 105 106 cmpl $262143, %ebx / blocks smaller than this allocate in the cache 107 jbe .sse_loop 108 jmp .sse_nt_loop / branch across alignment nops 109 110 .align 16 111 112.sse_nt_loop: 113 movntps %xmm0, (%edi) / block non-temporal store 114 movntps %xmm0, 16(%edi) / use sse rather than sse2 115 movntps %xmm0, 32(%edi) / so we work more places 116 movntps %xmm0, 48(%edi) / 117 118 addl $64, %edi / increment dest address 119 dec %ecx / dec count of blocks 120 jnz .sse_nt_loop / jump if not done 121 122 andl $63, %ebx / remainder of bytes to copy 123 movl %ebx, %ecx / ecx contains remainer of bytes to set 124 popl %esi / restore stack config 125 popl %ebx / 126#if defined(_SSE2_INSN) 127 mfence 128#elif defined(_SSE_INSN) 129 sfence 130#else 131#error "Must have either SSE or SSE2" 132#endif 133 cmpl $20, %ecx / compare and jump accordingly 134 jbe .byteset 135 jmp .wordset 136 137 .align 16 138.sse_loop: 139 movaps %xmm0, (%edi) / block copy w/ SSE 140 movaps %xmm0, 16(%edi) 141 movaps %xmm0, 32(%edi) 142 movaps %xmm0, 48(%edi) 143 144 addl $64, %edi / increment addr 145 dec %ecx / dec count of blocks 146 jnz .sse_loop / jump if not done 147 148 andl $63, %ebx / remainder of bytes to copy 149 movl %ebx, %ecx / in %ecx as normal 150 popl %esi / restore stack config 151 popl %ebx / 152 cmpl $20, %ecx 153 jbe .byteset 154 jmp .wordset 155 156.check_wordset: 157 movl %edi, %edx / save current store ptr 158 andl $7, %edi / check alignment 159 movl %edx,%edi / %edi = string address 160 jz .wordset / all ok 161 162 163.align_wordset: 164 pushl %ebx / more registers are needed 165 pushl %esi 166 167 movl %ecx, %ebx 168 movl %edi, %esi 169 andl $7, %esi 170 neg %esi 171 addl $8, %esi 172 andl $7, %esi 173 subl %esi, %ebx / ebx contains remainder of bytes to copy 174 movl %esi, %ecx 175 rep; sstob 176 movl %ebx, %ecx 177 popl %esi / restore stack config 178 popl %ebx / 179 180.wordset: 181 movl %ecx, %edx / save cont 182 shrl $2,%ecx / %ecx = number of words to set 183 rep; sstol 184 movl %edx,%ecx 185 andl $3,%ecx / %ecx = number of bytes left 186 187.byteset: 188 rep; sstob 189 movl 8(%esp),%eax / return string address 190 popl %edi / restore register variable 191 ret 192 SET_SIZE(memset) 193