1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22/* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 .ident "%Z%%M% %I% %E% SMI" 28 29 .file "%M%" 30 31#include <sys/asm_linkage.h> 32 33 ANSI_PRAGMA_WEAK(memmove,function) 34 ANSI_PRAGMA_WEAK(memcpy,function) 35 36#include "SYS.h" 37 38 ANSI_PRAGMA_WEAK2(_private_memcpy,memcpy,function) 39 40 ENTRY(memmove) 41 movl 0+12(%esp),%ecx / get number of bytes to move 42 pushl %esi / save off %edi, %esi and move destination 43 pushl %edi 44 movl 8+ 4(%esp),%edi / destination buffer address 45 movl 8+ 8(%esp),%esi / source buffer address 46 movl %edi, %eax 47 testl %ecx,%ecx 48 jz .Return 49 50 cmpl %esi,%edi / if (source addr > dest addr) 51 leal -1(%esi,%ecx),%edx / %edx = src + size - 1 52 jle .memcpy_post / jump if dst < src 53 cmpl %edx,%edi 54 jle .CopyLeft / jump if dst <= src + size - 1 55 jmp .memcpy_post 56 57 ENTRY(memcpy) 58 pushl %esi 59 pushl %edi 60 61 movl 8+4(%esp),%edi / %edi = dest address 62 movl %edi, %eax / save this 63 movl 8+8(%esp),%esi / %esi = source address 64 movl 8+12(%esp),%ecx/ %ecx = length of string 65 / %edx scratch register 66 / %eax scratch register 67.memcpy_post: 68 nop / this really helps, don't know why 69 / note: cld is perf death on P4 70 cmpl $63,%ecx 71 ja .move_sse / not worth doing sse for less 72 73.movew: 74 movl %ecx,%edx / save byte cnt 75 shrl $2,%ecx / %ecx = number of words to move 76 rep ; smovl / move the words 77 78 79 andl $0x3,%edx / %edx = number of bytes left to move 80 jz .Return / %edx <= 3, so just unroll the loop 81 82 movb (%esi), %cl 83 movb %cl, (%edi) 84 decl %edx 85 jz .Return 86 movb 1(%esi), %cl 87 movb %cl, 1(%edi) 88 decl %edx 89 jz .Return 90 movb 2(%esi), %cl 91 movb %cl, 2(%edi) 92 93.Return: 94 popl %edi / restore register variables 95 popl %esi 96 ret 97 98.move_sse: 99 / 100 / time to 16 byte align destination 101 / 102 andl $15, %eax 103 jnz .sse_unaligned / jmp if dest is unaligned 104.sse: / dest is aligned, check source 105 movl %ecx, %edx / get byte count 106 shrl $6, %edx / number of 64 byte blocks to move 107 testl $15, %esi 108 jnz .sse_da / go to slow loop if source is unaligned 109 cmpl $65535, %ecx 110 ja .sse_sa_nt_loop 111 112 / 113 / use aligned load since we're lucky 114 / 115.sse_sa_loop: 116 prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time 117 prefetcht0 568(%edi) / prefetch source & copy 64 byte at a time 118 movaps 0(%esi), %xmm0 119 movaps %xmm0, 0(%edi) 120 movaps 16(%esi), %xmm1 121 movaps %xmm1, 16(%edi) 122 movaps 32(%esi), %xmm2 123 movaps %xmm2, 32(%edi) 124 movaps 48(%esi), %xmm3 125 movaps %xmm3, 48(%edi) 126 addl $64, %esi 127 addl $64, %edi 128 decl %edx 129 jnz .sse_sa_loop 130 131.sse_cleanup: 132 andl $63, %ecx / compute remaining bytes 133 movl 8+4(%esp), %eax / setup return value 134 jz .Return 135 jmp .movew 136 137 / 138 / use aligned load since we're lucky 139 / 140 .align 16 141.sse_sa_nt_loop: 142 prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time 143 movaps (%esi), %xmm0 144 movntps %xmm0, 0(%edi) 145 movaps 16(%esi), %xmm1 146 movntps %xmm1, 16(%edi) 147 movaps 32(%esi), %xmm2 148 movntps %xmm2, 32(%edi) 149 movaps 48(%esi), %xmm3 150 movntps %xmm3, 48(%edi) 151 addl $64, %esi 152 addl $64, %edi 153 decl %edx 154 jnz .sse_sa_nt_loop 155#if defined(_SSE2_INSN) 156 mfence 157#elif defined(_SSE_INSN) 158 sfence 159#else 160#error "Must have either SSE or SSE2" 161#endif 162 jmp .sse_cleanup 163 164 / 165 / Make certain that destination buffer becomes aligned 166 / 167.sse_unaligned: 168 neg %eax / subtract from 16 and get destination 169 andl $15, %eax / aligned on a 16 byte boundary 170 movl %ecx, %edx / saved count 171 subl %eax, %ecx / subtract from byte count 172 cmpl $64, %ecx / after aligning, will we still have 64 bytes? 173 cmovb %edx, %ecx / if not, restore original byte count, 174 cmovb 8+4(%esp), %eax / and restore return value, 175 jb .movew / and do a non-SSE move. 176 xchg %ecx, %eax / flip for copy 177 rep ; smovb / move the bytes 178 xchg %ecx, %eax / flip back 179 jmp .sse 180 181 .align 16 182.sse_da: 183 cmpl $65535, %ecx 184 jbe .sse_da_loop 185 186 / 187 / use unaligned load since source doesn't line up 188 / 189.sse_da_nt_loop: 190 prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time 191 movups 0(%esi), %xmm0 192 movntps %xmm0, 0(%edi) 193 movups 16(%esi), %xmm1 194 movntps %xmm1, 16(%edi) 195 movups 32(%esi), %xmm2 196 movntps %xmm2, 32(%edi) 197 movups 48(%esi), %xmm3 198 movntps %xmm3, 48(%edi) 199 addl $64, %esi 200 addl $64, %edi 201 decl %edx 202 jnz .sse_da_nt_loop 203#if defined(_SSE2_INSN) 204 mfence 205#elif defined(_SSE_INSN) 206 sfence 207#else 208#error "Must have either SSE or SSE2" 209#endif 210 jmp .sse_cleanup 211 / 212 / use unaligned load since source doesn't line up 213 / 214 .align 16 215.sse_da_loop: 216 prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time 217 prefetcht0 568(%edi) 218 movups 0(%esi), %xmm0 219 movaps %xmm0, 0(%edi) 220 movups 16(%esi), %xmm1 221 movaps %xmm1, 16(%edi) 222 movups 32(%esi), %xmm2 223 movaps %xmm2, 32(%edi) 224 movups 48(%esi), %xmm3 225 movaps %xmm3, 48(%edi) 226 addl $64, %esi 227 addl $64, %edi 228 decl %edx 229 jnz .sse_da_loop 230 jmp .sse_cleanup 231 232 SET_SIZE(memcpy) 233 234 235/ .CopyLeft handles the memmove case where we must perform the copy backwards, 236/ because of overlap between src and dst. This is not particularly optimized. 237 238.CopyLeft: 239 movl $3,%eax / heavily used constant 240 std / reverse direction bit (RtoL) 241 cmpl $12,%ecx / if (size < 12) 242 ja .BigCopyLeft / { 243 movl %edx,%esi / src = src + size - 1 244 leal -1(%ecx,%edi),%edi / dst = dst + size - 1 245 rep; smovb / do the byte copy 246 cld / reset direction flag to LtoR 247 popl %edi / } 248 popl %esi / restore registers 249 movl 4(%esp),%eax / set up return value 250 ret / return(dba); 251.BigCopyLeft: / } else { 252 xchgl %edx,%ecx 253 movl %ecx,%esi / align source w/byte copy 254 leal -1(%edx,%edi),%edi 255 andl %eax,%ecx 256 jz .SkipAlignLeft 257 addl $1, %ecx / we need to insure that future 258 subl %ecx,%edx / copy is done on aligned boundary 259 rep; smovb 260.SkipAlignLeft: 261 movl %edx,%ecx 262 subl %eax,%esi 263 shrl $2,%ecx / do 4 byte copy RtoL 264 subl %eax,%edi 265 rep; smovl 266 andl %eax,%edx / do 1 byte copy whats left 267 jz .CleanupReturnLeft 268 movl %edx,%ecx 269 addl %eax,%esi / rep; smovl instruction will decrement 270 addl %eax,%edi / %edi, %esi by four after each copy 271 / adding 3 will restore pointers to byte 272 / before last double word copied 273 / which is where they are expected to 274 / be for the single byte copy code 275 rep; smovb 276.CleanupReturnLeft: 277 cld / reset direction flag to LtoR 278 popl %edi 279 popl %esi / restore registers 280 movl 4(%esp),%eax / set up return value 281 ret / return(dba); 282 SET_SIZE(memmove) 283