1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 .ident "%Z%%M% %I% %E% SMI" 28 29 .file "%M%" 30 31#include <sys/asm_linkage.h> 32 33 ANSI_PRAGMA_WEAK(memmove,function) 34 ANSI_PRAGMA_WEAK(memcpy,function) 35 36#include "SYS.h" 37 38 ENTRY(memmove) 39 movl 0+12(%esp),%ecx / get number of bytes to move 40 pushl %esi / save off %edi, %esi and move destination 41 pushl %edi 42 movl 8+ 4(%esp),%edi / destination buffer address 43 movl 8+ 8(%esp),%esi / source buffer address 44 movl %edi, %eax 45 testl %ecx,%ecx 46 jz .Return 47 48 cmpl %esi,%edi / if (source addr > dest addr) 49 leal -1(%esi,%ecx),%edx / %edx = src + size - 1 50 jle .memcpy_post / jump if dst < src 51 cmpl %edx,%edi 52 jle .CopyLeft / jump if dst <= src + size - 1 53 jmp .memcpy_post 54 55 ENTRY(memcpy) 56 pushl %esi 57 pushl %edi 58 59 movl 8+4(%esp),%edi / %edi = dest address 60 movl %edi, %eax / save this 61 movl 8+8(%esp),%esi / %esi = source address 62 movl 8+12(%esp),%ecx/ %ecx = length of string 63 / %edx scratch register 64 / %eax scratch register 65.memcpy_post: 66 nop / this really helps, don't know why 67 / note: cld is perf death on P4 68 cmpl $63,%ecx 69 ja .move_sse / not worth doing sse for less 70 71.movew: 72 movl %ecx,%edx / save byte cnt 73 shrl $2,%ecx / %ecx = number of words to move 74 rep ; smovl / move the words 75 76 77 andl $0x3,%edx / %edx = number of bytes left to move 78 jz .Return / %edx <= 3, so just unroll the loop 79 80 movb (%esi), %cl 81 movb %cl, (%edi) 82 decl %edx 83 jz .Return 84 movb 1(%esi), %cl 85 movb %cl, 1(%edi) 86 decl %edx 87 jz .Return 88 movb 2(%esi), %cl 89 movb %cl, 2(%edi) 90 91.Return: 92 popl %edi / restore register variables 93 popl %esi 94 ret 95 96.move_sse: 97 / 98 / time to 16 byte align destination 99 / 100 andl $15, %eax 101 jnz .sse_unaligned / jmp if dest is unaligned 102.sse: / dest is aligned, check source 103 movl %ecx, %edx / get byte count 104 shrl $6, %edx / number of 64 byte blocks to move 105 testl $15, %esi 106 jnz .sse_da / go to slow loop if source is unaligned 107 cmpl $65535, %ecx 108 ja .sse_sa_nt_loop 109 110 / 111 / use aligned load since we're lucky 112 / 113.sse_sa_loop: 114 prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time 115 prefetcht0 568(%edi) / prefetch source & copy 64 byte at a time 116 movaps 0(%esi), %xmm0 117 movaps %xmm0, 0(%edi) 118 movaps 16(%esi), %xmm1 119 movaps %xmm1, 16(%edi) 120 movaps 32(%esi), %xmm2 121 movaps %xmm2, 32(%edi) 122 movaps 48(%esi), %xmm3 123 movaps %xmm3, 48(%edi) 124 addl $64, %esi 125 addl $64, %edi 126 decl %edx 127 jnz .sse_sa_loop 128 129.sse_cleanup: 130 andl $63, %ecx / compute remaining bytes 131 movl 8+4(%esp), %eax / setup return value 132 jz .Return 133 jmp .movew 134 135 / 136 / use aligned load since we're lucky 137 / 138 .align 16 139.sse_sa_nt_loop: 140 prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time 141 movaps (%esi), %xmm0 142 movntps %xmm0, 0(%edi) 143 movaps 16(%esi), %xmm1 144 movntps %xmm1, 16(%edi) 145 movaps 32(%esi), %xmm2 146 movntps %xmm2, 32(%edi) 147 movaps 48(%esi), %xmm3 148 movntps %xmm3, 48(%edi) 149 addl $64, %esi 150 addl $64, %edi 151 decl %edx 152 jnz .sse_sa_nt_loop 153#if defined(_SSE2_INSN) 154 mfence 155#elif defined(_SSE_INSN) 156 sfence 157#else 158#error "Must have either SSE or SSE2" 159#endif 160 jmp .sse_cleanup 161 162 / 163 / Make certain that destination buffer becomes aligned 164 / 165.sse_unaligned: 166 neg %eax / subtract from 16 and get destination 167 andl $15, %eax / aligned on a 16 byte boundary 168 movl %ecx, %edx / saved count 169 subl %eax, %ecx / subtract from byte count 170 cmpl $64, %ecx / after aligning, will we still have 64 bytes? 171 cmovb %edx, %ecx / if not, restore original byte count, 172 cmovb 8+4(%esp), %eax / and restore return value, 173 jb .movew / and do a non-SSE move. 174 xchg %ecx, %eax / flip for copy 175 rep ; smovb / move the bytes 176 xchg %ecx, %eax / flip back 177 jmp .sse 178 179 .align 16 180.sse_da: 181 cmpl $65535, %ecx 182 jbe .sse_da_loop 183 184 / 185 / use unaligned load since source doesn't line up 186 / 187.sse_da_nt_loop: 188 prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time 189 movups 0(%esi), %xmm0 190 movntps %xmm0, 0(%edi) 191 movups 16(%esi), %xmm1 192 movntps %xmm1, 16(%edi) 193 movups 32(%esi), %xmm2 194 movntps %xmm2, 32(%edi) 195 movups 48(%esi), %xmm3 196 movntps %xmm3, 48(%edi) 197 addl $64, %esi 198 addl $64, %edi 199 decl %edx 200 jnz .sse_da_nt_loop 201#if defined(_SSE2_INSN) 202 mfence 203#elif defined(_SSE_INSN) 204 sfence 205#else 206#error "Must have either SSE or SSE2" 207#endif 208 jmp .sse_cleanup 209 / 210 / use unaligned load since source doesn't line up 211 / 212 .align 16 213.sse_da_loop: 214 prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time 215 prefetcht0 568(%edi) 216 movups 0(%esi), %xmm0 217 movaps %xmm0, 0(%edi) 218 movups 16(%esi), %xmm1 219 movaps %xmm1, 16(%edi) 220 movups 32(%esi), %xmm2 221 movaps %xmm2, 32(%edi) 222 movups 48(%esi), %xmm3 223 movaps %xmm3, 48(%edi) 224 addl $64, %esi 225 addl $64, %edi 226 decl %edx 227 jnz .sse_da_loop 228 jmp .sse_cleanup 229 230 SET_SIZE(memcpy) 231 232 233/ .CopyLeft handles the memmove case where we must perform the copy backwards, 234/ because of overlap between src and dst. This is not particularly optimized. 235 236.CopyLeft: 237 movl $3,%eax / heavily used constant 238 std / reverse direction bit (RtoL) 239 cmpl $12,%ecx / if (size < 12) 240 ja .BigCopyLeft / { 241 movl %edx,%esi / src = src + size - 1 242 leal -1(%ecx,%edi),%edi / dst = dst + size - 1 243 rep; smovb / do the byte copy 244 cld / reset direction flag to LtoR 245 popl %edi / } 246 popl %esi / restore registers 247 movl 4(%esp),%eax / set up return value 248 ret / return(dba); 249.BigCopyLeft: / } else { 250 xchgl %edx,%ecx 251 movl %ecx,%esi / align source w/byte copy 252 leal -1(%edx,%edi),%edi 253 andl %eax,%ecx 254 jz .SkipAlignLeft 255 addl $1, %ecx / we need to insure that future 256 subl %ecx,%edx / copy is done on aligned boundary 257 rep; smovb 258.SkipAlignLeft: 259 movl %edx,%ecx 260 subl %eax,%esi 261 shrl $2,%ecx / do 4 byte copy RtoL 262 subl %eax,%edi 263 rep; smovl 264 andl %eax,%edx / do 1 byte copy whats left 265 jz .CleanupReturnLeft 266 movl %edx,%ecx 267 addl %eax,%esi / rep; smovl instruction will decrement 268 addl %eax,%edi / %edi, %esi by four after each copy 269 / adding 3 will restore pointers to byte 270 / before last double word copied 271 / which is where they are expected to 272 / be for the single byte copy code 273 rep; smovb 274.CleanupReturnLeft: 275 cld / reset direction flag to LtoR 276 popl %edi 277 popl %esi / restore registers 278 movl 4(%esp),%eax / set up return value 279 ret / return(dba); 280 SET_SIZE(memmove) 281