1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma ident "%Z%%M% %I% %E% SMI" 28 29 .file "%M%" 30 31#include <sys/asm_linkage.h> 32 33 ANSI_PRAGMA_WEAK(memmove,function) 34 ANSI_PRAGMA_WEAK(memcpy,function) 35 36 ENTRY(memmove) 37 movl 0+12(%esp),%ecx / get number of bytes to move 38 pushl %esi / save off %edi, %esi and move destination 39 pushl %edi 40 movl 8+ 4(%esp),%edi / destination buffer address 41 movl 8+ 8(%esp),%esi / source buffer address 42 movl %edi, %eax 43 testl %ecx,%ecx 44 jz .Return 45 46 cmpl %esi,%edi / if (source addr > dest addr) 47 leal -1(%esi,%ecx),%edx / %edx = src + size - 1 48 jle .memcpy_post / jump if dst < src 49 cmpl %edx,%edi 50 jle .CopyLeft / jump if dst <= src + size - 1 51 jmp .memcpy_post 52 53 ENTRY(memcpy) 54 pushl %esi 55 pushl %edi 56 57 movl 8+4(%esp),%edi / %edi = dest address 58 movl %edi, %eax / save this 59 movl 8+8(%esp),%esi / %esi = source address 60 movl 8+12(%esp),%ecx/ %ecx = length of string 61 / %edx scratch register 62 / %eax scratch register 63.memcpy_post: 64 nop / this really helps, don't know why 65 / note: cld is perf death on P4 66 cmpl $63,%ecx 67 ja .move_sse / not worth doing sse for less 68 69.movew: 70 movl %ecx,%edx / save byte cnt 71 shrl $2,%ecx / %ecx = number of words to move 72 rep ; smovl / move the words 73 74 75 andl $0x3,%edx / %edx = number of bytes left to move 76 jz .Return / %edx <= 3, so just unroll the loop 77 78 movb (%esi), %cl 79 movb %cl, (%edi) 80 decl %edx 81 jz .Return 82 movb 1(%esi), %cl 83 movb %cl, 1(%edi) 84 decl %edx 85 jz .Return 86 movb 2(%esi), %cl 87 movb %cl, 2(%edi) 88 89.Return: 90 popl %edi / restore register variables 91 popl %esi 92 ret 93 94.move_sse: 95 / 96 / time to 16 byte align destination 97 / 98 andl $15, %eax 99 jnz .sse_unaligned / jmp if dest is unaligned 100.sse: / dest is aligned, check source 101 movl %ecx, %edx / get byte count 102 shrl $6, %edx / number of 64 byte blocks to move 103 testl $15, %esi 104 jnz .sse_da / go to slow loop if source is unaligned 105 cmpl $65535, %ecx 106 ja .sse_sa_nt_loop 107 108 / 109 / use aligned load since we're lucky 110 / 111.sse_sa_loop: 112 prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time 113 prefetcht0 568(%edi) / prefetch source & copy 64 byte at a time 114 movaps 0(%esi), %xmm0 115 movaps %xmm0, 0(%edi) 116 movaps 16(%esi), %xmm1 117 movaps %xmm1, 16(%edi) 118 movaps 32(%esi), %xmm2 119 movaps %xmm2, 32(%edi) 120 movaps 48(%esi), %xmm3 121 movaps %xmm3, 48(%edi) 122 addl $64, %esi 123 addl $64, %edi 124 decl %edx 125 jnz .sse_sa_loop 126 127.sse_cleanup: 128 andl $63, %ecx / compute remaining bytes 129 movl 8+4(%esp), %eax / setup return value 130 jz .Return 131 jmp .movew 132 133 / 134 / use aligned load since we're lucky 135 / 136 .align 16 137.sse_sa_nt_loop: 138 prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time 139 movaps (%esi), %xmm0 140 movntps %xmm0, 0(%edi) 141 movaps 16(%esi), %xmm1 142 movntps %xmm1, 16(%edi) 143 movaps 32(%esi), %xmm2 144 movntps %xmm2, 32(%edi) 145 movaps 48(%esi), %xmm3 146 movntps %xmm3, 48(%edi) 147 addl $64, %esi 148 addl $64, %edi 149 decl %edx 150 jnz .sse_sa_nt_loop 151#if defined(_SSE2_INSN) 152 mfence 153#elif defined(_SSE_INSN) 154 sfence 155#else 156#error "Must have either SSE or SSE2" 157#endif 158 jmp .sse_cleanup 159 160 / 161 / Make certain that destination buffer becomes aligned 162 / 163.sse_unaligned: 164 neg %eax / subtract from 16 and get destination 165 andl $15, %eax / aligned on a 16 byte boundary 166 movl %ecx, %edx / saved count 167 subl %eax, %ecx / subtract from byte count 168 cmpl $64, %ecx / after aligning, will we still have 64 bytes? 169 cmovb %edx, %ecx / if not, restore original byte count, 170 cmovb 8+4(%esp), %eax / and restore return value, 171 jb .movew / and do a non-SSE move. 172 xchg %ecx, %eax / flip for copy 173 rep ; smovb / move the bytes 174 xchg %ecx, %eax / flip back 175 jmp .sse 176 177 .align 16 178.sse_da: 179 cmpl $65535, %ecx 180 jbe .sse_da_loop 181 182 / 183 / use unaligned load since source doesn't line up 184 / 185.sse_da_nt_loop: 186 prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time 187 movups 0(%esi), %xmm0 188 movntps %xmm0, 0(%edi) 189 movups 16(%esi), %xmm1 190 movntps %xmm1, 16(%edi) 191 movups 32(%esi), %xmm2 192 movntps %xmm2, 32(%edi) 193 movups 48(%esi), %xmm3 194 movntps %xmm3, 48(%edi) 195 addl $64, %esi 196 addl $64, %edi 197 decl %edx 198 jnz .sse_da_nt_loop 199#if defined(_SSE2_INSN) 200 mfence 201#elif defined(_SSE_INSN) 202 sfence 203#else 204#error "Must have either SSE or SSE2" 205#endif 206 jmp .sse_cleanup 207 / 208 / use unaligned load since source doesn't line up 209 / 210 .align 16 211.sse_da_loop: 212 prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time 213 prefetcht0 568(%edi) 214 movups 0(%esi), %xmm0 215 movaps %xmm0, 0(%edi) 216 movups 16(%esi), %xmm1 217 movaps %xmm1, 16(%edi) 218 movups 32(%esi), %xmm2 219 movaps %xmm2, 32(%edi) 220 movups 48(%esi), %xmm3 221 movaps %xmm3, 48(%edi) 222 addl $64, %esi 223 addl $64, %edi 224 decl %edx 225 jnz .sse_da_loop 226 jmp .sse_cleanup 227 228 SET_SIZE(memcpy) 229 230 231/ .CopyLeft handles the memmove case where we must perform the copy backwards, 232/ because of overlap between src and dst. This is not particularly optimized. 233 234.CopyLeft: 235 movl $3,%eax / heavily used constant 236 std / reverse direction bit (RtoL) 237 cmpl $12,%ecx / if (size < 12) 238 ja .BigCopyLeft / { 239 movl %edx,%esi / src = src + size - 1 240 leal -1(%ecx,%edi),%edi / dst = dst + size - 1 241 rep; smovb / do the byte copy 242 cld / reset direction flag to LtoR 243 popl %edi / } 244 popl %esi / restore registers 245 movl 4(%esp),%eax / set up return value 246 ret / return(dba); 247.BigCopyLeft: / } else { 248 xchgl %edx,%ecx 249 movl %ecx,%esi / align source w/byte copy 250 leal -1(%edx,%edi),%edi 251 andl %eax,%ecx 252 jz .SkipAlignLeft 253 addl $1, %ecx / we need to insure that future 254 subl %ecx,%edx / copy is done on aligned boundary 255 rep; smovb 256.SkipAlignLeft: 257 movl %edx,%ecx 258 subl %eax,%esi 259 shrl $2,%ecx / do 4 byte copy RtoL 260 subl %eax,%edi 261 rep; smovl 262 andl %eax,%edx / do 1 byte copy whats left 263 jz .CleanupReturnLeft 264 movl %edx,%ecx 265 addl %eax,%esi / rep; smovl instruction will decrement 266 addl %eax,%edi / %edi, %esi by four after each copy 267 / adding 3 will restore pointers to byte 268 / before last double word copied 269 / which is where they are expected to 270 / be for the single byte copy code 271 rep; smovb 272.CleanupReturnLeft: 273 cld / reset direction flag to LtoR 274 popl %edi 275 popl %esi / restore registers 276 movl 4(%esp),%eax / set up return value 277 ret / return(dba); 278 SET_SIZE(memmove) 279