1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 .file "memcpy.s" 28 29#include <sys/asm_linkage.h> 30 31 ANSI_PRAGMA_WEAK(memmove,function) 32 ANSI_PRAGMA_WEAK(memcpy,function) 33 34 ENTRY(memmove) 35 movl 0+12(%esp),%ecx / get number of bytes to move 36 pushl %esi / save off %edi, %esi and move destination 37 pushl %edi 38 movl 8+ 4(%esp),%edi / destination buffer address 39 movl 8+ 8(%esp),%esi / source buffer address 40 movl %edi, %eax 41 testl %ecx,%ecx 42 jz .Return 43 44 cmpl %esi,%edi / if (source addr > dest addr) 45 leal -1(%esi,%ecx),%edx / %edx = src + size - 1 46 jbe .memcpy_post / jump if dst <= src 47 cmpl %edx,%edi 48 jbe .CopyLeft / jump if dst <= src + size - 1 49 jmp .memcpy_post 50 51 ENTRY(memcpy) 52 pushl %esi 53 pushl %edi 54 55 movl 8+4(%esp),%edi / %edi = dest address 56 movl %edi, %eax / save this 57 movl 8+8(%esp),%esi / %esi = source address 58 movl 8+12(%esp),%ecx/ %ecx = length of string 59 / %edx scratch register 60 / %eax scratch register 61.memcpy_post: 62 nop / this really helps, don't know why 63 / note: cld is perf death on P4 64 cmpl $63,%ecx 65 ja .move_sse / not worth doing sse for less 66 67.movew: 68 movl %ecx,%edx / save byte cnt 69 shrl $2,%ecx / %ecx = number of words to move 70 rep ; smovl / move the words 71 72 73 andl $0x3,%edx / %edx = number of bytes left to move 74 jz .Return / %edx <= 3, so just unroll the loop 75 76 movb (%esi), %cl 77 movb %cl, (%edi) 78 decl %edx 79 jz .Return 80 movb 1(%esi), %cl 81 movb %cl, 1(%edi) 82 decl %edx 83 jz .Return 84 movb 2(%esi), %cl 85 movb %cl, 2(%edi) 86 87.Return: 88 popl %edi / restore register variables 89 popl %esi 90 ret 91 92.move_sse: 93 / 94 / time to 16 byte align destination 95 / 96 andl $15, %eax 97 jnz .sse_unaligned / jmp if dest is unaligned 98.sse: / dest is aligned, check source 99 movl %ecx, %edx / get byte count 100 shrl $6, %edx / number of 64 byte blocks to move 101 testl $15, %esi 102 jnz .sse_da / go to slow loop if source is unaligned 103 cmpl $65535, %ecx 104 ja .sse_sa_nt_loop 105 106 / 107 / use aligned load since we're lucky 108 / 109.sse_sa_loop: 110 prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time 111 prefetcht0 568(%edi) / prefetch source & copy 64 byte at a time 112 movaps 0(%esi), %xmm0 113 movaps %xmm0, 0(%edi) 114 movaps 16(%esi), %xmm1 115 movaps %xmm1, 16(%edi) 116 movaps 32(%esi), %xmm2 117 movaps %xmm2, 32(%edi) 118 movaps 48(%esi), %xmm3 119 movaps %xmm3, 48(%edi) 120 addl $64, %esi 121 addl $64, %edi 122 decl %edx 123 jnz .sse_sa_loop 124 125.sse_cleanup: 126 andl $63, %ecx / compute remaining bytes 127 movl 8+4(%esp), %eax / setup return value 128 jz .Return 129 jmp .movew 130 131 / 132 / use aligned load since we're lucky 133 / 134 .align 16 135.sse_sa_nt_loop: 136 prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time 137 movaps (%esi), %xmm0 138 movntps %xmm0, 0(%edi) 139 movaps 16(%esi), %xmm1 140 movntps %xmm1, 16(%edi) 141 movaps 32(%esi), %xmm2 142 movntps %xmm2, 32(%edi) 143 movaps 48(%esi), %xmm3 144 movntps %xmm3, 48(%edi) 145 addl $64, %esi 146 addl $64, %edi 147 decl %edx 148 jnz .sse_sa_nt_loop 149#if defined(_SSE2_INSN) 150 mfence 151#elif defined(_SSE_INSN) 152 sfence 153#else 154#error "Must have either SSE or SSE2" 155#endif 156 jmp .sse_cleanup 157 158 / 159 / Make certain that destination buffer becomes aligned 160 / 161.sse_unaligned: 162 neg %eax / subtract from 16 and get destination 163 andl $15, %eax / aligned on a 16 byte boundary 164 movl %ecx, %edx / saved count 165 subl %eax, %ecx / subtract from byte count 166 cmpl $64, %ecx / after aligning, will we still have 64 bytes? 167 cmovb %edx, %ecx / if not, restore original byte count, 168 cmovb 8+4(%esp), %eax / and restore return value, 169 jb .movew / and do a non-SSE move. 170 xchg %ecx, %eax / flip for copy 171 rep ; smovb / move the bytes 172 xchg %ecx, %eax / flip back 173 jmp .sse 174 175 .align 16 176.sse_da: 177 cmpl $65535, %ecx 178 jbe .sse_da_loop 179 180 / 181 / use unaligned load since source doesn't line up 182 / 183.sse_da_nt_loop: 184 prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time 185 movups 0(%esi), %xmm0 186 movntps %xmm0, 0(%edi) 187 movups 16(%esi), %xmm1 188 movntps %xmm1, 16(%edi) 189 movups 32(%esi), %xmm2 190 movntps %xmm2, 32(%edi) 191 movups 48(%esi), %xmm3 192 movntps %xmm3, 48(%edi) 193 addl $64, %esi 194 addl $64, %edi 195 decl %edx 196 jnz .sse_da_nt_loop 197#if defined(_SSE2_INSN) 198 mfence 199#elif defined(_SSE_INSN) 200 sfence 201#else 202#error "Must have either SSE or SSE2" 203#endif 204 jmp .sse_cleanup 205 / 206 / use unaligned load since source doesn't line up 207 / 208 .align 16 209.sse_da_loop: 210 prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time 211 prefetcht0 568(%edi) 212 movups 0(%esi), %xmm0 213 movaps %xmm0, 0(%edi) 214 movups 16(%esi), %xmm1 215 movaps %xmm1, 16(%edi) 216 movups 32(%esi), %xmm2 217 movaps %xmm2, 32(%edi) 218 movups 48(%esi), %xmm3 219 movaps %xmm3, 48(%edi) 220 addl $64, %esi 221 addl $64, %edi 222 decl %edx 223 jnz .sse_da_loop 224 jmp .sse_cleanup 225 226 SET_SIZE(memcpy) 227 228 229/ .CopyLeft handles the memmove case where we must perform the copy backwards, 230/ because of overlap between src and dst. This is not particularly optimized. 231 232.CopyLeft: 233 movl $3,%eax / heavily used constant 234 std / reverse direction bit (RtoL) 235 cmpl $12,%ecx / if (size < 12) 236 ja .BigCopyLeft / { 237 movl %edx,%esi / src = src + size - 1 238 leal -1(%ecx,%edi),%edi / dst = dst + size - 1 239 rep; smovb / do the byte copy 240 cld / reset direction flag to LtoR 241 popl %edi / } 242 popl %esi / restore registers 243 movl 4(%esp),%eax / set up return value 244 ret / return(dba); 245.BigCopyLeft: / } else { 246 xchgl %edx,%ecx 247 movl %ecx,%esi / align source w/byte copy 248 leal -1(%edx,%edi),%edi 249 andl %eax,%ecx 250 jz .SkipAlignLeft 251 addl $1, %ecx / we need to insure that future 252 subl %ecx,%edx / copy is done on aligned boundary 253 rep; smovb 254.SkipAlignLeft: 255 movl %edx,%ecx 256 subl %eax,%esi 257 shrl $2,%ecx / do 4 byte copy RtoL 258 subl %eax,%edi 259 rep; smovl 260 andl %eax,%edx / do 1 byte copy whats left 261 jz .CleanupReturnLeft 262 movl %edx,%ecx 263 addl %eax,%esi / rep; smovl instruction will decrement 264 addl %eax,%edi / %edi, %esi by four after each copy 265 / adding 3 will restore pointers to byte 266 / before last double word copied 267 / which is where they are expected to 268 / be for the single byte copy code 269 rep; smovb 270.CleanupReturnLeft: 271 cld / reset direction flag to LtoR 272 popl %edi 273 popl %esi / restore registers 274 movl 4(%esp),%eax / set up return value 275 ret / return(dba); 276 SET_SIZE(memmove) 277