/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ .file "memcpy.s" #include ANSI_PRAGMA_WEAK(memmove,function) ANSI_PRAGMA_WEAK(memcpy,function) ENTRY(memmove) movl 0+12(%esp),%ecx / get number of bytes to move pushl %esi / save off %edi, %esi and move destination pushl %edi movl 8+ 4(%esp),%edi / destination buffer address movl 8+ 8(%esp),%esi / source buffer address movl %edi, %eax testl %ecx,%ecx jz .Return cmpl %esi,%edi / if (source addr > dest addr) leal -1(%esi,%ecx),%edx / %edx = src + size - 1 jbe .memcpy_post / jump if dst <= src cmpl %edx,%edi jbe .CopyLeft / jump if dst <= src + size - 1 jmp .memcpy_post ENTRY(memcpy) pushl %esi pushl %edi movl 8+4(%esp),%edi / %edi = dest address movl %edi, %eax / save this movl 8+8(%esp),%esi / %esi = source address movl 8+12(%esp),%ecx/ %ecx = length of string / %edx scratch register / %eax scratch register .memcpy_post: nop / this really helps, don't know why / note: cld is perf death on P4 cmpl $63,%ecx ja .move_sse / not worth doing sse for less .movew: movl %ecx,%edx / save byte cnt shrl $2,%ecx / %ecx = number of words to move rep ; smovl / move the words andl $0x3,%edx / %edx = number of bytes left to move jz .Return / %edx <= 3, so just unroll the loop movb (%esi), %cl movb %cl, (%edi) decl %edx jz .Return movb 1(%esi), %cl movb %cl, 1(%edi) decl %edx jz .Return movb 2(%esi), %cl movb %cl, 2(%edi) .Return: popl %edi / restore register variables popl %esi ret .move_sse: / / time to 16 byte align destination / andl $15, %eax jnz .sse_unaligned / jmp if dest is unaligned .sse: / dest is aligned, check source movl %ecx, %edx / get byte count shrl $6, %edx / number of 64 byte blocks to move testl $15, %esi jnz .sse_da / go to slow loop if source is unaligned cmpl $65535, %ecx ja .sse_sa_nt_loop / / use aligned load since we're lucky / .sse_sa_loop: prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time prefetcht0 568(%edi) / prefetch source & copy 64 byte at a time movaps 0(%esi), %xmm0 movaps %xmm0, 0(%edi) movaps 16(%esi), %xmm1 movaps %xmm1, 16(%edi) movaps 32(%esi), %xmm2 movaps %xmm2, 32(%edi) movaps 48(%esi), %xmm3 movaps %xmm3, 48(%edi) addl $64, %esi addl $64, %edi decl %edx jnz .sse_sa_loop .sse_cleanup: andl $63, %ecx / compute remaining bytes movl 8+4(%esp), %eax / setup return value jz .Return jmp .movew / / use aligned load since we're lucky / .align 16 .sse_sa_nt_loop: prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time movaps (%esi), %xmm0 movntps %xmm0, 0(%edi) movaps 16(%esi), %xmm1 movntps %xmm1, 16(%edi) movaps 32(%esi), %xmm2 movntps %xmm2, 32(%edi) movaps 48(%esi), %xmm3 movntps %xmm3, 48(%edi) addl $64, %esi addl $64, %edi decl %edx jnz .sse_sa_nt_loop #if defined(_SSE2_INSN) mfence #elif defined(_SSE_INSN) sfence #else #error "Must have either SSE or SSE2" #endif jmp .sse_cleanup / / Make certain that destination buffer becomes aligned / .sse_unaligned: neg %eax / subtract from 16 and get destination andl $15, %eax / aligned on a 16 byte boundary movl %ecx, %edx / saved count subl %eax, %ecx / subtract from byte count cmpl $64, %ecx / after aligning, will we still have 64 bytes? cmovb %edx, %ecx / if not, restore original byte count, cmovb 8+4(%esp), %eax / and restore return value, jb .movew / and do a non-SSE move. xchg %ecx, %eax / flip for copy rep ; smovb / move the bytes xchg %ecx, %eax / flip back jmp .sse .align 16 .sse_da: cmpl $65535, %ecx jbe .sse_da_loop / / use unaligned load since source doesn't line up / .sse_da_nt_loop: prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time movups 0(%esi), %xmm0 movntps %xmm0, 0(%edi) movups 16(%esi), %xmm1 movntps %xmm1, 16(%edi) movups 32(%esi), %xmm2 movntps %xmm2, 32(%edi) movups 48(%esi), %xmm3 movntps %xmm3, 48(%edi) addl $64, %esi addl $64, %edi decl %edx jnz .sse_da_nt_loop #if defined(_SSE2_INSN) mfence #elif defined(_SSE_INSN) sfence #else #error "Must have either SSE or SSE2" #endif jmp .sse_cleanup / / use unaligned load since source doesn't line up / .align 16 .sse_da_loop: prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time prefetcht0 568(%edi) movups 0(%esi), %xmm0 movaps %xmm0, 0(%edi) movups 16(%esi), %xmm1 movaps %xmm1, 16(%edi) movups 32(%esi), %xmm2 movaps %xmm2, 32(%edi) movups 48(%esi), %xmm3 movaps %xmm3, 48(%edi) addl $64, %esi addl $64, %edi decl %edx jnz .sse_da_loop jmp .sse_cleanup SET_SIZE(memcpy) / .CopyLeft handles the memmove case where we must perform the copy backwards, / because of overlap between src and dst. This is not particularly optimized. .CopyLeft: movl $3,%eax / heavily used constant std / reverse direction bit (RtoL) cmpl $12,%ecx / if (size < 12) ja .BigCopyLeft / { movl %edx,%esi / src = src + size - 1 leal -1(%ecx,%edi),%edi / dst = dst + size - 1 rep; smovb / do the byte copy cld / reset direction flag to LtoR popl %edi / } popl %esi / restore registers movl 4(%esp),%eax / set up return value ret / return(dba); .BigCopyLeft: / } else { xchgl %edx,%ecx movl %ecx,%esi / align source w/byte copy leal -1(%edx,%edi),%edi andl %eax,%ecx jz .SkipAlignLeft addl $1, %ecx / we need to insure that future subl %ecx,%edx / copy is done on aligned boundary rep; smovb .SkipAlignLeft: movl %edx,%ecx subl %eax,%esi shrl $2,%ecx / do 4 byte copy RtoL subl %eax,%edi rep; smovl andl %eax,%edx / do 1 byte copy whats left jz .CleanupReturnLeft movl %edx,%ecx addl %eax,%esi / rep; smovl instruction will decrement addl %eax,%edi / %edi, %esi by four after each copy / adding 3 will restore pointers to byte / before last double word copied / which is where they are expected to / be for the single byte copy code rep; smovb .CleanupReturnLeft: cld / reset direction flag to LtoR popl %edi popl %esi / restore registers movl 4(%esp),%eax / set up return value ret / return(dba); SET_SIZE(memmove)