17c478bd9Sstevel@tonic-gate/* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 58cd45542Sraf * Common Development and Distribution License (the "License"). 68cd45542Sraf * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 218cd45542Sraf 227c478bd9Sstevel@tonic-gate/* 23*46b59285SSudheer A * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 247c478bd9Sstevel@tonic-gate * Use is subject to license terms. 257c478bd9Sstevel@tonic-gate */ 267c478bd9Sstevel@tonic-gate 279a70fc3bSMark J. Nelson .file "memcpy.s" 287c478bd9Sstevel@tonic-gate 297c478bd9Sstevel@tonic-gate#include <sys/asm_linkage.h> 307c478bd9Sstevel@tonic-gate 317c478bd9Sstevel@tonic-gate ANSI_PRAGMA_WEAK(memmove,function) 327c478bd9Sstevel@tonic-gate ANSI_PRAGMA_WEAK(memcpy,function) 337c478bd9Sstevel@tonic-gate 347c478bd9Sstevel@tonic-gate ENTRY(memmove) 357c478bd9Sstevel@tonic-gate movl 0+12(%esp),%ecx / get number of bytes to move 367c478bd9Sstevel@tonic-gate pushl %esi / save off %edi, %esi and move destination 377c478bd9Sstevel@tonic-gate pushl %edi 387c478bd9Sstevel@tonic-gate movl 8+ 4(%esp),%edi / destination buffer address 397c478bd9Sstevel@tonic-gate movl 8+ 8(%esp),%esi / source buffer address 407c478bd9Sstevel@tonic-gate movl %edi, %eax 417c478bd9Sstevel@tonic-gate testl %ecx,%ecx 427c478bd9Sstevel@tonic-gate jz .Return 437c478bd9Sstevel@tonic-gate 447c478bd9Sstevel@tonic-gate cmpl %esi,%edi / if (source addr > dest addr) 457c478bd9Sstevel@tonic-gate leal -1(%esi,%ecx),%edx / %edx = src + size - 1 46*46b59285SSudheer A jbe .memcpy_post / jump if dst <= src 477c478bd9Sstevel@tonic-gate cmpl %edx,%edi 48*46b59285SSudheer A jbe .CopyLeft / jump if dst <= src + size - 1 497c478bd9Sstevel@tonic-gate jmp .memcpy_post 507c478bd9Sstevel@tonic-gate 517c478bd9Sstevel@tonic-gate ENTRY(memcpy) 527c478bd9Sstevel@tonic-gate pushl %esi 537c478bd9Sstevel@tonic-gate pushl %edi 547c478bd9Sstevel@tonic-gate 557c478bd9Sstevel@tonic-gate movl 8+4(%esp),%edi / %edi = dest address 567c478bd9Sstevel@tonic-gate movl %edi, %eax / save this 577c478bd9Sstevel@tonic-gate movl 8+8(%esp),%esi / %esi = source address 587c478bd9Sstevel@tonic-gate movl 8+12(%esp),%ecx/ %ecx = length of string 597c478bd9Sstevel@tonic-gate / %edx scratch register 607c478bd9Sstevel@tonic-gate / %eax scratch register 617c478bd9Sstevel@tonic-gate.memcpy_post: 627c478bd9Sstevel@tonic-gate nop / this really helps, don't know why 637c478bd9Sstevel@tonic-gate / note: cld is perf death on P4 647c478bd9Sstevel@tonic-gate cmpl $63,%ecx 657c478bd9Sstevel@tonic-gate ja .move_sse / not worth doing sse for less 667c478bd9Sstevel@tonic-gate 677c478bd9Sstevel@tonic-gate.movew: 687c478bd9Sstevel@tonic-gate movl %ecx,%edx / save byte cnt 697c478bd9Sstevel@tonic-gate shrl $2,%ecx / %ecx = number of words to move 707c478bd9Sstevel@tonic-gate rep ; smovl / move the words 717c478bd9Sstevel@tonic-gate 727c478bd9Sstevel@tonic-gate 737c478bd9Sstevel@tonic-gate andl $0x3,%edx / %edx = number of bytes left to move 747c478bd9Sstevel@tonic-gate jz .Return / %edx <= 3, so just unroll the loop 757c478bd9Sstevel@tonic-gate 767c478bd9Sstevel@tonic-gate movb (%esi), %cl 777c478bd9Sstevel@tonic-gate movb %cl, (%edi) 787c478bd9Sstevel@tonic-gate decl %edx 797c478bd9Sstevel@tonic-gate jz .Return 807c478bd9Sstevel@tonic-gate movb 1(%esi), %cl 817c478bd9Sstevel@tonic-gate movb %cl, 1(%edi) 827c478bd9Sstevel@tonic-gate decl %edx 837c478bd9Sstevel@tonic-gate jz .Return 847c478bd9Sstevel@tonic-gate movb 2(%esi), %cl 857c478bd9Sstevel@tonic-gate movb %cl, 2(%edi) 867c478bd9Sstevel@tonic-gate 877c478bd9Sstevel@tonic-gate.Return: 887c478bd9Sstevel@tonic-gate popl %edi / restore register variables 897c478bd9Sstevel@tonic-gate popl %esi 907c478bd9Sstevel@tonic-gate ret 917c478bd9Sstevel@tonic-gate 927c478bd9Sstevel@tonic-gate.move_sse: 937c478bd9Sstevel@tonic-gate / 947c478bd9Sstevel@tonic-gate / time to 16 byte align destination 957c478bd9Sstevel@tonic-gate / 967c478bd9Sstevel@tonic-gate andl $15, %eax 977c478bd9Sstevel@tonic-gate jnz .sse_unaligned / jmp if dest is unaligned 987c478bd9Sstevel@tonic-gate.sse: / dest is aligned, check source 997c478bd9Sstevel@tonic-gate movl %ecx, %edx / get byte count 1007c478bd9Sstevel@tonic-gate shrl $6, %edx / number of 64 byte blocks to move 1017c478bd9Sstevel@tonic-gate testl $15, %esi 1027c478bd9Sstevel@tonic-gate jnz .sse_da / go to slow loop if source is unaligned 1037c478bd9Sstevel@tonic-gate cmpl $65535, %ecx 1047c478bd9Sstevel@tonic-gate ja .sse_sa_nt_loop 1057c478bd9Sstevel@tonic-gate 1067c478bd9Sstevel@tonic-gate / 1077c478bd9Sstevel@tonic-gate / use aligned load since we're lucky 1087c478bd9Sstevel@tonic-gate / 1097c478bd9Sstevel@tonic-gate.sse_sa_loop: 1107c478bd9Sstevel@tonic-gate prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time 1117c478bd9Sstevel@tonic-gate prefetcht0 568(%edi) / prefetch source & copy 64 byte at a time 1127c478bd9Sstevel@tonic-gate movaps 0(%esi), %xmm0 1137c478bd9Sstevel@tonic-gate movaps %xmm0, 0(%edi) 1147c478bd9Sstevel@tonic-gate movaps 16(%esi), %xmm1 1157c478bd9Sstevel@tonic-gate movaps %xmm1, 16(%edi) 1167c478bd9Sstevel@tonic-gate movaps 32(%esi), %xmm2 1177c478bd9Sstevel@tonic-gate movaps %xmm2, 32(%edi) 1187c478bd9Sstevel@tonic-gate movaps 48(%esi), %xmm3 1197c478bd9Sstevel@tonic-gate movaps %xmm3, 48(%edi) 1207c478bd9Sstevel@tonic-gate addl $64, %esi 1217c478bd9Sstevel@tonic-gate addl $64, %edi 1227c478bd9Sstevel@tonic-gate decl %edx 1237c478bd9Sstevel@tonic-gate jnz .sse_sa_loop 1247c478bd9Sstevel@tonic-gate 1257c478bd9Sstevel@tonic-gate.sse_cleanup: 1267c478bd9Sstevel@tonic-gate andl $63, %ecx / compute remaining bytes 1277c478bd9Sstevel@tonic-gate movl 8+4(%esp), %eax / setup return value 1287c478bd9Sstevel@tonic-gate jz .Return 1297c478bd9Sstevel@tonic-gate jmp .movew 1307c478bd9Sstevel@tonic-gate 1317c478bd9Sstevel@tonic-gate / 1327c478bd9Sstevel@tonic-gate / use aligned load since we're lucky 1337c478bd9Sstevel@tonic-gate / 1347c478bd9Sstevel@tonic-gate .align 16 1357c478bd9Sstevel@tonic-gate.sse_sa_nt_loop: 1367c478bd9Sstevel@tonic-gate prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time 1377c478bd9Sstevel@tonic-gate movaps (%esi), %xmm0 1387c478bd9Sstevel@tonic-gate movntps %xmm0, 0(%edi) 1397c478bd9Sstevel@tonic-gate movaps 16(%esi), %xmm1 1407c478bd9Sstevel@tonic-gate movntps %xmm1, 16(%edi) 1417c478bd9Sstevel@tonic-gate movaps 32(%esi), %xmm2 1427c478bd9Sstevel@tonic-gate movntps %xmm2, 32(%edi) 1437c478bd9Sstevel@tonic-gate movaps 48(%esi), %xmm3 1447c478bd9Sstevel@tonic-gate movntps %xmm3, 48(%edi) 1457c478bd9Sstevel@tonic-gate addl $64, %esi 1467c478bd9Sstevel@tonic-gate addl $64, %edi 1477c478bd9Sstevel@tonic-gate decl %edx 1487c478bd9Sstevel@tonic-gate jnz .sse_sa_nt_loop 1497c478bd9Sstevel@tonic-gate#if defined(_SSE2_INSN) 1507c478bd9Sstevel@tonic-gate mfence 1517c478bd9Sstevel@tonic-gate#elif defined(_SSE_INSN) 1527c478bd9Sstevel@tonic-gate sfence 1537c478bd9Sstevel@tonic-gate#else 1547c478bd9Sstevel@tonic-gate#error "Must have either SSE or SSE2" 1557c478bd9Sstevel@tonic-gate#endif 1567c478bd9Sstevel@tonic-gate jmp .sse_cleanup 1577c478bd9Sstevel@tonic-gate 1587c478bd9Sstevel@tonic-gate / 1597c478bd9Sstevel@tonic-gate / Make certain that destination buffer becomes aligned 1607c478bd9Sstevel@tonic-gate / 1617c478bd9Sstevel@tonic-gate.sse_unaligned: 1627c478bd9Sstevel@tonic-gate neg %eax / subtract from 16 and get destination 1637c478bd9Sstevel@tonic-gate andl $15, %eax / aligned on a 16 byte boundary 1647c478bd9Sstevel@tonic-gate movl %ecx, %edx / saved count 1657c478bd9Sstevel@tonic-gate subl %eax, %ecx / subtract from byte count 1667c478bd9Sstevel@tonic-gate cmpl $64, %ecx / after aligning, will we still have 64 bytes? 1677c478bd9Sstevel@tonic-gate cmovb %edx, %ecx / if not, restore original byte count, 1687c478bd9Sstevel@tonic-gate cmovb 8+4(%esp), %eax / and restore return value, 1697c478bd9Sstevel@tonic-gate jb .movew / and do a non-SSE move. 1707c478bd9Sstevel@tonic-gate xchg %ecx, %eax / flip for copy 1717c478bd9Sstevel@tonic-gate rep ; smovb / move the bytes 1727c478bd9Sstevel@tonic-gate xchg %ecx, %eax / flip back 1737c478bd9Sstevel@tonic-gate jmp .sse 1747c478bd9Sstevel@tonic-gate 1757c478bd9Sstevel@tonic-gate .align 16 1767c478bd9Sstevel@tonic-gate.sse_da: 1777c478bd9Sstevel@tonic-gate cmpl $65535, %ecx 1787c478bd9Sstevel@tonic-gate jbe .sse_da_loop 1797c478bd9Sstevel@tonic-gate 1807c478bd9Sstevel@tonic-gate / 1817c478bd9Sstevel@tonic-gate / use unaligned load since source doesn't line up 1827c478bd9Sstevel@tonic-gate / 1837c478bd9Sstevel@tonic-gate.sse_da_nt_loop: 1847c478bd9Sstevel@tonic-gate prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time 1857c478bd9Sstevel@tonic-gate movups 0(%esi), %xmm0 1867c478bd9Sstevel@tonic-gate movntps %xmm0, 0(%edi) 1877c478bd9Sstevel@tonic-gate movups 16(%esi), %xmm1 1887c478bd9Sstevel@tonic-gate movntps %xmm1, 16(%edi) 1897c478bd9Sstevel@tonic-gate movups 32(%esi), %xmm2 1907c478bd9Sstevel@tonic-gate movntps %xmm2, 32(%edi) 1917c478bd9Sstevel@tonic-gate movups 48(%esi), %xmm3 1927c478bd9Sstevel@tonic-gate movntps %xmm3, 48(%edi) 1937c478bd9Sstevel@tonic-gate addl $64, %esi 1947c478bd9Sstevel@tonic-gate addl $64, %edi 1957c478bd9Sstevel@tonic-gate decl %edx 1967c478bd9Sstevel@tonic-gate jnz .sse_da_nt_loop 1977c478bd9Sstevel@tonic-gate#if defined(_SSE2_INSN) 1987c478bd9Sstevel@tonic-gate mfence 1997c478bd9Sstevel@tonic-gate#elif defined(_SSE_INSN) 2007c478bd9Sstevel@tonic-gate sfence 2017c478bd9Sstevel@tonic-gate#else 2027c478bd9Sstevel@tonic-gate#error "Must have either SSE or SSE2" 2037c478bd9Sstevel@tonic-gate#endif 2047c478bd9Sstevel@tonic-gate jmp .sse_cleanup 2057c478bd9Sstevel@tonic-gate / 2067c478bd9Sstevel@tonic-gate / use unaligned load since source doesn't line up 2077c478bd9Sstevel@tonic-gate / 2087c478bd9Sstevel@tonic-gate .align 16 2097c478bd9Sstevel@tonic-gate.sse_da_loop: 2107c478bd9Sstevel@tonic-gate prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time 2117c478bd9Sstevel@tonic-gate prefetcht0 568(%edi) 2127c478bd9Sstevel@tonic-gate movups 0(%esi), %xmm0 2137c478bd9Sstevel@tonic-gate movaps %xmm0, 0(%edi) 2147c478bd9Sstevel@tonic-gate movups 16(%esi), %xmm1 2157c478bd9Sstevel@tonic-gate movaps %xmm1, 16(%edi) 2167c478bd9Sstevel@tonic-gate movups 32(%esi), %xmm2 2177c478bd9Sstevel@tonic-gate movaps %xmm2, 32(%edi) 2187c478bd9Sstevel@tonic-gate movups 48(%esi), %xmm3 2197c478bd9Sstevel@tonic-gate movaps %xmm3, 48(%edi) 2207c478bd9Sstevel@tonic-gate addl $64, %esi 2217c478bd9Sstevel@tonic-gate addl $64, %edi 2227c478bd9Sstevel@tonic-gate decl %edx 2237c478bd9Sstevel@tonic-gate jnz .sse_da_loop 2247c478bd9Sstevel@tonic-gate jmp .sse_cleanup 2257c478bd9Sstevel@tonic-gate 2267c478bd9Sstevel@tonic-gate SET_SIZE(memcpy) 2277c478bd9Sstevel@tonic-gate 2287c478bd9Sstevel@tonic-gate 2297c478bd9Sstevel@tonic-gate/ .CopyLeft handles the memmove case where we must perform the copy backwards, 2307c478bd9Sstevel@tonic-gate/ because of overlap between src and dst. This is not particularly optimized. 2317c478bd9Sstevel@tonic-gate 2327c478bd9Sstevel@tonic-gate.CopyLeft: 2337c478bd9Sstevel@tonic-gate movl $3,%eax / heavily used constant 2347c478bd9Sstevel@tonic-gate std / reverse direction bit (RtoL) 2357c478bd9Sstevel@tonic-gate cmpl $12,%ecx / if (size < 12) 2367c478bd9Sstevel@tonic-gate ja .BigCopyLeft / { 2377c478bd9Sstevel@tonic-gate movl %edx,%esi / src = src + size - 1 2387c478bd9Sstevel@tonic-gate leal -1(%ecx,%edi),%edi / dst = dst + size - 1 2397c478bd9Sstevel@tonic-gate rep; smovb / do the byte copy 2407c478bd9Sstevel@tonic-gate cld / reset direction flag to LtoR 2417c478bd9Sstevel@tonic-gate popl %edi / } 2427c478bd9Sstevel@tonic-gate popl %esi / restore registers 2437c478bd9Sstevel@tonic-gate movl 4(%esp),%eax / set up return value 2447c478bd9Sstevel@tonic-gate ret / return(dba); 2457c478bd9Sstevel@tonic-gate.BigCopyLeft: / } else { 2467c478bd9Sstevel@tonic-gate xchgl %edx,%ecx 2477c478bd9Sstevel@tonic-gate movl %ecx,%esi / align source w/byte copy 2487c478bd9Sstevel@tonic-gate leal -1(%edx,%edi),%edi 2497c478bd9Sstevel@tonic-gate andl %eax,%ecx 2507c478bd9Sstevel@tonic-gate jz .SkipAlignLeft 2517c478bd9Sstevel@tonic-gate addl $1, %ecx / we need to insure that future 2527c478bd9Sstevel@tonic-gate subl %ecx,%edx / copy is done on aligned boundary 2537c478bd9Sstevel@tonic-gate rep; smovb 2547c478bd9Sstevel@tonic-gate.SkipAlignLeft: 2557c478bd9Sstevel@tonic-gate movl %edx,%ecx 2567c478bd9Sstevel@tonic-gate subl %eax,%esi 2577c478bd9Sstevel@tonic-gate shrl $2,%ecx / do 4 byte copy RtoL 2587c478bd9Sstevel@tonic-gate subl %eax,%edi 2597c478bd9Sstevel@tonic-gate rep; smovl 2607c478bd9Sstevel@tonic-gate andl %eax,%edx / do 1 byte copy whats left 2617c478bd9Sstevel@tonic-gate jz .CleanupReturnLeft 2627c478bd9Sstevel@tonic-gate movl %edx,%ecx 2637c478bd9Sstevel@tonic-gate addl %eax,%esi / rep; smovl instruction will decrement 2647c478bd9Sstevel@tonic-gate addl %eax,%edi / %edi, %esi by four after each copy 2657c478bd9Sstevel@tonic-gate / adding 3 will restore pointers to byte 2667c478bd9Sstevel@tonic-gate / before last double word copied 2677c478bd9Sstevel@tonic-gate / which is where they are expected to 2687c478bd9Sstevel@tonic-gate / be for the single byte copy code 2697c478bd9Sstevel@tonic-gate rep; smovb 2707c478bd9Sstevel@tonic-gate.CleanupReturnLeft: 2717c478bd9Sstevel@tonic-gate cld / reset direction flag to LtoR 2727c478bd9Sstevel@tonic-gate popl %edi 2737c478bd9Sstevel@tonic-gate popl %esi / restore registers 2747c478bd9Sstevel@tonic-gate movl 4(%esp),%eax / set up return value 2757c478bd9Sstevel@tonic-gate ret / return(dba); 2767c478bd9Sstevel@tonic-gate SET_SIZE(memmove) 277