17c478bd9Sstevel@tonic-gate/* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 58cd45542Sraf * Common Development and Distribution License (the "License"). 68cd45542Sraf * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 218cd45542Sraf 227c478bd9Sstevel@tonic-gate/* 238cd45542Sraf * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 247c478bd9Sstevel@tonic-gate * Use is subject to license terms. 257c478bd9Sstevel@tonic-gate */ 267c478bd9Sstevel@tonic-gate 27*9a70fc3bSMark J. Nelson .file "memset.s" 287c478bd9Sstevel@tonic-gate 297c478bd9Sstevel@tonic-gate#include <sys/asm_linkage.h> 307c478bd9Sstevel@tonic-gate 317c478bd9Sstevel@tonic-gate ANSI_PRAGMA_WEAK(memset,function) 327c478bd9Sstevel@tonic-gate 337c478bd9Sstevel@tonic-gate ENTRY(memset) 347c478bd9Sstevel@tonic-gate pushl %edi / save register variable 357c478bd9Sstevel@tonic-gate movl 8(%esp),%edi / %edi = string address 367c478bd9Sstevel@tonic-gate movl 12(%esp),%eax / %al = byte to duplicate 377c478bd9Sstevel@tonic-gate movl 16(%esp),%ecx / %ecx = number of copies 387c478bd9Sstevel@tonic-gate 397c478bd9Sstevel@tonic-gate / For all basic blocks in this routine, maintain the following 407c478bd9Sstevel@tonic-gate / entry conditions: %eax each byte is set to desired byte. 417c478bd9Sstevel@tonic-gate / NOTE: .byteset doesn't require this 427c478bd9Sstevel@tonic-gate / %ecx contains # bytes to set 437c478bd9Sstevel@tonic-gate / %edi contain address to set 447c478bd9Sstevel@tonic-gate 457c478bd9Sstevel@tonic-gate cld / make sure we go the right way... 467c478bd9Sstevel@tonic-gate cmpl $20,%ecx / strings with fewer than 20 chars should be byte set 477c478bd9Sstevel@tonic-gate jbe .byteset 487c478bd9Sstevel@tonic-gate 497c478bd9Sstevel@tonic-gate andl $0xff, %eax / trim anything above low byte 507c478bd9Sstevel@tonic-gate imul $0x01010101, %eax / extend low byte to each byte 517c478bd9Sstevel@tonic-gate 527c478bd9Sstevel@tonic-gate cmpl $256, %ecx / smaller areas don't benefit from alignment 537c478bd9Sstevel@tonic-gate jbe .wordset 547c478bd9Sstevel@tonic-gate 557c478bd9Sstevel@tonic-gate cmpl $511, %ecx / areas smaller than this should be wordset 567c478bd9Sstevel@tonic-gate jbe .check_wordset 577c478bd9Sstevel@tonic-gate 587c478bd9Sstevel@tonic-gate / 597c478bd9Sstevel@tonic-gate / prep work for sse temporal and non-temporal 607c478bd9Sstevel@tonic-gate / 617c478bd9Sstevel@tonic-gate 627c478bd9Sstevel@tonic-gate pushl %ebx / more registers are needed 637c478bd9Sstevel@tonic-gate pushl %esi / for alignment work 647c478bd9Sstevel@tonic-gate 657c478bd9Sstevel@tonic-gate / 667c478bd9Sstevel@tonic-gate / align address to 64 byte boundaries. 677c478bd9Sstevel@tonic-gate / 687c478bd9Sstevel@tonic-gate 697c478bd9Sstevel@tonic-gate movl %ecx, %ebx / save byte count 707c478bd9Sstevel@tonic-gate movl %edi, %esi / esi is scratch register 717c478bd9Sstevel@tonic-gate andl $63, %esi / bytes to align to 64 byte align addr 727c478bd9Sstevel@tonic-gate neg %esi / compute count of bytes 737c478bd9Sstevel@tonic-gate addl $64, %esi / needed to align 747c478bd9Sstevel@tonic-gate andl $63, %esi / to 64 byte align addr 757c478bd9Sstevel@tonic-gate jz .sse_aligned / skip alignment if not needed 767c478bd9Sstevel@tonic-gate subl %esi, %ebx / ebx contains remainder of bytes to set 777c478bd9Sstevel@tonic-gate movl %esi, %ecx / alignment bytes 787c478bd9Sstevel@tonic-gate shrl $2,%ecx / %ecx = number of words to set 797c478bd9Sstevel@tonic-gate rep; sstol 807c478bd9Sstevel@tonic-gate movl %esi,%ecx 817c478bd9Sstevel@tonic-gate andl $3,%ecx / %ecx = number of bytes left 827c478bd9Sstevel@tonic-gate rep; sstob 837c478bd9Sstevel@tonic-gate movl %ebx, %ecx / remainder to be set 847c478bd9Sstevel@tonic-gate 857c478bd9Sstevel@tonic-gate.sse_aligned: 867c478bd9Sstevel@tonic-gate 877c478bd9Sstevel@tonic-gate shr $6, %ecx / number of 64 byte blocks to set 887c478bd9Sstevel@tonic-gate 897c478bd9Sstevel@tonic-gate / 907c478bd9Sstevel@tonic-gate / load xmm0 with bytes to be set 917c478bd9Sstevel@tonic-gate / 927c478bd9Sstevel@tonic-gate subl $16,%esp / give ourselves some working room on the stack 937c478bd9Sstevel@tonic-gate movl %eax,(%esp) / copy eax into each of 4 bytes 947c478bd9Sstevel@tonic-gate movl %eax,4(%esp) / avoid pushl since it causes more interlocking 957c478bd9Sstevel@tonic-gate movl %eax,8(%esp) / 967c478bd9Sstevel@tonic-gate movl %eax,12(%esp) / 977c478bd9Sstevel@tonic-gate movups (%esp), %xmm0 / unaligned load from stack into xmm0 987c478bd9Sstevel@tonic-gate addl $16,%esp / restore stack position 997c478bd9Sstevel@tonic-gate 1007c478bd9Sstevel@tonic-gate cmpl $262143, %ebx / blocks smaller than this allocate in the cache 1017c478bd9Sstevel@tonic-gate jbe .sse_loop 1027c478bd9Sstevel@tonic-gate jmp .sse_nt_loop / branch across alignment nops 1037c478bd9Sstevel@tonic-gate 1047c478bd9Sstevel@tonic-gate .align 16 1057c478bd9Sstevel@tonic-gate 1067c478bd9Sstevel@tonic-gate.sse_nt_loop: 1077c478bd9Sstevel@tonic-gate movntps %xmm0, (%edi) / block non-temporal store 1087c478bd9Sstevel@tonic-gate movntps %xmm0, 16(%edi) / use sse rather than sse2 1097c478bd9Sstevel@tonic-gate movntps %xmm0, 32(%edi) / so we work more places 1107c478bd9Sstevel@tonic-gate movntps %xmm0, 48(%edi) / 1117c478bd9Sstevel@tonic-gate 1127c478bd9Sstevel@tonic-gate addl $64, %edi / increment dest address 1137c478bd9Sstevel@tonic-gate dec %ecx / dec count of blocks 1147c478bd9Sstevel@tonic-gate jnz .sse_nt_loop / jump if not done 1157c478bd9Sstevel@tonic-gate 1167c478bd9Sstevel@tonic-gate andl $63, %ebx / remainder of bytes to copy 1177c478bd9Sstevel@tonic-gate movl %ebx, %ecx / ecx contains remainer of bytes to set 1187c478bd9Sstevel@tonic-gate popl %esi / restore stack config 1197c478bd9Sstevel@tonic-gate popl %ebx / 1207c478bd9Sstevel@tonic-gate#if defined(_SSE2_INSN) 1217c478bd9Sstevel@tonic-gate mfence 1227c478bd9Sstevel@tonic-gate#elif defined(_SSE_INSN) 1237c478bd9Sstevel@tonic-gate sfence 1247c478bd9Sstevel@tonic-gate#else 1257c478bd9Sstevel@tonic-gate#error "Must have either SSE or SSE2" 1267c478bd9Sstevel@tonic-gate#endif 1277c478bd9Sstevel@tonic-gate cmpl $20, %ecx / compare and jump accordingly 1287c478bd9Sstevel@tonic-gate jbe .byteset 1297c478bd9Sstevel@tonic-gate jmp .wordset 1307c478bd9Sstevel@tonic-gate 1317c478bd9Sstevel@tonic-gate .align 16 1327c478bd9Sstevel@tonic-gate.sse_loop: 1337c478bd9Sstevel@tonic-gate movaps %xmm0, (%edi) / block copy w/ SSE 1347c478bd9Sstevel@tonic-gate movaps %xmm0, 16(%edi) 1357c478bd9Sstevel@tonic-gate movaps %xmm0, 32(%edi) 1367c478bd9Sstevel@tonic-gate movaps %xmm0, 48(%edi) 1377c478bd9Sstevel@tonic-gate 1387c478bd9Sstevel@tonic-gate addl $64, %edi / increment addr 1397c478bd9Sstevel@tonic-gate dec %ecx / dec count of blocks 1407c478bd9Sstevel@tonic-gate jnz .sse_loop / jump if not done 1417c478bd9Sstevel@tonic-gate 1427c478bd9Sstevel@tonic-gate andl $63, %ebx / remainder of bytes to copy 1437c478bd9Sstevel@tonic-gate movl %ebx, %ecx / in %ecx as normal 1447c478bd9Sstevel@tonic-gate popl %esi / restore stack config 1457c478bd9Sstevel@tonic-gate popl %ebx / 1467c478bd9Sstevel@tonic-gate cmpl $20, %ecx 1477c478bd9Sstevel@tonic-gate jbe .byteset 1487c478bd9Sstevel@tonic-gate jmp .wordset 1497c478bd9Sstevel@tonic-gate 1507c478bd9Sstevel@tonic-gate.check_wordset: 1517c478bd9Sstevel@tonic-gate movl %edi, %edx / save current store ptr 1527c478bd9Sstevel@tonic-gate andl $7, %edi / check alignment 1537c478bd9Sstevel@tonic-gate movl %edx,%edi / %edi = string address 1547c478bd9Sstevel@tonic-gate jz .wordset / all ok 1557c478bd9Sstevel@tonic-gate 1567c478bd9Sstevel@tonic-gate 1577c478bd9Sstevel@tonic-gate.align_wordset: 1587c478bd9Sstevel@tonic-gate pushl %ebx / more registers are needed 1597c478bd9Sstevel@tonic-gate pushl %esi 1607c478bd9Sstevel@tonic-gate 1617c478bd9Sstevel@tonic-gate movl %ecx, %ebx 1627c478bd9Sstevel@tonic-gate movl %edi, %esi 1637c478bd9Sstevel@tonic-gate andl $7, %esi 1647c478bd9Sstevel@tonic-gate neg %esi 1657c478bd9Sstevel@tonic-gate addl $8, %esi 1667c478bd9Sstevel@tonic-gate andl $7, %esi 1677c478bd9Sstevel@tonic-gate subl %esi, %ebx / ebx contains remainder of bytes to copy 1687c478bd9Sstevel@tonic-gate movl %esi, %ecx 1697c478bd9Sstevel@tonic-gate rep; sstob 1707c478bd9Sstevel@tonic-gate movl %ebx, %ecx 1717c478bd9Sstevel@tonic-gate popl %esi / restore stack config 1727c478bd9Sstevel@tonic-gate popl %ebx / 1737c478bd9Sstevel@tonic-gate 1747c478bd9Sstevel@tonic-gate.wordset: 1757c478bd9Sstevel@tonic-gate movl %ecx, %edx / save cont 1767c478bd9Sstevel@tonic-gate shrl $2,%ecx / %ecx = number of words to set 1777c478bd9Sstevel@tonic-gate rep; sstol 1787c478bd9Sstevel@tonic-gate movl %edx,%ecx 1797c478bd9Sstevel@tonic-gate andl $3,%ecx / %ecx = number of bytes left 1807c478bd9Sstevel@tonic-gate 1817c478bd9Sstevel@tonic-gate.byteset: 1827c478bd9Sstevel@tonic-gate rep; sstob 1837c478bd9Sstevel@tonic-gate movl 8(%esp),%eax / return string address 1847c478bd9Sstevel@tonic-gate popl %edi / restore register variable 1857c478bd9Sstevel@tonic-gate ret 1867c478bd9Sstevel@tonic-gate SET_SIZE(memset) 187