1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 .file "__align_cpy_4.s" 28 29/* __align_cpy_4(s1, s2, n) 30 * 31 * Copy 4-byte aligned source to 4-byte aligned target in multiples of 4 bytes. 32 * 33 * Input: 34 * o0 address of target 35 * o1 address of source 36 * o2 number of bytes to copy (must be a multiple of 4) 37 * Output: 38 * o0 address of target 39 * Caller's registers that have been changed by this function: 40 * o1-o5, g1, g5 41 * 42 * Note: 43 * This helper routine will not be used by any 32-bit compilations. 44 * To do so would break binary compatibility with previous versions of 45 * Solaris. 46 * 47 * Assumptions: 48 * Source and target addresses are 4-byte aligned. 49 * Bytes to be copied are non-overlapping or _exactly_ overlapping. 50 * The number of bytes to be copied is a multiple of 4. 51 * Call will usually be made with a byte count of more than 4*4 and 52 * less than a few hundred bytes. Legal values are 0 to MAX_SIZE_T. 53 * 54 * Optimization attempt: 55 * Reasonable speed for a generic v9. 56 */ 57 58#include <sys/asm_linkage.h> 59 60 ENTRY(__align_cpy_4) 61 brz,pn %o2, .done ! Skip out if no bytes to copy. 62 cmp %o0, %o1 63 be,pn %xcc, .done ! Addresses are identical--done. 64 and %o0, 7, %o3 ! Is target 8-byte aligned? 65 and %o1, 7, %o4 ! Is source 8-byte aligned? 66 cmp %o3, %o4 67 bne,pt %icc, .noton8 ! Exactly one of source and target is 68 mov %o0, %g1 ! 8-byte aligned. 69 brz,pt %o3, .both8 ! Both are 8-byte aligned. 70 nop 71 72 ld [%o1], %o3 ! Neither is aligned, so do 4 bytes; 73 subcc %o2, 4, %o2 ! then both will be aligned. 74 st %o3, [%g1] 75 bz,pn %xcc, .done 76 add %g1, 4, %g1 77 b .both8 78 add %o1, 4, %o1 79 80! Section of code dealing with case where source and target are both 8-byte 81! aligned. Get and store 16 bytes at a time using ldx and stx. 82 83 .align 32 84.both8: ! Both source and target are aligned. 85 cmp %o2, 16 86 bl,a,pn %xcc, .chkwd 87 cmp %o2, 8 88 89 sub %o2, 12, %o2 90.loop16a: ! Load and store 16 bytes at a time. 91 ldx [%o1], %o3 92 ldx [%o1+8], %o4 93 subcc %o2, 16, %o2 94 stx %o3, [%g1] 95 stx %o4, [%g1+8] 96 add %o1, 16, %o1 97 bg,pt %xcc, .loop16a ! Have at least 16 bytes left. 98 add %g1, 16, %g1 99 100 addcc %o2, 12, %o2 101 bg,a,pt %xcc, .chkwd ! Have some remaining bytes. 102 cmp %o2, 8 103 retl 104 nop 105 106.chkwd: 107 bl,a,pn %xcc, .wrword ! Only 4 bytes left. 108 ld [%o1], %o3 109 110 ldx [%o1], %o3 ! Have 8 or 12, so do 8. 111 stx %o3, [%g1] 112 add %o1, 8, %o1 113 add %g1, 8, %g1 114 subcc %o2, 8, %o2 115 bg,a,pn %xcc, .wrword ! Still have four to do. 116 ld [%o1], %o3 117 118 retl 119 nop 120 121.wrword: ! Copy final word. 122 st %o3, [%g1] 123 124.done: 125 retl 126 nop 127 128! Section of code where either source or target, but not both, are 8-byte 129! aligned. So, use ld and st instructions rather than trying to copy stuff 130! around in registers. 131 132 .align 32 ! Ultra cache line boundary. 133.noton8: 134 add %o1, %o2, %g5 ! Ending address of source. 135 andcc %o2, 15, %o3 ! Mod 16 of number of bytes to copy. 136 bz,pn %xcc, .loop16 ! Copy odd amounts first, then multiples of 16. 137 cmp %o3, 4 138 bz,pn %xcc, .mod4 139 cmp %o3, 8 140 bz,pn %xcc, .mod8 141 cmp %o3, 12 142 bz,pt %xcc, .mod12 143 nop 144 illtrap 0 ! Size not valid. 145 146.mod4: ! Do first 4 bytes, then do multiples of 16. 147 lduw [%o1], %o2 148 add %o1, 4, %o1 149 st %o2, [%g1] 150 cmp %o1, %g5 151 bl,a,pt %xcc, .loop16 152 add %g1, 4, %g1 153 retl 154 nop 155.mod8: ! Do first 8 bytes, then do multiples of 16. 156 lduw [%o1], %o2 157 lduw [%o1+4], %o3 158 add %o1, 8, %o1 159 st %o2, [%g1] 160 st %o3, [%g1+4] 161 cmp %o1, %g5 162 bl,a,pt %xcc, .loop16 163 add %g1, 8, %g1 164 retl 165 nop 166.mod12: ! Do first 12 bytes, then do multiples of 16. 167 lduw [%o1], %o2 168 lduw [%o1+4], %o3 169 lduw [%o1+8], %o4 170 add %o1, 12, %o1 171 st %o2, [%g1] 172 st %o3, [%g1+4] 173 st %o4, [%g1+8] 174 cmp %o1, %g5 175 bl,a,pt %xcc, .loop16 176 add %g1, 12, %g1 177 retl 178 nop 179 .align 32 ! Ultra cache line boundary. 180.loop16: ! Do multiples of 16 bytes. 181 lduw [%o1], %o2 182 lduw [%o1+4], %o3 183 lduw [%o1+8], %o4 184 lduw [%o1+12], %o5 185 add %o1, 16, %o1 186 st %o2, [%g1] 187 st %o3, [%g1+4] 188 cmp %o1, %g5 189 st %o4, [%g1+8] 190 st %o5, [%g1+12] 191 bl,a,pt %xcc, .loop16 192 add %g1, 16,%g1 193 retl ! Target address is already in o0. 194 nop 195 196 SET_SIZE(__align_cpy_4) 197