1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22/* 23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27/* 28 * Copyright (c) 2002 Advanced Micro Devices, Inc. 29 * 30 * All rights reserved. 31 * 32 * Redistribution and use in source and binary forms, with or 33 * without modification, are permitted provided that the 34 * following conditions are met: 35 * 36 * + Redistributions of source code must retain the above 37 * copyright notice, this list of conditions and the 38 * following disclaimer. 39 * 40 * + Redistributions in binary form must reproduce the above 41 * copyright notice, this list of conditions and the 42 * following disclaimer in the documentation and/or other 43 * materials provided with the distribution. 44 * 45 * + Neither the name of Advanced Micro Devices, Inc. nor the 46 * names of its contributors may be used to endorse or 47 * promote products derived from this software without 48 * specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 51 * CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, 52 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 53 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 54 * DISCLAIMED. IN NO EVENT SHALL ADVANCED MICRO DEVICES, 55 * INC. OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 56 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 57 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 58 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 59 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 60 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 61 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 62 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 63 * POSSIBILITY OF SUCH DAMAGE. 64 * 65 * It is licensee's responsibility to comply with any export 66 * regulations applicable in licensee's jurisdiction. 67 */ 68 69 .ident "%Z%%M% %I% %E% SMI" 70 71 .file "%M%" 72 73#include <sys/asm_linkage.h> 74 75 ANSI_PRAGMA_WEAK(memset,function) 76 77#include "SYS.h" 78#include "cache.h" 79 80 ANSI_PRAGMA_WEAK2(_private_memset,memset,function) 81 82#define LABEL(s) .memset/**/s 83 84 ENTRY(memset) /* (void *, const void*, size_t) */ 85 86 mov $0x0101010101010101, %rcx /* memset is itself */ 87 movzx %sil, %rsi 88 imul %rcx, %rsi /* replicate 8 times */ 89 90LABEL(try1): 91 cmp $64, %rdx 92 mov %rdi, %rax /* return memory block address (even for bzero ()) */ 93 jae LABEL(1after) 94 95LABEL(1): /* 1-byte */ 96 test $1, %dl 97 jz LABEL(1a) 98 99 mov %sil, (%rdi) 100 inc %rdi 101 102LABEL(1a): 103 test $2, %dl 104 jz LABEL(1b) 105 106 mov %si, (%rdi) 107 add $2, %rdi 108 109LABEL(1b): 110 test $4, %dl 111 jz LABEL(1c) 112 113 mov %esi, (%rdi) 114 add $4, %rdi 115 116LABEL(1c): 117 test $8, %dl 118 jz LABEL(1d) 119 120 mov %rsi, (%rdi) 121 add $8, %rdi 122 123LABEL(1d): 124 test $16, %dl 125 jz LABEL(1e) 126 127 mov %rsi, (%rdi) 128 mov %rsi, 8 (%rdi) 129 add $16, %rdi 130 131LABEL(1e): 132 133 test $32, %dl 134 jz LABEL(1f) 135 136 mov %rsi, (%rdi) 137 mov %rsi, 8 (%rdi) 138 mov %rsi, 16 (%rdi) 139 mov %rsi, 24 (%rdi) 140/* add $32, %rdi */ 141 142LABEL(1f): 143 144LABEL(exit): 145 rep 146 ret 147 148 .p2align 4 149 150LABEL(1after): 151 152LABEL(32try): 153 cmp $256, %rdx 154 ja LABEL(32after) 155 156LABEL(32): /* 32-byte */ 157 mov %edx, %ecx 158 shr $5, %ecx 159 jz LABEL(32skip) 160 161 .p2align 4 162 163LABEL(32loop): 164 dec %ecx 165 166 mov %rsi, (%rdi) 167 mov %rsi, 8 (%rdi) 168 mov %rsi, 16 (%rdi) 169 mov %rsi, 24 (%rdi) 170 171 lea 32 (%rdi), %rdi 172 173 jz LABEL(32skip) 174 175 dec %ecx 176 177 mov %rsi, (%rdi) 178 mov %rsi, 8 (%rdi) 179 mov %rsi, 16 (%rdi) 180 mov %rsi, 24 (%rdi) 181 182 lea 32 (%rdi), %rdi 183 184 jnz LABEL(32loop) 185 186 .p2align 4 187 188LABEL(32skip): 189 and $31, %edx 190 jnz LABEL(1) 191 192 rep 193 ret 194 195 .p2align 4 196 197LABEL(32after): 198 199 /* 3DNow: use prefetch */ 200 prefetchnta _sref_(.amd64cache1) /* improves test further ahead on B0 */ 201 202LABEL(aligntry): 203 mov %edi, %ecx /* align by destination */ 204 205 and $7, %ecx /* skip if already aligned */ 206 jz LABEL(alignafter) 207 208LABEL(align): /* align */ 209 lea -8 (%rcx, %rdx), %rdx 210 sub $8, %ecx 211 212 .p2align 4 213 214LABEL(alignloop): 215 inc %ecx 216 217 mov %sil, (%rdi) 218 lea 1 (%rdi), %rdi 219 220 jnz LABEL(alignloop) 221 222 .p2align 4 223 224LABEL(alignafter): 225 mov _sref_(.amd64cache2), %r8 226 cmp %rdx, %r8 227 cmova %rdx, %r8 228 229 cmp $2048, %rdx /* this is slow for some block sizes */ 230 jb LABEL(64) 231 232LABEL(fast): /* microcode */ 233 mov %r8, %rcx 234 and $-8, %r8 235 shr $3, %rcx 236/* jz LABEL(fastskip) */ 237 238 xchg %rax, %rsi 239 240 rep 241 stosq 242 243 xchg %rax, %rsi 244 245LABEL(fastskip): 246 sub %r8, %rdx 247 ja LABEL(64after) 248 249 and $7, %edx 250 jnz LABEL(1) 251 252 rep 253 ret 254 255 .p2align 4 256 257LABEL(64try): 258 259LABEL(64): /* 64-byte */ 260 mov %r8, %rcx 261 and $-64, %r8 262 shr $6, %rcx 263 264 dec %rcx /* this iteration starts the prefetcher sooner */ 265 266 mov %rsi, (%rdi) 267 mov %rsi, 8 (%rdi) 268 mov %rsi, 16 (%rdi) 269 mov %rsi, 24 (%rdi) 270 mov %rsi, 32 (%rdi) 271 mov %rsi, 40 (%rdi) 272 mov %rsi, 48 (%rdi) 273 mov %rsi, 56 (%rdi) 274 275 lea 64 (%rdi), %rdi 276 277 .p2align 4 278 279LABEL(64loop): 280 dec %rcx 281 282 mov %rsi, (%rdi) 283 mov %rsi, 8 (%rdi) 284 mov %rsi, 16 (%rdi) 285 mov %rsi, 24 (%rdi) 286 mov %rsi, 32 (%rdi) 287 mov %rsi, 40 (%rdi) 288 mov %rsi, 48 (%rdi) 289 mov %rsi, 56 (%rdi) 290 291 lea 64 (%rdi), %rdi 292 293 jnz LABEL(64loop) 294 295LABEL(64skip): 296 sub %r8, %rdx 297 ja LABEL(64after) 298 299 and $63, %edx 300 jnz LABEL(32) 301 302 rep 303 ret 304 305 .p2align 4 306 307LABEL(64after): 308 309LABEL(NTtry): 310 311LABEL(NT): /* 128-byte */ 312 mov %rdx, %rcx 313 shr $7, %rcx 314 jz LABEL(NTskip) 315 316 .p2align 4 317 318LABEL(NTloop): /* on an MP system it would be better to prefetchnta 320 (%rdi) and 384 (%rdi) here, but not so on an 1P system */ 319 dec %rcx 320 321 movnti %rsi, (%rdi) 322 movnti %rsi, 8 (%rdi) 323 movnti %rsi, 16 (%rdi) 324 movnti %rsi, 24 (%rdi) 325 movnti %rsi, 32 (%rdi) 326 movnti %rsi, 40 (%rdi) 327 movnti %rsi, 48 (%rdi) 328 movnti %rsi, 56 (%rdi) 329 movnti %rsi, 64 (%rdi) 330 movnti %rsi, 72 (%rdi) 331 movnti %rsi, 80 (%rdi) 332 movnti %rsi, 88 (%rdi) 333 movnti %rsi, 96 (%rdi) 334 movnti %rsi, 104 (%rdi) 335 movnti %rsi, 112 (%rdi) 336 movnti %rsi, 120 (%rdi) 337 338 lea 128 (%rdi), %rdi 339 340 jnz LABEL(NTloop) 341 342 mfence 343 344LABEL(NTskip): 345 and $127, %edx 346 jnz LABEL(32) 347 348 rep 349 ret 350 351 SET_SIZE(memset) 352