1/* 2 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 6/* 7 * Copyright (c) 2002 Advanced Micro Devices, Inc. 8 * 9 * All rights reserved. 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the 13 * following conditions are met: 14 * 15 * + Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the 17 * following disclaimer. 18 * 19 * + Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the 21 * following disclaimer in the documentation and/or other 22 * materials provided with the distribution. 23 * 24 * + Neither the name of Advanced Micro Devices, Inc. nor the 25 * names of its contributors may be used to endorse or 26 * promote products derived from this software without 27 * specific prior written permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 30 * CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, 31 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 32 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 33 * DISCLAIMED. IN NO EVENT SHALL ADVANCED MICRO DEVICES, 34 * INC. OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 35 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 36 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 37 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 38 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 39 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 40 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 41 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 42 * POSSIBILITY OF SUCH DAMAGE. 43 * 44 * It is licensee's responsibility to comply with any export 45 * regulations applicable in licensee's jurisdiction. 46 */ 47 48 .ident "%Z%%M% %I% %E% SMI" 49 50 .file "%M%" 51 52#include <sys/asm_linkage.h> 53 54 ANSI_PRAGMA_WEAK(memset,function) 55 56#include "SYS.h" 57#include "cache.h" 58 59 ANSI_PRAGMA_WEAK2(_private_memset,memset,function) 60 61#define LABEL(s) .memset/**/s 62 63 ENTRY(memset) /* (void *, const void*, size_t) */ 64 65 mov $0x0101010101010101, %rcx /* memset is itself */ 66 movzx %sil, %rsi 67 imul %rcx, %rsi /* replicate 8 times */ 68 69LABEL(try1): 70 cmp $64, %rdx 71 mov %rdi, %rax /* return memory block address (even for bzero ()) */ 72 jae LABEL(1after) 73 74LABEL(1): /* 1-byte */ 75 test $1, %dl 76 jz LABEL(1a) 77 78 mov %sil, (%rdi) 79 inc %rdi 80 81LABEL(1a): 82 test $2, %dl 83 jz LABEL(1b) 84 85 mov %si, (%rdi) 86 add $2, %rdi 87 88LABEL(1b): 89 test $4, %dl 90 jz LABEL(1c) 91 92 mov %esi, (%rdi) 93 add $4, %rdi 94 95LABEL(1c): 96 test $8, %dl 97 jz LABEL(1d) 98 99 mov %rsi, (%rdi) 100 add $8, %rdi 101 102LABEL(1d): 103 test $16, %dl 104 jz LABEL(1e) 105 106 mov %rsi, (%rdi) 107 mov %rsi, 8 (%rdi) 108 add $16, %rdi 109 110LABEL(1e): 111 112 test $32, %dl 113 jz LABEL(1f) 114 115 mov %rsi, (%rdi) 116 mov %rsi, 8 (%rdi) 117 mov %rsi, 16 (%rdi) 118 mov %rsi, 24 (%rdi) 119/* add $32, %rdi */ 120 121LABEL(1f): 122 123LABEL(exit): 124 rep 125 ret 126 127 .p2align 4 128 129LABEL(1after): 130 131LABEL(32try): 132 cmp $256, %rdx 133 ja LABEL(32after) 134 135LABEL(32): /* 32-byte */ 136 mov %edx, %ecx 137 shr $5, %ecx 138 jz LABEL(32skip) 139 140 .p2align 4 141 142LABEL(32loop): 143 dec %ecx 144 145 mov %rsi, (%rdi) 146 mov %rsi, 8 (%rdi) 147 mov %rsi, 16 (%rdi) 148 mov %rsi, 24 (%rdi) 149 150 lea 32 (%rdi), %rdi 151 152 jz LABEL(32skip) 153 154 dec %ecx 155 156 mov %rsi, (%rdi) 157 mov %rsi, 8 (%rdi) 158 mov %rsi, 16 (%rdi) 159 mov %rsi, 24 (%rdi) 160 161 lea 32 (%rdi), %rdi 162 163 jnz LABEL(32loop) 164 165 .p2align 4 166 167LABEL(32skip): 168 and $31, %edx 169 jnz LABEL(1) 170 171 rep 172 ret 173 174 .p2align 4 175 176LABEL(32after): 177 178 /* 3DNow: use prefetch */ 179 prefetchnta _sref_(.amd64cache1) /* improves test further ahead on B0 */ 180 181LABEL(aligntry): 182 mov %edi, %ecx /* align by destination */ 183 184 and $7, %ecx /* skip if already aligned */ 185 jz LABEL(alignafter) 186 187LABEL(align): /* align */ 188 lea -8 (%rcx, %rdx), %rdx 189 sub $8, %ecx 190 191 .p2align 4 192 193LABEL(alignloop): 194 inc %ecx 195 196 mov %sil, (%rdi) 197 lea 1 (%rdi), %rdi 198 199 jnz LABEL(alignloop) 200 201 .p2align 4 202 203LABEL(alignafter): 204 mov _sref_(.amd64cache2), %r8 205 cmp %rdx, %r8 206 cmova %rdx, %r8 207 208 cmp $2048, %rdx /* this is slow for some block sizes */ 209 jb LABEL(64) 210 211LABEL(fast): /* microcode */ 212 mov %r8, %rcx 213 and $-8, %r8 214 shr $3, %rcx 215/* jz LABEL(fastskip) */ 216 217 xchg %rax, %rsi 218 219 rep 220 stosq 221 222 xchg %rax, %rsi 223 224LABEL(fastskip): 225 sub %r8, %rdx 226 ja LABEL(64after) 227 228 and $7, %edx 229 jnz LABEL(1) 230 231 rep 232 ret 233 234 .p2align 4 235 236LABEL(64try): 237 238LABEL(64): /* 64-byte */ 239 mov %r8, %rcx 240 and $-64, %r8 241 shr $6, %rcx 242 243 dec %rcx /* this iteration starts the prefetcher sooner */ 244 245 mov %rsi, (%rdi) 246 mov %rsi, 8 (%rdi) 247 mov %rsi, 16 (%rdi) 248 mov %rsi, 24 (%rdi) 249 mov %rsi, 32 (%rdi) 250 mov %rsi, 40 (%rdi) 251 mov %rsi, 48 (%rdi) 252 mov %rsi, 56 (%rdi) 253 254 lea 64 (%rdi), %rdi 255 256 .p2align 4 257 258LABEL(64loop): 259 dec %rcx 260 261 mov %rsi, (%rdi) 262 mov %rsi, 8 (%rdi) 263 mov %rsi, 16 (%rdi) 264 mov %rsi, 24 (%rdi) 265 mov %rsi, 32 (%rdi) 266 mov %rsi, 40 (%rdi) 267 mov %rsi, 48 (%rdi) 268 mov %rsi, 56 (%rdi) 269 270 lea 64 (%rdi), %rdi 271 272 jnz LABEL(64loop) 273 274LABEL(64skip): 275 sub %r8, %rdx 276 ja LABEL(64after) 277 278 and $63, %edx 279 jnz LABEL(32) 280 281 rep 282 ret 283 284 .p2align 4 285 286LABEL(64after): 287 288LABEL(NTtry): 289 290LABEL(NT): /* 128-byte */ 291 mov %rdx, %rcx 292 shr $7, %rcx 293 jz LABEL(NTskip) 294 295 .p2align 4 296 297LABEL(NTloop): /* on an MP system it would be better to prefetchnta 320 (%rdi) and 384 (%rdi) here, but not so on an 1P system */ 298 dec %rcx 299 300 movnti %rsi, (%rdi) 301 movnti %rsi, 8 (%rdi) 302 movnti %rsi, 16 (%rdi) 303 movnti %rsi, 24 (%rdi) 304 movnti %rsi, 32 (%rdi) 305 movnti %rsi, 40 (%rdi) 306 movnti %rsi, 48 (%rdi) 307 movnti %rsi, 56 (%rdi) 308 movnti %rsi, 64 (%rdi) 309 movnti %rsi, 72 (%rdi) 310 movnti %rsi, 80 (%rdi) 311 movnti %rsi, 88 (%rdi) 312 movnti %rsi, 96 (%rdi) 313 movnti %rsi, 104 (%rdi) 314 movnti %rsi, 112 (%rdi) 315 movnti %rsi, 120 (%rdi) 316 317 lea 128 (%rdi), %rdi 318 319 jnz LABEL(NTloop) 320 321 mfence 322 323LABEL(NTskip): 324 and $127, %edx 325 jnz LABEL(32) 326 327 rep 328 ret 329 330 SET_SIZE(memset) 331