1/* 2 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 6/* 7 * Copyright (c) 2002 Advanced Micro Devices, Inc. 8 * 9 * All rights reserved. 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the 13 * following conditions are met: 14 * 15 * + Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the 17 * following disclaimer. 18 * 19 * + Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the 21 * following disclaimer in the documentation and/or other 22 * materials provided with the distribution. 23 * 24 * + Neither the name of Advanced Micro Devices, Inc. nor the 25 * names of its contributors may be used to endorse or 26 * promote products derived from this software without 27 * specific prior written permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 30 * CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, 31 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 32 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 33 * DISCLAIMED. IN NO EVENT SHALL ADVANCED MICRO DEVICES, 34 * INC. OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 35 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 36 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 37 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 38 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 39 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 40 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 41 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 42 * POSSIBILITY OF SUCH DAMAGE. 43 * 44 * It is licensee's responsibility to comply with any export 45 * regulations applicable in licensee's jurisdiction. 46 */ 47 48 .file "memcmp.s" 49 50#include <sys/asm_linkage.h> 51 52 ANSI_PRAGMA_WEAK(memcmp,function) 53 54#include "SYS.h" 55#include "cache.h" 56 57#define LABEL(s) .memcmp/**/s 58 59 ENTRY(memcmp) /* (const void *, const void*, size_t) */ 60 61LABEL(try1): 62 cmp $8, %rdx 63 jae LABEL(1after) 64 65LABEL(1): /* 1-byte */ 66 test %rdx, %rdx 67 mov $0, %eax 68 jz LABEL(exit) 69 70LABEL(1loop): 71 movzbl (%rdi), %eax 72 movzbl (%rsi), %ecx 73 sub %ecx, %eax 74 jnz LABEL(exit) 75 76 dec %rdx 77 78 lea 1 (%rdi), %rdi 79 lea 1 (%rsi), %rsi 80 81 jnz LABEL(1loop) 82 83LABEL(exit): 84 rep 85 ret 86 87 .p2align 4 88 89LABEL(1after): 90 91LABEL(8try): 92 cmp $32, %rdx 93 jae LABEL(8after) 94 95LABEL(8): /* 8-byte */ 96 mov %edx, %ecx 97 shr $3, %ecx 98 jz LABEL(1) 99 100 .p2align 4 101 102LABEL(8loop): 103 mov (%rsi), %rax 104 cmp (%rdi), %rax 105 jne LABEL(1) 106 107 sub $8, %rdx 108 dec %ecx 109 110 lea 8 (%rsi), %rsi 111 lea 8 (%rdi), %rdi 112 113 jnz LABEL(8loop) 114 115LABEL(8skip): 116 and $7, %edx 117 jnz LABEL(1) 118 119 xor %eax, %eax 120 ret 121 122 .p2align 4 123 124LABEL(8after): 125 126LABEL(32try): 127 cmp $2048, %rdx 128 ja LABEL(32after) 129 130LABEL(32): /* 32-byte */ 131 mov %edx, %ecx 132 shr $5, %ecx 133 jz LABEL(8) 134 135 .p2align 4 136 137LABEL(32loop): 138 mov (%rsi), %rax 139 mov 8 (%rsi), %r8 140 mov 16 (%rsi), %r9 141 mov 24 (%rsi), %r10 142 sub (%rdi), %rax 143 sub 8 (%rdi), %r8 144 sub 16 (%rdi), %r9 145 sub 24 (%rdi), %r10 146 147 or %rax, %r8 148 or %r9, %r10 149 or %r8, %r10 150 jnz LABEL(8) 151 152 sub $32, %rdx 153 dec %ecx 154 155 lea 32 (%rsi), %rsi 156 lea 32 (%rdi), %rdi 157 158 jnz LABEL(32loop) 159 160LABEL(32skip): 161 and $31, %edx 162 jnz LABEL(8) 163 164 xor %eax, %eax 165 ret 166 167 .p2align 4 168 169LABEL(32after): 170 171 prefetchnta _sref_(.amd64cache1half) /* 3DNow: use prefetch */ 172 173LABEL(srctry): 174 mov %esi, %r8d /* align by source */ 175 176 and $7, %r8d 177 jz LABEL(srcafter) /* not unaligned */ 178 179LABEL(src): /* align */ 180 lea -8 (%r8, %rdx), %rdx 181 sub $8, %r8d 182 183 184LABEL(srcloop): 185 movzbl (%rdi), %eax 186 movzbl (%rsi), %ecx 187 sub %ecx, %eax 188 jnz LABEL(exit) 189 190 inc %r8d 191 192 lea 1 (%rdi), %rdi 193 lea 1 (%rsi), %rsi 194 195 jnz LABEL(srcloop) 196 197 .p2align 4 198 199LABEL(srcafter): 200 201LABEL(64try): 202 mov _sref_(.amd64cache1half), %rcx 203 cmp %rdx, %rcx 204 cmova %rdx, %rcx 205 206LABEL(64): /* 64-byte */ 207 shr $6, %rcx 208 jz LABEL(32) 209 210 .p2align 4 211 212LABEL(64loop): 213 mov (%rsi), %rax 214 mov 8 (%rsi), %r8 215 sub (%rdi), %rax 216 sub 8 (%rdi), %r8 217 or %r8, %rax 218 219 mov 16 (%rsi), %r9 220 mov 24 (%rsi), %r10 221 sub 16 (%rdi), %r9 222 sub 24 (%rdi), %r10 223 or %r10, %r9 224 225 or %r9, %rax 226 jnz LABEL(32) 227 228 mov 32 (%rsi), %rax 229 mov 40 (%rsi), %r8 230 sub 32 (%rdi), %rax 231 sub 40 (%rdi), %r8 232 or %r8, %rax 233 234 mov 48 (%rsi), %r9 235 mov 56 (%rsi), %r10 236 sub 48 (%rdi), %r9 237 sub 56 (%rdi), %r10 238 or %r10, %r9 239 240 or %r9, %rax 241 jnz LABEL(32) 242 243 lea 64 (%rsi), %rsi 244 lea 64 (%rdi), %rdi 245 246 sub $64, %rdx 247 dec %rcx 248 jnz LABEL(64loop) 249 250LABEL(64skip): 251 cmp $2048, %rdx 252 ja LABEL(64after) 253 254 test %edx, %edx 255 jnz LABEL(32) 256 257 xor %eax, %eax 258 ret 259 260 .p2align 4 261 262LABEL(64after): 263 264LABEL(pretry): 265 266LABEL(pre): /* 64-byte prefetching */ 267 mov _sref_(.amd64cache2half), %rcx 268 cmp %rdx, %rcx 269 cmova %rdx, %rcx 270 271 shr $6, %rcx 272 jz LABEL(preskip) 273 274 prefetchnta 512 (%rsi) /* 3DNow: use prefetch */ 275 prefetchnta 512 (%rdi) /* 3DNow: use prefetch */ 276 277 mov (%rsi), %rax 278 mov 8 (%rsi), %r9 279 mov 16 (%rsi), %r10 280 mov 24 (%rsi), %r11 281 sub (%rdi), %rax 282 sub 8 (%rdi), %r9 283 sub 16 (%rdi), %r10 284 sub 24 (%rdi), %r11 285 286 or %r9, %rax 287 or %r11, %r10 288 or %r10, %rax 289 jnz LABEL(32) 290 291 mov 32 (%rsi), %rax 292 mov 40 (%rsi), %r9 293 mov 48 (%rsi), %r10 294 mov 56 (%rsi), %r11 295 sub 32 (%rdi), %rax 296 sub 40 (%rdi), %r9 297 sub 48 (%rdi), %r10 298 sub 56 (%rdi), %r11 299 300 or %r9, %rax 301 or %r11, %r10 302 or %r10, %rax 303 jnz LABEL(32) 304 305 lea 64 (%rsi), %rsi 306 lea 64 (%rdi), %rdi 307 308 sub $64, %rdx 309 dec %rcx 310 311 .p2align 4 312 313LABEL(preloop): 314 prefetchnta 512 (%rsi) /* 3DNow: use prefetch */ 315 prefetchnta 512 (%rdi) /* 3DNow: use prefetch */ 316 317 mov (%rsi), %rax 318 mov 8 (%rsi), %r9 319 mov 16 (%rsi), %r10 320 mov 24 (%rsi), %r11 321 sub (%rdi), %rax 322 sub 8 (%rdi), %r9 323 sub 16 (%rdi), %r10 324 sub 24 (%rdi), %r11 325 326 or %r9, %rax 327 or %r11, %r10 328 or %r10, %rax 329 jnz LABEL(32) 330 331 mov 32 (%rsi), %rax 332 mov 40 (%rsi), %r9 333 mov 48 (%rsi), %r10 334 mov 56 (%rsi), %r11 335 sub 32 (%rdi), %rax 336 sub 40 (%rdi), %r9 337 sub 48 (%rdi), %r10 338 sub 56 (%rdi), %r11 339 340 or %r9, %rax 341 or %r11, %r10 342 or %r10, %rax 343 jnz LABEL(32) 344 345 lea 64 (%rsi), %rsi 346 lea 64 (%rdi), %rdi 347 348 sub $64, %rdx 349 dec %rcx 350 jnz LABEL(preloop) 351 352 353LABEL(preskip): 354 cmp $2048, %rdx 355 ja LABEL(preafter) 356 357 test %edx, %edx 358 jnz LABEL(32) 359 360 xor %eax, %eax 361 ret 362 363 .p2align 4 364 365LABEL(preafter): 366 367LABEL(128try): 368 369LABEL(128): /* 128-byte */ 370 mov %rdx, %rcx 371 shr $7, %rcx 372 jz LABEL(128skip) 373 374 .p2align 4 375 376LABEL(128loop): 377 prefetchnta 512 (%rsi) /* 3DNow: use prefetch */ 378 prefetchnta 512 (%rdi) /* 3DNow: use prefetch */ 379 380 mov (%rsi), %rax 381 mov 8 (%rsi), %r8 382 sub (%rdi), %rax 383 sub 8 (%rdi), %r8 384 mov 16 (%rsi), %r9 385 mov 24 (%rsi), %r10 386 sub 16 (%rdi), %r9 387 sub 24 (%rdi), %r10 388 389 or %r8, %rax 390 or %r9, %r10 391 or %r10, %rax 392 393 mov 32 (%rsi), %r8 394 mov 40 (%rsi), %r9 395 sub 32 (%rdi), %r8 396 sub 40 (%rdi), %r9 397 mov 48 (%rsi), %r10 398 mov 56 (%rsi), %r11 399 sub 48 (%rdi), %r10 400 sub 56 (%rdi), %r11 401 402 or %r9, %r8 403 or %r11, %r10 404 or %r10, %r8 405 406 or %r8, %rax 407 jnz LABEL(32) 408 409 prefetchnta 576 (%rsi) /* 3DNow: use prefetch */ 410 prefetchnta 576 (%rdi) /* 3DNow: use prefetch */ 411 412 mov 64 (%rsi), %rax 413 mov 72 (%rsi), %r8 414 sub 64 (%rdi), %rax 415 sub 72 (%rdi), %r8 416 mov 80 (%rsi), %r9 417 mov 88 (%rsi), %r10 418 sub 80 (%rdi), %r9 419 sub 88 (%rdi), %r10 420 421 or %r8, %rax 422 or %r9, %r10 423 or %r10, %rax 424 425 mov 96 (%rsi), %r8 426 mov 104 (%rsi), %r9 427 sub 96 (%rdi), %r8 428 sub 104 (%rdi), %r9 429 mov 112 (%rsi), %r10 430 mov 120 (%rsi), %r11 431 sub 112 (%rdi), %r10 432 sub 120 (%rdi), %r11 433 434 or %r9, %r8 435 or %r11, %r10 436 or %r10, %r8 437 438 or %r8, %rax 439 jnz LABEL(32) 440 441 sub $128, %rdx 442 dec %rcx 443 444 lea 128 (%rsi), %rsi 445 lea 128 (%rdi), %rdi 446 447 jnz LABEL(128loop) 448 449LABEL(128skip): 450 and $127, %edx 451 jnz LABEL(32) 452 453 xor %eax, %eax 454 ret 455 456 SET_SIZE(memcmp) 457