1/* 2 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 6/* 7 * Copyright (c) 2002 Advanced Micro Devices, Inc. 8 * 9 * All rights reserved. 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the 13 * following conditions are met: 14 * 15 * + Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the 17 * following disclaimer. 18 * 19 * + Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the 21 * following disclaimer in the documentation and/or other 22 * materials provided with the distribution. 23 * 24 * + Neither the name of Advanced Micro Devices, Inc. nor the 25 * names of its contributors may be used to endorse or 26 * promote products derived from this software without 27 * specific prior written permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 30 * CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, 31 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 32 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 33 * DISCLAIMED. IN NO EVENT SHALL ADVANCED MICRO DEVICES, 34 * INC. OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 35 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 36 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 37 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 38 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 39 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 40 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 41 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 42 * POSSIBILITY OF SUCH DAMAGE. 43 * 44 * It is licensee's responsibility to comply with any export 45 * regulations applicable in licensee's jurisdiction. 46 */ 47 48 .ident "%Z%%M% %I% %E% SMI" 49 50 .file "%M%" 51 52#include <sys/asm_linkage.h> 53 54 ANSI_PRAGMA_WEAK(memcmp,function) 55 56#include "SYS.h" 57#include "cache.h" 58 59#define LABEL(s) .memcmp/**/s 60 61 ENTRY(memcmp) /* (const void *, const void*, size_t) */ 62 63LABEL(try1): 64 cmp $8, %rdx 65 jae LABEL(1after) 66 67LABEL(1): /* 1-byte */ 68 test %rdx, %rdx 69 mov $0, %eax 70 jz LABEL(exit) 71 72LABEL(1loop): 73 movzbl (%rdi), %eax 74 movzbl (%rsi), %ecx 75 sub %ecx, %eax 76 jnz LABEL(exit) 77 78 dec %rdx 79 80 lea 1 (%rdi), %rdi 81 lea 1 (%rsi), %rsi 82 83 jnz LABEL(1loop) 84 85LABEL(exit): 86 rep 87 ret 88 89 .p2align 4 90 91LABEL(1after): 92 93LABEL(8try): 94 cmp $32, %rdx 95 jae LABEL(8after) 96 97LABEL(8): /* 8-byte */ 98 mov %edx, %ecx 99 shr $3, %ecx 100 jz LABEL(1) 101 102 .p2align 4 103 104LABEL(8loop): 105 mov (%rsi), %rax 106 cmp (%rdi), %rax 107 jne LABEL(1) 108 109 sub $8, %rdx 110 dec %ecx 111 112 lea 8 (%rsi), %rsi 113 lea 8 (%rdi), %rdi 114 115 jnz LABEL(8loop) 116 117LABEL(8skip): 118 and $7, %edx 119 jnz LABEL(1) 120 121 xor %eax, %eax 122 ret 123 124 .p2align 4 125 126LABEL(8after): 127 128LABEL(32try): 129 cmp $2048, %rdx 130 ja LABEL(32after) 131 132LABEL(32): /* 32-byte */ 133 mov %edx, %ecx 134 shr $5, %ecx 135 jz LABEL(8) 136 137 .p2align 4 138 139LABEL(32loop): 140 mov (%rsi), %rax 141 mov 8 (%rsi), %r8 142 mov 16 (%rsi), %r9 143 mov 24 (%rsi), %r10 144 sub (%rdi), %rax 145 sub 8 (%rdi), %r8 146 sub 16 (%rdi), %r9 147 sub 24 (%rdi), %r10 148 149 or %rax, %r8 150 or %r9, %r10 151 or %r8, %r10 152 jnz LABEL(8) 153 154 sub $32, %rdx 155 dec %ecx 156 157 lea 32 (%rsi), %rsi 158 lea 32 (%rdi), %rdi 159 160 jnz LABEL(32loop) 161 162LABEL(32skip): 163 and $31, %edx 164 jnz LABEL(8) 165 166 xor %eax, %eax 167 ret 168 169 .p2align 4 170 171LABEL(32after): 172 173 prefetchnta _sref_(.amd64cache1half) /* 3DNow: use prefetch */ 174 175LABEL(srctry): 176 mov %esi, %r8d /* align by source */ 177 178 and $7, %r8d 179 jz LABEL(srcafter) /* not unaligned */ 180 181LABEL(src): /* align */ 182 lea -8 (%r8, %rdx), %rdx 183 sub $8, %r8d 184 185 186LABEL(srcloop): 187 movzbl (%rdi), %eax 188 movzbl (%rsi), %ecx 189 sub %ecx, %eax 190 jnz LABEL(exit) 191 192 inc %r8d 193 194 lea 1 (%rdi), %rdi 195 lea 1 (%rsi), %rsi 196 197 jnz LABEL(srcloop) 198 199 .p2align 4 200 201LABEL(srcafter): 202 203LABEL(64try): 204 mov _sref_(.amd64cache1half), %rcx 205 cmp %rdx, %rcx 206 cmova %rdx, %rcx 207 208LABEL(64): /* 64-byte */ 209 shr $6, %rcx 210 jz LABEL(32) 211 212 .p2align 4 213 214LABEL(64loop): 215 mov (%rsi), %rax 216 mov 8 (%rsi), %r8 217 sub (%rdi), %rax 218 sub 8 (%rdi), %r8 219 or %r8, %rax 220 221 mov 16 (%rsi), %r9 222 mov 24 (%rsi), %r10 223 sub 16 (%rdi), %r9 224 sub 24 (%rdi), %r10 225 or %r10, %r9 226 227 or %r9, %rax 228 jnz LABEL(32) 229 230 mov 32 (%rsi), %rax 231 mov 40 (%rsi), %r8 232 sub 32 (%rdi), %rax 233 sub 40 (%rdi), %r8 234 or %r8, %rax 235 236 mov 48 (%rsi), %r9 237 mov 56 (%rsi), %r10 238 sub 48 (%rdi), %r9 239 sub 56 (%rdi), %r10 240 or %r10, %r9 241 242 or %r9, %rax 243 jnz LABEL(32) 244 245 lea 64 (%rsi), %rsi 246 lea 64 (%rdi), %rdi 247 248 sub $64, %rdx 249 dec %rcx 250 jnz LABEL(64loop) 251 252LABEL(64skip): 253 cmp $2048, %rdx 254 ja LABEL(64after) 255 256 test %edx, %edx 257 jnz LABEL(32) 258 259 xor %eax, %eax 260 ret 261 262 .p2align 4 263 264LABEL(64after): 265 266LABEL(pretry): 267 268LABEL(pre): /* 64-byte prefetching */ 269 mov _sref_(.amd64cache2half), %rcx 270 cmp %rdx, %rcx 271 cmova %rdx, %rcx 272 273 shr $6, %rcx 274 jz LABEL(preskip) 275 276 prefetchnta 512 (%rsi) /* 3DNow: use prefetch */ 277 prefetchnta 512 (%rdi) /* 3DNow: use prefetch */ 278 279 mov (%rsi), %rax 280 mov 8 (%rsi), %r9 281 mov 16 (%rsi), %r10 282 mov 24 (%rsi), %r11 283 sub (%rdi), %rax 284 sub 8 (%rdi), %r9 285 sub 16 (%rdi), %r10 286 sub 24 (%rdi), %r11 287 288 or %r9, %rax 289 or %r11, %r10 290 or %r10, %rax 291 jnz LABEL(32) 292 293 mov 32 (%rsi), %rax 294 mov 40 (%rsi), %r9 295 mov 48 (%rsi), %r10 296 mov 56 (%rsi), %r11 297 sub 32 (%rdi), %rax 298 sub 40 (%rdi), %r9 299 sub 48 (%rdi), %r10 300 sub 56 (%rdi), %r11 301 302 or %r9, %rax 303 or %r11, %r10 304 or %r10, %rax 305 jnz LABEL(32) 306 307 lea 64 (%rsi), %rsi 308 lea 64 (%rdi), %rdi 309 310 sub $64, %rdx 311 dec %rcx 312 313 .p2align 4 314 315LABEL(preloop): 316 prefetchnta 512 (%rsi) /* 3DNow: use prefetch */ 317 prefetchnta 512 (%rdi) /* 3DNow: use prefetch */ 318 319 mov (%rsi), %rax 320 mov 8 (%rsi), %r9 321 mov 16 (%rsi), %r10 322 mov 24 (%rsi), %r11 323 sub (%rdi), %rax 324 sub 8 (%rdi), %r9 325 sub 16 (%rdi), %r10 326 sub 24 (%rdi), %r11 327 328 or %r9, %rax 329 or %r11, %r10 330 or %r10, %rax 331 jnz LABEL(32) 332 333 mov 32 (%rsi), %rax 334 mov 40 (%rsi), %r9 335 mov 48 (%rsi), %r10 336 mov 56 (%rsi), %r11 337 sub 32 (%rdi), %rax 338 sub 40 (%rdi), %r9 339 sub 48 (%rdi), %r10 340 sub 56 (%rdi), %r11 341 342 or %r9, %rax 343 or %r11, %r10 344 or %r10, %rax 345 jnz LABEL(32) 346 347 lea 64 (%rsi), %rsi 348 lea 64 (%rdi), %rdi 349 350 sub $64, %rdx 351 dec %rcx 352 jnz LABEL(preloop) 353 354 355LABEL(preskip): 356 cmp $2048, %rdx 357 ja LABEL(preafter) 358 359 test %edx, %edx 360 jnz LABEL(32) 361 362 xor %eax, %eax 363 ret 364 365 .p2align 4 366 367LABEL(preafter): 368 369LABEL(128try): 370 371LABEL(128): /* 128-byte */ 372 mov %rdx, %rcx 373 shr $7, %rcx 374 jz LABEL(128skip) 375 376 .p2align 4 377 378LABEL(128loop): 379 prefetchnta 512 (%rsi) /* 3DNow: use prefetch */ 380 prefetchnta 512 (%rdi) /* 3DNow: use prefetch */ 381 382 mov (%rsi), %rax 383 mov 8 (%rsi), %r8 384 sub (%rdi), %rax 385 sub 8 (%rdi), %r8 386 mov 16 (%rsi), %r9 387 mov 24 (%rsi), %r10 388 sub 16 (%rdi), %r9 389 sub 24 (%rdi), %r10 390 391 or %r8, %rax 392 or %r9, %r10 393 or %r10, %rax 394 395 mov 32 (%rsi), %r8 396 mov 40 (%rsi), %r9 397 sub 32 (%rdi), %r8 398 sub 40 (%rdi), %r9 399 mov 48 (%rsi), %r10 400 mov 56 (%rsi), %r11 401 sub 48 (%rdi), %r10 402 sub 56 (%rdi), %r11 403 404 or %r9, %r8 405 or %r11, %r10 406 or %r10, %r8 407 408 or %r8, %rax 409 jnz LABEL(32) 410 411 prefetchnta 576 (%rsi) /* 3DNow: use prefetch */ 412 prefetchnta 576 (%rdi) /* 3DNow: use prefetch */ 413 414 mov 64 (%rsi), %rax 415 mov 72 (%rsi), %r8 416 sub 64 (%rdi), %rax 417 sub 72 (%rdi), %r8 418 mov 80 (%rsi), %r9 419 mov 88 (%rsi), %r10 420 sub 80 (%rdi), %r9 421 sub 88 (%rdi), %r10 422 423 or %r8, %rax 424 or %r9, %r10 425 or %r10, %rax 426 427 mov 96 (%rsi), %r8 428 mov 104 (%rsi), %r9 429 sub 96 (%rdi), %r8 430 sub 104 (%rdi), %r9 431 mov 112 (%rsi), %r10 432 mov 120 (%rsi), %r11 433 sub 112 (%rdi), %r10 434 sub 120 (%rdi), %r11 435 436 or %r9, %r8 437 or %r11, %r10 438 or %r10, %r8 439 440 or %r8, %rax 441 jnz LABEL(32) 442 443 sub $128, %rdx 444 dec %rcx 445 446 lea 128 (%rsi), %rsi 447 lea 128 (%rdi), %rdi 448 449 jnz LABEL(128loop) 450 451LABEL(128skip): 452 and $127, %edx 453 jnz LABEL(32) 454 455 xor %eax, %eax 456 ret 457 458 SET_SIZE(memcmp) 459