1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22/* 23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27/* 28 * Copyright (c) 2002 Advanced Micro Devices, Inc. 29 * 30 * All rights reserved. 31 * 32 * Redistribution and use in source and binary forms, with or 33 * without modification, are permitted provided that the 34 * following conditions are met: 35 * 36 * + Redistributions of source code must retain the above 37 * copyright notice, this list of conditions and the 38 * following disclaimer. 39 * 40 * + Redistributions in binary form must reproduce the above 41 * copyright notice, this list of conditions and the 42 * following disclaimer in the documentation and/or other 43 * materials provided with the distribution. 44 * 45 * + Neither the name of Advanced Micro Devices, Inc. nor the 46 * names of its contributors may be used to endorse or 47 * promote products derived from this software without 48 * specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 51 * CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, 52 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 53 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 54 * DISCLAIMED. IN NO EVENT SHALL ADVANCED MICRO DEVICES, 55 * INC. OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 56 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 57 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 58 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 59 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 60 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 61 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 62 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 63 * POSSIBILITY OF SUCH DAMAGE. 64 * 65 * It is licensee's responsibility to comply with any export 66 * regulations applicable in licensee's jurisdiction. 67 */ 68 69 .ident "%Z%%M% %I% %E% SMI" 70 71 .file "%M%" 72 73#include <sys/asm_linkage.h> 74 75 ANSI_PRAGMA_WEAK(memcmp,function) 76 77#include "SYS.h" 78#include "cache.h" 79 80#define LABEL(s) .memcmp/**/s 81 82 ENTRY(memcmp) /* (const void *, const void*, size_t) */ 83 84LABEL(try1): 85 cmp $8, %rdx 86 jae LABEL(1after) 87 88LABEL(1): /* 1-byte */ 89 test %rdx, %rdx 90 mov $0, %eax 91 jz LABEL(exit) 92 93LABEL(1loop): 94 movzbl (%rdi), %eax 95 movzbl (%rsi), %ecx 96 sub %ecx, %eax 97 jnz LABEL(exit) 98 99 dec %rdx 100 101 lea 1 (%rdi), %rdi 102 lea 1 (%rsi), %rsi 103 104 jnz LABEL(1loop) 105 106LABEL(exit): 107 rep 108 ret 109 110 .p2align 4 111 112LABEL(1after): 113 114LABEL(8try): 115 cmp $32, %rdx 116 jae LABEL(8after) 117 118LABEL(8): /* 8-byte */ 119 mov %edx, %ecx 120 shr $3, %ecx 121 jz LABEL(1) 122 123 .p2align 4 124 125LABEL(8loop): 126 mov (%rsi), %rax 127 cmp (%rdi), %rax 128 jne LABEL(1) 129 130 sub $8, %rdx 131 dec %ecx 132 133 lea 8 (%rsi), %rsi 134 lea 8 (%rdi), %rdi 135 136 jnz LABEL(8loop) 137 138LABEL(8skip): 139 and $7, %edx 140 jnz LABEL(1) 141 142 xor %eax, %eax 143 ret 144 145 .p2align 4 146 147LABEL(8after): 148 149LABEL(32try): 150 cmp $2048, %rdx 151 ja LABEL(32after) 152 153LABEL(32): /* 32-byte */ 154 mov %edx, %ecx 155 shr $5, %ecx 156 jz LABEL(8) 157 158 .p2align 4 159 160LABEL(32loop): 161 mov (%rsi), %rax 162 mov 8 (%rsi), %r8 163 mov 16 (%rsi), %r9 164 mov 24 (%rsi), %r10 165 sub (%rdi), %rax 166 sub 8 (%rdi), %r8 167 sub 16 (%rdi), %r9 168 sub 24 (%rdi), %r10 169 170 or %rax, %r8 171 or %r9, %r10 172 or %r8, %r10 173 jnz LABEL(8) 174 175 sub $32, %rdx 176 dec %ecx 177 178 lea 32 (%rsi), %rsi 179 lea 32 (%rdi), %rdi 180 181 jnz LABEL(32loop) 182 183LABEL(32skip): 184 and $31, %edx 185 jnz LABEL(8) 186 187 xor %eax, %eax 188 ret 189 190 .p2align 4 191 192LABEL(32after): 193 194 prefetchnta _sref_(.amd64cache1half) /* 3DNow: use prefetch */ 195 196LABEL(srctry): 197 mov %esi, %r8d /* align by source */ 198 199 and $7, %r8d 200 jz LABEL(srcafter) /* not unaligned */ 201 202LABEL(src): /* align */ 203 lea -8 (%r8, %rdx), %rdx 204 sub $8, %r8d 205 206 207LABEL(srcloop): 208 movzbl (%rdi), %eax 209 movzbl (%rsi), %ecx 210 sub %ecx, %eax 211 jnz LABEL(exit) 212 213 inc %r8d 214 215 lea 1 (%rdi), %rdi 216 lea 1 (%rsi), %rsi 217 218 jnz LABEL(srcloop) 219 220 .p2align 4 221 222LABEL(srcafter): 223 224LABEL(64try): 225 mov _sref_(.amd64cache1half), %rcx 226 cmp %rdx, %rcx 227 cmova %rdx, %rcx 228 229LABEL(64): /* 64-byte */ 230 shr $6, %rcx 231 jz LABEL(32) 232 233 .p2align 4 234 235LABEL(64loop): 236 mov (%rsi), %rax 237 mov 8 (%rsi), %r8 238 sub (%rdi), %rax 239 sub 8 (%rdi), %r8 240 or %r8, %rax 241 242 mov 16 (%rsi), %r9 243 mov 24 (%rsi), %r10 244 sub 16 (%rdi), %r9 245 sub 24 (%rdi), %r10 246 or %r10, %r9 247 248 or %r9, %rax 249 jnz LABEL(32) 250 251 mov 32 (%rsi), %rax 252 mov 40 (%rsi), %r8 253 sub 32 (%rdi), %rax 254 sub 40 (%rdi), %r8 255 or %r8, %rax 256 257 mov 48 (%rsi), %r9 258 mov 56 (%rsi), %r10 259 sub 48 (%rdi), %r9 260 sub 56 (%rdi), %r10 261 or %r10, %r9 262 263 or %r9, %rax 264 jnz LABEL(32) 265 266 lea 64 (%rsi), %rsi 267 lea 64 (%rdi), %rdi 268 269 sub $64, %rdx 270 dec %rcx 271 jnz LABEL(64loop) 272 273LABEL(64skip): 274 cmp $2048, %rdx 275 ja LABEL(64after) 276 277 test %edx, %edx 278 jnz LABEL(32) 279 280 xor %eax, %eax 281 ret 282 283 .p2align 4 284 285LABEL(64after): 286 287LABEL(pretry): 288 289LABEL(pre): /* 64-byte prefetching */ 290 mov _sref_(.amd64cache2half), %rcx 291 cmp %rdx, %rcx 292 cmova %rdx, %rcx 293 294 shr $6, %rcx 295 jz LABEL(preskip) 296 297 prefetchnta 512 (%rsi) /* 3DNow: use prefetch */ 298 prefetchnta 512 (%rdi) /* 3DNow: use prefetch */ 299 300 mov (%rsi), %rax 301 mov 8 (%rsi), %r9 302 mov 16 (%rsi), %r10 303 mov 24 (%rsi), %r11 304 sub (%rdi), %rax 305 sub 8 (%rdi), %r9 306 sub 16 (%rdi), %r10 307 sub 24 (%rdi), %r11 308 309 or %r9, %rax 310 or %r11, %r10 311 or %r10, %rax 312 jnz LABEL(32) 313 314 mov 32 (%rsi), %rax 315 mov 40 (%rsi), %r9 316 mov 48 (%rsi), %r10 317 mov 56 (%rsi), %r11 318 sub 32 (%rdi), %rax 319 sub 40 (%rdi), %r9 320 sub 48 (%rdi), %r10 321 sub 56 (%rdi), %r11 322 323 or %r9, %rax 324 or %r11, %r10 325 or %r10, %rax 326 jnz LABEL(32) 327 328 lea 64 (%rsi), %rsi 329 lea 64 (%rdi), %rdi 330 331 sub $64, %rdx 332 dec %rcx 333 334 .p2align 4 335 336LABEL(preloop): 337 prefetchnta 512 (%rsi) /* 3DNow: use prefetch */ 338 prefetchnta 512 (%rdi) /* 3DNow: use prefetch */ 339 340 mov (%rsi), %rax 341 mov 8 (%rsi), %r9 342 mov 16 (%rsi), %r10 343 mov 24 (%rsi), %r11 344 sub (%rdi), %rax 345 sub 8 (%rdi), %r9 346 sub 16 (%rdi), %r10 347 sub 24 (%rdi), %r11 348 349 or %r9, %rax 350 or %r11, %r10 351 or %r10, %rax 352 jnz LABEL(32) 353 354 mov 32 (%rsi), %rax 355 mov 40 (%rsi), %r9 356 mov 48 (%rsi), %r10 357 mov 56 (%rsi), %r11 358 sub 32 (%rdi), %rax 359 sub 40 (%rdi), %r9 360 sub 48 (%rdi), %r10 361 sub 56 (%rdi), %r11 362 363 or %r9, %rax 364 or %r11, %r10 365 or %r10, %rax 366 jnz LABEL(32) 367 368 lea 64 (%rsi), %rsi 369 lea 64 (%rdi), %rdi 370 371 sub $64, %rdx 372 dec %rcx 373 jnz LABEL(preloop) 374 375 376LABEL(preskip): 377 cmp $2048, %rdx 378 ja LABEL(preafter) 379 380 test %edx, %edx 381 jnz LABEL(32) 382 383 xor %eax, %eax 384 ret 385 386 .p2align 4 387 388LABEL(preafter): 389 390LABEL(128try): 391 392LABEL(128): /* 128-byte */ 393 mov %rdx, %rcx 394 shr $7, %rcx 395 jz LABEL(128skip) 396 397 .p2align 4 398 399LABEL(128loop): 400 prefetchnta 512 (%rsi) /* 3DNow: use prefetch */ 401 prefetchnta 512 (%rdi) /* 3DNow: use prefetch */ 402 403 mov (%rsi), %rax 404 mov 8 (%rsi), %r8 405 sub (%rdi), %rax 406 sub 8 (%rdi), %r8 407 mov 16 (%rsi), %r9 408 mov 24 (%rsi), %r10 409 sub 16 (%rdi), %r9 410 sub 24 (%rdi), %r10 411 412 or %r8, %rax 413 or %r9, %r10 414 or %r10, %rax 415 416 mov 32 (%rsi), %r8 417 mov 40 (%rsi), %r9 418 sub 32 (%rdi), %r8 419 sub 40 (%rdi), %r9 420 mov 48 (%rsi), %r10 421 mov 56 (%rsi), %r11 422 sub 48 (%rdi), %r10 423 sub 56 (%rdi), %r11 424 425 or %r9, %r8 426 or %r11, %r10 427 or %r10, %r8 428 429 or %r8, %rax 430 jnz LABEL(32) 431 432 prefetchnta 576 (%rsi) /* 3DNow: use prefetch */ 433 prefetchnta 576 (%rdi) /* 3DNow: use prefetch */ 434 435 mov 64 (%rsi), %rax 436 mov 72 (%rsi), %r8 437 sub 64 (%rdi), %rax 438 sub 72 (%rdi), %r8 439 mov 80 (%rsi), %r9 440 mov 88 (%rsi), %r10 441 sub 80 (%rdi), %r9 442 sub 88 (%rdi), %r10 443 444 or %r8, %rax 445 or %r9, %r10 446 or %r10, %rax 447 448 mov 96 (%rsi), %r8 449 mov 104 (%rsi), %r9 450 sub 96 (%rdi), %r8 451 sub 104 (%rdi), %r9 452 mov 112 (%rsi), %r10 453 mov 120 (%rsi), %r11 454 sub 112 (%rdi), %r10 455 sub 120 (%rdi), %r11 456 457 or %r9, %r8 458 or %r11, %r10 459 or %r10, %r8 460 461 or %r8, %rax 462 jnz LABEL(32) 463 464 sub $128, %rdx 465 dec %rcx 466 467 lea 128 (%rsi), %rsi 468 lea 128 (%rdi), %rdi 469 470 jnz LABEL(128loop) 471 472LABEL(128skip): 473 and $127, %edx 474 jnz LABEL(32) 475 476 xor %eax, %eax 477 ret 478 479 SET_SIZE(memcmp) 480