1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2009, Intel Corporation 24 * All rights reserved. 25 */ 26 27/* 28 * str[n]cpy - copy [n] chars from second operand into first operand 29 */ 30#include "SYS.h" 31#include "proc64_id.h" 32 33#define LABEL(s) .strcpy##s 34 35#ifdef USE_AS_STRNCPY 36 ENTRY(strncpy) 37 test %edx, %edx 38 jz LABEL(strncpy_exitz) 39 mov %rdx, %r8 40#else 41 ENTRY(strcpy) /* (char *, const char *) */ 42 xor %rdx, %rdx 43#endif 44 mov %esi, %ecx 45 and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */ 46 and $0xf, %rcx 47 mov %rdi, %rax /* save destination address for return value */ 48 49 50 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ 51 pcmpeqb (%rsi), %xmm0 /* check 16 bytes in src for null */ 52 pmovmskb %xmm0, %edx 53 shr %cl, %edx /* adjust for offset from 16byte boundary */ 54 test %edx, %edx /* edx will be 0 if chars are non-null */ 55 jnz LABEL(less16bytes) /* null char found in first 16 bytes examined */ 56#ifdef USE_AS_STRNCPY 57 /* 58 * Check if the count is satisfied in first 16 bytes examined. 59 */ 60 lea -16(%r8, %rcx), %r11 61 cmp $0, %r11 62 jle LABEL(less16bytes) 63#endif 64 mov %rcx, %r9 /* rsi alignment offset */ 65 or %edi, %ecx 66 and $0xf, %ecx 67 lea -16(%r9), %r10 68 jz LABEL(ashr_0) /* src and dest are both 16 byte aligned */ 69 70 neg %r10 /* max src bytes remaining in current dqword */ 71 72 pxor %xmm0, %xmm0 /* clear %xmm0, may be polluted by unaligned operation */ 73 pcmpeqb 16(%rsi), %xmm0 /* check next 16 bytes in src for a null */ 74 pmovmskb %xmm0, %edx 75 test %edx, %edx 76 jnz LABEL(less32bytes) /* null char found in first 32 bytes examined */ 77 78#ifdef USE_AS_STRNCPY 79 /* 80 * If strncpy count <= 16 go to exit case 81 */ 82 sub $16, %r8 83 jbe LABEL(less32bytes_strncpy_truncation) 84#endif 85 /* 86 * At least 16 bytes to copy to destination string. Move them now. 87 * Don't worry about alignment. 88 */ 89 mov (%rsi, %r9), %rdx 90 mov %rdx, (%rdi) 91 mov 8(%rsi, %r9), %rdx 92 mov %rdx, 8(%rdi) 93 94 /* 95 * so far destination rdi may be aligned by 16, re-calculate rsi and 96 * jump to corresponding src/dest relative offset case. 97 * rcx is offset of rsi 98 * rdx is offset of rdi 99 */ 100 and $0xfffffffffffffff0, %rdi /* force rdi 16 byte align */ 101 mov %rax, %rdx /* rax contains orignal rdi */ 102 xor %rdi, %rdx /* same effect as "and $0xf, %rdx" */ 103#ifdef USE_AS_STRNCPY 104 /* 105 * Will now do 16 byte aligned stores. Stores may overlap some bytes 106 * (ie store twice) if destination was unaligned. Compensate here. 107 */ 108 add %rdx, %r8 /* compensate for overlap */ 109#endif 110 111 add $16, %rdi /* next 16 bytes for dest */ 112 113 /* 114 * align src to 16-byte boundary. Could be up or down depending on 115 * whether src offset - dest offset > 0 (up) or 116 * src offset - dest offset < 0 (down). 117 */ 118 sub %rdx, %r9 /* src offset - dest offset */ 119 120 lea 16(%r9, %rsi), %rsi 121 mov %esi, %ecx /* for new src offset */ 122 and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */ 123 124 and $0xf, %ecx /* new src offset is 0 if rsi/rdi have same alignment */ 125 jz LABEL(ashr_0) 126 127#ifdef USE_AS_STRNCPY 128 xor %edx, %edx /* In case unaligned_exit is taken */ 129#endif 130 /* 131 * Jump to case corresponding to source/dest string relative offsets 132 * Index = (16 + (src offset - dest offset)) % 16 133 */ 134 lea -16(%rcx), %r10 135 mov %rcx, %r9 136 neg %r10 /* max src bytes remaining in current dqword */ 137 lea LABEL(unaligned_table)(%rip), %r11 138 movslq (%r11, %rcx, 4), %rcx 139 lea (%r11, %rcx), %rcx 140 jmp *%rcx 141 142/* 143 * ashr_0 handles the following cases: 144 * src alignment offset = dest alignment offset 145 */ 146 .p2align 5 147LABEL(ashr_0): 148#ifdef USE_AS_STRNCPY 149 sub $16, %r8 150 jbe LABEL(strncpy_truncation_aligned) 151#endif 152 movdqa (%rsi), %xmm1 /* fetch 16 bytes from src string */ 153 movdqa %xmm1, (%rdi) /* store 16 bytes into dest string */ 154 add $16, %rsi 155 add $16, %rdi 156 pcmpeqb (%rsi), %xmm0 /* check 16 bytes in src for a null */ 157 pmovmskb %xmm0, %edx 158 159 test %edx, %edx /* edx will be 0 if chars are non-null */ 160 jnz LABEL(aligned_16bytes) /* exit tail */ 161 162LABEL(ashr_0_loop): 163#ifdef USE_AS_STRNCPY 164 sub $16, %r8 165 jbe LABEL(strncpy_truncation_aligned) 166#endif 167 movdqa (%rsi, %rcx), %xmm1 168 movdqa %xmm1, (%rdi, %rcx) 169 add $16, %rcx 170 pcmpeqb (%rsi, %rcx), %xmm0 171 pmovmskb %xmm0, %edx 172 test %edx, %edx 173 jnz LABEL(aligned_exit) 174 175#ifdef USE_AS_STRNCPY 176 sub $16, %r8 177 jbe LABEL(strncpy_truncation_aligned) 178#endif 179 movdqa (%rsi, %rcx), %xmm1 180 movdqa %xmm1, (%rdi, %rcx) 181 add $16, %rcx 182 pcmpeqb (%rsi, %rcx), %xmm0 183 pmovmskb %xmm0, %edx 184 test %edx, %edx 185 jnz LABEL(aligned_exit) 186 187#ifdef USE_AS_STRNCPY 188 sub $16, %r8 189 jbe LABEL(strncpy_truncation_aligned) 190#endif 191 movdqa (%rsi, %rcx), %xmm1 192 movdqa %xmm1, (%rdi, %rcx) 193 194 add $16, %rcx 195 pcmpeqb (%rsi, %rcx), %xmm0 196 pmovmskb %xmm0, %edx 197 test %edx, %edx 198 jnz LABEL(aligned_exit) 199 200#ifdef USE_AS_STRNCPY 201 sub $16, %r8 202 jbe LABEL(strncpy_truncation_aligned) 203#endif 204 movdqa (%rsi, %rcx), %xmm1 205 movdqa %xmm1, (%rdi, %rcx) 206 add $16, %rcx 207 pcmpeqb (%rsi, %rcx), %xmm0 208 pmovmskb %xmm0, %edx 209 test %edx, %edx 210 jz LABEL(ashr_0_loop) 211 jmp LABEL(aligned_exit) 212 213 214/* 215 * ashr_15 handles the following cases: 216 * (16 + (src offset - dest offset)) % 16 = 15 217 * 218 * Based on above operation, start from (%r9 + rsi) to the left of this cache 219 * bank, there is no null byte. 220 */ 221 .p2align 4 222LABEL(ashr_15): 223 xor %ecx, %ecx /* clear index */ 224#ifdef USE_AS_STRNCPY 225 cmp %r10, %r8 226 jbe LABEL(unaligned_exit) 227#endif 228 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 229 jz LABEL(ashr_15_use_sse2) 230 231 .p2align 4 232LABEL(ashr_15_use_ssse3): 233 movdqa 16(%rsi, %rcx), %xmm3 234 pcmpeqb %xmm3, %xmm0 235 pmovmskb %xmm0, %edx 236 test %edx, %edx 237 jnz LABEL(unaligned_exit) 238#ifdef USE_AS_STRNCPY 239 sub $16, %r8 240 jbe LABEL(strncpy_truncation_unaligned) 241#endif 242 243 #palignr $15, (%rsi, %rcx), %xmm3 244 .byte 0x66, 0x0F, 0x3A ,0x0F 245 .byte 0x1c, 0x0e, 0x0f 246 247 movdqa %xmm3, (%rdi, %rcx) 248 add $16, %rcx 249 250#ifdef USE_AS_STRNCPY 251 cmp %r10, %r8 252 jbe LABEL(unaligned_exit) 253#endif 254 movdqa 16(%rsi, %rcx), %xmm3 255 pcmpeqb %xmm3, %xmm0 256 pmovmskb %xmm0, %edx 257 test %edx, %edx 258 jnz LABEL(unaligned_exit) 259#ifdef USE_AS_STRNCPY 260 sub $16, %r8 261 jbe LABEL(strncpy_truncation_unaligned) 262#endif 263 264 #palignr $15, (%rsi, %rcx), %xmm3 265 .byte 0x66, 0x0F, 0x3A ,0x0F 266 .byte 0x1c, 0x0e, 0x0f 267 268 movdqa %xmm3, (%rdi, %rcx) 269 add $16, %rcx 270 271#ifdef USE_AS_STRNCPY 272 cmp %r10, %r8 273 jbe LABEL(unaligned_exit) 274#endif 275 jmp LABEL(ashr_15_use_ssse3) 276 277 .p2align 4 278LABEL(ashr_15_use_sse2): 279 pcmpeqb 16(%rsi, %rcx), %xmm0 280 pmovmskb %xmm0, %edx 281 test %edx, %edx 282 jnz LABEL(unaligned_exit) 283#ifdef USE_AS_STRNCPY 284 sub $16, %r8 285 jbe LABEL(strncpy_truncation_unaligned) 286#endif 287 288 movdqa 16(%rsi, %rcx), %xmm3 289 movdqa (%rsi, %rcx), %xmm2 290 291 psrldq $15, %xmm2 292 pslldq $1, %xmm3 293 por %xmm2, %xmm3 294 295 movdqa %xmm3, (%rdi, %rcx) 296 add $16, %rcx 297#ifdef USE_AS_STRNCPY 298 cmp %r10, %r8 299 jbe LABEL(unaligned_exit) 300#endif 301 pcmpeqb 16(%rsi, %rcx), %xmm0 302 pmovmskb %xmm0, %edx 303 test %edx, %edx 304 jnz LABEL(unaligned_exit) 305#ifdef USE_AS_STRNCPY 306 sub $16, %r8 307 jbe LABEL(strncpy_truncation_unaligned) 308#endif 309 310 movdqa 16(%rsi, %rcx), %xmm3 311 movdqa (%rsi, %rcx), %xmm2 312 313 psrldq $15, %xmm2 314 pslldq $1, %xmm3 315 por %xmm2, %xmm3 316 317 movdqa %xmm3, (%rdi, %rcx) 318 add $16, %rcx 319#ifdef USE_AS_STRNCPY 320 cmp %r10, %r8 321 jbe LABEL(unaligned_exit) 322#endif 323 jmp LABEL(ashr_15_use_sse2) 324 325 326/* 327 * ashr_14 handles the following cases: 328 * (16 + (src offset - dest offset)) % 16 = 14 329 * 330 * Based on above operation, start from (%r9 + rsi) to the left of this cache 331 * bank, there is no null byte. 332 */ 333 .p2align 4 334LABEL(ashr_14): 335 xor %ecx, %ecx /* clear index */ 336#ifdef USE_AS_STRNCPY 337 cmp %r10, %r8 338 jbe LABEL(unaligned_exit) 339#endif 340 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 341 jz LABEL(ashr_14_use_sse2) 342 343 .p2align 4 344LABEL(ashr_14_use_ssse3): 345 movdqa 16(%rsi, %rcx), %xmm3 346 pcmpeqb %xmm3, %xmm0 347 pmovmskb %xmm0, %edx 348 test %edx, %edx 349 jnz LABEL(unaligned_exit) 350#ifdef USE_AS_STRNCPY 351 sub $16, %r8 352 jbe LABEL(strncpy_truncation_unaligned) 353#endif 354 355 #palignr $14, (%rsi, %rcx), %xmm3 356 .byte 0x66, 0x0F, 0x3A ,0x0F 357 .byte 0x1c, 0x0e, 0x0e 358 359 movdqa %xmm3, (%rdi, %rcx) 360 add $16, %rcx 361 362#ifdef USE_AS_STRNCPY 363 cmp %r10, %r8 364 jbe LABEL(unaligned_exit) 365#endif 366 movdqa 16(%rsi, %rcx), %xmm3 367 pcmpeqb %xmm3, %xmm0 368 pmovmskb %xmm0, %edx 369 test %edx, %edx 370 jnz LABEL(unaligned_exit) 371#ifdef USE_AS_STRNCPY 372 sub $16, %r8 373 jbe LABEL(strncpy_truncation_unaligned) 374#endif 375 376 #palignr $14, (%rsi, %rcx), %xmm3 377 .byte 0x66, 0x0F, 0x3A ,0x0F 378 .byte 0x1c, 0x0e, 0x0e 379 380 movdqa %xmm3, (%rdi, %rcx) 381 add $16, %rcx 382#ifdef USE_AS_STRNCPY 383 cmp %r10, %r8 384 jbe LABEL(unaligned_exit) 385#endif 386 jmp LABEL(ashr_14_use_ssse3) 387 388 .p2align 4 389LABEL(ashr_14_use_sse2): 390 pcmpeqb 16(%rsi, %rcx), %xmm0 391 pmovmskb %xmm0, %edx 392 test %edx, %edx 393 jnz LABEL(unaligned_exit) 394#ifdef USE_AS_STRNCPY 395 sub $16, %r8 396 jbe LABEL(strncpy_truncation_unaligned) 397#endif 398 399 movdqa 16(%rsi, %rcx), %xmm3 400 movdqa (%rsi, %rcx), %xmm2 401 402 psrldq $14, %xmm2 403 pslldq $2, %xmm3 404 por %xmm2, %xmm3 405 406 movdqa %xmm3, (%rdi, %rcx) 407 add $16, %rcx 408 409#ifdef USE_AS_STRNCPY 410 cmp %r10, %r8 411 jbe LABEL(unaligned_exit) 412#endif 413 pcmpeqb 16(%rsi, %rcx), %xmm0 414 pmovmskb %xmm0, %edx 415 test %edx, %edx 416 jnz LABEL(unaligned_exit) 417#ifdef USE_AS_STRNCPY 418 sub $16, %r8 419 jbe LABEL(strncpy_truncation_unaligned) 420#endif 421 422 movdqa 16(%rsi, %rcx), %xmm3 423 movdqa (%rsi, %rcx), %xmm2 424 425 psrldq $14, %xmm2 426 pslldq $2, %xmm3 427 por %xmm2, %xmm3 428 429 movdqa %xmm3, (%rdi, %rcx) 430 add $16, %rcx 431#ifdef USE_AS_STRNCPY 432 cmp %r10, %r8 433 jbe LABEL(unaligned_exit) 434#endif 435 jmp LABEL(ashr_14_use_sse2) 436 437 438/* 439 * ashr_13 handles the following cases: 440 * (16 + (src offset - dest offset)) % 16 = 13 441 * 442 * Based on above operation, start from (%r9 + rsi) to the left of this cache 443 * bank, there is no null byte. 444 */ 445 .p2align 4 446LABEL(ashr_13): 447 xor %ecx, %ecx /* clear index */ 448#ifdef USE_AS_STRNCPY 449 cmp %r10, %r8 450 jbe LABEL(unaligned_exit) 451#endif 452 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 453 jz LABEL(ashr_13_use_sse2) 454 455 .p2align 4 456LABEL(ashr_13_use_ssse3): 457 movdqa 16(%rsi, %rcx), %xmm3 458 pcmpeqb %xmm3, %xmm0 459 pmovmskb %xmm0, %edx 460 test %edx, %edx 461 jnz LABEL(unaligned_exit) 462#ifdef USE_AS_STRNCPY 463 sub $16, %r8 464 jbe LABEL(strncpy_truncation_unaligned) 465#endif 466 467 #palignr $13, (%rsi, %rcx), %xmm3 468 .byte 0x66, 0x0F, 0x3A ,0x0F 469 .byte 0x1c, 0x0e, 0x0d 470 471 movdqa %xmm3, (%rdi, %rcx) 472 add $16, %rcx 473 474#ifdef USE_AS_STRNCPY 475 cmp %r10, %r8 476 jbe LABEL(unaligned_exit) 477#endif 478 movdqa 16(%rsi, %rcx), %xmm3 479 pcmpeqb %xmm3, %xmm0 480 pmovmskb %xmm0, %edx 481 test %edx, %edx 482 jnz LABEL(unaligned_exit) 483#ifdef USE_AS_STRNCPY 484 sub $16, %r8 485 jbe LABEL(strncpy_truncation_unaligned) 486#endif 487 488 #palignr $13, (%rsi, %rcx), %xmm3 489 .byte 0x66, 0x0F, 0x3A ,0x0F 490 .byte 0x1c, 0x0e, 0x0d 491 492 movdqa %xmm3, (%rdi, %rcx) 493 add $16, %rcx 494#ifdef USE_AS_STRNCPY 495 cmp %r10, %r8 496 jbe LABEL(unaligned_exit) 497#endif 498 jmp LABEL(ashr_13_use_ssse3) 499 500 .p2align 4 501LABEL(ashr_13_use_sse2): 502 pcmpeqb 16(%rsi, %rcx), %xmm0 503 pmovmskb %xmm0, %edx 504 test %edx, %edx 505 jnz LABEL(unaligned_exit) 506#ifdef USE_AS_STRNCPY 507 sub $16, %r8 508 jbe LABEL(strncpy_truncation_unaligned) 509#endif 510 511 movdqa 16(%rsi, %rcx), %xmm3 512 movdqa (%rsi, %rcx), %xmm2 513 514 psrldq $13, %xmm2 515 pslldq $3, %xmm3 516 por %xmm2, %xmm3 517 518 movdqa %xmm3, (%rdi, %rcx) 519 add $16, %rcx 520 521#ifdef USE_AS_STRNCPY 522 cmp %r10, %r8 523 jbe LABEL(unaligned_exit) 524#endif 525 pcmpeqb 16(%rsi, %rcx), %xmm0 526 pmovmskb %xmm0, %edx 527 test %edx, %edx 528 jnz LABEL(unaligned_exit) 529#ifdef USE_AS_STRNCPY 530 sub $16, %r8 531 jbe LABEL(strncpy_truncation_unaligned) 532#endif 533 534 movdqa 16(%rsi, %rcx), %xmm3 535 movdqa (%rsi, %rcx), %xmm2 536 537 psrldq $13, %xmm2 538 pslldq $3, %xmm3 539 por %xmm2, %xmm3 540 541 movdqa %xmm3, (%rdi, %rcx) 542 add $16, %rcx 543#ifdef USE_AS_STRNCPY 544 cmp %r10, %r8 545 jbe LABEL(unaligned_exit) 546#endif 547 jmp LABEL(ashr_13_use_sse2) 548 549 550/* 551 * ashr_12 handles the following cases: 552 * (16 + (src offset - dest offset)) % 16 = 12 553 * 554 * Based on above operation, start from (%r9 + rsi) to the left of this cache 555 * bank, there is no null byte. 556 */ 557 .p2align 4 558LABEL(ashr_12): 559 xor %ecx, %ecx /* clear index */ 560#ifdef USE_AS_STRNCPY 561 cmp %r10, %r8 562 jbe LABEL(unaligned_exit) 563#endif 564 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 565 jz LABEL(ashr_12_use_sse2) 566 567 .p2align 4 568LABEL(ashr_12_use_ssse3): 569 movdqa 16(%rsi, %rcx), %xmm3 570 pcmpeqb %xmm3, %xmm0 571 pmovmskb %xmm0, %edx 572 test %edx, %edx 573 jnz LABEL(unaligned_exit) 574#ifdef USE_AS_STRNCPY 575 sub $16, %r8 576 jbe LABEL(strncpy_truncation_unaligned) 577#endif 578 579 #palignr $12, (%rsi, %rcx), %xmm3 580 .byte 0x66, 0x0F, 0x3A ,0x0F 581 .byte 0x1c, 0x0e, 0x0c 582 583 movdqa %xmm3, (%rdi, %rcx) 584 add $16, %rcx 585 586#ifdef USE_AS_STRNCPY 587 cmp %r10, %r8 588 jbe LABEL(unaligned_exit) 589#endif 590 movdqa 16(%rsi, %rcx), %xmm3 591 pcmpeqb %xmm3, %xmm0 592 pmovmskb %xmm0, %edx 593 test %edx, %edx 594 jnz LABEL(unaligned_exit) 595#ifdef USE_AS_STRNCPY 596 sub $16, %r8 597 jbe LABEL(strncpy_truncation_unaligned) 598#endif 599 600 #palignr $12, (%rsi, %rcx), %xmm3 601 .byte 0x66, 0x0F, 0x3A ,0x0F 602 .byte 0x1c, 0x0e, 0x0c 603 604 movdqa %xmm3, (%rdi, %rcx) 605 add $16, %rcx 606#ifdef USE_AS_STRNCPY 607 cmp %r10, %r8 608 jbe LABEL(unaligned_exit) 609#endif 610 jmp LABEL(ashr_12_use_ssse3) 611 612 .p2align 4 613LABEL(ashr_12_use_sse2): 614 pcmpeqb 16(%rsi, %rcx), %xmm0 615 pmovmskb %xmm0, %edx 616 test %edx, %edx 617 jnz LABEL(unaligned_exit) 618#ifdef USE_AS_STRNCPY 619 sub $16, %r8 620 jbe LABEL(strncpy_truncation_unaligned) 621#endif 622 623 movdqa 16(%rsi, %rcx), %xmm3 624 movdqa (%rsi, %rcx), %xmm2 625 626 psrldq $12, %xmm2 627 pslldq $4, %xmm3 628 por %xmm2, %xmm3 629 630 movdqa %xmm3, (%rdi, %rcx) 631 add $16, %rcx 632 633#ifdef USE_AS_STRNCPY 634 cmp %r10, %r8 635 jbe LABEL(unaligned_exit) 636#endif 637 pcmpeqb 16(%rsi, %rcx), %xmm0 638 pmovmskb %xmm0, %edx 639 test %edx, %edx 640 jnz LABEL(unaligned_exit) 641#ifdef USE_AS_STRNCPY 642 sub $16, %r8 643 jbe LABEL(strncpy_truncation_unaligned) 644#endif 645 646 movdqa 16(%rsi, %rcx), %xmm3 647 movdqa (%rsi, %rcx), %xmm2 648 649 psrldq $12, %xmm2 650 pslldq $4, %xmm3 651 por %xmm2, %xmm3 652 653 movdqa %xmm3, (%rdi, %rcx) 654 add $16, %rcx 655#ifdef USE_AS_STRNCPY 656 cmp %r10, %r8 657 jbe LABEL(unaligned_exit) 658#endif 659 jmp LABEL(ashr_12_use_sse2) 660 661 662/* 663 * ashr_11 handles the following cases: 664 * (16 + (src offset - dest offset)) % 16 = 11 665 * 666 * Based on above operation, start from (%r9 + rsi) to the left of this cache 667 * bank, there is no null byte. 668 */ 669 .p2align 4 670LABEL(ashr_11): 671 xor %ecx, %ecx /* clear index */ 672#ifdef USE_AS_STRNCPY 673 cmp %r10, %r8 674 jbe LABEL(unaligned_exit) 675#endif 676 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 677 jz LABEL(ashr_11_use_sse2) 678 679 .p2align 4 680LABEL(ashr_11_use_ssse3): 681 movdqa 16(%rsi, %rcx), %xmm3 682 pcmpeqb %xmm3, %xmm0 683 pmovmskb %xmm0, %edx 684 test %edx, %edx 685 jnz LABEL(unaligned_exit) 686#ifdef USE_AS_STRNCPY 687 sub $16, %r8 688 jbe LABEL(strncpy_truncation_unaligned) 689#endif 690 691 #palignr $11, (%rsi, %rcx), %xmm3 692 .byte 0x66, 0x0F, 0x3A ,0x0F 693 .byte 0x1c, 0x0e, 0x0b 694 695 movdqa %xmm3, (%rdi, %rcx) 696 add $16, %rcx 697 698#ifdef USE_AS_STRNCPY 699 cmp %r10, %r8 700 jbe LABEL(unaligned_exit) 701#endif 702 movdqa 16(%rsi, %rcx), %xmm3 703 pcmpeqb %xmm3, %xmm0 704 pmovmskb %xmm0, %edx 705 test %edx, %edx 706 jnz LABEL(unaligned_exit) 707#ifdef USE_AS_STRNCPY 708 sub $16, %r8 709 jbe LABEL(strncpy_truncation_unaligned) 710#endif 711 712 #palignr $11, (%rsi, %rcx), %xmm3 713 .byte 0x66, 0x0F, 0x3A ,0x0F 714 .byte 0x1c, 0x0e, 0x0b 715 716 movdqa %xmm3, (%rdi, %rcx) 717 add $16, %rcx 718#ifdef USE_AS_STRNCPY 719 cmp %r10, %r8 720 jbe LABEL(unaligned_exit) 721#endif 722 jmp LABEL(ashr_11_use_ssse3) 723 724 .p2align 4 725LABEL(ashr_11_use_sse2): 726 pcmpeqb 16(%rsi, %rcx), %xmm0 727 pmovmskb %xmm0, %edx 728 test %edx, %edx 729 jnz LABEL(unaligned_exit) 730#ifdef USE_AS_STRNCPY 731 sub $16, %r8 732 jbe LABEL(strncpy_truncation_unaligned) 733#endif 734 735 movdqa 16(%rsi, %rcx), %xmm3 736 movdqa (%rsi, %rcx), %xmm2 737 738 psrldq $11, %xmm2 739 pslldq $5, %xmm3 740 por %xmm2, %xmm3 741 742 movdqa %xmm3, (%rdi, %rcx) 743 add $16, %rcx 744 745#ifdef USE_AS_STRNCPY 746 cmp %r10, %r8 747 jbe LABEL(unaligned_exit) 748#endif 749 pcmpeqb 16(%rsi, %rcx), %xmm0 750 pmovmskb %xmm0, %edx 751 test %edx, %edx 752 jnz LABEL(unaligned_exit) 753#ifdef USE_AS_STRNCPY 754 sub $16, %r8 755 jbe LABEL(strncpy_truncation_unaligned) 756#endif 757 758 movdqa 16(%rsi, %rcx), %xmm3 759 movdqa (%rsi, %rcx), %xmm2 760 761 psrldq $11, %xmm2 762 pslldq $5, %xmm3 763 por %xmm2, %xmm3 764 765 movdqa %xmm3, (%rdi, %rcx) 766 add $16, %rcx 767#ifdef USE_AS_STRNCPY 768 cmp %r10, %r8 769 jbe LABEL(unaligned_exit) 770#endif 771 jmp LABEL(ashr_11_use_sse2) 772 773 774/* 775 * ashr_10 handles the following cases: 776 * (16 + (src offset - dest offset)) % 16 = 10 777 * 778 * Based on above operation, start from (%r9 + rsi) to the left of this cache 779 * bank, there is no null byte. 780 */ 781 .p2align 4 782LABEL(ashr_10): 783 xor %ecx, %ecx /* clear index */ 784#ifdef USE_AS_STRNCPY 785 cmp %r10, %r8 786 jbe LABEL(unaligned_exit) 787#endif 788 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 789 jz LABEL(ashr_10_use_sse2) 790 791 .p2align 4 792LABEL(ashr_10_use_ssse3): 793 movdqa 16(%rsi, %rcx), %xmm3 794 pcmpeqb %xmm3, %xmm0 795 pmovmskb %xmm0, %edx 796 test %edx, %edx 797 jnz LABEL(unaligned_exit) 798#ifdef USE_AS_STRNCPY 799 sub $16, %r8 800 jbe LABEL(strncpy_truncation_unaligned) 801#endif 802 803 #palignr $10, (%rsi, %rcx), %xmm3 804 .byte 0x66, 0x0F, 0x3A ,0x0F 805 .byte 0x1c, 0x0e, 0x0a 806 807 movdqa %xmm3, (%rdi, %rcx) 808 add $16, %rcx 809 810#ifdef USE_AS_STRNCPY 811 cmp %r10, %r8 812 jbe LABEL(unaligned_exit) 813#endif 814 movdqa 16(%rsi, %rcx), %xmm3 815 pcmpeqb %xmm3, %xmm0 816 pmovmskb %xmm0, %edx 817 test %edx, %edx 818 jnz LABEL(unaligned_exit) 819#ifdef USE_AS_STRNCPY 820 sub $16, %r8 821 jbe LABEL(strncpy_truncation_unaligned) 822#endif 823 824 #palignr $10, (%rsi, %rcx), %xmm3 825 .byte 0x66, 0x0F, 0x3A ,0x0F 826 .byte 0x1c, 0x0e, 0x0a 827 828 movdqa %xmm3, (%rdi, %rcx) 829 add $16, %rcx 830#ifdef USE_AS_STRNCPY 831 cmp %r10, %r8 832 jbe LABEL(unaligned_exit) 833#endif 834 jmp LABEL(ashr_10_use_ssse3) 835 836 .p2align 4 837LABEL(ashr_10_use_sse2): 838 pcmpeqb 16(%rsi, %rcx), %xmm0 839 pmovmskb %xmm0, %edx 840 test %edx, %edx 841 jnz LABEL(unaligned_exit) 842#ifdef USE_AS_STRNCPY 843 sub $16, %r8 844 jbe LABEL(strncpy_truncation_unaligned) 845#endif 846 847 movdqa 16(%rsi, %rcx), %xmm3 848 movdqa (%rsi, %rcx), %xmm2 849 850 psrldq $10, %xmm2 851 pslldq $6, %xmm3 852 por %xmm2, %xmm3 853 854 movdqa %xmm3, (%rdi, %rcx) 855 add $16, %rcx 856 857#ifdef USE_AS_STRNCPY 858 cmp %r10, %r8 859 jbe LABEL(unaligned_exit) 860#endif 861 pcmpeqb 16(%rsi, %rcx), %xmm0 862 pmovmskb %xmm0, %edx 863 test %edx, %edx 864 jnz LABEL(unaligned_exit) 865#ifdef USE_AS_STRNCPY 866 sub $16, %r8 867 jbe LABEL(strncpy_truncation_unaligned) 868#endif 869 870 movdqa 16(%rsi, %rcx), %xmm3 871 movdqa (%rsi, %rcx), %xmm2 872 873 psrldq $10, %xmm2 874 pslldq $6, %xmm3 875 por %xmm2, %xmm3 876 877 movdqa %xmm3, (%rdi, %rcx) 878 add $16, %rcx 879#ifdef USE_AS_STRNCPY 880 cmp %r10, %r8 881 jbe LABEL(unaligned_exit) 882#endif 883 jmp LABEL(ashr_10_use_sse2) 884 885 886/* 887 * ashr_9 handles the following cases: 888 * (16 + (src offset - dest offset)) % 16 = 9 889 * 890 * Based on above operation, start from (%r9 + rsi) to the left of this cache 891 * bank, there is no null byte. 892 */ 893 .p2align 4 894LABEL(ashr_9): 895 xor %ecx, %ecx /* clear index */ 896#ifdef USE_AS_STRNCPY 897 cmp %r10, %r8 898 jbe LABEL(unaligned_exit) 899#endif 900 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 901 jz LABEL(ashr_9_use_sse2) 902 903 .p2align 4 904LABEL(ashr_9_use_ssse3): 905 movdqa 16(%rsi, %rcx), %xmm3 906 pcmpeqb %xmm3, %xmm0 907 pmovmskb %xmm0, %edx 908 test %edx, %edx 909 jnz LABEL(unaligned_exit) 910#ifdef USE_AS_STRNCPY 911 sub $16, %r8 912 jbe LABEL(strncpy_truncation_unaligned) 913#endif 914 915 #palignr $9, (%rsi, %rcx), %xmm3 916 .byte 0x66, 0x0F, 0x3A ,0x0F 917 .byte 0x1c, 0x0e, 0x09 918 919 movdqa %xmm3, (%rdi, %rcx) 920 add $16, %rcx 921 922#ifdef USE_AS_STRNCPY 923 cmp %r10, %r8 924 jbe LABEL(unaligned_exit) 925#endif 926 movdqa 16(%rsi, %rcx), %xmm3 927 pcmpeqb %xmm3, %xmm0 928 pmovmskb %xmm0, %edx 929 test %edx, %edx 930 jnz LABEL(unaligned_exit) 931#ifdef USE_AS_STRNCPY 932 sub $16, %r8 933 jbe LABEL(strncpy_truncation_unaligned) 934#endif 935 936 #palignr $9, (%rsi, %rcx), %xmm3 937 .byte 0x66, 0x0F, 0x3A ,0x0F 938 .byte 0x1c, 0x0e, 0x09 939 940 movdqa %xmm3, (%rdi, %rcx) 941 add $16, %rcx 942#ifdef USE_AS_STRNCPY 943 cmp %r10, %r8 944 jbe LABEL(unaligned_exit) 945#endif 946 jmp LABEL(ashr_9_use_ssse3) 947 948 .p2align 4 949LABEL(ashr_9_use_sse2): 950 pcmpeqb 16(%rsi, %rcx), %xmm0 951 pmovmskb %xmm0, %edx 952 test %edx, %edx 953 jnz LABEL(unaligned_exit) 954#ifdef USE_AS_STRNCPY 955 sub $16, %r8 956 jbe LABEL(strncpy_truncation_unaligned) 957#endif 958 959 movdqa 16(%rsi, %rcx), %xmm3 960 movdqa (%rsi, %rcx), %xmm2 961 962 psrldq $9, %xmm2 963 pslldq $7, %xmm3 964 por %xmm2, %xmm3 965 966 movdqa %xmm3, (%rdi, %rcx) 967 add $16, %rcx 968 969#ifdef USE_AS_STRNCPY 970 cmp %r10, %r8 971 jbe LABEL(unaligned_exit) 972#endif 973 pcmpeqb 16(%rsi, %rcx), %xmm0 974 pmovmskb %xmm0, %edx 975 test %edx, %edx 976 jnz LABEL(unaligned_exit) 977#ifdef USE_AS_STRNCPY 978 sub $16, %r8 979 jbe LABEL(strncpy_truncation_unaligned) 980#endif 981 982 movdqa 16(%rsi, %rcx), %xmm3 983 movdqa (%rsi, %rcx), %xmm2 984 985 psrldq $9, %xmm2 986 pslldq $7, %xmm3 987 por %xmm2, %xmm3 988 989 movdqa %xmm3, (%rdi, %rcx) 990 add $16, %rcx 991#ifdef USE_AS_STRNCPY 992 cmp %r10, %r8 993 jbe LABEL(unaligned_exit) 994#endif 995 jmp LABEL(ashr_9_use_sse2) 996 997 998/* 999 * ashr_8 handles the following cases: 1000 * (16 + (src offset - dest offset)) % 16 = 8 1001 * 1002 * Based on above operation, start from (%r9 + rsi) to the left of this cache 1003 * bank, there is no null byte. 1004 */ 1005 .p2align 4 1006LABEL(ashr_8): 1007 xor %ecx, %ecx /* clear index */ 1008#ifdef USE_AS_STRNCPY 1009 cmp %r10, %r8 1010 jbe LABEL(unaligned_exit) 1011#endif 1012 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 1013 jz LABEL(ashr_8_use_sse2) 1014 1015 .p2align 4 1016LABEL(ashr_8_use_ssse3): 1017 movdqa 16(%rsi, %rcx), %xmm3 1018 pcmpeqb %xmm3, %xmm0 1019 pmovmskb %xmm0, %edx 1020 test %edx, %edx 1021 jnz LABEL(unaligned_exit) 1022#ifdef USE_AS_STRNCPY 1023 sub $16, %r8 1024 jbe LABEL(strncpy_truncation_unaligned) 1025#endif 1026 1027 #palignr $8, (%rsi, %rcx), %xmm3 1028 .byte 0x66, 0x0F, 0x3A ,0x0F 1029 .byte 0x1c, 0x0e, 0x08 1030 1031 movdqa %xmm3, (%rdi, %rcx) 1032 add $16, %rcx 1033 1034#ifdef USE_AS_STRNCPY 1035 cmp %r10, %r8 1036 jbe LABEL(unaligned_exit) 1037#endif 1038 movdqa 16(%rsi, %rcx), %xmm3 1039 pcmpeqb %xmm3, %xmm0 1040 pmovmskb %xmm0, %edx 1041 test %edx, %edx 1042 jnz LABEL(unaligned_exit) 1043#ifdef USE_AS_STRNCPY 1044 sub $16, %r8 1045 jbe LABEL(strncpy_truncation_unaligned) 1046#endif 1047 1048 #palignr $8, (%rsi, %rcx), %xmm3 1049 .byte 0x66, 0x0F, 0x3A ,0x0F 1050 .byte 0x1c, 0x0e, 0x08 1051 1052 movdqa %xmm3, (%rdi, %rcx) 1053 add $16, %rcx 1054#ifdef USE_AS_STRNCPY 1055 cmp %r10, %r8 1056 jbe LABEL(unaligned_exit) 1057#endif 1058 jmp LABEL(ashr_8_use_ssse3) 1059 1060 .p2align 4 1061LABEL(ashr_8_use_sse2): 1062 pcmpeqb 16(%rsi, %rcx), %xmm0 1063 pmovmskb %xmm0, %edx 1064 test %edx, %edx 1065 jnz LABEL(unaligned_exit) 1066#ifdef USE_AS_STRNCPY 1067 sub $16, %r8 1068 jbe LABEL(strncpy_truncation_unaligned) 1069#endif 1070 1071 movdqa 16(%rsi, %rcx), %xmm3 1072 movdqa (%rsi, %rcx), %xmm2 1073 1074 psrldq $8, %xmm2 1075 pslldq $8, %xmm3 1076 por %xmm2, %xmm3 1077 1078 movdqa %xmm3, (%rdi, %rcx) 1079 add $16, %rcx 1080 1081#ifdef USE_AS_STRNCPY 1082 cmp %r10, %r8 1083 jbe LABEL(unaligned_exit) 1084#endif 1085 pcmpeqb 16(%rsi, %rcx), %xmm0 1086 pmovmskb %xmm0, %edx 1087 test %edx, %edx 1088 jnz LABEL(unaligned_exit) 1089#ifdef USE_AS_STRNCPY 1090 sub $16, %r8 1091 jbe LABEL(strncpy_truncation_unaligned) 1092#endif 1093 1094 movdqa 16(%rsi, %rcx), %xmm3 1095 movdqa (%rsi, %rcx), %xmm2 1096 1097 psrldq $8, %xmm2 1098 pslldq $8, %xmm3 1099 por %xmm2, %xmm3 1100 1101 movdqa %xmm3, (%rdi, %rcx) 1102 add $16, %rcx 1103#ifdef USE_AS_STRNCPY 1104 cmp %r10, %r8 1105 jbe LABEL(unaligned_exit) 1106#endif 1107 jmp LABEL(ashr_8_use_sse2) 1108 1109 1110/* 1111 * ashr_7 handles the following cases: 1112 * (16 + (src offset - dest offset)) % 16 = 7 1113 * 1114 * Based on above operation, start from (%r9 + rsi) to the left of this cache 1115 * bank, there is no null byte. 1116 */ 1117 .p2align 4 1118LABEL(ashr_7): 1119 xor %ecx, %ecx /* clear index */ 1120#ifdef USE_AS_STRNCPY 1121 cmp %r10, %r8 1122 jbe LABEL(unaligned_exit) 1123#endif 1124 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 1125 jz LABEL(ashr_7_use_sse2) 1126 1127 .p2align 4 1128LABEL(ashr_7_use_ssse3): 1129 movdqa 16(%rsi, %rcx), %xmm3 1130 pcmpeqb %xmm3, %xmm0 1131 pmovmskb %xmm0, %edx 1132 test %edx, %edx 1133 jnz LABEL(unaligned_exit) 1134#ifdef USE_AS_STRNCPY 1135 sub $16, %r8 1136 jbe LABEL(strncpy_truncation_unaligned) 1137#endif 1138 1139 #palignr $7, (%rsi, %rcx), %xmm3 1140 .byte 0x66, 0x0F, 0x3A ,0x0F 1141 .byte 0x1c, 0x0e, 0x07 1142 1143 movdqa %xmm3, (%rdi, %rcx) 1144 add $16, %rcx 1145 1146#ifdef USE_AS_STRNCPY 1147 cmp %r10, %r8 1148 jbe LABEL(unaligned_exit) 1149#endif 1150 movdqa 16(%rsi, %rcx), %xmm3 1151 pcmpeqb %xmm3, %xmm0 1152 pmovmskb %xmm0, %edx 1153 test %edx, %edx 1154 jnz LABEL(unaligned_exit) 1155#ifdef USE_AS_STRNCPY 1156 sub $16, %r8 1157 jbe LABEL(strncpy_truncation_unaligned) 1158#endif 1159 1160 #palignr $7, (%rsi, %rcx), %xmm3 1161 .byte 0x66, 0x0F, 0x3A ,0x0F 1162 .byte 0x1c, 0x0e, 0x07 1163 1164 movdqa %xmm3, (%rdi, %rcx) 1165 add $16, %rcx 1166#ifdef USE_AS_STRNCPY 1167 cmp %r10, %r8 1168 jbe LABEL(unaligned_exit) 1169#endif 1170 jmp LABEL(ashr_7_use_ssse3) 1171 1172 .p2align 4 1173LABEL(ashr_7_use_sse2): 1174 pcmpeqb 16(%rsi, %rcx), %xmm0 1175 pmovmskb %xmm0, %edx 1176 test %edx, %edx 1177 jnz LABEL(unaligned_exit) 1178#ifdef USE_AS_STRNCPY 1179 sub $16, %r8 1180 jbe LABEL(strncpy_truncation_unaligned) 1181#endif 1182 1183 movdqa 16(%rsi, %rcx), %xmm3 1184 movdqa (%rsi, %rcx), %xmm2 1185 1186 psrldq $7, %xmm2 1187 pslldq $9, %xmm3 1188 por %xmm2, %xmm3 1189 1190 movdqa %xmm3, (%rdi, %rcx) 1191 add $16, %rcx 1192 1193#ifdef USE_AS_STRNCPY 1194 cmp %r10, %r8 1195 jbe LABEL(unaligned_exit) 1196#endif 1197 pcmpeqb 16(%rsi, %rcx), %xmm0 1198 pmovmskb %xmm0, %edx 1199 test %edx, %edx 1200 jnz LABEL(unaligned_exit) 1201#ifdef USE_AS_STRNCPY 1202 sub $16, %r8 1203 jbe LABEL(strncpy_truncation_unaligned) 1204#endif 1205 1206 movdqa 16(%rsi, %rcx), %xmm3 1207 movdqa (%rsi, %rcx), %xmm2 1208 1209 psrldq $7, %xmm2 1210 pslldq $9, %xmm3 1211 por %xmm2, %xmm3 1212 1213 movdqa %xmm3, (%rdi, %rcx) 1214 add $16, %rcx 1215#ifdef USE_AS_STRNCPY 1216 cmp %r10, %r8 1217 jbe LABEL(unaligned_exit) 1218#endif 1219 jmp LABEL(ashr_7_use_sse2) 1220 1221 1222/* 1223 * ashr_6 handles the following cases: 1224 * (16 + (src offset - dest offset)) % 16 = 6 1225 * 1226 * Based on above operation, start from (%r9 + rsi) to the left of this cache 1227 * bank, there is no null byte. 1228 */ 1229 .p2align 4 1230LABEL(ashr_6): 1231 xor %ecx, %ecx /* clear index */ 1232#ifdef USE_AS_STRNCPY 1233 cmp %r10, %r8 1234 jbe LABEL(unaligned_exit) 1235#endif 1236 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 1237 jz LABEL(ashr_6_use_sse2) 1238 1239 .p2align 4 1240LABEL(ashr_6_use_ssse3): 1241 movdqa 16(%rsi, %rcx), %xmm3 1242 pcmpeqb %xmm3, %xmm0 1243 pmovmskb %xmm0, %edx 1244 test %edx, %edx 1245 jnz LABEL(unaligned_exit) 1246#ifdef USE_AS_STRNCPY 1247 sub $16, %r8 1248 jbe LABEL(strncpy_truncation_unaligned) 1249#endif 1250 1251 #palignr $6, (%rsi, %rcx), %xmm3 1252 .byte 0x66, 0x0F, 0x3A ,0x0F 1253 .byte 0x1c, 0x0e, 0x06 1254 1255 movdqa %xmm3, (%rdi, %rcx) 1256 add $16, %rcx 1257 1258#ifdef USE_AS_STRNCPY 1259 cmp %r10, %r8 1260 jbe LABEL(unaligned_exit) 1261#endif 1262 movdqa 16(%rsi, %rcx), %xmm3 1263 pcmpeqb %xmm3, %xmm0 1264 pmovmskb %xmm0, %edx 1265 test %edx, %edx 1266 jnz LABEL(unaligned_exit) 1267#ifdef USE_AS_STRNCPY 1268 sub $16, %r8 1269 jbe LABEL(strncpy_truncation_unaligned) 1270#endif 1271 1272 #palignr $6, (%rsi, %rcx), %xmm3 1273 .byte 0x66, 0x0F, 0x3A ,0x0F 1274 .byte 0x1c, 0x0e, 0x06 1275 1276 movdqa %xmm3, (%rdi, %rcx) 1277 add $16, %rcx 1278#ifdef USE_AS_STRNCPY 1279 cmp %r10, %r8 1280 jbe LABEL(unaligned_exit) 1281#endif 1282 jmp LABEL(ashr_6_use_ssse3) 1283 1284 .p2align 4 1285LABEL(ashr_6_use_sse2): 1286 pcmpeqb 16(%rsi, %rcx), %xmm0 1287 pmovmskb %xmm0, %edx 1288 test %edx, %edx 1289 jnz LABEL(unaligned_exit) 1290#ifdef USE_AS_STRNCPY 1291 sub $16, %r8 1292 jbe LABEL(strncpy_truncation_unaligned) 1293#endif 1294 1295 movdqa 16(%rsi, %rcx), %xmm3 1296 movdqa (%rsi, %rcx), %xmm2 1297 1298 psrldq $6, %xmm2 1299 pslldq $10, %xmm3 1300 por %xmm2, %xmm3 1301 1302 movdqa %xmm3, (%rdi, %rcx) 1303 add $16, %rcx 1304 1305#ifdef USE_AS_STRNCPY 1306 cmp %r10, %r8 1307 jbe LABEL(unaligned_exit) 1308#endif 1309 pcmpeqb 16(%rsi, %rcx), %xmm0 1310 pmovmskb %xmm0, %edx 1311 test %edx, %edx 1312 jnz LABEL(unaligned_exit) 1313#ifdef USE_AS_STRNCPY 1314 sub $16, %r8 1315 jbe LABEL(strncpy_truncation_unaligned) 1316#endif 1317 1318 movdqa 16(%rsi, %rcx), %xmm3 1319 movdqa (%rsi, %rcx), %xmm2 1320 1321 psrldq $6, %xmm2 1322 pslldq $10, %xmm3 1323 por %xmm2, %xmm3 1324 1325 movdqa %xmm3, (%rdi, %rcx) 1326 add $16, %rcx 1327#ifdef USE_AS_STRNCPY 1328 cmp %r10, %r8 1329 jbe LABEL(unaligned_exit) 1330#endif 1331 jmp LABEL(ashr_6_use_sse2) 1332 1333 1334/* 1335 * ashr_5 handles the following cases: 1336 * (16 + (src offset - dest offset)) % 16 = 5 1337 * 1338 * Based on above operation, start from (%r9 + rsi) to the left of this cache 1339 * bank, there is no null byte. 1340 */ 1341 .p2align 4 1342LABEL(ashr_5): 1343 xor %ecx, %ecx /* clear index */ 1344#ifdef USE_AS_STRNCPY 1345 cmp %r10, %r8 1346 jbe LABEL(unaligned_exit) 1347#endif 1348 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 1349 jz LABEL(ashr_5_use_sse2) 1350 1351 .p2align 4 1352LABEL(ashr_5_use_ssse3): 1353 movdqa 16(%rsi, %rcx), %xmm3 1354 pcmpeqb %xmm3, %xmm0 1355 pmovmskb %xmm0, %edx 1356 test %edx, %edx 1357 jnz LABEL(unaligned_exit) 1358#ifdef USE_AS_STRNCPY 1359 sub $16, %r8 1360 jbe LABEL(strncpy_truncation_unaligned) 1361#endif 1362 1363 #palignr $5, (%rsi, %rcx), %xmm3 1364 .byte 0x66, 0x0F, 0x3A ,0x0F 1365 .byte 0x1c, 0x0e, 0x05 1366 1367 movdqa %xmm3, (%rdi, %rcx) 1368 add $16, %rcx 1369 1370#ifdef USE_AS_STRNCPY 1371 cmp %r10, %r8 1372 jbe LABEL(unaligned_exit) 1373#endif 1374 movdqa 16(%rsi, %rcx), %xmm3 1375 pcmpeqb %xmm3, %xmm0 1376 pmovmskb %xmm0, %edx 1377 test %edx, %edx 1378 jnz LABEL(unaligned_exit) 1379#ifdef USE_AS_STRNCPY 1380 sub $16, %r8 1381 jbe LABEL(strncpy_truncation_unaligned) 1382#endif 1383 1384 #palignr $5, (%rsi, %rcx), %xmm3 1385 .byte 0x66, 0x0F, 0x3A ,0x0F 1386 .byte 0x1c, 0x0e, 0x05 1387 1388 movdqa %xmm3, (%rdi, %rcx) 1389 add $16, %rcx 1390#ifdef USE_AS_STRNCPY 1391 cmp %r10, %r8 1392 jbe LABEL(unaligned_exit) 1393#endif 1394 jmp LABEL(ashr_5_use_ssse3) 1395 1396 .p2align 4 1397LABEL(ashr_5_use_sse2): 1398 pcmpeqb 16(%rsi, %rcx), %xmm0 1399 pmovmskb %xmm0, %edx 1400 test %edx, %edx 1401 jnz LABEL(unaligned_exit) 1402#ifdef USE_AS_STRNCPY 1403 sub $16, %r8 1404 jbe LABEL(strncpy_truncation_unaligned) 1405#endif 1406 1407 movdqa 16(%rsi, %rcx), %xmm3 1408 movdqa (%rsi, %rcx), %xmm2 1409 1410 psrldq $5, %xmm2 1411 pslldq $11, %xmm3 1412 por %xmm2, %xmm3 1413 1414 movdqa %xmm3, (%rdi, %rcx) 1415 add $16, %rcx 1416 1417#ifdef USE_AS_STRNCPY 1418 cmp %r10, %r8 1419 jbe LABEL(unaligned_exit) 1420#endif 1421 pcmpeqb 16(%rsi, %rcx), %xmm0 1422 pmovmskb %xmm0, %edx 1423 test %edx, %edx 1424 jnz LABEL(unaligned_exit) 1425#ifdef USE_AS_STRNCPY 1426 sub $16, %r8 1427 jbe LABEL(strncpy_truncation_unaligned) 1428#endif 1429 1430 movdqa 16(%rsi, %rcx), %xmm3 1431 movdqa (%rsi, %rcx), %xmm2 1432 1433 psrldq $5, %xmm2 1434 pslldq $11, %xmm3 1435 por %xmm2, %xmm3 1436 1437 movdqa %xmm3, (%rdi, %rcx) 1438 add $16, %rcx 1439#ifdef USE_AS_STRNCPY 1440 cmp %r10, %r8 1441 jbe LABEL(unaligned_exit) 1442#endif 1443 jmp LABEL(ashr_5_use_sse2) 1444 1445 1446/* 1447 * ashr_4 handles the following cases: 1448 * (16 + (src offset - dest offset)) % 16 = 4 1449 * 1450 * Based on above operation, start from (%r9 + rsi) to the left of this cache 1451 * bank, there is no null byte. 1452 */ 1453 .p2align 4 1454LABEL(ashr_4): 1455 xor %ecx, %ecx /* clear index */ 1456#ifdef USE_AS_STRNCPY 1457 cmp %r10, %r8 1458 jbe LABEL(unaligned_exit) 1459#endif 1460 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 1461 jz LABEL(ashr_4_use_sse2) 1462 1463 .p2align 4 1464LABEL(ashr_4_use_ssse3): 1465 movdqa 16(%rsi, %rcx), %xmm3 1466 pcmpeqb %xmm3, %xmm0 1467 pmovmskb %xmm0, %edx 1468 test %edx, %edx 1469 jnz LABEL(unaligned_exit) 1470#ifdef USE_AS_STRNCPY 1471 sub $16, %r8 1472 jbe LABEL(strncpy_truncation_unaligned) 1473#endif 1474 1475 #palignr $4, (%rsi, %rcx), %xmm3 1476 .byte 0x66, 0x0F, 0x3A ,0x0F 1477 .byte 0x1c, 0x0e, 0x04 1478 1479 movdqa %xmm3, (%rdi, %rcx) 1480 add $16, %rcx 1481 1482#ifdef USE_AS_STRNCPY 1483 cmp %r10, %r8 1484 jbe LABEL(unaligned_exit) 1485#endif 1486 movdqa 16(%rsi, %rcx), %xmm3 1487 pcmpeqb %xmm3, %xmm0 1488 pmovmskb %xmm0, %edx 1489 test %edx, %edx 1490 jnz LABEL(unaligned_exit) 1491#ifdef USE_AS_STRNCPY 1492 sub $16, %r8 1493 jbe LABEL(strncpy_truncation_unaligned) 1494#endif 1495 1496 #palignr $4, (%rsi, %rcx), %xmm3 1497 .byte 0x66, 0x0F, 0x3A ,0x0F 1498 .byte 0x1c, 0x0e, 0x04 1499 1500 movdqa %xmm3, (%rdi, %rcx) 1501 add $16, %rcx 1502#ifdef USE_AS_STRNCPY 1503 cmp %r10, %r8 1504 jbe LABEL(unaligned_exit) 1505#endif 1506 jmp LABEL(ashr_4_use_ssse3) 1507 1508 .p2align 4 1509LABEL(ashr_4_use_sse2): 1510 pcmpeqb 16(%rsi, %rcx), %xmm0 1511 pmovmskb %xmm0, %edx 1512 test %edx, %edx 1513 jnz LABEL(unaligned_exit) 1514#ifdef USE_AS_STRNCPY 1515 sub $16, %r8 1516 jbe LABEL(strncpy_truncation_unaligned) 1517#endif 1518 1519 movdqa 16(%rsi, %rcx), %xmm3 1520 movdqa (%rsi, %rcx), %xmm2 1521 1522 psrldq $4, %xmm2 1523 pslldq $12, %xmm3 1524 por %xmm2, %xmm3 1525 1526 movdqa %xmm3, (%rdi, %rcx) 1527 add $16, %rcx 1528 1529#ifdef USE_AS_STRNCPY 1530 cmp %r10, %r8 1531 jbe LABEL(unaligned_exit) 1532#endif 1533 pcmpeqb 16(%rsi, %rcx), %xmm0 1534 pmovmskb %xmm0, %edx 1535 test %edx, %edx 1536 jnz LABEL(unaligned_exit) 1537#ifdef USE_AS_STRNCPY 1538 sub $16, %r8 1539 jbe LABEL(strncpy_truncation_unaligned) 1540#endif 1541 1542 movdqa 16(%rsi, %rcx), %xmm3 1543 movdqa (%rsi, %rcx), %xmm2 1544 1545 psrldq $4, %xmm2 1546 pslldq $12, %xmm3 1547 por %xmm2, %xmm3 1548 1549 movdqa %xmm3, (%rdi, %rcx) 1550 add $16, %rcx 1551#ifdef USE_AS_STRNCPY 1552 cmp %r10, %r8 1553 jbe LABEL(unaligned_exit) 1554#endif 1555 jmp LABEL(ashr_4_use_sse2) 1556 1557 1558/* 1559 * ashr_3 handles the following cases: 1560 * (16 + (src offset - dest offset)) % 16 = 3 1561 * 1562 * Based on above operation, start from (%r9 + rsi) to the left of this cache 1563 * bank, there is no null byte. 1564 */ 1565 .p2align 4 1566LABEL(ashr_3): 1567 xor %ecx, %ecx /* clear index */ 1568#ifdef USE_AS_STRNCPY 1569 cmp %r10, %r8 1570 jbe LABEL(unaligned_exit) 1571#endif 1572 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 1573 jz LABEL(ashr_3_use_sse2) 1574 1575 .p2align 4 1576LABEL(ashr_3_use_ssse3): 1577 movdqa 16(%rsi, %rcx), %xmm3 1578 pcmpeqb %xmm3, %xmm0 1579 pmovmskb %xmm0, %edx 1580 test %edx, %edx 1581 jnz LABEL(unaligned_exit) 1582#ifdef USE_AS_STRNCPY 1583 sub $16, %r8 1584 jbe LABEL(strncpy_truncation_unaligned) 1585#endif 1586 1587 #palignr $3, (%rsi, %rcx), %xmm3 1588 .byte 0x66, 0x0F, 0x3A ,0x0F 1589 .byte 0x1c, 0x0e, 0x03 1590 1591 movdqa %xmm3, (%rdi, %rcx) 1592 add $16, %rcx 1593 1594#ifdef USE_AS_STRNCPY 1595 cmp %r10, %r8 1596 jbe LABEL(unaligned_exit) 1597#endif 1598 movdqa 16(%rsi, %rcx), %xmm3 1599 pcmpeqb %xmm3, %xmm0 1600 pmovmskb %xmm0, %edx 1601 test %edx, %edx 1602 jnz LABEL(unaligned_exit) 1603#ifdef USE_AS_STRNCPY 1604 sub $16, %r8 1605 jbe LABEL(strncpy_truncation_unaligned) 1606#endif 1607 1608 #palignr $3, (%rsi, %rcx), %xmm3 1609 .byte 0x66, 0x0F, 0x3A ,0x0F 1610 .byte 0x1c, 0x0e, 0x03 1611 1612 movdqa %xmm3, (%rdi, %rcx) 1613 add $16, %rcx 1614#ifdef USE_AS_STRNCPY 1615 cmp %r10, %r8 1616 jbe LABEL(unaligned_exit) 1617#endif 1618 jmp LABEL(ashr_3_use_ssse3) 1619 1620 .p2align 4 1621LABEL(ashr_3_use_sse2): 1622 pcmpeqb 16(%rsi, %rcx), %xmm0 1623 pmovmskb %xmm0, %edx 1624 test %edx, %edx 1625 jnz LABEL(unaligned_exit) 1626#ifdef USE_AS_STRNCPY 1627 sub $16, %r8 1628 jbe LABEL(strncpy_truncation_unaligned) 1629#endif 1630 1631 movdqa 16(%rsi, %rcx), %xmm3 1632 movdqa (%rsi, %rcx), %xmm2 1633 1634 psrldq $3, %xmm2 1635 pslldq $13, %xmm3 1636 por %xmm2, %xmm3 1637 1638 movdqa %xmm3, (%rdi, %rcx) 1639 add $16, %rcx 1640 1641#ifdef USE_AS_STRNCPY 1642 cmp %r10, %r8 1643 jbe LABEL(unaligned_exit) 1644#endif 1645 pcmpeqb 16(%rsi, %rcx), %xmm0 1646 pmovmskb %xmm0, %edx 1647 test %edx, %edx 1648 jnz LABEL(unaligned_exit) 1649#ifdef USE_AS_STRNCPY 1650 sub $16, %r8 1651 jbe LABEL(strncpy_truncation_unaligned) 1652#endif 1653 1654 movdqa 16(%rsi, %rcx), %xmm3 1655 movdqa (%rsi, %rcx), %xmm2 1656 1657 psrldq $3, %xmm2 1658 pslldq $13, %xmm3 1659 por %xmm2, %xmm3 1660 1661 movdqa %xmm3, (%rdi, %rcx) 1662 add $16, %rcx 1663#ifdef USE_AS_STRNCPY 1664 cmp %r10, %r8 1665 jbe LABEL(unaligned_exit) 1666#endif 1667 jmp LABEL(ashr_3_use_sse2) 1668 1669 1670/* 1671 * ashr_2 handles the following cases: 1672 * (16 + (src offset - dest offset)) % 16 = 2 1673 * 1674 * Based on above operation, start from (%r9 + rsi) to the left of this cache 1675 * bank, there is no null byte. 1676 */ 1677 .p2align 4 1678LABEL(ashr_2): 1679 xor %ecx, %ecx /* clear index */ 1680#ifdef USE_AS_STRNCPY 1681 cmp %r10, %r8 1682 jbe LABEL(unaligned_exit) 1683#endif 1684 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 1685 jz LABEL(ashr_2_use_sse2) 1686 1687 .p2align 4 1688LABEL(ashr_2_use_ssse3): 1689 movdqa 16(%rsi, %rcx), %xmm3 1690 pcmpeqb %xmm3, %xmm0 1691 pmovmskb %xmm0, %edx 1692 test %edx, %edx 1693 jnz LABEL(unaligned_exit) 1694#ifdef USE_AS_STRNCPY 1695 sub $16, %r8 1696 jbe LABEL(strncpy_truncation_unaligned) 1697#endif 1698 1699 #palignr $2, (%rsi, %rcx), %xmm3 1700 .byte 0x66, 0x0F, 0x3A ,0x0F 1701 .byte 0x1c, 0x0e, 0x02 1702 1703 movdqa %xmm3, (%rdi, %rcx) 1704 add $16, %rcx 1705 1706#ifdef USE_AS_STRNCPY 1707 cmp %r10, %r8 1708 jbe LABEL(unaligned_exit) 1709#endif 1710 movdqa 16(%rsi, %rcx), %xmm3 1711 pcmpeqb %xmm3, %xmm0 1712 pmovmskb %xmm0, %edx 1713 test %edx, %edx 1714 jnz LABEL(unaligned_exit) 1715#ifdef USE_AS_STRNCPY 1716 sub $16, %r8 1717 jbe LABEL(strncpy_truncation_unaligned) 1718#endif 1719 1720 #palignr $2, (%rsi, %rcx), %xmm3 1721 .byte 0x66, 0x0F, 0x3A ,0x0F 1722 .byte 0x1c, 0x0e, 0x02 1723 1724 movdqa %xmm3, (%rdi, %rcx) 1725 add $16, %rcx 1726#ifdef USE_AS_STRNCPY 1727 cmp %r10, %r8 1728 jbe LABEL(unaligned_exit) 1729#endif 1730 jmp LABEL(ashr_2_use_ssse3) 1731 1732 .p2align 4 1733LABEL(ashr_2_use_sse2): 1734 pcmpeqb 16(%rsi, %rcx), %xmm0 1735 pmovmskb %xmm0, %edx 1736 test %edx, %edx 1737 jnz LABEL(unaligned_exit) 1738#ifdef USE_AS_STRNCPY 1739 sub $16, %r8 1740 jbe LABEL(strncpy_truncation_unaligned) 1741#endif 1742 1743 movdqa 16(%rsi, %rcx), %xmm3 1744 movdqa (%rsi, %rcx), %xmm2 1745 1746 psrldq $2, %xmm2 1747 pslldq $14, %xmm3 1748 por %xmm2, %xmm3 1749 1750 movdqa %xmm3, (%rdi, %rcx) 1751 add $16, %rcx 1752 1753#ifdef USE_AS_STRNCPY 1754 cmp %r10, %r8 1755 jbe LABEL(unaligned_exit) 1756#endif 1757 pcmpeqb 16(%rsi, %rcx), %xmm0 1758 pmovmskb %xmm0, %edx 1759 test %edx, %edx 1760 jnz LABEL(unaligned_exit) 1761#ifdef USE_AS_STRNCPY 1762 sub $16, %r8 1763 jbe LABEL(strncpy_truncation_unaligned) 1764#endif 1765 1766 movdqa 16(%rsi, %rcx), %xmm3 1767 movdqa (%rsi, %rcx), %xmm2 1768 1769 psrldq $2, %xmm2 1770 pslldq $14, %xmm3 1771 por %xmm2, %xmm3 1772 1773 movdqa %xmm3, (%rdi, %rcx) 1774 add $16, %rcx 1775#ifdef USE_AS_STRNCPY 1776 cmp %r10, %r8 1777 jbe LABEL(unaligned_exit) 1778#endif 1779 jmp LABEL(ashr_2_use_sse2) 1780 1781 1782/* 1783 * ashr_1 handles the following cases: 1784 * (16 + (src offset - dest offset)) % 16 = 1 1785 * 1786 * Based on above operation, start from (%r9 + rsi) to the left of this cache 1787 * bank, there is no null byte. 1788 */ 1789 .p2align 4 1790LABEL(ashr_1): 1791 xor %ecx, %ecx /* clear index */ 1792#ifdef USE_AS_STRNCPY 1793 cmp %r10, %r8 1794 jbe LABEL(unaligned_exit) 1795#endif 1796 testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 1797 jz LABEL(ashr_1_use_sse2) 1798 1799 .p2align 4 1800LABEL(ashr_1_use_ssse3): 1801 movdqa 16(%rsi, %rcx), %xmm3 1802 pcmpeqb %xmm3, %xmm0 1803 pmovmskb %xmm0, %edx 1804 test %edx, %edx 1805 jnz LABEL(unaligned_exit) 1806#ifdef USE_AS_STRNCPY 1807 sub $16, %r8 1808 jbe LABEL(strncpy_truncation_unaligned) 1809#endif 1810 1811 #palignr $1, (%rsi, %rcx), %xmm3 1812 .byte 0x66, 0x0F, 0x3A ,0x0F 1813 .byte 0x1c, 0x0e, 0x01 1814 1815 movdqa %xmm3, (%rdi, %rcx) 1816 add $16, %rcx 1817 1818#ifdef USE_AS_STRNCPY 1819 cmp %r10, %r8 1820 jbe LABEL(unaligned_exit) 1821#endif 1822 movdqa 16(%rsi, %rcx), %xmm3 1823 pcmpeqb %xmm3, %xmm0 1824 pmovmskb %xmm0, %edx 1825 test %edx, %edx 1826 jnz LABEL(unaligned_exit) 1827#ifdef USE_AS_STRNCPY 1828 sub $16, %r8 1829 jbe LABEL(strncpy_truncation_unaligned) 1830#endif 1831 #palignr $1, (%rsi, %rcx), %xmm3 1832 .byte 0x66, 0x0F, 0x3A ,0x0F 1833 .byte 0x1c, 0x0e, 0x01 1834 1835 movdqa %xmm3, (%rdi, %rcx) 1836 add $16, %rcx 1837#ifdef USE_AS_STRNCPY 1838 cmp %r10, %r8 1839 jbe LABEL(unaligned_exit) 1840#endif 1841 jmp LABEL(ashr_1_use_ssse3) 1842 1843 .p2align 4 1844LABEL(ashr_1_use_sse2): 1845 pcmpeqb 16(%rsi, %rcx), %xmm0 1846 pmovmskb %xmm0, %edx 1847 test %edx, %edx 1848 jnz LABEL(unaligned_exit) 1849#ifdef USE_AS_STRNCPY 1850 sub $16, %r8 1851 jbe LABEL(strncpy_truncation_unaligned) 1852#endif 1853 movdqa 16(%rsi, %rcx), %xmm3 1854 movdqa (%rsi, %rcx), %xmm2 1855 1856 psrldq $1, %xmm2 1857 pslldq $15, %xmm3 1858 por %xmm2, %xmm3 1859 1860 movdqa %xmm3, (%rdi, %rcx) 1861 add $16, %rcx 1862 1863#ifdef USE_AS_STRNCPY 1864 cmp %r10, %r8 1865 jbe LABEL(unaligned_exit) 1866#endif 1867 pcmpeqb 16(%rsi, %rcx), %xmm0 1868 pmovmskb %xmm0, %edx 1869 test %edx, %edx 1870 jnz LABEL(unaligned_exit) 1871#ifdef USE_AS_STRNCPY 1872 sub $16, %r8 1873 jbe LABEL(strncpy_truncation_unaligned) 1874#endif 1875 1876 movdqa 16(%rsi, %rcx), %xmm3 1877 movdqa (%rsi, %rcx), %xmm2 1878 1879 psrldq $1, %xmm2 1880 pslldq $15, %xmm3 1881 por %xmm2, %xmm3 1882 1883 movdqa %xmm3, (%rdi, %rcx) 1884 add $16, %rcx 1885#ifdef USE_AS_STRNCPY 1886 cmp %r10, %r8 1887 jbe LABEL(unaligned_exit) 1888#endif 1889 jmp LABEL(ashr_1_use_sse2) 1890 1891 1892 /* 1893 * Exit tail code: 1894 * Up to 32 bytes are copied in the case of strcpy. 1895 */ 1896 .p2align 4 1897LABEL(less32bytes): 1898 xor %ecx, %ecx 1899LABEL(unaligned_exit): 1900 add %r9, %rsi /* r9 holds offset of rsi */ 1901 mov %rcx, %r9 1902 mov %r10, %rcx 1903 shl %cl, %edx /* after shl, calculate the exact number to be filled */ 1904 mov %r9, %rcx 1905 .p2align 4 1906LABEL(aligned_exit): 1907 add %rcx, %rdi /* locate exact address for rdi */ 1908LABEL(less16bytes): 1909 add %rcx, %rsi /* locate exact address for rsi */ 1910LABEL(aligned_16bytes): 1911#ifdef USE_AS_STRNCPY 1912 /* 1913 * Null found in 16bytes checked. Set bit in bitmask corresponding to 1914 * the strncpy count argument. We will copy to the null (inclusive) 1915 * or count whichever comes first. 1916 */ 1917 mov $1, %r9d 1918 lea -1(%r8), %rcx 1919 shl %cl, %r9d 1920 cmp $32, %r8 1921 ja LABEL(strncpy_tail) 1922 or %r9d, %edx 1923LABEL(strncpy_tail): 1924#endif 1925 /* 1926 * Check to see if BSF is fast on this processor. If not, use a 1927 * different exit tail. 1928 */ 1929 testb $USE_BSF, .memops_method(%rip) 1930 jz LABEL(AMD_exit) 1931 bsf %rdx, %rcx /* Find byte with null char */ 1932 lea LABEL(tail_table)(%rip), %r11 1933 movslq (%r11, %rcx, 4), %rcx 1934 lea (%r11, %rcx), %rcx 1935 jmp *%rcx 1936 1937#ifdef USE_AS_STRNCPY 1938 /* 1939 * Count reached before null found. 1940 */ 1941 .p2align 4 1942LABEL(less32bytes_strncpy_truncation): 1943 xor %ecx, %ecx 1944LABEL(strncpy_truncation_unaligned): 1945 add %r9, %rsi /* next src char to copy */ 1946LABEL(strncpy_truncation_aligned): 1947 add %rcx, %rdi 1948 add %rcx, %rsi 1949 add $16, %r8 /* compensation */ 1950 lea -1(%r8), %rcx 1951 lea LABEL(tail_table)(%rip), %r11 1952 movslq (%r11, %rcx, 4), %rcx 1953 lea (%r11, %rcx), %rcx 1954 jmp *%rcx 1955 1956 .p2align 4 1957LABEL(strncpy_exitz): 1958 mov %rdi, %rax 1959 ret 1960#endif 1961 1962 .p2align 4 1963LABEL(AMD_exit): 1964 test %dl, %dl 1965 jz LABEL(AMD_exit_more_8) 1966 test $0x01, %dl 1967 jnz LABEL(tail_0) 1968 test $0x02, %dl 1969 jnz LABEL(tail_1) 1970 test $0x04, %dl 1971 jnz LABEL(tail_2) 1972 test $0x08, %dl 1973 jnz LABEL(tail_3) 1974 test $0x10, %dl 1975 jnz LABEL(tail_4) 1976 test $0x20, %dl 1977 jnz LABEL(tail_5) 1978 test $0x40, %dl 1979 jnz LABEL(tail_6) 1980 1981 .p2align 4 1982LABEL(tail_7): /* 8 bytes */ 1983 mov (%rsi), %rcx 1984 mov %rcx, (%rdi) 1985#ifdef USE_AS_STRNCPY 1986 mov $8, %cl 1987 sub $8, %r8 1988 jnz LABEL(strncpy_fill_tail) 1989#endif 1990 ret 1991 1992#ifdef USE_AS_STRNCPY 1993 /* 1994 * Null terminated src string shorter than count. Fill the rest of the 1995 * destination with null chars. 1996 */ 1997 .p2align 4 1998LABEL(strncpy_fill_tail): 1999 mov %rax, %rdx 2000 movzx %cl, %rax 2001 mov %r8, %rcx 2002 add %rax, %rdi 2003 xor %eax, %eax 2004 shr $3, %ecx 2005 jz LABEL(strncpy_fill_less_8) 2006 2007 rep stosq 2008LABEL(strncpy_fill_less_8): 2009 mov %r8, %rcx 2010 and $7, %rcx 2011 jz LABEL(strncpy_fill_return) 2012LABEL(strncpy_fill_less_7): 2013 sub $1, %ecx 2014 mov %al, (%rdi, %rcx) 2015 jnz LABEL(strncpy_fill_less_7) 2016LABEL(strncpy_fill_return): 2017 mov %rdx, %rax 2018 ret 2019#endif 2020 2021 .p2align 4 2022LABEL(tail_0): /* 1 byte */ 2023 mov (%rsi), %cl 2024 mov %cl, (%rdi) 2025#ifdef USE_AS_STRNCPY 2026 mov $1, %cl 2027 sub $1, %r8 2028 jnz LABEL(strncpy_fill_tail) 2029#endif 2030 ret 2031 2032 .p2align 4 2033LABEL(tail_1): /* 2 bytes */ 2034 mov (%rsi), %cx 2035 mov %cx, (%rdi) 2036#ifdef USE_AS_STRNCPY 2037 mov $2, %cl 2038 sub $2, %r8 2039 jnz LABEL(strncpy_fill_tail) 2040#endif 2041 ret 2042 2043 .p2align 4 2044LABEL(tail_2): /* 3 bytes */ 2045 mov (%rsi), %cx 2046 mov %cx, (%rdi) 2047 mov 1(%rsi), %cx 2048 mov %cx, 1(%rdi) 2049#ifdef USE_AS_STRNCPY 2050 mov $3, %cl 2051 sub $3, %r8 2052 jnz LABEL(strncpy_fill_tail) 2053#endif 2054 ret 2055 2056 .p2align 4 2057LABEL(tail_3): /* 4 bytes */ 2058 mov (%rsi), %ecx 2059 mov %ecx, (%rdi) 2060#ifdef USE_AS_STRNCPY 2061 mov $4, %cl 2062 sub $4, %r8 2063 jnz LABEL(strncpy_fill_tail) 2064#endif 2065 ret 2066 2067 .p2align 4 2068LABEL(tail_4): /* 5 bytes */ 2069 mov (%rsi), %ecx 2070 mov %ecx, (%rdi) 2071 mov 1(%rsi), %edx 2072 mov %edx, 1(%rdi) 2073#ifdef USE_AS_STRNCPY 2074 mov $5, %cl 2075 sub $5, %r8 2076 jnz LABEL(strncpy_fill_tail) 2077#endif 2078 ret 2079 2080 .p2align 4 2081LABEL(tail_5): /* 6 bytes */ 2082 mov (%rsi), %ecx 2083 mov %ecx, (%rdi) 2084 mov 2(%rsi), %edx 2085 mov %edx, 2(%rdi) 2086#ifdef USE_AS_STRNCPY 2087 mov $6, %cl 2088 sub $6, %r8 2089 jnz LABEL(strncpy_fill_tail) 2090#endif 2091 ret 2092 2093 .p2align 4 2094LABEL(tail_6): /* 7 bytes */ 2095 mov (%rsi), %ecx 2096 mov %ecx, (%rdi) 2097 mov 3(%rsi), %edx 2098 mov %edx,3(%rdi) 2099#ifdef USE_AS_STRNCPY 2100 mov $7, %cl 2101 sub $7, %r8 2102 jnz LABEL(strncpy_fill_tail) 2103#endif 2104 ret 2105 2106 .p2align 4 2107LABEL(tail_8): /* 9 bytes */ 2108 mov (%rsi), %rcx 2109 mov %rcx, (%rdi) 2110 mov 5(%rsi), %edx 2111 mov %edx, 5(%rdi) 2112#ifdef USE_AS_STRNCPY 2113 mov $9, %cl 2114 sub $9, %r8 2115 jnz LABEL(strncpy_fill_tail) 2116#endif 2117 ret 2118 2119 .p2align 4 2120LABEL(AMD_exit_more_8): 2121 test %dh, %dh 2122 jz LABEL(AMD_exit_more_16) 2123 test $0x01, %dh 2124 jnz LABEL(tail_8) 2125 test $0x02, %dh 2126 jnz LABEL(tail_9) 2127 test $0x04, %dh 2128 jnz LABEL(tail_10) 2129 test $0x08, %dh 2130 jnz LABEL(tail_11) 2131 test $0x10, %dh 2132 jnz LABEL(tail_12) 2133 test $0x20, %dh 2134 jnz LABEL(tail_13) 2135 test $0x40, %dh 2136 jnz LABEL(tail_14) 2137 2138 .p2align 4 2139LABEL(tail_15): /* 16 bytes */ 2140 mov (%rsi), %rcx 2141 mov %rcx, (%rdi) 2142 mov 8(%rsi), %rdx 2143 mov %rdx, 8(%rdi) 2144#ifdef USE_AS_STRNCPY 2145 mov $16, %cl 2146 sub $16, %r8 2147 jnz LABEL(strncpy_fill_tail) 2148#endif 2149 ret 2150 2151 .p2align 4 2152LABEL(tail_9): /* 10 bytes */ 2153 mov (%rsi), %rcx 2154 mov %rcx, (%rdi) 2155 mov 6(%rsi), %edx 2156 mov %edx, 6(%rdi) 2157#ifdef USE_AS_STRNCPY 2158 mov $10, %cl 2159 sub $10, %r8 2160 jnz LABEL(strncpy_fill_tail) 2161#endif 2162 ret 2163 2164 .p2align 4 2165LABEL(tail_10): /* 11 bytes */ 2166 mov (%rsi), %rcx 2167 mov %rcx, (%rdi) 2168 mov 7(%rsi), %edx 2169 mov %edx, 7(%rdi) 2170#ifdef USE_AS_STRNCPY 2171 mov $11, %cl 2172 sub $11, %r8 2173 jnz LABEL(strncpy_fill_tail) 2174#endif 2175 ret 2176 2177 .p2align 4 2178LABEL(tail_11): /* 12 bytes */ 2179 mov (%rsi), %rcx 2180 mov %rcx, (%rdi) 2181 mov 8(%rsi), %edx 2182 mov %edx, 8(%rdi) 2183#ifdef USE_AS_STRNCPY 2184 mov $12, %cl 2185 sub $12, %r8 2186 jnz LABEL(strncpy_fill_tail) 2187#endif 2188 ret 2189 2190 .p2align 4 2191LABEL(tail_12): /* 13 bytes */ 2192 mov (%rsi), %rcx 2193 mov %rcx, (%rdi) 2194 mov 5(%rsi), %rcx 2195 mov %rcx, 5(%rdi) 2196#ifdef USE_AS_STRNCPY 2197 mov $13, %cl 2198 sub $13, %r8 2199 jnz LABEL(strncpy_fill_tail) 2200#endif 2201 ret 2202 2203 .p2align 4 2204LABEL(tail_13): /* 14 bytes */ 2205 mov (%rsi), %rcx 2206 mov %rcx, (%rdi) 2207 mov 6(%rsi), %rcx 2208 mov %rcx, 6(%rdi) 2209#ifdef USE_AS_STRNCPY 2210 mov $14, %cl 2211 sub $14, %r8 2212 jnz LABEL(strncpy_fill_tail) 2213#endif 2214 ret 2215 2216 .p2align 4 2217LABEL(tail_14): /* 15 bytes */ 2218 mov (%rsi), %rcx 2219 mov %rcx, (%rdi) 2220 mov 7(%rsi), %rcx 2221 mov %rcx, 7(%rdi) 2222#ifdef USE_AS_STRNCPY 2223 mov $15, %cl 2224 sub $15, %r8 2225 jnz LABEL(strncpy_fill_tail) 2226#endif 2227 ret 2228 2229 .p2align 4 2230LABEL(AMD_exit_more_16): 2231 shr $16, %edx 2232 test %dl, %dl 2233 jz LABEL(AMD_exit_more_24) 2234 test $0x01, %dl 2235 jnz LABEL(tail_16) 2236 test $0x02, %dl 2237 jnz LABEL(tail_17) 2238 test $0x04, %dl 2239 jnz LABEL(tail_18) 2240 test $0x08, %dl 2241 jnz LABEL(tail_19) 2242 test $0x10, %dl 2243 jnz LABEL(tail_20) 2244 test $0x20, %dl 2245 jnz LABEL(tail_21) 2246 test $0x40, %dl 2247 jnz LABEL(tail_22) 2248 2249 .p2align 4 2250LABEL(tail_23): /* 24 bytes */ 2251 mov (%rsi), %rcx 2252 mov %rcx, (%rdi) 2253 mov 8(%rsi), %rdx 2254 mov %rdx, 8(%rdi) 2255 mov 16(%rsi), %rcx 2256 mov %rcx, 16(%rdi) 2257#ifdef USE_AS_STRNCPY 2258 mov $24, %cl 2259 sub $24, %r8 2260 jnz LABEL(strncpy_fill_tail) 2261#endif 2262 ret 2263 2264 .p2align 4 2265LABEL(tail_16): /* 17 bytes */ 2266 mov (%rsi), %rcx 2267 mov %rcx, (%rdi) 2268 mov 8(%rsi), %rdx 2269 mov %rdx, 8(%rdi) 2270 mov 16(%rsi), %cl 2271 mov %cl, 16(%rdi) 2272#ifdef USE_AS_STRNCPY 2273 mov $17, %cl 2274 sub $17, %r8 2275 jnz LABEL(strncpy_fill_tail) 2276#endif 2277 ret 2278 2279 .p2align 4 2280LABEL(tail_17): /* 18 bytes */ 2281 mov (%rsi), %rcx 2282 mov %rcx, (%rdi) 2283 mov 8(%rsi), %rdx 2284 mov %rdx, 8(%rdi) 2285 mov 16(%rsi), %cx 2286 mov %cx, 16(%rdi) 2287#ifdef USE_AS_STRNCPY 2288 mov $18, %cl 2289 sub $18, %r8 2290 jnz LABEL(strncpy_fill_tail) 2291#endif 2292 ret 2293 2294 .p2align 4 2295LABEL(tail_18): /* 19 bytes */ 2296 mov (%rsi), %rcx 2297 mov %rcx, (%rdi) 2298 mov 8(%rsi), %rdx 2299 mov %rdx, 8(%rdi) 2300 mov 15(%rsi), %ecx 2301 mov %ecx,15(%rdi) 2302#ifdef USE_AS_STRNCPY 2303 mov $19, %cl 2304 sub $19, %r8 2305 jnz LABEL(strncpy_fill_tail) 2306#endif 2307 ret 2308 2309 .p2align 4 2310LABEL(tail_19): /* 20 bytes */ 2311 mov (%rsi), %rcx 2312 mov %rcx, (%rdi) 2313 mov 8(%rsi), %rdx 2314 mov %rdx, 8(%rdi) 2315 mov 16(%rsi), %ecx 2316 mov %ecx, 16(%rdi) 2317#ifdef USE_AS_STRNCPY 2318 mov $20, %cl 2319 sub $20, %r8 2320 jnz LABEL(strncpy_fill_tail) 2321#endif 2322 ret 2323 2324 .p2align 4 2325LABEL(tail_20): /* 21 bytes */ 2326 mov (%rsi), %rcx 2327 mov %rcx, (%rdi) 2328 mov 8(%rsi), %rdx 2329 mov %rdx, 8(%rdi) 2330 mov 13(%rsi), %rcx 2331 mov %rcx, 13(%rdi) 2332#ifdef USE_AS_STRNCPY 2333 mov $21, %cl 2334 sub $21, %r8 2335 jnz LABEL(strncpy_fill_tail) 2336#endif 2337 ret 2338 2339 .p2align 4 2340LABEL(tail_21): /* 22 bytes */ 2341 mov (%rsi), %rcx 2342 mov %rcx, (%rdi) 2343 mov 8(%rsi), %rdx 2344 mov %rdx, 8(%rdi) 2345 mov 14(%rsi), %rcx 2346 mov %rcx, 14(%rdi) 2347#ifdef USE_AS_STRNCPY 2348 mov $22, %cl 2349 sub $22, %r8 2350 jnz LABEL(strncpy_fill_tail) 2351#endif 2352 ret 2353 2354 .p2align 4 2355LABEL(tail_22): /* 23 bytes */ 2356 mov (%rsi), %rcx 2357 mov %rcx, (%rdi) 2358 mov 8(%rsi), %rdx 2359 mov %rdx, 8(%rdi) 2360 mov 15(%rsi), %rcx 2361 mov %rcx, 15(%rdi) 2362#ifdef USE_AS_STRNCPY 2363 mov $23, %cl 2364 sub $23, %r8 2365 jnz LABEL(strncpy_fill_tail) 2366#endif 2367 ret 2368 2369 .p2align 4 2370LABEL(AMD_exit_more_24): 2371 test $0x01, %dh 2372 jnz LABEL(tail_24) 2373 test $0x02, %dh 2374 jnz LABEL(tail_25) 2375 test $0x04, %dh 2376 jnz LABEL(tail_26) 2377 test $0x08, %dh 2378 jnz LABEL(tail_27) 2379 test $0x10, %dh 2380 jnz LABEL(tail_28) 2381 test $0x20, %dh 2382 jnz LABEL(tail_29) 2383 test $0x40, %dh 2384 jnz LABEL(tail_30) 2385 2386 .p2align 4 2387LABEL(tail_31): /* 32 bytes */ 2388 mov (%rsi), %rcx 2389 mov %rcx, (%rdi) 2390 mov 8(%rsi), %rdx 2391 mov %rdx, 8(%rdi) 2392 mov 16(%rsi), %rcx 2393 mov %rcx, 16(%rdi) 2394 mov 24(%rsi), %rdx 2395 mov %rdx, 24(%rdi) 2396#ifdef USE_AS_STRNCPY 2397 mov $32, %cl 2398 sub $32, %r8 2399 jnz LABEL(strncpy_fill_tail) 2400#endif 2401 ret 2402 2403 .p2align 4 2404LABEL(tail_24): /* 25 bytes */ 2405 mov (%rsi), %rcx 2406 mov %rcx, (%rdi) 2407 mov 8(%rsi), %rdx 2408 mov %rdx, 8(%rdi) 2409 mov 16(%rsi), %rcx 2410 mov %rcx, 16(%rdi) 2411 mov 21(%rsi), %edx 2412 mov %edx, 21(%rdi) 2413#ifdef USE_AS_STRNCPY 2414 mov $25, %cl 2415 sub $25, %r8 2416 jnz LABEL(strncpy_fill_tail) 2417#endif 2418 ret 2419 2420 .p2align 4 2421LABEL(tail_25): /* 26 bytes */ 2422 mov (%rsi), %rcx 2423 mov %rcx, (%rdi) 2424 mov 8(%rsi), %rdx 2425 mov %rdx, 8(%rdi) 2426 mov 16(%rsi), %rcx 2427 mov %rcx, 16(%rdi) 2428 mov 22(%rsi), %edx 2429 mov %edx, 22(%rdi) 2430#ifdef USE_AS_STRNCPY 2431 mov $26, %cl 2432 sub $26, %r8 2433 jnz LABEL(strncpy_fill_tail) 2434#endif 2435 ret 2436 2437 .p2align 4 2438LABEL(tail_26): /* 27 bytes */ 2439 mov (%rsi), %rcx 2440 mov %rcx, (%rdi) 2441 mov 8(%rsi), %rdx 2442 mov %rdx, 8(%rdi) 2443 mov 16(%rsi), %rcx 2444 mov %rcx, 16(%rdi) 2445 mov 23(%rsi), %edx 2446 mov %edx, 23(%rdi) 2447#ifdef USE_AS_STRNCPY 2448 mov $27, %cl 2449 sub $27, %r8 2450 jnz LABEL(strncpy_fill_tail) 2451#endif 2452 ret 2453 2454 .p2align 4 2455LABEL(tail_27): /* 28 bytes */ 2456 mov (%rsi), %rcx 2457 mov %rcx, (%rdi) 2458 mov 8(%rsi), %rdx 2459 mov %rdx, 8(%rdi) 2460 mov 16(%rsi), %rcx 2461 mov %rcx, 16(%rdi) 2462 mov 24(%rsi), %edx 2463 mov %edx, 24(%rdi) 2464#ifdef USE_AS_STRNCPY 2465 mov $28, %cl 2466 sub $28, %r8 2467 jnz LABEL(strncpy_fill_tail) 2468#endif 2469 ret 2470 2471 .p2align 4 2472LABEL(tail_28): /* 29 bytes */ 2473 mov (%rsi), %rcx 2474 mov %rcx, (%rdi) 2475 mov 8(%rsi), %rdx 2476 mov %rdx, 8(%rdi) 2477 mov 16(%rsi), %rcx 2478 mov %rcx, 16(%rdi) 2479 mov 21(%rsi), %rdx 2480 mov %rdx, 21(%rdi) 2481#ifdef USE_AS_STRNCPY 2482 mov $29, %cl 2483 sub $29, %r8 2484 jnz LABEL(strncpy_fill_tail) 2485#endif 2486 ret 2487 2488 .p2align 4 2489LABEL(tail_29): /* 30 bytes */ 2490 mov (%rsi), %rcx 2491 mov %rcx, (%rdi) 2492 mov 8(%rsi), %rdx 2493 mov %rdx, 8(%rdi) 2494 mov 16(%rsi), %rcx 2495 mov %rcx, 16(%rdi) 2496 mov 22(%rsi), %rdx 2497 mov %rdx, 22(%rdi) 2498#ifdef USE_AS_STRNCPY 2499 mov $30, %cl 2500 sub $30, %r8 2501 jnz LABEL(strncpy_fill_tail) 2502#endif 2503 ret 2504 2505 .p2align 4 2506LABEL(tail_30): /* 31 bytes */ 2507 mov (%rsi), %rcx 2508 mov %rcx, (%rdi) 2509 mov 8(%rsi), %rdx 2510 mov %rdx, 8(%rdi) 2511 mov 16(%rsi), %rcx 2512 mov %rcx, 16(%rdi) 2513 mov 23(%rsi), %rdx 2514 mov %rdx, 23(%rdi) 2515#ifdef USE_AS_STRNCPY 2516 mov $31, %cl 2517 sub $31, %r8 2518 jnz LABEL(strncpy_fill_tail) 2519#endif 2520 ret 2521 2522 .pushsection .rodata 2523 .p2align 4 2524LABEL(tail_table): 2525 .int LABEL(tail_0) - LABEL(tail_table) /* 1 byte */ 2526 .int LABEL(tail_1) - LABEL(tail_table) 2527 .int LABEL(tail_2) - LABEL(tail_table) 2528 .int LABEL(tail_3) - LABEL(tail_table) 2529 .int LABEL(tail_4) - LABEL(tail_table) 2530 .int LABEL(tail_5) - LABEL(tail_table) 2531 .int LABEL(tail_6) - LABEL(tail_table) 2532 .int LABEL(tail_7) - LABEL(tail_table) 2533 .int LABEL(tail_8) - LABEL(tail_table) 2534 .int LABEL(tail_9) - LABEL(tail_table) 2535 .int LABEL(tail_10) - LABEL(tail_table) 2536 .int LABEL(tail_11) - LABEL(tail_table) 2537 .int LABEL(tail_12) - LABEL(tail_table) 2538 .int LABEL(tail_13) - LABEL(tail_table) 2539 .int LABEL(tail_14) - LABEL(tail_table) 2540 .int LABEL(tail_15) - LABEL(tail_table) 2541 .int LABEL(tail_16) - LABEL(tail_table) 2542 .int LABEL(tail_17) - LABEL(tail_table) 2543 .int LABEL(tail_18) - LABEL(tail_table) 2544 .int LABEL(tail_19) - LABEL(tail_table) 2545 .int LABEL(tail_20) - LABEL(tail_table) 2546 .int LABEL(tail_21) - LABEL(tail_table) 2547 .int LABEL(tail_22) - LABEL(tail_table) 2548 .int LABEL(tail_23) - LABEL(tail_table) 2549 .int LABEL(tail_24) - LABEL(tail_table) 2550 .int LABEL(tail_25) - LABEL(tail_table) 2551 .int LABEL(tail_26) - LABEL(tail_table) 2552 .int LABEL(tail_27) - LABEL(tail_table) 2553 .int LABEL(tail_28) - LABEL(tail_table) 2554 .int LABEL(tail_29) - LABEL(tail_table) 2555 .int LABEL(tail_30) - LABEL(tail_table) 2556 .int LABEL(tail_31) - LABEL(tail_table) /* 32 bytes */ 2557 2558 .p2align 4 2559LABEL(unaligned_table): 2560 .int LABEL(ashr_0) - LABEL(unaligned_table) 2561 .int LABEL(ashr_1) - LABEL(unaligned_table) 2562 .int LABEL(ashr_2) - LABEL(unaligned_table) 2563 .int LABEL(ashr_3) - LABEL(unaligned_table) 2564 .int LABEL(ashr_4) - LABEL(unaligned_table) 2565 .int LABEL(ashr_5) - LABEL(unaligned_table) 2566 .int LABEL(ashr_6) - LABEL(unaligned_table) 2567 .int LABEL(ashr_7) - LABEL(unaligned_table) 2568 .int LABEL(ashr_8) - LABEL(unaligned_table) 2569 .int LABEL(ashr_9) - LABEL(unaligned_table) 2570 .int LABEL(ashr_10) - LABEL(unaligned_table) 2571 .int LABEL(ashr_11) - LABEL(unaligned_table) 2572 .int LABEL(ashr_12) - LABEL(unaligned_table) 2573 .int LABEL(ashr_13) - LABEL(unaligned_table) 2574 .int LABEL(ashr_14) - LABEL(unaligned_table) 2575 .int LABEL(ashr_15) - LABEL(unaligned_table) 2576 .popsection 2577 2578#ifdef USE_AS_STRNCPY 2579 SET_SIZE(strncpy) 2580#else 2581 SET_SIZE(strcpy) /* (char *, const char *) */ 2582#endif 2583