1/*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org> 5*/ 6 7#include <machine/asm.h> 8#include <machine/param.h> 9 10 .weak strncmp 11 .set strncmp, __strncmp 12 .text 13 14ENTRY(__strncmp) 15 16 bic x8, x0, #0xf // x0 aligned to the boundary 17 and x9, x0, #0xf // x9 is the offset 18 bic x10, x1, #0xf // x1 aligned to the boundary 19 and x11, x1, #0xf // x11 is the offset 20 21 subs x2, x2, #1 22 b.lo .Lempty 23 24 mov x13, #-1 // save constants for later 25 mov x16, #0xf 26 27 /* 28 * Check if either string is located at end of page to avoid crossing 29 * into unmapped page. If so, we load 16 bytes from the nearest 30 * alignment boundary and shift based on the offset. 31 */ 32 33 add x3, x0, #16 // end of head 34 add x4, x1, #16 35 eor x3, x3, x0 36 eor x4, x4, x1 // bits that changed 37 orr x3, x3, x4 // in either str1 or str2 38 cmp x2,#16 39 b.lo .Llt16 40 tbz w3, #PAGE_SHIFT, .Lbegin 41 42 ldr q0, [x8] // load aligned head 43 ldr q1, [x10] 44 45 lsl x14, x9, #2 46 lsl x15, x11, #2 47 lsl x3, x13, x14 // string head 48 lsl x4, x13, x15 49 50 cmeq v5.16b, v0.16b, #0 51 cmeq v6.16b, v1.16b, #0 52 53 shrn v5.8b, v5.8h, #4 54 shrn v6.8b, v6.8h, #4 55 fmov x5, d5 56 fmov x6, d6 57 58 adrp x14, shift_data 59 add x14, x14, :lo12:shift_data 60 61 /* heads may cross page boundary, avoid unmapped loads */ 62 tst x5, x3 63 b.eq 0f 64 65 ldr q4, [x14, x9] // load permutation table 66 tbl v0.16b, {v0.16b}, v4.16b 67 68 b 1f 69 .p2align 4 700: 71 ldr q0, [x0] // load true head 721: 73 tst x6, x4 74 b.eq 0f 75 76 ldr q4, [x14, x11] 77 tbl v4.16b, {v1.16b}, v4.16b 78 79 b 1f 80 81 .p2align 4 82.Lbegin: 83 ldr q0, [x0] // load true heads 840: 85 ldr q4, [x1] 861: 87 cmeq v2.16b, v0.16b, #0 // NUL byte present? 88 cmeq v4.16b, v0.16b, v4.16b // which bytes match? 89 90 orn v2.16b, v2.16b, v4.16b // mismatch or NUL byte? 91 92 shrn v2.8b, v2.8h, #4 93 fmov x5, d2 94 95 cbnz x5, .Lhead_mismatch 96 /* load head and second chunk */ 97 ldr q2, [x8, #16] // load second chunk 98 ldr q3, [x10, #16] 99 100 add x2, x2, x11 101 sub x2, x2, #16 102 103 subs x9, x9, x11 // is a&0xf >= b&0xf 104 b.lo .Lswapped // if not swap operands 105 b .Lnormal 106 107 .p2align 4 108.Llt16: 109 /* 110 * Check if either string is located at end of page to avoid crossing 111 * into unmapped page. If so, we load 16 bytes from the nearest 112 * alignment boundary and shift based on the offset. 113 */ 114 tbz w3, #PAGE_SHIFT, 2f 115 116 ldr q0, [x8] // load aligned head 117 ldr q1, [x10] 118 119 lsl x14, x9, #2 120 lsl x15, x11, #2 121 lsl x3, x13, x14 // string head 122 lsl x4, x13, x15 123 124 /* Introduce a null byte match if the limit is within the aligned chunk */ 125 add x14, x2, x9 126 add x15, x2, x11 127 lsl x14, x14, #2 128 lsl x15, x15, #2 129 lsl x14, x16, x14 130 lsl x15, x16, x15 131 132 cmeq v5.16b, v0.16b, #0 133 cmeq v6.16b, v1.16b, #0 134 135 shrn v5.8b, v5.8h, #4 136 shrn v6.8b, v6.8h, #4 137 fmov x5, d5 138 fmov x6, d6 139 140 orr x5, x5, x14 // insert match at limit 141 orr x6, x6, x15 142 143 adrp x14, shift_data 144 add x14, x14, :lo12:shift_data 145 146 /* heads may cross page boundary, avoid unmapped loads */ 147 tst x5, x3 148 b.eq 0f 149 150 ldr q4, [x14, x9] // load permutation table 151 tbl v0.16b, {v0.16b}, v4.16b 152 153 b 1f 154 .p2align 4 1550: 156 ldr q0, [x0] // load true head 1571: 158 tst x6, x4 159 b.eq 0f 160 161 ldr q4, [x14, x11] 162 tbl v4.16b, {v1.16b}, v4.16b 163 164 b 1f 165 166 .p2align 4 1672: 168 ldr q0, [x0] // load true heads 1690: 170 ldr q4, [x1] 1711: 172 173 cmeq v2.16b, v0.16b, #0 // NUL byte present? 174 cmeq v4.16b, v0.16b, v4.16b // which bytes match? 175 176 bic v2.16b, v4.16b, v2.16b // match and not NUL byte 177 178 shrn v2.8b, v2.8h, #4 179 fmov x5, d2 180 lsl x4, x2, #2 181 lsl x4, x13, x4 182 orn x5, x4, x5 // mismatch or NUL byte? 183 184.Lhead_mismatch: 185 rbit x3, x5 186 clz x3, x3 // index of mismatch 187 lsr x3, x3, #2 188 ldrb w4, [x0, x3] 189 ldrb w5, [x1, x3] 190 sub w0, w4, w5 191 ret 192 193 .p2align 4 194.Lnormal: 195 sub x12, x10, x9 196 ldr q0, [x12, #16]! 197 sub x10, x10, x8 198 sub x11, x10, x9 199 200 cmeq v1.16b, v3.16b, #0 // NUL present? 201 cmeq v0.16b, v0.16b, v2.16b // Mismatch between chunks? 202 shrn v1.8b, v1.8h, #4 203 shrn v0.8b, v0.8h, #4 204 fmov x6, d1 205 fmov x5, d0 206 207 add x8, x8, #32 // advance to next iteration 208 209 lsl x4, x2, #2 210 lsl x4, x13, x4 211 orr x3, x6, x4 // introduce a null byte match 212 cmp x2, #16 // does the buffer end within x2 213 csel x6, x3, x6, lo 214 cbnz x6, .Lnulfound2 // NUL or end of buffer found? 215 mvn x5, x5 216 cbnz x5, .Lmismatch2 217 sub x2, x2, #16 218 cmp x2, #32 // end of buffer? 219 b.lo .Ltail 220 /* 221 * During the main loop, the layout of the two strings is something like: 222 * 223 * v ------1------ v ------2------ v 224 * X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... 225 * X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... 226 * 227 * where v indicates the alignment boundaries and corresponding chunks 228 * of the strings have the same letters. Chunk A has been checked in 229 * the previous iteration. This iteration, we first check that string 230 * X1 doesn't end within region 2, then we compare chunk B between the 231 * two strings. As X1 is known not to hold a NUL byte in regions 1 232 * and 2 at this point, this also ensures that x0 has not ended yet. 233 */ 234 .p2align 4 2350: 236 ldr q0, [x8, x11] 237 ldr q1, [x8, x10] 238 ldr q2, [x8] 239 240 cmeq v1.16b, v1.16b, #0 // end of string? 241 cmeq v0.16b, v0.16b, v2.16b // do the chunks match? 242 243 shrn v1.8b, v1.8h, #4 244 shrn v0.8b, v0.8h, #4 245 fmov x6, d1 246 fmov x5, d0 247 cbnz x6, .Lnulfound 248 mvn x5, x5 // any mismatches? 249 cbnz x5, .Lmismatch 250 251 add x8, x8, #16 252 253 /* main loop unrolled twice */ 254 ldr q0, [x8, x11] 255 ldr q1, [x8, x10] 256 ldr q2, [x8] 257 258 add x8, x8, #16 259 cmeq v1.16b, v1.16b, #0 260 cmeq v0.16b, v0.16b, v2.16b 261 262 shrn v1.8b, v1.8h, #4 263 shrn v0.8b, v0.8h, #4 264 fmov x6, d1 265 fmov x5, d0 266 cbnz x6, .Lnulfound2 267 mvn x5, x5 268 cbnz x5, .Lmismatch2 269 sub x2, x2, #32 270 cmp x2, #32 // end of buffer? 271 b.hs 0b // if yes, process tail 272 273 /* end of buffer will occur in next 32 bytes */ 274.Ltail: 275 ldr q0, [x8, x11] 276 ldr q1, [x8, x10] 277 ldr q2, [x8] 278 279 cmeq v1.16b, v1.16b, #0 // end of string? 280 cmeq v0.16b, v0.16b, v2.16b // do the chunks match? 281 282 shrn v1.8b, v1.8h, #4 283 shrn v0.8b, v0.8h, #4 284 fmov x6, d1 285 fmov x5, d0 286 287 /* 288 * If x2 <= 16 then we introduce a NUL byte in the 289 * result from CMEQ to avoid comparing further! 290 */ 291 292 lsl x4, x2, #2 293 lsl x4, x13, x4 294 orr x3, x6, x4 // introduce a null byte match 295 cmp x2, #16 // does the buffer end within x2 296 csel x6, x3, x6, lo 297 298 cbnz x6, .Lnulfound // NUL or end of string found 299 mvn x5, x5 300 cbnz x5, .Lmismatch 301 302 add x8, x8, #16 303 304 /* main loop unrolled twice */ 305 ldr q0, [x8, x11] 306 ldr q1, [x8, x10] 307 ldr q2, [x8] 308 309 add x8, x8, #16 310 cmeq v1.16b, v1.16b, #0 311 cmeq v0.16b, v0.16b, v2.16b 312 313 shrn v1.8b, v1.8h, #4 314 shrn v0.8b, v0.8h, #4 315 fmov x6, d1 316 fmov x5, d0 317 318 ubfiz x4, x2, #2, #4 // (x2 - 16) << 2 319 lsl x4, x13, x4 // take first half into account 320 orr x6, x6, x4 // introduce a null byte match 321 322.Lnulfound2: 323 sub x8, x8, #16 324 325.Lnulfound: 326 mov x4, x6 327 328 ubfiz x7, x9, #2, #4 329 lsl x6, x6, x7 // adjust NUL mask to indices 330 331 orn x5, x6, x5 332 cbnz x5, .Lmismatch 333 334 /* 335 * (x0) == (x1) and NUL is past the string. 336 * Compare (x1) with the corresponding part 337 * of the other string until the NUL byte. 338 */ 339 ldr q0, [x8, x9] 340 ldr q1, [x8, x10] 341 342 cmeq v1.16b, v0.16b, v1.16b 343 shrn v1.8b, v1.8h, #4 344 fmov x5, d1 345 346 orn x5, x4, x5 347 348 rbit x3, x5 349 clz x3, x3 350 lsr x5, x3, #2 351 352 add x10, x10, x8 // restore x10 pointer 353 add x8, x8, x9 // point to corresponding chunk 354 355 ldrb w4, [x8, x5] 356 ldrb w5, [x10, x5] 357 sub w0, w4, w5 358 ret 359 360 .p2align 4 361.Lmismatch2: 362 sub x8, x8, #16 // roll back second increment 363.Lmismatch: 364 rbit x3, x5 365 clz x3, x3 // index of mismatch 366 lsr x3, x3, #2 367 add x11, x8, x11 368 369 ldrb w4, [x8, x3] 370 ldrb w5, [x11, x3] 371 sub w0, w4, w5 // byte difference 372 ret 373 374 /* 375 * If (a&0xf) < (b&0xf), we do the same thing but with swapped 376 * operands. I found that this performs slightly better than 377 * using conditional moves to do the swap branchless. 378 */ 379 .p2align 4 380.Lswapped: 381 add x12, x8, x9 382 ldr q0, [x12, #16]! 383 sub x8, x8, x10 384 add x11, x8, x9 385 add x2,x2,x9 386 neg x9, x9 387 388 cmeq v1.16b, v2.16b, #0 389 cmeq v0.16b, v0.16b, v3.16b 390 shrn v1.8b, v1.8h, #4 391 shrn v0.8b, v0.8h, #4 392 fmov x6, d1 393 fmov x5, d0 394 395 add x10, x10, #32 396 397 lsl x4, x2, #2 398 lsl x4, x13, x4 399 orr x3,x6,x4 // introduce a null byte match 400 cmp x2,#16 401 csel x6, x3, x6, lo 402 cbnz x6, .Lnulfound2s 403 mvn x5, x5 404 cbnz x5, .Lmismatch2s 405 406 sub x2, x2, #16 407 cmp x2, #32 408 b.lo .Ltails 409 410 /* 411 * During the main loop, the layout of the two strings is something like: 412 * 413 * v ------1------ v ------2------ v 414 * X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... 415 * X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... 416 * 417 * where v indicates the alignment boundaries and corresponding chunks 418 * of the strings have the same letters. Chunk A has been checked in 419 * the previous iteration. This iteration, we first check that string 420 * X0 doesn't end within region 2, then we compare chunk B between the 421 * two strings. As X0 is known not to hold a NUL byte in regions 1 422 * and 2 at this point, this also ensures that X1 has not ended yet. 423 */ 424 .p2align 4 4250: 426 ldr q0, [x10, x11] 427 ldr q1, [x10, x8] 428 ldr q2, [x10] 429 430 cmeq v1.16b, v1.16b, #0 431 cmeq v0.16b, v0.16b, v2.16b 432 433 shrn v1.8b, v1.8h, #4 434 shrn v0.8b, v0.8h, #4 435 fmov x6, d1 436 fmov x5, d0 437 cbnz x6, .Lnulfounds 438 mvn x5, x5 439 cbnz x5, .Lmismatchs 440 441 add x10, x10, #16 442 443 /* main loop unrolled twice */ 444 ldr q0, [x10, x11] 445 ldr q1, [x10, x8] 446 ldr q2, [x10] 447 448 add x10, x10, #16 449 cmeq v1.16b, v1.16b, #0 450 cmeq v0.16b, v0.16b, v2.16b 451 452 shrn v1.8b, v1.8h, #4 453 shrn v0.8b, v0.8h, #4 454 fmov x6, d1 455 fmov x5, d0 456 cbnz x6, .Lnulfound2s 457 mvn x5, x5 458 cbnz x5, .Lmismatch2s 459 sub x2, x2, #32 460 cmp x2, #32 461 b.hs 0b 462 463.Ltails: 464 ldr q0, [x10, x11] 465 ldr q1, [x10, x8] 466 ldr q2, [x10] 467 468 cmeq v1.16b, v1.16b, #0 469 cmeq v0.16b, v0.16b, v2.16b 470 471 shrn v1.8b, v1.8h, #4 472 shrn v0.8b, v0.8h, #4 473 fmov x6, d1 474 fmov x5, d0 475 476 /* 477 * If x2 <= 16 then we introduce a NUL byte in the 478 * result from CMEQ to avoid comparing further! 479 */ 480 481 lsl x4, x2, #2 482 lsl x4, x13, x4 483 orr x3, x6, x4 // introduce a null byte match 484 cmp x2, #16 485 csel x6, x3, x6, lo 486 487 cbnz x6, .Lnulfounds 488 mvn x5, x5 489 cbnz x5, .Lmismatchs 490 491 add x10, x10, #16 492 493 ldr q0, [x10, x11] 494 ldr q1, [x10, x8] 495 ldr q2, [x10] 496 497 add x10, x10, #16 498 cmeq v1.16b, v1.16b, #0 499 cmeq v0.16b, v0.16b, v2.16b 500 501 shrn v1.8b, v1.8h, #4 502 shrn v0.8b, v0.8h, #4 503 fmov x6, d1 504 fmov x5, d0 505 506 ubfiz x4, x2, #2, #4 507 lsl x4, x13, x4 508 orr x6, x6, x4 // introduce a null byte match 509 510.Lnulfound2s: 511 sub x10, x10, #16 512.Lnulfounds: 513 mov x4, x6 514 515 ubfiz x7, x9, #2, #4 516 lsl x6, x6, x7 517 518 orn x5, x6, x5 519 520 cbnz x5, .Lmismatchs 521 522 ldr q0, [x10, x9] 523 ldr q1, [x10, x8] 524 525 cmeq v1.16b, v0.16b, v1.16b 526 shrn v1.8b, v1.8h, #4 527 fmov x5, d1 528 529 orn x5, x4, x5 530 531 rbit x3, x5 532 clz x3, x3 533 lsr x5, x3, #2 534 535 add x11, x10, x8 536 add x10, x10, x9 537 538 ldrb w4, [x10, x5] 539 ldrb w5, [x11, x5] 540 sub w0, w5, w4 541 ret 542 543 .p2align 4 544.Lmismatch2s: 545 sub x10, x10, #16 546.Lmismatchs: 547 rbit x3, x5 548 clz x3, x3 549 lsr x3, x3, #2 550 add x11, x10, x11 551 552 ldrb w4, [x10, x3] 553 ldrb w5, [x11, x3] 554 sub w0, w5, w4 555 ret 556 557 .p2align 4 558.Lempty: 559 eor x0, x0, x0 560 ret 561 562END(__strncmp) 563 564 .section .rodata 565 .p2align 4 566shift_data: 567 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 568 .fill 16, 1, -1 569 .size shift_data, .-shift_data 570