1/*- 2 * Copyright (c) 2023 The FreeBSD Foundation 3 * 4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org> 5 * under sponsorship from the FreeBSD Foundation. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE 27 */ 28 29#include <machine/asm.h> 30#include <machine/param.h> 31 32#include "amd64_archlevel.h" 33 34#define ALIGN_TEXT .p2align 4, 0x90 35 36ARCHFUNCS(strncmp) 37 ARCHFUNC(strncmp, scalar) 38 ARCHFUNC(strncmp, baseline) 39ENDARCHFUNCS(strncmp) 40 41/* 42 * This is just the scalar loop unrolled a bunch of times. 43 */ 44ARCHENTRY(strncmp, scalar) 45 xor %eax, %eax 46 sub $4, %rdx # 4 chars left to compare? 47 jbe 1f 48 49 ALIGN_TEXT 500: movzbl (%rdi), %ecx 51 test %ecx, %ecx # NUL char in first string? 52 jz .L0 53 cmpb (%rsi), %cl # mismatch between strings? 54 jnz .L0 55 56 movzbl 1(%rdi), %ecx 57 test %ecx, %ecx 58 jz .L1 59 cmpb 1(%rsi), %cl 60 jnz .L1 61 62 movzbl 2(%rdi), %ecx 63 test %ecx, %ecx 64 jz .L2 65 cmpb 2(%rsi), %cl 66 jnz .L2 67 68 movzbl 3(%rdi), %ecx 69 test %ecx, %ecx 70 jz .L3 71 cmpb 3(%rsi), %cl 72 jnz .L3 73 74 add $4, %rdi # advance to next iteration 75 add $4, %rsi 76 sub $4, %rdx 77 ja 0b 78 79 /* end of string within the next 4 characters */ 801: cmp $-4, %edx # end of string reached immediately? 81 jz .Leq 82 movzbl (%rdi), %ecx 83 test %ecx, %ecx 84 jz .L0 85 cmpb (%rsi), %cl 86 jnz .L0 87 88 cmp $-3, %edx # end of string reached after 1 char? 89 jz .Leq 90 movzbl 1(%rdi), %ecx 91 test %ecx, %ecx 92 jz .L1 93 cmpb 1(%rsi), %cl 94 jnz .L1 95 96 cmp $-2, %edx 97 jz .Leq 98 movzbl 2(%rdi), %ecx 99 test %ecx, %ecx 100 jz .L2 101 cmpb 2(%rsi), %cl 102 jnz .L2 103 104 cmp $-1, %edx # either end of string after 3 chars, 105 jz .Leq # or it boils down to the last char 106 107.L3: inc %eax 108.L2: inc %eax 109.L1: inc %eax 110.L0: movzbl (%rsi, %rax, 1), %ecx 111 movzbl (%rdi, %rax, 1), %eax 112 sub %ecx, %eax 113.Leq: ret 114ARCHEND(strncmp, scalar) 115 116ARCHENTRY(strncmp, baseline) 117 push %rbx 118 sub $1, %rdx # RDX--, so RDX points to the last byte to compare 119 jb .Lempty # where there any bytes to compare at all? 120 121 lea 15(%rdi), %r8d # end of head 122 lea 15(%rsi), %r9d 123 mov %edi, %eax 124 mov %esi, %ebx 125 xor %edi, %r8d # bits that changed between first and last byte 126 xor %esi, %r9d 127 and $~0xf, %rdi # align heads to 16 bytes 128 and $~0xf, %rsi 129 or %r8d, %r9d 130 and $0xf, %eax # offset from alignment 131 and $0xf, %ebx 132 movdqa (%rdi), %xmm0 # load aligned heads 133 movdqa (%rsi), %xmm2 134 pxor %xmm1, %xmm1 135 cmp $16, %rdx # end of buffer within the first 32 bytes? 136 jb .Llt16 137 138 test $PAGE_SIZE, %r9d # did the page change? 139 jz 0f # if not, take fast path 140 141 142 /* heads may cross page boundary, avoid unmapped loads */ 143 movdqa %xmm0, -32(%rsp) # stash copies of the heads on the stack 144 movdqa %xmm2, -16(%rsp) 145 mov $-1, %r8d 146 mov $-1, %r9d 147 mov %eax, %ecx 148 shl %cl, %r8d # string head in XMM0 149 mov %ebx, %ecx 150 shl %cl, %r9d # string head in XMM2 151 pcmpeqb %xmm1, %xmm0 152 pcmpeqb %xmm1, %xmm2 153 pmovmskb %xmm0, %r10d 154 pmovmskb %xmm2, %r11d 155 test %r8d, %r10d # NUL byte present in first string? 156 lea -32(%rsp), %r8 157 cmovz %rdi, %r8 158 test %r9d, %r11d # NUL byte present in second string? 159 lea -16(%rsp), %r9 160 cmovz %rsi, %r9 161 movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads 162 movdqu (%r9, %rbx, 1), %xmm4 163 jmp 1f 164 165 /* rdx == 0 */ 166.Lempty: 167 xor %eax, %eax # zero-length buffers compare equal 168 pop %rbx 169 ret 170 1710: movdqu (%rdi, %rax, 1), %xmm0 # load true heads 172 movdqu (%rsi, %rbx, 1), %xmm4 1731: pxor %xmm2, %xmm2 174 pcmpeqb %xmm0, %xmm2 # NUL byte present? 175 pcmpeqb %xmm0, %xmm4 # which bytes match? 176 pandn %xmm4, %xmm2 # match and not NUL byte? 177 pmovmskb %xmm2, %r9d 178 xor $0xffff, %r9d # mismatch or NUL byte? 179 jnz .Lhead_mismatch 180 181 /* load head and second chunk */ 182 movdqa 16(%rdi), %xmm2 # load second chunks 183 movdqa 16(%rsi), %xmm3 184 lea -16(%rdx, %rbx, 1), %rdx # account for length of RSI chunk 185 sub %rbx, %rax # is a&0xf >= b&0xf? 186 jb .Lswapped # if not, proceed with swapped operands 187 jmp .Lnormal 188 189 /* buffer ends within the first 16 bytes */ 190.Llt16: test $PAGE_SIZE, %r9d # did the page change? 191 jz 0f # if not, take fast path 192 193 /* heads may cross page boundary */ 194 movdqa %xmm0, -32(%rsp) # stash copies of the heads on the stack 195 movdqa %xmm2, -16(%rsp) 196 mov $-1, %r8d 197 mov $-1, %r9d 198 mov %eax, %ecx 199 shl %cl, %r8d # string head in XMM0 200 mov %ebx, %ecx 201 shl %cl, %r9d # string head in XMM2 202 pcmpeqb %xmm1, %xmm0 203 pcmpeqb %xmm1, %xmm2 204 pmovmskb %xmm0, %r10d 205 pmovmskb %xmm2, %r11d 206 lea (%rdx, %rax, 1), %ecx # location of last buffer byte in xmm0 207 bts %ecx, %r10d # treat as if NUL byte present 208 lea (%rdx, %rbx, 1), %ecx 209 bts %ecx, %r11d 210 test %r8w, %r10w # NUL byte present in first string head? 211 lea -32(%rsp), %r8 212 cmovz %rdi, %r8 213 test %r9w, %r11w # NUL byte present in second string head? 214 lea -16(%rsp), %r9 215 cmovz %rsi, %r9 216 movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads 217 movdqu (%r9, %rbx, 1), %xmm4 218 jmp 1f 219 2200: movdqu (%rdi, %rax, 1), %xmm0 # load true heads 221 movdqu (%rsi, %rbx, 1), %xmm4 2221: pxor %xmm2, %xmm2 223 pcmpeqb %xmm0, %xmm2 # NUL byte present? 224 pcmpeqb %xmm0, %xmm4 # which bytes match? 225 pandn %xmm4, %xmm2 # match and not NUL byte? 226 pmovmskb %xmm2, %r9d 227 btr %edx, %r9d # induce mismatch in last byte of buffer 228 not %r9d # mismatch or NUL byte? 229 230 /* mismatch in true heads */ 231 ALIGN_TEXT 232.Lhead_mismatch: 233 tzcnt %r9d, %r9d # where is the mismatch? 234 add %rax, %rdi # return to true heads 235 add %rbx, %rsi 236 movzbl (%rdi, %r9, 1), %eax # mismatching characters 237 movzbl (%rsi, %r9, 1), %ecx 238 sub %ecx, %eax 239 pop %rbx 240 ret 241 242 /* rax >= 0 */ 243 ALIGN_TEXT 244.Lnormal: 245 neg %rax 246 movdqu 16(%rsi, %rax, 1), %xmm0 247 sub %rdi, %rsi # express RSI as distance from RDI 248 lea (%rsi, %rax, 1), %rbx # point RBX to offset in second string 249 neg %rax # ... corresponding to RDI 250 pcmpeqb %xmm3, %xmm1 # NUL present? 251 pcmpeqb %xmm2, %xmm0 # Mismatch between chunks? 252 pmovmskb %xmm1, %r8d 253 pmovmskb %xmm0, %r9d 254 mov $16, %ecx 255 cmp %rcx, %rdx # does the buffer end within (RDI,RSI,1)? 256 cmovb %edx, %ecx # ECX = min(16, RDX) 257 add $32, %rdi # advance to next iteration 258 bts %ecx, %r8d # mark end-of-buffer as if there was a NUL byte 259 test %r8w, %r8w # NUL or end of buffer found? 260 jnz .Lnul_found2 261 xor $0xffff, %r9d 262 jnz .Lmismatch2 263 sub $48, %rdx # end of buffer within first main loop iteration? 264 jb .Ltail # if yes, process tail 265 266 /* 267 * During the main loop, the layout of the two strings is something like: 268 * 269 * v ------1------ v ------2------ v 270 * RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... 271 * RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... 272 * 273 * where v indicates the alignment boundaries and corresponding chunks 274 * of the strings have the same letters. Chunk A has been checked in 275 * the previous iteration. This iteration, we first check that string 276 * RSI doesn't end within region 2, then we compare chunk B between the 277 * two strings. As RSI is known not to hold a NUL byte in regsions 1 278 * and 2 at this point, this also ensures that RDI has not ended yet. 279 */ 280 ALIGN_TEXT 2810: movdqu (%rdi, %rbx, 1), %xmm0 # chunk of 2nd string corresponding to RDI 282 pxor %xmm1, %xmm1 283 pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI? 284 pcmpeqb (%rdi), %xmm0 # where do the chunks match? 285 pmovmskb %xmm1, %r8d 286 pmovmskb %xmm0, %r9d 287 test %r8d, %r8d 288 jnz .Lnul_found 289 xor $0xffff, %r9d # any mismatches? 290 jnz .Lmismatch 291 292 /* main loop unrolled twice */ 293 movdqu 16(%rdi, %rbx, 1), %xmm0 294 pxor %xmm1, %xmm1 295 pcmpeqb 16(%rdi, %rsi, 1), %xmm1 296 pcmpeqb 16(%rdi), %xmm0 297 pmovmskb %xmm1, %r8d 298 pmovmskb %xmm0, %r9d 299 add $32, %rdi 300 test %r8d, %r8d 301 jnz .Lnul_found2 302 xor $0xffff, %r9d 303 jnz .Lmismatch2 304 sub $32, %rdx # end of buffer within next iteration? 305 jae 0b 306 307 /* end of buffer will occur in next 32 bytes */ 308.Ltail: movdqu (%rdi, %rbx, 1), %xmm0 # chunk of 2nd string corresponding to RDI 309 pxor %xmm1, %xmm1 310 pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI? 311 pcmpeqb (%rdi), %xmm0 # where do the chunks match? 312 pmovmskb %xmm1, %r8d 313 pmovmskb %xmm0, %r9d 314 bts %edx, %r8d # indicate NUL byte at last byte in buffer 315 test %r8w, %r8w # NUL byte in first chunk? 316 jnz .Lnul_found 317 xor $0xffff, %r9d # any mismatches? 318 jnz .Lmismatch 319 320 /* main loop unrolled twice */ 321 movdqu 16(%rdi, %rbx, 1), %xmm0 322 pxor %xmm1, %xmm1 323 pcmpeqb 16(%rdi, %rsi, 1), %xmm1 324 pcmpeqb 16(%rdi), %xmm0 325 pmovmskb %xmm1, %r8d 326 pmovmskb %xmm0, %r9d 327 sub $16, %edx # take first half into account 328 bts %edx, %r8d # indicate NUL byte at last byte in buffer 329 add $32, %rdi 330 331.Lnul_found2: 332 sub $16, %rdi 333 334.Lnul_found: 335 mov %eax, %ecx 336 mov %r8d, %r10d 337 shl %cl, %r8d # adjust NUL mask to positions in RDI/RBX 338 not %r9d # mask of mismatches 339 or %r8w, %r9w # NUL bytes als count as mismatches 340 jnz .Lmismatch 341 342 /* 343 * (RDI) == (RSI) and NUL is past the string. 344 * compare (RSI) with the corresponding part 345 * of the other string until the NUL byte. 346 */ 347 movdqu (%rdi, %rax, 1), %xmm0 348 pcmpeqb (%rdi, %rsi, 1), %xmm0 349 add %rdi, %rsi # restore RSI pointer 350 add %rax, %rdi # point RDI to chunk corresponding to (RSI) 351 pmovmskb %xmm0, %ecx # mask of matches 352 not %ecx # mask of mismatches 353 or %r10d, %ecx # mask of mismatches or NUL bytes 354 tzcnt %ecx, %ecx # location of first mismatch 355 movzbl (%rdi, %rcx, 1), %eax 356 movzbl (%rsi, %rcx, 1), %ecx 357 sub %ecx, %eax 358 pop %rbx 359 ret 360 361.Lmismatch2: 362 sub $16, %rdi 363 364 /* a mismatch has been found between RBX and RSI */ 365.Lmismatch: 366 tzcnt %r9d, %r9d # where is the mismatch? 367 add %rdi, %rbx # turn RBX from offset into pointer 368 movzbl (%rbx, %r9, 1), %ecx 369 movzbl (%rdi, %r9, 1), %eax 370 sub %ecx, %eax 371 pop %rbx 372 ret 373 374 /* rax < 0 */ 375 ALIGN_TEXT 376.Lswapped: 377 movdqu 16(%rdi, %rax, 1), %xmm0 378 sub %rsi, %rdi # express RDI as distance from RDI 379 lea (%rdi, %rax, 1), %rbx # point RBX to offset in first string 380 pcmpeqb %xmm2, %xmm1 # NUL present? 381 pcmpeqb %xmm3, %xmm0 # mismatch between chunks? 382 pmovmskb %xmm1, %r8d 383 pmovmskb %xmm0, %r9d 384 add %rax, %rdx # RDX points to buffer end in RSI 385 neg %rax # ... corresponding to RSI 386 mov $16, %ecx 387 cmp %rcx, %rdx # does the buffer end within (RSI,RDI,1)? 388 cmovb %edx, %ecx # ECX = min(16, RDX) 389 add $32, %rsi 390 bts %ecx, %r8d # mark end-of-buffer as if there was a NUL byte 391 test %r8w, %r8w # NUL or end of buffer found? 392 jnz .Lnul_found2s 393 xor $0xffff, %r9d 394 jnz .Lmismatch2s 395 sub $48, %rdx # end of buffer within first main loop iteration? 396 jb .Ltails # if yes, process tail 397 398 ALIGN_TEXT 3990: movdqu (%rsi, %rbx, 1), %xmm0 # chunk of 1st string corresponding to RSI 400 pxor %xmm1, %xmm1 401 pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RDI? 402 pcmpeqb (%rsi), %xmm0 # where do the chunks match? 403 pmovmskb %xmm1, %r8d 404 pmovmskb %xmm0, %r9d 405 test %r8d, %r8d 406 jnz .Lnul_founds 407 xor $0xffff, %r9d # any mismatches? 408 jnz .Lmismatchs 409 410 /* main loop unrolled twice */ 411 movdqu 16(%rsi, %rbx, 1), %xmm0 412 pxor %xmm1, %xmm1 413 pcmpeqb 16(%rsi, %rdi, 1), %xmm1 414 pcmpeqb 16(%rsi), %xmm0 415 pmovmskb %xmm1, %r8d 416 pmovmskb %xmm0, %r9d 417 add $32, %rsi 418 test %r8d, %r8d 419 jnz .Lnul_found2s 420 xor $0xffff, %r9d 421 jnz .Lmismatch2s 422 sub $32, %rdx # end of buffer within next iteration? 423 jae 0b 424 425 /* end of buffer will occur in next 32 bytes */ 426.Ltails: 427 movdqu (%rsi, %rbx, 1), %xmm0 # chunk of 1st string corresponding to RSI 428 pxor %xmm1, %xmm1 429 pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RDI? 430 pcmpeqb (%rsi), %xmm0 # where do the chunks match? 431 pmovmskb %xmm1, %r8d 432 pmovmskb %xmm0, %r9d 433 bts %edx, %r8d # indicate NUL byte at laste byte in buffer 434 test %r8w, %r8w # NUL byte in first chunk? 435 jnz .Lnul_founds 436 xor $0xffff, %r9d # any mismatches? 437 jnz .Lmismatchs 438 439 /* main loop unrolled twice */ 440 movdqu 16(%rsi, %rbx, 1), %xmm0 441 pxor %xmm1, %xmm1 442 pcmpeqb 16(%rsi, %rdi, 1), %xmm1 443 pcmpeqb 16(%rsi), %xmm0 444 pmovmskb %xmm1, %r8d 445 pmovmskb %xmm0, %r9d 446 sub $16, %edx # take first half into account 447 bts %edx, %r8d # indicate NUL byte at laste byte in buffer 448 add $32, %rsi 449 450.Lnul_found2s: 451 sub $16, %rsi 452 453.Lnul_founds: 454 mov %eax, %ecx 455 mov %r8d, %r10d 456 shl %cl, %r8d # adjust NUL mask to positions in RSI/RBX 457 not %r9d # mask of mismatches 458 or %r8w, %r9w # NUL bytes also count as mismatches 459 jnz .Lmismatchs 460 461 movdqu (%rsi, %rax, 1), %xmm0 462 pcmpeqb (%rsi, %rdi, 1), %xmm0 463 add %rsi, %rdi # restore RDI pointer 464 add %rax, %rsi # point RSI to chunk corresponding to (RDI) 465 pmovmskb %xmm0, %ecx # mask of matches 466 not %ecx # mask of mismatches 467 or %r10d, %ecx # mask of mismatches or NUL bytes 468 tzcnt %ecx, %ecx # location of first mismatch 469 movzbl (%rdi, %rcx, 1), %eax 470 movzbl (%rsi, %rcx, 1), %ecx 471 sub %ecx, %eax 472 pop %rbx 473 ret 474 475.Lmismatch2s: 476 sub $16, %rsi 477 478.Lmismatchs: 479 tzcnt %r9d, %r9d # where is the mismatch? 480 add %rsi, %rbx # turn RBX from offset into pointer 481 movzbl (%rbx, %r9, 1), %eax 482 movzbl (%rsi, %r9, 1), %ecx 483 sub %ecx, %eax 484 pop %rbx 485 ret 486ARCHEND(strncmp, baseline) 487 488 .section .note.GNU-stack,"",%progbits 489