1/* 2 * Copyright (c) 2023 The FreeBSD Foundation 3 * 4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org> 5 * under sponsorship from the FreeBSD Foundation. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE 27 */ 28 29#include <machine/asm.h> 30#include <machine/param.h> 31 32#include "amd64_archlevel.h" 33 34#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ 35 36ARCHFUNCS(strcspn) 37 ARCHFUNC(strcspn, scalar) 38 NOARCHFUNC 39 ARCHFUNC(strcspn, x86_64_v2) 40ENDARCHFUNCS(strcspn) 41 42ARCHENTRY(strcspn, scalar) 43 push %rbp # align stack to enable function call 44 mov %rsp, %rbp 45 sub $256, %rsp # allocate space for lookup table 46 47 /* check for special cases */ 48 movzbl (%rsi), %eax # first character in the set 49 test %eax, %eax 50 jz .Lstrlen 51 52 movzbl 1(%rsi), %edx # second character in the set 53 test %edx, %edx 54 jz .Lstrchr 55 56 /* no special case matches -- prepare lookup table */ 57 xor %r8d, %r8d 58 mov $28, %ecx 590: mov %r8, (%rsp, %rcx, 8) 60 mov %r8, 8(%rsp, %rcx, 8) 61 mov %r8, 16(%rsp, %rcx, 8) 62 mov %r8, 24(%rsp, %rcx, 8) 63 sub $4, %ecx 64 jnc 0b 65 66 add $2, %rsi 67 movb $1, (%rsp, %rax, 1) # register first chars in set 68 movb $1, (%rsp, %rdx, 1) 69 mov %rdi, %rax # a copy of the source to iterate over 70 71 /* process remaining chars in set */ 72 ALIGN_TEXT 730: movzbl (%rsi), %ecx 74 movb $1, (%rsp, %rcx, 1) 75 test %ecx, %ecx 76 jz 1f 77 78 movzbl 1(%rsi), %ecx 79 movb $1, (%rsp, %rcx, 1) 80 test %ecx, %ecx 81 jz 1f 82 83 add $2, %rsi 84 jmp 0b 85 86 /* find match */ 87 ALIGN_TEXT 881: movzbl (%rax), %ecx 89 cmpb $0, (%rsp, %rcx, 1) 90 jne 2f 91 92 movzbl 1(%rax), %ecx 93 cmpb $0, (%rsp, %rcx, 1) 94 jne 3f 95 96 movzbl 2(%rax), %ecx 97 cmpb $0, (%rsp, %rcx, 1) 98 jne 4f 99 100 movzbl 3(%rax), %ecx 101 add $4, %rax 102 cmpb $0, (%rsp, %rcx, 1) 103 je 1b 104 105 sub $3, %rax 1064: dec %rdi 1073: inc %rax 1082: sub %rdi, %rax # number of characters preceding match 109 leave 110 ret 111 112 /* set is empty, degrades to strlen */ 113.Lstrlen: 114 leave 115 jmp CNAME(strlen) 116 117 /* just one character in set, degrades to strchr */ 118.Lstrchr: 119 mov %rdi, (%rsp) # stash a copy of the string 120 mov %eax, %esi # find the character in the set 121 call CNAME(strchrnul) 122 sub (%rsp), %rax # length of prefix before match 123 leave 124 ret 125ARCHEND(strcspn, scalar) 126 127 /* 128 * This kernel uses pcmpistri to do the heavy lifting. 129 * We provide five code paths, depending on set size: 130 * 131 * 0: call strlen() 132 * 1: call strchr() 133 * 2--16: one pcmpistri per 16 bytes of input 134 * 17--32: two pcmpistri per 16 bytes of input 135 * >=33: fall back to look up table 136 */ 137ARCHENTRY(strcspn, x86_64_v2) 138 push %rbp 139 mov %rsp, %rbp 140 sub $256, %rsp 141 142 /* check for special cases */ 143 movzbl (%rsi), %eax 144 test %eax, %eax # empty string? 145 jz .Lstrlenv2 146 147 cmpb $0, 1(%rsi) # single character string? 148 jz .Lstrchrv2 149 150 /* find set size and copy up to 32 bytes to (%rsp) */ 151 mov %esi, %ecx 152 and $~0xf, %rsi # align set pointer 153 movdqa (%rsi), %xmm0 154 pxor %xmm1, %xmm1 155 and $0xf, %ecx # amount of bytes rsi is past alignment 156 xor %edx, %edx 157 pcmpeqb %xmm0, %xmm1 # end of string reached? 158 movdqa %xmm0, 32(%rsp) # transfer head of set to stack 159 pmovmskb %xmm1, %eax 160 shr %cl, %eax # clear out junk before string 161 test %eax, %eax # end of set reached? 162 jnz 0f 163 164 movdqa 16(%rsi), %xmm0 # second chunk of the set 165 mov $16, %edx 166 sub %ecx, %edx # length of set preceding xmm0 167 pxor %xmm1, %xmm1 168 pcmpeqb %xmm0, %xmm1 169 movdqa %xmm0, 48(%rsp) 170 movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set 171 pmovmskb %xmm1, %eax 172 test %eax, %eax 173 jnz 1f 174 175 movdqa 32(%rsi), %xmm0 # third chunk 176 add $16, %edx 177 pxor %xmm1, %xmm1 178 pcmpeqb %xmm0, %xmm1 179 movdqa %xmm0, 64(%rsp) 180 pmovmskb %xmm1, %eax 181 test %eax, %eax # still not done? 182 jz .Lgt32v2 183 1840: movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set 1851: tzcnt %eax, %eax 186 add %eax, %edx # length of set (excluding NUL byte) 187 cmp $32, %edx # above 32 bytes? 188 ja .Lgt32v2 189 190 /* 191 * At this point we know that we want to use pcmpistri. 192 * one last problem obtains: the head of the string is not 193 * aligned and may cross a cacheline. If this is the case, 194 * we take the part before the page boundary and repeat the 195 * last byte to fill up the xmm register. 196 */ 197 mov %rdi, %rax # save original string pointer 198 lea 15(%rdi), %esi # last byte of the head 199 xor %edi, %esi 200 test $PAGE_SIZE, %esi # does the head cross a page? 201 jz 0f 202 203 /* head crosses page: copy to stack to fix up */ 204 and $~0xf, %rax # align head pointer temporarily 205 movzbl 15(%rax), %esi # last head byte on the page 206 movdqa (%rax), %xmm0 207 movabs $0x0101010101010101, %r8 208 imul %r8, %rsi # repeated 8 times 209 movdqa %xmm0, (%rsp) # head word on stack 210 mov %rsi, 16(%rsp) # followed by filler (last byte x8) 211 mov %rsi, 24(%rsp) 212 mov %edi, %eax 213 and $0xf, %eax # offset of head from alignment 214 add %rsp, %rax # pointer to fake head 215 2160: movdqu (%rax), %xmm0 # load head (fake or real) 217 lea 16(%rdi), %rax 218 and $~0xf, %rax # second 16 bytes of string (aligned) 2191: cmp $16, %edx # 16--32 bytes? 220 ja .Lgt16v2 221 222 223 /* set is 2--16 bytes in size */ 224 225 /* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT */ 226 pcmpistri $0, %xmm0, %xmm2 # match in head? 227 jbe .Lheadmatchv2 228 229 ALIGN_TEXT 2300: pcmpistri $0, (%rax), %xmm2 231 jbe 1f # match or end of string? 232 pcmpistri $0, 16(%rax), %xmm2 233 lea 32(%rax), %rax 234 ja 0b # match or end of string? 235 2363: lea -16(%rax), %rax # go back to second half 2371: jc 2f # jump if match found 238 movdqa (%rax), %xmm0 # reload string piece 239 pxor %xmm1, %xmm1 240 pcmpeqb %xmm1, %xmm0 # where is the NUL byte? 241 pmovmskb %xmm0, %ecx 242 tzcnt %ecx, %ecx # location of NUL byte in (%rax) 2432: sub %rdi, %rax # offset of %xmm0 from beginning of string 244 add %rcx, %rax # prefix length before match/NUL 245 leave 246 ret 247 248.Lheadmatchv2: 249 jc 2f # jump if match found 250 pxor %xmm1, %xmm1 251 pcmpeqb %xmm1, %xmm0 252 pmovmskb %xmm0, %ecx 253 tzcnt %ecx, %ecx # location of NUL byte 2542: mov %ecx, %eax # prefix length before match/NUL 255 leave 256 ret 257 258 /* match in first set half during head */ 259.Lheadmatchv2first: 260 mov %ecx, %eax 261 pcmpistri $0, %xmm0, %xmm3 # match in second set half? 262 cmp %ecx, %eax # before the first half match? 263 cmova %ecx, %eax # use the earlier match 264 leave 265 ret 266 267.Lgt16v2: 268 movdqu 48(%rsp, %rcx, 1), %xmm3 # second part of set 269 270 /* set is 17--32 bytes in size */ 271 pcmpistri $0, %xmm0, %xmm2 # match in first set half? 272 jb .Lheadmatchv2first 273 pcmpistri $0, %xmm0, %xmm3 # match in second set half or end of string? 274 jbe .Lheadmatchv2 275 276 ALIGN_TEXT 2770: movdqa (%rax), %xmm0 278 pcmpistri $0, %xmm0, %xmm2 279 jb 4f # match in first set half? 280 pcmpistri $0, %xmm0, %xmm3 281 jbe 1f # match in second set half or end of string? 282 movdqa 16(%rax), %xmm0 283 add $32, %rax 284 pcmpistri $0, %xmm0, %xmm2 285 jb 3f # match in first set half? 286 pcmpistri $0, %xmm0, %xmm3 287 ja 0b # neither match in 2nd half nor string end? 288 289 /* match in second half or NUL */ 290 lea -16(%rax), %rax # go back to second half 2911: jc 2f # jump if match found 292 pxor %xmm1, %xmm1 293 pcmpeqb %xmm1, %xmm0 # where is the NUL byte? 294 pmovmskb %xmm0, %ecx 295 tzcnt %ecx, %ecx # location of NUL byte in (%rax) 2962: sub %rdi, %rax # offset of %xmm0 from beginning of string 297 add %rcx, %rax # prefix length before match/NUL 298 leave 299 ret 300 301 /* match in first half */ 3023: sub $16, %rax # go back to second half 3034: sub %rdi, %rax # offset of %xmm0 from beginning of string 304 mov %ecx, %edx 305 pcmpistri $0, %xmm0, %xmm3 # match in second set half? 306 cmp %ecx, %edx # before the first half match? 307 cmova %ecx, %edx # use the earlier match 308 add %rdx, %rax # return full ofset 309 leave 310 ret 311 312 /* set is empty, degrades to strlen */ 313.Lstrlenv2: 314 leave 315 jmp CNAME(strlen) 316 317 /* just one character in set, degrades to strchr */ 318.Lstrchrv2: 319 mov %rdi, (%rsp) # stash a copy of the string 320 mov %eax, %esi # find this character 321 call CNAME(strchrnul) 322 sub (%rsp), %rax # length of prefix before match 323 leave 324 ret 325 326 /* set is >=33 bytes in size */ 327.Lgt32v2: 328 xorps %xmm0, %xmm0 329 mov $256-64, %edx 330 331 /* clear out look up table */ 3320: movaps %xmm0, (%rsp, %rdx, 1) 333 movaps %xmm0, 16(%rsp, %rdx, 1) 334 movaps %xmm0, 32(%rsp, %rdx, 1) 335 movaps %xmm0, 48(%rsp, %rdx, 1) 336 sub $64, %edx 337 jnc 0b 338 339 add %rcx, %rsi # restore string pointer 340 mov %rdi, %rax # keep a copy of the string 341 342 /* initialise look up table */ 343 ALIGN_TEXT 3440: movzbl (%rsi), %ecx 345 movb $1, (%rsp, %rcx, 1) 346 test %ecx, %ecx 347 jz 1f 348 349 movzbl 1(%rsi), %ecx 350 movb $1, (%rsp, %rcx, 1) 351 test %ecx, %ecx 352 jz 1f 353 354 movzbl 2(%rsi), %ecx 355 movb $1, (%rsp, %rcx, 1) 356 test %ecx, %ecx 357 jz 1f 358 359 movzbl 3(%rsi), %ecx 360 movb $1, (%rsp, %rcx, 1) 361 test %ecx, %ecx 362 jz 1f 363 364 add $4, %rsi 365 jmp 0b 366 367 /* find match */ 368 ALIGN_TEXT 3691: movzbl (%rax), %ecx 370 cmpb $0, (%rsp, %rcx, 1) 371 jne 2f 372 373 movzbl 1(%rax), %ecx 374 cmpb $0, (%rsp, %rcx, 1) 375 jne 3f 376 377 movzbl 2(%rax), %ecx 378 cmpb $0, (%rsp, %rcx, 1) 379 jne 4f 380 381 movzbl 3(%rax), %ecx 382 add $4, %rax 383 cmpb $0, (%rsp, %rcx, 1) 384 je 1b 385 386 sub $3, %rax 3874: dec %rdi 3883: inc %rax 3892: sub %rdi, %rax # number of characters preceding match 390 leave 391 ret 392ARCHEND(strcspn, x86_64_v2) 393 394 .section .note.GNU-stack,"",%progbits 395