1/* 2 * Copyright (c) 2023 The FreeBSD Foundation 3 * 4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org> 5 * under sponsorship from the FreeBSD Foundation. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE 27 */ 28 29#include <machine/asm.h> 30#include <machine/param.h> 31 32#include "amd64_archlevel.h" 33 34#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ 35 36 .weak strcspn 37 .set strcspn, __strcspn 38ARCHFUNCS(__strcspn) 39 ARCHFUNC(__strcspn, scalar) 40 NOARCHFUNC 41 ARCHFUNC(__strcspn, x86_64_v2) 42ENDARCHFUNCS(__strcspn) 43 44ARCHENTRY(__strcspn, scalar) 45 push %rbp # align stack to enable function call 46 mov %rsp, %rbp 47 sub $256, %rsp # allocate space for lookup table 48 49 /* check for special cases */ 50 movzbl (%rsi), %eax # first character in the set 51 test %eax, %eax 52 jz .Lstrlen 53 54 movzbl 1(%rsi), %edx # second character in the set 55 test %edx, %edx 56 jz .Lstrchr 57 58 /* no special case matches -- prepare lookup table */ 59 xor %r8d, %r8d 60 mov $28, %ecx 610: mov %r8, (%rsp, %rcx, 8) 62 mov %r8, 8(%rsp, %rcx, 8) 63 mov %r8, 16(%rsp, %rcx, 8) 64 mov %r8, 24(%rsp, %rcx, 8) 65 sub $4, %ecx 66 jnc 0b 67 68 add $2, %rsi 69 movb $1, (%rsp, %rax, 1) # register first chars in set 70 movb $1, (%rsp, %rdx, 1) 71 mov %rdi, %rax # a copy of the source to iterate over 72 73 /* process remaining chars in set */ 74 ALIGN_TEXT 750: movzbl (%rsi), %ecx 76 movb $1, (%rsp, %rcx, 1) 77 test %ecx, %ecx 78 jz 1f 79 80 movzbl 1(%rsi), %ecx 81 movb $1, (%rsp, %rcx, 1) 82 test %ecx, %ecx 83 jz 1f 84 85 add $2, %rsi 86 jmp 0b 87 88 /* find match */ 89 ALIGN_TEXT 901: movzbl (%rax), %ecx 91 cmpb $0, (%rsp, %rcx, 1) 92 jne 2f 93 94 movzbl 1(%rax), %ecx 95 cmpb $0, (%rsp, %rcx, 1) 96 jne 3f 97 98 movzbl 2(%rax), %ecx 99 cmpb $0, (%rsp, %rcx, 1) 100 jne 4f 101 102 movzbl 3(%rax), %ecx 103 add $4, %rax 104 cmpb $0, (%rsp, %rcx, 1) 105 je 1b 106 107 sub $3, %rax 1084: dec %rdi 1093: inc %rax 1102: sub %rdi, %rax # number of characters preceding match 111 leave 112 ret 113 114 /* set is empty, degrades to strlen */ 115.Lstrlen: 116 leave 117 jmp CNAME(strlen) 118 119 /* just one character in set, degrades to strchr */ 120.Lstrchr: 121 mov %rdi, (%rsp) # stash a copy of the string 122 mov %eax, %esi # find the character in the set 123 call CNAME(strchrnul) 124 sub (%rsp), %rax # length of prefix before match 125 leave 126 ret 127ARCHEND(__strcspn, scalar) 128 129 /* 130 * This kernel uses pcmpistri to do the heavy lifting. 131 * We provide five code paths, depending on set size: 132 * 133 * 0: call strlen() 134 * 1: call strchr() 135 * 2--16: one pcmpistri per 16 bytes of input 136 * 17--32: two pcmpistri per 16 bytes of input 137 * >=33: fall back to look up table 138 */ 139ARCHENTRY(__strcspn, x86_64_v2) 140 push %rbp 141 mov %rsp, %rbp 142 sub $256, %rsp 143 144 /* check for special cases */ 145 movzbl (%rsi), %eax 146 test %eax, %eax # empty string? 147 jz .Lstrlenv2 148 149 cmpb $0, 1(%rsi) # single character string? 150 jz .Lstrchrv2 151 152 /* find set size and copy up to 32 bytes to (%rsp) */ 153 mov %esi, %ecx 154 and $~0xf, %rsi # align set pointer 155 movdqa (%rsi), %xmm0 156 pxor %xmm1, %xmm1 157 and $0xf, %ecx # amount of bytes rsi is past alignment 158 xor %edx, %edx 159 pcmpeqb %xmm0, %xmm1 # end of string reached? 160 movdqa %xmm0, 32(%rsp) # transfer head of set to stack 161 pmovmskb %xmm1, %eax 162 shr %cl, %eax # clear out junk before string 163 test %eax, %eax # end of set reached? 164 jnz 0f 165 166 movdqa 16(%rsi), %xmm0 # second chunk of the set 167 mov $16, %edx 168 sub %ecx, %edx # length of set preceding xmm0 169 pxor %xmm1, %xmm1 170 pcmpeqb %xmm0, %xmm1 171 movdqa %xmm0, 48(%rsp) 172 movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set 173 pmovmskb %xmm1, %eax 174 test %eax, %eax 175 jnz 1f 176 177 movdqa 32(%rsi), %xmm0 # third chunk 178 add $16, %edx 179 pxor %xmm1, %xmm1 180 pcmpeqb %xmm0, %xmm1 181 movdqa %xmm0, 64(%rsp) 182 pmovmskb %xmm1, %eax 183 test %eax, %eax # still not done? 184 jz .Lgt32v2 185 1860: movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set 1871: tzcnt %eax, %eax 188 add %eax, %edx # length of set (excluding NUL byte) 189 cmp $32, %edx # above 32 bytes? 190 ja .Lgt32v2 191 192 /* 193 * At this point we know that we want to use pcmpistri. 194 * one last problem obtains: the head of the string is not 195 * aligned and may cross a cacheline. If this is the case, 196 * we take the part before the page boundary and repeat the 197 * last byte to fill up the xmm register. 198 */ 199 mov %rdi, %rax # save original string pointer 200 lea 15(%rdi), %esi # last byte of the head 201 xor %edi, %esi 202 test $PAGE_SIZE, %esi # does the head cross a page? 203 jz 0f 204 205 /* head crosses page: copy to stack to fix up */ 206 and $~0xf, %rax # align head pointer temporarily 207 movzbl 15(%rax), %esi # last head byte on the page 208 movdqa (%rax), %xmm0 209 movabs $0x0101010101010101, %r8 210 imul %r8, %rsi # repeated 8 times 211 movdqa %xmm0, (%rsp) # head word on stack 212 mov %rsi, 16(%rsp) # followed by filler (last byte x8) 213 mov %rsi, 24(%rsp) 214 mov %edi, %eax 215 and $0xf, %eax # offset of head from alignment 216 add %rsp, %rax # pointer to fake head 217 2180: movdqu (%rax), %xmm0 # load head (fake or real) 219 lea 16(%rdi), %rax 220 and $~0xf, %rax # second 16 bytes of string (aligned) 2211: cmp $16, %edx # 16--32 bytes? 222 ja .Lgt16v2 223 224 225 /* set is 2--16 bytes in size */ 226 227 /* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT */ 228 pcmpistri $0, %xmm0, %xmm2 # match in head? 229 jbe .Lheadmatchv2 230 231 ALIGN_TEXT 2320: pcmpistri $0, (%rax), %xmm2 233 jbe 1f # match or end of string? 234 pcmpistri $0, 16(%rax), %xmm2 235 lea 32(%rax), %rax 236 ja 0b # match or end of string? 237 2383: lea -16(%rax), %rax # go back to second half 2391: jc 2f # jump if match found 240 movdqa (%rax), %xmm0 # reload string piece 241 pxor %xmm1, %xmm1 242 pcmpeqb %xmm1, %xmm0 # where is the NUL byte? 243 pmovmskb %xmm0, %ecx 244 tzcnt %ecx, %ecx # location of NUL byte in (%rax) 2452: sub %rdi, %rax # offset of %xmm0 from beginning of string 246 add %rcx, %rax # prefix length before match/NUL 247 leave 248 ret 249 250.Lheadmatchv2: 251 jc 2f # jump if match found 252 pxor %xmm1, %xmm1 253 pcmpeqb %xmm1, %xmm0 254 pmovmskb %xmm0, %ecx 255 tzcnt %ecx, %ecx # location of NUL byte 2562: mov %ecx, %eax # prefix length before match/NUL 257 leave 258 ret 259 260 /* match in first set half during head */ 261.Lheadmatchv2first: 262 mov %ecx, %eax 263 pcmpistri $0, %xmm0, %xmm3 # match in second set half? 264 cmp %ecx, %eax # before the first half match? 265 cmova %ecx, %eax # use the earlier match 266 leave 267 ret 268 269.Lgt16v2: 270 movdqu 48(%rsp, %rcx, 1), %xmm3 # second part of set 271 272 /* set is 17--32 bytes in size */ 273 pcmpistri $0, %xmm0, %xmm2 # match in first set half? 274 jb .Lheadmatchv2first 275 pcmpistri $0, %xmm0, %xmm3 # match in second set half or end of string? 276 jbe .Lheadmatchv2 277 278 ALIGN_TEXT 2790: movdqa (%rax), %xmm0 280 pcmpistri $0, %xmm0, %xmm2 281 jb 4f # match in first set half? 282 pcmpistri $0, %xmm0, %xmm3 283 jbe 1f # match in second set half or end of string? 284 movdqa 16(%rax), %xmm0 285 add $32, %rax 286 pcmpistri $0, %xmm0, %xmm2 287 jb 3f # match in first set half? 288 pcmpistri $0, %xmm0, %xmm3 289 ja 0b # neither match in 2nd half nor string end? 290 291 /* match in second half or NUL */ 292 lea -16(%rax), %rax # go back to second half 2931: jc 2f # jump if match found 294 pxor %xmm1, %xmm1 295 pcmpeqb %xmm1, %xmm0 # where is the NUL byte? 296 pmovmskb %xmm0, %ecx 297 tzcnt %ecx, %ecx # location of NUL byte in (%rax) 2982: sub %rdi, %rax # offset of %xmm0 from beginning of string 299 add %rcx, %rax # prefix length before match/NUL 300 leave 301 ret 302 303 /* match in first half */ 3043: sub $16, %rax # go back to second half 3054: sub %rdi, %rax # offset of %xmm0 from beginning of string 306 mov %ecx, %edx 307 pcmpistri $0, %xmm0, %xmm3 # match in second set half? 308 cmp %ecx, %edx # before the first half match? 309 cmova %ecx, %edx # use the earlier match 310 add %rdx, %rax # return full ofset 311 leave 312 ret 313 314 /* set is empty, degrades to strlen */ 315.Lstrlenv2: 316 leave 317 jmp CNAME(strlen) 318 319 /* just one character in set, degrades to strchr */ 320.Lstrchrv2: 321 mov %rdi, (%rsp) # stash a copy of the string 322 mov %eax, %esi # find this character 323 call CNAME(strchrnul) 324 sub (%rsp), %rax # length of prefix before match 325 leave 326 ret 327 328 /* set is >=33 bytes in size */ 329.Lgt32v2: 330 xorps %xmm0, %xmm0 331 mov $256-64, %edx 332 333 /* clear out look up table */ 3340: movaps %xmm0, (%rsp, %rdx, 1) 335 movaps %xmm0, 16(%rsp, %rdx, 1) 336 movaps %xmm0, 32(%rsp, %rdx, 1) 337 movaps %xmm0, 48(%rsp, %rdx, 1) 338 sub $64, %edx 339 jnc 0b 340 341 add %rcx, %rsi # restore string pointer 342 mov %rdi, %rax # keep a copy of the string 343 344 /* initialise look up table */ 345 ALIGN_TEXT 3460: movzbl (%rsi), %ecx 347 movb $1, (%rsp, %rcx, 1) 348 test %ecx, %ecx 349 jz 1f 350 351 movzbl 1(%rsi), %ecx 352 movb $1, (%rsp, %rcx, 1) 353 test %ecx, %ecx 354 jz 1f 355 356 movzbl 2(%rsi), %ecx 357 movb $1, (%rsp, %rcx, 1) 358 test %ecx, %ecx 359 jz 1f 360 361 movzbl 3(%rsi), %ecx 362 movb $1, (%rsp, %rcx, 1) 363 test %ecx, %ecx 364 jz 1f 365 366 add $4, %rsi 367 jmp 0b 368 369 /* find match */ 370 ALIGN_TEXT 3711: movzbl (%rax), %ecx 372 cmpb $0, (%rsp, %rcx, 1) 373 jne 2f 374 375 movzbl 1(%rax), %ecx 376 cmpb $0, (%rsp, %rcx, 1) 377 jne 3f 378 379 movzbl 2(%rax), %ecx 380 cmpb $0, (%rsp, %rcx, 1) 381 jne 4f 382 383 movzbl 3(%rax), %ecx 384 add $4, %rax 385 cmpb $0, (%rsp, %rcx, 1) 386 je 1b 387 388 sub $3, %rax 3894: dec %rdi 3903: inc %rax 3912: sub %rdi, %rax # number of characters preceding match 392 leave 393 ret 394ARCHEND(__strcspn, x86_64_v2) 395 396 .section .note.GNU-stack,"",%progbits 397