1/*- 2 * Copyright (c) 2023 The FreeBSD Foundation 3 * 4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org> 5 * under sponsorship from the FreeBSD Foundation. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE 27 */ 28 29#include <machine/asm.h> 30#include <machine/param.h> 31 32#include "amd64_archlevel.h" 33 34#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ 35 36ARCHFUNCS(strspn) 37 ARCHFUNC(strspn, scalar) 38 NOARCHFUNC 39 ARCHFUNC(strspn, x86_64_v2) 40ENDARCHFUNCS(strspn) 41 42ARCHENTRY(strspn, scalar) 43 push %rbp # align stack to enable function call 44 mov %rsp, %rbp 45 sub $256, %rsp # allocate space for lookup table 46 47 /* check for special cases */ 48 movzbl (%rsi), %edx # first character in the set 49 test %edx, %edx 50 jz .Lzero # empty set always returns 0 51 52 movzbl 1(%rsi), %eax # second character in the set 53 test %eax, %eax 54 jz .Lsingle 55 56 /* no special case matches -- prepare lookup table */ 57 xor %r8d, %r8d 58 mov $28, %ecx 590: mov %r8, (%rsp, %rcx, 8) 60 mov %r8, 8(%rsp, %rcx, 8) 61 mov %r8, 16(%rsp, %rcx, 8) 62 mov %r8, 24(%rsp, %rcx, 8) 63 sub $4, %ecx 64 jnc 0b 65 66 movb $1, (%rsp, %rdx, 1) # register first char in set 67 add $2, %rsi 68 69 /* process remaining chars in set */ 70 ALIGN_TEXT 710: movb $1, (%rsp, %rax, 1) # register previous char 72 movzbl (%rsi), %eax # next char in set 73 test %eax, %eax # end of string? 74 jz 1f 75 76 movb $1, (%rsp, %rax, 1) 77 add $2, %rsi 78 movzbl -1(%rsi), %eax 79 test %eax, %eax 80 jnz 0b 81 821: mov %rdi, %rax # a copy of the source to iterate over 83 84 /* find mismatch */ 85 ALIGN_TEXT 860: movzbl (%rax), %ecx 87 cmpb $0, (%rsp, %rcx, 1) 88 je 2f 89 90 movzbl 1(%rax), %ecx 91 cmpb $0, (%rsp, %rcx, 1) 92 je 3f 93 94 movzbl 2(%rax), %ecx 95 cmpb $0, (%rsp, %rcx, 1) 96 je 4f 97 98 movzbl 3(%rax), %ecx 99 add $4, %rax 100 cmpb $0, (%rsp, %rcx, 1) 101 jne 0b 102 103 sub $3, %rax 1044: dec %rdi 1053: inc %rax 1062: sub %rdi, %rax # number of characters preceding match 107 leave 108 ret 109 110 /* empty set never matches */ 111.Lzero: xor %eax, %eax 112 leave 113 ret 114 115 /* find repeated single character */ 116 ALIGN_TEXT 117.Lsingle: 118 cmpb %dl, (%rdi, %rax, 1) 119 jne 1f 120 121 cmpb %dl, 1(%rdi, %rax, 1) 122 jne 2f 123 124 cmpb %dl, 2(%rdi, %rax, 1) 125 jne 3f 126 127 cmpb %dl, 3(%rdi, %rax, 1) 128 lea 4(%rax), %rax 129 je .Lsingle 130 131 sub $3, %rax 1323: inc %rax 1332: inc %rax 1341: leave 135 ret 136ARCHEND(strspn, scalar) 137 138 /* 139 * This kernel uses pcmpistri to do the heavy lifting. 140 * We provide three code paths, depending on set size: 141 * 142 * 0--16: one pcmpistri per 16 bytes of input 143 * 17--32: two pcmpistri per 16 bytes of input 144 * >=33: fall back to look up table 145 */ 146ARCHENTRY(strspn, x86_64_v2) 147 push %rbp 148 mov %rsp, %rbp 149 sub $256, %rsp 150 151 /* find set size and copy up to 32 bytes to (%rsp) */ 152 mov %esi, %ecx 153 and $~0xf, %rsi # align set pointer 154 movdqa (%rsi), %xmm0 155 pxor %xmm1, %xmm1 156 and $0xf, %ecx # amount of bytes rsi is past alignment 157 xor %edx, %edx 158 pcmpeqb %xmm0, %xmm1 # end of string reached? 159 movdqa %xmm0, 32(%rsp) # transfer head of set to stack 160 pmovmskb %xmm1, %eax 161 shr %cl, %eax # clear out junk before string 162 test %eax, %eax # end of set reached? 163 jnz 0f 164 165 movdqa 16(%rsi), %xmm0 # second chunk of the set 166 mov $16, %edx 167 sub %ecx, %edx # length of set preceding xmm0 168 pxor %xmm1, %xmm1 169 pcmpeqb %xmm0, %xmm1 170 movdqa %xmm0, 48(%rsp) 171 movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set 172 pmovmskb %xmm1, %eax 173 test %eax, %eax 174 jnz 1f 175 176 movdqa 32(%rsi), %xmm0 # third chunk 177 add $16, %edx 178 pxor %xmm1, %xmm1 179 pcmpeqb %xmm0, %xmm1 180 movdqa %xmm0, 64(%rsp) 181 pmovmskb %xmm1, %eax 182 test %eax, %eax # still not done? 183 jz .Lgt32v2 184 1850: movdqu 32(%rsp, %rcx, 1), %xmm2 # head of set 1861: tzcnt %eax, %eax 187 add %eax, %edx # length of set (excluding NUL byte) 188 cmp $32, %edx # above 32 bytes? 189 ja .Lgt32v2 190 191 /* 192 * At this point we know that we want to use pcmpistri. 193 * one last problem obtains: the head of the string is not 194 * aligned and may cross a cacheline. If this is the case, 195 * we take the part before the page boundary and repeat the 196 * last byte to fill up the xmm register. 197 */ 198 mov %rdi, %rax # save original string pointer 199 lea 15(%rdi), %esi # last byte of the head 200 xor %edi, %esi 201 test $PAGE_SIZE, %esi # does the head cross a page? 202 jz 0f 203 204 /* head crosses page: copy to stack to fix up */ 205 and $~0xf, %rax # align head pointer temporarily 206 movzbl 15(%rax), %esi # last head byte on the page 207 movdqa (%rax), %xmm0 208 movabs $0x0101010101010101, %r8 209 imul %r8, %rsi # repeated 8 times 210 movdqa %xmm0, (%rsp) # head word on stack 211 mov %rsi, 16(%rsp) # followed by filler (last byte x8) 212 mov %rsi, 24(%rsp) 213 mov %edi, %eax 214 and $0xf, %eax # offset of head from alignment 215 add %rsp, %rax # pointer to fake head 216 2170: movdqu (%rax), %xmm1 # load head (fake or real) 218 lea 16(%rdi), %rax 219 and $~0xf, %rax # second 16 bytes of string (aligned) 2201: cmp $16, %edx # 16--32 bytes? 221 ja .Lgt16v2 222 223 224 /* set is 2--16 bytes in size */ 225 226 /* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT|_SIDD_NEGATIVE_POLARITY */ 227 pcmpistri $0x10, %xmm1, %xmm2 # match in head? 228 jc .Lheadmismatchv2 229 230 ALIGN_TEXT 2310: pcmpistri $0x10, (%rax), %xmm2 232 jc 1f # match or end of string? 233 pcmpistri $0x10, 16(%rax), %xmm2 234 lea 32(%rax), %rax 235 jnc 0b # match or end of string? 236 237 sub $16, %rax # go back to second half 2381: sub %rdi, %rax # offset of (%rax) from beginning of string 239 add %rcx, %rax # prefix length before match/NUL 240 leave 241 ret 242 243.Lheadmismatchv2: 244 mov %ecx, %eax # prefix length before mismatch/NUL 245 leave 246 ret 247 248 /* set is 17--32 bytes in size */ 249.Lgt16v2: 250 movdqu 48(%rsp, %rcx, 1), %xmm3 # second part of set 251 252 /* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_BIT_MASK|_SIDD_NEGATIVE_POLARITY */ 253 pcmpistrm $0x10, %xmm1, %xmm2 # any mismatch in first half? 254 movdqa %xmm0, %xmm4 255 pcmpistrm $0x10, %xmm1, %xmm3 # any mismatch in the second half? 256 ptest %xmm0, %xmm4 # any entry that doesn't match either? 257 jnz 2f 258 259 ALIGN_TEXT 2600: movdqa (%rax), %xmm1 261 pcmpistrm $0x10, %xmm1, %xmm2 262 movdqa %xmm0, %xmm4 263 pcmpistrm $0x10, %xmm1, %xmm3 264 ptest %xmm0, %xmm4 265 jnz 1f 266 movdqa 16(%rax), %xmm1 267 add $32, %rax 268 pcmpistrm $0x10, %xmm1, %xmm2 269 movdqa %xmm0, %xmm4 270 pcmpistrm $0x10, %xmm1, %xmm3 271 ptest %xmm0, %xmm4 272 jz 0b 273 274 sub $16, %rax 2751: pand %xmm4, %xmm0 276 movd %xmm0, %ecx 277 sub %rdi, %rax # offset of %xmm1 from beginning of string 278 tzcnt %ecx, %ecx 279 add %rcx, %rax # prefix length before match/NUL 280 leave 281 ret 282 283 /* mismatch or string end in head */ 2842: pand %xmm4, %xmm0 # bit mask of mismatches (end of string counts) 285 movd %xmm0, %eax 286 tzcnt %eax, %eax # prefix length before mismatch/NUL 287 leave 288 ret 289 290 /* set is >=33 bytes in size */ 291.Lgt32v2: 292 xorps %xmm0, %xmm0 293 mov $256-64, %edx 294 295 /* clear out look up table */ 2960: movaps %xmm0, (%rsp, %rdx, 1) 297 movaps %xmm0, 16(%rsp, %rdx, 1) 298 movaps %xmm0, 32(%rsp, %rdx, 1) 299 movaps %xmm0, 48(%rsp, %rdx, 1) 300 sub $64, %edx 301 jnc 0b 302 303 add %rcx, %rsi # restore string pointer 304 mov %rdi, %rax # keep a copy of the string 305 306 /* initialise look up table */ 307 movzbl (%rsi), %ecx # string is known not to be empty 308 309 ALIGN_TEXT 3100: movb $1, (%rsp, %rcx, 1) 311 movzbl 1(%rsi), %ecx 312 test %ecx, %ecx 313 jz 1f 314 315 movb $1, (%rsp, %rcx, 1) 316 movzbl 2(%rsi), %ecx 317 test %ecx, %ecx 318 jz 1f 319 320 movb $1, (%rsp, %rcx, 1) 321 movzbl 3(%rsi), %ecx 322 add $4, %rsi 323 test %ecx, %ecx 324 jz 1f 325 326 movb $1, (%rsp, %rcx, 1) 327 movzbl (%rsi), %ecx 328 test %ecx, %ecx 329 jnz 0b 330 331 /* find match */ 332 ALIGN_TEXT 3331: movzbl (%rax), %ecx 334 cmpb $0, (%rsp, %rcx, 1) 335 je 2f 336 337 movzbl 1(%rax), %ecx 338 cmpb $0, (%rsp, %rcx, 1) 339 je 3f 340 341 movzbl 2(%rax), %ecx 342 cmpb $0, (%rsp, %rcx, 1) 343 je 4f 344 345 movzbl 3(%rax), %ecx 346 add $4, %rax 347 cmpb $0, (%rsp, %rcx, 1) 348 jne 1b 349 350 sub $3, %rax 3514: dec %rdi 3523: inc %rax 3532: sub %rdi, %rax # number of characters preceding match 354 leave 355 ret 356ARCHEND(strspn, x86_64_v2) 357 358 .section .note.GNU-stack,"",%progbits 359