161f4c4d3SRobert Clausecker/*- 261f4c4d3SRobert Clausecker * Copyright (c) 2023 The FreeBSD Foundation 361f4c4d3SRobert Clausecker * 461f4c4d3SRobert Clausecker * This software was developed by Robert Clausecker <fuz@FreeBSD.org> 561f4c4d3SRobert Clausecker * under sponsorship from the FreeBSD Foundation. 661f4c4d3SRobert Clausecker * 761f4c4d3SRobert Clausecker * Redistribution and use in source and binary forms, with or without 861f4c4d3SRobert Clausecker * modification, are permitted provided that the following conditions 961f4c4d3SRobert Clausecker * are met: 1061f4c4d3SRobert Clausecker * 1. Redistributions of source code must retain the above copyright 1161f4c4d3SRobert Clausecker * notice, this list of conditions and the following disclaimer. 1261f4c4d3SRobert Clausecker * 2. Redistributions in binary form must reproduce the above copyright 1361f4c4d3SRobert Clausecker * notice, this list of conditions and the following disclaimer in the 1461f4c4d3SRobert Clausecker * documentation and/or other materials provided with the distribution. 1561f4c4d3SRobert Clausecker * 1661f4c4d3SRobert Clausecker * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND 1761f4c4d3SRobert Clausecker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1861f4c4d3SRobert Clausecker * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 1961f4c4d3SRobert Clausecker * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 2061f4c4d3SRobert Clausecker * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2161f4c4d3SRobert Clausecker * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2261f4c4d3SRobert Clausecker * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2361f4c4d3SRobert Clausecker * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2461f4c4d3SRobert Clausecker * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2561f4c4d3SRobert Clausecker * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2661f4c4d3SRobert Clausecker * SUCH DAMAGE 2761f4c4d3SRobert Clausecker */ 2861f4c4d3SRobert Clausecker 2961f4c4d3SRobert Clausecker#include <machine/asm.h> 3061f4c4d3SRobert Clausecker 3161f4c4d3SRobert Clausecker#include "amd64_archlevel.h" 3261f4c4d3SRobert Clausecker 3361f4c4d3SRobert Clausecker#define ALIGN_TEXT .p2align 4,0x90 # 16-byte alignment, nop-filled 3461f4c4d3SRobert Clausecker 3561f4c4d3SRobert Clausecker .weak strchrnul 3661f4c4d3SRobert Clausecker .set strchrnul, __strchrnul 3761f4c4d3SRobert Clausecker 3861f4c4d3SRobert ClauseckerARCHFUNCS(__strchrnul) 3961f4c4d3SRobert Clausecker ARCHFUNC(__strchrnul, scalar) 4061f4c4d3SRobert Clausecker ARCHFUNC(__strchrnul, baseline) 4161f4c4d3SRobert ClauseckerENDARCHFUNCS(__strchrnul) 4261f4c4d3SRobert Clausecker 4361f4c4d3SRobert Clausecker/* 4461f4c4d3SRobert Clausecker * strchrnul(str, c) 4561f4c4d3SRobert Clausecker * This is implemented like strlen(str), but we check for the 4661f4c4d3SRobert Clausecker * presence of both NUL and c in each iteration. 4761f4c4d3SRobert Clausecker */ 4861f4c4d3SRobert ClauseckerARCHENTRY(__strchrnul, scalar) 4961f4c4d3SRobert Clausecker mov %edi, %ecx 5061f4c4d3SRobert Clausecker and $~7, %rdi # align to 8 byte 5161f4c4d3SRobert Clausecker movzbl %sil, %esi # clear stray high bits 5261f4c4d3SRobert Clausecker movabs $0x0101010101010101, %r8 5361f4c4d3SRobert Clausecker mov (%rdi), %rax # load first word 5461f4c4d3SRobert Clausecker imul %r8, %rsi # replicate char 8 times 5561f4c4d3SRobert Clausecker 5661f4c4d3SRobert Clausecker /* 5761f4c4d3SRobert Clausecker * Unaligned input: align to 8 bytes. Then proceed the same 58*3d8ef251SRobert Clausecker * way as with aligned input, but prevent matches before the 59*3d8ef251SRobert Clausecker * beginning of the string. This is achieved by oring 0x01 60*3d8ef251SRobert Clausecker * into each byte of the buffer before the string 6161f4c4d3SRobert Clausecker */ 6261f4c4d3SRobert Clausecker shl $3, %ecx 63*3d8ef251SRobert Clausecker mov %r8, %r10 6461f4c4d3SRobert Clausecker add $8, %rdi 65*3d8ef251SRobert Clausecker shl %cl, %r10 # 0x01 where the string is 66*3d8ef251SRobert Clausecker xor %r8, %r10 # 0x01 where it is not 6761f4c4d3SRobert Clausecker neg %r8 # negate 01..01 so we can use lea 68*3d8ef251SRobert Clausecker movabs $0x8080808080808080, %r9 6961f4c4d3SRobert Clausecker 7061f4c4d3SRobert Clausecker mov %rsi, %rcx 7161f4c4d3SRobert Clausecker xor %rax, %rcx # str ^ c 72*3d8ef251SRobert Clausecker or %r10, %rax # str without NUL bytes before it 73*3d8ef251SRobert Clausecker or %r10, %rcx # (str ^ c) without matches before it 7461f4c4d3SRobert Clausecker lea (%rax, %r8, 1), %rdx # str - 0x01..01 7561f4c4d3SRobert Clausecker lea (%rcx, %r8, 1), %r11 # (str ^ c) - 0x01..01 7661f4c4d3SRobert Clausecker not %rax # ~str 7761f4c4d3SRobert Clausecker not %rcx # ~(str ^ c) 7861f4c4d3SRobert Clausecker and %rdx, %rax # (str - 0x01..01) & ~str 7961f4c4d3SRobert Clausecker and %r11, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) 8061f4c4d3SRobert Clausecker or %rcx, %rax # matches for both 81*3d8ef251SRobert Clausecker and %r9, %rax # not including junk bytes 8261f4c4d3SRobert Clausecker jnz 1f 8361f4c4d3SRobert Clausecker 8461f4c4d3SRobert Clausecker /* main loop unrolled twice */ 8561f4c4d3SRobert Clausecker ALIGN_TEXT 8661f4c4d3SRobert Clausecker0: mov (%rdi), %rax # str 8761f4c4d3SRobert Clausecker mov %rsi, %rcx 8861f4c4d3SRobert Clausecker xor %rax, %rcx # str ^ c 8961f4c4d3SRobert Clausecker lea (%rax, %r8, 1), %rdx # str - 0x01..01 9061f4c4d3SRobert Clausecker lea (%rcx, %r8, 1), %r11 # (str ^ c) - 0x01..01 9161f4c4d3SRobert Clausecker not %rax # ~str 9261f4c4d3SRobert Clausecker not %rcx # ~(str ^ c) 9361f4c4d3SRobert Clausecker and %rdx, %rax # (str - 0x01..01) & ~str 9461f4c4d3SRobert Clausecker and %r11, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) 9561f4c4d3SRobert Clausecker or %rcx, %rax # matches for both 9661f4c4d3SRobert Clausecker and %r9, %rax # not including junk bits 9761f4c4d3SRobert Clausecker jnz 2f 9861f4c4d3SRobert Clausecker 9961f4c4d3SRobert Clausecker mov 8(%rdi), %rax # str 10061f4c4d3SRobert Clausecker add $16, %rdi 10161f4c4d3SRobert Clausecker mov %rsi, %rcx 10261f4c4d3SRobert Clausecker xor %rax, %rcx # str ^ c 10361f4c4d3SRobert Clausecker lea (%rax, %r8, 1), %rdx # str - 0x01..01 10461f4c4d3SRobert Clausecker lea (%rcx, %r8, 1), %r11 # (str ^ c) - 0x01..01 10561f4c4d3SRobert Clausecker not %rax # ~str 10661f4c4d3SRobert Clausecker not %rcx # ~(str ^ c) 10761f4c4d3SRobert Clausecker and %rdx, %rax # (str - 0x01..01) & ~str 10861f4c4d3SRobert Clausecker and %r11, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) 10961f4c4d3SRobert Clausecker or %rcx, %rax # matches for both 11061f4c4d3SRobert Clausecker and %r9, %rax # not including junk bits 11161f4c4d3SRobert Clausecker jz 0b 11261f4c4d3SRobert Clausecker 11361f4c4d3SRobert Clausecker /* NUL or c found */ 11461f4c4d3SRobert Clausecker1: sub $8, %rdi # undo advance past buffer 11561f4c4d3SRobert Clausecker2: tzcnt %rax, %rax # first NUL or c byte match 11661f4c4d3SRobert Clausecker shr $3, %eax # scale from bit to byte index 11761f4c4d3SRobert Clausecker add %rdi, %rax # pointer to found c or NUL 11861f4c4d3SRobert Clausecker ret 11961f4c4d3SRobert ClauseckerARCHEND(__strchrnul, scalar) 12061f4c4d3SRobert Clausecker 12161f4c4d3SRobert ClauseckerARCHENTRY(__strchrnul, baseline) 12261f4c4d3SRobert Clausecker mov %edi, %ecx 12361f4c4d3SRobert Clausecker and $~0xf, %rdi # align to 16 byte 12461f4c4d3SRobert Clausecker movdqa (%rdi), %xmm1 12561f4c4d3SRobert Clausecker movd %esi, %xmm0 12661f4c4d3SRobert Clausecker and $0xf, %ecx # distance from (%rdi) to start of string 12761f4c4d3SRobert Clausecker pxor %xmm2, %xmm2 12861f4c4d3SRobert Clausecker mov $-1, %edx 12961f4c4d3SRobert Clausecker punpcklbw %xmm0, %xmm0 # c -> cc 13061f4c4d3SRobert Clausecker shl %cl, %edx # bits corresponding to bytes in the string 13161f4c4d3SRobert Clausecker punpcklwd %xmm0, %xmm0 # cc -> cccc 13261f4c4d3SRobert Clausecker add $16, %rdi 13361f4c4d3SRobert Clausecker 13461f4c4d3SRobert Clausecker /* check for match in head */ 13561f4c4d3SRobert Clausecker pcmpeqb %xmm1, %xmm2 # NUL bytes present? 13661f4c4d3SRobert Clausecker pshufd $0, %xmm0, %xmm0 # cccc -> cccccccccccccccc 13761f4c4d3SRobert Clausecker pcmpeqb %xmm0, %xmm1 # c present? 13861f4c4d3SRobert Clausecker por %xmm2, %xmm1 # either present? 13961f4c4d3SRobert Clausecker pmovmskb %xmm1, %eax 14061f4c4d3SRobert Clausecker and %edx, %eax # match in the string? 14161f4c4d3SRobert Clausecker jnz 1f 14261f4c4d3SRobert Clausecker 14361f4c4d3SRobert Clausecker /* main loop unrolled twice */ 14461f4c4d3SRobert Clausecker ALIGN_TEXT 14561f4c4d3SRobert Clausecker0: movdqa (%rdi), %xmm1 14661f4c4d3SRobert Clausecker pxor %xmm2, %xmm2 14761f4c4d3SRobert Clausecker pcmpeqb %xmm1, %xmm2 # NUL bytes present? 14861f4c4d3SRobert Clausecker pcmpeqb %xmm0, %xmm1 # c present? 14961f4c4d3SRobert Clausecker por %xmm2, %xmm1 # either present? 15061f4c4d3SRobert Clausecker pmovmskb %xmm1, %eax 15161f4c4d3SRobert Clausecker test %eax, %eax # match in the string? 15261f4c4d3SRobert Clausecker jnz 2f 15361f4c4d3SRobert Clausecker 15461f4c4d3SRobert Clausecker movdqa 16(%rdi), %xmm1 15561f4c4d3SRobert Clausecker add $32, %rdi 15661f4c4d3SRobert Clausecker pxor %xmm2, %xmm2 15761f4c4d3SRobert Clausecker pcmpeqb %xmm1, %xmm2 # NUL bytes present? 15861f4c4d3SRobert Clausecker pcmpeqb %xmm0, %xmm1 # c present? 15961f4c4d3SRobert Clausecker por %xmm2, %xmm1 # either present? 16061f4c4d3SRobert Clausecker pmovmskb %xmm1, %eax 16161f4c4d3SRobert Clausecker test %eax, %eax # match in the string? 16261f4c4d3SRobert Clausecker jz 0b 16361f4c4d3SRobert Clausecker 16461f4c4d3SRobert Clausecker1: sub $16, %rdi # undo advance past buffer 16561f4c4d3SRobert Clausecker2: tzcnt %eax, %eax # where is the match? 16661f4c4d3SRobert Clausecker add %rdi, %rax # pointer to found c or NUL 16761f4c4d3SRobert Clausecker ret 16861f4c4d3SRobert ClauseckerARCHEND(__strchrnul, baseline) 16961f4c4d3SRobert Clausecker 17061f4c4d3SRobert Clausecker .section .note.GNU-stack,"",%progbits 171