1/*- 2 * Copyright (c) 2023 The FreeBSD Foundation 3 * 4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org> 5 * under sponsorship from the FreeBSD Foundation. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE 27 */ 28 29#include <machine/asm.h> 30 31#include "amd64_archlevel.h" 32 33#define ALIGN_TEXT .p2align 4,0x90 # 16-byte alignment, nop-filled 34 35 .weak strchrnul 36 .set strchrnul, __strchrnul 37 38ARCHFUNCS(__strchrnul) 39 ARCHFUNC(__strchrnul, scalar) 40 ARCHFUNC(__strchrnul, baseline) 41ENDARCHFUNCS(__strchrnul) 42 43/* 44 * strchrnul(str, c) 45 * This is implemented like strlen(str), but we check for the 46 * presence of both NUL and c in each iteration. 47 */ 48ARCHENTRY(__strchrnul, scalar) 49 mov %edi, %ecx 50 and $~7, %rdi # align to 8 byte 51 movzbl %sil, %esi # clear stray high bits 52 movabs $0x0101010101010101, %r8 53 mov (%rdi), %rax # load first word 54 imul %r8, %rsi # replicate char 8 times 55 movabs $0x8080808080808080, %r9 56 57 /* 58 * Unaligned input: align to 8 bytes. Then proceed the same 59 * way as with aligned input, but ignore matches before the 60 * beginning of the string. This is achieved by shifting r9 61 * into r10 to have 0x00 bytes before the string begins. 62 */ 63 shl $3, %ecx 64 mov %r9, %r10 65 add $8, %rdi 66 shl %cl, %r10 # 0x80 where the string is 67 neg %r8 # negate 01..01 so we can use lea 68 69 mov %rsi, %rcx 70 xor %rax, %rcx # str ^ c 71 lea (%rax, %r8, 1), %rdx # str - 0x01..01 72 lea (%rcx, %r8, 1), %r11 # (str ^ c) - 0x01..01 73 not %rax # ~str 74 not %rcx # ~(str ^ c) 75 and %rdx, %rax # (str - 0x01..01) & ~str 76 and %r11, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) 77 or %rcx, %rax # matches for both 78 and %r10, %rax # not including junk bytes or bytes before the string 79 jnz 1f 80 81 /* main loop unrolled twice */ 82 ALIGN_TEXT 830: mov (%rdi), %rax # str 84 mov %rsi, %rcx 85 xor %rax, %rcx # str ^ c 86 lea (%rax, %r8, 1), %rdx # str - 0x01..01 87 lea (%rcx, %r8, 1), %r11 # (str ^ c) - 0x01..01 88 not %rax # ~str 89 not %rcx # ~(str ^ c) 90 and %rdx, %rax # (str - 0x01..01) & ~str 91 and %r11, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) 92 or %rcx, %rax # matches for both 93 and %r9, %rax # not including junk bits 94 jnz 2f 95 96 mov 8(%rdi), %rax # str 97 add $16, %rdi 98 mov %rsi, %rcx 99 xor %rax, %rcx # str ^ c 100 lea (%rax, %r8, 1), %rdx # str - 0x01..01 101 lea (%rcx, %r8, 1), %r11 # (str ^ c) - 0x01..01 102 not %rax # ~str 103 not %rcx # ~(str ^ c) 104 and %rdx, %rax # (str - 0x01..01) & ~str 105 and %r11, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) 106 or %rcx, %rax # matches for both 107 and %r9, %rax # not including junk bits 108 jz 0b 109 110 /* NUL or c found */ 1111: sub $8, %rdi # undo advance past buffer 1122: tzcnt %rax, %rax # first NUL or c byte match 113 shr $3, %eax # scale from bit to byte index 114 add %rdi, %rax # pointer to found c or NUL 115 ret 116ARCHEND(__strchrnul, scalar) 117 118ARCHENTRY(__strchrnul, baseline) 119 mov %edi, %ecx 120 and $~0xf, %rdi # align to 16 byte 121 movdqa (%rdi), %xmm1 122 movd %esi, %xmm0 123 and $0xf, %ecx # distance from (%rdi) to start of string 124 pxor %xmm2, %xmm2 125 mov $-1, %edx 126 punpcklbw %xmm0, %xmm0 # c -> cc 127 shl %cl, %edx # bits corresponding to bytes in the string 128 punpcklwd %xmm0, %xmm0 # cc -> cccc 129 add $16, %rdi 130 131 /* check for match in head */ 132 pcmpeqb %xmm1, %xmm2 # NUL bytes present? 133 pshufd $0, %xmm0, %xmm0 # cccc -> cccccccccccccccc 134 pcmpeqb %xmm0, %xmm1 # c present? 135 por %xmm2, %xmm1 # either present? 136 pmovmskb %xmm1, %eax 137 and %edx, %eax # match in the string? 138 jnz 1f 139 140 /* main loop unrolled twice */ 141 ALIGN_TEXT 1420: movdqa (%rdi), %xmm1 143 pxor %xmm2, %xmm2 144 pcmpeqb %xmm1, %xmm2 # NUL bytes present? 145 pcmpeqb %xmm0, %xmm1 # c present? 146 por %xmm2, %xmm1 # either present? 147 pmovmskb %xmm1, %eax 148 test %eax, %eax # match in the string? 149 jnz 2f 150 151 movdqa 16(%rdi), %xmm1 152 add $32, %rdi 153 pxor %xmm2, %xmm2 154 pcmpeqb %xmm1, %xmm2 # NUL bytes present? 155 pcmpeqb %xmm0, %xmm1 # c present? 156 por %xmm2, %xmm1 # either present? 157 pmovmskb %xmm1, %eax 158 test %eax, %eax # match in the string? 159 jz 0b 160 1611: sub $16, %rdi # undo advance past buffer 1622: tzcnt %eax, %eax # where is the match? 163 add %rdi, %rax # pointer to found c or NUL 164 ret 165ARCHEND(__strchrnul, baseline) 166 167 .section .note.GNU-stack,"",%progbits 168