1de12a689SRobert Clausecker/*- 2de12a689SRobert Clausecker * Copyright (c) 2023 The FreeBSD Foundation 3de12a689SRobert Clausecker * 4de12a689SRobert Clausecker * This software was developed by Robert Clausecker <fuz@FreeBSD.org> 5de12a689SRobert Clausecker * under sponsorship from the FreeBSD Foundation. 6de12a689SRobert Clausecker * 7de12a689SRobert Clausecker * Redistribution and use in source and binary forms, with or without 8de12a689SRobert Clausecker * modification, are permitted provided that the following conditions 9de12a689SRobert Clausecker * are met: 10de12a689SRobert Clausecker * 1. Redistributions of source code must retain the above copyright 11de12a689SRobert Clausecker * notice, this list of conditions and the following disclaimer. 12de12a689SRobert Clausecker * 2. Redistributions in binary form must reproduce the above copyright 13de12a689SRobert Clausecker * notice, this list of conditions and the following disclaimer in the 14de12a689SRobert Clausecker * documentation and/or other materials provided with the distribution. 15de12a689SRobert Clausecker * 16de12a689SRobert Clausecker * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND 17de12a689SRobert Clausecker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18de12a689SRobert Clausecker * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19de12a689SRobert Clausecker * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20de12a689SRobert Clausecker * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21de12a689SRobert Clausecker * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22de12a689SRobert Clausecker * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23de12a689SRobert Clausecker * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24de12a689SRobert Clausecker * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25de12a689SRobert Clausecker * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26de12a689SRobert Clausecker * SUCH DAMAGE 27de12a689SRobert Clausecker */ 28de12a689SRobert Clausecker 29de12a689SRobert Clausecker#include <machine/asm.h> 30de12a689SRobert Clausecker 31de12a689SRobert Clausecker#include "amd64_archlevel.h" 32de12a689SRobert Clausecker 33de12a689SRobert Clausecker#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ 34de12a689SRobert Clausecker 35de12a689SRobert Clausecker .weak memchr 36de12a689SRobert Clausecker .set memchr, __memchr 37de12a689SRobert ClauseckerARCHFUNCS(__memchr) 38de12a689SRobert Clausecker ARCHFUNC(__memchr, scalar) 39de12a689SRobert Clausecker ARCHFUNC(__memchr, baseline) 40de12a689SRobert ClauseckerENDARCHFUNCS(__memchr) 41de12a689SRobert Clausecker 42de12a689SRobert ClauseckerARCHENTRY(__memchr, scalar) 43de12a689SRobert Clausecker test %rdx, %rdx # empty input? 44de12a689SRobert Clausecker je .Lnomatch 45de12a689SRobert Clausecker 46de12a689SRobert Clausecker lea (, %rdi, 8), %ecx 47*b2618b65SRobert Clausecker mov $-1, %rax 48*b2618b65SRobert Clausecker add %rdi, %rdx # pointer to end of buffer or to end of 49*b2618b65SRobert Clausecker cmovc %rax, %rdx # address space (whichever comes first) 50de12a689SRobert Clausecker and $~7, %rdi # align to 8 bytes 51de12a689SRobert Clausecker mov (%rdi), %rax # load first word 52de12a689SRobert Clausecker movzbl %sil, %esi # clear stray high bits 53de12a689SRobert Clausecker movabs $0x0101010101010101, %r8 54de12a689SRobert Clausecker imul %r8, %rsi # replicate char 8 times 55de12a689SRobert Clausecker 56de12a689SRobert Clausecker /* compute head and tail masks */ 57de12a689SRobert Clausecker mov %r8, %r10 58de12a689SRobert Clausecker movabs $0x8080808080808080, %r9 59de12a689SRobert Clausecker shl %cl, %r10 # 0x01 where string head is 60de12a689SRobert Clausecker lea (, %rdx, 8), %ecx 61de12a689SRobert Clausecker xor %r8, %r10 # 0x01 where it is not 62de12a689SRobert Clausecker neg %r8 # negate 01..01 so we can use lea 63de12a689SRobert Clausecker mov %r9, %r11 64de12a689SRobert Clausecker xor %rsi, %rax # str ^ c (0x00 where str[i] == c) 65de12a689SRobert Clausecker neg %ecx 66de12a689SRobert Clausecker or %r10, %rax # except before the string 67de12a689SRobert Clausecker shr %cl, %r11 # 0x80 where string tail is 68de12a689SRobert Clausecker 69de12a689SRobert Clausecker add $8, %rdi # advance to next 8 bytes 70de12a689SRobert Clausecker cmp %rdx, %rdi # end of buffer reached during head? 71de12a689SRobert Clausecker jae .Ltail # and go to tail-processing code 72de12a689SRobert Clausecker 73de12a689SRobert Clausecker /* main loop, unrolled twice */ 74de12a689SRobert Clausecker ALIGN_TEXT 75de12a689SRobert Clausecker0: lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01 76de12a689SRobert Clausecker not %rax # ~(str ^ c) 77de12a689SRobert Clausecker and %r9, %rax # ((str^c) - 0x01..01) & ~(str^c) 78de12a689SRobert Clausecker and %rcx, %rax # not including junk bytes 79de12a689SRobert Clausecker jnz .Lmatch 80de12a689SRobert Clausecker 81de12a689SRobert Clausecker mov (%rdi), %rax 82de12a689SRobert Clausecker add $8, %rdi 83de12a689SRobert Clausecker xor %rsi, %rax # str ^ c 84de12a689SRobert Clausecker cmp %rdx, %rdi 85de12a689SRobert Clausecker jae .Ltail 86de12a689SRobert Clausecker 87de12a689SRobert Clausecker lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01 88de12a689SRobert Clausecker not %rax # ~(str ^ c) 89de12a689SRobert Clausecker and %r9, %rax # ((str^c) - 0x01..01) & ~(str^c) 90de12a689SRobert Clausecker and %rcx, %rax # not including junk bytes 91de12a689SRobert Clausecker jnz .Lmatch 92de12a689SRobert Clausecker 93de12a689SRobert Clausecker mov (%rdi), %rax 94de12a689SRobert Clausecker add $8, %rdi 95de12a689SRobert Clausecker xor %rsi, %rax # str ^ c 96de12a689SRobert Clausecker cmp %rdx, %rdi 97de12a689SRobert Clausecker jb 0b 98de12a689SRobert Clausecker 99de12a689SRobert Clausecker.Ltail: lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01 100de12a689SRobert Clausecker not %rax # ~(str ^ c) 101de12a689SRobert Clausecker and %r11, %rax # ((str^c) - 0x01..01) & ~(str^c) 102de12a689SRobert Clausecker and %rcx, %rax # not including junk bytes or bytes past buffer 103de12a689SRobert Clausecker jz .Lnomatch 104de12a689SRobert Clausecker 105de12a689SRobert Clausecker.Lmatch: 106de12a689SRobert Clausecker tzcnt %rax, %rax # first match 107de12a689SRobert Clausecker shr $3, %eax # scale from bit to byte index 108de12a689SRobert Clausecker lea -8(%rdi, %rax), %rax # pointer to found c 109de12a689SRobert Clausecker ret 110de12a689SRobert Clausecker 111de12a689SRobert Clausecker /* no match found */ 112de12a689SRobert Clausecker.Lnomatch: 113de12a689SRobert Clausecker xor %eax, %eax # return null pointer 114de12a689SRobert Clausecker ret 115de12a689SRobert ClauseckerARCHEND(__memchr, scalar) 116de12a689SRobert Clausecker 117de12a689SRobert ClauseckerARCHENTRY(__memchr, baseline) 118de12a689SRobert Clausecker test %rdx, %rdx # empty input? 119de12a689SRobert Clausecker je .Lnomatchb 120de12a689SRobert Clausecker 121de12a689SRobert Clausecker movd %esi, %xmm2 122de12a689SRobert Clausecker mov %edi, %ecx 123*b2618b65SRobert Clausecker mov $-1, %r9 124*b2618b65SRobert Clausecker add %rdi, %rdx # pointer to end of buffer or to end of 125*b2618b65SRobert Clausecker cmovc %r9, %rdx # address space (whichever comes first) 126de12a689SRobert Clausecker and $~0x1f, %rdi # align to 32 bytes 127de12a689SRobert Clausecker movdqa (%rdi), %xmm0 # load first 32 bytes 128de12a689SRobert Clausecker movdqa 16(%rdi), %xmm1 129de12a689SRobert Clausecker 130de12a689SRobert Clausecker punpcklbw %xmm2, %xmm2 # c -> cc 131de12a689SRobert Clausecker 132de12a689SRobert Clausecker shl %cl, %r9d # mask with zeroes before the string 133de12a689SRobert Clausecker 134de12a689SRobert Clausecker punpcklwd %xmm2, %xmm2 # cc -> cccc 135de12a689SRobert Clausecker 136de12a689SRobert Clausecker mov $-1, %r8d 137de12a689SRobert Clausecker xor %ecx, %ecx 138de12a689SRobert Clausecker sub %edx, %ecx # edx = -ecx 139de12a689SRobert Clausecker shr %cl, %r8d # bytes in tail that are part of the buffer 140de12a689SRobert Clausecker 141de12a689SRobert Clausecker pshufd $0, %xmm2, %xmm2 # cccc -> cccccccccccccccc 142de12a689SRobert Clausecker 143de12a689SRobert Clausecker add $32, %rdi # advance to next 32 bytes 144de12a689SRobert Clausecker mov $-1, %eax 145de12a689SRobert Clausecker cmp %rdx, %rdi # end of buffer reached during head? 146de12a689SRobert Clausecker cmovae %r8d, %eax # if yes, do combined head/tail processing 147de12a689SRobert Clausecker and %r9d, %eax # mask of bytes in head part of string 148de12a689SRobert Clausecker 149de12a689SRobert Clausecker /* process head */ 150de12a689SRobert Clausecker pcmpeqb %xmm2, %xmm1 151de12a689SRobert Clausecker pcmpeqb %xmm2, %xmm0 152de12a689SRobert Clausecker pmovmskb %xmm1, %esi 153de12a689SRobert Clausecker pmovmskb %xmm0, %ecx 154de12a689SRobert Clausecker shl $16, %esi 155de12a689SRobert Clausecker or %esi, %ecx # locations of matches 156de12a689SRobert Clausecker and %ecx, %eax # any match inside buffer? 157de12a689SRobert Clausecker jnz .Lprecisematchb 158de12a689SRobert Clausecker 159de12a689SRobert Clausecker cmp %rdx, %rdi # did the buffer end here? 160de12a689SRobert Clausecker jae .Lnomatchb # if yes we are done 161de12a689SRobert Clausecker 162de12a689SRobert Clausecker /* main loop */ 163de12a689SRobert Clausecker ALIGN_TEXT 164de12a689SRobert Clausecker0: movdqa (%rdi), %xmm0 # load next string chunk 165de12a689SRobert Clausecker movdqa 16(%rdi), %xmm1 166de12a689SRobert Clausecker add $32, %rdi 167de12a689SRobert Clausecker cmp %rdx, %rdi # ready for main loop? 168de12a689SRobert Clausecker jae .Ltailb 169de12a689SRobert Clausecker 170de12a689SRobert Clausecker pcmpeqb %xmm2, %xmm0 171de12a689SRobert Clausecker pcmpeqb %xmm2, %xmm1 172de12a689SRobert Clausecker por %xmm1, %xmm0 # match in either half? 173de12a689SRobert Clausecker pmovmskb %xmm0, %eax 174de12a689SRobert Clausecker test %eax, %eax 175de12a689SRobert Clausecker jz 0b 176de12a689SRobert Clausecker 177de12a689SRobert Clausecker.Lmatchb: 178de12a689SRobert Clausecker pcmpeqb -32(%rdi), %xmm2 # redo comparison of first 16 bytes 179de12a689SRobert Clausecker pmovmskb %xmm1, %ecx 180de12a689SRobert Clausecker pmovmskb %xmm2, %eax 181de12a689SRobert Clausecker shl $16, %ecx 182de12a689SRobert Clausecker or %ecx, %eax # location of matches 183de12a689SRobert Clausecker 184de12a689SRobert Clausecker.Lprecisematchb: 185de12a689SRobert Clausecker tzcnt %eax, %eax # find location of match 186de12a689SRobert Clausecker lea -32(%rdi, %rax, 1), %rax # point to matching byte 187de12a689SRobert Clausecker ret 188de12a689SRobert Clausecker 189de12a689SRobert Clausecker.Ltailb: 190de12a689SRobert Clausecker pcmpeqb %xmm2, %xmm1 191de12a689SRobert Clausecker pcmpeqb %xmm2, %xmm0 192de12a689SRobert Clausecker pmovmskb %xmm1, %edx 193de12a689SRobert Clausecker pmovmskb %xmm0, %eax 194de12a689SRobert Clausecker shl $16, %edx 195de12a689SRobert Clausecker or %edx, %eax # location of matches 196de12a689SRobert Clausecker and %r8d, %eax # mask out matches beyond buffer 197de12a689SRobert Clausecker bsf %eax, %edx # location of match 198de12a689SRobert Clausecker lea -32(%rdi, %rdx, 1), %rdx # pointer to match (if any) 199de12a689SRobert Clausecker cmovnz %rdx, %rax # point to match if present, 200de12a689SRobert Clausecker ret # else null pointer 201de12a689SRobert Clausecker 202de12a689SRobert Clausecker.Lnomatchb: 203de12a689SRobert Clausecker xor %eax, %eax # return null pointer 204de12a689SRobert Clausecker ret 205de12a689SRobert ClauseckerARCHEND(__memchr, baseline) 206de12a689SRobert Clausecker 207de12a689SRobert Clausecker .section .note.GNU-stack,"",%progbits 208