1/*- 2 * Copyright (c) 2023 The FreeBSD Foundation 3 * 4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org> 5 * under sponsorship from the FreeBSD Foundation. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE 27 */ 28 29#include <machine/asm.h> 30 31#include "amd64_archlevel.h" 32 33#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ 34 35 .weak memchr 36 .set memchr, __memchr 37ARCHFUNCS(__memchr) 38 ARCHFUNC(__memchr, scalar) 39 ARCHFUNC(__memchr, baseline) 40ENDARCHFUNCS(__memchr) 41 42ARCHENTRY(__memchr, scalar) 43 test %rdx, %rdx # empty input? 44 je .Lnomatch 45 46 lea (, %rdi, 8), %ecx 47 mov $-1, %rax 48 add %rdi, %rdx # pointer to end of buffer or to end of 49 cmovc %rax, %rdx # address space (whichever comes first) 50 and $~7, %rdi # align to 8 bytes 51 mov (%rdi), %rax # load first word 52 movzbl %sil, %esi # clear stray high bits 53 movabs $0x0101010101010101, %r8 54 imul %r8, %rsi # replicate char 8 times 55 56 /* compute head and tail masks */ 57 mov %r8, %r10 58 movabs $0x8080808080808080, %r9 59 shl %cl, %r10 # 0x01 where string head is 60 lea (, %rdx, 8), %ecx 61 xor %r8, %r10 # 0x01 where it is not 62 neg %r8 # negate 01..01 so we can use lea 63 mov %r9, %r11 64 xor %rsi, %rax # str ^ c (0x00 where str[i] == c) 65 neg %ecx 66 or %r10, %rax # except before the string 67 shr %cl, %r11 # 0x80 where string tail is 68 69 add $8, %rdi # advance to next 8 bytes 70 cmp %rdx, %rdi # end of buffer reached during head? 71 jae .Ltail # and go to tail-processing code 72 73 /* main loop, unrolled twice */ 74 ALIGN_TEXT 750: lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01 76 not %rax # ~(str ^ c) 77 and %r9, %rax # ((str^c) - 0x01..01) & ~(str^c) 78 and %rcx, %rax # not including junk bytes 79 jnz .Lmatch 80 81 mov (%rdi), %rax 82 add $8, %rdi 83 xor %rsi, %rax # str ^ c 84 cmp %rdx, %rdi 85 jae .Ltail 86 87 lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01 88 not %rax # ~(str ^ c) 89 and %r9, %rax # ((str^c) - 0x01..01) & ~(str^c) 90 and %rcx, %rax # not including junk bytes 91 jnz .Lmatch 92 93 mov (%rdi), %rax 94 add $8, %rdi 95 xor %rsi, %rax # str ^ c 96 cmp %rdx, %rdi 97 jb 0b 98 99.Ltail: lea (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01 100 not %rax # ~(str ^ c) 101 and %r11, %rax # ((str^c) - 0x01..01) & ~(str^c) 102 and %rcx, %rax # not including junk bytes or bytes past buffer 103 jz .Lnomatch 104 105.Lmatch: 106 tzcnt %rax, %rax # first match 107 shr $3, %eax # scale from bit to byte index 108 lea -8(%rdi, %rax), %rax # pointer to found c 109 ret 110 111 /* no match found */ 112.Lnomatch: 113 xor %eax, %eax # return null pointer 114 ret 115ARCHEND(__memchr, scalar) 116 117ARCHENTRY(__memchr, baseline) 118 test %rdx, %rdx # empty input? 119 je .Lnomatchb 120 121 movd %esi, %xmm2 122 mov %edi, %ecx 123 mov $-1, %r9 124 add %rdi, %rdx # pointer to end of buffer or to end of 125 cmovc %r9, %rdx # address space (whichever comes first) 126 and $~0x1f, %rdi # align to 32 bytes 127 movdqa (%rdi), %xmm0 # load first 32 bytes 128 movdqa 16(%rdi), %xmm1 129 130 punpcklbw %xmm2, %xmm2 # c -> cc 131 132 shl %cl, %r9d # mask with zeroes before the string 133 134 punpcklwd %xmm2, %xmm2 # cc -> cccc 135 136 mov $-1, %r8d 137 xor %ecx, %ecx 138 sub %edx, %ecx # edx = -ecx 139 shr %cl, %r8d # bytes in tail that are part of the buffer 140 141 pshufd $0, %xmm2, %xmm2 # cccc -> cccccccccccccccc 142 143 add $32, %rdi # advance to next 32 bytes 144 mov $-1, %eax 145 cmp %rdx, %rdi # end of buffer reached during head? 146 cmovae %r8d, %eax # if yes, do combined head/tail processing 147 and %r9d, %eax # mask of bytes in head part of string 148 149 /* process head */ 150 pcmpeqb %xmm2, %xmm1 151 pcmpeqb %xmm2, %xmm0 152 pmovmskb %xmm1, %esi 153 pmovmskb %xmm0, %ecx 154 shl $16, %esi 155 or %esi, %ecx # locations of matches 156 and %ecx, %eax # any match inside buffer? 157 jnz .Lprecisematchb 158 159 cmp %rdx, %rdi # did the buffer end here? 160 jae .Lnomatchb # if yes we are done 161 162 /* main loop */ 163 ALIGN_TEXT 1640: movdqa (%rdi), %xmm0 # load next string chunk 165 movdqa 16(%rdi), %xmm1 166 add $32, %rdi 167 cmp %rdx, %rdi # ready for main loop? 168 jae .Ltailb 169 170 pcmpeqb %xmm2, %xmm0 171 pcmpeqb %xmm2, %xmm1 172 por %xmm1, %xmm0 # match in either half? 173 pmovmskb %xmm0, %eax 174 test %eax, %eax 175 jz 0b 176 177.Lmatchb: 178 pcmpeqb -32(%rdi), %xmm2 # redo comparison of first 16 bytes 179 pmovmskb %xmm1, %ecx 180 pmovmskb %xmm2, %eax 181 shl $16, %ecx 182 or %ecx, %eax # location of matches 183 184.Lprecisematchb: 185 tzcnt %eax, %eax # find location of match 186 lea -32(%rdi, %rax, 1), %rax # point to matching byte 187 ret 188 189.Ltailb: 190 pcmpeqb %xmm2, %xmm1 191 pcmpeqb %xmm2, %xmm0 192 pmovmskb %xmm1, %edx 193 pmovmskb %xmm0, %eax 194 shl $16, %edx 195 or %edx, %eax # location of matches 196 and %r8d, %eax # mask out matches beyond buffer 197 bsf %eax, %edx # location of match 198 lea -32(%rdi, %rdx, 1), %rdx # pointer to match (if any) 199 cmovnz %rdx, %rax # point to match if present, 200 ret # else null pointer 201 202.Lnomatchb: 203 xor %eax, %eax # return null pointer 204 ret 205ARCHEND(__memchr, baseline) 206 207 .section .note.GNU-stack,"",%progbits 208