1*2ed514a2SRobert Clausecker/*- 2*2ed514a2SRobert Clausecker * Copyright (c) 2023 The FreeBSD Foundation 3*2ed514a2SRobert Clausecker * 4*2ed514a2SRobert Clausecker * This software was developed by Robert Clausecker <fuz@FreeBSD.org> 5*2ed514a2SRobert Clausecker * under sponsorship from the FreeBSD Foundation. 6*2ed514a2SRobert Clausecker * 7*2ed514a2SRobert Clausecker * Redistribution and use in source and binary forms, with or without 8*2ed514a2SRobert Clausecker * modification, are permitted provided that the following conditions 9*2ed514a2SRobert Clausecker * are met: 10*2ed514a2SRobert Clausecker * 1. Redistributions of source code must retain the above copyright 11*2ed514a2SRobert Clausecker * notice, this list of conditions and the following disclaimer. 12*2ed514a2SRobert Clausecker * 2. Redistributions in binary form must reproduce the above copyright 13*2ed514a2SRobert Clausecker * notice, this list of conditions and the following disclaimer in the 14*2ed514a2SRobert Clausecker * documentation and/or other materials provided with the distribution. 15*2ed514a2SRobert Clausecker * 16*2ed514a2SRobert Clausecker * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND 17*2ed514a2SRobert Clausecker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18*2ed514a2SRobert Clausecker * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19*2ed514a2SRobert Clausecker * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20*2ed514a2SRobert Clausecker * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21*2ed514a2SRobert Clausecker * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22*2ed514a2SRobert Clausecker * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23*2ed514a2SRobert Clausecker * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24*2ed514a2SRobert Clausecker * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25*2ed514a2SRobert Clausecker * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26*2ed514a2SRobert Clausecker * SUCH DAMAGE 27*2ed514a2SRobert Clausecker */ 28*2ed514a2SRobert Clausecker 29*2ed514a2SRobert Clausecker#include <machine/asm.h> 30*2ed514a2SRobert Clausecker 31*2ed514a2SRobert Clausecker#include "amd64_archlevel.h" 32*2ed514a2SRobert Clausecker 33*2ed514a2SRobert Clausecker#define ALIGN_TEXT .p2align 4,0x90 # 16-byte alignment, nop-filled 34*2ed514a2SRobert Clausecker 35*2ed514a2SRobert Clausecker .weak rindex 36*2ed514a2SRobert Clausecker .set rindex, strrchr 37*2ed514a2SRobert Clausecker 38*2ed514a2SRobert ClauseckerARCHFUNCS(strrchr) 39*2ed514a2SRobert Clausecker ARCHFUNC(strrchr, scalar) 40*2ed514a2SRobert Clausecker ARCHFUNC(strrchr, baseline) 41*2ed514a2SRobert ClauseckerENDARCHFUNCS(strrchr) 42*2ed514a2SRobert Clausecker 43*2ed514a2SRobert ClauseckerARCHENTRY(strrchr, scalar) 44*2ed514a2SRobert Clausecker mov %edi, %ecx 45*2ed514a2SRobert Clausecker and $~7, %rdi # align to 8 byte 46*2ed514a2SRobert Clausecker movzbl %sil, %esi # clear stray high bits 47*2ed514a2SRobert Clausecker movabs $0x0101010101010101, %r8 48*2ed514a2SRobert Clausecker mov (%rdi), %rax # load first word 49*2ed514a2SRobert Clausecker imul %r8, %rsi # replicate char 8 times 50*2ed514a2SRobert Clausecker 51*2ed514a2SRobert Clausecker /* 52*2ed514a2SRobert Clausecker * Unaligned input: align to 8 bytes. Then proceed the same 53*2ed514a2SRobert Clausecker * way as with aligned input, but prevent matches before the 54*2ed514a2SRobert Clausecker * beginning of the string. This is achieved by oring 0x01 55*2ed514a2SRobert Clausecker * into each byte of the buffer before the string 56*2ed514a2SRobert Clausecker */ 57*2ed514a2SRobert Clausecker shl $3, %ecx 58*2ed514a2SRobert Clausecker mov %r8, %r10 59*2ed514a2SRobert Clausecker shl %cl, %r10 # 0x01 where the string is 60*2ed514a2SRobert Clausecker xor %r8, %r10 # 0x01 where it is not 61*2ed514a2SRobert Clausecker neg %r8 # negate 01..01 so we can use lea 62*2ed514a2SRobert Clausecker movabs $0x8080808080808080, %r9 63*2ed514a2SRobert Clausecker 64*2ed514a2SRobert Clausecker mov %rsi, %rcx 65*2ed514a2SRobert Clausecker xor %rax, %rcx # str ^ c 66*2ed514a2SRobert Clausecker or %r10, %rax # ensure str != 0 before string 67*2ed514a2SRobert Clausecker or %r10, %rcx # ensure str^c != 0 before string 68*2ed514a2SRobert Clausecker bswap %rcx # in reverse order, to find last match 69*2ed514a2SRobert Clausecker mov %rdi, %r10 # location of initial mismatch (if any) 70*2ed514a2SRobert Clausecker xor %r11, %r11 # initial mismatch (none) 71*2ed514a2SRobert Clausecker add $8, %rdi # advance to next iteration 72*2ed514a2SRobert Clausecker lea (%rax, %r8, 1), %rdx # str - 0x01..01 73*2ed514a2SRobert Clausecker not %rax # ~str 74*2ed514a2SRobert Clausecker and %rdx, %rax # (str - 0x01..01) & ~str 75*2ed514a2SRobert Clausecker and %r9, %rax # not including junk bits 76*2ed514a2SRobert Clausecker jnz 1f # end of string? 77*2ed514a2SRobert Clausecker 78*2ed514a2SRobert Clausecker lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01 79*2ed514a2SRobert Clausecker not %rcx # ~(str ^ c) 80*2ed514a2SRobert Clausecker and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) 81*2ed514a2SRobert Clausecker and %r9, %rcx # not including junk bits 82*2ed514a2SRobert Clausecker mov %rcx, %r11 # remember mismatch in head 83*2ed514a2SRobert Clausecker jmp 0f 84*2ed514a2SRobert Clausecker 85*2ed514a2SRobert Clausecker /* main loop unrolled twice */ 86*2ed514a2SRobert Clausecker ALIGN_TEXT 87*2ed514a2SRobert Clausecker3: lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01 88*2ed514a2SRobert Clausecker not %rcx # ~(str ^ c) 89*2ed514a2SRobert Clausecker and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) 90*2ed514a2SRobert Clausecker and %r9, %rcx # not including junk bits 91*2ed514a2SRobert Clausecker lea -8(%rdi), %rdx 92*2ed514a2SRobert Clausecker cmovnz %rdx, %r10 # remember location of current mismatch 93*2ed514a2SRobert Clausecker cmovnz %rcx, %r11 94*2ed514a2SRobert Clausecker 95*2ed514a2SRobert Clausecker0: mov (%rdi), %rax # str 96*2ed514a2SRobert Clausecker mov %rsi, %rcx 97*2ed514a2SRobert Clausecker xor %rax, %rcx # str ^ c 98*2ed514a2SRobert Clausecker bswap %rcx # in reverse order, to find last match 99*2ed514a2SRobert Clausecker lea (%rax, %r8, 1), %rdx # str - 0x01..01 100*2ed514a2SRobert Clausecker not %rax # ~str 101*2ed514a2SRobert Clausecker and %rdx, %rax # (str - 0x01..01) & ~str 102*2ed514a2SRobert Clausecker and %r9, %rax # not including junk bits 103*2ed514a2SRobert Clausecker jnz 2f # end of string? 104*2ed514a2SRobert Clausecker 105*2ed514a2SRobert Clausecker lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01 106*2ed514a2SRobert Clausecker not %rcx # ~(str ^ c) 107*2ed514a2SRobert Clausecker and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) 108*2ed514a2SRobert Clausecker and %r9, %rcx # not including junk bits 109*2ed514a2SRobert Clausecker cmovnz %rdi, %r10 # remember location of current mismatch 110*2ed514a2SRobert Clausecker cmovnz %rcx, %r11 111*2ed514a2SRobert Clausecker 112*2ed514a2SRobert Clausecker mov 8(%rdi), %rax # str 113*2ed514a2SRobert Clausecker add $16, %rdi 114*2ed514a2SRobert Clausecker mov %rsi, %rcx 115*2ed514a2SRobert Clausecker xor %rax, %rcx # str ^ c 116*2ed514a2SRobert Clausecker bswap %rcx 117*2ed514a2SRobert Clausecker lea (%rax, %r8, 1), %rdx # str - 0x01..01 118*2ed514a2SRobert Clausecker not %rax # ~str 119*2ed514a2SRobert Clausecker and %rdx, %rax # (str - 0x01..01) & ~str 120*2ed514a2SRobert Clausecker and %r9, %rax # not including junk bits 121*2ed514a2SRobert Clausecker jz 3b # end of string? 122*2ed514a2SRobert Clausecker 123*2ed514a2SRobert Clausecker /* NUL found */ 124*2ed514a2SRobert Clausecker1: sub $8, %rdi # undo advance past buffer 125*2ed514a2SRobert Clausecker2: lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01 126*2ed514a2SRobert Clausecker not %rcx # ~(str ^ c) 127*2ed514a2SRobert Clausecker and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) 128*2ed514a2SRobert Clausecker and %r9, %rcx # not including junk bits 129*2ed514a2SRobert Clausecker lea -1(%rax), %rdx 130*2ed514a2SRobert Clausecker xor %rdx, %rax # mask of bytes in the string 131*2ed514a2SRobert Clausecker bswap %rdx # in reverse order 132*2ed514a2SRobert Clausecker and %rdx, %rcx # c found in the tail? 133*2ed514a2SRobert Clausecker cmovnz %rdi, %r10 134*2ed514a2SRobert Clausecker cmovnz %rcx, %r11 135*2ed514a2SRobert Clausecker bswap %r11 # unreverse byte order 136*2ed514a2SRobert Clausecker bsr %r11, %rcx # last location of c in (R10) 137*2ed514a2SRobert Clausecker shr $3, %rcx # as byte offset 138*2ed514a2SRobert Clausecker lea (%r10, %rcx, 1), %rax # pointer to match 139*2ed514a2SRobert Clausecker test %r11, %r11 # was there actually a match? 140*2ed514a2SRobert Clausecker cmovz %r11, %rax # if not, return null pointer 141*2ed514a2SRobert Clausecker ret 142*2ed514a2SRobert ClauseckerARCHEND(strrchr, scalar) 143*2ed514a2SRobert Clausecker 144*2ed514a2SRobert ClauseckerARCHENTRY(strrchr, baseline) 145*2ed514a2SRobert Clausecker mov %edi, %ecx 146*2ed514a2SRobert Clausecker and $~0xf, %rdi # align to 16 bytes 147*2ed514a2SRobert Clausecker movdqa (%rdi), %xmm1 148*2ed514a2SRobert Clausecker movd %esi, %xmm0 149*2ed514a2SRobert Clausecker and $0xf, %ecx # offset from alignment 150*2ed514a2SRobert Clausecker pxor %xmm2, %xmm2 151*2ed514a2SRobert Clausecker mov $-1, %edx 152*2ed514a2SRobert Clausecker punpcklbw %xmm0, %xmm0 # c -> cc 153*2ed514a2SRobert Clausecker shl %cl, %edx # bits corresponding to bytes in the string 154*2ed514a2SRobert Clausecker punpcklwd %xmm0, %xmm0 # cc -> cccc 155*2ed514a2SRobert Clausecker xor %r8, %r8 # address of latest match 156*2ed514a2SRobert Clausecker mov $1, %esi # bit mask of latest match 157*2ed514a2SRobert Clausecker mov %rdi, %r9 # candidate location for next match 158*2ed514a2SRobert Clausecker add $16, %rdi # advance to next chunk 159*2ed514a2SRobert Clausecker 160*2ed514a2SRobert Clausecker /* check for match in head */ 161*2ed514a2SRobert Clausecker pcmpeqb %xmm1, %xmm2 # NUL byte present? 162*2ed514a2SRobert Clausecker pshufd $0, %xmm0, %xmm0 # cccc -> cccccccccccccccc 163*2ed514a2SRobert Clausecker pcmpeqb %xmm0, %xmm1 # c present? 164*2ed514a2SRobert Clausecker pmovmskb %xmm2, %eax 165*2ed514a2SRobert Clausecker pmovmskb %xmm1, %ecx 166*2ed514a2SRobert Clausecker and %edx, %ecx # c present in the string? 167*2ed514a2SRobert Clausecker and %edx, %eax # NUL present in the string? 168*2ed514a2SRobert Clausecker jnz .Lend2 169*2ed514a2SRobert Clausecker 170*2ed514a2SRobert Clausecker /* main loop unrolled twice */ 171*2ed514a2SRobert Clausecker ALIGN_TEXT 172*2ed514a2SRobert Clausecker0: movdqa (%rdi), %xmm1 173*2ed514a2SRobert Clausecker test %ecx, %ecx # was there a match in the last iter.? 174*2ed514a2SRobert Clausecker cmovnz %r9, %r8 # remember match if any 175*2ed514a2SRobert Clausecker cmovnz %ecx, %esi 176*2ed514a2SRobert Clausecker pxor %xmm2, %xmm2 177*2ed514a2SRobert Clausecker pcmpeqb %xmm1, %xmm2 # NUL byte present? 178*2ed514a2SRobert Clausecker pcmpeqb %xmm0, %xmm1 # c present? 179*2ed514a2SRobert Clausecker pmovmskb %xmm2, %eax 180*2ed514a2SRobert Clausecker pmovmskb %xmm1, %ecx 181*2ed514a2SRobert Clausecker test %eax, %eax # end of string in first half? 182*2ed514a2SRobert Clausecker jnz .Lend 183*2ed514a2SRobert Clausecker 184*2ed514a2SRobert Clausecker movdqa 16(%rdi), %xmm1 185*2ed514a2SRobert Clausecker test %ecx, %ecx # was there a match in the last iter.? 186*2ed514a2SRobert Clausecker cmovnz %rdi, %r8 # remember match if any 187*2ed514a2SRobert Clausecker cmovnz %ecx, %esi 188*2ed514a2SRobert Clausecker pxor %xmm2, %xmm2 189*2ed514a2SRobert Clausecker pcmpeqb %xmm1, %xmm2 # NUL byte present? 190*2ed514a2SRobert Clausecker pcmpeqb %xmm0, %xmm1 # c present? 191*2ed514a2SRobert Clausecker pmovmskb %xmm2, %eax 192*2ed514a2SRobert Clausecker pmovmskb %xmm1, %ecx 193*2ed514a2SRobert Clausecker lea 16(%rdi), %r9 194*2ed514a2SRobert Clausecker add $32, %rdi 195*2ed514a2SRobert Clausecker test %eax, %eax # end of string in second half? 196*2ed514a2SRobert Clausecker jz 0b 197*2ed514a2SRobert Clausecker 198*2ed514a2SRobert Clausecker ALIGN_TEXT 199*2ed514a2SRobert Clausecker.Lend2: sub $16, %rdi 200*2ed514a2SRobert Clausecker.Lend: lea -1(%rax), %edx 201*2ed514a2SRobert Clausecker xor %edx, %eax # mask of bytes in the string 202*2ed514a2SRobert Clausecker and %eax, %ecx # c found in the tail? 203*2ed514a2SRobert Clausecker cmovnz %rdi, %r8 204*2ed514a2SRobert Clausecker cmovnz %ecx, %esi 205*2ed514a2SRobert Clausecker bsr %esi, %esi # last location of c in (R8) 206*2ed514a2SRobert Clausecker lea (%r8, %rsi, 1), %rax # pointer to match 207*2ed514a2SRobert Clausecker ret 208*2ed514a2SRobert ClauseckerARCHEND(strrchr, baseline) 209*2ed514a2SRobert Clausecker .section .note.GNU-stack,"",%progbits 210