1*fb197a4fSRobert Clausecker/*- 2*fb197a4fSRobert Clausecker * SPDX-License-Identifier: BSD-2-Clause 3*fb197a4fSRobert Clausecker * 4*fb197a4fSRobert Clausecker * Copyright (c) 2023 Robert Clausecker 5*fb197a4fSRobert Clausecker */ 6*fb197a4fSRobert Clausecker 7*fb197a4fSRobert Clausecker#include <machine/asm.h> 8*fb197a4fSRobert Clausecker 9*fb197a4fSRobert Clausecker#include "amd64_archlevel.h" 10*fb197a4fSRobert Clausecker 11*fb197a4fSRobert Clausecker#define ALIGN_TEXT .p2align 4, 0x90 12*fb197a4fSRobert Clausecker 13*fb197a4fSRobert ClauseckerARCHFUNCS(memrchr) 14*fb197a4fSRobert Clausecker ARCHFUNC(memrchr, scalar) 15*fb197a4fSRobert Clausecker ARCHFUNC(memrchr, baseline) 16*fb197a4fSRobert ClauseckerENDARCHFUNCS(memrchr) 17*fb197a4fSRobert Clausecker 18*fb197a4fSRobert ClauseckerARCHENTRY(memrchr, scalar) 19*fb197a4fSRobert Clausecker xor %eax, %eax # prospective return value 20*fb197a4fSRobert Clausecker sub $4, %rdx # 4 bytes left to process? 21*fb197a4fSRobert Clausecker jb 1f 22*fb197a4fSRobert Clausecker 23*fb197a4fSRobert Clausecker ALIGN_TEXT 24*fb197a4fSRobert Clausecker0: xor %r8, %r8 25*fb197a4fSRobert Clausecker lea 2(%rdi), %r10 26*fb197a4fSRobert Clausecker cmp %sil, 2(%rdi) 27*fb197a4fSRobert Clausecker cmovne %r8, %r10 # point to null if no match 28*fb197a4fSRobert Clausecker 29*fb197a4fSRobert Clausecker cmp %sil, (%rdi) 30*fb197a4fSRobert Clausecker cmove %rdi, %r8 # point to first char if match 31*fb197a4fSRobert Clausecker 32*fb197a4fSRobert Clausecker lea 1(%rdi), %r9 33*fb197a4fSRobert Clausecker cmp %sil, 1(%rdi) 34*fb197a4fSRobert Clausecker cmovne %r8, %r9 # point to first result if no match in second 35*fb197a4fSRobert Clausecker 36*fb197a4fSRobert Clausecker lea 3(%rdi), %r11 37*fb197a4fSRobert Clausecker cmp %sil, 3(%rdi) 38*fb197a4fSRobert Clausecker cmovne %r10, %r11 39*fb197a4fSRobert Clausecker 40*fb197a4fSRobert Clausecker test %r11, %r11 41*fb197a4fSRobert Clausecker cmovz %r9, %r11 # take first pair match if none in second 42*fb197a4fSRobert Clausecker 43*fb197a4fSRobert Clausecker test %r11, %r11 44*fb197a4fSRobert Clausecker cmovnz %r11, %rax # take match in current set if any 45*fb197a4fSRobert Clausecker 46*fb197a4fSRobert Clausecker add $4, %rdi 47*fb197a4fSRobert Clausecker sub $4, %rdx 48*fb197a4fSRobert Clausecker jae 0b 49*fb197a4fSRobert Clausecker 50*fb197a4fSRobert Clausecker1: cmp $-3, %edx # a least one character left to process? 51*fb197a4fSRobert Clausecker jb 2f 52*fb197a4fSRobert Clausecker 53*fb197a4fSRobert Clausecker cmp %sil, (%rdi) 54*fb197a4fSRobert Clausecker cmove %rdi, %rax 55*fb197a4fSRobert Clausecker 56*fb197a4fSRobert Clausecker lea 1(%rdi), %rcx 57*fb197a4fSRobert Clausecker cmp $-2, %edx # at least two characters left to process? 58*fb197a4fSRobert Clausecker jb 2f 59*fb197a4fSRobert Clausecker 60*fb197a4fSRobert Clausecker cmp %sil, 1(%rdi) 61*fb197a4fSRobert Clausecker cmove %rcx, %rax 62*fb197a4fSRobert Clausecker 63*fb197a4fSRobert Clausecker lea 2(%rdi), %rcx 64*fb197a4fSRobert Clausecker cmp $-1, %edx # at least three character left to process? 65*fb197a4fSRobert Clausecker jb 2f 66*fb197a4fSRobert Clausecker 67*fb197a4fSRobert Clausecker cmp %sil, 2(%rdi) 68*fb197a4fSRobert Clausecker cmove %rcx, %rax 69*fb197a4fSRobert Clausecker 70*fb197a4fSRobert Clausecker2: ret 71*fb197a4fSRobert ClauseckerARCHEND(memrchr, scalar) 72*fb197a4fSRobert Clausecker 73*fb197a4fSRobert ClauseckerARCHENTRY(memrchr, baseline) 74*fb197a4fSRobert Clausecker movd %esi, %xmm4 75*fb197a4fSRobert Clausecker test %rdx, %rdx # empty buffer? 76*fb197a4fSRobert Clausecker jz .L0 # if yes, return immediately 77*fb197a4fSRobert Clausecker 78*fb197a4fSRobert Clausecker punpcklbw %xmm4, %xmm4 # c -> cc 79*fb197a4fSRobert Clausecker mov %edi, %ecx 80*fb197a4fSRobert Clausecker punpcklwd %xmm4, %xmm4 # cc -> cccc 81*fb197a4fSRobert Clausecker and $~0xf, %rdi # align source pointer 82*fb197a4fSRobert Clausecker pshufd $0, %xmm4, %xmm4 # cccc -> cccccccccccccccc 83*fb197a4fSRobert Clausecker and $0xf, %ecx 84*fb197a4fSRobert Clausecker movdqa %xmm4, %xmm0 85*fb197a4fSRobert Clausecker mov $-1, %r8d 86*fb197a4fSRobert Clausecker pcmpeqb (%rdi), %xmm0 # compare aligned head 87*fb197a4fSRobert Clausecker shl %cl, %r8d # mask of bytes in the head of the buffer 88*fb197a4fSRobert Clausecker pmovmskb %xmm0, %eax 89*fb197a4fSRobert Clausecker 90*fb197a4fSRobert Clausecker sub $16, %rcx 91*fb197a4fSRobert Clausecker and %r8d, %eax # match mask 92*fb197a4fSRobert Clausecker add %rcx, %rdx # advance past head 93*fb197a4fSRobert Clausecker cmc 94*fb197a4fSRobert Clausecker jbe .Lrunt # did the string end in the buffer? 95*fb197a4fSRobert Clausecker 96*fb197a4fSRobert Clausecker mov %rdi, %rsi # pointer to matching chunk 97*fb197a4fSRobert Clausecker add $16, %rdi 98*fb197a4fSRobert Clausecker sub $16, %rdx # enough left for another round? 99*fb197a4fSRobert Clausecker jbe 1f 100*fb197a4fSRobert Clausecker 101*fb197a4fSRobert Clausecker /* main loop unrolled twice */ 102*fb197a4fSRobert Clausecker ALIGN_TEXT 103*fb197a4fSRobert Clausecker0: movdqa %xmm4, %xmm0 104*fb197a4fSRobert Clausecker pcmpeqb (%rdi), %xmm0 105*fb197a4fSRobert Clausecker pmovmskb %xmm0, %r8d 106*fb197a4fSRobert Clausecker 107*fb197a4fSRobert Clausecker cmp $16, %rdx # enough left for second chunk? 108*fb197a4fSRobert Clausecker jbe 2f 109*fb197a4fSRobert Clausecker 110*fb197a4fSRobert Clausecker movdqa %xmm4, %xmm0 111*fb197a4fSRobert Clausecker pcmpeqb 16(%rdi), %xmm0 112*fb197a4fSRobert Clausecker pmovmskb %xmm0, %ecx 113*fb197a4fSRobert Clausecker 114*fb197a4fSRobert Clausecker lea 16(%rdi), %r9 115*fb197a4fSRobert Clausecker test %ecx, %ecx # match found in second chunk? 116*fb197a4fSRobert Clausecker cmovz %r8d, %ecx # if not, use match data from first chunk 117*fb197a4fSRobert Clausecker cmovz %rdi, %r9 118*fb197a4fSRobert Clausecker 119*fb197a4fSRobert Clausecker test %ecx, %ecx # any match found? 120*fb197a4fSRobert Clausecker cmovnz %ecx, %eax # if yes, overwrite previously found match 121*fb197a4fSRobert Clausecker cmovnz %r9, %rsi 122*fb197a4fSRobert Clausecker 123*fb197a4fSRobert Clausecker add $32, %rdi # advance to next iteration 124*fb197a4fSRobert Clausecker sub $32, %rdx # advance to next chunks 125*fb197a4fSRobert Clausecker ja 0b 126*fb197a4fSRobert Clausecker 127*fb197a4fSRobert Clausecker /* process remaining 1--16 bytes */ 128*fb197a4fSRobert Clausecker1: pcmpeqb (%rdi), %xmm4 129*fb197a4fSRobert Clausecker mov $0xffff, %r8d 130*fb197a4fSRobert Clausecker xor %ecx, %ecx 131*fb197a4fSRobert Clausecker sub %edx, %ecx # number of bytes to be masked out 132*fb197a4fSRobert Clausecker pmovmskb %xmm4, %r9d 133*fb197a4fSRobert Clausecker shr %cl, %r8d # mask of bytes to be kept in the buffer 134*fb197a4fSRobert Clausecker and %r9d, %r8d 135*fb197a4fSRobert Clausecker cmovnz %r8d, %eax 136*fb197a4fSRobert Clausecker cmovnz %rdi, %rsi 137*fb197a4fSRobert Clausecker bsr %eax, %eax 138*fb197a4fSRobert Clausecker lea (%rsi, %rax, 1), %rsi # pointer to match (or junk) 139*fb197a4fSRobert Clausecker cmovnz %rsi, %rax # if any match was found, return it 140*fb197a4fSRobert Clausecker ret 141*fb197a4fSRobert Clausecker 142*fb197a4fSRobert Clausecker /* end of chunk reached within first half iteration */ 143*fb197a4fSRobert Clausecker2: test %r8d, %r8d # match in previous chunk? 144*fb197a4fSRobert Clausecker cmovnz %r8d, %eax # if yes, overwrite previous chunks 145*fb197a4fSRobert Clausecker cmovnz %rdi, %rsi 146*fb197a4fSRobert Clausecker add $16, %rdi # point to tail 147*fb197a4fSRobert Clausecker sub $16, %edx 148*fb197a4fSRobert Clausecker jmp 1b # handle tail the same otherwise 149*fb197a4fSRobert Clausecker 150*fb197a4fSRobert Clausecker /* runt: string ends within head, edx has negated amount of invalid head bytes */ 151*fb197a4fSRobert Clausecker.Lrunt: mov $0xffff, %r8d 152*fb197a4fSRobert Clausecker xor %ecx, %ecx 153*fb197a4fSRobert Clausecker sub %edx, %ecx 154*fb197a4fSRobert Clausecker shr %cl, %r8d 155*fb197a4fSRobert Clausecker and %r8d, %eax 156*fb197a4fSRobert Clausecker bsr %eax, %eax 157*fb197a4fSRobert Clausecker lea (%rdi, %rax, 1), %rdi 158*fb197a4fSRobert Clausecker cmovnz %rdi, %rax 159*fb197a4fSRobert Clausecker ret 160*fb197a4fSRobert Clausecker 161*fb197a4fSRobert Clausecker /* empty buffer: return a null pointer */ 162*fb197a4fSRobert Clausecker.L0: xor %eax, %eax 163*fb197a4fSRobert Clausecker ret 164*fb197a4fSRobert ClauseckerARCHEND(memrchr, baseline) 165*fb197a4fSRobert Clausecker 166*fb197a4fSRobert Clausecker .section .note.GNU-stack, "", %progbits 167