1/*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2023 Robert Clausecker 5 */ 6 7#include <machine/asm.h> 8 9#include "amd64_archlevel.h" 10 11#define ALIGN_TEXT .p2align 4, 0x90 12 13ARCHFUNCS(memrchr) 14 ARCHFUNC(memrchr, scalar) 15 ARCHFUNC(memrchr, baseline) 16ENDARCHFUNCS(memrchr) 17 18ARCHENTRY(memrchr, scalar) 19 xor %eax, %eax # prospective return value 20 sub $4, %rdx # 4 bytes left to process? 21 jb 1f 22 23 ALIGN_TEXT 240: xor %r8, %r8 25 lea 2(%rdi), %r10 26 cmp %sil, 2(%rdi) 27 cmovne %r8, %r10 # point to null if no match 28 29 cmp %sil, (%rdi) 30 cmove %rdi, %r8 # point to first char if match 31 32 lea 1(%rdi), %r9 33 cmp %sil, 1(%rdi) 34 cmovne %r8, %r9 # point to first result if no match in second 35 36 lea 3(%rdi), %r11 37 cmp %sil, 3(%rdi) 38 cmovne %r10, %r11 39 40 test %r11, %r11 41 cmovz %r9, %r11 # take first pair match if none in second 42 43 test %r11, %r11 44 cmovnz %r11, %rax # take match in current set if any 45 46 add $4, %rdi 47 sub $4, %rdx 48 jae 0b 49 501: cmp $-3, %edx # a least one character left to process? 51 jb 2f 52 53 cmp %sil, (%rdi) 54 cmove %rdi, %rax 55 56 lea 1(%rdi), %rcx 57 cmp $-2, %edx # at least two characters left to process? 58 jb 2f 59 60 cmp %sil, 1(%rdi) 61 cmove %rcx, %rax 62 63 lea 2(%rdi), %rcx 64 cmp $-1, %edx # at least three character left to process? 65 jb 2f 66 67 cmp %sil, 2(%rdi) 68 cmove %rcx, %rax 69 702: ret 71ARCHEND(memrchr, scalar) 72 73ARCHENTRY(memrchr, baseline) 74 movd %esi, %xmm4 75 test %rdx, %rdx # empty buffer? 76 jz .L0 # if yes, return immediately 77 78 punpcklbw %xmm4, %xmm4 # c -> cc 79 mov %edi, %ecx 80 punpcklwd %xmm4, %xmm4 # cc -> cccc 81 and $~0xf, %rdi # align source pointer 82 pshufd $0, %xmm4, %xmm4 # cccc -> cccccccccccccccc 83 and $0xf, %ecx 84 movdqa %xmm4, %xmm0 85 mov $-1, %r8d 86 pcmpeqb (%rdi), %xmm0 # compare aligned head 87 shl %cl, %r8d # mask of bytes in the head of the buffer 88 pmovmskb %xmm0, %eax 89 90 sub $16, %rcx 91 and %r8d, %eax # match mask 92 add %rcx, %rdx # advance past head 93 cmc 94 jbe .Lrunt # did the string end in the buffer? 95 96 mov %rdi, %rsi # pointer to matching chunk 97 add $16, %rdi 98 sub $16, %rdx # enough left for another round? 99 jbe 1f 100 101 /* main loop unrolled twice */ 102 ALIGN_TEXT 1030: movdqa %xmm4, %xmm0 104 pcmpeqb (%rdi), %xmm0 105 pmovmskb %xmm0, %r8d 106 107 cmp $16, %rdx # enough left for second chunk? 108 jbe 2f 109 110 movdqa %xmm4, %xmm0 111 pcmpeqb 16(%rdi), %xmm0 112 pmovmskb %xmm0, %ecx 113 114 lea 16(%rdi), %r9 115 test %ecx, %ecx # match found in second chunk? 116 cmovz %r8d, %ecx # if not, use match data from first chunk 117 cmovz %rdi, %r9 118 119 test %ecx, %ecx # any match found? 120 cmovnz %ecx, %eax # if yes, overwrite previously found match 121 cmovnz %r9, %rsi 122 123 add $32, %rdi # advance to next iteration 124 sub $32, %rdx # advance to next chunks 125 ja 0b 126 127 /* process remaining 1--16 bytes */ 1281: pcmpeqb (%rdi), %xmm4 129 mov $0xffff, %r8d 130 xor %ecx, %ecx 131 sub %edx, %ecx # number of bytes to be masked out 132 pmovmskb %xmm4, %r9d 133 shr %cl, %r8d # mask of bytes to be kept in the buffer 134 and %r9d, %r8d 135 cmovnz %r8d, %eax 136 cmovnz %rdi, %rsi 137 bsr %eax, %eax 138 lea (%rsi, %rax, 1), %rsi # pointer to match (or junk) 139 cmovnz %rsi, %rax # if any match was found, return it 140 ret 141 142 /* end of chunk reached within first half iteration */ 1432: test %r8d, %r8d # match in previous chunk? 144 cmovnz %r8d, %eax # if yes, overwrite previous chunks 145 cmovnz %rdi, %rsi 146 add $16, %rdi # point to tail 147 sub $16, %edx 148 jmp 1b # handle tail the same otherwise 149 150 /* runt: string ends within head, edx has negated amount of invalid head bytes */ 151.Lrunt: mov $0xffff, %r8d 152 xor %ecx, %ecx 153 sub %edx, %ecx 154 shr %cl, %r8d 155 and %r8d, %eax 156 bsr %eax, %eax 157 lea (%rdi, %rax, 1), %rdi 158 cmovnz %rdi, %rax 159 ret 160 161 /* empty buffer: return a null pointer */ 162.L0: xor %eax, %eax 163 ret 164ARCHEND(memrchr, baseline) 165 166 .section .note.GNU-stack, "", %progbits 167