1/*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org> 5 */ 6 7#include <machine/asm.h> 8 9/* 10 * a0 - const void *b 11 * a1 - int c 12 * a2 - size_t len 13 */ 14ENTRY(memchr) 15 /* 16 * a0 - const char *ptr 17 * a1 - char cccccccc[8] 18 * a2 - char iter[8] 19 * a3 - uint8_t *end 20 * a4 - uint64_t *end_align 21 * a5 - uint64_t *end_unroll 22 */ 23 24 beqz a2, .Lno_match 25 26 /* c = (uint8_t) c */ 27 andi a1, a1, 0xFF 28 29 /* 30 * t0 = 0x0101010101010101 31 * t1 = 0x8080808080808080 32 * t2 = b << 3 33 * cccccccc = (uint8_t)c * t0 34 * end = b + len; 35 * ptr = b & ~0b111 36 */ 37 add a3, a0, a2 38 li t0, 0x01010101 39 sltu t2, a0, a3 40 slli t1, t0, 32 41 neg t2, t2 42 or t0, t0, t1 43 and a3, a3, t2 44 slli t1, t0, 7 45 slli t2, a0, 3 46 and a0, a0, ~0b111 47 mul a1, t0, a1 48 49 ld a2, (a0) 50 51 /* 52 * mask_start = REP8_0x01 ^ (REP8_0x01 << t2) 53 * iter = iter ^ cccccccc 54 * iter = iter | mask_start 55 */ 56 sll t2, t0, t2 57 xor a2, a2, a1 58 xor t2, t2, t0 59 or a2, a2, t2 60 61 /* has_zero(iter) 62 * end_align = (end + 7) & ~0b111; 63 */ 64 addi a4, a3, 7 65 not t2, a2 66 sub a2, a2, t0 67 and t2, t2, t1 68 andi a4, a4, ~0b111 69 and a2, a2, t2 70 71 /* ptr = ptr + 8 */ 72 addi a0, a0, 8 73 74 bnez a2, .Lfind_zero 75 76 /* if(ptr == end_align) */ 77 beq a0, a4, .Lno_match 78 79 /* end_unroll = end_align & ~0b1111 */ 80 andi a5, a4, ~0b1111 81 82 /* 83 * Instead of branching to check if `ptr` is 16-byte aligned: 84 * - Probe the next 8 bytes for `c` 85 * - Align `ptr` down to the nearest 16-byte boundary 86 * 87 * If `ptr` was already 16-byte aligned, those 8 bytes will be 88 * checked again inside the unrolled loop. 89 * 90 * This removes an unpredictable branch and improves performance. 91 */ 92 93 ld a2, (a0) 94 xor a2, a2, a1 95 96 not t2, a2 97 sub a2, a2, t0 98 and t2, t2, t1 99 and a2, a2, t2 100 101 addi a0, a0, 8 102 103 bnez a2, .Lfind_zero 104 105 andi a0, a0, ~0b1111 106 107 /* while(ptr != end_unroll) */ 108 beq a0, a5, .Lskip_loop 109.Lloop: 110 ld a2, (a0) 111 ld t3, 8(a0) 112 113 xor a2, a2, a1 114 xor t3, t3, a1 115 116 not t2, a2 117 not t4, t3 118 sub a2, a2, t0 119 sub t3, t3, t0 120 and t2, t2, t1 121 and t4, t4, t1 122 and a2, a2, t2 123 and t3, t3, t4 124 125 addi a0, a0, 8 126 127 bnez a2, .Lfind_zero 128 129 /* move into iter for find_zero */ 130 mv a2, t3 131 132 addi a0, a0, 8 133 134 bnez a2, .Lfind_zero 135 136 bne a0, a5, .Lloop 137.Lskip_loop: 138 139 /* there might be one 8byte left */ 140 beq a0, a4, .Lno_match 141 142 ld a2, (a0) 143 xor a2, a2, a1 144 145 not t2, a2 146 sub a2, a2, t0 147 and t2, t2, t1 148 and a2, a2, t2 149 150 addi a0, a0, 8 151 152 beqz a2, .Lno_match 153 154.Lfind_zero: 155 /* 156 * ptr = ptr - 8 157 * t1 = 0x0001020304050607 158 * iter = iter & (-iter) 159 * iter = iter >> 7 160 * iter = iter * t1 161 * iter = iter >> 56 162 */ 163 li t1, 0x10203000 164 neg t0, a2 165 slli t1, t1, 4 166 and a2, a2, t0 167 addi t1, t1, 0x405 168 srli a2, a2, 7 169 slli t1, t1, 16 170 addi a0, a0, -8 171 addi t1, t1, 0x607 172 mul a2, a2, t1 173 srli a2, a2, 56 174 175 /* left = end - ptr */ 176 sub t0, a3, a0 177 178 /* return iter < left ? ptr + iter : NULL */ 179 sltu t1, a2, t0 180 neg t1, t1 181 add a0, a0, a2 182 and a0, a0, t1 183 ret 184 185.Lno_match: 186 li a0, 0 187 ret 188END(memchr) 189