1*563efdd3SStrahinja Stanišić/*- 2*563efdd3SStrahinja Stanišić * SPDX-License-Identifier: BSD-2-Clause 3*563efdd3SStrahinja Stanišić * 4*563efdd3SStrahinja Stanišić * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org> 5*563efdd3SStrahinja Stanišić */ 6*563efdd3SStrahinja Stanišić 7*563efdd3SStrahinja Stanišić#include <machine/asm.h> 8*563efdd3SStrahinja Stanišić 9*563efdd3SStrahinja Stanišić/* 10*563efdd3SStrahinja Stanišić * a0 - const void *b 11*563efdd3SStrahinja Stanišić * a1 - int c 12*563efdd3SStrahinja Stanišić * a2 - size_t len 13*563efdd3SStrahinja Stanišić */ 14*563efdd3SStrahinja StanišićENTRY(memchr) 15*563efdd3SStrahinja Stanišić /* 16*563efdd3SStrahinja Stanišić * a0 - const char *ptr 17*563efdd3SStrahinja Stanišić * a1 - char cccccccc[8] 18*563efdd3SStrahinja Stanišić * a2 - char iter[8] 19*563efdd3SStrahinja Stanišić * a3 - uint8_t *end 20*563efdd3SStrahinja Stanišić * a4 - uint64_t *end_align 21*563efdd3SStrahinja Stanišić * a5 - uint64_t *end_unroll 22*563efdd3SStrahinja Stanišić */ 23*563efdd3SStrahinja Stanišić 24*563efdd3SStrahinja Stanišić beqz a2, .Lno_match 25*563efdd3SStrahinja Stanišić 26*563efdd3SStrahinja Stanišić /* c = (uint8_t) c */ 27*563efdd3SStrahinja Stanišić andi a1, a1, 0xFF 28*563efdd3SStrahinja Stanišić 29*563efdd3SStrahinja Stanišić /* 30*563efdd3SStrahinja Stanišić * t0 = 0x0101010101010101 31*563efdd3SStrahinja Stanišić * t1 = 0x8080808080808080 32*563efdd3SStrahinja Stanišić * t2 = b << 3 33*563efdd3SStrahinja Stanišić * cccccccc = (uint8_t)c * t0 34*563efdd3SStrahinja Stanišić * end = b + len; 35*563efdd3SStrahinja Stanišić * ptr = b & ~0b111 36*563efdd3SStrahinja Stanišić */ 37*563efdd3SStrahinja Stanišić add a3, a0, a2 38*563efdd3SStrahinja Stanišić li t0, 0x01010101 39*563efdd3SStrahinja Stanišić sltu t2, a0, a3 40*563efdd3SStrahinja Stanišić slli t1, t0, 32 41*563efdd3SStrahinja Stanišić neg t2, t2 42*563efdd3SStrahinja Stanišić or t0, t0, t1 43*563efdd3SStrahinja Stanišić and a3, a3, t2 44*563efdd3SStrahinja Stanišić slli t1, t0, 7 45*563efdd3SStrahinja Stanišić slli t2, a0, 3 46*563efdd3SStrahinja Stanišić and a0, a0, ~0b111 47*563efdd3SStrahinja Stanišić mul a1, t0, a1 48*563efdd3SStrahinja Stanišić 49*563efdd3SStrahinja Stanišić ld a2, (a0) 50*563efdd3SStrahinja Stanišić 51*563efdd3SStrahinja Stanišić /* 52*563efdd3SStrahinja Stanišić * mask_start = REP8_0x01 ^ (REP8_0x01 << t2) 53*563efdd3SStrahinja Stanišić * iter = iter ^ cccccccc 54*563efdd3SStrahinja Stanišić * iter = iter | mask_start 55*563efdd3SStrahinja Stanišić */ 56*563efdd3SStrahinja Stanišić sll t2, t0, t2 57*563efdd3SStrahinja Stanišić xor a2, a2, a1 58*563efdd3SStrahinja Stanišić xor t2, t2, t0 59*563efdd3SStrahinja Stanišić or a2, a2, t2 60*563efdd3SStrahinja Stanišić 61*563efdd3SStrahinja Stanišić /* has_zero(iter) 62*563efdd3SStrahinja Stanišić * end_align = (end + 7) & ~0b111; 63*563efdd3SStrahinja Stanišić */ 64*563efdd3SStrahinja Stanišić addi a4, a3, 7 65*563efdd3SStrahinja Stanišić not t2, a2 66*563efdd3SStrahinja Stanišić sub a2, a2, t0 67*563efdd3SStrahinja Stanišić and t2, t2, t1 68*563efdd3SStrahinja Stanišić andi a4, a4, ~0b111 69*563efdd3SStrahinja Stanišić and a2, a2, t2 70*563efdd3SStrahinja Stanišić 71*563efdd3SStrahinja Stanišić /* ptr = ptr + 8 */ 72*563efdd3SStrahinja Stanišić addi a0, a0, 8 73*563efdd3SStrahinja Stanišić 74*563efdd3SStrahinja Stanišić bnez a2, .Lfind_zero 75*563efdd3SStrahinja Stanišić 76*563efdd3SStrahinja Stanišić /* if(ptr == end_align) */ 77*563efdd3SStrahinja Stanišić beq a0, a4, .Lno_match 78*563efdd3SStrahinja Stanišić 79*563efdd3SStrahinja Stanišić /* end_unroll = end_align & ~0b1111 */ 80*563efdd3SStrahinja Stanišić andi a5, a4, ~0b1111 81*563efdd3SStrahinja Stanišić 82*563efdd3SStrahinja Stanišić /* 83*563efdd3SStrahinja Stanišić * Instead of branching to check if `ptr` is 16-byte aligned: 84*563efdd3SStrahinja Stanišić * - Probe the next 8 bytes for `c` 85*563efdd3SStrahinja Stanišić * - Align `ptr` down to the nearest 16-byte boundary 86*563efdd3SStrahinja Stanišić * 87*563efdd3SStrahinja Stanišić * If `ptr` was already 16-byte aligned, those 8 bytes will be 88*563efdd3SStrahinja Stanišić * checked again inside the unrolled loop. 89*563efdd3SStrahinja Stanišić * 90*563efdd3SStrahinja Stanišić * This removes an unpredictable branch and improves performance. 91*563efdd3SStrahinja Stanišić */ 92*563efdd3SStrahinja Stanišić 93*563efdd3SStrahinja Stanišić ld a2, (a0) 94*563efdd3SStrahinja Stanišić xor a2, a2, a1 95*563efdd3SStrahinja Stanišić 96*563efdd3SStrahinja Stanišić not t2, a2 97*563efdd3SStrahinja Stanišić sub a2, a2, t0 98*563efdd3SStrahinja Stanišić and t2, t2, t1 99*563efdd3SStrahinja Stanišić and a2, a2, t2 100*563efdd3SStrahinja Stanišić 101*563efdd3SStrahinja Stanišić addi a0, a0, 8 102*563efdd3SStrahinja Stanišić 103*563efdd3SStrahinja Stanišić bnez a2, .Lfind_zero 104*563efdd3SStrahinja Stanišić 105*563efdd3SStrahinja Stanišić andi a0, a0, ~0b1111 106*563efdd3SStrahinja Stanišić 107*563efdd3SStrahinja Stanišić /* while(ptr != end_unroll) */ 108*563efdd3SStrahinja Stanišić beq a0, a5, .Lskip_loop 109*563efdd3SStrahinja Stanišić.Lloop: 110*563efdd3SStrahinja Stanišić ld a2, (a0) 111*563efdd3SStrahinja Stanišić ld t3, 8(a0) 112*563efdd3SStrahinja Stanišić 113*563efdd3SStrahinja Stanišić xor a2, a2, a1 114*563efdd3SStrahinja Stanišić xor t3, t3, a1 115*563efdd3SStrahinja Stanišić 116*563efdd3SStrahinja Stanišić not t2, a2 117*563efdd3SStrahinja Stanišić not t4, t3 118*563efdd3SStrahinja Stanišić sub a2, a2, t0 119*563efdd3SStrahinja Stanišić sub t3, t3, t0 120*563efdd3SStrahinja Stanišić and t2, t2, t1 121*563efdd3SStrahinja Stanišić and t4, t4, t1 122*563efdd3SStrahinja Stanišić and a2, a2, t2 123*563efdd3SStrahinja Stanišić and t3, t3, t4 124*563efdd3SStrahinja Stanišić 125*563efdd3SStrahinja Stanišić addi a0, a0, 8 126*563efdd3SStrahinja Stanišić 127*563efdd3SStrahinja Stanišić bnez a2, .Lfind_zero 128*563efdd3SStrahinja Stanišić 129*563efdd3SStrahinja Stanišić /* move into iter for find_zero */ 130*563efdd3SStrahinja Stanišić mv a2, t3 131*563efdd3SStrahinja Stanišić 132*563efdd3SStrahinja Stanišić addi a0, a0, 8 133*563efdd3SStrahinja Stanišić 134*563efdd3SStrahinja Stanišić bnez a2, .Lfind_zero 135*563efdd3SStrahinja Stanišić 136*563efdd3SStrahinja Stanišić bne a0, a5, .Lloop 137*563efdd3SStrahinja Stanišić.Lskip_loop: 138*563efdd3SStrahinja Stanišić 139*563efdd3SStrahinja Stanišić /* there might be one 8byte left */ 140*563efdd3SStrahinja Stanišić beq a0, a4, .Lno_match 141*563efdd3SStrahinja Stanišić 142*563efdd3SStrahinja Stanišić ld a2, (a0) 143*563efdd3SStrahinja Stanišić xor a2, a2, a1 144*563efdd3SStrahinja Stanišić 145*563efdd3SStrahinja Stanišić not t2, a2 146*563efdd3SStrahinja Stanišić sub a2, a2, t0 147*563efdd3SStrahinja Stanišić and t2, t2, t1 148*563efdd3SStrahinja Stanišić and a2, a2, t2 149*563efdd3SStrahinja Stanišić 150*563efdd3SStrahinja Stanišić addi a0, a0, 8 151*563efdd3SStrahinja Stanišić 152*563efdd3SStrahinja Stanišić beqz a2, .Lno_match 153*563efdd3SStrahinja Stanišić 154*563efdd3SStrahinja Stanišić.Lfind_zero: 155*563efdd3SStrahinja Stanišić /* 156*563efdd3SStrahinja Stanišić * ptr = ptr - 8 157*563efdd3SStrahinja Stanišić * t1 = 0x0001020304050607 158*563efdd3SStrahinja Stanišić * iter = iter & (-iter) 159*563efdd3SStrahinja Stanišić * iter = iter >> 7 160*563efdd3SStrahinja Stanišić * iter = iter * t1 161*563efdd3SStrahinja Stanišić * iter = iter >> 56 162*563efdd3SStrahinja Stanišić */ 163*563efdd3SStrahinja Stanišić li t1, 0x10203000 164*563efdd3SStrahinja Stanišić neg t0, a2 165*563efdd3SStrahinja Stanišić slli t1, t1, 4 166*563efdd3SStrahinja Stanišić and a2, a2, t0 167*563efdd3SStrahinja Stanišić addi t1, t1, 0x405 168*563efdd3SStrahinja Stanišić srli a2, a2, 7 169*563efdd3SStrahinja Stanišić slli t1, t1, 16 170*563efdd3SStrahinja Stanišić addi a0, a0, -8 171*563efdd3SStrahinja Stanišić addi t1, t1, 0x607 172*563efdd3SStrahinja Stanišić mul a2, a2, t1 173*563efdd3SStrahinja Stanišić srli a2, a2, 56 174*563efdd3SStrahinja Stanišić 175*563efdd3SStrahinja Stanišić /* left = end - ptr */ 176*563efdd3SStrahinja Stanišić sub t0, a3, a0 177*563efdd3SStrahinja Stanišić 178*563efdd3SStrahinja Stanišić /* return iter < left ? ptr + iter : NULL */ 179*563efdd3SStrahinja Stanišić sltu t1, a2, t0 180*563efdd3SStrahinja Stanišić neg t1, t1 181*563efdd3SStrahinja Stanišić add a0, a0, a2 182*563efdd3SStrahinja Stanišić and a0, a0, t1 183*563efdd3SStrahinja Stanišić ret 184*563efdd3SStrahinja Stanišić 185*563efdd3SStrahinja Stanišić.Lno_match: 186*563efdd3SStrahinja Stanišić li a0, 0 187*563efdd3SStrahinja Stanišić ret 188*563efdd3SStrahinja StanišićEND(memchr) 189