xref: /freebsd/lib/libc/riscv/string/memchr.S (revision 563efdd3bd5d5f94e356444bb64fd66e13dda5e1)
1*563efdd3SStrahinja Stanišić/*-
2*563efdd3SStrahinja Stanišić * SPDX-License-Identifier: BSD-2-Clause
3*563efdd3SStrahinja Stanišić *
4*563efdd3SStrahinja Stanišić * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org>
5*563efdd3SStrahinja Stanišić */
6*563efdd3SStrahinja Stanišić
7*563efdd3SStrahinja Stanišić#include <machine/asm.h>
8*563efdd3SStrahinja Stanišić
9*563efdd3SStrahinja Stanišić/*
10*563efdd3SStrahinja Stanišić * a0 - const void *b
11*563efdd3SStrahinja Stanišić * a1 - int c
12*563efdd3SStrahinja Stanišić * a2 - size_t len
13*563efdd3SStrahinja Stanišić */
14*563efdd3SStrahinja StanišićENTRY(memchr)
15*563efdd3SStrahinja Stanišić	/*
16*563efdd3SStrahinja Stanišić	 * a0 - const char *ptr
17*563efdd3SStrahinja Stanišić	 * a1 - char cccccccc[8]
18*563efdd3SStrahinja Stanišić	 * a2 - char iter[8]
19*563efdd3SStrahinja Stanišić	 * a3 - uint8_t *end
20*563efdd3SStrahinja Stanišić	 * a4 - uint64_t *end_align
21*563efdd3SStrahinja Stanišić	 * a5 - uint64_t *end_unroll
22*563efdd3SStrahinja Stanišić	 */
23*563efdd3SStrahinja Stanišić
24*563efdd3SStrahinja Stanišić	beqz a2, .Lno_match
25*563efdd3SStrahinja Stanišić
26*563efdd3SStrahinja Stanišić	/* c = (uint8_t) c */
27*563efdd3SStrahinja Stanišić	andi a1, a1, 0xFF
28*563efdd3SStrahinja Stanišić
29*563efdd3SStrahinja Stanišić	/*
30*563efdd3SStrahinja Stanišić	 * t0 = 0x0101010101010101
31*563efdd3SStrahinja Stanišić	 * t1 = 0x8080808080808080
32*563efdd3SStrahinja Stanišić	 * t2 = b << 3
33*563efdd3SStrahinja Stanišić	 * cccccccc = (uint8_t)c * t0
34*563efdd3SStrahinja Stanišić	 * end = b + len;
35*563efdd3SStrahinja Stanišić	 * ptr = b & ~0b111
36*563efdd3SStrahinja Stanišić	 */
37*563efdd3SStrahinja Stanišić	add a3, a0, a2
38*563efdd3SStrahinja Stanišić	li t0, 0x01010101
39*563efdd3SStrahinja Stanišić	sltu t2, a0, a3
40*563efdd3SStrahinja Stanišić	slli t1, t0, 32
41*563efdd3SStrahinja Stanišić	neg t2, t2
42*563efdd3SStrahinja Stanišić	or t0, t0, t1
43*563efdd3SStrahinja Stanišić	and a3, a3, t2
44*563efdd3SStrahinja Stanišić	slli t1, t0, 7
45*563efdd3SStrahinja Stanišić	slli t2, a0, 3
46*563efdd3SStrahinja Stanišić	and a0, a0, ~0b111
47*563efdd3SStrahinja Stanišić	mul a1, t0, a1
48*563efdd3SStrahinja Stanišić
49*563efdd3SStrahinja Stanišić	ld a2, (a0)
50*563efdd3SStrahinja Stanišić
51*563efdd3SStrahinja Stanišić	/*
52*563efdd3SStrahinja Stanišić	 * mask_start = REP8_0x01 ^ (REP8_0x01 << t2)
53*563efdd3SStrahinja Stanišić	 * iter = iter ^ cccccccc
54*563efdd3SStrahinja Stanišić	 * iter = iter | mask_start
55*563efdd3SStrahinja Stanišić	 */
56*563efdd3SStrahinja Stanišić	sll t2, t0, t2
57*563efdd3SStrahinja Stanišić	xor a2, a2, a1
58*563efdd3SStrahinja Stanišić	xor t2, t2, t0
59*563efdd3SStrahinja Stanišić	or a2, a2, t2
60*563efdd3SStrahinja Stanišić
61*563efdd3SStrahinja Stanišić	/* has_zero(iter)
62*563efdd3SStrahinja Stanišić	 * end_align = (end + 7) & ~0b111;
63*563efdd3SStrahinja Stanišić	 */
64*563efdd3SStrahinja Stanišić	addi a4, a3, 7
65*563efdd3SStrahinja Stanišić	not t2, a2
66*563efdd3SStrahinja Stanišić	sub a2, a2, t0
67*563efdd3SStrahinja Stanišić	and t2, t2, t1
68*563efdd3SStrahinja Stanišić	andi a4, a4, ~0b111
69*563efdd3SStrahinja Stanišić	and a2, a2, t2
70*563efdd3SStrahinja Stanišić
71*563efdd3SStrahinja Stanišić	/* ptr = ptr + 8 */
72*563efdd3SStrahinja Stanišić	addi a0, a0, 8
73*563efdd3SStrahinja Stanišić
74*563efdd3SStrahinja Stanišić	bnez a2, .Lfind_zero
75*563efdd3SStrahinja Stanišić
76*563efdd3SStrahinja Stanišić	/* if(ptr == end_align) */
77*563efdd3SStrahinja Stanišić	beq a0, a4, .Lno_match
78*563efdd3SStrahinja Stanišić
79*563efdd3SStrahinja Stanišić	/* end_unroll = end_align & ~0b1111 */
80*563efdd3SStrahinja Stanišić	andi a5, a4, ~0b1111
81*563efdd3SStrahinja Stanišić
82*563efdd3SStrahinja Stanišić	/*
83*563efdd3SStrahinja Stanišić	 * Instead of branching to check if `ptr` is 16-byte aligned:
84*563efdd3SStrahinja Stanišić	 *   - Probe the next 8 bytes for `c`
85*563efdd3SStrahinja Stanišić	 *   - Align `ptr` down to the nearest 16-byte boundary
86*563efdd3SStrahinja Stanišić	 *
87*563efdd3SStrahinja Stanišić	 * If `ptr` was already 16-byte aligned, those 8 bytes will be
88*563efdd3SStrahinja Stanišić	 * checked again inside the unrolled loop.
89*563efdd3SStrahinja Stanišić	 *
90*563efdd3SStrahinja Stanišić	 * This removes an unpredictable branch and improves performance.
91*563efdd3SStrahinja Stanišić	 */
92*563efdd3SStrahinja Stanišić
93*563efdd3SStrahinja Stanišić	ld a2, (a0)
94*563efdd3SStrahinja Stanišić	xor a2, a2, a1
95*563efdd3SStrahinja Stanišić
96*563efdd3SStrahinja Stanišić	not t2, a2
97*563efdd3SStrahinja Stanišić	sub a2, a2, t0
98*563efdd3SStrahinja Stanišić	and t2, t2, t1
99*563efdd3SStrahinja Stanišić	and a2, a2, t2
100*563efdd3SStrahinja Stanišić
101*563efdd3SStrahinja Stanišić	addi a0, a0, 8
102*563efdd3SStrahinja Stanišić
103*563efdd3SStrahinja Stanišić	bnez a2, .Lfind_zero
104*563efdd3SStrahinja Stanišić
105*563efdd3SStrahinja Stanišić	andi a0, a0, ~0b1111
106*563efdd3SStrahinja Stanišić
107*563efdd3SStrahinja Stanišić	/* while(ptr != end_unroll) */
108*563efdd3SStrahinja Stanišić	beq a0, a5, .Lskip_loop
109*563efdd3SStrahinja Stanišić.Lloop:
110*563efdd3SStrahinja Stanišić	ld a2, (a0)
111*563efdd3SStrahinja Stanišić	ld t3, 8(a0)
112*563efdd3SStrahinja Stanišić
113*563efdd3SStrahinja Stanišić	xor a2, a2, a1
114*563efdd3SStrahinja Stanišić	xor t3, t3, a1
115*563efdd3SStrahinja Stanišić
116*563efdd3SStrahinja Stanišić	not t2, a2
117*563efdd3SStrahinja Stanišić	not t4, t3
118*563efdd3SStrahinja Stanišić	sub a2, a2, t0
119*563efdd3SStrahinja Stanišić	sub t3, t3, t0
120*563efdd3SStrahinja Stanišić	and t2, t2, t1
121*563efdd3SStrahinja Stanišić	and t4, t4, t1
122*563efdd3SStrahinja Stanišić	and a2, a2, t2
123*563efdd3SStrahinja Stanišić	and t3, t3, t4
124*563efdd3SStrahinja Stanišić
125*563efdd3SStrahinja Stanišić	addi a0, a0, 8
126*563efdd3SStrahinja Stanišić
127*563efdd3SStrahinja Stanišić	bnez a2, .Lfind_zero
128*563efdd3SStrahinja Stanišić
129*563efdd3SStrahinja Stanišić	/* move into iter for find_zero */
130*563efdd3SStrahinja Stanišić	mv a2, t3
131*563efdd3SStrahinja Stanišić
132*563efdd3SStrahinja Stanišić	addi a0, a0, 8
133*563efdd3SStrahinja Stanišić
134*563efdd3SStrahinja Stanišić	bnez a2, .Lfind_zero
135*563efdd3SStrahinja Stanišić
136*563efdd3SStrahinja Stanišić	bne a0, a5, .Lloop
137*563efdd3SStrahinja Stanišić.Lskip_loop:
138*563efdd3SStrahinja Stanišić
139*563efdd3SStrahinja Stanišić	/* there might be one 8byte left */
140*563efdd3SStrahinja Stanišić	beq a0, a4, .Lno_match
141*563efdd3SStrahinja Stanišić
142*563efdd3SStrahinja Stanišić	ld a2, (a0)
143*563efdd3SStrahinja Stanišić	xor a2, a2, a1
144*563efdd3SStrahinja Stanišić
145*563efdd3SStrahinja Stanišić	not t2, a2
146*563efdd3SStrahinja Stanišić	sub a2, a2, t0
147*563efdd3SStrahinja Stanišić	and t2, t2, t1
148*563efdd3SStrahinja Stanišić	and a2, a2, t2
149*563efdd3SStrahinja Stanišić
150*563efdd3SStrahinja Stanišić	addi a0, a0, 8
151*563efdd3SStrahinja Stanišić
152*563efdd3SStrahinja Stanišić	beqz a2, .Lno_match
153*563efdd3SStrahinja Stanišić
154*563efdd3SStrahinja Stanišić.Lfind_zero:
155*563efdd3SStrahinja Stanišić	/*
156*563efdd3SStrahinja Stanišić	 * ptr = ptr - 8
157*563efdd3SStrahinja Stanišić	 * t1 = 0x0001020304050607
158*563efdd3SStrahinja Stanišić	 * iter = iter & (-iter)
159*563efdd3SStrahinja Stanišić	 * iter = iter >> 7
160*563efdd3SStrahinja Stanišić	 * iter = iter * t1
161*563efdd3SStrahinja Stanišić	 * iter = iter >> 56
162*563efdd3SStrahinja Stanišić	 */
163*563efdd3SStrahinja Stanišić	li t1, 0x10203000
164*563efdd3SStrahinja Stanišić	neg t0, a2
165*563efdd3SStrahinja Stanišić	slli t1, t1, 4
166*563efdd3SStrahinja Stanišić	and a2, a2, t0
167*563efdd3SStrahinja Stanišić	addi t1, t1, 0x405
168*563efdd3SStrahinja Stanišić	srli a2, a2, 7
169*563efdd3SStrahinja Stanišić	slli t1, t1, 16
170*563efdd3SStrahinja Stanišić	addi a0, a0, -8
171*563efdd3SStrahinja Stanišić	addi t1, t1, 0x607
172*563efdd3SStrahinja Stanišić	mul a2, a2, t1
173*563efdd3SStrahinja Stanišić	srli a2, a2, 56
174*563efdd3SStrahinja Stanišić
175*563efdd3SStrahinja Stanišić	/* left = end - ptr */
176*563efdd3SStrahinja Stanišić	sub t0, a3, a0
177*563efdd3SStrahinja Stanišić
178*563efdd3SStrahinja Stanišić	/* return iter < left ? ptr + iter : NULL */
179*563efdd3SStrahinja Stanišić	sltu t1, a2, t0
180*563efdd3SStrahinja Stanišić	neg t1, t1
181*563efdd3SStrahinja Stanišić	add a0, a0, a2
182*563efdd3SStrahinja Stanišić	and a0, a0, t1
183*563efdd3SStrahinja Stanišić	ret
184*563efdd3SStrahinja Stanišić
185*563efdd3SStrahinja Stanišić.Lno_match:
186*563efdd3SStrahinja Stanišić	li a0, 0
187*563efdd3SStrahinja Stanišić	ret
188*563efdd3SStrahinja StanišićEND(memchr)
189