xref: /freebsd/lib/libc/riscv/string/memchr.S (revision 563efdd3bd5d5f94e356444bb64fd66e13dda5e1)
1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org>
5 */
6
7#include <machine/asm.h>
8
9/*
10 * a0 - const void *b
11 * a1 - int c
12 * a2 - size_t len
13 */
14ENTRY(memchr)
15	/*
16	 * a0 - const char *ptr
17	 * a1 - char cccccccc[8]
18	 * a2 - char iter[8]
19	 * a3 - uint8_t *end
20	 * a4 - uint64_t *end_align
21	 * a5 - uint64_t *end_unroll
22	 */
23
24	beqz a2, .Lno_match
25
26	/* c = (uint8_t) c */
27	andi a1, a1, 0xFF
28
29	/*
30	 * t0 = 0x0101010101010101
31	 * t1 = 0x8080808080808080
32	 * t2 = b << 3
33	 * cccccccc = (uint8_t)c * t0
34	 * end = b + len;
35	 * ptr = b & ~0b111
36	 */
37	add a3, a0, a2
38	li t0, 0x01010101
39	sltu t2, a0, a3
40	slli t1, t0, 32
41	neg t2, t2
42	or t0, t0, t1
43	and a3, a3, t2
44	slli t1, t0, 7
45	slli t2, a0, 3
46	and a0, a0, ~0b111
47	mul a1, t0, a1
48
49	ld a2, (a0)
50
51	/*
52	 * mask_start = REP8_0x01 ^ (REP8_0x01 << t2)
53	 * iter = iter ^ cccccccc
54	 * iter = iter | mask_start
55	 */
56	sll t2, t0, t2
57	xor a2, a2, a1
58	xor t2, t2, t0
59	or a2, a2, t2
60
61	/* has_zero(iter)
62	 * end_align = (end + 7) & ~0b111;
63	 */
64	addi a4, a3, 7
65	not t2, a2
66	sub a2, a2, t0
67	and t2, t2, t1
68	andi a4, a4, ~0b111
69	and a2, a2, t2
70
71	/* ptr = ptr + 8 */
72	addi a0, a0, 8
73
74	bnez a2, .Lfind_zero
75
76	/* if(ptr == end_align) */
77	beq a0, a4, .Lno_match
78
79	/* end_unroll = end_align & ~0b1111 */
80	andi a5, a4, ~0b1111
81
82	/*
83	 * Instead of branching to check if `ptr` is 16-byte aligned:
84	 *   - Probe the next 8 bytes for `c`
85	 *   - Align `ptr` down to the nearest 16-byte boundary
86	 *
87	 * If `ptr` was already 16-byte aligned, those 8 bytes will be
88	 * checked again inside the unrolled loop.
89	 *
90	 * This removes an unpredictable branch and improves performance.
91	 */
92
93	ld a2, (a0)
94	xor a2, a2, a1
95
96	not t2, a2
97	sub a2, a2, t0
98	and t2, t2, t1
99	and a2, a2, t2
100
101	addi a0, a0, 8
102
103	bnez a2, .Lfind_zero
104
105	andi a0, a0, ~0b1111
106
107	/* while(ptr != end_unroll) */
108	beq a0, a5, .Lskip_loop
109.Lloop:
110	ld a2, (a0)
111	ld t3, 8(a0)
112
113	xor a2, a2, a1
114	xor t3, t3, a1
115
116	not t2, a2
117	not t4, t3
118	sub a2, a2, t0
119	sub t3, t3, t0
120	and t2, t2, t1
121	and t4, t4, t1
122	and a2, a2, t2
123	and t3, t3, t4
124
125	addi a0, a0, 8
126
127	bnez a2, .Lfind_zero
128
129	/* move into iter for find_zero */
130	mv a2, t3
131
132	addi a0, a0, 8
133
134	bnez a2, .Lfind_zero
135
136	bne a0, a5, .Lloop
137.Lskip_loop:
138
139	/* there might be one 8byte left */
140	beq a0, a4, .Lno_match
141
142	ld a2, (a0)
143	xor a2, a2, a1
144
145	not t2, a2
146	sub a2, a2, t0
147	and t2, t2, t1
148	and a2, a2, t2
149
150	addi a0, a0, 8
151
152	beqz a2, .Lno_match
153
154.Lfind_zero:
155	/*
156	 * ptr = ptr - 8
157	 * t1 = 0x0001020304050607
158	 * iter = iter & (-iter)
159	 * iter = iter >> 7
160	 * iter = iter * t1
161	 * iter = iter >> 56
162	 */
163	li t1, 0x10203000
164	neg t0, a2
165	slli t1, t1, 4
166	and a2, a2, t0
167	addi t1, t1, 0x405
168	srli a2, a2, 7
169	slli t1, t1, 16
170	addi a0, a0, -8
171	addi t1, t1, 0x607
172	mul a2, a2, t1
173	srli a2, a2, 56
174
175	/* left = end - ptr */
176	sub t0, a3, a0
177
178	/* return iter < left ? ptr + iter : NULL */
179	sltu t1, a2, t0
180	neg t1, t1
181	add a0, a0, a2
182	and a0, a0, t1
183	ret
184
185.Lno_match:
186	li a0, 0
187	ret
188END(memchr)
189