xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/memrchr.S (revision a03411e84728e9b267056fd31c7d1d9d1dc1b01e)
1/*
2 * memrchr - find last character in a memory zone.
3 *
4 * Copyright (c) 2020-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD.
11 * MTE compatible.
12 */
13
14#include "asmdefs.h"
15
16#define srcin		x0
17#define chrin		w1
18#define cntin		x2
19#define result		x0
20
21#define src		x3
22#define cntrem		x4
23#define synd		x5
24#define shift		x6
25#define	tmp		x7
26#define end		x8
27#define endm1		x9
28
29#define vrepchr		v0
30#define qdata		q1
31#define vdata		v1
32#define vhas_chr	v2
33#define vend		v3
34#define dend		d3
35
36/*
37   Core algorithm:
38   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
39   per byte. We take 4 bits of every comparison byte with shift right and narrow
40   by 4 instruction. Since the bits in the nibble mask reflect the order in
41   which things occur in the original string, counting leading zeros identifies
42   exactly which byte matched.  */
43
44ENTRY (__memrchr_aarch64)
45	PTR_ARG (0)
46	add	end, srcin, cntin
47	sub	endm1, end, 1
48	bic	src, endm1, 15
49	cbz	cntin, L(nomatch)
50	ld1	{vdata.16b}, [src]
51	dup	vrepchr.16b, chrin
52	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
53	neg	shift, end, lsl 2
54	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
55	fmov	synd, dend
56	lsl	synd, synd, shift
57	cbz	synd, L(start_loop)
58
59	clz	synd, synd
60	sub	result, endm1, synd, lsr 2
61	cmp	cntin, synd, lsr 2
62	csel	result, result, xzr, hi
63	ret
64
65	nop
66L(start_loop):
67	subs	cntrem, src, srcin
68	b.ls	L(nomatch)
69
70	/* Make sure that it won't overread by a 16-byte chunk */
71	sub	cntrem, cntrem, 1
72	tbz	cntrem, 4, L(loop32_2)
73	add	src, src, 16
74
75	.p2align 5
76L(loop32):
77	ldr	qdata, [src, -32]!
78	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
79	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
80	fmov	synd, dend
81	cbnz	synd, L(end)
82
83L(loop32_2):
84	ldr	qdata, [src, -16]
85	subs	cntrem, cntrem, 32
86	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
87	b.lo	L(end_2)
88	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
89	fmov	synd, dend
90	cbz	synd, L(loop32)
91L(end_2):
92	sub	src, src, 16
93L(end):
94	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
95	fmov	synd, dend
96
97	add	tmp, src, 15
98#ifdef __AARCH64EB__
99	rbit	synd, synd
100#endif
101	clz	synd, synd
102	sub	tmp, tmp, synd, lsr 2
103	cmp	tmp, srcin
104	csel	result, tmp, xzr, hs
105	ret
106
107L(nomatch):
108	mov	result, 0
109	ret
110
111END (__memrchr_aarch64)
112
113