xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/memrchr.S (revision 924226fba12cc9a228c73b956e1b7fa24c60b055)
1/*
2 * memrchr - find last character in a memory zone.
3 *
4 * Copyright (c) 2020, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD.
11 * MTE compatible.
12 */
13
14#include "../asmdefs.h"
15
16#define srcin		x0
17#define chrin		w1
18#define cntin		x2
19#define result		x0
20
21#define src		x3
22#define cntrem		x4
23#define synd		x5
24#define shift		x6
25#define	tmp		x7
26#define wtmp		w7
27#define end		x8
28#define endm1		x9
29
30#define vrepchr		v0
31#define qdata		q1
32#define vdata		v1
33#define vhas_chr	v2
34#define vrepmask	v3
35#define vend		v4
36#define dend		d4
37
38/*
39   Core algorithm:
40
41   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
42   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
43   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
44   set likewise for odd bytes so that adjacent bytes can be merged. Since the
45   bits in the syndrome reflect the order in which things occur in the original
46   string, counting trailing zeros identifies exactly which byte matched.  */
47
48ENTRY (__memrchr_aarch64)
49	PTR_ARG (0)
50	add	end, srcin, cntin
51	sub	endm1, end, 1
52	bic	src, endm1, 15
53	cbz	cntin, L(nomatch)
54	ld1	{vdata.16b}, [src]
55	dup	vrepchr.16b, chrin
56	mov	wtmp, 0xf00f
57	dup	vrepmask.8h, wtmp
58	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
59	neg	shift, end, lsl 2
60	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
61	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
62	fmov	synd, dend
63	lsl	synd, synd, shift
64	cbz	synd, L(start_loop)
65
66	clz	synd, synd
67	sub	result, endm1, synd, lsr 2
68	cmp	cntin, synd, lsr 2
69	csel	result, result, xzr, hi
70	ret
71
72L(start_loop):
73	sub	tmp, end, src
74	subs	cntrem, cntin, tmp
75	b.ls	L(nomatch)
76
77	/* Make sure that it won't overread by a 16-byte chunk */
78	add	tmp, cntrem, 15
79	tbnz	tmp, 4, L(loop32_2)
80
81	.p2align 4
82L(loop32):
83	ldr	qdata, [src, -16]!
84	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
85	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
86	fmov	synd, dend
87	cbnz	synd, L(end)
88
89L(loop32_2):
90	ldr	qdata, [src, -16]!
91	subs	cntrem, cntrem, 32
92	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
93	b.ls	L(end)
94	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
95	fmov	synd, dend
96	cbz	synd, L(loop32)
97L(end):
98	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
99	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
100	fmov	synd, dend
101
102	add	tmp, src, 15
103#ifdef __AARCH64EB__
104	rbit	synd, synd
105#endif
106	clz	synd, synd
107	sub	tmp, tmp, synd, lsr 2
108	cmp	tmp, srcin
109	csel	result, tmp, xzr, hs
110	ret
111
112L(nomatch):
113	mov	result, 0
114	ret
115
116END (__memrchr_aarch64)
117
118