xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/memrchr.S (revision 31914882fca502069810b9e9ddea4bcd8136a4f4)
1*31914882SAlex Richardson/*
2*31914882SAlex Richardson * memrchr - find last character in a memory zone.
3*31914882SAlex Richardson *
4*31914882SAlex Richardson * Copyright (c) 2020, Arm Limited.
5*31914882SAlex Richardson * SPDX-License-Identifier: MIT
6*31914882SAlex Richardson */
7*31914882SAlex Richardson
8*31914882SAlex Richardson/* Assumptions:
9*31914882SAlex Richardson *
10*31914882SAlex Richardson * ARMv8-a, AArch64, Advanced SIMD.
11*31914882SAlex Richardson * MTE compatible.
12*31914882SAlex Richardson */
13*31914882SAlex Richardson
14*31914882SAlex Richardson#include "../asmdefs.h"
15*31914882SAlex Richardson
16*31914882SAlex Richardson#define srcin		x0
17*31914882SAlex Richardson#define chrin		w1
18*31914882SAlex Richardson#define cntin		x2
19*31914882SAlex Richardson#define result		x0
20*31914882SAlex Richardson
21*31914882SAlex Richardson#define src		x3
22*31914882SAlex Richardson#define cntrem		x4
23*31914882SAlex Richardson#define synd		x5
24*31914882SAlex Richardson#define shift		x6
25*31914882SAlex Richardson#define	tmp		x7
26*31914882SAlex Richardson#define wtmp		w7
27*31914882SAlex Richardson#define end		x8
28*31914882SAlex Richardson#define endm1		x9
29*31914882SAlex Richardson
30*31914882SAlex Richardson#define vrepchr		v0
31*31914882SAlex Richardson#define qdata		q1
32*31914882SAlex Richardson#define vdata		v1
33*31914882SAlex Richardson#define vhas_chr	v2
34*31914882SAlex Richardson#define vrepmask	v3
35*31914882SAlex Richardson#define vend		v4
36*31914882SAlex Richardson#define dend		d4
37*31914882SAlex Richardson
38*31914882SAlex Richardson/*
39*31914882SAlex Richardson   Core algorithm:
40*31914882SAlex Richardson
41*31914882SAlex Richardson   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
42*31914882SAlex Richardson   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
43*31914882SAlex Richardson   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
44*31914882SAlex Richardson   set likewise for odd bytes so that adjacent bytes can be merged. Since the
45*31914882SAlex Richardson   bits in the syndrome reflect the order in which things occur in the original
46*31914882SAlex Richardson   string, counting trailing zeros identifies exactly which byte matched.  */
47*31914882SAlex Richardson
48*31914882SAlex RichardsonENTRY (__memrchr_aarch64)
49*31914882SAlex Richardson	PTR_ARG (0)
50*31914882SAlex Richardson	add	end, srcin, cntin
51*31914882SAlex Richardson	sub	endm1, end, 1
52*31914882SAlex Richardson	bic	src, endm1, 15
53*31914882SAlex Richardson	cbz	cntin, L(nomatch)
54*31914882SAlex Richardson	ld1	{vdata.16b}, [src]
55*31914882SAlex Richardson	dup	vrepchr.16b, chrin
56*31914882SAlex Richardson	mov	wtmp, 0xf00f
57*31914882SAlex Richardson	dup	vrepmask.8h, wtmp
58*31914882SAlex Richardson	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
59*31914882SAlex Richardson	neg	shift, end, lsl 2
60*31914882SAlex Richardson	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
61*31914882SAlex Richardson	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
62*31914882SAlex Richardson	fmov	synd, dend
63*31914882SAlex Richardson	lsl	synd, synd, shift
64*31914882SAlex Richardson	cbz	synd, L(start_loop)
65*31914882SAlex Richardson
66*31914882SAlex Richardson	clz	synd, synd
67*31914882SAlex Richardson	sub	result, endm1, synd, lsr 2
68*31914882SAlex Richardson	cmp	cntin, synd, lsr 2
69*31914882SAlex Richardson	csel	result, result, xzr, hi
70*31914882SAlex Richardson	ret
71*31914882SAlex Richardson
72*31914882SAlex RichardsonL(start_loop):
73*31914882SAlex Richardson	sub	tmp, end, src
74*31914882SAlex Richardson	subs	cntrem, cntin, tmp
75*31914882SAlex Richardson	b.ls	L(nomatch)
76*31914882SAlex Richardson
77*31914882SAlex Richardson	/* Make sure that it won't overread by a 16-byte chunk */
78*31914882SAlex Richardson	add	tmp, cntrem, 15
79*31914882SAlex Richardson	tbnz	tmp, 4, L(loop32_2)
80*31914882SAlex Richardson
81*31914882SAlex Richardson	.p2align 4
82*31914882SAlex RichardsonL(loop32):
83*31914882SAlex Richardson	ldr	qdata, [src, -16]!
84*31914882SAlex Richardson	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
85*31914882SAlex Richardson	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
86*31914882SAlex Richardson	fmov	synd, dend
87*31914882SAlex Richardson	cbnz	synd, L(end)
88*31914882SAlex Richardson
89*31914882SAlex RichardsonL(loop32_2):
90*31914882SAlex Richardson	ldr	qdata, [src, -16]!
91*31914882SAlex Richardson	subs	cntrem, cntrem, 32
92*31914882SAlex Richardson	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
93*31914882SAlex Richardson	b.ls	L(end)
94*31914882SAlex Richardson	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
95*31914882SAlex Richardson	fmov	synd, dend
96*31914882SAlex Richardson	cbz	synd, L(loop32)
97*31914882SAlex RichardsonL(end):
98*31914882SAlex Richardson	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
99*31914882SAlex Richardson	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
100*31914882SAlex Richardson	fmov	synd, dend
101*31914882SAlex Richardson
102*31914882SAlex Richardson	add	tmp, src, 15
103*31914882SAlex Richardson#ifdef __AARCH64EB__
104*31914882SAlex Richardson	rbit	synd, synd
105*31914882SAlex Richardson#endif
106*31914882SAlex Richardson	clz	synd, synd
107*31914882SAlex Richardson	sub	tmp, tmp, synd, lsr 2
108*31914882SAlex Richardson	cmp	tmp, srcin
109*31914882SAlex Richardson	csel	result, tmp, xzr, hs
110*31914882SAlex Richardson	ret
111*31914882SAlex Richardson
112*31914882SAlex RichardsonL(nomatch):
113*31914882SAlex Richardson	mov	result, 0
114*31914882SAlex Richardson	ret
115*31914882SAlex Richardson
116*31914882SAlex RichardsonEND (__memrchr_aarch64)
117*31914882SAlex Richardson
118