xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/memchr-mte.S (revision 31914882fca502069810b9e9ddea4bcd8136a4f4)
1*31914882SAlex Richardson/*
2*31914882SAlex Richardson * memchr - find a character in a memory zone
3*31914882SAlex Richardson *
4*31914882SAlex Richardson * Copyright (c) 2020, Arm Limited.
5*31914882SAlex Richardson * SPDX-License-Identifier: MIT
6*31914882SAlex Richardson */
7*31914882SAlex Richardson
8*31914882SAlex Richardson/* Assumptions:
9*31914882SAlex Richardson *
10*31914882SAlex Richardson * ARMv8-a, AArch64, Advanced SIMD.
11*31914882SAlex Richardson * MTE compatible.
12*31914882SAlex Richardson */
13*31914882SAlex Richardson
14*31914882SAlex Richardson#include "../asmdefs.h"
15*31914882SAlex Richardson
16*31914882SAlex Richardson#define srcin		x0
17*31914882SAlex Richardson#define chrin		w1
18*31914882SAlex Richardson#define cntin		x2
19*31914882SAlex Richardson#define result		x0
20*31914882SAlex Richardson
21*31914882SAlex Richardson#define src		x3
22*31914882SAlex Richardson#define cntrem		x4
23*31914882SAlex Richardson#define synd		x5
24*31914882SAlex Richardson#define shift		x6
25*31914882SAlex Richardson#define	tmp		x7
26*31914882SAlex Richardson#define wtmp		w7
27*31914882SAlex Richardson
28*31914882SAlex Richardson#define vrepchr		v0
29*31914882SAlex Richardson#define qdata		q1
30*31914882SAlex Richardson#define vdata		v1
31*31914882SAlex Richardson#define vhas_chr	v2
32*31914882SAlex Richardson#define vrepmask	v3
33*31914882SAlex Richardson#define vend		v4
34*31914882SAlex Richardson#define dend		d4
35*31914882SAlex Richardson
36*31914882SAlex Richardson/*
37*31914882SAlex Richardson   Core algorithm:
38*31914882SAlex Richardson
39*31914882SAlex Richardson   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
40*31914882SAlex Richardson   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
41*31914882SAlex Richardson   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
42*31914882SAlex Richardson   set likewise for odd bytes so that adjacent bytes can be merged. Since the
43*31914882SAlex Richardson   bits in the syndrome reflect the order in which things occur in the original
44*31914882SAlex Richardson   string, counting trailing zeros identifies exactly which byte matched.  */
45*31914882SAlex Richardson
46*31914882SAlex RichardsonENTRY (__memchr_aarch64_mte)
47*31914882SAlex Richardson	PTR_ARG (0)
48*31914882SAlex Richardson	SIZE_ARG (2)
49*31914882SAlex Richardson	bic	src, srcin, 15
50*31914882SAlex Richardson	cbz	cntin, L(nomatch)
51*31914882SAlex Richardson	ld1	{vdata.16b}, [src]
52*31914882SAlex Richardson	dup	vrepchr.16b, chrin
53*31914882SAlex Richardson	mov	wtmp, 0xf00f
54*31914882SAlex Richardson	dup	vrepmask.8h, wtmp
55*31914882SAlex Richardson	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
56*31914882SAlex Richardson	lsl	shift, srcin, 2
57*31914882SAlex Richardson	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
58*31914882SAlex Richardson	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
59*31914882SAlex Richardson	fmov	synd, dend
60*31914882SAlex Richardson	lsr	synd, synd, shift
61*31914882SAlex Richardson	cbz	synd, L(start_loop)
62*31914882SAlex Richardson
63*31914882SAlex Richardson	rbit	synd, synd
64*31914882SAlex Richardson	clz	synd, synd
65*31914882SAlex Richardson	add	result, srcin, synd, lsr 2
66*31914882SAlex Richardson	cmp	cntin, synd, lsr 2
67*31914882SAlex Richardson	csel	result, result, xzr, hi
68*31914882SAlex Richardson	ret
69*31914882SAlex Richardson
70*31914882SAlex RichardsonL(start_loop):
71*31914882SAlex Richardson	sub	tmp, src, srcin
72*31914882SAlex Richardson	add	tmp, tmp, 16
73*31914882SAlex Richardson	subs	cntrem, cntin, tmp
74*31914882SAlex Richardson	b.ls	L(nomatch)
75*31914882SAlex Richardson
76*31914882SAlex Richardson	/* Make sure that it won't overread by a 16-byte chunk */
77*31914882SAlex Richardson	add	tmp, cntrem, 15
78*31914882SAlex Richardson	tbnz	tmp, 4, L(loop32_2)
79*31914882SAlex Richardson
80*31914882SAlex Richardson	.p2align 4
81*31914882SAlex RichardsonL(loop32):
82*31914882SAlex Richardson	ldr	qdata, [src, 16]!
83*31914882SAlex Richardson	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
84*31914882SAlex Richardson	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
85*31914882SAlex Richardson	fmov	synd, dend
86*31914882SAlex Richardson	cbnz	synd, L(end)
87*31914882SAlex Richardson
88*31914882SAlex RichardsonL(loop32_2):
89*31914882SAlex Richardson	ldr	qdata, [src, 16]!
90*31914882SAlex Richardson	subs	cntrem, cntrem, 32
91*31914882SAlex Richardson	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
92*31914882SAlex Richardson	b.ls	L(end)
93*31914882SAlex Richardson	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
94*31914882SAlex Richardson	fmov	synd, dend
95*31914882SAlex Richardson	cbz	synd, L(loop32)
96*31914882SAlex RichardsonL(end):
97*31914882SAlex Richardson	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
98*31914882SAlex Richardson	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
99*31914882SAlex Richardson	fmov	synd, dend
100*31914882SAlex Richardson	add	tmp, srcin, cntin
101*31914882SAlex Richardson	sub	cntrem, tmp, src
102*31914882SAlex Richardson#ifndef __AARCH64EB__
103*31914882SAlex Richardson	rbit	synd, synd
104*31914882SAlex Richardson#endif
105*31914882SAlex Richardson	clz	synd, synd
106*31914882SAlex Richardson	cmp	cntrem, synd, lsr 2
107*31914882SAlex Richardson	add	result, src, synd, lsr 2
108*31914882SAlex Richardson	csel	result, result, xzr, hi
109*31914882SAlex Richardson	ret
110*31914882SAlex Richardson
111*31914882SAlex RichardsonL(nomatch):
112*31914882SAlex Richardson	mov	result, 0
113*31914882SAlex Richardson	ret
114*31914882SAlex Richardson
115*31914882SAlex RichardsonEND (__memchr_aarch64_mte)
116*31914882SAlex Richardson
117