xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/strnlen.S (revision e6bfd18d21b225af6a0ed67ceeaf1293b7b9eba5)
1/*
2 * strnlen - calculate the length of a string with limit.
3 *
4 * Copyright (c) 2020, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD.
11 * MTE compatible.
12 */
13
14#include "../asmdefs.h"
15
16#define srcin		x0
17#define cntin		x1
18#define result		x0
19
20#define src		x2
21#define synd		x3
22#define	shift		x4
23#define wtmp		w4
24#define tmp		x4
25#define cntrem		x5
26
27#define qdata		q0
28#define vdata		v0
29#define vhas_chr	v1
30#define vrepmask	v2
31#define vend		v3
32#define dend		d3
33
34/*
35   Core algorithm:
36
37   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
38   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
39   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
40   set likewise for odd bytes so that adjacent bytes can be merged. Since the
41   bits in the syndrome reflect the order in which things occur in the original
42   string, counting trailing zeros identifies exactly which byte matched.  */
43
44ENTRY (__strnlen_aarch64)
45	PTR_ARG (0)
46	SIZE_ARG (1)
47	bic	src, srcin, 15
48	mov	wtmp, 0xf00f
49	cbz	cntin, L(nomatch)
50	ld1	{vdata.16b}, [src], 16
51	dup	vrepmask.8h, wtmp
52	cmeq	vhas_chr.16b, vdata.16b, 0
53	lsl	shift, srcin, 2
54	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
55	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
56	fmov	synd, dend
57	lsr	synd, synd, shift
58	cbz	synd, L(start_loop)
59L(finish):
60	rbit	synd, synd
61	clz	synd, synd
62	lsr	result, synd, 2
63	cmp	cntin, result
64	csel	result, cntin, result, ls
65	ret
66
67L(start_loop):
68	sub	tmp, src, srcin
69	subs	cntrem, cntin, tmp
70	b.ls	L(nomatch)
71
72	/* Make sure that it won't overread by a 16-byte chunk */
73	add	tmp, cntrem, 15
74	tbnz	tmp, 4, L(loop32_2)
75
76	.p2align 5
77L(loop32):
78	ldr	qdata, [src], 16
79	cmeq	vhas_chr.16b, vdata.16b, 0
80	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
81	fmov	synd, dend
82	cbnz	synd, L(end)
83L(loop32_2):
84	ldr	qdata, [src], 16
85	subs	cntrem, cntrem, 32
86	cmeq	vhas_chr.16b, vdata.16b, 0
87	b.ls	L(end)
88	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
89	fmov	synd, dend
90	cbz	synd, L(loop32)
91
92L(end):
93	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
94	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
95	sub	src, src, 16
96	mov	synd, vend.d[0]
97	sub	result, src, srcin
98#ifndef __AARCH64EB__
99	rbit	synd, synd
100#endif
101	clz	synd, synd
102	add	result, result, synd, lsr 2
103	cmp	cntin, result
104	csel	result, cntin, result, ls
105	ret
106
107L(nomatch):
108	mov	result, cntin
109	ret
110
111END (__strnlen_aarch64)
112
113