xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S (revision 9f23cbd6cae82fd77edfad7173432fa8dccd0a95)
1/*
2 * strlen - calculate the length of a string.
3 *
4 * Copyright (c) 2020-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD.
11 * MTE compatible.
12 */
13
14#include "asmdefs.h"
15
16#define srcin		x0
17#define result		x0
18
19#define src		x1
20#define	synd		x2
21#define tmp		x3
22#define shift		x4
23
24#define data		q0
25#define vdata		v0
26#define vhas_nul	v1
27#define vend		v2
28#define dend		d2
29
30/* Core algorithm:
31   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
32   four bits per byte using the shrn instruction. A count trailing zeros then
33   identifies the first zero byte.  */
34
35ENTRY (__strlen_aarch64_mte)
36	PTR_ARG (0)
37	bic	src, srcin, 15
38	ld1	{vdata.16b}, [src]
39	cmeq	vhas_nul.16b, vdata.16b, 0
40	lsl	shift, srcin, 2
41	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
42	fmov	synd, dend
43	lsr	synd, synd, shift
44	cbz	synd, L(loop)
45
46	rbit	synd, synd
47	clz	result, synd
48	lsr	result, result, 2
49	ret
50
51	.p2align 5
52L(loop):
53	ldr	data, [src, 16]
54	cmeq	vhas_nul.16b, vdata.16b, 0
55	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
56	fmov	synd, dend
57	cbnz	synd, L(loop_end)
58	ldr	data, [src, 32]!
59	cmeq	vhas_nul.16b, vdata.16b, 0
60	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
61	fmov	synd, dend
62	cbz	synd, L(loop)
63	sub	src, src, 16
64L(loop_end):
65	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
66	sub	result, src, srcin
67	fmov	synd, dend
68#ifndef __AARCH64EB__
69	rbit	synd, synd
70#endif
71	add	result, result, 16
72	clz	tmp, synd
73	add	result, result, tmp, lsr 2
74	ret
75
76END (__strlen_aarch64_mte)
77
78