xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1/*
2 * strlen - calculate the length of a string.
3 *
4 * Copyright (c) 2020-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD.
11 * MTE compatible.
12 */
13
14#include "asmdefs.h"
15
16#define srcin		x0
17#define result		x0
18
19#define src		x1
20#define	synd		x2
21#define tmp		x3
22#define shift		x4
23
24#define data		q0
25#define vdata		v0
26#define vhas_nul	v1
27#define vend		v2
28#define dend		d2
29
30/* Core algorithm:
31   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
32   four bits per byte using the shrn instruction. A count trailing zeros then
33   identifies the first zero byte.  */
34
35ENTRY (__strlen_aarch64_mte)
36	bic	src, srcin, 15
37	ld1	{vdata.16b}, [src]
38	cmeq	vhas_nul.16b, vdata.16b, 0
39	lsl	shift, srcin, 2
40	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
41	fmov	synd, dend
42	lsr	synd, synd, shift
43	cbz	synd, L(next16)
44
45	rbit	synd, synd
46	clz	result, synd
47	lsr	result, result, 2
48	ret
49
50L(next16):
51	ldr	data, [src, 16]
52	cmeq	vhas_nul.16b, vdata.16b, 0
53	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
54	fmov	synd, dend
55	cbz	synd, L(loop)
56	add	src, src, 16
57#ifndef __AARCH64EB__
58	rbit	synd, synd
59#endif
60	sub	result, src, srcin
61	clz	tmp, synd
62	add	result, result, tmp, lsr 2
63	ret
64
65	.p2align 5
66L(loop):
67	ldr	data, [src, 32]!
68	cmeq	vhas_nul.16b, vdata.16b, 0
69	addhn	vend.8b, vhas_nul.8h, vhas_nul.8h
70	fmov	synd, dend
71	cbnz	synd, L(loop_end)
72	ldr	data, [src, 16]
73	cmeq	vhas_nul.16b, vdata.16b, 0
74	addhn	vend.8b, vhas_nul.8h, vhas_nul.8h
75	fmov	synd, dend
76	cbz	synd, L(loop)
77	add	src, src, 16
78L(loop_end):
79	sub	result, shift, src, lsl 2	/* (srcin - src) << 2.  */
80#ifndef __AARCH64EB__
81	rbit	synd, synd
82	sub	result, result, 3
83#endif
84	clz	tmp, synd
85	sub	result, tmp, result
86	lsr	result, result, 2
87	ret
88
89END (__strlen_aarch64_mte)
90