xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S (revision 072a4ba82a01476eaee33781ccd241033eefcf0b)
131914882SAlex Richardson/*
231914882SAlex Richardson * strlen - calculate the length of a string.
331914882SAlex Richardson *
4*072a4ba8SAndrew Turner * Copyright (c) 2020-2022, Arm Limited.
5*072a4ba8SAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
631914882SAlex Richardson */
731914882SAlex Richardson
831914882SAlex Richardson/* Assumptions:
931914882SAlex Richardson *
1031914882SAlex Richardson * ARMv8-a, AArch64, Advanced SIMD.
1131914882SAlex Richardson * MTE compatible.
1231914882SAlex Richardson */
1331914882SAlex Richardson
14*072a4ba8SAndrew Turner#include "asmdefs.h"
1531914882SAlex Richardson
1631914882SAlex Richardson#define srcin		x0
1731914882SAlex Richardson#define result		x0
1831914882SAlex Richardson
1931914882SAlex Richardson#define src		x1
2031914882SAlex Richardson#define	synd		x2
2131914882SAlex Richardson#define tmp		x3
2231914882SAlex Richardson#define shift		x4
2331914882SAlex Richardson
2431914882SAlex Richardson#define data		q0
2531914882SAlex Richardson#define vdata		v0
2631914882SAlex Richardson#define vhas_nul	v1
27*072a4ba8SAndrew Turner#define vend		v2
28*072a4ba8SAndrew Turner#define dend		d2
2931914882SAlex Richardson
3031914882SAlex Richardson/* Core algorithm:
31*072a4ba8SAndrew Turner   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
32*072a4ba8SAndrew Turner   four bits per byte using the shrn instruction. A count trailing zeros then
33*072a4ba8SAndrew Turner   identifies the first zero byte.  */
3431914882SAlex Richardson
3531914882SAlex RichardsonENTRY (__strlen_aarch64_mte)
3631914882SAlex Richardson	PTR_ARG (0)
3731914882SAlex Richardson	bic	src, srcin, 15
3831914882SAlex Richardson	ld1	{vdata.16b}, [src]
3931914882SAlex Richardson	cmeq	vhas_nul.16b, vdata.16b, 0
4031914882SAlex Richardson	lsl	shift, srcin, 2
41*072a4ba8SAndrew Turner	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
4231914882SAlex Richardson	fmov	synd, dend
4331914882SAlex Richardson	lsr	synd, synd, shift
4431914882SAlex Richardson	cbz	synd, L(loop)
4531914882SAlex Richardson
4631914882SAlex Richardson	rbit	synd, synd
4731914882SAlex Richardson	clz	result, synd
4831914882SAlex Richardson	lsr	result, result, 2
4931914882SAlex Richardson	ret
5031914882SAlex Richardson
5131914882SAlex Richardson	.p2align 5
5231914882SAlex RichardsonL(loop):
53*072a4ba8SAndrew Turner	ldr	data, [src, 16]
54*072a4ba8SAndrew Turner	cmeq	vhas_nul.16b, vdata.16b, 0
55*072a4ba8SAndrew Turner	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
56*072a4ba8SAndrew Turner	fmov	synd, dend
57*072a4ba8SAndrew Turner	cbnz	synd, L(loop_end)
58*072a4ba8SAndrew Turner	ldr	data, [src, 32]!
5931914882SAlex Richardson	cmeq	vhas_nul.16b, vdata.16b, 0
6031914882SAlex Richardson	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
6131914882SAlex Richardson	fmov	synd, dend
6231914882SAlex Richardson	cbz	synd, L(loop)
63*072a4ba8SAndrew Turner	sub	src, src, 16
64*072a4ba8SAndrew TurnerL(loop_end):
65*072a4ba8SAndrew Turner	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
6631914882SAlex Richardson	sub	result, src, srcin
6731914882SAlex Richardson	fmov	synd, dend
6831914882SAlex Richardson#ifndef __AARCH64EB__
6931914882SAlex Richardson	rbit	synd, synd
7031914882SAlex Richardson#endif
71*072a4ba8SAndrew Turner	add	result, result, 16
7231914882SAlex Richardson	clz	tmp, synd
7331914882SAlex Richardson	add	result, result, tmp, lsr 2
7431914882SAlex Richardson	ret
7531914882SAlex Richardson
7631914882SAlex RichardsonEND (__strlen_aarch64_mte)
7731914882SAlex Richardson
78