xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/strlen-mte.S (revision a2464ee12761660f50d0b6f59f233949ebcacc87)
1/*
2 * strlen - calculate the length of a string.
3 *
4 * Copyright (c) 2020, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD.
11 * MTE compatible.
12 */
13
14#include "../asmdefs.h"
15
16#define srcin		x0
17#define result		x0
18
19#define src		x1
20#define	synd		x2
21#define tmp		x3
22#define wtmp		w3
23#define shift		x4
24
25#define data		q0
26#define vdata		v0
27#define vhas_nul	v1
28#define vrepmask	v2
29#define vend		v3
30#define dend		d3
31
32/* Core algorithm:
33
34   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
35   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
36   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
37   set likewise for odd bytes so that adjacent bytes can be merged. Since the
38   bits in the syndrome reflect the order in which things occur in the original
39   string, counting trailing zeros identifies exactly which byte matched.  */
40
41ENTRY (__strlen_aarch64_mte)
42	PTR_ARG (0)
43	bic	src, srcin, 15
44	mov	wtmp, 0xf00f
45	ld1	{vdata.16b}, [src]
46	dup	vrepmask.8h, wtmp
47	cmeq	vhas_nul.16b, vdata.16b, 0
48	lsl	shift, srcin, 2
49	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
50	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
51	fmov	synd, dend
52	lsr	synd, synd, shift
53	cbz	synd, L(loop)
54
55	rbit	synd, synd
56	clz	result, synd
57	lsr	result, result, 2
58	ret
59
60	.p2align 5
61L(loop):
62	ldr	data, [src, 16]!
63	cmeq	vhas_nul.16b, vdata.16b, 0
64	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
65	fmov	synd, dend
66	cbz	synd, L(loop)
67
68	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
69	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
70	sub	result, src, srcin
71	fmov	synd, dend
72#ifndef __AARCH64EB__
73	rbit	synd, synd
74#endif
75	clz	tmp, synd
76	add	result, result, tmp, lsr 2
77	ret
78
79END (__strlen_aarch64_mte)
80
81