xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/strcpy.S (revision 072a4ba82a01476eaee33781ccd241033eefcf0b)
131914882SAlex Richardson/*
231914882SAlex Richardson * strcpy/stpcpy - copy a string returning pointer to start/end.
331914882SAlex Richardson *
4*072a4ba8SAndrew Turner * Copyright (c) 2020-2023, Arm Limited.
5*072a4ba8SAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
631914882SAlex Richardson */
731914882SAlex Richardson
831914882SAlex Richardson/* Assumptions:
931914882SAlex Richardson *
10d49ad206SAndrew Turner * ARMv8-a, AArch64, Advanced SIMD.
11d49ad206SAndrew Turner * MTE compatible.
1231914882SAlex Richardson */
1331914882SAlex Richardson
14*072a4ba8SAndrew Turner#include "asmdefs.h"
1531914882SAlex Richardson
1631914882SAlex Richardson#define dstin		x0
1731914882SAlex Richardson#define srcin		x1
18d49ad206SAndrew Turner#define result		x0
1931914882SAlex Richardson
2031914882SAlex Richardson#define src		x2
2131914882SAlex Richardson#define dst		x3
22d49ad206SAndrew Turner#define len		x4
23d49ad206SAndrew Turner#define synd		x4
24d49ad206SAndrew Turner#define	tmp		x5
25d49ad206SAndrew Turner#define shift		x5
26d49ad206SAndrew Turner#define data1		x6
27d49ad206SAndrew Turner#define dataw1		w6
28d49ad206SAndrew Turner#define data2		x7
29d49ad206SAndrew Turner#define dataw2		w7
30d49ad206SAndrew Turner
31d49ad206SAndrew Turner#define dataq		q0
32d49ad206SAndrew Turner#define vdata		v0
33d49ad206SAndrew Turner#define vhas_nul	v1
34*072a4ba8SAndrew Turner#define vend		v2
35*072a4ba8SAndrew Turner#define dend		d2
36d49ad206SAndrew Turner#define dataq2		q1
3731914882SAlex Richardson
3831914882SAlex Richardson#ifdef BUILD_STPCPY
3931914882SAlex Richardson# define STRCPY __stpcpy_aarch64
40d49ad206SAndrew Turner# define IFSTPCPY(X,...) X,__VA_ARGS__
4131914882SAlex Richardson#else
4231914882SAlex Richardson# define STRCPY __strcpy_aarch64
43d49ad206SAndrew Turner# define IFSTPCPY(X,...)
4431914882SAlex Richardson#endif
4531914882SAlex Richardson
46*072a4ba8SAndrew Turner/*
47*072a4ba8SAndrew Turner   Core algorithm:
48*072a4ba8SAndrew Turner   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
49*072a4ba8SAndrew Turner   per byte. We take 4 bits of every comparison byte with shift right and narrow
50*072a4ba8SAndrew Turner   by 4 instruction. Since the bits in the nibble mask reflect the order in
51*072a4ba8SAndrew Turner   which things occur in the original string, counting leading zeros identifies
52*072a4ba8SAndrew Turner   exactly which byte matched.  */
5331914882SAlex Richardson
5431914882SAlex RichardsonENTRY (STRCPY)
5531914882SAlex Richardson	PTR_ARG (0)
5631914882SAlex Richardson	PTR_ARG (1)
57d49ad206SAndrew Turner	bic	src, srcin, 15
58d49ad206SAndrew Turner	ld1	{vdata.16b}, [src]
59d49ad206SAndrew Turner	cmeq	vhas_nul.16b, vdata.16b, 0
60d49ad206SAndrew Turner	lsl	shift, srcin, 2
61*072a4ba8SAndrew Turner	shrn	vend.8b, vhas_nul.8h, 4
62d49ad206SAndrew Turner	fmov	synd, dend
63d49ad206SAndrew Turner	lsr	synd, synd, shift
64d49ad206SAndrew Turner	cbnz	synd, L(tail)
6531914882SAlex Richardson
66d49ad206SAndrew Turner	ldr	dataq, [src, 16]!
67d49ad206SAndrew Turner	cmeq	vhas_nul.16b, vdata.16b, 0
68*072a4ba8SAndrew Turner	shrn	vend.8b, vhas_nul.8h, 4
69d49ad206SAndrew Turner	fmov	synd, dend
70d49ad206SAndrew Turner	cbz	synd, L(start_loop)
7131914882SAlex Richardson
72d49ad206SAndrew Turner#ifndef __AARCH64EB__
73d49ad206SAndrew Turner	rbit	synd, synd
7431914882SAlex Richardson#endif
75d49ad206SAndrew Turner	sub	tmp, src, srcin
76d49ad206SAndrew Turner	clz	len, synd
77d49ad206SAndrew Turner	add	len, tmp, len, lsr 2
78d49ad206SAndrew Turner	tbz	len, 4, L(less16)
79d49ad206SAndrew Turner	sub	tmp, len, 15
80d49ad206SAndrew Turner	ldr	dataq, [srcin]
81d49ad206SAndrew Turner	ldr	dataq2, [srcin, tmp]
82d49ad206SAndrew Turner	str	dataq, [dstin]
83d49ad206SAndrew Turner	str	dataq2, [dstin, tmp]
84d49ad206SAndrew Turner	IFSTPCPY (add result, dstin, len)
85d49ad206SAndrew Turner	ret
86d49ad206SAndrew Turner
87d49ad206SAndrew TurnerL(tail):
88d49ad206SAndrew Turner	rbit	synd, synd
89d49ad206SAndrew Turner	clz	len, synd
90d49ad206SAndrew Turner	lsr	len, len, 2
91d49ad206SAndrew TurnerL(less16):
92d49ad206SAndrew Turner	tbz	len, 3, L(less8)
93d49ad206SAndrew Turner	sub	tmp, len, 7
94d49ad206SAndrew Turner	ldr	data1, [srcin]
95d49ad206SAndrew Turner	ldr	data2, [srcin, tmp]
9631914882SAlex Richardson	str	data1, [dstin]
97d49ad206SAndrew Turner	str	data2, [dstin, tmp]
98d49ad206SAndrew Turner	IFSTPCPY (add result, dstin, len)
9931914882SAlex Richardson	ret
10031914882SAlex Richardson
101d49ad206SAndrew Turner	.p2align 4
102d49ad206SAndrew TurnerL(less8):
103d49ad206SAndrew Turner	subs	tmp, len, 3
104d49ad206SAndrew Turner	b.lo	L(less4)
105d49ad206SAndrew Turner	ldr	dataw1, [srcin]
106d49ad206SAndrew Turner	ldr	dataw2, [srcin, tmp]
107d49ad206SAndrew Turner	str	dataw1, [dstin]
108d49ad206SAndrew Turner	str	dataw2, [dstin, tmp]
109d49ad206SAndrew Turner	IFSTPCPY (add result, dstin, len)
11031914882SAlex Richardson	ret
11131914882SAlex Richardson
112d49ad206SAndrew TurnerL(less4):
113d49ad206SAndrew Turner	cbz	len, L(zerobyte)
114d49ad206SAndrew Turner	ldrh	dataw1, [srcin]
115d49ad206SAndrew Turner	strh	dataw1, [dstin]
116d49ad206SAndrew TurnerL(zerobyte):
117d49ad206SAndrew Turner	strb	wzr, [dstin, len]
118d49ad206SAndrew Turner	IFSTPCPY (add result, dstin, len)
11931914882SAlex Richardson	ret
12031914882SAlex Richardson
121d49ad206SAndrew Turner	.p2align 4
122d49ad206SAndrew TurnerL(start_loop):
123*072a4ba8SAndrew Turner	sub	tmp, srcin, dstin
124d49ad206SAndrew Turner	ldr	dataq2, [srcin]
125*072a4ba8SAndrew Turner	sub	dst, src, tmp
126d49ad206SAndrew Turner	str	dataq2, [dstin]
127d49ad206SAndrew TurnerL(loop):
128*072a4ba8SAndrew Turner	str	dataq, [dst], 32
129*072a4ba8SAndrew Turner	ldr	dataq, [src, 16]
130*072a4ba8SAndrew Turner	cmeq	vhas_nul.16b, vdata.16b, 0
131*072a4ba8SAndrew Turner	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
132*072a4ba8SAndrew Turner	fmov	synd, dend
133*072a4ba8SAndrew Turner	cbnz	synd, L(loopend)
134*072a4ba8SAndrew Turner	str	dataq, [dst, -16]
135*072a4ba8SAndrew Turner	ldr	dataq, [src, 32]!
136d49ad206SAndrew Turner	cmeq	vhas_nul.16b, vdata.16b, 0
137d49ad206SAndrew Turner	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
138d49ad206SAndrew Turner	fmov	synd, dend
139d49ad206SAndrew Turner	cbz	synd, L(loop)
140*072a4ba8SAndrew Turner	add	dst, dst, 16
141*072a4ba8SAndrew TurnerL(loopend):
142*072a4ba8SAndrew Turner	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
143d49ad206SAndrew Turner	fmov	synd, dend
144*072a4ba8SAndrew Turner	sub	dst, dst, 31
145d49ad206SAndrew Turner#ifndef __AARCH64EB__
146d49ad206SAndrew Turner	rbit	synd, synd
14731914882SAlex Richardson#endif
148d49ad206SAndrew Turner	clz	len, synd
149d49ad206SAndrew Turner	lsr	len, len, 2
150*072a4ba8SAndrew Turner	add	dst, dst, len
151*072a4ba8SAndrew Turner	ldr	dataq, [dst, tmp]
152*072a4ba8SAndrew Turner	str	dataq, [dst]
153*072a4ba8SAndrew Turner	IFSTPCPY (add result, dst, 15)
154d49ad206SAndrew Turner	ret
15531914882SAlex Richardson
15631914882SAlex RichardsonEND (STRCPY)
157