xref: /freebsd/lib/libc/amd64/string/stpcpy.S (revision 783d3ff6d7fae619db8a7990b8a6387de0c677b5)
1/*-
2 * Copyright (c) 2023, The FreeBSD Foundation
3 *
4 * SPDX-License-Expression: BSD-2-Clause
5 *
6 * Portions of this software were developed by Robert Clausecker
7 * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
8 *
9 * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S
10 * written by J.T. Conklin <jtc@acorntoolworks.com> and
11 * adapted by Guillaume Morin <guillaume@morinfr.org> to implement stpcpy
12 * that was originally dedicated to the public domain
13 */
14
15#include <machine/asm.h>
16
17#include "amd64_archlevel.h"
18
19#define ALIGN_TEXT	.p2align 4, 0x90
20
21	.weak stpcpy
22	.set stpcpy, __stpcpy
23ARCHFUNCS(__stpcpy)
24	ARCHFUNC(__stpcpy, scalar)
25	ARCHFUNC(__stpcpy, baseline)
26ENDARCHFUNCS(__stpcpy)
27
28/*
29 * This stpcpy implementation copies a byte at a time until the
30 * source pointer is aligned to a word boundary, it then copies by
31 * words until it finds a word containing a zero byte, and finally
32 * copies by bytes until the end of the string is reached.
33 *
34 * While this may result in unaligned stores if the source and
35 * destination pointers are unaligned with respect to each other,
36 * it is still faster than either byte copies or the overhead of
37 * an implementation suitable for machines with strict alignment
38 * requirements.
39 */
40
41ARCHENTRY(__stpcpy, scalar)
42	movabsq $0x0101010101010101,%r8
43	movabsq $0x8080808080808080,%r9
44
45	/*
46	 * Align source to a word boundary.
47	 * Consider unrolling loop?
48	 */
49.Lalign:
50	testb	$7,%sil
51	je	.Lword_aligned
52	movb	(%rsi),%dl
53	incq	%rsi
54	movb	%dl,(%rdi)
55	incq	%rdi
56	testb	%dl,%dl
57	jne	.Lalign
58	movq	%rdi,%rax
59	dec	%rax
60	ret
61
62	ALIGN_TEXT
63.Lloop:
64	movq	%rdx,(%rdi)
65	addq	$8,%rdi
66.Lword_aligned:
67	movq	(%rsi),%rdx
68	movq	%rdx,%rcx
69	addq	$8,%rsi
70	subq	%r8,%rcx
71	testq	%r9,%rcx
72	je	.Lloop
73
74	/*
75	 * In rare cases, the above loop may exit prematurely. We must
76	 * return to the loop if none of the bytes in the word equal 0.
77	 */
78
79	movb	%dl,(%rdi)
80	testb	%dl,%dl		/* 1st byte == 0? */
81	je	.Ldone
82	incq	%rdi
83
84	shrq	$8,%rdx
85	movb	%dl,(%rdi)
86	testb	%dl,%dl		/* 2nd byte == 0? */
87	je	.Ldone
88	incq	%rdi
89
90	shrq	$8,%rdx
91	movb	%dl,(%rdi)
92	testb	%dl,%dl		/* 3rd byte == 0? */
93	je	.Ldone
94	incq	%rdi
95
96	shrq	$8,%rdx
97	movb	%dl,(%rdi)
98	testb	%dl,%dl		/* 4th byte == 0? */
99	je	.Ldone
100	incq	%rdi
101
102	shrq	$8,%rdx
103	movb	%dl,(%rdi)
104	testb	%dl,%dl		/* 5th byte == 0? */
105	je	.Ldone
106	incq	%rdi
107
108	shrq	$8,%rdx
109	movb	%dl,(%rdi)
110	testb	%dl,%dl		/* 6th byte == 0? */
111	je	.Ldone
112	incq	%rdi
113
114	shrq	$8,%rdx
115	movb	%dl,(%rdi)
116	testb	%dl,%dl		/* 7th byte == 0? */
117	je	.Ldone
118	incq	%rdi
119
120	shrq	$8,%rdx
121	movb	%dl,(%rdi)
122	incq	%rdi
123	testb	%dl,%dl		/* 8th byte == 0? */
124	jne	.Lword_aligned
125	decq	%rdi
126
127.Ldone:
128	movq	%rdi,%rax
129	ret
130ARCHEND(__stpcpy, scalar)
131
132ARCHENTRY(__stpcpy, baseline)
133	mov	%esi, %ecx
134	mov	%rdi, %rdx
135	sub	%rsi, %rdi		# express destination as distance to surce
136	and	$~0xf, %rsi		# align source to 16 byte
137	movdqa	(%rsi), %xmm0		# head of string with junk before
138	pxor	%xmm1, %xmm1
139	and	$0xf, %ecx		# misalignment in bytes
140	pcmpeqb	%xmm1, %xmm0		# NUL byte present?
141	pmovmskb %xmm0, %eax
142	shr	%cl, %eax		# clear out matches in junk bytes
143	bsf	%eax, %eax		# find match if any
144	jnz	.Lrunt
145
146	/* first normal iteration: write head back if it succeeds */
147	movdqa	16(%rsi), %xmm0		# 16 bytes of current iteration
148	movdqu	(%rsi, %rcx, 1), %xmm2	# first 16 bytes of the string
149	pcmpeqb	%xmm0, %xmm1		# NUL byte present?
150	pmovmskb %xmm1, %eax
151	test	%eax, %eax		# find match if any
152	jnz	.Lshorty
153
154	movdqu	%xmm2, (%rdx)		# store beginning of string
155
156	/* main loop, unrolled twice */
157	ALIGN_TEXT
1580:	movdqa	32(%rsi), %xmm2		# load current iteraion
159	movdqu	%xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion
160	pxor	%xmm1, %xmm1
161	add	$32, %rsi
162	pcmpeqb	%xmm2, %xmm1		# NUL byte present?
163	pmovmskb %xmm1, %eax
164	test	%eax, %eax
165	jnz	1f
166
167	movdqa	16(%rsi), %xmm0		# load current iteraion
168	movdqu	%xmm2, (%rsi, %rdi, 1)	# write back previous iteraion
169	pxor	%xmm1, %xmm1
170	pcmpeqb	%xmm0, %xmm1		# NUL byte present?
171	pmovmskb %xmm1, %eax
172	test	%eax, %eax
173	jz	0b
174
175	/* end of string after main loop has iterated */
176	add	$16, %rsi		# advance rsi to second unrolled half
1771:	tzcnt	%eax, %eax		# find location of match
178					# (behaves as bsf on pre-x86-64-v3 CPUs)
179	add	%rsi, %rax		# point to NUL byte
180	movdqu	-15(%rax), %xmm0	# last 16 bytes of string
181	movdqu	%xmm0, -15(%rax, %rdi, 1) # copied to destination
182	add	%rdi, %rax		# point to destination's NUL byte
183	ret
184
185	/* NUL encountered in second iteration */
186.Lshorty:
187	tzcnt	%eax, %eax
188	add	$16, %eax		# account for length of first iteration
189	sub	%ecx, %eax		# but not the parts before the string
190
191	/* NUL encountered in first iteration */
192.Lrunt:	lea	1(%rax), %edi		# string length including NUL byte
193	add	%rcx, %rsi		# point to beginning of string
194	add	%rdx, %rax		# point to NUL byte
195
196	/* transfer 16--32 bytes */
197.L1632:	cmp	$16, %edi
198	jb	.L0815
199
200	movdqu	-16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes
201	movdqu	%xmm2, (%rdx)		# store first 16 bytes
202	movdqu	%xmm0, -15(%rax)	# store last 16 bytes
203	ret
204
205	/* transfer 8--15 bytes */
206.L0815:	cmp	$8, %edi
207	jb	.L0407
208
209	mov	(%rsi), %rcx		# load first 8 bytes
210	mov	-8(%rsi, %rdi, 1), %rdi	# load last 8 bytes
211	mov	%rcx, (%rdx)		# store to dst
212	mov	%rdi, -7(%rax)		# dito
213	ret
214
215	/* transfer 4--7 bytes */
216.L0407:	cmp	$4, %edi
217	jb	.L0203
218
219	mov	(%rsi), %ecx
220	mov	-4(%rsi, %rdi, 1), %edi
221	mov	%ecx, (%rdx)
222	mov	%edi, -3(%rax)
223	ret
224
225	/* transfer 2--3 bytes */
226.L0203:	cmp	$2, %edi
227	jb	.L0101
228
229	movzwl	(%rsi), %ecx
230	mov	%cx, (%rdx)		# store first two bytes
231
232	/* transfer 0 bytes (last byte is always NUL) */
233.L0101:	movb	$0, (%rax)		# store terminating NUL byte
234	ret
235ARCHEND(__stpcpy, baseline)
236
237	.section .note.GNU-stack,"",%progbits
238