xref: /freebsd/lib/libc/amd64/string/stpcpy.S (revision 9fbea870286d53d906ffaf6b15ace8e40019a880)
1*9fbea870SRobert Clausecker/*-
2*9fbea870SRobert Clausecker * Copyright (c) 2023, The FreeBSD Foundation
3*9fbea870SRobert Clausecker *
4*9fbea870SRobert Clausecker * SPDX-License-Expression: BSD-2-Clause
5*9fbea870SRobert Clausecker *
6*9fbea870SRobert Clausecker * Portions of this software were developed by Robert Clausecker
7*9fbea870SRobert Clausecker * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
8*9fbea870SRobert Clausecker *
9*9fbea870SRobert Clausecker * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S
10*9fbea870SRobert Clausecker * written by J.T. Conklin <jtc@acorntoolworks.com> and
11*9fbea870SRobert Clausecker * adapted by Guillaume Morin <guillaume@morinfr.org> to implement stpcpy
12*9fbea870SRobert Clausecker * that was originally dedicated to the public domain
13c03b5ad6SGeorge V. Neville-Neil */
14c03b5ad6SGeorge V. Neville-Neil
15c03b5ad6SGeorge V. Neville-Neil#include <machine/asm.h>
16*9fbea870SRobert Clausecker
17*9fbea870SRobert Clausecker#include "amd64_archlevel.h"
18*9fbea870SRobert Clausecker
19*9fbea870SRobert Clausecker#define ALIGN_TEXT	.p2align 4, 0x90
20*9fbea870SRobert Clausecker
21*9fbea870SRobert Clausecker	.weak stpcpy
22*9fbea870SRobert Clausecker	.set stpcpy, __stpcpy
23*9fbea870SRobert ClauseckerARCHFUNCS(__stpcpy)
24*9fbea870SRobert Clausecker	ARCHFUNC(__stpcpy, scalar)
25*9fbea870SRobert Clausecker	ARCHFUNC(__stpcpy, baseline)
26*9fbea870SRobert ClauseckerENDARCHFUNCS(__stpcpy)
27*9fbea870SRobert Clausecker
28c03b5ad6SGeorge V. Neville-Neil/*
29c03b5ad6SGeorge V. Neville-Neil * This stpcpy implementation copies a byte at a time until the
30c03b5ad6SGeorge V. Neville-Neil * source pointer is aligned to a word boundary, it then copies by
31c03b5ad6SGeorge V. Neville-Neil * words until it finds a word containing a zero byte, and finally
32c03b5ad6SGeorge V. Neville-Neil * copies by bytes until the end of the string is reached.
33c03b5ad6SGeorge V. Neville-Neil *
34c03b5ad6SGeorge V. Neville-Neil * While this may result in unaligned stores if the source and
35c03b5ad6SGeorge V. Neville-Neil * destination pointers are unaligned with respect to each other,
36c03b5ad6SGeorge V. Neville-Neil * it is still faster than either byte copies or the overhead of
37c03b5ad6SGeorge V. Neville-Neil * an implementation suitable for machines with strict alignment
38c03b5ad6SGeorge V. Neville-Neil * requirements.
39c03b5ad6SGeorge V. Neville-Neil */
40c03b5ad6SGeorge V. Neville-Neil
41*9fbea870SRobert ClauseckerARCHENTRY(__stpcpy, scalar)
42c03b5ad6SGeorge V. Neville-Neil	movabsq $0x0101010101010101,%r8
43c03b5ad6SGeorge V. Neville-Neil	movabsq $0x8080808080808080,%r9
44c03b5ad6SGeorge V. Neville-Neil
45c03b5ad6SGeorge V. Neville-Neil	/*
46c03b5ad6SGeorge V. Neville-Neil	 * Align source to a word boundary.
47c03b5ad6SGeorge V. Neville-Neil	 * Consider unrolling loop?
48c03b5ad6SGeorge V. Neville-Neil	 */
49c03b5ad6SGeorge V. Neville-Neil.Lalign:
50c03b5ad6SGeorge V. Neville-Neil	testb	$7,%sil
51c03b5ad6SGeorge V. Neville-Neil	je	.Lword_aligned
52c03b5ad6SGeorge V. Neville-Neil	movb	(%rsi),%dl
53c03b5ad6SGeorge V. Neville-Neil	incq	%rsi
54c03b5ad6SGeorge V. Neville-Neil	movb	%dl,(%rdi)
55c03b5ad6SGeorge V. Neville-Neil	incq	%rdi
56c03b5ad6SGeorge V. Neville-Neil	testb	%dl,%dl
57c03b5ad6SGeorge V. Neville-Neil	jne	.Lalign
58c03b5ad6SGeorge V. Neville-Neil	movq	%rdi,%rax
59c03b5ad6SGeorge V. Neville-Neil	dec	%rax
60c03b5ad6SGeorge V. Neville-Neil	ret
61c03b5ad6SGeorge V. Neville-Neil
62*9fbea870SRobert Clausecker	ALIGN_TEXT
63c03b5ad6SGeorge V. Neville-Neil.Lloop:
64c03b5ad6SGeorge V. Neville-Neil	movq	%rdx,(%rdi)
65c03b5ad6SGeorge V. Neville-Neil	addq	$8,%rdi
66c03b5ad6SGeorge V. Neville-Neil.Lword_aligned:
67c03b5ad6SGeorge V. Neville-Neil	movq	(%rsi),%rdx
68c03b5ad6SGeorge V. Neville-Neil	movq	%rdx,%rcx
69c03b5ad6SGeorge V. Neville-Neil	addq	$8,%rsi
70c03b5ad6SGeorge V. Neville-Neil	subq	%r8,%rcx
71c03b5ad6SGeorge V. Neville-Neil	testq	%r9,%rcx
72c03b5ad6SGeorge V. Neville-Neil	je	.Lloop
73c03b5ad6SGeorge V. Neville-Neil
74c03b5ad6SGeorge V. Neville-Neil	/*
75c03b5ad6SGeorge V. Neville-Neil	 * In rare cases, the above loop may exit prematurely. We must
76c03b5ad6SGeorge V. Neville-Neil	 * return to the loop if none of the bytes in the word equal 0.
77c03b5ad6SGeorge V. Neville-Neil	 */
78c03b5ad6SGeorge V. Neville-Neil
79c03b5ad6SGeorge V. Neville-Neil	movb	%dl,(%rdi)
80c03b5ad6SGeorge V. Neville-Neil	testb	%dl,%dl		/* 1st byte == 0? */
81c03b5ad6SGeorge V. Neville-Neil	je	.Ldone
82c03b5ad6SGeorge V. Neville-Neil	incq	%rdi
83c03b5ad6SGeorge V. Neville-Neil
84c03b5ad6SGeorge V. Neville-Neil	shrq	$8,%rdx
85c03b5ad6SGeorge V. Neville-Neil	movb	%dl,(%rdi)
86c03b5ad6SGeorge V. Neville-Neil	testb	%dl,%dl		/* 2nd byte == 0? */
87c03b5ad6SGeorge V. Neville-Neil	je	.Ldone
88c03b5ad6SGeorge V. Neville-Neil	incq	%rdi
89c03b5ad6SGeorge V. Neville-Neil
90c03b5ad6SGeorge V. Neville-Neil	shrq	$8,%rdx
91c03b5ad6SGeorge V. Neville-Neil	movb	%dl,(%rdi)
92c03b5ad6SGeorge V. Neville-Neil	testb	%dl,%dl		/* 3rd byte == 0? */
93c03b5ad6SGeorge V. Neville-Neil	je	.Ldone
94c03b5ad6SGeorge V. Neville-Neil	incq	%rdi
95c03b5ad6SGeorge V. Neville-Neil
96c03b5ad6SGeorge V. Neville-Neil	shrq	$8,%rdx
97c03b5ad6SGeorge V. Neville-Neil	movb	%dl,(%rdi)
98c03b5ad6SGeorge V. Neville-Neil	testb	%dl,%dl		/* 4th byte == 0? */
99c03b5ad6SGeorge V. Neville-Neil	je	.Ldone
100c03b5ad6SGeorge V. Neville-Neil	incq	%rdi
101c03b5ad6SGeorge V. Neville-Neil
102c03b5ad6SGeorge V. Neville-Neil	shrq	$8,%rdx
103c03b5ad6SGeorge V. Neville-Neil	movb	%dl,(%rdi)
104c03b5ad6SGeorge V. Neville-Neil	testb	%dl,%dl		/* 5th byte == 0? */
105c03b5ad6SGeorge V. Neville-Neil	je	.Ldone
106c03b5ad6SGeorge V. Neville-Neil	incq	%rdi
107c03b5ad6SGeorge V. Neville-Neil
108c03b5ad6SGeorge V. Neville-Neil	shrq	$8,%rdx
109c03b5ad6SGeorge V. Neville-Neil	movb	%dl,(%rdi)
110c03b5ad6SGeorge V. Neville-Neil	testb	%dl,%dl		/* 6th byte == 0? */
111c03b5ad6SGeorge V. Neville-Neil	je	.Ldone
112c03b5ad6SGeorge V. Neville-Neil	incq	%rdi
113c03b5ad6SGeorge V. Neville-Neil
114c03b5ad6SGeorge V. Neville-Neil	shrq	$8,%rdx
115c03b5ad6SGeorge V. Neville-Neil	movb	%dl,(%rdi)
116c03b5ad6SGeorge V. Neville-Neil	testb	%dl,%dl		/* 7th byte == 0? */
117c03b5ad6SGeorge V. Neville-Neil	je	.Ldone
118c03b5ad6SGeorge V. Neville-Neil	incq	%rdi
119c03b5ad6SGeorge V. Neville-Neil
120c03b5ad6SGeorge V. Neville-Neil	shrq	$8,%rdx
121c03b5ad6SGeorge V. Neville-Neil	movb	%dl,(%rdi)
122c03b5ad6SGeorge V. Neville-Neil	incq	%rdi
123c03b5ad6SGeorge V. Neville-Neil	testb	%dl,%dl		/* 8th byte == 0? */
124c03b5ad6SGeorge V. Neville-Neil	jne	.Lword_aligned
125c03b5ad6SGeorge V. Neville-Neil	decq	%rdi
126c03b5ad6SGeorge V. Neville-Neil
127c03b5ad6SGeorge V. Neville-Neil.Ldone:
128c03b5ad6SGeorge V. Neville-Neil	movq	%rdi,%rax
129c03b5ad6SGeorge V. Neville-Neil	ret
130*9fbea870SRobert ClauseckerARCHEND(__stpcpy, scalar)
131*9fbea870SRobert Clausecker
132*9fbea870SRobert ClauseckerARCHENTRY(__stpcpy, baseline)
133*9fbea870SRobert Clausecker	mov	%esi, %ecx
134*9fbea870SRobert Clausecker	mov	%rdi, %rdx
135*9fbea870SRobert Clausecker	sub	%rsi, %rdi		# express destination as distance to surce
136*9fbea870SRobert Clausecker	and	$~0xf, %rsi		# align source to 16 byte
137*9fbea870SRobert Clausecker	movdqa	(%rsi), %xmm0		# head of string with junk before
138*9fbea870SRobert Clausecker	pxor	%xmm1, %xmm1
139*9fbea870SRobert Clausecker	and	$0xf, %ecx		# misalignment in bytes
140*9fbea870SRobert Clausecker	pcmpeqb	%xmm1, %xmm0		# NUL byte present?
141*9fbea870SRobert Clausecker	pmovmskb %xmm0, %eax
142*9fbea870SRobert Clausecker	shr	%cl, %eax		# clear out matches in junk bytes
143*9fbea870SRobert Clausecker	bsf	%eax, %eax		# find match if any
144*9fbea870SRobert Clausecker	jnz	.Lrunt
145*9fbea870SRobert Clausecker
146*9fbea870SRobert Clausecker	/* first normal iteration: write head back if it succeeds */
147*9fbea870SRobert Clausecker	movdqa	16(%rsi), %xmm0		# 16 bytes of current iteration
148*9fbea870SRobert Clausecker	movdqu	(%rsi, %rcx, 1), %xmm2	# first 16 bytes of the string
149*9fbea870SRobert Clausecker	pcmpeqb	%xmm0, %xmm1		# NUL byte present?
150*9fbea870SRobert Clausecker	pmovmskb %xmm1, %eax
151*9fbea870SRobert Clausecker	test	%eax, %eax		# find match if any
152*9fbea870SRobert Clausecker	jnz	.Lshorty
153*9fbea870SRobert Clausecker
154*9fbea870SRobert Clausecker	movdqu	%xmm2, (%rdx)		# store beginning of string
155*9fbea870SRobert Clausecker
156*9fbea870SRobert Clausecker	/* main loop, unrolled twice */
157*9fbea870SRobert Clausecker	ALIGN_TEXT
158*9fbea870SRobert Clausecker0:	movdqa	32(%rsi), %xmm2		# load current iteraion
159*9fbea870SRobert Clausecker	movdqu	%xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion
160*9fbea870SRobert Clausecker	pxor	%xmm1, %xmm1
161*9fbea870SRobert Clausecker	add	$32, %rsi
162*9fbea870SRobert Clausecker	pcmpeqb	%xmm2, %xmm1		# NUL byte present?
163*9fbea870SRobert Clausecker	pmovmskb %xmm1, %eax
164*9fbea870SRobert Clausecker	test	%eax, %eax
165*9fbea870SRobert Clausecker	jnz	1f
166*9fbea870SRobert Clausecker
167*9fbea870SRobert Clausecker	movdqa	16(%rsi), %xmm0		# load current iteraion
168*9fbea870SRobert Clausecker	movdqu	%xmm2, (%rsi, %rdi, 1)	# write back previous iteraion
169*9fbea870SRobert Clausecker	pxor	%xmm1, %xmm1
170*9fbea870SRobert Clausecker	pcmpeqb	%xmm0, %xmm1		# NUL byte present?
171*9fbea870SRobert Clausecker	pmovmskb %xmm1, %eax
172*9fbea870SRobert Clausecker	test	%eax, %eax
173*9fbea870SRobert Clausecker	jz	0b
174*9fbea870SRobert Clausecker
175*9fbea870SRobert Clausecker	/* end of string after main loop has iterated */
176*9fbea870SRobert Clausecker	add	$16, %rsi		# advance rsi to second unrolled half
177*9fbea870SRobert Clausecker1:	tzcnt	%eax, %eax		# find location of match
178*9fbea870SRobert Clausecker					# (behaves as bsf on pre-x86-64-v3 CPUs)
179*9fbea870SRobert Clausecker	add	%rsi, %rax		# point to NUL byte
180*9fbea870SRobert Clausecker	movdqu	-15(%rax), %xmm0	# last 16 bytes of string
181*9fbea870SRobert Clausecker	movdqu	%xmm0, -15(%rax, %rdi, 1) # copied to destination
182*9fbea870SRobert Clausecker	add	%rdi, %rax		# point to destination's NUL byte
183*9fbea870SRobert Clausecker	ret
184*9fbea870SRobert Clausecker
185*9fbea870SRobert Clausecker	/* NUL encountered in second iteration */
186*9fbea870SRobert Clausecker.Lshorty:
187*9fbea870SRobert Clausecker	tzcnt	%eax, %eax
188*9fbea870SRobert Clausecker	add	$16, %eax		# account for length of first iteration
189*9fbea870SRobert Clausecker	sub	%ecx, %eax		# but not the parts before the string
190*9fbea870SRobert Clausecker
191*9fbea870SRobert Clausecker	/* NUL encountered in first iteration */
192*9fbea870SRobert Clausecker.Lrunt:	lea	1(%rax), %edi		# string length including NUL byte
193*9fbea870SRobert Clausecker	add	%rcx, %rsi		# point to beginning of string
194*9fbea870SRobert Clausecker	add	%rdx, %rax		# point to NUL byte
195*9fbea870SRobert Clausecker
196*9fbea870SRobert Clausecker	/* transfer 16--32 bytes */
197*9fbea870SRobert Clausecker.L1632:	cmp	$16, %edi
198*9fbea870SRobert Clausecker	jb	.L0815
199*9fbea870SRobert Clausecker
200*9fbea870SRobert Clausecker	movdqu	-16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes
201*9fbea870SRobert Clausecker	movdqu	%xmm2, (%rdx)		# store first 16 bytes
202*9fbea870SRobert Clausecker	movdqu	%xmm0, -15(%rax)	# store last 16 bytes
203*9fbea870SRobert Clausecker	ret
204*9fbea870SRobert Clausecker
205*9fbea870SRobert Clausecker	/* transfer 8--15 bytes */
206*9fbea870SRobert Clausecker.L0815:	cmp	$8, %edi
207*9fbea870SRobert Clausecker	jb	.L0407
208*9fbea870SRobert Clausecker
209*9fbea870SRobert Clausecker	mov	(%rsi), %rcx		# load first 8 bytes
210*9fbea870SRobert Clausecker	mov	-8(%rsi, %rdi, 1), %rdi	# load last 8 bytes
211*9fbea870SRobert Clausecker	mov	%rcx, (%rdx)		# store to dst
212*9fbea870SRobert Clausecker	mov	%rdi, -7(%rax)		# dito
213*9fbea870SRobert Clausecker	ret
214*9fbea870SRobert Clausecker
215*9fbea870SRobert Clausecker	/* transfer 4--7 bytes */
216*9fbea870SRobert Clausecker.L0407:	cmp	$4, %edi
217*9fbea870SRobert Clausecker	jb	.L0203
218*9fbea870SRobert Clausecker
219*9fbea870SRobert Clausecker	mov	(%rsi), %ecx
220*9fbea870SRobert Clausecker	mov	-4(%rsi, %rdi, 1), %edi
221*9fbea870SRobert Clausecker	mov	%ecx, (%rdx)
222*9fbea870SRobert Clausecker	mov	%edi, -3(%rax)
223*9fbea870SRobert Clausecker	ret
224*9fbea870SRobert Clausecker
225*9fbea870SRobert Clausecker	/* transfer 2--3 bytes */
226*9fbea870SRobert Clausecker.L0203:	cmp	$2, %edi
227*9fbea870SRobert Clausecker	jb	.L0101
228*9fbea870SRobert Clausecker
229*9fbea870SRobert Clausecker	movzwl	(%rsi), %ecx
230*9fbea870SRobert Clausecker	mov	%cx, (%rdx)		# store first two bytes
231*9fbea870SRobert Clausecker
232*9fbea870SRobert Clausecker	/* transfer 0 bytes (last byte is always NUL) */
233*9fbea870SRobert Clausecker.L0101:	movb	$0, (%rax)		# store terminating NUL byte
234*9fbea870SRobert Clausecker	ret
235*9fbea870SRobert ClauseckerARCHEND(__stpcpy, baseline)
236c03b5ad6SGeorge V. Neville-Neil
237c03b5ad6SGeorge V. Neville-Neil	.section .note.GNU-stack,"",%progbits
238