xref: /freebsd/lib/libc/amd64/string/strlcpy.S (revision 74d6cfad54d676299ee5e4695139461876dfd757)
1*74d6cfadSRobert Clausecker/*
2*74d6cfadSRobert Clausecker * Copyright (c) 2023 The FreeBSD Foundation
3*74d6cfadSRobert Clausecker *
4*74d6cfadSRobert Clausecker * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
5*74d6cfadSRobert Clausecker * under sponsorship from the FreeBSD Foundation.
6*74d6cfadSRobert Clausecker *
7*74d6cfadSRobert Clausecker * Redistribution and use in source and binary forms, with or without
8*74d6cfadSRobert Clausecker * modification, are permitted provided that the following conditions
9*74d6cfadSRobert Clausecker * are met:
10*74d6cfadSRobert Clausecker * 1. Redistributions of source code must retain the above copyright
11*74d6cfadSRobert Clausecker *    notice, this list of conditions and the following disclaimer.
12*74d6cfadSRobert Clausecker * 2. Redistributions in binary form must reproduce the above copyright
13*74d6cfadSRobert Clausecker *    notice, this list of conditions and the following disclaimer in the
14*74d6cfadSRobert Clausecker *    documentation and/or other materials provided with the distribution.
15*74d6cfadSRobert Clausecker *
16*74d6cfadSRobert Clausecker * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17*74d6cfadSRobert Clausecker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18*74d6cfadSRobert Clausecker * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19*74d6cfadSRobert Clausecker * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20*74d6cfadSRobert Clausecker * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21*74d6cfadSRobert Clausecker * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22*74d6cfadSRobert Clausecker * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23*74d6cfadSRobert Clausecker * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24*74d6cfadSRobert Clausecker * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25*74d6cfadSRobert Clausecker * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26*74d6cfadSRobert Clausecker * SUCH DAMAGE
27*74d6cfadSRobert Clausecker */
28*74d6cfadSRobert Clausecker
29*74d6cfadSRobert Clausecker#include <machine/asm.h>
30*74d6cfadSRobert Clausecker
31*74d6cfadSRobert Clausecker#include "amd64_archlevel.h"
32*74d6cfadSRobert Clausecker
33*74d6cfadSRobert Clausecker#define ALIGN_TEXT	.p2align 4, 0x90
34*74d6cfadSRobert Clausecker
35*74d6cfadSRobert Clausecker	.weak strlcpy
36*74d6cfadSRobert Clausecker	.set strlcpy, __strlcpy
37*74d6cfadSRobert ClauseckerARCHFUNCS(__strlcpy)
38*74d6cfadSRobert Clausecker	ARCHFUNC(__strlcpy, scalar)
39*74d6cfadSRobert Clausecker	ARCHFUNC(__strlcpy, baseline)
40*74d6cfadSRobert ClauseckerENDARCHFUNCS(__strlcpy)
41*74d6cfadSRobert Clausecker
42*74d6cfadSRobert ClauseckerARCHENTRY(__strlcpy, scalar)
43*74d6cfadSRobert Clausecker	push	%rbp		# establish stack frame
44*74d6cfadSRobert Clausecker	mov	%rsp, %rbp
45*74d6cfadSRobert Clausecker	push	%rsi
46*74d6cfadSRobert Clausecker	push	%rbx
47*74d6cfadSRobert Clausecker	push	%rdi
48*74d6cfadSRobert Clausecker	push	%rdx
49*74d6cfadSRobert Clausecker	mov	%rsi, %rdi
50*74d6cfadSRobert Clausecker	call	CNAME(strlen)	# strlen(src)
51*74d6cfadSRobert Clausecker	pop	%rdx
52*74d6cfadSRobert Clausecker	pop	%rdi
53*74d6cfadSRobert Clausecker	mov	-8(%rbp), %rsi
54*74d6cfadSRobert Clausecker	mov	%rax, %rbx	# remember string length for return value
55*74d6cfadSRobert Clausecker	sub	$1, %rdx	# do not copy into the final byte of the buffer
56*74d6cfadSRobert Clausecker	jc	0f		# skip copying altogether if buffer was empty
57*74d6cfadSRobert Clausecker	cmp	%rax, %rdx	# is the buffer longer than the input?
58*74d6cfadSRobert Clausecker	cmova	%rax, %rdx	# if yes, only copy the part that fits
59*74d6cfadSRobert Clausecker	movb	$0, (%rdi, %rdx, 1) # NUL-terminate output buffer
60*74d6cfadSRobert Clausecker	call	CNAME(memcpy)	# copy string to output
61*74d6cfadSRobert Clausecker0:	mov	%rbx, %rax	# restore return value
62*74d6cfadSRobert Clausecker	pop	%rbx
63*74d6cfadSRobert Clausecker	leave
64*74d6cfadSRobert Clausecker	ret
65*74d6cfadSRobert ClauseckerARCHEND(__strlcpy, scalar)
66*74d6cfadSRobert Clausecker
67*74d6cfadSRobert ClauseckerARCHENTRY(__strlcpy, baseline)
68*74d6cfadSRobert Clausecker	sub		$1, %rdx		# do not count NUL byte in buffer length
69*74d6cfadSRobert Clausecker	jb		.L0			# go to special code path if len was 0
70*74d6cfadSRobert Clausecker
71*74d6cfadSRobert Clausecker	mov		%esi, %ecx
72*74d6cfadSRobert Clausecker	pxor		%xmm1, %xmm1
73*74d6cfadSRobert Clausecker	mov		%rsi, %r9		# stash a copy of the source pointer for later
74*74d6cfadSRobert Clausecker	and		$~0xf, %rsi
75*74d6cfadSRobert Clausecker	pcmpeqb		(%rsi), %xmm1		# NUL found in head?
76*74d6cfadSRobert Clausecker	mov		$-1, %r8d
77*74d6cfadSRobert Clausecker	and		$0xf, %ecx
78*74d6cfadSRobert Clausecker	shl		%cl, %r8d		# mask of bytes in the string
79*74d6cfadSRobert Clausecker	pmovmskb	%xmm1, %eax
80*74d6cfadSRobert Clausecker	and		%r8d, %eax
81*74d6cfadSRobert Clausecker	jnz		.Lhead_nul
82*74d6cfadSRobert Clausecker
83*74d6cfadSRobert Clausecker	movdqa		16(%rsi), %xmm3		# load second string chunk
84*74d6cfadSRobert Clausecker	movdqu		(%r9), %xmm2		# load unaligned string head
85*74d6cfadSRobert Clausecker	mov		$32, %r8d
86*74d6cfadSRobert Clausecker	sub		%ecx, %r8d		# head length + length of second chunk
87*74d6cfadSRobert Clausecker	pxor		%xmm1, %xmm1
88*74d6cfadSRobert Clausecker	pcmpeqb		%xmm3, %xmm1		# NUL found in second chunk?
89*74d6cfadSRobert Clausecker
90*74d6cfadSRobert Clausecker	sub		%r8, %rdx		# enough space left for the second chunk?
91*74d6cfadSRobert Clausecker	jbe		.Lhead_buf_end
92*74d6cfadSRobert Clausecker
93*74d6cfadSRobert Clausecker	/* process second chunk */
94*74d6cfadSRobert Clausecker	pmovmskb	%xmm1, %eax
95*74d6cfadSRobert Clausecker	test		%eax, %eax
96*74d6cfadSRobert Clausecker	jnz		.Lsecond_nul
97*74d6cfadSRobert Clausecker
98*74d6cfadSRobert Clausecker	/* string didn't end in second chunk and neither did buffer -- not a runt! */
99*74d6cfadSRobert Clausecker	movdqa		32(%rsi), %xmm0		# load next string chunk
100*74d6cfadSRobert Clausecker	pxor		%xmm1, %xmm1
101*74d6cfadSRobert Clausecker	movdqu		%xmm2, (%rdi)		# deposit head into buffer
102*74d6cfadSRobert Clausecker	sub		%rcx, %rdi		# adjust RDI to correspond to RSI
103*74d6cfadSRobert Clausecker	movdqu		%xmm3, 16(%rdi)		# deposit second chunk
104*74d6cfadSRobert Clausecker	sub		%rsi, %rdi		# express RDI as distance from RSI
105*74d6cfadSRobert Clausecker	add		$32, %rsi		# advance RSI past first two chunks
106*74d6cfadSRobert Clausecker	sub		$16, %rdx		# enough left for another round?
107*74d6cfadSRobert Clausecker	jbe		1f
108*74d6cfadSRobert Clausecker
109*74d6cfadSRobert Clausecker	/* main loop unrolled twice */
110*74d6cfadSRobert Clausecker	ALIGN_TEXT
111*74d6cfadSRobert Clausecker0:	pcmpeqb		%xmm0, %xmm1		# NUL byte encountered?
112*74d6cfadSRobert Clausecker	pmovmskb	%xmm1, %eax
113*74d6cfadSRobert Clausecker	test		%eax, %eax
114*74d6cfadSRobert Clausecker	jnz		3f
115*74d6cfadSRobert Clausecker
116*74d6cfadSRobert Clausecker	movdqu		%xmm0, (%rsi, %rdi)
117*74d6cfadSRobert Clausecker	movdqa		16(%rsi), %xmm0		# load next string chunk
118*74d6cfadSRobert Clausecker	pxor		%xmm1, %xmm1
119*74d6cfadSRobert Clausecker	cmp		$16, %rdx		# more than a full chunk left?
120*74d6cfadSRobert Clausecker	jbe		2f
121*74d6cfadSRobert Clausecker
122*74d6cfadSRobert Clausecker	add		$32, %rsi		# advance pointers to next chunk
123*74d6cfadSRobert Clausecker	pcmpeqb		%xmm0, %xmm1		# NUL byte encountered?
124*74d6cfadSRobert Clausecker	pmovmskb	%xmm1, %eax
125*74d6cfadSRobert Clausecker	test		%eax, %eax
126*74d6cfadSRobert Clausecker	jnz		4f
127*74d6cfadSRobert Clausecker
128*74d6cfadSRobert Clausecker	movdqu		%xmm0, -16(%rsi, %rdi)
129*74d6cfadSRobert Clausecker	movdqa		(%rsi), %xmm0		# load next string chunk
130*74d6cfadSRobert Clausecker	pxor		%xmm1, %xmm1
131*74d6cfadSRobert Clausecker	sub		$32, %rdx
132*74d6cfadSRobert Clausecker	ja		0b
133*74d6cfadSRobert Clausecker
134*74d6cfadSRobert Clausecker1:	sub		$16, %rsi		# undo second advancement
135*74d6cfadSRobert Clausecker	add		$16, %edx
136*74d6cfadSRobert Clausecker
137*74d6cfadSRobert Clausecker	/* 1--16 bytes left in the buffer but string has not ended yet */
138*74d6cfadSRobert Clausecker2:	pcmpeqb		%xmm1, %xmm0		# NUL byte encountered?
139*74d6cfadSRobert Clausecker	pmovmskb	%xmm0, %r8d
140*74d6cfadSRobert Clausecker	mov		%r8d, %eax
141*74d6cfadSRobert Clausecker	bts		%edx, %r8d		# treat end of buffer as end of string
142*74d6cfadSRobert Clausecker	tzcnt		%r8d, %r8d		# find tail length
143*74d6cfadSRobert Clausecker	add		%rsi, %rdi		# restore RDI
144*74d6cfadSRobert Clausecker	movdqu		(%rsi, %r8, 1), %xmm0	# load string tail
145*74d6cfadSRobert Clausecker	movdqu		%xmm0, (%rdi, %r8, 1)	# store string tail
146*74d6cfadSRobert Clausecker	movb		$0, 16(%rdi, %r8, 1)	# NUL terminate
147*74d6cfadSRobert Clausecker
148*74d6cfadSRobert Clausecker	/* continue to find the end of the string */
149*74d6cfadSRobert Clausecker	test		%eax, %eax		# end of string already reached?
150*74d6cfadSRobert Clausecker	jnz		1f
151*74d6cfadSRobert Clausecker
152*74d6cfadSRobert Clausecker	ALIGN_TEXT
153*74d6cfadSRobert Clausecker0:	pcmpeqb		32(%rsi), %xmm1
154*74d6cfadSRobert Clausecker	pmovmskb	%xmm1, %eax
155*74d6cfadSRobert Clausecker	pxor		%xmm1, %xmm1
156*74d6cfadSRobert Clausecker	test		%eax, %eax
157*74d6cfadSRobert Clausecker	jnz		2f
158*74d6cfadSRobert Clausecker
159*74d6cfadSRobert Clausecker	pcmpeqb		48(%rsi), %xmm1
160*74d6cfadSRobert Clausecker	pmovmskb	%xmm1, %eax
161*74d6cfadSRobert Clausecker	add		$32, %rsi
162*74d6cfadSRobert Clausecker	pxor		%xmm1, %xmm1
163*74d6cfadSRobert Clausecker	test		%eax, %eax
164*74d6cfadSRobert Clausecker	jz		0b
165*74d6cfadSRobert Clausecker
166*74d6cfadSRobert Clausecker1:	sub		$16, %rsi		# undo second advancement
167*74d6cfadSRobert Clausecker2:	tzcnt		%eax, %eax		# where is the NUL byte?
168*74d6cfadSRobert Clausecker	sub		%r9, %rsi
169*74d6cfadSRobert Clausecker	lea		32(%rsi, %rax, 1), %rax	# return string length
170*74d6cfadSRobert Clausecker	ret
171*74d6cfadSRobert Clausecker
172*74d6cfadSRobert Clausecker4:	sub		$16, %rsi		# undo second advancement
173*74d6cfadSRobert Clausecker	add		$16, %rdx		# restore number of remaining bytes
174*74d6cfadSRobert Clausecker
175*74d6cfadSRobert Clausecker	/* string has ended but buffer has not */
176*74d6cfadSRobert Clausecker3:	tzcnt		%eax, %eax		# find length of string tail
177*74d6cfadSRobert Clausecker	movdqu		-15(%rsi, %rax, 1), %xmm0 # load string tail (incl. NUL)
178*74d6cfadSRobert Clausecker	add		%rsi, %rdi		# restore destination pointer
179*74d6cfadSRobert Clausecker	movdqu		%xmm0, -15(%rdi, %rax, 1) # store string tail (incl. NUL)
180*74d6cfadSRobert Clausecker	sub		%r9, %rsi		# string length to current chunk
181*74d6cfadSRobert Clausecker	add		%rsi, %rax		# plus length of current chunk
182*74d6cfadSRobert Clausecker	ret
183*74d6cfadSRobert Clausecker
184*74d6cfadSRobert Clausecker.Lhead_buf_end:
185*74d6cfadSRobert Clausecker	pmovmskb	%xmm1, %r8d
186*74d6cfadSRobert Clausecker	add		$32, %edx		# restore edx to (len-1) + ecx
187*74d6cfadSRobert Clausecker	mov		%r8d, %eax
188*74d6cfadSRobert Clausecker	shl		$16, %r8d		# place 2nd chunk NUL mask into bits 16--31
189*74d6cfadSRobert Clausecker	bts		%rdx, %r8		# treat end of buffer as end of string
190*74d6cfadSRobert Clausecker	tzcnt		%r8, %rdx		# find string/bufer len from alignment boundary
191*74d6cfadSRobert Clausecker	sub		%ecx, %edx		# find actual string/buffer len
192*74d6cfadSRobert Clausecker	movb		$0, (%rdi, %rdx, 1)	# write NUL terminator
193*74d6cfadSRobert Clausecker
194*74d6cfadSRobert Clausecker	/* continue to find the end of the string */
195*74d6cfadSRobert Clausecker	test		%eax, %eax		# end of string already reached?
196*74d6cfadSRobert Clausecker	jnz		1f
197*74d6cfadSRobert Clausecker
198*74d6cfadSRobert Clausecker	ALIGN_TEXT
199*74d6cfadSRobert Clausecker0:	pcmpeqb		32(%rsi), %xmm1
200*74d6cfadSRobert Clausecker	pmovmskb	%xmm1, %eax
201*74d6cfadSRobert Clausecker	pxor		%xmm1, %xmm1
202*74d6cfadSRobert Clausecker	test		%eax, %eax
203*74d6cfadSRobert Clausecker	jnz		2f
204*74d6cfadSRobert Clausecker
205*74d6cfadSRobert Clausecker	pcmpeqb		48(%rsi), %xmm1
206*74d6cfadSRobert Clausecker	pmovmskb	%xmm1, %eax
207*74d6cfadSRobert Clausecker	add		$32, %rsi
208*74d6cfadSRobert Clausecker	pxor		%xmm1, %xmm1
209*74d6cfadSRobert Clausecker	test		%eax, %eax
210*74d6cfadSRobert Clausecker	jz		0b
211*74d6cfadSRobert Clausecker
212*74d6cfadSRobert Clausecker1:	sub		$16, %rsi
213*74d6cfadSRobert Clausecker2:	tzcnt		%eax, %eax
214*74d6cfadSRobert Clausecker	sub		%r9, %rsi
215*74d6cfadSRobert Clausecker	lea		32(%rsi, %rax, 1), %rax	# return string length
216*74d6cfadSRobert Clausecker	jmp		.L0031
217*74d6cfadSRobert Clausecker
218*74d6cfadSRobert Clausecker.Lsecond_nul:
219*74d6cfadSRobert Clausecker	add		%r8, %rdx		# restore buffer length
220*74d6cfadSRobert Clausecker	tzcnt		%eax, %eax		# where is the NUL byte?
221*74d6cfadSRobert Clausecker	lea		-16(%rcx), %r8d
222*74d6cfadSRobert Clausecker	sub		%r8d, %eax		# string length
223*74d6cfadSRobert Clausecker	cmp		%rax, %rdx		# is the string shorter than the buffer?
224*74d6cfadSRobert Clausecker	cmova		%rax, %rdx		# copy only min(buflen, srclen) bytes
225*74d6cfadSRobert Clausecker	movb		$0, (%rdi, %rdx, 1)	# write NUL terminator
226*74d6cfadSRobert Clausecker.L0031:	cmp		$16, %rdx		# at least 16 bytes to copy (not incl NUL)?
227*74d6cfadSRobert Clausecker	jb		.L0015
228*74d6cfadSRobert Clausecker
229*74d6cfadSRobert Clausecker	/* copy 16--31 bytes */
230*74d6cfadSRobert Clausecker	movdqu		(%r9), %xmm0		# load first 16 bytes
231*74d6cfadSRobert Clausecker	movdqu		-16(%r9, %rdx, 1), %xmm1 # load last 16 bytes
232*74d6cfadSRobert Clausecker	movdqu		%xmm0, (%rdi)
233*74d6cfadSRobert Clausecker	movdqu		%xmm1, -16(%rdi, %rdx, 1)
234*74d6cfadSRobert Clausecker	ret
235*74d6cfadSRobert Clausecker
236*74d6cfadSRobert Clausecker.Lhead_nul:
237*74d6cfadSRobert Clausecker	tzcnt		%eax, %eax		# where is the NUL byte?
238*74d6cfadSRobert Clausecker	sub		%ecx, %eax		# ... from the beginning of the string?
239*74d6cfadSRobert Clausecker	cmp		%rax, %rdx		# is the string shorter than the buffer?
240*74d6cfadSRobert Clausecker	cmova		%rax, %rdx		# copy only min(buflen, srclen) bytes
241*74d6cfadSRobert Clausecker	movb		$0, (%rdi, %rdx, 1)	# write NUL terminator
242*74d6cfadSRobert Clausecker
243*74d6cfadSRobert Clausecker	/* process strings of 0--15 bytes (rdx: min(buflen, srclen), rax: srclen) */
244*74d6cfadSRobert Clausecker.L0015:	cmp		$8, %rdx		# at least 8 bytes to copy?
245*74d6cfadSRobert Clausecker	jae		.L0815
246*74d6cfadSRobert Clausecker
247*74d6cfadSRobert Clausecker	cmp		$4, %rdx		# at least 4 bytes to copy?
248*74d6cfadSRobert Clausecker	jae		.L0407
249*74d6cfadSRobert Clausecker
250*74d6cfadSRobert Clausecker	cmp		$2, %rdx		# at least 2 bytes to copy?
251*74d6cfadSRobert Clausecker	jae		.L0203
252*74d6cfadSRobert Clausecker
253*74d6cfadSRobert Clausecker	movzbl		(%r9), %ecx		# load first byte from src
254*74d6cfadSRobert Clausecker	mov		%cl, (%rdi)		# deposit into destination
255*74d6cfadSRobert Clausecker	movb		$0, (%rdi, %rdx, 1)	# add NUL terminator (again)
256*74d6cfadSRobert Clausecker	ret
257*74d6cfadSRobert Clausecker
258*74d6cfadSRobert Clausecker.L0203:	movzwl		(%r9), %ecx
259*74d6cfadSRobert Clausecker	movzwl		-2(%r9, %rdx, 1), %esi
260*74d6cfadSRobert Clausecker	mov		%cx, (%rdi)
261*74d6cfadSRobert Clausecker	mov		%si, -2(%rdi, %rdx, 1)
262*74d6cfadSRobert Clausecker	ret
263*74d6cfadSRobert Clausecker
264*74d6cfadSRobert Clausecker.L0407:	mov		(%r9), %ecx
265*74d6cfadSRobert Clausecker	mov		-4(%r9, %rdx, 1), %esi
266*74d6cfadSRobert Clausecker	mov		%ecx, (%rdi)
267*74d6cfadSRobert Clausecker	mov		%esi, -4(%rdi, %rdx, 1)
268*74d6cfadSRobert Clausecker	ret
269*74d6cfadSRobert Clausecker
270*74d6cfadSRobert Clausecker.L0815:	mov		(%r9), %rcx
271*74d6cfadSRobert Clausecker	mov		-8(%r9, %rdx, 1), %rsi
272*74d6cfadSRobert Clausecker	mov		%rcx, (%rdi)
273*74d6cfadSRobert Clausecker	mov		%rsi, -8(%rdi, %rdx, 1)
274*74d6cfadSRobert Clausecker	ret
275*74d6cfadSRobert Clausecker
276*74d6cfadSRobert Clausecker	/* length zero destination: just return the string length */
277*74d6cfadSRobert Clausecker.L0:	mov		%rsi, %rdi
278*74d6cfadSRobert Clausecker	jmp		CNAME(strlen)
279*74d6cfadSRobert ClauseckerARCHEND(__strlcpy, baseline)
280*74d6cfadSRobert Clausecker
281*74d6cfadSRobert Clausecker	.section .note.GNU-stack,"",%progbits
282