xref: /freebsd/lib/libc/amd64/string/stpncpy.S (revision 90253d49db09a9b1490c448d05314f3e4bbfa468)
1*90253d49SRobert Clausecker/*
2*90253d49SRobert Clausecker * Copyright (c) 2023 The FreeBSD Foundation
3*90253d49SRobert Clausecker *
4*90253d49SRobert Clausecker * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
5*90253d49SRobert Clausecker * under sponsorship from the FreeBSD Foundation.
6*90253d49SRobert Clausecker *
7*90253d49SRobert Clausecker * Redistribution and use in source and binary forms, with or without
8*90253d49SRobert Clausecker * modification, are permitted provided that the following conditions
9*90253d49SRobert Clausecker * are met:
10*90253d49SRobert Clausecker * 1. Redistributions of source code must retain the above copyright
11*90253d49SRobert Clausecker *    notice, this list of conditions and the following disclaimer.
12*90253d49SRobert Clausecker * 2. Redistributions in binary form must reproduce the above copyright
13*90253d49SRobert Clausecker *    notice, this list of conditions and the following disclaimer in the
14*90253d49SRobert Clausecker *    documentation and/or other materials provided with the distribution.
15*90253d49SRobert Clausecker *
16*90253d49SRobert Clausecker * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17*90253d49SRobert Clausecker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18*90253d49SRobert Clausecker * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19*90253d49SRobert Clausecker * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20*90253d49SRobert Clausecker * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21*90253d49SRobert Clausecker * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22*90253d49SRobert Clausecker * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23*90253d49SRobert Clausecker * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24*90253d49SRobert Clausecker * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25*90253d49SRobert Clausecker * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26*90253d49SRobert Clausecker * SUCH DAMAGE
27*90253d49SRobert Clausecker */
28*90253d49SRobert Clausecker
29*90253d49SRobert Clausecker#include <machine/asm.h>
30*90253d49SRobert Clausecker
31*90253d49SRobert Clausecker#include "amd64_archlevel.h"
32*90253d49SRobert Clausecker
33*90253d49SRobert Clausecker#define ALIGN_TEXT	.p2align 4, 0x90
34*90253d49SRobert Clausecker
35*90253d49SRobert Clausecker	.weak stpncpy
36*90253d49SRobert Clausecker	.set stpncpy, __stpncpy
37*90253d49SRobert ClauseckerARCHFUNCS(__stpncpy)
38*90253d49SRobert Clausecker	ARCHFUNC(__stpncpy, scalar)
39*90253d49SRobert Clausecker	ARCHFUNC(__stpncpy, baseline)
40*90253d49SRobert ClauseckerENDARCHFUNCS(__stpncpy)
41*90253d49SRobert Clausecker
42*90253d49SRobert ClauseckerARCHENTRY(__stpncpy, scalar)
43*90253d49SRobert Clausecker	push	%rbp		# establish stack frame
44*90253d49SRobert Clausecker	mov	%rsp, %rbp
45*90253d49SRobert Clausecker
46*90253d49SRobert Clausecker	push	%rdx
47*90253d49SRobert Clausecker	push	%rdi
48*90253d49SRobert Clausecker	push	%rsi
49*90253d49SRobert Clausecker	push	%rax		# dummy push for alignment
50*90253d49SRobert Clausecker
51*90253d49SRobert Clausecker	mov	%rsi, %rdi
52*90253d49SRobert Clausecker	xor	%esi, %esi
53*90253d49SRobert Clausecker	call	CNAME(__memchr)	# memchr(src, '\0', len)
54*90253d49SRobert Clausecker	pop	%rcx		# dummy pop
55*90253d49SRobert Clausecker	pop	%rsi
56*90253d49SRobert Clausecker	mov	-16(%rbp), %rdi
57*90253d49SRobert Clausecker
58*90253d49SRobert Clausecker	test	%rax, %rax	# NUL found?
59*90253d49SRobert Clausecker	jz	.Lfullcopy
60*90253d49SRobert Clausecker
61*90253d49SRobert Clausecker	mov	%rax, %rdx
62*90253d49SRobert Clausecker	sub	%rsi, %rdx	# copy until the NUL byte
63*90253d49SRobert Clausecker	add	%rdx, -16(%rbp)	# advance destination by string length
64*90253d49SRobert Clausecker	sub	%rdx, -8(%rbp)	# and shorten buffer size by string length
65*90253d49SRobert Clausecker	call	CNAME(memcpy)
66*90253d49SRobert Clausecker
67*90253d49SRobert Clausecker	pop	%rdi
68*90253d49SRobert Clausecker	pop	%rdx
69*90253d49SRobert Clausecker	xor	%esi, %esi
70*90253d49SRobert Clausecker	pop	%rbp
71*90253d49SRobert Clausecker	jmp	CNAME(memset)	# clear remaining buffer
72*90253d49SRobert Clausecker
73*90253d49SRobert Clausecker.Lfullcopy:
74*90253d49SRobert Clausecker	mov	-8(%rbp), %rdx
75*90253d49SRobert Clausecker	call	CNAME(memcpy)	# copy whole string
76*90253d49SRobert Clausecker	add	-8(%rbp), %rax	# point to dest[n]
77*90253d49SRobert Clausecker	leave
78*90253d49SRobert Clausecker	ret
79*90253d49SRobert ClauseckerARCHEND(__stpncpy, scalar)
80*90253d49SRobert Clausecker
81*90253d49SRobert Clausecker	/*
82*90253d49SRobert Clausecker	 * this mask allows us to generate masks of 16-n 0xff bytes
83*90253d49SRobert Clausecker	 * followed by n 0x00 bytes by loading from .Lmask+n.
84*90253d49SRobert Clausecker	 */
85*90253d49SRobert Clausecker	.section	.rodata
86*90253d49SRobert Clausecker.Lmask:	.quad		0xffffffffffffffff
87*90253d49SRobert Clausecker	.quad		0xffffffffffffffff
88*90253d49SRobert Clausecker	.quad		0x0000000000000000
89*90253d49SRobert Clausecker	.quad		0x0000000000000000
90*90253d49SRobert Clausecker
91*90253d49SRobert Clausecker/* stpncpy(char *restrict rdi, const char *rsi, size_t rdx) */
92*90253d49SRobert ClauseckerARCHENTRY(__stpncpy, baseline)
93*90253d49SRobert Clausecker#define bounce		(-3*16-8)		/* location of on-stack bounce buffer */
94*90253d49SRobert Clausecker
95*90253d49SRobert Clausecker	test		%rdx, %rdx		# no bytes to copy?
96*90253d49SRobert Clausecker	jz		.L0
97*90253d49SRobert Clausecker
98*90253d49SRobert Clausecker	mov		%esi, %ecx
99*90253d49SRobert Clausecker	and		$~0xf, %rsi		# align source to 16 bytes
100*90253d49SRobert Clausecker	movdqa		(%rsi), %xmm0		# load head
101*90253d49SRobert Clausecker	and		$0xf, %ecx		# offset from alignment
102*90253d49SRobert Clausecker	mov		$-1, %r9d
103*90253d49SRobert Clausecker	lea		-32(%rcx), %rax		# set up overflow-proof comparison rdx+rcx<=32
104*90253d49SRobert Clausecker	shl		%cl, %r9d		# mask of bytes belonging to the string
105*90253d49SRobert Clausecker	sub		%rcx, %rdi		# adjust RDI to correspond to RSI
106*90253d49SRobert Clausecker	pxor		%xmm1, %xmm1
107*90253d49SRobert Clausecker	movdqa		%xmm0, bounce(%rsp)	# stash copy of head on the stack
108*90253d49SRobert Clausecker	pcmpeqb		%xmm1, %xmm0
109*90253d49SRobert Clausecker	pmovmskb	%xmm0, %r8d
110*90253d49SRobert Clausecker
111*90253d49SRobert Clausecker	lea		(%rdx, %rcx, 1), %r10	# buffer length from alignment boundary
112*90253d49SRobert Clausecker	add		%rdx, %rax		# less than 2 chunks (32 bytes) to play with?
113*90253d49SRobert Clausecker	jnc		.Lrunt			# if yes, use special runt processing
114*90253d49SRobert Clausecker
115*90253d49SRobert Clausecker	movdqu		%xmm1, -16(%rdi, %r10, 1) # clear final bytes of destination
116*90253d49SRobert Clausecker	and		%r9d, %r8d		# end of string within head?
117*90253d49SRobert Clausecker	jnz		.Lheadnul
118*90253d49SRobert Clausecker
119*90253d49SRobert Clausecker	movdqu		(%rsi, %rcx, 1), %xmm2	# load head from source buffer
120*90253d49SRobert Clausecker	movdqu		%xmm2, (%rdi, %rcx, 1)	# an deposit
121*90253d49SRobert Clausecker
122*90253d49SRobert Clausecker	add		$16, %rsi
123*90253d49SRobert Clausecker	add		$16, %rdi
124*90253d49SRobert Clausecker	sub		$32, %r10
125*90253d49SRobert Clausecker
126*90253d49SRobert Clausecker	/* main loop unrolled twice */
127*90253d49SRobert Clausecker	ALIGN_TEXT
128*90253d49SRobert Clausecker0:	movdqa		(%rsi), %xmm0
129*90253d49SRobert Clausecker	pxor		%xmm1, %xmm1
130*90253d49SRobert Clausecker	pcmpeqb		%xmm0, %xmm1		# NUL byte encountered?
131*90253d49SRobert Clausecker	pmovmskb	%xmm1, %r8d
132*90253d49SRobert Clausecker	test		%r8d, %r8d
133*90253d49SRobert Clausecker	jnz		3f
134*90253d49SRobert Clausecker
135*90253d49SRobert Clausecker	movdqu		%xmm0, (%rdi)
136*90253d49SRobert Clausecker	cmp		$16, %r10		# more than a full chunk left?
137*90253d49SRobert Clausecker	jbe		1f
138*90253d49SRobert Clausecker
139*90253d49SRobert Clausecker	movdqa		16(%rsi), %xmm0
140*90253d49SRobert Clausecker	add		$32, %rdi		# advance pointers to next chunk
141*90253d49SRobert Clausecker	add		$32, %rsi
142*90253d49SRobert Clausecker	pxor		%xmm1, %xmm1
143*90253d49SRobert Clausecker	pcmpeqb		%xmm0, %xmm1		# NUL byte encountered?
144*90253d49SRobert Clausecker	pmovmskb	%xmm1, %r8d
145*90253d49SRobert Clausecker	test		%r8d, %r8d
146*90253d49SRobert Clausecker	jnz		2f
147*90253d49SRobert Clausecker
148*90253d49SRobert Clausecker	movdqu		%xmm0, -16(%rdi)
149*90253d49SRobert Clausecker	sub		$32, %r10		# more than another full chunk left?
150*90253d49SRobert Clausecker	ja		0b
151*90253d49SRobert Clausecker
152*90253d49SRobert Clausecker	sub		$16, %rdi		# undo second advancement
153*90253d49SRobert Clausecker	sub		$16, %rsi
154*90253d49SRobert Clausecker	add		$16, %r10d		# restore number of remaining bytes
155*90253d49SRobert Clausecker
156*90253d49SRobert Clausecker	/* 1--16 bytes left but string has not ended yet */
157*90253d49SRobert Clausecker1:	pxor		%xmm1, %xmm1
158*90253d49SRobert Clausecker	pcmpeqb		16(%rsi), %xmm1		# NUL byte in source tail?
159*90253d49SRobert Clausecker	pmovmskb	%xmm1, %r8d
160*90253d49SRobert Clausecker	bts		%r10d, %r8d		# treat end of buffer as NUL
161*90253d49SRobert Clausecker	tzcnt		%r8d, %r8d		# where is the NUL byte?
162*90253d49SRobert Clausecker	movdqu		(%rsi, %r8, 1), %xmm0	# load source tail before NUL
163*90253d49SRobert Clausecker	lea		16(%rdi, %r8, 1), %rax	# point return value to NUL byte
164*90253d49SRobert Clausecker						# or end of buffer
165*90253d49SRobert Clausecker	movdqu		%xmm0, (%rdi, %r8, 1)	# store tail into the buffer
166*90253d49SRobert Clausecker	ret
167*90253d49SRobert Clausecker
168*90253d49SRobert Clausecker2:	sub		$16, %rdi		# undo second advancement
169*90253d49SRobert Clausecker	sub		$16, %rsi
170*90253d49SRobert Clausecker	sub		$16, %r10
171*90253d49SRobert Clausecker
172*90253d49SRobert Clausecker	/* string has ended and buffer has not */
173*90253d49SRobert Clausecker3:	tzcnt		%r8d, %r8d		# where did the string end?
174*90253d49SRobert Clausecker	lea		.Lmask+16(%rip), %rcx
175*90253d49SRobert Clausecker	lea		(%rdi, %r8, 1), %rax 	# where the NUL byte will be
176*90253d49SRobert Clausecker	neg		%r8
177*90253d49SRobert Clausecker	movdqu		(%rcx, %r8, 1), %xmm1	# mask with FF where the string is,
178*90253d49SRobert Clausecker						# 00 where it is not
179*90253d49SRobert Clausecker	pand		%xmm1, %xmm0		# mask out bytes after the string
180*90253d49SRobert Clausecker	movdqu		%xmm0, (%rdi)	 	# store masked current chunk
181*90253d49SRobert Clausecker	pxor		%xmm1, %xmm1
182*90253d49SRobert Clausecker	sub		$16, %r10		# another full chunk left?
183*90253d49SRobert Clausecker	jbe		1f
184*90253d49SRobert Clausecker
185*90253d49SRobert Clausecker	/* clear remaining destination buffer (tail has been cleared earlier) */
186*90253d49SRobert Clausecker	ALIGN_TEXT
187*90253d49SRobert Clausecker0:	movdqu		%xmm1, 16(%rdi)
188*90253d49SRobert Clausecker	cmp		$16, %r10
189*90253d49SRobert Clausecker	jbe		1f
190*90253d49SRobert Clausecker
191*90253d49SRobert Clausecker	movdqu		%xmm1, 32(%rdi)
192*90253d49SRobert Clausecker	add		$32, %rdi
193*90253d49SRobert Clausecker	sub		$32, %r10
194*90253d49SRobert Clausecker	ja		0b
195*90253d49SRobert Clausecker
196*90253d49SRobert Clausecker1:	ret
197*90253d49SRobert Clausecker
198*90253d49SRobert Clausecker	/* at least two chunks to play with and NUL while processing head */
199*90253d49SRobert Clausecker.Lheadnul:
200*90253d49SRobert Clausecker	movdqu		bounce(%rsp, %rcx, 1), %xmm0 # load start of source from stack
201*90253d49SRobert Clausecker	tzcnt		%r8d, %r8d		# find location of NUL byte
202*90253d49SRobert Clausecker	movdqu		%xmm0, (%rdi, %rcx, 1)	# deposit head in the destination
203*90253d49SRobert Clausecker	movdqu		%xmm1, (%rdi, %r8, 1)	# clear out following bytes
204*90253d49SRobert Clausecker	movdqu		%xmm1, 16(%rdi)		# clear out second chunk
205*90253d49SRobert Clausecker	lea		(%rdi, %r8, 1), %rax	# make RAX point to the NUL byte
206*90253d49SRobert Clausecker
207*90253d49SRobert Clausecker	add		$32, %rdi		# advance past first two chunks
208*90253d49SRobert Clausecker	sub		$32+16, %r10		# advance past first three chunks
209*90253d49SRobert Clausecker	jbe		1f			# did we pass the end of the buffer?
210*90253d49SRobert Clausecker
211*90253d49SRobert Clausecker	/* clear remaining destination buffer (tail has been cleared earlier) */
212*90253d49SRobert Clausecker	ALIGN_TEXT
213*90253d49SRobert Clausecker0:	movdqu		%xmm1, (%rdi)		# clear out buffer chunk
214*90253d49SRobert Clausecker	cmp		$16, %r10
215*90253d49SRobert Clausecker	jbe		1f
216*90253d49SRobert Clausecker
217*90253d49SRobert Clausecker	movdqu		%xmm1, 16(%rdi)
218*90253d49SRobert Clausecker	add		$32, %rdi
219*90253d49SRobert Clausecker	sub		$32, %r10
220*90253d49SRobert Clausecker	ja		0b
221*90253d49SRobert Clausecker
222*90253d49SRobert Clausecker1:	ret
223*90253d49SRobert Clausecker
224*90253d49SRobert Clausecker	/* 1--32 bytes to copy, bounce through the stack */
225*90253d49SRobert Clausecker.Lrunt:	movdqa		%xmm1, bounce+16(%rsp)	# clear out rest of on-stack copy
226*90253d49SRobert Clausecker	bts		%r10d, %r8d		# treat end of buffer as end of string
227*90253d49SRobert Clausecker	and		%r9w, %r8w		# end of string within first buffer?
228*90253d49SRobert Clausecker	jnz		0f			# if yes, do not inspect second buffer
229*90253d49SRobert Clausecker
230*90253d49SRobert Clausecker	movdqa		16(%rsi), %xmm0		# load second chunk of input
231*90253d49SRobert Clausecker	movdqa		%xmm0, bounce+16(%rsp)	# stash copy on stack
232*90253d49SRobert Clausecker	pcmpeqb		%xmm1, %xmm0		# NUL in second chunk?
233*90253d49SRobert Clausecker	pmovmskb	%xmm0, %r9d
234*90253d49SRobert Clausecker	shl		$16, %r9d
235*90253d49SRobert Clausecker	or		%r9d, %r8d		# merge found NUL bytes into NUL mask
236*90253d49SRobert Clausecker
237*90253d49SRobert Clausecker	/* end of string after one buffer */
238*90253d49SRobert Clausecker0:	tzcnt		%r8d, %r8d		# location of last char in string
239*90253d49SRobert Clausecker	movdqu		%xmm1, bounce(%rsp, %r8, 1) # clear bytes behind string
240*90253d49SRobert Clausecker	lea		bounce(%rsp, %rcx, 1), %rsi # start of string copy on stack
241*90253d49SRobert Clausecker	lea		(%rdi, %r8, 1), %rax	# return pointer to NUL byte
242*90253d49SRobert Clausecker
243*90253d49SRobert Clausecker	cmp		$16, %edx		# at least 16 bytes to transfer?
244*90253d49SRobert Clausecker	jae		.L1631
245*90253d49SRobert Clausecker
246*90253d49SRobert Clausecker	mov		(%rsi), %r8		# load string head
247*90253d49SRobert Clausecker	cmp		$8, %edx		# at least 8 bytes to transfer?
248*90253d49SRobert Clausecker	jae		.L0815
249*90253d49SRobert Clausecker
250*90253d49SRobert Clausecker	cmp		$4, %edx		# at least 4 bytes to transfer?
251*90253d49SRobert Clausecker	jae		.L0407
252*90253d49SRobert Clausecker
253*90253d49SRobert Clausecker	movzwl		-2(%rsi, %rdx, 1), %esi	# load last two bytes of string
254*90253d49SRobert Clausecker	mov		%r8b, (%rdi, %rcx, 1)	# store first byte
255*90253d49SRobert Clausecker
256*90253d49SRobert Clausecker	cmp		$2, %edx		# at least 2 bytes to transfer?
257*90253d49SRobert Clausecker	jb		.L1
258*90253d49SRobert Clausecker
259*90253d49SRobert Clausecker	mov		%si, -2(%rdi, %r10, 1)	# store last two bytes of string
260*90253d49SRobert Clausecker.L1:	ret
261*90253d49SRobert Clausecker
262*90253d49SRobert Clausecker.L1631:	movdqu		(%rsi), %xmm0		# load first 16 bytes of string
263*90253d49SRobert Clausecker	movdqu		-16(%rsi, %rdx, 1), %xmm1 # load last 16 bytes of string
264*90253d49SRobert Clausecker	movdqu		%xmm0, (%rdi, %rcx, 1)
265*90253d49SRobert Clausecker	movdqu		%xmm1, -16(%rdi, %r10, 1)
266*90253d49SRobert Clausecker	ret
267*90253d49SRobert Clausecker
268*90253d49SRobert Clausecker.L0815:	mov		-8(%rsi, %rdx, 1), %rdx	# load last 8 bytes of string
269*90253d49SRobert Clausecker	mov		%r8, (%rdi, %rcx, 1)
270*90253d49SRobert Clausecker	mov		%rdx, -8(%rdi, %r10, 1)
271*90253d49SRobert Clausecker	ret
272*90253d49SRobert Clausecker
273*90253d49SRobert Clausecker.L0407:	mov		-4(%rsi, %rdx, 1), %edx	# load last four bytes of string
274*90253d49SRobert Clausecker	mov		%r8d, (%rdi, %rcx, 1)
275*90253d49SRobert Clausecker	mov		%edx, -4(%rdi, %r10, 1)
276*90253d49SRobert Clausecker	ret
277*90253d49SRobert Clausecker
278*90253d49SRobert Clausecker	/* length 0 buffer: just return dest */
279*90253d49SRobert Clausecker.L0:	mov		%rdi, %rax
280*90253d49SRobert Clausecker	ret
281*90253d49SRobert ClauseckerARCHEND(__stpncpy, baseline)
282*90253d49SRobert Clausecker
283*90253d49SRobert Clausecker	.section .note.GNU-stack,"",%progbits
284