xref: /freebsd/lib/libc/amd64/string/stpncpy.S (revision 4f35a84b32412f5cf54e08cd97cd6eee407fb30e)
1/*
2 * Copyright (c) 2023 The FreeBSD Foundation
3 *
4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
5 * under sponsorship from the FreeBSD Foundation.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE
27 */
28
29#include <machine/asm.h>
30
31#include "amd64_archlevel.h"
32
33#define ALIGN_TEXT	.p2align 4, 0x90
34
35	.weak stpncpy
36	.set stpncpy, __stpncpy
37ARCHFUNCS(__stpncpy)
38	ARCHFUNC(__stpncpy, scalar)
39#if 0 /* temporarily disabled cf. PR 291720 */
40	ARCHFUNC(__stpncpy, baseline)
41#endif
42ENDARCHFUNCS(__stpncpy)
43
44ARCHENTRY(__stpncpy, scalar)
45	push	%rbp		# establish stack frame
46	mov	%rsp, %rbp
47
48	push	%rdx
49	push	%rdi
50	push	%rsi
51	push	%rax		# dummy push for alignment
52
53	mov	%rsi, %rdi
54	xor	%esi, %esi
55	call	CNAME(__memchr)	# memchr(src, '\0', len)
56	pop	%rcx		# dummy pop
57	pop	%rsi
58	mov	-16(%rbp), %rdi
59
60	test	%rax, %rax	# NUL found?
61	jz	.Lfullcopy
62
63	mov	%rax, %rdx
64	sub	%rsi, %rdx	# copy until the NUL byte
65	add	%rdx, -16(%rbp)	# advance destination by string length
66	sub	%rdx, -8(%rbp)	# and shorten buffer size by string length
67	call	CNAME(memcpy)
68
69	pop	%rdi
70	pop	%rdx
71	xor	%esi, %esi
72	pop	%rbp
73	jmp	CNAME(memset)	# clear remaining buffer
74
75.Lfullcopy:
76	mov	-8(%rbp), %rdx
77	call	CNAME(memcpy)	# copy whole string
78	add	-8(%rbp), %rax	# point to dest[n]
79	leave
80	ret
81ARCHEND(__stpncpy, scalar)
82
83	/*
84	 * this mask allows us to generate masks of 16-n 0xff bytes
85	 * followed by n 0x00 bytes by loading from .Lmask+n.
86	 */
87	.section	.rodata
88.Lmask:	.quad		0xffffffffffffffff
89	.quad		0xffffffffffffffff
90	.quad		0x0000000000000000
91	.quad		0x0000000000000000
92
93/* stpncpy(char *restrict rdi, const char *rsi, size_t rdx) */
94ARCHENTRY(__stpncpy, baseline)
95#define bounce		(-3*16-8)		/* location of on-stack bounce buffer */
96
97	test		%rdx, %rdx		# no bytes to copy?
98	jz		.L0
99
100	mov		%esi, %ecx
101	and		$~0xf, %rsi		# align source to 16 bytes
102	movdqa		(%rsi), %xmm0		# load head
103	and		$0xf, %ecx		# offset from alignment
104	mov		$-1, %r9d
105	lea		-33(%rcx), %rax		# set up overflow-proof comparison rdx+rcx<=32
106	shl		%cl, %r9d		# mask of bytes belonging to the string
107	sub		%rcx, %rdi		# adjust RDI to correspond to RSI
108	pxor		%xmm1, %xmm1
109	movdqa		%xmm0, bounce(%rsp)	# stash copy of head on the stack
110	pcmpeqb		%xmm1, %xmm0
111	pmovmskb	%xmm0, %r8d
112
113	lea		(%rdx, %rcx, 1), %r10	# buffer length from alignment boundary
114	add		%rdx, %rax		# less than 2 chunks (32 bytes) to play with?
115	jnc		.Lrunt			# if yes, use special runt processing
116
117	movdqu		%xmm1, -16(%rdi, %r10, 1) # clear final bytes of destination
118	and		%r9d, %r8d		# end of string within head?
119	jnz		.Lheadnul
120
121	movdqu		(%rsi, %rcx, 1), %xmm2	# load head from source buffer
122	movdqu		%xmm2, (%rdi, %rcx, 1)	# an deposit
123
124	add		$16, %rsi
125	add		$16, %rdi
126	sub		$32, %r10
127
128	/* main loop unrolled twice */
129	ALIGN_TEXT
1300:	movdqa		(%rsi), %xmm0
131	pxor		%xmm1, %xmm1
132	pcmpeqb		%xmm0, %xmm1		# NUL byte encountered?
133	pmovmskb	%xmm1, %r8d
134	test		%r8d, %r8d
135	jnz		3f
136
137	movdqu		%xmm0, (%rdi)
138	cmp		$16, %r10		# more than a full chunk left?
139	jbe		1f
140
141	movdqa		16(%rsi), %xmm0
142	add		$32, %rdi		# advance pointers to next chunk
143	add		$32, %rsi
144	pxor		%xmm1, %xmm1
145	pcmpeqb		%xmm0, %xmm1		# NUL byte encountered?
146	pmovmskb	%xmm1, %r8d
147	test		%r8d, %r8d
148	jnz		2f
149
150	movdqu		%xmm0, -16(%rdi)
151	sub		$32, %r10		# more than another full chunk left?
152	ja		0b
153
154	sub		$16, %rdi		# undo second advancement
155	sub		$16, %rsi
156	add		$16, %r10d		# restore number of remaining bytes
157
158	/* 1--16 bytes left but string has not ended yet */
1591:	pxor		%xmm1, %xmm1
160	pcmpeqb		16(%rsi), %xmm1		# NUL byte in source tail?
161	pmovmskb	%xmm1, %r8d
162	bts		%r10d, %r8d		# treat end of buffer as NUL
163	tzcnt		%r8d, %r8d		# where is the NUL byte?
164	movdqu		(%rsi, %r8, 1), %xmm0	# load source tail before NUL
165	lea		16(%rdi, %r8, 1), %rax	# point return value to NUL byte
166						# or end of buffer
167	movdqu		%xmm0, (%rdi, %r8, 1)	# store tail into the buffer
168	ret
169
1702:	sub		$16, %rdi		# undo second advancement
171	sub		$16, %rsi
172	sub		$16, %r10
173
174	/* string has ended and buffer has not */
1753:	tzcnt		%r8d, %r8d		# where did the string end?
176	lea		.Lmask+16(%rip), %rcx
177	lea		(%rdi, %r8, 1), %rax 	# where the NUL byte will be
178	neg		%r8
179	movdqu		(%rcx, %r8, 1), %xmm1	# mask with FF where the string is,
180						# 00 where it is not
181	pand		%xmm1, %xmm0		# mask out bytes after the string
182	movdqu		%xmm0, (%rdi)	 	# store masked current chunk
183	pxor		%xmm1, %xmm1
184	sub		$16, %r10		# another full chunk left?
185	jbe		1f
186
187	/* clear remaining destination buffer (tail has been cleared earlier) */
188	ALIGN_TEXT
1890:	movdqu		%xmm1, 16(%rdi)
190	cmp		$16, %r10
191	jbe		1f
192
193	movdqu		%xmm1, 32(%rdi)
194	add		$32, %rdi
195	sub		$32, %r10
196	ja		0b
197
1981:	ret
199
200	/* at least two chunks to play with and NUL while processing head */
201.Lheadnul:
202	movdqu		bounce(%rsp, %rcx, 1), %xmm0 # load start of source from stack
203	tzcnt		%r8d, %r8d		# find location of NUL byte
204	movdqu		%xmm0, (%rdi, %rcx, 1)	# deposit head in the destination
205	movdqu		%xmm1, (%rdi, %r8, 1)	# clear out following bytes
206	movdqu		%xmm1, 16(%rdi)		# clear out second chunk
207	lea		(%rdi, %r8, 1), %rax	# make RAX point to the NUL byte
208
209	add		$32, %rdi		# advance past first two chunks
210	sub		$32+16, %r10		# advance past first three chunks
211	jbe		1f			# did we pass the end of the buffer?
212
213	/* clear remaining destination buffer (tail has been cleared earlier) */
214	ALIGN_TEXT
2150:	movdqu		%xmm1, (%rdi)		# clear out buffer chunk
216	cmp		$16, %r10
217	jbe		1f
218
219	movdqu		%xmm1, 16(%rdi)
220	add		$32, %rdi
221	sub		$32, %r10
222	ja		0b
223
2241:	ret
225
226	/* 1--32 bytes to copy, bounce through the stack */
227.Lrunt:	movdqa		%xmm1, bounce+16(%rsp)	# clear out rest of on-stack copy
228	bts		%r10, %r8		# treat end of buffer as end of string
229	and		%r9d, %r8d		# mask out head before string
230	test		$0x1ffff, %r8d		# end of string within first chunk or right after?
231	jnz		0f			# if yes, do not inspect second buffer
232
233	movdqa		16(%rsi), %xmm0		# load second chunk of input
234	movdqa		%xmm0, bounce+16(%rsp)	# stash copy on stack
235	pcmpeqb		%xmm1, %xmm0		# NUL in second chunk?
236	pmovmskb	%xmm0, %r9d
237	shl		$16, %r9d
238	or		%r9d, %r8d		# merge found NUL bytes into NUL mask
239
240	/* end of string after one buffer */
2410:	tzcnt		%r8d, %r8d		# location of last char in string
242	movdqu		%xmm1, bounce(%rsp, %r8, 1) # clear bytes behind string
243	lea		bounce(%rsp, %rcx, 1), %rsi # start of string copy on stack
244	lea		(%rdi, %r8, 1), %rax	# return pointer to NUL byte
245
246	cmp		$16, %edx		# at least 16 bytes to transfer?
247	jae		.L1631
248
249	mov		(%rsi), %r8		# load string head
250	cmp		$8, %edx		# at least 8 bytes to transfer?
251	jae		.L0815
252
253	cmp		$4, %edx		# at least 4 bytes to transfer?
254	jae		.L0407
255
256	movzwl		-2(%rsi, %rdx, 1), %esi	# load last two bytes of string
257	mov		%r8b, (%rdi, %rcx, 1)	# store first byte
258
259	cmp		$2, %edx		# at least 2 bytes to transfer?
260	jb		.L1
261
262	mov		%si, -2(%rdi, %r10, 1)	# store last two bytes of string
263.L1:	ret
264
265.L1631:	movdqu		(%rsi), %xmm0		# load first 16 bytes of string
266	movdqu		-16(%rsi, %rdx, 1), %xmm1 # load last 16 bytes of string
267	movdqu		%xmm0, (%rdi, %rcx, 1)
268	movdqu		%xmm1, -16(%rdi, %r10, 1)
269	ret
270
271.L0815:	mov		-8(%rsi, %rdx, 1), %rdx	# load last 8 bytes of string
272	mov		%r8, (%rdi, %rcx, 1)
273	mov		%rdx, -8(%rdi, %r10, 1)
274	ret
275
276.L0407:	mov		-4(%rsi, %rdx, 1), %edx	# load last four bytes of string
277	mov		%r8d, (%rdi, %rcx, 1)
278	mov		%edx, -4(%rdi, %r10, 1)
279	ret
280
281	/* length 0 buffer: just return dest */
282.L0:	mov		%rdi, %rax
283	ret
284ARCHEND(__stpncpy, baseline)
285
286	.section .note.GNU-stack,"",%progbits
287