xref: /freebsd/lib/libc/amd64/string/memccpy.S (revision 5ca8e32633c4ffbbcd6762e5888b6a4ba0708c6c)
1/*
2 * Copyright (c) 2023 The FreeBSD Foundation
3 *
4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
5 * under sponsorship from the FreeBSD Foundation.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE
27 */
28
29#include <machine/asm.h>
30
31#include "amd64_archlevel.h"
32
33#define ALIGN_TEXT	.p2align 4, 0x90
34
35	.weak memccpy
36	.set memccpy, __memccpy
37ARCHFUNCS(__memccpy)
38	ARCHFUNC(__memccpy, scalar)
39	ARCHFUNC(__memccpy, baseline)
40ENDARCHFUNCS(__memccpy)
41
42ARCHENTRY(__memccpy, scalar)
43	push	%rbp			# establish stack frame
44	mov	%rsp, %rbp
45	push	%rax			# dummy push for alignment
46	push	%rbx
47	push	%rdi
48	push	%rsi
49
50	mov	%rsi, %rdi
51	mov	%edx, %esi
52	mov	%rcx, %rdx
53	mov	%rcx, %rbx
54	call	CNAME(__memchr)		# ptr = memchr(src, c, len)
55
56	pop	%rsi
57	pop	%rdi
58	lea	1(%rax), %rdx
59	sub	%rsi, %rdx		# size = ptr - src + 1
60	mov	%rbx, %rcx
61	lea	(%rdi, %rdx, 1), %rbx	# res = dest + size
62	test	%rax, %rax		# if (ptr == NULL)
63	cmovz	%rcx, %rdx		# size = len
64	cmovz	%rax, %rbx		# res = NULL
65	call	CNAME(memcpy)
66
67	mov	%rbx, %rax		# return (res)
68	pop	%rbx
69	leave
70	ret
71ARCHEND(__memccpy, scalar)
72
73ARCHENTRY(__memccpy, baseline)
74	sub		$1, %rcx		# RCX refers to last character in buffer
75	jb		.L0			# go to special code path if len was 0
76
77	movd		%edx, %xmm4
78	mov		%rcx, %rdx
79	punpcklbw	%xmm4, %xmm4		# c -> cc
80	mov		%esi, %ecx
81	punpcklwd	%xmm4, %xmm4		# cc -> cccc
82	mov		%rsi, %r9		# stash a copy of the source pointer for later
83	pshufd		$0, %xmm4, %xmm4	# cccc -> cccccccccccccccc
84	and		$~0xf, %rsi
85	movdqa		%xmm4, %xmm1
86	pcmpeqb		(%rsi), %xmm1		# NUL found in head?
87	mov		$-1, %r8d
88	and		$0xf, %ecx
89	shl		%cl, %r8d		# mask of bytes in the string
90	pmovmskb	%xmm1, %eax
91	and		%r8d, %eax
92	jnz		.Lhead_nul
93
94	movdqa		16(%rsi), %xmm3		# load second string chunk
95	movdqu		(%r9), %xmm2		# load unaligned string head
96	mov		$32, %r8d
97	sub		%ecx, %r8d		# head length + length of second chunk
98	movdqa		%xmm4, %xmm1
99	pcmpeqb		%xmm3, %xmm1		# NUL found in second chunk?
100
101	sub		%r8, %rdx		# enough space left for the second chunk?
102	jb		.Lhead_buf_end
103
104	/* process second chunk */
105	pmovmskb	%xmm1, %eax
106	test		%eax, %eax
107	jnz		.Lsecond_nul
108
109	/* string didn't end in second chunk and neither did buffer -- not a runt! */
110	movdqa		32(%rsi), %xmm0		# load next string chunk
111	movdqa		%xmm4, %xmm1
112	movdqu		%xmm2, (%rdi)		# deposit head into buffer
113	sub		%rcx, %rdi		# adjust RDI to correspond to RSI
114	movdqu		%xmm3, 16(%rdi)		# deposit second chunk
115	sub		%rsi, %rdi		# express RDI as distance from RSI
116	add		$32, %rsi		# advance RSI past first two chunks
117	sub		$16, %rdx		# enough left for another round?
118	jb		1f
119
120	/* main loop unrolled twice */
121	ALIGN_TEXT
1220:	pcmpeqb		%xmm0, %xmm1		# NUL byte encountered?
123	pmovmskb	%xmm1, %eax
124	test		%eax, %eax
125	jnz		3f
126
127	movdqu		%xmm0, (%rsi, %rdi)
128	movdqa		16(%rsi), %xmm0		# load next string chunk
129	movdqa		%xmm4, %xmm1
130	cmp		$16, %rdx		# more than a full chunk left?
131	jb		2f
132
133	add		$32, %rsi		# advance pointers to next chunk
134	pcmpeqb		%xmm0, %xmm1		# NUL byte encountered?
135	pmovmskb	%xmm1, %eax
136	test		%eax, %eax
137	jnz		4f
138
139	movdqu		%xmm0, -16(%rsi, %rdi)
140	movdqa		(%rsi), %xmm0		# load next string chunk
141	movdqa		%xmm4, %xmm1
142	sub		$32, %rdx
143	jae		0b
144
1451:	sub		$16, %rsi		# undo second advancement
146	add		$16, %edx
147
148	/* 1--16 bytes left in the buffer but string has not ended yet */
1492:	pcmpeqb		%xmm1, %xmm0		# NUL byte encountered?
150	pmovmskb	%xmm0, %r8d
151	mov		%r8d, %ecx
152	bts		%edx, %r8d		# treat end of buffer as end of string
153	or		$0x10000, %eax		# ensure TZCNT finds a set bit
154	tzcnt		%r8d, %r8d		# find tail length
155	add		%rsi, %rdi		# restore RDI
156	movdqu		1(%rsi, %r8, 1), %xmm0	# load string tail
157	movdqu		%xmm0, 1(%rdi, %r8, 1)	# store string tail
158	lea		17(%rdi, %r8, 1), %rsi	# return value if terminator encountered
159	xor		%eax, %eax		# return value if no terminator encountered
160	bt		%r8d, %ecx		# terminator encountered inside buffer?
161	cmovc		%rsi, %rax		# if yes, return pointer, else NULL
162	ret
163
1644:	sub		$16, %rsi		# undo second advancement
165	add		$16, %rdx		# restore number of remaining bytes
166
167	/* string has ended but buffer has not */
1683:	tzcnt		%eax, %eax		# find length of string tail
169	movdqu		-15(%rsi, %rax, 1), %xmm0 # load string tail (incl. NUL)
170	add		%rsi, %rdi		# restore destination pointer
171	movdqu		%xmm0, -15(%rdi, %rax, 1) # store string tail (incl. NUL)
172	lea		1(%rdi, %rax, 1), %rax	# compute return value
173	ret
174
175.Lhead_buf_end:
176	pmovmskb	%xmm1, %r8d
177	add		$32, %edx		# restore edx to (len-1) + ecx
178	shl		$16, %r8d		# place 2nd chunk NUL mask into bits 16--31
179	mov		%r8d, %r10d
180	bts		%rdx, %r8		# treat end of buffer as if terminator present
181	xor		%eax, %eax		# return value if terminator not found
182	tzcnt		%r8, %rdx		# find string/buffer len from alignment boundary
183	lea		1(%rdi, %rdx, 1), %r8	# return value if terminator found + rcx
184	sub		%rcx, %r8		# subtract rcx
185	bt		%rdx, %r10		# was the terminator present?
186	cmovc		%r8, %rax		# if yes, return pointer, else NULL
187	sub		%ecx, %edx		# find actual string/buffer len
188	jmp		.L0132
189
190.Lsecond_nul:
191	add		%r8, %rdx		# restore buffer length
192	tzcnt		%eax, %r8d		# where is the NUL byte?
193	lea		-16(%rcx), %eax
194	sub		%eax, %r8d		# string length
195	lea		1(%rdi, %r8, 1), %rax	# return value if NUL before end of buffer
196	xor		%ecx, %ecx		# return value if not
197	cmp		%r8, %rdx		# is the string shorter than the buffer?
198	cmova		%r8, %rdx		# copy only min(buflen, srclen) bytes
199	cmovb		%rcx, %rax		# return NUL if buffer ended before string
200.L0132:	cmp		$16, %rdx		# at least 17 bytes to copy (not incl NUL)?
201	jb		.L0116
202
203	/* copy 17--32 bytes */
204	movdqu		(%r9), %xmm0		# load first 16 bytes
205	movdqu		-15(%r9, %rdx, 1), %xmm1 # load last 16 bytes
206	movdqu		%xmm0, (%rdi)
207	movdqu		%xmm1, -15(%rdi, %rdx, 1)
208	ret
209
210.Lhead_nul:
211	tzcnt		%eax, %r8d		# where is the NUL byte?
212	sub		%ecx, %r8d		# ... from the beginning of the string?
213	lea		1(%rdi, %r8, 1), %rax	# return value if NUL before end of buffer
214	xor		%ecx, %ecx		# return value if not
215	cmp		%r8, %rdx		# is the string shorter than the buffer?
216	cmova		%r8, %rdx		# copy only min(buflen, srclen) bytes
217	cmovb		%rcx, %rax		# return NUL if buffer ended before string
218
219	/* process strings of 1--16 bytes (rdx: min(buflen, srclen), rax: srclen) */
220.L0116:	cmp		$8, %rdx		# at least 9 bytes to copy?
221	jae		.L0916
222
223	cmp		$4, %rdx		# at least 5 bytes to copy?
224	jae		.L0508
225
226	cmp		$2, %rdx		# at least 3 bytes to copy?
227	jae		.L0304
228
229	/* copy one or two bytes */
230	movzbl		(%r9), %ecx		# load first byte from src
231	movzbl		(%r9, %rdx, 1), %esi	# load last byte from src
232	mov		%cl, (%rdi)		# deposit into destination
233	mov		%sil, (%rdi, %rdx, 1)
234	ret
235
236.L0304:	movzwl		(%r9), %ecx
237	movzwl		-1(%r9, %rdx, 1), %esi
238	mov		%cx, (%rdi)
239	mov		%si, -1(%rdi, %rdx, 1)
240	ret
241
242.L0508:	mov		(%r9), %ecx
243	mov		-3(%r9, %rdx, 1), %esi
244	mov		%ecx, (%rdi)
245	mov		%esi, -3(%rdi, %rdx, 1)
246	ret
247
248.L0916:	mov		(%r9), %rcx
249	mov		-7(%r9, %rdx, 1), %rsi
250	mov		%rcx, (%rdi)
251	mov		%rsi, -7(%rdi, %rdx, 1)
252	ret
253
254	/* length zero destination: return null pointer */
255.L0:	xor		%eax, %eax
256	ret
257ARCHEND(__memccpy, baseline)
258
259	.section .note.GNU-stack,"",%progbits
260