xref: /freebsd/lib/libc/amd64/string/memccpy.S (revision 9082398090bcf0ac333397d47e594b105ea3aefd)
1fc0e38a7SRobert Clausecker/*
2*90823980SRobert Clausecker * Copyright (c) 2023, 2024 The FreeBSD Foundation
3fc0e38a7SRobert Clausecker *
4fc0e38a7SRobert Clausecker * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
5fc0e38a7SRobert Clausecker * under sponsorship from the FreeBSD Foundation.
6fc0e38a7SRobert Clausecker *
7fc0e38a7SRobert Clausecker * Redistribution and use in source and binary forms, with or without
8fc0e38a7SRobert Clausecker * modification, are permitted provided that the following conditions
9fc0e38a7SRobert Clausecker * are met:
10fc0e38a7SRobert Clausecker * 1. Redistributions of source code must retain the above copyright
11fc0e38a7SRobert Clausecker *    notice, this list of conditions and the following disclaimer.
12fc0e38a7SRobert Clausecker * 2. Redistributions in binary form must reproduce the above copyright
13fc0e38a7SRobert Clausecker *    notice, this list of conditions and the following disclaimer in the
14fc0e38a7SRobert Clausecker *    documentation and/or other materials provided with the distribution.
15fc0e38a7SRobert Clausecker *
16fc0e38a7SRobert Clausecker * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17fc0e38a7SRobert Clausecker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18fc0e38a7SRobert Clausecker * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19fc0e38a7SRobert Clausecker * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20fc0e38a7SRobert Clausecker * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21fc0e38a7SRobert Clausecker * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22fc0e38a7SRobert Clausecker * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23fc0e38a7SRobert Clausecker * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24fc0e38a7SRobert Clausecker * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25fc0e38a7SRobert Clausecker * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26fc0e38a7SRobert Clausecker * SUCH DAMAGE
27fc0e38a7SRobert Clausecker */
28fc0e38a7SRobert Clausecker
29fc0e38a7SRobert Clausecker#include <machine/asm.h>
30fc0e38a7SRobert Clausecker
31fc0e38a7SRobert Clausecker#include "amd64_archlevel.h"
32fc0e38a7SRobert Clausecker
33fc0e38a7SRobert Clausecker#define ALIGN_TEXT	.p2align 4, 0x90
34fc0e38a7SRobert Clausecker
35fc0e38a7SRobert Clausecker	.weak memccpy
36fc0e38a7SRobert Clausecker	.set memccpy, __memccpy
37fc0e38a7SRobert ClauseckerARCHFUNCS(__memccpy)
38fc0e38a7SRobert Clausecker	ARCHFUNC(__memccpy, scalar)
39fc0e38a7SRobert Clausecker	ARCHFUNC(__memccpy, baseline)
40fc0e38a7SRobert ClauseckerENDARCHFUNCS(__memccpy)
41fc0e38a7SRobert Clausecker
42fc0e38a7SRobert ClauseckerARCHENTRY(__memccpy, scalar)
43fc0e38a7SRobert Clausecker	push	%rbp			# establish stack frame
44fc0e38a7SRobert Clausecker	mov	%rsp, %rbp
45fc0e38a7SRobert Clausecker	push	%rax			# dummy push for alignment
46fc0e38a7SRobert Clausecker	push	%rbx
47fc0e38a7SRobert Clausecker	push	%rdi
48fc0e38a7SRobert Clausecker	push	%rsi
49fc0e38a7SRobert Clausecker
50fc0e38a7SRobert Clausecker	mov	%rsi, %rdi
51fc0e38a7SRobert Clausecker	mov	%edx, %esi
52fc0e38a7SRobert Clausecker	mov	%rcx, %rdx
53fc0e38a7SRobert Clausecker	mov	%rcx, %rbx
54fc0e38a7SRobert Clausecker	call	CNAME(__memchr)		# ptr = memchr(src, c, len)
55fc0e38a7SRobert Clausecker
56fc0e38a7SRobert Clausecker	pop	%rsi
57fc0e38a7SRobert Clausecker	pop	%rdi
58fc0e38a7SRobert Clausecker	lea	1(%rax), %rdx
59fc0e38a7SRobert Clausecker	sub	%rsi, %rdx		# size = ptr - src + 1
60fc0e38a7SRobert Clausecker	mov	%rbx, %rcx
61fc0e38a7SRobert Clausecker	lea	(%rdi, %rdx, 1), %rbx	# res = dest + size
62fc0e38a7SRobert Clausecker	test	%rax, %rax		# if (ptr == NULL)
63fc0e38a7SRobert Clausecker	cmovz	%rcx, %rdx		# size = len
64fc0e38a7SRobert Clausecker	cmovz	%rax, %rbx		# res = NULL
65fc0e38a7SRobert Clausecker	call	CNAME(memcpy)
66fc0e38a7SRobert Clausecker
67fc0e38a7SRobert Clausecker	mov	%rbx, %rax		# return (res)
68fc0e38a7SRobert Clausecker	pop	%rbx
69fc0e38a7SRobert Clausecker	leave
70fc0e38a7SRobert Clausecker	ret
71fc0e38a7SRobert ClauseckerARCHEND(__memccpy, scalar)
72fc0e38a7SRobert Clausecker
73fc0e38a7SRobert ClauseckerARCHENTRY(__memccpy, baseline)
74fc0e38a7SRobert Clausecker	sub		$1, %rcx		# RCX refers to last character in buffer
75fc0e38a7SRobert Clausecker	jb		.L0			# go to special code path if len was 0
76fc0e38a7SRobert Clausecker
77fc0e38a7SRobert Clausecker	movd		%edx, %xmm4
78fc0e38a7SRobert Clausecker	mov		%rcx, %rdx
79fc0e38a7SRobert Clausecker	punpcklbw	%xmm4, %xmm4		# c -> cc
80fc0e38a7SRobert Clausecker	mov		%esi, %ecx
81fc0e38a7SRobert Clausecker	punpcklwd	%xmm4, %xmm4		# cc -> cccc
82fc0e38a7SRobert Clausecker	mov		%rsi, %r9		# stash a copy of the source pointer for later
83fc0e38a7SRobert Clausecker	pshufd		$0, %xmm4, %xmm4	# cccc -> cccccccccccccccc
84fc0e38a7SRobert Clausecker	and		$~0xf, %rsi
85fc0e38a7SRobert Clausecker	movdqa		%xmm4, %xmm1
86*90823980SRobert Clausecker	pcmpeqb		(%rsi), %xmm1		# c found in head?
87fc0e38a7SRobert Clausecker	and		$0xf, %ecx
88*90823980SRobert Clausecker	mov		$-1, %eax
89*90823980SRobert Clausecker	pmovmskb	%xmm1, %r8d
90*90823980SRobert Clausecker	lea		-32(%rcx), %r11
91*90823980SRobert Clausecker	shl		%cl, %eax		# mask of bytes in the string
92*90823980SRobert Clausecker	add		%rdx, %r11		# distance from alignment boundary - 32
93*90823980SRobert Clausecker	jnc		.Lrunt			# jump if buffer length is 32 or less
94*90823980SRobert Clausecker
95fc0e38a7SRobert Clausecker	and		%r8d, %eax
96*90823980SRobert Clausecker	jz		0f			# match (or induced match) found?
97fc0e38a7SRobert Clausecker
98*90823980SRobert Clausecker	/* match in first chunk */
99*90823980SRobert Clausecker	tzcnt		%eax, %edx		# where is c?
100*90823980SRobert Clausecker	sub		%ecx, %edx		# ... from the beginning of the string?
101*90823980SRobert Clausecker	lea		1(%rdi, %rdx, 1), %rax	# return value
102*90823980SRobert Clausecker	jmp		.L0116
103*90823980SRobert Clausecker
104*90823980SRobert Clausecker0:	movdqa		16(%rsi), %xmm3		# load second string chunk
105fc0e38a7SRobert Clausecker	movdqu		(%r9), %xmm2		# load unaligned string head
106fc0e38a7SRobert Clausecker	movdqa		%xmm4, %xmm1
107*90823980SRobert Clausecker	pcmpeqb		%xmm3, %xmm1		# c found in second chunk?
108fc0e38a7SRobert Clausecker
109fc0e38a7SRobert Clausecker	/* process second chunk */
110fc0e38a7SRobert Clausecker	pmovmskb	%xmm1, %eax
111fc0e38a7SRobert Clausecker	test		%eax, %eax
112*90823980SRobert Clausecker	jz		0f
113fc0e38a7SRobert Clausecker
114*90823980SRobert Clausecker	/* match in second chunk */
115*90823980SRobert Clausecker	tzcnt		%eax, %edx		# where is c?
116*90823980SRobert Clausecker	sub		$16, %ecx
117*90823980SRobert Clausecker	sub		%ecx, %edx		# adjust for alignment offset
118*90823980SRobert Clausecker	lea		1(%rdi, %rdx, 1), %rax	# return value
119*90823980SRobert Clausecker	jmp		.L0132
120*90823980SRobert Clausecker
121*90823980SRobert Clausecker	/* c not found in second chunk: prepare for main loop */
122*90823980SRobert Clausecker0:	movdqa		32(%rsi), %xmm0		# load next string chunk
123fc0e38a7SRobert Clausecker	movdqa		%xmm4, %xmm1
124fc0e38a7SRobert Clausecker	movdqu		%xmm2, (%rdi)		# deposit head into buffer
125fc0e38a7SRobert Clausecker	sub		%rcx, %rdi		# adjust RDI to correspond to RSI
126*90823980SRobert Clausecker	mov		%r11, %rdx
127fc0e38a7SRobert Clausecker	movdqu		%xmm3, 16(%rdi)		# deposit second chunk
128fc0e38a7SRobert Clausecker	sub		%rsi, %rdi		# express RDI as distance from RSI
129fc0e38a7SRobert Clausecker	add		$32, %rsi		# advance RSI past first two chunks
130fc0e38a7SRobert Clausecker	sub		$16, %rdx		# enough left for another round?
131fc0e38a7SRobert Clausecker	jb		1f
132fc0e38a7SRobert Clausecker
133fc0e38a7SRobert Clausecker	/* main loop unrolled twice */
134fc0e38a7SRobert Clausecker	ALIGN_TEXT
135*90823980SRobert Clausecker0:	pcmpeqb		%xmm0, %xmm1		# c encountered?
136fc0e38a7SRobert Clausecker	pmovmskb	%xmm1, %eax
137fc0e38a7SRobert Clausecker	test		%eax, %eax
138fc0e38a7SRobert Clausecker	jnz		3f
139fc0e38a7SRobert Clausecker
140fc0e38a7SRobert Clausecker	movdqu		%xmm0, (%rsi, %rdi)
141fc0e38a7SRobert Clausecker	movdqa		16(%rsi), %xmm0		# load next string chunk
142fc0e38a7SRobert Clausecker	movdqa		%xmm4, %xmm1
143fc0e38a7SRobert Clausecker	cmp		$16, %rdx		# more than a full chunk left?
144fc0e38a7SRobert Clausecker	jb		2f
145fc0e38a7SRobert Clausecker
146fc0e38a7SRobert Clausecker	add		$32, %rsi		# advance pointers to next chunk
147*90823980SRobert Clausecker	pcmpeqb		%xmm0, %xmm1		# c encountered?
148fc0e38a7SRobert Clausecker	pmovmskb	%xmm1, %eax
149fc0e38a7SRobert Clausecker	test		%eax, %eax
150fc0e38a7SRobert Clausecker	jnz		4f
151fc0e38a7SRobert Clausecker
152fc0e38a7SRobert Clausecker	movdqu		%xmm0, -16(%rsi, %rdi)
153fc0e38a7SRobert Clausecker	movdqa		(%rsi), %xmm0		# load next string chunk
154fc0e38a7SRobert Clausecker	movdqa		%xmm4, %xmm1
155fc0e38a7SRobert Clausecker	sub		$32, %rdx
156fc0e38a7SRobert Clausecker	jae		0b
157fc0e38a7SRobert Clausecker
158fc0e38a7SRobert Clausecker1:	sub		$16, %rsi		# undo second advancement
159fc0e38a7SRobert Clausecker	add		$16, %edx
160fc0e38a7SRobert Clausecker
161fc0e38a7SRobert Clausecker	/* 1--16 bytes left in the buffer but string has not ended yet */
162*90823980SRobert Clausecker2:	pcmpeqb		%xmm1, %xmm0		# c encountered?
163fc0e38a7SRobert Clausecker	pmovmskb	%xmm0, %r8d
164fc0e38a7SRobert Clausecker	mov		%r8d, %ecx
165fc0e38a7SRobert Clausecker	bts		%edx, %r8d		# treat end of buffer as end of string
166fc0e38a7SRobert Clausecker	tzcnt		%r8d, %r8d		# find tail length
167fc0e38a7SRobert Clausecker	add		%rsi, %rdi		# restore RDI
168fc0e38a7SRobert Clausecker	movdqu		1(%rsi, %r8, 1), %xmm0	# load string tail
169fc0e38a7SRobert Clausecker	movdqu		%xmm0, 1(%rdi, %r8, 1)	# store string tail
170fc0e38a7SRobert Clausecker	lea		17(%rdi, %r8, 1), %rsi	# return value if terminator encountered
171fc0e38a7SRobert Clausecker	xor		%eax, %eax		# return value if no terminator encountered
172fc0e38a7SRobert Clausecker	bt		%r8d, %ecx		# terminator encountered inside buffer?
173fc0e38a7SRobert Clausecker	cmovc		%rsi, %rax		# if yes, return pointer, else NULL
174fc0e38a7SRobert Clausecker	ret
175fc0e38a7SRobert Clausecker
176fc0e38a7SRobert Clausecker4:	sub		$16, %rsi		# undo second advancement
177fc0e38a7SRobert Clausecker
178*90823980SRobert Clausecker	/* terminator found and buffer has not ended yet */
179fc0e38a7SRobert Clausecker3:	tzcnt		%eax, %eax		# find length of string tail
180*90823980SRobert Clausecker	movdqu		-15(%rsi, %rax, 1), %xmm0 # load string tail (incl. c)
181fc0e38a7SRobert Clausecker	add		%rsi, %rdi		# restore destination pointer
182*90823980SRobert Clausecker	movdqu		%xmm0, -15(%rdi, %rax, 1) # store string tail (incl. c)
183fc0e38a7SRobert Clausecker	lea		1(%rdi, %rax, 1), %rax	# compute return value
184fc0e38a7SRobert Clausecker	ret
185fc0e38a7SRobert Clausecker
186*90823980SRobert Clausecker	/* buffer is 1--32 bytes in size */
187*90823980SRobert Clausecker	ALIGN_TEXT
188*90823980SRobert Clausecker.Lrunt:	add		$32, %r11d		# undo earlier decrement
189*90823980SRobert Clausecker	mov		%r8d, %r10d		# keep a copy of the original match mask
190*90823980SRobert Clausecker	bts		%r11d, %r8d		# induce match at buffer end
191*90823980SRobert Clausecker	and		%ax, %r8w		# is there a match in the first 16 bytes?
192*90823980SRobert Clausecker	jnz		0f			# if yes, skip looking at second chunk
193fc0e38a7SRobert Clausecker
194*90823980SRobert Clausecker	pcmpeqb		16(%rsi), %xmm4		# check for match in second chunk
195*90823980SRobert Clausecker	pmovmskb	%xmm4, %r8d
196*90823980SRobert Clausecker	shl		$16, %r8d		# place second chunk matches in bits 16--31
197*90823980SRobert Clausecker	mov		%r8d, %r10d		# keep a copy of the original match mask
198*90823980SRobert Clausecker	bts		%r11d, %r8d		# induce a match at buffer end
199*90823980SRobert Clausecker
200*90823980SRobert Clausecker0:	xor		%eax, %eax		# return value if terminator not found
201*90823980SRobert Clausecker	tzcnt		%r8d, %edx		# find string/buffer length from alignment boundary
202*90823980SRobert Clausecker	lea		1(%rdi, %rdx, 1), %r8	# return value if terminator found + rcx
203*90823980SRobert Clausecker	sub		%rcx, %r8
204*90823980SRobert Clausecker	bt		%edx, %r10d		# was the terminator present?
205*90823980SRobert Clausecker	cmovc		%r8, %rax		# if yes, return pointer, else NULL
206*90823980SRobert Clausecker	sub		%ecx, %edx		# find actual string/buffer length
207*90823980SRobert Clausecker
208*90823980SRobert Clausecker	ALIGN_TEXT
209*90823980SRobert Clausecker.L0132:	cmp		$16, %rdx		# at least 17 bytes to copy?
210fc0e38a7SRobert Clausecker	jb		.L0116
211fc0e38a7SRobert Clausecker
212fc0e38a7SRobert Clausecker	/* copy 17--32 bytes */
213fc0e38a7SRobert Clausecker	movdqu		(%r9), %xmm0		# load first 16 bytes
214fc0e38a7SRobert Clausecker	movdqu		-15(%r9, %rdx, 1), %xmm1 # load last 16 bytes
215fc0e38a7SRobert Clausecker	movdqu		%xmm0, (%rdi)
216fc0e38a7SRobert Clausecker	movdqu		%xmm1, -15(%rdi, %rdx, 1)
217fc0e38a7SRobert Clausecker	ret
218fc0e38a7SRobert Clausecker
219fc0e38a7SRobert Clausecker	/* process strings of 1--16 bytes (rdx: min(buflen, srclen), rax: srclen) */
220*90823980SRobert Clausecker	ALIGN_TEXT
221fc0e38a7SRobert Clausecker.L0116:	cmp		$8, %rdx		# at least 9 bytes to copy?
222fc0e38a7SRobert Clausecker	jae		.L0916
223fc0e38a7SRobert Clausecker
224fc0e38a7SRobert Clausecker	cmp		$4, %rdx		# at least 5 bytes to copy?
225fc0e38a7SRobert Clausecker	jae		.L0508
226fc0e38a7SRobert Clausecker
227fc0e38a7SRobert Clausecker	cmp		$2, %rdx		# at least 3 bytes to copy?
228fc0e38a7SRobert Clausecker	jae		.L0304
229fc0e38a7SRobert Clausecker
230fc0e38a7SRobert Clausecker	/* copy one or two bytes */
231fc0e38a7SRobert Clausecker	movzbl		(%r9), %ecx		# load first byte from src
232fc0e38a7SRobert Clausecker	movzbl		(%r9, %rdx, 1), %esi	# load last byte from src
233fc0e38a7SRobert Clausecker	mov		%cl, (%rdi)		# deposit into destination
234fc0e38a7SRobert Clausecker	mov		%sil, (%rdi, %rdx, 1)
235fc0e38a7SRobert Clausecker	ret
236fc0e38a7SRobert Clausecker
237fc0e38a7SRobert Clausecker.L0304:	movzwl		(%r9), %ecx
238fc0e38a7SRobert Clausecker	movzwl		-1(%r9, %rdx, 1), %esi
239fc0e38a7SRobert Clausecker	mov		%cx, (%rdi)
240fc0e38a7SRobert Clausecker	mov		%si, -1(%rdi, %rdx, 1)
241fc0e38a7SRobert Clausecker	ret
242fc0e38a7SRobert Clausecker
243fc0e38a7SRobert Clausecker.L0508:	mov		(%r9), %ecx
244fc0e38a7SRobert Clausecker	mov		-3(%r9, %rdx, 1), %esi
245fc0e38a7SRobert Clausecker	mov		%ecx, (%rdi)
246fc0e38a7SRobert Clausecker	mov		%esi, -3(%rdi, %rdx, 1)
247fc0e38a7SRobert Clausecker	ret
248fc0e38a7SRobert Clausecker
249fc0e38a7SRobert Clausecker.L0916:	mov		(%r9), %rcx
250fc0e38a7SRobert Clausecker	mov		-7(%r9, %rdx, 1), %rsi
251fc0e38a7SRobert Clausecker	mov		%rcx, (%rdi)
252fc0e38a7SRobert Clausecker	mov		%rsi, -7(%rdi, %rdx, 1)
253fc0e38a7SRobert Clausecker	ret
254fc0e38a7SRobert Clausecker
255fc0e38a7SRobert Clausecker	/* length zero destination: return null pointer */
256fc0e38a7SRobert Clausecker.L0:	xor		%eax, %eax
257fc0e38a7SRobert Clausecker	ret
258fc0e38a7SRobert ClauseckerARCHEND(__memccpy, baseline)
259fc0e38a7SRobert Clausecker
260fc0e38a7SRobert Clausecker	.section .note.GNU-stack,"",%progbits
261