xref: /freebsd/lib/libc/amd64/string/strcat.S (revision aff9143a242c0012b0195b3666e03fa3b7cd33e8)
1*aff9143aSRobert Clausecker/*-
2*aff9143aSRobert Clausecker * Copyright (c) 2023, The FreeBSD Foundation
3*aff9143aSRobert Clausecker *
4*aff9143aSRobert Clausecker * SPDX-License-Expression: BSD-2-Clause
5*aff9143aSRobert Clausecker *
6*aff9143aSRobert Clausecker * Portions of this software were developed by Robert Clausecker
7*aff9143aSRobert Clausecker * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
8*aff9143aSRobert Clausecker *
9*aff9143aSRobert Clausecker * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcat.S
10*aff9143aSRobert Clausecker * written by J.T. Conklin <jtc@acorntoolworks.com>
11*aff9143aSRobert Clausecker * that was originally dedicated to the public domain
127e266fcdSAlan Cox */
137e266fcdSAlan Cox
147e266fcdSAlan Cox#include <machine/asm.h>
157e266fcdSAlan Cox#if 0
167e266fcdSAlan Cox	RCSID("$NetBSD: strcat.S,v 1.4 2004/07/26 18:51:21 drochner Exp $")
177e266fcdSAlan Cox#endif
187e266fcdSAlan Cox
19*aff9143aSRobert Clausecker#include "amd64_archlevel.h"
20*aff9143aSRobert Clausecker
21*aff9143aSRobert ClauseckerARCHFUNCS(strcat)
22*aff9143aSRobert Clausecker	ARCHFUNC(strcat, scalar)
23*aff9143aSRobert Clausecker	ARCHFUNC(strcat, baseline)
24*aff9143aSRobert ClauseckerENDARCHFUNCS(strcat)
25*aff9143aSRobert Clausecker
26*aff9143aSRobert ClauseckerARCHENTRY(strcat, scalar)
277e266fcdSAlan Cox	movq	%rdi,%rax
287e266fcdSAlan Cox	movabsq	$0x0101010101010101,%r8
297e266fcdSAlan Cox	movabsq	$0x8080808080808080,%r9
307e266fcdSAlan Cox
317e266fcdSAlan Cox	/*
327e266fcdSAlan Cox	 * Align destination to word boundary.
337e266fcdSAlan Cox	 * Consider unrolling loop?
347e266fcdSAlan Cox	 */
357e266fcdSAlan Cox.Lscan:
367e266fcdSAlan Cox.Lscan_align:
377e266fcdSAlan Cox	testb	$7,%dil
387e266fcdSAlan Cox	je	.Lscan_aligned
397e266fcdSAlan Cox	cmpb	$0,(%rdi)
407e266fcdSAlan Cox	je	.Lcopy
417e266fcdSAlan Cox	incq	%rdi
427e266fcdSAlan Cox	jmp	.Lscan_align
437e266fcdSAlan Cox
447e266fcdSAlan Cox	.align	4
457e266fcdSAlan Cox.Lscan_aligned:
467e266fcdSAlan Cox.Lscan_loop:
477e266fcdSAlan Cox	movq	(%rdi),%rdx
487e266fcdSAlan Cox	addq	$8,%rdi
497e266fcdSAlan Cox	subq	%r8,%rdx
507e266fcdSAlan Cox	testq	%r9,%rdx
517e266fcdSAlan Cox	je	.Lscan_loop
527e266fcdSAlan Cox
537e266fcdSAlan Cox	/*
547e266fcdSAlan Cox	 * In rare cases, the above loop may exit prematurely. We must
557e266fcdSAlan Cox	 * return to the loop if none of the bytes in the word equal 0.
567e266fcdSAlan Cox	 */
577e266fcdSAlan Cox
587e266fcdSAlan Cox	cmpb	$0,-8(%rdi)	/* 1st byte == 0? */
597e266fcdSAlan Cox	jne	1f
607e266fcdSAlan Cox	subq	$8,%rdi
617e266fcdSAlan Cox	jmp	.Lcopy
627e266fcdSAlan Cox
637e266fcdSAlan Cox1:	cmpb	$0,-7(%rdi)	/* 2nd byte == 0? */
647e266fcdSAlan Cox	jne	1f
657e266fcdSAlan Cox	subq	$7,%rdi
667e266fcdSAlan Cox	jmp	.Lcopy
677e266fcdSAlan Cox
687e266fcdSAlan Cox1:	cmpb	$0,-6(%rdi)	/* 3rd byte == 0? */
697e266fcdSAlan Cox	jne	1f
707e266fcdSAlan Cox	subq	$6,%rdi
717e266fcdSAlan Cox	jmp	.Lcopy
727e266fcdSAlan Cox
737e266fcdSAlan Cox1:	cmpb	$0,-5(%rdi)	/* 4th byte == 0? */
747e266fcdSAlan Cox	jne	1f
757e266fcdSAlan Cox	subq	$5,%rdi
767e266fcdSAlan Cox	jmp	.Lcopy
777e266fcdSAlan Cox
787e266fcdSAlan Cox1:	cmpb	$0,-4(%rdi)	/* 5th byte == 0? */
797e266fcdSAlan Cox	jne	1f
807e266fcdSAlan Cox	subq	$4,%rdi
817e266fcdSAlan Cox	jmp	.Lcopy
827e266fcdSAlan Cox
837e266fcdSAlan Cox1:	cmpb	$0,-3(%rdi)	/* 6th byte == 0? */
847e266fcdSAlan Cox	jne	1f
857e266fcdSAlan Cox	subq	$3,%rdi
867e266fcdSAlan Cox	jmp	.Lcopy
877e266fcdSAlan Cox
887e266fcdSAlan Cox1:	cmpb	$0,-2(%rdi)	/* 7th byte == 0? */
897e266fcdSAlan Cox	jne	1f
907e266fcdSAlan Cox	subq	$2,%rdi
917e266fcdSAlan Cox	jmp	.Lcopy
927e266fcdSAlan Cox
937e266fcdSAlan Cox1:	cmpb	$0,-1(%rdi)	/* 8th byte == 0? */
947e266fcdSAlan Cox	jne	.Lscan_loop
957e266fcdSAlan Cox	subq	$1,%rdi
967e266fcdSAlan Cox
977e266fcdSAlan Cox	/*
987e266fcdSAlan Cox	 * Align source to a word boundary.
997e266fcdSAlan Cox	 * Consider unrolling loop?
1007e266fcdSAlan Cox	 */
1017e266fcdSAlan Cox.Lcopy:
1027e266fcdSAlan Cox.Lcopy_align:
1037e266fcdSAlan Cox	testb	$7,%sil
1047e266fcdSAlan Cox	je	.Lcopy_aligned
1057e266fcdSAlan Cox	movb	(%rsi),%dl
1067e266fcdSAlan Cox	incq	%rsi
1077e266fcdSAlan Cox	movb	%dl,(%rdi)
1087e266fcdSAlan Cox	incq	%rdi
1097e266fcdSAlan Cox	testb	%dl,%dl
1107e266fcdSAlan Cox	jne	.Lcopy_align
1117e266fcdSAlan Cox	ret
1127e266fcdSAlan Cox
1137e266fcdSAlan Cox	.align	4
1147e266fcdSAlan Cox.Lcopy_loop:
1157e266fcdSAlan Cox	movq	%rdx,(%rdi)
1167e266fcdSAlan Cox	addq	$8,%rdi
1177e266fcdSAlan Cox.Lcopy_aligned:
1187e266fcdSAlan Cox	movq	(%rsi),%rdx
1197e266fcdSAlan Cox	movq	%rdx,%rcx
1207e266fcdSAlan Cox	addq	$8,%rsi
1217e266fcdSAlan Cox	subq	%r8,%rcx
1227e266fcdSAlan Cox	testq	%r9,%rcx
1237e266fcdSAlan Cox	je	.Lcopy_loop
1247e266fcdSAlan Cox
1257e266fcdSAlan Cox	/*
1267e266fcdSAlan Cox	 * In rare cases, the above loop may exit prematurely. We must
1277e266fcdSAlan Cox	 * return to the loop if none of the bytes in the word equal 0.
1287e266fcdSAlan Cox	 */
1297e266fcdSAlan Cox
1307e266fcdSAlan Cox	movb	%dl,(%rdi)
1317e266fcdSAlan Cox	incq	%rdi
1327e266fcdSAlan Cox	testb	%dl,%dl		/* 1st byte == 0? */
1337e266fcdSAlan Cox	je	.Ldone
1347e266fcdSAlan Cox
1357e266fcdSAlan Cox	shrq	$8,%rdx
1367e266fcdSAlan Cox	movb	%dl,(%rdi)
1377e266fcdSAlan Cox	incq	%rdi
1387e266fcdSAlan Cox	testb	%dl,%dl		/* 2nd byte == 0? */
1397e266fcdSAlan Cox	je	.Ldone
1407e266fcdSAlan Cox
1417e266fcdSAlan Cox	shrq	$8,%rdx
1427e266fcdSAlan Cox	movb	%dl,(%rdi)
1437e266fcdSAlan Cox	incq	%rdi
1447e266fcdSAlan Cox	testb	%dl,%dl		/* 3rd byte == 0? */
1457e266fcdSAlan Cox	je	.Ldone
1467e266fcdSAlan Cox
1477e266fcdSAlan Cox	shrq	$8,%rdx
1487e266fcdSAlan Cox	movb	%dl,(%rdi)
1497e266fcdSAlan Cox	incq	%rdi
1507e266fcdSAlan Cox	testb	%dl,%dl		/* 4th byte == 0? */
1517e266fcdSAlan Cox	je	.Ldone
1527e266fcdSAlan Cox
1537e266fcdSAlan Cox	shrq	$8,%rdx
1547e266fcdSAlan Cox	movb	%dl,(%rdi)
1557e266fcdSAlan Cox	incq	%rdi
1567e266fcdSAlan Cox	testb	%dl,%dl		/* 5th byte == 0? */
1577e266fcdSAlan Cox	je	.Ldone
1587e266fcdSAlan Cox
1597e266fcdSAlan Cox	shrq	$8,%rdx
1607e266fcdSAlan Cox	movb	%dl,(%rdi)
1617e266fcdSAlan Cox	incq	%rdi
1627e266fcdSAlan Cox	testb	%dl,%dl		/* 6th byte == 0? */
1637e266fcdSAlan Cox	je	.Ldone
1647e266fcdSAlan Cox
1657e266fcdSAlan Cox	shrq	$8,%rdx
1667e266fcdSAlan Cox	movb	%dl,(%rdi)
1677e266fcdSAlan Cox	incq	%rdi
1687e266fcdSAlan Cox	testb	%dl,%dl		/* 7th byte == 0? */
1697e266fcdSAlan Cox	je	.Ldone
1707e266fcdSAlan Cox
1717e266fcdSAlan Cox	shrq	$8,%rdx
1727e266fcdSAlan Cox	movb	%dl,(%rdi)
1737e266fcdSAlan Cox	incq	%rdi
1747e266fcdSAlan Cox	testb	%dl,%dl		/* 8th byte == 0? */
1757e266fcdSAlan Cox	jne	.Lcopy_aligned
1767e266fcdSAlan Cox
1777e266fcdSAlan Cox.Ldone:
1787e266fcdSAlan Cox	ret
179*aff9143aSRobert ClauseckerARCHEND(strcat, scalar)
180*aff9143aSRobert Clausecker
181*aff9143aSRobert Clausecker/*
182*aff9143aSRobert Clausecker * Call into strlen + strcpy if we have any SIMD at all.
183*aff9143aSRobert Clausecker * The scalar implementation above is better for the scalar
184*aff9143aSRobert Clausecker * case as it avoids the function call overhead, but pessimal
185*aff9143aSRobert Clausecker * if we could call SIMD routines instead.
186*aff9143aSRobert Clausecker */
187*aff9143aSRobert ClauseckerARCHENTRY(strcat, baseline)
188*aff9143aSRobert Clausecker	push	%rbp
189*aff9143aSRobert Clausecker	mov	%rsp, %rbp
190*aff9143aSRobert Clausecker	push	%rsi
191*aff9143aSRobert Clausecker	push	%rbx
192*aff9143aSRobert Clausecker	mov	%rdi, %rbx		# remember destination for later
193*aff9143aSRobert Clausecker	call	CNAME(strlen)		# strlen(dest)
194*aff9143aSRobert Clausecker	mov	-8(%rbp), %rsi
195*aff9143aSRobert Clausecker	lea	(%rbx, %rax, 1), %rdi	# dest + strlen(dest)
196*aff9143aSRobert Clausecker	call	CNAME(__stpcpy)		# stpcpy(dest + strlen(dest), src)
197*aff9143aSRobert Clausecker	mov	%rbx, %rax		# return dest
198*aff9143aSRobert Clausecker	pop	%rbx
199*aff9143aSRobert Clausecker	leave
200*aff9143aSRobert Clausecker	ret
201*aff9143aSRobert ClauseckerARCHEND(strcat, baseline)
20293ab7586SKonstantin Belousov
20393ab7586SKonstantin Belousov	.section .note.GNU-stack,"",%progbits
204