1*aff9143aSRobert Clausecker/*- 2*aff9143aSRobert Clausecker * Copyright (c) 2023, The FreeBSD Foundation 3*aff9143aSRobert Clausecker * 4*aff9143aSRobert Clausecker * SPDX-License-Expression: BSD-2-Clause 5*aff9143aSRobert Clausecker * 6*aff9143aSRobert Clausecker * Portions of this software were developed by Robert Clausecker 7*aff9143aSRobert Clausecker * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation. 8*aff9143aSRobert Clausecker * 9*aff9143aSRobert Clausecker * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcat.S 10*aff9143aSRobert Clausecker * written by J.T. Conklin <jtc@acorntoolworks.com> 11*aff9143aSRobert Clausecker * that was originally dedicated to the public domain 127e266fcdSAlan Cox */ 137e266fcdSAlan Cox 147e266fcdSAlan Cox#include <machine/asm.h> 157e266fcdSAlan Cox#if 0 167e266fcdSAlan Cox RCSID("$NetBSD: strcat.S,v 1.4 2004/07/26 18:51:21 drochner Exp $") 177e266fcdSAlan Cox#endif 187e266fcdSAlan Cox 19*aff9143aSRobert Clausecker#include "amd64_archlevel.h" 20*aff9143aSRobert Clausecker 21*aff9143aSRobert ClauseckerARCHFUNCS(strcat) 22*aff9143aSRobert Clausecker ARCHFUNC(strcat, scalar) 23*aff9143aSRobert Clausecker ARCHFUNC(strcat, baseline) 24*aff9143aSRobert ClauseckerENDARCHFUNCS(strcat) 25*aff9143aSRobert Clausecker 26*aff9143aSRobert ClauseckerARCHENTRY(strcat, scalar) 277e266fcdSAlan Cox movq %rdi,%rax 287e266fcdSAlan Cox movabsq $0x0101010101010101,%r8 297e266fcdSAlan Cox movabsq $0x8080808080808080,%r9 307e266fcdSAlan Cox 317e266fcdSAlan Cox /* 327e266fcdSAlan Cox * Align destination to word boundary. 337e266fcdSAlan Cox * Consider unrolling loop? 347e266fcdSAlan Cox */ 357e266fcdSAlan Cox.Lscan: 367e266fcdSAlan Cox.Lscan_align: 377e266fcdSAlan Cox testb $7,%dil 387e266fcdSAlan Cox je .Lscan_aligned 397e266fcdSAlan Cox cmpb $0,(%rdi) 407e266fcdSAlan Cox je .Lcopy 417e266fcdSAlan Cox incq %rdi 427e266fcdSAlan Cox jmp .Lscan_align 437e266fcdSAlan Cox 447e266fcdSAlan Cox .align 4 457e266fcdSAlan Cox.Lscan_aligned: 467e266fcdSAlan Cox.Lscan_loop: 477e266fcdSAlan Cox movq (%rdi),%rdx 487e266fcdSAlan Cox addq $8,%rdi 497e266fcdSAlan Cox subq %r8,%rdx 507e266fcdSAlan Cox testq %r9,%rdx 517e266fcdSAlan Cox je .Lscan_loop 527e266fcdSAlan Cox 537e266fcdSAlan Cox /* 547e266fcdSAlan Cox * In rare cases, the above loop may exit prematurely. We must 557e266fcdSAlan Cox * return to the loop if none of the bytes in the word equal 0. 567e266fcdSAlan Cox */ 577e266fcdSAlan Cox 587e266fcdSAlan Cox cmpb $0,-8(%rdi) /* 1st byte == 0? */ 597e266fcdSAlan Cox jne 1f 607e266fcdSAlan Cox subq $8,%rdi 617e266fcdSAlan Cox jmp .Lcopy 627e266fcdSAlan Cox 637e266fcdSAlan Cox1: cmpb $0,-7(%rdi) /* 2nd byte == 0? */ 647e266fcdSAlan Cox jne 1f 657e266fcdSAlan Cox subq $7,%rdi 667e266fcdSAlan Cox jmp .Lcopy 677e266fcdSAlan Cox 687e266fcdSAlan Cox1: cmpb $0,-6(%rdi) /* 3rd byte == 0? */ 697e266fcdSAlan Cox jne 1f 707e266fcdSAlan Cox subq $6,%rdi 717e266fcdSAlan Cox jmp .Lcopy 727e266fcdSAlan Cox 737e266fcdSAlan Cox1: cmpb $0,-5(%rdi) /* 4th byte == 0? */ 747e266fcdSAlan Cox jne 1f 757e266fcdSAlan Cox subq $5,%rdi 767e266fcdSAlan Cox jmp .Lcopy 777e266fcdSAlan Cox 787e266fcdSAlan Cox1: cmpb $0,-4(%rdi) /* 5th byte == 0? */ 797e266fcdSAlan Cox jne 1f 807e266fcdSAlan Cox subq $4,%rdi 817e266fcdSAlan Cox jmp .Lcopy 827e266fcdSAlan Cox 837e266fcdSAlan Cox1: cmpb $0,-3(%rdi) /* 6th byte == 0? */ 847e266fcdSAlan Cox jne 1f 857e266fcdSAlan Cox subq $3,%rdi 867e266fcdSAlan Cox jmp .Lcopy 877e266fcdSAlan Cox 887e266fcdSAlan Cox1: cmpb $0,-2(%rdi) /* 7th byte == 0? */ 897e266fcdSAlan Cox jne 1f 907e266fcdSAlan Cox subq $2,%rdi 917e266fcdSAlan Cox jmp .Lcopy 927e266fcdSAlan Cox 937e266fcdSAlan Cox1: cmpb $0,-1(%rdi) /* 8th byte == 0? */ 947e266fcdSAlan Cox jne .Lscan_loop 957e266fcdSAlan Cox subq $1,%rdi 967e266fcdSAlan Cox 977e266fcdSAlan Cox /* 987e266fcdSAlan Cox * Align source to a word boundary. 997e266fcdSAlan Cox * Consider unrolling loop? 1007e266fcdSAlan Cox */ 1017e266fcdSAlan Cox.Lcopy: 1027e266fcdSAlan Cox.Lcopy_align: 1037e266fcdSAlan Cox testb $7,%sil 1047e266fcdSAlan Cox je .Lcopy_aligned 1057e266fcdSAlan Cox movb (%rsi),%dl 1067e266fcdSAlan Cox incq %rsi 1077e266fcdSAlan Cox movb %dl,(%rdi) 1087e266fcdSAlan Cox incq %rdi 1097e266fcdSAlan Cox testb %dl,%dl 1107e266fcdSAlan Cox jne .Lcopy_align 1117e266fcdSAlan Cox ret 1127e266fcdSAlan Cox 1137e266fcdSAlan Cox .align 4 1147e266fcdSAlan Cox.Lcopy_loop: 1157e266fcdSAlan Cox movq %rdx,(%rdi) 1167e266fcdSAlan Cox addq $8,%rdi 1177e266fcdSAlan Cox.Lcopy_aligned: 1187e266fcdSAlan Cox movq (%rsi),%rdx 1197e266fcdSAlan Cox movq %rdx,%rcx 1207e266fcdSAlan Cox addq $8,%rsi 1217e266fcdSAlan Cox subq %r8,%rcx 1227e266fcdSAlan Cox testq %r9,%rcx 1237e266fcdSAlan Cox je .Lcopy_loop 1247e266fcdSAlan Cox 1257e266fcdSAlan Cox /* 1267e266fcdSAlan Cox * In rare cases, the above loop may exit prematurely. We must 1277e266fcdSAlan Cox * return to the loop if none of the bytes in the word equal 0. 1287e266fcdSAlan Cox */ 1297e266fcdSAlan Cox 1307e266fcdSAlan Cox movb %dl,(%rdi) 1317e266fcdSAlan Cox incq %rdi 1327e266fcdSAlan Cox testb %dl,%dl /* 1st byte == 0? */ 1337e266fcdSAlan Cox je .Ldone 1347e266fcdSAlan Cox 1357e266fcdSAlan Cox shrq $8,%rdx 1367e266fcdSAlan Cox movb %dl,(%rdi) 1377e266fcdSAlan Cox incq %rdi 1387e266fcdSAlan Cox testb %dl,%dl /* 2nd byte == 0? */ 1397e266fcdSAlan Cox je .Ldone 1407e266fcdSAlan Cox 1417e266fcdSAlan Cox shrq $8,%rdx 1427e266fcdSAlan Cox movb %dl,(%rdi) 1437e266fcdSAlan Cox incq %rdi 1447e266fcdSAlan Cox testb %dl,%dl /* 3rd byte == 0? */ 1457e266fcdSAlan Cox je .Ldone 1467e266fcdSAlan Cox 1477e266fcdSAlan Cox shrq $8,%rdx 1487e266fcdSAlan Cox movb %dl,(%rdi) 1497e266fcdSAlan Cox incq %rdi 1507e266fcdSAlan Cox testb %dl,%dl /* 4th byte == 0? */ 1517e266fcdSAlan Cox je .Ldone 1527e266fcdSAlan Cox 1537e266fcdSAlan Cox shrq $8,%rdx 1547e266fcdSAlan Cox movb %dl,(%rdi) 1557e266fcdSAlan Cox incq %rdi 1567e266fcdSAlan Cox testb %dl,%dl /* 5th byte == 0? */ 1577e266fcdSAlan Cox je .Ldone 1587e266fcdSAlan Cox 1597e266fcdSAlan Cox shrq $8,%rdx 1607e266fcdSAlan Cox movb %dl,(%rdi) 1617e266fcdSAlan Cox incq %rdi 1627e266fcdSAlan Cox testb %dl,%dl /* 6th byte == 0? */ 1637e266fcdSAlan Cox je .Ldone 1647e266fcdSAlan Cox 1657e266fcdSAlan Cox shrq $8,%rdx 1667e266fcdSAlan Cox movb %dl,(%rdi) 1677e266fcdSAlan Cox incq %rdi 1687e266fcdSAlan Cox testb %dl,%dl /* 7th byte == 0? */ 1697e266fcdSAlan Cox je .Ldone 1707e266fcdSAlan Cox 1717e266fcdSAlan Cox shrq $8,%rdx 1727e266fcdSAlan Cox movb %dl,(%rdi) 1737e266fcdSAlan Cox incq %rdi 1747e266fcdSAlan Cox testb %dl,%dl /* 8th byte == 0? */ 1757e266fcdSAlan Cox jne .Lcopy_aligned 1767e266fcdSAlan Cox 1777e266fcdSAlan Cox.Ldone: 1787e266fcdSAlan Cox ret 179*aff9143aSRobert ClauseckerARCHEND(strcat, scalar) 180*aff9143aSRobert Clausecker 181*aff9143aSRobert Clausecker/* 182*aff9143aSRobert Clausecker * Call into strlen + strcpy if we have any SIMD at all. 183*aff9143aSRobert Clausecker * The scalar implementation above is better for the scalar 184*aff9143aSRobert Clausecker * case as it avoids the function call overhead, but pessimal 185*aff9143aSRobert Clausecker * if we could call SIMD routines instead. 186*aff9143aSRobert Clausecker */ 187*aff9143aSRobert ClauseckerARCHENTRY(strcat, baseline) 188*aff9143aSRobert Clausecker push %rbp 189*aff9143aSRobert Clausecker mov %rsp, %rbp 190*aff9143aSRobert Clausecker push %rsi 191*aff9143aSRobert Clausecker push %rbx 192*aff9143aSRobert Clausecker mov %rdi, %rbx # remember destination for later 193*aff9143aSRobert Clausecker call CNAME(strlen) # strlen(dest) 194*aff9143aSRobert Clausecker mov -8(%rbp), %rsi 195*aff9143aSRobert Clausecker lea (%rbx, %rax, 1), %rdi # dest + strlen(dest) 196*aff9143aSRobert Clausecker call CNAME(__stpcpy) # stpcpy(dest + strlen(dest), src) 197*aff9143aSRobert Clausecker mov %rbx, %rax # return dest 198*aff9143aSRobert Clausecker pop %rbx 199*aff9143aSRobert Clausecker leave 200*aff9143aSRobert Clausecker ret 201*aff9143aSRobert ClauseckerARCHEND(strcat, baseline) 20293ab7586SKonstantin Belousov 20393ab7586SKonstantin Belousov .section .note.GNU-stack,"",%progbits 204