1/*- 2 * Copyright (c) 2023, The FreeBSD Foundation 3 * 4 * SPDX-License-Expression: BSD-2-Clause 5 * 6 * Portions of this software were developed by Robert Clausecker 7 * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation. 8 * 9 * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcat.S 10 * written by J.T. Conklin <jtc@acorntoolworks.com> 11 * that was originally dedicated to the public domain 12 */ 13 14#include <machine/asm.h> 15#if 0 16 RCSID("$NetBSD: strcat.S,v 1.4 2004/07/26 18:51:21 drochner Exp $") 17#endif 18 19#include "amd64_archlevel.h" 20 21ARCHFUNCS(strcat) 22 ARCHFUNC(strcat, scalar) 23 ARCHFUNC(strcat, baseline) 24ENDARCHFUNCS(strcat) 25 26ARCHENTRY(strcat, scalar) 27 movq %rdi,%rax 28 movabsq $0x0101010101010101,%r8 29 movabsq $0x8080808080808080,%r9 30 31 /* 32 * Align destination to word boundary. 33 * Consider unrolling loop? 34 */ 35.Lscan: 36.Lscan_align: 37 testb $7,%dil 38 je .Lscan_aligned 39 cmpb $0,(%rdi) 40 je .Lcopy 41 incq %rdi 42 jmp .Lscan_align 43 44 .align 4 45.Lscan_aligned: 46.Lscan_loop: 47 movq (%rdi),%rdx 48 addq $8,%rdi 49 subq %r8,%rdx 50 testq %r9,%rdx 51 je .Lscan_loop 52 53 /* 54 * In rare cases, the above loop may exit prematurely. We must 55 * return to the loop if none of the bytes in the word equal 0. 56 */ 57 58 cmpb $0,-8(%rdi) /* 1st byte == 0? */ 59 jne 1f 60 subq $8,%rdi 61 jmp .Lcopy 62 631: cmpb $0,-7(%rdi) /* 2nd byte == 0? */ 64 jne 1f 65 subq $7,%rdi 66 jmp .Lcopy 67 681: cmpb $0,-6(%rdi) /* 3rd byte == 0? */ 69 jne 1f 70 subq $6,%rdi 71 jmp .Lcopy 72 731: cmpb $0,-5(%rdi) /* 4th byte == 0? */ 74 jne 1f 75 subq $5,%rdi 76 jmp .Lcopy 77 781: cmpb $0,-4(%rdi) /* 5th byte == 0? */ 79 jne 1f 80 subq $4,%rdi 81 jmp .Lcopy 82 831: cmpb $0,-3(%rdi) /* 6th byte == 0? */ 84 jne 1f 85 subq $3,%rdi 86 jmp .Lcopy 87 881: cmpb $0,-2(%rdi) /* 7th byte == 0? */ 89 jne 1f 90 subq $2,%rdi 91 jmp .Lcopy 92 931: cmpb $0,-1(%rdi) /* 8th byte == 0? */ 94 jne .Lscan_loop 95 subq $1,%rdi 96 97 /* 98 * Align source to a word boundary. 99 * Consider unrolling loop? 100 */ 101.Lcopy: 102.Lcopy_align: 103 testb $7,%sil 104 je .Lcopy_aligned 105 movb (%rsi),%dl 106 incq %rsi 107 movb %dl,(%rdi) 108 incq %rdi 109 testb %dl,%dl 110 jne .Lcopy_align 111 ret 112 113 .align 4 114.Lcopy_loop: 115 movq %rdx,(%rdi) 116 addq $8,%rdi 117.Lcopy_aligned: 118 movq (%rsi),%rdx 119 movq %rdx,%rcx 120 addq $8,%rsi 121 subq %r8,%rcx 122 testq %r9,%rcx 123 je .Lcopy_loop 124 125 /* 126 * In rare cases, the above loop may exit prematurely. We must 127 * return to the loop if none of the bytes in the word equal 0. 128 */ 129 130 movb %dl,(%rdi) 131 incq %rdi 132 testb %dl,%dl /* 1st byte == 0? */ 133 je .Ldone 134 135 shrq $8,%rdx 136 movb %dl,(%rdi) 137 incq %rdi 138 testb %dl,%dl /* 2nd byte == 0? */ 139 je .Ldone 140 141 shrq $8,%rdx 142 movb %dl,(%rdi) 143 incq %rdi 144 testb %dl,%dl /* 3rd byte == 0? */ 145 je .Ldone 146 147 shrq $8,%rdx 148 movb %dl,(%rdi) 149 incq %rdi 150 testb %dl,%dl /* 4th byte == 0? */ 151 je .Ldone 152 153 shrq $8,%rdx 154 movb %dl,(%rdi) 155 incq %rdi 156 testb %dl,%dl /* 5th byte == 0? */ 157 je .Ldone 158 159 shrq $8,%rdx 160 movb %dl,(%rdi) 161 incq %rdi 162 testb %dl,%dl /* 6th byte == 0? */ 163 je .Ldone 164 165 shrq $8,%rdx 166 movb %dl,(%rdi) 167 incq %rdi 168 testb %dl,%dl /* 7th byte == 0? */ 169 je .Ldone 170 171 shrq $8,%rdx 172 movb %dl,(%rdi) 173 incq %rdi 174 testb %dl,%dl /* 8th byte == 0? */ 175 jne .Lcopy_aligned 176 177.Ldone: 178 ret 179ARCHEND(strcat, scalar) 180 181/* 182 * Call into strlen + strcpy if we have any SIMD at all. 183 * The scalar implementation above is better for the scalar 184 * case as it avoids the function call overhead, but pessimal 185 * if we could call SIMD routines instead. 186 */ 187ARCHENTRY(strcat, baseline) 188 push %rbp 189 mov %rsp, %rbp 190 push %rsi 191 push %rbx 192 mov %rdi, %rbx # remember destination for later 193 call CNAME(strlen) # strlen(dest) 194 mov -8(%rbp), %rsi 195 lea (%rbx, %rax, 1), %rdi # dest + strlen(dest) 196 call CNAME(__stpcpy) # stpcpy(dest + strlen(dest), src) 197 mov %rbx, %rax # return dest 198 pop %rbx 199 leave 200 ret 201ARCHEND(strcat, baseline) 202 203 .section .note.GNU-stack,"",%progbits 204