1/*- 2 * Copyright (c) 2023, The FreeBSD Foundation 3 * 4 * SPDX-License-Expression: BSD-2-Clause 5 * 6 * Portions of this software were developed by Robert Clausecker 7 * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation. 8 * 9 * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S 10 * written by J.T. Conklin <jtc@acorntoolworks.com> and 11 * adapted by Guillaume Morin <guillaume@morinfr.org> to implement stpcpy 12 * that was originally dedicated to the public domain 13 */ 14 15#include <machine/asm.h> 16 17#include "amd64_archlevel.h" 18 19#define ALIGN_TEXT .p2align 4, 0x90 20 21 .weak stpcpy 22 .set stpcpy, __stpcpy 23ARCHFUNCS(__stpcpy) 24 ARCHFUNC(__stpcpy, scalar) 25 ARCHFUNC(__stpcpy, baseline) 26ENDARCHFUNCS(__stpcpy) 27 28/* 29 * This stpcpy implementation copies a byte at a time until the 30 * source pointer is aligned to a word boundary, it then copies by 31 * words until it finds a word containing a zero byte, and finally 32 * copies by bytes until the end of the string is reached. 33 * 34 * While this may result in unaligned stores if the source and 35 * destination pointers are unaligned with respect to each other, 36 * it is still faster than either byte copies or the overhead of 37 * an implementation suitable for machines with strict alignment 38 * requirements. 39 */ 40 41ARCHENTRY(__stpcpy, scalar) 42 movabsq $0x0101010101010101,%r8 43 movabsq $0x8080808080808080,%r9 44 45 /* 46 * Align source to a word boundary. 47 * Consider unrolling loop? 48 */ 49.Lalign: 50 testb $7,%sil 51 je .Lword_aligned 52 movb (%rsi),%dl 53 incq %rsi 54 movb %dl,(%rdi) 55 incq %rdi 56 testb %dl,%dl 57 jne .Lalign 58 movq %rdi,%rax 59 dec %rax 60 ret 61 62 ALIGN_TEXT 63.Lloop: 64 movq %rdx,(%rdi) 65 addq $8,%rdi 66.Lword_aligned: 67 movq (%rsi),%rdx 68 movq %rdx,%rcx 69 addq $8,%rsi 70 subq %r8,%rcx 71 testq %r9,%rcx 72 je .Lloop 73 74 /* 75 * In rare cases, the above loop may exit prematurely. We must 76 * return to the loop if none of the bytes in the word equal 0. 77 */ 78 79 movb %dl,(%rdi) 80 testb %dl,%dl /* 1st byte == 0? */ 81 je .Ldone 82 incq %rdi 83 84 shrq $8,%rdx 85 movb %dl,(%rdi) 86 testb %dl,%dl /* 2nd byte == 0? */ 87 je .Ldone 88 incq %rdi 89 90 shrq $8,%rdx 91 movb %dl,(%rdi) 92 testb %dl,%dl /* 3rd byte == 0? */ 93 je .Ldone 94 incq %rdi 95 96 shrq $8,%rdx 97 movb %dl,(%rdi) 98 testb %dl,%dl /* 4th byte == 0? */ 99 je .Ldone 100 incq %rdi 101 102 shrq $8,%rdx 103 movb %dl,(%rdi) 104 testb %dl,%dl /* 5th byte == 0? */ 105 je .Ldone 106 incq %rdi 107 108 shrq $8,%rdx 109 movb %dl,(%rdi) 110 testb %dl,%dl /* 6th byte == 0? */ 111 je .Ldone 112 incq %rdi 113 114 shrq $8,%rdx 115 movb %dl,(%rdi) 116 testb %dl,%dl /* 7th byte == 0? */ 117 je .Ldone 118 incq %rdi 119 120 shrq $8,%rdx 121 movb %dl,(%rdi) 122 incq %rdi 123 testb %dl,%dl /* 8th byte == 0? */ 124 jne .Lword_aligned 125 decq %rdi 126 127.Ldone: 128 movq %rdi,%rax 129 ret 130ARCHEND(__stpcpy, scalar) 131 132ARCHENTRY(__stpcpy, baseline) 133 mov %esi, %ecx 134 mov %rdi, %rdx 135 sub %rsi, %rdi # express destination as distance to surce 136 and $~0xf, %rsi # align source to 16 byte 137 movdqa (%rsi), %xmm0 # head of string with junk before 138 pxor %xmm1, %xmm1 139 and $0xf, %ecx # misalignment in bytes 140 pcmpeqb %xmm1, %xmm0 # NUL byte present? 141 pmovmskb %xmm0, %eax 142 shr %cl, %eax # clear out matches in junk bytes 143 bsf %eax, %eax # find match if any 144 jnz .Lrunt 145 146 /* first normal iteration: write head back if it succeeds */ 147 movdqa 16(%rsi), %xmm0 # 16 bytes of current iteration 148 movdqu (%rsi, %rcx, 1), %xmm2 # first 16 bytes of the string 149 pcmpeqb %xmm0, %xmm1 # NUL byte present? 150 pmovmskb %xmm1, %eax 151 test %eax, %eax # find match if any 152 jnz .Lshorty 153 154 movdqu %xmm2, (%rdx) # store beginning of string 155 156 /* main loop, unrolled twice */ 157 ALIGN_TEXT 1580: movdqa 32(%rsi), %xmm2 # load current iteraion 159 movdqu %xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion 160 pxor %xmm1, %xmm1 161 add $32, %rsi 162 pcmpeqb %xmm2, %xmm1 # NUL byte present? 163 pmovmskb %xmm1, %eax 164 test %eax, %eax 165 jnz 1f 166 167 movdqa 16(%rsi), %xmm0 # load current iteraion 168 movdqu %xmm2, (%rsi, %rdi, 1) # write back previous iteraion 169 pxor %xmm1, %xmm1 170 pcmpeqb %xmm0, %xmm1 # NUL byte present? 171 pmovmskb %xmm1, %eax 172 test %eax, %eax 173 jz 0b 174 175 /* end of string after main loop has iterated */ 176 add $16, %rsi # advance rsi to second unrolled half 1771: tzcnt %eax, %eax # find location of match 178 # (behaves as bsf on pre-x86-64-v3 CPUs) 179 add %rsi, %rax # point to NUL byte 180 movdqu -15(%rax), %xmm0 # last 16 bytes of string 181 movdqu %xmm0, -15(%rax, %rdi, 1) # copied to destination 182 add %rdi, %rax # point to destination's NUL byte 183 ret 184 185 /* NUL encountered in second iteration */ 186.Lshorty: 187 tzcnt %eax, %eax 188 add $16, %eax # account for length of first iteration 189 sub %ecx, %eax # but not the parts before the string 190 191 /* NUL encountered in first iteration */ 192.Lrunt: lea 1(%rax), %edi # string length including NUL byte 193 add %rcx, %rsi # point to beginning of string 194 add %rdx, %rax # point to NUL byte 195 196 /* transfer 16--32 bytes */ 197.L1632: cmp $16, %edi 198 jb .L0815 199 200 movdqu -16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes 201 movdqu %xmm2, (%rdx) # store first 16 bytes 202 movdqu %xmm0, -15(%rax) # store last 16 bytes 203 ret 204 205 /* transfer 8--15 bytes */ 206.L0815: cmp $8, %edi 207 jb .L0407 208 209 mov (%rsi), %rcx # load first 8 bytes 210 mov -8(%rsi, %rdi, 1), %rdi # load last 8 bytes 211 mov %rcx, (%rdx) # store to dst 212 mov %rdi, -7(%rax) # dito 213 ret 214 215 /* transfer 4--7 bytes */ 216.L0407: cmp $4, %edi 217 jb .L0203 218 219 mov (%rsi), %ecx 220 mov -4(%rsi, %rdi, 1), %edi 221 mov %ecx, (%rdx) 222 mov %edi, -3(%rax) 223 ret 224 225 /* transfer 2--3 bytes */ 226.L0203: cmp $2, %edi 227 jb .L0101 228 229 movzwl (%rsi), %ecx 230 mov %cx, (%rdx) # store first two bytes 231 232 /* transfer 0 bytes (last byte is always NUL) */ 233.L0101: movb $0, (%rax) # store terminating NUL byte 234 ret 235ARCHEND(__stpcpy, baseline) 236 237 .section .note.GNU-stack,"",%progbits 238