1/* 2 * Copyright (c) 2023 The FreeBSD Foundation 3 * 4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org> 5 * under sponsorship from the FreeBSD Foundation. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE 27 */ 28 29#include <machine/asm.h> 30 31#include "amd64_archlevel.h" 32 33#define ALIGN_TEXT .p2align 4, 0x90 34 35 .weak strlcpy 36 .set strlcpy, __strlcpy 37ARCHFUNCS(__strlcpy) 38 ARCHFUNC(__strlcpy, scalar) 39 ARCHFUNC(__strlcpy, baseline) 40ENDARCHFUNCS(__strlcpy) 41 42ARCHENTRY(__strlcpy, scalar) 43 push %rbp # establish stack frame 44 mov %rsp, %rbp 45 push %rsi 46 push %rbx 47 push %rdi 48 push %rdx 49 mov %rsi, %rdi 50 call CNAME(strlen) # strlen(src) 51 pop %rdx 52 pop %rdi 53 mov -8(%rbp), %rsi 54 mov %rax, %rbx # remember string length for return value 55 sub $1, %rdx # do not copy into the final byte of the buffer 56 jc 0f # skip copying altogether if buffer was empty 57 cmp %rax, %rdx # is the buffer longer than the input? 58 cmova %rax, %rdx # if yes, only copy the part that fits 59 movb $0, (%rdi, %rdx, 1) # NUL-terminate output buffer 60 call CNAME(memcpy) # copy string to output 610: mov %rbx, %rax # restore return value 62 pop %rbx 63 leave 64 ret 65ARCHEND(__strlcpy, scalar) 66 67ARCHENTRY(__strlcpy, baseline) 68 sub $1, %rdx # do not count NUL byte in buffer length 69 jb .L0 # go to special code path if len was 0 70 71 mov %esi, %ecx 72 pxor %xmm1, %xmm1 73 mov %rsi, %r9 # stash a copy of the source pointer for later 74 and $~0xf, %rsi 75 pcmpeqb (%rsi), %xmm1 # NUL found in head? 76 mov $-1, %r8d 77 and $0xf, %ecx 78 shl %cl, %r8d # mask of bytes in the string 79 pmovmskb %xmm1, %eax 80 and %r8d, %eax 81 jnz .Lhead_nul 82 83 movdqa 16(%rsi), %xmm3 # load second string chunk 84 movdqu (%r9), %xmm2 # load unaligned string head 85 mov $32, %r8d 86 sub %ecx, %r8d # head length + length of second chunk 87 pxor %xmm1, %xmm1 88 pcmpeqb %xmm3, %xmm1 # NUL found in second chunk? 89 90 sub %r8, %rdx # enough space left for the second chunk? 91 jbe .Lhead_buf_end 92 93 /* process second chunk */ 94 pmovmskb %xmm1, %eax 95 test %eax, %eax 96 jnz .Lsecond_nul 97 98 /* string didn't end in second chunk and neither did buffer -- not a runt! */ 99 movdqa 32(%rsi), %xmm0 # load next string chunk 100 pxor %xmm1, %xmm1 101 movdqu %xmm2, (%rdi) # deposit head into buffer 102 sub %rcx, %rdi # adjust RDI to correspond to RSI 103 movdqu %xmm3, 16(%rdi) # deposit second chunk 104 sub %rsi, %rdi # express RDI as distance from RSI 105 add $32, %rsi # advance RSI past first two chunks 106 sub $16, %rdx # enough left for another round? 107 jbe 1f 108 109 /* main loop unrolled twice */ 110 ALIGN_TEXT 1110: pcmpeqb %xmm0, %xmm1 # NUL byte encountered? 112 pmovmskb %xmm1, %eax 113 test %eax, %eax 114 jnz 3f 115 116 movdqu %xmm0, (%rsi, %rdi) 117 movdqa 16(%rsi), %xmm0 # load next string chunk 118 pxor %xmm1, %xmm1 119 cmp $16, %rdx # more than a full chunk left? 120 jbe 2f 121 122 add $32, %rsi # advance pointers to next chunk 123 pcmpeqb %xmm0, %xmm1 # NUL byte encountered? 124 pmovmskb %xmm1, %eax 125 test %eax, %eax 126 jnz 4f 127 128 movdqu %xmm0, -16(%rsi, %rdi) 129 movdqa (%rsi), %xmm0 # load next string chunk 130 pxor %xmm1, %xmm1 131 sub $32, %rdx 132 ja 0b 133 1341: sub $16, %rsi # undo second advancement 135 add $16, %edx 136 137 /* 1--16 bytes left in the buffer but string has not ended yet */ 1382: pcmpeqb %xmm1, %xmm0 # NUL byte encountered? 139 pmovmskb %xmm0, %r8d 140 mov %r8d, %eax 141 bts %edx, %r8d # treat end of buffer as end of string 142 tzcnt %r8d, %r8d # find tail length 143 add %rsi, %rdi # restore RDI 144 movdqu (%rsi, %r8, 1), %xmm0 # load string tail 145 movdqu %xmm0, (%rdi, %r8, 1) # store string tail 146 movb $0, 16(%rdi, %r8, 1) # NUL terminate 147 148 /* continue to find the end of the string */ 149 test %eax, %eax # end of string already reached? 150 jnz 1f 151 152 ALIGN_TEXT 1530: pcmpeqb 32(%rsi), %xmm1 154 pmovmskb %xmm1, %eax 155 pxor %xmm1, %xmm1 156 test %eax, %eax 157 jnz 2f 158 159 pcmpeqb 48(%rsi), %xmm1 160 pmovmskb %xmm1, %eax 161 add $32, %rsi 162 pxor %xmm1, %xmm1 163 test %eax, %eax 164 jz 0b 165 1661: sub $16, %rsi # undo second advancement 1672: tzcnt %eax, %eax # where is the NUL byte? 168 sub %r9, %rsi 169 lea 32(%rsi, %rax, 1), %rax # return string length 170 ret 171 1724: sub $16, %rsi # undo second advancement 173 add $16, %rdx # restore number of remaining bytes 174 175 /* string has ended but buffer has not */ 1763: tzcnt %eax, %eax # find length of string tail 177 movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. NUL) 178 add %rsi, %rdi # restore destination pointer 179 movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. NUL) 180 sub %r9, %rsi # string length to current chunk 181 add %rsi, %rax # plus length of current chunk 182 ret 183 184.Lhead_buf_end: 185 pmovmskb %xmm1, %r8d 186 add $32, %edx # restore edx to (len-1) + ecx 187 mov %r8d, %eax 188 shl $16, %r8d # place 2nd chunk NUL mask into bits 16--31 189 bts %rdx, %r8 # treat end of buffer as end of string 190 tzcnt %r8, %rdx # find string/bufer len from alignment boundary 191 sub %ecx, %edx # find actual string/buffer len 192 movb $0, (%rdi, %rdx, 1) # write NUL terminator 193 194 /* continue to find the end of the string */ 195 test %eax, %eax # end of string already reached? 196 jnz 1f 197 198 ALIGN_TEXT 1990: pcmpeqb 32(%rsi), %xmm1 200 pmovmskb %xmm1, %eax 201 pxor %xmm1, %xmm1 202 test %eax, %eax 203 jnz 2f 204 205 pcmpeqb 48(%rsi), %xmm1 206 pmovmskb %xmm1, %eax 207 add $32, %rsi 208 pxor %xmm1, %xmm1 209 test %eax, %eax 210 jz 0b 211 2121: sub $16, %rsi 2132: tzcnt %eax, %eax 214 sub %r9, %rsi 215 lea 32(%rsi, %rax, 1), %rax # return string length 216 jmp .L0031 217 218.Lsecond_nul: 219 add %r8, %rdx # restore buffer length 220 tzcnt %eax, %eax # where is the NUL byte? 221 lea -16(%rcx), %r8d 222 sub %r8d, %eax # string length 223 cmp %rax, %rdx # is the string shorter than the buffer? 224 cmova %rax, %rdx # copy only min(buflen, srclen) bytes 225 movb $0, (%rdi, %rdx, 1) # write NUL terminator 226.L0031: cmp $16, %rdx # at least 16 bytes to copy (not incl NUL)? 227 jb .L0015 228 229 /* copy 16--31 bytes */ 230 movdqu (%r9), %xmm0 # load first 16 bytes 231 movdqu -16(%r9, %rdx, 1), %xmm1 # load last 16 bytes 232 movdqu %xmm0, (%rdi) 233 movdqu %xmm1, -16(%rdi, %rdx, 1) 234 ret 235 236.Lhead_nul: 237 tzcnt %eax, %eax # where is the NUL byte? 238 sub %ecx, %eax # ... from the beginning of the string? 239 cmp %rax, %rdx # is the string shorter than the buffer? 240 cmova %rax, %rdx # copy only min(buflen, srclen) bytes 241 movb $0, (%rdi, %rdx, 1) # write NUL terminator 242 243 /* process strings of 0--15 bytes (rdx: min(buflen, srclen), rax: srclen) */ 244.L0015: cmp $8, %rdx # at least 8 bytes to copy? 245 jae .L0815 246 247 cmp $4, %rdx # at least 4 bytes to copy? 248 jae .L0407 249 250 cmp $2, %rdx # at least 2 bytes to copy? 251 jae .L0203 252 253 movzbl (%r9), %ecx # load first byte from src 254 mov %cl, (%rdi) # deposit into destination 255 movb $0, (%rdi, %rdx, 1) # add NUL terminator (again) 256 ret 257 258.L0203: movzwl (%r9), %ecx 259 movzwl -2(%r9, %rdx, 1), %esi 260 mov %cx, (%rdi) 261 mov %si, -2(%rdi, %rdx, 1) 262 ret 263 264.L0407: mov (%r9), %ecx 265 mov -4(%r9, %rdx, 1), %esi 266 mov %ecx, (%rdi) 267 mov %esi, -4(%rdi, %rdx, 1) 268 ret 269 270.L0815: mov (%r9), %rcx 271 mov -8(%r9, %rdx, 1), %rsi 272 mov %rcx, (%rdi) 273 mov %rsi, -8(%rdi, %rdx, 1) 274 ret 275 276 /* length zero destination: just return the string length */ 277.L0: mov %rsi, %rdi 278 jmp CNAME(strlen) 279ARCHEND(__strlcpy, baseline) 280 281 .section .note.GNU-stack,"",%progbits 282