/* * Copyright (c) 2023 The FreeBSD Foundation * * This software was developed by Robert Clausecker * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE */ #include #include "amd64_archlevel.h" #define ALIGN_TEXT .p2align 4, 0x90 .weak strlcpy .set strlcpy, __strlcpy ARCHFUNCS(__strlcpy) ARCHFUNC(__strlcpy, scalar) ARCHFUNC(__strlcpy, baseline) ENDARCHFUNCS(__strlcpy) ARCHENTRY(__strlcpy, scalar) push %rbp # establish stack frame mov %rsp, %rbp push %rsi push %rbx push %rdi push %rdx mov %rsi, %rdi call CNAME(strlen) # strlen(src) pop %rdx pop %rdi mov -8(%rbp), %rsi mov %rax, %rbx # remember string length for return value sub $1, %rdx # do not copy into the final byte of the buffer jc 0f # skip copying altogether if buffer was empty cmp %rax, %rdx # is the buffer longer than the input? cmova %rax, %rdx # if yes, only copy the part that fits movb $0, (%rdi, %rdx, 1) # NUL-terminate output buffer call CNAME(memcpy) # copy string to output 0: mov %rbx, %rax # restore return value pop %rbx leave ret ARCHEND(__strlcpy, scalar) ARCHENTRY(__strlcpy, baseline) sub $1, %rdx # do not count NUL byte in buffer length jb .L0 # go to special code path if len was 0 mov %esi, %ecx pxor %xmm1, %xmm1 mov %rsi, %r9 # stash a copy of the source pointer for later and $~0xf, %rsi pcmpeqb (%rsi), %xmm1 # NUL found in head? mov $-1, %r8d and $0xf, %ecx shl %cl, %r8d # mask of bytes in the string pmovmskb %xmm1, %eax and %r8d, %eax jnz .Lhead_nul movdqa 16(%rsi), %xmm3 # load second string chunk movdqu (%r9), %xmm2 # load unaligned string head mov $32, %r8d sub %ecx, %r8d # head length + length of second chunk pxor %xmm1, %xmm1 pcmpeqb %xmm3, %xmm1 # NUL found in second chunk? sub %r8, %rdx # enough space left for the second chunk? jbe .Lhead_buf_end /* process second chunk */ pmovmskb %xmm1, %eax test %eax, %eax jnz .Lsecond_nul /* string didn't end in second chunk and neither did buffer -- not a runt! */ movdqa 32(%rsi), %xmm0 # load next string chunk pxor %xmm1, %xmm1 movdqu %xmm2, (%rdi) # deposit head into buffer sub %rcx, %rdi # adjust RDI to correspond to RSI movdqu %xmm3, 16(%rdi) # deposit second chunk sub %rsi, %rdi # express RDI as distance from RSI add $32, %rsi # advance RSI past first two chunks sub $16, %rdx # enough left for another round? jbe 1f /* main loop unrolled twice */ ALIGN_TEXT 0: pcmpeqb %xmm0, %xmm1 # NUL byte encountered? pmovmskb %xmm1, %eax test %eax, %eax jnz 3f movdqu %xmm0, (%rsi, %rdi) movdqa 16(%rsi), %xmm0 # load next string chunk pxor %xmm1, %xmm1 cmp $16, %rdx # more than a full chunk left? jbe 2f add $32, %rsi # advance pointers to next chunk pcmpeqb %xmm0, %xmm1 # NUL byte encountered? pmovmskb %xmm1, %eax test %eax, %eax jnz 4f movdqu %xmm0, -16(%rsi, %rdi) movdqa (%rsi), %xmm0 # load next string chunk pxor %xmm1, %xmm1 sub $32, %rdx ja 0b 1: sub $16, %rsi # undo second advancement add $16, %edx /* 1--16 bytes left in the buffer but string has not ended yet */ 2: pcmpeqb %xmm1, %xmm0 # NUL byte encountered? pmovmskb %xmm0, %r8d mov %r8d, %eax bts %edx, %r8d # treat end of buffer as end of string tzcnt %r8d, %r8d # find tail length add %rsi, %rdi # restore RDI movdqu (%rsi, %r8, 1), %xmm0 # load string tail movdqu %xmm0, (%rdi, %r8, 1) # store string tail movb $0, 16(%rdi, %r8, 1) # NUL terminate /* continue to find the end of the string */ test %eax, %eax # end of string already reached? jnz 1f ALIGN_TEXT 0: pcmpeqb 32(%rsi), %xmm1 pmovmskb %xmm1, %eax pxor %xmm1, %xmm1 test %eax, %eax jnz 2f pcmpeqb 48(%rsi), %xmm1 pmovmskb %xmm1, %eax add $32, %rsi pxor %xmm1, %xmm1 test %eax, %eax jz 0b 1: sub $16, %rsi # undo second advancement 2: tzcnt %eax, %eax # where is the NUL byte? sub %r9, %rsi lea 32(%rsi, %rax, 1), %rax # return string length ret 4: sub $16, %rsi # undo second advancement add $16, %rdx # restore number of remaining bytes /* string has ended but buffer has not */ 3: tzcnt %eax, %eax # find length of string tail movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. NUL) add %rsi, %rdi # restore destination pointer movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. NUL) sub %r9, %rsi # string length to current chunk add %rsi, %rax # plus length of current chunk ret .Lhead_buf_end: pmovmskb %xmm1, %r8d add $32, %edx # restore edx to (len-1) + ecx mov %r8d, %eax shl $16, %r8d # place 2nd chunk NUL mask into bits 16--31 bts %rdx, %r8 # treat end of buffer as end of string tzcnt %r8, %rdx # find string/bufer len from alignment boundary sub %ecx, %edx # find actual string/buffer len movb $0, (%rdi, %rdx, 1) # write NUL terminator /* continue to find the end of the string */ test %eax, %eax # end of string already reached? jnz 1f ALIGN_TEXT 0: pcmpeqb 32(%rsi), %xmm1 pmovmskb %xmm1, %eax pxor %xmm1, %xmm1 test %eax, %eax jnz 2f pcmpeqb 48(%rsi), %xmm1 pmovmskb %xmm1, %eax add $32, %rsi pxor %xmm1, %xmm1 test %eax, %eax jz 0b 1: sub $16, %rsi 2: tzcnt %eax, %eax sub %r9, %rsi lea 32(%rsi, %rax, 1), %rax # return string length jmp .L0031 .Lsecond_nul: add %r8, %rdx # restore buffer length tzcnt %eax, %eax # where is the NUL byte? lea -16(%rcx), %r8d sub %r8d, %eax # string length cmp %rax, %rdx # is the string shorter than the buffer? cmova %rax, %rdx # copy only min(buflen, srclen) bytes movb $0, (%rdi, %rdx, 1) # write NUL terminator .L0031: cmp $16, %rdx # at least 16 bytes to copy (not incl NUL)? jb .L0015 /* copy 16--31 bytes */ movdqu (%r9), %xmm0 # load first 16 bytes movdqu -16(%r9, %rdx, 1), %xmm1 # load last 16 bytes movdqu %xmm0, (%rdi) movdqu %xmm1, -16(%rdi, %rdx, 1) ret .Lhead_nul: tzcnt %eax, %eax # where is the NUL byte? sub %ecx, %eax # ... from the beginning of the string? cmp %rax, %rdx # is the string shorter than the buffer? cmova %rax, %rdx # copy only min(buflen, srclen) bytes movb $0, (%rdi, %rdx, 1) # write NUL terminator /* process strings of 0--15 bytes (rdx: min(buflen, srclen), rax: srclen) */ .L0015: cmp $8, %rdx # at least 8 bytes to copy? jae .L0815 cmp $4, %rdx # at least 4 bytes to copy? jae .L0407 cmp $2, %rdx # at least 2 bytes to copy? jae .L0203 movzbl (%r9), %ecx # load first byte from src mov %cl, (%rdi) # deposit into destination movb $0, (%rdi, %rdx, 1) # add NUL terminator (again) ret .L0203: movzwl (%r9), %ecx movzwl -2(%r9, %rdx, 1), %esi mov %cx, (%rdi) mov %si, -2(%rdi, %rdx, 1) ret .L0407: mov (%r9), %ecx mov -4(%r9, %rdx, 1), %esi mov %ecx, (%rdi) mov %esi, -4(%rdi, %rdx, 1) ret .L0815: mov (%r9), %rcx mov -8(%r9, %rdx, 1), %rsi mov %rcx, (%rdi) mov %rsi, -8(%rdi, %rdx, 1) ret /* length zero destination: just return the string length */ .L0: mov %rsi, %rdi jmp CNAME(strlen) ARCHEND(__strlcpy, baseline) .section .note.GNU-stack,"",%progbits