1*74d6cfadSRobert Clausecker/* 2*74d6cfadSRobert Clausecker * Copyright (c) 2023 The FreeBSD Foundation 3*74d6cfadSRobert Clausecker * 4*74d6cfadSRobert Clausecker * This software was developed by Robert Clausecker <fuz@FreeBSD.org> 5*74d6cfadSRobert Clausecker * under sponsorship from the FreeBSD Foundation. 6*74d6cfadSRobert Clausecker * 7*74d6cfadSRobert Clausecker * Redistribution and use in source and binary forms, with or without 8*74d6cfadSRobert Clausecker * modification, are permitted provided that the following conditions 9*74d6cfadSRobert Clausecker * are met: 10*74d6cfadSRobert Clausecker * 1. Redistributions of source code must retain the above copyright 11*74d6cfadSRobert Clausecker * notice, this list of conditions and the following disclaimer. 12*74d6cfadSRobert Clausecker * 2. Redistributions in binary form must reproduce the above copyright 13*74d6cfadSRobert Clausecker * notice, this list of conditions and the following disclaimer in the 14*74d6cfadSRobert Clausecker * documentation and/or other materials provided with the distribution. 15*74d6cfadSRobert Clausecker * 16*74d6cfadSRobert Clausecker * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND 17*74d6cfadSRobert Clausecker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18*74d6cfadSRobert Clausecker * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19*74d6cfadSRobert Clausecker * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20*74d6cfadSRobert Clausecker * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21*74d6cfadSRobert Clausecker * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22*74d6cfadSRobert Clausecker * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23*74d6cfadSRobert Clausecker * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24*74d6cfadSRobert Clausecker * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25*74d6cfadSRobert Clausecker * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26*74d6cfadSRobert Clausecker * SUCH DAMAGE 27*74d6cfadSRobert Clausecker */ 28*74d6cfadSRobert Clausecker 29*74d6cfadSRobert Clausecker#include <machine/asm.h> 30*74d6cfadSRobert Clausecker 31*74d6cfadSRobert Clausecker#include "amd64_archlevel.h" 32*74d6cfadSRobert Clausecker 33*74d6cfadSRobert Clausecker#define ALIGN_TEXT .p2align 4, 0x90 34*74d6cfadSRobert Clausecker 35*74d6cfadSRobert Clausecker .weak strlcpy 36*74d6cfadSRobert Clausecker .set strlcpy, __strlcpy 37*74d6cfadSRobert ClauseckerARCHFUNCS(__strlcpy) 38*74d6cfadSRobert Clausecker ARCHFUNC(__strlcpy, scalar) 39*74d6cfadSRobert Clausecker ARCHFUNC(__strlcpy, baseline) 40*74d6cfadSRobert ClauseckerENDARCHFUNCS(__strlcpy) 41*74d6cfadSRobert Clausecker 42*74d6cfadSRobert ClauseckerARCHENTRY(__strlcpy, scalar) 43*74d6cfadSRobert Clausecker push %rbp # establish stack frame 44*74d6cfadSRobert Clausecker mov %rsp, %rbp 45*74d6cfadSRobert Clausecker push %rsi 46*74d6cfadSRobert Clausecker push %rbx 47*74d6cfadSRobert Clausecker push %rdi 48*74d6cfadSRobert Clausecker push %rdx 49*74d6cfadSRobert Clausecker mov %rsi, %rdi 50*74d6cfadSRobert Clausecker call CNAME(strlen) # strlen(src) 51*74d6cfadSRobert Clausecker pop %rdx 52*74d6cfadSRobert Clausecker pop %rdi 53*74d6cfadSRobert Clausecker mov -8(%rbp), %rsi 54*74d6cfadSRobert Clausecker mov %rax, %rbx # remember string length for return value 55*74d6cfadSRobert Clausecker sub $1, %rdx # do not copy into the final byte of the buffer 56*74d6cfadSRobert Clausecker jc 0f # skip copying altogether if buffer was empty 57*74d6cfadSRobert Clausecker cmp %rax, %rdx # is the buffer longer than the input? 58*74d6cfadSRobert Clausecker cmova %rax, %rdx # if yes, only copy the part that fits 59*74d6cfadSRobert Clausecker movb $0, (%rdi, %rdx, 1) # NUL-terminate output buffer 60*74d6cfadSRobert Clausecker call CNAME(memcpy) # copy string to output 61*74d6cfadSRobert Clausecker0: mov %rbx, %rax # restore return value 62*74d6cfadSRobert Clausecker pop %rbx 63*74d6cfadSRobert Clausecker leave 64*74d6cfadSRobert Clausecker ret 65*74d6cfadSRobert ClauseckerARCHEND(__strlcpy, scalar) 66*74d6cfadSRobert Clausecker 67*74d6cfadSRobert ClauseckerARCHENTRY(__strlcpy, baseline) 68*74d6cfadSRobert Clausecker sub $1, %rdx # do not count NUL byte in buffer length 69*74d6cfadSRobert Clausecker jb .L0 # go to special code path if len was 0 70*74d6cfadSRobert Clausecker 71*74d6cfadSRobert Clausecker mov %esi, %ecx 72*74d6cfadSRobert Clausecker pxor %xmm1, %xmm1 73*74d6cfadSRobert Clausecker mov %rsi, %r9 # stash a copy of the source pointer for later 74*74d6cfadSRobert Clausecker and $~0xf, %rsi 75*74d6cfadSRobert Clausecker pcmpeqb (%rsi), %xmm1 # NUL found in head? 76*74d6cfadSRobert Clausecker mov $-1, %r8d 77*74d6cfadSRobert Clausecker and $0xf, %ecx 78*74d6cfadSRobert Clausecker shl %cl, %r8d # mask of bytes in the string 79*74d6cfadSRobert Clausecker pmovmskb %xmm1, %eax 80*74d6cfadSRobert Clausecker and %r8d, %eax 81*74d6cfadSRobert Clausecker jnz .Lhead_nul 82*74d6cfadSRobert Clausecker 83*74d6cfadSRobert Clausecker movdqa 16(%rsi), %xmm3 # load second string chunk 84*74d6cfadSRobert Clausecker movdqu (%r9), %xmm2 # load unaligned string head 85*74d6cfadSRobert Clausecker mov $32, %r8d 86*74d6cfadSRobert Clausecker sub %ecx, %r8d # head length + length of second chunk 87*74d6cfadSRobert Clausecker pxor %xmm1, %xmm1 88*74d6cfadSRobert Clausecker pcmpeqb %xmm3, %xmm1 # NUL found in second chunk? 89*74d6cfadSRobert Clausecker 90*74d6cfadSRobert Clausecker sub %r8, %rdx # enough space left for the second chunk? 91*74d6cfadSRobert Clausecker jbe .Lhead_buf_end 92*74d6cfadSRobert Clausecker 93*74d6cfadSRobert Clausecker /* process second chunk */ 94*74d6cfadSRobert Clausecker pmovmskb %xmm1, %eax 95*74d6cfadSRobert Clausecker test %eax, %eax 96*74d6cfadSRobert Clausecker jnz .Lsecond_nul 97*74d6cfadSRobert Clausecker 98*74d6cfadSRobert Clausecker /* string didn't end in second chunk and neither did buffer -- not a runt! */ 99*74d6cfadSRobert Clausecker movdqa 32(%rsi), %xmm0 # load next string chunk 100*74d6cfadSRobert Clausecker pxor %xmm1, %xmm1 101*74d6cfadSRobert Clausecker movdqu %xmm2, (%rdi) # deposit head into buffer 102*74d6cfadSRobert Clausecker sub %rcx, %rdi # adjust RDI to correspond to RSI 103*74d6cfadSRobert Clausecker movdqu %xmm3, 16(%rdi) # deposit second chunk 104*74d6cfadSRobert Clausecker sub %rsi, %rdi # express RDI as distance from RSI 105*74d6cfadSRobert Clausecker add $32, %rsi # advance RSI past first two chunks 106*74d6cfadSRobert Clausecker sub $16, %rdx # enough left for another round? 107*74d6cfadSRobert Clausecker jbe 1f 108*74d6cfadSRobert Clausecker 109*74d6cfadSRobert Clausecker /* main loop unrolled twice */ 110*74d6cfadSRobert Clausecker ALIGN_TEXT 111*74d6cfadSRobert Clausecker0: pcmpeqb %xmm0, %xmm1 # NUL byte encountered? 112*74d6cfadSRobert Clausecker pmovmskb %xmm1, %eax 113*74d6cfadSRobert Clausecker test %eax, %eax 114*74d6cfadSRobert Clausecker jnz 3f 115*74d6cfadSRobert Clausecker 116*74d6cfadSRobert Clausecker movdqu %xmm0, (%rsi, %rdi) 117*74d6cfadSRobert Clausecker movdqa 16(%rsi), %xmm0 # load next string chunk 118*74d6cfadSRobert Clausecker pxor %xmm1, %xmm1 119*74d6cfadSRobert Clausecker cmp $16, %rdx # more than a full chunk left? 120*74d6cfadSRobert Clausecker jbe 2f 121*74d6cfadSRobert Clausecker 122*74d6cfadSRobert Clausecker add $32, %rsi # advance pointers to next chunk 123*74d6cfadSRobert Clausecker pcmpeqb %xmm0, %xmm1 # NUL byte encountered? 124*74d6cfadSRobert Clausecker pmovmskb %xmm1, %eax 125*74d6cfadSRobert Clausecker test %eax, %eax 126*74d6cfadSRobert Clausecker jnz 4f 127*74d6cfadSRobert Clausecker 128*74d6cfadSRobert Clausecker movdqu %xmm0, -16(%rsi, %rdi) 129*74d6cfadSRobert Clausecker movdqa (%rsi), %xmm0 # load next string chunk 130*74d6cfadSRobert Clausecker pxor %xmm1, %xmm1 131*74d6cfadSRobert Clausecker sub $32, %rdx 132*74d6cfadSRobert Clausecker ja 0b 133*74d6cfadSRobert Clausecker 134*74d6cfadSRobert Clausecker1: sub $16, %rsi # undo second advancement 135*74d6cfadSRobert Clausecker add $16, %edx 136*74d6cfadSRobert Clausecker 137*74d6cfadSRobert Clausecker /* 1--16 bytes left in the buffer but string has not ended yet */ 138*74d6cfadSRobert Clausecker2: pcmpeqb %xmm1, %xmm0 # NUL byte encountered? 139*74d6cfadSRobert Clausecker pmovmskb %xmm0, %r8d 140*74d6cfadSRobert Clausecker mov %r8d, %eax 141*74d6cfadSRobert Clausecker bts %edx, %r8d # treat end of buffer as end of string 142*74d6cfadSRobert Clausecker tzcnt %r8d, %r8d # find tail length 143*74d6cfadSRobert Clausecker add %rsi, %rdi # restore RDI 144*74d6cfadSRobert Clausecker movdqu (%rsi, %r8, 1), %xmm0 # load string tail 145*74d6cfadSRobert Clausecker movdqu %xmm0, (%rdi, %r8, 1) # store string tail 146*74d6cfadSRobert Clausecker movb $0, 16(%rdi, %r8, 1) # NUL terminate 147*74d6cfadSRobert Clausecker 148*74d6cfadSRobert Clausecker /* continue to find the end of the string */ 149*74d6cfadSRobert Clausecker test %eax, %eax # end of string already reached? 150*74d6cfadSRobert Clausecker jnz 1f 151*74d6cfadSRobert Clausecker 152*74d6cfadSRobert Clausecker ALIGN_TEXT 153*74d6cfadSRobert Clausecker0: pcmpeqb 32(%rsi), %xmm1 154*74d6cfadSRobert Clausecker pmovmskb %xmm1, %eax 155*74d6cfadSRobert Clausecker pxor %xmm1, %xmm1 156*74d6cfadSRobert Clausecker test %eax, %eax 157*74d6cfadSRobert Clausecker jnz 2f 158*74d6cfadSRobert Clausecker 159*74d6cfadSRobert Clausecker pcmpeqb 48(%rsi), %xmm1 160*74d6cfadSRobert Clausecker pmovmskb %xmm1, %eax 161*74d6cfadSRobert Clausecker add $32, %rsi 162*74d6cfadSRobert Clausecker pxor %xmm1, %xmm1 163*74d6cfadSRobert Clausecker test %eax, %eax 164*74d6cfadSRobert Clausecker jz 0b 165*74d6cfadSRobert Clausecker 166*74d6cfadSRobert Clausecker1: sub $16, %rsi # undo second advancement 167*74d6cfadSRobert Clausecker2: tzcnt %eax, %eax # where is the NUL byte? 168*74d6cfadSRobert Clausecker sub %r9, %rsi 169*74d6cfadSRobert Clausecker lea 32(%rsi, %rax, 1), %rax # return string length 170*74d6cfadSRobert Clausecker ret 171*74d6cfadSRobert Clausecker 172*74d6cfadSRobert Clausecker4: sub $16, %rsi # undo second advancement 173*74d6cfadSRobert Clausecker add $16, %rdx # restore number of remaining bytes 174*74d6cfadSRobert Clausecker 175*74d6cfadSRobert Clausecker /* string has ended but buffer has not */ 176*74d6cfadSRobert Clausecker3: tzcnt %eax, %eax # find length of string tail 177*74d6cfadSRobert Clausecker movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. NUL) 178*74d6cfadSRobert Clausecker add %rsi, %rdi # restore destination pointer 179*74d6cfadSRobert Clausecker movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. NUL) 180*74d6cfadSRobert Clausecker sub %r9, %rsi # string length to current chunk 181*74d6cfadSRobert Clausecker add %rsi, %rax # plus length of current chunk 182*74d6cfadSRobert Clausecker ret 183*74d6cfadSRobert Clausecker 184*74d6cfadSRobert Clausecker.Lhead_buf_end: 185*74d6cfadSRobert Clausecker pmovmskb %xmm1, %r8d 186*74d6cfadSRobert Clausecker add $32, %edx # restore edx to (len-1) + ecx 187*74d6cfadSRobert Clausecker mov %r8d, %eax 188*74d6cfadSRobert Clausecker shl $16, %r8d # place 2nd chunk NUL mask into bits 16--31 189*74d6cfadSRobert Clausecker bts %rdx, %r8 # treat end of buffer as end of string 190*74d6cfadSRobert Clausecker tzcnt %r8, %rdx # find string/bufer len from alignment boundary 191*74d6cfadSRobert Clausecker sub %ecx, %edx # find actual string/buffer len 192*74d6cfadSRobert Clausecker movb $0, (%rdi, %rdx, 1) # write NUL terminator 193*74d6cfadSRobert Clausecker 194*74d6cfadSRobert Clausecker /* continue to find the end of the string */ 195*74d6cfadSRobert Clausecker test %eax, %eax # end of string already reached? 196*74d6cfadSRobert Clausecker jnz 1f 197*74d6cfadSRobert Clausecker 198*74d6cfadSRobert Clausecker ALIGN_TEXT 199*74d6cfadSRobert Clausecker0: pcmpeqb 32(%rsi), %xmm1 200*74d6cfadSRobert Clausecker pmovmskb %xmm1, %eax 201*74d6cfadSRobert Clausecker pxor %xmm1, %xmm1 202*74d6cfadSRobert Clausecker test %eax, %eax 203*74d6cfadSRobert Clausecker jnz 2f 204*74d6cfadSRobert Clausecker 205*74d6cfadSRobert Clausecker pcmpeqb 48(%rsi), %xmm1 206*74d6cfadSRobert Clausecker pmovmskb %xmm1, %eax 207*74d6cfadSRobert Clausecker add $32, %rsi 208*74d6cfadSRobert Clausecker pxor %xmm1, %xmm1 209*74d6cfadSRobert Clausecker test %eax, %eax 210*74d6cfadSRobert Clausecker jz 0b 211*74d6cfadSRobert Clausecker 212*74d6cfadSRobert Clausecker1: sub $16, %rsi 213*74d6cfadSRobert Clausecker2: tzcnt %eax, %eax 214*74d6cfadSRobert Clausecker sub %r9, %rsi 215*74d6cfadSRobert Clausecker lea 32(%rsi, %rax, 1), %rax # return string length 216*74d6cfadSRobert Clausecker jmp .L0031 217*74d6cfadSRobert Clausecker 218*74d6cfadSRobert Clausecker.Lsecond_nul: 219*74d6cfadSRobert Clausecker add %r8, %rdx # restore buffer length 220*74d6cfadSRobert Clausecker tzcnt %eax, %eax # where is the NUL byte? 221*74d6cfadSRobert Clausecker lea -16(%rcx), %r8d 222*74d6cfadSRobert Clausecker sub %r8d, %eax # string length 223*74d6cfadSRobert Clausecker cmp %rax, %rdx # is the string shorter than the buffer? 224*74d6cfadSRobert Clausecker cmova %rax, %rdx # copy only min(buflen, srclen) bytes 225*74d6cfadSRobert Clausecker movb $0, (%rdi, %rdx, 1) # write NUL terminator 226*74d6cfadSRobert Clausecker.L0031: cmp $16, %rdx # at least 16 bytes to copy (not incl NUL)? 227*74d6cfadSRobert Clausecker jb .L0015 228*74d6cfadSRobert Clausecker 229*74d6cfadSRobert Clausecker /* copy 16--31 bytes */ 230*74d6cfadSRobert Clausecker movdqu (%r9), %xmm0 # load first 16 bytes 231*74d6cfadSRobert Clausecker movdqu -16(%r9, %rdx, 1), %xmm1 # load last 16 bytes 232*74d6cfadSRobert Clausecker movdqu %xmm0, (%rdi) 233*74d6cfadSRobert Clausecker movdqu %xmm1, -16(%rdi, %rdx, 1) 234*74d6cfadSRobert Clausecker ret 235*74d6cfadSRobert Clausecker 236*74d6cfadSRobert Clausecker.Lhead_nul: 237*74d6cfadSRobert Clausecker tzcnt %eax, %eax # where is the NUL byte? 238*74d6cfadSRobert Clausecker sub %ecx, %eax # ... from the beginning of the string? 239*74d6cfadSRobert Clausecker cmp %rax, %rdx # is the string shorter than the buffer? 240*74d6cfadSRobert Clausecker cmova %rax, %rdx # copy only min(buflen, srclen) bytes 241*74d6cfadSRobert Clausecker movb $0, (%rdi, %rdx, 1) # write NUL terminator 242*74d6cfadSRobert Clausecker 243*74d6cfadSRobert Clausecker /* process strings of 0--15 bytes (rdx: min(buflen, srclen), rax: srclen) */ 244*74d6cfadSRobert Clausecker.L0015: cmp $8, %rdx # at least 8 bytes to copy? 245*74d6cfadSRobert Clausecker jae .L0815 246*74d6cfadSRobert Clausecker 247*74d6cfadSRobert Clausecker cmp $4, %rdx # at least 4 bytes to copy? 248*74d6cfadSRobert Clausecker jae .L0407 249*74d6cfadSRobert Clausecker 250*74d6cfadSRobert Clausecker cmp $2, %rdx # at least 2 bytes to copy? 251*74d6cfadSRobert Clausecker jae .L0203 252*74d6cfadSRobert Clausecker 253*74d6cfadSRobert Clausecker movzbl (%r9), %ecx # load first byte from src 254*74d6cfadSRobert Clausecker mov %cl, (%rdi) # deposit into destination 255*74d6cfadSRobert Clausecker movb $0, (%rdi, %rdx, 1) # add NUL terminator (again) 256*74d6cfadSRobert Clausecker ret 257*74d6cfadSRobert Clausecker 258*74d6cfadSRobert Clausecker.L0203: movzwl (%r9), %ecx 259*74d6cfadSRobert Clausecker movzwl -2(%r9, %rdx, 1), %esi 260*74d6cfadSRobert Clausecker mov %cx, (%rdi) 261*74d6cfadSRobert Clausecker mov %si, -2(%rdi, %rdx, 1) 262*74d6cfadSRobert Clausecker ret 263*74d6cfadSRobert Clausecker 264*74d6cfadSRobert Clausecker.L0407: mov (%r9), %ecx 265*74d6cfadSRobert Clausecker mov -4(%r9, %rdx, 1), %esi 266*74d6cfadSRobert Clausecker mov %ecx, (%rdi) 267*74d6cfadSRobert Clausecker mov %esi, -4(%rdi, %rdx, 1) 268*74d6cfadSRobert Clausecker ret 269*74d6cfadSRobert Clausecker 270*74d6cfadSRobert Clausecker.L0815: mov (%r9), %rcx 271*74d6cfadSRobert Clausecker mov -8(%r9, %rdx, 1), %rsi 272*74d6cfadSRobert Clausecker mov %rcx, (%rdi) 273*74d6cfadSRobert Clausecker mov %rsi, -8(%rdi, %rdx, 1) 274*74d6cfadSRobert Clausecker ret 275*74d6cfadSRobert Clausecker 276*74d6cfadSRobert Clausecker /* length zero destination: just return the string length */ 277*74d6cfadSRobert Clausecker.L0: mov %rsi, %rdi 278*74d6cfadSRobert Clausecker jmp CNAME(strlen) 279*74d6cfadSRobert ClauseckerARCHEND(__strlcpy, baseline) 280*74d6cfadSRobert Clausecker 281*74d6cfadSRobert Clausecker .section .note.GNU-stack,"",%progbits 282