1*90253d49SRobert Clausecker/* 2*90253d49SRobert Clausecker * Copyright (c) 2023 The FreeBSD Foundation 3*90253d49SRobert Clausecker * 4*90253d49SRobert Clausecker * This software was developed by Robert Clausecker <fuz@FreeBSD.org> 5*90253d49SRobert Clausecker * under sponsorship from the FreeBSD Foundation. 6*90253d49SRobert Clausecker * 7*90253d49SRobert Clausecker * Redistribution and use in source and binary forms, with or without 8*90253d49SRobert Clausecker * modification, are permitted provided that the following conditions 9*90253d49SRobert Clausecker * are met: 10*90253d49SRobert Clausecker * 1. Redistributions of source code must retain the above copyright 11*90253d49SRobert Clausecker * notice, this list of conditions and the following disclaimer. 12*90253d49SRobert Clausecker * 2. Redistributions in binary form must reproduce the above copyright 13*90253d49SRobert Clausecker * notice, this list of conditions and the following disclaimer in the 14*90253d49SRobert Clausecker * documentation and/or other materials provided with the distribution. 15*90253d49SRobert Clausecker * 16*90253d49SRobert Clausecker * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND 17*90253d49SRobert Clausecker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18*90253d49SRobert Clausecker * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19*90253d49SRobert Clausecker * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20*90253d49SRobert Clausecker * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21*90253d49SRobert Clausecker * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22*90253d49SRobert Clausecker * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23*90253d49SRobert Clausecker * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24*90253d49SRobert Clausecker * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25*90253d49SRobert Clausecker * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26*90253d49SRobert Clausecker * SUCH DAMAGE 27*90253d49SRobert Clausecker */ 28*90253d49SRobert Clausecker 29*90253d49SRobert Clausecker#include <machine/asm.h> 30*90253d49SRobert Clausecker 31*90253d49SRobert Clausecker#include "amd64_archlevel.h" 32*90253d49SRobert Clausecker 33*90253d49SRobert Clausecker#define ALIGN_TEXT .p2align 4, 0x90 34*90253d49SRobert Clausecker 35*90253d49SRobert Clausecker .weak stpncpy 36*90253d49SRobert Clausecker .set stpncpy, __stpncpy 37*90253d49SRobert ClauseckerARCHFUNCS(__stpncpy) 38*90253d49SRobert Clausecker ARCHFUNC(__stpncpy, scalar) 39*90253d49SRobert Clausecker ARCHFUNC(__stpncpy, baseline) 40*90253d49SRobert ClauseckerENDARCHFUNCS(__stpncpy) 41*90253d49SRobert Clausecker 42*90253d49SRobert ClauseckerARCHENTRY(__stpncpy, scalar) 43*90253d49SRobert Clausecker push %rbp # establish stack frame 44*90253d49SRobert Clausecker mov %rsp, %rbp 45*90253d49SRobert Clausecker 46*90253d49SRobert Clausecker push %rdx 47*90253d49SRobert Clausecker push %rdi 48*90253d49SRobert Clausecker push %rsi 49*90253d49SRobert Clausecker push %rax # dummy push for alignment 50*90253d49SRobert Clausecker 51*90253d49SRobert Clausecker mov %rsi, %rdi 52*90253d49SRobert Clausecker xor %esi, %esi 53*90253d49SRobert Clausecker call CNAME(__memchr) # memchr(src, '\0', len) 54*90253d49SRobert Clausecker pop %rcx # dummy pop 55*90253d49SRobert Clausecker pop %rsi 56*90253d49SRobert Clausecker mov -16(%rbp), %rdi 57*90253d49SRobert Clausecker 58*90253d49SRobert Clausecker test %rax, %rax # NUL found? 59*90253d49SRobert Clausecker jz .Lfullcopy 60*90253d49SRobert Clausecker 61*90253d49SRobert Clausecker mov %rax, %rdx 62*90253d49SRobert Clausecker sub %rsi, %rdx # copy until the NUL byte 63*90253d49SRobert Clausecker add %rdx, -16(%rbp) # advance destination by string length 64*90253d49SRobert Clausecker sub %rdx, -8(%rbp) # and shorten buffer size by string length 65*90253d49SRobert Clausecker call CNAME(memcpy) 66*90253d49SRobert Clausecker 67*90253d49SRobert Clausecker pop %rdi 68*90253d49SRobert Clausecker pop %rdx 69*90253d49SRobert Clausecker xor %esi, %esi 70*90253d49SRobert Clausecker pop %rbp 71*90253d49SRobert Clausecker jmp CNAME(memset) # clear remaining buffer 72*90253d49SRobert Clausecker 73*90253d49SRobert Clausecker.Lfullcopy: 74*90253d49SRobert Clausecker mov -8(%rbp), %rdx 75*90253d49SRobert Clausecker call CNAME(memcpy) # copy whole string 76*90253d49SRobert Clausecker add -8(%rbp), %rax # point to dest[n] 77*90253d49SRobert Clausecker leave 78*90253d49SRobert Clausecker ret 79*90253d49SRobert ClauseckerARCHEND(__stpncpy, scalar) 80*90253d49SRobert Clausecker 81*90253d49SRobert Clausecker /* 82*90253d49SRobert Clausecker * this mask allows us to generate masks of 16-n 0xff bytes 83*90253d49SRobert Clausecker * followed by n 0x00 bytes by loading from .Lmask+n. 84*90253d49SRobert Clausecker */ 85*90253d49SRobert Clausecker .section .rodata 86*90253d49SRobert Clausecker.Lmask: .quad 0xffffffffffffffff 87*90253d49SRobert Clausecker .quad 0xffffffffffffffff 88*90253d49SRobert Clausecker .quad 0x0000000000000000 89*90253d49SRobert Clausecker .quad 0x0000000000000000 90*90253d49SRobert Clausecker 91*90253d49SRobert Clausecker/* stpncpy(char *restrict rdi, const char *rsi, size_t rdx) */ 92*90253d49SRobert ClauseckerARCHENTRY(__stpncpy, baseline) 93*90253d49SRobert Clausecker#define bounce (-3*16-8) /* location of on-stack bounce buffer */ 94*90253d49SRobert Clausecker 95*90253d49SRobert Clausecker test %rdx, %rdx # no bytes to copy? 96*90253d49SRobert Clausecker jz .L0 97*90253d49SRobert Clausecker 98*90253d49SRobert Clausecker mov %esi, %ecx 99*90253d49SRobert Clausecker and $~0xf, %rsi # align source to 16 bytes 100*90253d49SRobert Clausecker movdqa (%rsi), %xmm0 # load head 101*90253d49SRobert Clausecker and $0xf, %ecx # offset from alignment 102*90253d49SRobert Clausecker mov $-1, %r9d 103*90253d49SRobert Clausecker lea -32(%rcx), %rax # set up overflow-proof comparison rdx+rcx<=32 104*90253d49SRobert Clausecker shl %cl, %r9d # mask of bytes belonging to the string 105*90253d49SRobert Clausecker sub %rcx, %rdi # adjust RDI to correspond to RSI 106*90253d49SRobert Clausecker pxor %xmm1, %xmm1 107*90253d49SRobert Clausecker movdqa %xmm0, bounce(%rsp) # stash copy of head on the stack 108*90253d49SRobert Clausecker pcmpeqb %xmm1, %xmm0 109*90253d49SRobert Clausecker pmovmskb %xmm0, %r8d 110*90253d49SRobert Clausecker 111*90253d49SRobert Clausecker lea (%rdx, %rcx, 1), %r10 # buffer length from alignment boundary 112*90253d49SRobert Clausecker add %rdx, %rax # less than 2 chunks (32 bytes) to play with? 113*90253d49SRobert Clausecker jnc .Lrunt # if yes, use special runt processing 114*90253d49SRobert Clausecker 115*90253d49SRobert Clausecker movdqu %xmm1, -16(%rdi, %r10, 1) # clear final bytes of destination 116*90253d49SRobert Clausecker and %r9d, %r8d # end of string within head? 117*90253d49SRobert Clausecker jnz .Lheadnul 118*90253d49SRobert Clausecker 119*90253d49SRobert Clausecker movdqu (%rsi, %rcx, 1), %xmm2 # load head from source buffer 120*90253d49SRobert Clausecker movdqu %xmm2, (%rdi, %rcx, 1) # an deposit 121*90253d49SRobert Clausecker 122*90253d49SRobert Clausecker add $16, %rsi 123*90253d49SRobert Clausecker add $16, %rdi 124*90253d49SRobert Clausecker sub $32, %r10 125*90253d49SRobert Clausecker 126*90253d49SRobert Clausecker /* main loop unrolled twice */ 127*90253d49SRobert Clausecker ALIGN_TEXT 128*90253d49SRobert Clausecker0: movdqa (%rsi), %xmm0 129*90253d49SRobert Clausecker pxor %xmm1, %xmm1 130*90253d49SRobert Clausecker pcmpeqb %xmm0, %xmm1 # NUL byte encountered? 131*90253d49SRobert Clausecker pmovmskb %xmm1, %r8d 132*90253d49SRobert Clausecker test %r8d, %r8d 133*90253d49SRobert Clausecker jnz 3f 134*90253d49SRobert Clausecker 135*90253d49SRobert Clausecker movdqu %xmm0, (%rdi) 136*90253d49SRobert Clausecker cmp $16, %r10 # more than a full chunk left? 137*90253d49SRobert Clausecker jbe 1f 138*90253d49SRobert Clausecker 139*90253d49SRobert Clausecker movdqa 16(%rsi), %xmm0 140*90253d49SRobert Clausecker add $32, %rdi # advance pointers to next chunk 141*90253d49SRobert Clausecker add $32, %rsi 142*90253d49SRobert Clausecker pxor %xmm1, %xmm1 143*90253d49SRobert Clausecker pcmpeqb %xmm0, %xmm1 # NUL byte encountered? 144*90253d49SRobert Clausecker pmovmskb %xmm1, %r8d 145*90253d49SRobert Clausecker test %r8d, %r8d 146*90253d49SRobert Clausecker jnz 2f 147*90253d49SRobert Clausecker 148*90253d49SRobert Clausecker movdqu %xmm0, -16(%rdi) 149*90253d49SRobert Clausecker sub $32, %r10 # more than another full chunk left? 150*90253d49SRobert Clausecker ja 0b 151*90253d49SRobert Clausecker 152*90253d49SRobert Clausecker sub $16, %rdi # undo second advancement 153*90253d49SRobert Clausecker sub $16, %rsi 154*90253d49SRobert Clausecker add $16, %r10d # restore number of remaining bytes 155*90253d49SRobert Clausecker 156*90253d49SRobert Clausecker /* 1--16 bytes left but string has not ended yet */ 157*90253d49SRobert Clausecker1: pxor %xmm1, %xmm1 158*90253d49SRobert Clausecker pcmpeqb 16(%rsi), %xmm1 # NUL byte in source tail? 159*90253d49SRobert Clausecker pmovmskb %xmm1, %r8d 160*90253d49SRobert Clausecker bts %r10d, %r8d # treat end of buffer as NUL 161*90253d49SRobert Clausecker tzcnt %r8d, %r8d # where is the NUL byte? 162*90253d49SRobert Clausecker movdqu (%rsi, %r8, 1), %xmm0 # load source tail before NUL 163*90253d49SRobert Clausecker lea 16(%rdi, %r8, 1), %rax # point return value to NUL byte 164*90253d49SRobert Clausecker # or end of buffer 165*90253d49SRobert Clausecker movdqu %xmm0, (%rdi, %r8, 1) # store tail into the buffer 166*90253d49SRobert Clausecker ret 167*90253d49SRobert Clausecker 168*90253d49SRobert Clausecker2: sub $16, %rdi # undo second advancement 169*90253d49SRobert Clausecker sub $16, %rsi 170*90253d49SRobert Clausecker sub $16, %r10 171*90253d49SRobert Clausecker 172*90253d49SRobert Clausecker /* string has ended and buffer has not */ 173*90253d49SRobert Clausecker3: tzcnt %r8d, %r8d # where did the string end? 174*90253d49SRobert Clausecker lea .Lmask+16(%rip), %rcx 175*90253d49SRobert Clausecker lea (%rdi, %r8, 1), %rax # where the NUL byte will be 176*90253d49SRobert Clausecker neg %r8 177*90253d49SRobert Clausecker movdqu (%rcx, %r8, 1), %xmm1 # mask with FF where the string is, 178*90253d49SRobert Clausecker # 00 where it is not 179*90253d49SRobert Clausecker pand %xmm1, %xmm0 # mask out bytes after the string 180*90253d49SRobert Clausecker movdqu %xmm0, (%rdi) # store masked current chunk 181*90253d49SRobert Clausecker pxor %xmm1, %xmm1 182*90253d49SRobert Clausecker sub $16, %r10 # another full chunk left? 183*90253d49SRobert Clausecker jbe 1f 184*90253d49SRobert Clausecker 185*90253d49SRobert Clausecker /* clear remaining destination buffer (tail has been cleared earlier) */ 186*90253d49SRobert Clausecker ALIGN_TEXT 187*90253d49SRobert Clausecker0: movdqu %xmm1, 16(%rdi) 188*90253d49SRobert Clausecker cmp $16, %r10 189*90253d49SRobert Clausecker jbe 1f 190*90253d49SRobert Clausecker 191*90253d49SRobert Clausecker movdqu %xmm1, 32(%rdi) 192*90253d49SRobert Clausecker add $32, %rdi 193*90253d49SRobert Clausecker sub $32, %r10 194*90253d49SRobert Clausecker ja 0b 195*90253d49SRobert Clausecker 196*90253d49SRobert Clausecker1: ret 197*90253d49SRobert Clausecker 198*90253d49SRobert Clausecker /* at least two chunks to play with and NUL while processing head */ 199*90253d49SRobert Clausecker.Lheadnul: 200*90253d49SRobert Clausecker movdqu bounce(%rsp, %rcx, 1), %xmm0 # load start of source from stack 201*90253d49SRobert Clausecker tzcnt %r8d, %r8d # find location of NUL byte 202*90253d49SRobert Clausecker movdqu %xmm0, (%rdi, %rcx, 1) # deposit head in the destination 203*90253d49SRobert Clausecker movdqu %xmm1, (%rdi, %r8, 1) # clear out following bytes 204*90253d49SRobert Clausecker movdqu %xmm1, 16(%rdi) # clear out second chunk 205*90253d49SRobert Clausecker lea (%rdi, %r8, 1), %rax # make RAX point to the NUL byte 206*90253d49SRobert Clausecker 207*90253d49SRobert Clausecker add $32, %rdi # advance past first two chunks 208*90253d49SRobert Clausecker sub $32+16, %r10 # advance past first three chunks 209*90253d49SRobert Clausecker jbe 1f # did we pass the end of the buffer? 210*90253d49SRobert Clausecker 211*90253d49SRobert Clausecker /* clear remaining destination buffer (tail has been cleared earlier) */ 212*90253d49SRobert Clausecker ALIGN_TEXT 213*90253d49SRobert Clausecker0: movdqu %xmm1, (%rdi) # clear out buffer chunk 214*90253d49SRobert Clausecker cmp $16, %r10 215*90253d49SRobert Clausecker jbe 1f 216*90253d49SRobert Clausecker 217*90253d49SRobert Clausecker movdqu %xmm1, 16(%rdi) 218*90253d49SRobert Clausecker add $32, %rdi 219*90253d49SRobert Clausecker sub $32, %r10 220*90253d49SRobert Clausecker ja 0b 221*90253d49SRobert Clausecker 222*90253d49SRobert Clausecker1: ret 223*90253d49SRobert Clausecker 224*90253d49SRobert Clausecker /* 1--32 bytes to copy, bounce through the stack */ 225*90253d49SRobert Clausecker.Lrunt: movdqa %xmm1, bounce+16(%rsp) # clear out rest of on-stack copy 226*90253d49SRobert Clausecker bts %r10d, %r8d # treat end of buffer as end of string 227*90253d49SRobert Clausecker and %r9w, %r8w # end of string within first buffer? 228*90253d49SRobert Clausecker jnz 0f # if yes, do not inspect second buffer 229*90253d49SRobert Clausecker 230*90253d49SRobert Clausecker movdqa 16(%rsi), %xmm0 # load second chunk of input 231*90253d49SRobert Clausecker movdqa %xmm0, bounce+16(%rsp) # stash copy on stack 232*90253d49SRobert Clausecker pcmpeqb %xmm1, %xmm0 # NUL in second chunk? 233*90253d49SRobert Clausecker pmovmskb %xmm0, %r9d 234*90253d49SRobert Clausecker shl $16, %r9d 235*90253d49SRobert Clausecker or %r9d, %r8d # merge found NUL bytes into NUL mask 236*90253d49SRobert Clausecker 237*90253d49SRobert Clausecker /* end of string after one buffer */ 238*90253d49SRobert Clausecker0: tzcnt %r8d, %r8d # location of last char in string 239*90253d49SRobert Clausecker movdqu %xmm1, bounce(%rsp, %r8, 1) # clear bytes behind string 240*90253d49SRobert Clausecker lea bounce(%rsp, %rcx, 1), %rsi # start of string copy on stack 241*90253d49SRobert Clausecker lea (%rdi, %r8, 1), %rax # return pointer to NUL byte 242*90253d49SRobert Clausecker 243*90253d49SRobert Clausecker cmp $16, %edx # at least 16 bytes to transfer? 244*90253d49SRobert Clausecker jae .L1631 245*90253d49SRobert Clausecker 246*90253d49SRobert Clausecker mov (%rsi), %r8 # load string head 247*90253d49SRobert Clausecker cmp $8, %edx # at least 8 bytes to transfer? 248*90253d49SRobert Clausecker jae .L0815 249*90253d49SRobert Clausecker 250*90253d49SRobert Clausecker cmp $4, %edx # at least 4 bytes to transfer? 251*90253d49SRobert Clausecker jae .L0407 252*90253d49SRobert Clausecker 253*90253d49SRobert Clausecker movzwl -2(%rsi, %rdx, 1), %esi # load last two bytes of string 254*90253d49SRobert Clausecker mov %r8b, (%rdi, %rcx, 1) # store first byte 255*90253d49SRobert Clausecker 256*90253d49SRobert Clausecker cmp $2, %edx # at least 2 bytes to transfer? 257*90253d49SRobert Clausecker jb .L1 258*90253d49SRobert Clausecker 259*90253d49SRobert Clausecker mov %si, -2(%rdi, %r10, 1) # store last two bytes of string 260*90253d49SRobert Clausecker.L1: ret 261*90253d49SRobert Clausecker 262*90253d49SRobert Clausecker.L1631: movdqu (%rsi), %xmm0 # load first 16 bytes of string 263*90253d49SRobert Clausecker movdqu -16(%rsi, %rdx, 1), %xmm1 # load last 16 bytes of string 264*90253d49SRobert Clausecker movdqu %xmm0, (%rdi, %rcx, 1) 265*90253d49SRobert Clausecker movdqu %xmm1, -16(%rdi, %r10, 1) 266*90253d49SRobert Clausecker ret 267*90253d49SRobert Clausecker 268*90253d49SRobert Clausecker.L0815: mov -8(%rsi, %rdx, 1), %rdx # load last 8 bytes of string 269*90253d49SRobert Clausecker mov %r8, (%rdi, %rcx, 1) 270*90253d49SRobert Clausecker mov %rdx, -8(%rdi, %r10, 1) 271*90253d49SRobert Clausecker ret 272*90253d49SRobert Clausecker 273*90253d49SRobert Clausecker.L0407: mov -4(%rsi, %rdx, 1), %edx # load last four bytes of string 274*90253d49SRobert Clausecker mov %r8d, (%rdi, %rcx, 1) 275*90253d49SRobert Clausecker mov %edx, -4(%rdi, %r10, 1) 276*90253d49SRobert Clausecker ret 277*90253d49SRobert Clausecker 278*90253d49SRobert Clausecker /* length 0 buffer: just return dest */ 279*90253d49SRobert Clausecker.L0: mov %rdi, %rax 280*90253d49SRobert Clausecker ret 281*90253d49SRobert ClauseckerARCHEND(__stpncpy, baseline) 282*90253d49SRobert Clausecker 283*90253d49SRobert Clausecker .section .note.GNU-stack,"",%progbits 284