1/* 2 * Copyright (c) 2023 The FreeBSD Foundation 3 * 4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org> 5 * under sponsorship from the FreeBSD Foundation. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE 27 */ 28 29#include <machine/asm.h> 30 31#include "amd64_archlevel.h" 32 33#define ALIGN_TEXT .p2align 4, 0x90 34 35 .weak stpncpy 36 .set stpncpy, __stpncpy 37ARCHFUNCS(__stpncpy) 38 ARCHFUNC(__stpncpy, scalar) 39#if 0 /* temporarily disabled cf. PR 291720 */ 40 ARCHFUNC(__stpncpy, baseline) 41#endif 42ENDARCHFUNCS(__stpncpy) 43 44ARCHENTRY(__stpncpy, scalar) 45 push %rbp # establish stack frame 46 mov %rsp, %rbp 47 48 push %rdx 49 push %rdi 50 push %rsi 51 push %rax # dummy push for alignment 52 53 mov %rsi, %rdi 54 xor %esi, %esi 55 call CNAME(__memchr) # memchr(src, '\0', len) 56 pop %rcx # dummy pop 57 pop %rsi 58 mov -16(%rbp), %rdi 59 60 test %rax, %rax # NUL found? 61 jz .Lfullcopy 62 63 mov %rax, %rdx 64 sub %rsi, %rdx # copy until the NUL byte 65 add %rdx, -16(%rbp) # advance destination by string length 66 sub %rdx, -8(%rbp) # and shorten buffer size by string length 67 call CNAME(memcpy) 68 69 pop %rdi 70 pop %rdx 71 xor %esi, %esi 72 pop %rbp 73 jmp CNAME(memset) # clear remaining buffer 74 75.Lfullcopy: 76 mov -8(%rbp), %rdx 77 call CNAME(memcpy) # copy whole string 78 add -8(%rbp), %rax # point to dest[n] 79 leave 80 ret 81ARCHEND(__stpncpy, scalar) 82 83 /* 84 * this mask allows us to generate masks of 16-n 0xff bytes 85 * followed by n 0x00 bytes by loading from .Lmask+n. 86 */ 87 .section .rodata 88.Lmask: .quad 0xffffffffffffffff 89 .quad 0xffffffffffffffff 90 .quad 0x0000000000000000 91 .quad 0x0000000000000000 92 93/* stpncpy(char *restrict rdi, const char *rsi, size_t rdx) */ 94ARCHENTRY(__stpncpy, baseline) 95#define bounce (-3*16-8) /* location of on-stack bounce buffer */ 96 97 test %rdx, %rdx # no bytes to copy? 98 jz .L0 99 100 mov %esi, %ecx 101 and $~0xf, %rsi # align source to 16 bytes 102 movdqa (%rsi), %xmm0 # load head 103 and $0xf, %ecx # offset from alignment 104 mov $-1, %r9d 105 lea -33(%rcx), %rax # set up overflow-proof comparison rdx+rcx<=32 106 shl %cl, %r9d # mask of bytes belonging to the string 107 sub %rcx, %rdi # adjust RDI to correspond to RSI 108 pxor %xmm1, %xmm1 109 movdqa %xmm0, bounce(%rsp) # stash copy of head on the stack 110 pcmpeqb %xmm1, %xmm0 111 pmovmskb %xmm0, %r8d 112 113 lea (%rdx, %rcx, 1), %r10 # buffer length from alignment boundary 114 add %rdx, %rax # less than 2 chunks (32 bytes) to play with? 115 jnc .Lrunt # if yes, use special runt processing 116 117 movdqu %xmm1, -16(%rdi, %r10, 1) # clear final bytes of destination 118 and %r9d, %r8d # end of string within head? 119 jnz .Lheadnul 120 121 movdqu (%rsi, %rcx, 1), %xmm2 # load head from source buffer 122 movdqu %xmm2, (%rdi, %rcx, 1) # an deposit 123 124 add $16, %rsi 125 add $16, %rdi 126 sub $32, %r10 127 128 /* main loop unrolled twice */ 129 ALIGN_TEXT 1300: movdqa (%rsi), %xmm0 131 pxor %xmm1, %xmm1 132 pcmpeqb %xmm0, %xmm1 # NUL byte encountered? 133 pmovmskb %xmm1, %r8d 134 test %r8d, %r8d 135 jnz 3f 136 137 movdqu %xmm0, (%rdi) 138 cmp $16, %r10 # more than a full chunk left? 139 jbe 1f 140 141 movdqa 16(%rsi), %xmm0 142 add $32, %rdi # advance pointers to next chunk 143 add $32, %rsi 144 pxor %xmm1, %xmm1 145 pcmpeqb %xmm0, %xmm1 # NUL byte encountered? 146 pmovmskb %xmm1, %r8d 147 test %r8d, %r8d 148 jnz 2f 149 150 movdqu %xmm0, -16(%rdi) 151 sub $32, %r10 # more than another full chunk left? 152 ja 0b 153 154 sub $16, %rdi # undo second advancement 155 sub $16, %rsi 156 add $16, %r10d # restore number of remaining bytes 157 158 /* 1--16 bytes left but string has not ended yet */ 1591: pxor %xmm1, %xmm1 160 pcmpeqb 16(%rsi), %xmm1 # NUL byte in source tail? 161 pmovmskb %xmm1, %r8d 162 bts %r10d, %r8d # treat end of buffer as NUL 163 tzcnt %r8d, %r8d # where is the NUL byte? 164 movdqu (%rsi, %r8, 1), %xmm0 # load source tail before NUL 165 lea 16(%rdi, %r8, 1), %rax # point return value to NUL byte 166 # or end of buffer 167 movdqu %xmm0, (%rdi, %r8, 1) # store tail into the buffer 168 ret 169 1702: sub $16, %rdi # undo second advancement 171 sub $16, %rsi 172 sub $16, %r10 173 174 /* string has ended and buffer has not */ 1753: tzcnt %r8d, %r8d # where did the string end? 176 lea .Lmask+16(%rip), %rcx 177 lea (%rdi, %r8, 1), %rax # where the NUL byte will be 178 neg %r8 179 movdqu (%rcx, %r8, 1), %xmm1 # mask with FF where the string is, 180 # 00 where it is not 181 pand %xmm1, %xmm0 # mask out bytes after the string 182 movdqu %xmm0, (%rdi) # store masked current chunk 183 pxor %xmm1, %xmm1 184 sub $16, %r10 # another full chunk left? 185 jbe 1f 186 187 /* clear remaining destination buffer (tail has been cleared earlier) */ 188 ALIGN_TEXT 1890: movdqu %xmm1, 16(%rdi) 190 cmp $16, %r10 191 jbe 1f 192 193 movdqu %xmm1, 32(%rdi) 194 add $32, %rdi 195 sub $32, %r10 196 ja 0b 197 1981: ret 199 200 /* at least two chunks to play with and NUL while processing head */ 201.Lheadnul: 202 movdqu bounce(%rsp, %rcx, 1), %xmm0 # load start of source from stack 203 tzcnt %r8d, %r8d # find location of NUL byte 204 movdqu %xmm0, (%rdi, %rcx, 1) # deposit head in the destination 205 movdqu %xmm1, (%rdi, %r8, 1) # clear out following bytes 206 movdqu %xmm1, 16(%rdi) # clear out second chunk 207 lea (%rdi, %r8, 1), %rax # make RAX point to the NUL byte 208 209 add $32, %rdi # advance past first two chunks 210 sub $32+16, %r10 # advance past first three chunks 211 jbe 1f # did we pass the end of the buffer? 212 213 /* clear remaining destination buffer (tail has been cleared earlier) */ 214 ALIGN_TEXT 2150: movdqu %xmm1, (%rdi) # clear out buffer chunk 216 cmp $16, %r10 217 jbe 1f 218 219 movdqu %xmm1, 16(%rdi) 220 add $32, %rdi 221 sub $32, %r10 222 ja 0b 223 2241: ret 225 226 /* 1--32 bytes to copy, bounce through the stack */ 227.Lrunt: movdqa %xmm1, bounce+16(%rsp) # clear out rest of on-stack copy 228 bts %r10, %r8 # treat end of buffer as end of string 229 and %r9d, %r8d # mask out head before string 230 test $0x1ffff, %r8d # end of string within first chunk or right after? 231 jnz 0f # if yes, do not inspect second buffer 232 233 movdqa 16(%rsi), %xmm0 # load second chunk of input 234 movdqa %xmm0, bounce+16(%rsp) # stash copy on stack 235 pcmpeqb %xmm1, %xmm0 # NUL in second chunk? 236 pmovmskb %xmm0, %r9d 237 shl $16, %r9d 238 or %r9d, %r8d # merge found NUL bytes into NUL mask 239 240 /* end of string after one buffer */ 2410: tzcnt %r8d, %r8d # location of last char in string 242 movdqu %xmm1, bounce(%rsp, %r8, 1) # clear bytes behind string 243 lea bounce(%rsp, %rcx, 1), %rsi # start of string copy on stack 244 lea (%rdi, %r8, 1), %rax # return pointer to NUL byte 245 246 cmp $16, %edx # at least 16 bytes to transfer? 247 jae .L1631 248 249 mov (%rsi), %r8 # load string head 250 cmp $8, %edx # at least 8 bytes to transfer? 251 jae .L0815 252 253 cmp $4, %edx # at least 4 bytes to transfer? 254 jae .L0407 255 256 movzwl -2(%rsi, %rdx, 1), %esi # load last two bytes of string 257 mov %r8b, (%rdi, %rcx, 1) # store first byte 258 259 cmp $2, %edx # at least 2 bytes to transfer? 260 jb .L1 261 262 mov %si, -2(%rdi, %r10, 1) # store last two bytes of string 263.L1: ret 264 265.L1631: movdqu (%rsi), %xmm0 # load first 16 bytes of string 266 movdqu -16(%rsi, %rdx, 1), %xmm1 # load last 16 bytes of string 267 movdqu %xmm0, (%rdi, %rcx, 1) 268 movdqu %xmm1, -16(%rdi, %r10, 1) 269 ret 270 271.L0815: mov -8(%rsi, %rdx, 1), %rdx # load last 8 bytes of string 272 mov %r8, (%rdi, %rcx, 1) 273 mov %rdx, -8(%rdi, %r10, 1) 274 ret 275 276.L0407: mov -4(%rsi, %rdx, 1), %edx # load last four bytes of string 277 mov %r8d, (%rdi, %rcx, 1) 278 mov %edx, -4(%rdi, %r10, 1) 279 ret 280 281 /* length 0 buffer: just return dest */ 282.L0: mov %rdi, %rax 283 ret 284ARCHEND(__stpncpy, baseline) 285 286 .section .note.GNU-stack,"",%progbits 287