1/* 2 * Copyright (c) 2023 The FreeBSD Foundation 3 * 4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org> 5 * under sponsorship from the FreeBSD Foundation. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE 27 */ 28 29#include <machine/asm.h> 30 31#include "amd64_archlevel.h" 32 33#define ALIGN_TEXT .p2align 4, 0x90 34 35 .weak stpncpy 36 .set stpncpy, __stpncpy 37ARCHFUNCS(__stpncpy) 38 ARCHFUNC(__stpncpy, scalar) 39 ARCHFUNC(__stpncpy, baseline) 40ENDARCHFUNCS(__stpncpy) 41 42ARCHENTRY(__stpncpy, scalar) 43 push %rbp # establish stack frame 44 mov %rsp, %rbp 45 46 push %rdx 47 push %rdi 48 push %rsi 49 push %rax # dummy push for alignment 50 51 mov %rsi, %rdi 52 xor %esi, %esi 53 call CNAME(__memchr) # memchr(src, '\0', len) 54 pop %rcx # dummy pop 55 pop %rsi 56 mov -16(%rbp), %rdi 57 58 test %rax, %rax # NUL found? 59 jz .Lfullcopy 60 61 mov %rax, %rdx 62 sub %rsi, %rdx # copy until the NUL byte 63 add %rdx, -16(%rbp) # advance destination by string length 64 sub %rdx, -8(%rbp) # and shorten buffer size by string length 65 call CNAME(memcpy) 66 67 pop %rdi 68 pop %rdx 69 xor %esi, %esi 70 pop %rbp 71 jmp CNAME(memset) # clear remaining buffer 72 73.Lfullcopy: 74 mov -8(%rbp), %rdx 75 call CNAME(memcpy) # copy whole string 76 add -8(%rbp), %rax # point to dest[n] 77 leave 78 ret 79ARCHEND(__stpncpy, scalar) 80 81 /* 82 * this mask allows us to generate masks of 16-n 0xff bytes 83 * followed by n 0x00 bytes by loading from .Lmask+n. 84 */ 85 .section .rodata 86.Lmask: .quad 0xffffffffffffffff 87 .quad 0xffffffffffffffff 88 .quad 0x0000000000000000 89 .quad 0x0000000000000000 90 91/* stpncpy(char *restrict rdi, const char *rsi, size_t rdx) */ 92ARCHENTRY(__stpncpy, baseline) 93#define bounce (-3*16-8) /* location of on-stack bounce buffer */ 94 95 test %rdx, %rdx # no bytes to copy? 96 jz .L0 97 98 mov %esi, %ecx 99 and $~0xf, %rsi # align source to 16 bytes 100 movdqa (%rsi), %xmm0 # load head 101 and $0xf, %ecx # offset from alignment 102 mov $-1, %r9d 103 lea -32(%rcx), %rax # set up overflow-proof comparison rdx+rcx<=32 104 shl %cl, %r9d # mask of bytes belonging to the string 105 sub %rcx, %rdi # adjust RDI to correspond to RSI 106 pxor %xmm1, %xmm1 107 movdqa %xmm0, bounce(%rsp) # stash copy of head on the stack 108 pcmpeqb %xmm1, %xmm0 109 pmovmskb %xmm0, %r8d 110 111 lea (%rdx, %rcx, 1), %r10 # buffer length from alignment boundary 112 add %rdx, %rax # less than 2 chunks (32 bytes) to play with? 113 jnc .Lrunt # if yes, use special runt processing 114 115 movdqu %xmm1, -16(%rdi, %r10, 1) # clear final bytes of destination 116 and %r9d, %r8d # end of string within head? 117 jnz .Lheadnul 118 119 movdqu (%rsi, %rcx, 1), %xmm2 # load head from source buffer 120 movdqu %xmm2, (%rdi, %rcx, 1) # an deposit 121 122 add $16, %rsi 123 add $16, %rdi 124 sub $32, %r10 125 126 /* main loop unrolled twice */ 127 ALIGN_TEXT 1280: movdqa (%rsi), %xmm0 129 pxor %xmm1, %xmm1 130 pcmpeqb %xmm0, %xmm1 # NUL byte encountered? 131 pmovmskb %xmm1, %r8d 132 test %r8d, %r8d 133 jnz 3f 134 135 movdqu %xmm0, (%rdi) 136 cmp $16, %r10 # more than a full chunk left? 137 jbe 1f 138 139 movdqa 16(%rsi), %xmm0 140 add $32, %rdi # advance pointers to next chunk 141 add $32, %rsi 142 pxor %xmm1, %xmm1 143 pcmpeqb %xmm0, %xmm1 # NUL byte encountered? 144 pmovmskb %xmm1, %r8d 145 test %r8d, %r8d 146 jnz 2f 147 148 movdqu %xmm0, -16(%rdi) 149 sub $32, %r10 # more than another full chunk left? 150 ja 0b 151 152 sub $16, %rdi # undo second advancement 153 sub $16, %rsi 154 add $16, %r10d # restore number of remaining bytes 155 156 /* 1--16 bytes left but string has not ended yet */ 1571: pxor %xmm1, %xmm1 158 pcmpeqb 16(%rsi), %xmm1 # NUL byte in source tail? 159 pmovmskb %xmm1, %r8d 160 bts %r10d, %r8d # treat end of buffer as NUL 161 tzcnt %r8d, %r8d # where is the NUL byte? 162 movdqu (%rsi, %r8, 1), %xmm0 # load source tail before NUL 163 lea 16(%rdi, %r8, 1), %rax # point return value to NUL byte 164 # or end of buffer 165 movdqu %xmm0, (%rdi, %r8, 1) # store tail into the buffer 166 ret 167 1682: sub $16, %rdi # undo second advancement 169 sub $16, %rsi 170 sub $16, %r10 171 172 /* string has ended and buffer has not */ 1733: tzcnt %r8d, %r8d # where did the string end? 174 lea .Lmask+16(%rip), %rcx 175 lea (%rdi, %r8, 1), %rax # where the NUL byte will be 176 neg %r8 177 movdqu (%rcx, %r8, 1), %xmm1 # mask with FF where the string is, 178 # 00 where it is not 179 pand %xmm1, %xmm0 # mask out bytes after the string 180 movdqu %xmm0, (%rdi) # store masked current chunk 181 pxor %xmm1, %xmm1 182 sub $16, %r10 # another full chunk left? 183 jbe 1f 184 185 /* clear remaining destination buffer (tail has been cleared earlier) */ 186 ALIGN_TEXT 1870: movdqu %xmm1, 16(%rdi) 188 cmp $16, %r10 189 jbe 1f 190 191 movdqu %xmm1, 32(%rdi) 192 add $32, %rdi 193 sub $32, %r10 194 ja 0b 195 1961: ret 197 198 /* at least two chunks to play with and NUL while processing head */ 199.Lheadnul: 200 movdqu bounce(%rsp, %rcx, 1), %xmm0 # load start of source from stack 201 tzcnt %r8d, %r8d # find location of NUL byte 202 movdqu %xmm0, (%rdi, %rcx, 1) # deposit head in the destination 203 movdqu %xmm1, (%rdi, %r8, 1) # clear out following bytes 204 movdqu %xmm1, 16(%rdi) # clear out second chunk 205 lea (%rdi, %r8, 1), %rax # make RAX point to the NUL byte 206 207 add $32, %rdi # advance past first two chunks 208 sub $32+16, %r10 # advance past first three chunks 209 jbe 1f # did we pass the end of the buffer? 210 211 /* clear remaining destination buffer (tail has been cleared earlier) */ 212 ALIGN_TEXT 2130: movdqu %xmm1, (%rdi) # clear out buffer chunk 214 cmp $16, %r10 215 jbe 1f 216 217 movdqu %xmm1, 16(%rdi) 218 add $32, %rdi 219 sub $32, %r10 220 ja 0b 221 2221: ret 223 224 /* 1--32 bytes to copy, bounce through the stack */ 225.Lrunt: movdqa %xmm1, bounce+16(%rsp) # clear out rest of on-stack copy 226 bts %r10d, %r8d # treat end of buffer as end of string 227 and %r9w, %r8w # end of string within first buffer? 228 jnz 0f # if yes, do not inspect second buffer 229 230 movdqa 16(%rsi), %xmm0 # load second chunk of input 231 movdqa %xmm0, bounce+16(%rsp) # stash copy on stack 232 pcmpeqb %xmm1, %xmm0 # NUL in second chunk? 233 pmovmskb %xmm0, %r9d 234 shl $16, %r9d 235 or %r9d, %r8d # merge found NUL bytes into NUL mask 236 237 /* end of string after one buffer */ 2380: tzcnt %r8d, %r8d # location of last char in string 239 movdqu %xmm1, bounce(%rsp, %r8, 1) # clear bytes behind string 240 lea bounce(%rsp, %rcx, 1), %rsi # start of string copy on stack 241 lea (%rdi, %r8, 1), %rax # return pointer to NUL byte 242 243 cmp $16, %edx # at least 16 bytes to transfer? 244 jae .L1631 245 246 mov (%rsi), %r8 # load string head 247 cmp $8, %edx # at least 8 bytes to transfer? 248 jae .L0815 249 250 cmp $4, %edx # at least 4 bytes to transfer? 251 jae .L0407 252 253 movzwl -2(%rsi, %rdx, 1), %esi # load last two bytes of string 254 mov %r8b, (%rdi, %rcx, 1) # store first byte 255 256 cmp $2, %edx # at least 2 bytes to transfer? 257 jb .L1 258 259 mov %si, -2(%rdi, %r10, 1) # store last two bytes of string 260.L1: ret 261 262.L1631: movdqu (%rsi), %xmm0 # load first 16 bytes of string 263 movdqu -16(%rsi, %rdx, 1), %xmm1 # load last 16 bytes of string 264 movdqu %xmm0, (%rdi, %rcx, 1) 265 movdqu %xmm1, -16(%rdi, %r10, 1) 266 ret 267 268.L0815: mov -8(%rsi, %rdx, 1), %rdx # load last 8 bytes of string 269 mov %r8, (%rdi, %rcx, 1) 270 mov %rdx, -8(%rdi, %r10, 1) 271 ret 272 273.L0407: mov -4(%rsi, %rdx, 1), %edx # load last four bytes of string 274 mov %r8d, (%rdi, %rcx, 1) 275 mov %edx, -4(%rdi, %r10, 1) 276 ret 277 278 /* length 0 buffer: just return dest */ 279.L0: mov %rdi, %rax 280 ret 281ARCHEND(__stpncpy, baseline) 282 283 .section .note.GNU-stack,"",%progbits 284