1/* 2 * Copyright (c) 2023 The FreeBSD Foundation 3 * 4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org> 5 * under sponsorship from the FreeBSD Foundation. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE 27 */ 28 29#include <machine/asm.h> 30 31#include "amd64_archlevel.h" 32 33#define ALIGN_TEXT .p2align 4, 0x90 34 35 .weak memccpy 36 .set memccpy, __memccpy 37ARCHFUNCS(__memccpy) 38 ARCHFUNC(__memccpy, scalar) 39 ARCHFUNC(__memccpy, baseline) 40ENDARCHFUNCS(__memccpy) 41 42ARCHENTRY(__memccpy, scalar) 43 push %rbp # establish stack frame 44 mov %rsp, %rbp 45 push %rax # dummy push for alignment 46 push %rbx 47 push %rdi 48 push %rsi 49 50 mov %rsi, %rdi 51 mov %edx, %esi 52 mov %rcx, %rdx 53 mov %rcx, %rbx 54 call CNAME(__memchr) # ptr = memchr(src, c, len) 55 56 pop %rsi 57 pop %rdi 58 lea 1(%rax), %rdx 59 sub %rsi, %rdx # size = ptr - src + 1 60 mov %rbx, %rcx 61 lea (%rdi, %rdx, 1), %rbx # res = dest + size 62 test %rax, %rax # if (ptr == NULL) 63 cmovz %rcx, %rdx # size = len 64 cmovz %rax, %rbx # res = NULL 65 call CNAME(memcpy) 66 67 mov %rbx, %rax # return (res) 68 pop %rbx 69 leave 70 ret 71ARCHEND(__memccpy, scalar) 72 73ARCHENTRY(__memccpy, baseline) 74 sub $1, %rcx # RCX refers to last character in buffer 75 jb .L0 # go to special code path if len was 0 76 77 movd %edx, %xmm4 78 mov %rcx, %rdx 79 punpcklbw %xmm4, %xmm4 # c -> cc 80 mov %esi, %ecx 81 punpcklwd %xmm4, %xmm4 # cc -> cccc 82 mov %rsi, %r9 # stash a copy of the source pointer for later 83 pshufd $0, %xmm4, %xmm4 # cccc -> cccccccccccccccc 84 and $~0xf, %rsi 85 movdqa %xmm4, %xmm1 86 pcmpeqb (%rsi), %xmm1 # NUL found in head? 87 mov $-1, %r8d 88 and $0xf, %ecx 89 shl %cl, %r8d # mask of bytes in the string 90 pmovmskb %xmm1, %eax 91 and %r8d, %eax 92 jnz .Lhead_nul 93 94 movdqa 16(%rsi), %xmm3 # load second string chunk 95 movdqu (%r9), %xmm2 # load unaligned string head 96 mov $32, %r8d 97 sub %ecx, %r8d # head length + length of second chunk 98 movdqa %xmm4, %xmm1 99 pcmpeqb %xmm3, %xmm1 # NUL found in second chunk? 100 101 sub %r8, %rdx # enough space left for the second chunk? 102 jb .Lhead_buf_end 103 104 /* process second chunk */ 105 pmovmskb %xmm1, %eax 106 test %eax, %eax 107 jnz .Lsecond_nul 108 109 /* string didn't end in second chunk and neither did buffer -- not a runt! */ 110 movdqa 32(%rsi), %xmm0 # load next string chunk 111 movdqa %xmm4, %xmm1 112 movdqu %xmm2, (%rdi) # deposit head into buffer 113 sub %rcx, %rdi # adjust RDI to correspond to RSI 114 movdqu %xmm3, 16(%rdi) # deposit second chunk 115 sub %rsi, %rdi # express RDI as distance from RSI 116 add $32, %rsi # advance RSI past first two chunks 117 sub $16, %rdx # enough left for another round? 118 jb 1f 119 120 /* main loop unrolled twice */ 121 ALIGN_TEXT 1220: pcmpeqb %xmm0, %xmm1 # NUL byte encountered? 123 pmovmskb %xmm1, %eax 124 test %eax, %eax 125 jnz 3f 126 127 movdqu %xmm0, (%rsi, %rdi) 128 movdqa 16(%rsi), %xmm0 # load next string chunk 129 movdqa %xmm4, %xmm1 130 cmp $16, %rdx # more than a full chunk left? 131 jb 2f 132 133 add $32, %rsi # advance pointers to next chunk 134 pcmpeqb %xmm0, %xmm1 # NUL byte encountered? 135 pmovmskb %xmm1, %eax 136 test %eax, %eax 137 jnz 4f 138 139 movdqu %xmm0, -16(%rsi, %rdi) 140 movdqa (%rsi), %xmm0 # load next string chunk 141 movdqa %xmm4, %xmm1 142 sub $32, %rdx 143 jae 0b 144 1451: sub $16, %rsi # undo second advancement 146 add $16, %edx 147 148 /* 1--16 bytes left in the buffer but string has not ended yet */ 1492: pcmpeqb %xmm1, %xmm0 # NUL byte encountered? 150 pmovmskb %xmm0, %r8d 151 mov %r8d, %ecx 152 bts %edx, %r8d # treat end of buffer as end of string 153 or $0x10000, %eax # ensure TZCNT finds a set bit 154 tzcnt %r8d, %r8d # find tail length 155 add %rsi, %rdi # restore RDI 156 movdqu 1(%rsi, %r8, 1), %xmm0 # load string tail 157 movdqu %xmm0, 1(%rdi, %r8, 1) # store string tail 158 lea 17(%rdi, %r8, 1), %rsi # return value if terminator encountered 159 xor %eax, %eax # return value if no terminator encountered 160 bt %r8d, %ecx # terminator encountered inside buffer? 161 cmovc %rsi, %rax # if yes, return pointer, else NULL 162 ret 163 1644: sub $16, %rsi # undo second advancement 165 add $16, %rdx # restore number of remaining bytes 166 167 /* string has ended but buffer has not */ 1683: tzcnt %eax, %eax # find length of string tail 169 movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. NUL) 170 add %rsi, %rdi # restore destination pointer 171 movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. NUL) 172 lea 1(%rdi, %rax, 1), %rax # compute return value 173 ret 174 175.Lhead_buf_end: 176 pmovmskb %xmm1, %r8d 177 add $32, %edx # restore edx to (len-1) + ecx 178 shl $16, %r8d # place 2nd chunk NUL mask into bits 16--31 179 mov %r8d, %r10d 180 bts %rdx, %r8 # treat end of buffer as if terminator present 181 xor %eax, %eax # return value if terminator not found 182 tzcnt %r8, %rdx # find string/buffer len from alignment boundary 183 lea 1(%rdi, %rdx, 1), %r8 # return value if terminator found + rcx 184 sub %rcx, %r8 # subtract rcx 185 bt %rdx, %r10 # was the terminator present? 186 cmovc %r8, %rax # if yes, return pointer, else NULL 187 sub %ecx, %edx # find actual string/buffer len 188 jmp .L0132 189 190.Lsecond_nul: 191 add %r8, %rdx # restore buffer length 192 tzcnt %eax, %r8d # where is the NUL byte? 193 lea -16(%rcx), %eax 194 sub %eax, %r8d # string length 195 lea 1(%rdi, %r8, 1), %rax # return value if NUL before end of buffer 196 xor %ecx, %ecx # return value if not 197 cmp %r8, %rdx # is the string shorter than the buffer? 198 cmova %r8, %rdx # copy only min(buflen, srclen) bytes 199 cmovb %rcx, %rax # return NUL if buffer ended before string 200.L0132: cmp $16, %rdx # at least 17 bytes to copy (not incl NUL)? 201 jb .L0116 202 203 /* copy 17--32 bytes */ 204 movdqu (%r9), %xmm0 # load first 16 bytes 205 movdqu -15(%r9, %rdx, 1), %xmm1 # load last 16 bytes 206 movdqu %xmm0, (%rdi) 207 movdqu %xmm1, -15(%rdi, %rdx, 1) 208 ret 209 210.Lhead_nul: 211 tzcnt %eax, %r8d # where is the NUL byte? 212 sub %ecx, %r8d # ... from the beginning of the string? 213 lea 1(%rdi, %r8, 1), %rax # return value if NUL before end of buffer 214 xor %ecx, %ecx # return value if not 215 cmp %r8, %rdx # is the string shorter than the buffer? 216 cmova %r8, %rdx # copy only min(buflen, srclen) bytes 217 cmovb %rcx, %rax # return NUL if buffer ended before string 218 219 /* process strings of 1--16 bytes (rdx: min(buflen, srclen), rax: srclen) */ 220.L0116: cmp $8, %rdx # at least 9 bytes to copy? 221 jae .L0916 222 223 cmp $4, %rdx # at least 5 bytes to copy? 224 jae .L0508 225 226 cmp $2, %rdx # at least 3 bytes to copy? 227 jae .L0304 228 229 /* copy one or two bytes */ 230 movzbl (%r9), %ecx # load first byte from src 231 movzbl (%r9, %rdx, 1), %esi # load last byte from src 232 mov %cl, (%rdi) # deposit into destination 233 mov %sil, (%rdi, %rdx, 1) 234 ret 235 236.L0304: movzwl (%r9), %ecx 237 movzwl -1(%r9, %rdx, 1), %esi 238 mov %cx, (%rdi) 239 mov %si, -1(%rdi, %rdx, 1) 240 ret 241 242.L0508: mov (%r9), %ecx 243 mov -3(%r9, %rdx, 1), %esi 244 mov %ecx, (%rdi) 245 mov %esi, -3(%rdi, %rdx, 1) 246 ret 247 248.L0916: mov (%r9), %rcx 249 mov -7(%r9, %rdx, 1), %rsi 250 mov %rcx, (%rdi) 251 mov %rsi, -7(%rdi, %rdx, 1) 252 ret 253 254 /* length zero destination: return null pointer */ 255.L0: xor %eax, %eax 256 ret 257ARCHEND(__memccpy, baseline) 258 259 .section .note.GNU-stack,"",%progbits 260