1/* 2 * Copyright (c) 2023, 2024 The FreeBSD Foundation 3 * 4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org> 5 * under sponsorship from the FreeBSD Foundation. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE 27 */ 28 29#include <machine/asm.h> 30 31#include "amd64_archlevel.h" 32 33#define ALIGN_TEXT .p2align 4, 0x90 34 35 .weak memccpy 36 .set memccpy, __memccpy 37ARCHFUNCS(__memccpy) 38 ARCHFUNC(__memccpy, scalar) 39 ARCHFUNC(__memccpy, baseline) 40ENDARCHFUNCS(__memccpy) 41 42ARCHENTRY(__memccpy, scalar) 43 push %rbp # establish stack frame 44 mov %rsp, %rbp 45 push %rax # dummy push for alignment 46 push %rbx 47 push %rdi 48 push %rsi 49 50 mov %rsi, %rdi 51 mov %edx, %esi 52 mov %rcx, %rdx 53 mov %rcx, %rbx 54 call CNAME(__memchr) # ptr = memchr(src, c, len) 55 56 pop %rsi 57 pop %rdi 58 lea 1(%rax), %rdx 59 sub %rsi, %rdx # size = ptr - src + 1 60 mov %rbx, %rcx 61 lea (%rdi, %rdx, 1), %rbx # res = dest + size 62 test %rax, %rax # if (ptr == NULL) 63 cmovz %rcx, %rdx # size = len 64 cmovz %rax, %rbx # res = NULL 65 call CNAME(memcpy) 66 67 mov %rbx, %rax # return (res) 68 pop %rbx 69 leave 70 ret 71ARCHEND(__memccpy, scalar) 72 73ARCHENTRY(__memccpy, baseline) 74 sub $1, %rcx # RCX refers to last character in buffer 75 jb .L0 # go to special code path if len was 0 76 77 movd %edx, %xmm4 78 mov %rcx, %rdx 79 punpcklbw %xmm4, %xmm4 # c -> cc 80 mov %esi, %ecx 81 punpcklwd %xmm4, %xmm4 # cc -> cccc 82 mov %rsi, %r9 # stash a copy of the source pointer for later 83 pshufd $0, %xmm4, %xmm4 # cccc -> cccccccccccccccc 84 and $~0xf, %rsi 85 movdqa %xmm4, %xmm1 86 pcmpeqb (%rsi), %xmm1 # c found in head? 87 and $0xf, %ecx 88 mov $-1, %eax 89 pmovmskb %xmm1, %r8d 90 lea -32(%rcx), %r11 91 shl %cl, %eax # mask of bytes in the string 92 add %rdx, %r11 # distance from alignment boundary - 32 93 jnc .Lrunt # jump if buffer length is 32 or less 94 95 and %r8d, %eax 96 jz 0f # match (or induced match) found? 97 98 /* match in first chunk */ 99 tzcnt %eax, %edx # where is c? 100 sub %ecx, %edx # ... from the beginning of the string? 101 lea 1(%rdi, %rdx, 1), %rax # return value 102 jmp .L0116 103 1040: movdqa 16(%rsi), %xmm3 # load second string chunk 105 movdqu (%r9), %xmm2 # load unaligned string head 106 movdqa %xmm4, %xmm1 107 pcmpeqb %xmm3, %xmm1 # c found in second chunk? 108 109 /* process second chunk */ 110 pmovmskb %xmm1, %eax 111 test %eax, %eax 112 jz 0f 113 114 /* match in second chunk */ 115 tzcnt %eax, %edx # where is c? 116 sub $16, %ecx 117 sub %ecx, %edx # adjust for alignment offset 118 lea 1(%rdi, %rdx, 1), %rax # return value 119 jmp .L0132 120 121 /* c not found in second chunk: prepare for main loop */ 1220: movdqa 32(%rsi), %xmm0 # load next string chunk 123 movdqa %xmm4, %xmm1 124 movdqu %xmm2, (%rdi) # deposit head into buffer 125 sub %rcx, %rdi # adjust RDI to correspond to RSI 126 mov %r11, %rdx 127 movdqu %xmm3, 16(%rdi) # deposit second chunk 128 sub %rsi, %rdi # express RDI as distance from RSI 129 add $32, %rsi # advance RSI past first two chunks 130 sub $16, %rdx # enough left for another round? 131 jb 1f 132 133 /* main loop unrolled twice */ 134 ALIGN_TEXT 1350: pcmpeqb %xmm0, %xmm1 # c encountered? 136 pmovmskb %xmm1, %eax 137 test %eax, %eax 138 jnz 3f 139 140 movdqu %xmm0, (%rsi, %rdi) 141 movdqa 16(%rsi), %xmm0 # load next string chunk 142 movdqa %xmm4, %xmm1 143 cmp $16, %rdx # more than a full chunk left? 144 jb 2f 145 146 add $32, %rsi # advance pointers to next chunk 147 pcmpeqb %xmm0, %xmm1 # c encountered? 148 pmovmskb %xmm1, %eax 149 test %eax, %eax 150 jnz 4f 151 152 movdqu %xmm0, -16(%rsi, %rdi) 153 movdqa (%rsi), %xmm0 # load next string chunk 154 movdqa %xmm4, %xmm1 155 sub $32, %rdx 156 jae 0b 157 1581: sub $16, %rsi # undo second advancement 159 add $16, %edx 160 161 /* 1--16 bytes left in the buffer but string has not ended yet */ 1622: pcmpeqb %xmm1, %xmm0 # c encountered? 163 pmovmskb %xmm0, %r8d 164 mov %r8d, %ecx 165 bts %edx, %r8d # treat end of buffer as end of string 166 tzcnt %r8d, %r8d # find tail length 167 add %rsi, %rdi # restore RDI 168 movdqu 1(%rsi, %r8, 1), %xmm0 # load string tail 169 movdqu %xmm0, 1(%rdi, %r8, 1) # store string tail 170 lea 17(%rdi, %r8, 1), %rsi # return value if terminator encountered 171 xor %eax, %eax # return value if no terminator encountered 172 bt %r8d, %ecx # terminator encountered inside buffer? 173 cmovc %rsi, %rax # if yes, return pointer, else NULL 174 ret 175 1764: sub $16, %rsi # undo second advancement 177 178 /* terminator found and buffer has not ended yet */ 1793: tzcnt %eax, %eax # find length of string tail 180 movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. c) 181 add %rsi, %rdi # restore destination pointer 182 movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. c) 183 lea 1(%rdi, %rax, 1), %rax # compute return value 184 ret 185 186 /* buffer is 1--32 bytes in size */ 187 ALIGN_TEXT 188.Lrunt: add $32, %r11d # undo earlier decrement 189 mov %r8d, %r10d # keep a copy of the original match mask 190 bts %r11d, %r8d # induce match at buffer end 191 and %ax, %r8w # is there a match in the first 16 bytes? 192 jnz 0f # if yes, skip looking at second chunk 193 194 pcmpeqb 16(%rsi), %xmm4 # check for match in second chunk 195 pmovmskb %xmm4, %r8d 196 shl $16, %r8d # place second chunk matches in bits 16--31 197 mov %r8d, %r10d # keep a copy of the original match mask 198 bts %r11d, %r8d # induce a match at buffer end 199 2000: xor %eax, %eax # return value if terminator not found 201 tzcnt %r8d, %edx # find string/buffer length from alignment boundary 202 lea 1(%rdi, %rdx, 1), %r8 # return value if terminator found + rcx 203 sub %rcx, %r8 204 bt %edx, %r10d # was the terminator present? 205 cmovc %r8, %rax # if yes, return pointer, else NULL 206 sub %ecx, %edx # find actual string/buffer length 207 208 ALIGN_TEXT 209.L0132: cmp $16, %rdx # at least 17 bytes to copy? 210 jb .L0116 211 212 /* copy 17--32 bytes */ 213 movdqu (%r9), %xmm0 # load first 16 bytes 214 movdqu -15(%r9, %rdx, 1), %xmm1 # load last 16 bytes 215 movdqu %xmm0, (%rdi) 216 movdqu %xmm1, -15(%rdi, %rdx, 1) 217 ret 218 219 /* process strings of 1--16 bytes (rdx: min(buflen, srclen), rax: srclen) */ 220 ALIGN_TEXT 221.L0116: cmp $8, %rdx # at least 9 bytes to copy? 222 jae .L0916 223 224 cmp $4, %rdx # at least 5 bytes to copy? 225 jae .L0508 226 227 cmp $2, %rdx # at least 3 bytes to copy? 228 jae .L0304 229 230 /* copy one or two bytes */ 231 movzbl (%r9), %ecx # load first byte from src 232 movzbl (%r9, %rdx, 1), %esi # load last byte from src 233 mov %cl, (%rdi) # deposit into destination 234 mov %sil, (%rdi, %rdx, 1) 235 ret 236 237.L0304: movzwl (%r9), %ecx 238 movzwl -1(%r9, %rdx, 1), %esi 239 mov %cx, (%rdi) 240 mov %si, -1(%rdi, %rdx, 1) 241 ret 242 243.L0508: mov (%r9), %ecx 244 mov -3(%r9, %rdx, 1), %esi 245 mov %ecx, (%rdi) 246 mov %esi, -3(%rdi, %rdx, 1) 247 ret 248 249.L0916: mov (%r9), %rcx 250 mov -7(%r9, %rdx, 1), %rsi 251 mov %rcx, (%rdi) 252 mov %rsi, -7(%rdi, %rdx, 1) 253 ret 254 255 /* length zero destination: return null pointer */ 256.L0: xor %eax, %eax 257 ret 258ARCHEND(__memccpy, baseline) 259 260 .section .note.GNU-stack,"",%progbits 261