1fc0e38a7SRobert Clausecker/* 2*90823980SRobert Clausecker * Copyright (c) 2023, 2024 The FreeBSD Foundation 3fc0e38a7SRobert Clausecker * 4fc0e38a7SRobert Clausecker * This software was developed by Robert Clausecker <fuz@FreeBSD.org> 5fc0e38a7SRobert Clausecker * under sponsorship from the FreeBSD Foundation. 6fc0e38a7SRobert Clausecker * 7fc0e38a7SRobert Clausecker * Redistribution and use in source and binary forms, with or without 8fc0e38a7SRobert Clausecker * modification, are permitted provided that the following conditions 9fc0e38a7SRobert Clausecker * are met: 10fc0e38a7SRobert Clausecker * 1. Redistributions of source code must retain the above copyright 11fc0e38a7SRobert Clausecker * notice, this list of conditions and the following disclaimer. 12fc0e38a7SRobert Clausecker * 2. Redistributions in binary form must reproduce the above copyright 13fc0e38a7SRobert Clausecker * notice, this list of conditions and the following disclaimer in the 14fc0e38a7SRobert Clausecker * documentation and/or other materials provided with the distribution. 15fc0e38a7SRobert Clausecker * 16fc0e38a7SRobert Clausecker * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND 17fc0e38a7SRobert Clausecker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18fc0e38a7SRobert Clausecker * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19fc0e38a7SRobert Clausecker * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20fc0e38a7SRobert Clausecker * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21fc0e38a7SRobert Clausecker * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22fc0e38a7SRobert Clausecker * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23fc0e38a7SRobert Clausecker * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24fc0e38a7SRobert Clausecker * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25fc0e38a7SRobert Clausecker * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26fc0e38a7SRobert Clausecker * SUCH DAMAGE 27fc0e38a7SRobert Clausecker */ 28fc0e38a7SRobert Clausecker 29fc0e38a7SRobert Clausecker#include <machine/asm.h> 30fc0e38a7SRobert Clausecker 31fc0e38a7SRobert Clausecker#include "amd64_archlevel.h" 32fc0e38a7SRobert Clausecker 33fc0e38a7SRobert Clausecker#define ALIGN_TEXT .p2align 4, 0x90 34fc0e38a7SRobert Clausecker 35fc0e38a7SRobert Clausecker .weak memccpy 36fc0e38a7SRobert Clausecker .set memccpy, __memccpy 37fc0e38a7SRobert ClauseckerARCHFUNCS(__memccpy) 38fc0e38a7SRobert Clausecker ARCHFUNC(__memccpy, scalar) 39fc0e38a7SRobert Clausecker ARCHFUNC(__memccpy, baseline) 40fc0e38a7SRobert ClauseckerENDARCHFUNCS(__memccpy) 41fc0e38a7SRobert Clausecker 42fc0e38a7SRobert ClauseckerARCHENTRY(__memccpy, scalar) 43fc0e38a7SRobert Clausecker push %rbp # establish stack frame 44fc0e38a7SRobert Clausecker mov %rsp, %rbp 45fc0e38a7SRobert Clausecker push %rax # dummy push for alignment 46fc0e38a7SRobert Clausecker push %rbx 47fc0e38a7SRobert Clausecker push %rdi 48fc0e38a7SRobert Clausecker push %rsi 49fc0e38a7SRobert Clausecker 50fc0e38a7SRobert Clausecker mov %rsi, %rdi 51fc0e38a7SRobert Clausecker mov %edx, %esi 52fc0e38a7SRobert Clausecker mov %rcx, %rdx 53fc0e38a7SRobert Clausecker mov %rcx, %rbx 54fc0e38a7SRobert Clausecker call CNAME(__memchr) # ptr = memchr(src, c, len) 55fc0e38a7SRobert Clausecker 56fc0e38a7SRobert Clausecker pop %rsi 57fc0e38a7SRobert Clausecker pop %rdi 58fc0e38a7SRobert Clausecker lea 1(%rax), %rdx 59fc0e38a7SRobert Clausecker sub %rsi, %rdx # size = ptr - src + 1 60fc0e38a7SRobert Clausecker mov %rbx, %rcx 61fc0e38a7SRobert Clausecker lea (%rdi, %rdx, 1), %rbx # res = dest + size 62fc0e38a7SRobert Clausecker test %rax, %rax # if (ptr == NULL) 63fc0e38a7SRobert Clausecker cmovz %rcx, %rdx # size = len 64fc0e38a7SRobert Clausecker cmovz %rax, %rbx # res = NULL 65fc0e38a7SRobert Clausecker call CNAME(memcpy) 66fc0e38a7SRobert Clausecker 67fc0e38a7SRobert Clausecker mov %rbx, %rax # return (res) 68fc0e38a7SRobert Clausecker pop %rbx 69fc0e38a7SRobert Clausecker leave 70fc0e38a7SRobert Clausecker ret 71fc0e38a7SRobert ClauseckerARCHEND(__memccpy, scalar) 72fc0e38a7SRobert Clausecker 73fc0e38a7SRobert ClauseckerARCHENTRY(__memccpy, baseline) 74fc0e38a7SRobert Clausecker sub $1, %rcx # RCX refers to last character in buffer 75fc0e38a7SRobert Clausecker jb .L0 # go to special code path if len was 0 76fc0e38a7SRobert Clausecker 77fc0e38a7SRobert Clausecker movd %edx, %xmm4 78fc0e38a7SRobert Clausecker mov %rcx, %rdx 79fc0e38a7SRobert Clausecker punpcklbw %xmm4, %xmm4 # c -> cc 80fc0e38a7SRobert Clausecker mov %esi, %ecx 81fc0e38a7SRobert Clausecker punpcklwd %xmm4, %xmm4 # cc -> cccc 82fc0e38a7SRobert Clausecker mov %rsi, %r9 # stash a copy of the source pointer for later 83fc0e38a7SRobert Clausecker pshufd $0, %xmm4, %xmm4 # cccc -> cccccccccccccccc 84fc0e38a7SRobert Clausecker and $~0xf, %rsi 85fc0e38a7SRobert Clausecker movdqa %xmm4, %xmm1 86*90823980SRobert Clausecker pcmpeqb (%rsi), %xmm1 # c found in head? 87fc0e38a7SRobert Clausecker and $0xf, %ecx 88*90823980SRobert Clausecker mov $-1, %eax 89*90823980SRobert Clausecker pmovmskb %xmm1, %r8d 90*90823980SRobert Clausecker lea -32(%rcx), %r11 91*90823980SRobert Clausecker shl %cl, %eax # mask of bytes in the string 92*90823980SRobert Clausecker add %rdx, %r11 # distance from alignment boundary - 32 93*90823980SRobert Clausecker jnc .Lrunt # jump if buffer length is 32 or less 94*90823980SRobert Clausecker 95fc0e38a7SRobert Clausecker and %r8d, %eax 96*90823980SRobert Clausecker jz 0f # match (or induced match) found? 97fc0e38a7SRobert Clausecker 98*90823980SRobert Clausecker /* match in first chunk */ 99*90823980SRobert Clausecker tzcnt %eax, %edx # where is c? 100*90823980SRobert Clausecker sub %ecx, %edx # ... from the beginning of the string? 101*90823980SRobert Clausecker lea 1(%rdi, %rdx, 1), %rax # return value 102*90823980SRobert Clausecker jmp .L0116 103*90823980SRobert Clausecker 104*90823980SRobert Clausecker0: movdqa 16(%rsi), %xmm3 # load second string chunk 105fc0e38a7SRobert Clausecker movdqu (%r9), %xmm2 # load unaligned string head 106fc0e38a7SRobert Clausecker movdqa %xmm4, %xmm1 107*90823980SRobert Clausecker pcmpeqb %xmm3, %xmm1 # c found in second chunk? 108fc0e38a7SRobert Clausecker 109fc0e38a7SRobert Clausecker /* process second chunk */ 110fc0e38a7SRobert Clausecker pmovmskb %xmm1, %eax 111fc0e38a7SRobert Clausecker test %eax, %eax 112*90823980SRobert Clausecker jz 0f 113fc0e38a7SRobert Clausecker 114*90823980SRobert Clausecker /* match in second chunk */ 115*90823980SRobert Clausecker tzcnt %eax, %edx # where is c? 116*90823980SRobert Clausecker sub $16, %ecx 117*90823980SRobert Clausecker sub %ecx, %edx # adjust for alignment offset 118*90823980SRobert Clausecker lea 1(%rdi, %rdx, 1), %rax # return value 119*90823980SRobert Clausecker jmp .L0132 120*90823980SRobert Clausecker 121*90823980SRobert Clausecker /* c not found in second chunk: prepare for main loop */ 122*90823980SRobert Clausecker0: movdqa 32(%rsi), %xmm0 # load next string chunk 123fc0e38a7SRobert Clausecker movdqa %xmm4, %xmm1 124fc0e38a7SRobert Clausecker movdqu %xmm2, (%rdi) # deposit head into buffer 125fc0e38a7SRobert Clausecker sub %rcx, %rdi # adjust RDI to correspond to RSI 126*90823980SRobert Clausecker mov %r11, %rdx 127fc0e38a7SRobert Clausecker movdqu %xmm3, 16(%rdi) # deposit second chunk 128fc0e38a7SRobert Clausecker sub %rsi, %rdi # express RDI as distance from RSI 129fc0e38a7SRobert Clausecker add $32, %rsi # advance RSI past first two chunks 130fc0e38a7SRobert Clausecker sub $16, %rdx # enough left for another round? 131fc0e38a7SRobert Clausecker jb 1f 132fc0e38a7SRobert Clausecker 133fc0e38a7SRobert Clausecker /* main loop unrolled twice */ 134fc0e38a7SRobert Clausecker ALIGN_TEXT 135*90823980SRobert Clausecker0: pcmpeqb %xmm0, %xmm1 # c encountered? 136fc0e38a7SRobert Clausecker pmovmskb %xmm1, %eax 137fc0e38a7SRobert Clausecker test %eax, %eax 138fc0e38a7SRobert Clausecker jnz 3f 139fc0e38a7SRobert Clausecker 140fc0e38a7SRobert Clausecker movdqu %xmm0, (%rsi, %rdi) 141fc0e38a7SRobert Clausecker movdqa 16(%rsi), %xmm0 # load next string chunk 142fc0e38a7SRobert Clausecker movdqa %xmm4, %xmm1 143fc0e38a7SRobert Clausecker cmp $16, %rdx # more than a full chunk left? 144fc0e38a7SRobert Clausecker jb 2f 145fc0e38a7SRobert Clausecker 146fc0e38a7SRobert Clausecker add $32, %rsi # advance pointers to next chunk 147*90823980SRobert Clausecker pcmpeqb %xmm0, %xmm1 # c encountered? 148fc0e38a7SRobert Clausecker pmovmskb %xmm1, %eax 149fc0e38a7SRobert Clausecker test %eax, %eax 150fc0e38a7SRobert Clausecker jnz 4f 151fc0e38a7SRobert Clausecker 152fc0e38a7SRobert Clausecker movdqu %xmm0, -16(%rsi, %rdi) 153fc0e38a7SRobert Clausecker movdqa (%rsi), %xmm0 # load next string chunk 154fc0e38a7SRobert Clausecker movdqa %xmm4, %xmm1 155fc0e38a7SRobert Clausecker sub $32, %rdx 156fc0e38a7SRobert Clausecker jae 0b 157fc0e38a7SRobert Clausecker 158fc0e38a7SRobert Clausecker1: sub $16, %rsi # undo second advancement 159fc0e38a7SRobert Clausecker add $16, %edx 160fc0e38a7SRobert Clausecker 161fc0e38a7SRobert Clausecker /* 1--16 bytes left in the buffer but string has not ended yet */ 162*90823980SRobert Clausecker2: pcmpeqb %xmm1, %xmm0 # c encountered? 163fc0e38a7SRobert Clausecker pmovmskb %xmm0, %r8d 164fc0e38a7SRobert Clausecker mov %r8d, %ecx 165fc0e38a7SRobert Clausecker bts %edx, %r8d # treat end of buffer as end of string 166fc0e38a7SRobert Clausecker tzcnt %r8d, %r8d # find tail length 167fc0e38a7SRobert Clausecker add %rsi, %rdi # restore RDI 168fc0e38a7SRobert Clausecker movdqu 1(%rsi, %r8, 1), %xmm0 # load string tail 169fc0e38a7SRobert Clausecker movdqu %xmm0, 1(%rdi, %r8, 1) # store string tail 170fc0e38a7SRobert Clausecker lea 17(%rdi, %r8, 1), %rsi # return value if terminator encountered 171fc0e38a7SRobert Clausecker xor %eax, %eax # return value if no terminator encountered 172fc0e38a7SRobert Clausecker bt %r8d, %ecx # terminator encountered inside buffer? 173fc0e38a7SRobert Clausecker cmovc %rsi, %rax # if yes, return pointer, else NULL 174fc0e38a7SRobert Clausecker ret 175fc0e38a7SRobert Clausecker 176fc0e38a7SRobert Clausecker4: sub $16, %rsi # undo second advancement 177fc0e38a7SRobert Clausecker 178*90823980SRobert Clausecker /* terminator found and buffer has not ended yet */ 179fc0e38a7SRobert Clausecker3: tzcnt %eax, %eax # find length of string tail 180*90823980SRobert Clausecker movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. c) 181fc0e38a7SRobert Clausecker add %rsi, %rdi # restore destination pointer 182*90823980SRobert Clausecker movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. c) 183fc0e38a7SRobert Clausecker lea 1(%rdi, %rax, 1), %rax # compute return value 184fc0e38a7SRobert Clausecker ret 185fc0e38a7SRobert Clausecker 186*90823980SRobert Clausecker /* buffer is 1--32 bytes in size */ 187*90823980SRobert Clausecker ALIGN_TEXT 188*90823980SRobert Clausecker.Lrunt: add $32, %r11d # undo earlier decrement 189*90823980SRobert Clausecker mov %r8d, %r10d # keep a copy of the original match mask 190*90823980SRobert Clausecker bts %r11d, %r8d # induce match at buffer end 191*90823980SRobert Clausecker and %ax, %r8w # is there a match in the first 16 bytes? 192*90823980SRobert Clausecker jnz 0f # if yes, skip looking at second chunk 193fc0e38a7SRobert Clausecker 194*90823980SRobert Clausecker pcmpeqb 16(%rsi), %xmm4 # check for match in second chunk 195*90823980SRobert Clausecker pmovmskb %xmm4, %r8d 196*90823980SRobert Clausecker shl $16, %r8d # place second chunk matches in bits 16--31 197*90823980SRobert Clausecker mov %r8d, %r10d # keep a copy of the original match mask 198*90823980SRobert Clausecker bts %r11d, %r8d # induce a match at buffer end 199*90823980SRobert Clausecker 200*90823980SRobert Clausecker0: xor %eax, %eax # return value if terminator not found 201*90823980SRobert Clausecker tzcnt %r8d, %edx # find string/buffer length from alignment boundary 202*90823980SRobert Clausecker lea 1(%rdi, %rdx, 1), %r8 # return value if terminator found + rcx 203*90823980SRobert Clausecker sub %rcx, %r8 204*90823980SRobert Clausecker bt %edx, %r10d # was the terminator present? 205*90823980SRobert Clausecker cmovc %r8, %rax # if yes, return pointer, else NULL 206*90823980SRobert Clausecker sub %ecx, %edx # find actual string/buffer length 207*90823980SRobert Clausecker 208*90823980SRobert Clausecker ALIGN_TEXT 209*90823980SRobert Clausecker.L0132: cmp $16, %rdx # at least 17 bytes to copy? 210fc0e38a7SRobert Clausecker jb .L0116 211fc0e38a7SRobert Clausecker 212fc0e38a7SRobert Clausecker /* copy 17--32 bytes */ 213fc0e38a7SRobert Clausecker movdqu (%r9), %xmm0 # load first 16 bytes 214fc0e38a7SRobert Clausecker movdqu -15(%r9, %rdx, 1), %xmm1 # load last 16 bytes 215fc0e38a7SRobert Clausecker movdqu %xmm0, (%rdi) 216fc0e38a7SRobert Clausecker movdqu %xmm1, -15(%rdi, %rdx, 1) 217fc0e38a7SRobert Clausecker ret 218fc0e38a7SRobert Clausecker 219fc0e38a7SRobert Clausecker /* process strings of 1--16 bytes (rdx: min(buflen, srclen), rax: srclen) */ 220*90823980SRobert Clausecker ALIGN_TEXT 221fc0e38a7SRobert Clausecker.L0116: cmp $8, %rdx # at least 9 bytes to copy? 222fc0e38a7SRobert Clausecker jae .L0916 223fc0e38a7SRobert Clausecker 224fc0e38a7SRobert Clausecker cmp $4, %rdx # at least 5 bytes to copy? 225fc0e38a7SRobert Clausecker jae .L0508 226fc0e38a7SRobert Clausecker 227fc0e38a7SRobert Clausecker cmp $2, %rdx # at least 3 bytes to copy? 228fc0e38a7SRobert Clausecker jae .L0304 229fc0e38a7SRobert Clausecker 230fc0e38a7SRobert Clausecker /* copy one or two bytes */ 231fc0e38a7SRobert Clausecker movzbl (%r9), %ecx # load first byte from src 232fc0e38a7SRobert Clausecker movzbl (%r9, %rdx, 1), %esi # load last byte from src 233fc0e38a7SRobert Clausecker mov %cl, (%rdi) # deposit into destination 234fc0e38a7SRobert Clausecker mov %sil, (%rdi, %rdx, 1) 235fc0e38a7SRobert Clausecker ret 236fc0e38a7SRobert Clausecker 237fc0e38a7SRobert Clausecker.L0304: movzwl (%r9), %ecx 238fc0e38a7SRobert Clausecker movzwl -1(%r9, %rdx, 1), %esi 239fc0e38a7SRobert Clausecker mov %cx, (%rdi) 240fc0e38a7SRobert Clausecker mov %si, -1(%rdi, %rdx, 1) 241fc0e38a7SRobert Clausecker ret 242fc0e38a7SRobert Clausecker 243fc0e38a7SRobert Clausecker.L0508: mov (%r9), %ecx 244fc0e38a7SRobert Clausecker mov -3(%r9, %rdx, 1), %esi 245fc0e38a7SRobert Clausecker mov %ecx, (%rdi) 246fc0e38a7SRobert Clausecker mov %esi, -3(%rdi, %rdx, 1) 247fc0e38a7SRobert Clausecker ret 248fc0e38a7SRobert Clausecker 249fc0e38a7SRobert Clausecker.L0916: mov (%r9), %rcx 250fc0e38a7SRobert Clausecker mov -7(%r9, %rdx, 1), %rsi 251fc0e38a7SRobert Clausecker mov %rcx, (%rdi) 252fc0e38a7SRobert Clausecker mov %rsi, -7(%rdi, %rdx, 1) 253fc0e38a7SRobert Clausecker ret 254fc0e38a7SRobert Clausecker 255fc0e38a7SRobert Clausecker /* length zero destination: return null pointer */ 256fc0e38a7SRobert Clausecker.L0: xor %eax, %eax 257fc0e38a7SRobert Clausecker ret 258fc0e38a7SRobert ClauseckerARCHEND(__memccpy, baseline) 259fc0e38a7SRobert Clausecker 260fc0e38a7SRobert Clausecker .section .note.GNU-stack,"",%progbits 261