1*9fbea870SRobert Clausecker/*- 2*9fbea870SRobert Clausecker * Copyright (c) 2023, The FreeBSD Foundation 3*9fbea870SRobert Clausecker * 4*9fbea870SRobert Clausecker * SPDX-License-Expression: BSD-2-Clause 5*9fbea870SRobert Clausecker * 6*9fbea870SRobert Clausecker * Portions of this software were developed by Robert Clausecker 7*9fbea870SRobert Clausecker * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation. 8*9fbea870SRobert Clausecker * 9*9fbea870SRobert Clausecker * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S 10*9fbea870SRobert Clausecker * written by J.T. Conklin <jtc@acorntoolworks.com> and 11*9fbea870SRobert Clausecker * adapted by Guillaume Morin <guillaume@morinfr.org> to implement stpcpy 12*9fbea870SRobert Clausecker * that was originally dedicated to the public domain 13c03b5ad6SGeorge V. Neville-Neil */ 14c03b5ad6SGeorge V. Neville-Neil 15c03b5ad6SGeorge V. Neville-Neil#include <machine/asm.h> 16*9fbea870SRobert Clausecker 17*9fbea870SRobert Clausecker#include "amd64_archlevel.h" 18*9fbea870SRobert Clausecker 19*9fbea870SRobert Clausecker#define ALIGN_TEXT .p2align 4, 0x90 20*9fbea870SRobert Clausecker 21*9fbea870SRobert Clausecker .weak stpcpy 22*9fbea870SRobert Clausecker .set stpcpy, __stpcpy 23*9fbea870SRobert ClauseckerARCHFUNCS(__stpcpy) 24*9fbea870SRobert Clausecker ARCHFUNC(__stpcpy, scalar) 25*9fbea870SRobert Clausecker ARCHFUNC(__stpcpy, baseline) 26*9fbea870SRobert ClauseckerENDARCHFUNCS(__stpcpy) 27*9fbea870SRobert Clausecker 28c03b5ad6SGeorge V. Neville-Neil/* 29c03b5ad6SGeorge V. Neville-Neil * This stpcpy implementation copies a byte at a time until the 30c03b5ad6SGeorge V. Neville-Neil * source pointer is aligned to a word boundary, it then copies by 31c03b5ad6SGeorge V. Neville-Neil * words until it finds a word containing a zero byte, and finally 32c03b5ad6SGeorge V. Neville-Neil * copies by bytes until the end of the string is reached. 33c03b5ad6SGeorge V. Neville-Neil * 34c03b5ad6SGeorge V. Neville-Neil * While this may result in unaligned stores if the source and 35c03b5ad6SGeorge V. Neville-Neil * destination pointers are unaligned with respect to each other, 36c03b5ad6SGeorge V. Neville-Neil * it is still faster than either byte copies or the overhead of 37c03b5ad6SGeorge V. Neville-Neil * an implementation suitable for machines with strict alignment 38c03b5ad6SGeorge V. Neville-Neil * requirements. 39c03b5ad6SGeorge V. Neville-Neil */ 40c03b5ad6SGeorge V. Neville-Neil 41*9fbea870SRobert ClauseckerARCHENTRY(__stpcpy, scalar) 42c03b5ad6SGeorge V. Neville-Neil movabsq $0x0101010101010101,%r8 43c03b5ad6SGeorge V. Neville-Neil movabsq $0x8080808080808080,%r9 44c03b5ad6SGeorge V. Neville-Neil 45c03b5ad6SGeorge V. Neville-Neil /* 46c03b5ad6SGeorge V. Neville-Neil * Align source to a word boundary. 47c03b5ad6SGeorge V. Neville-Neil * Consider unrolling loop? 48c03b5ad6SGeorge V. Neville-Neil */ 49c03b5ad6SGeorge V. Neville-Neil.Lalign: 50c03b5ad6SGeorge V. Neville-Neil testb $7,%sil 51c03b5ad6SGeorge V. Neville-Neil je .Lword_aligned 52c03b5ad6SGeorge V. Neville-Neil movb (%rsi),%dl 53c03b5ad6SGeorge V. Neville-Neil incq %rsi 54c03b5ad6SGeorge V. Neville-Neil movb %dl,(%rdi) 55c03b5ad6SGeorge V. Neville-Neil incq %rdi 56c03b5ad6SGeorge V. Neville-Neil testb %dl,%dl 57c03b5ad6SGeorge V. Neville-Neil jne .Lalign 58c03b5ad6SGeorge V. Neville-Neil movq %rdi,%rax 59c03b5ad6SGeorge V. Neville-Neil dec %rax 60c03b5ad6SGeorge V. Neville-Neil ret 61c03b5ad6SGeorge V. Neville-Neil 62*9fbea870SRobert Clausecker ALIGN_TEXT 63c03b5ad6SGeorge V. Neville-Neil.Lloop: 64c03b5ad6SGeorge V. Neville-Neil movq %rdx,(%rdi) 65c03b5ad6SGeorge V. Neville-Neil addq $8,%rdi 66c03b5ad6SGeorge V. Neville-Neil.Lword_aligned: 67c03b5ad6SGeorge V. Neville-Neil movq (%rsi),%rdx 68c03b5ad6SGeorge V. Neville-Neil movq %rdx,%rcx 69c03b5ad6SGeorge V. Neville-Neil addq $8,%rsi 70c03b5ad6SGeorge V. Neville-Neil subq %r8,%rcx 71c03b5ad6SGeorge V. Neville-Neil testq %r9,%rcx 72c03b5ad6SGeorge V. Neville-Neil je .Lloop 73c03b5ad6SGeorge V. Neville-Neil 74c03b5ad6SGeorge V. Neville-Neil /* 75c03b5ad6SGeorge V. Neville-Neil * In rare cases, the above loop may exit prematurely. We must 76c03b5ad6SGeorge V. Neville-Neil * return to the loop if none of the bytes in the word equal 0. 77c03b5ad6SGeorge V. Neville-Neil */ 78c03b5ad6SGeorge V. Neville-Neil 79c03b5ad6SGeorge V. Neville-Neil movb %dl,(%rdi) 80c03b5ad6SGeorge V. Neville-Neil testb %dl,%dl /* 1st byte == 0? */ 81c03b5ad6SGeorge V. Neville-Neil je .Ldone 82c03b5ad6SGeorge V. Neville-Neil incq %rdi 83c03b5ad6SGeorge V. Neville-Neil 84c03b5ad6SGeorge V. Neville-Neil shrq $8,%rdx 85c03b5ad6SGeorge V. Neville-Neil movb %dl,(%rdi) 86c03b5ad6SGeorge V. Neville-Neil testb %dl,%dl /* 2nd byte == 0? */ 87c03b5ad6SGeorge V. Neville-Neil je .Ldone 88c03b5ad6SGeorge V. Neville-Neil incq %rdi 89c03b5ad6SGeorge V. Neville-Neil 90c03b5ad6SGeorge V. Neville-Neil shrq $8,%rdx 91c03b5ad6SGeorge V. Neville-Neil movb %dl,(%rdi) 92c03b5ad6SGeorge V. Neville-Neil testb %dl,%dl /* 3rd byte == 0? */ 93c03b5ad6SGeorge V. Neville-Neil je .Ldone 94c03b5ad6SGeorge V. Neville-Neil incq %rdi 95c03b5ad6SGeorge V. Neville-Neil 96c03b5ad6SGeorge V. Neville-Neil shrq $8,%rdx 97c03b5ad6SGeorge V. Neville-Neil movb %dl,(%rdi) 98c03b5ad6SGeorge V. Neville-Neil testb %dl,%dl /* 4th byte == 0? */ 99c03b5ad6SGeorge V. Neville-Neil je .Ldone 100c03b5ad6SGeorge V. Neville-Neil incq %rdi 101c03b5ad6SGeorge V. Neville-Neil 102c03b5ad6SGeorge V. Neville-Neil shrq $8,%rdx 103c03b5ad6SGeorge V. Neville-Neil movb %dl,(%rdi) 104c03b5ad6SGeorge V. Neville-Neil testb %dl,%dl /* 5th byte == 0? */ 105c03b5ad6SGeorge V. Neville-Neil je .Ldone 106c03b5ad6SGeorge V. Neville-Neil incq %rdi 107c03b5ad6SGeorge V. Neville-Neil 108c03b5ad6SGeorge V. Neville-Neil shrq $8,%rdx 109c03b5ad6SGeorge V. Neville-Neil movb %dl,(%rdi) 110c03b5ad6SGeorge V. Neville-Neil testb %dl,%dl /* 6th byte == 0? */ 111c03b5ad6SGeorge V. Neville-Neil je .Ldone 112c03b5ad6SGeorge V. Neville-Neil incq %rdi 113c03b5ad6SGeorge V. Neville-Neil 114c03b5ad6SGeorge V. Neville-Neil shrq $8,%rdx 115c03b5ad6SGeorge V. Neville-Neil movb %dl,(%rdi) 116c03b5ad6SGeorge V. Neville-Neil testb %dl,%dl /* 7th byte == 0? */ 117c03b5ad6SGeorge V. Neville-Neil je .Ldone 118c03b5ad6SGeorge V. Neville-Neil incq %rdi 119c03b5ad6SGeorge V. Neville-Neil 120c03b5ad6SGeorge V. Neville-Neil shrq $8,%rdx 121c03b5ad6SGeorge V. Neville-Neil movb %dl,(%rdi) 122c03b5ad6SGeorge V. Neville-Neil incq %rdi 123c03b5ad6SGeorge V. Neville-Neil testb %dl,%dl /* 8th byte == 0? */ 124c03b5ad6SGeorge V. Neville-Neil jne .Lword_aligned 125c03b5ad6SGeorge V. Neville-Neil decq %rdi 126c03b5ad6SGeorge V. Neville-Neil 127c03b5ad6SGeorge V. Neville-Neil.Ldone: 128c03b5ad6SGeorge V. Neville-Neil movq %rdi,%rax 129c03b5ad6SGeorge V. Neville-Neil ret 130*9fbea870SRobert ClauseckerARCHEND(__stpcpy, scalar) 131*9fbea870SRobert Clausecker 132*9fbea870SRobert ClauseckerARCHENTRY(__stpcpy, baseline) 133*9fbea870SRobert Clausecker mov %esi, %ecx 134*9fbea870SRobert Clausecker mov %rdi, %rdx 135*9fbea870SRobert Clausecker sub %rsi, %rdi # express destination as distance to surce 136*9fbea870SRobert Clausecker and $~0xf, %rsi # align source to 16 byte 137*9fbea870SRobert Clausecker movdqa (%rsi), %xmm0 # head of string with junk before 138*9fbea870SRobert Clausecker pxor %xmm1, %xmm1 139*9fbea870SRobert Clausecker and $0xf, %ecx # misalignment in bytes 140*9fbea870SRobert Clausecker pcmpeqb %xmm1, %xmm0 # NUL byte present? 141*9fbea870SRobert Clausecker pmovmskb %xmm0, %eax 142*9fbea870SRobert Clausecker shr %cl, %eax # clear out matches in junk bytes 143*9fbea870SRobert Clausecker bsf %eax, %eax # find match if any 144*9fbea870SRobert Clausecker jnz .Lrunt 145*9fbea870SRobert Clausecker 146*9fbea870SRobert Clausecker /* first normal iteration: write head back if it succeeds */ 147*9fbea870SRobert Clausecker movdqa 16(%rsi), %xmm0 # 16 bytes of current iteration 148*9fbea870SRobert Clausecker movdqu (%rsi, %rcx, 1), %xmm2 # first 16 bytes of the string 149*9fbea870SRobert Clausecker pcmpeqb %xmm0, %xmm1 # NUL byte present? 150*9fbea870SRobert Clausecker pmovmskb %xmm1, %eax 151*9fbea870SRobert Clausecker test %eax, %eax # find match if any 152*9fbea870SRobert Clausecker jnz .Lshorty 153*9fbea870SRobert Clausecker 154*9fbea870SRobert Clausecker movdqu %xmm2, (%rdx) # store beginning of string 155*9fbea870SRobert Clausecker 156*9fbea870SRobert Clausecker /* main loop, unrolled twice */ 157*9fbea870SRobert Clausecker ALIGN_TEXT 158*9fbea870SRobert Clausecker0: movdqa 32(%rsi), %xmm2 # load current iteraion 159*9fbea870SRobert Clausecker movdqu %xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion 160*9fbea870SRobert Clausecker pxor %xmm1, %xmm1 161*9fbea870SRobert Clausecker add $32, %rsi 162*9fbea870SRobert Clausecker pcmpeqb %xmm2, %xmm1 # NUL byte present? 163*9fbea870SRobert Clausecker pmovmskb %xmm1, %eax 164*9fbea870SRobert Clausecker test %eax, %eax 165*9fbea870SRobert Clausecker jnz 1f 166*9fbea870SRobert Clausecker 167*9fbea870SRobert Clausecker movdqa 16(%rsi), %xmm0 # load current iteraion 168*9fbea870SRobert Clausecker movdqu %xmm2, (%rsi, %rdi, 1) # write back previous iteraion 169*9fbea870SRobert Clausecker pxor %xmm1, %xmm1 170*9fbea870SRobert Clausecker pcmpeqb %xmm0, %xmm1 # NUL byte present? 171*9fbea870SRobert Clausecker pmovmskb %xmm1, %eax 172*9fbea870SRobert Clausecker test %eax, %eax 173*9fbea870SRobert Clausecker jz 0b 174*9fbea870SRobert Clausecker 175*9fbea870SRobert Clausecker /* end of string after main loop has iterated */ 176*9fbea870SRobert Clausecker add $16, %rsi # advance rsi to second unrolled half 177*9fbea870SRobert Clausecker1: tzcnt %eax, %eax # find location of match 178*9fbea870SRobert Clausecker # (behaves as bsf on pre-x86-64-v3 CPUs) 179*9fbea870SRobert Clausecker add %rsi, %rax # point to NUL byte 180*9fbea870SRobert Clausecker movdqu -15(%rax), %xmm0 # last 16 bytes of string 181*9fbea870SRobert Clausecker movdqu %xmm0, -15(%rax, %rdi, 1) # copied to destination 182*9fbea870SRobert Clausecker add %rdi, %rax # point to destination's NUL byte 183*9fbea870SRobert Clausecker ret 184*9fbea870SRobert Clausecker 185*9fbea870SRobert Clausecker /* NUL encountered in second iteration */ 186*9fbea870SRobert Clausecker.Lshorty: 187*9fbea870SRobert Clausecker tzcnt %eax, %eax 188*9fbea870SRobert Clausecker add $16, %eax # account for length of first iteration 189*9fbea870SRobert Clausecker sub %ecx, %eax # but not the parts before the string 190*9fbea870SRobert Clausecker 191*9fbea870SRobert Clausecker /* NUL encountered in first iteration */ 192*9fbea870SRobert Clausecker.Lrunt: lea 1(%rax), %edi # string length including NUL byte 193*9fbea870SRobert Clausecker add %rcx, %rsi # point to beginning of string 194*9fbea870SRobert Clausecker add %rdx, %rax # point to NUL byte 195*9fbea870SRobert Clausecker 196*9fbea870SRobert Clausecker /* transfer 16--32 bytes */ 197*9fbea870SRobert Clausecker.L1632: cmp $16, %edi 198*9fbea870SRobert Clausecker jb .L0815 199*9fbea870SRobert Clausecker 200*9fbea870SRobert Clausecker movdqu -16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes 201*9fbea870SRobert Clausecker movdqu %xmm2, (%rdx) # store first 16 bytes 202*9fbea870SRobert Clausecker movdqu %xmm0, -15(%rax) # store last 16 bytes 203*9fbea870SRobert Clausecker ret 204*9fbea870SRobert Clausecker 205*9fbea870SRobert Clausecker /* transfer 8--15 bytes */ 206*9fbea870SRobert Clausecker.L0815: cmp $8, %edi 207*9fbea870SRobert Clausecker jb .L0407 208*9fbea870SRobert Clausecker 209*9fbea870SRobert Clausecker mov (%rsi), %rcx # load first 8 bytes 210*9fbea870SRobert Clausecker mov -8(%rsi, %rdi, 1), %rdi # load last 8 bytes 211*9fbea870SRobert Clausecker mov %rcx, (%rdx) # store to dst 212*9fbea870SRobert Clausecker mov %rdi, -7(%rax) # dito 213*9fbea870SRobert Clausecker ret 214*9fbea870SRobert Clausecker 215*9fbea870SRobert Clausecker /* transfer 4--7 bytes */ 216*9fbea870SRobert Clausecker.L0407: cmp $4, %edi 217*9fbea870SRobert Clausecker jb .L0203 218*9fbea870SRobert Clausecker 219*9fbea870SRobert Clausecker mov (%rsi), %ecx 220*9fbea870SRobert Clausecker mov -4(%rsi, %rdi, 1), %edi 221*9fbea870SRobert Clausecker mov %ecx, (%rdx) 222*9fbea870SRobert Clausecker mov %edi, -3(%rax) 223*9fbea870SRobert Clausecker ret 224*9fbea870SRobert Clausecker 225*9fbea870SRobert Clausecker /* transfer 2--3 bytes */ 226*9fbea870SRobert Clausecker.L0203: cmp $2, %edi 227*9fbea870SRobert Clausecker jb .L0101 228*9fbea870SRobert Clausecker 229*9fbea870SRobert Clausecker movzwl (%rsi), %ecx 230*9fbea870SRobert Clausecker mov %cx, (%rdx) # store first two bytes 231*9fbea870SRobert Clausecker 232*9fbea870SRobert Clausecker /* transfer 0 bytes (last byte is always NUL) */ 233*9fbea870SRobert Clausecker.L0101: movb $0, (%rax) # store terminating NUL byte 234*9fbea870SRobert Clausecker ret 235*9fbea870SRobert ClauseckerARCHEND(__stpcpy, baseline) 236c03b5ad6SGeorge V. Neville-Neil 237c03b5ad6SGeorge V. Neville-Neil .section .note.GNU-stack,"",%progbits 238