stpcpy.S - OpenGrok cross reference for /freebsd/lib/libc/amd64/string/stpcpy.S

Deleted Added

sdiffudifftextold (1d386b48..)new (9fbea870..)

stpcpy.S (1d386b48a555f61cb7325543adbbb5c3f3407a66)	stpcpy.S (9fbea870286d53d906ffaf6b15ace8e40019a880)
1/* 2 * Adapted by Guillaume Morin <guillaume@morinfr.org> from strcpy.S 3 * written by J.T. Conklin <jtc@acorntoolworks.com> 4 * Public domain.	1/- 2 Copyright (c) 2023, The FreeBSD Foundation 3 * 4 * SPDX-License-Expression: BSD-2-Clause 5 * 6 * Portions of this software were developed by Robert Clausecker 7 * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation. 8 * 9 * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S 10 * written by J.T. Conklin <jtc@acorntoolworks.com> and 11 * adapted by Guillaume Morin <guillaume@morinfr.org> to implement stpcpy 12 * that was originally dedicated to the public domain
5 */ 6 7#include <machine/asm.h>	13 */ 14 15#include <machine/asm.h>
	16 17#include "amd64_archlevel.h" 18 19#define ALIGN_TEXT .p2align 4, 0x90 20 21 .weak stpcpy 22 .set stpcpy, __stpcpy 23ARCHFUNCS(__stpcpy) 24 ARCHFUNC(__stpcpy, scalar) 25 ARCHFUNC(__stpcpy, baseline) 26ENDARCHFUNCS(__stpcpy) 27
8/* 9 * This stpcpy implementation copies a byte at a time until the 10 * source pointer is aligned to a word boundary, it then copies by 11 * words until it finds a word containing a zero byte, and finally 12 * copies by bytes until the end of the string is reached. 13 * 14 * While this may result in unaligned stores if the source and 15 * destination pointers are unaligned with respect to each other, 16 * it is still faster than either byte copies or the overhead of 17 * an implementation suitable for machines with strict alignment 18 * requirements. 19 */ 20	28/* 29 * This stpcpy implementation copies a byte at a time until the 30 * source pointer is aligned to a word boundary, it then copies by 31 * words until it finds a word containing a zero byte, and finally 32 * copies by bytes until the end of the string is reached. 33 * 34 * While this may result in unaligned stores if the source and 35 * destination pointers are unaligned with respect to each other, 36 * it is still faster than either byte copies or the overhead of 37 * an implementation suitable for machines with strict alignment 38 * requirements. 39 */ 40
21 .globl stpcpy,__stpcpy 22ENTRY(stpcpy) 23__stpcpy:	41ARCHENTRY(__stpcpy, scalar)
24 movabsq $0x0101010101010101,%r8 25 movabsq $0x8080808080808080,%r9 26 27 /* 28 * Align source to a word boundary. 29 * Consider unrolling loop? 30 / 31.Lalign: --- 4 unchanged lines hidden* (view full) --- 36 movb %dl,(%rdi) 37 incq %rdi 38 testb %dl,%dl 39 jne .Lalign 40 movq %rdi,%rax 41 dec %rax 42 ret 43	42 movabsq $0x0101010101010101,%r8 43 movabsq $0x8080808080808080,%r9 44 45 /* 46 * Align source to a word boundary. 47 * Consider unrolling loop? 48 / 49.Lalign: --- 4 unchanged lines hidden* (view full) --- 54 movb %dl,(%rdi) 55 incq %rdi 56 testb %dl,%dl 57 jne .Lalign 58 movq %rdi,%rax 59 dec %rax 60 ret 61
44 .p2align 4	62 ALIGN_TEXT
45.Lloop: 46 movq %rdx,(%rdi) 47 addq $8,%rdi 48.Lword_aligned: 49 movq (%rsi),%rdx 50 movq %rdx,%rcx 51 addq $8,%rsi 52 subq %r8,%rcx --- 51 unchanged lines hidden (view full) --- 104 incq %rdi 105 testb %dl,%dl /* 8th byte == 0? */ 106 jne .Lword_aligned 107 decq %rdi 108 109.Ldone: 110 movq %rdi,%rax 111 ret	63.Lloop: 64 movq %rdx,(%rdi) 65 addq $8,%rdi 66.Lword_aligned: 67 movq (%rsi),%rdx 68 movq %rdx,%rcx 69 addq $8,%rsi 70 subq %r8,%rcx --- 51 unchanged lines hidden (view full) --- 122 incq %rdi 123 testb %dl,%dl /* 8th byte == 0? */ 124 jne .Lword_aligned 125 decq %rdi 126 127.Ldone: 128 movq %rdi,%rax 129 ret
112END(stpcpy) 113	130ARCHEND(__stpcpy, scalar) 131 132ARCHENTRY(__stpcpy, baseline) 133 mov %esi, %ecx 134 mov %rdi, %rdx 135 sub %rsi, %rdi # express destination as distance to surce 136 and $~0xf, %rsi # align source to 16 byte 137 movdqa (%rsi), %xmm0 # head of string with junk before 138 pxor %xmm1, %xmm1 139 and $0xf, %ecx # misalignment in bytes 140 pcmpeqb %xmm1, %xmm0 # NUL byte present? 141 pmovmskb %xmm0, %eax 142 shr %cl, %eax # clear out matches in junk bytes 143 bsf %eax, %eax # find match if any 144 jnz .Lrunt 145 146 /* first normal iteration: write head back if it succeeds / 147 movdqa 16(%rsi), %xmm0 # 16 bytes of current iteration 148 movdqu (%rsi, %rcx, 1), %xmm2 # first 16 bytes of the string 149 pcmpeqb %xmm0, %xmm1 # NUL byte present? 150 pmovmskb %xmm1, %eax 151 test %eax, %eax # find match if any 152 jnz .Lshorty 153 154 movdqu %xmm2, (%rdx) # store beginning of string 155 156 / main loop, unrolled twice / 157 ALIGN_TEXT 1580: movdqa 32(%rsi), %xmm2 # load current iteraion 159 movdqu %xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion 160 pxor %xmm1, %xmm1 161 add $32, %rsi 162 pcmpeqb %xmm2, %xmm1 # NUL byte present? 163 pmovmskb %xmm1, %eax 164 test %eax, %eax 165 jnz 1f 166 167 movdqa 16(%rsi), %xmm0 # load current iteraion 168 movdqu %xmm2, (%rsi, %rdi, 1) # write back previous iteraion 169 pxor %xmm1, %xmm1 170 pcmpeqb %xmm0, %xmm1 # NUL byte present? 171 pmovmskb %xmm1, %eax 172 test %eax, %eax 173 jz 0b 174 175 / end of string after main loop has iterated / 176 add $16, %rsi # advance rsi to second unrolled half 1771: tzcnt %eax, %eax # find location of match 178 # (behaves as bsf on pre-x86-64-v3 CPUs) 179 add %rsi, %rax # point to NUL byte 180 movdqu -15(%rax), %xmm0 # last 16 bytes of string 181 movdqu %xmm0, -15(%rax, %rdi, 1) # copied to destination 182 add %rdi, %rax # point to destination's NUL byte 183 ret 184 185 / NUL encountered in second iteration / 186.Lshorty: 187 tzcnt %eax, %eax 188 add $16, %eax # account for length of first iteration 189 sub %ecx, %eax # but not the parts before the string 190 191 / NUL encountered in first iteration / 192.Lrunt: lea 1(%rax), %edi # string length including NUL byte 193 add %rcx, %rsi # point to beginning of string 194 add %rdx, %rax # point to NUL byte 195 196 / transfer 16--32 bytes / 197.L1632: cmp $16, %edi 198 jb .L0815 199 200 movdqu -16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes 201 movdqu %xmm2, (%rdx) # store first 16 bytes 202 movdqu %xmm0, -15(%rax) # store last 16 bytes 203 ret 204 205 / transfer 8--15 bytes / 206.L0815: cmp $8, %edi 207 jb .L0407 208 209 mov (%rsi), %rcx # load first 8 bytes 210 mov -8(%rsi, %rdi, 1), %rdi # load last 8 bytes 211 mov %rcx, (%rdx) # store to dst 212 mov %rdi, -7(%rax) # dito 213 ret 214 215 / transfer 4--7 bytes / 216.L0407: cmp $4, %edi 217 jb .L0203 218 219 mov (%rsi), %ecx 220 mov -4(%rsi, %rdi, 1), %edi 221 mov %ecx, (%rdx) 222 mov %edi, -3(%rax) 223 ret 224 225 / transfer 2--3 bytes / 226.L0203: cmp $2, %edi 227 jb .L0101 228 229 movzwl (%rsi), %ecx 230 mov %cx, (%rdx) # store first two bytes 231 232 / transfer 0 bytes (last byte is always NUL) */ 233.L0101: movb $0, (%rax) # store terminating NUL byte 234 ret 235ARCHEND(__stpcpy, baseline) 236
114 .section .note.GNU-stack,"",%progbits	237 .section .note.GNU-stack,"",%progbits