stpcpy.S (1d386b48a555f61cb7325543adbbb5c3f3407a66) stpcpy.S (9fbea870286d53d906ffaf6b15ace8e40019a880)
1/*
2 * Adapted by Guillaume Morin <guillaume@morinfr.org> from strcpy.S
3 * written by J.T. Conklin <jtc@acorntoolworks.com>
4 * Public domain.
1/*-
2 * Copyright (c) 2023, The FreeBSD Foundation
3 *
4 * SPDX-License-Expression: BSD-2-Clause
5 *
6 * Portions of this software were developed by Robert Clausecker
7 * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
8 *
9 * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S
10 * written by J.T. Conklin <jtc@acorntoolworks.com> and
11 * adapted by Guillaume Morin <guillaume@morinfr.org> to implement stpcpy
12 * that was originally dedicated to the public domain
5 */
6
7#include <machine/asm.h>
13 */
14
15#include <machine/asm.h>
16
17#include "amd64_archlevel.h"
18
19#define ALIGN_TEXT .p2align 4, 0x90
20
21 .weak stpcpy
22 .set stpcpy, __stpcpy
23ARCHFUNCS(__stpcpy)
24 ARCHFUNC(__stpcpy, scalar)
25 ARCHFUNC(__stpcpy, baseline)
26ENDARCHFUNCS(__stpcpy)
27
8/*
9 * This stpcpy implementation copies a byte at a time until the
10 * source pointer is aligned to a word boundary, it then copies by
11 * words until it finds a word containing a zero byte, and finally
12 * copies by bytes until the end of the string is reached.
13 *
14 * While this may result in unaligned stores if the source and
15 * destination pointers are unaligned with respect to each other,
16 * it is still faster than either byte copies or the overhead of
17 * an implementation suitable for machines with strict alignment
18 * requirements.
19 */
20
28/*
29 * This stpcpy implementation copies a byte at a time until the
30 * source pointer is aligned to a word boundary, it then copies by
31 * words until it finds a word containing a zero byte, and finally
32 * copies by bytes until the end of the string is reached.
33 *
34 * While this may result in unaligned stores if the source and
35 * destination pointers are unaligned with respect to each other,
36 * it is still faster than either byte copies or the overhead of
37 * an implementation suitable for machines with strict alignment
38 * requirements.
39 */
40
21 .globl stpcpy,__stpcpy
22ENTRY(stpcpy)
23__stpcpy:
41ARCHENTRY(__stpcpy, scalar)
24 movabsq $0x0101010101010101,%r8
25 movabsq $0x8080808080808080,%r9
26
27 /*
28 * Align source to a word boundary.
29 * Consider unrolling loop?
30 */
31.Lalign:

--- 4 unchanged lines hidden (view full) ---

36 movb %dl,(%rdi)
37 incq %rdi
38 testb %dl,%dl
39 jne .Lalign
40 movq %rdi,%rax
41 dec %rax
42 ret
43
42 movabsq $0x0101010101010101,%r8
43 movabsq $0x8080808080808080,%r9
44
45 /*
46 * Align source to a word boundary.
47 * Consider unrolling loop?
48 */
49.Lalign:

--- 4 unchanged lines hidden (view full) ---

54 movb %dl,(%rdi)
55 incq %rdi
56 testb %dl,%dl
57 jne .Lalign
58 movq %rdi,%rax
59 dec %rax
60 ret
61
44 .p2align 4
62 ALIGN_TEXT
45.Lloop:
46 movq %rdx,(%rdi)
47 addq $8,%rdi
48.Lword_aligned:
49 movq (%rsi),%rdx
50 movq %rdx,%rcx
51 addq $8,%rsi
52 subq %r8,%rcx

--- 51 unchanged lines hidden (view full) ---

104 incq %rdi
105 testb %dl,%dl /* 8th byte == 0? */
106 jne .Lword_aligned
107 decq %rdi
108
109.Ldone:
110 movq %rdi,%rax
111 ret
63.Lloop:
64 movq %rdx,(%rdi)
65 addq $8,%rdi
66.Lword_aligned:
67 movq (%rsi),%rdx
68 movq %rdx,%rcx
69 addq $8,%rsi
70 subq %r8,%rcx

--- 51 unchanged lines hidden (view full) ---

122 incq %rdi
123 testb %dl,%dl /* 8th byte == 0? */
124 jne .Lword_aligned
125 decq %rdi
126
127.Ldone:
128 movq %rdi,%rax
129 ret
112END(stpcpy)
113
130ARCHEND(__stpcpy, scalar)
131
132ARCHENTRY(__stpcpy, baseline)
133 mov %esi, %ecx
134 mov %rdi, %rdx
135 sub %rsi, %rdi # express destination as distance to surce
136 and $~0xf, %rsi # align source to 16 byte
137 movdqa (%rsi), %xmm0 # head of string with junk before
138 pxor %xmm1, %xmm1
139 and $0xf, %ecx # misalignment in bytes
140 pcmpeqb %xmm1, %xmm0 # NUL byte present?
141 pmovmskb %xmm0, %eax
142 shr %cl, %eax # clear out matches in junk bytes
143 bsf %eax, %eax # find match if any
144 jnz .Lrunt
145
146 /* first normal iteration: write head back if it succeeds */
147 movdqa 16(%rsi), %xmm0 # 16 bytes of current iteration
148 movdqu (%rsi, %rcx, 1), %xmm2 # first 16 bytes of the string
149 pcmpeqb %xmm0, %xmm1 # NUL byte present?
150 pmovmskb %xmm1, %eax
151 test %eax, %eax # find match if any
152 jnz .Lshorty
153
154 movdqu %xmm2, (%rdx) # store beginning of string
155
156 /* main loop, unrolled twice */
157 ALIGN_TEXT
1580: movdqa 32(%rsi), %xmm2 # load current iteraion
159 movdqu %xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion
160 pxor %xmm1, %xmm1
161 add $32, %rsi
162 pcmpeqb %xmm2, %xmm1 # NUL byte present?
163 pmovmskb %xmm1, %eax
164 test %eax, %eax
165 jnz 1f
166
167 movdqa 16(%rsi), %xmm0 # load current iteraion
168 movdqu %xmm2, (%rsi, %rdi, 1) # write back previous iteraion
169 pxor %xmm1, %xmm1
170 pcmpeqb %xmm0, %xmm1 # NUL byte present?
171 pmovmskb %xmm1, %eax
172 test %eax, %eax
173 jz 0b
174
175 /* end of string after main loop has iterated */
176 add $16, %rsi # advance rsi to second unrolled half
1771: tzcnt %eax, %eax # find location of match
178 # (behaves as bsf on pre-x86-64-v3 CPUs)
179 add %rsi, %rax # point to NUL byte
180 movdqu -15(%rax), %xmm0 # last 16 bytes of string
181 movdqu %xmm0, -15(%rax, %rdi, 1) # copied to destination
182 add %rdi, %rax # point to destination's NUL byte
183 ret
184
185 /* NUL encountered in second iteration */
186.Lshorty:
187 tzcnt %eax, %eax
188 add $16, %eax # account for length of first iteration
189 sub %ecx, %eax # but not the parts before the string
190
191 /* NUL encountered in first iteration */
192.Lrunt: lea 1(%rax), %edi # string length including NUL byte
193 add %rcx, %rsi # point to beginning of string
194 add %rdx, %rax # point to NUL byte
195
196 /* transfer 16--32 bytes */
197.L1632: cmp $16, %edi
198 jb .L0815
199
200 movdqu -16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes
201 movdqu %xmm2, (%rdx) # store first 16 bytes
202 movdqu %xmm0, -15(%rax) # store last 16 bytes
203 ret
204
205 /* transfer 8--15 bytes */
206.L0815: cmp $8, %edi
207 jb .L0407
208
209 mov (%rsi), %rcx # load first 8 bytes
210 mov -8(%rsi, %rdi, 1), %rdi # load last 8 bytes
211 mov %rcx, (%rdx) # store to dst
212 mov %rdi, -7(%rax) # dito
213 ret
214
215 /* transfer 4--7 bytes */
216.L0407: cmp $4, %edi
217 jb .L0203
218
219 mov (%rsi), %ecx
220 mov -4(%rsi, %rdi, 1), %edi
221 mov %ecx, (%rdx)
222 mov %edi, -3(%rax)
223 ret
224
225 /* transfer 2--3 bytes */
226.L0203: cmp $2, %edi
227 jb .L0101
228
229 movzwl (%rsi), %ecx
230 mov %cx, (%rdx) # store first two bytes
231
232 /* transfer 0 bytes (last byte is always NUL) */
233.L0101: movb $0, (%rax) # store terminating NUL byte
234 ret
235ARCHEND(__stpcpy, baseline)
236
114 .section .note.GNU-stack,"",%progbits
237 .section .note.GNU-stack,"",%progbits