xref: /freebsd/lib/libc/powerpc64/string/bcopy.S (revision 13ec1e3155c7e9bf037b12af186351b7fa9b9450)
1/*-
2 * Copyright (c) 2018 Instituto de Pesquisas Eldorado
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the author nor the names of its contributors may
14 *    be used to endorse or promote products derived from this software
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 */
29
30#include <machine/asm.h>
31__FBSDID("$FreeBSD$");
32
33#define BLOCK_SIZE_BITS			6
34#define BLOCK_SIZE			(1 << BLOCK_SIZE_BITS)
35#define BLOCK_SIZE_MASK			(BLOCK_SIZE - 1)
36
37/* Minimum 8 byte alignment, to avoid cache-inhibited alignment faults.*/
38#ifndef ALIGN_MASK
39#define ALIGN_MASK			0x7
40#endif
41
42#define MULTI_PHASE_THRESHOLD		512
43
44#ifndef FN_NAME
45#ifdef MEMMOVE
46#define FN_NAME				__memmove
47WEAK_REFERENCE(__memmove, memmove);
48#else
49#define FN_NAME				__bcopy
50WEAK_REFERENCE(__bcopy, bcopy);
51#endif
52#endif
53
54/*
55 * r3: dst
56 * r4: src
57 * r5: len
58 */
59
60ENTRY(FN_NAME)
61	cmpld	%r3, %r4		/* src == dst? nothing to do */
62	beqlr-
63	cmpdi	%r5, 0			/* len == 0? nothing to do */
64	beqlr-
65
66#ifdef MEMMOVE
67	std	%r3, -8(%r1)		/* save dst */
68#else	/* bcopy: swap src/dst */
69	mr	%r0, %r3
70	mr	%r3, %r4
71	mr	%r4, %r0
72#endif
73
74	/* First check for relative alignment, if unaligned copy one byte at a time */
75	andi.	%r8, %r3, ALIGN_MASK
76	andi.	%r7, %r4, ALIGN_MASK
77	cmpd	%r7, %r8
78	bne 	.Lunaligned
79
80
81	cmpldi	%r5, MULTI_PHASE_THRESHOLD
82	bge	.Lmulti_phase
83	b	.Lfast_copy
84
85.Lunaligned:
86	/* forward or backward copy? */
87	cmpd	%r4, %r3
88	blt	.Lbackward_unaligned
89
90	/* Just need to setup increment and jump to copy */
91	li	%r0, 1
92	mtctr	%r5
93	b	.Lsingle_1_loop
94
95.Lbackward_unaligned:
96	/* advance src and dst to last byte, set decrement and jump to copy */
97	add	%r3, %r3, %r5
98	addi	%r3, %r3, -1
99	add	%r4, %r4, %r5
100	addi	%r4, %r4, -1
101	li	%r0, -1
102	mtctr	%r5
103	b 	.Lsingle_1_loop
104
105.Lfast_copy:
106	/* align src */
107	cmpd	%r4, %r3		/* forward or backward copy? */
108	blt	.Lbackward_align
109
110	.align 5
111.Lalign:
112	andi.	%r0, %r4, 15
113	beq	.Lsingle_copy
114	lbz	%r0, 0(%r4)
115	addi	%r4, %r4, 1
116	stb	%r0, 0(%r3)
117	addi	%r3, %r3, 1
118	addi	%r5, %r5, -1
119	cmpdi	%r5, 0
120	beq-	.Ldone
121	b	.Lalign
122
123.Lbackward_align:
124	/* advance src and dst to end (past last byte) */
125	add	%r3, %r3, %r5
126	add	%r4, %r4, %r5
127	.align 5
128.Lbackward_align_loop:
129	andi.	%r0, %r4, 15
130	beq	.Lbackward_single_copy
131	lbzu	%r0, -1(%r4)
132	addi	%r5, %r5, -1
133	stbu	%r0, -1(%r3)
134	cmpdi	%r5, 0
135	beq-	.Ldone
136	b	.Lbackward_align_loop
137
138.Lsingle_copy:
139	/* forward copy */
140	li	%r0, 1
141	li	%r8, 16
142	li	%r9, 0
143	b	.Lsingle_phase
144
145.Lbackward_single_copy:
146	/* backward copy */
147	li	%r0, -1
148	li	%r8, -16
149	li	%r9, -15
150	/* point src and dst to last byte */
151	addi	%r3, %r3, -1
152	addi	%r4, %r4, -1
153
154.Lsingle_phase:
155	srdi.	%r6, %r5, 4		/* number of 16-bytes */
156	beq	.Lsingle_1
157
158	/* pre-adjustment */
159	add	%r3, %r3, %r9
160	add	%r4, %r4, %r9
161
162	mtctr	%r6
163	.align 5
164.Lsingle_16_loop:
165	ld	%r6, 0(%r4)
166	ld	%r7, 8(%r4)
167	add	%r4, %r4, %r8
168	std	%r6, 0(%r3)
169	std	%r7, 8(%r3)
170	add	%r3, %r3, %r8
171	bdnz	.Lsingle_16_loop
172
173	/* post-adjustment */
174	sub	%r3, %r3, %r9
175	sub	%r4, %r4, %r9
176
177.Lsingle_1:
178	andi.	%r6, %r5, 0x0f		/* number of 1-bytes */
179	beq	.Ldone			/* 1-bytes == 0? done */
180
181	mtctr	%r6
182	.align 5
183.Lsingle_1_loop:
184	lbz	%r6, 0(%r4)
185	add	%r4, %r4, %r0		/* increment */
186	stb	%r6, 0(%r3)
187	add	%r3, %r3, %r0		/* increment */
188	bdnz	.Lsingle_1_loop
189
190.Ldone:
191#ifdef MEMMOVE
192	ld	%r3, -8(%r1)		/* restore dst */
193#endif
194	blr
195
196
197.Lmulti_phase:
198	/* set up multi-phase copy parameters */
199
200	/* r7 = bytes before the aligned section of the buffer */
201	andi.	%r6, %r4, 15
202	subfic	%r7, %r6, 16
203	/* r8 = bytes in and after the aligned section of the buffer */
204	sub	%r8, %r5, %r7
205	/* r9 = bytes after the aligned section of the buffer */
206	andi.	%r9, %r8, BLOCK_SIZE_MASK
207	/* r10 = BLOCKS in the aligned section of the buffer */
208	srdi	%r10, %r8, BLOCK_SIZE_BITS
209
210	/* forward or backward copy? */
211	cmpd	%r4, %r3
212	blt	.Lbackward_multi_copy
213
214	/* set up forward copy parameters */
215	std	%r7,  -32(%r1)		/* bytes to copy in phase 1 */
216	std	%r10, -40(%r1)		/* BLOCKS to copy in phase 2 */
217	std	%r9,  -48(%r1)		/* bytes to copy in phase 3 */
218
219	li	%r0, 1			/* increment for phases 1 and 3 */
220	li	%r5, BLOCK_SIZE		/* increment for phase 2 */
221
222	/* op offsets for phase 2 */
223	li	%r7,  0
224	li	%r8,  16
225	li	%r9,  32
226	li	%r10, 48
227
228	std	%r8, -16(%r1)		/* 16-byte increment (16) */
229	std	%r7, -24(%r1)		/* 16-byte pre/post adjustment (0) */
230
231	b	.Lphase1
232
233.Lbackward_multi_copy:
234	/* set up backward copy parameters */
235	std	%r9,  -32(%r1)		/* bytes to copy in phase 1 */
236	std	%r10, -40(%r1)		/* BLOCKS to copy in phase 2 */
237	std	%r7,  -48(%r1)		/* bytes to copy in phase 3 */
238
239	li	%r0, -1			/* increment for phases 1 and 3 */
240	add	%r6, %r5, %r0		/* r6 = len - 1 */
241	li	%r5, -BLOCK_SIZE	/* increment for phase 2 */
242	/* advance src and dst to the last position */
243	add	%r3, %r3, %r6
244	add	%r4, %r4, %r6
245
246	/* op offsets for phase 2 */
247	li	%r7,  -15
248	li	%r8,  -31
249	li	%r9,  -47
250	li	%r10, -63
251
252	add	%r6, %r7, %r0		/* r6 = -16 */
253	std	%r6, -16(%r1)		/* 16-byte increment (-16) */
254	std	%r7, -24(%r1)		/* 16-byte pre/post adjustment (-15) */
255
256.Lphase1:
257	ld	%r6, -32(%r1)		/* bytes to copy in phase 1 */
258	cmpldi	%r6, 0			/* r6 == 0? skip phase 1 */
259	beq+	.Lphase2
260
261	mtctr	%r6
262	.align 5
263.Lphase1_loop:
264	lbz	%r6, 0(%r4)
265	add	%r4, %r4, %r0		/* phase 1 increment */
266	stb	%r6, 0(%r3)
267	add	%r3, %r3, %r0		/* phase 1 increment */
268	bdnz	.Lphase1_loop
269
270.Lphase2:
271	ld	%r6, -40(%r1)		/* BLOCKS to copy in phase 2 */
272	cmpldi	%r6, 0			/* %r6 == 0? skip phase 2 */
273	beq	.Lphase3
274
275#ifdef FN_PHASE2
276FN_PHASE2
277#else
278	/* save registers */
279	std	%r14, -56(%r1)
280	std	%r15, -64(%r1)
281	std	%r16, -72(%r1)
282	std	%r17, -80(%r1)
283	std	%r18, -88(%r1)
284	std	%r19, -96(%r1)
285	std	%r20, -104(%r1)
286	std	%r21, -112(%r1)
287
288	addi	%r18, %r7, 8
289	addi	%r19, %r8, 8
290	addi	%r20, %r9, 8
291	addi	%r21, %r10, 8
292
293	mtctr	%r6
294	.align 5
295.Lphase2_loop:
296	ldx	%r14, %r7,  %r4
297	ldx	%r15, %r18, %r4
298	ldx	%r16, %r8,  %r4
299	ldx	%r17, %r19, %r4
300	stdx	%r14, %r7,  %r3
301	stdx	%r15, %r18, %r3
302	stdx	%r16, %r8,  %r3
303	stdx	%r17, %r19, %r3
304
305	ldx	%r14, %r9,  %r4
306	ldx	%r15, %r20, %r4
307	ldx	%r16, %r10, %r4
308	ldx	%r17, %r21, %r4
309	stdx	%r14, %r9,  %r3
310	stdx	%r15, %r20, %r3
311	stdx	%r16, %r10, %r3
312	stdx	%r17, %r21, %r3
313
314	add	%r4, %r4, %r5		/* phase 2 increment */
315	add	%r3, %r3, %r5		/* phase 2 increment */
316
317	bdnz	.Lphase2_loop
318
319	/* restore registers */
320	ld	%r14, -56(%r1)
321	ld	%r15, -64(%r1)
322	ld	%r16, -72(%r1)
323	ld	%r17, -80(%r1)
324	ld	%r18, -88(%r1)
325	ld	%r19, -96(%r1)
326	ld	%r20, -104(%r1)
327	ld	%r21, -112(%r1)
328#endif
329
330.Lphase3:
331	/* load registers for transitioning into the single-phase logic */
332	ld	%r5, -48(%r1)		/* bytes to copy in phase 3 */
333	ld	%r8, -16(%r1)		/* 16-byte increment */
334	ld	%r9, -24(%r1)		/* 16-byte pre/post adjustment */
335	b	.Lsingle_phase
336
337END(FN_NAME)
338
339	.section .note.GNU-stack,"",%progbits
340
341