xref: /freebsd/lib/libc/powerpc64/string/bcopy.S (revision 6966ac055c3b7a39266fb982493330df7a097997)
1/*-
2 * Copyright (c) 2018 Instituto de Pesquisas Eldorado
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the author nor the names of its contributors may
14 *    be used to endorse or promote products derived from this software
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 */
29
30#include <machine/asm.h>
31__FBSDID("$FreeBSD$");
32
33#define BLOCK_SIZE_BITS			6
34#define BLOCK_SIZE			(1 << BLOCK_SIZE_BITS)
35#define BLOCK_SIZE_MASK			(BLOCK_SIZE - 1)
36
37#define MULTI_PHASE_THRESHOLD		512
38
39#ifndef FN_NAME
40#ifdef MEMMOVE
41#define FN_NAME				__memmove
42WEAK_REFERENCE(__memmove, memmove);
43#else
44#define FN_NAME				__bcopy
45WEAK_REFERENCE(__bcopy, bcopy);
46#endif
47#endif
48
49/*
50 * r3: dst
51 * r4: src
52 * r5: len
53 */
54
55ENTRY(FN_NAME)
56	cmpld	%r3, %r4		/* src == dst? nothing to do */
57	beqlr-
58	cmpdi	%r5, 0			/* len == 0? nothing to do */
59	beqlr-
60
61#ifdef MEMMOVE
62	std	%r3, -8(%r1)		/* save dst */
63#else	/* bcopy: swap src/dst */
64	mr	%r0, %r3
65	mr	%r3, %r4
66	mr	%r4, %r0
67#endif
68
69	cmpldi	%r5, MULTI_PHASE_THRESHOLD
70	bge	.Lmulti_phase
71
72	/* align src */
73	cmpd	%r4, %r3		/* forward or backward copy? */
74	blt	.Lbackward_align
75
76	.align 5
77.Lalign:
78	andi.	%r0, %r4, 15
79	beq	.Lsingle_copy
80	lbz	%r0, 0(%r4)
81	addi	%r4, %r4, 1
82	stb	%r0, 0(%r3)
83	addi	%r3, %r3, 1
84	addi	%r5, %r5, -1
85	cmpdi	%r5, 0
86	beq-	.Ldone
87	b	.Lalign
88
89.Lbackward_align:
90	/* advance src and dst to end (past last byte) */
91	add	%r3, %r3, %r5
92	add	%r4, %r4, %r5
93	.align 5
94.Lbackward_align_loop:
95	andi.	%r0, %r4, 15
96	beq	.Lbackward_single_copy
97	lbzu	%r0, -1(%r4)
98	addi	%r5, %r5, -1
99	stbu	%r0, -1(%r3)
100	cmpdi	%r5, 0
101	beq-	.Ldone
102	b	.Lbackward_align_loop
103
104.Lsingle_copy:
105	/* forward copy */
106	li	%r0, 1
107	li	%r8, 16
108	li	%r9, 0
109	b	.Lsingle_phase
110
111.Lbackward_single_copy:
112	/* backward copy */
113	li	%r0, -1
114	li	%r8, -16
115	li	%r9, -15
116	/* point src and dst to last byte */
117	addi	%r3, %r3, -1
118	addi	%r4, %r4, -1
119
120.Lsingle_phase:
121	srdi.	%r6, %r5, 4		/* number of 16-bytes */
122	beq	.Lsingle_1
123
124	/* pre-adjustment */
125	add	%r3, %r3, %r9
126	add	%r4, %r4, %r9
127
128	mtctr	%r6
129	.align 5
130.Lsingle_16_loop:
131	ld	%r6, 0(%r4)
132	ld	%r7, 8(%r4)
133	add	%r4, %r4, %r8
134	std	%r6, 0(%r3)
135	std	%r7, 8(%r3)
136	add	%r3, %r3, %r8
137	bdnz	.Lsingle_16_loop
138
139	/* post-adjustment */
140	sub	%r3, %r3, %r9
141	sub	%r4, %r4, %r9
142
143.Lsingle_1:
144	andi.	%r6, %r5, 0x0f		/* number of 1-bytes */
145	beq	.Ldone			/* 1-bytes == 0? done */
146
147	mtctr	%r6
148	.align 5
149.Lsingle_1_loop:
150	lbz	%r6, 0(%r4)
151	add	%r4, %r4, %r0		/* increment */
152	stb	%r6, 0(%r3)
153	add	%r3, %r3, %r0		/* increment */
154	bdnz	.Lsingle_1_loop
155
156.Ldone:
157#ifdef MEMMOVE
158	ld	%r3, -8(%r1)		/* restore dst */
159#endif
160	blr
161
162
163.Lmulti_phase:
164	/* set up multi-phase copy parameters */
165
166	/* r7 = bytes before the aligned section of the buffer */
167	andi.	%r6, %r4, 15
168	subfic	%r7, %r6, 16
169	/* r8 = bytes in and after the aligned section of the buffer */
170	sub	%r8, %r5, %r7
171	/* r9 = bytes after the aligned section of the buffer */
172	andi.	%r9, %r8, BLOCK_SIZE_MASK
173	/* r10 = BLOCKS in the aligned section of the buffer */
174	srdi	%r10, %r8, BLOCK_SIZE_BITS
175
176	/* forward or backward copy? */
177	cmpd	%r4, %r3
178	blt	.Lbackward_multi_copy
179
180	/* set up forward copy parameters */
181	std	%r7,  -32(%r1)		/* bytes to copy in phase 1 */
182	std	%r10, -40(%r1)		/* BLOCKS to copy in phase 2 */
183	std	%r9,  -48(%r1)		/* bytes to copy in phase 3 */
184
185	li	%r0, 1			/* increment for phases 1 and 3 */
186	li	%r5, BLOCK_SIZE		/* increment for phase 2 */
187
188	/* op offsets for phase 2 */
189	li	%r7,  0
190	li	%r8,  16
191	li	%r9,  32
192	li	%r10, 48
193
194	std	%r8, -16(%r1)		/* 16-byte increment (16) */
195	std	%r7, -24(%r1)		/* 16-byte pre/post adjustment (0) */
196
197	b	.Lphase1
198
199.Lbackward_multi_copy:
200	/* set up backward copy parameters */
201	std	%r9,  -32(%r1)		/* bytes to copy in phase 1 */
202	std	%r10, -40(%r1)		/* BLOCKS to copy in phase 2 */
203	std	%r7,  -48(%r1)		/* bytes to copy in phase 3 */
204
205	li	%r0, -1			/* increment for phases 1 and 3 */
206	add	%r6, %r5, %r0		/* r6 = len - 1 */
207	li	%r5, -BLOCK_SIZE	/* increment for phase 2 */
208	/* advance src and dst to the last position */
209	add	%r3, %r3, %r6
210	add	%r4, %r4, %r6
211
212	/* op offsets for phase 2 */
213	li	%r7,  -15
214	li	%r8,  -31
215	li	%r9,  -47
216	li	%r10, -63
217
218	add	%r6, %r7, %r0		/* r6 = -16 */
219	std	%r6, -16(%r1)		/* 16-byte increment (-16) */
220	std	%r7, -24(%r1)		/* 16-byte pre/post adjustment (-15) */
221
222.Lphase1:
223	ld	%r6, -32(%r1)		/* bytes to copy in phase 1 */
224	cmpldi	%r6, 0			/* r6 == 0? skip phase 1 */
225	beq+	.Lphase2
226
227	mtctr	%r6
228	.align 5
229.Lphase1_loop:
230	lbz	%r6, 0(%r4)
231	add	%r4, %r4, %r0		/* phase 1 increment */
232	stb	%r6, 0(%r3)
233	add	%r3, %r3, %r0		/* phase 1 increment */
234	bdnz	.Lphase1_loop
235
236.Lphase2:
237	ld	%r6, -40(%r1)		/* BLOCKS to copy in phase 2 */
238	cmpldi	%r6, 0			/* %r6 == 0? skip phase 2 */
239	beq	.Lphase3
240
241#ifdef FN_PHASE2
242FN_PHASE2
243#else
244	/* save registers */
245	std	%r14, -56(%r1)
246	std	%r15, -64(%r1)
247	std	%r16, -72(%r1)
248	std	%r17, -80(%r1)
249	std	%r18, -88(%r1)
250	std	%r19, -96(%r1)
251	std	%r20, -104(%r1)
252	std	%r21, -112(%r1)
253
254	addi	%r18, %r7, 8
255	addi	%r19, %r8, 8
256	addi	%r20, %r9, 8
257	addi	%r21, %r10, 8
258
259	mtctr	%r6
260	.align 5
261.Lphase2_loop:
262	ldx	%r14, %r7,  %r4
263	ldx	%r15, %r18, %r4
264	ldx	%r16, %r8,  %r4
265	ldx	%r17, %r19, %r4
266	stdx	%r14, %r7,  %r3
267	stdx	%r15, %r18, %r3
268	stdx	%r16, %r8,  %r3
269	stdx	%r17, %r19, %r3
270
271	ldx	%r14, %r9,  %r4
272	ldx	%r15, %r20, %r4
273	ldx	%r16, %r10, %r4
274	ldx	%r17, %r21, %r4
275	stdx	%r14, %r9,  %r3
276	stdx	%r15, %r20, %r3
277	stdx	%r16, %r10, %r3
278	stdx	%r17, %r21, %r3
279
280	add	%r4, %r4, %r5		/* phase 2 increment */
281	add	%r3, %r3, %r5		/* phase 2 increment */
282
283	bdnz	.Lphase2_loop
284
285	/* restore registers */
286	ld	%r14, -56(%r1)
287	ld	%r15, -64(%r1)
288	ld	%r16, -72(%r1)
289	ld	%r17, -80(%r1)
290	ld	%r18, -88(%r1)
291	ld	%r19, -96(%r1)
292	ld	%r20, -104(%r1)
293	ld	%r21, -112(%r1)
294#endif
295
296.Lphase3:
297	/* load registers for transitioning into the single-phase logic */
298	ld	%r5, -48(%r1)		/* bytes to copy in phase 3 */
299	ld	%r8, -16(%r1)		/* 16-byte increment */
300	ld	%r9, -24(%r1)		/* 16-byte pre/post adjustment */
301	b	.Lsingle_phase
302
303END(FN_NAME)
304
305	.section .note.GNU-stack,"",%progbits
306
307