xref: /freebsd/lib/libc/powerpc64/string/bcopy.S (revision 1d386b48a555f61cb7325543adbbb5c3f3407a66)
1/*-
2 * Copyright (c) 2018 Instituto de Pesquisas Eldorado
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the author nor the names of its contributors may
14 *    be used to endorse or promote products derived from this software
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 */
29
30#include <machine/asm.h>
31#define BLOCK_SIZE_BITS			6
32#define BLOCK_SIZE			(1 << BLOCK_SIZE_BITS)
33#define BLOCK_SIZE_MASK			(BLOCK_SIZE - 1)
34
35/* Minimum 8 byte alignment, to avoid cache-inhibited alignment faults.*/
36#ifndef ALIGN_MASK
37#define ALIGN_MASK			0x7
38#endif
39
40#define MULTI_PHASE_THRESHOLD		512
41
42#ifndef FN_NAME
43#ifdef MEMMOVE
44#define FN_NAME				__memmove
45WEAK_REFERENCE(__memmove, memmove);
46#else
47#define FN_NAME				__bcopy
48WEAK_REFERENCE(__bcopy, bcopy);
49#endif
50#endif
51
52/*
53 * r3: dst
54 * r4: src
55 * r5: len
56 */
57
58ENTRY(FN_NAME)
59	cmpld	%r3, %r4		/* src == dst? nothing to do */
60	beqlr-
61	cmpdi	%r5, 0			/* len == 0? nothing to do */
62	beqlr-
63
64#ifdef MEMMOVE
65	std	%r3, -8(%r1)		/* save dst */
66#else	/* bcopy: swap src/dst */
67	mr	%r0, %r3
68	mr	%r3, %r4
69	mr	%r4, %r0
70#endif
71
72	/* First check for relative alignment, if unaligned copy one byte at a time */
73	andi.	%r8, %r3, ALIGN_MASK
74	andi.	%r7, %r4, ALIGN_MASK
75	cmpd	%r7, %r8
76	bne 	.Lunaligned
77
78
79	cmpldi	%r5, MULTI_PHASE_THRESHOLD
80	bge	.Lmulti_phase
81	b	.Lfast_copy
82
83.Lunaligned:
84	/* forward or backward copy? */
85	cmpd	%r4, %r3
86	blt	.Lbackward_unaligned
87
88	/* Just need to setup increment and jump to copy */
89	li	%r0, 1
90	mtctr	%r5
91	b	.Lsingle_1_loop
92
93.Lbackward_unaligned:
94	/* advance src and dst to last byte, set decrement and jump to copy */
95	add	%r3, %r3, %r5
96	addi	%r3, %r3, -1
97	add	%r4, %r4, %r5
98	addi	%r4, %r4, -1
99	li	%r0, -1
100	mtctr	%r5
101	b 	.Lsingle_1_loop
102
103.Lfast_copy:
104	/* align src */
105	cmpd	%r4, %r3		/* forward or backward copy? */
106	blt	.Lbackward_align
107
108	.align 5
109.Lalign:
110	andi.	%r0, %r4, 15
111	beq	.Lsingle_copy
112	lbz	%r0, 0(%r4)
113	addi	%r4, %r4, 1
114	stb	%r0, 0(%r3)
115	addi	%r3, %r3, 1
116	addi	%r5, %r5, -1
117	cmpdi	%r5, 0
118	beq-	.Ldone
119	b	.Lalign
120
121.Lbackward_align:
122	/* advance src and dst to end (past last byte) */
123	add	%r3, %r3, %r5
124	add	%r4, %r4, %r5
125	.align 5
126.Lbackward_align_loop:
127	andi.	%r0, %r4, 15
128	beq	.Lbackward_single_copy
129	lbzu	%r0, -1(%r4)
130	addi	%r5, %r5, -1
131	stbu	%r0, -1(%r3)
132	cmpdi	%r5, 0
133	beq-	.Ldone
134	b	.Lbackward_align_loop
135
136.Lsingle_copy:
137	/* forward copy */
138	li	%r0, 1
139	li	%r8, 16
140	li	%r9, 0
141	b	.Lsingle_phase
142
143.Lbackward_single_copy:
144	/* backward copy */
145	li	%r0, -1
146	li	%r8, -16
147	li	%r9, -15
148	/* point src and dst to last byte */
149	addi	%r3, %r3, -1
150	addi	%r4, %r4, -1
151
152.Lsingle_phase:
153	srdi.	%r6, %r5, 4		/* number of 16-bytes */
154	beq	.Lsingle_1
155
156	/* pre-adjustment */
157	add	%r3, %r3, %r9
158	add	%r4, %r4, %r9
159
160	mtctr	%r6
161	.align 5
162.Lsingle_16_loop:
163	ld	%r6, 0(%r4)
164	ld	%r7, 8(%r4)
165	add	%r4, %r4, %r8
166	std	%r6, 0(%r3)
167	std	%r7, 8(%r3)
168	add	%r3, %r3, %r8
169	bdnz	.Lsingle_16_loop
170
171	/* post-adjustment */
172	sub	%r3, %r3, %r9
173	sub	%r4, %r4, %r9
174
175.Lsingle_1:
176	andi.	%r6, %r5, 0x0f		/* number of 1-bytes */
177	beq	.Ldone			/* 1-bytes == 0? done */
178
179	mtctr	%r6
180	.align 5
181.Lsingle_1_loop:
182	lbz	%r6, 0(%r4)
183	add	%r4, %r4, %r0		/* increment */
184	stb	%r6, 0(%r3)
185	add	%r3, %r3, %r0		/* increment */
186	bdnz	.Lsingle_1_loop
187
188.Ldone:
189#ifdef MEMMOVE
190	ld	%r3, -8(%r1)		/* restore dst */
191#endif
192	blr
193
194
195.Lmulti_phase:
196	/* set up multi-phase copy parameters */
197
198	/* r7 = bytes before the aligned section of the buffer */
199	andi.	%r6, %r4, 15
200	subfic	%r7, %r6, 16
201	/* r8 = bytes in and after the aligned section of the buffer */
202	sub	%r8, %r5, %r7
203	/* r9 = bytes after the aligned section of the buffer */
204	andi.	%r9, %r8, BLOCK_SIZE_MASK
205	/* r10 = BLOCKS in the aligned section of the buffer */
206	srdi	%r10, %r8, BLOCK_SIZE_BITS
207
208	/* forward or backward copy? */
209	cmpd	%r4, %r3
210	blt	.Lbackward_multi_copy
211
212	/* set up forward copy parameters */
213	std	%r7,  -32(%r1)		/* bytes to copy in phase 1 */
214	std	%r10, -40(%r1)		/* BLOCKS to copy in phase 2 */
215	std	%r9,  -48(%r1)		/* bytes to copy in phase 3 */
216
217	li	%r0, 1			/* increment for phases 1 and 3 */
218	li	%r5, BLOCK_SIZE		/* increment for phase 2 */
219
220	/* op offsets for phase 2 */
221	li	%r7,  0
222	li	%r8,  16
223	li	%r9,  32
224	li	%r10, 48
225
226	std	%r8, -16(%r1)		/* 16-byte increment (16) */
227	std	%r7, -24(%r1)		/* 16-byte pre/post adjustment (0) */
228
229	b	.Lphase1
230
231.Lbackward_multi_copy:
232	/* set up backward copy parameters */
233	std	%r9,  -32(%r1)		/* bytes to copy in phase 1 */
234	std	%r10, -40(%r1)		/* BLOCKS to copy in phase 2 */
235	std	%r7,  -48(%r1)		/* bytes to copy in phase 3 */
236
237	li	%r0, -1			/* increment for phases 1 and 3 */
238	add	%r6, %r5, %r0		/* r6 = len - 1 */
239	li	%r5, -BLOCK_SIZE	/* increment for phase 2 */
240	/* advance src and dst to the last position */
241	add	%r3, %r3, %r6
242	add	%r4, %r4, %r6
243
244	/* op offsets for phase 2 */
245	li	%r7,  -15
246	li	%r8,  -31
247	li	%r9,  -47
248	li	%r10, -63
249
250	add	%r6, %r7, %r0		/* r6 = -16 */
251	std	%r6, -16(%r1)		/* 16-byte increment (-16) */
252	std	%r7, -24(%r1)		/* 16-byte pre/post adjustment (-15) */
253
254.Lphase1:
255	ld	%r6, -32(%r1)		/* bytes to copy in phase 1 */
256	cmpldi	%r6, 0			/* r6 == 0? skip phase 1 */
257	beq+	.Lphase2
258
259	mtctr	%r6
260	.align 5
261.Lphase1_loop:
262	lbz	%r6, 0(%r4)
263	add	%r4, %r4, %r0		/* phase 1 increment */
264	stb	%r6, 0(%r3)
265	add	%r3, %r3, %r0		/* phase 1 increment */
266	bdnz	.Lphase1_loop
267
268.Lphase2:
269	ld	%r6, -40(%r1)		/* BLOCKS to copy in phase 2 */
270	cmpldi	%r6, 0			/* %r6 == 0? skip phase 2 */
271	beq	.Lphase3
272
273#ifdef FN_PHASE2
274FN_PHASE2
275#else
276	/* save registers */
277	std	%r14, -56(%r1)
278	std	%r15, -64(%r1)
279	std	%r16, -72(%r1)
280	std	%r17, -80(%r1)
281	std	%r18, -88(%r1)
282	std	%r19, -96(%r1)
283	std	%r20, -104(%r1)
284	std	%r21, -112(%r1)
285
286	addi	%r18, %r7, 8
287	addi	%r19, %r8, 8
288	addi	%r20, %r9, 8
289	addi	%r21, %r10, 8
290
291	mtctr	%r6
292	.align 5
293.Lphase2_loop:
294	ldx	%r14, %r7,  %r4
295	ldx	%r15, %r18, %r4
296	ldx	%r16, %r8,  %r4
297	ldx	%r17, %r19, %r4
298	stdx	%r14, %r7,  %r3
299	stdx	%r15, %r18, %r3
300	stdx	%r16, %r8,  %r3
301	stdx	%r17, %r19, %r3
302
303	ldx	%r14, %r9,  %r4
304	ldx	%r15, %r20, %r4
305	ldx	%r16, %r10, %r4
306	ldx	%r17, %r21, %r4
307	stdx	%r14, %r9,  %r3
308	stdx	%r15, %r20, %r3
309	stdx	%r16, %r10, %r3
310	stdx	%r17, %r21, %r3
311
312	add	%r4, %r4, %r5		/* phase 2 increment */
313	add	%r3, %r3, %r5		/* phase 2 increment */
314
315	bdnz	.Lphase2_loop
316
317	/* restore registers */
318	ld	%r14, -56(%r1)
319	ld	%r15, -64(%r1)
320	ld	%r16, -72(%r1)
321	ld	%r17, -80(%r1)
322	ld	%r18, -88(%r1)
323	ld	%r19, -96(%r1)
324	ld	%r20, -104(%r1)
325	ld	%r21, -112(%r1)
326#endif
327
328.Lphase3:
329	/* load registers for transitioning into the single-phase logic */
330	ld	%r5, -48(%r1)		/* bytes to copy in phase 3 */
331	ld	%r8, -16(%r1)		/* 16-byte increment */
332	ld	%r9, -24(%r1)		/* 16-byte pre/post adjustment */
333	b	.Lsingle_phase
334
335END(FN_NAME)
336
337	.section .note.GNU-stack,"",%progbits
338
339