xref: /freebsd/lib/libc/powerpc64/string/bcopy.S (revision 1d386b48a555f61cb7325543adbbb5c3f3407a66)
1e16c1865SLeandro Lupori/*-
2e16c1865SLeandro Lupori * Copyright (c) 2018 Instituto de Pesquisas Eldorado
3e16c1865SLeandro Lupori * All rights reserved.
4e16c1865SLeandro Lupori *
5e16c1865SLeandro Lupori * Redistribution and use in source and binary forms, with or without
6e16c1865SLeandro Lupori * modification, are permitted provided that the following conditions
7e16c1865SLeandro Lupori * are met:
8e16c1865SLeandro Lupori * 1. Redistributions of source code must retain the above copyright
9e16c1865SLeandro Lupori *    notice, this list of conditions and the following disclaimer.
10e16c1865SLeandro Lupori * 2. Redistributions in binary form must reproduce the above copyright
11e16c1865SLeandro Lupori *    notice, this list of conditions and the following disclaimer in the
12e16c1865SLeandro Lupori *    documentation and/or other materials provided with the distribution.
13e16c1865SLeandro Lupori * 3. Neither the name of the author nor the names of its contributors may
14e16c1865SLeandro Lupori *    be used to endorse or promote products derived from this software
15e16c1865SLeandro Lupori *
16e16c1865SLeandro Lupori * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17e16c1865SLeandro Lupori * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18e16c1865SLeandro Lupori * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19e16c1865SLeandro Lupori * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20e16c1865SLeandro Lupori * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21e16c1865SLeandro Lupori * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22e16c1865SLeandro Lupori * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23e16c1865SLeandro Lupori * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24e16c1865SLeandro Lupori * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25e16c1865SLeandro Lupori * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26e16c1865SLeandro Lupori * SUCH DAMAGE.
27e16c1865SLeandro Lupori *
28e16c1865SLeandro Lupori */
29e16c1865SLeandro Lupori
30e16c1865SLeandro Lupori#include <machine/asm.h>
31e16c1865SLeandro Lupori#define BLOCK_SIZE_BITS			6
32e16c1865SLeandro Lupori#define BLOCK_SIZE			(1 << BLOCK_SIZE_BITS)
33e16c1865SLeandro Lupori#define BLOCK_SIZE_MASK			(BLOCK_SIZE - 1)
34e16c1865SLeandro Lupori
35*2f561284SLeandro Lupori/* Minimum 8 byte alignment, to avoid cache-inhibited alignment faults.*/
36*2f561284SLeandro Lupori#ifndef ALIGN_MASK
37*2f561284SLeandro Lupori#define ALIGN_MASK			0x7
38*2f561284SLeandro Lupori#endif
39*2f561284SLeandro Lupori
40e16c1865SLeandro Lupori#define MULTI_PHASE_THRESHOLD		512
41e16c1865SLeandro Lupori
42e16c1865SLeandro Lupori#ifndef FN_NAME
43e16c1865SLeandro Lupori#ifdef MEMMOVE
44e16c1865SLeandro Lupori#define FN_NAME				__memmove
45e16c1865SLeandro LuporiWEAK_REFERENCE(__memmove, memmove);
46e16c1865SLeandro Lupori#else
47e16c1865SLeandro Lupori#define FN_NAME				__bcopy
48e16c1865SLeandro LuporiWEAK_REFERENCE(__bcopy, bcopy);
49e16c1865SLeandro Lupori#endif
50e16c1865SLeandro Lupori#endif
51e16c1865SLeandro Lupori
52e16c1865SLeandro Lupori/*
53e16c1865SLeandro Lupori * r3: dst
54e16c1865SLeandro Lupori * r4: src
55e16c1865SLeandro Lupori * r5: len
56e16c1865SLeandro Lupori */
57e16c1865SLeandro Lupori
58e16c1865SLeandro LuporiENTRY(FN_NAME)
59e16c1865SLeandro Lupori	cmpld	%r3, %r4		/* src == dst? nothing to do */
60e16c1865SLeandro Lupori	beqlr-
61e16c1865SLeandro Lupori	cmpdi	%r5, 0			/* len == 0? nothing to do */
62e16c1865SLeandro Lupori	beqlr-
63e16c1865SLeandro Lupori
64e16c1865SLeandro Lupori#ifdef MEMMOVE
65e16c1865SLeandro Lupori	std	%r3, -8(%r1)		/* save dst */
66e16c1865SLeandro Lupori#else	/* bcopy: swap src/dst */
67e16c1865SLeandro Lupori	mr	%r0, %r3
68e16c1865SLeandro Lupori	mr	%r3, %r4
69e16c1865SLeandro Lupori	mr	%r4, %r0
70e16c1865SLeandro Lupori#endif
71e16c1865SLeandro Lupori
72*2f561284SLeandro Lupori	/* First check for relative alignment, if unaligned copy one byte at a time */
73*2f561284SLeandro Lupori	andi.	%r8, %r3, ALIGN_MASK
74*2f561284SLeandro Lupori	andi.	%r7, %r4, ALIGN_MASK
75*2f561284SLeandro Lupori	cmpd	%r7, %r8
76*2f561284SLeandro Lupori	bne 	.Lunaligned
77*2f561284SLeandro Lupori
78*2f561284SLeandro Lupori
79e16c1865SLeandro Lupori	cmpldi	%r5, MULTI_PHASE_THRESHOLD
80e16c1865SLeandro Lupori	bge	.Lmulti_phase
81*2f561284SLeandro Lupori	b	.Lfast_copy
82e16c1865SLeandro Lupori
83*2f561284SLeandro Lupori.Lunaligned:
84*2f561284SLeandro Lupori	/* forward or backward copy? */
85*2f561284SLeandro Lupori	cmpd	%r4, %r3
86*2f561284SLeandro Lupori	blt	.Lbackward_unaligned
87*2f561284SLeandro Lupori
88*2f561284SLeandro Lupori	/* Just need to setup increment and jump to copy */
89*2f561284SLeandro Lupori	li	%r0, 1
90*2f561284SLeandro Lupori	mtctr	%r5
91*2f561284SLeandro Lupori	b	.Lsingle_1_loop
92*2f561284SLeandro Lupori
93*2f561284SLeandro Lupori.Lbackward_unaligned:
94*2f561284SLeandro Lupori	/* advance src and dst to last byte, set decrement and jump to copy */
95*2f561284SLeandro Lupori	add	%r3, %r3, %r5
96*2f561284SLeandro Lupori	addi	%r3, %r3, -1
97*2f561284SLeandro Lupori	add	%r4, %r4, %r5
98*2f561284SLeandro Lupori	addi	%r4, %r4, -1
99*2f561284SLeandro Lupori	li	%r0, -1
100*2f561284SLeandro Lupori	mtctr	%r5
101*2f561284SLeandro Lupori	b 	.Lsingle_1_loop
102*2f561284SLeandro Lupori
103*2f561284SLeandro Lupori.Lfast_copy:
104e16c1865SLeandro Lupori	/* align src */
105e16c1865SLeandro Lupori	cmpd	%r4, %r3		/* forward or backward copy? */
106e16c1865SLeandro Lupori	blt	.Lbackward_align
107e16c1865SLeandro Lupori
108e16c1865SLeandro Lupori	.align 5
109e16c1865SLeandro Lupori.Lalign:
110e16c1865SLeandro Lupori	andi.	%r0, %r4, 15
111e16c1865SLeandro Lupori	beq	.Lsingle_copy
112e16c1865SLeandro Lupori	lbz	%r0, 0(%r4)
113e16c1865SLeandro Lupori	addi	%r4, %r4, 1
114e16c1865SLeandro Lupori	stb	%r0, 0(%r3)
115e16c1865SLeandro Lupori	addi	%r3, %r3, 1
116e16c1865SLeandro Lupori	addi	%r5, %r5, -1
117e16c1865SLeandro Lupori	cmpdi	%r5, 0
118e16c1865SLeandro Lupori	beq-	.Ldone
119e16c1865SLeandro Lupori	b	.Lalign
120e16c1865SLeandro Lupori
121e16c1865SLeandro Lupori.Lbackward_align:
122e16c1865SLeandro Lupori	/* advance src and dst to end (past last byte) */
123e16c1865SLeandro Lupori	add	%r3, %r3, %r5
124e16c1865SLeandro Lupori	add	%r4, %r4, %r5
125e16c1865SLeandro Lupori	.align 5
126e16c1865SLeandro Lupori.Lbackward_align_loop:
127e16c1865SLeandro Lupori	andi.	%r0, %r4, 15
128e16c1865SLeandro Lupori	beq	.Lbackward_single_copy
129e16c1865SLeandro Lupori	lbzu	%r0, -1(%r4)
130e16c1865SLeandro Lupori	addi	%r5, %r5, -1
131e16c1865SLeandro Lupori	stbu	%r0, -1(%r3)
132e16c1865SLeandro Lupori	cmpdi	%r5, 0
133e16c1865SLeandro Lupori	beq-	.Ldone
134e16c1865SLeandro Lupori	b	.Lbackward_align_loop
135e16c1865SLeandro Lupori
136e16c1865SLeandro Lupori.Lsingle_copy:
137e16c1865SLeandro Lupori	/* forward copy */
138e16c1865SLeandro Lupori	li	%r0, 1
139e16c1865SLeandro Lupori	li	%r8, 16
140e16c1865SLeandro Lupori	li	%r9, 0
141e16c1865SLeandro Lupori	b	.Lsingle_phase
142e16c1865SLeandro Lupori
143e16c1865SLeandro Lupori.Lbackward_single_copy:
144e16c1865SLeandro Lupori	/* backward copy */
145e16c1865SLeandro Lupori	li	%r0, -1
146e16c1865SLeandro Lupori	li	%r8, -16
147e16c1865SLeandro Lupori	li	%r9, -15
148e16c1865SLeandro Lupori	/* point src and dst to last byte */
149e16c1865SLeandro Lupori	addi	%r3, %r3, -1
150e16c1865SLeandro Lupori	addi	%r4, %r4, -1
151e16c1865SLeandro Lupori
152e16c1865SLeandro Lupori.Lsingle_phase:
153e16c1865SLeandro Lupori	srdi.	%r6, %r5, 4		/* number of 16-bytes */
154e16c1865SLeandro Lupori	beq	.Lsingle_1
155e16c1865SLeandro Lupori
156e16c1865SLeandro Lupori	/* pre-adjustment */
157e16c1865SLeandro Lupori	add	%r3, %r3, %r9
158e16c1865SLeandro Lupori	add	%r4, %r4, %r9
159e16c1865SLeandro Lupori
160e16c1865SLeandro Lupori	mtctr	%r6
161e16c1865SLeandro Lupori	.align 5
162e16c1865SLeandro Lupori.Lsingle_16_loop:
163e16c1865SLeandro Lupori	ld	%r6, 0(%r4)
164e16c1865SLeandro Lupori	ld	%r7, 8(%r4)
165e16c1865SLeandro Lupori	add	%r4, %r4, %r8
166e16c1865SLeandro Lupori	std	%r6, 0(%r3)
167e16c1865SLeandro Lupori	std	%r7, 8(%r3)
168e16c1865SLeandro Lupori	add	%r3, %r3, %r8
169e16c1865SLeandro Lupori	bdnz	.Lsingle_16_loop
170e16c1865SLeandro Lupori
171e16c1865SLeandro Lupori	/* post-adjustment */
172e16c1865SLeandro Lupori	sub	%r3, %r3, %r9
173e16c1865SLeandro Lupori	sub	%r4, %r4, %r9
174e16c1865SLeandro Lupori
175e16c1865SLeandro Lupori.Lsingle_1:
176e16c1865SLeandro Lupori	andi.	%r6, %r5, 0x0f		/* number of 1-bytes */
177e16c1865SLeandro Lupori	beq	.Ldone			/* 1-bytes == 0? done */
178e16c1865SLeandro Lupori
179e16c1865SLeandro Lupori	mtctr	%r6
180e16c1865SLeandro Lupori	.align 5
181e16c1865SLeandro Lupori.Lsingle_1_loop:
182e16c1865SLeandro Lupori	lbz	%r6, 0(%r4)
183e16c1865SLeandro Lupori	add	%r4, %r4, %r0		/* increment */
184e16c1865SLeandro Lupori	stb	%r6, 0(%r3)
185e16c1865SLeandro Lupori	add	%r3, %r3, %r0		/* increment */
186e16c1865SLeandro Lupori	bdnz	.Lsingle_1_loop
187e16c1865SLeandro Lupori
188e16c1865SLeandro Lupori.Ldone:
189e16c1865SLeandro Lupori#ifdef MEMMOVE
190e16c1865SLeandro Lupori	ld	%r3, -8(%r1)		/* restore dst */
191e16c1865SLeandro Lupori#endif
192e16c1865SLeandro Lupori	blr
193e16c1865SLeandro Lupori
194e16c1865SLeandro Lupori
195e16c1865SLeandro Lupori.Lmulti_phase:
196e16c1865SLeandro Lupori	/* set up multi-phase copy parameters */
197e16c1865SLeandro Lupori
198e16c1865SLeandro Lupori	/* r7 = bytes before the aligned section of the buffer */
199e16c1865SLeandro Lupori	andi.	%r6, %r4, 15
200e16c1865SLeandro Lupori	subfic	%r7, %r6, 16
201e16c1865SLeandro Lupori	/* r8 = bytes in and after the aligned section of the buffer */
202e16c1865SLeandro Lupori	sub	%r8, %r5, %r7
203e16c1865SLeandro Lupori	/* r9 = bytes after the aligned section of the buffer */
204e16c1865SLeandro Lupori	andi.	%r9, %r8, BLOCK_SIZE_MASK
205e16c1865SLeandro Lupori	/* r10 = BLOCKS in the aligned section of the buffer */
206e16c1865SLeandro Lupori	srdi	%r10, %r8, BLOCK_SIZE_BITS
207e16c1865SLeandro Lupori
208e16c1865SLeandro Lupori	/* forward or backward copy? */
209e16c1865SLeandro Lupori	cmpd	%r4, %r3
210e16c1865SLeandro Lupori	blt	.Lbackward_multi_copy
211e16c1865SLeandro Lupori
212e16c1865SLeandro Lupori	/* set up forward copy parameters */
213e16c1865SLeandro Lupori	std	%r7,  -32(%r1)		/* bytes to copy in phase 1 */
214e16c1865SLeandro Lupori	std	%r10, -40(%r1)		/* BLOCKS to copy in phase 2 */
215e16c1865SLeandro Lupori	std	%r9,  -48(%r1)		/* bytes to copy in phase 3 */
216e16c1865SLeandro Lupori
217e16c1865SLeandro Lupori	li	%r0, 1			/* increment for phases 1 and 3 */
218e16c1865SLeandro Lupori	li	%r5, BLOCK_SIZE		/* increment for phase 2 */
219e16c1865SLeandro Lupori
220e16c1865SLeandro Lupori	/* op offsets for phase 2 */
221e16c1865SLeandro Lupori	li	%r7,  0
222e16c1865SLeandro Lupori	li	%r8,  16
223e16c1865SLeandro Lupori	li	%r9,  32
224e16c1865SLeandro Lupori	li	%r10, 48
225e16c1865SLeandro Lupori
226e16c1865SLeandro Lupori	std	%r8, -16(%r1)		/* 16-byte increment (16) */
227e16c1865SLeandro Lupori	std	%r7, -24(%r1)		/* 16-byte pre/post adjustment (0) */
228e16c1865SLeandro Lupori
229e16c1865SLeandro Lupori	b	.Lphase1
230e16c1865SLeandro Lupori
231e16c1865SLeandro Lupori.Lbackward_multi_copy:
232e16c1865SLeandro Lupori	/* set up backward copy parameters */
233e16c1865SLeandro Lupori	std	%r9,  -32(%r1)		/* bytes to copy in phase 1 */
234e16c1865SLeandro Lupori	std	%r10, -40(%r1)		/* BLOCKS to copy in phase 2 */
235e16c1865SLeandro Lupori	std	%r7,  -48(%r1)		/* bytes to copy in phase 3 */
236e16c1865SLeandro Lupori
237e16c1865SLeandro Lupori	li	%r0, -1			/* increment for phases 1 and 3 */
238e16c1865SLeandro Lupori	add	%r6, %r5, %r0		/* r6 = len - 1 */
239e16c1865SLeandro Lupori	li	%r5, -BLOCK_SIZE	/* increment for phase 2 */
240e16c1865SLeandro Lupori	/* advance src and dst to the last position */
241e16c1865SLeandro Lupori	add	%r3, %r3, %r6
242e16c1865SLeandro Lupori	add	%r4, %r4, %r6
243e16c1865SLeandro Lupori
244e16c1865SLeandro Lupori	/* op offsets for phase 2 */
245e16c1865SLeandro Lupori	li	%r7,  -15
246e16c1865SLeandro Lupori	li	%r8,  -31
247e16c1865SLeandro Lupori	li	%r9,  -47
248e16c1865SLeandro Lupori	li	%r10, -63
249e16c1865SLeandro Lupori
250e16c1865SLeandro Lupori	add	%r6, %r7, %r0		/* r6 = -16 */
251e16c1865SLeandro Lupori	std	%r6, -16(%r1)		/* 16-byte increment (-16) */
252e16c1865SLeandro Lupori	std	%r7, -24(%r1)		/* 16-byte pre/post adjustment (-15) */
253e16c1865SLeandro Lupori
254e16c1865SLeandro Lupori.Lphase1:
255e16c1865SLeandro Lupori	ld	%r6, -32(%r1)		/* bytes to copy in phase 1 */
256e16c1865SLeandro Lupori	cmpldi	%r6, 0			/* r6 == 0? skip phase 1 */
257e16c1865SLeandro Lupori	beq+	.Lphase2
258e16c1865SLeandro Lupori
259e16c1865SLeandro Lupori	mtctr	%r6
260e16c1865SLeandro Lupori	.align 5
261e16c1865SLeandro Lupori.Lphase1_loop:
262e16c1865SLeandro Lupori	lbz	%r6, 0(%r4)
263e16c1865SLeandro Lupori	add	%r4, %r4, %r0		/* phase 1 increment */
264e16c1865SLeandro Lupori	stb	%r6, 0(%r3)
265e16c1865SLeandro Lupori	add	%r3, %r3, %r0		/* phase 1 increment */
266e16c1865SLeandro Lupori	bdnz	.Lphase1_loop
267e16c1865SLeandro Lupori
268e16c1865SLeandro Lupori.Lphase2:
269e16c1865SLeandro Lupori	ld	%r6, -40(%r1)		/* BLOCKS to copy in phase 2 */
270e16c1865SLeandro Lupori	cmpldi	%r6, 0			/* %r6 == 0? skip phase 2 */
271e16c1865SLeandro Lupori	beq	.Lphase3
272e16c1865SLeandro Lupori
273e16c1865SLeandro Lupori#ifdef FN_PHASE2
274e16c1865SLeandro LuporiFN_PHASE2
275e16c1865SLeandro Lupori#else
276e16c1865SLeandro Lupori	/* save registers */
277e16c1865SLeandro Lupori	std	%r14, -56(%r1)
278e16c1865SLeandro Lupori	std	%r15, -64(%r1)
279e16c1865SLeandro Lupori	std	%r16, -72(%r1)
280e16c1865SLeandro Lupori	std	%r17, -80(%r1)
281e16c1865SLeandro Lupori	std	%r18, -88(%r1)
282e16c1865SLeandro Lupori	std	%r19, -96(%r1)
283e16c1865SLeandro Lupori	std	%r20, -104(%r1)
284e16c1865SLeandro Lupori	std	%r21, -112(%r1)
285e16c1865SLeandro Lupori
286e16c1865SLeandro Lupori	addi	%r18, %r7, 8
287e16c1865SLeandro Lupori	addi	%r19, %r8, 8
288e16c1865SLeandro Lupori	addi	%r20, %r9, 8
289e16c1865SLeandro Lupori	addi	%r21, %r10, 8
290e16c1865SLeandro Lupori
291e16c1865SLeandro Lupori	mtctr	%r6
292e16c1865SLeandro Lupori	.align 5
293e16c1865SLeandro Lupori.Lphase2_loop:
294e16c1865SLeandro Lupori	ldx	%r14, %r7,  %r4
295e16c1865SLeandro Lupori	ldx	%r15, %r18, %r4
296e16c1865SLeandro Lupori	ldx	%r16, %r8,  %r4
297e16c1865SLeandro Lupori	ldx	%r17, %r19, %r4
298e16c1865SLeandro Lupori	stdx	%r14, %r7,  %r3
299e16c1865SLeandro Lupori	stdx	%r15, %r18, %r3
300e16c1865SLeandro Lupori	stdx	%r16, %r8,  %r3
301e16c1865SLeandro Lupori	stdx	%r17, %r19, %r3
302e16c1865SLeandro Lupori
303e16c1865SLeandro Lupori	ldx	%r14, %r9,  %r4
304e16c1865SLeandro Lupori	ldx	%r15, %r20, %r4
305e16c1865SLeandro Lupori	ldx	%r16, %r10, %r4
306e16c1865SLeandro Lupori	ldx	%r17, %r21, %r4
307e16c1865SLeandro Lupori	stdx	%r14, %r9,  %r3
308e16c1865SLeandro Lupori	stdx	%r15, %r20, %r3
309e16c1865SLeandro Lupori	stdx	%r16, %r10, %r3
310e16c1865SLeandro Lupori	stdx	%r17, %r21, %r3
311e16c1865SLeandro Lupori
312e16c1865SLeandro Lupori	add	%r4, %r4, %r5		/* phase 2 increment */
313e16c1865SLeandro Lupori	add	%r3, %r3, %r5		/* phase 2 increment */
314e16c1865SLeandro Lupori
315e16c1865SLeandro Lupori	bdnz	.Lphase2_loop
316e16c1865SLeandro Lupori
317e16c1865SLeandro Lupori	/* restore registers */
318e16c1865SLeandro Lupori	ld	%r14, -56(%r1)
319e16c1865SLeandro Lupori	ld	%r15, -64(%r1)
320e16c1865SLeandro Lupori	ld	%r16, -72(%r1)
321e16c1865SLeandro Lupori	ld	%r17, -80(%r1)
322e16c1865SLeandro Lupori	ld	%r18, -88(%r1)
323e16c1865SLeandro Lupori	ld	%r19, -96(%r1)
324e16c1865SLeandro Lupori	ld	%r20, -104(%r1)
325e16c1865SLeandro Lupori	ld	%r21, -112(%r1)
326e16c1865SLeandro Lupori#endif
327e16c1865SLeandro Lupori
328e16c1865SLeandro Lupori.Lphase3:
329e16c1865SLeandro Lupori	/* load registers for transitioning into the single-phase logic */
330e16c1865SLeandro Lupori	ld	%r5, -48(%r1)		/* bytes to copy in phase 3 */
331e16c1865SLeandro Lupori	ld	%r8, -16(%r1)		/* 16-byte increment */
332e16c1865SLeandro Lupori	ld	%r9, -24(%r1)		/* 16-byte pre/post adjustment */
333e16c1865SLeandro Lupori	b	.Lsingle_phase
334e16c1865SLeandro Lupori
335e16c1865SLeandro LuporiEND(FN_NAME)
336e16c1865SLeandro Lupori
337e16c1865SLeandro Lupori	.section .note.GNU-stack,"",%progbits
338e16c1865SLeandro Lupori
339