xref: /linux/arch/powerpc/lib/memcpy_64.S (revision 00a6d7b6762c27d441e9ac8faff36384bc0fc180)
1/*
2 * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9#include <asm/processor.h>
10#include <asm/ppc_asm.h>
11
12	.align	7
13_GLOBAL(memcpy)
14BEGIN_FTR_SECTION
15	std	r3,48(r1)	/* save destination pointer for return value */
16FTR_SECTION_ELSE
17#ifndef SELFTEST
18	b	memcpy_power7
19#endif
20ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
21	PPC_MTOCRF(0x01,r5)
22	cmpldi	cr1,r5,16
23	neg	r6,r3		# LS 3 bits = # bytes to 8-byte dest bdry
24	andi.	r6,r6,7
25	dcbt	0,r4
26	blt	cr1,.Lshort_copy
27/* Below we want to nop out the bne if we're on a CPU that has the
28   CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
29   cleared.
30   At the time of writing the only CPU that has this combination of bits
31   set is Power6. */
32BEGIN_FTR_SECTION
33	nop
34FTR_SECTION_ELSE
35	bne	.Ldst_unaligned
36ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
37                    CPU_FTR_UNALIGNED_LD_STD)
38.Ldst_aligned:
39	addi	r3,r3,-16
40BEGIN_FTR_SECTION
41	andi.	r0,r4,7
42	bne	.Lsrc_unaligned
43END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
44	srdi	r7,r5,4
45	ld	r9,0(r4)
46	addi	r4,r4,-8
47	mtctr	r7
48	andi.	r5,r5,7
49	bf	cr7*4+0,2f
50	addi	r3,r3,8
51	addi	r4,r4,8
52	mr	r8,r9
53	blt	cr1,3f
541:	ld	r9,8(r4)
55	std	r8,8(r3)
562:	ldu	r8,16(r4)
57	stdu	r9,16(r3)
58	bdnz	1b
593:	std	r8,8(r3)
60	beq	3f
61	addi	r3,r3,16
62.Ldo_tail:
63	bf	cr7*4+1,1f
64	lwz	r9,8(r4)
65	addi	r4,r4,4
66	stw	r9,0(r3)
67	addi	r3,r3,4
681:	bf	cr7*4+2,2f
69	lhz	r9,8(r4)
70	addi	r4,r4,2
71	sth	r9,0(r3)
72	addi	r3,r3,2
732:	bf	cr7*4+3,3f
74	lbz	r9,8(r4)
75	stb	r9,0(r3)
763:	ld	r3,48(r1)	/* return dest pointer */
77	blr
78
79.Lsrc_unaligned:
80	srdi	r6,r5,3
81	addi	r5,r5,-16
82	subf	r4,r0,r4
83	srdi	r7,r5,4
84	sldi	r10,r0,3
85	cmpdi	cr6,r6,3
86	andi.	r5,r5,7
87	mtctr	r7
88	subfic	r11,r10,64
89	add	r5,r5,r0
90
91	bt	cr7*4+0,0f
92
93	ld	r9,0(r4)	# 3+2n loads, 2+2n stores
94	ld	r0,8(r4)
95	sld	r6,r9,r10
96	ldu	r9,16(r4)
97	srd	r7,r0,r11
98	sld	r8,r0,r10
99	or	r7,r7,r6
100	blt	cr6,4f
101	ld	r0,8(r4)
102	# s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
103	b	2f
104
1050:	ld	r0,0(r4)	# 4+2n loads, 3+2n stores
106	ldu	r9,8(r4)
107	sld	r8,r0,r10
108	addi	r3,r3,-8
109	blt	cr6,5f
110	ld	r0,8(r4)
111	srd	r12,r9,r11
112	sld	r6,r9,r10
113	ldu	r9,16(r4)
114	or	r12,r8,r12
115	srd	r7,r0,r11
116	sld	r8,r0,r10
117	addi	r3,r3,16
118	beq	cr6,3f
119
120	# d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
1211:	or	r7,r7,r6
122	ld	r0,8(r4)
123	std	r12,8(r3)
1242:	srd	r12,r9,r11
125	sld	r6,r9,r10
126	ldu	r9,16(r4)
127	or	r12,r8,r12
128	stdu	r7,16(r3)
129	srd	r7,r0,r11
130	sld	r8,r0,r10
131	bdnz	1b
132
1333:	std	r12,8(r3)
134	or	r7,r7,r6
1354:	std	r7,16(r3)
1365:	srd	r12,r9,r11
137	or	r12,r8,r12
138	std	r12,24(r3)
139	beq	4f
140	cmpwi	cr1,r5,8
141	addi	r3,r3,32
142	sld	r9,r9,r10
143	ble	cr1,6f
144	ld	r0,8(r4)
145	srd	r7,r0,r11
146	or	r9,r7,r9
1476:
148	bf	cr7*4+1,1f
149	rotldi	r9,r9,32
150	stw	r9,0(r3)
151	addi	r3,r3,4
1521:	bf	cr7*4+2,2f
153	rotldi	r9,r9,16
154	sth	r9,0(r3)
155	addi	r3,r3,2
1562:	bf	cr7*4+3,3f
157	rotldi	r9,r9,8
158	stb	r9,0(r3)
1593:	ld	r3,48(r1)	/* return dest pointer */
160	blr
161
162.Ldst_unaligned:
163	PPC_MTOCRF(0x01,r6)		# put #bytes to 8B bdry into cr7
164	subf	r5,r6,r5
165	li	r7,0
166	cmpldi	cr1,r5,16
167	bf	cr7*4+3,1f
168	lbz	r0,0(r4)
169	stb	r0,0(r3)
170	addi	r7,r7,1
1711:	bf	cr7*4+2,2f
172	lhzx	r0,r7,r4
173	sthx	r0,r7,r3
174	addi	r7,r7,2
1752:	bf	cr7*4+1,3f
176	lwzx	r0,r7,r4
177	stwx	r0,r7,r3
1783:	PPC_MTOCRF(0x01,r5)
179	add	r4,r6,r4
180	add	r3,r6,r3
181	b	.Ldst_aligned
182
183.Lshort_copy:
184	bf	cr7*4+0,1f
185	lwz	r0,0(r4)
186	lwz	r9,4(r4)
187	addi	r4,r4,8
188	stw	r0,0(r3)
189	stw	r9,4(r3)
190	addi	r3,r3,8
1911:	bf	cr7*4+1,2f
192	lwz	r0,0(r4)
193	addi	r4,r4,4
194	stw	r0,0(r3)
195	addi	r3,r3,4
1962:	bf	cr7*4+2,3f
197	lhz	r0,0(r4)
198	addi	r4,r4,2
199	sth	r0,0(r3)
200	addi	r3,r3,2
2013:	bf	cr7*4+3,4f
202	lbz	r0,0(r4)
203	stb	r0,0(r3)
2044:	ld	r3,48(r1)	/* return dest pointer */
205	blr
206