xref: /linux/arch/powerpc/lib/memcpy_64.S (revision 2c86cd188f8a5631f3d75a1dea14d22df85189b4)
1/*
2 * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9#include <asm/processor.h>
10#include <asm/ppc_asm.h>
11#include <asm/export.h>
12#include <asm/asm-compat.h>
13#include <asm/feature-fixups.h>
14
15	.align	7
16_GLOBAL_TOC(memcpy)
17BEGIN_FTR_SECTION
18#ifdef __LITTLE_ENDIAN__
19	cmpdi	cr7,r5,0
20#else
21	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* save destination pointer for return value */
22#endif
23FTR_SECTION_ELSE
24#ifdef CONFIG_PPC_BOOK3S_64
25#ifndef SELFTEST
26	b	memcpy_power7
27#endif
28#endif
29ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
30#ifdef __LITTLE_ENDIAN__
31	/* dumb little-endian memcpy that will get replaced at runtime */
32	addi r9,r3,-1
33	addi r4,r4,-1
34	beqlr cr7
35	mtctr r5
361:	lbzu r10,1(r4)
37	stbu r10,1(r9)
38	bdnz 1b
39	blr
40#else
41	PPC_MTOCRF(0x01,r5)
42	cmpldi	cr1,r5,16
43	neg	r6,r3		# LS 3 bits = # bytes to 8-byte dest bdry
44	andi.	r6,r6,7
45	dcbt	0,r4
46	blt	cr1,.Lshort_copy
47/* Below we want to nop out the bne if we're on a CPU that has the
48   CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
49   cleared.
50   At the time of writing the only CPU that has this combination of bits
51   set is Power6. */
52BEGIN_FTR_SECTION
53	nop
54FTR_SECTION_ELSE
55	bne	.Ldst_unaligned
56ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
57                    CPU_FTR_UNALIGNED_LD_STD)
58.Ldst_aligned:
59	addi	r3,r3,-16
60BEGIN_FTR_SECTION
61	andi.	r0,r4,7
62	bne	.Lsrc_unaligned
63END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
64	srdi	r7,r5,4
65	ld	r9,0(r4)
66	addi	r4,r4,-8
67	mtctr	r7
68	andi.	r5,r5,7
69	bf	cr7*4+0,2f
70	addi	r3,r3,8
71	addi	r4,r4,8
72	mr	r8,r9
73	blt	cr1,3f
741:	ld	r9,8(r4)
75	std	r8,8(r3)
762:	ldu	r8,16(r4)
77	stdu	r9,16(r3)
78	bdnz	1b
793:	std	r8,8(r3)
80	beq	3f
81	addi	r3,r3,16
82.Ldo_tail:
83	bf	cr7*4+1,1f
84	lwz	r9,8(r4)
85	addi	r4,r4,4
86	stw	r9,0(r3)
87	addi	r3,r3,4
881:	bf	cr7*4+2,2f
89	lhz	r9,8(r4)
90	addi	r4,r4,2
91	sth	r9,0(r3)
92	addi	r3,r3,2
932:	bf	cr7*4+3,3f
94	lbz	r9,8(r4)
95	stb	r9,0(r3)
963:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* return dest pointer */
97	blr
98
99.Lsrc_unaligned:
100	srdi	r6,r5,3
101	addi	r5,r5,-16
102	subf	r4,r0,r4
103	srdi	r7,r5,4
104	sldi	r10,r0,3
105	cmpdi	cr6,r6,3
106	andi.	r5,r5,7
107	mtctr	r7
108	subfic	r11,r10,64
109	add	r5,r5,r0
110
111	bt	cr7*4+0,0f
112
113	ld	r9,0(r4)	# 3+2n loads, 2+2n stores
114	ld	r0,8(r4)
115	sld	r6,r9,r10
116	ldu	r9,16(r4)
117	srd	r7,r0,r11
118	sld	r8,r0,r10
119	or	r7,r7,r6
120	blt	cr6,4f
121	ld	r0,8(r4)
122	# s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
123	b	2f
124
1250:	ld	r0,0(r4)	# 4+2n loads, 3+2n stores
126	ldu	r9,8(r4)
127	sld	r8,r0,r10
128	addi	r3,r3,-8
129	blt	cr6,5f
130	ld	r0,8(r4)
131	srd	r12,r9,r11
132	sld	r6,r9,r10
133	ldu	r9,16(r4)
134	or	r12,r8,r12
135	srd	r7,r0,r11
136	sld	r8,r0,r10
137	addi	r3,r3,16
138	beq	cr6,3f
139
140	# d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
1411:	or	r7,r7,r6
142	ld	r0,8(r4)
143	std	r12,8(r3)
1442:	srd	r12,r9,r11
145	sld	r6,r9,r10
146	ldu	r9,16(r4)
147	or	r12,r8,r12
148	stdu	r7,16(r3)
149	srd	r7,r0,r11
150	sld	r8,r0,r10
151	bdnz	1b
152
1533:	std	r12,8(r3)
154	or	r7,r7,r6
1554:	std	r7,16(r3)
1565:	srd	r12,r9,r11
157	or	r12,r8,r12
158	std	r12,24(r3)
159	beq	4f
160	cmpwi	cr1,r5,8
161	addi	r3,r3,32
162	sld	r9,r9,r10
163	ble	cr1,6f
164	ld	r0,8(r4)
165	srd	r7,r0,r11
166	or	r9,r7,r9
1676:
168	bf	cr7*4+1,1f
169	rotldi	r9,r9,32
170	stw	r9,0(r3)
171	addi	r3,r3,4
1721:	bf	cr7*4+2,2f
173	rotldi	r9,r9,16
174	sth	r9,0(r3)
175	addi	r3,r3,2
1762:	bf	cr7*4+3,3f
177	rotldi	r9,r9,8
178	stb	r9,0(r3)
1793:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* return dest pointer */
180	blr
181
182.Ldst_unaligned:
183	PPC_MTOCRF(0x01,r6)		# put #bytes to 8B bdry into cr7
184	subf	r5,r6,r5
185	li	r7,0
186	cmpldi	cr1,r5,16
187	bf	cr7*4+3,1f
188	lbz	r0,0(r4)
189	stb	r0,0(r3)
190	addi	r7,r7,1
1911:	bf	cr7*4+2,2f
192	lhzx	r0,r7,r4
193	sthx	r0,r7,r3
194	addi	r7,r7,2
1952:	bf	cr7*4+1,3f
196	lwzx	r0,r7,r4
197	stwx	r0,r7,r3
1983:	PPC_MTOCRF(0x01,r5)
199	add	r4,r6,r4
200	add	r3,r6,r3
201	b	.Ldst_aligned
202
203.Lshort_copy:
204	bf	cr7*4+0,1f
205	lwz	r0,0(r4)
206	lwz	r9,4(r4)
207	addi	r4,r4,8
208	stw	r0,0(r3)
209	stw	r9,4(r3)
210	addi	r3,r3,8
2111:	bf	cr7*4+1,2f
212	lwz	r0,0(r4)
213	addi	r4,r4,4
214	stw	r0,0(r3)
215	addi	r3,r3,4
2162:	bf	cr7*4+2,3f
217	lhz	r0,0(r4)
218	addi	r4,r4,2
219	sth	r0,0(r3)
220	addi	r3,r3,2
2213:	bf	cr7*4+3,4f
222	lbz	r0,0(r4)
223	stb	r0,0(r3)
2244:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* return dest pointer */
225	blr
226#endif
227EXPORT_SYMBOL(memcpy)
228