xref: /linux/tools/testing/selftests/powerpc/copyloops/memcpy_64.S (revision 0883c2c06fb5bcf5b9e008270827e63c09a88c1e)
1/*
2 * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9#include <asm/processor.h>
10#include <asm/ppc_asm.h>
11
12	.align	7
13_GLOBAL_TOC(memcpy)
14BEGIN_FTR_SECTION
15#ifdef __LITTLE_ENDIAN__
16	cmpdi	cr7,r5,0
17#else
18	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* save destination pointer for return value */
19#endif
20FTR_SECTION_ELSE
21#ifndef SELFTEST
22	b	memcpy_power7
23#endif
24ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
25#ifdef __LITTLE_ENDIAN__
26	/* dumb little-endian memcpy that will get replaced at runtime */
27	addi r9,r3,-1
28	addi r4,r4,-1
29	beqlr cr7
30	mtctr r5
311:	lbzu r10,1(r4)
32	stbu r10,1(r9)
33	bdnz 1b
34	blr
35#else
36	PPC_MTOCRF(0x01,r5)
37	cmpldi	cr1,r5,16
38	neg	r6,r3		# LS 3 bits = # bytes to 8-byte dest bdry
39	andi.	r6,r6,7
40	dcbt	0,r4
41	blt	cr1,.Lshort_copy
42/* Below we want to nop out the bne if we're on a CPU that has the
43   CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
44   cleared.
45   At the time of writing the only CPU that has this combination of bits
46   set is Power6. */
47BEGIN_FTR_SECTION
48	nop
49FTR_SECTION_ELSE
50	bne	.Ldst_unaligned
51ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
52                    CPU_FTR_UNALIGNED_LD_STD)
53.Ldst_aligned:
54	addi	r3,r3,-16
55BEGIN_FTR_SECTION
56	andi.	r0,r4,7
57	bne	.Lsrc_unaligned
58END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
59	srdi	r7,r5,4
60	ld	r9,0(r4)
61	addi	r4,r4,-8
62	mtctr	r7
63	andi.	r5,r5,7
64	bf	cr7*4+0,2f
65	addi	r3,r3,8
66	addi	r4,r4,8
67	mr	r8,r9
68	blt	cr1,3f
691:	ld	r9,8(r4)
70	std	r8,8(r3)
712:	ldu	r8,16(r4)
72	stdu	r9,16(r3)
73	bdnz	1b
743:	std	r8,8(r3)
75	beq	3f
76	addi	r3,r3,16
77.Ldo_tail:
78	bf	cr7*4+1,1f
79	lwz	r9,8(r4)
80	addi	r4,r4,4
81	stw	r9,0(r3)
82	addi	r3,r3,4
831:	bf	cr7*4+2,2f
84	lhz	r9,8(r4)
85	addi	r4,r4,2
86	sth	r9,0(r3)
87	addi	r3,r3,2
882:	bf	cr7*4+3,3f
89	lbz	r9,8(r4)
90	stb	r9,0(r3)
913:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* return dest pointer */
92	blr
93
94.Lsrc_unaligned:
95	srdi	r6,r5,3
96	addi	r5,r5,-16
97	subf	r4,r0,r4
98	srdi	r7,r5,4
99	sldi	r10,r0,3
100	cmpdi	cr6,r6,3
101	andi.	r5,r5,7
102	mtctr	r7
103	subfic	r11,r10,64
104	add	r5,r5,r0
105
106	bt	cr7*4+0,0f
107
108	ld	r9,0(r4)	# 3+2n loads, 2+2n stores
109	ld	r0,8(r4)
110	sld	r6,r9,r10
111	ldu	r9,16(r4)
112	srd	r7,r0,r11
113	sld	r8,r0,r10
114	or	r7,r7,r6
115	blt	cr6,4f
116	ld	r0,8(r4)
117	# s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
118	b	2f
119
1200:	ld	r0,0(r4)	# 4+2n loads, 3+2n stores
121	ldu	r9,8(r4)
122	sld	r8,r0,r10
123	addi	r3,r3,-8
124	blt	cr6,5f
125	ld	r0,8(r4)
126	srd	r12,r9,r11
127	sld	r6,r9,r10
128	ldu	r9,16(r4)
129	or	r12,r8,r12
130	srd	r7,r0,r11
131	sld	r8,r0,r10
132	addi	r3,r3,16
133	beq	cr6,3f
134
135	# d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
1361:	or	r7,r7,r6
137	ld	r0,8(r4)
138	std	r12,8(r3)
1392:	srd	r12,r9,r11
140	sld	r6,r9,r10
141	ldu	r9,16(r4)
142	or	r12,r8,r12
143	stdu	r7,16(r3)
144	srd	r7,r0,r11
145	sld	r8,r0,r10
146	bdnz	1b
147
1483:	std	r12,8(r3)
149	or	r7,r7,r6
1504:	std	r7,16(r3)
1515:	srd	r12,r9,r11
152	or	r12,r8,r12
153	std	r12,24(r3)
154	beq	4f
155	cmpwi	cr1,r5,8
156	addi	r3,r3,32
157	sld	r9,r9,r10
158	ble	cr1,6f
159	ld	r0,8(r4)
160	srd	r7,r0,r11
161	or	r9,r7,r9
1626:
163	bf	cr7*4+1,1f
164	rotldi	r9,r9,32
165	stw	r9,0(r3)
166	addi	r3,r3,4
1671:	bf	cr7*4+2,2f
168	rotldi	r9,r9,16
169	sth	r9,0(r3)
170	addi	r3,r3,2
1712:	bf	cr7*4+3,3f
172	rotldi	r9,r9,8
173	stb	r9,0(r3)
1743:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* return dest pointer */
175	blr
176
177.Ldst_unaligned:
178	PPC_MTOCRF(0x01,r6)		# put #bytes to 8B bdry into cr7
179	subf	r5,r6,r5
180	li	r7,0
181	cmpldi	cr1,r5,16
182	bf	cr7*4+3,1f
183	lbz	r0,0(r4)
184	stb	r0,0(r3)
185	addi	r7,r7,1
1861:	bf	cr7*4+2,2f
187	lhzx	r0,r7,r4
188	sthx	r0,r7,r3
189	addi	r7,r7,2
1902:	bf	cr7*4+1,3f
191	lwzx	r0,r7,r4
192	stwx	r0,r7,r3
1933:	PPC_MTOCRF(0x01,r5)
194	add	r4,r6,r4
195	add	r3,r6,r3
196	b	.Ldst_aligned
197
198.Lshort_copy:
199	bf	cr7*4+0,1f
200	lwz	r0,0(r4)
201	lwz	r9,4(r4)
202	addi	r4,r4,8
203	stw	r0,0(r3)
204	stw	r9,4(r3)
205	addi	r3,r3,8
2061:	bf	cr7*4+1,2f
207	lwz	r0,0(r4)
208	addi	r4,r4,4
209	stw	r0,0(r3)
210	addi	r3,r3,4
2112:	bf	cr7*4+2,3f
212	lhz	r0,0(r4)
213	addi	r4,r4,2
214	sth	r0,0(r3)
215	addi	r3,r3,2
2163:	bf	cr7*4+3,4f
217	lbz	r0,0(r4)
218	stb	r0,0(r3)
2194:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)	/* return dest pointer */
220	blr
221#endif
222