xref: /linux/tools/testing/selftests/powerpc/copyloops/copy_mc_64.S (revision c8bfe3fad4f86a029da7157bae9699c816f0c309)
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Copyright (C) IBM Corporation, 2011
4 * Derived from copyuser_power7.s by Anton Blanchard <anton@au.ibm.com>
5 * Author - Balbir Singh <bsingharora@gmail.com>
6 */
7#include <linux/export.h>
8#include <asm/ppc_asm.h>
9#include <asm/errno.h>
10
11	.macro err1
12100:
13	EX_TABLE(100b,.Ldo_err1)
14	.endm
15
16	.macro err2
17200:
18	EX_TABLE(200b,.Ldo_err2)
19	.endm
20
21	.macro err3
22300:	EX_TABLE(300b,.Ldone)
23	.endm
24
25.Ldo_err2:
26	ld	r22,STK_REG(R22)(r1)
27	ld	r21,STK_REG(R21)(r1)
28	ld	r20,STK_REG(R20)(r1)
29	ld	r19,STK_REG(R19)(r1)
30	ld	r18,STK_REG(R18)(r1)
31	ld	r17,STK_REG(R17)(r1)
32	ld	r16,STK_REG(R16)(r1)
33	ld	r15,STK_REG(R15)(r1)
34	ld	r14,STK_REG(R14)(r1)
35	addi	r1,r1,STACKFRAMESIZE
36.Ldo_err1:
37	/* Do a byte by byte copy to get the exact remaining size */
38	mtctr	r7
3946:
40err3;	lbz	r0,0(r4)
41	addi	r4,r4,1
42err3;	stb	r0,0(r3)
43	addi	r3,r3,1
44	bdnz	46b
45	li	r3,0
46	blr
47
48.Ldone:
49	mfctr	r3
50	blr
51
52
53_GLOBAL(copy_mc_generic)
54	mr	r7,r5
55	cmpldi	r5,16
56	blt	.Lshort_copy
57
58.Lcopy:
59	/* Get the source 8B aligned */
60	neg	r6,r4
61	mtocrf	0x01,r6
62	clrldi	r6,r6,(64-3)
63
64	bf	cr7*4+3,1f
65err1;	lbz	r0,0(r4)
66	addi	r4,r4,1
67err1;	stb	r0,0(r3)
68	addi	r3,r3,1
69	subi	r7,r7,1
70
711:	bf	cr7*4+2,2f
72err1;	lhz	r0,0(r4)
73	addi	r4,r4,2
74err1;	sth	r0,0(r3)
75	addi	r3,r3,2
76	subi	r7,r7,2
77
782:	bf	cr7*4+1,3f
79err1;	lwz	r0,0(r4)
80	addi	r4,r4,4
81err1;	stw	r0,0(r3)
82	addi	r3,r3,4
83	subi	r7,r7,4
84
853:	sub	r5,r5,r6
86	cmpldi	r5,128
87
88	mflr	r0
89	stdu	r1,-STACKFRAMESIZE(r1)
90	std	r14,STK_REG(R14)(r1)
91	std	r15,STK_REG(R15)(r1)
92	std	r16,STK_REG(R16)(r1)
93	std	r17,STK_REG(R17)(r1)
94	std	r18,STK_REG(R18)(r1)
95	std	r19,STK_REG(R19)(r1)
96	std	r20,STK_REG(R20)(r1)
97	std	r21,STK_REG(R21)(r1)
98	std	r22,STK_REG(R22)(r1)
99	std	r0,STACKFRAMESIZE+16(r1)
100
101	blt	5f
102	srdi	r6,r5,7
103	mtctr	r6
104
105	/* Now do cacheline (128B) sized loads and stores. */
106	.align	5
1074:
108err2;	ld	r0,0(r4)
109err2;	ld	r6,8(r4)
110err2;	ld	r8,16(r4)
111err2;	ld	r9,24(r4)
112err2;	ld	r10,32(r4)
113err2;	ld	r11,40(r4)
114err2;	ld	r12,48(r4)
115err2;	ld	r14,56(r4)
116err2;	ld	r15,64(r4)
117err2;	ld	r16,72(r4)
118err2;	ld	r17,80(r4)
119err2;	ld	r18,88(r4)
120err2;	ld	r19,96(r4)
121err2;	ld	r20,104(r4)
122err2;	ld	r21,112(r4)
123err2;	ld	r22,120(r4)
124	addi	r4,r4,128
125err2;	std	r0,0(r3)
126err2;	std	r6,8(r3)
127err2;	std	r8,16(r3)
128err2;	std	r9,24(r3)
129err2;	std	r10,32(r3)
130err2;	std	r11,40(r3)
131err2;	std	r12,48(r3)
132err2;	std	r14,56(r3)
133err2;	std	r15,64(r3)
134err2;	std	r16,72(r3)
135err2;	std	r17,80(r3)
136err2;	std	r18,88(r3)
137err2;	std	r19,96(r3)
138err2;	std	r20,104(r3)
139err2;	std	r21,112(r3)
140err2;	std	r22,120(r3)
141	addi	r3,r3,128
142	subi	r7,r7,128
143	bdnz	4b
144
145	clrldi	r5,r5,(64-7)
146
147	/* Up to 127B to go */
1485:	srdi	r6,r5,4
149	mtocrf	0x01,r6
150
1516:	bf	cr7*4+1,7f
152err2;	ld	r0,0(r4)
153err2;	ld	r6,8(r4)
154err2;	ld	r8,16(r4)
155err2;	ld	r9,24(r4)
156err2;	ld	r10,32(r4)
157err2;	ld	r11,40(r4)
158err2;	ld	r12,48(r4)
159err2;	ld	r14,56(r4)
160	addi	r4,r4,64
161err2;	std	r0,0(r3)
162err2;	std	r6,8(r3)
163err2;	std	r8,16(r3)
164err2;	std	r9,24(r3)
165err2;	std	r10,32(r3)
166err2;	std	r11,40(r3)
167err2;	std	r12,48(r3)
168err2;	std	r14,56(r3)
169	addi	r3,r3,64
170	subi	r7,r7,64
171
1727:	ld	r14,STK_REG(R14)(r1)
173	ld	r15,STK_REG(R15)(r1)
174	ld	r16,STK_REG(R16)(r1)
175	ld	r17,STK_REG(R17)(r1)
176	ld	r18,STK_REG(R18)(r1)
177	ld	r19,STK_REG(R19)(r1)
178	ld	r20,STK_REG(R20)(r1)
179	ld	r21,STK_REG(R21)(r1)
180	ld	r22,STK_REG(R22)(r1)
181	addi	r1,r1,STACKFRAMESIZE
182
183	/* Up to 63B to go */
184	bf	cr7*4+2,8f
185err1;	ld	r0,0(r4)
186err1;	ld	r6,8(r4)
187err1;	ld	r8,16(r4)
188err1;	ld	r9,24(r4)
189	addi	r4,r4,32
190err1;	std	r0,0(r3)
191err1;	std	r6,8(r3)
192err1;	std	r8,16(r3)
193err1;	std	r9,24(r3)
194	addi	r3,r3,32
195	subi	r7,r7,32
196
197	/* Up to 31B to go */
1988:	bf	cr7*4+3,9f
199err1;	ld	r0,0(r4)
200err1;	ld	r6,8(r4)
201	addi	r4,r4,16
202err1;	std	r0,0(r3)
203err1;	std	r6,8(r3)
204	addi	r3,r3,16
205	subi	r7,r7,16
206
2079:	clrldi	r5,r5,(64-4)
208
209	/* Up to 15B to go */
210.Lshort_copy:
211	mtocrf	0x01,r5
212	bf	cr7*4+0,12f
213err1;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
214err1;	lwz	r6,4(r4)
215	addi	r4,r4,8
216err1;	stw	r0,0(r3)
217err1;	stw	r6,4(r3)
218	addi	r3,r3,8
219	subi	r7,r7,8
220
22112:	bf	cr7*4+1,13f
222err1;	lwz	r0,0(r4)
223	addi	r4,r4,4
224err1;	stw	r0,0(r3)
225	addi	r3,r3,4
226	subi	r7,r7,4
227
22813:	bf	cr7*4+2,14f
229err1;	lhz	r0,0(r4)
230	addi	r4,r4,2
231err1;	sth	r0,0(r3)
232	addi	r3,r3,2
233	subi	r7,r7,2
234
23514:	bf	cr7*4+3,15f
236err1;	lbz	r0,0(r4)
237err1;	stb	r0,0(r3)
238
23915:	li	r3,0
240	blr
241
242EXPORT_SYMBOL_GPL(copy_mc_generic);
243