xref: /linux/tools/testing/selftests/powerpc/copyloops/copyuser_power7.S (revision c8bfe3fad4f86a029da7157bae9699c816f0c309)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 *
4 * Copyright (C) IBM Corporation, 2011
5 *
6 * Author: Anton Blanchard <anton@au.ibm.com>
7 */
8#include <asm/ppc_asm.h>
9
10#ifndef SELFTEST_CASE
11/* 0 == don't use VMX, 1 == use VMX */
12#define SELFTEST_CASE	0
13#endif
14
15#ifdef __BIG_ENDIAN__
16#define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB
17#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC
18#else
19#define LVS(VRT,RA,RB)		lvsr	VRT,RA,RB
20#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC
21#endif
22
23	.macro err1
24100:
25	EX_TABLE(100b,.Ldo_err1)
26	.endm
27
28	.macro err2
29200:
30	EX_TABLE(200b,.Ldo_err2)
31	.endm
32
33#ifdef CONFIG_ALTIVEC
34	.macro err3
35300:
36	EX_TABLE(300b,.Ldo_err3)
37	.endm
38
39	.macro err4
40400:
41	EX_TABLE(400b,.Ldo_err4)
42	.endm
43
44
45.Ldo_err4:
46	ld	r16,STK_REG(R16)(r1)
47	ld	r15,STK_REG(R15)(r1)
48	ld	r14,STK_REG(R14)(r1)
49.Ldo_err3:
50	bl	CFUNC(exit_vmx_usercopy)
51	ld	r0,STACKFRAMESIZE+16(r1)
52	mtlr	r0
53	b	.Lexit
54#endif /* CONFIG_ALTIVEC */
55
56.Ldo_err2:
57	ld	r22,STK_REG(R22)(r1)
58	ld	r21,STK_REG(R21)(r1)
59	ld	r20,STK_REG(R20)(r1)
60	ld	r19,STK_REG(R19)(r1)
61	ld	r18,STK_REG(R18)(r1)
62	ld	r17,STK_REG(R17)(r1)
63	ld	r16,STK_REG(R16)(r1)
64	ld	r15,STK_REG(R15)(r1)
65	ld	r14,STK_REG(R14)(r1)
66.Lexit:
67	addi	r1,r1,STACKFRAMESIZE
68.Ldo_err1:
69	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
70	ld	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
71	ld	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
72	b	__copy_tofrom_user_base
73
74
75_GLOBAL(__copy_tofrom_user_power7)
76	cmpldi	r5,16
77	cmpldi	cr1,r5,3328
78
79	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
80	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
81	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
82
83	blt	.Lshort_copy
84
85#ifdef CONFIG_ALTIVEC
86test_feature = SELFTEST_CASE
87BEGIN_FTR_SECTION
88	bgt	cr1,.Lvmx_copy
89END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
90#endif
91
92.Lnonvmx_copy:
93	/* Get the source 8B aligned */
94	neg	r6,r4
95	mtocrf	0x01,r6
96	clrldi	r6,r6,(64-3)
97
98	bf	cr7*4+3,1f
99err1;	lbz	r0,0(r4)
100	addi	r4,r4,1
101err1;	stb	r0,0(r3)
102	addi	r3,r3,1
103
1041:	bf	cr7*4+2,2f
105err1;	lhz	r0,0(r4)
106	addi	r4,r4,2
107err1;	sth	r0,0(r3)
108	addi	r3,r3,2
109
1102:	bf	cr7*4+1,3f
111err1;	lwz	r0,0(r4)
112	addi	r4,r4,4
113err1;	stw	r0,0(r3)
114	addi	r3,r3,4
115
1163:	sub	r5,r5,r6
117	cmpldi	r5,128
118	blt	5f
119
120	mflr	r0
121	stdu	r1,-STACKFRAMESIZE(r1)
122	std	r14,STK_REG(R14)(r1)
123	std	r15,STK_REG(R15)(r1)
124	std	r16,STK_REG(R16)(r1)
125	std	r17,STK_REG(R17)(r1)
126	std	r18,STK_REG(R18)(r1)
127	std	r19,STK_REG(R19)(r1)
128	std	r20,STK_REG(R20)(r1)
129	std	r21,STK_REG(R21)(r1)
130	std	r22,STK_REG(R22)(r1)
131	std	r0,STACKFRAMESIZE+16(r1)
132
133	srdi	r6,r5,7
134	mtctr	r6
135
136	/* Now do cacheline (128B) sized loads and stores. */
137	.align	5
1384:
139err2;	ld	r0,0(r4)
140err2;	ld	r6,8(r4)
141err2;	ld	r7,16(r4)
142err2;	ld	r8,24(r4)
143err2;	ld	r9,32(r4)
144err2;	ld	r10,40(r4)
145err2;	ld	r11,48(r4)
146err2;	ld	r12,56(r4)
147err2;	ld	r14,64(r4)
148err2;	ld	r15,72(r4)
149err2;	ld	r16,80(r4)
150err2;	ld	r17,88(r4)
151err2;	ld	r18,96(r4)
152err2;	ld	r19,104(r4)
153err2;	ld	r20,112(r4)
154err2;	ld	r21,120(r4)
155	addi	r4,r4,128
156err2;	std	r0,0(r3)
157err2;	std	r6,8(r3)
158err2;	std	r7,16(r3)
159err2;	std	r8,24(r3)
160err2;	std	r9,32(r3)
161err2;	std	r10,40(r3)
162err2;	std	r11,48(r3)
163err2;	std	r12,56(r3)
164err2;	std	r14,64(r3)
165err2;	std	r15,72(r3)
166err2;	std	r16,80(r3)
167err2;	std	r17,88(r3)
168err2;	std	r18,96(r3)
169err2;	std	r19,104(r3)
170err2;	std	r20,112(r3)
171err2;	std	r21,120(r3)
172	addi	r3,r3,128
173	bdnz	4b
174
175	clrldi	r5,r5,(64-7)
176
177	ld	r14,STK_REG(R14)(r1)
178	ld	r15,STK_REG(R15)(r1)
179	ld	r16,STK_REG(R16)(r1)
180	ld	r17,STK_REG(R17)(r1)
181	ld	r18,STK_REG(R18)(r1)
182	ld	r19,STK_REG(R19)(r1)
183	ld	r20,STK_REG(R20)(r1)
184	ld	r21,STK_REG(R21)(r1)
185	ld	r22,STK_REG(R22)(r1)
186	addi	r1,r1,STACKFRAMESIZE
187
188	/* Up to 127B to go */
1895:	srdi	r6,r5,4
190	mtocrf	0x01,r6
191
1926:	bf	cr7*4+1,7f
193err1;	ld	r0,0(r4)
194err1;	ld	r6,8(r4)
195err1;	ld	r7,16(r4)
196err1;	ld	r8,24(r4)
197err1;	ld	r9,32(r4)
198err1;	ld	r10,40(r4)
199err1;	ld	r11,48(r4)
200err1;	ld	r12,56(r4)
201	addi	r4,r4,64
202err1;	std	r0,0(r3)
203err1;	std	r6,8(r3)
204err1;	std	r7,16(r3)
205err1;	std	r8,24(r3)
206err1;	std	r9,32(r3)
207err1;	std	r10,40(r3)
208err1;	std	r11,48(r3)
209err1;	std	r12,56(r3)
210	addi	r3,r3,64
211
212	/* Up to 63B to go */
2137:	bf	cr7*4+2,8f
214err1;	ld	r0,0(r4)
215err1;	ld	r6,8(r4)
216err1;	ld	r7,16(r4)
217err1;	ld	r8,24(r4)
218	addi	r4,r4,32
219err1;	std	r0,0(r3)
220err1;	std	r6,8(r3)
221err1;	std	r7,16(r3)
222err1;	std	r8,24(r3)
223	addi	r3,r3,32
224
225	/* Up to 31B to go */
2268:	bf	cr7*4+3,9f
227err1;	ld	r0,0(r4)
228err1;	ld	r6,8(r4)
229	addi	r4,r4,16
230err1;	std	r0,0(r3)
231err1;	std	r6,8(r3)
232	addi	r3,r3,16
233
2349:	clrldi	r5,r5,(64-4)
235
236	/* Up to 15B to go */
237.Lshort_copy:
238	mtocrf	0x01,r5
239	bf	cr7*4+0,12f
240err1;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
241err1;	lwz	r6,4(r4)
242	addi	r4,r4,8
243err1;	stw	r0,0(r3)
244err1;	stw	r6,4(r3)
245	addi	r3,r3,8
246
24712:	bf	cr7*4+1,13f
248err1;	lwz	r0,0(r4)
249	addi	r4,r4,4
250err1;	stw	r0,0(r3)
251	addi	r3,r3,4
252
25313:	bf	cr7*4+2,14f
254err1;	lhz	r0,0(r4)
255	addi	r4,r4,2
256err1;	sth	r0,0(r3)
257	addi	r3,r3,2
258
25914:	bf	cr7*4+3,15f
260err1;	lbz	r0,0(r4)
261err1;	stb	r0,0(r3)
262
26315:	li	r3,0
264	blr
265
266.Lunwind_stack_nonvmx_copy:
267	addi	r1,r1,STACKFRAMESIZE
268	b	.Lnonvmx_copy
269
270.Lvmx_copy:
271#ifdef CONFIG_ALTIVEC
272	mflr	r0
273	std	r0,16(r1)
274	stdu	r1,-STACKFRAMESIZE(r1)
275	bl	CFUNC(enter_vmx_usercopy)
276	cmpwi	cr1,r3,0
277	ld	r0,STACKFRAMESIZE+16(r1)
278	ld	r3,STK_REG(R31)(r1)
279	ld	r4,STK_REG(R30)(r1)
280	ld	r5,STK_REG(R29)(r1)
281	mtlr	r0
282
283	/*
284	 * We prefetch both the source and destination using enhanced touch
285	 * instructions. We use a stream ID of 0 for the load side and
286	 * 1 for the store side.
287	 */
288	clrrdi	r6,r4,7
289	clrrdi	r9,r3,7
290	ori	r9,r9,1		/* stream=1 */
291
292	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
293	cmpldi	r7,0x3FF
294	ble	1f
295	li	r7,0x3FF
2961:	lis	r0,0x0E00	/* depth=7 */
297	sldi	r7,r7,7
298	or	r7,r7,r0
299	ori	r10,r7,1	/* stream=1 */
300
301	DCBT_SETUP_STREAMS(r6, r7, r9, r10, r8)
302
303	beq	cr1,.Lunwind_stack_nonvmx_copy
304
305	/*
306	 * If source and destination are not relatively aligned we use a
307	 * slower permute loop.
308	 */
309	xor	r6,r4,r3
310	rldicl.	r6,r6,0,(64-4)
311	bne	.Lvmx_unaligned_copy
312
313	/* Get the destination 16B aligned */
314	neg	r6,r3
315	mtocrf	0x01,r6
316	clrldi	r6,r6,(64-4)
317
318	bf	cr7*4+3,1f
319err3;	lbz	r0,0(r4)
320	addi	r4,r4,1
321err3;	stb	r0,0(r3)
322	addi	r3,r3,1
323
3241:	bf	cr7*4+2,2f
325err3;	lhz	r0,0(r4)
326	addi	r4,r4,2
327err3;	sth	r0,0(r3)
328	addi	r3,r3,2
329
3302:	bf	cr7*4+1,3f
331err3;	lwz	r0,0(r4)
332	addi	r4,r4,4
333err3;	stw	r0,0(r3)
334	addi	r3,r3,4
335
3363:	bf	cr7*4+0,4f
337err3;	ld	r0,0(r4)
338	addi	r4,r4,8
339err3;	std	r0,0(r3)
340	addi	r3,r3,8
341
3424:	sub	r5,r5,r6
343
344	/* Get the desination 128B aligned */
345	neg	r6,r3
346	srdi	r7,r6,4
347	mtocrf	0x01,r7
348	clrldi	r6,r6,(64-7)
349
350	li	r9,16
351	li	r10,32
352	li	r11,48
353
354	bf	cr7*4+3,5f
355err3;	lvx	v1,0,r4
356	addi	r4,r4,16
357err3;	stvx	v1,0,r3
358	addi	r3,r3,16
359
3605:	bf	cr7*4+2,6f
361err3;	lvx	v1,0,r4
362err3;	lvx	v0,r4,r9
363	addi	r4,r4,32
364err3;	stvx	v1,0,r3
365err3;	stvx	v0,r3,r9
366	addi	r3,r3,32
367
3686:	bf	cr7*4+1,7f
369err3;	lvx	v3,0,r4
370err3;	lvx	v2,r4,r9
371err3;	lvx	v1,r4,r10
372err3;	lvx	v0,r4,r11
373	addi	r4,r4,64
374err3;	stvx	v3,0,r3
375err3;	stvx	v2,r3,r9
376err3;	stvx	v1,r3,r10
377err3;	stvx	v0,r3,r11
378	addi	r3,r3,64
379
3807:	sub	r5,r5,r6
381	srdi	r6,r5,7
382
383	std	r14,STK_REG(R14)(r1)
384	std	r15,STK_REG(R15)(r1)
385	std	r16,STK_REG(R16)(r1)
386
387	li	r12,64
388	li	r14,80
389	li	r15,96
390	li	r16,112
391
392	mtctr	r6
393
394	/*
395	 * Now do cacheline sized loads and stores. By this stage the
396	 * cacheline stores are also cacheline aligned.
397	 */
398	.align	5
3998:
400err4;	lvx	v7,0,r4
401err4;	lvx	v6,r4,r9
402err4;	lvx	v5,r4,r10
403err4;	lvx	v4,r4,r11
404err4;	lvx	v3,r4,r12
405err4;	lvx	v2,r4,r14
406err4;	lvx	v1,r4,r15
407err4;	lvx	v0,r4,r16
408	addi	r4,r4,128
409err4;	stvx	v7,0,r3
410err4;	stvx	v6,r3,r9
411err4;	stvx	v5,r3,r10
412err4;	stvx	v4,r3,r11
413err4;	stvx	v3,r3,r12
414err4;	stvx	v2,r3,r14
415err4;	stvx	v1,r3,r15
416err4;	stvx	v0,r3,r16
417	addi	r3,r3,128
418	bdnz	8b
419
420	ld	r14,STK_REG(R14)(r1)
421	ld	r15,STK_REG(R15)(r1)
422	ld	r16,STK_REG(R16)(r1)
423
424	/* Up to 127B to go */
425	clrldi	r5,r5,(64-7)
426	srdi	r6,r5,4
427	mtocrf	0x01,r6
428
429	bf	cr7*4+1,9f
430err3;	lvx	v3,0,r4
431err3;	lvx	v2,r4,r9
432err3;	lvx	v1,r4,r10
433err3;	lvx	v0,r4,r11
434	addi	r4,r4,64
435err3;	stvx	v3,0,r3
436err3;	stvx	v2,r3,r9
437err3;	stvx	v1,r3,r10
438err3;	stvx	v0,r3,r11
439	addi	r3,r3,64
440
4419:	bf	cr7*4+2,10f
442err3;	lvx	v1,0,r4
443err3;	lvx	v0,r4,r9
444	addi	r4,r4,32
445err3;	stvx	v1,0,r3
446err3;	stvx	v0,r3,r9
447	addi	r3,r3,32
448
44910:	bf	cr7*4+3,11f
450err3;	lvx	v1,0,r4
451	addi	r4,r4,16
452err3;	stvx	v1,0,r3
453	addi	r3,r3,16
454
455	/* Up to 15B to go */
45611:	clrldi	r5,r5,(64-4)
457	mtocrf	0x01,r5
458	bf	cr7*4+0,12f
459err3;	ld	r0,0(r4)
460	addi	r4,r4,8
461err3;	std	r0,0(r3)
462	addi	r3,r3,8
463
46412:	bf	cr7*4+1,13f
465err3;	lwz	r0,0(r4)
466	addi	r4,r4,4
467err3;	stw	r0,0(r3)
468	addi	r3,r3,4
469
47013:	bf	cr7*4+2,14f
471err3;	lhz	r0,0(r4)
472	addi	r4,r4,2
473err3;	sth	r0,0(r3)
474	addi	r3,r3,2
475
47614:	bf	cr7*4+3,15f
477err3;	lbz	r0,0(r4)
478err3;	stb	r0,0(r3)
479
48015:	addi	r1,r1,STACKFRAMESIZE
481	b	CFUNC(exit_vmx_usercopy)	/* tail call optimise */
482
483.Lvmx_unaligned_copy:
484	/* Get the destination 16B aligned */
485	neg	r6,r3
486	mtocrf	0x01,r6
487	clrldi	r6,r6,(64-4)
488
489	bf	cr7*4+3,1f
490err3;	lbz	r0,0(r4)
491	addi	r4,r4,1
492err3;	stb	r0,0(r3)
493	addi	r3,r3,1
494
4951:	bf	cr7*4+2,2f
496err3;	lhz	r0,0(r4)
497	addi	r4,r4,2
498err3;	sth	r0,0(r3)
499	addi	r3,r3,2
500
5012:	bf	cr7*4+1,3f
502err3;	lwz	r0,0(r4)
503	addi	r4,r4,4
504err3;	stw	r0,0(r3)
505	addi	r3,r3,4
506
5073:	bf	cr7*4+0,4f
508err3;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
509err3;	lwz	r7,4(r4)
510	addi	r4,r4,8
511err3;	stw	r0,0(r3)
512err3;	stw	r7,4(r3)
513	addi	r3,r3,8
514
5154:	sub	r5,r5,r6
516
517	/* Get the desination 128B aligned */
518	neg	r6,r3
519	srdi	r7,r6,4
520	mtocrf	0x01,r7
521	clrldi	r6,r6,(64-7)
522
523	li	r9,16
524	li	r10,32
525	li	r11,48
526
527	LVS(v16,0,r4)		/* Setup permute control vector */
528err3;	lvx	v0,0,r4
529	addi	r4,r4,16
530
531	bf	cr7*4+3,5f
532err3;	lvx	v1,0,r4
533	VPERM(v8,v0,v1,v16)
534	addi	r4,r4,16
535err3;	stvx	v8,0,r3
536	addi	r3,r3,16
537	vor	v0,v1,v1
538
5395:	bf	cr7*4+2,6f
540err3;	lvx	v1,0,r4
541	VPERM(v8,v0,v1,v16)
542err3;	lvx	v0,r4,r9
543	VPERM(v9,v1,v0,v16)
544	addi	r4,r4,32
545err3;	stvx	v8,0,r3
546err3;	stvx	v9,r3,r9
547	addi	r3,r3,32
548
5496:	bf	cr7*4+1,7f
550err3;	lvx	v3,0,r4
551	VPERM(v8,v0,v3,v16)
552err3;	lvx	v2,r4,r9
553	VPERM(v9,v3,v2,v16)
554err3;	lvx	v1,r4,r10
555	VPERM(v10,v2,v1,v16)
556err3;	lvx	v0,r4,r11
557	VPERM(v11,v1,v0,v16)
558	addi	r4,r4,64
559err3;	stvx	v8,0,r3
560err3;	stvx	v9,r3,r9
561err3;	stvx	v10,r3,r10
562err3;	stvx	v11,r3,r11
563	addi	r3,r3,64
564
5657:	sub	r5,r5,r6
566	srdi	r6,r5,7
567
568	std	r14,STK_REG(R14)(r1)
569	std	r15,STK_REG(R15)(r1)
570	std	r16,STK_REG(R16)(r1)
571
572	li	r12,64
573	li	r14,80
574	li	r15,96
575	li	r16,112
576
577	mtctr	r6
578
579	/*
580	 * Now do cacheline sized loads and stores. By this stage the
581	 * cacheline stores are also cacheline aligned.
582	 */
583	.align	5
5848:
585err4;	lvx	v7,0,r4
586	VPERM(v8,v0,v7,v16)
587err4;	lvx	v6,r4,r9
588	VPERM(v9,v7,v6,v16)
589err4;	lvx	v5,r4,r10
590	VPERM(v10,v6,v5,v16)
591err4;	lvx	v4,r4,r11
592	VPERM(v11,v5,v4,v16)
593err4;	lvx	v3,r4,r12
594	VPERM(v12,v4,v3,v16)
595err4;	lvx	v2,r4,r14
596	VPERM(v13,v3,v2,v16)
597err4;	lvx	v1,r4,r15
598	VPERM(v14,v2,v1,v16)
599err4;	lvx	v0,r4,r16
600	VPERM(v15,v1,v0,v16)
601	addi	r4,r4,128
602err4;	stvx	v8,0,r3
603err4;	stvx	v9,r3,r9
604err4;	stvx	v10,r3,r10
605err4;	stvx	v11,r3,r11
606err4;	stvx	v12,r3,r12
607err4;	stvx	v13,r3,r14
608err4;	stvx	v14,r3,r15
609err4;	stvx	v15,r3,r16
610	addi	r3,r3,128
611	bdnz	8b
612
613	ld	r14,STK_REG(R14)(r1)
614	ld	r15,STK_REG(R15)(r1)
615	ld	r16,STK_REG(R16)(r1)
616
617	/* Up to 127B to go */
618	clrldi	r5,r5,(64-7)
619	srdi	r6,r5,4
620	mtocrf	0x01,r6
621
622	bf	cr7*4+1,9f
623err3;	lvx	v3,0,r4
624	VPERM(v8,v0,v3,v16)
625err3;	lvx	v2,r4,r9
626	VPERM(v9,v3,v2,v16)
627err3;	lvx	v1,r4,r10
628	VPERM(v10,v2,v1,v16)
629err3;	lvx	v0,r4,r11
630	VPERM(v11,v1,v0,v16)
631	addi	r4,r4,64
632err3;	stvx	v8,0,r3
633err3;	stvx	v9,r3,r9
634err3;	stvx	v10,r3,r10
635err3;	stvx	v11,r3,r11
636	addi	r3,r3,64
637
6389:	bf	cr7*4+2,10f
639err3;	lvx	v1,0,r4
640	VPERM(v8,v0,v1,v16)
641err3;	lvx	v0,r4,r9
642	VPERM(v9,v1,v0,v16)
643	addi	r4,r4,32
644err3;	stvx	v8,0,r3
645err3;	stvx	v9,r3,r9
646	addi	r3,r3,32
647
64810:	bf	cr7*4+3,11f
649err3;	lvx	v1,0,r4
650	VPERM(v8,v0,v1,v16)
651	addi	r4,r4,16
652err3;	stvx	v8,0,r3
653	addi	r3,r3,16
654
655	/* Up to 15B to go */
65611:	clrldi	r5,r5,(64-4)
657	addi	r4,r4,-16	/* Unwind the +16 load offset */
658	mtocrf	0x01,r5
659	bf	cr7*4+0,12f
660err3;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
661err3;	lwz	r6,4(r4)
662	addi	r4,r4,8
663err3;	stw	r0,0(r3)
664err3;	stw	r6,4(r3)
665	addi	r3,r3,8
666
66712:	bf	cr7*4+1,13f
668err3;	lwz	r0,0(r4)
669	addi	r4,r4,4
670err3;	stw	r0,0(r3)
671	addi	r3,r3,4
672
67313:	bf	cr7*4+2,14f
674err3;	lhz	r0,0(r4)
675	addi	r4,r4,2
676err3;	sth	r0,0(r3)
677	addi	r3,r3,2
678
67914:	bf	cr7*4+3,15f
680err3;	lbz	r0,0(r4)
681err3;	stb	r0,0(r3)
682
68315:	addi	r1,r1,STACKFRAMESIZE
684	b	CFUNC(exit_vmx_usercopy)	/* tail call optimise */
685#endif /* CONFIG_ALTIVEC */
686