xref: /linux/tools/testing/selftests/powerpc/copyloops/memcpy_power7.S (revision c8bfe3fad4f86a029da7157bae9699c816f0c309)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 *
4 * Copyright (C) IBM Corporation, 2012
5 *
6 * Author: Anton Blanchard <anton@au.ibm.com>
7 */
8#include <asm/ppc_asm.h>
9
10#ifndef SELFTEST_CASE
11/* 0 == don't use VMX, 1 == use VMX */
12#define SELFTEST_CASE	0
13#endif
14
15#ifdef __BIG_ENDIAN__
16#define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB
17#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC
18#else
19#define LVS(VRT,RA,RB)		lvsr	VRT,RA,RB
20#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC
21#endif
22
23_GLOBAL(memcpy_power7)
24	cmpldi	r5,16
25	cmpldi	cr1,r5,4096
26	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
27	blt	.Lshort_copy
28
29#ifdef CONFIG_ALTIVEC
30test_feature = SELFTEST_CASE
31BEGIN_FTR_SECTION
32	bgt	cr1, .Lvmx_copy
33END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
34#endif
35
36.Lnonvmx_copy:
37	/* Get the source 8B aligned */
38	neg	r6,r4
39	mtocrf	0x01,r6
40	clrldi	r6,r6,(64-3)
41
42	bf	cr7*4+3,1f
43	lbz	r0,0(r4)
44	addi	r4,r4,1
45	stb	r0,0(r3)
46	addi	r3,r3,1
47
481:	bf	cr7*4+2,2f
49	lhz	r0,0(r4)
50	addi	r4,r4,2
51	sth	r0,0(r3)
52	addi	r3,r3,2
53
542:	bf	cr7*4+1,3f
55	lwz	r0,0(r4)
56	addi	r4,r4,4
57	stw	r0,0(r3)
58	addi	r3,r3,4
59
603:	sub	r5,r5,r6
61	cmpldi	r5,128
62	blt	5f
63
64	mflr	r0
65	stdu	r1,-STACKFRAMESIZE(r1)
66	std	r14,STK_REG(R14)(r1)
67	std	r15,STK_REG(R15)(r1)
68	std	r16,STK_REG(R16)(r1)
69	std	r17,STK_REG(R17)(r1)
70	std	r18,STK_REG(R18)(r1)
71	std	r19,STK_REG(R19)(r1)
72	std	r20,STK_REG(R20)(r1)
73	std	r21,STK_REG(R21)(r1)
74	std	r22,STK_REG(R22)(r1)
75	std	r0,STACKFRAMESIZE+16(r1)
76
77	srdi	r6,r5,7
78	mtctr	r6
79
80	/* Now do cacheline (128B) sized loads and stores. */
81	.align	5
824:
83	ld	r0,0(r4)
84	ld	r6,8(r4)
85	ld	r7,16(r4)
86	ld	r8,24(r4)
87	ld	r9,32(r4)
88	ld	r10,40(r4)
89	ld	r11,48(r4)
90	ld	r12,56(r4)
91	ld	r14,64(r4)
92	ld	r15,72(r4)
93	ld	r16,80(r4)
94	ld	r17,88(r4)
95	ld	r18,96(r4)
96	ld	r19,104(r4)
97	ld	r20,112(r4)
98	ld	r21,120(r4)
99	addi	r4,r4,128
100	std	r0,0(r3)
101	std	r6,8(r3)
102	std	r7,16(r3)
103	std	r8,24(r3)
104	std	r9,32(r3)
105	std	r10,40(r3)
106	std	r11,48(r3)
107	std	r12,56(r3)
108	std	r14,64(r3)
109	std	r15,72(r3)
110	std	r16,80(r3)
111	std	r17,88(r3)
112	std	r18,96(r3)
113	std	r19,104(r3)
114	std	r20,112(r3)
115	std	r21,120(r3)
116	addi	r3,r3,128
117	bdnz	4b
118
119	clrldi	r5,r5,(64-7)
120
121	ld	r14,STK_REG(R14)(r1)
122	ld	r15,STK_REG(R15)(r1)
123	ld	r16,STK_REG(R16)(r1)
124	ld	r17,STK_REG(R17)(r1)
125	ld	r18,STK_REG(R18)(r1)
126	ld	r19,STK_REG(R19)(r1)
127	ld	r20,STK_REG(R20)(r1)
128	ld	r21,STK_REG(R21)(r1)
129	ld	r22,STK_REG(R22)(r1)
130	addi	r1,r1,STACKFRAMESIZE
131
132	/* Up to 127B to go */
1335:	srdi	r6,r5,4
134	mtocrf	0x01,r6
135
1366:	bf	cr7*4+1,7f
137	ld	r0,0(r4)
138	ld	r6,8(r4)
139	ld	r7,16(r4)
140	ld	r8,24(r4)
141	ld	r9,32(r4)
142	ld	r10,40(r4)
143	ld	r11,48(r4)
144	ld	r12,56(r4)
145	addi	r4,r4,64
146	std	r0,0(r3)
147	std	r6,8(r3)
148	std	r7,16(r3)
149	std	r8,24(r3)
150	std	r9,32(r3)
151	std	r10,40(r3)
152	std	r11,48(r3)
153	std	r12,56(r3)
154	addi	r3,r3,64
155
156	/* Up to 63B to go */
1577:	bf	cr7*4+2,8f
158	ld	r0,0(r4)
159	ld	r6,8(r4)
160	ld	r7,16(r4)
161	ld	r8,24(r4)
162	addi	r4,r4,32
163	std	r0,0(r3)
164	std	r6,8(r3)
165	std	r7,16(r3)
166	std	r8,24(r3)
167	addi	r3,r3,32
168
169	/* Up to 31B to go */
1708:	bf	cr7*4+3,9f
171	ld	r0,0(r4)
172	ld	r6,8(r4)
173	addi	r4,r4,16
174	std	r0,0(r3)
175	std	r6,8(r3)
176	addi	r3,r3,16
177
1789:	clrldi	r5,r5,(64-4)
179
180	/* Up to 15B to go */
181.Lshort_copy:
182	mtocrf	0x01,r5
183	bf	cr7*4+0,12f
184	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
185	lwz	r6,4(r4)
186	addi	r4,r4,8
187	stw	r0,0(r3)
188	stw	r6,4(r3)
189	addi	r3,r3,8
190
19112:	bf	cr7*4+1,13f
192	lwz	r0,0(r4)
193	addi	r4,r4,4
194	stw	r0,0(r3)
195	addi	r3,r3,4
196
19713:	bf	cr7*4+2,14f
198	lhz	r0,0(r4)
199	addi	r4,r4,2
200	sth	r0,0(r3)
201	addi	r3,r3,2
202
20314:	bf	cr7*4+3,15f
204	lbz	r0,0(r4)
205	stb	r0,0(r3)
206
20715:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
208	blr
209
210.Lunwind_stack_nonvmx_copy:
211	addi	r1,r1,STACKFRAMESIZE
212	b	.Lnonvmx_copy
213
214.Lvmx_copy:
215#ifdef CONFIG_ALTIVEC
216	mflr	r0
217	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
218	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
219	std	r0,16(r1)
220	stdu	r1,-STACKFRAMESIZE(r1)
221	bl	CFUNC(enter_vmx_ops)
222	cmpwi	cr1,r3,0
223	ld	r0,STACKFRAMESIZE+16(r1)
224	ld	r3,STK_REG(R31)(r1)
225	ld	r4,STK_REG(R30)(r1)
226	ld	r5,STK_REG(R29)(r1)
227	mtlr	r0
228
229	/*
230	 * We prefetch both the source and destination using enhanced touch
231	 * instructions. We use a stream ID of 0 for the load side and
232	 * 1 for the store side.
233	 */
234	clrrdi	r6,r4,7
235	clrrdi	r9,r3,7
236	ori	r9,r9,1		/* stream=1 */
237
238	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
239	cmpldi	r7,0x3FF
240	ble	1f
241	li	r7,0x3FF
2421:	lis	r0,0x0E00	/* depth=7 */
243	sldi	r7,r7,7
244	or	r7,r7,r0
245	ori	r10,r7,1	/* stream=1 */
246
247	DCBT_SETUP_STREAMS(r6, r7, r9, r10, r8)
248
249	beq	cr1,.Lunwind_stack_nonvmx_copy
250
251	/*
252	 * If source and destination are not relatively aligned we use a
253	 * slower permute loop.
254	 */
255	xor	r6,r4,r3
256	rldicl.	r6,r6,0,(64-4)
257	bne	.Lvmx_unaligned_copy
258
259	/* Get the destination 16B aligned */
260	neg	r6,r3
261	mtocrf	0x01,r6
262	clrldi	r6,r6,(64-4)
263
264	bf	cr7*4+3,1f
265	lbz	r0,0(r4)
266	addi	r4,r4,1
267	stb	r0,0(r3)
268	addi	r3,r3,1
269
2701:	bf	cr7*4+2,2f
271	lhz	r0,0(r4)
272	addi	r4,r4,2
273	sth	r0,0(r3)
274	addi	r3,r3,2
275
2762:	bf	cr7*4+1,3f
277	lwz	r0,0(r4)
278	addi	r4,r4,4
279	stw	r0,0(r3)
280	addi	r3,r3,4
281
2823:	bf	cr7*4+0,4f
283	ld	r0,0(r4)
284	addi	r4,r4,8
285	std	r0,0(r3)
286	addi	r3,r3,8
287
2884:	sub	r5,r5,r6
289
290	/* Get the desination 128B aligned */
291	neg	r6,r3
292	srdi	r7,r6,4
293	mtocrf	0x01,r7
294	clrldi	r6,r6,(64-7)
295
296	li	r9,16
297	li	r10,32
298	li	r11,48
299
300	bf	cr7*4+3,5f
301	lvx	v1,0,r4
302	addi	r4,r4,16
303	stvx	v1,0,r3
304	addi	r3,r3,16
305
3065:	bf	cr7*4+2,6f
307	lvx	v1,0,r4
308	lvx	v0,r4,r9
309	addi	r4,r4,32
310	stvx	v1,0,r3
311	stvx	v0,r3,r9
312	addi	r3,r3,32
313
3146:	bf	cr7*4+1,7f
315	lvx	v3,0,r4
316	lvx	v2,r4,r9
317	lvx	v1,r4,r10
318	lvx	v0,r4,r11
319	addi	r4,r4,64
320	stvx	v3,0,r3
321	stvx	v2,r3,r9
322	stvx	v1,r3,r10
323	stvx	v0,r3,r11
324	addi	r3,r3,64
325
3267:	sub	r5,r5,r6
327	srdi	r6,r5,7
328
329	std	r14,STK_REG(R14)(r1)
330	std	r15,STK_REG(R15)(r1)
331	std	r16,STK_REG(R16)(r1)
332
333	li	r12,64
334	li	r14,80
335	li	r15,96
336	li	r16,112
337
338	mtctr	r6
339
340	/*
341	 * Now do cacheline sized loads and stores. By this stage the
342	 * cacheline stores are also cacheline aligned.
343	 */
344	.align	5
3458:
346	lvx	v7,0,r4
347	lvx	v6,r4,r9
348	lvx	v5,r4,r10
349	lvx	v4,r4,r11
350	lvx	v3,r4,r12
351	lvx	v2,r4,r14
352	lvx	v1,r4,r15
353	lvx	v0,r4,r16
354	addi	r4,r4,128
355	stvx	v7,0,r3
356	stvx	v6,r3,r9
357	stvx	v5,r3,r10
358	stvx	v4,r3,r11
359	stvx	v3,r3,r12
360	stvx	v2,r3,r14
361	stvx	v1,r3,r15
362	stvx	v0,r3,r16
363	addi	r3,r3,128
364	bdnz	8b
365
366	ld	r14,STK_REG(R14)(r1)
367	ld	r15,STK_REG(R15)(r1)
368	ld	r16,STK_REG(R16)(r1)
369
370	/* Up to 127B to go */
371	clrldi	r5,r5,(64-7)
372	srdi	r6,r5,4
373	mtocrf	0x01,r6
374
375	bf	cr7*4+1,9f
376	lvx	v3,0,r4
377	lvx	v2,r4,r9
378	lvx	v1,r4,r10
379	lvx	v0,r4,r11
380	addi	r4,r4,64
381	stvx	v3,0,r3
382	stvx	v2,r3,r9
383	stvx	v1,r3,r10
384	stvx	v0,r3,r11
385	addi	r3,r3,64
386
3879:	bf	cr7*4+2,10f
388	lvx	v1,0,r4
389	lvx	v0,r4,r9
390	addi	r4,r4,32
391	stvx	v1,0,r3
392	stvx	v0,r3,r9
393	addi	r3,r3,32
394
39510:	bf	cr7*4+3,11f
396	lvx	v1,0,r4
397	addi	r4,r4,16
398	stvx	v1,0,r3
399	addi	r3,r3,16
400
401	/* Up to 15B to go */
40211:	clrldi	r5,r5,(64-4)
403	mtocrf	0x01,r5
404	bf	cr7*4+0,12f
405	ld	r0,0(r4)
406	addi	r4,r4,8
407	std	r0,0(r3)
408	addi	r3,r3,8
409
41012:	bf	cr7*4+1,13f
411	lwz	r0,0(r4)
412	addi	r4,r4,4
413	stw	r0,0(r3)
414	addi	r3,r3,4
415
41613:	bf	cr7*4+2,14f
417	lhz	r0,0(r4)
418	addi	r4,r4,2
419	sth	r0,0(r3)
420	addi	r3,r3,2
421
42214:	bf	cr7*4+3,15f
423	lbz	r0,0(r4)
424	stb	r0,0(r3)
425
42615:	addi	r1,r1,STACKFRAMESIZE
427	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
428	b	CFUNC(exit_vmx_ops)		/* tail call optimise */
429
430.Lvmx_unaligned_copy:
431	/* Get the destination 16B aligned */
432	neg	r6,r3
433	mtocrf	0x01,r6
434	clrldi	r6,r6,(64-4)
435
436	bf	cr7*4+3,1f
437	lbz	r0,0(r4)
438	addi	r4,r4,1
439	stb	r0,0(r3)
440	addi	r3,r3,1
441
4421:	bf	cr7*4+2,2f
443	lhz	r0,0(r4)
444	addi	r4,r4,2
445	sth	r0,0(r3)
446	addi	r3,r3,2
447
4482:	bf	cr7*4+1,3f
449	lwz	r0,0(r4)
450	addi	r4,r4,4
451	stw	r0,0(r3)
452	addi	r3,r3,4
453
4543:	bf	cr7*4+0,4f
455	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
456	lwz	r7,4(r4)
457	addi	r4,r4,8
458	stw	r0,0(r3)
459	stw	r7,4(r3)
460	addi	r3,r3,8
461
4624:	sub	r5,r5,r6
463
464	/* Get the desination 128B aligned */
465	neg	r6,r3
466	srdi	r7,r6,4
467	mtocrf	0x01,r7
468	clrldi	r6,r6,(64-7)
469
470	li	r9,16
471	li	r10,32
472	li	r11,48
473
474	LVS(v16,0,r4)		/* Setup permute control vector */
475	lvx	v0,0,r4
476	addi	r4,r4,16
477
478	bf	cr7*4+3,5f
479	lvx	v1,0,r4
480	VPERM(v8,v0,v1,v16)
481	addi	r4,r4,16
482	stvx	v8,0,r3
483	addi	r3,r3,16
484	vor	v0,v1,v1
485
4865:	bf	cr7*4+2,6f
487	lvx	v1,0,r4
488	VPERM(v8,v0,v1,v16)
489	lvx	v0,r4,r9
490	VPERM(v9,v1,v0,v16)
491	addi	r4,r4,32
492	stvx	v8,0,r3
493	stvx	v9,r3,r9
494	addi	r3,r3,32
495
4966:	bf	cr7*4+1,7f
497	lvx	v3,0,r4
498	VPERM(v8,v0,v3,v16)
499	lvx	v2,r4,r9
500	VPERM(v9,v3,v2,v16)
501	lvx	v1,r4,r10
502	VPERM(v10,v2,v1,v16)
503	lvx	v0,r4,r11
504	VPERM(v11,v1,v0,v16)
505	addi	r4,r4,64
506	stvx	v8,0,r3
507	stvx	v9,r3,r9
508	stvx	v10,r3,r10
509	stvx	v11,r3,r11
510	addi	r3,r3,64
511
5127:	sub	r5,r5,r6
513	srdi	r6,r5,7
514
515	std	r14,STK_REG(R14)(r1)
516	std	r15,STK_REG(R15)(r1)
517	std	r16,STK_REG(R16)(r1)
518
519	li	r12,64
520	li	r14,80
521	li	r15,96
522	li	r16,112
523
524	mtctr	r6
525
526	/*
527	 * Now do cacheline sized loads and stores. By this stage the
528	 * cacheline stores are also cacheline aligned.
529	 */
530	.align	5
5318:
532	lvx	v7,0,r4
533	VPERM(v8,v0,v7,v16)
534	lvx	v6,r4,r9
535	VPERM(v9,v7,v6,v16)
536	lvx	v5,r4,r10
537	VPERM(v10,v6,v5,v16)
538	lvx	v4,r4,r11
539	VPERM(v11,v5,v4,v16)
540	lvx	v3,r4,r12
541	VPERM(v12,v4,v3,v16)
542	lvx	v2,r4,r14
543	VPERM(v13,v3,v2,v16)
544	lvx	v1,r4,r15
545	VPERM(v14,v2,v1,v16)
546	lvx	v0,r4,r16
547	VPERM(v15,v1,v0,v16)
548	addi	r4,r4,128
549	stvx	v8,0,r3
550	stvx	v9,r3,r9
551	stvx	v10,r3,r10
552	stvx	v11,r3,r11
553	stvx	v12,r3,r12
554	stvx	v13,r3,r14
555	stvx	v14,r3,r15
556	stvx	v15,r3,r16
557	addi	r3,r3,128
558	bdnz	8b
559
560	ld	r14,STK_REG(R14)(r1)
561	ld	r15,STK_REG(R15)(r1)
562	ld	r16,STK_REG(R16)(r1)
563
564	/* Up to 127B to go */
565	clrldi	r5,r5,(64-7)
566	srdi	r6,r5,4
567	mtocrf	0x01,r6
568
569	bf	cr7*4+1,9f
570	lvx	v3,0,r4
571	VPERM(v8,v0,v3,v16)
572	lvx	v2,r4,r9
573	VPERM(v9,v3,v2,v16)
574	lvx	v1,r4,r10
575	VPERM(v10,v2,v1,v16)
576	lvx	v0,r4,r11
577	VPERM(v11,v1,v0,v16)
578	addi	r4,r4,64
579	stvx	v8,0,r3
580	stvx	v9,r3,r9
581	stvx	v10,r3,r10
582	stvx	v11,r3,r11
583	addi	r3,r3,64
584
5859:	bf	cr7*4+2,10f
586	lvx	v1,0,r4
587	VPERM(v8,v0,v1,v16)
588	lvx	v0,r4,r9
589	VPERM(v9,v1,v0,v16)
590	addi	r4,r4,32
591	stvx	v8,0,r3
592	stvx	v9,r3,r9
593	addi	r3,r3,32
594
59510:	bf	cr7*4+3,11f
596	lvx	v1,0,r4
597	VPERM(v8,v0,v1,v16)
598	addi	r4,r4,16
599	stvx	v8,0,r3
600	addi	r3,r3,16
601
602	/* Up to 15B to go */
60311:	clrldi	r5,r5,(64-4)
604	addi	r4,r4,-16	/* Unwind the +16 load offset */
605	mtocrf	0x01,r5
606	bf	cr7*4+0,12f
607	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
608	lwz	r6,4(r4)
609	addi	r4,r4,8
610	stw	r0,0(r3)
611	stw	r6,4(r3)
612	addi	r3,r3,8
613
61412:	bf	cr7*4+1,13f
615	lwz	r0,0(r4)
616	addi	r4,r4,4
617	stw	r0,0(r3)
618	addi	r3,r3,4
619
62013:	bf	cr7*4+2,14f
621	lhz	r0,0(r4)
622	addi	r4,r4,2
623	sth	r0,0(r3)
624	addi	r3,r3,2
625
62614:	bf	cr7*4+3,15f
627	lbz	r0,0(r4)
628	stb	r0,0(r3)
629
63015:	addi	r1,r1,STACKFRAMESIZE
631	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
632	b	CFUNC(exit_vmx_ops)		/* tail call optimise */
633#endif /* CONFIG_ALTIVEC */
634