xref: /linux/tools/testing/selftests/powerpc/copyloops/memcpy_power7.S (revision da1d9caf95def6f0320819cf941c9fd1069ba9e1)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 *
4 * Copyright (C) IBM Corporation, 2012
5 *
6 * Author: Anton Blanchard <anton@au.ibm.com>
7 */
8#include <asm/ppc_asm.h>
9
10#ifndef SELFTEST_CASE
11/* 0 == don't use VMX, 1 == use VMX */
12#define SELFTEST_CASE	0
13#endif
14
15#ifdef __BIG_ENDIAN__
16#define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB
17#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC
18#else
19#define LVS(VRT,RA,RB)		lvsr	VRT,RA,RB
20#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC
21#endif
22
23_GLOBAL(memcpy_power7)
24	cmpldi	r5,16
25	cmpldi	cr1,r5,4096
26	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
27	blt	.Lshort_copy
28
29#ifdef CONFIG_ALTIVEC
30test_feature = SELFTEST_CASE
31BEGIN_FTR_SECTION
32	bgt	cr1, .Lvmx_copy
33END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
34#endif
35
36.Lnonvmx_copy:
37	/* Get the source 8B aligned */
38	neg	r6,r4
39	mtocrf	0x01,r6
40	clrldi	r6,r6,(64-3)
41
42	bf	cr7*4+3,1f
43	lbz	r0,0(r4)
44	addi	r4,r4,1
45	stb	r0,0(r3)
46	addi	r3,r3,1
47
481:	bf	cr7*4+2,2f
49	lhz	r0,0(r4)
50	addi	r4,r4,2
51	sth	r0,0(r3)
52	addi	r3,r3,2
53
542:	bf	cr7*4+1,3f
55	lwz	r0,0(r4)
56	addi	r4,r4,4
57	stw	r0,0(r3)
58	addi	r3,r3,4
59
603:	sub	r5,r5,r6
61	cmpldi	r5,128
62	blt	5f
63
64	mflr	r0
65	stdu	r1,-STACKFRAMESIZE(r1)
66	std	r14,STK_REG(R14)(r1)
67	std	r15,STK_REG(R15)(r1)
68	std	r16,STK_REG(R16)(r1)
69	std	r17,STK_REG(R17)(r1)
70	std	r18,STK_REG(R18)(r1)
71	std	r19,STK_REG(R19)(r1)
72	std	r20,STK_REG(R20)(r1)
73	std	r21,STK_REG(R21)(r1)
74	std	r22,STK_REG(R22)(r1)
75	std	r0,STACKFRAMESIZE+16(r1)
76
77	srdi	r6,r5,7
78	mtctr	r6
79
80	/* Now do cacheline (128B) sized loads and stores. */
81	.align	5
824:
83	ld	r0,0(r4)
84	ld	r6,8(r4)
85	ld	r7,16(r4)
86	ld	r8,24(r4)
87	ld	r9,32(r4)
88	ld	r10,40(r4)
89	ld	r11,48(r4)
90	ld	r12,56(r4)
91	ld	r14,64(r4)
92	ld	r15,72(r4)
93	ld	r16,80(r4)
94	ld	r17,88(r4)
95	ld	r18,96(r4)
96	ld	r19,104(r4)
97	ld	r20,112(r4)
98	ld	r21,120(r4)
99	addi	r4,r4,128
100	std	r0,0(r3)
101	std	r6,8(r3)
102	std	r7,16(r3)
103	std	r8,24(r3)
104	std	r9,32(r3)
105	std	r10,40(r3)
106	std	r11,48(r3)
107	std	r12,56(r3)
108	std	r14,64(r3)
109	std	r15,72(r3)
110	std	r16,80(r3)
111	std	r17,88(r3)
112	std	r18,96(r3)
113	std	r19,104(r3)
114	std	r20,112(r3)
115	std	r21,120(r3)
116	addi	r3,r3,128
117	bdnz	4b
118
119	clrldi	r5,r5,(64-7)
120
121	ld	r14,STK_REG(R14)(r1)
122	ld	r15,STK_REG(R15)(r1)
123	ld	r16,STK_REG(R16)(r1)
124	ld	r17,STK_REG(R17)(r1)
125	ld	r18,STK_REG(R18)(r1)
126	ld	r19,STK_REG(R19)(r1)
127	ld	r20,STK_REG(R20)(r1)
128	ld	r21,STK_REG(R21)(r1)
129	ld	r22,STK_REG(R22)(r1)
130	addi	r1,r1,STACKFRAMESIZE
131
132	/* Up to 127B to go */
1335:	srdi	r6,r5,4
134	mtocrf	0x01,r6
135
1366:	bf	cr7*4+1,7f
137	ld	r0,0(r4)
138	ld	r6,8(r4)
139	ld	r7,16(r4)
140	ld	r8,24(r4)
141	ld	r9,32(r4)
142	ld	r10,40(r4)
143	ld	r11,48(r4)
144	ld	r12,56(r4)
145	addi	r4,r4,64
146	std	r0,0(r3)
147	std	r6,8(r3)
148	std	r7,16(r3)
149	std	r8,24(r3)
150	std	r9,32(r3)
151	std	r10,40(r3)
152	std	r11,48(r3)
153	std	r12,56(r3)
154	addi	r3,r3,64
155
156	/* Up to 63B to go */
1577:	bf	cr7*4+2,8f
158	ld	r0,0(r4)
159	ld	r6,8(r4)
160	ld	r7,16(r4)
161	ld	r8,24(r4)
162	addi	r4,r4,32
163	std	r0,0(r3)
164	std	r6,8(r3)
165	std	r7,16(r3)
166	std	r8,24(r3)
167	addi	r3,r3,32
168
169	/* Up to 31B to go */
1708:	bf	cr7*4+3,9f
171	ld	r0,0(r4)
172	ld	r6,8(r4)
173	addi	r4,r4,16
174	std	r0,0(r3)
175	std	r6,8(r3)
176	addi	r3,r3,16
177
1789:	clrldi	r5,r5,(64-4)
179
180	/* Up to 15B to go */
181.Lshort_copy:
182	mtocrf	0x01,r5
183	bf	cr7*4+0,12f
184	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
185	lwz	r6,4(r4)
186	addi	r4,r4,8
187	stw	r0,0(r3)
188	stw	r6,4(r3)
189	addi	r3,r3,8
190
19112:	bf	cr7*4+1,13f
192	lwz	r0,0(r4)
193	addi	r4,r4,4
194	stw	r0,0(r3)
195	addi	r3,r3,4
196
19713:	bf	cr7*4+2,14f
198	lhz	r0,0(r4)
199	addi	r4,r4,2
200	sth	r0,0(r3)
201	addi	r3,r3,2
202
20314:	bf	cr7*4+3,15f
204	lbz	r0,0(r4)
205	stb	r0,0(r3)
206
20715:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
208	blr
209
210.Lunwind_stack_nonvmx_copy:
211	addi	r1,r1,STACKFRAMESIZE
212	b	.Lnonvmx_copy
213
214.Lvmx_copy:
215#ifdef CONFIG_ALTIVEC
216	mflr	r0
217	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
218	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
219	std	r0,16(r1)
220	stdu	r1,-STACKFRAMESIZE(r1)
221	bl	enter_vmx_ops
222	cmpwi	cr1,r3,0
223	ld	r0,STACKFRAMESIZE+16(r1)
224	ld	r3,STK_REG(R31)(r1)
225	ld	r4,STK_REG(R30)(r1)
226	ld	r5,STK_REG(R29)(r1)
227	mtlr	r0
228
229	/*
230	 * We prefetch both the source and destination using enhanced touch
231	 * instructions. We use a stream ID of 0 for the load side and
232	 * 1 for the store side.
233	 */
234	clrrdi	r6,r4,7
235	clrrdi	r9,r3,7
236	ori	r9,r9,1		/* stream=1 */
237
238	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
239	cmpldi	r7,0x3FF
240	ble	1f
241	li	r7,0x3FF
2421:	lis	r0,0x0E00	/* depth=7 */
243	sldi	r7,r7,7
244	or	r7,r7,r0
245	ori	r10,r7,1	/* stream=1 */
246
247	lis	r8,0x8000	/* GO=1 */
248	clrldi	r8,r8,32
249
250	dcbt	0,r6,0b01000
251	dcbt	0,r7,0b01010
252	dcbtst	0,r9,0b01000
253	dcbtst	0,r10,0b01010
254	eieio
255	dcbt	0,r8,0b01010	/* GO */
256
257	beq	cr1,.Lunwind_stack_nonvmx_copy
258
259	/*
260	 * If source and destination are not relatively aligned we use a
261	 * slower permute loop.
262	 */
263	xor	r6,r4,r3
264	rldicl.	r6,r6,0,(64-4)
265	bne	.Lvmx_unaligned_copy
266
267	/* Get the destination 16B aligned */
268	neg	r6,r3
269	mtocrf	0x01,r6
270	clrldi	r6,r6,(64-4)
271
272	bf	cr7*4+3,1f
273	lbz	r0,0(r4)
274	addi	r4,r4,1
275	stb	r0,0(r3)
276	addi	r3,r3,1
277
2781:	bf	cr7*4+2,2f
279	lhz	r0,0(r4)
280	addi	r4,r4,2
281	sth	r0,0(r3)
282	addi	r3,r3,2
283
2842:	bf	cr7*4+1,3f
285	lwz	r0,0(r4)
286	addi	r4,r4,4
287	stw	r0,0(r3)
288	addi	r3,r3,4
289
2903:	bf	cr7*4+0,4f
291	ld	r0,0(r4)
292	addi	r4,r4,8
293	std	r0,0(r3)
294	addi	r3,r3,8
295
2964:	sub	r5,r5,r6
297
298	/* Get the desination 128B aligned */
299	neg	r6,r3
300	srdi	r7,r6,4
301	mtocrf	0x01,r7
302	clrldi	r6,r6,(64-7)
303
304	li	r9,16
305	li	r10,32
306	li	r11,48
307
308	bf	cr7*4+3,5f
309	lvx	v1,0,r4
310	addi	r4,r4,16
311	stvx	v1,0,r3
312	addi	r3,r3,16
313
3145:	bf	cr7*4+2,6f
315	lvx	v1,0,r4
316	lvx	v0,r4,r9
317	addi	r4,r4,32
318	stvx	v1,0,r3
319	stvx	v0,r3,r9
320	addi	r3,r3,32
321
3226:	bf	cr7*4+1,7f
323	lvx	v3,0,r4
324	lvx	v2,r4,r9
325	lvx	v1,r4,r10
326	lvx	v0,r4,r11
327	addi	r4,r4,64
328	stvx	v3,0,r3
329	stvx	v2,r3,r9
330	stvx	v1,r3,r10
331	stvx	v0,r3,r11
332	addi	r3,r3,64
333
3347:	sub	r5,r5,r6
335	srdi	r6,r5,7
336
337	std	r14,STK_REG(R14)(r1)
338	std	r15,STK_REG(R15)(r1)
339	std	r16,STK_REG(R16)(r1)
340
341	li	r12,64
342	li	r14,80
343	li	r15,96
344	li	r16,112
345
346	mtctr	r6
347
348	/*
349	 * Now do cacheline sized loads and stores. By this stage the
350	 * cacheline stores are also cacheline aligned.
351	 */
352	.align	5
3538:
354	lvx	v7,0,r4
355	lvx	v6,r4,r9
356	lvx	v5,r4,r10
357	lvx	v4,r4,r11
358	lvx	v3,r4,r12
359	lvx	v2,r4,r14
360	lvx	v1,r4,r15
361	lvx	v0,r4,r16
362	addi	r4,r4,128
363	stvx	v7,0,r3
364	stvx	v6,r3,r9
365	stvx	v5,r3,r10
366	stvx	v4,r3,r11
367	stvx	v3,r3,r12
368	stvx	v2,r3,r14
369	stvx	v1,r3,r15
370	stvx	v0,r3,r16
371	addi	r3,r3,128
372	bdnz	8b
373
374	ld	r14,STK_REG(R14)(r1)
375	ld	r15,STK_REG(R15)(r1)
376	ld	r16,STK_REG(R16)(r1)
377
378	/* Up to 127B to go */
379	clrldi	r5,r5,(64-7)
380	srdi	r6,r5,4
381	mtocrf	0x01,r6
382
383	bf	cr7*4+1,9f
384	lvx	v3,0,r4
385	lvx	v2,r4,r9
386	lvx	v1,r4,r10
387	lvx	v0,r4,r11
388	addi	r4,r4,64
389	stvx	v3,0,r3
390	stvx	v2,r3,r9
391	stvx	v1,r3,r10
392	stvx	v0,r3,r11
393	addi	r3,r3,64
394
3959:	bf	cr7*4+2,10f
396	lvx	v1,0,r4
397	lvx	v0,r4,r9
398	addi	r4,r4,32
399	stvx	v1,0,r3
400	stvx	v0,r3,r9
401	addi	r3,r3,32
402
40310:	bf	cr7*4+3,11f
404	lvx	v1,0,r4
405	addi	r4,r4,16
406	stvx	v1,0,r3
407	addi	r3,r3,16
408
409	/* Up to 15B to go */
41011:	clrldi	r5,r5,(64-4)
411	mtocrf	0x01,r5
412	bf	cr7*4+0,12f
413	ld	r0,0(r4)
414	addi	r4,r4,8
415	std	r0,0(r3)
416	addi	r3,r3,8
417
41812:	bf	cr7*4+1,13f
419	lwz	r0,0(r4)
420	addi	r4,r4,4
421	stw	r0,0(r3)
422	addi	r3,r3,4
423
42413:	bf	cr7*4+2,14f
425	lhz	r0,0(r4)
426	addi	r4,r4,2
427	sth	r0,0(r3)
428	addi	r3,r3,2
429
43014:	bf	cr7*4+3,15f
431	lbz	r0,0(r4)
432	stb	r0,0(r3)
433
43415:	addi	r1,r1,STACKFRAMESIZE
435	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
436	b	exit_vmx_ops		/* tail call optimise */
437
438.Lvmx_unaligned_copy:
439	/* Get the destination 16B aligned */
440	neg	r6,r3
441	mtocrf	0x01,r6
442	clrldi	r6,r6,(64-4)
443
444	bf	cr7*4+3,1f
445	lbz	r0,0(r4)
446	addi	r4,r4,1
447	stb	r0,0(r3)
448	addi	r3,r3,1
449
4501:	bf	cr7*4+2,2f
451	lhz	r0,0(r4)
452	addi	r4,r4,2
453	sth	r0,0(r3)
454	addi	r3,r3,2
455
4562:	bf	cr7*4+1,3f
457	lwz	r0,0(r4)
458	addi	r4,r4,4
459	stw	r0,0(r3)
460	addi	r3,r3,4
461
4623:	bf	cr7*4+0,4f
463	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
464	lwz	r7,4(r4)
465	addi	r4,r4,8
466	stw	r0,0(r3)
467	stw	r7,4(r3)
468	addi	r3,r3,8
469
4704:	sub	r5,r5,r6
471
472	/* Get the desination 128B aligned */
473	neg	r6,r3
474	srdi	r7,r6,4
475	mtocrf	0x01,r7
476	clrldi	r6,r6,(64-7)
477
478	li	r9,16
479	li	r10,32
480	li	r11,48
481
482	LVS(v16,0,r4)		/* Setup permute control vector */
483	lvx	v0,0,r4
484	addi	r4,r4,16
485
486	bf	cr7*4+3,5f
487	lvx	v1,0,r4
488	VPERM(v8,v0,v1,v16)
489	addi	r4,r4,16
490	stvx	v8,0,r3
491	addi	r3,r3,16
492	vor	v0,v1,v1
493
4945:	bf	cr7*4+2,6f
495	lvx	v1,0,r4
496	VPERM(v8,v0,v1,v16)
497	lvx	v0,r4,r9
498	VPERM(v9,v1,v0,v16)
499	addi	r4,r4,32
500	stvx	v8,0,r3
501	stvx	v9,r3,r9
502	addi	r3,r3,32
503
5046:	bf	cr7*4+1,7f
505	lvx	v3,0,r4
506	VPERM(v8,v0,v3,v16)
507	lvx	v2,r4,r9
508	VPERM(v9,v3,v2,v16)
509	lvx	v1,r4,r10
510	VPERM(v10,v2,v1,v16)
511	lvx	v0,r4,r11
512	VPERM(v11,v1,v0,v16)
513	addi	r4,r4,64
514	stvx	v8,0,r3
515	stvx	v9,r3,r9
516	stvx	v10,r3,r10
517	stvx	v11,r3,r11
518	addi	r3,r3,64
519
5207:	sub	r5,r5,r6
521	srdi	r6,r5,7
522
523	std	r14,STK_REG(R14)(r1)
524	std	r15,STK_REG(R15)(r1)
525	std	r16,STK_REG(R16)(r1)
526
527	li	r12,64
528	li	r14,80
529	li	r15,96
530	li	r16,112
531
532	mtctr	r6
533
534	/*
535	 * Now do cacheline sized loads and stores. By this stage the
536	 * cacheline stores are also cacheline aligned.
537	 */
538	.align	5
5398:
540	lvx	v7,0,r4
541	VPERM(v8,v0,v7,v16)
542	lvx	v6,r4,r9
543	VPERM(v9,v7,v6,v16)
544	lvx	v5,r4,r10
545	VPERM(v10,v6,v5,v16)
546	lvx	v4,r4,r11
547	VPERM(v11,v5,v4,v16)
548	lvx	v3,r4,r12
549	VPERM(v12,v4,v3,v16)
550	lvx	v2,r4,r14
551	VPERM(v13,v3,v2,v16)
552	lvx	v1,r4,r15
553	VPERM(v14,v2,v1,v16)
554	lvx	v0,r4,r16
555	VPERM(v15,v1,v0,v16)
556	addi	r4,r4,128
557	stvx	v8,0,r3
558	stvx	v9,r3,r9
559	stvx	v10,r3,r10
560	stvx	v11,r3,r11
561	stvx	v12,r3,r12
562	stvx	v13,r3,r14
563	stvx	v14,r3,r15
564	stvx	v15,r3,r16
565	addi	r3,r3,128
566	bdnz	8b
567
568	ld	r14,STK_REG(R14)(r1)
569	ld	r15,STK_REG(R15)(r1)
570	ld	r16,STK_REG(R16)(r1)
571
572	/* Up to 127B to go */
573	clrldi	r5,r5,(64-7)
574	srdi	r6,r5,4
575	mtocrf	0x01,r6
576
577	bf	cr7*4+1,9f
578	lvx	v3,0,r4
579	VPERM(v8,v0,v3,v16)
580	lvx	v2,r4,r9
581	VPERM(v9,v3,v2,v16)
582	lvx	v1,r4,r10
583	VPERM(v10,v2,v1,v16)
584	lvx	v0,r4,r11
585	VPERM(v11,v1,v0,v16)
586	addi	r4,r4,64
587	stvx	v8,0,r3
588	stvx	v9,r3,r9
589	stvx	v10,r3,r10
590	stvx	v11,r3,r11
591	addi	r3,r3,64
592
5939:	bf	cr7*4+2,10f
594	lvx	v1,0,r4
595	VPERM(v8,v0,v1,v16)
596	lvx	v0,r4,r9
597	VPERM(v9,v1,v0,v16)
598	addi	r4,r4,32
599	stvx	v8,0,r3
600	stvx	v9,r3,r9
601	addi	r3,r3,32
602
60310:	bf	cr7*4+3,11f
604	lvx	v1,0,r4
605	VPERM(v8,v0,v1,v16)
606	addi	r4,r4,16
607	stvx	v8,0,r3
608	addi	r3,r3,16
609
610	/* Up to 15B to go */
61111:	clrldi	r5,r5,(64-4)
612	addi	r4,r4,-16	/* Unwind the +16 load offset */
613	mtocrf	0x01,r5
614	bf	cr7*4+0,12f
615	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
616	lwz	r6,4(r4)
617	addi	r4,r4,8
618	stw	r0,0(r3)
619	stw	r6,4(r3)
620	addi	r3,r3,8
621
62212:	bf	cr7*4+1,13f
623	lwz	r0,0(r4)
624	addi	r4,r4,4
625	stw	r0,0(r3)
626	addi	r3,r3,4
627
62813:	bf	cr7*4+2,14f
629	lhz	r0,0(r4)
630	addi	r4,r4,2
631	sth	r0,0(r3)
632	addi	r3,r3,2
633
63414:	bf	cr7*4+3,15f
635	lbz	r0,0(r4)
636	stb	r0,0(r3)
637
63815:	addi	r1,r1,STACKFRAMESIZE
639	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
640	b	exit_vmx_ops		/* tail call optimise */
641#endif /* CONFIG_ALTIVEC */
642