xref: /linux/arch/powerpc/lib/memcpy_power7.S (revision 4949009eb8d40a441dcddcd96e101e77d31cf1b2)
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2012
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20#include <asm/ppc_asm.h>
21
22_GLOBAL(memcpy_power7)
23
24#ifdef __BIG_ENDIAN__
25#define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB
26#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC
27#else
28#define LVS(VRT,RA,RB)		lvsr	VRT,RA,RB
29#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC
30#endif
31
32#ifdef CONFIG_ALTIVEC
33	cmpldi	r5,16
34	cmpldi	cr1,r5,4096
35
36	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
37
38	blt	.Lshort_copy
39	bgt	cr1,.Lvmx_copy
40#else
41	cmpldi	r5,16
42
43	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
44
45	blt	.Lshort_copy
46#endif
47
48.Lnonvmx_copy:
49	/* Get the source 8B aligned */
50	neg	r6,r4
51	mtocrf	0x01,r6
52	clrldi	r6,r6,(64-3)
53
54	bf	cr7*4+3,1f
55	lbz	r0,0(r4)
56	addi	r4,r4,1
57	stb	r0,0(r3)
58	addi	r3,r3,1
59
601:	bf	cr7*4+2,2f
61	lhz	r0,0(r4)
62	addi	r4,r4,2
63	sth	r0,0(r3)
64	addi	r3,r3,2
65
662:	bf	cr7*4+1,3f
67	lwz	r0,0(r4)
68	addi	r4,r4,4
69	stw	r0,0(r3)
70	addi	r3,r3,4
71
723:	sub	r5,r5,r6
73	cmpldi	r5,128
74	blt	5f
75
76	mflr	r0
77	stdu	r1,-STACKFRAMESIZE(r1)
78	std	r14,STK_REG(R14)(r1)
79	std	r15,STK_REG(R15)(r1)
80	std	r16,STK_REG(R16)(r1)
81	std	r17,STK_REG(R17)(r1)
82	std	r18,STK_REG(R18)(r1)
83	std	r19,STK_REG(R19)(r1)
84	std	r20,STK_REG(R20)(r1)
85	std	r21,STK_REG(R21)(r1)
86	std	r22,STK_REG(R22)(r1)
87	std	r0,STACKFRAMESIZE+16(r1)
88
89	srdi	r6,r5,7
90	mtctr	r6
91
92	/* Now do cacheline (128B) sized loads and stores. */
93	.align	5
944:
95	ld	r0,0(r4)
96	ld	r6,8(r4)
97	ld	r7,16(r4)
98	ld	r8,24(r4)
99	ld	r9,32(r4)
100	ld	r10,40(r4)
101	ld	r11,48(r4)
102	ld	r12,56(r4)
103	ld	r14,64(r4)
104	ld	r15,72(r4)
105	ld	r16,80(r4)
106	ld	r17,88(r4)
107	ld	r18,96(r4)
108	ld	r19,104(r4)
109	ld	r20,112(r4)
110	ld	r21,120(r4)
111	addi	r4,r4,128
112	std	r0,0(r3)
113	std	r6,8(r3)
114	std	r7,16(r3)
115	std	r8,24(r3)
116	std	r9,32(r3)
117	std	r10,40(r3)
118	std	r11,48(r3)
119	std	r12,56(r3)
120	std	r14,64(r3)
121	std	r15,72(r3)
122	std	r16,80(r3)
123	std	r17,88(r3)
124	std	r18,96(r3)
125	std	r19,104(r3)
126	std	r20,112(r3)
127	std	r21,120(r3)
128	addi	r3,r3,128
129	bdnz	4b
130
131	clrldi	r5,r5,(64-7)
132
133	ld	r14,STK_REG(R14)(r1)
134	ld	r15,STK_REG(R15)(r1)
135	ld	r16,STK_REG(R16)(r1)
136	ld	r17,STK_REG(R17)(r1)
137	ld	r18,STK_REG(R18)(r1)
138	ld	r19,STK_REG(R19)(r1)
139	ld	r20,STK_REG(R20)(r1)
140	ld	r21,STK_REG(R21)(r1)
141	ld	r22,STK_REG(R22)(r1)
142	addi	r1,r1,STACKFRAMESIZE
143
144	/* Up to 127B to go */
1455:	srdi	r6,r5,4
146	mtocrf	0x01,r6
147
1486:	bf	cr7*4+1,7f
149	ld	r0,0(r4)
150	ld	r6,8(r4)
151	ld	r7,16(r4)
152	ld	r8,24(r4)
153	ld	r9,32(r4)
154	ld	r10,40(r4)
155	ld	r11,48(r4)
156	ld	r12,56(r4)
157	addi	r4,r4,64
158	std	r0,0(r3)
159	std	r6,8(r3)
160	std	r7,16(r3)
161	std	r8,24(r3)
162	std	r9,32(r3)
163	std	r10,40(r3)
164	std	r11,48(r3)
165	std	r12,56(r3)
166	addi	r3,r3,64
167
168	/* Up to 63B to go */
1697:	bf	cr7*4+2,8f
170	ld	r0,0(r4)
171	ld	r6,8(r4)
172	ld	r7,16(r4)
173	ld	r8,24(r4)
174	addi	r4,r4,32
175	std	r0,0(r3)
176	std	r6,8(r3)
177	std	r7,16(r3)
178	std	r8,24(r3)
179	addi	r3,r3,32
180
181	/* Up to 31B to go */
1828:	bf	cr7*4+3,9f
183	ld	r0,0(r4)
184	ld	r6,8(r4)
185	addi	r4,r4,16
186	std	r0,0(r3)
187	std	r6,8(r3)
188	addi	r3,r3,16
189
1909:	clrldi	r5,r5,(64-4)
191
192	/* Up to 15B to go */
193.Lshort_copy:
194	mtocrf	0x01,r5
195	bf	cr7*4+0,12f
196	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
197	lwz	r6,4(r4)
198	addi	r4,r4,8
199	stw	r0,0(r3)
200	stw	r6,4(r3)
201	addi	r3,r3,8
202
20312:	bf	cr7*4+1,13f
204	lwz	r0,0(r4)
205	addi	r4,r4,4
206	stw	r0,0(r3)
207	addi	r3,r3,4
208
20913:	bf	cr7*4+2,14f
210	lhz	r0,0(r4)
211	addi	r4,r4,2
212	sth	r0,0(r3)
213	addi	r3,r3,2
214
21514:	bf	cr7*4+3,15f
216	lbz	r0,0(r4)
217	stb	r0,0(r3)
218
21915:	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
220	blr
221
222.Lunwind_stack_nonvmx_copy:
223	addi	r1,r1,STACKFRAMESIZE
224	b	.Lnonvmx_copy
225
226#ifdef CONFIG_ALTIVEC
227.Lvmx_copy:
228	mflr	r0
229	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
230	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
231	std	r0,16(r1)
232	stdu	r1,-STACKFRAMESIZE(r1)
233	bl	enter_vmx_copy
234	cmpwi	cr1,r3,0
235	ld	r0,STACKFRAMESIZE+16(r1)
236	ld	r3,STK_REG(R31)(r1)
237	ld	r4,STK_REG(R30)(r1)
238	ld	r5,STK_REG(R29)(r1)
239	mtlr	r0
240
241	/*
242	 * We prefetch both the source and destination using enhanced touch
243	 * instructions. We use a stream ID of 0 for the load side and
244	 * 1 for the store side.
245	 */
246	clrrdi	r6,r4,7
247	clrrdi	r9,r3,7
248	ori	r9,r9,1		/* stream=1 */
249
250	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
251	cmpldi	r7,0x3FF
252	ble	1f
253	li	r7,0x3FF
2541:	lis	r0,0x0E00	/* depth=7 */
255	sldi	r7,r7,7
256	or	r7,r7,r0
257	ori	r10,r7,1	/* stream=1 */
258
259	lis	r8,0x8000	/* GO=1 */
260	clrldi	r8,r8,32
261
262.machine push
263.machine "power4"
264	dcbt	r0,r6,0b01000
265	dcbt	r0,r7,0b01010
266	dcbtst	r0,r9,0b01000
267	dcbtst	r0,r10,0b01010
268	eieio
269	dcbt	r0,r8,0b01010	/* GO */
270.machine pop
271
272	beq	cr1,.Lunwind_stack_nonvmx_copy
273
274	/*
275	 * If source and destination are not relatively aligned we use a
276	 * slower permute loop.
277	 */
278	xor	r6,r4,r3
279	rldicl.	r6,r6,0,(64-4)
280	bne	.Lvmx_unaligned_copy
281
282	/* Get the destination 16B aligned */
283	neg	r6,r3
284	mtocrf	0x01,r6
285	clrldi	r6,r6,(64-4)
286
287	bf	cr7*4+3,1f
288	lbz	r0,0(r4)
289	addi	r4,r4,1
290	stb	r0,0(r3)
291	addi	r3,r3,1
292
2931:	bf	cr7*4+2,2f
294	lhz	r0,0(r4)
295	addi	r4,r4,2
296	sth	r0,0(r3)
297	addi	r3,r3,2
298
2992:	bf	cr7*4+1,3f
300	lwz	r0,0(r4)
301	addi	r4,r4,4
302	stw	r0,0(r3)
303	addi	r3,r3,4
304
3053:	bf	cr7*4+0,4f
306	ld	r0,0(r4)
307	addi	r4,r4,8
308	std	r0,0(r3)
309	addi	r3,r3,8
310
3114:	sub	r5,r5,r6
312
313	/* Get the desination 128B aligned */
314	neg	r6,r3
315	srdi	r7,r6,4
316	mtocrf	0x01,r7
317	clrldi	r6,r6,(64-7)
318
319	li	r9,16
320	li	r10,32
321	li	r11,48
322
323	bf	cr7*4+3,5f
324	lvx	vr1,r0,r4
325	addi	r4,r4,16
326	stvx	vr1,r0,r3
327	addi	r3,r3,16
328
3295:	bf	cr7*4+2,6f
330	lvx	vr1,r0,r4
331	lvx	vr0,r4,r9
332	addi	r4,r4,32
333	stvx	vr1,r0,r3
334	stvx	vr0,r3,r9
335	addi	r3,r3,32
336
3376:	bf	cr7*4+1,7f
338	lvx	vr3,r0,r4
339	lvx	vr2,r4,r9
340	lvx	vr1,r4,r10
341	lvx	vr0,r4,r11
342	addi	r4,r4,64
343	stvx	vr3,r0,r3
344	stvx	vr2,r3,r9
345	stvx	vr1,r3,r10
346	stvx	vr0,r3,r11
347	addi	r3,r3,64
348
3497:	sub	r5,r5,r6
350	srdi	r6,r5,7
351
352	std	r14,STK_REG(R14)(r1)
353	std	r15,STK_REG(R15)(r1)
354	std	r16,STK_REG(R16)(r1)
355
356	li	r12,64
357	li	r14,80
358	li	r15,96
359	li	r16,112
360
361	mtctr	r6
362
363	/*
364	 * Now do cacheline sized loads and stores. By this stage the
365	 * cacheline stores are also cacheline aligned.
366	 */
367	.align	5
3688:
369	lvx	vr7,r0,r4
370	lvx	vr6,r4,r9
371	lvx	vr5,r4,r10
372	lvx	vr4,r4,r11
373	lvx	vr3,r4,r12
374	lvx	vr2,r4,r14
375	lvx	vr1,r4,r15
376	lvx	vr0,r4,r16
377	addi	r4,r4,128
378	stvx	vr7,r0,r3
379	stvx	vr6,r3,r9
380	stvx	vr5,r3,r10
381	stvx	vr4,r3,r11
382	stvx	vr3,r3,r12
383	stvx	vr2,r3,r14
384	stvx	vr1,r3,r15
385	stvx	vr0,r3,r16
386	addi	r3,r3,128
387	bdnz	8b
388
389	ld	r14,STK_REG(R14)(r1)
390	ld	r15,STK_REG(R15)(r1)
391	ld	r16,STK_REG(R16)(r1)
392
393	/* Up to 127B to go */
394	clrldi	r5,r5,(64-7)
395	srdi	r6,r5,4
396	mtocrf	0x01,r6
397
398	bf	cr7*4+1,9f
399	lvx	vr3,r0,r4
400	lvx	vr2,r4,r9
401	lvx	vr1,r4,r10
402	lvx	vr0,r4,r11
403	addi	r4,r4,64
404	stvx	vr3,r0,r3
405	stvx	vr2,r3,r9
406	stvx	vr1,r3,r10
407	stvx	vr0,r3,r11
408	addi	r3,r3,64
409
4109:	bf	cr7*4+2,10f
411	lvx	vr1,r0,r4
412	lvx	vr0,r4,r9
413	addi	r4,r4,32
414	stvx	vr1,r0,r3
415	stvx	vr0,r3,r9
416	addi	r3,r3,32
417
41810:	bf	cr7*4+3,11f
419	lvx	vr1,r0,r4
420	addi	r4,r4,16
421	stvx	vr1,r0,r3
422	addi	r3,r3,16
423
424	/* Up to 15B to go */
42511:	clrldi	r5,r5,(64-4)
426	mtocrf	0x01,r5
427	bf	cr7*4+0,12f
428	ld	r0,0(r4)
429	addi	r4,r4,8
430	std	r0,0(r3)
431	addi	r3,r3,8
432
43312:	bf	cr7*4+1,13f
434	lwz	r0,0(r4)
435	addi	r4,r4,4
436	stw	r0,0(r3)
437	addi	r3,r3,4
438
43913:	bf	cr7*4+2,14f
440	lhz	r0,0(r4)
441	addi	r4,r4,2
442	sth	r0,0(r3)
443	addi	r3,r3,2
444
44514:	bf	cr7*4+3,15f
446	lbz	r0,0(r4)
447	stb	r0,0(r3)
448
44915:	addi	r1,r1,STACKFRAMESIZE
450	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
451	b	exit_vmx_copy		/* tail call optimise */
452
453.Lvmx_unaligned_copy:
454	/* Get the destination 16B aligned */
455	neg	r6,r3
456	mtocrf	0x01,r6
457	clrldi	r6,r6,(64-4)
458
459	bf	cr7*4+3,1f
460	lbz	r0,0(r4)
461	addi	r4,r4,1
462	stb	r0,0(r3)
463	addi	r3,r3,1
464
4651:	bf	cr7*4+2,2f
466	lhz	r0,0(r4)
467	addi	r4,r4,2
468	sth	r0,0(r3)
469	addi	r3,r3,2
470
4712:	bf	cr7*4+1,3f
472	lwz	r0,0(r4)
473	addi	r4,r4,4
474	stw	r0,0(r3)
475	addi	r3,r3,4
476
4773:	bf	cr7*4+0,4f
478	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
479	lwz	r7,4(r4)
480	addi	r4,r4,8
481	stw	r0,0(r3)
482	stw	r7,4(r3)
483	addi	r3,r3,8
484
4854:	sub	r5,r5,r6
486
487	/* Get the desination 128B aligned */
488	neg	r6,r3
489	srdi	r7,r6,4
490	mtocrf	0x01,r7
491	clrldi	r6,r6,(64-7)
492
493	li	r9,16
494	li	r10,32
495	li	r11,48
496
497	LVS(vr16,0,r4)		/* Setup permute control vector */
498	lvx	vr0,0,r4
499	addi	r4,r4,16
500
501	bf	cr7*4+3,5f
502	lvx	vr1,r0,r4
503	VPERM(vr8,vr0,vr1,vr16)
504	addi	r4,r4,16
505	stvx	vr8,r0,r3
506	addi	r3,r3,16
507	vor	vr0,vr1,vr1
508
5095:	bf	cr7*4+2,6f
510	lvx	vr1,r0,r4
511	VPERM(vr8,vr0,vr1,vr16)
512	lvx	vr0,r4,r9
513	VPERM(vr9,vr1,vr0,vr16)
514	addi	r4,r4,32
515	stvx	vr8,r0,r3
516	stvx	vr9,r3,r9
517	addi	r3,r3,32
518
5196:	bf	cr7*4+1,7f
520	lvx	vr3,r0,r4
521	VPERM(vr8,vr0,vr3,vr16)
522	lvx	vr2,r4,r9
523	VPERM(vr9,vr3,vr2,vr16)
524	lvx	vr1,r4,r10
525	VPERM(vr10,vr2,vr1,vr16)
526	lvx	vr0,r4,r11
527	VPERM(vr11,vr1,vr0,vr16)
528	addi	r4,r4,64
529	stvx	vr8,r0,r3
530	stvx	vr9,r3,r9
531	stvx	vr10,r3,r10
532	stvx	vr11,r3,r11
533	addi	r3,r3,64
534
5357:	sub	r5,r5,r6
536	srdi	r6,r5,7
537
538	std	r14,STK_REG(R14)(r1)
539	std	r15,STK_REG(R15)(r1)
540	std	r16,STK_REG(R16)(r1)
541
542	li	r12,64
543	li	r14,80
544	li	r15,96
545	li	r16,112
546
547	mtctr	r6
548
549	/*
550	 * Now do cacheline sized loads and stores. By this stage the
551	 * cacheline stores are also cacheline aligned.
552	 */
553	.align	5
5548:
555	lvx	vr7,r0,r4
556	VPERM(vr8,vr0,vr7,vr16)
557	lvx	vr6,r4,r9
558	VPERM(vr9,vr7,vr6,vr16)
559	lvx	vr5,r4,r10
560	VPERM(vr10,vr6,vr5,vr16)
561	lvx	vr4,r4,r11
562	VPERM(vr11,vr5,vr4,vr16)
563	lvx	vr3,r4,r12
564	VPERM(vr12,vr4,vr3,vr16)
565	lvx	vr2,r4,r14
566	VPERM(vr13,vr3,vr2,vr16)
567	lvx	vr1,r4,r15
568	VPERM(vr14,vr2,vr1,vr16)
569	lvx	vr0,r4,r16
570	VPERM(vr15,vr1,vr0,vr16)
571	addi	r4,r4,128
572	stvx	vr8,r0,r3
573	stvx	vr9,r3,r9
574	stvx	vr10,r3,r10
575	stvx	vr11,r3,r11
576	stvx	vr12,r3,r12
577	stvx	vr13,r3,r14
578	stvx	vr14,r3,r15
579	stvx	vr15,r3,r16
580	addi	r3,r3,128
581	bdnz	8b
582
583	ld	r14,STK_REG(R14)(r1)
584	ld	r15,STK_REG(R15)(r1)
585	ld	r16,STK_REG(R16)(r1)
586
587	/* Up to 127B to go */
588	clrldi	r5,r5,(64-7)
589	srdi	r6,r5,4
590	mtocrf	0x01,r6
591
592	bf	cr7*4+1,9f
593	lvx	vr3,r0,r4
594	VPERM(vr8,vr0,vr3,vr16)
595	lvx	vr2,r4,r9
596	VPERM(vr9,vr3,vr2,vr16)
597	lvx	vr1,r4,r10
598	VPERM(vr10,vr2,vr1,vr16)
599	lvx	vr0,r4,r11
600	VPERM(vr11,vr1,vr0,vr16)
601	addi	r4,r4,64
602	stvx	vr8,r0,r3
603	stvx	vr9,r3,r9
604	stvx	vr10,r3,r10
605	stvx	vr11,r3,r11
606	addi	r3,r3,64
607
6089:	bf	cr7*4+2,10f
609	lvx	vr1,r0,r4
610	VPERM(vr8,vr0,vr1,vr16)
611	lvx	vr0,r4,r9
612	VPERM(vr9,vr1,vr0,vr16)
613	addi	r4,r4,32
614	stvx	vr8,r0,r3
615	stvx	vr9,r3,r9
616	addi	r3,r3,32
617
61810:	bf	cr7*4+3,11f
619	lvx	vr1,r0,r4
620	VPERM(vr8,vr0,vr1,vr16)
621	addi	r4,r4,16
622	stvx	vr8,r0,r3
623	addi	r3,r3,16
624
625	/* Up to 15B to go */
62611:	clrldi	r5,r5,(64-4)
627	addi	r4,r4,-16	/* Unwind the +16 load offset */
628	mtocrf	0x01,r5
629	bf	cr7*4+0,12f
630	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
631	lwz	r6,4(r4)
632	addi	r4,r4,8
633	stw	r0,0(r3)
634	stw	r6,4(r3)
635	addi	r3,r3,8
636
63712:	bf	cr7*4+1,13f
638	lwz	r0,0(r4)
639	addi	r4,r4,4
640	stw	r0,0(r3)
641	addi	r3,r3,4
642
64313:	bf	cr7*4+2,14f
644	lhz	r0,0(r4)
645	addi	r4,r4,2
646	sth	r0,0(r3)
647	addi	r3,r3,2
648
64914:	bf	cr7*4+3,15f
650	lbz	r0,0(r4)
651	stb	r0,0(r3)
652
65315:	addi	r1,r1,STACKFRAMESIZE
654	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
655	b	exit_vmx_copy		/* tail call optimise */
656#endif /* CONFIG_ALTIVEC */
657