xref: /linux/arch/powerpc/lib/copyuser_power7.S (revision a66086b8197da8dc83b698642d5947ff850e708d)
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2011
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20#include <asm/ppc_asm.h>
21
22#define STACKFRAMESIZE	256
23#define STK_REG(i)	(112 + ((i)-14)*8)
24
25	.macro err1
26100:
27	.section __ex_table,"a"
28	.align 3
29	.llong 100b,.Ldo_err1
30	.previous
31	.endm
32
33	.macro err2
34200:
35	.section __ex_table,"a"
36	.align 3
37	.llong 200b,.Ldo_err2
38	.previous
39	.endm
40
41#ifdef CONFIG_ALTIVEC
42	.macro err3
43300:
44	.section __ex_table,"a"
45	.align 3
46	.llong 300b,.Ldo_err3
47	.previous
48	.endm
49
50	.macro err4
51400:
52	.section __ex_table,"a"
53	.align 3
54	.llong 400b,.Ldo_err4
55	.previous
56	.endm
57
58
59.Ldo_err4:
60	ld	r16,STK_REG(r16)(r1)
61	ld	r15,STK_REG(r15)(r1)
62	ld	r14,STK_REG(r14)(r1)
63.Ldo_err3:
64	bl	.exit_vmx_copy
65	ld	r0,STACKFRAMESIZE+16(r1)
66	mtlr	r0
67	b	.Lexit
68#endif /* CONFIG_ALTIVEC */
69
70.Ldo_err2:
71	ld	r22,STK_REG(r22)(r1)
72	ld	r21,STK_REG(r21)(r1)
73	ld	r20,STK_REG(r20)(r1)
74	ld	r19,STK_REG(r19)(r1)
75	ld	r18,STK_REG(r18)(r1)
76	ld	r17,STK_REG(r17)(r1)
77	ld	r16,STK_REG(r16)(r1)
78	ld	r15,STK_REG(r15)(r1)
79	ld	r14,STK_REG(r14)(r1)
80.Lexit:
81	addi	r1,r1,STACKFRAMESIZE
82.Ldo_err1:
83	ld	r3,48(r1)
84	ld	r4,56(r1)
85	ld	r5,64(r1)
86	b	__copy_tofrom_user_base
87
88
89_GLOBAL(__copy_tofrom_user_power7)
90#ifdef CONFIG_ALTIVEC
91	cmpldi	r5,16
92	cmpldi	cr1,r5,4096
93
94	std	r3,48(r1)
95	std	r4,56(r1)
96	std	r5,64(r1)
97
98	blt	.Lshort_copy
99	bgt	cr1,.Lvmx_copy
100#else
101	cmpldi	r5,16
102
103	std	r3,48(r1)
104	std	r4,56(r1)
105	std	r5,64(r1)
106
107	blt	.Lshort_copy
108#endif
109
110.Lnonvmx_copy:
111	/* Get the source 8B aligned */
112	neg	r6,r4
113	mtocrf	0x01,r6
114	clrldi	r6,r6,(64-3)
115
116	bf	cr7*4+3,1f
117err1;	lbz	r0,0(r4)
118	addi	r4,r4,1
119err1;	stb	r0,0(r3)
120	addi	r3,r3,1
121
1221:	bf	cr7*4+2,2f
123err1;	lhz	r0,0(r4)
124	addi	r4,r4,2
125err1;	sth	r0,0(r3)
126	addi	r3,r3,2
127
1282:	bf	cr7*4+1,3f
129err1;	lwz	r0,0(r4)
130	addi	r4,r4,4
131err1;	stw	r0,0(r3)
132	addi	r3,r3,4
133
1343:	sub	r5,r5,r6
135	cmpldi	r5,128
136	blt	5f
137
138	mflr	r0
139	stdu	r1,-STACKFRAMESIZE(r1)
140	std	r14,STK_REG(r14)(r1)
141	std	r15,STK_REG(r15)(r1)
142	std	r16,STK_REG(r16)(r1)
143	std	r17,STK_REG(r17)(r1)
144	std	r18,STK_REG(r18)(r1)
145	std	r19,STK_REG(r19)(r1)
146	std	r20,STK_REG(r20)(r1)
147	std	r21,STK_REG(r21)(r1)
148	std	r22,STK_REG(r22)(r1)
149	std	r0,STACKFRAMESIZE+16(r1)
150
151	srdi	r6,r5,7
152	mtctr	r6
153
154	/* Now do cacheline (128B) sized loads and stores. */
155	.align	5
1564:
157err2;	ld	r0,0(r4)
158err2;	ld	r6,8(r4)
159err2;	ld	r7,16(r4)
160err2;	ld	r8,24(r4)
161err2;	ld	r9,32(r4)
162err2;	ld	r10,40(r4)
163err2;	ld	r11,48(r4)
164err2;	ld	r12,56(r4)
165err2;	ld	r14,64(r4)
166err2;	ld	r15,72(r4)
167err2;	ld	r16,80(r4)
168err2;	ld	r17,88(r4)
169err2;	ld	r18,96(r4)
170err2;	ld	r19,104(r4)
171err2;	ld	r20,112(r4)
172err2;	ld	r21,120(r4)
173	addi	r4,r4,128
174err2;	std	r0,0(r3)
175err2;	std	r6,8(r3)
176err2;	std	r7,16(r3)
177err2;	std	r8,24(r3)
178err2;	std	r9,32(r3)
179err2;	std	r10,40(r3)
180err2;	std	r11,48(r3)
181err2;	std	r12,56(r3)
182err2;	std	r14,64(r3)
183err2;	std	r15,72(r3)
184err2;	std	r16,80(r3)
185err2;	std	r17,88(r3)
186err2;	std	r18,96(r3)
187err2;	std	r19,104(r3)
188err2;	std	r20,112(r3)
189err2;	std	r21,120(r3)
190	addi	r3,r3,128
191	bdnz	4b
192
193	clrldi	r5,r5,(64-7)
194
195	ld	r14,STK_REG(r14)(r1)
196	ld	r15,STK_REG(r15)(r1)
197	ld	r16,STK_REG(r16)(r1)
198	ld	r17,STK_REG(r17)(r1)
199	ld	r18,STK_REG(r18)(r1)
200	ld	r19,STK_REG(r19)(r1)
201	ld	r20,STK_REG(r20)(r1)
202	ld	r21,STK_REG(r21)(r1)
203	ld	r22,STK_REG(r22)(r1)
204	addi	r1,r1,STACKFRAMESIZE
205
206	/* Up to 127B to go */
2075:	srdi	r6,r5,4
208	mtocrf	0x01,r6
209
2106:	bf	cr7*4+1,7f
211err1;	ld	r0,0(r4)
212err1;	ld	r6,8(r4)
213err1;	ld	r7,16(r4)
214err1;	ld	r8,24(r4)
215err1;	ld	r9,32(r4)
216err1;	ld	r10,40(r4)
217err1;	ld	r11,48(r4)
218err1;	ld	r12,56(r4)
219	addi	r4,r4,64
220err1;	std	r0,0(r3)
221err1;	std	r6,8(r3)
222err1;	std	r7,16(r3)
223err1;	std	r8,24(r3)
224err1;	std	r9,32(r3)
225err1;	std	r10,40(r3)
226err1;	std	r11,48(r3)
227err1;	std	r12,56(r3)
228	addi	r3,r3,64
229
230	/* Up to 63B to go */
2317:	bf	cr7*4+2,8f
232err1;	ld	r0,0(r4)
233err1;	ld	r6,8(r4)
234err1;	ld	r7,16(r4)
235err1;	ld	r8,24(r4)
236	addi	r4,r4,32
237err1;	std	r0,0(r3)
238err1;	std	r6,8(r3)
239err1;	std	r7,16(r3)
240err1;	std	r8,24(r3)
241	addi	r3,r3,32
242
243	/* Up to 31B to go */
2448:	bf	cr7*4+3,9f
245err1;	ld	r0,0(r4)
246err1;	ld	r6,8(r4)
247	addi	r4,r4,16
248err1;	std	r0,0(r3)
249err1;	std	r6,8(r3)
250	addi	r3,r3,16
251
2529:	clrldi	r5,r5,(64-4)
253
254	/* Up to 15B to go */
255.Lshort_copy:
256	mtocrf	0x01,r5
257	bf	cr7*4+0,12f
258err1;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
259err1;	lwz	r6,4(r4)
260	addi	r4,r4,8
261err1;	stw	r0,0(r3)
262err1;	stw	r6,4(r3)
263	addi	r3,r3,8
264
26512:	bf	cr7*4+1,13f
266err1;	lwz	r0,0(r4)
267	addi	r4,r4,4
268err1;	stw	r0,0(r3)
269	addi	r3,r3,4
270
27113:	bf	cr7*4+2,14f
272err1;	lhz	r0,0(r4)
273	addi	r4,r4,2
274err1;	sth	r0,0(r3)
275	addi	r3,r3,2
276
27714:	bf	cr7*4+3,15f
278err1;	lbz	r0,0(r4)
279err1;	stb	r0,0(r3)
280
28115:	li	r3,0
282	blr
283
284.Lunwind_stack_nonvmx_copy:
285	addi	r1,r1,STACKFRAMESIZE
286	b	.Lnonvmx_copy
287
288#ifdef CONFIG_ALTIVEC
289.Lvmx_copy:
290	mflr	r0
291	std	r0,16(r1)
292	stdu	r1,-STACKFRAMESIZE(r1)
293	bl	.enter_vmx_copy
294	cmpwi	r3,0
295	ld	r0,STACKFRAMESIZE+16(r1)
296	ld	r3,STACKFRAMESIZE+48(r1)
297	ld	r4,STACKFRAMESIZE+56(r1)
298	ld	r5,STACKFRAMESIZE+64(r1)
299	mtlr	r0
300
301	beq	.Lunwind_stack_nonvmx_copy
302
303	/*
304	 * If source and destination are not relatively aligned we use a
305	 * slower permute loop.
306	 */
307	xor	r6,r4,r3
308	rldicl.	r6,r6,0,(64-4)
309	bne	.Lvmx_unaligned_copy
310
311	/* Get the destination 16B aligned */
312	neg	r6,r3
313	mtocrf	0x01,r6
314	clrldi	r6,r6,(64-4)
315
316	bf	cr7*4+3,1f
317err3;	lbz	r0,0(r4)
318	addi	r4,r4,1
319err3;	stb	r0,0(r3)
320	addi	r3,r3,1
321
3221:	bf	cr7*4+2,2f
323err3;	lhz	r0,0(r4)
324	addi	r4,r4,2
325err3;	sth	r0,0(r3)
326	addi	r3,r3,2
327
3282:	bf	cr7*4+1,3f
329err3;	lwz	r0,0(r4)
330	addi	r4,r4,4
331err3;	stw	r0,0(r3)
332	addi	r3,r3,4
333
3343:	bf	cr7*4+0,4f
335err3;	ld	r0,0(r4)
336	addi	r4,r4,8
337err3;	std	r0,0(r3)
338	addi	r3,r3,8
339
3404:	sub	r5,r5,r6
341
342	/* Get the desination 128B aligned */
343	neg	r6,r3
344	srdi	r7,r6,4
345	mtocrf	0x01,r7
346	clrldi	r6,r6,(64-7)
347
348	li	r9,16
349	li	r10,32
350	li	r11,48
351
352	bf	cr7*4+3,5f
353err3;	lvx	vr1,r0,r4
354	addi	r4,r4,16
355err3;	stvx	vr1,r0,r3
356	addi	r3,r3,16
357
3585:	bf	cr7*4+2,6f
359err3;	lvx	vr1,r0,r4
360err3;	lvx	vr0,r4,r9
361	addi	r4,r4,32
362err3;	stvx	vr1,r0,r3
363err3;	stvx	vr0,r3,r9
364	addi	r3,r3,32
365
3666:	bf	cr7*4+1,7f
367err3;	lvx	vr3,r0,r4
368err3;	lvx	vr2,r4,r9
369err3;	lvx	vr1,r4,r10
370err3;	lvx	vr0,r4,r11
371	addi	r4,r4,64
372err3;	stvx	vr3,r0,r3
373err3;	stvx	vr2,r3,r9
374err3;	stvx	vr1,r3,r10
375err3;	stvx	vr0,r3,r11
376	addi	r3,r3,64
377
3787:	sub	r5,r5,r6
379	srdi	r6,r5,7
380
381	std	r14,STK_REG(r14)(r1)
382	std	r15,STK_REG(r15)(r1)
383	std	r16,STK_REG(r16)(r1)
384
385	li	r12,64
386	li	r14,80
387	li	r15,96
388	li	r16,112
389
390	mtctr	r6
391
392	/*
393	 * Now do cacheline sized loads and stores. By this stage the
394	 * cacheline stores are also cacheline aligned.
395	 */
396	.align	5
3978:
398err4;	lvx	vr7,r0,r4
399err4;	lvx	vr6,r4,r9
400err4;	lvx	vr5,r4,r10
401err4;	lvx	vr4,r4,r11
402err4;	lvx	vr3,r4,r12
403err4;	lvx	vr2,r4,r14
404err4;	lvx	vr1,r4,r15
405err4;	lvx	vr0,r4,r16
406	addi	r4,r4,128
407err4;	stvx	vr7,r0,r3
408err4;	stvx	vr6,r3,r9
409err4;	stvx	vr5,r3,r10
410err4;	stvx	vr4,r3,r11
411err4;	stvx	vr3,r3,r12
412err4;	stvx	vr2,r3,r14
413err4;	stvx	vr1,r3,r15
414err4;	stvx	vr0,r3,r16
415	addi	r3,r3,128
416	bdnz	8b
417
418	ld	r14,STK_REG(r14)(r1)
419	ld	r15,STK_REG(r15)(r1)
420	ld	r16,STK_REG(r16)(r1)
421
422	/* Up to 127B to go */
423	clrldi	r5,r5,(64-7)
424	srdi	r6,r5,4
425	mtocrf	0x01,r6
426
427	bf	cr7*4+1,9f
428err3;	lvx	vr3,r0,r4
429err3;	lvx	vr2,r4,r9
430err3;	lvx	vr1,r4,r10
431err3;	lvx	vr0,r4,r11
432	addi	r4,r4,64
433err3;	stvx	vr3,r0,r3
434err3;	stvx	vr2,r3,r9
435err3;	stvx	vr1,r3,r10
436err3;	stvx	vr0,r3,r11
437	addi	r3,r3,64
438
4399:	bf	cr7*4+2,10f
440err3;	lvx	vr1,r0,r4
441err3;	lvx	vr0,r4,r9
442	addi	r4,r4,32
443err3;	stvx	vr1,r0,r3
444err3;	stvx	vr0,r3,r9
445	addi	r3,r3,32
446
44710:	bf	cr7*4+3,11f
448err3;	lvx	vr1,r0,r4
449	addi	r4,r4,16
450err3;	stvx	vr1,r0,r3
451	addi	r3,r3,16
452
453	/* Up to 15B to go */
45411:	clrldi	r5,r5,(64-4)
455	mtocrf	0x01,r5
456	bf	cr7*4+0,12f
457err3;	ld	r0,0(r4)
458	addi	r4,r4,8
459err3;	std	r0,0(r3)
460	addi	r3,r3,8
461
46212:	bf	cr7*4+1,13f
463err3;	lwz	r0,0(r4)
464	addi	r4,r4,4
465err3;	stw	r0,0(r3)
466	addi	r3,r3,4
467
46813:	bf	cr7*4+2,14f
469err3;	lhz	r0,0(r4)
470	addi	r4,r4,2
471err3;	sth	r0,0(r3)
472	addi	r3,r3,2
473
47414:	bf	cr7*4+3,15f
475err3;	lbz	r0,0(r4)
476err3;	stb	r0,0(r3)
477
47815:	addi	r1,r1,STACKFRAMESIZE
479	b	.exit_vmx_copy		/* tail call optimise */
480
481.Lvmx_unaligned_copy:
482	/* Get the destination 16B aligned */
483	neg	r6,r3
484	mtocrf	0x01,r6
485	clrldi	r6,r6,(64-4)
486
487	bf	cr7*4+3,1f
488err3;	lbz	r0,0(r4)
489	addi	r4,r4,1
490err3;	stb	r0,0(r3)
491	addi	r3,r3,1
492
4931:	bf	cr7*4+2,2f
494err3;	lhz	r0,0(r4)
495	addi	r4,r4,2
496err3;	sth	r0,0(r3)
497	addi	r3,r3,2
498
4992:	bf	cr7*4+1,3f
500err3;	lwz	r0,0(r4)
501	addi	r4,r4,4
502err3;	stw	r0,0(r3)
503	addi	r3,r3,4
504
5053:	bf	cr7*4+0,4f
506err3;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
507err3;	lwz	r7,4(r4)
508	addi	r4,r4,8
509err3;	stw	r0,0(r3)
510err3;	stw	r7,4(r3)
511	addi	r3,r3,8
512
5134:	sub	r5,r5,r6
514
515	/* Get the desination 128B aligned */
516	neg	r6,r3
517	srdi	r7,r6,4
518	mtocrf	0x01,r7
519	clrldi	r6,r6,(64-7)
520
521	li	r9,16
522	li	r10,32
523	li	r11,48
524
525	lvsl	vr16,0,r4	/* Setup permute control vector */
526err3;	lvx	vr0,0,r4
527	addi	r4,r4,16
528
529	bf	cr7*4+3,5f
530err3;	lvx	vr1,r0,r4
531	vperm	vr8,vr0,vr1,vr16
532	addi	r4,r4,16
533err3;	stvx	vr8,r0,r3
534	addi	r3,r3,16
535	vor	vr0,vr1,vr1
536
5375:	bf	cr7*4+2,6f
538err3;	lvx	vr1,r0,r4
539	vperm	vr8,vr0,vr1,vr16
540err3;	lvx	vr0,r4,r9
541	vperm	vr9,vr1,vr0,vr16
542	addi	r4,r4,32
543err3;	stvx	vr8,r0,r3
544err3;	stvx	vr9,r3,r9
545	addi	r3,r3,32
546
5476:	bf	cr7*4+1,7f
548err3;	lvx	vr3,r0,r4
549	vperm	vr8,vr0,vr3,vr16
550err3;	lvx	vr2,r4,r9
551	vperm	vr9,vr3,vr2,vr16
552err3;	lvx	vr1,r4,r10
553	vperm	vr10,vr2,vr1,vr16
554err3;	lvx	vr0,r4,r11
555	vperm	vr11,vr1,vr0,vr16
556	addi	r4,r4,64
557err3;	stvx	vr8,r0,r3
558err3;	stvx	vr9,r3,r9
559err3;	stvx	vr10,r3,r10
560err3;	stvx	vr11,r3,r11
561	addi	r3,r3,64
562
5637:	sub	r5,r5,r6
564	srdi	r6,r5,7
565
566	std	r14,STK_REG(r14)(r1)
567	std	r15,STK_REG(r15)(r1)
568	std	r16,STK_REG(r16)(r1)
569
570	li	r12,64
571	li	r14,80
572	li	r15,96
573	li	r16,112
574
575	mtctr	r6
576
577	/*
578	 * Now do cacheline sized loads and stores. By this stage the
579	 * cacheline stores are also cacheline aligned.
580	 */
581	.align	5
5828:
583err4;	lvx	vr7,r0,r4
584	vperm	vr8,vr0,vr7,vr16
585err4;	lvx	vr6,r4,r9
586	vperm	vr9,vr7,vr6,vr16
587err4;	lvx	vr5,r4,r10
588	vperm	vr10,vr6,vr5,vr16
589err4;	lvx	vr4,r4,r11
590	vperm	vr11,vr5,vr4,vr16
591err4;	lvx	vr3,r4,r12
592	vperm	vr12,vr4,vr3,vr16
593err4;	lvx	vr2,r4,r14
594	vperm	vr13,vr3,vr2,vr16
595err4;	lvx	vr1,r4,r15
596	vperm	vr14,vr2,vr1,vr16
597err4;	lvx	vr0,r4,r16
598	vperm	vr15,vr1,vr0,vr16
599	addi	r4,r4,128
600err4;	stvx	vr8,r0,r3
601err4;	stvx	vr9,r3,r9
602err4;	stvx	vr10,r3,r10
603err4;	stvx	vr11,r3,r11
604err4;	stvx	vr12,r3,r12
605err4;	stvx	vr13,r3,r14
606err4;	stvx	vr14,r3,r15
607err4;	stvx	vr15,r3,r16
608	addi	r3,r3,128
609	bdnz	8b
610
611	ld	r14,STK_REG(r14)(r1)
612	ld	r15,STK_REG(r15)(r1)
613	ld	r16,STK_REG(r16)(r1)
614
615	/* Up to 127B to go */
616	clrldi	r5,r5,(64-7)
617	srdi	r6,r5,4
618	mtocrf	0x01,r6
619
620	bf	cr7*4+1,9f
621err3;	lvx	vr3,r0,r4
622	vperm	vr8,vr0,vr3,vr16
623err3;	lvx	vr2,r4,r9
624	vperm	vr9,vr3,vr2,vr16
625err3;	lvx	vr1,r4,r10
626	vperm	vr10,vr2,vr1,vr16
627err3;	lvx	vr0,r4,r11
628	vperm	vr11,vr1,vr0,vr16
629	addi	r4,r4,64
630err3;	stvx	vr8,r0,r3
631err3;	stvx	vr9,r3,r9
632err3;	stvx	vr10,r3,r10
633err3;	stvx	vr11,r3,r11
634	addi	r3,r3,64
635
6369:	bf	cr7*4+2,10f
637err3;	lvx	vr1,r0,r4
638	vperm	vr8,vr0,vr1,vr16
639err3;	lvx	vr0,r4,r9
640	vperm	vr9,vr1,vr0,vr16
641	addi	r4,r4,32
642err3;	stvx	vr8,r0,r3
643err3;	stvx	vr9,r3,r9
644	addi	r3,r3,32
645
64610:	bf	cr7*4+3,11f
647err3;	lvx	vr1,r0,r4
648	vperm	vr8,vr0,vr1,vr16
649	addi	r4,r4,16
650err3;	stvx	vr8,r0,r3
651	addi	r3,r3,16
652
653	/* Up to 15B to go */
65411:	clrldi	r5,r5,(64-4)
655	addi	r4,r4,-16	/* Unwind the +16 load offset */
656	mtocrf	0x01,r5
657	bf	cr7*4+0,12f
658err3;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
659err3;	lwz	r6,4(r4)
660	addi	r4,r4,8
661err3;	stw	r0,0(r3)
662err3;	stw	r6,4(r3)
663	addi	r3,r3,8
664
66512:	bf	cr7*4+1,13f
666err3;	lwz	r0,0(r4)
667	addi	r4,r4,4
668err3;	stw	r0,0(r3)
669	addi	r3,r3,4
670
67113:	bf	cr7*4+2,14f
672err3;	lhz	r0,0(r4)
673	addi	r4,r4,2
674err3;	sth	r0,0(r3)
675	addi	r3,r3,2
676
67714:	bf	cr7*4+3,15f
678err3;	lbz	r0,0(r4)
679err3;	stb	r0,0(r3)
680
68115:	addi	r1,r1,STACKFRAMESIZE
682	b	.exit_vmx_copy		/* tail call optimise */
683#endif /* CONFiG_ALTIVEC */
684