xref: /linux/arch/powerpc/lib/copyuser_power7.S (revision 593d0a3e9f813db910dc50574532914db21d09ff)
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2011
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20#include <asm/ppc_asm.h>
21
22	.macro err1
23100:
24	.section __ex_table,"a"
25	.align 3
26	.llong 100b,.Ldo_err1
27	.previous
28	.endm
29
30	.macro err2
31200:
32	.section __ex_table,"a"
33	.align 3
34	.llong 200b,.Ldo_err2
35	.previous
36	.endm
37
38#ifdef CONFIG_ALTIVEC
39	.macro err3
40300:
41	.section __ex_table,"a"
42	.align 3
43	.llong 300b,.Ldo_err3
44	.previous
45	.endm
46
47	.macro err4
48400:
49	.section __ex_table,"a"
50	.align 3
51	.llong 400b,.Ldo_err4
52	.previous
53	.endm
54
55
56.Ldo_err4:
57	ld	r16,STK_REG(R16)(r1)
58	ld	r15,STK_REG(R15)(r1)
59	ld	r14,STK_REG(R14)(r1)
60.Ldo_err3:
61	bl	.exit_vmx_usercopy
62	ld	r0,STACKFRAMESIZE+16(r1)
63	mtlr	r0
64	b	.Lexit
65#endif /* CONFIG_ALTIVEC */
66
67.Ldo_err2:
68	ld	r22,STK_REG(R22)(r1)
69	ld	r21,STK_REG(R21)(r1)
70	ld	r20,STK_REG(R20)(r1)
71	ld	r19,STK_REG(R19)(r1)
72	ld	r18,STK_REG(R18)(r1)
73	ld	r17,STK_REG(R17)(r1)
74	ld	r16,STK_REG(R16)(r1)
75	ld	r15,STK_REG(R15)(r1)
76	ld	r14,STK_REG(R14)(r1)
77.Lexit:
78	addi	r1,r1,STACKFRAMESIZE
79.Ldo_err1:
80	ld	r3,48(r1)
81	ld	r4,56(r1)
82	ld	r5,64(r1)
83	b	__copy_tofrom_user_base
84
85
86_GLOBAL(__copy_tofrom_user_power7)
87#ifdef CONFIG_ALTIVEC
88	cmpldi	r5,16
89	cmpldi	cr1,r5,4096
90
91	std	r3,48(r1)
92	std	r4,56(r1)
93	std	r5,64(r1)
94
95	blt	.Lshort_copy
96	bgt	cr1,.Lvmx_copy
97#else
98	cmpldi	r5,16
99
100	std	r3,48(r1)
101	std	r4,56(r1)
102	std	r5,64(r1)
103
104	blt	.Lshort_copy
105#endif
106
107.Lnonvmx_copy:
108	/* Get the source 8B aligned */
109	neg	r6,r4
110	mtocrf	0x01,r6
111	clrldi	r6,r6,(64-3)
112
113	bf	cr7*4+3,1f
114err1;	lbz	r0,0(r4)
115	addi	r4,r4,1
116err1;	stb	r0,0(r3)
117	addi	r3,r3,1
118
1191:	bf	cr7*4+2,2f
120err1;	lhz	r0,0(r4)
121	addi	r4,r4,2
122err1;	sth	r0,0(r3)
123	addi	r3,r3,2
124
1252:	bf	cr7*4+1,3f
126err1;	lwz	r0,0(r4)
127	addi	r4,r4,4
128err1;	stw	r0,0(r3)
129	addi	r3,r3,4
130
1313:	sub	r5,r5,r6
132	cmpldi	r5,128
133	blt	5f
134
135	mflr	r0
136	stdu	r1,-STACKFRAMESIZE(r1)
137	std	r14,STK_REG(R14)(r1)
138	std	r15,STK_REG(R15)(r1)
139	std	r16,STK_REG(R16)(r1)
140	std	r17,STK_REG(R17)(r1)
141	std	r18,STK_REG(R18)(r1)
142	std	r19,STK_REG(R19)(r1)
143	std	r20,STK_REG(R20)(r1)
144	std	r21,STK_REG(R21)(r1)
145	std	r22,STK_REG(R22)(r1)
146	std	r0,STACKFRAMESIZE+16(r1)
147
148	srdi	r6,r5,7
149	mtctr	r6
150
151	/* Now do cacheline (128B) sized loads and stores. */
152	.align	5
1534:
154err2;	ld	r0,0(r4)
155err2;	ld	r6,8(r4)
156err2;	ld	r7,16(r4)
157err2;	ld	r8,24(r4)
158err2;	ld	r9,32(r4)
159err2;	ld	r10,40(r4)
160err2;	ld	r11,48(r4)
161err2;	ld	r12,56(r4)
162err2;	ld	r14,64(r4)
163err2;	ld	r15,72(r4)
164err2;	ld	r16,80(r4)
165err2;	ld	r17,88(r4)
166err2;	ld	r18,96(r4)
167err2;	ld	r19,104(r4)
168err2;	ld	r20,112(r4)
169err2;	ld	r21,120(r4)
170	addi	r4,r4,128
171err2;	std	r0,0(r3)
172err2;	std	r6,8(r3)
173err2;	std	r7,16(r3)
174err2;	std	r8,24(r3)
175err2;	std	r9,32(r3)
176err2;	std	r10,40(r3)
177err2;	std	r11,48(r3)
178err2;	std	r12,56(r3)
179err2;	std	r14,64(r3)
180err2;	std	r15,72(r3)
181err2;	std	r16,80(r3)
182err2;	std	r17,88(r3)
183err2;	std	r18,96(r3)
184err2;	std	r19,104(r3)
185err2;	std	r20,112(r3)
186err2;	std	r21,120(r3)
187	addi	r3,r3,128
188	bdnz	4b
189
190	clrldi	r5,r5,(64-7)
191
192	ld	r14,STK_REG(R14)(r1)
193	ld	r15,STK_REG(R15)(r1)
194	ld	r16,STK_REG(R16)(r1)
195	ld	r17,STK_REG(R17)(r1)
196	ld	r18,STK_REG(R18)(r1)
197	ld	r19,STK_REG(R19)(r1)
198	ld	r20,STK_REG(R20)(r1)
199	ld	r21,STK_REG(R21)(r1)
200	ld	r22,STK_REG(R22)(r1)
201	addi	r1,r1,STACKFRAMESIZE
202
203	/* Up to 127B to go */
2045:	srdi	r6,r5,4
205	mtocrf	0x01,r6
206
2076:	bf	cr7*4+1,7f
208err1;	ld	r0,0(r4)
209err1;	ld	r6,8(r4)
210err1;	ld	r7,16(r4)
211err1;	ld	r8,24(r4)
212err1;	ld	r9,32(r4)
213err1;	ld	r10,40(r4)
214err1;	ld	r11,48(r4)
215err1;	ld	r12,56(r4)
216	addi	r4,r4,64
217err1;	std	r0,0(r3)
218err1;	std	r6,8(r3)
219err1;	std	r7,16(r3)
220err1;	std	r8,24(r3)
221err1;	std	r9,32(r3)
222err1;	std	r10,40(r3)
223err1;	std	r11,48(r3)
224err1;	std	r12,56(r3)
225	addi	r3,r3,64
226
227	/* Up to 63B to go */
2287:	bf	cr7*4+2,8f
229err1;	ld	r0,0(r4)
230err1;	ld	r6,8(r4)
231err1;	ld	r7,16(r4)
232err1;	ld	r8,24(r4)
233	addi	r4,r4,32
234err1;	std	r0,0(r3)
235err1;	std	r6,8(r3)
236err1;	std	r7,16(r3)
237err1;	std	r8,24(r3)
238	addi	r3,r3,32
239
240	/* Up to 31B to go */
2418:	bf	cr7*4+3,9f
242err1;	ld	r0,0(r4)
243err1;	ld	r6,8(r4)
244	addi	r4,r4,16
245err1;	std	r0,0(r3)
246err1;	std	r6,8(r3)
247	addi	r3,r3,16
248
2499:	clrldi	r5,r5,(64-4)
250
251	/* Up to 15B to go */
252.Lshort_copy:
253	mtocrf	0x01,r5
254	bf	cr7*4+0,12f
255err1;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
256err1;	lwz	r6,4(r4)
257	addi	r4,r4,8
258err1;	stw	r0,0(r3)
259err1;	stw	r6,4(r3)
260	addi	r3,r3,8
261
26212:	bf	cr7*4+1,13f
263err1;	lwz	r0,0(r4)
264	addi	r4,r4,4
265err1;	stw	r0,0(r3)
266	addi	r3,r3,4
267
26813:	bf	cr7*4+2,14f
269err1;	lhz	r0,0(r4)
270	addi	r4,r4,2
271err1;	sth	r0,0(r3)
272	addi	r3,r3,2
273
27414:	bf	cr7*4+3,15f
275err1;	lbz	r0,0(r4)
276err1;	stb	r0,0(r3)
277
27815:	li	r3,0
279	blr
280
281.Lunwind_stack_nonvmx_copy:
282	addi	r1,r1,STACKFRAMESIZE
283	b	.Lnonvmx_copy
284
285#ifdef CONFIG_ALTIVEC
286.Lvmx_copy:
287	mflr	r0
288	std	r0,16(r1)
289	stdu	r1,-STACKFRAMESIZE(r1)
290	bl	.enter_vmx_usercopy
291	cmpwi	r3,0
292	ld	r0,STACKFRAMESIZE+16(r1)
293	ld	r3,STACKFRAMESIZE+48(r1)
294	ld	r4,STACKFRAMESIZE+56(r1)
295	ld	r5,STACKFRAMESIZE+64(r1)
296	mtlr	r0
297
298	/*
299	 * We prefetch both the source and destination using enhanced touch
300	 * instructions. We use a stream ID of 0 for the load side and
301	 * 1 for the store side.
302	 */
303	clrrdi	r6,r4,7
304	clrrdi	r9,r3,7
305	ori	r9,r9,1		/* stream=1 */
306
307	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
308	cmpldi	r7,0x3FF
309	ble	1f
310	li	r7,0x3FF
3111:	lis	r0,0x0E00	/* depth=7 */
312	sldi	r7,r7,7
313	or	r7,r7,r0
314	ori	r10,r7,1	/* stream=1 */
315
316	lis	r8,0x8000	/* GO=1 */
317	clrldi	r8,r8,32
318
319.machine push
320.machine "power4"
321	dcbt	r0,r6,0b01000
322	dcbt	r0,r7,0b01010
323	dcbtst	r0,r9,0b01000
324	dcbtst	r0,r10,0b01010
325	eieio
326	dcbt	r0,r8,0b01010	/* GO */
327.machine pop
328
329	/*
330	 * We prefetch both the source and destination using enhanced touch
331	 * instructions. We use a stream ID of 0 for the load side and
332	 * 1 for the store side.
333	 */
334	clrrdi	r6,r4,7
335	clrrdi	r9,r3,7
336	ori	r9,r9,1		/* stream=1 */
337
338	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
339	cmpldi	cr1,r7,0x3FF
340	ble	cr1,1f
341	li	r7,0x3FF
3421:	lis	r0,0x0E00	/* depth=7 */
343	sldi	r7,r7,7
344	or	r7,r7,r0
345	ori	r10,r7,1	/* stream=1 */
346
347	lis	r8,0x8000	/* GO=1 */
348	clrldi	r8,r8,32
349
350.machine push
351.machine "power4"
352	dcbt	r0,r6,0b01000
353	dcbt	r0,r7,0b01010
354	dcbtst	r0,r9,0b01000
355	dcbtst	r0,r10,0b01010
356	eieio
357	dcbt	r0,r8,0b01010	/* GO */
358.machine pop
359
360	beq	.Lunwind_stack_nonvmx_copy
361
362	/*
363	 * If source and destination are not relatively aligned we use a
364	 * slower permute loop.
365	 */
366	xor	r6,r4,r3
367	rldicl.	r6,r6,0,(64-4)
368	bne	.Lvmx_unaligned_copy
369
370	/* Get the destination 16B aligned */
371	neg	r6,r3
372	mtocrf	0x01,r6
373	clrldi	r6,r6,(64-4)
374
375	bf	cr7*4+3,1f
376err3;	lbz	r0,0(r4)
377	addi	r4,r4,1
378err3;	stb	r0,0(r3)
379	addi	r3,r3,1
380
3811:	bf	cr7*4+2,2f
382err3;	lhz	r0,0(r4)
383	addi	r4,r4,2
384err3;	sth	r0,0(r3)
385	addi	r3,r3,2
386
3872:	bf	cr7*4+1,3f
388err3;	lwz	r0,0(r4)
389	addi	r4,r4,4
390err3;	stw	r0,0(r3)
391	addi	r3,r3,4
392
3933:	bf	cr7*4+0,4f
394err3;	ld	r0,0(r4)
395	addi	r4,r4,8
396err3;	std	r0,0(r3)
397	addi	r3,r3,8
398
3994:	sub	r5,r5,r6
400
401	/* Get the desination 128B aligned */
402	neg	r6,r3
403	srdi	r7,r6,4
404	mtocrf	0x01,r7
405	clrldi	r6,r6,(64-7)
406
407	li	r9,16
408	li	r10,32
409	li	r11,48
410
411	bf	cr7*4+3,5f
412err3;	lvx	vr1,r0,r4
413	addi	r4,r4,16
414err3;	stvx	vr1,r0,r3
415	addi	r3,r3,16
416
4175:	bf	cr7*4+2,6f
418err3;	lvx	vr1,r0,r4
419err3;	lvx	vr0,r4,r9
420	addi	r4,r4,32
421err3;	stvx	vr1,r0,r3
422err3;	stvx	vr0,r3,r9
423	addi	r3,r3,32
424
4256:	bf	cr7*4+1,7f
426err3;	lvx	vr3,r0,r4
427err3;	lvx	vr2,r4,r9
428err3;	lvx	vr1,r4,r10
429err3;	lvx	vr0,r4,r11
430	addi	r4,r4,64
431err3;	stvx	vr3,r0,r3
432err3;	stvx	vr2,r3,r9
433err3;	stvx	vr1,r3,r10
434err3;	stvx	vr0,r3,r11
435	addi	r3,r3,64
436
4377:	sub	r5,r5,r6
438	srdi	r6,r5,7
439
440	std	r14,STK_REG(R14)(r1)
441	std	r15,STK_REG(R15)(r1)
442	std	r16,STK_REG(R16)(r1)
443
444	li	r12,64
445	li	r14,80
446	li	r15,96
447	li	r16,112
448
449	mtctr	r6
450
451	/*
452	 * Now do cacheline sized loads and stores. By this stage the
453	 * cacheline stores are also cacheline aligned.
454	 */
455	.align	5
4568:
457err4;	lvx	vr7,r0,r4
458err4;	lvx	vr6,r4,r9
459err4;	lvx	vr5,r4,r10
460err4;	lvx	vr4,r4,r11
461err4;	lvx	vr3,r4,r12
462err4;	lvx	vr2,r4,r14
463err4;	lvx	vr1,r4,r15
464err4;	lvx	vr0,r4,r16
465	addi	r4,r4,128
466err4;	stvx	vr7,r0,r3
467err4;	stvx	vr6,r3,r9
468err4;	stvx	vr5,r3,r10
469err4;	stvx	vr4,r3,r11
470err4;	stvx	vr3,r3,r12
471err4;	stvx	vr2,r3,r14
472err4;	stvx	vr1,r3,r15
473err4;	stvx	vr0,r3,r16
474	addi	r3,r3,128
475	bdnz	8b
476
477	ld	r14,STK_REG(R14)(r1)
478	ld	r15,STK_REG(R15)(r1)
479	ld	r16,STK_REG(R16)(r1)
480
481	/* Up to 127B to go */
482	clrldi	r5,r5,(64-7)
483	srdi	r6,r5,4
484	mtocrf	0x01,r6
485
486	bf	cr7*4+1,9f
487err3;	lvx	vr3,r0,r4
488err3;	lvx	vr2,r4,r9
489err3;	lvx	vr1,r4,r10
490err3;	lvx	vr0,r4,r11
491	addi	r4,r4,64
492err3;	stvx	vr3,r0,r3
493err3;	stvx	vr2,r3,r9
494err3;	stvx	vr1,r3,r10
495err3;	stvx	vr0,r3,r11
496	addi	r3,r3,64
497
4989:	bf	cr7*4+2,10f
499err3;	lvx	vr1,r0,r4
500err3;	lvx	vr0,r4,r9
501	addi	r4,r4,32
502err3;	stvx	vr1,r0,r3
503err3;	stvx	vr0,r3,r9
504	addi	r3,r3,32
505
50610:	bf	cr7*4+3,11f
507err3;	lvx	vr1,r0,r4
508	addi	r4,r4,16
509err3;	stvx	vr1,r0,r3
510	addi	r3,r3,16
511
512	/* Up to 15B to go */
51311:	clrldi	r5,r5,(64-4)
514	mtocrf	0x01,r5
515	bf	cr7*4+0,12f
516err3;	ld	r0,0(r4)
517	addi	r4,r4,8
518err3;	std	r0,0(r3)
519	addi	r3,r3,8
520
52112:	bf	cr7*4+1,13f
522err3;	lwz	r0,0(r4)
523	addi	r4,r4,4
524err3;	stw	r0,0(r3)
525	addi	r3,r3,4
526
52713:	bf	cr7*4+2,14f
528err3;	lhz	r0,0(r4)
529	addi	r4,r4,2
530err3;	sth	r0,0(r3)
531	addi	r3,r3,2
532
53314:	bf	cr7*4+3,15f
534err3;	lbz	r0,0(r4)
535err3;	stb	r0,0(r3)
536
53715:	addi	r1,r1,STACKFRAMESIZE
538	b	.exit_vmx_usercopy	/* tail call optimise */
539
540.Lvmx_unaligned_copy:
541	/* Get the destination 16B aligned */
542	neg	r6,r3
543	mtocrf	0x01,r6
544	clrldi	r6,r6,(64-4)
545
546	bf	cr7*4+3,1f
547err3;	lbz	r0,0(r4)
548	addi	r4,r4,1
549err3;	stb	r0,0(r3)
550	addi	r3,r3,1
551
5521:	bf	cr7*4+2,2f
553err3;	lhz	r0,0(r4)
554	addi	r4,r4,2
555err3;	sth	r0,0(r3)
556	addi	r3,r3,2
557
5582:	bf	cr7*4+1,3f
559err3;	lwz	r0,0(r4)
560	addi	r4,r4,4
561err3;	stw	r0,0(r3)
562	addi	r3,r3,4
563
5643:	bf	cr7*4+0,4f
565err3;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
566err3;	lwz	r7,4(r4)
567	addi	r4,r4,8
568err3;	stw	r0,0(r3)
569err3;	stw	r7,4(r3)
570	addi	r3,r3,8
571
5724:	sub	r5,r5,r6
573
574	/* Get the desination 128B aligned */
575	neg	r6,r3
576	srdi	r7,r6,4
577	mtocrf	0x01,r7
578	clrldi	r6,r6,(64-7)
579
580	li	r9,16
581	li	r10,32
582	li	r11,48
583
584	lvsl	vr16,0,r4	/* Setup permute control vector */
585err3;	lvx	vr0,0,r4
586	addi	r4,r4,16
587
588	bf	cr7*4+3,5f
589err3;	lvx	vr1,r0,r4
590	vperm	vr8,vr0,vr1,vr16
591	addi	r4,r4,16
592err3;	stvx	vr8,r0,r3
593	addi	r3,r3,16
594	vor	vr0,vr1,vr1
595
5965:	bf	cr7*4+2,6f
597err3;	lvx	vr1,r0,r4
598	vperm	vr8,vr0,vr1,vr16
599err3;	lvx	vr0,r4,r9
600	vperm	vr9,vr1,vr0,vr16
601	addi	r4,r4,32
602err3;	stvx	vr8,r0,r3
603err3;	stvx	vr9,r3,r9
604	addi	r3,r3,32
605
6066:	bf	cr7*4+1,7f
607err3;	lvx	vr3,r0,r4
608	vperm	vr8,vr0,vr3,vr16
609err3;	lvx	vr2,r4,r9
610	vperm	vr9,vr3,vr2,vr16
611err3;	lvx	vr1,r4,r10
612	vperm	vr10,vr2,vr1,vr16
613err3;	lvx	vr0,r4,r11
614	vperm	vr11,vr1,vr0,vr16
615	addi	r4,r4,64
616err3;	stvx	vr8,r0,r3
617err3;	stvx	vr9,r3,r9
618err3;	stvx	vr10,r3,r10
619err3;	stvx	vr11,r3,r11
620	addi	r3,r3,64
621
6227:	sub	r5,r5,r6
623	srdi	r6,r5,7
624
625	std	r14,STK_REG(R14)(r1)
626	std	r15,STK_REG(R15)(r1)
627	std	r16,STK_REG(R16)(r1)
628
629	li	r12,64
630	li	r14,80
631	li	r15,96
632	li	r16,112
633
634	mtctr	r6
635
636	/*
637	 * Now do cacheline sized loads and stores. By this stage the
638	 * cacheline stores are also cacheline aligned.
639	 */
640	.align	5
6418:
642err4;	lvx	vr7,r0,r4
643	vperm	vr8,vr0,vr7,vr16
644err4;	lvx	vr6,r4,r9
645	vperm	vr9,vr7,vr6,vr16
646err4;	lvx	vr5,r4,r10
647	vperm	vr10,vr6,vr5,vr16
648err4;	lvx	vr4,r4,r11
649	vperm	vr11,vr5,vr4,vr16
650err4;	lvx	vr3,r4,r12
651	vperm	vr12,vr4,vr3,vr16
652err4;	lvx	vr2,r4,r14
653	vperm	vr13,vr3,vr2,vr16
654err4;	lvx	vr1,r4,r15
655	vperm	vr14,vr2,vr1,vr16
656err4;	lvx	vr0,r4,r16
657	vperm	vr15,vr1,vr0,vr16
658	addi	r4,r4,128
659err4;	stvx	vr8,r0,r3
660err4;	stvx	vr9,r3,r9
661err4;	stvx	vr10,r3,r10
662err4;	stvx	vr11,r3,r11
663err4;	stvx	vr12,r3,r12
664err4;	stvx	vr13,r3,r14
665err4;	stvx	vr14,r3,r15
666err4;	stvx	vr15,r3,r16
667	addi	r3,r3,128
668	bdnz	8b
669
670	ld	r14,STK_REG(R14)(r1)
671	ld	r15,STK_REG(R15)(r1)
672	ld	r16,STK_REG(R16)(r1)
673
674	/* Up to 127B to go */
675	clrldi	r5,r5,(64-7)
676	srdi	r6,r5,4
677	mtocrf	0x01,r6
678
679	bf	cr7*4+1,9f
680err3;	lvx	vr3,r0,r4
681	vperm	vr8,vr0,vr3,vr16
682err3;	lvx	vr2,r4,r9
683	vperm	vr9,vr3,vr2,vr16
684err3;	lvx	vr1,r4,r10
685	vperm	vr10,vr2,vr1,vr16
686err3;	lvx	vr0,r4,r11
687	vperm	vr11,vr1,vr0,vr16
688	addi	r4,r4,64
689err3;	stvx	vr8,r0,r3
690err3;	stvx	vr9,r3,r9
691err3;	stvx	vr10,r3,r10
692err3;	stvx	vr11,r3,r11
693	addi	r3,r3,64
694
6959:	bf	cr7*4+2,10f
696err3;	lvx	vr1,r0,r4
697	vperm	vr8,vr0,vr1,vr16
698err3;	lvx	vr0,r4,r9
699	vperm	vr9,vr1,vr0,vr16
700	addi	r4,r4,32
701err3;	stvx	vr8,r0,r3
702err3;	stvx	vr9,r3,r9
703	addi	r3,r3,32
704
70510:	bf	cr7*4+3,11f
706err3;	lvx	vr1,r0,r4
707	vperm	vr8,vr0,vr1,vr16
708	addi	r4,r4,16
709err3;	stvx	vr8,r0,r3
710	addi	r3,r3,16
711
712	/* Up to 15B to go */
71311:	clrldi	r5,r5,(64-4)
714	addi	r4,r4,-16	/* Unwind the +16 load offset */
715	mtocrf	0x01,r5
716	bf	cr7*4+0,12f
717err3;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
718err3;	lwz	r6,4(r4)
719	addi	r4,r4,8
720err3;	stw	r0,0(r3)
721err3;	stw	r6,4(r3)
722	addi	r3,r3,8
723
72412:	bf	cr7*4+1,13f
725err3;	lwz	r0,0(r4)
726	addi	r4,r4,4
727err3;	stw	r0,0(r3)
728	addi	r3,r3,4
729
73013:	bf	cr7*4+2,14f
731err3;	lhz	r0,0(r4)
732	addi	r4,r4,2
733err3;	sth	r0,0(r3)
734	addi	r3,r3,2
735
73614:	bf	cr7*4+3,15f
737err3;	lbz	r0,0(r4)
738err3;	stb	r0,0(r3)
739
74015:	addi	r1,r1,STACKFRAMESIZE
741	b	.exit_vmx_usercopy	/* tail call optimise */
742#endif /* CONFiG_ALTIVEC */
743