xref: /linux/arch/powerpc/lib/copyuser_power7.S (revision 50e59058a42056063add41c2d900d1b162e2d6d6)
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2011
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20#include <asm/ppc_asm.h>
21
22#ifdef __BIG_ENDIAN__
23#define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB
24#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC
25#else
26#define LVS(VRT,RA,RB)		lvsr	VRT,RA,RB
27#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC
28#endif
29
30	.macro err1
31100:
32	.section __ex_table,"a"
33	.align 3
34	.llong 100b,.Ldo_err1
35	.previous
36	.endm
37
38	.macro err2
39200:
40	.section __ex_table,"a"
41	.align 3
42	.llong 200b,.Ldo_err2
43	.previous
44	.endm
45
46#ifdef CONFIG_ALTIVEC
47	.macro err3
48300:
49	.section __ex_table,"a"
50	.align 3
51	.llong 300b,.Ldo_err3
52	.previous
53	.endm
54
55	.macro err4
56400:
57	.section __ex_table,"a"
58	.align 3
59	.llong 400b,.Ldo_err4
60	.previous
61	.endm
62
63
64.Ldo_err4:
65	ld	r16,STK_REG(R16)(r1)
66	ld	r15,STK_REG(R15)(r1)
67	ld	r14,STK_REG(R14)(r1)
68.Ldo_err3:
69	bl	exit_vmx_usercopy
70	ld	r0,STACKFRAMESIZE+16(r1)
71	mtlr	r0
72	b	.Lexit
73#endif /* CONFIG_ALTIVEC */
74
75.Ldo_err2:
76	ld	r22,STK_REG(R22)(r1)
77	ld	r21,STK_REG(R21)(r1)
78	ld	r20,STK_REG(R20)(r1)
79	ld	r19,STK_REG(R19)(r1)
80	ld	r18,STK_REG(R18)(r1)
81	ld	r17,STK_REG(R17)(r1)
82	ld	r16,STK_REG(R16)(r1)
83	ld	r15,STK_REG(R15)(r1)
84	ld	r14,STK_REG(R14)(r1)
85.Lexit:
86	addi	r1,r1,STACKFRAMESIZE
87.Ldo_err1:
88	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
89	ld	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
90	ld	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
91	b	__copy_tofrom_user_base
92
93
94_GLOBAL(__copy_tofrom_user_power7)
95#ifdef CONFIG_ALTIVEC
96	cmpldi	r5,16
97	cmpldi	cr1,r5,4096
98
99	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
100	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
101	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
102
103	blt	.Lshort_copy
104	bgt	cr1,.Lvmx_copy
105#else
106	cmpldi	r5,16
107
108	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
109	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
110	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
111
112	blt	.Lshort_copy
113#endif
114
115.Lnonvmx_copy:
116	/* Get the source 8B aligned */
117	neg	r6,r4
118	mtocrf	0x01,r6
119	clrldi	r6,r6,(64-3)
120
121	bf	cr7*4+3,1f
122err1;	lbz	r0,0(r4)
123	addi	r4,r4,1
124err1;	stb	r0,0(r3)
125	addi	r3,r3,1
126
1271:	bf	cr7*4+2,2f
128err1;	lhz	r0,0(r4)
129	addi	r4,r4,2
130err1;	sth	r0,0(r3)
131	addi	r3,r3,2
132
1332:	bf	cr7*4+1,3f
134err1;	lwz	r0,0(r4)
135	addi	r4,r4,4
136err1;	stw	r0,0(r3)
137	addi	r3,r3,4
138
1393:	sub	r5,r5,r6
140	cmpldi	r5,128
141	blt	5f
142
143	mflr	r0
144	stdu	r1,-STACKFRAMESIZE(r1)
145	std	r14,STK_REG(R14)(r1)
146	std	r15,STK_REG(R15)(r1)
147	std	r16,STK_REG(R16)(r1)
148	std	r17,STK_REG(R17)(r1)
149	std	r18,STK_REG(R18)(r1)
150	std	r19,STK_REG(R19)(r1)
151	std	r20,STK_REG(R20)(r1)
152	std	r21,STK_REG(R21)(r1)
153	std	r22,STK_REG(R22)(r1)
154	std	r0,STACKFRAMESIZE+16(r1)
155
156	srdi	r6,r5,7
157	mtctr	r6
158
159	/* Now do cacheline (128B) sized loads and stores. */
160	.align	5
1614:
162err2;	ld	r0,0(r4)
163err2;	ld	r6,8(r4)
164err2;	ld	r7,16(r4)
165err2;	ld	r8,24(r4)
166err2;	ld	r9,32(r4)
167err2;	ld	r10,40(r4)
168err2;	ld	r11,48(r4)
169err2;	ld	r12,56(r4)
170err2;	ld	r14,64(r4)
171err2;	ld	r15,72(r4)
172err2;	ld	r16,80(r4)
173err2;	ld	r17,88(r4)
174err2;	ld	r18,96(r4)
175err2;	ld	r19,104(r4)
176err2;	ld	r20,112(r4)
177err2;	ld	r21,120(r4)
178	addi	r4,r4,128
179err2;	std	r0,0(r3)
180err2;	std	r6,8(r3)
181err2;	std	r7,16(r3)
182err2;	std	r8,24(r3)
183err2;	std	r9,32(r3)
184err2;	std	r10,40(r3)
185err2;	std	r11,48(r3)
186err2;	std	r12,56(r3)
187err2;	std	r14,64(r3)
188err2;	std	r15,72(r3)
189err2;	std	r16,80(r3)
190err2;	std	r17,88(r3)
191err2;	std	r18,96(r3)
192err2;	std	r19,104(r3)
193err2;	std	r20,112(r3)
194err2;	std	r21,120(r3)
195	addi	r3,r3,128
196	bdnz	4b
197
198	clrldi	r5,r5,(64-7)
199
200	ld	r14,STK_REG(R14)(r1)
201	ld	r15,STK_REG(R15)(r1)
202	ld	r16,STK_REG(R16)(r1)
203	ld	r17,STK_REG(R17)(r1)
204	ld	r18,STK_REG(R18)(r1)
205	ld	r19,STK_REG(R19)(r1)
206	ld	r20,STK_REG(R20)(r1)
207	ld	r21,STK_REG(R21)(r1)
208	ld	r22,STK_REG(R22)(r1)
209	addi	r1,r1,STACKFRAMESIZE
210
211	/* Up to 127B to go */
2125:	srdi	r6,r5,4
213	mtocrf	0x01,r6
214
2156:	bf	cr7*4+1,7f
216err1;	ld	r0,0(r4)
217err1;	ld	r6,8(r4)
218err1;	ld	r7,16(r4)
219err1;	ld	r8,24(r4)
220err1;	ld	r9,32(r4)
221err1;	ld	r10,40(r4)
222err1;	ld	r11,48(r4)
223err1;	ld	r12,56(r4)
224	addi	r4,r4,64
225err1;	std	r0,0(r3)
226err1;	std	r6,8(r3)
227err1;	std	r7,16(r3)
228err1;	std	r8,24(r3)
229err1;	std	r9,32(r3)
230err1;	std	r10,40(r3)
231err1;	std	r11,48(r3)
232err1;	std	r12,56(r3)
233	addi	r3,r3,64
234
235	/* Up to 63B to go */
2367:	bf	cr7*4+2,8f
237err1;	ld	r0,0(r4)
238err1;	ld	r6,8(r4)
239err1;	ld	r7,16(r4)
240err1;	ld	r8,24(r4)
241	addi	r4,r4,32
242err1;	std	r0,0(r3)
243err1;	std	r6,8(r3)
244err1;	std	r7,16(r3)
245err1;	std	r8,24(r3)
246	addi	r3,r3,32
247
248	/* Up to 31B to go */
2498:	bf	cr7*4+3,9f
250err1;	ld	r0,0(r4)
251err1;	ld	r6,8(r4)
252	addi	r4,r4,16
253err1;	std	r0,0(r3)
254err1;	std	r6,8(r3)
255	addi	r3,r3,16
256
2579:	clrldi	r5,r5,(64-4)
258
259	/* Up to 15B to go */
260.Lshort_copy:
261	mtocrf	0x01,r5
262	bf	cr7*4+0,12f
263err1;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
264err1;	lwz	r6,4(r4)
265	addi	r4,r4,8
266err1;	stw	r0,0(r3)
267err1;	stw	r6,4(r3)
268	addi	r3,r3,8
269
27012:	bf	cr7*4+1,13f
271err1;	lwz	r0,0(r4)
272	addi	r4,r4,4
273err1;	stw	r0,0(r3)
274	addi	r3,r3,4
275
27613:	bf	cr7*4+2,14f
277err1;	lhz	r0,0(r4)
278	addi	r4,r4,2
279err1;	sth	r0,0(r3)
280	addi	r3,r3,2
281
28214:	bf	cr7*4+3,15f
283err1;	lbz	r0,0(r4)
284err1;	stb	r0,0(r3)
285
28615:	li	r3,0
287	blr
288
289.Lunwind_stack_nonvmx_copy:
290	addi	r1,r1,STACKFRAMESIZE
291	b	.Lnonvmx_copy
292
293#ifdef CONFIG_ALTIVEC
294.Lvmx_copy:
295	mflr	r0
296	std	r0,16(r1)
297	stdu	r1,-STACKFRAMESIZE(r1)
298	bl	enter_vmx_usercopy
299	cmpwi	cr1,r3,0
300	ld	r0,STACKFRAMESIZE+16(r1)
301	ld	r3,STK_REG(R31)(r1)
302	ld	r4,STK_REG(R30)(r1)
303	ld	r5,STK_REG(R29)(r1)
304	mtlr	r0
305
306	/*
307	 * We prefetch both the source and destination using enhanced touch
308	 * instructions. We use a stream ID of 0 for the load side and
309	 * 1 for the store side.
310	 */
311	clrrdi	r6,r4,7
312	clrrdi	r9,r3,7
313	ori	r9,r9,1		/* stream=1 */
314
315	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
316	cmpldi	r7,0x3FF
317	ble	1f
318	li	r7,0x3FF
3191:	lis	r0,0x0E00	/* depth=7 */
320	sldi	r7,r7,7
321	or	r7,r7,r0
322	ori	r10,r7,1	/* stream=1 */
323
324	lis	r8,0x8000	/* GO=1 */
325	clrldi	r8,r8,32
326
327.machine push
328.machine "power4"
329	/* setup read stream 0 */
330	dcbt	r0,r6,0b01000   /* addr from */
331	dcbt	r0,r7,0b01010   /* length and depth from */
332	/* setup write stream 1 */
333	dcbtst	r0,r9,0b01000   /* addr to */
334	dcbtst	r0,r10,0b01010  /* length and depth to */
335	eieio
336	dcbt	r0,r8,0b01010	/* all streams GO */
337.machine pop
338
339	beq	cr1,.Lunwind_stack_nonvmx_copy
340
341	/*
342	 * If source and destination are not relatively aligned we use a
343	 * slower permute loop.
344	 */
345	xor	r6,r4,r3
346	rldicl.	r6,r6,0,(64-4)
347	bne	.Lvmx_unaligned_copy
348
349	/* Get the destination 16B aligned */
350	neg	r6,r3
351	mtocrf	0x01,r6
352	clrldi	r6,r6,(64-4)
353
354	bf	cr7*4+3,1f
355err3;	lbz	r0,0(r4)
356	addi	r4,r4,1
357err3;	stb	r0,0(r3)
358	addi	r3,r3,1
359
3601:	bf	cr7*4+2,2f
361err3;	lhz	r0,0(r4)
362	addi	r4,r4,2
363err3;	sth	r0,0(r3)
364	addi	r3,r3,2
365
3662:	bf	cr7*4+1,3f
367err3;	lwz	r0,0(r4)
368	addi	r4,r4,4
369err3;	stw	r0,0(r3)
370	addi	r3,r3,4
371
3723:	bf	cr7*4+0,4f
373err3;	ld	r0,0(r4)
374	addi	r4,r4,8
375err3;	std	r0,0(r3)
376	addi	r3,r3,8
377
3784:	sub	r5,r5,r6
379
380	/* Get the desination 128B aligned */
381	neg	r6,r3
382	srdi	r7,r6,4
383	mtocrf	0x01,r7
384	clrldi	r6,r6,(64-7)
385
386	li	r9,16
387	li	r10,32
388	li	r11,48
389
390	bf	cr7*4+3,5f
391err3;	lvx	vr1,r0,r4
392	addi	r4,r4,16
393err3;	stvx	vr1,r0,r3
394	addi	r3,r3,16
395
3965:	bf	cr7*4+2,6f
397err3;	lvx	vr1,r0,r4
398err3;	lvx	vr0,r4,r9
399	addi	r4,r4,32
400err3;	stvx	vr1,r0,r3
401err3;	stvx	vr0,r3,r9
402	addi	r3,r3,32
403
4046:	bf	cr7*4+1,7f
405err3;	lvx	vr3,r0,r4
406err3;	lvx	vr2,r4,r9
407err3;	lvx	vr1,r4,r10
408err3;	lvx	vr0,r4,r11
409	addi	r4,r4,64
410err3;	stvx	vr3,r0,r3
411err3;	stvx	vr2,r3,r9
412err3;	stvx	vr1,r3,r10
413err3;	stvx	vr0,r3,r11
414	addi	r3,r3,64
415
4167:	sub	r5,r5,r6
417	srdi	r6,r5,7
418
419	std	r14,STK_REG(R14)(r1)
420	std	r15,STK_REG(R15)(r1)
421	std	r16,STK_REG(R16)(r1)
422
423	li	r12,64
424	li	r14,80
425	li	r15,96
426	li	r16,112
427
428	mtctr	r6
429
430	/*
431	 * Now do cacheline sized loads and stores. By this stage the
432	 * cacheline stores are also cacheline aligned.
433	 */
434	.align	5
4358:
436err4;	lvx	vr7,r0,r4
437err4;	lvx	vr6,r4,r9
438err4;	lvx	vr5,r4,r10
439err4;	lvx	vr4,r4,r11
440err4;	lvx	vr3,r4,r12
441err4;	lvx	vr2,r4,r14
442err4;	lvx	vr1,r4,r15
443err4;	lvx	vr0,r4,r16
444	addi	r4,r4,128
445err4;	stvx	vr7,r0,r3
446err4;	stvx	vr6,r3,r9
447err4;	stvx	vr5,r3,r10
448err4;	stvx	vr4,r3,r11
449err4;	stvx	vr3,r3,r12
450err4;	stvx	vr2,r3,r14
451err4;	stvx	vr1,r3,r15
452err4;	stvx	vr0,r3,r16
453	addi	r3,r3,128
454	bdnz	8b
455
456	ld	r14,STK_REG(R14)(r1)
457	ld	r15,STK_REG(R15)(r1)
458	ld	r16,STK_REG(R16)(r1)
459
460	/* Up to 127B to go */
461	clrldi	r5,r5,(64-7)
462	srdi	r6,r5,4
463	mtocrf	0x01,r6
464
465	bf	cr7*4+1,9f
466err3;	lvx	vr3,r0,r4
467err3;	lvx	vr2,r4,r9
468err3;	lvx	vr1,r4,r10
469err3;	lvx	vr0,r4,r11
470	addi	r4,r4,64
471err3;	stvx	vr3,r0,r3
472err3;	stvx	vr2,r3,r9
473err3;	stvx	vr1,r3,r10
474err3;	stvx	vr0,r3,r11
475	addi	r3,r3,64
476
4779:	bf	cr7*4+2,10f
478err3;	lvx	vr1,r0,r4
479err3;	lvx	vr0,r4,r9
480	addi	r4,r4,32
481err3;	stvx	vr1,r0,r3
482err3;	stvx	vr0,r3,r9
483	addi	r3,r3,32
484
48510:	bf	cr7*4+3,11f
486err3;	lvx	vr1,r0,r4
487	addi	r4,r4,16
488err3;	stvx	vr1,r0,r3
489	addi	r3,r3,16
490
491	/* Up to 15B to go */
49211:	clrldi	r5,r5,(64-4)
493	mtocrf	0x01,r5
494	bf	cr7*4+0,12f
495err3;	ld	r0,0(r4)
496	addi	r4,r4,8
497err3;	std	r0,0(r3)
498	addi	r3,r3,8
499
50012:	bf	cr7*4+1,13f
501err3;	lwz	r0,0(r4)
502	addi	r4,r4,4
503err3;	stw	r0,0(r3)
504	addi	r3,r3,4
505
50613:	bf	cr7*4+2,14f
507err3;	lhz	r0,0(r4)
508	addi	r4,r4,2
509err3;	sth	r0,0(r3)
510	addi	r3,r3,2
511
51214:	bf	cr7*4+3,15f
513err3;	lbz	r0,0(r4)
514err3;	stb	r0,0(r3)
515
51615:	addi	r1,r1,STACKFRAMESIZE
517	b	exit_vmx_usercopy	/* tail call optimise */
518
519.Lvmx_unaligned_copy:
520	/* Get the destination 16B aligned */
521	neg	r6,r3
522	mtocrf	0x01,r6
523	clrldi	r6,r6,(64-4)
524
525	bf	cr7*4+3,1f
526err3;	lbz	r0,0(r4)
527	addi	r4,r4,1
528err3;	stb	r0,0(r3)
529	addi	r3,r3,1
530
5311:	bf	cr7*4+2,2f
532err3;	lhz	r0,0(r4)
533	addi	r4,r4,2
534err3;	sth	r0,0(r3)
535	addi	r3,r3,2
536
5372:	bf	cr7*4+1,3f
538err3;	lwz	r0,0(r4)
539	addi	r4,r4,4
540err3;	stw	r0,0(r3)
541	addi	r3,r3,4
542
5433:	bf	cr7*4+0,4f
544err3;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
545err3;	lwz	r7,4(r4)
546	addi	r4,r4,8
547err3;	stw	r0,0(r3)
548err3;	stw	r7,4(r3)
549	addi	r3,r3,8
550
5514:	sub	r5,r5,r6
552
553	/* Get the desination 128B aligned */
554	neg	r6,r3
555	srdi	r7,r6,4
556	mtocrf	0x01,r7
557	clrldi	r6,r6,(64-7)
558
559	li	r9,16
560	li	r10,32
561	li	r11,48
562
563	LVS(vr16,0,r4)		/* Setup permute control vector */
564err3;	lvx	vr0,0,r4
565	addi	r4,r4,16
566
567	bf	cr7*4+3,5f
568err3;	lvx	vr1,r0,r4
569	VPERM(vr8,vr0,vr1,vr16)
570	addi	r4,r4,16
571err3;	stvx	vr8,r0,r3
572	addi	r3,r3,16
573	vor	vr0,vr1,vr1
574
5755:	bf	cr7*4+2,6f
576err3;	lvx	vr1,r0,r4
577	VPERM(vr8,vr0,vr1,vr16)
578err3;	lvx	vr0,r4,r9
579	VPERM(vr9,vr1,vr0,vr16)
580	addi	r4,r4,32
581err3;	stvx	vr8,r0,r3
582err3;	stvx	vr9,r3,r9
583	addi	r3,r3,32
584
5856:	bf	cr7*4+1,7f
586err3;	lvx	vr3,r0,r4
587	VPERM(vr8,vr0,vr3,vr16)
588err3;	lvx	vr2,r4,r9
589	VPERM(vr9,vr3,vr2,vr16)
590err3;	lvx	vr1,r4,r10
591	VPERM(vr10,vr2,vr1,vr16)
592err3;	lvx	vr0,r4,r11
593	VPERM(vr11,vr1,vr0,vr16)
594	addi	r4,r4,64
595err3;	stvx	vr8,r0,r3
596err3;	stvx	vr9,r3,r9
597err3;	stvx	vr10,r3,r10
598err3;	stvx	vr11,r3,r11
599	addi	r3,r3,64
600
6017:	sub	r5,r5,r6
602	srdi	r6,r5,7
603
604	std	r14,STK_REG(R14)(r1)
605	std	r15,STK_REG(R15)(r1)
606	std	r16,STK_REG(R16)(r1)
607
608	li	r12,64
609	li	r14,80
610	li	r15,96
611	li	r16,112
612
613	mtctr	r6
614
615	/*
616	 * Now do cacheline sized loads and stores. By this stage the
617	 * cacheline stores are also cacheline aligned.
618	 */
619	.align	5
6208:
621err4;	lvx	vr7,r0,r4
622	VPERM(vr8,vr0,vr7,vr16)
623err4;	lvx	vr6,r4,r9
624	VPERM(vr9,vr7,vr6,vr16)
625err4;	lvx	vr5,r4,r10
626	VPERM(vr10,vr6,vr5,vr16)
627err4;	lvx	vr4,r4,r11
628	VPERM(vr11,vr5,vr4,vr16)
629err4;	lvx	vr3,r4,r12
630	VPERM(vr12,vr4,vr3,vr16)
631err4;	lvx	vr2,r4,r14
632	VPERM(vr13,vr3,vr2,vr16)
633err4;	lvx	vr1,r4,r15
634	VPERM(vr14,vr2,vr1,vr16)
635err4;	lvx	vr0,r4,r16
636	VPERM(vr15,vr1,vr0,vr16)
637	addi	r4,r4,128
638err4;	stvx	vr8,r0,r3
639err4;	stvx	vr9,r3,r9
640err4;	stvx	vr10,r3,r10
641err4;	stvx	vr11,r3,r11
642err4;	stvx	vr12,r3,r12
643err4;	stvx	vr13,r3,r14
644err4;	stvx	vr14,r3,r15
645err4;	stvx	vr15,r3,r16
646	addi	r3,r3,128
647	bdnz	8b
648
649	ld	r14,STK_REG(R14)(r1)
650	ld	r15,STK_REG(R15)(r1)
651	ld	r16,STK_REG(R16)(r1)
652
653	/* Up to 127B to go */
654	clrldi	r5,r5,(64-7)
655	srdi	r6,r5,4
656	mtocrf	0x01,r6
657
658	bf	cr7*4+1,9f
659err3;	lvx	vr3,r0,r4
660	VPERM(vr8,vr0,vr3,vr16)
661err3;	lvx	vr2,r4,r9
662	VPERM(vr9,vr3,vr2,vr16)
663err3;	lvx	vr1,r4,r10
664	VPERM(vr10,vr2,vr1,vr16)
665err3;	lvx	vr0,r4,r11
666	VPERM(vr11,vr1,vr0,vr16)
667	addi	r4,r4,64
668err3;	stvx	vr8,r0,r3
669err3;	stvx	vr9,r3,r9
670err3;	stvx	vr10,r3,r10
671err3;	stvx	vr11,r3,r11
672	addi	r3,r3,64
673
6749:	bf	cr7*4+2,10f
675err3;	lvx	vr1,r0,r4
676	VPERM(vr8,vr0,vr1,vr16)
677err3;	lvx	vr0,r4,r9
678	VPERM(vr9,vr1,vr0,vr16)
679	addi	r4,r4,32
680err3;	stvx	vr8,r0,r3
681err3;	stvx	vr9,r3,r9
682	addi	r3,r3,32
683
68410:	bf	cr7*4+3,11f
685err3;	lvx	vr1,r0,r4
686	VPERM(vr8,vr0,vr1,vr16)
687	addi	r4,r4,16
688err3;	stvx	vr8,r0,r3
689	addi	r3,r3,16
690
691	/* Up to 15B to go */
69211:	clrldi	r5,r5,(64-4)
693	addi	r4,r4,-16	/* Unwind the +16 load offset */
694	mtocrf	0x01,r5
695	bf	cr7*4+0,12f
696err3;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
697err3;	lwz	r6,4(r4)
698	addi	r4,r4,8
699err3;	stw	r0,0(r3)
700err3;	stw	r6,4(r3)
701	addi	r3,r3,8
702
70312:	bf	cr7*4+1,13f
704err3;	lwz	r0,0(r4)
705	addi	r4,r4,4
706err3;	stw	r0,0(r3)
707	addi	r3,r3,4
708
70913:	bf	cr7*4+2,14f
710err3;	lhz	r0,0(r4)
711	addi	r4,r4,2
712err3;	sth	r0,0(r3)
713	addi	r3,r3,2
714
71514:	bf	cr7*4+3,15f
716err3;	lbz	r0,0(r4)
717err3;	stb	r0,0(r3)
718
71915:	addi	r1,r1,STACKFRAMESIZE
720	b	exit_vmx_usercopy	/* tail call optimise */
721#endif /* CONFiG_ALTIVEC */
722