xref: /linux/arch/powerpc/lib/copyuser_power7.S (revision de008c9ba5684f14e83bcf86cd45fb0e4e6c4d82)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 *
4 * Copyright (C) IBM Corporation, 2011
5 *
6 * Author: Anton Blanchard <anton@au.ibm.com>
7 */
8#include <linux/export.h>
9#include <asm/ppc_asm.h>
10
11#ifdef __BIG_ENDIAN__
12#define LVS(VRT,RA,RB)		lvsl	VRT,RA,RB
13#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRA,VRB,VRC
14#else
15#define LVS(VRT,RA,RB)		lvsr	VRT,RA,RB
16#define VPERM(VRT,VRA,VRB,VRC)	vperm	VRT,VRB,VRA,VRC
17#endif
18
19	.macro err1
20100:
21	EX_TABLE(100b,.Ldo_err1)
22	.endm
23
24	.macro err2
25200:
26	EX_TABLE(200b,.Ldo_err2)
27	.endm
28
29#ifdef CONFIG_ALTIVEC
30	.macro err3
31300:
32	EX_TABLE(300b,.Ldo_err3)
33	.endm
34
35	.macro err4
36400:
37	EX_TABLE(400b,.Ldo_err4)
38	.endm
39
40
41.Ldo_err4:
42	ld	r16,STK_REG(R16)(r1)
43	ld	r15,STK_REG(R15)(r1)
44	ld	r14,STK_REG(R14)(r1)
45.Ldo_err3:
46	ld      r6,STK_REG(R31)(r1)	/* original destination pointer */
47	ld      r5,STK_REG(R29)(r1)	/* original number of bytes */
48	subf    r7,r6,r3		/* #bytes copied */
49	subf    r3,r7,r5		/* #bytes not copied in r3 */
50	ld	r0,STACKFRAMESIZE+16(r1)
51	mtlr	r0
52	addi    r1,r1,STACKFRAMESIZE
53	blr
54#endif /* CONFIG_ALTIVEC */
55
56.Ldo_err2:
57	ld	r22,STK_REG(R22)(r1)
58	ld	r21,STK_REG(R21)(r1)
59	ld	r20,STK_REG(R20)(r1)
60	ld	r19,STK_REG(R19)(r1)
61	ld	r18,STK_REG(R18)(r1)
62	ld	r17,STK_REG(R17)(r1)
63	ld	r16,STK_REG(R16)(r1)
64	ld	r15,STK_REG(R15)(r1)
65	ld	r14,STK_REG(R14)(r1)
66.Lexit:
67	addi	r1,r1,STACKFRAMESIZE
68.Ldo_err1:
69	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
70	ld	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
71	ld	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
72	b	__copy_tofrom_user_base
73
74
75_GLOBAL(__copy_tofrom_user_power7)
76	cmpldi	r5,16
77
78	std	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
79	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
80	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
81
82	blt	.Lshort_copy
83
84
85.Lnonvmx_copy:
86	/* Get the source 8B aligned */
87	neg	r6,r4
88	mtocrf	0x01,r6
89	clrldi	r6,r6,(64-3)
90
91	bf	cr7*4+3,1f
92err1;	lbz	r0,0(r4)
93	addi	r4,r4,1
94err1;	stb	r0,0(r3)
95	addi	r3,r3,1
96
971:	bf	cr7*4+2,2f
98err1;	lhz	r0,0(r4)
99	addi	r4,r4,2
100err1;	sth	r0,0(r3)
101	addi	r3,r3,2
102
1032:	bf	cr7*4+1,3f
104err1;	lwz	r0,0(r4)
105	addi	r4,r4,4
106err1;	stw	r0,0(r3)
107	addi	r3,r3,4
108
1093:	sub	r5,r5,r6
110	cmpldi	r5,128
111	blt	5f
112
113	mflr	r0
114	stdu	r1,-STACKFRAMESIZE(r1)
115	std	r14,STK_REG(R14)(r1)
116	std	r15,STK_REG(R15)(r1)
117	std	r16,STK_REG(R16)(r1)
118	std	r17,STK_REG(R17)(r1)
119	std	r18,STK_REG(R18)(r1)
120	std	r19,STK_REG(R19)(r1)
121	std	r20,STK_REG(R20)(r1)
122	std	r21,STK_REG(R21)(r1)
123	std	r22,STK_REG(R22)(r1)
124	std	r0,STACKFRAMESIZE+16(r1)
125
126	srdi	r6,r5,7
127	mtctr	r6
128
129	/* Now do cacheline (128B) sized loads and stores. */
130	.align	5
1314:
132err2;	ld	r0,0(r4)
133err2;	ld	r6,8(r4)
134err2;	ld	r7,16(r4)
135err2;	ld	r8,24(r4)
136err2;	ld	r9,32(r4)
137err2;	ld	r10,40(r4)
138err2;	ld	r11,48(r4)
139err2;	ld	r12,56(r4)
140err2;	ld	r14,64(r4)
141err2;	ld	r15,72(r4)
142err2;	ld	r16,80(r4)
143err2;	ld	r17,88(r4)
144err2;	ld	r18,96(r4)
145err2;	ld	r19,104(r4)
146err2;	ld	r20,112(r4)
147err2;	ld	r21,120(r4)
148	addi	r4,r4,128
149err2;	std	r0,0(r3)
150err2;	std	r6,8(r3)
151err2;	std	r7,16(r3)
152err2;	std	r8,24(r3)
153err2;	std	r9,32(r3)
154err2;	std	r10,40(r3)
155err2;	std	r11,48(r3)
156err2;	std	r12,56(r3)
157err2;	std	r14,64(r3)
158err2;	std	r15,72(r3)
159err2;	std	r16,80(r3)
160err2;	std	r17,88(r3)
161err2;	std	r18,96(r3)
162err2;	std	r19,104(r3)
163err2;	std	r20,112(r3)
164err2;	std	r21,120(r3)
165	addi	r3,r3,128
166	bdnz	4b
167
168	clrldi	r5,r5,(64-7)
169
170	ld	r14,STK_REG(R14)(r1)
171	ld	r15,STK_REG(R15)(r1)
172	ld	r16,STK_REG(R16)(r1)
173	ld	r17,STK_REG(R17)(r1)
174	ld	r18,STK_REG(R18)(r1)
175	ld	r19,STK_REG(R19)(r1)
176	ld	r20,STK_REG(R20)(r1)
177	ld	r21,STK_REG(R21)(r1)
178	ld	r22,STK_REG(R22)(r1)
179	addi	r1,r1,STACKFRAMESIZE
180
181	/* Up to 127B to go */
1825:	srdi	r6,r5,4
183	mtocrf	0x01,r6
184
1856:	bf	cr7*4+1,7f
186err1;	ld	r0,0(r4)
187err1;	ld	r6,8(r4)
188err1;	ld	r7,16(r4)
189err1;	ld	r8,24(r4)
190err1;	ld	r9,32(r4)
191err1;	ld	r10,40(r4)
192err1;	ld	r11,48(r4)
193err1;	ld	r12,56(r4)
194	addi	r4,r4,64
195err1;	std	r0,0(r3)
196err1;	std	r6,8(r3)
197err1;	std	r7,16(r3)
198err1;	std	r8,24(r3)
199err1;	std	r9,32(r3)
200err1;	std	r10,40(r3)
201err1;	std	r11,48(r3)
202err1;	std	r12,56(r3)
203	addi	r3,r3,64
204
205	/* Up to 63B to go */
2067:	bf	cr7*4+2,8f
207err1;	ld	r0,0(r4)
208err1;	ld	r6,8(r4)
209err1;	ld	r7,16(r4)
210err1;	ld	r8,24(r4)
211	addi	r4,r4,32
212err1;	std	r0,0(r3)
213err1;	std	r6,8(r3)
214err1;	std	r7,16(r3)
215err1;	std	r8,24(r3)
216	addi	r3,r3,32
217
218	/* Up to 31B to go */
2198:	bf	cr7*4+3,9f
220err1;	ld	r0,0(r4)
221err1;	ld	r6,8(r4)
222	addi	r4,r4,16
223err1;	std	r0,0(r3)
224err1;	std	r6,8(r3)
225	addi	r3,r3,16
226
2279:	clrldi	r5,r5,(64-4)
228
229	/* Up to 15B to go */
230.Lshort_copy:
231	mtocrf	0x01,r5
232	bf	cr7*4+0,12f
233err1;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
234err1;	lwz	r6,4(r4)
235	addi	r4,r4,8
236err1;	stw	r0,0(r3)
237err1;	stw	r6,4(r3)
238	addi	r3,r3,8
239
24012:	bf	cr7*4+1,13f
241err1;	lwz	r0,0(r4)
242	addi	r4,r4,4
243err1;	stw	r0,0(r3)
244	addi	r3,r3,4
245
24613:	bf	cr7*4+2,14f
247err1;	lhz	r0,0(r4)
248	addi	r4,r4,2
249err1;	sth	r0,0(r3)
250	addi	r3,r3,2
251
25214:	bf	cr7*4+3,15f
253err1;	lbz	r0,0(r4)
254err1;	stb	r0,0(r3)
255
25615:	li	r3,0
257	blr
258
259#ifdef CONFIG_ALTIVEC
260_GLOBAL(__copy_tofrom_user_power7_vmx)
261	mflr	r0
262	std	r0,16(r1)
263	stdu	r1,-STACKFRAMESIZE(r1)
264
265	std     r3,STK_REG(R31)(r1)
266	std     r5,STK_REG(R29)(r1)
267	/*
268	 * We prefetch both the source and destination using enhanced touch
269	 * instructions. We use a stream ID of 0 for the load side and
270	 * 1 for the store side.
271	 */
272	clrrdi	r6,r4,7
273	clrrdi	r9,r3,7
274	ori	r9,r9,1		/* stream=1 */
275
276	srdi	r7,r5,7		/* length in cachelines, capped at 0x3FF */
277	cmpldi	r7,0x3FF
278	ble	1f
279	li	r7,0x3FF
2801:	lis	r0,0x0E00	/* depth=7 */
281	sldi	r7,r7,7
282	or	r7,r7,r0
283	ori	r10,r7,1	/* stream=1 */
284
285	DCBT_SETUP_STREAMS(r6, r7, r9, r10, r8)
286
287	/*
288	 * If source and destination are not relatively aligned we use a
289	 * slower permute loop.
290	 */
291	xor	r6,r4,r3
292	rldicl.	r6,r6,0,(64-4)
293	bne	.Lvmx_unaligned_copy
294
295	/* Get the destination 16B aligned */
296	neg	r6,r3
297	mtocrf	0x01,r6
298	clrldi	r6,r6,(64-4)
299
300	bf	cr7*4+3,1f
301err3;	lbz	r0,0(r4)
302	addi	r4,r4,1
303err3;	stb	r0,0(r3)
304	addi	r3,r3,1
305
3061:	bf	cr7*4+2,2f
307err3;	lhz	r0,0(r4)
308	addi	r4,r4,2
309err3;	sth	r0,0(r3)
310	addi	r3,r3,2
311
3122:	bf	cr7*4+1,3f
313err3;	lwz	r0,0(r4)
314	addi	r4,r4,4
315err3;	stw	r0,0(r3)
316	addi	r3,r3,4
317
3183:	bf	cr7*4+0,4f
319err3;	ld	r0,0(r4)
320	addi	r4,r4,8
321err3;	std	r0,0(r3)
322	addi	r3,r3,8
323
3244:	sub	r5,r5,r6
325
326	/* Get the desination 128B aligned */
327	neg	r6,r3
328	srdi	r7,r6,4
329	mtocrf	0x01,r7
330	clrldi	r6,r6,(64-7)
331
332	li	r9,16
333	li	r10,32
334	li	r11,48
335
336	bf	cr7*4+3,5f
337err3;	lvx	v1,0,r4
338	addi	r4,r4,16
339err3;	stvx	v1,0,r3
340	addi	r3,r3,16
341
3425:	bf	cr7*4+2,6f
343err3;	lvx	v1,0,r4
344err3;	lvx	v0,r4,r9
345	addi	r4,r4,32
346err3;	stvx	v1,0,r3
347err3;	stvx	v0,r3,r9
348	addi	r3,r3,32
349
3506:	bf	cr7*4+1,7f
351err3;	lvx	v3,0,r4
352err3;	lvx	v2,r4,r9
353err3;	lvx	v1,r4,r10
354err3;	lvx	v0,r4,r11
355	addi	r4,r4,64
356err3;	stvx	v3,0,r3
357err3;	stvx	v2,r3,r9
358err3;	stvx	v1,r3,r10
359err3;	stvx	v0,r3,r11
360	addi	r3,r3,64
361
3627:	sub	r5,r5,r6
363	srdi	r6,r5,7
364
365	std	r14,STK_REG(R14)(r1)
366	std	r15,STK_REG(R15)(r1)
367	std	r16,STK_REG(R16)(r1)
368
369	li	r12,64
370	li	r14,80
371	li	r15,96
372	li	r16,112
373
374	mtctr	r6
375
376	/*
377	 * Now do cacheline sized loads and stores. By this stage the
378	 * cacheline stores are also cacheline aligned.
379	 */
380	.align	5
3818:
382err4;	lvx	v7,0,r4
383err4;	lvx	v6,r4,r9
384err4;	lvx	v5,r4,r10
385err4;	lvx	v4,r4,r11
386err4;	lvx	v3,r4,r12
387err4;	lvx	v2,r4,r14
388err4;	lvx	v1,r4,r15
389err4;	lvx	v0,r4,r16
390	addi	r4,r4,128
391err4;	stvx	v7,0,r3
392err4;	stvx	v6,r3,r9
393err4;	stvx	v5,r3,r10
394err4;	stvx	v4,r3,r11
395err4;	stvx	v3,r3,r12
396err4;	stvx	v2,r3,r14
397err4;	stvx	v1,r3,r15
398err4;	stvx	v0,r3,r16
399	addi	r3,r3,128
400	bdnz	8b
401
402	ld	r14,STK_REG(R14)(r1)
403	ld	r15,STK_REG(R15)(r1)
404	ld	r16,STK_REG(R16)(r1)
405
406	/* Up to 127B to go */
407	clrldi	r5,r5,(64-7)
408	srdi	r6,r5,4
409	mtocrf	0x01,r6
410
411	bf	cr7*4+1,9f
412err3;	lvx	v3,0,r4
413err3;	lvx	v2,r4,r9
414err3;	lvx	v1,r4,r10
415err3;	lvx	v0,r4,r11
416	addi	r4,r4,64
417err3;	stvx	v3,0,r3
418err3;	stvx	v2,r3,r9
419err3;	stvx	v1,r3,r10
420err3;	stvx	v0,r3,r11
421	addi	r3,r3,64
422
4239:	bf	cr7*4+2,10f
424err3;	lvx	v1,0,r4
425err3;	lvx	v0,r4,r9
426	addi	r4,r4,32
427err3;	stvx	v1,0,r3
428err3;	stvx	v0,r3,r9
429	addi	r3,r3,32
430
43110:	bf	cr7*4+3,11f
432err3;	lvx	v1,0,r4
433	addi	r4,r4,16
434err3;	stvx	v1,0,r3
435	addi	r3,r3,16
436
437	/* Up to 15B to go */
43811:	clrldi	r5,r5,(64-4)
439	mtocrf	0x01,r5
440	bf	cr7*4+0,12f
441err3;	ld	r0,0(r4)
442	addi	r4,r4,8
443err3;	std	r0,0(r3)
444	addi	r3,r3,8
445
44612:	bf	cr7*4+1,13f
447err3;	lwz	r0,0(r4)
448	addi	r4,r4,4
449err3;	stw	r0,0(r3)
450	addi	r3,r3,4
451
45213:	bf	cr7*4+2,14f
453err3;	lhz	r0,0(r4)
454	addi	r4,r4,2
455err3;	sth	r0,0(r3)
456	addi	r3,r3,2
457
45814:	bf	cr7*4+3,15f
459err3;	lbz	r0,0(r4)
460err3;	stb	r0,0(r3)
461
46215:	addi	r1,r1,STACKFRAMESIZE
463	li r3,0
464	blr
465
466.Lvmx_unaligned_copy:
467	/* Get the destination 16B aligned */
468	neg	r6,r3
469	mtocrf	0x01,r6
470	clrldi	r6,r6,(64-4)
471
472	bf	cr7*4+3,1f
473err3;	lbz	r0,0(r4)
474	addi	r4,r4,1
475err3;	stb	r0,0(r3)
476	addi	r3,r3,1
477
4781:	bf	cr7*4+2,2f
479err3;	lhz	r0,0(r4)
480	addi	r4,r4,2
481err3;	sth	r0,0(r3)
482	addi	r3,r3,2
483
4842:	bf	cr7*4+1,3f
485err3;	lwz	r0,0(r4)
486	addi	r4,r4,4
487err3;	stw	r0,0(r3)
488	addi	r3,r3,4
489
4903:	bf	cr7*4+0,4f
491err3;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
492err3;	lwz	r7,4(r4)
493	addi	r4,r4,8
494err3;	stw	r0,0(r3)
495err3;	stw	r7,4(r3)
496	addi	r3,r3,8
497
4984:	sub	r5,r5,r6
499
500	/* Get the desination 128B aligned */
501	neg	r6,r3
502	srdi	r7,r6,4
503	mtocrf	0x01,r7
504	clrldi	r6,r6,(64-7)
505
506	li	r9,16
507	li	r10,32
508	li	r11,48
509
510	LVS(v16,0,r4)		/* Setup permute control vector */
511err3;	lvx	v0,0,r4
512	addi	r4,r4,16
513
514	bf	cr7*4+3,5f
515err3;	lvx	v1,0,r4
516	VPERM(v8,v0,v1,v16)
517	addi	r4,r4,16
518err3;	stvx	v8,0,r3
519	addi	r3,r3,16
520	vor	v0,v1,v1
521
5225:	bf	cr7*4+2,6f
523err3;	lvx	v1,0,r4
524	VPERM(v8,v0,v1,v16)
525err3;	lvx	v0,r4,r9
526	VPERM(v9,v1,v0,v16)
527	addi	r4,r4,32
528err3;	stvx	v8,0,r3
529err3;	stvx	v9,r3,r9
530	addi	r3,r3,32
531
5326:	bf	cr7*4+1,7f
533err3;	lvx	v3,0,r4
534	VPERM(v8,v0,v3,v16)
535err3;	lvx	v2,r4,r9
536	VPERM(v9,v3,v2,v16)
537err3;	lvx	v1,r4,r10
538	VPERM(v10,v2,v1,v16)
539err3;	lvx	v0,r4,r11
540	VPERM(v11,v1,v0,v16)
541	addi	r4,r4,64
542err3;	stvx	v8,0,r3
543err3;	stvx	v9,r3,r9
544err3;	stvx	v10,r3,r10
545err3;	stvx	v11,r3,r11
546	addi	r3,r3,64
547
5487:	sub	r5,r5,r6
549	srdi	r6,r5,7
550
551	std	r14,STK_REG(R14)(r1)
552	std	r15,STK_REG(R15)(r1)
553	std	r16,STK_REG(R16)(r1)
554
555	li	r12,64
556	li	r14,80
557	li	r15,96
558	li	r16,112
559
560	mtctr	r6
561
562	/*
563	 * Now do cacheline sized loads and stores. By this stage the
564	 * cacheline stores are also cacheline aligned.
565	 */
566	.align	5
5678:
568err4;	lvx	v7,0,r4
569	VPERM(v8,v0,v7,v16)
570err4;	lvx	v6,r4,r9
571	VPERM(v9,v7,v6,v16)
572err4;	lvx	v5,r4,r10
573	VPERM(v10,v6,v5,v16)
574err4;	lvx	v4,r4,r11
575	VPERM(v11,v5,v4,v16)
576err4;	lvx	v3,r4,r12
577	VPERM(v12,v4,v3,v16)
578err4;	lvx	v2,r4,r14
579	VPERM(v13,v3,v2,v16)
580err4;	lvx	v1,r4,r15
581	VPERM(v14,v2,v1,v16)
582err4;	lvx	v0,r4,r16
583	VPERM(v15,v1,v0,v16)
584	addi	r4,r4,128
585err4;	stvx	v8,0,r3
586err4;	stvx	v9,r3,r9
587err4;	stvx	v10,r3,r10
588err4;	stvx	v11,r3,r11
589err4;	stvx	v12,r3,r12
590err4;	stvx	v13,r3,r14
591err4;	stvx	v14,r3,r15
592err4;	stvx	v15,r3,r16
593	addi	r3,r3,128
594	bdnz	8b
595
596	ld	r14,STK_REG(R14)(r1)
597	ld	r15,STK_REG(R15)(r1)
598	ld	r16,STK_REG(R16)(r1)
599
600	/* Up to 127B to go */
601	clrldi	r5,r5,(64-7)
602	srdi	r6,r5,4
603	mtocrf	0x01,r6
604
605	bf	cr7*4+1,9f
606err3;	lvx	v3,0,r4
607	VPERM(v8,v0,v3,v16)
608err3;	lvx	v2,r4,r9
609	VPERM(v9,v3,v2,v16)
610err3;	lvx	v1,r4,r10
611	VPERM(v10,v2,v1,v16)
612err3;	lvx	v0,r4,r11
613	VPERM(v11,v1,v0,v16)
614	addi	r4,r4,64
615err3;	stvx	v8,0,r3
616err3;	stvx	v9,r3,r9
617err3;	stvx	v10,r3,r10
618err3;	stvx	v11,r3,r11
619	addi	r3,r3,64
620
6219:	bf	cr7*4+2,10f
622err3;	lvx	v1,0,r4
623	VPERM(v8,v0,v1,v16)
624err3;	lvx	v0,r4,r9
625	VPERM(v9,v1,v0,v16)
626	addi	r4,r4,32
627err3;	stvx	v8,0,r3
628err3;	stvx	v9,r3,r9
629	addi	r3,r3,32
630
63110:	bf	cr7*4+3,11f
632err3;	lvx	v1,0,r4
633	VPERM(v8,v0,v1,v16)
634	addi	r4,r4,16
635err3;	stvx	v8,0,r3
636	addi	r3,r3,16
637
638	/* Up to 15B to go */
63911:	clrldi	r5,r5,(64-4)
640	addi	r4,r4,-16	/* Unwind the +16 load offset */
641	mtocrf	0x01,r5
642	bf	cr7*4+0,12f
643err3;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
644err3;	lwz	r6,4(r4)
645	addi	r4,r4,8
646err3;	stw	r0,0(r3)
647err3;	stw	r6,4(r3)
648	addi	r3,r3,8
649
65012:	bf	cr7*4+1,13f
651err3;	lwz	r0,0(r4)
652	addi	r4,r4,4
653err3;	stw	r0,0(r3)
654	addi	r3,r3,4
655
65613:	bf	cr7*4+2,14f
657err3;	lhz	r0,0(r4)
658	addi	r4,r4,2
659err3;	sth	r0,0(r3)
660	addi	r3,r3,2
661
66214:	bf	cr7*4+3,15f
663err3;	lbz	r0,0(r4)
664err3;	stb	r0,0(r3)
665
66615:	addi	r1,r1,STACKFRAMESIZE
667	li r3,0
668	blr
669EXPORT_SYMBOL(__copy_tofrom_user_power7_vmx)
670#endif /* CONFIG_ALTIVEC */
671