xref: /linux/arch/powerpc/lib/copyuser_64.S (revision b233b28eac0cc37d07c2d007ea08c86c778c5af4)
1/*
2 * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9#include <asm/processor.h>
10#include <asm/ppc_asm.h>
11
12	.align	7
13_GLOBAL(__copy_tofrom_user)
14	/* first check for a whole page copy on a page boundary */
15	cmpldi	cr1,r5,16
16	cmpdi	cr6,r5,4096
17	or	r0,r3,r4
18	neg	r6,r3		/* LS 3 bits = # bytes to 8-byte dest bdry */
19	andi.	r0,r0,4095
20	std	r3,-24(r1)
21	crand	cr0*4+2,cr0*4+2,cr6*4+2
22	std	r4,-16(r1)
23	std	r5,-8(r1)
24	dcbt	0,r4
25	beq	.Lcopy_page_4K
26	andi.	r6,r6,7
27	PPC_MTOCRF	0x01,r5
28	blt	cr1,.Lshort_copy
29/* Below we want to nop out the bne if we're on a CPU that has the
30 * CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
31 * cleared.
32 * At the time of writing the only CPU that has this combination of bits
33 * set is Power6.
34 */
35BEGIN_FTR_SECTION
36	nop
37FTR_SECTION_ELSE
38	bne	.Ldst_unaligned
39ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
40		    CPU_FTR_UNALIGNED_LD_STD)
41.Ldst_aligned:
42	addi	r3,r3,-16
43BEGIN_FTR_SECTION
44	andi.	r0,r4,7
45	bne	.Lsrc_unaligned
46END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
47	srdi	r7,r5,4
4820:	ld	r9,0(r4)
49	addi	r4,r4,-8
50	mtctr	r7
51	andi.	r5,r5,7
52	bf	cr7*4+0,22f
53	addi	r3,r3,8
54	addi	r4,r4,8
55	mr	r8,r9
56	blt	cr1,72f
5721:	ld	r9,8(r4)
5870:	std	r8,8(r3)
5922:	ldu	r8,16(r4)
6071:	stdu	r9,16(r3)
61	bdnz	21b
6272:	std	r8,8(r3)
63	beq+	3f
64	addi	r3,r3,16
6523:	ld	r9,8(r4)
66.Ldo_tail:
67	bf	cr7*4+1,1f
68	rotldi	r9,r9,32
6973:	stw	r9,0(r3)
70	addi	r3,r3,4
711:	bf	cr7*4+2,2f
72	rotldi	r9,r9,16
7374:	sth	r9,0(r3)
74	addi	r3,r3,2
752:	bf	cr7*4+3,3f
76	rotldi	r9,r9,8
7775:	stb	r9,0(r3)
783:	li	r3,0
79	blr
80
81.Lsrc_unaligned:
82	srdi	r6,r5,3
83	addi	r5,r5,-16
84	subf	r4,r0,r4
85	srdi	r7,r5,4
86	sldi	r10,r0,3
87	cmpldi	cr6,r6,3
88	andi.	r5,r5,7
89	mtctr	r7
90	subfic	r11,r10,64
91	add	r5,r5,r0
92	bt	cr7*4+0,28f
93
9424:	ld	r9,0(r4)	/* 3+2n loads, 2+2n stores */
9525:	ld	r0,8(r4)
96	sld	r6,r9,r10
9726:	ldu	r9,16(r4)
98	srd	r7,r0,r11
99	sld	r8,r0,r10
100	or	r7,r7,r6
101	blt	cr6,79f
10227:	ld	r0,8(r4)
103	b	2f
104
10528:	ld	r0,0(r4)	/* 4+2n loads, 3+2n stores */
10629:	ldu	r9,8(r4)
107	sld	r8,r0,r10
108	addi	r3,r3,-8
109	blt	cr6,5f
11030:	ld	r0,8(r4)
111	srd	r12,r9,r11
112	sld	r6,r9,r10
11331:	ldu	r9,16(r4)
114	or	r12,r8,r12
115	srd	r7,r0,r11
116	sld	r8,r0,r10
117	addi	r3,r3,16
118	beq	cr6,78f
119
1201:	or	r7,r7,r6
12132:	ld	r0,8(r4)
12276:	std	r12,8(r3)
1232:	srd	r12,r9,r11
124	sld	r6,r9,r10
12533:	ldu	r9,16(r4)
126	or	r12,r8,r12
12777:	stdu	r7,16(r3)
128	srd	r7,r0,r11
129	sld	r8,r0,r10
130	bdnz	1b
131
13278:	std	r12,8(r3)
133	or	r7,r7,r6
13479:	std	r7,16(r3)
1355:	srd	r12,r9,r11
136	or	r12,r8,r12
13780:	std	r12,24(r3)
138	bne	6f
139	li	r3,0
140	blr
1416:	cmpwi	cr1,r5,8
142	addi	r3,r3,32
143	sld	r9,r9,r10
144	ble	cr1,.Ldo_tail
14534:	ld	r0,8(r4)
146	srd	r7,r0,r11
147	or	r9,r7,r9
148	b	.Ldo_tail
149
150.Ldst_unaligned:
151	PPC_MTOCRF	0x01,r6		/* put #bytes to 8B bdry into cr7 */
152	subf	r5,r6,r5
153	li	r7,0
154	cmpldi	cr1,r5,16
155	bf	cr7*4+3,1f
15635:	lbz	r0,0(r4)
15781:	stb	r0,0(r3)
158	addi	r7,r7,1
1591:	bf	cr7*4+2,2f
16036:	lhzx	r0,r7,r4
16182:	sthx	r0,r7,r3
162	addi	r7,r7,2
1632:	bf	cr7*4+1,3f
16437:	lwzx	r0,r7,r4
16583:	stwx	r0,r7,r3
1663:	PPC_MTOCRF	0x01,r5
167	add	r4,r6,r4
168	add	r3,r6,r3
169	b	.Ldst_aligned
170
171.Lshort_copy:
172	bf	cr7*4+0,1f
17338:	lwz	r0,0(r4)
17439:	lwz	r9,4(r4)
175	addi	r4,r4,8
17684:	stw	r0,0(r3)
17785:	stw	r9,4(r3)
178	addi	r3,r3,8
1791:	bf	cr7*4+1,2f
18040:	lwz	r0,0(r4)
181	addi	r4,r4,4
18286:	stw	r0,0(r3)
183	addi	r3,r3,4
1842:	bf	cr7*4+2,3f
18541:	lhz	r0,0(r4)
186	addi	r4,r4,2
18787:	sth	r0,0(r3)
188	addi	r3,r3,2
1893:	bf	cr7*4+3,4f
19042:	lbz	r0,0(r4)
19188:	stb	r0,0(r3)
1924:	li	r3,0
193	blr
194
195/*
196 * exception handlers follow
197 * we have to return the number of bytes not copied
198 * for an exception on a load, we set the rest of the destination to 0
199 */
200
201136:
202137:
203	add	r3,r3,r7
204	b	1f
205130:
206131:
207	addi	r3,r3,8
208120:
209122:
210124:
211125:
212126:
213127:
214128:
215129:
216133:
217	addi	r3,r3,8
218121:
219132:
220	addi	r3,r3,8
221123:
222134:
223135:
224138:
225139:
226140:
227141:
228142:
229
230/*
231 * here we have had a fault on a load and r3 points to the first
232 * unmodified byte of the destination
233 */
2341:	ld	r6,-24(r1)
235	ld	r4,-16(r1)
236	ld	r5,-8(r1)
237	subf	r6,r6,r3
238	add	r4,r4,r6
239	subf	r5,r6,r5	/* #bytes left to go */
240
241/*
242 * first see if we can copy any more bytes before hitting another exception
243 */
244	mtctr	r5
24543:	lbz	r0,0(r4)
246	addi	r4,r4,1
24789:	stb	r0,0(r3)
248	addi	r3,r3,1
249	bdnz	43b
250	li	r3,0		/* huh? all copied successfully this time? */
251	blr
252
253/*
254 * here we have trapped again, need to clear ctr bytes starting at r3
255 */
256143:	mfctr	r5
257	li	r0,0
258	mr	r4,r3
259	mr	r3,r5		/* return the number of bytes not copied */
2601:	andi.	r9,r4,7
261	beq	3f
26290:	stb	r0,0(r4)
263	addic.	r5,r5,-1
264	addi	r4,r4,1
265	bne	1b
266	blr
2673:	cmpldi	cr1,r5,8
268	srdi	r9,r5,3
269	andi.	r5,r5,7
270	blt	cr1,93f
271	mtctr	r9
27291:	std	r0,0(r4)
273	addi	r4,r4,8
274	bdnz	91b
27593:	beqlr
276	mtctr	r5
27792:	stb	r0,0(r4)
278	addi	r4,r4,1
279	bdnz	92b
280	blr
281
282/*
283 * exception handlers for stores: we just need to work
284 * out how many bytes weren't copied
285 */
286182:
287183:
288	add	r3,r3,r7
289	b	1f
290180:
291	addi	r3,r3,8
292171:
293177:
294	addi	r3,r3,8
295170:
296172:
297176:
298178:
299	addi	r3,r3,4
300185:
301	addi	r3,r3,4
302173:
303174:
304175:
305179:
306181:
307184:
308186:
309187:
310188:
311189:
3121:
313	ld	r6,-24(r1)
314	ld	r5,-8(r1)
315	add	r6,r6,r5
316	subf	r3,r3,r6	/* #bytes not copied */
317190:
318191:
319192:
320	blr			/* #bytes not copied in r3 */
321
322	.section __ex_table,"a"
323	.align	3
324	.llong	20b,120b
325	.llong	21b,121b
326	.llong	70b,170b
327	.llong	22b,122b
328	.llong	71b,171b
329	.llong	72b,172b
330	.llong	23b,123b
331	.llong	73b,173b
332	.llong	74b,174b
333	.llong	75b,175b
334	.llong	24b,124b
335	.llong	25b,125b
336	.llong	26b,126b
337	.llong	27b,127b
338	.llong	28b,128b
339	.llong	29b,129b
340	.llong	30b,130b
341	.llong	31b,131b
342	.llong	32b,132b
343	.llong	76b,176b
344	.llong	33b,133b
345	.llong	77b,177b
346	.llong	78b,178b
347	.llong	79b,179b
348	.llong	80b,180b
349	.llong	34b,134b
350	.llong	35b,135b
351	.llong	81b,181b
352	.llong	36b,136b
353	.llong	82b,182b
354	.llong	37b,137b
355	.llong	83b,183b
356	.llong	38b,138b
357	.llong	39b,139b
358	.llong	84b,184b
359	.llong	85b,185b
360	.llong	40b,140b
361	.llong	86b,186b
362	.llong	41b,141b
363	.llong	87b,187b
364	.llong	42b,142b
365	.llong	88b,188b
366	.llong	43b,143b
367	.llong	89b,189b
368	.llong	90b,190b
369	.llong	91b,191b
370	.llong	92b,192b
371
372	.text
373
374/*
375 * Routine to copy a whole page of data, optimized for POWER4.
376 * On POWER4 it is more than 50% faster than the simple loop
377 * above (following the .Ldst_aligned label) but it runs slightly
378 * slower on POWER3.
379 */
380.Lcopy_page_4K:
381	std	r31,-32(1)
382	std	r30,-40(1)
383	std	r29,-48(1)
384	std	r28,-56(1)
385	std	r27,-64(1)
386	std	r26,-72(1)
387	std	r25,-80(1)
388	std	r24,-88(1)
389	std	r23,-96(1)
390	std	r22,-104(1)
391	std	r21,-112(1)
392	std	r20,-120(1)
393	li	r5,4096/32 - 1
394	addi	r3,r3,-8
395	li	r0,5
3960:	addi	r5,r5,-24
397	mtctr	r0
39820:	ld	r22,640(4)
39921:	ld	r21,512(4)
40022:	ld	r20,384(4)
40123:	ld	r11,256(4)
40224:	ld	r9,128(4)
40325:	ld	r7,0(4)
40426:	ld	r25,648(4)
40527:	ld	r24,520(4)
40628:	ld	r23,392(4)
40729:	ld	r10,264(4)
40830:	ld	r8,136(4)
40931:	ldu	r6,8(4)
410	cmpwi	r5,24
4111:
41232:	std	r22,648(3)
41333:	std	r21,520(3)
41434:	std	r20,392(3)
41535:	std	r11,264(3)
41636:	std	r9,136(3)
41737:	std	r7,8(3)
41838:	ld	r28,648(4)
41939:	ld	r27,520(4)
42040:	ld	r26,392(4)
42141:	ld	r31,264(4)
42242:	ld	r30,136(4)
42343:	ld	r29,8(4)
42444:	std	r25,656(3)
42545:	std	r24,528(3)
42646:	std	r23,400(3)
42747:	std	r10,272(3)
42848:	std	r8,144(3)
42949:	std	r6,16(3)
43050:	ld	r22,656(4)
43151:	ld	r21,528(4)
43252:	ld	r20,400(4)
43353:	ld	r11,272(4)
43454:	ld	r9,144(4)
43555:	ld	r7,16(4)
43656:	std	r28,664(3)
43757:	std	r27,536(3)
43858:	std	r26,408(3)
43959:	std	r31,280(3)
44060:	std	r30,152(3)
44161:	stdu	r29,24(3)
44262:	ld	r25,664(4)
44363:	ld	r24,536(4)
44464:	ld	r23,408(4)
44565:	ld	r10,280(4)
44666:	ld	r8,152(4)
44767:	ldu	r6,24(4)
448	bdnz	1b
44968:	std	r22,648(3)
45069:	std	r21,520(3)
45170:	std	r20,392(3)
45271:	std	r11,264(3)
45372:	std	r9,136(3)
45473:	std	r7,8(3)
45574:	addi	r4,r4,640
45675:	addi	r3,r3,648
457	bge	0b
458	mtctr	r5
45976:	ld	r7,0(4)
46077:	ld	r8,8(4)
46178:	ldu	r9,16(4)
4623:
46379:	ld	r10,8(4)
46480:	std	r7,8(3)
46581:	ld	r7,16(4)
46682:	std	r8,16(3)
46783:	ld	r8,24(4)
46884:	std	r9,24(3)
46985:	ldu	r9,32(4)
47086:	stdu	r10,32(3)
471	bdnz	3b
4724:
47387:	ld	r10,8(4)
47488:	std	r7,8(3)
47589:	std	r8,16(3)
47690:	std	r9,24(3)
47791:	std	r10,32(3)
4789:	ld	r20,-120(1)
479	ld	r21,-112(1)
480	ld	r22,-104(1)
481	ld	r23,-96(1)
482	ld	r24,-88(1)
483	ld	r25,-80(1)
484	ld	r26,-72(1)
485	ld	r27,-64(1)
486	ld	r28,-56(1)
487	ld	r29,-48(1)
488	ld	r30,-40(1)
489	ld	r31,-32(1)
490	li	r3,0
491	blr
492
493/*
494 * on an exception, reset to the beginning and jump back into the
495 * standard __copy_tofrom_user
496 */
497100:	ld	r20,-120(1)
498	ld	r21,-112(1)
499	ld	r22,-104(1)
500	ld	r23,-96(1)
501	ld	r24,-88(1)
502	ld	r25,-80(1)
503	ld	r26,-72(1)
504	ld	r27,-64(1)
505	ld	r28,-56(1)
506	ld	r29,-48(1)
507	ld	r30,-40(1)
508	ld	r31,-32(1)
509	ld	r3,-24(r1)
510	ld	r4,-16(r1)
511	li	r5,4096
512	b	.Ldst_aligned
513
514	.section __ex_table,"a"
515	.align	3
516	.llong	20b,100b
517	.llong	21b,100b
518	.llong	22b,100b
519	.llong	23b,100b
520	.llong	24b,100b
521	.llong	25b,100b
522	.llong	26b,100b
523	.llong	27b,100b
524	.llong	28b,100b
525	.llong	29b,100b
526	.llong	30b,100b
527	.llong	31b,100b
528	.llong	32b,100b
529	.llong	33b,100b
530	.llong	34b,100b
531	.llong	35b,100b
532	.llong	36b,100b
533	.llong	37b,100b
534	.llong	38b,100b
535	.llong	39b,100b
536	.llong	40b,100b
537	.llong	41b,100b
538	.llong	42b,100b
539	.llong	43b,100b
540	.llong	44b,100b
541	.llong	45b,100b
542	.llong	46b,100b
543	.llong	47b,100b
544	.llong	48b,100b
545	.llong	49b,100b
546	.llong	50b,100b
547	.llong	51b,100b
548	.llong	52b,100b
549	.llong	53b,100b
550	.llong	54b,100b
551	.llong	55b,100b
552	.llong	56b,100b
553	.llong	57b,100b
554	.llong	58b,100b
555	.llong	59b,100b
556	.llong	60b,100b
557	.llong	61b,100b
558	.llong	62b,100b
559	.llong	63b,100b
560	.llong	64b,100b
561	.llong	65b,100b
562	.llong	66b,100b
563	.llong	67b,100b
564	.llong	68b,100b
565	.llong	69b,100b
566	.llong	70b,100b
567	.llong	71b,100b
568	.llong	72b,100b
569	.llong	73b,100b
570	.llong	74b,100b
571	.llong	75b,100b
572	.llong	76b,100b
573	.llong	77b,100b
574	.llong	78b,100b
575	.llong	79b,100b
576	.llong	80b,100b
577	.llong	81b,100b
578	.llong	82b,100b
579	.llong	83b,100b
580	.llong	84b,100b
581	.llong	85b,100b
582	.llong	86b,100b
583	.llong	87b,100b
584	.llong	88b,100b
585	.llong	89b,100b
586	.llong	90b,100b
587	.llong	91b,100b
588