xref: /linux/arch/powerpc/lib/copy_32.S (revision 0883c2c06fb5bcf5b9e008270827e63c09a88c1e)
1/*
2 * Memory copy functions for 32-bit PowerPC.
3 *
4 * Copyright (C) 1996-2005 Paul Mackerras.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <asm/processor.h>
12#include <asm/cache.h>
13#include <asm/errno.h>
14#include <asm/ppc_asm.h>
15
16#define COPY_16_BYTES		\
17	lwz	r7,4(r4);	\
18	lwz	r8,8(r4);	\
19	lwz	r9,12(r4);	\
20	lwzu	r10,16(r4);	\
21	stw	r7,4(r6);	\
22	stw	r8,8(r6);	\
23	stw	r9,12(r6);	\
24	stwu	r10,16(r6)
25
26#define COPY_16_BYTES_WITHEX(n)	\
278 ## n ## 0:			\
28	lwz	r7,4(r4);	\
298 ## n ## 1:			\
30	lwz	r8,8(r4);	\
318 ## n ## 2:			\
32	lwz	r9,12(r4);	\
338 ## n ## 3:			\
34	lwzu	r10,16(r4);	\
358 ## n ## 4:			\
36	stw	r7,4(r6);	\
378 ## n ## 5:			\
38	stw	r8,8(r6);	\
398 ## n ## 6:			\
40	stw	r9,12(r6);	\
418 ## n ## 7:			\
42	stwu	r10,16(r6)
43
44#define COPY_16_BYTES_EXCODE(n)			\
459 ## n ## 0:					\
46	addi	r5,r5,-(16 * n);		\
47	b	104f;				\
489 ## n ## 1:					\
49	addi	r5,r5,-(16 * n);		\
50	b	105f;				\
51.section __ex_table,"a";			\
52	.align	2;				\
53	.long	8 ## n ## 0b,9 ## n ## 0b;	\
54	.long	8 ## n ## 1b,9 ## n ## 0b;	\
55	.long	8 ## n ## 2b,9 ## n ## 0b;	\
56	.long	8 ## n ## 3b,9 ## n ## 0b;	\
57	.long	8 ## n ## 4b,9 ## n ## 1b;	\
58	.long	8 ## n ## 5b,9 ## n ## 1b;	\
59	.long	8 ## n ## 6b,9 ## n ## 1b;	\
60	.long	8 ## n ## 7b,9 ## n ## 1b;	\
61	.text
62
63	.text
64	.stabs	"arch/powerpc/lib/",N_SO,0,0,0f
65	.stabs	"copy_32.S",N_SO,0,0,0f
660:
67
68CACHELINE_BYTES = L1_CACHE_BYTES
69LG_CACHELINE_BYTES = L1_CACHE_SHIFT
70CACHELINE_MASK = (L1_CACHE_BYTES-1)
71
72/*
73 * Use dcbz on the complete cache lines in the destination
74 * to set them to zero.  This requires that the destination
75 * area is cacheable.  -- paulus
76 *
77 * During early init, cache might not be active yet, so dcbz cannot be used.
78 * We therefore skip the optimised bloc that uses dcbz. This jump is
79 * replaced by a nop once cache is active. This is done in machine_init()
80 */
81_GLOBAL(memset)
82	rlwimi	r4,r4,8,16,23
83	rlwimi	r4,r4,16,0,15
84
85	addi	r6,r3,-4
86	cmplwi	0,r5,4
87	blt	7f
88	stwu	r4,4(r6)
89	beqlr
90	andi.	r0,r6,3
91	add	r5,r0,r5
92	subf	r6,r0,r6
93	cmplwi	0,r4,0
94	bne	2f	/* Use normal procedure if r4 is not zero */
95_GLOBAL(memset_nocache_branch)
96	b	2f	/* Skip optimised bloc until cache is enabled */
97
98	clrlwi	r7,r6,32-LG_CACHELINE_BYTES
99	add	r8,r7,r5
100	srwi	r9,r8,LG_CACHELINE_BYTES
101	addic.	r9,r9,-1	/* total number of complete cachelines */
102	ble	2f
103	xori	r0,r7,CACHELINE_MASK & ~3
104	srwi.	r0,r0,2
105	beq	3f
106	mtctr	r0
1074:	stwu	r4,4(r6)
108	bdnz	4b
1093:	mtctr	r9
110	li	r7,4
11110:	dcbz	r7,r6
112	addi	r6,r6,CACHELINE_BYTES
113	bdnz	10b
114	clrlwi	r5,r8,32-LG_CACHELINE_BYTES
115	addi	r5,r5,4
116
1172:	srwi	r0,r5,2
118	mtctr	r0
119	bdz	6f
1201:	stwu	r4,4(r6)
121	bdnz	1b
1226:	andi.	r5,r5,3
1237:	cmpwi	0,r5,0
124	beqlr
125	mtctr	r5
126	addi	r6,r6,3
1278:	stbu	r4,1(r6)
128	bdnz	8b
129	blr
130
131/*
132 * This version uses dcbz on the complete cache lines in the
133 * destination area to reduce memory traffic.  This requires that
134 * the destination area is cacheable.
135 * We only use this version if the source and dest don't overlap.
136 * -- paulus.
137 *
138 * During early init, cache might not be active yet, so dcbz cannot be used.
139 * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is
140 * replaced by a nop once cache is active. This is done in machine_init()
141 */
142_GLOBAL(memmove)
143	cmplw	0,r3,r4
144	bgt	backwards_memcpy
145	/* fall through */
146
147_GLOBAL(memcpy)
148	b	generic_memcpy
149	add	r7,r3,r5		/* test if the src & dst overlap */
150	add	r8,r4,r5
151	cmplw	0,r4,r7
152	cmplw	1,r3,r8
153	crand	0,0,4			/* cr0.lt &= cr1.lt */
154	blt	generic_memcpy		/* if regions overlap */
155
156	addi	r4,r4,-4
157	addi	r6,r3,-4
158	neg	r0,r3
159	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
160	beq	58f
161
162	cmplw	0,r5,r0			/* is this more than total to do? */
163	blt	63f			/* if not much to do */
164	andi.	r8,r0,3			/* get it word-aligned first */
165	subf	r5,r0,r5
166	mtctr	r8
167	beq+	61f
16870:	lbz	r9,4(r4)		/* do some bytes */
169	addi	r4,r4,1
170	addi	r6,r6,1
171	stb	r9,3(r6)
172	bdnz	70b
17361:	srwi.	r0,r0,2
174	mtctr	r0
175	beq	58f
17672:	lwzu	r9,4(r4)		/* do some words */
177	stwu	r9,4(r6)
178	bdnz	72b
179
18058:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
181	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
182	li	r11,4
183	mtctr	r0
184	beq	63f
18553:
186	dcbz	r11,r6
187	COPY_16_BYTES
188#if L1_CACHE_BYTES >= 32
189	COPY_16_BYTES
190#if L1_CACHE_BYTES >= 64
191	COPY_16_BYTES
192	COPY_16_BYTES
193#if L1_CACHE_BYTES >= 128
194	COPY_16_BYTES
195	COPY_16_BYTES
196	COPY_16_BYTES
197	COPY_16_BYTES
198#endif
199#endif
200#endif
201	bdnz	53b
202
20363:	srwi.	r0,r5,2
204	mtctr	r0
205	beq	64f
20630:	lwzu	r0,4(r4)
207	stwu	r0,4(r6)
208	bdnz	30b
209
21064:	andi.	r0,r5,3
211	mtctr	r0
212	beq+	65f
213	addi	r4,r4,3
214	addi	r6,r6,3
21540:	lbzu	r0,1(r4)
216	stbu	r0,1(r6)
217	bdnz	40b
21865:	blr
219
220generic_memcpy:
221	srwi.	r7,r5,3
222	addi	r6,r3,-4
223	addi	r4,r4,-4
224	beq	2f			/* if less than 8 bytes to do */
225	andi.	r0,r6,3			/* get dest word aligned */
226	mtctr	r7
227	bne	5f
2281:	lwz	r7,4(r4)
229	lwzu	r8,8(r4)
230	stw	r7,4(r6)
231	stwu	r8,8(r6)
232	bdnz	1b
233	andi.	r5,r5,7
2342:	cmplwi	0,r5,4
235	blt	3f
236	lwzu	r0,4(r4)
237	addi	r5,r5,-4
238	stwu	r0,4(r6)
2393:	cmpwi	0,r5,0
240	beqlr
241	mtctr	r5
242	addi	r4,r4,3
243	addi	r6,r6,3
2444:	lbzu	r0,1(r4)
245	stbu	r0,1(r6)
246	bdnz	4b
247	blr
2485:	subfic	r0,r0,4
249	mtctr	r0
2506:	lbz	r7,4(r4)
251	addi	r4,r4,1
252	stb	r7,4(r6)
253	addi	r6,r6,1
254	bdnz	6b
255	subf	r5,r0,r5
256	rlwinm.	r7,r5,32-3,3,31
257	beq	2b
258	mtctr	r7
259	b	1b
260
261_GLOBAL(backwards_memcpy)
262	rlwinm.	r7,r5,32-3,3,31		/* r0 = r5 >> 3 */
263	add	r6,r3,r5
264	add	r4,r4,r5
265	beq	2f
266	andi.	r0,r6,3
267	mtctr	r7
268	bne	5f
2691:	lwz	r7,-4(r4)
270	lwzu	r8,-8(r4)
271	stw	r7,-4(r6)
272	stwu	r8,-8(r6)
273	bdnz	1b
274	andi.	r5,r5,7
2752:	cmplwi	0,r5,4
276	blt	3f
277	lwzu	r0,-4(r4)
278	subi	r5,r5,4
279	stwu	r0,-4(r6)
2803:	cmpwi	0,r5,0
281	beqlr
282	mtctr	r5
2834:	lbzu	r0,-1(r4)
284	stbu	r0,-1(r6)
285	bdnz	4b
286	blr
2875:	mtctr	r0
2886:	lbzu	r7,-1(r4)
289	stbu	r7,-1(r6)
290	bdnz	6b
291	subf	r5,r0,r5
292	rlwinm.	r7,r5,32-3,3,31
293	beq	2b
294	mtctr	r7
295	b	1b
296
297_GLOBAL(__copy_tofrom_user)
298	addi	r4,r4,-4
299	addi	r6,r3,-4
300	neg	r0,r3
301	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
302	beq	58f
303
304	cmplw	0,r5,r0			/* is this more than total to do? */
305	blt	63f			/* if not much to do */
306	andi.	r8,r0,3			/* get it word-aligned first */
307	mtctr	r8
308	beq+	61f
30970:	lbz	r9,4(r4)		/* do some bytes */
31071:	stb	r9,4(r6)
311	addi	r4,r4,1
312	addi	r6,r6,1
313	bdnz	70b
31461:	subf	r5,r0,r5
315	srwi.	r0,r0,2
316	mtctr	r0
317	beq	58f
31872:	lwzu	r9,4(r4)		/* do some words */
31973:	stwu	r9,4(r6)
320	bdnz	72b
321
322	.section __ex_table,"a"
323	.align	2
324	.long	70b,100f
325	.long	71b,101f
326	.long	72b,102f
327	.long	73b,103f
328	.text
329
33058:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
331	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
332	li	r11,4
333	beq	63f
334
335	/* Here we decide how far ahead to prefetch the source */
336	li	r3,4
337	cmpwi	r0,1
338	li	r7,0
339	ble	114f
340	li	r7,1
341#if MAX_COPY_PREFETCH > 1
342	/* Heuristically, for large transfers we prefetch
343	   MAX_COPY_PREFETCH cachelines ahead.  For small transfers
344	   we prefetch 1 cacheline ahead. */
345	cmpwi	r0,MAX_COPY_PREFETCH
346	ble	112f
347	li	r7,MAX_COPY_PREFETCH
348112:	mtctr	r7
349111:	dcbt	r3,r4
350	addi	r3,r3,CACHELINE_BYTES
351	bdnz	111b
352#else
353	dcbt	r3,r4
354	addi	r3,r3,CACHELINE_BYTES
355#endif /* MAX_COPY_PREFETCH > 1 */
356
357114:	subf	r8,r7,r0
358	mr	r0,r7
359	mtctr	r8
360
36153:	dcbt	r3,r4
36254:	dcbz	r11,r6
363	.section __ex_table,"a"
364	.align	2
365	.long	54b,105f
366	.text
367/* the main body of the cacheline loop */
368	COPY_16_BYTES_WITHEX(0)
369#if L1_CACHE_BYTES >= 32
370	COPY_16_BYTES_WITHEX(1)
371#if L1_CACHE_BYTES >= 64
372	COPY_16_BYTES_WITHEX(2)
373	COPY_16_BYTES_WITHEX(3)
374#if L1_CACHE_BYTES >= 128
375	COPY_16_BYTES_WITHEX(4)
376	COPY_16_BYTES_WITHEX(5)
377	COPY_16_BYTES_WITHEX(6)
378	COPY_16_BYTES_WITHEX(7)
379#endif
380#endif
381#endif
382	bdnz	53b
383	cmpwi	r0,0
384	li	r3,4
385	li	r7,0
386	bne	114b
387
38863:	srwi.	r0,r5,2
389	mtctr	r0
390	beq	64f
39130:	lwzu	r0,4(r4)
39231:	stwu	r0,4(r6)
393	bdnz	30b
394
39564:	andi.	r0,r5,3
396	mtctr	r0
397	beq+	65f
39840:	lbz	r0,4(r4)
39941:	stb	r0,4(r6)
400	addi	r4,r4,1
401	addi	r6,r6,1
402	bdnz	40b
40365:	li	r3,0
404	blr
405
406/* read fault, initial single-byte copy */
407100:	li	r9,0
408	b	90f
409/* write fault, initial single-byte copy */
410101:	li	r9,1
41190:	subf	r5,r8,r5
412	li	r3,0
413	b	99f
414/* read fault, initial word copy */
415102:	li	r9,0
416	b	91f
417/* write fault, initial word copy */
418103:	li	r9,1
41991:	li	r3,2
420	b	99f
421
422/*
423 * this stuff handles faults in the cacheline loop and branches to either
424 * 104f (if in read part) or 105f (if in write part), after updating r5
425 */
426	COPY_16_BYTES_EXCODE(0)
427#if L1_CACHE_BYTES >= 32
428	COPY_16_BYTES_EXCODE(1)
429#if L1_CACHE_BYTES >= 64
430	COPY_16_BYTES_EXCODE(2)
431	COPY_16_BYTES_EXCODE(3)
432#if L1_CACHE_BYTES >= 128
433	COPY_16_BYTES_EXCODE(4)
434	COPY_16_BYTES_EXCODE(5)
435	COPY_16_BYTES_EXCODE(6)
436	COPY_16_BYTES_EXCODE(7)
437#endif
438#endif
439#endif
440
441/* read fault in cacheline loop */
442104:	li	r9,0
443	b	92f
444/* fault on dcbz (effectively a write fault) */
445/* or write fault in cacheline loop */
446105:	li	r9,1
44792:	li	r3,LG_CACHELINE_BYTES
448	mfctr	r8
449	add	r0,r0,r8
450	b	106f
451/* read fault in final word loop */
452108:	li	r9,0
453	b	93f
454/* write fault in final word loop */
455109:	li	r9,1
45693:	andi.	r5,r5,3
457	li	r3,2
458	b	99f
459/* read fault in final byte loop */
460110:	li	r9,0
461	b	94f
462/* write fault in final byte loop */
463111:	li	r9,1
46494:	li	r5,0
465	li	r3,0
466/*
467 * At this stage the number of bytes not copied is
468 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
469 */
47099:	mfctr	r0
471106:	slw	r3,r0,r3
472	add.	r3,r3,r5
473	beq	120f			/* shouldn't happen */
474	cmpwi	0,r9,0
475	bne	120f
476/* for a read fault, first try to continue the copy one byte at a time */
477	mtctr	r3
478130:	lbz	r0,4(r4)
479131:	stb	r0,4(r6)
480	addi	r4,r4,1
481	addi	r6,r6,1
482	bdnz	130b
483/* then clear out the destination: r3 bytes starting at 4(r6) */
484132:	mfctr	r3
485	srwi.	r0,r3,2
486	li	r9,0
487	mtctr	r0
488	beq	113f
489112:	stwu	r9,4(r6)
490	bdnz	112b
491113:	andi.	r0,r3,3
492	mtctr	r0
493	beq	120f
494114:	stb	r9,4(r6)
495	addi	r6,r6,1
496	bdnz	114b
497120:	blr
498
499	.section __ex_table,"a"
500	.align	2
501	.long	30b,108b
502	.long	31b,109b
503	.long	40b,110b
504	.long	41b,111b
505	.long	130b,132b
506	.long	131b,120b
507	.long	112b,120b
508	.long	114b,120b
509	.text
510