xref: /linux/arch/powerpc/lib/copy_32.S (revision 4b132aacb0768ac1e652cf517097ea6f237214b9)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Memory copy functions for 32-bit PowerPC.
4 *
5 * Copyright (C) 1996-2005 Paul Mackerras.
6 */
7#include <linux/export.h>
8#include <asm/processor.h>
9#include <asm/cache.h>
10#include <asm/errno.h>
11#include <asm/ppc_asm.h>
12#include <asm/code-patching-asm.h>
13#include <asm/kasan.h>
14
15#define COPY_16_BYTES		\
16	lwz	r7,4(r4);	\
17	lwz	r8,8(r4);	\
18	lwz	r9,12(r4);	\
19	lwzu	r10,16(r4);	\
20	stw	r7,4(r6);	\
21	stw	r8,8(r6);	\
22	stw	r9,12(r6);	\
23	stwu	r10,16(r6)
24
25#define COPY_16_BYTES_WITHEX(n)	\
268 ## n ## 0:			\
27	lwz	r7,4(r4);	\
288 ## n ## 1:			\
29	lwz	r8,8(r4);	\
308 ## n ## 2:			\
31	lwz	r9,12(r4);	\
328 ## n ## 3:			\
33	lwzu	r10,16(r4);	\
348 ## n ## 4:			\
35	stw	r7,4(r6);	\
368 ## n ## 5:			\
37	stw	r8,8(r6);	\
388 ## n ## 6:			\
39	stw	r9,12(r6);	\
408 ## n ## 7:			\
41	stwu	r10,16(r6)
42
43#define COPY_16_BYTES_EXCODE(n)			\
449 ## n ## 0:					\
45	addi	r5,r5,-(16 * n);		\
46	b	104f;				\
479 ## n ## 1:					\
48	addi	r5,r5,-(16 * n);		\
49	b	105f;				\
50	EX_TABLE(8 ## n ## 0b,9 ## n ## 0b);	\
51	EX_TABLE(8 ## n ## 1b,9 ## n ## 0b);	\
52	EX_TABLE(8 ## n ## 2b,9 ## n ## 0b);	\
53	EX_TABLE(8 ## n ## 3b,9 ## n ## 0b);	\
54	EX_TABLE(8 ## n ## 4b,9 ## n ## 1b);	\
55	EX_TABLE(8 ## n ## 5b,9 ## n ## 1b);	\
56	EX_TABLE(8 ## n ## 6b,9 ## n ## 1b);	\
57	EX_TABLE(8 ## n ## 7b,9 ## n ## 1b)
58
59	.text
60
61CACHELINE_BYTES = L1_CACHE_BYTES
62LG_CACHELINE_BYTES = L1_CACHE_SHIFT
63CACHELINE_MASK = (L1_CACHE_BYTES-1)
64
65#ifndef CONFIG_KASAN
66_GLOBAL(memset16)
67	rlwinm.	r0 ,r5, 31, 1, 31
68	addi	r6, r3, -4
69	beq-	2f
70	rlwimi	r4 ,r4 ,16 ,0 ,15
71	mtctr	r0
721:	stwu	r4, 4(r6)
73	bdnz	1b
742:	andi.	r0, r5, 1
75	beqlr
76	sth	r4, 4(r6)
77	blr
78EXPORT_SYMBOL(memset16)
79#endif
80
81/*
82 * Use dcbz on the complete cache lines in the destination
83 * to set them to zero.  This requires that the destination
84 * area is cacheable.  -- paulus
85 *
86 * During early init, cache might not be active yet, so dcbz cannot be used.
87 * We therefore skip the optimised bloc that uses dcbz. This jump is
88 * replaced by a nop once cache is active. This is done in machine_init()
89 */
90_GLOBAL_KASAN(memset)
91	cmplwi	0,r5,4
92	blt	7f
93
94	rlwimi	r4,r4,8,16,23
95	rlwimi	r4,r4,16,0,15
96
97	stw	r4,0(r3)
98	beqlr
99	andi.	r0,r3,3
100	add	r5,r0,r5
101	subf	r6,r0,r3
102	cmplwi	0,r4,0
103	/*
104	 * Skip optimised bloc until cache is enabled. Will be replaced
105	 * by 'bne' during boot to use normal procedure if r4 is not zero
106	 */
1075:	b	2f
108	patch_site	5b, patch__memset_nocache
109
110	clrlwi	r7,r6,32-LG_CACHELINE_BYTES
111	add	r8,r7,r5
112	srwi	r9,r8,LG_CACHELINE_BYTES
113	addic.	r9,r9,-1	/* total number of complete cachelines */
114	ble	2f
115	xori	r0,r7,CACHELINE_MASK & ~3
116	srwi.	r0,r0,2
117	beq	3f
118	mtctr	r0
1194:	stwu	r4,4(r6)
120	bdnz	4b
1213:	mtctr	r9
122	li	r7,4
12310:	dcbz	r7,r6
124	addi	r6,r6,CACHELINE_BYTES
125	bdnz	10b
126	clrlwi	r5,r8,32-LG_CACHELINE_BYTES
127	addi	r5,r5,4
128
1292:	srwi	r0,r5,2
130	mtctr	r0
131	bdz	6f
1321:	stwu	r4,4(r6)
133	bdnz	1b
1346:	andi.	r5,r5,3
135	beqlr
136	mtctr	r5
137	addi	r6,r6,3
1388:	stbu	r4,1(r6)
139	bdnz	8b
140	blr
141
1427:	cmpwi	0,r5,0
143	beqlr
144	mtctr	r5
145	addi	r6,r3,-1
1469:	stbu	r4,1(r6)
147	bdnz	9b
148	blr
149EXPORT_SYMBOL(memset)
150EXPORT_SYMBOL_KASAN(memset)
151
152/*
153 * This version uses dcbz on the complete cache lines in the
154 * destination area to reduce memory traffic.  This requires that
155 * the destination area is cacheable.
156 * We only use this version if the source and dest don't overlap.
157 * -- paulus.
158 *
159 * During early init, cache might not be active yet, so dcbz cannot be used.
160 * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is
161 * replaced by a nop once cache is active. This is done in machine_init()
162 */
163_GLOBAL_KASAN(memmove)
164	cmplw	0,r3,r4
165	bgt	backwards_memcpy
166	/* fall through */
167
168_GLOBAL_KASAN(memcpy)
1691:	b	generic_memcpy
170	patch_site	1b, patch__memcpy_nocache
171
172	add	r7,r3,r5		/* test if the src & dst overlap */
173	add	r8,r4,r5
174	cmplw	0,r4,r7
175	cmplw	1,r3,r8
176	crand	0,0,4			/* cr0.lt &= cr1.lt */
177	blt	generic_memcpy		/* if regions overlap */
178
179	addi	r4,r4,-4
180	addi	r6,r3,-4
181	neg	r0,r3
182	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
183	beq	58f
184
185	cmplw	0,r5,r0			/* is this more than total to do? */
186	blt	63f			/* if not much to do */
187	andi.	r8,r0,3			/* get it word-aligned first */
188	subf	r5,r0,r5
189	mtctr	r8
190	beq+	61f
19170:	lbz	r9,4(r4)		/* do some bytes */
192	addi	r4,r4,1
193	addi	r6,r6,1
194	stb	r9,3(r6)
195	bdnz	70b
19661:	srwi.	r0,r0,2
197	mtctr	r0
198	beq	58f
19972:	lwzu	r9,4(r4)		/* do some words */
200	stwu	r9,4(r6)
201	bdnz	72b
202
20358:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
204	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
205	li	r11,4
206	mtctr	r0
207	beq	63f
20853:
209	dcbz	r11,r6
210	COPY_16_BYTES
211#if L1_CACHE_BYTES >= 32
212	COPY_16_BYTES
213#if L1_CACHE_BYTES >= 64
214	COPY_16_BYTES
215	COPY_16_BYTES
216#if L1_CACHE_BYTES >= 128
217	COPY_16_BYTES
218	COPY_16_BYTES
219	COPY_16_BYTES
220	COPY_16_BYTES
221#endif
222#endif
223#endif
224	bdnz	53b
225
22663:	srwi.	r0,r5,2
227	mtctr	r0
228	beq	64f
22930:	lwzu	r0,4(r4)
230	stwu	r0,4(r6)
231	bdnz	30b
232
23364:	andi.	r0,r5,3
234	mtctr	r0
235	beq+	65f
236	addi	r4,r4,3
237	addi	r6,r6,3
23840:	lbzu	r0,1(r4)
239	stbu	r0,1(r6)
240	bdnz	40b
24165:	blr
242EXPORT_SYMBOL(memcpy)
243EXPORT_SYMBOL(memmove)
244EXPORT_SYMBOL_KASAN(memcpy)
245EXPORT_SYMBOL_KASAN(memmove)
246
247generic_memcpy:
248	srwi.	r7,r5,3
249	addi	r6,r3,-4
250	addi	r4,r4,-4
251	beq	2f			/* if less than 8 bytes to do */
252	andi.	r0,r6,3			/* get dest word aligned */
253	mtctr	r7
254	bne	5f
2551:	lwz	r7,4(r4)
256	lwzu	r8,8(r4)
257	stw	r7,4(r6)
258	stwu	r8,8(r6)
259	bdnz	1b
260	andi.	r5,r5,7
2612:	cmplwi	0,r5,4
262	blt	3f
263	lwzu	r0,4(r4)
264	addi	r5,r5,-4
265	stwu	r0,4(r6)
2663:	cmpwi	0,r5,0
267	beqlr
268	mtctr	r5
269	addi	r4,r4,3
270	addi	r6,r6,3
2714:	lbzu	r0,1(r4)
272	stbu	r0,1(r6)
273	bdnz	4b
274	blr
2755:	subfic	r0,r0,4
276	mtctr	r0
2776:	lbz	r7,4(r4)
278	addi	r4,r4,1
279	stb	r7,4(r6)
280	addi	r6,r6,1
281	bdnz	6b
282	subf	r5,r0,r5
283	rlwinm.	r7,r5,32-3,3,31
284	beq	2b
285	mtctr	r7
286	b	1b
287
288_GLOBAL(backwards_memcpy)
289	rlwinm.	r7,r5,32-3,3,31		/* r0 = r5 >> 3 */
290	add	r6,r3,r5
291	add	r4,r4,r5
292	beq	2f
293	andi.	r0,r6,3
294	mtctr	r7
295	bne	5f
2961:	lwz	r7,-4(r4)
297	lwzu	r8,-8(r4)
298	stw	r7,-4(r6)
299	stwu	r8,-8(r6)
300	bdnz	1b
301	andi.	r5,r5,7
3022:	cmplwi	0,r5,4
303	blt	3f
304	lwzu	r0,-4(r4)
305	subi	r5,r5,4
306	stwu	r0,-4(r6)
3073:	cmpwi	0,r5,0
308	beqlr
309	mtctr	r5
3104:	lbzu	r0,-1(r4)
311	stbu	r0,-1(r6)
312	bdnz	4b
313	blr
3145:	mtctr	r0
3156:	lbzu	r7,-1(r4)
316	stbu	r7,-1(r6)
317	bdnz	6b
318	subf	r5,r0,r5
319	rlwinm.	r7,r5,32-3,3,31
320	beq	2b
321	mtctr	r7
322	b	1b
323
324_GLOBAL(__copy_tofrom_user)
325	addi	r4,r4,-4
326	addi	r6,r3,-4
327	neg	r0,r3
328	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
329	beq	58f
330
331	cmplw	0,r5,r0			/* is this more than total to do? */
332	blt	63f			/* if not much to do */
333	andi.	r8,r0,3			/* get it word-aligned first */
334	mtctr	r8
335	beq+	61f
33670:	lbz	r9,4(r4)		/* do some bytes */
33771:	stb	r9,4(r6)
338	addi	r4,r4,1
339	addi	r6,r6,1
340	bdnz	70b
34161:	subf	r5,r0,r5
342	srwi.	r0,r0,2
343	mtctr	r0
344	beq	58f
34572:	lwzu	r9,4(r4)		/* do some words */
34673:	stwu	r9,4(r6)
347	bdnz	72b
348
349	EX_TABLE(70b,100f)
350	EX_TABLE(71b,101f)
351	EX_TABLE(72b,102f)
352	EX_TABLE(73b,103f)
353
35458:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
355	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
356	li	r11,4
357	beq	63f
358
359	/* Here we decide how far ahead to prefetch the source */
360	li	r3,4
361	cmpwi	r0,1
362	li	r7,0
363	ble	114f
364	li	r7,1
365#if MAX_COPY_PREFETCH > 1
366	/* Heuristically, for large transfers we prefetch
367	   MAX_COPY_PREFETCH cachelines ahead.  For small transfers
368	   we prefetch 1 cacheline ahead. */
369	cmpwi	r0,MAX_COPY_PREFETCH
370	ble	112f
371	li	r7,MAX_COPY_PREFETCH
372112:	mtctr	r7
373111:	dcbt	r3,r4
374	addi	r3,r3,CACHELINE_BYTES
375	bdnz	111b
376#else
377	dcbt	r3,r4
378	addi	r3,r3,CACHELINE_BYTES
379#endif /* MAX_COPY_PREFETCH > 1 */
380
381114:	subf	r8,r7,r0
382	mr	r0,r7
383	mtctr	r8
384
38553:	dcbt	r3,r4
38654:	dcbz	r11,r6
387	EX_TABLE(54b,105f)
388/* the main body of the cacheline loop */
389	COPY_16_BYTES_WITHEX(0)
390#if L1_CACHE_BYTES >= 32
391	COPY_16_BYTES_WITHEX(1)
392#if L1_CACHE_BYTES >= 64
393	COPY_16_BYTES_WITHEX(2)
394	COPY_16_BYTES_WITHEX(3)
395#if L1_CACHE_BYTES >= 128
396	COPY_16_BYTES_WITHEX(4)
397	COPY_16_BYTES_WITHEX(5)
398	COPY_16_BYTES_WITHEX(6)
399	COPY_16_BYTES_WITHEX(7)
400#endif
401#endif
402#endif
403	bdnz	53b
404	cmpwi	r0,0
405	li	r3,4
406	li	r7,0
407	bne	114b
408
40963:	srwi.	r0,r5,2
410	mtctr	r0
411	beq	64f
41230:	lwzu	r0,4(r4)
41331:	stwu	r0,4(r6)
414	bdnz	30b
415
41664:	andi.	r0,r5,3
417	mtctr	r0
418	beq+	65f
41940:	lbz	r0,4(r4)
42041:	stb	r0,4(r6)
421	addi	r4,r4,1
422	addi	r6,r6,1
423	bdnz	40b
42465:	li	r3,0
425	blr
426
427/* read fault, initial single-byte copy */
428100:	li	r9,0
429	b	90f
430/* write fault, initial single-byte copy */
431101:	li	r9,1
43290:	subf	r5,r8,r5
433	li	r3,0
434	b	99f
435/* read fault, initial word copy */
436102:	li	r9,0
437	b	91f
438/* write fault, initial word copy */
439103:	li	r9,1
44091:	li	r3,2
441	b	99f
442
443/*
444 * this stuff handles faults in the cacheline loop and branches to either
445 * 104f (if in read part) or 105f (if in write part), after updating r5
446 */
447	COPY_16_BYTES_EXCODE(0)
448#if L1_CACHE_BYTES >= 32
449	COPY_16_BYTES_EXCODE(1)
450#if L1_CACHE_BYTES >= 64
451	COPY_16_BYTES_EXCODE(2)
452	COPY_16_BYTES_EXCODE(3)
453#if L1_CACHE_BYTES >= 128
454	COPY_16_BYTES_EXCODE(4)
455	COPY_16_BYTES_EXCODE(5)
456	COPY_16_BYTES_EXCODE(6)
457	COPY_16_BYTES_EXCODE(7)
458#endif
459#endif
460#endif
461
462/* read fault in cacheline loop */
463104:	li	r9,0
464	b	92f
465/* fault on dcbz (effectively a write fault) */
466/* or write fault in cacheline loop */
467105:	li	r9,1
46892:	li	r3,LG_CACHELINE_BYTES
469	mfctr	r8
470	add	r0,r0,r8
471	b	106f
472/* read fault in final word loop */
473108:	li	r9,0
474	b	93f
475/* write fault in final word loop */
476109:	li	r9,1
47793:	andi.	r5,r5,3
478	li	r3,2
479	b	99f
480/* read fault in final byte loop */
481110:	li	r9,0
482	b	94f
483/* write fault in final byte loop */
484111:	li	r9,1
48594:	li	r5,0
486	li	r3,0
487/*
488 * At this stage the number of bytes not copied is
489 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
490 */
49199:	mfctr	r0
492106:	slw	r3,r0,r3
493	add.	r3,r3,r5
494	beq	120f			/* shouldn't happen */
495	cmpwi	0,r9,0
496	bne	120f
497/* for a read fault, first try to continue the copy one byte at a time */
498	mtctr	r3
499130:	lbz	r0,4(r4)
500131:	stb	r0,4(r6)
501	addi	r4,r4,1
502	addi	r6,r6,1
503	bdnz	130b
504/* then clear out the destination: r3 bytes starting at 4(r6) */
505132:	mfctr	r3
506120:	blr
507
508	EX_TABLE(30b,108b)
509	EX_TABLE(31b,109b)
510	EX_TABLE(40b,110b)
511	EX_TABLE(41b,111b)
512	EX_TABLE(130b,132b)
513	EX_TABLE(131b,120b)
514
515EXPORT_SYMBOL(__copy_tofrom_user)
516