xref: /linux/arch/xtensa/lib/memcopy.S (revision 0c7c237b1c35011ef0b8d30c1d5c20bc6ae7b69b)
1/*
2 * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
3 * xthal_memcpy and xthal_bcopy
4 *
5 * This file is subject to the terms and conditions of the GNU General Public
6 * License.  See the file "COPYING" in the main directory of this archive
7 * for more details.
8 *
9 * Copyright (C) 2002 - 2012 Tensilica Inc.
10 */
11
12#include <linux/linkage.h>
13#include <asm/asmmacro.h>
14#include <asm/core.h>
15
16/*
17 * void *memcpy(void *dst, const void *src, size_t len);
18 *
19 * This function is intended to do the same thing as the standard
20 * library function memcpy() for most cases.
21 * However, where the source and/or destination references
22 * an instruction RAM or ROM or a data RAM or ROM, that
23 * source and/or destination will always be accessed with
24 * 32-bit load and store instructions (as required for these
25 * types of devices).
26 *
27 * !!!!!!!  XTFIXME:
28 * !!!!!!!  Handling of IRAM/IROM has not yet
29 * !!!!!!!  been implemented.
30 *
31 * The (general case) algorithm is as follows:
32 *   If destination is unaligned, align it by conditionally
33 *     copying 1 and 2 bytes.
34 *   If source is aligned,
35 *     do 16 bytes with a loop, and then finish up with
36 *     8, 4, 2, and 1 byte copies conditional on the length;
37 *   else (if source is unaligned),
38 *     do the same, but use SRC to align the source data.
39 *   This code tries to use fall-through branches for the common
40 *     case of aligned source and destination and multiple
41 *     of 4 (or 8) length.
42 *
43 * Register use:
44 *	a0/ return address
45 *	a1/ stack pointer
46 *	a2/ return value
47 *	a3/ src
48 *	a4/ length
49 *	a5/ dst
50 *	a6/ tmp
51 *	a7/ tmp
52 *	a8/ tmp
53 *	a9/ tmp
54 *	a10/ tmp
55 *	a11/ tmp
56 */
57
58	.text
59
60/*
61 * Byte by byte copy
62 */
63	.align	4
64	.byte	0		# 1 mod 4 alignment for LOOPNEZ
65				# (0 mod 4 alignment for LBEG)
66.Lbytecopy:
67#if XCHAL_HAVE_LOOPS
68	loopnez	a4, .Lbytecopydone
69#else /* !XCHAL_HAVE_LOOPS */
70	beqz	a4, .Lbytecopydone
71	add	a7, a3, a4	# a7 = end address for source
72#endif /* !XCHAL_HAVE_LOOPS */
73.Lnextbyte:
74	l8ui	a6, a3, 0
75	addi	a3, a3, 1
76	s8i	a6, a5, 0
77	addi	a5, a5, 1
78#if !XCHAL_HAVE_LOOPS
79	bne	a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
80#endif /* !XCHAL_HAVE_LOOPS */
81.Lbytecopydone:
82	abi_ret_default
83
84/*
85 * Destination is unaligned
86 */
87
88	.align	4
89.Ldst1mod2:	# dst is only byte aligned
90	_bltui	a4, 7, .Lbytecopy	# do short copies byte by byte
91
92	# copy 1 byte
93	l8ui	a6, a3,  0
94	addi	a3, a3,  1
95	addi	a4, a4, -1
96	s8i	a6, a5,  0
97	addi	a5, a5,  1
98	_bbci.l	a5, 1, .Ldstaligned	# if dst is now aligned, then
99					# return to main algorithm
100.Ldst2mod4:	# dst 16-bit aligned
101	# copy 2 bytes
102	_bltui	a4, 6, .Lbytecopy	# do short copies byte by byte
103	l8ui	a6, a3,  0
104	l8ui	a7, a3,  1
105	addi	a3, a3,  2
106	addi	a4, a4, -2
107	s8i	a6, a5,  0
108	s8i	a7, a5,  1
109	addi	a5, a5,  2
110	j	.Ldstaligned	# dst is now aligned, return to main algorithm
111
112ENTRY(__memcpy)
113WEAK(memcpy)
114
115	abi_entry_default
116	# a2/ dst, a3/ src, a4/ len
117	mov	a5, a2		# copy dst so that a2 is return value
118.Lcommon:
119	_bbsi.l	a2, 0, .Ldst1mod2	# if dst is 1 mod 2
120	_bbsi.l	a2, 1, .Ldst2mod4	# if dst is 2 mod 4
121.Ldstaligned:	# return here from .Ldst?mod? once dst is aligned
122	srli	a7, a4, 4	# number of loop iterations with 16B
123				# per iteration
124	movi	a8, 3		# if source is not aligned,
125	_bany	a3, a8, .Lsrcunaligned	# then use shifting copy
126	/*
127	 * Destination and source are word-aligned, use word copy.
128	 */
129	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
130#if XCHAL_HAVE_LOOPS
131	loopnez	a7, .Loop1done
132#else /* !XCHAL_HAVE_LOOPS */
133	beqz	a7, .Loop1done
134	slli	a8, a7, 4
135	add	a8, a8, a3	# a8 = end of last 16B source chunk
136#endif /* !XCHAL_HAVE_LOOPS */
137.Loop1:
138	l32i	a6, a3,  0
139	l32i	a7, a3,  4
140	s32i	a6, a5,  0
141	l32i	a6, a3,  8
142	s32i	a7, a5,  4
143	l32i	a7, a3, 12
144	s32i	a6, a5,  8
145	addi	a3, a3, 16
146	s32i	a7, a5, 12
147	addi	a5, a5, 16
148#if !XCHAL_HAVE_LOOPS
149	bne	a3, a8, .Loop1  # continue loop if a3:src != a8:src_end
150#endif /* !XCHAL_HAVE_LOOPS */
151.Loop1done:
152	bbci.l	a4, 3, .L2
153	# copy 8 bytes
154	l32i	a6, a3,  0
155	l32i	a7, a3,  4
156	addi	a3, a3,  8
157	s32i	a6, a5,  0
158	s32i	a7, a5,  4
159	addi	a5, a5,  8
160.L2:
161	bbsi.l	a4, 2, .L3
162	bbsi.l	a4, 1, .L4
163	bbsi.l	a4, 0, .L5
164	abi_ret_default
165.L3:
166	# copy 4 bytes
167	l32i	a6, a3,  0
168	addi	a3, a3,  4
169	s32i	a6, a5,  0
170	addi	a5, a5,  4
171	bbsi.l	a4, 1, .L4
172	bbsi.l	a4, 0, .L5
173	abi_ret_default
174.L4:
175	# copy 2 bytes
176	l16ui	a6, a3,  0
177	addi	a3, a3,  2
178	s16i	a6, a5,  0
179	addi	a5, a5,  2
180	bbsi.l	a4, 0, .L5
181	abi_ret_default
182.L5:
183	# copy 1 byte
184	l8ui	a6, a3,  0
185	s8i	a6, a5,  0
186	abi_ret_default
187
188/*
189 * Destination is aligned, Source is unaligned
190 */
191
192	.align	4
193.Lsrcunaligned:
194	_beqz	a4, .Ldone	# avoid loading anything for zero-length copies
195	# copy 16 bytes per iteration for word-aligned dst and unaligned src
196	__ssa8	a3		# set shift amount from byte offset
197
198/* set to 1 when running on ISS (simulator) with the
199   lint or ferret client, or 0 to save a few cycles */
200#define SIM_CHECKS_ALIGNMENT	1
201#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
202	and	a11, a3, a8	# save unalignment offset for below
203	sub	a3, a3, a11	# align a3
204#endif
205	l32i	a6, a3, 0	# load first word
206#if XCHAL_HAVE_LOOPS
207	loopnez	a7, .Loop2done
208#else /* !XCHAL_HAVE_LOOPS */
209	beqz	a7, .Loop2done
210	slli	a10, a7, 4
211	add	a10, a10, a3	# a10 = end of last 16B source chunk
212#endif /* !XCHAL_HAVE_LOOPS */
213.Loop2:
214	l32i	a7, a3,  4
215	l32i	a8, a3,  8
216	__src_b	a6, a6, a7
217	s32i	a6, a5,  0
218	l32i	a9, a3, 12
219	__src_b	a7, a7, a8
220	s32i	a7, a5,  4
221	l32i	a6, a3, 16
222	__src_b	a8, a8, a9
223	s32i	a8, a5,  8
224	addi	a3, a3, 16
225	__src_b	a9, a9, a6
226	s32i	a9, a5, 12
227	addi	a5, a5, 16
228#if !XCHAL_HAVE_LOOPS
229	bne	a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
230#endif /* !XCHAL_HAVE_LOOPS */
231.Loop2done:
232	bbci.l	a4, 3, .L12
233	# copy 8 bytes
234	l32i	a7, a3,  4
235	l32i	a8, a3,  8
236	__src_b	a6, a6, a7
237	s32i	a6, a5,  0
238	addi	a3, a3,  8
239	__src_b	a7, a7, a8
240	s32i	a7, a5,  4
241	addi	a5, a5,  8
242	mov	a6, a8
243.L12:
244	bbci.l	a4, 2, .L13
245	# copy 4 bytes
246	l32i	a7, a3,  4
247	addi	a3, a3,  4
248	__src_b	a6, a6, a7
249	s32i	a6, a5,  0
250	addi	a5, a5,  4
251	mov	a6, a7
252.L13:
253#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
254	add	a3, a3, a11	# readjust a3 with correct misalignment
255#endif
256	bbsi.l	a4, 1, .L14
257	bbsi.l	a4, 0, .L15
258.Ldone:	abi_ret_default
259.L14:
260	# copy 2 bytes
261	l8ui	a6, a3,  0
262	l8ui	a7, a3,  1
263	addi	a3, a3,  2
264	s8i	a6, a5,  0
265	s8i	a7, a5,  1
266	addi	a5, a5,  2
267	bbsi.l	a4, 0, .L15
268	abi_ret_default
269.L15:
270	# copy 1 byte
271	l8ui	a6, a3,  0
272	s8i	a6, a5,  0
273	abi_ret_default
274
275ENDPROC(__memcpy)
276EXPORT_SYMBOL(__memcpy)
277EXPORT_SYMBOL(memcpy)
278
279/*
280 * void *memmove(void *dst, const void *src, size_t len);
281 *
282 * This function is intended to do the same thing as the standard
283 * library function memmove() for most cases.
284 * However, where the source and/or destination references
285 * an instruction RAM or ROM or a data RAM or ROM, that
286 * source and/or destination will always be accessed with
287 * 32-bit load and store instructions (as required for these
288 * types of devices).
289 *
290 * !!!!!!!  XTFIXME:
291 * !!!!!!!  Handling of IRAM/IROM has not yet
292 * !!!!!!!  been implemented.
293 *
294 * The (general case) algorithm is as follows:
295 *   If end of source doesn't overlap destination then use memcpy.
296 *   Otherwise do memcpy backwards.
297 *
298 * Register use:
299 *	a0/ return address
300 *	a1/ stack pointer
301 *	a2/ return value
302 *	a3/ src
303 *	a4/ length
304 *	a5/ dst
305 *	a6/ tmp
306 *	a7/ tmp
307 *	a8/ tmp
308 *	a9/ tmp
309 *	a10/ tmp
310 *	a11/ tmp
311 */
312
313/*
314 * Byte by byte copy
315 */
316	.align	4
317	.byte	0		# 1 mod 4 alignment for LOOPNEZ
318				# (0 mod 4 alignment for LBEG)
319.Lbackbytecopy:
320#if XCHAL_HAVE_LOOPS
321	loopnez	a4, .Lbackbytecopydone
322#else /* !XCHAL_HAVE_LOOPS */
323	beqz	a4, .Lbackbytecopydone
324	sub	a7, a3, a4	# a7 = start address for source
325#endif /* !XCHAL_HAVE_LOOPS */
326.Lbacknextbyte:
327	addi	a3, a3, -1
328	l8ui	a6, a3, 0
329	addi	a5, a5, -1
330	s8i	a6, a5, 0
331#if !XCHAL_HAVE_LOOPS
332	bne	a3, a7, .Lbacknextbyte # continue loop if
333				       # $a3:src != $a7:src_start
334#endif /* !XCHAL_HAVE_LOOPS */
335.Lbackbytecopydone:
336	abi_ret_default
337
338/*
339 * Destination is unaligned
340 */
341
342	.align	4
343.Lbackdst1mod2:	# dst is only byte aligned
344	_bltui	a4, 7, .Lbackbytecopy	# do short copies byte by byte
345
346	# copy 1 byte
347	addi	a3, a3, -1
348	l8ui	a6, a3,  0
349	addi	a5, a5, -1
350	s8i	a6, a5,  0
351	addi	a4, a4, -1
352	_bbci.l	a5, 1, .Lbackdstaligned	# if dst is now aligned, then
353					# return to main algorithm
354.Lbackdst2mod4:	# dst 16-bit aligned
355	# copy 2 bytes
356	_bltui	a4, 6, .Lbackbytecopy	# do short copies byte by byte
357	addi	a3, a3, -2
358	l8ui	a6, a3,  0
359	l8ui	a7, a3,  1
360	addi	a5, a5, -2
361	s8i	a6, a5,  0
362	s8i	a7, a5,  1
363	addi	a4, a4, -2
364	j	.Lbackdstaligned	# dst is now aligned,
365					# return to main algorithm
366
367ENTRY(__memmove)
368WEAK(memmove)
369
370	abi_entry_default
371	# a2/ dst, a3/ src, a4/ len
372	mov	a5, a2		# copy dst so that a2 is return value
373.Lmovecommon:
374	sub	a6, a5, a3
375	bgeu	a6, a4, .Lcommon
376
377	add	a5, a5, a4
378	add	a3, a3, a4
379
380	_bbsi.l	a5, 0, .Lbackdst1mod2	# if dst is 1 mod 2
381	_bbsi.l	a5, 1, .Lbackdst2mod4	# if dst is 2 mod 4
382.Lbackdstaligned:	# return here from .Lbackdst?mod? once dst is aligned
383	srli	a7, a4, 4	# number of loop iterations with 16B
384				# per iteration
385	movi	a8, 3		# if source is not aligned,
386	_bany	a3, a8, .Lbacksrcunaligned	# then use shifting copy
387	/*
388	 * Destination and source are word-aligned, use word copy.
389	 */
390	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
391#if XCHAL_HAVE_LOOPS
392	loopnez	a7, .LbackLoop1done
393#else /* !XCHAL_HAVE_LOOPS */
394	beqz	a7, .LbackLoop1done
395	slli	a8, a7, 4
396	sub	a8, a3, a8	# a8 = start of first 16B source chunk
397#endif /* !XCHAL_HAVE_LOOPS */
398.LbackLoop1:
399	addi	a3, a3, -16
400	l32i	a7, a3, 12
401	l32i	a6, a3,  8
402	addi	a5, a5, -16
403	s32i	a7, a5, 12
404	l32i	a7, a3,  4
405	s32i	a6, a5,  8
406	l32i	a6, a3,  0
407	s32i	a7, a5,  4
408	s32i	a6, a5,  0
409#if !XCHAL_HAVE_LOOPS
410	bne	a3, a8, .LbackLoop1  # continue loop if a3:src != a8:src_start
411#endif /* !XCHAL_HAVE_LOOPS */
412.LbackLoop1done:
413	bbci.l	a4, 3, .Lback2
414	# copy 8 bytes
415	addi	a3, a3, -8
416	l32i	a6, a3,  0
417	l32i	a7, a3,  4
418	addi	a5, a5, -8
419	s32i	a6, a5,  0
420	s32i	a7, a5,  4
421.Lback2:
422	bbsi.l	a4, 2, .Lback3
423	bbsi.l	a4, 1, .Lback4
424	bbsi.l	a4, 0, .Lback5
425	abi_ret_default
426.Lback3:
427	# copy 4 bytes
428	addi	a3, a3, -4
429	l32i	a6, a3,  0
430	addi	a5, a5, -4
431	s32i	a6, a5,  0
432	bbsi.l	a4, 1, .Lback4
433	bbsi.l	a4, 0, .Lback5
434	abi_ret_default
435.Lback4:
436	# copy 2 bytes
437	addi	a3, a3, -2
438	l16ui	a6, a3,  0
439	addi	a5, a5, -2
440	s16i	a6, a5,  0
441	bbsi.l	a4, 0, .Lback5
442	abi_ret_default
443.Lback5:
444	# copy 1 byte
445	addi	a3, a3, -1
446	l8ui	a6, a3,  0
447	addi	a5, a5, -1
448	s8i	a6, a5,  0
449	abi_ret_default
450
451/*
452 * Destination is aligned, Source is unaligned
453 */
454
455	.align	4
456.Lbacksrcunaligned:
457	_beqz	a4, .Lbackdone	# avoid loading anything for zero-length copies
458	# copy 16 bytes per iteration for word-aligned dst and unaligned src
459	__ssa8	a3		# set shift amount from byte offset
460#define SIM_CHECKS_ALIGNMENT	1	/* set to 1 when running on ISS with
461					 * the lint or ferret client, or 0
462					 * to save a few cycles */
463#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
464	and	a11, a3, a8	# save unalignment offset for below
465	sub	a3, a3, a11	# align a3
466#endif
467	l32i	a6, a3, 0	# load first word
468#if XCHAL_HAVE_LOOPS
469	loopnez	a7, .LbackLoop2done
470#else /* !XCHAL_HAVE_LOOPS */
471	beqz	a7, .LbackLoop2done
472	slli	a10, a7, 4
473	sub	a10, a3, a10	# a10 = start of first 16B source chunk
474#endif /* !XCHAL_HAVE_LOOPS */
475.LbackLoop2:
476	addi	a3, a3, -16
477	l32i	a7, a3, 12
478	l32i	a8, a3,  8
479	addi	a5, a5, -16
480	__src_b	a6, a7, a6
481	s32i	a6, a5, 12
482	l32i	a9, a3,  4
483	__src_b	a7, a8, a7
484	s32i	a7, a5,  8
485	l32i	a6, a3,  0
486	__src_b	a8, a9, a8
487	s32i	a8, a5,  4
488	__src_b	a9, a6, a9
489	s32i	a9, a5,  0
490#if !XCHAL_HAVE_LOOPS
491	bne	a3, a10, .LbackLoop2 # continue loop if a3:src != a10:src_start
492#endif /* !XCHAL_HAVE_LOOPS */
493.LbackLoop2done:
494	bbci.l	a4, 3, .Lback12
495	# copy 8 bytes
496	addi	a3, a3, -8
497	l32i	a7, a3,  4
498	l32i	a8, a3,  0
499	addi	a5, a5, -8
500	__src_b	a6, a7, a6
501	s32i	a6, a5,  4
502	__src_b	a7, a8, a7
503	s32i	a7, a5,  0
504	mov	a6, a8
505.Lback12:
506	bbci.l	a4, 2, .Lback13
507	# copy 4 bytes
508	addi	a3, a3, -4
509	l32i	a7, a3,  0
510	addi	a5, a5, -4
511	__src_b	a6, a7, a6
512	s32i	a6, a5,  0
513	mov	a6, a7
514.Lback13:
515#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
516	add	a3, a3, a11	# readjust a3 with correct misalignment
517#endif
518	bbsi.l	a4, 1, .Lback14
519	bbsi.l	a4, 0, .Lback15
520.Lbackdone:
521	abi_ret_default
522.Lback14:
523	# copy 2 bytes
524	addi	a3, a3, -2
525	l8ui	a6, a3,  0
526	l8ui	a7, a3,  1
527	addi	a5, a5, -2
528	s8i	a6, a5,  0
529	s8i	a7, a5,  1
530	bbsi.l	a4, 0, .Lback15
531	abi_ret_default
532.Lback15:
533	# copy 1 byte
534	addi	a3, a3, -1
535	addi	a5, a5, -1
536	l8ui	a6, a3,  0
537	s8i	a6, a5,  0
538	abi_ret_default
539
540ENDPROC(__memmove)
541EXPORT_SYMBOL(__memmove)
542EXPORT_SYMBOL(memmove)
543