xref: /linux/arch/xtensa/lib/memcopy.S (revision 08ec212c0f92cbf30e3ecc7349f18151714041d6)
1/*
2 * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
3 * xthal_memcpy and xthal_bcopy
4 *
5 * This file is subject to the terms and conditions of the GNU General Public
6 * License.  See the file "COPYING" in the main directory of this archive
7 * for more details.
8 *
9 * Copyright (C) 2002 - 2012 Tensilica Inc.
10 */
11
12#include <variant/core.h>
13
14	.macro	src_b	r, w0, w1
15#ifdef __XTENSA_EB__
16	src	\r, \w0, \w1
17#else
18	src	\r, \w1, \w0
19#endif
20	.endm
21
22	.macro	ssa8	r
23#ifdef __XTENSA_EB__
24	ssa8b	\r
25#else
26	ssa8l	\r
27#endif
28	.endm
29
30/*
31 * void *memcpy(void *dst, const void *src, size_t len);
32 *
33 * This function is intended to do the same thing as the standard
34 * library function memcpy() for most cases.
35 * However, where the source and/or destination references
36 * an instruction RAM or ROM or a data RAM or ROM, that
37 * source and/or destination will always be accessed with
38 * 32-bit load and store instructions (as required for these
39 * types of devices).
40 *
41 * !!!!!!!  XTFIXME:
42 * !!!!!!!  Handling of IRAM/IROM has not yet
43 * !!!!!!!  been implemented.
44 *
45 * The (general case) algorithm is as follows:
46 *   If destination is unaligned, align it by conditionally
47 *     copying 1 and 2 bytes.
48 *   If source is aligned,
49 *     do 16 bytes with a loop, and then finish up with
50 *     8, 4, 2, and 1 byte copies conditional on the length;
51 *   else (if source is unaligned),
52 *     do the same, but use SRC to align the source data.
53 *   This code tries to use fall-through branches for the common
54 *     case of aligned source and destination and multiple
55 *     of 4 (or 8) length.
56 *
57 * Register use:
58 *	a0/ return address
59 *	a1/ stack pointer
60 *	a2/ return value
61 *	a3/ src
62 *	a4/ length
63 *	a5/ dst
64 *	a6/ tmp
65 *	a7/ tmp
66 *	a8/ tmp
67 *	a9/ tmp
68 *	a10/ tmp
69 *	a11/ tmp
70 */
71
72	.text
73
74/*
75 * Byte by byte copy
76 */
77	.align	4
78	.byte	0		# 1 mod 4 alignment for LOOPNEZ
79				# (0 mod 4 alignment for LBEG)
80.Lbytecopy:
81#if XCHAL_HAVE_LOOPS
82	loopnez	a4, .Lbytecopydone
83#else /* !XCHAL_HAVE_LOOPS */
84	beqz	a4, .Lbytecopydone
85	add	a7, a3, a4	# a7 = end address for source
86#endif /* !XCHAL_HAVE_LOOPS */
87.Lnextbyte:
88	l8ui	a6, a3, 0
89	addi	a3, a3, 1
90	s8i	a6, a5, 0
91	addi	a5, a5, 1
92#if !XCHAL_HAVE_LOOPS
93	bne	a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
94#endif /* !XCHAL_HAVE_LOOPS */
95.Lbytecopydone:
96	retw
97
98/*
99 * Destination is unaligned
100 */
101
102	.align	4
103.Ldst1mod2:	# dst is only byte aligned
104	_bltui	a4, 7, .Lbytecopy	# do short copies byte by byte
105
106	# copy 1 byte
107	l8ui	a6, a3,  0
108	addi	a3, a3,  1
109	addi	a4, a4, -1
110	s8i	a6, a5,  0
111	addi	a5, a5,  1
112	_bbci.l	a5, 1, .Ldstaligned	# if dst is now aligned, then
113					# return to main algorithm
114.Ldst2mod4:	# dst 16-bit aligned
115	# copy 2 bytes
116	_bltui	a4, 6, .Lbytecopy	# do short copies byte by byte
117	l8ui	a6, a3,  0
118	l8ui	a7, a3,  1
119	addi	a3, a3,  2
120	addi	a4, a4, -2
121	s8i	a6, a5,  0
122	s8i	a7, a5,  1
123	addi	a5, a5,  2
124	j	.Ldstaligned	# dst is now aligned, return to main algorithm
125
126	.align	4
127	.global	memcpy
128	.type   memcpy,@function
129memcpy:
130
131	entry	sp, 16		# minimal stack frame
132	# a2/ dst, a3/ src, a4/ len
133	mov	a5, a2		# copy dst so that a2 is return value
134.Lcommon:
135	_bbsi.l	a2, 0, .Ldst1mod2	# if dst is 1 mod 2
136	_bbsi.l	a2, 1, .Ldst2mod4	# if dst is 2 mod 4
137.Ldstaligned:	# return here from .Ldst?mod? once dst is aligned
138	srli	a7, a4, 4	# number of loop iterations with 16B
139				# per iteration
140	movi	a8, 3		# if source is not aligned,
141	_bany	a3, a8, .Lsrcunaligned	# then use shifting copy
142	/*
143	 * Destination and source are word-aligned, use word copy.
144	 */
145	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
146#if XCHAL_HAVE_LOOPS
147	loopnez	a7, .Loop1done
148#else /* !XCHAL_HAVE_LOOPS */
149	beqz	a7, .Loop1done
150	slli	a8, a7, 4
151	add	a8, a8, a3	# a8 = end of last 16B source chunk
152#endif /* !XCHAL_HAVE_LOOPS */
153.Loop1:
154	l32i	a6, a3,  0
155	l32i	a7, a3,  4
156	s32i	a6, a5,  0
157	l32i	a6, a3,  8
158	s32i	a7, a5,  4
159	l32i	a7, a3, 12
160	s32i	a6, a5,  8
161	addi	a3, a3, 16
162	s32i	a7, a5, 12
163	addi	a5, a5, 16
164#if !XCHAL_HAVE_LOOPS
165	bne	a3, a8, .Loop1  # continue loop if a3:src != a8:src_end
166#endif /* !XCHAL_HAVE_LOOPS */
167.Loop1done:
168	bbci.l	a4, 3, .L2
169	# copy 8 bytes
170	l32i	a6, a3,  0
171	l32i	a7, a3,  4
172	addi	a3, a3,  8
173	s32i	a6, a5,  0
174	s32i	a7, a5,  4
175	addi	a5, a5,  8
176.L2:
177	bbsi.l	a4, 2, .L3
178	bbsi.l	a4, 1, .L4
179	bbsi.l	a4, 0, .L5
180	retw
181.L3:
182	# copy 4 bytes
183	l32i	a6, a3,  0
184	addi	a3, a3,  4
185	s32i	a6, a5,  0
186	addi	a5, a5,  4
187	bbsi.l	a4, 1, .L4
188	bbsi.l	a4, 0, .L5
189	retw
190.L4:
191	# copy 2 bytes
192	l16ui	a6, a3,  0
193	addi	a3, a3,  2
194	s16i	a6, a5,  0
195	addi	a5, a5,  2
196	bbsi.l	a4, 0, .L5
197	retw
198.L5:
199	# copy 1 byte
200	l8ui	a6, a3,  0
201	s8i	a6, a5,  0
202	retw
203
204/*
205 * Destination is aligned, Source is unaligned
206 */
207
208	.align	4
209.Lsrcunaligned:
210	_beqz	a4, .Ldone	# avoid loading anything for zero-length copies
211	# copy 16 bytes per iteration for word-aligned dst and unaligned src
212	ssa8	a3		# set shift amount from byte offset
213#define SIM_CHECKS_ALIGNMENT	1	/* set to 1 when running on ISS (simulator) with the
214					   lint or ferret client, or 0 to save a few cycles */
215#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
216	and	a11, a3, a8	# save unalignment offset for below
217	sub	a3, a3, a11	# align a3
218#endif
219	l32i	a6, a3, 0	# load first word
220#if XCHAL_HAVE_LOOPS
221	loopnez	a7, .Loop2done
222#else /* !XCHAL_HAVE_LOOPS */
223	beqz	a7, .Loop2done
224	slli	a10, a7, 4
225	add	a10, a10, a3	# a10 = end of last 16B source chunk
226#endif /* !XCHAL_HAVE_LOOPS */
227.Loop2:
228	l32i	a7, a3,  4
229	l32i	a8, a3,  8
230	src_b	a6, a6, a7
231	s32i	a6, a5,  0
232	l32i	a9, a3, 12
233	src_b	a7, a7, a8
234	s32i	a7, a5,  4
235	l32i	a6, a3, 16
236	src_b	a8, a8, a9
237	s32i	a8, a5,  8
238	addi	a3, a3, 16
239	src_b	a9, a9, a6
240	s32i	a9, a5, 12
241	addi	a5, a5, 16
242#if !XCHAL_HAVE_LOOPS
243	bne	a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
244#endif /* !XCHAL_HAVE_LOOPS */
245.Loop2done:
246	bbci.l	a4, 3, .L12
247	# copy 8 bytes
248	l32i	a7, a3,  4
249	l32i	a8, a3,  8
250	src_b	a6, a6, a7
251	s32i	a6, a5,  0
252	addi	a3, a3,  8
253	src_b	a7, a7, a8
254	s32i	a7, a5,  4
255	addi	a5, a5,  8
256	mov	a6, a8
257.L12:
258	bbci.l	a4, 2, .L13
259	# copy 4 bytes
260	l32i	a7, a3,  4
261	addi	a3, a3,  4
262	src_b	a6, a6, a7
263	s32i	a6, a5,  0
264	addi	a5, a5,  4
265	mov	a6, a7
266.L13:
267#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
268	add	a3, a3, a11	# readjust a3 with correct misalignment
269#endif
270	bbsi.l	a4, 1, .L14
271	bbsi.l	a4, 0, .L15
272.Ldone:	retw
273.L14:
274	# copy 2 bytes
275	l8ui	a6, a3,  0
276	l8ui	a7, a3,  1
277	addi	a3, a3,  2
278	s8i	a6, a5,  0
279	s8i	a7, a5,  1
280	addi	a5, a5,  2
281	bbsi.l	a4, 0, .L15
282	retw
283.L15:
284	# copy 1 byte
285	l8ui	a6, a3,  0
286	s8i	a6, a5,  0
287	retw
288
289
290/*
291 * void bcopy(const void *src, void *dest, size_t n);
292 */
293	.align	4
294	.global	bcopy
295	.type   bcopy,@function
296bcopy:
297	entry	sp, 16		# minimal stack frame
298	# a2=src, a3=dst, a4=len
299	mov	a5, a3
300	mov	a3, a2
301	mov	a2, a5
302	j	.Lmovecommon	# go to common code for memmove+bcopy
303
304/*
305 * void *memmove(void *dst, const void *src, size_t len);
306 *
307 * This function is intended to do the same thing as the standard
308 * library function memmove() for most cases.
309 * However, where the source and/or destination references
310 * an instruction RAM or ROM or a data RAM or ROM, that
311 * source and/or destination will always be accessed with
312 * 32-bit load and store instructions (as required for these
313 * types of devices).
314 *
315 * !!!!!!!  XTFIXME:
316 * !!!!!!!  Handling of IRAM/IROM has not yet
317 * !!!!!!!  been implemented.
318 *
319 * The (general case) algorithm is as follows:
320 *   If end of source doesn't overlap destination then use memcpy.
321 *   Otherwise do memcpy backwards.
322 *
323 * Register use:
324 *	a0/ return address
325 *	a1/ stack pointer
326 *	a2/ return value
327 *	a3/ src
328 *	a4/ length
329 *	a5/ dst
330 *	a6/ tmp
331 *	a7/ tmp
332 *	a8/ tmp
333 *	a9/ tmp
334 *	a10/ tmp
335 *	a11/ tmp
336 */
337
338/*
339 * Byte by byte copy
340 */
341	.align	4
342	.byte	0		# 1 mod 4 alignment for LOOPNEZ
343				# (0 mod 4 alignment for LBEG)
344.Lbackbytecopy:
345#if XCHAL_HAVE_LOOPS
346	loopnez	a4, .Lbackbytecopydone
347#else /* !XCHAL_HAVE_LOOPS */
348	beqz	a4, .Lbackbytecopydone
349	sub	a7, a3, a4	# a7 = start address for source
350#endif /* !XCHAL_HAVE_LOOPS */
351.Lbacknextbyte:
352	addi	a3, a3, -1
353	l8ui	a6, a3, 0
354	addi	a5, a5, -1
355	s8i	a6, a5, 0
356#if !XCHAL_HAVE_LOOPS
357	bne	a3, a7, .Lbacknextbyte # continue loop if
358				       # $a3:src != $a7:src_start
359#endif /* !XCHAL_HAVE_LOOPS */
360.Lbackbytecopydone:
361	retw
362
363/*
364 * Destination is unaligned
365 */
366
367	.align	4
368.Lbackdst1mod2:	# dst is only byte aligned
369	_bltui	a4, 7, .Lbackbytecopy	# do short copies byte by byte
370
371	# copy 1 byte
372	addi	a3, a3, -1
373	l8ui	a6, a3,  0
374	addi	a5, a5, -1
375	s8i	a6, a5,  0
376	addi	a4, a4, -1
377	_bbci.l	a5, 1, .Lbackdstaligned	# if dst is now aligned, then
378					# return to main algorithm
379.Lbackdst2mod4:	# dst 16-bit aligned
380	# copy 2 bytes
381	_bltui	a4, 6, .Lbackbytecopy	# do short copies byte by byte
382	addi	a3, a3, -2
383	l8ui	a6, a3,  0
384	l8ui	a7, a3,  1
385	addi	a5, a5, -2
386	s8i	a6, a5,  0
387	s8i	a7, a5,  1
388	addi	a4, a4, -2
389	j	.Lbackdstaligned	# dst is now aligned,
390					# return to main algorithm
391
392	.align	4
393	.global	memmove
394	.type   memmove,@function
395memmove:
396
397	entry	sp, 16		# minimal stack frame
398	# a2/ dst, a3/ src, a4/ len
399	mov	a5, a2		# copy dst so that a2 is return value
400.Lmovecommon:
401	sub	a6, a5, a3
402	bgeu	a6, a4, .Lcommon
403
404	add	a5, a5, a4
405	add	a3, a3, a4
406
407	_bbsi.l	a5, 0, .Lbackdst1mod2	# if dst is 1 mod 2
408	_bbsi.l	a5, 1, .Lbackdst2mod4	# if dst is 2 mod 4
409.Lbackdstaligned:	# return here from .Lbackdst?mod? once dst is aligned
410	srli	a7, a4, 4	# number of loop iterations with 16B
411				# per iteration
412	movi	a8, 3		# if source is not aligned,
413	_bany	a3, a8, .Lbacksrcunaligned	# then use shifting copy
414	/*
415	 * Destination and source are word-aligned, use word copy.
416	 */
417	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
418#if XCHAL_HAVE_LOOPS
419	loopnez	a7, .backLoop1done
420#else /* !XCHAL_HAVE_LOOPS */
421	beqz	a7, .backLoop1done
422	slli	a8, a7, 4
423	sub	a8, a3, a8	# a8 = start of first 16B source chunk
424#endif /* !XCHAL_HAVE_LOOPS */
425.backLoop1:
426	addi	a3, a3, -16
427	l32i	a7, a3, 12
428	l32i	a6, a3,  8
429	addi	a5, a5, -16
430	s32i	a7, a5, 12
431	l32i	a7, a3,  4
432	s32i	a6, a5,  8
433	l32i	a6, a3,  0
434	s32i	a7, a5,  4
435	s32i	a6, a5,  0
436#if !XCHAL_HAVE_LOOPS
437	bne	a3, a8, .backLoop1  # continue loop if a3:src != a8:src_start
438#endif /* !XCHAL_HAVE_LOOPS */
439.backLoop1done:
440	bbci.l	a4, 3, .Lback2
441	# copy 8 bytes
442	addi	a3, a3, -8
443	l32i	a6, a3,  0
444	l32i	a7, a3,  4
445	addi	a5, a5, -8
446	s32i	a6, a5,  0
447	s32i	a7, a5,  4
448.Lback2:
449	bbsi.l	a4, 2, .Lback3
450	bbsi.l	a4, 1, .Lback4
451	bbsi.l	a4, 0, .Lback5
452	retw
453.Lback3:
454	# copy 4 bytes
455	addi	a3, a3, -4
456	l32i	a6, a3,  0
457	addi	a5, a5, -4
458	s32i	a6, a5,  0
459	bbsi.l	a4, 1, .Lback4
460	bbsi.l	a4, 0, .Lback5
461	retw
462.Lback4:
463	# copy 2 bytes
464	addi	a3, a3, -2
465	l16ui	a6, a3,  0
466	addi	a5, a5, -2
467	s16i	a6, a5,  0
468	bbsi.l	a4, 0, .Lback5
469	retw
470.Lback5:
471	# copy 1 byte
472	addi	a3, a3, -1
473	l8ui	a6, a3,  0
474	addi	a5, a5, -1
475	s8i	a6, a5,  0
476	retw
477
478/*
479 * Destination is aligned, Source is unaligned
480 */
481
482	.align	4
483.Lbacksrcunaligned:
484	_beqz	a4, .Lbackdone	# avoid loading anything for zero-length copies
485	# copy 16 bytes per iteration for word-aligned dst and unaligned src
486	ssa8	a3		# set shift amount from byte offset
487#define SIM_CHECKS_ALIGNMENT	1	/* set to 1 when running on ISS with
488					 * the lint or ferret client, or 0
489					 * to save a few cycles */
490#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
491	and	a11, a3, a8	# save unalignment offset for below
492	sub	a3, a3, a11	# align a3
493#endif
494	l32i	a6, a3, 0	# load first word
495#if XCHAL_HAVE_LOOPS
496	loopnez	a7, .backLoop2done
497#else /* !XCHAL_HAVE_LOOPS */
498	beqz	a7, .backLoop2done
499	slli	a10, a7, 4
500	sub	a10, a3, a10	# a10 = start of first 16B source chunk
501#endif /* !XCHAL_HAVE_LOOPS */
502.backLoop2:
503	addi	a3, a3, -16
504	l32i	a7, a3, 12
505	l32i	a8, a3,  8
506	addi	a5, a5, -16
507	src_b	a6, a7, a6
508	s32i	a6, a5, 12
509	l32i	a9, a3,  4
510	src_b	a7, a8, a7
511	s32i	a7, a5,  8
512	l32i	a6, a3,  0
513	src_b	a8, a9, a8
514	s32i	a8, a5,  4
515	src_b	a9, a6, a9
516	s32i	a9, a5,  0
517#if !XCHAL_HAVE_LOOPS
518	bne	a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
519#endif /* !XCHAL_HAVE_LOOPS */
520.backLoop2done:
521	bbci.l	a4, 3, .Lback12
522	# copy 8 bytes
523	addi	a3, a3, -8
524	l32i	a7, a3,  4
525	l32i	a8, a3,  0
526	addi	a5, a5, -8
527	src_b	a6, a7, a6
528	s32i	a6, a5,  4
529	src_b	a7, a8, a7
530	s32i	a7, a5,  0
531	mov	a6, a8
532.Lback12:
533	bbci.l	a4, 2, .Lback13
534	# copy 4 bytes
535	addi	a3, a3, -4
536	l32i	a7, a3,  0
537	addi	a5, a5, -4
538	src_b	a6, a7, a6
539	s32i	a6, a5,  0
540	mov	a6, a7
541.Lback13:
542#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
543	add	a3, a3, a11	# readjust a3 with correct misalignment
544#endif
545	bbsi.l	a4, 1, .Lback14
546	bbsi.l	a4, 0, .Lback15
547.Lbackdone:
548	retw
549.Lback14:
550	# copy 2 bytes
551	addi	a3, a3, -2
552	l8ui	a6, a3,  0
553	l8ui	a7, a3,  1
554	addi	a5, a5, -2
555	s8i	a6, a5,  0
556	s8i	a7, a5,  1
557	bbsi.l	a4, 0, .Lback15
558	retw
559.Lback15:
560	# copy 1 byte
561	addi	a3, a3, -1
562	addi	a5, a5, -1
563	l8ui	a6, a3,  0
564	s8i	a6, a5,  0
565	retw
566
567
568/*
569 * Local Variables:
570 * mode:fundamental
571 * comment-start: "# "
572 * comment-start-skip: "# *"
573 * End:
574 */
575