xref: /linux/arch/xtensa/lib/memcopy.S (revision fbb871e220672a8e9e4e7870da5b206fe05904b2)
1/*
2 * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
3 * xthal_memcpy and xthal_bcopy
4 *
5 * This file is subject to the terms and conditions of the GNU General Public
6 * License.  See the file "COPYING" in the main directory of this archive
7 * for more details.
8 *
9 * Copyright (C) 2002 - 2012 Tensilica Inc.
10 */
11
12#include <variant/core.h>
13#include <asm/asmmacro.h>
14
15/*
16 * void *memcpy(void *dst, const void *src, size_t len);
17 *
18 * This function is intended to do the same thing as the standard
19 * library function memcpy() for most cases.
20 * However, where the source and/or destination references
21 * an instruction RAM or ROM or a data RAM or ROM, that
22 * source and/or destination will always be accessed with
23 * 32-bit load and store instructions (as required for these
24 * types of devices).
25 *
26 * !!!!!!!  XTFIXME:
27 * !!!!!!!  Handling of IRAM/IROM has not yet
28 * !!!!!!!  been implemented.
29 *
30 * The (general case) algorithm is as follows:
31 *   If destination is unaligned, align it by conditionally
32 *     copying 1 and 2 bytes.
33 *   If source is aligned,
34 *     do 16 bytes with a loop, and then finish up with
35 *     8, 4, 2, and 1 byte copies conditional on the length;
36 *   else (if source is unaligned),
37 *     do the same, but use SRC to align the source data.
38 *   This code tries to use fall-through branches for the common
39 *     case of aligned source and destination and multiple
40 *     of 4 (or 8) length.
41 *
42 * Register use:
43 *	a0/ return address
44 *	a1/ stack pointer
45 *	a2/ return value
46 *	a3/ src
47 *	a4/ length
48 *	a5/ dst
49 *	a6/ tmp
50 *	a7/ tmp
51 *	a8/ tmp
52 *	a9/ tmp
53 *	a10/ tmp
54 *	a11/ tmp
55 */
56
57	.text
58
59/*
60 * Byte by byte copy
61 */
62	.align	4
63	.byte	0		# 1 mod 4 alignment for LOOPNEZ
64				# (0 mod 4 alignment for LBEG)
65.Lbytecopy:
66#if XCHAL_HAVE_LOOPS
67	loopnez	a4, .Lbytecopydone
68#else /* !XCHAL_HAVE_LOOPS */
69	beqz	a4, .Lbytecopydone
70	add	a7, a3, a4	# a7 = end address for source
71#endif /* !XCHAL_HAVE_LOOPS */
72.Lnextbyte:
73	l8ui	a6, a3, 0
74	addi	a3, a3, 1
75	s8i	a6, a5, 0
76	addi	a5, a5, 1
77#if !XCHAL_HAVE_LOOPS
78	bne	a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
79#endif /* !XCHAL_HAVE_LOOPS */
80.Lbytecopydone:
81	retw
82
83/*
84 * Destination is unaligned
85 */
86
87	.align	4
88.Ldst1mod2:	# dst is only byte aligned
89	_bltui	a4, 7, .Lbytecopy	# do short copies byte by byte
90
91	# copy 1 byte
92	l8ui	a6, a3,  0
93	addi	a3, a3,  1
94	addi	a4, a4, -1
95	s8i	a6, a5,  0
96	addi	a5, a5,  1
97	_bbci.l	a5, 1, .Ldstaligned	# if dst is now aligned, then
98					# return to main algorithm
99.Ldst2mod4:	# dst 16-bit aligned
100	# copy 2 bytes
101	_bltui	a4, 6, .Lbytecopy	# do short copies byte by byte
102	l8ui	a6, a3,  0
103	l8ui	a7, a3,  1
104	addi	a3, a3,  2
105	addi	a4, a4, -2
106	s8i	a6, a5,  0
107	s8i	a7, a5,  1
108	addi	a5, a5,  2
109	j	.Ldstaligned	# dst is now aligned, return to main algorithm
110
111	.align	4
112	.global	memcpy
113	.type   memcpy,@function
114memcpy:
115
116	entry	sp, 16		# minimal stack frame
117	# a2/ dst, a3/ src, a4/ len
118	mov	a5, a2		# copy dst so that a2 is return value
119.Lcommon:
120	_bbsi.l	a2, 0, .Ldst1mod2	# if dst is 1 mod 2
121	_bbsi.l	a2, 1, .Ldst2mod4	# if dst is 2 mod 4
122.Ldstaligned:	# return here from .Ldst?mod? once dst is aligned
123	srli	a7, a4, 4	# number of loop iterations with 16B
124				# per iteration
125	movi	a8, 3		# if source is not aligned,
126	_bany	a3, a8, .Lsrcunaligned	# then use shifting copy
127	/*
128	 * Destination and source are word-aligned, use word copy.
129	 */
130	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
131#if XCHAL_HAVE_LOOPS
132	loopnez	a7, .Loop1done
133#else /* !XCHAL_HAVE_LOOPS */
134	beqz	a7, .Loop1done
135	slli	a8, a7, 4
136	add	a8, a8, a3	# a8 = end of last 16B source chunk
137#endif /* !XCHAL_HAVE_LOOPS */
138.Loop1:
139	l32i	a6, a3,  0
140	l32i	a7, a3,  4
141	s32i	a6, a5,  0
142	l32i	a6, a3,  8
143	s32i	a7, a5,  4
144	l32i	a7, a3, 12
145	s32i	a6, a5,  8
146	addi	a3, a3, 16
147	s32i	a7, a5, 12
148	addi	a5, a5, 16
149#if !XCHAL_HAVE_LOOPS
150	bne	a3, a8, .Loop1  # continue loop if a3:src != a8:src_end
151#endif /* !XCHAL_HAVE_LOOPS */
152.Loop1done:
153	bbci.l	a4, 3, .L2
154	# copy 8 bytes
155	l32i	a6, a3,  0
156	l32i	a7, a3,  4
157	addi	a3, a3,  8
158	s32i	a6, a5,  0
159	s32i	a7, a5,  4
160	addi	a5, a5,  8
161.L2:
162	bbsi.l	a4, 2, .L3
163	bbsi.l	a4, 1, .L4
164	bbsi.l	a4, 0, .L5
165	retw
166.L3:
167	# copy 4 bytes
168	l32i	a6, a3,  0
169	addi	a3, a3,  4
170	s32i	a6, a5,  0
171	addi	a5, a5,  4
172	bbsi.l	a4, 1, .L4
173	bbsi.l	a4, 0, .L5
174	retw
175.L4:
176	# copy 2 bytes
177	l16ui	a6, a3,  0
178	addi	a3, a3,  2
179	s16i	a6, a5,  0
180	addi	a5, a5,  2
181	bbsi.l	a4, 0, .L5
182	retw
183.L5:
184	# copy 1 byte
185	l8ui	a6, a3,  0
186	s8i	a6, a5,  0
187	retw
188
189/*
190 * Destination is aligned, Source is unaligned
191 */
192
193	.align	4
194.Lsrcunaligned:
195	_beqz	a4, .Ldone	# avoid loading anything for zero-length copies
196	# copy 16 bytes per iteration for word-aligned dst and unaligned src
197	__ssa8	a3		# set shift amount from byte offset
198
199/* set to 1 when running on ISS (simulator) with the
200   lint or ferret client, or 0 to save a few cycles */
201#define SIM_CHECKS_ALIGNMENT	1
202#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
203	and	a11, a3, a8	# save unalignment offset for below
204	sub	a3, a3, a11	# align a3
205#endif
206	l32i	a6, a3, 0	# load first word
207#if XCHAL_HAVE_LOOPS
208	loopnez	a7, .Loop2done
209#else /* !XCHAL_HAVE_LOOPS */
210	beqz	a7, .Loop2done
211	slli	a10, a7, 4
212	add	a10, a10, a3	# a10 = end of last 16B source chunk
213#endif /* !XCHAL_HAVE_LOOPS */
214.Loop2:
215	l32i	a7, a3,  4
216	l32i	a8, a3,  8
217	__src_b	a6, a6, a7
218	s32i	a6, a5,  0
219	l32i	a9, a3, 12
220	__src_b	a7, a7, a8
221	s32i	a7, a5,  4
222	l32i	a6, a3, 16
223	__src_b	a8, a8, a9
224	s32i	a8, a5,  8
225	addi	a3, a3, 16
226	__src_b	a9, a9, a6
227	s32i	a9, a5, 12
228	addi	a5, a5, 16
229#if !XCHAL_HAVE_LOOPS
230	bne	a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
231#endif /* !XCHAL_HAVE_LOOPS */
232.Loop2done:
233	bbci.l	a4, 3, .L12
234	# copy 8 bytes
235	l32i	a7, a3,  4
236	l32i	a8, a3,  8
237	__src_b	a6, a6, a7
238	s32i	a6, a5,  0
239	addi	a3, a3,  8
240	__src_b	a7, a7, a8
241	s32i	a7, a5,  4
242	addi	a5, a5,  8
243	mov	a6, a8
244.L12:
245	bbci.l	a4, 2, .L13
246	# copy 4 bytes
247	l32i	a7, a3,  4
248	addi	a3, a3,  4
249	__src_b	a6, a6, a7
250	s32i	a6, a5,  0
251	addi	a5, a5,  4
252	mov	a6, a7
253.L13:
254#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
255	add	a3, a3, a11	# readjust a3 with correct misalignment
256#endif
257	bbsi.l	a4, 1, .L14
258	bbsi.l	a4, 0, .L15
259.Ldone:	retw
260.L14:
261	# copy 2 bytes
262	l8ui	a6, a3,  0
263	l8ui	a7, a3,  1
264	addi	a3, a3,  2
265	s8i	a6, a5,  0
266	s8i	a7, a5,  1
267	addi	a5, a5,  2
268	bbsi.l	a4, 0, .L15
269	retw
270.L15:
271	# copy 1 byte
272	l8ui	a6, a3,  0
273	s8i	a6, a5,  0
274	retw
275
276
277/*
278 * void bcopy(const void *src, void *dest, size_t n);
279 */
280	.align	4
281	.global	bcopy
282	.type   bcopy,@function
283bcopy:
284	entry	sp, 16		# minimal stack frame
285	# a2=src, a3=dst, a4=len
286	mov	a5, a3
287	mov	a3, a2
288	mov	a2, a5
289	j	.Lmovecommon	# go to common code for memmove+bcopy
290
291/*
292 * void *memmove(void *dst, const void *src, size_t len);
293 *
294 * This function is intended to do the same thing as the standard
295 * library function memmove() for most cases.
296 * However, where the source and/or destination references
297 * an instruction RAM or ROM or a data RAM or ROM, that
298 * source and/or destination will always be accessed with
299 * 32-bit load and store instructions (as required for these
300 * types of devices).
301 *
302 * !!!!!!!  XTFIXME:
303 * !!!!!!!  Handling of IRAM/IROM has not yet
304 * !!!!!!!  been implemented.
305 *
306 * The (general case) algorithm is as follows:
307 *   If end of source doesn't overlap destination then use memcpy.
308 *   Otherwise do memcpy backwards.
309 *
310 * Register use:
311 *	a0/ return address
312 *	a1/ stack pointer
313 *	a2/ return value
314 *	a3/ src
315 *	a4/ length
316 *	a5/ dst
317 *	a6/ tmp
318 *	a7/ tmp
319 *	a8/ tmp
320 *	a9/ tmp
321 *	a10/ tmp
322 *	a11/ tmp
323 */
324
325/*
326 * Byte by byte copy
327 */
328	.align	4
329	.byte	0		# 1 mod 4 alignment for LOOPNEZ
330				# (0 mod 4 alignment for LBEG)
331.Lbackbytecopy:
332#if XCHAL_HAVE_LOOPS
333	loopnez	a4, .Lbackbytecopydone
334#else /* !XCHAL_HAVE_LOOPS */
335	beqz	a4, .Lbackbytecopydone
336	sub	a7, a3, a4	# a7 = start address for source
337#endif /* !XCHAL_HAVE_LOOPS */
338.Lbacknextbyte:
339	addi	a3, a3, -1
340	l8ui	a6, a3, 0
341	addi	a5, a5, -1
342	s8i	a6, a5, 0
343#if !XCHAL_HAVE_LOOPS
344	bne	a3, a7, .Lbacknextbyte # continue loop if
345				       # $a3:src != $a7:src_start
346#endif /* !XCHAL_HAVE_LOOPS */
347.Lbackbytecopydone:
348	retw
349
350/*
351 * Destination is unaligned
352 */
353
354	.align	4
355.Lbackdst1mod2:	# dst is only byte aligned
356	_bltui	a4, 7, .Lbackbytecopy	# do short copies byte by byte
357
358	# copy 1 byte
359	addi	a3, a3, -1
360	l8ui	a6, a3,  0
361	addi	a5, a5, -1
362	s8i	a6, a5,  0
363	addi	a4, a4, -1
364	_bbci.l	a5, 1, .Lbackdstaligned	# if dst is now aligned, then
365					# return to main algorithm
366.Lbackdst2mod4:	# dst 16-bit aligned
367	# copy 2 bytes
368	_bltui	a4, 6, .Lbackbytecopy	# do short copies byte by byte
369	addi	a3, a3, -2
370	l8ui	a6, a3,  0
371	l8ui	a7, a3,  1
372	addi	a5, a5, -2
373	s8i	a6, a5,  0
374	s8i	a7, a5,  1
375	addi	a4, a4, -2
376	j	.Lbackdstaligned	# dst is now aligned,
377					# return to main algorithm
378
379	.align	4
380	.global	memmove
381	.type   memmove,@function
382memmove:
383
384	entry	sp, 16		# minimal stack frame
385	# a2/ dst, a3/ src, a4/ len
386	mov	a5, a2		# copy dst so that a2 is return value
387.Lmovecommon:
388	sub	a6, a5, a3
389	bgeu	a6, a4, .Lcommon
390
391	add	a5, a5, a4
392	add	a3, a3, a4
393
394	_bbsi.l	a5, 0, .Lbackdst1mod2	# if dst is 1 mod 2
395	_bbsi.l	a5, 1, .Lbackdst2mod4	# if dst is 2 mod 4
396.Lbackdstaligned:	# return here from .Lbackdst?mod? once dst is aligned
397	srli	a7, a4, 4	# number of loop iterations with 16B
398				# per iteration
399	movi	a8, 3		# if source is not aligned,
400	_bany	a3, a8, .Lbacksrcunaligned	# then use shifting copy
401	/*
402	 * Destination and source are word-aligned, use word copy.
403	 */
404	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
405#if XCHAL_HAVE_LOOPS
406	loopnez	a7, .backLoop1done
407#else /* !XCHAL_HAVE_LOOPS */
408	beqz	a7, .backLoop1done
409	slli	a8, a7, 4
410	sub	a8, a3, a8	# a8 = start of first 16B source chunk
411#endif /* !XCHAL_HAVE_LOOPS */
412.backLoop1:
413	addi	a3, a3, -16
414	l32i	a7, a3, 12
415	l32i	a6, a3,  8
416	addi	a5, a5, -16
417	s32i	a7, a5, 12
418	l32i	a7, a3,  4
419	s32i	a6, a5,  8
420	l32i	a6, a3,  0
421	s32i	a7, a5,  4
422	s32i	a6, a5,  0
423#if !XCHAL_HAVE_LOOPS
424	bne	a3, a8, .backLoop1  # continue loop if a3:src != a8:src_start
425#endif /* !XCHAL_HAVE_LOOPS */
426.backLoop1done:
427	bbci.l	a4, 3, .Lback2
428	# copy 8 bytes
429	addi	a3, a3, -8
430	l32i	a6, a3,  0
431	l32i	a7, a3,  4
432	addi	a5, a5, -8
433	s32i	a6, a5,  0
434	s32i	a7, a5,  4
435.Lback2:
436	bbsi.l	a4, 2, .Lback3
437	bbsi.l	a4, 1, .Lback4
438	bbsi.l	a4, 0, .Lback5
439	retw
440.Lback3:
441	# copy 4 bytes
442	addi	a3, a3, -4
443	l32i	a6, a3,  0
444	addi	a5, a5, -4
445	s32i	a6, a5,  0
446	bbsi.l	a4, 1, .Lback4
447	bbsi.l	a4, 0, .Lback5
448	retw
449.Lback4:
450	# copy 2 bytes
451	addi	a3, a3, -2
452	l16ui	a6, a3,  0
453	addi	a5, a5, -2
454	s16i	a6, a5,  0
455	bbsi.l	a4, 0, .Lback5
456	retw
457.Lback5:
458	# copy 1 byte
459	addi	a3, a3, -1
460	l8ui	a6, a3,  0
461	addi	a5, a5, -1
462	s8i	a6, a5,  0
463	retw
464
465/*
466 * Destination is aligned, Source is unaligned
467 */
468
469	.align	4
470.Lbacksrcunaligned:
471	_beqz	a4, .Lbackdone	# avoid loading anything for zero-length copies
472	# copy 16 bytes per iteration for word-aligned dst and unaligned src
473	__ssa8	a3		# set shift amount from byte offset
474#define SIM_CHECKS_ALIGNMENT	1	/* set to 1 when running on ISS with
475					 * the lint or ferret client, or 0
476					 * to save a few cycles */
477#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
478	and	a11, a3, a8	# save unalignment offset for below
479	sub	a3, a3, a11	# align a3
480#endif
481	l32i	a6, a3, 0	# load first word
482#if XCHAL_HAVE_LOOPS
483	loopnez	a7, .backLoop2done
484#else /* !XCHAL_HAVE_LOOPS */
485	beqz	a7, .backLoop2done
486	slli	a10, a7, 4
487	sub	a10, a3, a10	# a10 = start of first 16B source chunk
488#endif /* !XCHAL_HAVE_LOOPS */
489.backLoop2:
490	addi	a3, a3, -16
491	l32i	a7, a3, 12
492	l32i	a8, a3,  8
493	addi	a5, a5, -16
494	__src_b	a6, a7, a6
495	s32i	a6, a5, 12
496	l32i	a9, a3,  4
497	__src_b	a7, a8, a7
498	s32i	a7, a5,  8
499	l32i	a6, a3,  0
500	__src_b	a8, a9, a8
501	s32i	a8, a5,  4
502	__src_b	a9, a6, a9
503	s32i	a9, a5,  0
504#if !XCHAL_HAVE_LOOPS
505	bne	a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
506#endif /* !XCHAL_HAVE_LOOPS */
507.backLoop2done:
508	bbci.l	a4, 3, .Lback12
509	# copy 8 bytes
510	addi	a3, a3, -8
511	l32i	a7, a3,  4
512	l32i	a8, a3,  0
513	addi	a5, a5, -8
514	__src_b	a6, a7, a6
515	s32i	a6, a5,  4
516	__src_b	a7, a8, a7
517	s32i	a7, a5,  0
518	mov	a6, a8
519.Lback12:
520	bbci.l	a4, 2, .Lback13
521	# copy 4 bytes
522	addi	a3, a3, -4
523	l32i	a7, a3,  0
524	addi	a5, a5, -4
525	__src_b	a6, a7, a6
526	s32i	a6, a5,  0
527	mov	a6, a7
528.Lback13:
529#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
530	add	a3, a3, a11	# readjust a3 with correct misalignment
531#endif
532	bbsi.l	a4, 1, .Lback14
533	bbsi.l	a4, 0, .Lback15
534.Lbackdone:
535	retw
536.Lback14:
537	# copy 2 bytes
538	addi	a3, a3, -2
539	l8ui	a6, a3,  0
540	l8ui	a7, a3,  1
541	addi	a5, a5, -2
542	s8i	a6, a5,  0
543	s8i	a7, a5,  1
544	bbsi.l	a4, 0, .Lback15
545	retw
546.Lback15:
547	# copy 1 byte
548	addi	a3, a3, -1
549	addi	a5, a5, -1
550	l8ui	a6, a3,  0
551	s8i	a6, a5,  0
552	retw
553
554
555/*
556 * Local Variables:
557 * mode:fundamental
558 * comment-start: "# "
559 * comment-start-skip: "# *"
560 * End:
561 */
562