xref: /linux/arch/xtensa/lib/memcopy.S (revision 9a379e77033f02c4a071891afdf0f0a01eff8ccb)
1/*
2 * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
3 * xthal_memcpy and xthal_bcopy
4 *
5 * This file is subject to the terms and conditions of the GNU General Public
6 * License.  See the file "COPYING" in the main directory of this archive
7 * for more details.
8 *
9 * Copyright (C) 2002 - 2012 Tensilica Inc.
10 */
11
12#include <linux/linkage.h>
13#include <variant/core.h>
14#include <asm/asmmacro.h>
15
16/*
17 * void *memcpy(void *dst, const void *src, size_t len);
18 *
19 * This function is intended to do the same thing as the standard
20 * library function memcpy() for most cases.
21 * However, where the source and/or destination references
22 * an instruction RAM or ROM or a data RAM or ROM, that
23 * source and/or destination will always be accessed with
24 * 32-bit load and store instructions (as required for these
25 * types of devices).
26 *
27 * !!!!!!!  XTFIXME:
28 * !!!!!!!  Handling of IRAM/IROM has not yet
29 * !!!!!!!  been implemented.
30 *
31 * The (general case) algorithm is as follows:
32 *   If destination is unaligned, align it by conditionally
33 *     copying 1 and 2 bytes.
34 *   If source is aligned,
35 *     do 16 bytes with a loop, and then finish up with
36 *     8, 4, 2, and 1 byte copies conditional on the length;
37 *   else (if source is unaligned),
38 *     do the same, but use SRC to align the source data.
39 *   This code tries to use fall-through branches for the common
40 *     case of aligned source and destination and multiple
41 *     of 4 (or 8) length.
42 *
43 * Register use:
44 *	a0/ return address
45 *	a1/ stack pointer
46 *	a2/ return value
47 *	a3/ src
48 *	a4/ length
49 *	a5/ dst
50 *	a6/ tmp
51 *	a7/ tmp
52 *	a8/ tmp
53 *	a9/ tmp
54 *	a10/ tmp
55 *	a11/ tmp
56 */
57
58	.text
59
60/*
61 * Byte by byte copy
62 */
63	.align	4
64	.byte	0		# 1 mod 4 alignment for LOOPNEZ
65				# (0 mod 4 alignment for LBEG)
66.Lbytecopy:
67#if XCHAL_HAVE_LOOPS
68	loopnez	a4, .Lbytecopydone
69#else /* !XCHAL_HAVE_LOOPS */
70	beqz	a4, .Lbytecopydone
71	add	a7, a3, a4	# a7 = end address for source
72#endif /* !XCHAL_HAVE_LOOPS */
73.Lnextbyte:
74	l8ui	a6, a3, 0
75	addi	a3, a3, 1
76	s8i	a6, a5, 0
77	addi	a5, a5, 1
78#if !XCHAL_HAVE_LOOPS
79	bne	a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
80#endif /* !XCHAL_HAVE_LOOPS */
81.Lbytecopydone:
82	retw
83
84/*
85 * Destination is unaligned
86 */
87
88	.align	4
89.Ldst1mod2:	# dst is only byte aligned
90	_bltui	a4, 7, .Lbytecopy	# do short copies byte by byte
91
92	# copy 1 byte
93	l8ui	a6, a3,  0
94	addi	a3, a3,  1
95	addi	a4, a4, -1
96	s8i	a6, a5,  0
97	addi	a5, a5,  1
98	_bbci.l	a5, 1, .Ldstaligned	# if dst is now aligned, then
99					# return to main algorithm
100.Ldst2mod4:	# dst 16-bit aligned
101	# copy 2 bytes
102	_bltui	a4, 6, .Lbytecopy	# do short copies byte by byte
103	l8ui	a6, a3,  0
104	l8ui	a7, a3,  1
105	addi	a3, a3,  2
106	addi	a4, a4, -2
107	s8i	a6, a5,  0
108	s8i	a7, a5,  1
109	addi	a5, a5,  2
110	j	.Ldstaligned	# dst is now aligned, return to main algorithm
111
112ENTRY(__memcpy)
113WEAK(memcpy)
114
115	entry	sp, 16		# minimal stack frame
116	# a2/ dst, a3/ src, a4/ len
117	mov	a5, a2		# copy dst so that a2 is return value
118.Lcommon:
119	_bbsi.l	a2, 0, .Ldst1mod2	# if dst is 1 mod 2
120	_bbsi.l	a2, 1, .Ldst2mod4	# if dst is 2 mod 4
121.Ldstaligned:	# return here from .Ldst?mod? once dst is aligned
122	srli	a7, a4, 4	# number of loop iterations with 16B
123				# per iteration
124	movi	a8, 3		# if source is not aligned,
125	_bany	a3, a8, .Lsrcunaligned	# then use shifting copy
126	/*
127	 * Destination and source are word-aligned, use word copy.
128	 */
129	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
130#if XCHAL_HAVE_LOOPS
131	loopnez	a7, .Loop1done
132#else /* !XCHAL_HAVE_LOOPS */
133	beqz	a7, .Loop1done
134	slli	a8, a7, 4
135	add	a8, a8, a3	# a8 = end of last 16B source chunk
136#endif /* !XCHAL_HAVE_LOOPS */
137.Loop1:
138	l32i	a6, a3,  0
139	l32i	a7, a3,  4
140	s32i	a6, a5,  0
141	l32i	a6, a3,  8
142	s32i	a7, a5,  4
143	l32i	a7, a3, 12
144	s32i	a6, a5,  8
145	addi	a3, a3, 16
146	s32i	a7, a5, 12
147	addi	a5, a5, 16
148#if !XCHAL_HAVE_LOOPS
149	bne	a3, a8, .Loop1  # continue loop if a3:src != a8:src_end
150#endif /* !XCHAL_HAVE_LOOPS */
151.Loop1done:
152	bbci.l	a4, 3, .L2
153	# copy 8 bytes
154	l32i	a6, a3,  0
155	l32i	a7, a3,  4
156	addi	a3, a3,  8
157	s32i	a6, a5,  0
158	s32i	a7, a5,  4
159	addi	a5, a5,  8
160.L2:
161	bbsi.l	a4, 2, .L3
162	bbsi.l	a4, 1, .L4
163	bbsi.l	a4, 0, .L5
164	retw
165.L3:
166	# copy 4 bytes
167	l32i	a6, a3,  0
168	addi	a3, a3,  4
169	s32i	a6, a5,  0
170	addi	a5, a5,  4
171	bbsi.l	a4, 1, .L4
172	bbsi.l	a4, 0, .L5
173	retw
174.L4:
175	# copy 2 bytes
176	l16ui	a6, a3,  0
177	addi	a3, a3,  2
178	s16i	a6, a5,  0
179	addi	a5, a5,  2
180	bbsi.l	a4, 0, .L5
181	retw
182.L5:
183	# copy 1 byte
184	l8ui	a6, a3,  0
185	s8i	a6, a5,  0
186	retw
187
188/*
189 * Destination is aligned, Source is unaligned
190 */
191
192	.align	4
193.Lsrcunaligned:
194	_beqz	a4, .Ldone	# avoid loading anything for zero-length copies
195	# copy 16 bytes per iteration for word-aligned dst and unaligned src
196	__ssa8	a3		# set shift amount from byte offset
197
198/* set to 1 when running on ISS (simulator) with the
199   lint or ferret client, or 0 to save a few cycles */
200#define SIM_CHECKS_ALIGNMENT	1
201#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
202	and	a11, a3, a8	# save unalignment offset for below
203	sub	a3, a3, a11	# align a3
204#endif
205	l32i	a6, a3, 0	# load first word
206#if XCHAL_HAVE_LOOPS
207	loopnez	a7, .Loop2done
208#else /* !XCHAL_HAVE_LOOPS */
209	beqz	a7, .Loop2done
210	slli	a10, a7, 4
211	add	a10, a10, a3	# a10 = end of last 16B source chunk
212#endif /* !XCHAL_HAVE_LOOPS */
213.Loop2:
214	l32i	a7, a3,  4
215	l32i	a8, a3,  8
216	__src_b	a6, a6, a7
217	s32i	a6, a5,  0
218	l32i	a9, a3, 12
219	__src_b	a7, a7, a8
220	s32i	a7, a5,  4
221	l32i	a6, a3, 16
222	__src_b	a8, a8, a9
223	s32i	a8, a5,  8
224	addi	a3, a3, 16
225	__src_b	a9, a9, a6
226	s32i	a9, a5, 12
227	addi	a5, a5, 16
228#if !XCHAL_HAVE_LOOPS
229	bne	a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
230#endif /* !XCHAL_HAVE_LOOPS */
231.Loop2done:
232	bbci.l	a4, 3, .L12
233	# copy 8 bytes
234	l32i	a7, a3,  4
235	l32i	a8, a3,  8
236	__src_b	a6, a6, a7
237	s32i	a6, a5,  0
238	addi	a3, a3,  8
239	__src_b	a7, a7, a8
240	s32i	a7, a5,  4
241	addi	a5, a5,  8
242	mov	a6, a8
243.L12:
244	bbci.l	a4, 2, .L13
245	# copy 4 bytes
246	l32i	a7, a3,  4
247	addi	a3, a3,  4
248	__src_b	a6, a6, a7
249	s32i	a6, a5,  0
250	addi	a5, a5,  4
251	mov	a6, a7
252.L13:
253#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
254	add	a3, a3, a11	# readjust a3 with correct misalignment
255#endif
256	bbsi.l	a4, 1, .L14
257	bbsi.l	a4, 0, .L15
258.Ldone:	retw
259.L14:
260	# copy 2 bytes
261	l8ui	a6, a3,  0
262	l8ui	a7, a3,  1
263	addi	a3, a3,  2
264	s8i	a6, a5,  0
265	s8i	a7, a5,  1
266	addi	a5, a5,  2
267	bbsi.l	a4, 0, .L15
268	retw
269.L15:
270	# copy 1 byte
271	l8ui	a6, a3,  0
272	s8i	a6, a5,  0
273	retw
274
275ENDPROC(__memcpy)
276
277/*
278 * void bcopy(const void *src, void *dest, size_t n);
279 */
280
281ENTRY(bcopy)
282
283	entry	sp, 16		# minimal stack frame
284	# a2=src, a3=dst, a4=len
285	mov	a5, a3
286	mov	a3, a2
287	mov	a2, a5
288	j	.Lmovecommon	# go to common code for memmove+bcopy
289
290ENDPROC(bcopy)
291
292/*
293 * void *memmove(void *dst, const void *src, size_t len);
294 *
295 * This function is intended to do the same thing as the standard
296 * library function memmove() for most cases.
297 * However, where the source and/or destination references
298 * an instruction RAM or ROM or a data RAM or ROM, that
299 * source and/or destination will always be accessed with
300 * 32-bit load and store instructions (as required for these
301 * types of devices).
302 *
303 * !!!!!!!  XTFIXME:
304 * !!!!!!!  Handling of IRAM/IROM has not yet
305 * !!!!!!!  been implemented.
306 *
307 * The (general case) algorithm is as follows:
308 *   If end of source doesn't overlap destination then use memcpy.
309 *   Otherwise do memcpy backwards.
310 *
311 * Register use:
312 *	a0/ return address
313 *	a1/ stack pointer
314 *	a2/ return value
315 *	a3/ src
316 *	a4/ length
317 *	a5/ dst
318 *	a6/ tmp
319 *	a7/ tmp
320 *	a8/ tmp
321 *	a9/ tmp
322 *	a10/ tmp
323 *	a11/ tmp
324 */
325
326/*
327 * Byte by byte copy
328 */
329	.align	4
330	.byte	0		# 1 mod 4 alignment for LOOPNEZ
331				# (0 mod 4 alignment for LBEG)
332.Lbackbytecopy:
333#if XCHAL_HAVE_LOOPS
334	loopnez	a4, .Lbackbytecopydone
335#else /* !XCHAL_HAVE_LOOPS */
336	beqz	a4, .Lbackbytecopydone
337	sub	a7, a3, a4	# a7 = start address for source
338#endif /* !XCHAL_HAVE_LOOPS */
339.Lbacknextbyte:
340	addi	a3, a3, -1
341	l8ui	a6, a3, 0
342	addi	a5, a5, -1
343	s8i	a6, a5, 0
344#if !XCHAL_HAVE_LOOPS
345	bne	a3, a7, .Lbacknextbyte # continue loop if
346				       # $a3:src != $a7:src_start
347#endif /* !XCHAL_HAVE_LOOPS */
348.Lbackbytecopydone:
349	retw
350
351/*
352 * Destination is unaligned
353 */
354
355	.align	4
356.Lbackdst1mod2:	# dst is only byte aligned
357	_bltui	a4, 7, .Lbackbytecopy	# do short copies byte by byte
358
359	# copy 1 byte
360	addi	a3, a3, -1
361	l8ui	a6, a3,  0
362	addi	a5, a5, -1
363	s8i	a6, a5,  0
364	addi	a4, a4, -1
365	_bbci.l	a5, 1, .Lbackdstaligned	# if dst is now aligned, then
366					# return to main algorithm
367.Lbackdst2mod4:	# dst 16-bit aligned
368	# copy 2 bytes
369	_bltui	a4, 6, .Lbackbytecopy	# do short copies byte by byte
370	addi	a3, a3, -2
371	l8ui	a6, a3,  0
372	l8ui	a7, a3,  1
373	addi	a5, a5, -2
374	s8i	a6, a5,  0
375	s8i	a7, a5,  1
376	addi	a4, a4, -2
377	j	.Lbackdstaligned	# dst is now aligned,
378					# return to main algorithm
379
380ENTRY(__memmove)
381WEAK(memmove)
382
383	entry	sp, 16		# minimal stack frame
384	# a2/ dst, a3/ src, a4/ len
385	mov	a5, a2		# copy dst so that a2 is return value
386.Lmovecommon:
387	sub	a6, a5, a3
388	bgeu	a6, a4, .Lcommon
389
390	add	a5, a5, a4
391	add	a3, a3, a4
392
393	_bbsi.l	a5, 0, .Lbackdst1mod2	# if dst is 1 mod 2
394	_bbsi.l	a5, 1, .Lbackdst2mod4	# if dst is 2 mod 4
395.Lbackdstaligned:	# return here from .Lbackdst?mod? once dst is aligned
396	srli	a7, a4, 4	# number of loop iterations with 16B
397				# per iteration
398	movi	a8, 3		# if source is not aligned,
399	_bany	a3, a8, .Lbacksrcunaligned	# then use shifting copy
400	/*
401	 * Destination and source are word-aligned, use word copy.
402	 */
403	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
404#if XCHAL_HAVE_LOOPS
405	loopnez	a7, .backLoop1done
406#else /* !XCHAL_HAVE_LOOPS */
407	beqz	a7, .backLoop1done
408	slli	a8, a7, 4
409	sub	a8, a3, a8	# a8 = start of first 16B source chunk
410#endif /* !XCHAL_HAVE_LOOPS */
411.backLoop1:
412	addi	a3, a3, -16
413	l32i	a7, a3, 12
414	l32i	a6, a3,  8
415	addi	a5, a5, -16
416	s32i	a7, a5, 12
417	l32i	a7, a3,  4
418	s32i	a6, a5,  8
419	l32i	a6, a3,  0
420	s32i	a7, a5,  4
421	s32i	a6, a5,  0
422#if !XCHAL_HAVE_LOOPS
423	bne	a3, a8, .backLoop1  # continue loop if a3:src != a8:src_start
424#endif /* !XCHAL_HAVE_LOOPS */
425.backLoop1done:
426	bbci.l	a4, 3, .Lback2
427	# copy 8 bytes
428	addi	a3, a3, -8
429	l32i	a6, a3,  0
430	l32i	a7, a3,  4
431	addi	a5, a5, -8
432	s32i	a6, a5,  0
433	s32i	a7, a5,  4
434.Lback2:
435	bbsi.l	a4, 2, .Lback3
436	bbsi.l	a4, 1, .Lback4
437	bbsi.l	a4, 0, .Lback5
438	retw
439.Lback3:
440	# copy 4 bytes
441	addi	a3, a3, -4
442	l32i	a6, a3,  0
443	addi	a5, a5, -4
444	s32i	a6, a5,  0
445	bbsi.l	a4, 1, .Lback4
446	bbsi.l	a4, 0, .Lback5
447	retw
448.Lback4:
449	# copy 2 bytes
450	addi	a3, a3, -2
451	l16ui	a6, a3,  0
452	addi	a5, a5, -2
453	s16i	a6, a5,  0
454	bbsi.l	a4, 0, .Lback5
455	retw
456.Lback5:
457	# copy 1 byte
458	addi	a3, a3, -1
459	l8ui	a6, a3,  0
460	addi	a5, a5, -1
461	s8i	a6, a5,  0
462	retw
463
464/*
465 * Destination is aligned, Source is unaligned
466 */
467
468	.align	4
469.Lbacksrcunaligned:
470	_beqz	a4, .Lbackdone	# avoid loading anything for zero-length copies
471	# copy 16 bytes per iteration for word-aligned dst and unaligned src
472	__ssa8	a3		# set shift amount from byte offset
473#define SIM_CHECKS_ALIGNMENT	1	/* set to 1 when running on ISS with
474					 * the lint or ferret client, or 0
475					 * to save a few cycles */
476#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
477	and	a11, a3, a8	# save unalignment offset for below
478	sub	a3, a3, a11	# align a3
479#endif
480	l32i	a6, a3, 0	# load first word
481#if XCHAL_HAVE_LOOPS
482	loopnez	a7, .backLoop2done
483#else /* !XCHAL_HAVE_LOOPS */
484	beqz	a7, .backLoop2done
485	slli	a10, a7, 4
486	sub	a10, a3, a10	# a10 = start of first 16B source chunk
487#endif /* !XCHAL_HAVE_LOOPS */
488.backLoop2:
489	addi	a3, a3, -16
490	l32i	a7, a3, 12
491	l32i	a8, a3,  8
492	addi	a5, a5, -16
493	__src_b	a6, a7, a6
494	s32i	a6, a5, 12
495	l32i	a9, a3,  4
496	__src_b	a7, a8, a7
497	s32i	a7, a5,  8
498	l32i	a6, a3,  0
499	__src_b	a8, a9, a8
500	s32i	a8, a5,  4
501	__src_b	a9, a6, a9
502	s32i	a9, a5,  0
503#if !XCHAL_HAVE_LOOPS
504	bne	a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
505#endif /* !XCHAL_HAVE_LOOPS */
506.backLoop2done:
507	bbci.l	a4, 3, .Lback12
508	# copy 8 bytes
509	addi	a3, a3, -8
510	l32i	a7, a3,  4
511	l32i	a8, a3,  0
512	addi	a5, a5, -8
513	__src_b	a6, a7, a6
514	s32i	a6, a5,  4
515	__src_b	a7, a8, a7
516	s32i	a7, a5,  0
517	mov	a6, a8
518.Lback12:
519	bbci.l	a4, 2, .Lback13
520	# copy 4 bytes
521	addi	a3, a3, -4
522	l32i	a7, a3,  0
523	addi	a5, a5, -4
524	__src_b	a6, a7, a6
525	s32i	a6, a5,  0
526	mov	a6, a7
527.Lback13:
528#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
529	add	a3, a3, a11	# readjust a3 with correct misalignment
530#endif
531	bbsi.l	a4, 1, .Lback14
532	bbsi.l	a4, 0, .Lback15
533.Lbackdone:
534	retw
535.Lback14:
536	# copy 2 bytes
537	addi	a3, a3, -2
538	l8ui	a6, a3,  0
539	l8ui	a7, a3,  1
540	addi	a5, a5, -2
541	s8i	a6, a5,  0
542	s8i	a7, a5,  1
543	bbsi.l	a4, 0, .Lback15
544	retw
545.Lback15:
546	# copy 1 byte
547	addi	a3, a3, -1
548	addi	a5, a5, -1
549	l8ui	a6, a3,  0
550	s8i	a6, a5,  0
551	retw
552
553ENDPROC(__memmove)
554