xref: /linux/arch/sh/lib/memset-sh4.S (revision 0883c2c06fb5bcf5b9e008270827e63c09a88c1e)
1/*
2 * "memset" implementation for SH4
3 *
4 * Copyright (C) 1999  Niibe Yutaka
5 * Copyright (c) 2009  STMicroelectronics Limited
6 * Author: Stuart Menefy <stuart.menefy:st.com>
7 */
8
9/*
10 *            void *memset(void *s, int c, size_t n);
11 */
12
13#include <linux/linkage.h>
14
15ENTRY(memset)
16	mov	#12,r0
17	add	r6,r4
18	cmp/gt	r6,r0
19	bt/s	40f		! if it's too small, set a byte at once
20	 mov	r4,r0
21	and	#3,r0
22	cmp/eq	#0,r0
23	bt/s	2f		! It's aligned
24	 sub	r0,r6
251:
26	dt	r0
27	bf/s	1b
28	 mov.b	r5,@-r4
292:				! make VVVV
30	extu.b	r5,r5
31	swap.b	r5,r0		!   V0
32	or	r0,r5		!   VV
33	swap.w	r5,r0		! VV00
34	or	r0,r5		! VVVV
35
36	! Check if enough bytes need to be copied to be worth the big loop
37	mov	#0x40, r0	! (MT)
38	cmp/gt	r6,r0		! (MT)  64 > len => slow loop
39
40	bt/s	22f
41	 mov	r6,r0
42
43	! align the dst to the cache block size if necessary
44	mov	r4, r3
45	mov	#~(0x1f), r1
46
47	and	r3, r1
48	cmp/eq	r3, r1
49
50	bt/s	11f		! dst is already aligned
51	 sub	r1, r3		! r3-r1 -> r3
52	shlr2	r3		! number of loops
53
5410:	mov.l	r5,@-r4
55	dt	r3
56	bf/s	10b
57	 add	#-4, r6
58
5911:	! dst is 32byte aligned
60	mov	r6,r2
61	mov	#-5,r0
62	shld	r0,r2		! number of loops
63
64	add	#-32, r4
65	mov	r5, r0
6612:
67	movca.l	r0,@r4
68	mov.l	r5,@(4, r4)
69	mov.l	r5,@(8, r4)
70	mov.l	r5,@(12,r4)
71	mov.l	r5,@(16,r4)
72	mov.l	r5,@(20,r4)
73	add	#-0x20, r6
74	mov.l	r5,@(24,r4)
75	dt	r2
76	mov.l	r5,@(28,r4)
77	bf/s	12b
78	 add	#-32, r4
79
80	add	#32, r4
81	mov	#8, r0
82	cmp/ge	r0, r6
83	bf	40f
84
85	mov	r6,r0
8622:
87	shlr2	r0
88	shlr	r0		! r0 = r6 >> 3
893:
90	dt	r0
91	mov.l	r5,@-r4		! set 8-byte at once
92	bf/s	3b
93	 mov.l	r5,@-r4
94	!
95	mov	#7,r0
96	and	r0,r6
97
98	! fill bytes (length may be zero)
9940:	tst	r6,r6
100	bt	5f
1014:
102	dt	r6
103	bf/s	4b
104	 mov.b	r5,@-r4
1055:
106	rts
107	 mov	r4,r0
108