xref: /linux/arch/arc/lib/memcpy-archs.S (revision 0883c2c06fb5bcf5b9e008270827e63c09a88c1e)
1/*
2 * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/linkage.h>
10
11#ifdef __LITTLE_ENDIAN__
12# define SHIFT_1(RX,RY,IMM)	asl	RX, RY, IMM	; <<
13# define SHIFT_2(RX,RY,IMM)	lsr	RX, RY, IMM	; >>
14# define MERGE_1(RX,RY,IMM)	asl	RX, RY, IMM
15# define MERGE_2(RX,RY,IMM)
16# define EXTRACT_1(RX,RY,IMM)	and	RX, RY, 0xFFFF
17# define EXTRACT_2(RX,RY,IMM)	lsr	RX, RY, IMM
18#else
19# define SHIFT_1(RX,RY,IMM)	lsr	RX, RY, IMM	; >>
20# define SHIFT_2(RX,RY,IMM)	asl	RX, RY, IMM	; <<
21# define MERGE_1(RX,RY,IMM)	asl	RX, RY, IMM	; <<
22# define MERGE_2(RX,RY,IMM)	asl	RX, RY, IMM	; <<
23# define EXTRACT_1(RX,RY,IMM)	lsr	RX, RY, IMM
24# define EXTRACT_2(RX,RY,IMM)	lsr	RX, RY, 0x08
25#endif
26
27#ifdef CONFIG_ARC_HAS_LL64
28# define PREFETCH_READ(RX)	prefetch    [RX, 56]
29# define PREFETCH_WRITE(RX)	prefetchw   [RX, 64]
30# define LOADX(DST,RX)		ldd.ab	DST, [RX, 8]
31# define STOREX(SRC,RX)		std.ab	SRC, [RX, 8]
32# define ZOLSHFT		5
33# define ZOLAND			0x1F
34#else
35# define PREFETCH_READ(RX)	prefetch    [RX, 28]
36# define PREFETCH_WRITE(RX)	prefetchw   [RX, 32]
37# define LOADX(DST,RX)		ld.ab	DST, [RX, 4]
38# define STOREX(SRC,RX)		st.ab	SRC, [RX, 4]
39# define ZOLSHFT		4
40# define ZOLAND			0xF
41#endif
42
43ENTRY(memcpy)
44	prefetch [r1]		; Prefetch the read location
45	prefetchw [r0]		; Prefetch the write location
46	mov.f	0, r2
47;;; if size is zero
48	jz.d	[blink]
49	mov	r3, r0		; don;t clobber ret val
50
51;;; if size <= 8
52	cmp	r2, 8
53	bls.d	@.Lsmallchunk
54	mov.f	lp_count, r2
55
56	and.f	r4, r0, 0x03
57	rsub	lp_count, r4, 4
58	lpnz	@.Laligndestination
59	;; LOOP BEGIN
60	ldb.ab	r5, [r1,1]
61	sub	r2, r2, 1
62	stb.ab	r5, [r3,1]
63.Laligndestination:
64
65;;; Check the alignment of the source
66	and.f	r4, r1, 0x03
67	bnz.d	@.Lsourceunaligned
68
69;;; CASE 0: Both source and destination are 32bit aligned
70;;; Convert len to Dwords, unfold x4
71	lsr.f	lp_count, r2, ZOLSHFT
72	lpnz	@.Lcopy32_64bytes
73	;; LOOP START
74	LOADX (r6, r1)
75	PREFETCH_READ (r1)
76	PREFETCH_WRITE (r3)
77	LOADX (r8, r1)
78	LOADX (r10, r1)
79	LOADX (r4, r1)
80	STOREX (r6, r3)
81	STOREX (r8, r3)
82	STOREX (r10, r3)
83	STOREX (r4, r3)
84.Lcopy32_64bytes:
85
86	and.f	lp_count, r2, ZOLAND ;Last remaining 31 bytes
87.Lsmallchunk:
88	lpnz	@.Lcopyremainingbytes
89	;; LOOP START
90	ldb.ab	r5, [r1,1]
91	stb.ab	r5, [r3,1]
92.Lcopyremainingbytes:
93
94	j	[blink]
95;;; END CASE 0
96
97.Lsourceunaligned:
98	cmp	r4, 2
99	beq.d	@.LunalignedOffby2
100	sub	r2, r2, 1
101
102	bhi.d	@.LunalignedOffby3
103	ldb.ab	r5, [r1, 1]
104
105;;; CASE 1: The source is unaligned, off by 1
106	;; Hence I need to read 1 byte for a 16bit alignment
107	;; and 2bytes to reach 32bit alignment
108	ldh.ab	r6, [r1, 2]
109	sub	r2, r2, 2
110	;; Convert to words, unfold x2
111	lsr.f	lp_count, r2, 3
112	MERGE_1 (r6, r6, 8)
113	MERGE_2 (r5, r5, 24)
114	or	r5, r5, r6
115
116	;; Both src and dst are aligned
117	lpnz	@.Lcopy8bytes_1
118	;; LOOP START
119	ld.ab	r6, [r1, 4]
120	prefetch [r1, 28]	;Prefetch the next read location
121	ld.ab	r8, [r1,4]
122	prefetchw [r3, 32]	;Prefetch the next write location
123
124	SHIFT_1	(r7, r6, 24)
125	or	r7, r7, r5
126	SHIFT_2	(r5, r6, 8)
127
128	SHIFT_1	(r9, r8, 24)
129	or	r9, r9, r5
130	SHIFT_2	(r5, r8, 8)
131
132	st.ab	r7, [r3, 4]
133	st.ab	r9, [r3, 4]
134.Lcopy8bytes_1:
135
136	;; Write back the remaining 16bits
137	EXTRACT_1 (r6, r5, 16)
138	sth.ab	r6, [r3, 2]
139	;; Write back the remaining 8bits
140	EXTRACT_2 (r5, r5, 16)
141	stb.ab	r5, [r3, 1]
142
143	and.f	lp_count, r2, 0x07 ;Last 8bytes
144	lpnz	@.Lcopybytewise_1
145	;; LOOP START
146	ldb.ab	r6, [r1,1]
147	stb.ab	r6, [r3,1]
148.Lcopybytewise_1:
149	j	[blink]
150
151.LunalignedOffby2:
152;;; CASE 2: The source is unaligned, off by 2
153	ldh.ab	r5, [r1, 2]
154	sub	r2, r2, 1
155
156	;; Both src and dst are aligned
157	;; Convert to words, unfold x2
158	lsr.f	lp_count, r2, 3
159#ifdef __BIG_ENDIAN__
160	asl.nz	r5, r5, 16
161#endif
162	lpnz	@.Lcopy8bytes_2
163	;; LOOP START
164	ld.ab	r6, [r1, 4]
165	prefetch [r1, 28]	;Prefetch the next read location
166	ld.ab	r8, [r1,4]
167	prefetchw [r3, 32]	;Prefetch the next write location
168
169	SHIFT_1	(r7, r6, 16)
170	or	r7, r7, r5
171	SHIFT_2	(r5, r6, 16)
172
173	SHIFT_1	(r9, r8, 16)
174	or	r9, r9, r5
175	SHIFT_2	(r5, r8, 16)
176
177	st.ab	r7, [r3, 4]
178	st.ab	r9, [r3, 4]
179.Lcopy8bytes_2:
180
181#ifdef __BIG_ENDIAN__
182	lsr.nz	r5, r5, 16
183#endif
184	sth.ab	r5, [r3, 2]
185
186	and.f	lp_count, r2, 0x07 ;Last 8bytes
187	lpnz	@.Lcopybytewise_2
188	;; LOOP START
189	ldb.ab	r6, [r1,1]
190	stb.ab	r6, [r3,1]
191.Lcopybytewise_2:
192	j	[blink]
193
194.LunalignedOffby3:
195;;; CASE 3: The source is unaligned, off by 3
196;;; Hence, I need to read 1byte for achieve the 32bit alignment
197
198	;; Both src and dst are aligned
199	;; Convert to words, unfold x2
200	lsr.f	lp_count, r2, 3
201#ifdef __BIG_ENDIAN__
202	asl.ne	r5, r5, 24
203#endif
204	lpnz	@.Lcopy8bytes_3
205	;; LOOP START
206	ld.ab	r6, [r1, 4]
207	prefetch [r1, 28]	;Prefetch the next read location
208	ld.ab	r8, [r1,4]
209	prefetchw [r3, 32]	;Prefetch the next write location
210
211	SHIFT_1	(r7, r6, 8)
212	or	r7, r7, r5
213	SHIFT_2	(r5, r6, 24)
214
215	SHIFT_1	(r9, r8, 8)
216	or	r9, r9, r5
217	SHIFT_2	(r5, r8, 24)
218
219	st.ab	r7, [r3, 4]
220	st.ab	r9, [r3, 4]
221.Lcopy8bytes_3:
222
223#ifdef __BIG_ENDIAN__
224	lsr.nz	r5, r5, 24
225#endif
226	stb.ab	r5, [r3, 1]
227
228	and.f	lp_count, r2, 0x07 ;Last 8bytes
229	lpnz	@.Lcopybytewise_3
230	;; LOOP START
231	ldb.ab	r6, [r1,1]
232	stb.ab	r6, [r3,1]
233.Lcopybytewise_3:
234	j	[blink]
235
236END(memcpy)
237