xref: /linux/arch/arm64/lib/copy_template.S (revision 36110669ddf832e6c9ceba4dd203749d5be31d31)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright (C) 2013 ARM Ltd.
4 * Copyright (C) 2013 Linaro.
5 *
6 * This code is based on glibc cortex strings work originally authored by Linaro
7 * be found @
8 *
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
11 */
12
13
14/*
15 * Copy a buffer from src to dest (alignment handled by the hardware)
16 *
17 * Parameters:
18 *	x0 - dest
19 *	x1 - src
20 *	x2 - n
21 * Returns:
22 *	x0 - dest
23 */
24dstin	.req	x0
25src	.req	x1
26count	.req	x2
27tmp1	.req	x3
28tmp1w	.req	w3
29tmp2	.req	x4
30tmp2w	.req	w4
31dst	.req	x6
32
33A_l	.req	x7
34A_h	.req	x8
35B_l	.req	x9
36B_h	.req	x10
37C_l	.req	x11
38C_h	.req	x12
39D_l	.req	x13
40D_h	.req	x14
41
42	mov	dst, dstin
43	cmp	count, #16
44	/*When memory length is less than 16, the accessed are not aligned.*/
45	b.lo	.Ltiny15
46
47	neg	tmp2, src
48	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
49	b.eq	.LSrcAligned
50	sub	count, count, tmp2
51	/*
52	* Copy the leading memory data from src to dst in an increasing
53	* address order.By this way,the risk of overwriting the source
54	* memory data is eliminated when the distance between src and
55	* dst is less than 16. The memory accesses here are alignment.
56	*/
57	tbz	tmp2, #0, 1f
58	ldrb1	tmp1w, src, #1
59	strb1	tmp1w, dst, #1
601:
61	tbz	tmp2, #1, 2f
62	ldrh1	tmp1w, src, #2
63	strh1	tmp1w, dst, #2
642:
65	tbz	tmp2, #2, 3f
66	ldr1	tmp1w, src, #4
67	str1	tmp1w, dst, #4
683:
69	tbz	tmp2, #3, .LSrcAligned
70	ldr1	tmp1, src, #8
71	str1	tmp1, dst, #8
72
73.LSrcAligned:
74	cmp	count, #64
75	b.ge	.Lcpy_over64
76	/*
77	* Deal with small copies quickly by dropping straight into the
78	* exit block.
79	*/
80.Ltail63:
81	/*
82	* Copy up to 48 bytes of data. At this point we only need the
83	* bottom 6 bits of count to be accurate.
84	*/
85	ands	tmp1, count, #0x30
86	b.eq	.Ltiny15
87	cmp	tmp1w, #0x20
88	b.eq	1f
89	b.lt	2f
90	ldp1	A_l, A_h, src, #16
91	stp1	A_l, A_h, dst, #16
921:
93	ldp1	A_l, A_h, src, #16
94	stp1	A_l, A_h, dst, #16
952:
96	ldp1	A_l, A_h, src, #16
97	stp1	A_l, A_h, dst, #16
98.Ltiny15:
99	/*
100	* Prefer to break one ldp/stp into several load/store to access
101	* memory in an increasing address order,rather than to load/store 16
102	* bytes from (src-16) to (dst-16) and to backward the src to aligned
103	* address,which way is used in original cortex memcpy. If keeping
104	* the original memcpy process here, memmove need to satisfy the
105	* precondition that src address is at least 16 bytes bigger than dst
106	* address,otherwise some source data will be overwritten when memove
107	* call memcpy directly. To make memmove simpler and decouple the
108	* memcpy's dependency on memmove, withdrew the original process.
109	*/
110	tbz	count, #3, 1f
111	ldr1	tmp1, src, #8
112	str1	tmp1, dst, #8
1131:
114	tbz	count, #2, 2f
115	ldr1	tmp1w, src, #4
116	str1	tmp1w, dst, #4
1172:
118	tbz	count, #1, 3f
119	ldrh1	tmp1w, src, #2
120	strh1	tmp1w, dst, #2
1213:
122	tbz	count, #0, .Lexitfunc
123	ldrb1	tmp1w, src, #1
124	strb1	tmp1w, dst, #1
125
126	b	.Lexitfunc
127
128.Lcpy_over64:
129	subs	count, count, #128
130	b.ge	.Lcpy_body_large
131	/*
132	* Less than 128 bytes to copy, so handle 64 here and then jump
133	* to the tail.
134	*/
135	ldp1	A_l, A_h, src, #16
136	stp1	A_l, A_h, dst, #16
137	ldp1	B_l, B_h, src, #16
138	ldp1	C_l, C_h, src, #16
139	stp1	B_l, B_h, dst, #16
140	stp1	C_l, C_h, dst, #16
141	ldp1	D_l, D_h, src, #16
142	stp1	D_l, D_h, dst, #16
143
144	tst	count, #0x3f
145	b.ne	.Ltail63
146	b	.Lexitfunc
147
148	/*
149	* Critical loop.  Start at a new cache line boundary.  Assuming
150	* 64 bytes per line this ensures the entire loop is in one line.
151	*/
152	.p2align	L1_CACHE_SHIFT
153.Lcpy_body_large:
154	/* pre-get 64 bytes data. */
155	ldp1	A_l, A_h, src, #16
156	ldp1	B_l, B_h, src, #16
157	ldp1	C_l, C_h, src, #16
158	ldp1	D_l, D_h, src, #16
1591:
160	/*
161	* interlace the load of next 64 bytes data block with store of the last
162	* loaded 64 bytes data.
163	*/
164	stp1	A_l, A_h, dst, #16
165	ldp1	A_l, A_h, src, #16
166	stp1	B_l, B_h, dst, #16
167	ldp1	B_l, B_h, src, #16
168	stp1	C_l, C_h, dst, #16
169	ldp1	C_l, C_h, src, #16
170	stp1	D_l, D_h, dst, #16
171	ldp1	D_l, D_h, src, #16
172	subs	count, count, #64
173	b.ge	1b
174	stp1	A_l, A_h, dst, #16
175	stp1	B_l, B_h, dst, #16
176	stp1	C_l, C_h, dst, #16
177	stp1	D_l, D_h, dst, #16
178
179	tst	count, #0x3f
180	b.ne	.Ltail63
181.Lexitfunc:
182