xref: /linux/arch/loongarch/lib/memcpy.S (revision 173b0b5b0e865348684c02bd9cb1d22b5d46e458)
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
4 */
5
6#include <linux/export.h>
7#include <asm/alternative-asm.h>
8#include <asm/asm.h>
9#include <asm/asmmacro.h>
10#include <asm/cpu.h>
11#include <asm/regdef.h>
12#include <asm/unwind_hints.h>
13
14.section .noinstr.text, "ax"
15
16SYM_FUNC_START(memcpy)
17	/*
18	 * Some CPUs support hardware unaligned access
19	 */
20	ALTERNATIVE	"b __memcpy_generic", \
21			"b __memcpy_fast", CPU_FEATURE_UAL
22SYM_FUNC_END(memcpy)
23SYM_FUNC_ALIAS(__memcpy, memcpy)
24
25EXPORT_SYMBOL(memcpy)
26EXPORT_SYMBOL(__memcpy)
27
28_ASM_NOKPROBE(memcpy)
29_ASM_NOKPROBE(__memcpy)
30
31/*
32 * void *__memcpy_generic(void *dst, const void *src, size_t n)
33 *
34 * a0: dst
35 * a1: src
36 * a2: n
37 */
38SYM_FUNC_START(__memcpy_generic)
39	move	a3, a0
40	beqz	a2, 2f
41
421:	ld.b	t0, a1, 0
43	st.b	t0, a0, 0
44	addi.d	a0, a0, 1
45	addi.d	a1, a1, 1
46	addi.d	a2, a2, -1
47	bgt	a2, zero, 1b
48
492:	move	a0, a3
50	jr	ra
51SYM_FUNC_END(__memcpy_generic)
52_ASM_NOKPROBE(__memcpy_generic)
53
54	.align	5
55SYM_FUNC_START_NOALIGN(__memcpy_small)
56	pcaddi	t0, 8
57	slli.d	a2, a2, 5
58	add.d	t0, t0, a2
59	jr	t0
60
61	.align	5
620:	jr	ra
63
64	.align	5
651:	ld.b	t0, a1, 0
66	st.b	t0, a0, 0
67	jr	ra
68
69	.align	5
702:	ld.h	t0, a1, 0
71	st.h	t0, a0, 0
72	jr	ra
73
74	.align	5
753:	ld.h	t0, a1, 0
76	ld.b	t1, a1, 2
77	st.h	t0, a0, 0
78	st.b	t1, a0, 2
79	jr	ra
80
81	.align	5
824:	ld.w	t0, a1, 0
83	st.w	t0, a0, 0
84	jr	ra
85
86	.align	5
875:	ld.w	t0, a1, 0
88	ld.b	t1, a1, 4
89	st.w	t0, a0, 0
90	st.b	t1, a0, 4
91	jr	ra
92
93	.align	5
946:	ld.w	t0, a1, 0
95	ld.h	t1, a1, 4
96	st.w	t0, a0, 0
97	st.h	t1, a0, 4
98	jr	ra
99
100	.align	5
1017:	ld.w	t0, a1, 0
102	ld.w	t1, a1, 3
103	st.w	t0, a0, 0
104	st.w	t1, a0, 3
105	jr	ra
106
107	.align	5
1088:	ld.d	t0, a1, 0
109	st.d	t0, a0, 0
110	jr	ra
111SYM_FUNC_END(__memcpy_small)
112_ASM_NOKPROBE(__memcpy_small)
113
114/*
115 * void *__memcpy_fast(void *dst, const void *src, size_t n)
116 *
117 * a0: dst
118 * a1: src
119 * a2: n
120 */
121SYM_FUNC_START(__memcpy_fast)
122	sltui	t0, a2, 9
123	bnez	t0, __memcpy_small
124
125	add.d	a3, a1, a2
126	add.d	a2, a0, a2
127	ld.d	a6, a1, 0
128	ld.d	a7, a3, -8
129
130	/* align up destination address */
131	andi	t1, a0, 7
132	sub.d	t0, zero, t1
133	addi.d	t0, t0, 8
134	add.d	a1, a1, t0
135	add.d	a5, a0, t0
136
137	addi.d	a4, a3, -64
138	bgeu	a1, a4, .Llt64
139
140	/* copy 64 bytes at a time */
141.Lloop64:
142	ld.d	t0, a1, 0
143	ld.d	t1, a1, 8
144	ld.d	t2, a1, 16
145	ld.d	t3, a1, 24
146	ld.d	t4, a1, 32
147	ld.d	t5, a1, 40
148	ld.d	t6, a1, 48
149	ld.d	t7, a1, 56
150	addi.d	a1, a1, 64
151	st.d	t0, a5, 0
152	st.d	t1, a5, 8
153	st.d	t2, a5, 16
154	st.d	t3, a5, 24
155	st.d	t4, a5, 32
156	st.d	t5, a5, 40
157	st.d	t6, a5, 48
158	st.d	t7, a5, 56
159	addi.d	a5, a5, 64
160	bltu	a1, a4, .Lloop64
161
162	/* copy the remaining bytes */
163.Llt64:
164	addi.d	a4, a3, -32
165	bgeu	a1, a4, .Llt32
166	ld.d	t0, a1, 0
167	ld.d	t1, a1, 8
168	ld.d	t2, a1, 16
169	ld.d	t3, a1, 24
170	addi.d	a1, a1, 32
171	st.d	t0, a5, 0
172	st.d	t1, a5, 8
173	st.d	t2, a5, 16
174	st.d	t3, a5, 24
175	addi.d	a5, a5, 32
176
177.Llt32:
178	addi.d	a4, a3, -16
179	bgeu	a1, a4, .Llt16
180	ld.d	t0, a1, 0
181	ld.d	t1, a1, 8
182	addi.d	a1, a1, 16
183	st.d	t0, a5, 0
184	st.d	t1, a5, 8
185	addi.d	a5, a5, 16
186
187.Llt16:
188	addi.d	a4, a3, -8
189	bgeu	a1, a4, .Llt8
190	ld.d	t0, a1, 0
191	st.d	t0, a5, 0
192
193.Llt8:
194	st.d	a6, a0, 0
195	st.d	a7, a2, -8
196
197	/* return */
198	jr	ra
199SYM_FUNC_END(__memcpy_fast)
200_ASM_NOKPROBE(__memcpy_fast)
201
202STACK_FRAME_NON_STANDARD __memcpy_small
203