xref: /linux/arch/loongarch/lib/memcpy.S (revision eb01fe7abbe2d0b38824d2a93fdb4cc3eaf2ccc1)
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
4 */
5
6#include <linux/export.h>
7#include <asm/alternative-asm.h>
8#include <asm/asm.h>
9#include <asm/asmmacro.h>
10#include <asm/cpu.h>
11#include <asm/regdef.h>
12
13.section .noinstr.text, "ax"
14
15SYM_FUNC_START(memcpy)
16	/*
17	 * Some CPUs support hardware unaligned access
18	 */
19	ALTERNATIVE	"b __memcpy_generic", \
20			"b __memcpy_fast", CPU_FEATURE_UAL
21SYM_FUNC_END(memcpy)
22SYM_FUNC_ALIAS(__memcpy, memcpy)
23
24EXPORT_SYMBOL(memcpy)
25EXPORT_SYMBOL(__memcpy)
26
27_ASM_NOKPROBE(memcpy)
28_ASM_NOKPROBE(__memcpy)
29
30/*
31 * void *__memcpy_generic(void *dst, const void *src, size_t n)
32 *
33 * a0: dst
34 * a1: src
35 * a2: n
36 */
37SYM_FUNC_START(__memcpy_generic)
38	move	a3, a0
39	beqz	a2, 2f
40
411:	ld.b	t0, a1, 0
42	st.b	t0, a0, 0
43	addi.d	a0, a0, 1
44	addi.d	a1, a1, 1
45	addi.d	a2, a2, -1
46	bgt	a2, zero, 1b
47
482:	move	a0, a3
49	jr	ra
50SYM_FUNC_END(__memcpy_generic)
51_ASM_NOKPROBE(__memcpy_generic)
52
53	.align	5
54SYM_FUNC_START_NOALIGN(__memcpy_small)
55	pcaddi	t0, 8
56	slli.d	a2, a2, 5
57	add.d	t0, t0, a2
58	jr	t0
59
60	.align	5
610:	jr	ra
62
63	.align	5
641:	ld.b	t0, a1, 0
65	st.b	t0, a0, 0
66	jr	ra
67
68	.align	5
692:	ld.h	t0, a1, 0
70	st.h	t0, a0, 0
71	jr	ra
72
73	.align	5
743:	ld.h	t0, a1, 0
75	ld.b	t1, a1, 2
76	st.h	t0, a0, 0
77	st.b	t1, a0, 2
78	jr	ra
79
80	.align	5
814:	ld.w	t0, a1, 0
82	st.w	t0, a0, 0
83	jr	ra
84
85	.align	5
865:	ld.w	t0, a1, 0
87	ld.b	t1, a1, 4
88	st.w	t0, a0, 0
89	st.b	t1, a0, 4
90	jr	ra
91
92	.align	5
936:	ld.w	t0, a1, 0
94	ld.h	t1, a1, 4
95	st.w	t0, a0, 0
96	st.h	t1, a0, 4
97	jr	ra
98
99	.align	5
1007:	ld.w	t0, a1, 0
101	ld.w	t1, a1, 3
102	st.w	t0, a0, 0
103	st.w	t1, a0, 3
104	jr	ra
105
106	.align	5
1078:	ld.d	t0, a1, 0
108	st.d	t0, a0, 0
109	jr	ra
110SYM_FUNC_END(__memcpy_small)
111_ASM_NOKPROBE(__memcpy_small)
112
113/*
114 * void *__memcpy_fast(void *dst, const void *src, size_t n)
115 *
116 * a0: dst
117 * a1: src
118 * a2: n
119 */
120SYM_FUNC_START(__memcpy_fast)
121	sltui	t0, a2, 9
122	bnez	t0, __memcpy_small
123
124	add.d	a3, a1, a2
125	add.d	a2, a0, a2
126	ld.d	a6, a1, 0
127	ld.d	a7, a3, -8
128
129	/* align up destination address */
130	andi	t1, a0, 7
131	sub.d	t0, zero, t1
132	addi.d	t0, t0, 8
133	add.d	a1, a1, t0
134	add.d	a5, a0, t0
135
136	addi.d	a4, a3, -64
137	bgeu	a1, a4, .Llt64
138
139	/* copy 64 bytes at a time */
140.Lloop64:
141	ld.d	t0, a1, 0
142	ld.d	t1, a1, 8
143	ld.d	t2, a1, 16
144	ld.d	t3, a1, 24
145	ld.d	t4, a1, 32
146	ld.d	t5, a1, 40
147	ld.d	t6, a1, 48
148	ld.d	t7, a1, 56
149	addi.d	a1, a1, 64
150	st.d	t0, a5, 0
151	st.d	t1, a5, 8
152	st.d	t2, a5, 16
153	st.d	t3, a5, 24
154	st.d	t4, a5, 32
155	st.d	t5, a5, 40
156	st.d	t6, a5, 48
157	st.d	t7, a5, 56
158	addi.d	a5, a5, 64
159	bltu	a1, a4, .Lloop64
160
161	/* copy the remaining bytes */
162.Llt64:
163	addi.d	a4, a3, -32
164	bgeu	a1, a4, .Llt32
165	ld.d	t0, a1, 0
166	ld.d	t1, a1, 8
167	ld.d	t2, a1, 16
168	ld.d	t3, a1, 24
169	addi.d	a1, a1, 32
170	st.d	t0, a5, 0
171	st.d	t1, a5, 8
172	st.d	t2, a5, 16
173	st.d	t3, a5, 24
174	addi.d	a5, a5, 32
175
176.Llt32:
177	addi.d	a4, a3, -16
178	bgeu	a1, a4, .Llt16
179	ld.d	t0, a1, 0
180	ld.d	t1, a1, 8
181	addi.d	a1, a1, 16
182	st.d	t0, a5, 0
183	st.d	t1, a5, 8
184	addi.d	a5, a5, 16
185
186.Llt16:
187	addi.d	a4, a3, -8
188	bgeu	a1, a4, .Llt8
189	ld.d	t0, a1, 0
190	st.d	t0, a5, 0
191
192.Llt8:
193	st.d	a6, a0, 0
194	st.d	a7, a2, -8
195
196	/* return */
197	jr	ra
198SYM_FUNC_END(__memcpy_fast)
199_ASM_NOKPROBE(__memcpy_fast)
200