xref: /freebsd/lib/libc/riscv/string/memcpy.S (revision 25fdd86a4c92b5bdab82db289f3bcd57756778e7)
1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org>
5 */
6
7#include <machine/asm.h>
8
9/*
10 * a0 - void* dst
11 * a1 - const void* src
12 * a2 - size_t len
13 */
14ENTRY(memcpy)
15	beqz a2, .Lreturn
16
17	/* diff = (dstv - srcv) & 0b111 */
18	sub t0, a0, a1
19	andi t0, t0, 0b111
20
21	sltiu t1, a2, 8
22
23	/* we never change a0, because memcpy returns the original dst */
24	mv a3, a0
25
26	/* len < 8 */
27	bnez t1, .Lend
28
29	/* t1 = (-dst) & 0b111 */
30	neg t1, a0
31	andi t1, t1, 0b111
32
33	sub a2, a2, t1
34
35	la t2, .Lduff_start
36	slli t3, t1, 3
37	sub t2, t2, t3
38	jr t2
39	lb t3, 6(a1)
40	sb t3, 6(a3)
41	lb t3, 5(a1)
42	sb t3, 5(a3)
43	lb t3, 4(a1)
44	sb t3, 4(a3)
45	lb t3, 3(a1)
46	sb t3, 3(a3)
47	lb t3, 2(a1)
48	sb t3, 2(a3)
49	lb t3, 1(a1)
50	sb t3, 1(a3)
51	lb t3, 0(a1)
52	sb t3, 0(a3)
53.Lduff_start:
54
55	add a1, a1, t1
56	add a3, a3, t1
57
58	beqz a2, .Lreturn
59
60	beqz t0, .Lmemcpy8
61
62	/*
63	 * a4 - size_t right_shift
64	 * a5 - size_t left_shift
65	 * a6 - size_t whole (number of dword stores)
66	 */
67
68	/* right_shift = (src % 0b111) * 8; */
69	andi a4, a1, 0b111
70	slli a4, a4, 3
71
72	/* left_shift = 64 - right_shift */
73	neg a5, a4
74
75	/* whole = len / 8 */
76	srli a6, a2, 3
77
78	/* len = len % 8 */
79	andi a2, a2, 0b111
80
81	/* t0 - uint64_t* ptr */
82
83	/* ptr = src & ~0b111 */
84	andi t0, a1, ~0b111
85
86	/* src += whole * 8 */
87	slli t1, a6, 3
88	add a1, a1, t1
89
90	/*
91	 * t1 - uint64_t low
92	 * t2 - uint64_t high
93	 */
94
95	/* low = *ptr++ */
96	ld t1, (t0)
97	addi t0, t0, 8
98
99	/* low >>= right_shift */
100	srl t1, t1, a4
101
102	beqz a6, .Llmain_skip
103.Llmain:
104	/* high = *ptr++ */
105	ld t2, (t0)
106	addi t0, t0, 8
107
108	/* whole-- */
109	addi a6, a6, -1
110
111	/* temp = (high << left_shift) | low */
112	sll t3, t2, a5
113	or t3, t3, t1
114
115	/* low = high >> right_shift */
116	srl t1, t2, a4
117
118	/* *dst++ = temp */
119	sd t3, (a3)
120	addi a3, a3, 8
121
122	bnez a6, .Llmain
123
124.Llmain_skip:
125
126.Lend:
127	la t1, .Lduff_end
128	slli t2, a2, 3
129	sub t1, t1, t2
130	jr t1
131	lb t2, 6(a1)
132	sb t2, 6(a3)
133	lb t2, 5(a1)
134	sb t2, 5(a3)
135	lb t2, 4(a1)
136	sb t2, 4(a3)
137	lb t2, 3(a1)
138	sb t2, 3(a3)
139	lb t2, 2(a1)
140	sb t2, 2(a3)
141	lb t2, 1(a1)
142	sb t2, 1(a3)
143	lb t2, 0(a1)
144	sb t2, 0(a3)
145.Lduff_end:
146
147.Lreturn:
148	ret
149
150/* exectued when dst - src is multiple of 8
151 * a0 - void* dst
152 * a1 - const void* src
153 * a2 - size_t len
154 */
155.Lmemcpy8:
156
157	beqz a2, .Lreturn
158
159	slti t0, a2, 128
160	bnez t0, .Llmain8_64_skip
161
162	/* a4 - uint64_t* end_unroll */
163
164	/* end_unroll = dst + len / 64 * 64 */
165	andi t0, a2, ~0b111111
166	add a4, a3, t0
167
168	/* len = len % 64 */
169	andi a2, a2, 0b111111
170
171.Llmain8_64:
172	ld t0, 0(a1)
173	ld t1, 8(a1)
174	ld t2, 16(a1)
175	ld t3, 24(a1)
176	sd t0, 0(a3)
177	sd t1, 8(a3)
178	sd t2, 16(a3)
179	sd t3, 24(a3)
180	ld t0, 32(a1)
181	ld t1, 40(a1)
182	ld t2, 48(a1)
183	ld t3, 56(a1)
184	sd t0, 32(a3)
185	sd t1, 40(a3)
186	sd t2, 48(a3)
187	sd t3, 56(a3)
188	addi a3, a3, 64
189	addi a1, a1, 64
190	bne a3, a4, .Llmain8_64
191.Llmain8_64_skip:
192
193	beqz a2, .Lreturn
194
195	/* a4 - uint64_t* end_align */
196
197	/* end_align = (dst + len) & ~0b111 */
198	add a4, a3, a2
199	andi a4, a4, ~0b111
200
201	/* len = len % 8 */
202	andi a2, a2, 0b111
203
204	beq a3, a4, .Llmain8_skip
205.Llmain8:
206	ld t0, (a1)
207	sd t0, (a3)
208	addi a3, a3, 8
209	addi a1, a1, 8
210	bne a3, a4, .Llmain8
211.Llmain8_skip:
212
213	la t1, .Lduff_end
214	slli t2, a2, 3
215	sub t1, t1, t2
216	jr t1
217END(memcpy)
218