xref: /freebsd/lib/libc/riscv/string/memcpy.S (revision 25fdd86a4c92b5bdab82db289f3bcd57756778e7)
1*25fdd86aSStrahinja Stanišić/*-
2*25fdd86aSStrahinja Stanišić * SPDX-License-Identifier: BSD-2-Clause
3*25fdd86aSStrahinja Stanišić *
4*25fdd86aSStrahinja Stanišić * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org>
5*25fdd86aSStrahinja Stanišić */
6*25fdd86aSStrahinja Stanišić
7*25fdd86aSStrahinja Stanišić#include <machine/asm.h>
8*25fdd86aSStrahinja Stanišić
9*25fdd86aSStrahinja Stanišić/*
10*25fdd86aSStrahinja Stanišić * a0 - void* dst
11*25fdd86aSStrahinja Stanišić * a1 - const void* src
12*25fdd86aSStrahinja Stanišić * a2 - size_t len
13*25fdd86aSStrahinja Stanišić */
14*25fdd86aSStrahinja StanišićENTRY(memcpy)
15*25fdd86aSStrahinja Stanišić	beqz a2, .Lreturn
16*25fdd86aSStrahinja Stanišić
17*25fdd86aSStrahinja Stanišić	/* diff = (dstv - srcv) & 0b111 */
18*25fdd86aSStrahinja Stanišić	sub t0, a0, a1
19*25fdd86aSStrahinja Stanišić	andi t0, t0, 0b111
20*25fdd86aSStrahinja Stanišić
21*25fdd86aSStrahinja Stanišić	sltiu t1, a2, 8
22*25fdd86aSStrahinja Stanišić
23*25fdd86aSStrahinja Stanišić	/* we never change a0, because memcpy returns the original dst */
24*25fdd86aSStrahinja Stanišić	mv a3, a0
25*25fdd86aSStrahinja Stanišić
26*25fdd86aSStrahinja Stanišić	/* len < 8 */
27*25fdd86aSStrahinja Stanišić	bnez t1, .Lend
28*25fdd86aSStrahinja Stanišić
29*25fdd86aSStrahinja Stanišić	/* t1 = (-dst) & 0b111 */
30*25fdd86aSStrahinja Stanišić	neg t1, a0
31*25fdd86aSStrahinja Stanišić	andi t1, t1, 0b111
32*25fdd86aSStrahinja Stanišić
33*25fdd86aSStrahinja Stanišić	sub a2, a2, t1
34*25fdd86aSStrahinja Stanišić
35*25fdd86aSStrahinja Stanišić	la t2, .Lduff_start
36*25fdd86aSStrahinja Stanišić	slli t3, t1, 3
37*25fdd86aSStrahinja Stanišić	sub t2, t2, t3
38*25fdd86aSStrahinja Stanišić	jr t2
39*25fdd86aSStrahinja Stanišić	lb t3, 6(a1)
40*25fdd86aSStrahinja Stanišić	sb t3, 6(a3)
41*25fdd86aSStrahinja Stanišić	lb t3, 5(a1)
42*25fdd86aSStrahinja Stanišić	sb t3, 5(a3)
43*25fdd86aSStrahinja Stanišić	lb t3, 4(a1)
44*25fdd86aSStrahinja Stanišić	sb t3, 4(a3)
45*25fdd86aSStrahinja Stanišić	lb t3, 3(a1)
46*25fdd86aSStrahinja Stanišić	sb t3, 3(a3)
47*25fdd86aSStrahinja Stanišić	lb t3, 2(a1)
48*25fdd86aSStrahinja Stanišić	sb t3, 2(a3)
49*25fdd86aSStrahinja Stanišić	lb t3, 1(a1)
50*25fdd86aSStrahinja Stanišić	sb t3, 1(a3)
51*25fdd86aSStrahinja Stanišić	lb t3, 0(a1)
52*25fdd86aSStrahinja Stanišić	sb t3, 0(a3)
53*25fdd86aSStrahinja Stanišić.Lduff_start:
54*25fdd86aSStrahinja Stanišić
55*25fdd86aSStrahinja Stanišić	add a1, a1, t1
56*25fdd86aSStrahinja Stanišić	add a3, a3, t1
57*25fdd86aSStrahinja Stanišić
58*25fdd86aSStrahinja Stanišić	beqz a2, .Lreturn
59*25fdd86aSStrahinja Stanišić
60*25fdd86aSStrahinja Stanišić	beqz t0, .Lmemcpy8
61*25fdd86aSStrahinja Stanišić
62*25fdd86aSStrahinja Stanišić	/*
63*25fdd86aSStrahinja Stanišić	 * a4 - size_t right_shift
64*25fdd86aSStrahinja Stanišić	 * a5 - size_t left_shift
65*25fdd86aSStrahinja Stanišić	 * a6 - size_t whole (number of dword stores)
66*25fdd86aSStrahinja Stanišić	 */
67*25fdd86aSStrahinja Stanišić
68*25fdd86aSStrahinja Stanišić	/* right_shift = (src % 0b111) * 8; */
69*25fdd86aSStrahinja Stanišić	andi a4, a1, 0b111
70*25fdd86aSStrahinja Stanišić	slli a4, a4, 3
71*25fdd86aSStrahinja Stanišić
72*25fdd86aSStrahinja Stanišić	/* left_shift = 64 - right_shift */
73*25fdd86aSStrahinja Stanišić	neg a5, a4
74*25fdd86aSStrahinja Stanišić
75*25fdd86aSStrahinja Stanišić	/* whole = len / 8 */
76*25fdd86aSStrahinja Stanišić	srli a6, a2, 3
77*25fdd86aSStrahinja Stanišić
78*25fdd86aSStrahinja Stanišić	/* len = len % 8 */
79*25fdd86aSStrahinja Stanišić	andi a2, a2, 0b111
80*25fdd86aSStrahinja Stanišić
81*25fdd86aSStrahinja Stanišić	/* t0 - uint64_t* ptr */
82*25fdd86aSStrahinja Stanišić
83*25fdd86aSStrahinja Stanišić	/* ptr = src & ~0b111 */
84*25fdd86aSStrahinja Stanišić	andi t0, a1, ~0b111
85*25fdd86aSStrahinja Stanišić
86*25fdd86aSStrahinja Stanišić	/* src += whole * 8 */
87*25fdd86aSStrahinja Stanišić	slli t1, a6, 3
88*25fdd86aSStrahinja Stanišić	add a1, a1, t1
89*25fdd86aSStrahinja Stanišić
90*25fdd86aSStrahinja Stanišić	/*
91*25fdd86aSStrahinja Stanišić	 * t1 - uint64_t low
92*25fdd86aSStrahinja Stanišić	 * t2 - uint64_t high
93*25fdd86aSStrahinja Stanišić	 */
94*25fdd86aSStrahinja Stanišić
95*25fdd86aSStrahinja Stanišić	/* low = *ptr++ */
96*25fdd86aSStrahinja Stanišić	ld t1, (t0)
97*25fdd86aSStrahinja Stanišić	addi t0, t0, 8
98*25fdd86aSStrahinja Stanišić
99*25fdd86aSStrahinja Stanišić	/* low >>= right_shift */
100*25fdd86aSStrahinja Stanišić	srl t1, t1, a4
101*25fdd86aSStrahinja Stanišić
102*25fdd86aSStrahinja Stanišić	beqz a6, .Llmain_skip
103*25fdd86aSStrahinja Stanišić.Llmain:
104*25fdd86aSStrahinja Stanišić	/* high = *ptr++ */
105*25fdd86aSStrahinja Stanišić	ld t2, (t0)
106*25fdd86aSStrahinja Stanišić	addi t0, t0, 8
107*25fdd86aSStrahinja Stanišić
108*25fdd86aSStrahinja Stanišić	/* whole-- */
109*25fdd86aSStrahinja Stanišić	addi a6, a6, -1
110*25fdd86aSStrahinja Stanišić
111*25fdd86aSStrahinja Stanišić	/* temp = (high << left_shift) | low */
112*25fdd86aSStrahinja Stanišić	sll t3, t2, a5
113*25fdd86aSStrahinja Stanišić	or t3, t3, t1
114*25fdd86aSStrahinja Stanišić
115*25fdd86aSStrahinja Stanišić	/* low = high >> right_shift */
116*25fdd86aSStrahinja Stanišić	srl t1, t2, a4
117*25fdd86aSStrahinja Stanišić
118*25fdd86aSStrahinja Stanišić	/* *dst++ = temp */
119*25fdd86aSStrahinja Stanišić	sd t3, (a3)
120*25fdd86aSStrahinja Stanišić	addi a3, a3, 8
121*25fdd86aSStrahinja Stanišić
122*25fdd86aSStrahinja Stanišić	bnez a6, .Llmain
123*25fdd86aSStrahinja Stanišić
124*25fdd86aSStrahinja Stanišić.Llmain_skip:
125*25fdd86aSStrahinja Stanišić
126*25fdd86aSStrahinja Stanišić.Lend:
127*25fdd86aSStrahinja Stanišić	la t1, .Lduff_end
128*25fdd86aSStrahinja Stanišić	slli t2, a2, 3
129*25fdd86aSStrahinja Stanišić	sub t1, t1, t2
130*25fdd86aSStrahinja Stanišić	jr t1
131*25fdd86aSStrahinja Stanišić	lb t2, 6(a1)
132*25fdd86aSStrahinja Stanišić	sb t2, 6(a3)
133*25fdd86aSStrahinja Stanišić	lb t2, 5(a1)
134*25fdd86aSStrahinja Stanišić	sb t2, 5(a3)
135*25fdd86aSStrahinja Stanišić	lb t2, 4(a1)
136*25fdd86aSStrahinja Stanišić	sb t2, 4(a3)
137*25fdd86aSStrahinja Stanišić	lb t2, 3(a1)
138*25fdd86aSStrahinja Stanišić	sb t2, 3(a3)
139*25fdd86aSStrahinja Stanišić	lb t2, 2(a1)
140*25fdd86aSStrahinja Stanišić	sb t2, 2(a3)
141*25fdd86aSStrahinja Stanišić	lb t2, 1(a1)
142*25fdd86aSStrahinja Stanišić	sb t2, 1(a3)
143*25fdd86aSStrahinja Stanišić	lb t2, 0(a1)
144*25fdd86aSStrahinja Stanišić	sb t2, 0(a3)
145*25fdd86aSStrahinja Stanišić.Lduff_end:
146*25fdd86aSStrahinja Stanišić
147*25fdd86aSStrahinja Stanišić.Lreturn:
148*25fdd86aSStrahinja Stanišić	ret
149*25fdd86aSStrahinja Stanišić
150*25fdd86aSStrahinja Stanišić/* exectued when dst - src is multiple of 8
151*25fdd86aSStrahinja Stanišić * a0 - void* dst
152*25fdd86aSStrahinja Stanišić * a1 - const void* src
153*25fdd86aSStrahinja Stanišić * a2 - size_t len
154*25fdd86aSStrahinja Stanišić */
155*25fdd86aSStrahinja Stanišić.Lmemcpy8:
156*25fdd86aSStrahinja Stanišić
157*25fdd86aSStrahinja Stanišić	beqz a2, .Lreturn
158*25fdd86aSStrahinja Stanišić
159*25fdd86aSStrahinja Stanišić	slti t0, a2, 128
160*25fdd86aSStrahinja Stanišić	bnez t0, .Llmain8_64_skip
161*25fdd86aSStrahinja Stanišić
162*25fdd86aSStrahinja Stanišić	/* a4 - uint64_t* end_unroll */
163*25fdd86aSStrahinja Stanišić
164*25fdd86aSStrahinja Stanišić	/* end_unroll = dst + len / 64 * 64 */
165*25fdd86aSStrahinja Stanišić	andi t0, a2, ~0b111111
166*25fdd86aSStrahinja Stanišić	add a4, a3, t0
167*25fdd86aSStrahinja Stanišić
168*25fdd86aSStrahinja Stanišić	/* len = len % 64 */
169*25fdd86aSStrahinja Stanišić	andi a2, a2, 0b111111
170*25fdd86aSStrahinja Stanišić
171*25fdd86aSStrahinja Stanišić.Llmain8_64:
172*25fdd86aSStrahinja Stanišić	ld t0, 0(a1)
173*25fdd86aSStrahinja Stanišić	ld t1, 8(a1)
174*25fdd86aSStrahinja Stanišić	ld t2, 16(a1)
175*25fdd86aSStrahinja Stanišić	ld t3, 24(a1)
176*25fdd86aSStrahinja Stanišić	sd t0, 0(a3)
177*25fdd86aSStrahinja Stanišić	sd t1, 8(a3)
178*25fdd86aSStrahinja Stanišić	sd t2, 16(a3)
179*25fdd86aSStrahinja Stanišić	sd t3, 24(a3)
180*25fdd86aSStrahinja Stanišić	ld t0, 32(a1)
181*25fdd86aSStrahinja Stanišić	ld t1, 40(a1)
182*25fdd86aSStrahinja Stanišić	ld t2, 48(a1)
183*25fdd86aSStrahinja Stanišić	ld t3, 56(a1)
184*25fdd86aSStrahinja Stanišić	sd t0, 32(a3)
185*25fdd86aSStrahinja Stanišić	sd t1, 40(a3)
186*25fdd86aSStrahinja Stanišić	sd t2, 48(a3)
187*25fdd86aSStrahinja Stanišić	sd t3, 56(a3)
188*25fdd86aSStrahinja Stanišić	addi a3, a3, 64
189*25fdd86aSStrahinja Stanišić	addi a1, a1, 64
190*25fdd86aSStrahinja Stanišić	bne a3, a4, .Llmain8_64
191*25fdd86aSStrahinja Stanišić.Llmain8_64_skip:
192*25fdd86aSStrahinja Stanišić
193*25fdd86aSStrahinja Stanišić	beqz a2, .Lreturn
194*25fdd86aSStrahinja Stanišić
195*25fdd86aSStrahinja Stanišić	/* a4 - uint64_t* end_align */
196*25fdd86aSStrahinja Stanišić
197*25fdd86aSStrahinja Stanišić	/* end_align = (dst + len) & ~0b111 */
198*25fdd86aSStrahinja Stanišić	add a4, a3, a2
199*25fdd86aSStrahinja Stanišić	andi a4, a4, ~0b111
200*25fdd86aSStrahinja Stanišić
201*25fdd86aSStrahinja Stanišić	/* len = len % 8 */
202*25fdd86aSStrahinja Stanišić	andi a2, a2, 0b111
203*25fdd86aSStrahinja Stanišić
204*25fdd86aSStrahinja Stanišić	beq a3, a4, .Llmain8_skip
205*25fdd86aSStrahinja Stanišić.Llmain8:
206*25fdd86aSStrahinja Stanišić	ld t0, (a1)
207*25fdd86aSStrahinja Stanišić	sd t0, (a3)
208*25fdd86aSStrahinja Stanišić	addi a3, a3, 8
209*25fdd86aSStrahinja Stanišić	addi a1, a1, 8
210*25fdd86aSStrahinja Stanišić	bne a3, a4, .Llmain8
211*25fdd86aSStrahinja Stanišić.Llmain8_skip:
212*25fdd86aSStrahinja Stanišić
213*25fdd86aSStrahinja Stanišić	la t1, .Lduff_end
214*25fdd86aSStrahinja Stanišić	slli t2, a2, 3
215*25fdd86aSStrahinja Stanišić	sub t1, t1, t2
216*25fdd86aSStrahinja Stanišić	jr t1
217*25fdd86aSStrahinja StanišićEND(memcpy)
218