xref: /freebsd/contrib/llvm-project/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2// See https://llvm.org/LICENSE.txt for license information.
3// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
5// Routines taken from libc/AOR_v20.02/string/aarch64
6
7#include "../assembly.h"
8
9#ifdef __aarch64__
10
11#define L(l) .L ## l
12
13//
14//  __arm_sc_memcpy / __arm_sc_memmove
15//
16
17#define dstin    x0
18#define src      x1
19#define count    x2
20#define dst      x3
21#define srcend1  x4
22#define dstend1  x5
23#define A_l      x6
24#define A_lw     w6
25#define A_h      x7
26#define B_l      x8
27#define B_lw     w8
28#define B_h      x9
29#define C_l      x10
30#define C_lw     w10
31#define C_h      x11
32#define D_l      x12
33#define D_h      x13
34#define E_l      x14
35#define E_h      x15
36#define F_l      x16
37#define F_h      x17
38#define G_l      count
39#define G_h      dst
40#define H_l      src
41#define H_h      srcend1
42#define tmp1     x14
43
44/* This implementation handles overlaps and supports both memcpy and memmove
45   from a single entry point.  It uses unaligned accesses and branchless
46   sequences to keep the code small, simple and improve performance.
47
48   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
49   copies of up to 128 bytes, and large copies.  The overhead of the overlap
50   check is negligible since it is only required for large copies.
51
52   Large copies use a software pipelined loop processing 64 bytes per iteration.
53   The destination pointer is 16-byte aligned to minimize unaligned accesses.
54   The loop tail is handled by always copying 64 bytes from the end.
55*/
56
57DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memcpy)
58        add     srcend1, src, count
59        add     dstend1, dstin, count
60        cmp     count, 128
61        b.hi    L(copy_long)
62        cmp     count, 32
63        b.hi    L(copy32_128)
64
65        /* Small copies: 0..32 bytes.  */
66        cmp     count, 16
67        b.lo    L(copy16)
68        ldp     A_l, A_h, [src]
69        ldp     D_l, D_h, [srcend1, -16]
70        stp     A_l, A_h, [dstin]
71        stp     D_l, D_h, [dstend1, -16]
72        ret
73
74        /* Copy 8-15 bytes.  */
75L(copy16):
76        tbz     count, 3, L(copy8)
77        ldr     A_l, [src]
78        ldr     A_h, [srcend1, -8]
79        str     A_l, [dstin]
80        str     A_h, [dstend1, -8]
81        ret
82
83        .p2align 3
84        /* Copy 4-7 bytes.  */
85L(copy8):
86        tbz     count, 2, L(copy4)
87        ldr     A_lw, [src]
88        ldr     B_lw, [srcend1, -4]
89        str     A_lw, [dstin]
90        str     B_lw, [dstend1, -4]
91        ret
92
93        /* Copy 0..3 bytes using a branchless sequence.  */
94L(copy4):
95        cbz     count, L(copy0)
96        lsr     tmp1, count, 1
97        ldrb    A_lw, [src]
98        ldrb    C_lw, [srcend1, -1]
99        ldrb    B_lw, [src, tmp1]
100        strb    A_lw, [dstin]
101        strb    B_lw, [dstin, tmp1]
102        strb    C_lw, [dstend1, -1]
103L(copy0):
104        ret
105
106        .p2align 4
107        /* Medium copies: 33..128 bytes.  */
108L(copy32_128):
109        ldp     A_l, A_h, [src]
110        ldp     B_l, B_h, [src, 16]
111        ldp     C_l, C_h, [srcend1, -32]
112        ldp     D_l, D_h, [srcend1, -16]
113        cmp     count, 64
114        b.hi    L(copy128)
115        stp     A_l, A_h, [dstin]
116        stp     B_l, B_h, [dstin, 16]
117        stp     C_l, C_h, [dstend1, -32]
118        stp     D_l, D_h, [dstend1, -16]
119        ret
120
121        .p2align 4
122        /* Copy 65..128 bytes.  */
123L(copy128):
124        ldp     E_l, E_h, [src, 32]
125        ldp     F_l, F_h, [src, 48]
126        cmp     count, 96
127        b.ls    L(copy96)
128        ldp     G_l, G_h, [srcend1, -64]
129        ldp     H_l, H_h, [srcend1, -48]
130        stp     G_l, G_h, [dstend1, -64]
131        stp     H_l, H_h, [dstend1, -48]
132L(copy96):
133        stp     A_l, A_h, [dstin]
134        stp     B_l, B_h, [dstin, 16]
135        stp     E_l, E_h, [dstin, 32]
136        stp     F_l, F_h, [dstin, 48]
137        stp     C_l, C_h, [dstend1, -32]
138        stp     D_l, D_h, [dstend1, -16]
139        ret
140
141        .p2align 4
142        /* Copy more than 128 bytes.  */
143L(copy_long):
144        /* Use backwards copy if there is an overlap.  */
145        sub     tmp1, dstin, src
146        cbz     tmp1, L(copy0)
147        cmp     tmp1, count
148        b.lo    L(copy_long_backwards)
149
150        /* Copy 16 bytes and then align dst to 16-byte alignment.  */
151
152        ldp     D_l, D_h, [src]
153        and     tmp1, dstin, 15
154        bic     dst, dstin, 15
155        sub     src, src, tmp1
156        add     count, count, tmp1      /* Count is now 16 too large.  */
157        ldp     A_l, A_h, [src, 16]
158        stp     D_l, D_h, [dstin]
159        ldp     B_l, B_h, [src, 32]
160        ldp     C_l, C_h, [src, 48]
161        ldp     D_l, D_h, [src, 64]!
162        subs    count, count, 128 + 16  /* Test and readjust count.  */
163        b.ls    L(copy64_from_end)
164L(loop64):
165        stp     A_l, A_h, [dst, 16]
166        ldp     A_l, A_h, [src, 16]
167        stp     B_l, B_h, [dst, 32]
168        ldp     B_l, B_h, [src, 32]
169        stp     C_l, C_h, [dst, 48]
170        ldp     C_l, C_h, [src, 48]
171        stp     D_l, D_h, [dst, 64]!
172        ldp     D_l, D_h, [src, 64]!
173        subs    count, count, 64
174        b.hi    L(loop64)
175
176        /* Write the last iteration and copy 64 bytes from the end.  */
177L(copy64_from_end):
178        ldp     E_l, E_h, [srcend1, -64]
179        stp     A_l, A_h, [dst, 16]
180        ldp     A_l, A_h, [srcend1, -48]
181        stp     B_l, B_h, [dst, 32]
182        ldp     B_l, B_h, [srcend1, -32]
183        stp     C_l, C_h, [dst, 48]
184        ldp     C_l, C_h, [srcend1, -16]
185        stp     D_l, D_h, [dst, 64]
186        stp     E_l, E_h, [dstend1, -64]
187        stp     A_l, A_h, [dstend1, -48]
188        stp     B_l, B_h, [dstend1, -32]
189        stp     C_l, C_h, [dstend1, -16]
190        ret
191
192        .p2align 4
193
194        /* Large backwards copy for overlapping copies.
195           Copy 16 bytes and then align dst to 16-byte alignment.  */
196L(copy_long_backwards):
197        ldp     D_l, D_h, [srcend1, -16]
198        and     tmp1, dstend1, 15
199        sub     srcend1, srcend1, tmp1
200        sub     count, count, tmp1
201        ldp     A_l, A_h, [srcend1, -16]
202        stp     D_l, D_h, [dstend1, -16]
203        ldp     B_l, B_h, [srcend1, -32]
204        ldp     C_l, C_h, [srcend1, -48]
205        ldp     D_l, D_h, [srcend1, -64]!
206        sub     dstend1, dstend1, tmp1
207        subs    count, count, 128
208        b.ls    L(copy64_from_start)
209
210L(loop64_backwards):
211        stp     A_l, A_h, [dstend1, -16]
212        ldp     A_l, A_h, [srcend1, -16]
213        stp     B_l, B_h, [dstend1, -32]
214        ldp     B_l, B_h, [srcend1, -32]
215        stp     C_l, C_h, [dstend1, -48]
216        ldp     C_l, C_h, [srcend1, -48]
217        stp     D_l, D_h, [dstend1, -64]!
218        ldp     D_l, D_h, [srcend1, -64]!
219        subs    count, count, 64
220        b.hi    L(loop64_backwards)
221
222        /* Write the last iteration and copy 64 bytes from the start.  */
223L(copy64_from_start):
224        ldp     G_l, G_h, [src, 48]
225        stp     A_l, A_h, [dstend1, -16]
226        ldp     A_l, A_h, [src, 32]
227        stp     B_l, B_h, [dstend1, -32]
228        ldp     B_l, B_h, [src, 16]
229        stp     C_l, C_h, [dstend1, -48]
230        ldp     C_l, C_h, [src]
231        stp     D_l, D_h, [dstend1, -64]
232        stp     G_l, G_h, [dstin, 48]
233        stp     A_l, A_h, [dstin, 32]
234        stp     B_l, B_h, [dstin, 16]
235        stp     C_l, C_h, [dstin]
236        ret
237END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memcpy)
238
239DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
240
241
242//
243//  __arm_sc_memset
244//
245
246#define dstin    x0
247#define val      x1
248#define valw     w1
249#define count    x2
250#define dst      x3
251#define dstend2  x4
252#define zva_val  x5
253
254DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
255#ifdef __ARM_FEATURE_SVE
256        mov     z0.b, valw
257#else
258        bfi valw, valw, #8, #8
259        bfi valw, valw, #16, #16
260        bfi val, val, #32, #32
261        fmov d0, val
262        fmov v0.d[1], val
263#endif
264        add     dstend2, dstin, count
265
266        cmp     count, 96
267        b.hi    L(set_long)
268        cmp     count, 16
269        b.hs    L(set_medium)
270        mov     val, v0.D[0]
271
272        /* Set 0..15 bytes.  */
273        tbz     count, 3, 1f
274        str     val, [dstin]
275        str     val, [dstend2, -8]
276        ret
277        nop
2781:      tbz     count, 2, 2f
279        str     valw, [dstin]
280        str     valw, [dstend2, -4]
281        ret
2822:      cbz     count, 3f
283        strb    valw, [dstin]
284        tbz     count, 1, 3f
285        strh    valw, [dstend2, -2]
2863:      ret
287
288        /* Set 17..96 bytes.  */
289L(set_medium):
290        str     q0, [dstin]
291        tbnz    count, 6, L(set96)
292        str     q0, [dstend2, -16]
293        tbz     count, 5, 1f
294        str     q0, [dstin, 16]
295        str     q0, [dstend2, -32]
2961:      ret
297
298        .p2align 4
299        /* Set 64..96 bytes.  Write 64 bytes from the start and
300           32 bytes from the end.  */
301L(set96):
302        str     q0, [dstin, 16]
303        stp     q0, q0, [dstin, 32]
304        stp     q0, q0, [dstend2, -32]
305        ret
306
307        .p2align 4
308L(set_long):
309        and     valw, valw, 255
310        bic     dst, dstin, 15
311        str     q0, [dstin]
312        cmp     count, 160
313        ccmp    valw, 0, 0, hs
314        b.ne    L(no_zva)
315
316#ifndef SKIP_ZVA_CHECK
317        mrs     zva_val, dczid_el0
318        and     zva_val, zva_val, 31
319        cmp     zva_val, 4              /* ZVA size is 64 bytes.  */
320        b.ne    L(no_zva)
321#endif
322        str     q0, [dst, 16]
323        stp     q0, q0, [dst, 32]
324        bic     dst, dst, 63
325        sub     count, dstend2, dst      /* Count is now 64 too large.  */
326        sub     count, count, 128       /* Adjust count and bias for loop.  */
327
328        .p2align 4
329L(zva_loop):
330        add     dst, dst, 64
331        dc      zva, dst
332        subs    count, count, 64
333        b.hi    L(zva_loop)
334        stp     q0, q0, [dstend2, -64]
335        stp     q0, q0, [dstend2, -32]
336        ret
337
338L(no_zva):
339        sub     count, dstend2, dst      /* Count is 16 too large.  */
340        sub     dst, dst, 16            /* Dst is biased by -32.  */
341        sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
342L(no_zva_loop):
343        stp     q0, q0, [dst, 32]
344        stp     q0, q0, [dst, 64]!
345        subs    count, count, 64
346        b.hi    L(no_zva_loop)
347        stp     q0, q0, [dstend2, -64]
348        stp     q0, q0, [dstend2, -32]
349        ret
350END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memset)
351
352#endif // __aarch64__
353