1// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 2// See https://llvm.org/LICENSE.txt for license information. 3// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 4 5// Routines taken from libc/AOR_v20.02/string/aarch64 6 7#include "../assembly.h" 8 9#ifdef __aarch64__ 10 11#define L(l) .L ## l 12 13// 14// __arm_sc_memcpy / __arm_sc_memmove 15// 16 17#define dstin x0 18#define src x1 19#define count x2 20#define dst x3 21#define srcend1 x4 22#define dstend1 x5 23#define A_l x6 24#define A_lw w6 25#define A_h x7 26#define B_l x8 27#define B_lw w8 28#define B_h x9 29#define C_l x10 30#define C_lw w10 31#define C_h x11 32#define D_l x12 33#define D_h x13 34#define E_l x14 35#define E_h x15 36#define F_l x16 37#define F_h x17 38#define G_l count 39#define G_h dst 40#define H_l src 41#define H_h srcend1 42#define tmp1 x14 43 44/* This implementation handles overlaps and supports both memcpy and memmove 45 from a single entry point. It uses unaligned accesses and branchless 46 sequences to keep the code small, simple and improve performance. 47 48 Copies are split into 3 main cases: small copies of up to 32 bytes, medium 49 copies of up to 128 bytes, and large copies. The overhead of the overlap 50 check is negligible since it is only required for large copies. 51 52 Large copies use a software pipelined loop processing 64 bytes per iteration. 53 The destination pointer is 16-byte aligned to minimize unaligned accesses. 54 The loop tail is handled by always copying 64 bytes from the end. 55*/ 56 57DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memcpy) 58 add srcend1, src, count 59 add dstend1, dstin, count 60 cmp count, 128 61 b.hi L(copy_long) 62 cmp count, 32 63 b.hi L(copy32_128) 64 65 /* Small copies: 0..32 bytes. */ 66 cmp count, 16 67 b.lo L(copy16) 68 ldp A_l, A_h, [src] 69 ldp D_l, D_h, [srcend1, -16] 70 stp A_l, A_h, [dstin] 71 stp D_l, D_h, [dstend1, -16] 72 ret 73 74 /* Copy 8-15 bytes. */ 75L(copy16): 76 tbz count, 3, L(copy8) 77 ldr A_l, [src] 78 ldr A_h, [srcend1, -8] 79 str A_l, [dstin] 80 str A_h, [dstend1, -8] 81 ret 82 83 .p2align 3 84 /* Copy 4-7 bytes. */ 85L(copy8): 86 tbz count, 2, L(copy4) 87 ldr A_lw, [src] 88 ldr B_lw, [srcend1, -4] 89 str A_lw, [dstin] 90 str B_lw, [dstend1, -4] 91 ret 92 93 /* Copy 0..3 bytes using a branchless sequence. */ 94L(copy4): 95 cbz count, L(copy0) 96 lsr tmp1, count, 1 97 ldrb A_lw, [src] 98 ldrb C_lw, [srcend1, -1] 99 ldrb B_lw, [src, tmp1] 100 strb A_lw, [dstin] 101 strb B_lw, [dstin, tmp1] 102 strb C_lw, [dstend1, -1] 103L(copy0): 104 ret 105 106 .p2align 4 107 /* Medium copies: 33..128 bytes. */ 108L(copy32_128): 109 ldp A_l, A_h, [src] 110 ldp B_l, B_h, [src, 16] 111 ldp C_l, C_h, [srcend1, -32] 112 ldp D_l, D_h, [srcend1, -16] 113 cmp count, 64 114 b.hi L(copy128) 115 stp A_l, A_h, [dstin] 116 stp B_l, B_h, [dstin, 16] 117 stp C_l, C_h, [dstend1, -32] 118 stp D_l, D_h, [dstend1, -16] 119 ret 120 121 .p2align 4 122 /* Copy 65..128 bytes. */ 123L(copy128): 124 ldp E_l, E_h, [src, 32] 125 ldp F_l, F_h, [src, 48] 126 cmp count, 96 127 b.ls L(copy96) 128 ldp G_l, G_h, [srcend1, -64] 129 ldp H_l, H_h, [srcend1, -48] 130 stp G_l, G_h, [dstend1, -64] 131 stp H_l, H_h, [dstend1, -48] 132L(copy96): 133 stp A_l, A_h, [dstin] 134 stp B_l, B_h, [dstin, 16] 135 stp E_l, E_h, [dstin, 32] 136 stp F_l, F_h, [dstin, 48] 137 stp C_l, C_h, [dstend1, -32] 138 stp D_l, D_h, [dstend1, -16] 139 ret 140 141 .p2align 4 142 /* Copy more than 128 bytes. */ 143L(copy_long): 144 /* Use backwards copy if there is an overlap. */ 145 sub tmp1, dstin, src 146 cbz tmp1, L(copy0) 147 cmp tmp1, count 148 b.lo L(copy_long_backwards) 149 150 /* Copy 16 bytes and then align dst to 16-byte alignment. */ 151 152 ldp D_l, D_h, [src] 153 and tmp1, dstin, 15 154 bic dst, dstin, 15 155 sub src, src, tmp1 156 add count, count, tmp1 /* Count is now 16 too large. */ 157 ldp A_l, A_h, [src, 16] 158 stp D_l, D_h, [dstin] 159 ldp B_l, B_h, [src, 32] 160 ldp C_l, C_h, [src, 48] 161 ldp D_l, D_h, [src, 64]! 162 subs count, count, 128 + 16 /* Test and readjust count. */ 163 b.ls L(copy64_from_end) 164L(loop64): 165 stp A_l, A_h, [dst, 16] 166 ldp A_l, A_h, [src, 16] 167 stp B_l, B_h, [dst, 32] 168 ldp B_l, B_h, [src, 32] 169 stp C_l, C_h, [dst, 48] 170 ldp C_l, C_h, [src, 48] 171 stp D_l, D_h, [dst, 64]! 172 ldp D_l, D_h, [src, 64]! 173 subs count, count, 64 174 b.hi L(loop64) 175 176 /* Write the last iteration and copy 64 bytes from the end. */ 177L(copy64_from_end): 178 ldp E_l, E_h, [srcend1, -64] 179 stp A_l, A_h, [dst, 16] 180 ldp A_l, A_h, [srcend1, -48] 181 stp B_l, B_h, [dst, 32] 182 ldp B_l, B_h, [srcend1, -32] 183 stp C_l, C_h, [dst, 48] 184 ldp C_l, C_h, [srcend1, -16] 185 stp D_l, D_h, [dst, 64] 186 stp E_l, E_h, [dstend1, -64] 187 stp A_l, A_h, [dstend1, -48] 188 stp B_l, B_h, [dstend1, -32] 189 stp C_l, C_h, [dstend1, -16] 190 ret 191 192 .p2align 4 193 194 /* Large backwards copy for overlapping copies. 195 Copy 16 bytes and then align dst to 16-byte alignment. */ 196L(copy_long_backwards): 197 ldp D_l, D_h, [srcend1, -16] 198 and tmp1, dstend1, 15 199 sub srcend1, srcend1, tmp1 200 sub count, count, tmp1 201 ldp A_l, A_h, [srcend1, -16] 202 stp D_l, D_h, [dstend1, -16] 203 ldp B_l, B_h, [srcend1, -32] 204 ldp C_l, C_h, [srcend1, -48] 205 ldp D_l, D_h, [srcend1, -64]! 206 sub dstend1, dstend1, tmp1 207 subs count, count, 128 208 b.ls L(copy64_from_start) 209 210L(loop64_backwards): 211 stp A_l, A_h, [dstend1, -16] 212 ldp A_l, A_h, [srcend1, -16] 213 stp B_l, B_h, [dstend1, -32] 214 ldp B_l, B_h, [srcend1, -32] 215 stp C_l, C_h, [dstend1, -48] 216 ldp C_l, C_h, [srcend1, -48] 217 stp D_l, D_h, [dstend1, -64]! 218 ldp D_l, D_h, [srcend1, -64]! 219 subs count, count, 64 220 b.hi L(loop64_backwards) 221 222 /* Write the last iteration and copy 64 bytes from the start. */ 223L(copy64_from_start): 224 ldp G_l, G_h, [src, 48] 225 stp A_l, A_h, [dstend1, -16] 226 ldp A_l, A_h, [src, 32] 227 stp B_l, B_h, [dstend1, -32] 228 ldp B_l, B_h, [src, 16] 229 stp C_l, C_h, [dstend1, -48] 230 ldp C_l, C_h, [src] 231 stp D_l, D_h, [dstend1, -64] 232 stp G_l, G_h, [dstin, 48] 233 stp A_l, A_h, [dstin, 32] 234 stp B_l, B_h, [dstin, 16] 235 stp C_l, C_h, [dstin] 236 ret 237END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memcpy) 238 239DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy) 240 241 242// 243// __arm_sc_memset 244// 245 246#define dstin x0 247#define val x1 248#define valw w1 249#define count x2 250#define dst x3 251#define dstend2 x4 252#define zva_val x5 253 254DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset) 255#ifdef __ARM_FEATURE_SVE 256 mov z0.b, valw 257#else 258 bfi valw, valw, #8, #8 259 bfi valw, valw, #16, #16 260 bfi val, val, #32, #32 261 fmov d0, val 262 fmov v0.d[1], val 263#endif 264 add dstend2, dstin, count 265 266 cmp count, 96 267 b.hi L(set_long) 268 cmp count, 16 269 b.hs L(set_medium) 270 mov val, v0.D[0] 271 272 /* Set 0..15 bytes. */ 273 tbz count, 3, 1f 274 str val, [dstin] 275 str val, [dstend2, -8] 276 ret 277 nop 2781: tbz count, 2, 2f 279 str valw, [dstin] 280 str valw, [dstend2, -4] 281 ret 2822: cbz count, 3f 283 strb valw, [dstin] 284 tbz count, 1, 3f 285 strh valw, [dstend2, -2] 2863: ret 287 288 /* Set 17..96 bytes. */ 289L(set_medium): 290 str q0, [dstin] 291 tbnz count, 6, L(set96) 292 str q0, [dstend2, -16] 293 tbz count, 5, 1f 294 str q0, [dstin, 16] 295 str q0, [dstend2, -32] 2961: ret 297 298 .p2align 4 299 /* Set 64..96 bytes. Write 64 bytes from the start and 300 32 bytes from the end. */ 301L(set96): 302 str q0, [dstin, 16] 303 stp q0, q0, [dstin, 32] 304 stp q0, q0, [dstend2, -32] 305 ret 306 307 .p2align 4 308L(set_long): 309 and valw, valw, 255 310 bic dst, dstin, 15 311 str q0, [dstin] 312 cmp count, 160 313 ccmp valw, 0, 0, hs 314 b.ne L(no_zva) 315 316#ifndef SKIP_ZVA_CHECK 317 mrs zva_val, dczid_el0 318 and zva_val, zva_val, 31 319 cmp zva_val, 4 /* ZVA size is 64 bytes. */ 320 b.ne L(no_zva) 321#endif 322 str q0, [dst, 16] 323 stp q0, q0, [dst, 32] 324 bic dst, dst, 63 325 sub count, dstend2, dst /* Count is now 64 too large. */ 326 sub count, count, 128 /* Adjust count and bias for loop. */ 327 328 .p2align 4 329L(zva_loop): 330 add dst, dst, 64 331 dc zva, dst 332 subs count, count, 64 333 b.hi L(zva_loop) 334 stp q0, q0, [dstend2, -64] 335 stp q0, q0, [dstend2, -32] 336 ret 337 338L(no_zva): 339 sub count, dstend2, dst /* Count is 16 too large. */ 340 sub dst, dst, 16 /* Dst is biased by -32. */ 341 sub count, count, 64 + 16 /* Adjust count and bias for loop. */ 342L(no_zva_loop): 343 stp q0, q0, [dst, 32] 344 stp q0, q0, [dst, 64]! 345 subs count, count, 64 346 b.hi L(no_zva_loop) 347 stp q0, q0, [dstend2, -64] 348 stp q0, q0, [dstend2, -32] 349 ret 350END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memset) 351 352#endif // __aarch64__ 353