1/* 2 * memcpy - copy memory area 3 * 4 * Copyright (c) 2019-2023, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 11 * 12 */ 13 14#include "asmdefs.h" 15 16#define dstin x0 17#define src x1 18#define count x2 19#define dst x3 20#define srcend x4 21#define dstend x5 22#define A_l x6 23#define A_lw w6 24#define A_h x7 25#define B_l x8 26#define B_lw w8 27#define B_h x9 28#define C_lw w10 29#define tmp1 x14 30 31#define A_q q0 32#define B_q q1 33#define C_q q2 34#define D_q q3 35#define E_q q4 36#define F_q q5 37#define G_q q6 38#define H_q q7 39 40/* This implementation handles overlaps and supports both memcpy and memmove 41 from a single entry point. It uses unaligned accesses and branchless 42 sequences to keep the code small, simple and improve performance. 43 44 Copies are split into 3 main cases: small copies of up to 32 bytes, medium 45 copies of up to 128 bytes, and large copies. The overhead of the overlap 46 check is negligible since it is only required for large copies. 47 48 Large copies use a software pipelined loop processing 64 bytes per iteration. 49 The source pointer is 16-byte aligned to minimize unaligned accesses. 50 The loop tail is handled by always copying 64 bytes from the end. 51*/ 52 53ENTRY_ALIAS (__memmove_aarch64_simd) 54ENTRY (__memcpy_aarch64_simd) 55 PTR_ARG (0) 56 PTR_ARG (1) 57 SIZE_ARG (2) 58 add srcend, src, count 59 cmp count, 128 60 b.hi L(copy_long) 61 add dstend, dstin, count 62 cmp count, 32 63 b.hi L(copy32_128) 64 nop 65 66 /* Small copies: 0..32 bytes. */ 67 cmp count, 16 68 b.lo L(copy16) 69 ldr A_q, [src] 70 ldr B_q, [srcend, -16] 71 str A_q, [dstin] 72 str B_q, [dstend, -16] 73 ret 74 75 .p2align 4 76 /* Medium copies: 33..128 bytes. */ 77L(copy32_128): 78 ldp A_q, B_q, [src] 79 ldp C_q, D_q, [srcend, -32] 80 cmp count, 64 81 b.hi L(copy128) 82 stp A_q, B_q, [dstin] 83 stp C_q, D_q, [dstend, -32] 84 ret 85 86 .p2align 4 87 /* Copy 8-15 bytes. */ 88L(copy16): 89 tbz count, 3, L(copy8) 90 ldr A_l, [src] 91 ldr A_h, [srcend, -8] 92 str A_l, [dstin] 93 str A_h, [dstend, -8] 94 ret 95 96 /* Copy 4-7 bytes. */ 97L(copy8): 98 tbz count, 2, L(copy4) 99 ldr A_lw, [src] 100 ldr B_lw, [srcend, -4] 101 str A_lw, [dstin] 102 str B_lw, [dstend, -4] 103 ret 104 105 /* Copy 65..128 bytes. */ 106L(copy128): 107 ldp E_q, F_q, [src, 32] 108 cmp count, 96 109 b.ls L(copy96) 110 ldp G_q, H_q, [srcend, -64] 111 stp G_q, H_q, [dstend, -64] 112L(copy96): 113 stp A_q, B_q, [dstin] 114 stp E_q, F_q, [dstin, 32] 115 stp C_q, D_q, [dstend, -32] 116 ret 117 118 /* Copy 0..3 bytes using a branchless sequence. */ 119L(copy4): 120 cbz count, L(copy0) 121 lsr tmp1, count, 1 122 ldrb A_lw, [src] 123 ldrb C_lw, [srcend, -1] 124 ldrb B_lw, [src, tmp1] 125 strb A_lw, [dstin] 126 strb B_lw, [dstin, tmp1] 127 strb C_lw, [dstend, -1] 128L(copy0): 129 ret 130 131 .p2align 3 132 /* Copy more than 128 bytes. */ 133L(copy_long): 134 add dstend, dstin, count 135 136 /* Use backwards copy if there is an overlap. */ 137 sub tmp1, dstin, src 138 cmp tmp1, count 139 b.lo L(copy_long_backwards) 140 141 /* Copy 16 bytes and then align src to 16-byte alignment. */ 142 ldr D_q, [src] 143 and tmp1, src, 15 144 bic src, src, 15 145 sub dst, dstin, tmp1 146 add count, count, tmp1 /* Count is now 16 too large. */ 147 ldp A_q, B_q, [src, 16] 148 str D_q, [dstin] 149 ldp C_q, D_q, [src, 48] 150 subs count, count, 128 + 16 /* Test and readjust count. */ 151 b.ls L(copy64_from_end) 152L(loop64): 153 stp A_q, B_q, [dst, 16] 154 ldp A_q, B_q, [src, 80] 155 stp C_q, D_q, [dst, 48] 156 ldp C_q, D_q, [src, 112] 157 add src, src, 64 158 add dst, dst, 64 159 subs count, count, 64 160 b.hi L(loop64) 161 162 /* Write the last iteration and copy 64 bytes from the end. */ 163L(copy64_from_end): 164 ldp E_q, F_q, [srcend, -64] 165 stp A_q, B_q, [dst, 16] 166 ldp A_q, B_q, [srcend, -32] 167 stp C_q, D_q, [dst, 48] 168 stp E_q, F_q, [dstend, -64] 169 stp A_q, B_q, [dstend, -32] 170 ret 171 172 .p2align 4 173 nop 174 175 /* Large backwards copy for overlapping copies. 176 Copy 16 bytes and then align srcend to 16-byte alignment. */ 177L(copy_long_backwards): 178 cbz tmp1, L(copy0) 179 ldr D_q, [srcend, -16] 180 and tmp1, srcend, 15 181 bic srcend, srcend, 15 182 sub count, count, tmp1 183 ldp A_q, B_q, [srcend, -32] 184 str D_q, [dstend, -16] 185 ldp C_q, D_q, [srcend, -64] 186 sub dstend, dstend, tmp1 187 subs count, count, 128 188 b.ls L(copy64_from_start) 189 190L(loop64_backwards): 191 str B_q, [dstend, -16] 192 str A_q, [dstend, -32] 193 ldp A_q, B_q, [srcend, -96] 194 str D_q, [dstend, -48] 195 str C_q, [dstend, -64]! 196 ldp C_q, D_q, [srcend, -128] 197 sub srcend, srcend, 64 198 subs count, count, 64 199 b.hi L(loop64_backwards) 200 201 /* Write the last iteration and copy 64 bytes from the start. */ 202L(copy64_from_start): 203 ldp E_q, F_q, [src, 32] 204 stp A_q, B_q, [dstend, -32] 205 ldp A_q, B_q, [src] 206 stp C_q, D_q, [dstend, -64] 207 stp E_q, F_q, [dstin, 32] 208 stp A_q, B_q, [dstin] 209 ret 210 211END (__memcpy_aarch64_simd) 212 213