1/* 2 * memcpy - copy memory area 3 * 4 * Copyright (c) 2019-2023, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 11 * 12 */ 13 14#include "asmdefs.h" 15 16#define dstin x0 17#define src x1 18#define count x2 19#define dst x3 20#define srcend x4 21#define dstend x5 22#define A_l x6 23#define A_lw w6 24#define A_h x7 25#define B_l x8 26#define B_lw w8 27#define B_h x9 28#define C_lw w10 29#define tmp1 x14 30 31#define A_q q0 32#define B_q q1 33#define C_q q2 34#define D_q q3 35#define E_q q4 36#define F_q q5 37#define G_q q6 38#define H_q q7 39 40/* This implementation handles overlaps and supports both memcpy and memmove 41 from a single entry point. It uses unaligned accesses and branchless 42 sequences to keep the code small, simple and improve performance. 43 44 Copies are split into 3 main cases: small copies of up to 32 bytes, medium 45 copies of up to 128 bytes, and large copies. The overhead of the overlap 46 check is negligible since it is only required for large copies. 47 48 Large copies use a software pipelined loop processing 64 bytes per iteration. 49 The source pointer is 16-byte aligned to minimize unaligned accesses. 50 The loop tail is handled by always copying 64 bytes from the end. 51*/ 52 53ENTRY_ALIAS (__memmove_aarch64_simd) 54ENTRY (__memcpy_aarch64_simd) 55 add srcend, src, count 56 cmp count, 128 57 b.hi L(copy_long) 58 add dstend, dstin, count 59 cmp count, 32 60 b.hi L(copy32_128) 61 nop 62 63 /* Small copies: 0..32 bytes. */ 64 cmp count, 16 65 b.lo L(copy16) 66 ldr A_q, [src] 67 ldr B_q, [srcend, -16] 68 str A_q, [dstin] 69 str B_q, [dstend, -16] 70 ret 71 72 .p2align 4 73 /* Medium copies: 33..128 bytes. */ 74L(copy32_128): 75 ldp A_q, B_q, [src] 76 ldp C_q, D_q, [srcend, -32] 77 cmp count, 64 78 b.hi L(copy128) 79 stp A_q, B_q, [dstin] 80 stp C_q, D_q, [dstend, -32] 81 ret 82 83 .p2align 4 84 /* Copy 8-15 bytes. */ 85L(copy16): 86 tbz count, 3, L(copy8) 87 ldr A_l, [src] 88 ldr A_h, [srcend, -8] 89 str A_l, [dstin] 90 str A_h, [dstend, -8] 91 ret 92 93 /* Copy 4-7 bytes. */ 94L(copy8): 95 tbz count, 2, L(copy4) 96 ldr A_lw, [src] 97 ldr B_lw, [srcend, -4] 98 str A_lw, [dstin] 99 str B_lw, [dstend, -4] 100 ret 101 102 /* Copy 65..128 bytes. */ 103L(copy128): 104 ldp E_q, F_q, [src, 32] 105 cmp count, 96 106 b.ls L(copy96) 107 ldp G_q, H_q, [srcend, -64] 108 stp G_q, H_q, [dstend, -64] 109L(copy96): 110 stp A_q, B_q, [dstin] 111 stp E_q, F_q, [dstin, 32] 112 stp C_q, D_q, [dstend, -32] 113 ret 114 115 /* Copy 0..3 bytes using a branchless sequence. */ 116L(copy4): 117 cbz count, L(copy0) 118 lsr tmp1, count, 1 119 ldrb A_lw, [src] 120 ldrb C_lw, [srcend, -1] 121 ldrb B_lw, [src, tmp1] 122 strb A_lw, [dstin] 123 strb B_lw, [dstin, tmp1] 124 strb C_lw, [dstend, -1] 125L(copy0): 126 ret 127 128 .p2align 3 129 /* Copy more than 128 bytes. */ 130L(copy_long): 131 add dstend, dstin, count 132 133 /* Use backwards copy if there is an overlap. */ 134 sub tmp1, dstin, src 135 cmp tmp1, count 136 b.lo L(copy_long_backwards) 137 138 /* Copy 16 bytes and then align src to 16-byte alignment. */ 139 ldr D_q, [src] 140 and tmp1, src, 15 141 bic src, src, 15 142 sub dst, dstin, tmp1 143 add count, count, tmp1 /* Count is now 16 too large. */ 144 ldp A_q, B_q, [src, 16] 145 str D_q, [dstin] 146 ldp C_q, D_q, [src, 48] 147 subs count, count, 128 + 16 /* Test and readjust count. */ 148 b.ls L(copy64_from_end) 149L(loop64): 150 stp A_q, B_q, [dst, 16] 151 ldp A_q, B_q, [src, 80] 152 stp C_q, D_q, [dst, 48] 153 ldp C_q, D_q, [src, 112] 154 add src, src, 64 155 add dst, dst, 64 156 subs count, count, 64 157 b.hi L(loop64) 158 159 /* Write the last iteration and copy 64 bytes from the end. */ 160L(copy64_from_end): 161 ldp E_q, F_q, [srcend, -64] 162 stp A_q, B_q, [dst, 16] 163 ldp A_q, B_q, [srcend, -32] 164 stp C_q, D_q, [dst, 48] 165 stp E_q, F_q, [dstend, -64] 166 stp A_q, B_q, [dstend, -32] 167 ret 168 169 .p2align 4 170 nop 171 172 /* Large backwards copy for overlapping copies. 173 Copy 16 bytes and then align srcend to 16-byte alignment. */ 174L(copy_long_backwards): 175 cbz tmp1, L(copy0) 176 ldr D_q, [srcend, -16] 177 and tmp1, srcend, 15 178 bic srcend, srcend, 15 179 sub count, count, tmp1 180 ldp A_q, B_q, [srcend, -32] 181 str D_q, [dstend, -16] 182 ldp C_q, D_q, [srcend, -64] 183 sub dstend, dstend, tmp1 184 subs count, count, 128 185 b.ls L(copy64_from_start) 186 187L(loop64_backwards): 188 str B_q, [dstend, -16] 189 str A_q, [dstend, -32] 190 ldp A_q, B_q, [srcend, -96] 191 str D_q, [dstend, -48] 192 str C_q, [dstend, -64]! 193 ldp C_q, D_q, [srcend, -128] 194 sub srcend, srcend, 64 195 subs count, count, 64 196 b.hi L(loop64_backwards) 197 198 /* Write the last iteration and copy 64 bytes from the start. */ 199L(copy64_from_start): 200 ldp E_q, F_q, [src, 32] 201 stp A_q, B_q, [dstend, -32] 202 ldp A_q, B_q, [src] 203 stp C_q, D_q, [dstend, -64] 204 stp E_q, F_q, [dstin, 32] 205 stp A_q, B_q, [dstin] 206 ret 207 208END (__memcpy_aarch64_simd) 209 210