1/* 2 * memcpy - copy memory area 3 * 4 * Copyright (c) 2012-2022, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, unaligned accesses. 11 * 12 */ 13 14#include "asmdefs.h" 15 16#define dstin x0 17#define src x1 18#define count x2 19#define dst x3 20#define srcend x4 21#define dstend x5 22#define A_l x6 23#define A_lw w6 24#define A_h x7 25#define B_l x8 26#define B_lw w8 27#define B_h x9 28#define C_l x10 29#define C_lw w10 30#define C_h x11 31#define D_l x12 32#define D_h x13 33#define E_l x14 34#define E_h x15 35#define F_l x16 36#define F_h x17 37#define G_l count 38#define G_h dst 39#define H_l src 40#define H_h srcend 41#define tmp1 x14 42 43/* This implementation handles overlaps and supports both memcpy and memmove 44 from a single entry point. It uses unaligned accesses and branchless 45 sequences to keep the code small, simple and improve performance. 46 47 Copies are split into 3 main cases: small copies of up to 32 bytes, medium 48 copies of up to 128 bytes, and large copies. The overhead of the overlap 49 check is negligible since it is only required for large copies. 50 51 Large copies use a software pipelined loop processing 64 bytes per iteration. 52 The destination pointer is 16-byte aligned to minimize unaligned accesses. 53 The loop tail is handled by always copying 64 bytes from the end. 54*/ 55 56ENTRY_ALIAS (__memmove_aarch64) 57ENTRY (__memcpy_aarch64) 58 add srcend, src, count 59 add dstend, dstin, count 60 cmp count, 128 61 b.hi L(copy_long) 62 cmp count, 32 63 b.hi L(copy32_128) 64 65 /* Small copies: 0..32 bytes. */ 66 cmp count, 16 67 b.lo L(copy16) 68 ldp A_l, A_h, [src] 69 ldp D_l, D_h, [srcend, -16] 70 stp A_l, A_h, [dstin] 71 stp D_l, D_h, [dstend, -16] 72 ret 73 74 /* Copy 8-15 bytes. */ 75L(copy16): 76 tbz count, 3, L(copy8) 77 ldr A_l, [src] 78 ldr A_h, [srcend, -8] 79 str A_l, [dstin] 80 str A_h, [dstend, -8] 81 ret 82 83 .p2align 3 84 /* Copy 4-7 bytes. */ 85L(copy8): 86 tbz count, 2, L(copy4) 87 ldr A_lw, [src] 88 ldr B_lw, [srcend, -4] 89 str A_lw, [dstin] 90 str B_lw, [dstend, -4] 91 ret 92 93 /* Copy 0..3 bytes using a branchless sequence. */ 94L(copy4): 95 cbz count, L(copy0) 96 lsr tmp1, count, 1 97 ldrb A_lw, [src] 98 ldrb C_lw, [srcend, -1] 99 ldrb B_lw, [src, tmp1] 100 strb A_lw, [dstin] 101 strb B_lw, [dstin, tmp1] 102 strb C_lw, [dstend, -1] 103L(copy0): 104 ret 105 106 .p2align 4 107 /* Medium copies: 33..128 bytes. */ 108L(copy32_128): 109 ldp A_l, A_h, [src] 110 ldp B_l, B_h, [src, 16] 111 ldp C_l, C_h, [srcend, -32] 112 ldp D_l, D_h, [srcend, -16] 113 cmp count, 64 114 b.hi L(copy128) 115 stp A_l, A_h, [dstin] 116 stp B_l, B_h, [dstin, 16] 117 stp C_l, C_h, [dstend, -32] 118 stp D_l, D_h, [dstend, -16] 119 ret 120 121 .p2align 4 122 /* Copy 65..128 bytes. */ 123L(copy128): 124 ldp E_l, E_h, [src, 32] 125 ldp F_l, F_h, [src, 48] 126 cmp count, 96 127 b.ls L(copy96) 128 ldp G_l, G_h, [srcend, -64] 129 ldp H_l, H_h, [srcend, -48] 130 stp G_l, G_h, [dstend, -64] 131 stp H_l, H_h, [dstend, -48] 132L(copy96): 133 stp A_l, A_h, [dstin] 134 stp B_l, B_h, [dstin, 16] 135 stp E_l, E_h, [dstin, 32] 136 stp F_l, F_h, [dstin, 48] 137 stp C_l, C_h, [dstend, -32] 138 stp D_l, D_h, [dstend, -16] 139 ret 140 141 .p2align 4 142 /* Copy more than 128 bytes. */ 143L(copy_long): 144 /* Use backwards copy if there is an overlap. */ 145 sub tmp1, dstin, src 146 cbz tmp1, L(copy0) 147 cmp tmp1, count 148 b.lo L(copy_long_backwards) 149 150 /* Copy 16 bytes and then align dst to 16-byte alignment. */ 151 152 ldp D_l, D_h, [src] 153 and tmp1, dstin, 15 154 bic dst, dstin, 15 155 sub src, src, tmp1 156 add count, count, tmp1 /* Count is now 16 too large. */ 157 ldp A_l, A_h, [src, 16] 158 stp D_l, D_h, [dstin] 159 ldp B_l, B_h, [src, 32] 160 ldp C_l, C_h, [src, 48] 161 ldp D_l, D_h, [src, 64]! 162 subs count, count, 128 + 16 /* Test and readjust count. */ 163 b.ls L(copy64_from_end) 164 165L(loop64): 166 stp A_l, A_h, [dst, 16] 167 ldp A_l, A_h, [src, 16] 168 stp B_l, B_h, [dst, 32] 169 ldp B_l, B_h, [src, 32] 170 stp C_l, C_h, [dst, 48] 171 ldp C_l, C_h, [src, 48] 172 stp D_l, D_h, [dst, 64]! 173 ldp D_l, D_h, [src, 64]! 174 subs count, count, 64 175 b.hi L(loop64) 176 177 /* Write the last iteration and copy 64 bytes from the end. */ 178L(copy64_from_end): 179 ldp E_l, E_h, [srcend, -64] 180 stp A_l, A_h, [dst, 16] 181 ldp A_l, A_h, [srcend, -48] 182 stp B_l, B_h, [dst, 32] 183 ldp B_l, B_h, [srcend, -32] 184 stp C_l, C_h, [dst, 48] 185 ldp C_l, C_h, [srcend, -16] 186 stp D_l, D_h, [dst, 64] 187 stp E_l, E_h, [dstend, -64] 188 stp A_l, A_h, [dstend, -48] 189 stp B_l, B_h, [dstend, -32] 190 stp C_l, C_h, [dstend, -16] 191 ret 192 193 .p2align 4 194 195 /* Large backwards copy for overlapping copies. 196 Copy 16 bytes and then align dst to 16-byte alignment. */ 197L(copy_long_backwards): 198 ldp D_l, D_h, [srcend, -16] 199 and tmp1, dstend, 15 200 sub srcend, srcend, tmp1 201 sub count, count, tmp1 202 ldp A_l, A_h, [srcend, -16] 203 stp D_l, D_h, [dstend, -16] 204 ldp B_l, B_h, [srcend, -32] 205 ldp C_l, C_h, [srcend, -48] 206 ldp D_l, D_h, [srcend, -64]! 207 sub dstend, dstend, tmp1 208 subs count, count, 128 209 b.ls L(copy64_from_start) 210 211L(loop64_backwards): 212 stp A_l, A_h, [dstend, -16] 213 ldp A_l, A_h, [srcend, -16] 214 stp B_l, B_h, [dstend, -32] 215 ldp B_l, B_h, [srcend, -32] 216 stp C_l, C_h, [dstend, -48] 217 ldp C_l, C_h, [srcend, -48] 218 stp D_l, D_h, [dstend, -64]! 219 ldp D_l, D_h, [srcend, -64]! 220 subs count, count, 64 221 b.hi L(loop64_backwards) 222 223 /* Write the last iteration and copy 64 bytes from the start. */ 224L(copy64_from_start): 225 ldp G_l, G_h, [src, 48] 226 stp A_l, A_h, [dstend, -16] 227 ldp A_l, A_h, [src, 32] 228 stp B_l, B_h, [dstend, -32] 229 ldp B_l, B_h, [src, 16] 230 stp C_l, C_h, [dstend, -48] 231 ldp C_l, C_h, [src] 232 stp D_l, D_h, [dstend, -64] 233 stp G_l, G_h, [dstin, 48] 234 stp A_l, A_h, [dstin, 32] 235 stp B_l, B_h, [dstin, 16] 236 stp C_l, C_h, [dstin] 237 ret 238 239END (__memcpy_aarch64) 240 241