1/* 2 * memcpy - copy memory area 3 * 4 * Copyright (c) 2019-2020, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 11 * 12 */ 13 14#include "../asmdefs.h" 15 16#define dstin x0 17#define src x1 18#define count x2 19#define dst x3 20#define srcend x4 21#define dstend x5 22#define A_l x6 23#define A_lw w6 24#define A_h x7 25#define B_l x8 26#define B_lw w8 27#define B_h x9 28#define C_lw w10 29#define tmp1 x14 30 31#define A_q q0 32#define B_q q1 33#define C_q q2 34#define D_q q3 35#define E_q q4 36#define F_q q5 37#define G_q q6 38#define H_q q7 39 40/* This implementation handles overlaps and supports both memcpy and memmove 41 from a single entry point. It uses unaligned accesses and branchless 42 sequences to keep the code small, simple and improve performance. 43 44 Copies are split into 3 main cases: small copies of up to 32 bytes, medium 45 copies of up to 128 bytes, and large copies. The overhead of the overlap 46 check is negligible since it is only required for large copies. 47 48 Large copies use a software pipelined loop processing 64 bytes per iteration. 49 The source pointer is 16-byte aligned to minimize unaligned accesses. 50 The loop tail is handled by always copying 64 bytes from the end. 51*/ 52 53ENTRY_ALIAS (__memmove_aarch64_simd) 54ENTRY (__memcpy_aarch64_simd) 55 PTR_ARG (0) 56 PTR_ARG (1) 57 SIZE_ARG (2) 58 add srcend, src, count 59 add dstend, dstin, count 60 cmp count, 128 61 b.hi L(copy_long) 62 cmp count, 32 63 b.hi L(copy32_128) 64 65 /* Small copies: 0..32 bytes. */ 66 cmp count, 16 67 b.lo L(copy16) 68 ldr A_q, [src] 69 ldr B_q, [srcend, -16] 70 str A_q, [dstin] 71 str B_q, [dstend, -16] 72 ret 73 74 /* Copy 8-15 bytes. */ 75L(copy16): 76 tbz count, 3, L(copy8) 77 ldr A_l, [src] 78 ldr A_h, [srcend, -8] 79 str A_l, [dstin] 80 str A_h, [dstend, -8] 81 ret 82 83 .p2align 3 84 /* Copy 4-7 bytes. */ 85L(copy8): 86 tbz count, 2, L(copy4) 87 ldr A_lw, [src] 88 ldr B_lw, [srcend, -4] 89 str A_lw, [dstin] 90 str B_lw, [dstend, -4] 91 ret 92 93 /* Copy 0..3 bytes using a branchless sequence. */ 94L(copy4): 95 cbz count, L(copy0) 96 lsr tmp1, count, 1 97 ldrb A_lw, [src] 98 ldrb C_lw, [srcend, -1] 99 ldrb B_lw, [src, tmp1] 100 strb A_lw, [dstin] 101 strb B_lw, [dstin, tmp1] 102 strb C_lw, [dstend, -1] 103L(copy0): 104 ret 105 106 .p2align 4 107 /* Medium copies: 33..128 bytes. */ 108L(copy32_128): 109 ldp A_q, B_q, [src] 110 ldp C_q, D_q, [srcend, -32] 111 cmp count, 64 112 b.hi L(copy128) 113 stp A_q, B_q, [dstin] 114 stp C_q, D_q, [dstend, -32] 115 ret 116 117 .p2align 4 118 /* Copy 65..128 bytes. */ 119L(copy128): 120 ldp E_q, F_q, [src, 32] 121 cmp count, 96 122 b.ls L(copy96) 123 ldp G_q, H_q, [srcend, -64] 124 stp G_q, H_q, [dstend, -64] 125L(copy96): 126 stp A_q, B_q, [dstin] 127 stp E_q, F_q, [dstin, 32] 128 stp C_q, D_q, [dstend, -32] 129 ret 130 131 /* Copy more than 128 bytes. */ 132L(copy_long): 133 /* Use backwards copy if there is an overlap. */ 134 sub tmp1, dstin, src 135 cmp tmp1, count 136 b.lo L(copy_long_backwards) 137 138 /* Copy 16 bytes and then align src to 16-byte alignment. */ 139 ldr D_q, [src] 140 and tmp1, src, 15 141 bic src, src, 15 142 sub dst, dstin, tmp1 143 add count, count, tmp1 /* Count is now 16 too large. */ 144 ldp A_q, B_q, [src, 16] 145 str D_q, [dstin] 146 ldp C_q, D_q, [src, 48] 147 subs count, count, 128 + 16 /* Test and readjust count. */ 148 b.ls L(copy64_from_end) 149L(loop64): 150 stp A_q, B_q, [dst, 16] 151 ldp A_q, B_q, [src, 80] 152 stp C_q, D_q, [dst, 48] 153 ldp C_q, D_q, [src, 112] 154 add src, src, 64 155 add dst, dst, 64 156 subs count, count, 64 157 b.hi L(loop64) 158 159 /* Write the last iteration and copy 64 bytes from the end. */ 160L(copy64_from_end): 161 ldp E_q, F_q, [srcend, -64] 162 stp A_q, B_q, [dst, 16] 163 ldp A_q, B_q, [srcend, -32] 164 stp C_q, D_q, [dst, 48] 165 stp E_q, F_q, [dstend, -64] 166 stp A_q, B_q, [dstend, -32] 167 ret 168 169 /* Large backwards copy for overlapping copies. 170 Copy 16 bytes and then align srcend to 16-byte alignment. */ 171L(copy_long_backwards): 172 cbz tmp1, L(copy0) 173 ldr D_q, [srcend, -16] 174 and tmp1, srcend, 15 175 bic srcend, srcend, 15 176 sub count, count, tmp1 177 ldp A_q, B_q, [srcend, -32] 178 str D_q, [dstend, -16] 179 ldp C_q, D_q, [srcend, -64] 180 sub dstend, dstend, tmp1 181 subs count, count, 128 182 b.ls L(copy64_from_start) 183 184L(loop64_backwards): 185 str B_q, [dstend, -16] 186 str A_q, [dstend, -32] 187 ldp A_q, B_q, [srcend, -96] 188 str D_q, [dstend, -48] 189 str C_q, [dstend, -64]! 190 ldp C_q, D_q, [srcend, -128] 191 sub srcend, srcend, 64 192 subs count, count, 64 193 b.hi L(loop64_backwards) 194 195 /* Write the last iteration and copy 64 bytes from the start. */ 196L(copy64_from_start): 197 ldp E_q, F_q, [src, 32] 198 stp A_q, B_q, [dstend, -32] 199 ldp A_q, B_q, [src] 200 stp C_q, D_q, [dstend, -64] 201 stp E_q, F_q, [dstin, 32] 202 stp A_q, B_q, [dstin] 203 ret 204 205END (__memcpy_aarch64_simd) 206 207