1/* 2 * memcpy - copy memory area 3 * 4 * Copyright (c) 2019-2022, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses. 11 * 12 */ 13 14#if __ARM_FEATURE_SVE 15 16#include "../asmdefs.h" 17 18#define dstin x0 19#define src x1 20#define count x2 21#define dst x3 22#define srcend x4 23#define dstend x5 24#define tmp1 x6 25#define vlen x6 26 27#define A_q q0 28#define B_q q1 29#define C_q q2 30#define D_q q3 31#define E_q q4 32#define F_q q5 33#define G_q q6 34#define H_q q7 35 36/* This implementation handles overlaps and supports both memcpy and memmove 37 from a single entry point. It uses unaligned accesses and branchless 38 sequences to keep the code small, simple and improve performance. 39 SVE vectors are used to speedup small copies. 40 41 Copies are split into 3 main cases: small copies of up to 32 bytes, medium 42 copies of up to 128 bytes, and large copies. The overhead of the overlap 43 check is negligible since it is only required for large copies. 44 45 Large copies use a software pipelined loop processing 64 bytes per iteration. 46 The source pointer is 16-byte aligned to minimize unaligned accesses. 47 The loop tail is handled by always copying 64 bytes from the end. 48*/ 49 50ENTRY_ALIAS (__memmove_aarch64_sve) 51ENTRY (__memcpy_aarch64_sve) 52 PTR_ARG (0) 53 PTR_ARG (1) 54 SIZE_ARG (2) 55 56 cmp count, 128 57 b.hi L(copy_long) 58 cmp count, 32 59 b.hi L(copy32_128) 60 61 whilelo p0.b, xzr, count 62 cntb vlen 63 tbnz vlen, 4, L(vlen128) 64 ld1b z0.b, p0/z, [src] 65 st1b z0.b, p0, [dstin] 66 ret 67 68 /* Medium copies: 33..128 bytes. */ 69L(copy32_128): 70 add srcend, src, count 71 add dstend, dstin, count 72 ldp A_q, B_q, [src] 73 ldp C_q, D_q, [srcend, -32] 74 cmp count, 64 75 b.hi L(copy128) 76 stp A_q, B_q, [dstin] 77 stp C_q, D_q, [dstend, -32] 78 ret 79 80 /* Copy 65..128 bytes. */ 81L(copy128): 82 ldp E_q, F_q, [src, 32] 83 cmp count, 96 84 b.ls L(copy96) 85 ldp G_q, H_q, [srcend, -64] 86 stp G_q, H_q, [dstend, -64] 87L(copy96): 88 stp A_q, B_q, [dstin] 89 stp E_q, F_q, [dstin, 32] 90 stp C_q, D_q, [dstend, -32] 91 ret 92 93 /* Copy more than 128 bytes. */ 94L(copy_long): 95 add srcend, src, count 96 add dstend, dstin, count 97 98 /* Use backwards copy if there is an overlap. */ 99 sub tmp1, dstin, src 100 cmp tmp1, count 101 b.lo L(copy_long_backwards) 102 103 /* Copy 16 bytes and then align src to 16-byte alignment. */ 104 ldr D_q, [src] 105 and tmp1, src, 15 106 bic src, src, 15 107 sub dst, dstin, tmp1 108 add count, count, tmp1 /* Count is now 16 too large. */ 109 ldp A_q, B_q, [src, 16] 110 str D_q, [dstin] 111 ldp C_q, D_q, [src, 48] 112 subs count, count, 128 + 16 /* Test and readjust count. */ 113 b.ls L(copy64_from_end) 114L(loop64): 115 stp A_q, B_q, [dst, 16] 116 ldp A_q, B_q, [src, 80] 117 stp C_q, D_q, [dst, 48] 118 ldp C_q, D_q, [src, 112] 119 add src, src, 64 120 add dst, dst, 64 121 subs count, count, 64 122 b.hi L(loop64) 123 124 /* Write the last iteration and copy 64 bytes from the end. */ 125L(copy64_from_end): 126 ldp E_q, F_q, [srcend, -64] 127 stp A_q, B_q, [dst, 16] 128 ldp A_q, B_q, [srcend, -32] 129 stp C_q, D_q, [dst, 48] 130 stp E_q, F_q, [dstend, -64] 131 stp A_q, B_q, [dstend, -32] 132 ret 133 134L(vlen128): 135 whilelo p1.b, vlen, count 136 ld1b z0.b, p0/z, [src, 0, mul vl] 137 ld1b z1.b, p1/z, [src, 1, mul vl] 138 st1b z0.b, p0, [dstin, 0, mul vl] 139 st1b z1.b, p1, [dstin, 1, mul vl] 140 ret 141 142 /* Large backwards copy for overlapping copies. 143 Copy 16 bytes and then align srcend to 16-byte alignment. */ 144L(copy_long_backwards): 145 cbz tmp1, L(return) 146 ldr D_q, [srcend, -16] 147 and tmp1, srcend, 15 148 bic srcend, srcend, 15 149 sub count, count, tmp1 150 ldp A_q, B_q, [srcend, -32] 151 str D_q, [dstend, -16] 152 ldp C_q, D_q, [srcend, -64] 153 sub dstend, dstend, tmp1 154 subs count, count, 128 155 b.ls L(copy64_from_start) 156 157L(loop64_backwards): 158 str B_q, [dstend, -16] 159 str A_q, [dstend, -32] 160 ldp A_q, B_q, [srcend, -96] 161 str D_q, [dstend, -48] 162 str C_q, [dstend, -64]! 163 ldp C_q, D_q, [srcend, -128] 164 sub srcend, srcend, 64 165 subs count, count, 64 166 b.hi L(loop64_backwards) 167 168 /* Write the last iteration and copy 64 bytes from the start. */ 169L(copy64_from_start): 170 ldp E_q, F_q, [src, 32] 171 stp A_q, B_q, [dstend, -32] 172 ldp A_q, B_q, [src] 173 stp C_q, D_q, [dstend, -64] 174 stp E_q, F_q, [dstin, 32] 175 stp A_q, B_q, [dstin] 176L(return): 177 ret 178 179END (__memcpy_aarch64_sve) 180#endif 181