1/* 2 * memcpy - copy memory area 3 * 4 * Copyright (c) 2019-2023, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses. 11 * 12 */ 13 14#include "asmdefs.h" 15 16#ifdef HAVE_SVE 17 18.arch armv8-a+sve 19 20#define dstin x0 21#define src x1 22#define count x2 23#define dst x3 24#define srcend x4 25#define dstend x5 26#define tmp1 x6 27#define vlen x6 28 29#define A_q q0 30#define B_q q1 31#define C_q q2 32#define D_q q3 33#define E_q q4 34#define F_q q5 35#define G_q q6 36#define H_q q7 37 38/* This implementation handles overlaps and supports both memcpy and memmove 39 from a single entry point. It uses unaligned accesses and branchless 40 sequences to keep the code small, simple and improve performance. 41 SVE vectors are used to speedup small copies. 42 43 Copies are split into 3 main cases: small copies of up to 32 bytes, medium 44 copies of up to 128 bytes, and large copies. The overhead of the overlap 45 check is negligible since it is only required for large copies. 46 47 Large copies use a software pipelined loop processing 64 bytes per iteration. 48 The source pointer is 16-byte aligned to minimize unaligned accesses. 49 The loop tail is handled by always copying 64 bytes from the end. 50*/ 51 52ENTRY_ALIAS (__memmove_aarch64_sve) 53ENTRY (__memcpy_aarch64_sve) 54 PTR_ARG (0) 55 PTR_ARG (1) 56 SIZE_ARG (2) 57 58 cmp count, 128 59 b.hi L(copy_long) 60 cntb vlen 61 cmp count, vlen, lsl 1 62 b.hi L(copy32_128) 63 64 whilelo p0.b, xzr, count 65 whilelo p1.b, vlen, count 66 ld1b z0.b, p0/z, [src, 0, mul vl] 67 ld1b z1.b, p1/z, [src, 1, mul vl] 68 st1b z0.b, p0, [dstin, 0, mul vl] 69 st1b z1.b, p1, [dstin, 1, mul vl] 70 ret 71 72 /* Medium copies: 33..128 bytes. */ 73L(copy32_128): 74 add srcend, src, count 75 add dstend, dstin, count 76 ldp A_q, B_q, [src] 77 ldp C_q, D_q, [srcend, -32] 78 cmp count, 64 79 b.hi L(copy128) 80 stp A_q, B_q, [dstin] 81 stp C_q, D_q, [dstend, -32] 82 ret 83 84 /* Copy 65..128 bytes. */ 85L(copy128): 86 ldp E_q, F_q, [src, 32] 87 cmp count, 96 88 b.ls L(copy96) 89 ldp G_q, H_q, [srcend, -64] 90 stp G_q, H_q, [dstend, -64] 91L(copy96): 92 stp A_q, B_q, [dstin] 93 stp E_q, F_q, [dstin, 32] 94 stp C_q, D_q, [dstend, -32] 95 ret 96 97 /* Copy more than 128 bytes. */ 98L(copy_long): 99 add srcend, src, count 100 add dstend, dstin, count 101 102 /* Use backwards copy if there is an overlap. */ 103 sub tmp1, dstin, src 104 cmp tmp1, count 105 b.lo L(copy_long_backwards) 106 107 /* Copy 16 bytes and then align src to 16-byte alignment. */ 108 ldr D_q, [src] 109 and tmp1, src, 15 110 bic src, src, 15 111 sub dst, dstin, tmp1 112 add count, count, tmp1 /* Count is now 16 too large. */ 113 ldp A_q, B_q, [src, 16] 114 str D_q, [dstin] 115 ldp C_q, D_q, [src, 48] 116 subs count, count, 128 + 16 /* Test and readjust count. */ 117 b.ls L(copy64_from_end) 118L(loop64): 119 stp A_q, B_q, [dst, 16] 120 ldp A_q, B_q, [src, 80] 121 stp C_q, D_q, [dst, 48] 122 ldp C_q, D_q, [src, 112] 123 add src, src, 64 124 add dst, dst, 64 125 subs count, count, 64 126 b.hi L(loop64) 127 128 /* Write the last iteration and copy 64 bytes from the end. */ 129L(copy64_from_end): 130 ldp E_q, F_q, [srcend, -64] 131 stp A_q, B_q, [dst, 16] 132 ldp A_q, B_q, [srcend, -32] 133 stp C_q, D_q, [dst, 48] 134 stp E_q, F_q, [dstend, -64] 135 stp A_q, B_q, [dstend, -32] 136 ret 137 138 /* Large backwards copy for overlapping copies. 139 Copy 16 bytes and then align srcend to 16-byte alignment. */ 140L(copy_long_backwards): 141 cbz tmp1, L(return) 142 ldr D_q, [srcend, -16] 143 and tmp1, srcend, 15 144 bic srcend, srcend, 15 145 sub count, count, tmp1 146 ldp A_q, B_q, [srcend, -32] 147 str D_q, [dstend, -16] 148 ldp C_q, D_q, [srcend, -64] 149 sub dstend, dstend, tmp1 150 subs count, count, 128 151 b.ls L(copy64_from_start) 152 153L(loop64_backwards): 154 str B_q, [dstend, -16] 155 str A_q, [dstend, -32] 156 ldp A_q, B_q, [srcend, -96] 157 str D_q, [dstend, -48] 158 str C_q, [dstend, -64]! 159 ldp C_q, D_q, [srcend, -128] 160 sub srcend, srcend, 64 161 subs count, count, 64 162 b.hi L(loop64_backwards) 163 164 /* Write the last iteration and copy 64 bytes from the start. */ 165L(copy64_from_start): 166 ldp E_q, F_q, [src, 32] 167 stp A_q, B_q, [dstend, -32] 168 ldp A_q, B_q, [src] 169 stp C_q, D_q, [dstend, -64] 170 stp E_q, F_q, [dstin, 32] 171 stp A_q, B_q, [dstin] 172L(return): 173 ret 174 175END (__memcpy_aarch64_sve) 176 177#endif 178