1/* 2 * memcpy - copy memory area 3 * 4 * Copyright (c) 2019-2023, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses. 11 * 12 */ 13 14#include "asmdefs.h" 15 16.arch armv8-a+sve 17 18#define dstin x0 19#define src x1 20#define count x2 21#define dst x3 22#define srcend x4 23#define dstend x5 24#define tmp1 x6 25#define vlen x6 26 27#define A_q q0 28#define B_q q1 29#define C_q q2 30#define D_q q3 31#define E_q q4 32#define F_q q5 33#define G_q q6 34#define H_q q7 35 36/* This implementation handles overlaps and supports both memcpy and memmove 37 from a single entry point. It uses unaligned accesses and branchless 38 sequences to keep the code small, simple and improve performance. 39 SVE vectors are used to speedup small copies. 40 41 Copies are split into 3 main cases: small copies of up to 32 bytes, medium 42 copies of up to 128 bytes, and large copies. The overhead of the overlap 43 check is negligible since it is only required for large copies. 44 45 Large copies use a software pipelined loop processing 64 bytes per iteration. 46 The source pointer is 16-byte aligned to minimize unaligned accesses. 47 The loop tail is handled by always copying 64 bytes from the end. 48*/ 49 50ENTRY_ALIAS (__memmove_aarch64_sve) 51ENTRY (__memcpy_aarch64_sve) 52 cmp count, 128 53 b.hi L(copy_long) 54 cntb vlen 55 cmp count, vlen, lsl 1 56 b.hi L(copy32_128) 57 58 whilelo p0.b, xzr, count 59 whilelo p1.b, vlen, count 60 ld1b z0.b, p0/z, [src, 0, mul vl] 61 ld1b z1.b, p1/z, [src, 1, mul vl] 62 st1b z0.b, p0, [dstin, 0, mul vl] 63 st1b z1.b, p1, [dstin, 1, mul vl] 64 ret 65 66 /* Medium copies: 33..128 bytes. */ 67L(copy32_128): 68 add srcend, src, count 69 add dstend, dstin, count 70 ldp A_q, B_q, [src] 71 ldp C_q, D_q, [srcend, -32] 72 cmp count, 64 73 b.hi L(copy128) 74 stp A_q, B_q, [dstin] 75 stp C_q, D_q, [dstend, -32] 76 ret 77 78 /* Copy 65..128 bytes. */ 79L(copy128): 80 ldp E_q, F_q, [src, 32] 81 cmp count, 96 82 b.ls L(copy96) 83 ldp G_q, H_q, [srcend, -64] 84 stp G_q, H_q, [dstend, -64] 85L(copy96): 86 stp A_q, B_q, [dstin] 87 stp E_q, F_q, [dstin, 32] 88 stp C_q, D_q, [dstend, -32] 89 ret 90 91 /* Copy more than 128 bytes. */ 92L(copy_long): 93 add srcend, src, count 94 add dstend, dstin, count 95 96 /* Use backwards copy if there is an overlap. */ 97 sub tmp1, dstin, src 98 cmp tmp1, count 99 b.lo L(copy_long_backwards) 100 101 /* Copy 16 bytes and then align src to 16-byte alignment. */ 102 ldr D_q, [src] 103 and tmp1, src, 15 104 bic src, src, 15 105 sub dst, dstin, tmp1 106 add count, count, tmp1 /* Count is now 16 too large. */ 107 ldp A_q, B_q, [src, 16] 108 str D_q, [dstin] 109 ldp C_q, D_q, [src, 48] 110 subs count, count, 128 + 16 /* Test and readjust count. */ 111 b.ls L(copy64_from_end) 112L(loop64): 113 stp A_q, B_q, [dst, 16] 114 ldp A_q, B_q, [src, 80] 115 stp C_q, D_q, [dst, 48] 116 ldp C_q, D_q, [src, 112] 117 add src, src, 64 118 add dst, dst, 64 119 subs count, count, 64 120 b.hi L(loop64) 121 122 /* Write the last iteration and copy 64 bytes from the end. */ 123L(copy64_from_end): 124 ldp E_q, F_q, [srcend, -64] 125 stp A_q, B_q, [dst, 16] 126 ldp A_q, B_q, [srcend, -32] 127 stp C_q, D_q, [dst, 48] 128 stp E_q, F_q, [dstend, -64] 129 stp A_q, B_q, [dstend, -32] 130 ret 131 132 /* Large backwards copy for overlapping copies. 133 Copy 16 bytes and then align srcend to 16-byte alignment. */ 134L(copy_long_backwards): 135 cbz tmp1, L(return) 136 ldr D_q, [srcend, -16] 137 and tmp1, srcend, 15 138 bic srcend, srcend, 15 139 sub count, count, tmp1 140 ldp A_q, B_q, [srcend, -32] 141 str D_q, [dstend, -16] 142 ldp C_q, D_q, [srcend, -64] 143 sub dstend, dstend, tmp1 144 subs count, count, 128 145 b.ls L(copy64_from_start) 146 147L(loop64_backwards): 148 str B_q, [dstend, -16] 149 str A_q, [dstend, -32] 150 ldp A_q, B_q, [srcend, -96] 151 str D_q, [dstend, -48] 152 str C_q, [dstend, -64]! 153 ldp C_q, D_q, [srcend, -128] 154 sub srcend, srcend, 64 155 subs count, count, 64 156 b.hi L(loop64_backwards) 157 158 /* Write the last iteration and copy 64 bytes from the start. */ 159L(copy64_from_start): 160 ldp E_q, F_q, [src, 32] 161 stp A_q, B_q, [dstend, -32] 162 ldp A_q, B_q, [src] 163 stp C_q, D_q, [dstend, -64] 164 stp E_q, F_q, [dstin, 32] 165 stp A_q, B_q, [dstin] 166L(return): 167 ret 168 169END (__memcpy_aarch64_sve) 170