1/* Copyright (c) 2012, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 26 27/* 28 * Copyright (c) 2015 ARM Ltd 29 * All rights reserved. 30 * 31 * Redistribution and use in source and binary forms, with or without 32 * modification, are permitted provided that the following conditions 33 * are met: 34 * 1. Redistributions of source code must retain the above copyright 35 * notice, this list of conditions and the following disclaimer. 36 * 2. Redistributions in binary form must reproduce the above copyright 37 * notice, this list of conditions and the following disclaimer in the 38 * documentation and/or other materials provided with the distribution. 39 * 3. The name of the company may not be used to endorse or promote 40 * products derived from this software without specific prior written 41 * permission. 42 * 43 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 44 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 45 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 46 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 48 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 49 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 50 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 51 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 52 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 53 */ 54 55/* Assumptions: 56 * 57 * ARMv8-a, AArch64, unaligned accesses. 58 * 59 */ 60 61#define dstin x0 62#define src x1 63#define count x2 64#define dst x3 65#define srcend x4 66#define dstend x5 67#define A_l x6 68#define A_lw w6 69#define A_h x7 70#define A_hw w7 71#define B_l x8 72#define B_lw w8 73#define B_h x9 74#define C_l x10 75#define C_h x11 76#define D_l x12 77#define D_h x13 78#define E_l src 79#define E_h count 80#define F_l dst 81#define F_h srcend 82#define tmp1 x9 83 84#define L(l) .L ## l 85 86 .macro def_fn f p2align=0 87 .text 88 .p2align \p2align 89 .global \f 90 .type \f, %function 91\f: 92 .endm 93 94/* Copies are split into 3 main cases: small copies of up to 16 bytes, 95 medium copies of 17..96 bytes which are fully unrolled. Large copies 96 of more than 96 bytes align the destination and use an unrolled loop 97 processing 64 bytes per iteration. 98 Small and medium copies read all data before writing, allowing any 99 kind of overlap, and memmove tailcalls memcpy for these cases as 100 well as non-overlapping copies. 101*/ 102 103def_fn memcpy p2align=6 104 prfm PLDL1KEEP, [src] 105 add srcend, src, count 106 add dstend, dstin, count 107 cmp count, 16 108 b.ls L(copy16) 109 cmp count, 96 110 b.hi L(copy_long) 111 112 /* Medium copies: 17..96 bytes. */ 113 sub tmp1, count, 1 114 ldp A_l, A_h, [src] 115 tbnz tmp1, 6, L(copy96) 116 ldp D_l, D_h, [srcend, -16] 117 tbz tmp1, 5, 1f 118 ldp B_l, B_h, [src, 16] 119 ldp C_l, C_h, [srcend, -32] 120 stp B_l, B_h, [dstin, 16] 121 stp C_l, C_h, [dstend, -32] 1221: 123 stp A_l, A_h, [dstin] 124 stp D_l, D_h, [dstend, -16] 125 ret 126 127 .p2align 4 128 /* Small copies: 0..16 bytes. */ 129L(copy16): 130 cmp count, 8 131 b.lo 1f 132 ldr A_l, [src] 133 ldr A_h, [srcend, -8] 134 str A_l, [dstin] 135 str A_h, [dstend, -8] 136 ret 137 .p2align 4 1381: 139 tbz count, 2, 1f 140 ldr A_lw, [src] 141 ldr A_hw, [srcend, -4] 142 str A_lw, [dstin] 143 str A_hw, [dstend, -4] 144 ret 145 146 /* Copy 0..3 bytes. Use a branchless sequence that copies the same 147 byte 3 times if count==1, or the 2nd byte twice if count==2. */ 1481: 149 cbz count, 2f 150 lsr tmp1, count, 1 151 ldrb A_lw, [src] 152 ldrb A_hw, [srcend, -1] 153 ldrb B_lw, [src, tmp1] 154 strb A_lw, [dstin] 155 strb B_lw, [dstin, tmp1] 156 strb A_hw, [dstend, -1] 1572: ret 158 159 .p2align 4 160 /* Copy 64..96 bytes. Copy 64 bytes from the start and 161 32 bytes from the end. */ 162L(copy96): 163 ldp B_l, B_h, [src, 16] 164 ldp C_l, C_h, [src, 32] 165 ldp D_l, D_h, [src, 48] 166 ldp E_l, E_h, [srcend, -32] 167 ldp F_l, F_h, [srcend, -16] 168 stp A_l, A_h, [dstin] 169 stp B_l, B_h, [dstin, 16] 170 stp C_l, C_h, [dstin, 32] 171 stp D_l, D_h, [dstin, 48] 172 stp E_l, E_h, [dstend, -32] 173 stp F_l, F_h, [dstend, -16] 174 ret 175 176 /* Align DST to 16 byte alignment so that we don't cross cache line 177 boundaries on both loads and stores. There are at least 96 bytes 178 to copy, so copy 16 bytes unaligned and then align. The loop 179 copies 64 bytes per iteration and prefetches one iteration ahead. */ 180 181 .p2align 4 182L(copy_long): 183 and tmp1, dstin, 15 184 bic dst, dstin, 15 185 ldp D_l, D_h, [src] 186 sub src, src, tmp1 187 add count, count, tmp1 /* Count is now 16 too large. */ 188 ldp A_l, A_h, [src, 16] 189 stp D_l, D_h, [dstin] 190 ldp B_l, B_h, [src, 32] 191 ldp C_l, C_h, [src, 48] 192 ldp D_l, D_h, [src, 64]! 193 subs count, count, 128 + 16 /* Test and readjust count. */ 194 b.ls 2f 1951: 196 stp A_l, A_h, [dst, 16] 197 ldp A_l, A_h, [src, 16] 198 stp B_l, B_h, [dst, 32] 199 ldp B_l, B_h, [src, 32] 200 stp C_l, C_h, [dst, 48] 201 ldp C_l, C_h, [src, 48] 202 stp D_l, D_h, [dst, 64]! 203 ldp D_l, D_h, [src, 64]! 204 subs count, count, 64 205 b.hi 1b 206 207 /* Write the last full set of 64 bytes. The remainder is at most 64 208 bytes, so it is safe to always copy 64 bytes from the end even if 209 there is just 1 byte left. */ 2102: 211 ldp E_l, E_h, [srcend, -64] 212 stp A_l, A_h, [dst, 16] 213 ldp A_l, A_h, [srcend, -48] 214 stp B_l, B_h, [dst, 32] 215 ldp B_l, B_h, [srcend, -32] 216 stp C_l, C_h, [dst, 48] 217 ldp C_l, C_h, [srcend, -16] 218 stp D_l, D_h, [dst, 64] 219 stp E_l, E_h, [dstend, -64] 220 stp A_l, A_h, [dstend, -48] 221 stp B_l, B_h, [dstend, -32] 222 stp C_l, C_h, [dstend, -16] 223 ret 224 225 .size memcpy, . - memcpy 226