1/* Copyright (c) 2012, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 26 27/* 28 * Copyright (c) 2015 ARM Ltd 29 * All rights reserved. 30 * 31 * Redistribution and use in source and binary forms, with or without 32 * modification, are permitted provided that the following conditions 33 * are met: 34 * 1. Redistributions of source code must retain the above copyright 35 * notice, this list of conditions and the following disclaimer. 36 * 2. Redistributions in binary form must reproduce the above copyright 37 * notice, this list of conditions and the following disclaimer in the 38 * documentation and/or other materials provided with the distribution. 39 * 3. The name of the company may not be used to endorse or promote 40 * products derived from this software without specific prior written 41 * permission. 42 * 43 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 44 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 45 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 46 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 48 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 49 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 50 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 51 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 52 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 53 */ 54 55/* Assumptions: 56 * 57 * ARMv8-a, AArch64, unaligned accesses 58 * 59 */ 60 61 62#define dstin x0 63#define val x1 64#define valw w1 65#define count x2 66#define dst x3 67#define dstend x4 68#define tmp1 x5 69#define tmp1w w5 70#define tmp2 x6 71#define tmp2w w6 72#define zva_len x7 73#define zva_lenw w7 74 75#define L(l) .L ## l 76 77 .macro def_fn f p2align=0 78 .text 79 .p2align \p2align 80 .global \f 81 .type \f, %function 82\f: 83 .endm 84 85def_fn memset p2align=6 86 87 dup v0.16B, valw 88 add dstend, dstin, count 89 90 cmp count, 96 91 b.hi L(set_long) 92 cmp count, 16 93 b.hs L(set_medium) 94 mov val, v0.D[0] 95 96 /* Set 0..15 bytes. */ 97 tbz count, 3, 1f 98 str val, [dstin] 99 str val, [dstend, -8] 100 ret 101 nop 1021: tbz count, 2, 2f 103 str valw, [dstin] 104 str valw, [dstend, -4] 105 ret 1062: cbz count, 3f 107 strb valw, [dstin] 108 tbz count, 1, 3f 109 strh valw, [dstend, -2] 1103: ret 111 112 /* Set 17..96 bytes. */ 113L(set_medium): 114 str q0, [dstin] 115 tbnz count, 6, L(set96) 116 str q0, [dstend, -16] 117 tbz count, 5, 1f 118 str q0, [dstin, 16] 119 str q0, [dstend, -32] 1201: ret 121 122 .p2align 4 123 /* Set 64..96 bytes. Write 64 bytes from the start and 124 32 bytes from the end. */ 125L(set96): 126 str q0, [dstin, 16] 127 stp q0, q0, [dstin, 32] 128 stp q0, q0, [dstend, -32] 129 ret 130 131 .p2align 3 132 nop 133L(set_long): 134 and valw, valw, 255 135 bic dst, dstin, 15 136 str q0, [dstin] 137 cmp count, 256 138 ccmp valw, 0, 0, cs 139 b.eq L(try_zva) 140L(no_zva): 141 sub count, dstend, dst /* Count is 16 too large. */ 142 add dst, dst, 16 143 sub count, count, 64 + 16 /* Adjust count and bias for loop. */ 1441: stp q0, q0, [dst], 64 145 stp q0, q0, [dst, -32] 146L(tail64): 147 subs count, count, 64 148 b.hi 1b 1492: stp q0, q0, [dstend, -64] 150 stp q0, q0, [dstend, -32] 151 ret 152 153 .p2align 3 154L(try_zva): 155 mrs tmp1, dczid_el0 156 tbnz tmp1w, 4, L(no_zva) 157 and tmp1w, tmp1w, 15 158 cmp tmp1w, 4 /* ZVA size is 64 bytes. */ 159 b.ne L(zva_128) 160 161 /* Write the first and last 64 byte aligned block using stp rather 162 than using DC ZVA. This is faster on some cores. 163 */ 164L(zva_64): 165 str q0, [dst, 16] 166 stp q0, q0, [dst, 32] 167 bic dst, dst, 63 168 stp q0, q0, [dst, 64] 169 stp q0, q0, [dst, 96] 170 sub count, dstend, dst /* Count is now 128 too large. */ 171 sub count, count, 128+64+64 /* Adjust count and bias for loop. */ 172 add dst, dst, 128 173 nop 1741: dc zva, dst 175 add dst, dst, 64 176 subs count, count, 64 177 b.hi 1b 178 stp q0, q0, [dst, 0] 179 stp q0, q0, [dst, 32] 180 stp q0, q0, [dstend, -64] 181 stp q0, q0, [dstend, -32] 182 ret 183 184 .p2align 3 185L(zva_128): 186 cmp tmp1w, 5 /* ZVA size is 128 bytes. */ 187 b.ne L(zva_other) 188 189 str q0, [dst, 16] 190 stp q0, q0, [dst, 32] 191 stp q0, q0, [dst, 64] 192 stp q0, q0, [dst, 96] 193 bic dst, dst, 127 194 sub count, dstend, dst /* Count is now 128 too large. */ 195 sub count, count, 128+128 /* Adjust count and bias for loop. */ 196 add dst, dst, 128 1971: dc zva, dst 198 add dst, dst, 128 199 subs count, count, 128 200 b.hi 1b 201 stp q0, q0, [dstend, -128] 202 stp q0, q0, [dstend, -96] 203 stp q0, q0, [dstend, -64] 204 stp q0, q0, [dstend, -32] 205 ret 206 207L(zva_other): 208 mov tmp2w, 4 209 lsl zva_lenw, tmp2w, tmp1w 210 add tmp1, zva_len, 64 /* Max alignment bytes written. */ 211 cmp count, tmp1 212 blo L(no_zva) 213 214 sub tmp2, zva_len, 1 215 add tmp1, dst, zva_len 216 add dst, dst, 16 217 subs count, tmp1, dst /* Actual alignment bytes to write. */ 218 bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ 219 beq 2f 2201: stp q0, q0, [dst], 64 221 stp q0, q0, [dst, -32] 222 subs count, count, 64 223 b.hi 1b 2242: mov dst, tmp1 225 sub count, dstend, tmp1 /* Remaining bytes to write. */ 226 subs count, count, zva_len 227 b.lo 4f 2283: dc zva, dst 229 add dst, dst, zva_len 230 subs count, count, zva_len 231 b.hs 3b 2324: add count, count, zva_len 233 b L(tail64) 234 235 .size memset, . - memset 236