1/* Copyright (c) 2012, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 26 27/* Assumptions: 28 * 29 * ARMv8-a, AArch64 30 * Unaligned accesses 31 * 32 */ 33 34#include <sys/elf_common.h> 35 36#include <machine/asm.h> 37 38#define dstin x0 39#define val w1 40#define count x2 41#define tmp1 x3 42#define tmp1w w3 43#define tmp2 x4 44#define tmp2w w4 45#define zva_len_x x5 46#define zva_len w5 47#define zva_bits_x x6 48 49#define A_l x7 50#define A_lw w7 51#define dst x8 52#define tmp3w w9 53 54ENTRY(memset) 55 56 mov dst, dstin /* Preserve return value. */ 57 ands A_lw, val, #255 58#ifndef DONT_USE_DC 59 b.eq .Lzero_mem 60#endif 61 orr A_lw, A_lw, A_lw, lsl #8 62 orr A_lw, A_lw, A_lw, lsl #16 63 orr A_l, A_l, A_l, lsl #32 64.Ltail_maybe_long: 65 cmp count, #64 66 b.ge .Lnot_short 67.Ltail_maybe_tiny: 68 cmp count, #15 69 b.le .Ltail15tiny 70.Ltail63: 71 ands tmp1, count, #0x30 72 b.eq .Ltail15 73 add dst, dst, tmp1 74 cmp tmp1w, #0x20 75 b.eq 1f 76 b.lt 2f 77 stp A_l, A_l, [dst, #-48] 781: 79 stp A_l, A_l, [dst, #-32] 802: 81 stp A_l, A_l, [dst, #-16] 82 83.Ltail15: 84 and count, count, #15 85 add dst, dst, count 86 stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ 87 ret 88 89.Ltail15tiny: 90 /* Set up to 15 bytes. Does not assume earlier memory 91 being set. */ 92 tbz count, #3, 1f 93 str A_l, [dst], #8 941: 95 tbz count, #2, 1f 96 str A_lw, [dst], #4 971: 98 tbz count, #1, 1f 99 strh A_lw, [dst], #2 1001: 101 tbz count, #0, 1f 102 strb A_lw, [dst] 1031: 104 ret 105 106 /* Critical loop. Start at a new cache line boundary. Assuming 107 * 64 bytes per line, this ensures the entire loop is in one line. */ 108 .p2align 6 109.Lnot_short: 110 neg tmp2, dst 111 ands tmp2, tmp2, #15 112 b.eq 2f 113 /* Bring DST to 128-bit (16-byte) alignment. We know that there's 114 * more than that to set, so we simply store 16 bytes and advance by 115 * the amount required to reach alignment. */ 116 sub count, count, tmp2 117 stp A_l, A_l, [dst] 118 add dst, dst, tmp2 119 /* There may be less than 63 bytes to go now. */ 120 cmp count, #63 121 b.le .Ltail63 1222: 123 sub dst, dst, #16 /* Pre-bias. */ 124 sub count, count, #64 1251: 126 stp A_l, A_l, [dst, #16] 127 stp A_l, A_l, [dst, #32] 128 stp A_l, A_l, [dst, #48] 129 stp A_l, A_l, [dst, #64]! 130 subs count, count, #64 131 b.ge 1b 132 tst count, #0x3f 133 add dst, dst, #16 134 b.ne .Ltail63 135 ret 136 137 /* For zeroing memory, check to see if we can use the ZVA feature to 138 * zero entire 'cache' lines. */ 139.Lzero_mem: 140 mov A_l, #0 141 cmp count, #63 142 b.le .Ltail_maybe_tiny 143 neg tmp2, dst 144 ands tmp2, tmp2, #15 145 b.eq 1f 146 sub count, count, tmp2 147 stp A_l, A_l, [dst] 148 add dst, dst, tmp2 149 cmp count, #63 150 b.le .Ltail63 1511: 152 /* For zeroing small amounts of memory, it's not worth setting up 153 * the line-clear code. */ 154 cmp count, #128 155 b.lt .Lnot_short 156 157 adrp tmp2, dczva_line_size 158 add tmp2, tmp2, :lo12:dczva_line_size 159 ldr zva_len, [tmp2] 160 cbz zva_len, .Lnot_short 161 162.Lzero_by_line: 163 /* Compute how far we need to go to become suitably aligned. We're 164 * already at quad-word alignment. */ 165 cmp count, zva_len_x 166 b.lt .Lnot_short /* Not enough to reach alignment. */ 167 sub zva_bits_x, zva_len_x, #1 168 neg tmp2, dst 169 ands tmp2, tmp2, zva_bits_x 170 b.eq 1f /* Already aligned. */ 171 /* Not aligned, check that there's enough to copy after alignment. */ 172 sub tmp1, count, tmp2 173 cmp tmp1, #64 174 ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ 175 b.lt .Lnot_short 176 /* We know that there's at least 64 bytes to zero and that it's safe 177 * to overrun by 64 bytes. */ 178 mov count, tmp1 1792: 180 stp A_l, A_l, [dst] 181 stp A_l, A_l, [dst, #16] 182 stp A_l, A_l, [dst, #32] 183 subs tmp2, tmp2, #64 184 stp A_l, A_l, [dst, #48] 185 add dst, dst, #64 186 b.ge 2b 187 /* We've overrun a bit, so adjust dst downwards. */ 188 add dst, dst, tmp2 1891: 190 sub count, count, zva_len_x 1913: 192 dc zva, dst 193 add dst, dst, zva_len_x 194 subs count, count, zva_len_x 195 b.ge 3b 196 ands count, count, zva_bits_x 197 b.ne .Ltail_maybe_long 198 ret 199END(memset) 200 201GNU_PROPERTY_AARCH64_FEATURE_1_NOTE(GNU_PROPERTY_AARCH64_FEATURE_1_VAL) 202