1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Copyright (C) 2013 ARM Ltd. 4 * Copyright (C) 2013 Linaro. 5 * 6 * This code is based on glibc cortex strings work originally authored by Linaro 7 * be found @ 8 * 9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 * files/head:/src/aarch64/ 11 */ 12 13#include <linux/linkage.h> 14#include <asm/assembler.h> 15#include <asm/cache.h> 16 17/* 18 * Fill in the buffer with character c (alignment handled by the hardware) 19 * 20 * Parameters: 21 * x0 - buf 22 * x1 - c 23 * x2 - n 24 * Returns: 25 * x0 - buf 26 */ 27 28dstin .req x0 29val_x .req x1 30val .req w1 31count .req x2 32tmp1 .req x3 33tmp1w .req w3 34tmp2 .req x4 35tmp2w .req w4 36zva_len_x .req x5 37zva_len .req w5 38zva_bits_x .req x6 39 40A_l .req x7 41A_lw .req w7 42dst .req x8 43tmp3w .req w9 44tmp3 .req x9 45 46SYM_FUNC_START_LOCAL(__pi_memset_generic) 47 mov dst, dstin /* Preserve return value. */ 48 and A_lw, val, #255 49 orr A_lw, A_lw, A_lw, lsl #8 50 orr A_lw, A_lw, A_lw, lsl #16 51 orr A_l, A_l, A_l, lsl #32 52 53 cmp count, #15 54 b.hi .Lover16_proc 55 /*All store maybe are non-aligned..*/ 56 tbz count, #3, 1f 57 str A_l, [dst], #8 581: 59 tbz count, #2, 2f 60 str A_lw, [dst], #4 612: 62 tbz count, #1, 3f 63 strh A_lw, [dst], #2 643: 65 tbz count, #0, 4f 66 strb A_lw, [dst] 674: 68 ret 69 70.Lover16_proc: 71 /*Whether the start address is aligned with 16.*/ 72 neg tmp2, dst 73 ands tmp2, tmp2, #15 74 b.eq .Laligned 75/* 76* The count is not less than 16, we can use stp to store the start 16 bytes, 77* then adjust the dst aligned with 16.This process will make the current 78* memory address at alignment boundary. 79*/ 80 stp A_l, A_l, [dst] /*non-aligned store..*/ 81 /*make the dst aligned..*/ 82 sub count, count, tmp2 83 add dst, dst, tmp2 84 85.Laligned: 86 cbz A_l, .Lzero_mem 87 88.Ltail_maybe_long: 89 cmp count, #64 90 b.ge .Lnot_short 91.Ltail63: 92 ands tmp1, count, #0x30 93 b.eq 3f 94 cmp tmp1w, #0x20 95 b.eq 1f 96 b.lt 2f 97 stp A_l, A_l, [dst], #16 981: 99 stp A_l, A_l, [dst], #16 1002: 101 stp A_l, A_l, [dst], #16 102/* 103* The last store length is less than 16,use stp to write last 16 bytes. 104* It will lead some bytes written twice and the access is non-aligned. 105*/ 1063: 107 ands count, count, #15 108 cbz count, 4f 109 add dst, dst, count 110 stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ 1114: 112 ret 113 114 /* 115 * Critical loop. Start at a new cache line boundary. Assuming 116 * 64 bytes per line, this ensures the entire loop is in one line. 117 */ 118 .p2align L1_CACHE_SHIFT 119.Lnot_short: 120 sub dst, dst, #16/* Pre-bias. */ 121 sub count, count, #64 1221: 123 stp A_l, A_l, [dst, #16] 124 stp A_l, A_l, [dst, #32] 125 stp A_l, A_l, [dst, #48] 126 stp A_l, A_l, [dst, #64]! 127 subs count, count, #64 128 b.ge 1b 129 tst count, #0x3f 130 add dst, dst, #16 131 b.ne .Ltail63 132.Lexitfunc: 133 ret 134 135 /* 136 * For zeroing memory, check to see if we can use the ZVA feature to 137 * zero entire 'cache' lines. 138 */ 139.Lzero_mem: 140 cmp count, #63 141 b.le .Ltail63 142 /* 143 * For zeroing small amounts of memory, it's not worth setting up 144 * the line-clear code. 145 */ 146 cmp count, #128 147 b.lt .Lnot_short /*count is at least 128 bytes*/ 148 149 mrs tmp1, dczid_el0 150 tbnz tmp1, #4, .Lnot_short 151 mov tmp3w, #4 152 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ 153 lsl zva_len, tmp3w, zva_len 154 155 ands tmp3w, zva_len, #63 156 /* 157 * ensure the zva_len is not less than 64. 158 * It is not meaningful to use ZVA if the block size is less than 64. 159 */ 160 b.ne .Lnot_short 161.Lzero_by_line: 162 /* 163 * Compute how far we need to go to become suitably aligned. We're 164 * already at quad-word alignment. 165 */ 166 cmp count, zva_len_x 167 b.lt .Lnot_short /* Not enough to reach alignment. */ 168 sub zva_bits_x, zva_len_x, #1 169 neg tmp2, dst 170 ands tmp2, tmp2, zva_bits_x 171 b.eq 2f /* Already aligned. */ 172 /* Not aligned, check that there's enough to copy after alignment.*/ 173 sub tmp1, count, tmp2 174 /* 175 * grantee the remain length to be ZVA is bigger than 64, 176 * avoid to make the 2f's process over mem range.*/ 177 cmp tmp1, #64 178 ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ 179 b.lt .Lnot_short 180 /* 181 * We know that there's at least 64 bytes to zero and that it's safe 182 * to overrun by 64 bytes. 183 */ 184 mov count, tmp1 1851: 186 stp A_l, A_l, [dst] 187 stp A_l, A_l, [dst, #16] 188 stp A_l, A_l, [dst, #32] 189 subs tmp2, tmp2, #64 190 stp A_l, A_l, [dst, #48] 191 add dst, dst, #64 192 b.ge 1b 193 /* We've overrun a bit, so adjust dst downwards.*/ 194 add dst, dst, tmp2 1952: 196 sub count, count, zva_len_x 1973: 198 dc zva, dst 199 add dst, dst, zva_len_x 200 subs count, count, zva_len_x 201 b.ge 3b 202 ands count, count, zva_bits_x 203 b.ne .Ltail_maybe_long 204 ret 205SYM_FUNC_END(__pi_memset_generic) 206 207#ifdef CONFIG_AS_HAS_MOPS 208 .arch_extension mops 209SYM_FUNC_START(__pi_memset) 210alternative_if_not ARM64_HAS_MOPS 211 b __pi_memset_generic 212alternative_else_nop_endif 213 214 mov dst, dstin 215 setp [dst]!, count!, val_x 216 setm [dst]!, count!, val_x 217 sete [dst]!, count!, val_x 218 ret 219SYM_FUNC_END(__pi_memset) 220#else 221SYM_FUNC_ALIAS(__pi_memset, __pi_memset_generic) 222#endif 223 224SYM_FUNC_ALIAS(__memset, __pi_memset) 225EXPORT_SYMBOL(__memset) 226 227SYM_FUNC_ALIAS_WEAK(memset, __pi_memset) 228EXPORT_SYMBOL(memset) 229