1/* Copyright (c) 2013-2015, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 26 27/* Assumptions: 28 * 29 * ARMv8-a, AArch64, unaligned accesses, min page size 4k. 30 */ 31 32/* To test the page crossing code path more thoroughly, compile with 33 -DTEST_PAGE_CROSS - this will force all calls through the slower 34 entry path. This option is not intended for production use. */ 35 36/* Arguments and results. */ 37#define srcin x0 38#define len x0 39 40/* Locals and temporaries. */ 41#define src x1 42#define data1 x2 43#define data2 x3 44#define has_nul1 x4 45#define has_nul2 x5 46#define tmp1 x4 47#define tmp2 x5 48#define tmp3 x6 49#define tmp4 x7 50#define zeroones x8 51 52#define L(l) .L ## l 53 54 .macro def_fn f p2align=0 55 .text 56 .p2align \p2align 57 .global \f 58 .type \f, %function 59\f: 60 .endm 61 62 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 63 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 64 can be done in parallel across the entire word. A faster check 65 (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives 66 false hits for characters 129..255. */ 67 68#define REP8_01 0x0101010101010101 69#define REP8_7f 0x7f7f7f7f7f7f7f7f 70#define REP8_80 0x8080808080808080 71 72#ifdef TEST_PAGE_CROSS 73# define MIN_PAGE_SIZE 15 74#else 75# define MIN_PAGE_SIZE 4096 76#endif 77 78 /* Since strings are short on average, we check the first 16 bytes 79 of the string for a NUL character. In order to do an unaligned ldp 80 safely we have to do a page cross check first. If there is a NUL 81 byte we calculate the length from the 2 8-byte words using 82 conditional select to reduce branch mispredictions (it is unlikely 83 strlen will be repeatedly called on strings with the same length). 84 85 If the string is longer than 16 bytes, we align src so don't need 86 further page cross checks, and process 32 bytes per iteration 87 using the fast NUL check. If we encounter non-ASCII characters, 88 fallback to a second loop using the full NUL check. 89 90 If the page cross check fails, we read 16 bytes from an aligned 91 address, remove any characters before the string, and continue 92 in the main loop using aligned loads. Since strings crossing a 93 page in the first 16 bytes are rare (probability of 94 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. 95 96 AArch64 systems have a minimum page size of 4k. We don't bother 97 checking for larger page sizes - the cost of setting up the correct 98 page size is just not worth the extra gain from a small reduction in 99 the cases taking the slow path. Note that we only care about 100 whether the first fetch, which may be misaligned, crosses a page 101 boundary. */ 102 103def_fn strlen p2align=6 104 and tmp1, srcin, MIN_PAGE_SIZE - 1 105 mov zeroones, REP8_01 106 cmp tmp1, MIN_PAGE_SIZE - 16 107 b.gt L(page_cross) 108 ldp data1, data2, [srcin] 109#ifdef __AARCH64EB__ 110 /* For big-endian, carry propagation (if the final byte in the 111 string is 0x01) means we cannot use has_nul1/2 directly. 112 Since we expect strings to be small and early-exit, 113 byte-swap the data now so has_null1/2 will be correct. */ 114 rev data1, data1 115 rev data2, data2 116#endif 117 sub tmp1, data1, zeroones 118 orr tmp2, data1, REP8_7f 119 sub tmp3, data2, zeroones 120 orr tmp4, data2, REP8_7f 121 bics has_nul1, tmp1, tmp2 122 bic has_nul2, tmp3, tmp4 123 ccmp has_nul2, 0, 0, eq 124 beq L(main_loop_entry) 125 126 /* Enter with C = has_nul1 == 0. */ 127 csel has_nul1, has_nul1, has_nul2, cc 128 mov len, 8 129 rev has_nul1, has_nul1 130 clz tmp1, has_nul1 131 csel len, xzr, len, cc 132 add len, len, tmp1, lsr 3 133 ret 134 135 /* The inner loop processes 32 bytes per iteration and uses the fast 136 NUL check. If we encounter non-ASCII characters, use a second 137 loop with the accurate NUL check. */ 138 .p2align 4 139L(main_loop_entry): 140 bic src, srcin, 15 141 sub src, src, 16 142L(main_loop): 143 ldp data1, data2, [src, 32]! 144.Lpage_cross_entry: 145 sub tmp1, data1, zeroones 146 sub tmp3, data2, zeroones 147 orr tmp2, tmp1, tmp3 148 tst tmp2, zeroones, lsl 7 149 bne 1f 150 ldp data1, data2, [src, 16] 151 sub tmp1, data1, zeroones 152 sub tmp3, data2, zeroones 153 orr tmp2, tmp1, tmp3 154 tst tmp2, zeroones, lsl 7 155 beq L(main_loop) 156 add src, src, 16 1571: 158 /* The fast check failed, so do the slower, accurate NUL check. */ 159 orr tmp2, data1, REP8_7f 160 orr tmp4, data2, REP8_7f 161 bics has_nul1, tmp1, tmp2 162 bic has_nul2, tmp3, tmp4 163 ccmp has_nul2, 0, 0, eq 164 beq L(nonascii_loop) 165 166 /* Enter with C = has_nul1 == 0. */ 167L(tail): 168#ifdef __AARCH64EB__ 169 /* For big-endian, carry propagation (if the final byte in the 170 string is 0x01) means we cannot use has_nul1/2 directly. The 171 easiest way to get the correct byte is to byte-swap the data 172 and calculate the syndrome a second time. */ 173 csel data1, data1, data2, cc 174 rev data1, data1 175 sub tmp1, data1, zeroones 176 orr tmp2, data1, REP8_7f 177 bic has_nul1, tmp1, tmp2 178#else 179 csel has_nul1, has_nul1, has_nul2, cc 180#endif 181 sub len, src, srcin 182 rev has_nul1, has_nul1 183 add tmp2, len, 8 184 clz tmp1, has_nul1 185 csel len, len, tmp2, cc 186 add len, len, tmp1, lsr 3 187 ret 188 189L(nonascii_loop): 190 ldp data1, data2, [src, 16]! 191 sub tmp1, data1, zeroones 192 orr tmp2, data1, REP8_7f 193 sub tmp3, data2, zeroones 194 orr tmp4, data2, REP8_7f 195 bics has_nul1, tmp1, tmp2 196 bic has_nul2, tmp3, tmp4 197 ccmp has_nul2, 0, 0, eq 198 bne L(tail) 199 ldp data1, data2, [src, 16]! 200 sub tmp1, data1, zeroones 201 orr tmp2, data1, REP8_7f 202 sub tmp3, data2, zeroones 203 orr tmp4, data2, REP8_7f 204 bics has_nul1, tmp1, tmp2 205 bic has_nul2, tmp3, tmp4 206 ccmp has_nul2, 0, 0, eq 207 beq L(nonascii_loop) 208 b L(tail) 209 210 /* Load 16 bytes from [srcin & ~15] and force the bytes that precede 211 srcin to 0x7f, so we ignore any NUL bytes before the string. 212 Then continue in the aligned loop. */ 213L(page_cross): 214 bic src, srcin, 15 215 ldp data1, data2, [src] 216 lsl tmp1, srcin, 3 217 mov tmp4, -1 218#ifdef __AARCH64EB__ 219 /* Big-endian. Early bytes are at MSB. */ 220 lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ 221#else 222 /* Little-endian. Early bytes are at LSB. */ 223 lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ 224#endif 225 orr tmp1, tmp1, REP8_80 226 orn data1, data1, tmp1 227 orn tmp2, data2, tmp1 228 tst srcin, 8 229 csel data1, data1, tmp4, eq 230 csel data2, data2, tmp2, eq 231 b L(page_cross_entry) 232 233 .size strlen, . - strlen 234