1/* 2 * strlen - calculate the length of a string. 3 * 4 * Copyright (c) 2020-2022, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 11 * Not MTE compatible. 12 */ 13 14#include "asmdefs.h" 15 16#define srcin x0 17#define len x0 18 19#define src x1 20#define data1 x2 21#define data2 x3 22#define has_nul1 x4 23#define has_nul2 x5 24#define tmp1 x4 25#define tmp2 x5 26#define tmp3 x6 27#define tmp4 x7 28#define zeroones x8 29 30#define maskv v0 31#define maskd d0 32#define dataq1 q1 33#define dataq2 q2 34#define datav1 v1 35#define datav2 v2 36#define tmp x2 37#define tmpw w2 38#define synd x3 39#define syndw w3 40#define shift x4 41 42/* For the first 32 bytes, NUL detection works on the principle that 43 (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a 44 byte is zero, and can be done in parallel across the entire word. */ 45 46#define REP8_01 0x0101010101010101 47#define REP8_7f 0x7f7f7f7f7f7f7f7f 48 49/* To test the page crossing code path more thoroughly, compile with 50 -DTEST_PAGE_CROSS - this will force all calls through the slower 51 entry path. This option is not intended for production use. */ 52 53#ifdef TEST_PAGE_CROSS 54# define MIN_PAGE_SIZE 32 55#else 56# define MIN_PAGE_SIZE 4096 57#endif 58 59/* Core algorithm: 60 61 Since strings are short on average, we check the first 32 bytes of the 62 string for a NUL character without aligning the string. In order to use 63 unaligned loads safely we must do a page cross check first. 64 65 If there is a NUL byte we calculate the length from the 2 8-byte words 66 using conditional select to reduce branch mispredictions (it is unlikely 67 strlen will be repeatedly called on strings with the same length). 68 69 If the string is longer than 32 bytes, align src so we don't need further 70 page cross checks, and process 32 bytes per iteration using a fast SIMD 71 loop. 72 73 If the page cross check fails, we read 32 bytes from an aligned address, 74 and ignore any characters before the string. If it contains a NUL 75 character, return the length, if not, continue in the main loop. */ 76 77ENTRY (__strlen_aarch64) 78 and tmp1, srcin, MIN_PAGE_SIZE - 1 79 cmp tmp1, MIN_PAGE_SIZE - 32 80 b.hi L(page_cross) 81 82 /* Look for a NUL byte in the first 16 bytes. */ 83 ldp data1, data2, [srcin] 84 mov zeroones, REP8_01 85 86#ifdef __AARCH64EB__ 87 /* For big-endian, carry propagation (if the final byte in the 88 string is 0x01) means we cannot use has_nul1/2 directly. 89 Since we expect strings to be small and early-exit, 90 byte-swap the data now so has_null1/2 will be correct. */ 91 rev data1, data1 92 rev data2, data2 93#endif 94 sub tmp1, data1, zeroones 95 orr tmp2, data1, REP8_7f 96 sub tmp3, data2, zeroones 97 orr tmp4, data2, REP8_7f 98 bics has_nul1, tmp1, tmp2 99 bic has_nul2, tmp3, tmp4 100 ccmp has_nul2, 0, 0, eq 101 b.eq L(bytes16_31) 102 103 /* Find the exact offset of the first NUL byte in the first 16 bytes 104 from the string start. Enter with C = has_nul1 == 0. */ 105 csel has_nul1, has_nul1, has_nul2, cc 106 mov len, 8 107 rev has_nul1, has_nul1 108 csel len, xzr, len, cc 109 clz tmp1, has_nul1 110 add len, len, tmp1, lsr 3 111 ret 112 113 /* Look for a NUL byte at offset 16..31 in the string. */ 114L(bytes16_31): 115 ldp data1, data2, [srcin, 16] 116#ifdef __AARCH64EB__ 117 rev data1, data1 118 rev data2, data2 119#endif 120 sub tmp1, data1, zeroones 121 orr tmp2, data1, REP8_7f 122 sub tmp3, data2, zeroones 123 orr tmp4, data2, REP8_7f 124 bics has_nul1, tmp1, tmp2 125 bic has_nul2, tmp3, tmp4 126 ccmp has_nul2, 0, 0, eq 127 b.eq L(loop_entry) 128 129 /* Find the exact offset of the first NUL byte at offset 16..31 from 130 the string start. Enter with C = has_nul1 == 0. */ 131 csel has_nul1, has_nul1, has_nul2, cc 132 mov len, 24 133 rev has_nul1, has_nul1 134 mov tmp3, 16 135 clz tmp1, has_nul1 136 csel len, tmp3, len, cc 137 add len, len, tmp1, lsr 3 138 ret 139 140 nop 141L(loop_entry): 142 bic src, srcin, 31 143 144 .p2align 5 145L(loop): 146 ldp dataq1, dataq2, [src, 32]! 147 uminp maskv.16b, datav1.16b, datav2.16b 148 uminp maskv.16b, maskv.16b, maskv.16b 149 cmeq maskv.8b, maskv.8b, 0 150 fmov synd, maskd 151 cbz synd, L(loop) 152 153 /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ 154 cmeq maskv.16b, datav1.16b, 0 155 sub len, src, srcin 156 cbnz syndw, 1f 157 cmeq maskv.16b, datav2.16b, 0 158 add len, len, 16 1591: 160 /* Generate a bitmask and compute correct byte offset. */ 161 shrn maskv.8b, maskv.8h, 4 162 fmov synd, maskd 163#ifndef __AARCH64EB__ 164 rbit synd, synd 165#endif 166 clz tmp, synd 167 add len, len, tmp, lsr 2 168 ret 169 170L(page_cross): 171 bic src, srcin, 31 172 mov tmpw, 0x0c03 173 movk tmpw, 0xc030, lsl 16 174 ld1 {datav1.16b, datav2.16b}, [src] 175 dup maskv.4s, tmpw 176 cmeq datav1.16b, datav1.16b, 0 177 cmeq datav2.16b, datav2.16b, 0 178 and datav1.16b, datav1.16b, maskv.16b 179 and datav2.16b, datav2.16b, maskv.16b 180 addp maskv.16b, datav1.16b, datav2.16b 181 addp maskv.16b, maskv.16b, maskv.16b 182 fmov synd, maskd 183 lsl shift, srcin, 1 184 lsr synd, synd, shift 185 cbz synd, L(loop) 186 187 rbit synd, synd 188 clz len, synd 189 lsr len, len, 1 190 ret 191 192END (__strlen_aarch64) 193