1/* 2 * strlen - calculate the length of a string. 3 * 4 * Copyright (c) 2020-2022, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 11 * Not MTE compatible. 12 */ 13 14#include "asmdefs.h" 15 16#define srcin x0 17#define len x0 18 19#define src x1 20#define data1 x2 21#define data2 x3 22#define has_nul1 x4 23#define has_nul2 x5 24#define tmp1 x4 25#define tmp2 x5 26#define tmp3 x6 27#define tmp4 x7 28#define zeroones x8 29 30#define maskv v0 31#define maskd d0 32#define dataq1 q1 33#define dataq2 q2 34#define datav1 v1 35#define datav2 v2 36#define tmp x2 37#define tmpw w2 38#define synd x3 39#define syndw w3 40#define shift x4 41 42/* For the first 32 bytes, NUL detection works on the principle that 43 (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a 44 byte is zero, and can be done in parallel across the entire word. */ 45 46#define REP8_01 0x0101010101010101 47#define REP8_7f 0x7f7f7f7f7f7f7f7f 48 49/* To test the page crossing code path more thoroughly, compile with 50 -DTEST_PAGE_CROSS - this will force all calls through the slower 51 entry path. This option is not intended for production use. */ 52 53#ifdef TEST_PAGE_CROSS 54# define MIN_PAGE_SIZE 32 55#else 56# define MIN_PAGE_SIZE 4096 57#endif 58 59/* Core algorithm: 60 61 Since strings are short on average, we check the first 32 bytes of the 62 string for a NUL character without aligning the string. In order to use 63 unaligned loads safely we must do a page cross check first. 64 65 If there is a NUL byte we calculate the length from the 2 8-byte words 66 using conditional select to reduce branch mispredictions (it is unlikely 67 strlen will be repeatedly called on strings with the same length). 68 69 If the string is longer than 32 bytes, align src so we don't need further 70 page cross checks, and process 32 bytes per iteration using a fast SIMD 71 loop. 72 73 If the page cross check fails, we read 32 bytes from an aligned address, 74 and ignore any characters before the string. If it contains a NUL 75 character, return the length, if not, continue in the main loop. */ 76 77ENTRY (__strlen_aarch64) 78 PTR_ARG (0) 79 and tmp1, srcin, MIN_PAGE_SIZE - 1 80 cmp tmp1, MIN_PAGE_SIZE - 32 81 b.hi L(page_cross) 82 83 /* Look for a NUL byte in the first 16 bytes. */ 84 ldp data1, data2, [srcin] 85 mov zeroones, REP8_01 86 87#ifdef __AARCH64EB__ 88 /* For big-endian, carry propagation (if the final byte in the 89 string is 0x01) means we cannot use has_nul1/2 directly. 90 Since we expect strings to be small and early-exit, 91 byte-swap the data now so has_null1/2 will be correct. */ 92 rev data1, data1 93 rev data2, data2 94#endif 95 sub tmp1, data1, zeroones 96 orr tmp2, data1, REP8_7f 97 sub tmp3, data2, zeroones 98 orr tmp4, data2, REP8_7f 99 bics has_nul1, tmp1, tmp2 100 bic has_nul2, tmp3, tmp4 101 ccmp has_nul2, 0, 0, eq 102 b.eq L(bytes16_31) 103 104 /* Find the exact offset of the first NUL byte in the first 16 bytes 105 from the string start. Enter with C = has_nul1 == 0. */ 106 csel has_nul1, has_nul1, has_nul2, cc 107 mov len, 8 108 rev has_nul1, has_nul1 109 csel len, xzr, len, cc 110 clz tmp1, has_nul1 111 add len, len, tmp1, lsr 3 112 ret 113 114 /* Look for a NUL byte at offset 16..31 in the string. */ 115L(bytes16_31): 116 ldp data1, data2, [srcin, 16] 117#ifdef __AARCH64EB__ 118 rev data1, data1 119 rev data2, data2 120#endif 121 sub tmp1, data1, zeroones 122 orr tmp2, data1, REP8_7f 123 sub tmp3, data2, zeroones 124 orr tmp4, data2, REP8_7f 125 bics has_nul1, tmp1, tmp2 126 bic has_nul2, tmp3, tmp4 127 ccmp has_nul2, 0, 0, eq 128 b.eq L(loop_entry) 129 130 /* Find the exact offset of the first NUL byte at offset 16..31 from 131 the string start. Enter with C = has_nul1 == 0. */ 132 csel has_nul1, has_nul1, has_nul2, cc 133 mov len, 24 134 rev has_nul1, has_nul1 135 mov tmp3, 16 136 clz tmp1, has_nul1 137 csel len, tmp3, len, cc 138 add len, len, tmp1, lsr 3 139 ret 140 141 nop 142L(loop_entry): 143 bic src, srcin, 31 144 145 .p2align 5 146L(loop): 147 ldp dataq1, dataq2, [src, 32]! 148 uminp maskv.16b, datav1.16b, datav2.16b 149 uminp maskv.16b, maskv.16b, maskv.16b 150 cmeq maskv.8b, maskv.8b, 0 151 fmov synd, maskd 152 cbz synd, L(loop) 153 154 /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ 155 cmeq maskv.16b, datav1.16b, 0 156 sub len, src, srcin 157 cbnz syndw, 1f 158 cmeq maskv.16b, datav2.16b, 0 159 add len, len, 16 1601: 161 /* Generate a bitmask and compute correct byte offset. */ 162 shrn maskv.8b, maskv.8h, 4 163 fmov synd, maskd 164#ifndef __AARCH64EB__ 165 rbit synd, synd 166#endif 167 clz tmp, synd 168 add len, len, tmp, lsr 2 169 ret 170 171L(page_cross): 172 bic src, srcin, 31 173 mov tmpw, 0x0c03 174 movk tmpw, 0xc030, lsl 16 175 ld1 {datav1.16b, datav2.16b}, [src] 176 dup maskv.4s, tmpw 177 cmeq datav1.16b, datav1.16b, 0 178 cmeq datav2.16b, datav2.16b, 0 179 and datav1.16b, datav1.16b, maskv.16b 180 and datav2.16b, datav2.16b, maskv.16b 181 addp maskv.16b, datav1.16b, datav2.16b 182 addp maskv.16b, maskv.16b, maskv.16b 183 fmov synd, maskd 184 lsl shift, srcin, 1 185 lsr synd, synd, shift 186 cbz synd, L(loop) 187 188 rbit synd, synd 189 clz len, synd 190 lsr len, len, 1 191 ret 192 193END (__strlen_aarch64) 194