1/* strnlen - calculate the length of a string with limit. 2 3 Copyright (c) 2013, Linaro Limited 4 All rights reserved. 5 6 Redistribution and use in source and binary forms, with or without 7 modification, are permitted provided that the following conditions are met: 8 * Redistributions of source code must retain the above copyright 9 notice, this list of conditions and the following disclaimer. 10 * Redistributions in binary form must reproduce the above copyright 11 notice, this list of conditions and the following disclaimer in the 12 documentation and/or other materials provided with the distribution. 13 * Neither the name of the Linaro nor the 14 names of its contributors may be used to endorse or promote products 15 derived from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 28 29/* Assumptions: 30 * 31 * ARMv8-a, AArch64 32 */ 33 34/* Arguments and results. */ 35#define srcin x0 36#define len x0 37#define limit x1 38 39/* Locals and temporaries. */ 40#define src x2 41#define data1 x3 42#define data2 x4 43#define data2a x5 44#define has_nul1 x6 45#define has_nul2 x7 46#define tmp1 x8 47#define tmp2 x9 48#define tmp3 x10 49#define tmp4 x11 50#define zeroones x12 51#define pos x13 52#define limit_wd x14 53 54 .macro def_fn f p2align=0 55 .text 56 .p2align \p2align 57 .global \f 58 .type \f, %function 59\f: 60 .endm 61 62#define REP8_01 0x0101010101010101 63#define REP8_7f 0x7f7f7f7f7f7f7f7f 64#define REP8_80 0x8080808080808080 65 66 .text 67 .p2align 6 68.Lstart: 69 /* Pre-pad to ensure critical loop begins an icache line. */ 70 .rep 7 71 nop 72 .endr 73 /* Put this code here to avoid wasting more space with pre-padding. */ 74.Lhit_limit: 75 mov len, limit 76 ret 77 78def_fn strnlen 79 cbz limit, .Lhit_limit 80 mov zeroones, #REP8_01 81 bic src, srcin, #15 82 ands tmp1, srcin, #15 83 b.ne .Lmisaligned 84 /* Calculate the number of full and partial words -1. */ 85 sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ 86 lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ 87 88 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 89 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 90 can be done in parallel across the entire word. */ 91 /* The inner loop deals with two Dwords at a time. This has a 92 slightly higher start-up cost, but we should win quite quickly, 93 especially on cores with a high number of issue slots per 94 cycle, as we get much better parallelism out of the operations. */ 95 96 /* Start of critial section -- keep to one 64Byte cache line. */ 97.Lloop: 98 ldp data1, data2, [src], #16 99.Lrealigned: 100 sub tmp1, data1, zeroones 101 orr tmp2, data1, #REP8_7f 102 sub tmp3, data2, zeroones 103 orr tmp4, data2, #REP8_7f 104 bic has_nul1, tmp1, tmp2 105 bic has_nul2, tmp3, tmp4 106 subs limit_wd, limit_wd, #1 107 orr tmp1, has_nul1, has_nul2 108 ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ 109 b.eq .Lloop 110 /* End of critical section -- keep to one 64Byte cache line. */ 111 112 orr tmp1, has_nul1, has_nul2 113 cbz tmp1, .Lhit_limit /* No null in final Qword. */ 114 115 /* We know there's a null in the final Qword. The easiest thing 116 to do now is work out the length of the string and return 117 MIN (len, limit). */ 118 119 sub len, src, srcin 120 cbz has_nul1, .Lnul_in_data2 121#ifdef __AARCH64EB__ 122 mov data2, data1 123#endif 124 sub len, len, #8 125 mov has_nul2, has_nul1 126.Lnul_in_data2: 127#ifdef __AARCH64EB__ 128 /* For big-endian, carry propagation (if the final byte in the 129 string is 0x01) means we cannot use has_nul directly. The 130 easiest way to get the correct byte is to byte-swap the data 131 and calculate the syndrome a second time. */ 132 rev data2, data2 133 sub tmp1, data2, zeroones 134 orr tmp2, data2, #REP8_7f 135 bic has_nul2, tmp1, tmp2 136#endif 137 sub len, len, #8 138 rev has_nul2, has_nul2 139 clz pos, has_nul2 140 add len, len, pos, lsr #3 /* Bits to bytes. */ 141 cmp len, limit 142 csel len, len, limit, ls /* Return the lower value. */ 143 ret 144 145.Lmisaligned: 146 /* Deal with a partial first word. 147 We're doing two things in parallel here; 148 1) Calculate the number of words (but avoiding overflow if 149 limit is near ULONG_MAX) - to do this we need to work out 150 limit + tmp1 - 1 as a 65-bit value before shifting it; 151 2) Load and mask the initial data words - we force the bytes 152 before the ones we are interested in to 0xff - this ensures 153 early bytes will not hit any zero detection. */ 154 sub limit_wd, limit, #1 155 neg tmp4, tmp1 156 cmp tmp1, #8 157 158 and tmp3, limit_wd, #15 159 lsr limit_wd, limit_wd, #4 160 mov tmp2, #~0 161 162 ldp data1, data2, [src], #16 163 lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */ 164 add tmp3, tmp3, tmp1 165 166#ifdef __AARCH64EB__ 167 /* Big-endian. Early bytes are at MSB. */ 168 lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ 169#else 170 /* Little-endian. Early bytes are at LSB. */ 171 lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ 172#endif 173 add limit_wd, limit_wd, tmp3, lsr #4 174 175 orr data1, data1, tmp2 176 orr data2a, data2, tmp2 177 178 csinv data1, data1, xzr, le 179 csel data2, data2, data2a, le 180 b .Lrealigned 181 .size strnlen, . - .Lstart /* Include pre-padding in size. */ 182