1/* 2 * strncmp - compare two strings 3 * 4 * Copyright (c) 2013-2022, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64. 11 * MTE compatible. 12 */ 13 14#include "asmdefs.h" 15 16#define REP8_01 0x0101010101010101 17#define REP8_7f 0x7f7f7f7f7f7f7f7f 18 19/* Parameters and result. */ 20#define src1 x0 21#define src2 x1 22#define limit x2 23#define result x0 24 25/* Internal variables. */ 26#define data1 x3 27#define data1w w3 28#define data2 x4 29#define data2w w4 30#define has_nul x5 31#define diff x6 32#define syndrome x7 33#define tmp1 x8 34#define tmp2 x9 35#define tmp3 x10 36#define zeroones x11 37#define pos x12 38#define mask x13 39#define endloop x14 40#define count mask 41#define offset pos 42#define neg_offset x15 43 44/* Define endian dependent shift operations. 45 On big-endian early bytes are at MSB and on little-endian LSB. 46 LS_FW means shifting towards early bytes. 47 LS_BK means shifting towards later bytes. 48 */ 49#ifdef __AARCH64EB__ 50#define LS_FW lsl 51#define LS_BK lsr 52#else 53#define LS_FW lsr 54#define LS_BK lsl 55#endif 56 57ENTRY (__strncmp_aarch64) 58 cbz limit, L(ret0) 59 eor tmp1, src1, src2 60 mov zeroones, #REP8_01 61 tst tmp1, #7 62 and count, src1, #7 63 b.ne L(misaligned8) 64 cbnz count, L(mutual_align) 65 66 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 67 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 68 can be done in parallel across the entire word. */ 69 .p2align 4 70L(loop_aligned): 71 ldr data1, [src1], #8 72 ldr data2, [src2], #8 73L(start_realigned): 74 subs limit, limit, #8 75 sub tmp1, data1, zeroones 76 orr tmp2, data1, #REP8_7f 77 eor diff, data1, data2 /* Non-zero if differences found. */ 78 csinv endloop, diff, xzr, hi /* Last Dword or differences. */ 79 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 80 ccmp endloop, #0, #0, eq 81 b.eq L(loop_aligned) 82 /* End of main loop */ 83 84L(full_check): 85#ifndef __AARCH64EB__ 86 orr syndrome, diff, has_nul 87 add limit, limit, 8 /* Rewind limit to before last subs. */ 88L(syndrome_check): 89 /* Limit was reached. Check if the NUL byte or the difference 90 is before the limit. */ 91 rev syndrome, syndrome 92 rev data1, data1 93 clz pos, syndrome 94 rev data2, data2 95 lsl data1, data1, pos 96 cmp limit, pos, lsr #3 97 lsl data2, data2, pos 98 /* But we need to zero-extend (char is unsigned) the value and then 99 perform a signed 32-bit subtraction. */ 100 lsr data1, data1, #56 101 sub result, data1, data2, lsr #56 102 csel result, result, xzr, hi 103 ret 104#else 105 /* Not reached the limit, must have found the end or a diff. */ 106 tbz limit, #63, L(not_limit) 107 add tmp1, limit, 8 108 cbz limit, L(not_limit) 109 110 lsl limit, tmp1, #3 /* Bits -> bytes. */ 111 mov mask, #~0 112 lsr mask, mask, limit 113 bic data1, data1, mask 114 bic data2, data2, mask 115 116 /* Make sure that the NUL byte is marked in the syndrome. */ 117 orr has_nul, has_nul, mask 118 119L(not_limit): 120 /* For big-endian we cannot use the trick with the syndrome value 121 as carry-propagation can corrupt the upper bits if the trailing 122 bytes in the string contain 0x01. */ 123 /* However, if there is no NUL byte in the dword, we can generate 124 the result directly. We can't just subtract the bytes as the 125 MSB might be significant. */ 126 cbnz has_nul, 1f 127 cmp data1, data2 128 cset result, ne 129 cneg result, result, lo 130 ret 1311: 132 /* Re-compute the NUL-byte detection, using a byte-reversed value. */ 133 rev tmp3, data1 134 sub tmp1, tmp3, zeroones 135 orr tmp2, tmp3, #REP8_7f 136 bic has_nul, tmp1, tmp2 137 rev has_nul, has_nul 138 orr syndrome, diff, has_nul 139 clz pos, syndrome 140 /* The most-significant-non-zero bit of the syndrome marks either the 141 first bit that is different, or the top bit of the first zero byte. 142 Shifting left now will bring the critical information into the 143 top bits. */ 144L(end_quick): 145 lsl data1, data1, pos 146 lsl data2, data2, pos 147 /* But we need to zero-extend (char is unsigned) the value and then 148 perform a signed 32-bit subtraction. */ 149 lsr data1, data1, #56 150 sub result, data1, data2, lsr #56 151 ret 152#endif 153 154L(mutual_align): 155 /* Sources are mutually aligned, but are not currently at an 156 alignment boundary. Round down the addresses and then mask off 157 the bytes that precede the start point. 158 We also need to adjust the limit calculations, but without 159 overflowing if the limit is near ULONG_MAX. */ 160 bic src1, src1, #7 161 bic src2, src2, #7 162 ldr data1, [src1], #8 163 neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ 164 ldr data2, [src2], #8 165 mov tmp2, #~0 166 LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ 167 /* Adjust the limit and ensure it doesn't overflow. */ 168 adds limit, limit, count 169 csinv limit, limit, xzr, lo 170 orr data1, data1, tmp2 171 orr data2, data2, tmp2 172 b L(start_realigned) 173 174 .p2align 4 175 /* Don't bother with dwords for up to 16 bytes. */ 176L(misaligned8): 177 cmp limit, #16 178 b.hs L(try_misaligned_words) 179 180L(byte_loop): 181 /* Perhaps we can do better than this. */ 182 ldrb data1w, [src1], #1 183 ldrb data2w, [src2], #1 184 subs limit, limit, #1 185 ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ 186 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 187 b.eq L(byte_loop) 188L(done): 189 sub result, data1, data2 190 ret 191 /* Align the SRC1 to a dword by doing a bytewise compare and then do 192 the dword loop. */ 193L(try_misaligned_words): 194 cbz count, L(src1_aligned) 195 196 neg count, count 197 and count, count, #7 198 sub limit, limit, count 199 200L(page_end_loop): 201 ldrb data1w, [src1], #1 202 ldrb data2w, [src2], #1 203 cmp data1w, #1 204 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 205 b.ne L(done) 206 subs count, count, #1 207 b.hi L(page_end_loop) 208 209 /* The following diagram explains the comparison of misaligned strings. 210 The bytes are shown in natural order. For little-endian, it is 211 reversed in the registers. The "x" bytes are before the string. 212 The "|" separates data that is loaded at one time. 213 src1 | a a a a a a a a | b b b c c c c c | . . . 214 src2 | x x x x x a a a a a a a a b b b | c c c c c . . . 215 216 After shifting in each step, the data looks like this: 217 STEP_A STEP_B STEP_C 218 data1 a a a a a a a a b b b c c c c c b b b c c c c c 219 data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c 220 221 The bytes with "0" are eliminated from the syndrome via mask. 222 223 Align SRC2 down to 16 bytes. This way we can read 16 bytes at a 224 time from SRC2. The comparison happens in 3 steps. After each step 225 the loop can exit, or read from SRC1 or SRC2. */ 226L(src1_aligned): 227 /* Calculate offset from 8 byte alignment to string start in bits. No 228 need to mask offset since shifts are ignoring upper bits. */ 229 lsl offset, src2, #3 230 bic src2, src2, #0xf 231 mov mask, -1 232 neg neg_offset, offset 233 ldr data1, [src1], #8 234 ldp tmp1, tmp2, [src2], #16 235 LS_BK mask, mask, neg_offset 236 and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ 237 /* Skip the first compare if data in tmp1 is irrelevant. */ 238 tbnz offset, 6, L(misaligned_mid_loop) 239 240L(loop_misaligned): 241 /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ 242 LS_FW data2, tmp1, offset 243 LS_BK tmp1, tmp2, neg_offset 244 subs limit, limit, #8 245 orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ 246 sub has_nul, data1, zeroones 247 eor diff, data1, data2 /* Non-zero if differences found. */ 248 orr tmp3, data1, #REP8_7f 249 csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ 250 bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ 251 orr tmp3, endloop, has_nul 252 cbnz tmp3, L(full_check) 253 254 ldr data1, [src1], #8 255L(misaligned_mid_loop): 256 /* STEP_B: Compare first part of data1 to second part of tmp2. */ 257 LS_FW data2, tmp2, offset 258#ifdef __AARCH64EB__ 259 /* For big-endian we do a byte reverse to avoid carry-propagation 260 problem described above. This way we can reuse the has_nul in the 261 next step and also use syndrome value trick at the end. */ 262 rev tmp3, data1 263 #define data1_fixed tmp3 264#else 265 #define data1_fixed data1 266#endif 267 sub has_nul, data1_fixed, zeroones 268 orr tmp3, data1_fixed, #REP8_7f 269 eor diff, data2, data1 /* Non-zero if differences found. */ 270 bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ 271#ifdef __AARCH64EB__ 272 rev has_nul, has_nul 273#endif 274 cmp limit, neg_offset, lsr #3 275 orr syndrome, diff, has_nul 276 bic syndrome, syndrome, mask /* Ignore later bytes. */ 277 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ 278 cbnz tmp3, L(syndrome_check) 279 280 /* STEP_C: Compare second part of data1 to first part of tmp1. */ 281 ldp tmp1, tmp2, [src2], #16 282 cmp limit, #8 283 LS_BK data2, tmp1, neg_offset 284 eor diff, data2, data1 /* Non-zero if differences found. */ 285 orr syndrome, diff, has_nul 286 and syndrome, syndrome, mask /* Ignore earlier bytes. */ 287 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ 288 cbnz tmp3, L(syndrome_check) 289 290 ldr data1, [src1], #8 291 sub limit, limit, #8 292 b L(loop_misaligned) 293 294#ifdef __AARCH64EB__ 295L(syndrome_check): 296 clz pos, syndrome 297 cmp pos, limit, lsl #3 298 b.lo L(end_quick) 299#endif 300 301L(ret0): 302 mov result, #0 303 ret 304END(__strncmp_aarch64) 305 306