1/* 2 * strncmp - compare two strings 3 * 4 * Copyright (c) 2013-2022, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64. 11 * MTE compatible. 12 */ 13 14#include "../asmdefs.h" 15 16#define REP8_01 0x0101010101010101 17#define REP8_7f 0x7f7f7f7f7f7f7f7f 18 19/* Parameters and result. */ 20#define src1 x0 21#define src2 x1 22#define limit x2 23#define result x0 24 25/* Internal variables. */ 26#define data1 x3 27#define data1w w3 28#define data2 x4 29#define data2w w4 30#define has_nul x5 31#define diff x6 32#define syndrome x7 33#define tmp1 x8 34#define tmp2 x9 35#define tmp3 x10 36#define zeroones x11 37#define pos x12 38#define mask x13 39#define endloop x14 40#define count mask 41#define offset pos 42#define neg_offset x15 43 44/* Define endian dependent shift operations. 45 On big-endian early bytes are at MSB and on little-endian LSB. 46 LS_FW means shifting towards early bytes. 47 LS_BK means shifting towards later bytes. 48 */ 49#ifdef __AARCH64EB__ 50#define LS_FW lsl 51#define LS_BK lsr 52#else 53#define LS_FW lsr 54#define LS_BK lsl 55#endif 56 57ENTRY (__strncmp_aarch64) 58 PTR_ARG (0) 59 PTR_ARG (1) 60 SIZE_ARG (2) 61 cbz limit, L(ret0) 62 eor tmp1, src1, src2 63 mov zeroones, #REP8_01 64 tst tmp1, #7 65 and count, src1, #7 66 b.ne L(misaligned8) 67 cbnz count, L(mutual_align) 68 69 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 70 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 71 can be done in parallel across the entire word. */ 72 .p2align 4 73L(loop_aligned): 74 ldr data1, [src1], #8 75 ldr data2, [src2], #8 76L(start_realigned): 77 subs limit, limit, #8 78 sub tmp1, data1, zeroones 79 orr tmp2, data1, #REP8_7f 80 eor diff, data1, data2 /* Non-zero if differences found. */ 81 csinv endloop, diff, xzr, hi /* Last Dword or differences. */ 82 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 83 ccmp endloop, #0, #0, eq 84 b.eq L(loop_aligned) 85 /* End of main loop */ 86 87L(full_check): 88#ifndef __AARCH64EB__ 89 orr syndrome, diff, has_nul 90 add limit, limit, 8 /* Rewind limit to before last subs. */ 91L(syndrome_check): 92 /* Limit was reached. Check if the NUL byte or the difference 93 is before the limit. */ 94 rev syndrome, syndrome 95 rev data1, data1 96 clz pos, syndrome 97 rev data2, data2 98 lsl data1, data1, pos 99 cmp limit, pos, lsr #3 100 lsl data2, data2, pos 101 /* But we need to zero-extend (char is unsigned) the value and then 102 perform a signed 32-bit subtraction. */ 103 lsr data1, data1, #56 104 sub result, data1, data2, lsr #56 105 csel result, result, xzr, hi 106 ret 107#else 108 /* Not reached the limit, must have found the end or a diff. */ 109 tbz limit, #63, L(not_limit) 110 add tmp1, limit, 8 111 cbz limit, L(not_limit) 112 113 lsl limit, tmp1, #3 /* Bits -> bytes. */ 114 mov mask, #~0 115 lsr mask, mask, limit 116 bic data1, data1, mask 117 bic data2, data2, mask 118 119 /* Make sure that the NUL byte is marked in the syndrome. */ 120 orr has_nul, has_nul, mask 121 122L(not_limit): 123 /* For big-endian we cannot use the trick with the syndrome value 124 as carry-propagation can corrupt the upper bits if the trailing 125 bytes in the string contain 0x01. */ 126 /* However, if there is no NUL byte in the dword, we can generate 127 the result directly. We can't just subtract the bytes as the 128 MSB might be significant. */ 129 cbnz has_nul, 1f 130 cmp data1, data2 131 cset result, ne 132 cneg result, result, lo 133 ret 1341: 135 /* Re-compute the NUL-byte detection, using a byte-reversed value. */ 136 rev tmp3, data1 137 sub tmp1, tmp3, zeroones 138 orr tmp2, tmp3, #REP8_7f 139 bic has_nul, tmp1, tmp2 140 rev has_nul, has_nul 141 orr syndrome, diff, has_nul 142 clz pos, syndrome 143 /* The most-significant-non-zero bit of the syndrome marks either the 144 first bit that is different, or the top bit of the first zero byte. 145 Shifting left now will bring the critical information into the 146 top bits. */ 147L(end_quick): 148 lsl data1, data1, pos 149 lsl data2, data2, pos 150 /* But we need to zero-extend (char is unsigned) the value and then 151 perform a signed 32-bit subtraction. */ 152 lsr data1, data1, #56 153 sub result, data1, data2, lsr #56 154 ret 155#endif 156 157L(mutual_align): 158 /* Sources are mutually aligned, but are not currently at an 159 alignment boundary. Round down the addresses and then mask off 160 the bytes that precede the start point. 161 We also need to adjust the limit calculations, but without 162 overflowing if the limit is near ULONG_MAX. */ 163 bic src1, src1, #7 164 bic src2, src2, #7 165 ldr data1, [src1], #8 166 neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ 167 ldr data2, [src2], #8 168 mov tmp2, #~0 169 LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ 170 /* Adjust the limit and ensure it doesn't overflow. */ 171 adds limit, limit, count 172 csinv limit, limit, xzr, lo 173 orr data1, data1, tmp2 174 orr data2, data2, tmp2 175 b L(start_realigned) 176 177 .p2align 4 178 /* Don't bother with dwords for up to 16 bytes. */ 179L(misaligned8): 180 cmp limit, #16 181 b.hs L(try_misaligned_words) 182 183L(byte_loop): 184 /* Perhaps we can do better than this. */ 185 ldrb data1w, [src1], #1 186 ldrb data2w, [src2], #1 187 subs limit, limit, #1 188 ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ 189 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 190 b.eq L(byte_loop) 191L(done): 192 sub result, data1, data2 193 ret 194 /* Align the SRC1 to a dword by doing a bytewise compare and then do 195 the dword loop. */ 196L(try_misaligned_words): 197 cbz count, L(src1_aligned) 198 199 neg count, count 200 and count, count, #7 201 sub limit, limit, count 202 203L(page_end_loop): 204 ldrb data1w, [src1], #1 205 ldrb data2w, [src2], #1 206 cmp data1w, #1 207 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 208 b.ne L(done) 209 subs count, count, #1 210 b.hi L(page_end_loop) 211 212 /* The following diagram explains the comparison of misaligned strings. 213 The bytes are shown in natural order. For little-endian, it is 214 reversed in the registers. The "x" bytes are before the string. 215 The "|" separates data that is loaded at one time. 216 src1 | a a a a a a a a | b b b c c c c c | . . . 217 src2 | x x x x x a a a a a a a a b b b | c c c c c . . . 218 219 After shifting in each step, the data looks like this: 220 STEP_A STEP_B STEP_C 221 data1 a a a a a a a a b b b c c c c c b b b c c c c c 222 data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c 223 224 The bytes with "0" are eliminated from the syndrome via mask. 225 226 Align SRC2 down to 16 bytes. This way we can read 16 bytes at a 227 time from SRC2. The comparison happens in 3 steps. After each step 228 the loop can exit, or read from SRC1 or SRC2. */ 229L(src1_aligned): 230 /* Calculate offset from 8 byte alignment to string start in bits. No 231 need to mask offset since shifts are ignoring upper bits. */ 232 lsl offset, src2, #3 233 bic src2, src2, #0xf 234 mov mask, -1 235 neg neg_offset, offset 236 ldr data1, [src1], #8 237 ldp tmp1, tmp2, [src2], #16 238 LS_BK mask, mask, neg_offset 239 and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ 240 /* Skip the first compare if data in tmp1 is irrelevant. */ 241 tbnz offset, 6, L(misaligned_mid_loop) 242 243L(loop_misaligned): 244 /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ 245 LS_FW data2, tmp1, offset 246 LS_BK tmp1, tmp2, neg_offset 247 subs limit, limit, #8 248 orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ 249 sub has_nul, data1, zeroones 250 eor diff, data1, data2 /* Non-zero if differences found. */ 251 orr tmp3, data1, #REP8_7f 252 csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ 253 bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ 254 orr tmp3, endloop, has_nul 255 cbnz tmp3, L(full_check) 256 257 ldr data1, [src1], #8 258L(misaligned_mid_loop): 259 /* STEP_B: Compare first part of data1 to second part of tmp2. */ 260 LS_FW data2, tmp2, offset 261#ifdef __AARCH64EB__ 262 /* For big-endian we do a byte reverse to avoid carry-propagation 263 problem described above. This way we can reuse the has_nul in the 264 next step and also use syndrome value trick at the end. */ 265 rev tmp3, data1 266 #define data1_fixed tmp3 267#else 268 #define data1_fixed data1 269#endif 270 sub has_nul, data1_fixed, zeroones 271 orr tmp3, data1_fixed, #REP8_7f 272 eor diff, data2, data1 /* Non-zero if differences found. */ 273 bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ 274#ifdef __AARCH64EB__ 275 rev has_nul, has_nul 276#endif 277 cmp limit, neg_offset, lsr #3 278 orr syndrome, diff, has_nul 279 bic syndrome, syndrome, mask /* Ignore later bytes. */ 280 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ 281 cbnz tmp3, L(syndrome_check) 282 283 /* STEP_C: Compare second part of data1 to first part of tmp1. */ 284 ldp tmp1, tmp2, [src2], #16 285 cmp limit, #8 286 LS_BK data2, tmp1, neg_offset 287 eor diff, data2, data1 /* Non-zero if differences found. */ 288 orr syndrome, diff, has_nul 289 and syndrome, syndrome, mask /* Ignore earlier bytes. */ 290 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ 291 cbnz tmp3, L(syndrome_check) 292 293 ldr data1, [src1], #8 294 sub limit, limit, #8 295 b L(loop_misaligned) 296 297#ifdef __AARCH64EB__ 298L(syndrome_check): 299 clz pos, syndrome 300 cmp pos, limit, lsl #3 301 b.lo L(end_quick) 302#endif 303 304L(ret0): 305 mov result, #0 306 ret 307END(__strncmp_aarch64) 308 309