1/* 2 * strcmp for ARMv7 3 * 4 * Copyright (c) 2012-2021, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8#if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 9 10/* Implementation of strcmp for ARMv7 when DSP instructions are 11 available. Use ldrd to support wider loads, provided the data 12 is sufficiently aligned. Use saturating arithmetic to optimize 13 the compares. */ 14 15#include "../asmdefs.h" 16 17/* Build Options: 18 STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first 19 byte in the string. If comparing completely random strings 20 the pre-check will save time, since there is a very high 21 probability of a mismatch in the first character: we save 22 significant overhead if this is the common case. However, 23 if strings are likely to be identical (eg because we're 24 verifying a hit in a hash table), then this check is largely 25 redundant. */ 26 27#define STRCMP_NO_PRECHECK 0 28 29 /* This version uses Thumb-2 code. */ 30 .thumb 31 .syntax unified 32 33#ifdef __ARM_BIG_ENDIAN 34#define S2LO lsl 35#define S2LOEQ lsleq 36#define S2HI lsr 37#define MSB 0x000000ff 38#define LSB 0xff000000 39#define BYTE0_OFFSET 24 40#define BYTE1_OFFSET 16 41#define BYTE2_OFFSET 8 42#define BYTE3_OFFSET 0 43#else /* not __ARM_BIG_ENDIAN */ 44#define S2LO lsr 45#define S2LOEQ lsreq 46#define S2HI lsl 47#define BYTE0_OFFSET 0 48#define BYTE1_OFFSET 8 49#define BYTE2_OFFSET 16 50#define BYTE3_OFFSET 24 51#define MSB 0xff000000 52#define LSB 0x000000ff 53#endif /* not __ARM_BIG_ENDIAN */ 54 55/* Parameters and result. */ 56#define src1 r0 57#define src2 r1 58#define result r0 /* Overlaps src1. */ 59 60/* Internal variables. */ 61#define tmp1 r4 62#define tmp2 r5 63#define const_m1 r12 64 65/* Additional internal variables for 64-bit aligned data. */ 66#define data1a r2 67#define data1b r3 68#define data2a r6 69#define data2b r7 70#define syndrome_a tmp1 71#define syndrome_b tmp2 72 73/* Additional internal variables for 32-bit aligned data. */ 74#define data1 r2 75#define data2 r3 76#define syndrome tmp2 77 78 79 /* Macro to compute and return the result value for word-aligned 80 cases. */ 81 .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 82#ifdef __ARM_BIG_ENDIAN 83 /* If data1 contains a zero byte, then syndrome will contain a 1 in 84 bit 7 of that byte. Otherwise, the highest set bit in the 85 syndrome will highlight the first different bit. It is therefore 86 sufficient to extract the eight bits starting with the syndrome 87 bit. */ 88 clz tmp1, \synd 89 lsl r1, \d2, tmp1 90 .if \restore_r6 91 ldrd r6, r7, [sp, #8] 92 .endif 93 .cfi_restore 6 94 .cfi_restore 7 95 lsl \d1, \d1, tmp1 96 .cfi_remember_state 97 lsr result, \d1, #24 98 ldrd r4, r5, [sp], #16 99 .cfi_restore 4 100 .cfi_restore 5 101 sub result, result, r1, lsr #24 102 bx lr 103#else 104 /* To use the big-endian trick we'd have to reverse all three words. 105 that's slower than this approach. */ 106 rev \synd, \synd 107 clz tmp1, \synd 108 bic tmp1, tmp1, #7 109 lsr r1, \d2, tmp1 110 .cfi_remember_state 111 .if \restore_r6 112 ldrd r6, r7, [sp, #8] 113 .endif 114 .cfi_restore 6 115 .cfi_restore 7 116 lsr \d1, \d1, tmp1 117 and result, \d1, #255 118 and r1, r1, #255 119 ldrd r4, r5, [sp], #16 120 .cfi_restore 4 121 .cfi_restore 5 122 sub result, result, r1 123 124 bx lr 125#endif 126 .endm 127 128 .p2align 5 129L(strcmp_start_addr): 130#if STRCMP_NO_PRECHECK == 0 131L(fastpath_exit): 132 sub r0, r2, r3 133 bx lr 134 nop 135#endif 136ENTRY_ALIGN (__strcmp_arm, 0) 137#if STRCMP_NO_PRECHECK == 0 138 ldrb r2, [src1] 139 ldrb r3, [src2] 140 cmp r2, #1 141 it cs 142 cmpcs r2, r3 143 bne L(fastpath_exit) 144#endif 145 strd r4, r5, [sp, #-16]! 146 .cfi_def_cfa_offset 16 147 .cfi_offset 4, -16 148 .cfi_offset 5, -12 149 orr tmp1, src1, src2 150 strd r6, r7, [sp, #8] 151 .cfi_offset 6, -8 152 .cfi_offset 7, -4 153 mvn const_m1, #0 154 lsl r2, tmp1, #29 155 cbz r2, L(loop_aligned8) 156 157L(not_aligned): 158 eor tmp1, src1, src2 159 tst tmp1, #7 160 bne L(misaligned8) 161 162 /* Deal with mutual misalignment by aligning downwards and then 163 masking off the unwanted loaded data to prevent a difference. */ 164 and tmp1, src1, #7 165 bic src1, src1, #7 166 and tmp2, tmp1, #3 167 bic src2, src2, #7 168 lsl tmp2, tmp2, #3 /* Bytes -> bits. */ 169 ldrd data1a, data1b, [src1], #16 170 tst tmp1, #4 171 ldrd data2a, data2b, [src2], #16 172 /* In thumb code we can't use MVN with a register shift, but 173 we do have ORN. */ 174 S2HI tmp1, const_m1, tmp2 175 orn data1a, data1a, tmp1 176 orn data2a, data2a, tmp1 177 beq L(start_realigned8) 178 orn data1b, data1b, tmp1 179 mov data1a, const_m1 180 orn data2b, data2b, tmp1 181 mov data2a, const_m1 182 b L(start_realigned8) 183 184 /* Unwind the inner loop by a factor of 2, giving 16 bytes per 185 pass. */ 186 .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ 187 .p2align 2 /* Always word aligned. */ 188L(loop_aligned8): 189 ldrd data1a, data1b, [src1], #16 190 ldrd data2a, data2b, [src2], #16 191L(start_realigned8): 192 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ 193 eor syndrome_a, data1a, data2a 194 sel syndrome_a, syndrome_a, const_m1 195 cbnz syndrome_a, L(diff_in_a) 196 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ 197 eor syndrome_b, data1b, data2b 198 sel syndrome_b, syndrome_b, const_m1 199 cbnz syndrome_b, L(diff_in_b) 200 201 ldrd data1a, data1b, [src1, #-8] 202 ldrd data2a, data2b, [src2, #-8] 203 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ 204 eor syndrome_a, data1a, data2a 205 sel syndrome_a, syndrome_a, const_m1 206 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ 207 eor syndrome_b, data1b, data2b 208 sel syndrome_b, syndrome_b, const_m1 209 /* Can't use CBZ for backwards branch. */ 210 orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ 211 beq L(loop_aligned8) 212 213L(diff_found): 214 cbnz syndrome_a, L(diff_in_a) 215 216L(diff_in_b): 217 strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 218 219L(diff_in_a): 220 .cfi_restore_state 221 strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 222 223 .cfi_restore_state 224L(misaligned8): 225 tst tmp1, #3 226 bne L(misaligned4) 227 ands tmp1, src1, #3 228 bne L(mutual_align4) 229 230 /* Unrolled by a factor of 2, to reduce the number of post-increment 231 operations. */ 232L(loop_aligned4): 233 ldr data1, [src1], #8 234 ldr data2, [src2], #8 235L(start_realigned4): 236 uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ 237 eor syndrome, data1, data2 238 sel syndrome, syndrome, const_m1 239 cbnz syndrome, L(aligned4_done) 240 ldr data1, [src1, #-4] 241 ldr data2, [src2, #-4] 242 uadd8 syndrome, data1, const_m1 243 eor syndrome, data1, data2 244 sel syndrome, syndrome, const_m1 245 cmp syndrome, #0 246 beq L(loop_aligned4) 247 248L(aligned4_done): 249 strcmp_epilogue_aligned syndrome, data1, data2, 0 250 251L(mutual_align4): 252 .cfi_restore_state 253 /* Deal with mutual misalignment by aligning downwards and then 254 masking off the unwanted loaded data to prevent a difference. */ 255 lsl tmp1, tmp1, #3 /* Bytes -> bits. */ 256 bic src1, src1, #3 257 ldr data1, [src1], #8 258 bic src2, src2, #3 259 ldr data2, [src2], #8 260 261 /* In thumb code we can't use MVN with a register shift, but 262 we do have ORN. */ 263 S2HI tmp1, const_m1, tmp1 264 orn data1, data1, tmp1 265 orn data2, data2, tmp1 266 b L(start_realigned4) 267 268L(misaligned4): 269 ands tmp1, src1, #3 270 beq L(src1_aligned) 271 sub src2, src2, tmp1 272 bic src1, src1, #3 273 lsls tmp1, tmp1, #31 274 ldr data1, [src1], #4 275 beq L(aligned_m2) 276 bcs L(aligned_m1) 277 278#if STRCMP_NO_PRECHECK == 1 279 ldrb data2, [src2, #1] 280 uxtb tmp1, data1, ror #BYTE1_OFFSET 281 subs tmp1, tmp1, data2 282 bne L(misaligned_exit) 283 cbz data2, L(misaligned_exit) 284 285L(aligned_m2): 286 ldrb data2, [src2, #2] 287 uxtb tmp1, data1, ror #BYTE2_OFFSET 288 subs tmp1, tmp1, data2 289 bne L(misaligned_exit) 290 cbz data2, L(misaligned_exit) 291 292L(aligned_m1): 293 ldrb data2, [src2, #3] 294 uxtb tmp1, data1, ror #BYTE3_OFFSET 295 subs tmp1, tmp1, data2 296 bne L(misaligned_exit) 297 add src2, src2, #4 298 cbnz data2, L(src1_aligned) 299#else /* STRCMP_NO_PRECHECK */ 300 /* If we've done the pre-check, then we don't need to check the 301 first byte again here. */ 302 ldrb data2, [src2, #2] 303 uxtb tmp1, data1, ror #BYTE2_OFFSET 304 subs tmp1, tmp1, data2 305 bne L(misaligned_exit) 306 cbz data2, L(misaligned_exit) 307 308L(aligned_m2): 309 ldrb data2, [src2, #3] 310 uxtb tmp1, data1, ror #BYTE3_OFFSET 311 subs tmp1, tmp1, data2 312 bne L(misaligned_exit) 313 cbnz data2, L(aligned_m1) 314#endif 315 316L(misaligned_exit): 317 .cfi_remember_state 318 mov result, tmp1 319 ldr r4, [sp], #16 320 .cfi_restore 4 321 bx lr 322 323#if STRCMP_NO_PRECHECK == 0 324L(aligned_m1): 325 add src2, src2, #4 326#endif 327L(src1_aligned): 328 .cfi_restore_state 329 /* src1 is word aligned, but src2 has no common alignment 330 with it. */ 331 ldr data1, [src1], #4 332 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ 333 334 bic src2, src2, #3 335 ldr data2, [src2], #4 336 bhi L(overlap1) /* C=1, Z=0 => src2[1:0] = 0b11. */ 337 bcs L(overlap2) /* C=1, Z=1 => src2[1:0] = 0b10. */ 338 339 /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ 340L(overlap3): 341 bic tmp1, data1, #MSB 342 uadd8 syndrome, data1, const_m1 343 eors syndrome, tmp1, data2, S2LO #8 344 sel syndrome, syndrome, const_m1 345 bne 4f 346 cbnz syndrome, 5f 347 ldr data2, [src2], #4 348 eor tmp1, tmp1, data1 349 cmp tmp1, data2, S2HI #24 350 bne 6f 351 ldr data1, [src1], #4 352 b L(overlap3) 3534: 354 S2LO data2, data2, #8 355 b L(strcmp_tail) 356 3575: 358 bics syndrome, syndrome, #MSB 359 bne L(strcmp_done_equal) 360 361 /* We can only get here if the MSB of data1 contains 0, so 362 fast-path the exit. */ 363 ldrb result, [src2] 364 .cfi_remember_state 365 ldrd r4, r5, [sp], #16 366 .cfi_restore 4 367 .cfi_restore 5 368 /* R6/7 Not used in this sequence. */ 369 .cfi_restore 6 370 .cfi_restore 7 371 neg result, result 372 bx lr 373 3746: 375 .cfi_restore_state 376 S2LO data1, data1, #24 377 and data2, data2, #LSB 378 b L(strcmp_tail) 379 380 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ 381L(overlap2): 382 and tmp1, data1, const_m1, S2LO #16 383 uadd8 syndrome, data1, const_m1 384 eors syndrome, tmp1, data2, S2LO #16 385 sel syndrome, syndrome, const_m1 386 bne 4f 387 cbnz syndrome, 5f 388 ldr data2, [src2], #4 389 eor tmp1, tmp1, data1 390 cmp tmp1, data2, S2HI #16 391 bne 6f 392 ldr data1, [src1], #4 393 b L(overlap2) 3944: 395 S2LO data2, data2, #16 396 b L(strcmp_tail) 3975: 398 ands syndrome, syndrome, const_m1, S2LO #16 399 bne L(strcmp_done_equal) 400 401 ldrh data2, [src2] 402 S2LO data1, data1, #16 403#ifdef __ARM_BIG_ENDIAN 404 lsl data2, data2, #16 405#endif 406 b L(strcmp_tail) 407 4086: 409 S2LO data1, data1, #16 410 and data2, data2, const_m1, S2LO #16 411 b L(strcmp_tail) 412 413 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ 414L(overlap1): 415 and tmp1, data1, #LSB 416 uadd8 syndrome, data1, const_m1 417 eors syndrome, tmp1, data2, S2LO #24 418 sel syndrome, syndrome, const_m1 419 bne 4f 420 cbnz syndrome, 5f 421 ldr data2, [src2], #4 422 eor tmp1, tmp1, data1 423 cmp tmp1, data2, S2HI #8 424 bne 6f 425 ldr data1, [src1], #4 426 b L(overlap1) 4274: 428 S2LO data2, data2, #24 429 b L(strcmp_tail) 4305: 431 tst syndrome, #LSB 432 bne L(strcmp_done_equal) 433 ldr data2, [src2] 4346: 435 S2LO data1, data1, #8 436 bic data2, data2, #MSB 437 b L(strcmp_tail) 438 439L(strcmp_done_equal): 440 mov result, #0 441 .cfi_remember_state 442 ldrd r4, r5, [sp], #16 443 .cfi_restore 4 444 .cfi_restore 5 445 /* R6/7 not used in this sequence. */ 446 .cfi_restore 6 447 .cfi_restore 7 448 bx lr 449 450L(strcmp_tail): 451 .cfi_restore_state 452#ifndef __ARM_BIG_ENDIAN 453 rev data1, data1 454 rev data2, data2 455 /* Now everything looks big-endian... */ 456#endif 457 uadd8 tmp1, data1, const_m1 458 eor tmp1, data1, data2 459 sel syndrome, tmp1, const_m1 460 clz tmp1, syndrome 461 lsl data1, data1, tmp1 462 lsl data2, data2, tmp1 463 lsr result, data1, #24 464 ldrd r4, r5, [sp], #16 465 .cfi_restore 4 466 .cfi_restore 5 467 /* R6/7 not used in this sequence. */ 468 .cfi_restore 6 469 .cfi_restore 7 470 sub result, result, data2, lsr #24 471 bx lr 472 473END (__strcmp_arm) 474 475#endif /* __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 */ 476