1/* 2 * Copyright (c) 2012-2014 ARM Ltd 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. The name of the company may not be used to endorse or promote 14 * products derived from this software without specific prior written 15 * permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29/* Implementation of strcmp for ARMv7 when DSP instructions are 30 available. Use ldrd to support wider loads, provided the data 31 is sufficiently aligned. Use saturating arithmetic to optimize 32 the compares. */ 33 34/* Build Options: 35 STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first 36 byte in the string. If comparing completely random strings 37 the pre-check will save time, since there is a very high 38 probability of a mismatch in the first character: we save 39 significant overhead if this is the common case. However, 40 if strings are likely to be identical (eg because we're 41 verifying a hit in a hash table), then this check is largely 42 redundant. */ 43 44#define STRCMP_NO_PRECHECK 0 45 46 /* This version uses Thumb-2 code. */ 47 .thumb 48 .syntax unified 49 50#ifdef __ARM_BIG_ENDIAN 51#define S2LO lsl 52#define S2LOEQ lsleq 53#define S2HI lsr 54#define MSB 0x000000ff 55#define LSB 0xff000000 56#define BYTE0_OFFSET 24 57#define BYTE1_OFFSET 16 58#define BYTE2_OFFSET 8 59#define BYTE3_OFFSET 0 60#else /* not __ARM_BIG_ENDIAN */ 61#define S2LO lsr 62#define S2LOEQ lsreq 63#define S2HI lsl 64#define BYTE0_OFFSET 0 65#define BYTE1_OFFSET 8 66#define BYTE2_OFFSET 16 67#define BYTE3_OFFSET 24 68#define MSB 0xff000000 69#define LSB 0x000000ff 70#endif /* not __ARM_BIG_ENDIAN */ 71 72 .macro def_fn f p2align=0 73 .text 74 .p2align \p2align 75 .global \f 76 .type \f, %function 77\f: 78 .endm 79 80/* Parameters and result. */ 81#define src1 r0 82#define src2 r1 83#define result r0 /* Overlaps src1. */ 84 85/* Internal variables. */ 86#define tmp1 r4 87#define tmp2 r5 88#define const_m1 r12 89 90/* Additional internal variables for 64-bit aligned data. */ 91#define data1a r2 92#define data1b r3 93#define data2a r6 94#define data2b r7 95#define syndrome_a tmp1 96#define syndrome_b tmp2 97 98/* Additional internal variables for 32-bit aligned data. */ 99#define data1 r2 100#define data2 r3 101#define syndrome tmp2 102 103 104 /* Macro to compute and return the result value for word-aligned 105 cases. */ 106 .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 107#ifdef __ARM_BIG_ENDIAN 108 /* If data1 contains a zero byte, then syndrome will contain a 1 in 109 bit 7 of that byte. Otherwise, the highest set bit in the 110 syndrome will highlight the first different bit. It is therefore 111 sufficient to extract the eight bits starting with the syndrome 112 bit. */ 113 clz tmp1, \synd 114 lsl r1, \d2, tmp1 115 .if \restore_r6 116 ldrd r6, r7, [sp, #8] 117 .endif 118 .cfi_restore 6 119 .cfi_restore 7 120 lsl \d1, \d1, tmp1 121 .cfi_remember_state 122 lsr result, \d1, #24 123 ldrd r4, r5, [sp], #16 124 .cfi_restore 4 125 .cfi_restore 5 126 sub result, result, r1, lsr #24 127 bx lr 128#else 129 /* To use the big-endian trick we'd have to reverse all three words. 130 that's slower than this approach. */ 131 rev \synd, \synd 132 clz tmp1, \synd 133 bic tmp1, tmp1, #7 134 lsr r1, \d2, tmp1 135 .cfi_remember_state 136 .if \restore_r6 137 ldrd r6, r7, [sp, #8] 138 .endif 139 .cfi_restore 6 140 .cfi_restore 7 141 lsr \d1, \d1, tmp1 142 and result, \d1, #255 143 and r1, r1, #255 144 ldrd r4, r5, [sp], #16 145 .cfi_restore 4 146 .cfi_restore 5 147 sub result, result, r1 148 149 bx lr 150#endif 151 .endm 152 153 .text 154 .p2align 5 155.Lstrcmp_start_addr: 156#if STRCMP_NO_PRECHECK == 0 157.Lfastpath_exit: 158 sub r0, r2, r3 159 bx lr 160 nop 161#endif 162def_fn strcmp 163#if STRCMP_NO_PRECHECK == 0 164 ldrb r2, [src1] 165 ldrb r3, [src2] 166 cmp r2, #1 167 it cs 168 cmpcs r2, r3 169 bne .Lfastpath_exit 170#endif 171 .cfi_startproc 172 strd r4, r5, [sp, #-16]! 173 .cfi_def_cfa_offset 16 174 .cfi_offset 4, -16 175 .cfi_offset 5, -12 176 orr tmp1, src1, src2 177 strd r6, r7, [sp, #8] 178 .cfi_offset 6, -8 179 .cfi_offset 7, -4 180 mvn const_m1, #0 181 lsl r2, tmp1, #29 182 cbz r2, .Lloop_aligned8 183 184.Lnot_aligned: 185 eor tmp1, src1, src2 186 tst tmp1, #7 187 bne .Lmisaligned8 188 189 /* Deal with mutual misalignment by aligning downwards and then 190 masking off the unwanted loaded data to prevent a difference. */ 191 and tmp1, src1, #7 192 bic src1, src1, #7 193 and tmp2, tmp1, #3 194 bic src2, src2, #7 195 lsl tmp2, tmp2, #3 /* Bytes -> bits. */ 196 ldrd data1a, data1b, [src1], #16 197 tst tmp1, #4 198 ldrd data2a, data2b, [src2], #16 199 /* In thumb code we can't use MVN with a register shift, but 200 we do have ORN. */ 201 S2HI tmp1, const_m1, tmp2 202 orn data1a, data1a, tmp1 203 orn data2a, data2a, tmp1 204 beq .Lstart_realigned8 205 orn data1b, data1b, tmp1 206 mov data1a, const_m1 207 orn data2b, data2b, tmp1 208 mov data2a, const_m1 209 b .Lstart_realigned8 210 211 /* Unwind the inner loop by a factor of 2, giving 16 bytes per 212 pass. */ 213 .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ 214 .p2align 2 /* Always word aligned. */ 215.Lloop_aligned8: 216 ldrd data1a, data1b, [src1], #16 217 ldrd data2a, data2b, [src2], #16 218.Lstart_realigned8: 219 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ 220 eor syndrome_a, data1a, data2a 221 sel syndrome_a, syndrome_a, const_m1 222 cbnz syndrome_a, .Ldiff_in_a 223 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ 224 eor syndrome_b, data1b, data2b 225 sel syndrome_b, syndrome_b, const_m1 226 cbnz syndrome_b, .Ldiff_in_b 227 228 ldrd data1a, data1b, [src1, #-8] 229 ldrd data2a, data2b, [src2, #-8] 230 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ 231 eor syndrome_a, data1a, data2a 232 sel syndrome_a, syndrome_a, const_m1 233 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ 234 eor syndrome_b, data1b, data2b 235 sel syndrome_b, syndrome_b, const_m1 236 /* Can't use CBZ for backwards branch. */ 237 orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ 238 beq .Lloop_aligned8 239 240.Ldiff_found: 241 cbnz syndrome_a, .Ldiff_in_a 242 243.Ldiff_in_b: 244 strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 245 246.Ldiff_in_a: 247 .cfi_restore_state 248 strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 249 250 .cfi_restore_state 251.Lmisaligned8: 252 tst tmp1, #3 253 bne .Lmisaligned4 254 ands tmp1, src1, #3 255 bne .Lmutual_align4 256 257 /* Unrolled by a factor of 2, to reduce the number of post-increment 258 operations. */ 259.Lloop_aligned4: 260 ldr data1, [src1], #8 261 ldr data2, [src2], #8 262.Lstart_realigned4: 263 uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ 264 eor syndrome, data1, data2 265 sel syndrome, syndrome, const_m1 266 cbnz syndrome, .Laligned4_done 267 ldr data1, [src1, #-4] 268 ldr data2, [src2, #-4] 269 uadd8 syndrome, data1, const_m1 270 eor syndrome, data1, data2 271 sel syndrome, syndrome, const_m1 272 cmp syndrome, #0 273 beq .Lloop_aligned4 274 275.Laligned4_done: 276 strcmp_epilogue_aligned syndrome, data1, data2, 0 277 278.Lmutual_align4: 279 .cfi_restore_state 280 /* Deal with mutual misalignment by aligning downwards and then 281 masking off the unwanted loaded data to prevent a difference. */ 282 lsl tmp1, tmp1, #3 /* Bytes -> bits. */ 283 bic src1, src1, #3 284 ldr data1, [src1], #8 285 bic src2, src2, #3 286 ldr data2, [src2], #8 287 288 /* In thumb code we can't use MVN with a register shift, but 289 we do have ORN. */ 290 S2HI tmp1, const_m1, tmp1 291 orn data1, data1, tmp1 292 orn data2, data2, tmp1 293 b .Lstart_realigned4 294 295.Lmisaligned4: 296 ands tmp1, src1, #3 297 beq .Lsrc1_aligned 298 sub src2, src2, tmp1 299 bic src1, src1, #3 300 lsls tmp1, tmp1, #31 301 ldr data1, [src1], #4 302 beq .Laligned_m2 303 bcs .Laligned_m1 304 305#if STRCMP_NO_PRECHECK == 1 306 ldrb data2, [src2, #1] 307 uxtb tmp1, data1, ror #BYTE1_OFFSET 308 subs tmp1, tmp1, data2 309 bne .Lmisaligned_exit 310 cbz data2, .Lmisaligned_exit 311 312.Laligned_m2: 313 ldrb data2, [src2, #2] 314 uxtb tmp1, data1, ror #BYTE2_OFFSET 315 subs tmp1, tmp1, data2 316 bne .Lmisaligned_exit 317 cbz data2, .Lmisaligned_exit 318 319.Laligned_m1: 320 ldrb data2, [src2, #3] 321 uxtb tmp1, data1, ror #BYTE3_OFFSET 322 subs tmp1, tmp1, data2 323 bne .Lmisaligned_exit 324 add src2, src2, #4 325 cbnz data2, .Lsrc1_aligned 326#else /* STRCMP_NO_PRECHECK */ 327 /* If we've done the pre-check, then we don't need to check the 328 first byte again here. */ 329 ldrb data2, [src2, #2] 330 uxtb tmp1, data1, ror #BYTE2_OFFSET 331 subs tmp1, tmp1, data2 332 bne .Lmisaligned_exit 333 cbz data2, .Lmisaligned_exit 334 335.Laligned_m2: 336 ldrb data2, [src2, #3] 337 uxtb tmp1, data1, ror #BYTE3_OFFSET 338 subs tmp1, tmp1, data2 339 bne .Lmisaligned_exit 340 cbnz data2, .Laligned_m1 341#endif 342 343.Lmisaligned_exit: 344 .cfi_remember_state 345 mov result, tmp1 346 ldr r4, [sp], #16 347 .cfi_restore 4 348 bx lr 349 350#if STRCMP_NO_PRECHECK == 0 351.Laligned_m1: 352 add src2, src2, #4 353#endif 354.Lsrc1_aligned: 355 .cfi_restore_state 356 /* src1 is word aligned, but src2 has no common alignment 357 with it. */ 358 ldr data1, [src1], #4 359 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ 360 361 bic src2, src2, #3 362 ldr data2, [src2], #4 363 bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */ 364 bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */ 365 366 /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ 367.Loverlap3: 368 bic tmp1, data1, #MSB 369 uadd8 syndrome, data1, const_m1 370 eors syndrome, tmp1, data2, S2LO #8 371 sel syndrome, syndrome, const_m1 372 bne 4f 373 cbnz syndrome, 5f 374 ldr data2, [src2], #4 375 eor tmp1, tmp1, data1 376 cmp tmp1, data2, S2HI #24 377 bne 6f 378 ldr data1, [src1], #4 379 b .Loverlap3 3804: 381 S2LO data2, data2, #8 382 b .Lstrcmp_tail 383 3845: 385 bics syndrome, syndrome, #MSB 386 bne .Lstrcmp_done_equal 387 388 /* We can only get here if the MSB of data1 contains 0, so 389 fast-path the exit. */ 390 ldrb result, [src2] 391 .cfi_remember_state 392 ldrd r4, r5, [sp], #16 393 .cfi_restore 4 394 .cfi_restore 5 395 /* R6/7 Not used in this sequence. */ 396 .cfi_restore 6 397 .cfi_restore 7 398 neg result, result 399 bx lr 400 4016: 402 .cfi_restore_state 403 S2LO data1, data1, #24 404 and data2, data2, #LSB 405 b .Lstrcmp_tail 406 407 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ 408.Loverlap2: 409 and tmp1, data1, const_m1, S2LO #16 410 uadd8 syndrome, data1, const_m1 411 eors syndrome, tmp1, data2, S2LO #16 412 sel syndrome, syndrome, const_m1 413 bne 4f 414 cbnz syndrome, 5f 415 ldr data2, [src2], #4 416 eor tmp1, tmp1, data1 417 cmp tmp1, data2, S2HI #16 418 bne 6f 419 ldr data1, [src1], #4 420 b .Loverlap2 4214: 422 S2LO data2, data2, #16 423 b .Lstrcmp_tail 4245: 425 ands syndrome, syndrome, const_m1, S2LO #16 426 bne .Lstrcmp_done_equal 427 428 ldrh data2, [src2] 429 S2LO data1, data1, #16 430#ifdef __ARM_BIG_ENDIAN 431 lsl data2, data2, #16 432#endif 433 b .Lstrcmp_tail 434 4356: 436 S2LO data1, data1, #16 437 and data2, data2, const_m1, S2LO #16 438 b .Lstrcmp_tail 439 440 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ 441.Loverlap1: 442 and tmp1, data1, #LSB 443 uadd8 syndrome, data1, const_m1 444 eors syndrome, tmp1, data2, S2LO #24 445 sel syndrome, syndrome, const_m1 446 bne 4f 447 cbnz syndrome, 5f 448 ldr data2, [src2], #4 449 eor tmp1, tmp1, data1 450 cmp tmp1, data2, S2HI #8 451 bne 6f 452 ldr data1, [src1], #4 453 b .Loverlap1 4544: 455 S2LO data2, data2, #24 456 b .Lstrcmp_tail 4575: 458 tst syndrome, #LSB 459 bne .Lstrcmp_done_equal 460 ldr data2, [src2] 4616: 462 S2LO data1, data1, #8 463 bic data2, data2, #MSB 464 b .Lstrcmp_tail 465 466.Lstrcmp_done_equal: 467 mov result, #0 468 .cfi_remember_state 469 ldrd r4, r5, [sp], #16 470 .cfi_restore 4 471 .cfi_restore 5 472 /* R6/7 not used in this sequence. */ 473 .cfi_restore 6 474 .cfi_restore 7 475 bx lr 476 477.Lstrcmp_tail: 478 .cfi_restore_state 479#ifndef __ARM_BIG_ENDIAN 480 rev data1, data1 481 rev data2, data2 482 /* Now everything looks big-endian... */ 483#endif 484 uadd8 tmp1, data1, const_m1 485 eor tmp1, data1, data2 486 sel syndrome, tmp1, const_m1 487 clz tmp1, syndrome 488 lsl data1, data1, tmp1 489 lsl data2, data2, tmp1 490 lsr result, data1, #24 491 ldrd r4, r5, [sp], #16 492 .cfi_restore 4 493 .cfi_restore 5 494 /* R6/7 not used in this sequence. */ 495 .cfi_restore 6 496 .cfi_restore 7 497 sub result, result, data2, lsr #24 498 bx lr 499 .cfi_endproc 500 .size strcmp, . - .Lstrcmp_start_addr 501