1/* Copyright (c) 2013, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 26 27/* Assumptions: 28 * 29 * ARMv8-a, AArch64 30 */ 31 32 .macro def_fn f p2align=0 33 .text 34 .p2align \p2align 35 .global \f 36 .type \f, %function 37\f: 38 .endm 39 40#define REP8_01 0x0101010101010101 41#define REP8_7f 0x7f7f7f7f7f7f7f7f 42#define REP8_80 0x8080808080808080 43 44/* Parameters and result. */ 45#define src1 x0 46#define src2 x1 47#define limit x2 48#define result x0 49 50/* Internal variables. */ 51#define data1 x3 52#define data1w w3 53#define data2 x4 54#define data2w w4 55#define has_nul x5 56#define diff x6 57#define syndrome x7 58#define tmp1 x8 59#define tmp2 x9 60#define tmp3 x10 61#define zeroones x11 62#define pos x12 63#define limit_wd x13 64#define mask x14 65#define endloop x15 66 67 .text 68 .p2align 6 69 .rep 7 70 nop /* Pad so that the loop below fits a cache line. */ 71 .endr 72def_fn strncmp 73 cbz limit, .Lret0 74 eor tmp1, src1, src2 75 mov zeroones, #REP8_01 76 tst tmp1, #7 77 b.ne .Lmisaligned8 78 ands tmp1, src1, #7 79 b.ne .Lmutual_align 80 /* Calculate the number of full and partial words -1. */ 81 sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ 82 lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ 83 84 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 85 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 86 can be done in parallel across the entire word. */ 87 /* Start of performance-critical section -- one 64B cache line. */ 88.Lloop_aligned: 89 ldr data1, [src1], #8 90 ldr data2, [src2], #8 91.Lstart_realigned: 92 subs limit_wd, limit_wd, #1 93 sub tmp1, data1, zeroones 94 orr tmp2, data1, #REP8_7f 95 eor diff, data1, data2 /* Non-zero if differences found. */ 96 csinv endloop, diff, xzr, pl /* Last Dword or differences. */ 97 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 98 ccmp endloop, #0, #0, eq 99 b.eq .Lloop_aligned 100 /* End of performance-critical section -- one 64B cache line. */ 101 102 /* Not reached the limit, must have found the end or a diff. */ 103 tbz limit_wd, #63, .Lnot_limit 104 105 /* Limit % 8 == 0 => all bytes significant. */ 106 ands limit, limit, #7 107 b.eq .Lnot_limit 108 109 lsl limit, limit, #3 /* Bits -> bytes. */ 110 mov mask, #~0 111#ifdef __AARCH64EB__ 112 lsr mask, mask, limit 113#else 114 lsl mask, mask, limit 115#endif 116 bic data1, data1, mask 117 bic data2, data2, mask 118 119 /* Make sure that the NUL byte is marked in the syndrome. */ 120 orr has_nul, has_nul, mask 121 122.Lnot_limit: 123 orr syndrome, diff, has_nul 124 125#ifndef __AARCH64EB__ 126 rev syndrome, syndrome 127 rev data1, data1 128 /* The MS-non-zero bit of the syndrome marks either the first bit 129 that is different, or the top bit of the first zero byte. 130 Shifting left now will bring the critical information into the 131 top bits. */ 132 clz pos, syndrome 133 rev data2, data2 134 lsl data1, data1, pos 135 lsl data2, data2, pos 136 /* But we need to zero-extend (char is unsigned) the value and then 137 perform a signed 32-bit subtraction. */ 138 lsr data1, data1, #56 139 sub result, data1, data2, lsr #56 140 ret 141#else 142 /* For big-endian we cannot use the trick with the syndrome value 143 as carry-propagation can corrupt the upper bits if the trailing 144 bytes in the string contain 0x01. */ 145 /* However, if there is no NUL byte in the dword, we can generate 146 the result directly. We can't just subtract the bytes as the 147 MSB might be significant. */ 148 cbnz has_nul, 1f 149 cmp data1, data2 150 cset result, ne 151 cneg result, result, lo 152 ret 1531: 154 /* Re-compute the NUL-byte detection, using a byte-reversed value. */ 155 rev tmp3, data1 156 sub tmp1, tmp3, zeroones 157 orr tmp2, tmp3, #REP8_7f 158 bic has_nul, tmp1, tmp2 159 rev has_nul, has_nul 160 orr syndrome, diff, has_nul 161 clz pos, syndrome 162 /* The MS-non-zero bit of the syndrome marks either the first bit 163 that is different, or the top bit of the first zero byte. 164 Shifting left now will bring the critical information into the 165 top bits. */ 166 lsl data1, data1, pos 167 lsl data2, data2, pos 168 /* But we need to zero-extend (char is unsigned) the value and then 169 perform a signed 32-bit subtraction. */ 170 lsr data1, data1, #56 171 sub result, data1, data2, lsr #56 172 ret 173#endif 174 175.Lmutual_align: 176 /* Sources are mutually aligned, but are not currently at an 177 alignment boundary. Round down the addresses and then mask off 178 the bytes that precede the start point. 179 We also need to adjust the limit calculations, but without 180 overflowing if the limit is near ULONG_MAX. */ 181 bic src1, src1, #7 182 bic src2, src2, #7 183 ldr data1, [src1], #8 184 neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */ 185 ldr data2, [src2], #8 186 mov tmp2, #~0 187 sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ 188#ifdef __AARCH64EB__ 189 /* Big-endian. Early bytes are at MSB. */ 190 lsl tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */ 191#else 192 /* Little-endian. Early bytes are at LSB. */ 193 lsr tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */ 194#endif 195 and tmp3, limit_wd, #7 196 lsr limit_wd, limit_wd, #3 197 /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ 198 add limit, limit, tmp1 199 add tmp3, tmp3, tmp1 200 orr data1, data1, tmp2 201 orr data2, data2, tmp2 202 add limit_wd, limit_wd, tmp3, lsr #3 203 b .Lstart_realigned 204 205.Lret0: 206 mov result, #0 207 ret 208 209 .p2align 6 210.Lmisaligned8: 211 sub limit, limit, #1 2121: 213 /* Perhaps we can do better than this. */ 214 ldrb data1w, [src1], #1 215 ldrb data2w, [src2], #1 216 subs limit, limit, #1 217 ccmp data1w, #1, #0, cs /* NZCV = 0b0000. */ 218 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 219 b.eq 1b 220 sub result, data1, data2 221 ret 222 .size strncmp, . - strncmp 223