1/*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org> 5*/ 6 7#include <machine/asm.h> 8#include <machine/param.h> 9 10 .weak strcmp 11 .set strcmp, __strcmp 12 .text 13 14ENTRY(__strcmp) 15 16 bic x8, x0, #0xf // x0 aligned to the boundary 17 and x9, x0, #0xf // x9 is the offset 18 bic x10, x1, #0xf // x1 aligned to the boundary 19 and x11, x1, #0xf // x11 is the offset 20 21 mov x13, #-1 22 23 /* 24 * Check if either string is located at end of page to avoid crossing 25 * into unmapped page. If so, we load 16 bytes from the nearest 26 * alignment boundary and shift based on the offset. 27 */ 28 29 add x3, x0, #16 // end of head 30 add x4, x1, #16 31 eor x3, x3, x0 32 eor x4, x4, x1 // bits that changed 33 orr x3, x3, x4 // in either str1 or str2 34 tbz w3, #PAGE_SHIFT, .Lbegin 35 36 ldr q0, [x8] // load aligned head 37 ldr q2, [x10] 38 39 lsl x14, x9, #2 40 lsl x15, x11, #2 41 lsl x3, x13, x14 // string head 42 lsl x4, x13, x15 43 44 cmeq v5.16b, v0.16b, #0 45 cmeq v6.16b, v2.16b, #0 46 47 shrn v5.8b, v5.8h, #4 48 shrn v6.8b, v6.8h, #4 49 fmov x5, d5 50 fmov x6, d6 51 52 adrp x2, shift_data 53 add x2, x2, :lo12:shift_data 54 55 /* heads may cross page boundary, avoid unmapped loads */ 56 tst x5, x3 57 b.eq 0f 58 59 ldr q4, [x2, x9] // load permutation table 60 tbl v0.16b, {v0.16b}, v4.16b 61 62 b 1f 63 .p2align 4 640: 65 ldr q0, [x0] // load true head 661: 67 tst x6, x4 68 b.eq 0f 69 70 ldr q4, [x2, x11] 71 tbl v4.16b, {v2.16b}, v4.16b 72 73 b 1f 74 75 .p2align 4 76.Lbegin: 77 ldr q0, [x0] // load true heads 780: 79 ldr q4, [x1] 801: 81 82 cmeq v2.16b, v0.16b, #0 // NUL byte present? 83 cmeq v4.16b, v0.16b, v4.16b // which bytes match? 84 85 orn v2.16b, v2.16b, v4.16b // mismatch or NUL byte? 86 87 shrn v2.8b, v2.8h, #4 88 fmov x5, d2 89 90 cbnz x5, .Lhead_mismatch 91 92 ldr q2, [x8, #16] // load second chunk 93 ldr q3, [x10, #16] 94 subs x9, x9, x11 // is a&0xf >= b&0xf 95 b.lo .Lswapped // if not swap operands 96 sub x12, x10, x9 97 ldr q0, [x12, #16]! 98 sub x10, x10, x8 99 sub x11, x10, x9 100 101 cmeq v1.16b, v3.16b, #0 102 cmeq v0.16b, v0.16b, v2.16b 103 add x8, x8, #16 104 shrn v1.8b, v1.8h, #4 105 fmov x6, d1 106 shrn v0.8b, v0.8h, #4 107 fmov x5, d0 108 cbnz x6, .Lnulfound 109 mvn x5, x5 110 cbnz x5, .Lmismatch 111 add x8, x8, #16 // advance aligned pointers 112 113 /* 114 * During the main loop, the layout of the two strings is something like: 115 * 116 * v ------1------ v ------2------ v 117 * X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... 118 * X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... 119 * 120 * where v indicates the alignment boundaries and corresponding chunks 121 * of the strings have the same letters. Chunk A has been checked in 122 * the previous iteration. This iteration, we first check that string 123 * X1 doesn't end within region 2, then we compare chunk B between the 124 * two strings. As X1 is known not to hold a NUL byte in regions 1 125 * and 2 at this point, this also ensures that x0 has not ended yet. 126 */ 127 .p2align 4 1280: 129 ldr q0, [x8, x11] 130 ldr q1, [x8, x10] 131 ldr q2, [x8] 132 133 cmeq v1.16b, v1.16b, #0 // end of string? 134 cmeq v0.16b, v0.16b, v2.16b // do the chunks match? 135 136 shrn v1.8b, v1.8h, #4 137 fmov x6, d1 138 shrn v0.8b, v0.8h, #4 139 fmov x5, d0 140 cbnz x6, .Lnulfound 141 mvn x5, x5 // any mismatches? 142 cbnz x5, .Lmismatch 143 144 add x8, x8, #16 145 146 ldr q0, [x8, x11] 147 ldr q1, [x8, x10] 148 ldr q2, [x8] 149 150 add x8, x8, #16 151 cmeq v1.16b, v1.16b, #0 152 cmeq v0.16b, v0.16b, v2.16b 153 154 shrn v1.8b, v1.8h, #4 155 fmov x6, d1 156 shrn v0.8b, v0.8h, #4 157 fmov x5, d0 158 cbnz x6, .Lnulfound2 159 mvn x5, x5 160 cbz x5, 0b 161 162 sub x8, x8, #16 // roll back second increment 163.Lmismatch: 164 rbit x2, x5 165 clz x2, x2 // index of mismatch 166 lsr x2, x2, #2 167 add x11, x8, x11 168 169 ldrb w4, [x8, x2] 170 ldrb w5, [x11, x2] 171 sub w0, w4, w5 // byte difference 172 ret 173 174 .p2align 4 175.Lnulfound2: 176 sub x8, x8, #16 177 178.Lnulfound: 179 mov x7, x9 180 mov x4, x6 181 182 ubfiz x7, x7, #2, #4 // x7 = (x7 & 0xf) << 2 183 lsl x6, x6, x7 // adjust NUL mask to indices 184 orn x5, x6, x5 185 cbnz x5, .Lmismatch 186 187 /* 188 * (x0) == (x1) and NUL is past the string. 189 * Compare (x1) with the corresponding part 190 * of the other string until the NUL byte. 191 */ 192 ldr q0, [x8, x9] 193 ldr q1, [x8, x10] 194 195 cmeq v1.16b, v0.16b, v1.16b 196 shrn v1.8b, v1.8h, #4 197 fmov x5, d1 198 199 orn x5, x4, x5 200 201 rbit x2, x5 202 clz x2, x2 203 lsr x5, x2, #2 204 205 add x10, x10, x8 // restore x10 pointer 206 add x8, x8, x9 // point to corresponding chunk 207 208 ldrb w4, [x8, x5] 209 ldrb w5, [x10, x5] 210 sub w0, w4, w5 211 ret 212 213 .p2align 4 214.Lhead_mismatch: 215 rbit x2, x5 216 clz x2, x2 // index of mismatch 217 lsr x2, x2, #2 218 ldrb w4, [x0, x2] 219 ldrb w5, [x1, x2] 220 sub w0, w4, w5 221 ret 222 223 /* 224 * If (a&0xf) < (b&0xf), we do the same thing but with swapped 225 * operands. I found that this performs slightly better than 226 * using conditional moves to do the swap branchless. 227 */ 228 .p2align 4 229.Lswapped: 230 add x12, x8, x9 231 ldr q0, [x12, #16]! 232 sub x8, x8, x10 233 add x11, x8, x9 234 neg x9, x9 235 236 cmeq v1.16b, v2.16b, #0 237 cmeq v0.16b, v0.16b, v3.16b 238 add x10, x10, #16 239 shrn v1.8b, v1.8h, #4 240 fmov x6, d1 241 shrn v0.8b, v0.8h, #4 242 fmov x5, d0 243 cbnz x6, .Lnulfounds 244 mvn x5, x5 245 cbnz x5, .Lmismatchs 246 add x10, x10, #16 247 248 /* 249 * During the main loop, the layout of the two strings is something like: 250 * 251 * v ------1------ v ------2------ v 252 * X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... 253 * X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... 254 * 255 * where v indicates the alignment boundaries and corresponding chunks 256 * of the strings have the same letters. Chunk A has been checked in 257 * the previous iteration. This iteration, we first check that string 258 * X0 doesn't end within region 2, then we compare chunk B between the 259 * two strings. As X0 is known not to hold a NUL byte in regions 1 260 * and 2 at this point, this also ensures that X1 has not ended yet. 261 */ 262 .p2align 4 2630: 264 ldr q0, [x10, x11] 265 ldr q1, [x10, x8] 266 ldr q2, [x10] 267 268 cmeq v1.16b, v1.16b, #0 269 cmeq v0.16b, v0.16b, v2.16b 270 271 shrn v1.8b, v1.8h, #4 272 fmov x6, d1 273 shrn v0.8b, v0.8h, #4 274 fmov x5, d0 275 cbnz x6, .Lnulfounds 276 mvn x5, x5 277 cbnz x5, .Lmismatchs 278 279 add x10, x10, #16 280 281 ldr q0, [x10, x11] 282 ldr q1, [x10, x8] 283 ldr q2, [x10] 284 285 add x10, x10, #16 286 cmeq v1.16b, v1.16b, #0 287 cmeq v0.16b, v0.16b, v2.16b 288 289 shrn v1.8b, v1.8h, #4 290 fmov x6, d1 291 shrn v0.8b, v0.8h, #4 292 fmov x5, d0 293 cbnz x6, .Lnulfound2s 294 mvn x5, x5 295 cbz x5, 0b 296 297 sub x10, x10, #16 298 299.Lmismatchs: 300 rbit x2, x5 301 clz x2, x2 302 lsr x2, x2, #2 303 add x11, x10, x11 304 305 ldrb w4, [x10, x2] 306 ldrb w5, [x11, x2] 307 sub w0, w5, w4 308 ret 309 310 .p2align 4 311.Lnulfound2s: 312 sub x10, x10, #16 313.Lnulfounds: 314 mov x7, x9 315 mov x4, x6 316 317 ubfiz x7, x7, #2, #4 318 lsl x6, x6, x7 319 orn x5, x6, x5 320 cbnz x5, .Lmismatchs 321 322 ldr q0, [x10, x9] 323 ldr q1, [x10, x8] 324 325 cmeq v1.16b, v0.16b, v1.16b 326 shrn v1.8b, v1.8h, #4 327 fmov x5, d1 328 329 orn x5, x4, x5 330 331 rbit x2, x5 332 clz x2, x2 333 lsr x5, x2, #2 334 335 add x11, x10, x8 336 add x10, x10, x9 337 338 ldrb w4, [x10, x5] 339 ldrb w5, [x11, x5] 340 sub w0, w5, w4 341 ret 342 343END(__strcmp) 344 345 .section .rodata 346 .p2align 4 347shift_data: 348 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 349 .fill 16, 1, -1 350 .size shift_data, .-shift_data 351