1*25c485e1SGetz Mikalsen/*- 2*25c485e1SGetz Mikalsen * SPDX-License-Identifier: BSD-2-Clause 3*25c485e1SGetz Mikalsen * 4*25c485e1SGetz Mikalsen * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org> 5*25c485e1SGetz Mikalsen*/ 6*25c485e1SGetz Mikalsen 7*25c485e1SGetz Mikalsen#include <machine/asm.h> 8*25c485e1SGetz Mikalsen#include <machine/param.h> 9*25c485e1SGetz Mikalsen 10*25c485e1SGetz Mikalsen .weak strncmp 11*25c485e1SGetz Mikalsen .set strncmp, __strncmp 12*25c485e1SGetz Mikalsen .text 13*25c485e1SGetz Mikalsen 14*25c485e1SGetz MikalsenENTRY(__strncmp) 15*25c485e1SGetz Mikalsen 16*25c485e1SGetz Mikalsen bic x8, x0, #0xf // x0 aligned to the boundary 17*25c485e1SGetz Mikalsen and x9, x0, #0xf // x9 is the offset 18*25c485e1SGetz Mikalsen bic x10, x1, #0xf // x1 aligned to the boundary 19*25c485e1SGetz Mikalsen and x11, x1, #0xf // x11 is the offset 20*25c485e1SGetz Mikalsen 21*25c485e1SGetz Mikalsen subs x2, x2, #1 22*25c485e1SGetz Mikalsen b.lo .Lempty 23*25c485e1SGetz Mikalsen 24*25c485e1SGetz Mikalsen mov x13, #-1 // save constants for later 25*25c485e1SGetz Mikalsen mov x16, #0xf 26*25c485e1SGetz Mikalsen 27*25c485e1SGetz Mikalsen /* 28*25c485e1SGetz Mikalsen * Check if either string is located at end of page to avoid crossing 29*25c485e1SGetz Mikalsen * into unmapped page. If so, we load 16 bytes from the nearest 30*25c485e1SGetz Mikalsen * alignment boundary and shift based on the offset. 31*25c485e1SGetz Mikalsen */ 32*25c485e1SGetz Mikalsen 33*25c485e1SGetz Mikalsen add x3, x0, #16 // end of head 34*25c485e1SGetz Mikalsen add x4, x1, #16 35*25c485e1SGetz Mikalsen eor x3, x3, x0 36*25c485e1SGetz Mikalsen eor x4, x4, x1 // bits that changed 37*25c485e1SGetz Mikalsen orr x3, x3, x4 // in either str1 or str2 38*25c485e1SGetz Mikalsen cmp x2,#16 39*25c485e1SGetz Mikalsen b.lo .Llt16 40*25c485e1SGetz Mikalsen tbz w3, #PAGE_SHIFT, .Lbegin 41*25c485e1SGetz Mikalsen 42*25c485e1SGetz Mikalsen ldr q0, [x8] // load aligned head 43*25c485e1SGetz Mikalsen ldr q1, [x10] 44*25c485e1SGetz Mikalsen 45*25c485e1SGetz Mikalsen lsl x14, x9, #2 46*25c485e1SGetz Mikalsen lsl x15, x11, #2 47*25c485e1SGetz Mikalsen lsl x3, x13, x14 // string head 48*25c485e1SGetz Mikalsen lsl x4, x13, x15 49*25c485e1SGetz Mikalsen 50*25c485e1SGetz Mikalsen cmeq v5.16b, v0.16b, #0 51*25c485e1SGetz Mikalsen cmeq v6.16b, v1.16b, #0 52*25c485e1SGetz Mikalsen 53*25c485e1SGetz Mikalsen shrn v5.8b, v5.8h, #4 54*25c485e1SGetz Mikalsen shrn v6.8b, v6.8h, #4 55*25c485e1SGetz Mikalsen fmov x5, d5 56*25c485e1SGetz Mikalsen fmov x6, d6 57*25c485e1SGetz Mikalsen 58*25c485e1SGetz Mikalsen adrp x14, shift_data 59*25c485e1SGetz Mikalsen add x14, x14, :lo12:shift_data 60*25c485e1SGetz Mikalsen 61*25c485e1SGetz Mikalsen /* heads may cross page boundary, avoid unmapped loads */ 62*25c485e1SGetz Mikalsen tst x5, x3 63*25c485e1SGetz Mikalsen b.eq 0f 64*25c485e1SGetz Mikalsen 65*25c485e1SGetz Mikalsen ldr q4, [x14, x9] // load permutation table 66*25c485e1SGetz Mikalsen tbl v0.16b, {v0.16b}, v4.16b 67*25c485e1SGetz Mikalsen 68*25c485e1SGetz Mikalsen b 1f 69*25c485e1SGetz Mikalsen .p2align 4 70*25c485e1SGetz Mikalsen0: 71*25c485e1SGetz Mikalsen ldr q0, [x0] // load true head 72*25c485e1SGetz Mikalsen1: 73*25c485e1SGetz Mikalsen tst x6, x4 74*25c485e1SGetz Mikalsen b.eq 0f 75*25c485e1SGetz Mikalsen 76*25c485e1SGetz Mikalsen ldr q4, [x14, x11] 77*25c485e1SGetz Mikalsen tbl v4.16b, {v1.16b}, v4.16b 78*25c485e1SGetz Mikalsen 79*25c485e1SGetz Mikalsen b 1f 80*25c485e1SGetz Mikalsen 81*25c485e1SGetz Mikalsen .p2align 4 82*25c485e1SGetz Mikalsen.Lbegin: 83*25c485e1SGetz Mikalsen ldr q0, [x0] // load true heads 84*25c485e1SGetz Mikalsen0: 85*25c485e1SGetz Mikalsen ldr q4, [x1] 86*25c485e1SGetz Mikalsen1: 87*25c485e1SGetz Mikalsen cmeq v2.16b, v0.16b, #0 // NUL byte present? 88*25c485e1SGetz Mikalsen cmeq v4.16b, v0.16b, v4.16b // which bytes match? 89*25c485e1SGetz Mikalsen 90*25c485e1SGetz Mikalsen orn v2.16b, v2.16b, v4.16b // mismatch or NUL byte? 91*25c485e1SGetz Mikalsen 92*25c485e1SGetz Mikalsen shrn v2.8b, v2.8h, #4 93*25c485e1SGetz Mikalsen fmov x5, d2 94*25c485e1SGetz Mikalsen 95*25c485e1SGetz Mikalsen cbnz x5, .Lhead_mismatch 96*25c485e1SGetz Mikalsen /* load head and second chunk */ 97*25c485e1SGetz Mikalsen ldr q2, [x8, #16] // load second chunk 98*25c485e1SGetz Mikalsen ldr q3, [x10, #16] 99*25c485e1SGetz Mikalsen 100*25c485e1SGetz Mikalsen add x2, x2, x11 101*25c485e1SGetz Mikalsen sub x2, x2, #16 102*25c485e1SGetz Mikalsen 103*25c485e1SGetz Mikalsen subs x9, x9, x11 // is a&0xf >= b&0xf 104*25c485e1SGetz Mikalsen b.lo .Lswapped // if not swap operands 105*25c485e1SGetz Mikalsen b .Lnormal 106*25c485e1SGetz Mikalsen 107*25c485e1SGetz Mikalsen .p2align 4 108*25c485e1SGetz Mikalsen.Llt16: 109*25c485e1SGetz Mikalsen /* 110*25c485e1SGetz Mikalsen * Check if either string is located at end of page to avoid crossing 111*25c485e1SGetz Mikalsen * into unmapped page. If so, we load 16 bytes from the nearest 112*25c485e1SGetz Mikalsen * alignment boundary and shift based on the offset. 113*25c485e1SGetz Mikalsen */ 114*25c485e1SGetz Mikalsen tbz w3, #PAGE_SHIFT, 2f 115*25c485e1SGetz Mikalsen 116*25c485e1SGetz Mikalsen ldr q0, [x8] // load aligned head 117*25c485e1SGetz Mikalsen ldr q1, [x10] 118*25c485e1SGetz Mikalsen 119*25c485e1SGetz Mikalsen lsl x14, x9, #2 120*25c485e1SGetz Mikalsen lsl x15, x11, #2 121*25c485e1SGetz Mikalsen lsl x3, x13, x14 // string head 122*25c485e1SGetz Mikalsen lsl x4, x13, x15 123*25c485e1SGetz Mikalsen 124*25c485e1SGetz Mikalsen /* Introduce a null byte match if the limit is within the aligned chunk */ 125*25c485e1SGetz Mikalsen add x14, x2, x9 126*25c485e1SGetz Mikalsen add x15, x2, x11 127*25c485e1SGetz Mikalsen lsl x14, x14, #2 128*25c485e1SGetz Mikalsen lsl x15, x15, #2 129*25c485e1SGetz Mikalsen lsl x14, x16, x14 130*25c485e1SGetz Mikalsen lsl x15, x16, x15 131*25c485e1SGetz Mikalsen 132*25c485e1SGetz Mikalsen cmeq v5.16b, v0.16b, #0 133*25c485e1SGetz Mikalsen cmeq v6.16b, v1.16b, #0 134*25c485e1SGetz Mikalsen 135*25c485e1SGetz Mikalsen shrn v5.8b, v5.8h, #4 136*25c485e1SGetz Mikalsen shrn v6.8b, v6.8h, #4 137*25c485e1SGetz Mikalsen fmov x5, d5 138*25c485e1SGetz Mikalsen fmov x6, d6 139*25c485e1SGetz Mikalsen 140*25c485e1SGetz Mikalsen orr x5, x5, x14 // insert match at limit 141*25c485e1SGetz Mikalsen orr x6, x6, x15 142*25c485e1SGetz Mikalsen 143*25c485e1SGetz Mikalsen adrp x14, shift_data 144*25c485e1SGetz Mikalsen add x14, x14, :lo12:shift_data 145*25c485e1SGetz Mikalsen 146*25c485e1SGetz Mikalsen /* heads may cross page boundary, avoid unmapped loads */ 147*25c485e1SGetz Mikalsen tst x5, x3 148*25c485e1SGetz Mikalsen b.eq 0f 149*25c485e1SGetz Mikalsen 150*25c485e1SGetz Mikalsen ldr q4, [x14, x9] // load permutation table 151*25c485e1SGetz Mikalsen tbl v0.16b, {v0.16b}, v4.16b 152*25c485e1SGetz Mikalsen 153*25c485e1SGetz Mikalsen b 1f 154*25c485e1SGetz Mikalsen .p2align 4 155*25c485e1SGetz Mikalsen0: 156*25c485e1SGetz Mikalsen ldr q0, [x0] // load true head 157*25c485e1SGetz Mikalsen1: 158*25c485e1SGetz Mikalsen tst x6, x4 159*25c485e1SGetz Mikalsen b.eq 0f 160*25c485e1SGetz Mikalsen 161*25c485e1SGetz Mikalsen ldr q4, [x14, x11] 162*25c485e1SGetz Mikalsen tbl v4.16b, {v1.16b}, v4.16b 163*25c485e1SGetz Mikalsen 164*25c485e1SGetz Mikalsen b 1f 165*25c485e1SGetz Mikalsen 166*25c485e1SGetz Mikalsen .p2align 4 167*25c485e1SGetz Mikalsen2: 168*25c485e1SGetz Mikalsen ldr q0, [x0] // load true heads 169*25c485e1SGetz Mikalsen0: 170*25c485e1SGetz Mikalsen ldr q4, [x1] 171*25c485e1SGetz Mikalsen1: 172*25c485e1SGetz Mikalsen 173*25c485e1SGetz Mikalsen cmeq v2.16b, v0.16b, #0 // NUL byte present? 174*25c485e1SGetz Mikalsen cmeq v4.16b, v0.16b, v4.16b // which bytes match? 175*25c485e1SGetz Mikalsen 176*25c485e1SGetz Mikalsen bic v2.16b, v4.16b, v2.16b // match and not NUL byte 177*25c485e1SGetz Mikalsen 178*25c485e1SGetz Mikalsen shrn v2.8b, v2.8h, #4 179*25c485e1SGetz Mikalsen fmov x5, d2 180*25c485e1SGetz Mikalsen lsl x4, x2, #2 181*25c485e1SGetz Mikalsen lsl x4, x13, x4 182*25c485e1SGetz Mikalsen orn x5, x4, x5 // mismatch or NUL byte? 183*25c485e1SGetz Mikalsen 184*25c485e1SGetz Mikalsen.Lhead_mismatch: 185*25c485e1SGetz Mikalsen rbit x3, x5 186*25c485e1SGetz Mikalsen clz x3, x3 // index of mismatch 187*25c485e1SGetz Mikalsen lsr x3, x3, #2 188*25c485e1SGetz Mikalsen ldrb w4, [x0, x3] 189*25c485e1SGetz Mikalsen ldrb w5, [x1, x3] 190*25c485e1SGetz Mikalsen sub w0, w4, w5 191*25c485e1SGetz Mikalsen ret 192*25c485e1SGetz Mikalsen 193*25c485e1SGetz Mikalsen .p2align 4 194*25c485e1SGetz Mikalsen.Lnormal: 195*25c485e1SGetz Mikalsen sub x12, x10, x9 196*25c485e1SGetz Mikalsen ldr q0, [x12, #16]! 197*25c485e1SGetz Mikalsen sub x10, x10, x8 198*25c485e1SGetz Mikalsen sub x11, x10, x9 199*25c485e1SGetz Mikalsen 200*25c485e1SGetz Mikalsen cmeq v1.16b, v3.16b, #0 // NUL present? 201*25c485e1SGetz Mikalsen cmeq v0.16b, v0.16b, v2.16b // Mismatch between chunks? 202*25c485e1SGetz Mikalsen shrn v1.8b, v1.8h, #4 203*25c485e1SGetz Mikalsen shrn v0.8b, v0.8h, #4 204*25c485e1SGetz Mikalsen fmov x6, d1 205*25c485e1SGetz Mikalsen fmov x5, d0 206*25c485e1SGetz Mikalsen 207*25c485e1SGetz Mikalsen add x8, x8, #32 // advance to next iteration 208*25c485e1SGetz Mikalsen 209*25c485e1SGetz Mikalsen lsl x4, x2, #2 210*25c485e1SGetz Mikalsen lsl x4, x13, x4 211*25c485e1SGetz Mikalsen orr x3, x6, x4 // introduce a null byte match 212*25c485e1SGetz Mikalsen cmp x2, #16 // does the buffer end within x2 213*25c485e1SGetz Mikalsen csel x6, x3, x6, lo 214*25c485e1SGetz Mikalsen cbnz x6, .Lnulfound2 // NUL or end of buffer found? 215*25c485e1SGetz Mikalsen mvn x5, x5 216*25c485e1SGetz Mikalsen cbnz x5, .Lmismatch2 217*25c485e1SGetz Mikalsen sub x2, x2, #16 218*25c485e1SGetz Mikalsen cmp x2, #32 // end of buffer? 219*25c485e1SGetz Mikalsen b.lo .Ltail 220*25c485e1SGetz Mikalsen /* 221*25c485e1SGetz Mikalsen * During the main loop, the layout of the two strings is something like: 222*25c485e1SGetz Mikalsen * 223*25c485e1SGetz Mikalsen * v ------1------ v ------2------ v 224*25c485e1SGetz Mikalsen * X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... 225*25c485e1SGetz Mikalsen * X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... 226*25c485e1SGetz Mikalsen * 227*25c485e1SGetz Mikalsen * where v indicates the alignment boundaries and corresponding chunks 228*25c485e1SGetz Mikalsen * of the strings have the same letters. Chunk A has been checked in 229*25c485e1SGetz Mikalsen * the previous iteration. This iteration, we first check that string 230*25c485e1SGetz Mikalsen * X1 doesn't end within region 2, then we compare chunk B between the 231*25c485e1SGetz Mikalsen * two strings. As X1 is known not to hold a NUL byte in regions 1 232*25c485e1SGetz Mikalsen * and 2 at this point, this also ensures that x0 has not ended yet. 233*25c485e1SGetz Mikalsen */ 234*25c485e1SGetz Mikalsen .p2align 4 235*25c485e1SGetz Mikalsen0: 236*25c485e1SGetz Mikalsen ldr q0, [x8, x11] 237*25c485e1SGetz Mikalsen ldr q1, [x8, x10] 238*25c485e1SGetz Mikalsen ldr q2, [x8] 239*25c485e1SGetz Mikalsen 240*25c485e1SGetz Mikalsen cmeq v1.16b, v1.16b, #0 // end of string? 241*25c485e1SGetz Mikalsen cmeq v0.16b, v0.16b, v2.16b // do the chunks match? 242*25c485e1SGetz Mikalsen 243*25c485e1SGetz Mikalsen shrn v1.8b, v1.8h, #4 244*25c485e1SGetz Mikalsen shrn v0.8b, v0.8h, #4 245*25c485e1SGetz Mikalsen fmov x6, d1 246*25c485e1SGetz Mikalsen fmov x5, d0 247*25c485e1SGetz Mikalsen cbnz x6, .Lnulfound 248*25c485e1SGetz Mikalsen mvn x5, x5 // any mismatches? 249*25c485e1SGetz Mikalsen cbnz x5, .Lmismatch 250*25c485e1SGetz Mikalsen 251*25c485e1SGetz Mikalsen add x8, x8, #16 252*25c485e1SGetz Mikalsen 253*25c485e1SGetz Mikalsen /* main loop unrolled twice */ 254*25c485e1SGetz Mikalsen ldr q0, [x8, x11] 255*25c485e1SGetz Mikalsen ldr q1, [x8, x10] 256*25c485e1SGetz Mikalsen ldr q2, [x8] 257*25c485e1SGetz Mikalsen 258*25c485e1SGetz Mikalsen add x8, x8, #16 259*25c485e1SGetz Mikalsen cmeq v1.16b, v1.16b, #0 260*25c485e1SGetz Mikalsen cmeq v0.16b, v0.16b, v2.16b 261*25c485e1SGetz Mikalsen 262*25c485e1SGetz Mikalsen shrn v1.8b, v1.8h, #4 263*25c485e1SGetz Mikalsen shrn v0.8b, v0.8h, #4 264*25c485e1SGetz Mikalsen fmov x6, d1 265*25c485e1SGetz Mikalsen fmov x5, d0 266*25c485e1SGetz Mikalsen cbnz x6, .Lnulfound2 267*25c485e1SGetz Mikalsen mvn x5, x5 268*25c485e1SGetz Mikalsen cbnz x5, .Lmismatch2 269*25c485e1SGetz Mikalsen sub x2, x2, #32 270*25c485e1SGetz Mikalsen cmp x2, #32 // end of buffer? 271*25c485e1SGetz Mikalsen b.hs 0b // if yes, process tail 272*25c485e1SGetz Mikalsen 273*25c485e1SGetz Mikalsen /* end of buffer will occur in next 32 bytes */ 274*25c485e1SGetz Mikalsen.Ltail: 275*25c485e1SGetz Mikalsen ldr q0, [x8, x11] 276*25c485e1SGetz Mikalsen ldr q1, [x8, x10] 277*25c485e1SGetz Mikalsen ldr q2, [x8] 278*25c485e1SGetz Mikalsen 279*25c485e1SGetz Mikalsen cmeq v1.16b, v1.16b, #0 // end of string? 280*25c485e1SGetz Mikalsen cmeq v0.16b, v0.16b, v2.16b // do the chunks match? 281*25c485e1SGetz Mikalsen 282*25c485e1SGetz Mikalsen shrn v1.8b, v1.8h, #4 283*25c485e1SGetz Mikalsen shrn v0.8b, v0.8h, #4 284*25c485e1SGetz Mikalsen fmov x6, d1 285*25c485e1SGetz Mikalsen fmov x5, d0 286*25c485e1SGetz Mikalsen 287*25c485e1SGetz Mikalsen /* 288*25c485e1SGetz Mikalsen * If x2 <= 16 then we introduce a NUL byte in the 289*25c485e1SGetz Mikalsen * result from CMEQ to avoid comparing further! 290*25c485e1SGetz Mikalsen */ 291*25c485e1SGetz Mikalsen 292*25c485e1SGetz Mikalsen lsl x4, x2, #2 293*25c485e1SGetz Mikalsen lsl x4, x13, x4 294*25c485e1SGetz Mikalsen orr x3, x6, x4 // introduce a null byte match 295*25c485e1SGetz Mikalsen cmp x2, #16 // does the buffer end within x2 296*25c485e1SGetz Mikalsen csel x6, x3, x6, lo 297*25c485e1SGetz Mikalsen 298*25c485e1SGetz Mikalsen cbnz x6, .Lnulfound // NUL or end of string found 299*25c485e1SGetz Mikalsen mvn x5, x5 300*25c485e1SGetz Mikalsen cbnz x5, .Lmismatch 301*25c485e1SGetz Mikalsen 302*25c485e1SGetz Mikalsen add x8, x8, #16 303*25c485e1SGetz Mikalsen 304*25c485e1SGetz Mikalsen /* main loop unrolled twice */ 305*25c485e1SGetz Mikalsen ldr q0, [x8, x11] 306*25c485e1SGetz Mikalsen ldr q1, [x8, x10] 307*25c485e1SGetz Mikalsen ldr q2, [x8] 308*25c485e1SGetz Mikalsen 309*25c485e1SGetz Mikalsen add x8, x8, #16 310*25c485e1SGetz Mikalsen cmeq v1.16b, v1.16b, #0 311*25c485e1SGetz Mikalsen cmeq v0.16b, v0.16b, v2.16b 312*25c485e1SGetz Mikalsen 313*25c485e1SGetz Mikalsen shrn v1.8b, v1.8h, #4 314*25c485e1SGetz Mikalsen shrn v0.8b, v0.8h, #4 315*25c485e1SGetz Mikalsen fmov x6, d1 316*25c485e1SGetz Mikalsen fmov x5, d0 317*25c485e1SGetz Mikalsen 318*25c485e1SGetz Mikalsen ubfiz x4, x2, #2, #4 // (x2 - 16) << 2 319*25c485e1SGetz Mikalsen lsl x4, x13, x4 // take first half into account 320*25c485e1SGetz Mikalsen orr x6, x6, x4 // introduce a null byte match 321*25c485e1SGetz Mikalsen 322*25c485e1SGetz Mikalsen.Lnulfound2: 323*25c485e1SGetz Mikalsen sub x8, x8, #16 324*25c485e1SGetz Mikalsen 325*25c485e1SGetz Mikalsen.Lnulfound: 326*25c485e1SGetz Mikalsen mov x4, x6 327*25c485e1SGetz Mikalsen 328*25c485e1SGetz Mikalsen ubfiz x7, x9, #2, #4 329*25c485e1SGetz Mikalsen lsl x6, x6, x7 // adjust NUL mask to indices 330*25c485e1SGetz Mikalsen 331*25c485e1SGetz Mikalsen orn x5, x6, x5 332*25c485e1SGetz Mikalsen cbnz x5, .Lmismatch 333*25c485e1SGetz Mikalsen 334*25c485e1SGetz Mikalsen /* 335*25c485e1SGetz Mikalsen * (x0) == (x1) and NUL is past the string. 336*25c485e1SGetz Mikalsen * Compare (x1) with the corresponding part 337*25c485e1SGetz Mikalsen * of the other string until the NUL byte. 338*25c485e1SGetz Mikalsen */ 339*25c485e1SGetz Mikalsen ldr q0, [x8, x9] 340*25c485e1SGetz Mikalsen ldr q1, [x8, x10] 341*25c485e1SGetz Mikalsen 342*25c485e1SGetz Mikalsen cmeq v1.16b, v0.16b, v1.16b 343*25c485e1SGetz Mikalsen shrn v1.8b, v1.8h, #4 344*25c485e1SGetz Mikalsen fmov x5, d1 345*25c485e1SGetz Mikalsen 346*25c485e1SGetz Mikalsen orn x5, x4, x5 347*25c485e1SGetz Mikalsen 348*25c485e1SGetz Mikalsen rbit x3, x5 349*25c485e1SGetz Mikalsen clz x3, x3 350*25c485e1SGetz Mikalsen lsr x5, x3, #2 351*25c485e1SGetz Mikalsen 352*25c485e1SGetz Mikalsen add x10, x10, x8 // restore x10 pointer 353*25c485e1SGetz Mikalsen add x8, x8, x9 // point to corresponding chunk 354*25c485e1SGetz Mikalsen 355*25c485e1SGetz Mikalsen ldrb w4, [x8, x5] 356*25c485e1SGetz Mikalsen ldrb w5, [x10, x5] 357*25c485e1SGetz Mikalsen sub w0, w4, w5 358*25c485e1SGetz Mikalsen ret 359*25c485e1SGetz Mikalsen 360*25c485e1SGetz Mikalsen .p2align 4 361*25c485e1SGetz Mikalsen.Lmismatch2: 362*25c485e1SGetz Mikalsen sub x8, x8, #16 // roll back second increment 363*25c485e1SGetz Mikalsen.Lmismatch: 364*25c485e1SGetz Mikalsen rbit x3, x5 365*25c485e1SGetz Mikalsen clz x3, x3 // index of mismatch 366*25c485e1SGetz Mikalsen lsr x3, x3, #2 367*25c485e1SGetz Mikalsen add x11, x8, x11 368*25c485e1SGetz Mikalsen 369*25c485e1SGetz Mikalsen ldrb w4, [x8, x3] 370*25c485e1SGetz Mikalsen ldrb w5, [x11, x3] 371*25c485e1SGetz Mikalsen sub w0, w4, w5 // byte difference 372*25c485e1SGetz Mikalsen ret 373*25c485e1SGetz Mikalsen 374*25c485e1SGetz Mikalsen /* 375*25c485e1SGetz Mikalsen * If (a&0xf) < (b&0xf), we do the same thing but with swapped 376*25c485e1SGetz Mikalsen * operands. I found that this performs slightly better than 377*25c485e1SGetz Mikalsen * using conditional moves to do the swap branchless. 378*25c485e1SGetz Mikalsen */ 379*25c485e1SGetz Mikalsen .p2align 4 380*25c485e1SGetz Mikalsen.Lswapped: 381*25c485e1SGetz Mikalsen add x12, x8, x9 382*25c485e1SGetz Mikalsen ldr q0, [x12, #16]! 383*25c485e1SGetz Mikalsen sub x8, x8, x10 384*25c485e1SGetz Mikalsen add x11, x8, x9 385*25c485e1SGetz Mikalsen add x2,x2,x9 386*25c485e1SGetz Mikalsen neg x9, x9 387*25c485e1SGetz Mikalsen 388*25c485e1SGetz Mikalsen cmeq v1.16b, v2.16b, #0 389*25c485e1SGetz Mikalsen cmeq v0.16b, v0.16b, v3.16b 390*25c485e1SGetz Mikalsen shrn v1.8b, v1.8h, #4 391*25c485e1SGetz Mikalsen shrn v0.8b, v0.8h, #4 392*25c485e1SGetz Mikalsen fmov x6, d1 393*25c485e1SGetz Mikalsen fmov x5, d0 394*25c485e1SGetz Mikalsen 395*25c485e1SGetz Mikalsen add x10, x10, #32 396*25c485e1SGetz Mikalsen 397*25c485e1SGetz Mikalsen lsl x4, x2, #2 398*25c485e1SGetz Mikalsen lsl x4, x13, x4 399*25c485e1SGetz Mikalsen orr x3,x6,x4 // introduce a null byte match 400*25c485e1SGetz Mikalsen cmp x2,#16 401*25c485e1SGetz Mikalsen csel x6, x3, x6, lo 402*25c485e1SGetz Mikalsen cbnz x6, .Lnulfound2s 403*25c485e1SGetz Mikalsen mvn x5, x5 404*25c485e1SGetz Mikalsen cbnz x5, .Lmismatch2s 405*25c485e1SGetz Mikalsen 406*25c485e1SGetz Mikalsen sub x2, x2, #16 407*25c485e1SGetz Mikalsen cmp x2, #32 408*25c485e1SGetz Mikalsen b.lo .Ltails 409*25c485e1SGetz Mikalsen 410*25c485e1SGetz Mikalsen /* 411*25c485e1SGetz Mikalsen * During the main loop, the layout of the two strings is something like: 412*25c485e1SGetz Mikalsen * 413*25c485e1SGetz Mikalsen * v ------1------ v ------2------ v 414*25c485e1SGetz Mikalsen * X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... 415*25c485e1SGetz Mikalsen * X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... 416*25c485e1SGetz Mikalsen * 417*25c485e1SGetz Mikalsen * where v indicates the alignment boundaries and corresponding chunks 418*25c485e1SGetz Mikalsen * of the strings have the same letters. Chunk A has been checked in 419*25c485e1SGetz Mikalsen * the previous iteration. This iteration, we first check that string 420*25c485e1SGetz Mikalsen * X0 doesn't end within region 2, then we compare chunk B between the 421*25c485e1SGetz Mikalsen * two strings. As X0 is known not to hold a NUL byte in regions 1 422*25c485e1SGetz Mikalsen * and 2 at this point, this also ensures that X1 has not ended yet. 423*25c485e1SGetz Mikalsen */ 424*25c485e1SGetz Mikalsen .p2align 4 425*25c485e1SGetz Mikalsen0: 426*25c485e1SGetz Mikalsen ldr q0, [x10, x11] 427*25c485e1SGetz Mikalsen ldr q1, [x10, x8] 428*25c485e1SGetz Mikalsen ldr q2, [x10] 429*25c485e1SGetz Mikalsen 430*25c485e1SGetz Mikalsen cmeq v1.16b, v1.16b, #0 431*25c485e1SGetz Mikalsen cmeq v0.16b, v0.16b, v2.16b 432*25c485e1SGetz Mikalsen 433*25c485e1SGetz Mikalsen shrn v1.8b, v1.8h, #4 434*25c485e1SGetz Mikalsen shrn v0.8b, v0.8h, #4 435*25c485e1SGetz Mikalsen fmov x6, d1 436*25c485e1SGetz Mikalsen fmov x5, d0 437*25c485e1SGetz Mikalsen cbnz x6, .Lnulfounds 438*25c485e1SGetz Mikalsen mvn x5, x5 439*25c485e1SGetz Mikalsen cbnz x5, .Lmismatchs 440*25c485e1SGetz Mikalsen 441*25c485e1SGetz Mikalsen add x10, x10, #16 442*25c485e1SGetz Mikalsen 443*25c485e1SGetz Mikalsen /* main loop unrolled twice */ 444*25c485e1SGetz Mikalsen ldr q0, [x10, x11] 445*25c485e1SGetz Mikalsen ldr q1, [x10, x8] 446*25c485e1SGetz Mikalsen ldr q2, [x10] 447*25c485e1SGetz Mikalsen 448*25c485e1SGetz Mikalsen add x10, x10, #16 449*25c485e1SGetz Mikalsen cmeq v1.16b, v1.16b, #0 450*25c485e1SGetz Mikalsen cmeq v0.16b, v0.16b, v2.16b 451*25c485e1SGetz Mikalsen 452*25c485e1SGetz Mikalsen shrn v1.8b, v1.8h, #4 453*25c485e1SGetz Mikalsen shrn v0.8b, v0.8h, #4 454*25c485e1SGetz Mikalsen fmov x6, d1 455*25c485e1SGetz Mikalsen fmov x5, d0 456*25c485e1SGetz Mikalsen cbnz x6, .Lnulfound2s 457*25c485e1SGetz Mikalsen mvn x5, x5 458*25c485e1SGetz Mikalsen cbnz x5, .Lmismatch2s 459*25c485e1SGetz Mikalsen sub x2, x2, #32 460*25c485e1SGetz Mikalsen cmp x2, #32 461*25c485e1SGetz Mikalsen b.hs 0b 462*25c485e1SGetz Mikalsen 463*25c485e1SGetz Mikalsen.Ltails: 464*25c485e1SGetz Mikalsen ldr q0, [x10, x11] 465*25c485e1SGetz Mikalsen ldr q1, [x10, x8] 466*25c485e1SGetz Mikalsen ldr q2, [x10] 467*25c485e1SGetz Mikalsen 468*25c485e1SGetz Mikalsen cmeq v1.16b, v1.16b, #0 469*25c485e1SGetz Mikalsen cmeq v0.16b, v0.16b, v2.16b 470*25c485e1SGetz Mikalsen 471*25c485e1SGetz Mikalsen shrn v1.8b, v1.8h, #4 472*25c485e1SGetz Mikalsen shrn v0.8b, v0.8h, #4 473*25c485e1SGetz Mikalsen fmov x6, d1 474*25c485e1SGetz Mikalsen fmov x5, d0 475*25c485e1SGetz Mikalsen 476*25c485e1SGetz Mikalsen /* 477*25c485e1SGetz Mikalsen * If x2 <= 16 then we introduce a NUL byte in the 478*25c485e1SGetz Mikalsen * result from CMEQ to avoid comparing further! 479*25c485e1SGetz Mikalsen */ 480*25c485e1SGetz Mikalsen 481*25c485e1SGetz Mikalsen lsl x4, x2, #2 482*25c485e1SGetz Mikalsen lsl x4, x13, x4 483*25c485e1SGetz Mikalsen orr x3, x6, x4 // introduce a null byte match 484*25c485e1SGetz Mikalsen cmp x2, #16 485*25c485e1SGetz Mikalsen csel x6, x3, x6, lo 486*25c485e1SGetz Mikalsen 487*25c485e1SGetz Mikalsen cbnz x6, .Lnulfounds 488*25c485e1SGetz Mikalsen mvn x5, x5 489*25c485e1SGetz Mikalsen cbnz x5, .Lmismatchs 490*25c485e1SGetz Mikalsen 491*25c485e1SGetz Mikalsen add x10, x10, #16 492*25c485e1SGetz Mikalsen 493*25c485e1SGetz Mikalsen ldr q0, [x10, x11] 494*25c485e1SGetz Mikalsen ldr q1, [x10, x8] 495*25c485e1SGetz Mikalsen ldr q2, [x10] 496*25c485e1SGetz Mikalsen 497*25c485e1SGetz Mikalsen add x10, x10, #16 498*25c485e1SGetz Mikalsen cmeq v1.16b, v1.16b, #0 499*25c485e1SGetz Mikalsen cmeq v0.16b, v0.16b, v2.16b 500*25c485e1SGetz Mikalsen 501*25c485e1SGetz Mikalsen shrn v1.8b, v1.8h, #4 502*25c485e1SGetz Mikalsen shrn v0.8b, v0.8h, #4 503*25c485e1SGetz Mikalsen fmov x6, d1 504*25c485e1SGetz Mikalsen fmov x5, d0 505*25c485e1SGetz Mikalsen 506*25c485e1SGetz Mikalsen ubfiz x4, x2, #2, #4 507*25c485e1SGetz Mikalsen lsl x4, x13, x4 508*25c485e1SGetz Mikalsen orr x6, x6, x4 // introduce a null byte match 509*25c485e1SGetz Mikalsen 510*25c485e1SGetz Mikalsen.Lnulfound2s: 511*25c485e1SGetz Mikalsen sub x10, x10, #16 512*25c485e1SGetz Mikalsen.Lnulfounds: 513*25c485e1SGetz Mikalsen mov x4, x6 514*25c485e1SGetz Mikalsen 515*25c485e1SGetz Mikalsen ubfiz x7, x9, #2, #4 516*25c485e1SGetz Mikalsen lsl x6, x6, x7 517*25c485e1SGetz Mikalsen 518*25c485e1SGetz Mikalsen orn x5, x6, x5 519*25c485e1SGetz Mikalsen 520*25c485e1SGetz Mikalsen cbnz x5, .Lmismatchs 521*25c485e1SGetz Mikalsen 522*25c485e1SGetz Mikalsen ldr q0, [x10, x9] 523*25c485e1SGetz Mikalsen ldr q1, [x10, x8] 524*25c485e1SGetz Mikalsen 525*25c485e1SGetz Mikalsen cmeq v1.16b, v0.16b, v1.16b 526*25c485e1SGetz Mikalsen shrn v1.8b, v1.8h, #4 527*25c485e1SGetz Mikalsen fmov x5, d1 528*25c485e1SGetz Mikalsen 529*25c485e1SGetz Mikalsen orn x5, x4, x5 530*25c485e1SGetz Mikalsen 531*25c485e1SGetz Mikalsen rbit x3, x5 532*25c485e1SGetz Mikalsen clz x3, x3 533*25c485e1SGetz Mikalsen lsr x5, x3, #2 534*25c485e1SGetz Mikalsen 535*25c485e1SGetz Mikalsen add x11, x10, x8 536*25c485e1SGetz Mikalsen add x10, x10, x9 537*25c485e1SGetz Mikalsen 538*25c485e1SGetz Mikalsen ldrb w4, [x10, x5] 539*25c485e1SGetz Mikalsen ldrb w5, [x11, x5] 540*25c485e1SGetz Mikalsen sub w0, w5, w4 541*25c485e1SGetz Mikalsen ret 542*25c485e1SGetz Mikalsen 543*25c485e1SGetz Mikalsen .p2align 4 544*25c485e1SGetz Mikalsen.Lmismatch2s: 545*25c485e1SGetz Mikalsen sub x10, x10, #16 546*25c485e1SGetz Mikalsen.Lmismatchs: 547*25c485e1SGetz Mikalsen rbit x3, x5 548*25c485e1SGetz Mikalsen clz x3, x3 549*25c485e1SGetz Mikalsen lsr x3, x3, #2 550*25c485e1SGetz Mikalsen add x11, x10, x11 551*25c485e1SGetz Mikalsen 552*25c485e1SGetz Mikalsen ldrb w4, [x10, x3] 553*25c485e1SGetz Mikalsen ldrb w5, [x11, x3] 554*25c485e1SGetz Mikalsen sub w0, w5, w4 555*25c485e1SGetz Mikalsen ret 556*25c485e1SGetz Mikalsen 557*25c485e1SGetz Mikalsen .p2align 4 558*25c485e1SGetz Mikalsen.Lempty: 559*25c485e1SGetz Mikalsen eor x0, x0, x0 560*25c485e1SGetz Mikalsen ret 561*25c485e1SGetz Mikalsen 562*25c485e1SGetz MikalsenEND(__strncmp) 563*25c485e1SGetz Mikalsen 564*25c485e1SGetz Mikalsen .section .rodata 565*25c485e1SGetz Mikalsen .p2align 4 566*25c485e1SGetz Mikalsenshift_data: 567*25c485e1SGetz Mikalsen .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 568*25c485e1SGetz Mikalsen .fill 16, 1, -1 569*25c485e1SGetz Mikalsen .size shift_data, .-shift_data 570