1*756b7fc8SGetz Mikalsen/*- 2*756b7fc8SGetz Mikalsen * SPDX-License-Identifier: BSD-2-Clause 3*756b7fc8SGetz Mikalsen * 4*756b7fc8SGetz Mikalsen * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org> 5*756b7fc8SGetz Mikalsen*/ 6*756b7fc8SGetz Mikalsen 7*756b7fc8SGetz Mikalsen#include <machine/asm.h> 8*756b7fc8SGetz Mikalsen 9*756b7fc8SGetz Mikalsen .weak strlcpy 10*756b7fc8SGetz Mikalsen .set strlcpy, __strlcpy 11*756b7fc8SGetz Mikalsen .text 12*756b7fc8SGetz Mikalsen 13*756b7fc8SGetz MikalsenENTRY(__strlcpy) 14*756b7fc8SGetz Mikalsen subs x2, x2, #1 15*756b7fc8SGetz Mikalsen b.lo .L0 16*756b7fc8SGetz Mikalsen 17*756b7fc8SGetz Mikalsen mov x9, x0 // stash copy of dst pointer 18*756b7fc8SGetz Mikalsen bic x10, x1, #0xf // src aligned 19*756b7fc8SGetz Mikalsen and x11, x1, #0xf // src offset 20*756b7fc8SGetz Mikalsen 21*756b7fc8SGetz Mikalsen ldr q1, [x10] 22*756b7fc8SGetz Mikalsen cmeq v1.16b, v1.16b, #0 // NUL found in head? 23*756b7fc8SGetz Mikalsen 24*756b7fc8SGetz Mikalsen mov x8, #-1 // fill register with 0xfff..fff 25*756b7fc8SGetz Mikalsen lsl x12, x11, #2 26*756b7fc8SGetz Mikalsen lsl x8, x8, x12 // mask of bytes in the string 27*756b7fc8SGetz Mikalsen 28*756b7fc8SGetz Mikalsen shrn v1.8b, v1.8h, #4 29*756b7fc8SGetz Mikalsen fmov x5, d1 30*756b7fc8SGetz Mikalsen 31*756b7fc8SGetz Mikalsen ands x5, x5, x8 32*756b7fc8SGetz Mikalsen b.ne .Lhead_nul 33*756b7fc8SGetz Mikalsen 34*756b7fc8SGetz Mikalsen ldr q3, [x10, #16] // load second string chunk 35*756b7fc8SGetz Mikalsen ldr q2, [x1] // load true head 36*756b7fc8SGetz Mikalsen mov x8, #32 37*756b7fc8SGetz Mikalsen sub x8, x8, x11 38*756b7fc8SGetz Mikalsen 39*756b7fc8SGetz Mikalsen cmeq v1.16b, v3.16b, #0 // NUL found in second chunk? 40*756b7fc8SGetz Mikalsen 41*756b7fc8SGetz Mikalsen subs x2, x2, x8 42*756b7fc8SGetz Mikalsen b.ls .Lhead_buf_end 43*756b7fc8SGetz Mikalsen 44*756b7fc8SGetz Mikalsen /* process second chunk */ 45*756b7fc8SGetz Mikalsen shrn v1.8b, v1.8h, #4 46*756b7fc8SGetz Mikalsen fmov x5, d1 47*756b7fc8SGetz Mikalsen cbnz x5, .Lsecond_nul 48*756b7fc8SGetz Mikalsen 49*756b7fc8SGetz Mikalsen /* string didn't end in second chunk and neither did buffer */ 50*756b7fc8SGetz Mikalsen ldr q1, [x10, #32] // load next string chunk 51*756b7fc8SGetz Mikalsen str q2, [x0] // deposit head into buffer 52*756b7fc8SGetz Mikalsen sub x0, x0, x11 // adjust x0 53*756b7fc8SGetz Mikalsen str q3, [x0, #16] // deposit second chunk 54*756b7fc8SGetz Mikalsen add x10, x10, #32 // advance src 55*756b7fc8SGetz Mikalsen add x0, x0, #32 // advance dst 56*756b7fc8SGetz Mikalsen subs x2, x2, #16 // enough left for another round? 57*756b7fc8SGetz Mikalsen b.ls 1f 58*756b7fc8SGetz Mikalsen 59*756b7fc8SGetz Mikalsen /* main loop unrolled twice */ 60*756b7fc8SGetz Mikalsen .p2align 4 61*756b7fc8SGetz Mikalsen0: 62*756b7fc8SGetz Mikalsen cmeq v2.16b, v1.16b, #0 // NUL found in second chunk? 63*756b7fc8SGetz Mikalsen shrn v2.8b, v2.8h, #4 64*756b7fc8SGetz Mikalsen fmov x5, d2 65*756b7fc8SGetz Mikalsen 66*756b7fc8SGetz Mikalsen cbnz x5, 3f 67*756b7fc8SGetz Mikalsen 68*756b7fc8SGetz Mikalsen str q1, [x0] 69*756b7fc8SGetz Mikalsen ldr q1, [x10, #16] // load next chunk 70*756b7fc8SGetz Mikalsen 71*756b7fc8SGetz Mikalsen cmp x2, #16 // more than a full chunk left? 72*756b7fc8SGetz Mikalsen b.ls 2f 73*756b7fc8SGetz Mikalsen 74*756b7fc8SGetz Mikalsen add x10, x10, #32 // advance pointers 75*756b7fc8SGetz Mikalsen add x0, x0, #32 76*756b7fc8SGetz Mikalsen 77*756b7fc8SGetz Mikalsen cmeq v2.16b, v1.16b, #0 // NUL found in second chunk? 78*756b7fc8SGetz Mikalsen shrn v2.8b, v2.8h, #4 79*756b7fc8SGetz Mikalsen fmov x5, d2 80*756b7fc8SGetz Mikalsen cbnz x5, 4f // process chunk if match 81*756b7fc8SGetz Mikalsen 82*756b7fc8SGetz Mikalsen str q1, [x0, #-16] 83*756b7fc8SGetz Mikalsen ldr q1, [x10] // load next chunk 84*756b7fc8SGetz Mikalsen 85*756b7fc8SGetz Mikalsen subs x2, x2, #32 86*756b7fc8SGetz Mikalsen b.hi 0b 87*756b7fc8SGetz Mikalsen 88*756b7fc8SGetz Mikalsen1: 89*756b7fc8SGetz Mikalsen sub x10, x10, #16 // undo second advancement 90*756b7fc8SGetz Mikalsen add x2, x2, #16 91*756b7fc8SGetz Mikalsen sub x0, x0, #16 92*756b7fc8SGetz Mikalsen 93*756b7fc8SGetz Mikalsen /* 1--16 bytes left in the buffer but string has not ended yet */ 94*756b7fc8SGetz Mikalsen2: 95*756b7fc8SGetz Mikalsen cmeq v2.16b, v1.16b, #0 // NUL found in second chunk? 96*756b7fc8SGetz Mikalsen shrn v2.8b, v2.8h, #4 97*756b7fc8SGetz Mikalsen fmov x4, d2 98*756b7fc8SGetz Mikalsen 99*756b7fc8SGetz Mikalsen mov x6, #0xf 100*756b7fc8SGetz Mikalsen mov x7, x4 101*756b7fc8SGetz Mikalsen 102*756b7fc8SGetz Mikalsen lsl x5, x2, #2 // shift 0xf to the limits position 103*756b7fc8SGetz Mikalsen lsl x5, x6, x5 104*756b7fc8SGetz Mikalsen cmp x2, #16 // dont induce match if limit >=16 105*756b7fc8SGetz Mikalsen csel x5, x5, xzr, lo 106*756b7fc8SGetz Mikalsen orr x8, x4, x5 // treat limit as if terminator present 107*756b7fc8SGetz Mikalsen 108*756b7fc8SGetz Mikalsen rbit x8, x8 // simulate x86 tzcnt 109*756b7fc8SGetz Mikalsen clz x8, x8 // index of mismatch 110*756b7fc8SGetz Mikalsen lsr x8, x8, #2 111*756b7fc8SGetz Mikalsen 112*756b7fc8SGetz Mikalsen add x0, x0, x8 113*756b7fc8SGetz Mikalsen 114*756b7fc8SGetz Mikalsen ldr q1, [x10, x8] // load tail 115*756b7fc8SGetz Mikalsen str q1, [x0] // store tail 116*756b7fc8SGetz Mikalsen strb wzr, [x0, #16] 117*756b7fc8SGetz Mikalsen 118*756b7fc8SGetz Mikalsen /* continue to find the end of the string */ 119*756b7fc8SGetz Mikalsen cbnz x7, 1f 120*756b7fc8SGetz Mikalsen 121*756b7fc8SGetz Mikalsen /* we opt for a simpler strlen than the one in libc as the 122*756b7fc8SGetz Mikalsen * cmeq, shrn approach is faster for shorter strings. 123*756b7fc8SGetz Mikalsen */ 124*756b7fc8SGetz Mikalsen .p2align 4 125*756b7fc8SGetz Mikalsen0: 126*756b7fc8SGetz Mikalsen ldr q1, [x10, #32] 127*756b7fc8SGetz Mikalsen cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL 128*756b7fc8SGetz Mikalsen shrn v1.8b, v1.8h, #4 129*756b7fc8SGetz Mikalsen fmov x7, d1 130*756b7fc8SGetz Mikalsen cbnz x7, 2f 131*756b7fc8SGetz Mikalsen 132*756b7fc8SGetz Mikalsen ldr q1, [x10, #48] 133*756b7fc8SGetz Mikalsen cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL 134*756b7fc8SGetz Mikalsen shrn v1.8b, v1.8h, #4 135*756b7fc8SGetz Mikalsen fmov x7, d1 136*756b7fc8SGetz Mikalsen add x10, x10, #32 137*756b7fc8SGetz Mikalsen cbz x7, 0b 138*756b7fc8SGetz Mikalsen 139*756b7fc8SGetz Mikalsen1: sub x10, x10, #16 140*756b7fc8SGetz Mikalsen2: rbit x8, x7 141*756b7fc8SGetz Mikalsen clz x8, x8 // index of mismatch 142*756b7fc8SGetz Mikalsen lsr x8, x8, #2 143*756b7fc8SGetz Mikalsen 144*756b7fc8SGetz Mikalsen sub x10, x10, x1 145*756b7fc8SGetz Mikalsen add x0, x10, #32 146*756b7fc8SGetz Mikalsen add x0, x0, x8 147*756b7fc8SGetz Mikalsen 148*756b7fc8SGetz Mikalsen ret 149*756b7fc8SGetz Mikalsen 150*756b7fc8SGetz Mikalsen4: 151*756b7fc8SGetz Mikalsen sub x10, x10, #16 // undo second advancement 152*756b7fc8SGetz Mikalsen sub x0, x0, #16 // undo second advancement 153*756b7fc8SGetz Mikalsen 154*756b7fc8SGetz Mikalsen /* string has ended but buffer has not */ 155*756b7fc8SGetz Mikalsen3: 156*756b7fc8SGetz Mikalsen rbit x8, x5 157*756b7fc8SGetz Mikalsen clz x8, x8 // index of mismatch 158*756b7fc8SGetz Mikalsen lsr x8, x8, #2 159*756b7fc8SGetz Mikalsen 160*756b7fc8SGetz Mikalsen add x0, x0, x8 // restore dst pointer 161*756b7fc8SGetz Mikalsen add x10, x10, x8 162*756b7fc8SGetz Mikalsen 163*756b7fc8SGetz Mikalsen ldr q1, [x10, #-15] 164*756b7fc8SGetz Mikalsen str q1, [x0, #-15] 165*756b7fc8SGetz Mikalsen add x0, x0, #1 166*756b7fc8SGetz Mikalsen sub x0, x10, x1 167*756b7fc8SGetz Mikalsen 168*756b7fc8SGetz Mikalsen ret 169*756b7fc8SGetz Mikalsen 170*756b7fc8SGetz Mikalsen.Lhead_buf_end: 171*756b7fc8SGetz Mikalsen shrn v1.8b, v1.8h, #4 172*756b7fc8SGetz Mikalsen fmov x8, d1 173*756b7fc8SGetz Mikalsen 174*756b7fc8SGetz Mikalsen add x2, x2, #32 // restore limit 175*756b7fc8SGetz Mikalsen 176*756b7fc8SGetz Mikalsen mov x7, x8 177*756b7fc8SGetz Mikalsen mov x6, #0xf 178*756b7fc8SGetz Mikalsen 179*756b7fc8SGetz Mikalsen cmp x2, #16 // should we induce a match or not 180*756b7fc8SGetz Mikalsen b.lo 0f 181*756b7fc8SGetz Mikalsen 182*756b7fc8SGetz Mikalsen rbit x8, x8 183*756b7fc8SGetz Mikalsen clz x8, x8 // index of mismatch 184*756b7fc8SGetz Mikalsen lsr x8, x8, #2 185*756b7fc8SGetz Mikalsen add x8, x8, #16 186*756b7fc8SGetz Mikalsen 187*756b7fc8SGetz Mikalsen cmp x8, x2 188*756b7fc8SGetz Mikalsen csel x8, x8, x2, lo // copy min(buflen, srclen) bytes 189*756b7fc8SGetz Mikalsen b 1f 190*756b7fc8SGetz Mikalsen0: 191*756b7fc8SGetz Mikalsen 192*756b7fc8SGetz Mikalsen rbit x8, x8 193*756b7fc8SGetz Mikalsen clz x8, x8 // index of mismatch 194*756b7fc8SGetz Mikalsen lsr x8, x8, #2 195*756b7fc8SGetz Mikalsen 196*756b7fc8SGetz Mikalsen mov x8, x2 197*756b7fc8SGetz Mikalsen1: 198*756b7fc8SGetz Mikalsen 199*756b7fc8SGetz Mikalsen sub x8, x8, x11 200*756b7fc8SGetz Mikalsen strb wzr, [x9, x8] 201*756b7fc8SGetz Mikalsen 202*756b7fc8SGetz Mikalsen /* continue to find the end of the string */ 203*756b7fc8SGetz Mikalsen cbnz x7, 1f 204*756b7fc8SGetz Mikalsen 205*756b7fc8SGetz Mikalsen /* we opt for a simpler strlen than the one in libc as the 206*756b7fc8SGetz Mikalsen * cmeq, shrn approach is faster for shorter strings. 207*756b7fc8SGetz Mikalsen */ 208*756b7fc8SGetz Mikalsen .p2align 4 209*756b7fc8SGetz Mikalsen0: 210*756b7fc8SGetz Mikalsen ldr q1, [x10, #32] 211*756b7fc8SGetz Mikalsen cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL 212*756b7fc8SGetz Mikalsen shrn v1.8b, v1.8h, #4 213*756b7fc8SGetz Mikalsen fmov x7, d1 214*756b7fc8SGetz Mikalsen cbnz x7, 2f 215*756b7fc8SGetz Mikalsen 216*756b7fc8SGetz Mikalsen ldr q1, [x10, #48] 217*756b7fc8SGetz Mikalsen cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL 218*756b7fc8SGetz Mikalsen shrn v1.8b, v1.8h, #4 219*756b7fc8SGetz Mikalsen fmov x7, d1 220*756b7fc8SGetz Mikalsen add x10, x10, #32 221*756b7fc8SGetz Mikalsen cbz x7, 0b 222*756b7fc8SGetz Mikalsen 223*756b7fc8SGetz Mikalsen1: sub x10, x10, #16 224*756b7fc8SGetz Mikalsen2: rbit x6, x7 225*756b7fc8SGetz Mikalsen clz x6, x6 // index of mismatch 226*756b7fc8SGetz Mikalsen lsr x6, x6, #2 227*756b7fc8SGetz Mikalsen 228*756b7fc8SGetz Mikalsen sub x10, x10, x1 229*756b7fc8SGetz Mikalsen add x0, x10, #32 230*756b7fc8SGetz Mikalsen add x0, x0, x6 231*756b7fc8SGetz Mikalsen 232*756b7fc8SGetz Mikalsen add x4, x9, x8 // dst + cnt 233*756b7fc8SGetz Mikalsen add x5, x1, x8 // src + cnt 234*756b7fc8SGetz Mikalsen 235*756b7fc8SGetz Mikalsen b .L1732 236*756b7fc8SGetz Mikalsen 237*756b7fc8SGetz Mikalsen.Lsecond_nul: 238*756b7fc8SGetz Mikalsen add x2, x2, x8 239*756b7fc8SGetz Mikalsen 240*756b7fc8SGetz Mikalsen rbit x8, x5 241*756b7fc8SGetz Mikalsen clz x8, x8 // index of mismatch 242*756b7fc8SGetz Mikalsen lsr x5, x8, #2 243*756b7fc8SGetz Mikalsen 244*756b7fc8SGetz Mikalsen sub x8, x11, #16 245*756b7fc8SGetz Mikalsen sub x0, x5, x8 // string length 246*756b7fc8SGetz Mikalsen 247*756b7fc8SGetz Mikalsen cmp x0, x2 // did we match or hit limit first? 248*756b7fc8SGetz Mikalsen csel x8, x2, x0, hi 249*756b7fc8SGetz Mikalsen 250*756b7fc8SGetz Mikalsen add x4, x9, x8 // dst + cnt 251*756b7fc8SGetz Mikalsen add x5, x1, x8 // src + cnt 252*756b7fc8SGetz Mikalsen 253*756b7fc8SGetz Mikalsen strb wzr, [x4] 254*756b7fc8SGetz Mikalsen 255*756b7fc8SGetz Mikalsen /* copy 17-32 bytes */ 256*756b7fc8SGetz Mikalsen.L1732: 257*756b7fc8SGetz Mikalsen cmp x8, #16 258*756b7fc8SGetz Mikalsen b.lo .L0816 259*756b7fc8SGetz Mikalsen ldp x16, x17, [x1] 260*756b7fc8SGetz Mikalsen ldp x12, x1, [x5, #-16] 261*756b7fc8SGetz Mikalsen stp x16, x17, [x9] 262*756b7fc8SGetz Mikalsen stp x12, x1, [x4, #-16] 263*756b7fc8SGetz Mikalsen ret 264*756b7fc8SGetz Mikalsen 265*756b7fc8SGetz Mikalsen.Lhead_nul: 266*756b7fc8SGetz Mikalsen rbit x8, x5 267*756b7fc8SGetz Mikalsen clz x8, x8 // index of mismatch 268*756b7fc8SGetz Mikalsen lsr x8, x8, #2 269*756b7fc8SGetz Mikalsen 270*756b7fc8SGetz Mikalsen sub x0, x8, x11 271*756b7fc8SGetz Mikalsen cmp x0, x2 272*756b7fc8SGetz Mikalsen csel x8, x2, x0, hi 273*756b7fc8SGetz Mikalsen 274*756b7fc8SGetz Mikalsen add x4, x9, x8 // dst + cnt 275*756b7fc8SGetz Mikalsen add x5, x1, x8 // src + cnt 276*756b7fc8SGetz Mikalsen strb wzr, [x4] 277*756b7fc8SGetz Mikalsen 278*756b7fc8SGetz Mikalsen /* Copy 8-16 bytes */ 279*756b7fc8SGetz Mikalsen.L0816: 280*756b7fc8SGetz Mikalsen tbz x8, #3, .L0407 281*756b7fc8SGetz Mikalsen ldr x16, [x1] 282*756b7fc8SGetz Mikalsen ldr x17, [x5, #-8] 283*756b7fc8SGetz Mikalsen str x16, [x9] 284*756b7fc8SGetz Mikalsen str x17, [x4, #-8] 285*756b7fc8SGetz Mikalsen ret 286*756b7fc8SGetz Mikalsen 287*756b7fc8SGetz Mikalsen /* Copy 4-7 bytes */ 288*756b7fc8SGetz Mikalsen .p2align 4 289*756b7fc8SGetz Mikalsen.L0407: 290*756b7fc8SGetz Mikalsen cmp x8, #3 291*756b7fc8SGetz Mikalsen b.ls .L0203 292*756b7fc8SGetz Mikalsen ldr w16, [x1] 293*756b7fc8SGetz Mikalsen ldr w18, [x5, #-4] 294*756b7fc8SGetz Mikalsen str w16, [x9] 295*756b7fc8SGetz Mikalsen str w18, [x4, #-4] 296*756b7fc8SGetz Mikalsen ret 297*756b7fc8SGetz Mikalsen 298*756b7fc8SGetz Mikalsen.L0203: 299*756b7fc8SGetz Mikalsen tbz x8, 1, .L0001 300*756b7fc8SGetz Mikalsen ldrh w16, [x1] 301*756b7fc8SGetz Mikalsen ldrh w17, [x5, #-2] 302*756b7fc8SGetz Mikalsen strh w16, [x9] 303*756b7fc8SGetz Mikalsen strh w17, [x4, #-2] 304*756b7fc8SGetz Mikalsen ret 305*756b7fc8SGetz Mikalsen 306*756b7fc8SGetz Mikalsen.L0001: 307*756b7fc8SGetz Mikalsen ldrb w16, [x1] 308*756b7fc8SGetz Mikalsen strb w16, [x9] 309*756b7fc8SGetz Mikalsen strb wzr, [x4] 310*756b7fc8SGetz Mikalsen ret 311*756b7fc8SGetz Mikalsen 312*756b7fc8SGetz Mikalsen.L0: 313*756b7fc8SGetz Mikalsen mov x0, x1 314*756b7fc8SGetz Mikalsen b strlen 315*756b7fc8SGetz Mikalsen ret 316*756b7fc8SGetz MikalsenEND(__strlcpy) 317