1*bad17991SGetz Mikalsen/*- 2*bad17991SGetz Mikalsen * SPDX-License-Identifier: BSD-2-Clause 3*bad17991SGetz Mikalsen * 4*bad17991SGetz Mikalsen * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org> 5*bad17991SGetz Mikalsen*/ 6*bad17991SGetz Mikalsen 7*bad17991SGetz Mikalsen#include <machine/asm.h> 8*bad17991SGetz Mikalsen 9*bad17991SGetz Mikalsen .weak memccpy 10*bad17991SGetz Mikalsen .set memccpy, __memccpy 11*bad17991SGetz Mikalsen .text 12*bad17991SGetz Mikalsen 13*bad17991SGetz MikalsenENTRY(__memccpy) 14*bad17991SGetz Mikalsen subs x3, x3, #1 15*bad17991SGetz Mikalsen b.lo .L0 16*bad17991SGetz Mikalsen 17*bad17991SGetz Mikalsen dup v0.16b, w2 18*bad17991SGetz Mikalsen 19*bad17991SGetz Mikalsen mov x9, x0 // stash copy of src pointer 20*bad17991SGetz Mikalsen bic x10, x1, #0xf // src aligned 21*bad17991SGetz Mikalsen and x11, x1, #0xf // src offset 22*bad17991SGetz Mikalsen 23*bad17991SGetz Mikalsen ldr q1, [x10] 24*bad17991SGetz Mikalsen cmeq v1.16b, v1.16b, v0.16b // bytewise compare against src char 25*bad17991SGetz Mikalsen 26*bad17991SGetz Mikalsen mov x8, #-1 // prepare a 0xfff..fff register 27*bad17991SGetz Mikalsen mov x6, #0xf 28*bad17991SGetz Mikalsen 29*bad17991SGetz Mikalsen lsl x12, x11, #2 30*bad17991SGetz Mikalsen lsl x8, x8, x12 // mask of bytes in the string 31*bad17991SGetz Mikalsen 32*bad17991SGetz Mikalsen shrn v1.8b, v1.8h, #4 33*bad17991SGetz Mikalsen fmov x5, d1 34*bad17991SGetz Mikalsen 35*bad17991SGetz Mikalsen sub x12, x11, #32 36*bad17991SGetz Mikalsen adds x12, x12, x3 // distance from alignment boundary - 32 37*bad17991SGetz Mikalsen b.cc .Lrunt // branch if buffer length is 32 or less 38*bad17991SGetz Mikalsen 39*bad17991SGetz Mikalsen ands x8, x8, x5 40*bad17991SGetz Mikalsen b.eq 0f 41*bad17991SGetz Mikalsen 42*bad17991SGetz Mikalsen /* match in first chunk */ 43*bad17991SGetz Mikalsen rbit x8, x8 44*bad17991SGetz Mikalsen clz x8, x8 // index of mismatch 45*bad17991SGetz Mikalsen lsr x8, x8, #2 46*bad17991SGetz Mikalsen 47*bad17991SGetz Mikalsen sub x8, x8, x11 // ... from beginning of the string 48*bad17991SGetz Mikalsen 49*bad17991SGetz Mikalsen add x0, x0, x8 50*bad17991SGetz Mikalsen add x4, x9, x8 // dst + cnt 51*bad17991SGetz Mikalsen add x5, x1, x8 // src + cnt 52*bad17991SGetz Mikalsen add x0, x0, #1 53*bad17991SGetz Mikalsen 54*bad17991SGetz Mikalsen b .L0816 55*bad17991SGetz Mikalsen 56*bad17991SGetz Mikalsen0: 57*bad17991SGetz Mikalsen ldr q3, [x10, #16] // load second string chunk 58*bad17991SGetz Mikalsen ldr q2, [x1] // load true head 59*bad17991SGetz Mikalsen cmeq v1.16b, v3.16b, v0.16b // char found in second chunk? 60*bad17991SGetz Mikalsen 61*bad17991SGetz Mikalsen /* process second chunk */ 62*bad17991SGetz Mikalsen shrn v1.8b, v1.8h, #4 63*bad17991SGetz Mikalsen fmov x5, d1 64*bad17991SGetz Mikalsen 65*bad17991SGetz Mikalsen cbz x5, 0f 66*bad17991SGetz Mikalsen 67*bad17991SGetz Mikalsen /* match in second chunk */ 68*bad17991SGetz Mikalsen rbit x8, x5 69*bad17991SGetz Mikalsen clz x8, x8 // index of mismatch 70*bad17991SGetz Mikalsen lsr x8, x8, #2 71*bad17991SGetz Mikalsen 72*bad17991SGetz Mikalsen sub x11, x11, #16 73*bad17991SGetz Mikalsen sub x8, x8, x11 // adjust for alignment offset 74*bad17991SGetz Mikalsen add x0, x0, x8 // return value 75*bad17991SGetz Mikalsen add x0, x0, #1 76*bad17991SGetz Mikalsen 77*bad17991SGetz Mikalsen add x4, x9, x8 78*bad17991SGetz Mikalsen add x5, x1, x8 79*bad17991SGetz Mikalsen b .L1732 80*bad17991SGetz Mikalsen 81*bad17991SGetz Mikalsen0: 82*bad17991SGetz Mikalsen /* string didn't end in second chunk and neither did buffer */ 83*bad17991SGetz Mikalsen ldr q1, [x10, #32] // load next string chunk 84*bad17991SGetz Mikalsen str q2, [x0] // deposit head into buffer 85*bad17991SGetz Mikalsen sub x0, x0, x11 // adjust x0 86*bad17991SGetz Mikalsen mov x3, x12 87*bad17991SGetz Mikalsen str q3, [x0, #16] // deposit second chunk 88*bad17991SGetz Mikalsen 89*bad17991SGetz Mikalsen add x10, x10, #32 // advance src 90*bad17991SGetz Mikalsen add x0, x0, #32 // advance dst 91*bad17991SGetz Mikalsen subs x3, x3, #16 // enough left for another round? 92*bad17991SGetz Mikalsen b.lo 1f 93*bad17991SGetz Mikalsen 94*bad17991SGetz Mikalsen /* main loop unrolled twice */ 95*bad17991SGetz Mikalsen .p2align 4 96*bad17991SGetz Mikalsen0: 97*bad17991SGetz Mikalsen cmeq v2.16b, v1.16b, v0.16b // char found in second chunk? 98*bad17991SGetz Mikalsen shrn v2.8b, v2.8h, #4 99*bad17991SGetz Mikalsen fmov x5, d2 100*bad17991SGetz Mikalsen 101*bad17991SGetz Mikalsen cbnz x5, 3f 102*bad17991SGetz Mikalsen 103*bad17991SGetz Mikalsen str q1, [x0] 104*bad17991SGetz Mikalsen ldr q1, [x10, #16] // load next chunk 105*bad17991SGetz Mikalsen 106*bad17991SGetz Mikalsen cmp x3, #16 // more than a full chunk left? 107*bad17991SGetz Mikalsen b.lo 2f 108*bad17991SGetz Mikalsen 109*bad17991SGetz Mikalsen add x10, x10, #32 // advance pointers 110*bad17991SGetz Mikalsen add x0, x0, #32 111*bad17991SGetz Mikalsen 112*bad17991SGetz Mikalsen cmeq v2.16b, v1.16b, v0.16b // char found in second chunk? 113*bad17991SGetz Mikalsen shrn v2.8b, v2.8h, #4 114*bad17991SGetz Mikalsen fmov x5, d2 115*bad17991SGetz Mikalsen cbnz x5, 4f // process chunk if match 116*bad17991SGetz Mikalsen 117*bad17991SGetz Mikalsen str q1, [x0, #-16] 118*bad17991SGetz Mikalsen ldr q1, [x10] // load next chunk 119*bad17991SGetz Mikalsen 120*bad17991SGetz Mikalsen subs x3, x3, #32 121*bad17991SGetz Mikalsen b.hs 0b 122*bad17991SGetz Mikalsen 123*bad17991SGetz Mikalsen1: 124*bad17991SGetz Mikalsen sub x10, x10, #16 // undo second advancement 125*bad17991SGetz Mikalsen add x3, x3, #16 126*bad17991SGetz Mikalsen sub x0, x0, #16 127*bad17991SGetz Mikalsen 128*bad17991SGetz Mikalsen /* 1--16 bytes left in the buffer but string has not ended yet */ 129*bad17991SGetz Mikalsen2: 130*bad17991SGetz Mikalsen cmeq v2.16b, v1.16b, v0.16b // char found in second chunk? 131*bad17991SGetz Mikalsen shrn v2.8b, v2.8h, #4 132*bad17991SGetz Mikalsen fmov x4, d2 133*bad17991SGetz Mikalsen 134*bad17991SGetz Mikalsen lsl x5, x3, #2 // shift 0xf to the limits position 135*bad17991SGetz Mikalsen lsl x5, x6, x5 136*bad17991SGetz Mikalsen orr x8, x4, x5 // insert match in mask at limit 137*bad17991SGetz Mikalsen 138*bad17991SGetz Mikalsen rbit x8, x8 // simulate x86 tzcnt 139*bad17991SGetz Mikalsen clz x7, x8 // index of mismatch 140*bad17991SGetz Mikalsen lsr x8, x7, #2 141*bad17991SGetz Mikalsen 142*bad17991SGetz Mikalsen lsl x5, x6, x7 // simulate x86 bt with shifted 0xf 143*bad17991SGetz Mikalsen 144*bad17991SGetz Mikalsen add x8, x8, #1 145*bad17991SGetz Mikalsen add x0, x0, x8 146*bad17991SGetz Mikalsen 147*bad17991SGetz Mikalsen ldr q1, [x10, x8] // load tail 148*bad17991SGetz Mikalsen str q1, [x0] // store tail 149*bad17991SGetz Mikalsen 150*bad17991SGetz Mikalsen add x0, x0, #16 151*bad17991SGetz Mikalsen 152*bad17991SGetz Mikalsen tst x4, x5 // terminator encountered inside buffer? 153*bad17991SGetz Mikalsen csel x0, x0, xzr, ne // if yes, return pointer, else NUL 154*bad17991SGetz Mikalsen ret 155*bad17991SGetz Mikalsen 156*bad17991SGetz Mikalsen4: 157*bad17991SGetz Mikalsen sub x10, x10, #16 // undo second advancement 158*bad17991SGetz Mikalsen sub x0, x0, #16 // undo second advancement 159*bad17991SGetz Mikalsen 160*bad17991SGetz Mikalsen3: 161*bad17991SGetz Mikalsen rbit x8, x5 162*bad17991SGetz Mikalsen clz x8, x8 // index of mismatch 163*bad17991SGetz Mikalsen lsr x3, x8, #2 164*bad17991SGetz Mikalsen 165*bad17991SGetz Mikalsen add x0, x0, x3 // restore dst pointer 166*bad17991SGetz Mikalsen add x10, x10, x3 167*bad17991SGetz Mikalsen ldr q1, [x10, #-15] 168*bad17991SGetz Mikalsen str q1, [x0, #-15] 169*bad17991SGetz Mikalsen add x0, x0, #1 170*bad17991SGetz Mikalsen ret 171*bad17991SGetz Mikalsen 172*bad17991SGetz Mikalsen.Lrunt: 173*bad17991SGetz Mikalsen add x13, x11, x3 174*bad17991SGetz Mikalsen 175*bad17991SGetz Mikalsen mov x7, x5 // keep a copy of original match mask 176*bad17991SGetz Mikalsen 177*bad17991SGetz Mikalsen lsl x4, x12, #2 // shift 0xf to the limits position 178*bad17991SGetz Mikalsen lsl x4, x6, x4 179*bad17991SGetz Mikalsen 180*bad17991SGetz Mikalsen cmp x13, #16 // dont induce match if limit >=16 181*bad17991SGetz Mikalsen csel x4, x4, xzr, lo 182*bad17991SGetz Mikalsen orr x5, x5, x4 // insert match in mask at limit 183*bad17991SGetz Mikalsen 184*bad17991SGetz Mikalsen ands x8, x8, x5 // if match always fall through 185*bad17991SGetz Mikalsen b.ne 0f 186*bad17991SGetz Mikalsen 187*bad17991SGetz Mikalsen ldr q4, [x10, #16] // load second string chunk 188*bad17991SGetz Mikalsen cmeq v1.16b, v4.16b, v0.16b // char found in second chunk? 189*bad17991SGetz Mikalsen 190*bad17991SGetz Mikalsen /* process second chunk */ 191*bad17991SGetz Mikalsen shrn v1.8b, v1.8h, #4 192*bad17991SGetz Mikalsen fmov x8, d1 193*bad17991SGetz Mikalsen mov x7, x8 194*bad17991SGetz Mikalsen 195*bad17991SGetz Mikalsen lsl x4, x12, #2 196*bad17991SGetz Mikalsen lsl x4, x6, x4 197*bad17991SGetz Mikalsen orr x8, x8, x4 // induce match in upper bytes of mask 198*bad17991SGetz Mikalsen 199*bad17991SGetz Mikalsen rbit x8, x8 200*bad17991SGetz Mikalsen clz x4, x8 // index of mismatch 201*bad17991SGetz Mikalsen lsr x8, x4, #2 202*bad17991SGetz Mikalsen add x8, x8, #16 // no match in first chunk 203*bad17991SGetz Mikalsen b 1f 204*bad17991SGetz Mikalsen 205*bad17991SGetz Mikalsen0: 206*bad17991SGetz Mikalsen rbit x8, x8 207*bad17991SGetz Mikalsen clz x4, x8 // index of mismatch 208*bad17991SGetz Mikalsen lsr x8, x4, #2 209*bad17991SGetz Mikalsen1: 210*bad17991SGetz Mikalsen add x0, x0, x8 // return value if terminator not found 211*bad17991SGetz Mikalsen sub x0, x0, x11 212*bad17991SGetz Mikalsen add x0, x0, #1 213*bad17991SGetz Mikalsen 214*bad17991SGetz Mikalsen /* check if we encountered a match or the limit first */ 215*bad17991SGetz Mikalsen lsl x5, x6, x4 216*bad17991SGetz Mikalsen ands x7, x7, x5 // was the terminator present? 217*bad17991SGetz Mikalsen csel x0, xzr, x0, eq // return value based on what we matched 218*bad17991SGetz Mikalsen 219*bad17991SGetz Mikalsen sub x8, x8, x11 220*bad17991SGetz Mikalsen add x4, x9, x8 // dst + cnt 221*bad17991SGetz Mikalsen add x5, x1, x8 // src + cnt 222*bad17991SGetz Mikalsen 223*bad17991SGetz Mikalsen /* copy 17-32 bytes */ 224*bad17991SGetz Mikalsen.L1732: 225*bad17991SGetz Mikalsen cmp x8, #16 226*bad17991SGetz Mikalsen b.lo .L0816 227*bad17991SGetz Mikalsen add x5, x5, #1 // ldp offsets are powers of 2 228*bad17991SGetz Mikalsen add x4, x4, #1 229*bad17991SGetz Mikalsen ldp x16, x17, [x1] 230*bad17991SGetz Mikalsen ldp x12, x13, [x5, #-16] 231*bad17991SGetz Mikalsen stp x16, x17, [x9] 232*bad17991SGetz Mikalsen stp x12, x13, [x4, #-16] 233*bad17991SGetz Mikalsen ret 234*bad17991SGetz Mikalsen 235*bad17991SGetz Mikalsen /* Copy 8-16 bytes */ 236*bad17991SGetz Mikalsen.L0816: 237*bad17991SGetz Mikalsen tbz x8, #3, .L0407 238*bad17991SGetz Mikalsen ldr x16, [x1] 239*bad17991SGetz Mikalsen ldr x17, [x5, #-7] 240*bad17991SGetz Mikalsen str x16, [x9] 241*bad17991SGetz Mikalsen str x17, [x4, #-7] 242*bad17991SGetz Mikalsen ret 243*bad17991SGetz Mikalsen 244*bad17991SGetz Mikalsen /* Copy 4-7 bytes */ 245*bad17991SGetz Mikalsen .p2align 4 246*bad17991SGetz Mikalsen.L0407: 247*bad17991SGetz Mikalsen cmp x8, #3 248*bad17991SGetz Mikalsen b.lo .L0103 249*bad17991SGetz Mikalsen ldr w16, [x1] 250*bad17991SGetz Mikalsen ldr w18, [x5, #-3] 251*bad17991SGetz Mikalsen str w16, [x9] 252*bad17991SGetz Mikalsen str w18, [x4, #-3] 253*bad17991SGetz Mikalsen ret 254*bad17991SGetz Mikalsen 255*bad17991SGetz Mikalsen /* Copy 1-3 bytes */ 256*bad17991SGetz Mikalsen .p2align 4 257*bad17991SGetz Mikalsen.L0103: 258*bad17991SGetz Mikalsen lsr x14, x8, #1 259*bad17991SGetz Mikalsen ldrb w16, [x1] 260*bad17991SGetz Mikalsen ldrb w15, [x5] 261*bad17991SGetz Mikalsen ldrb w18, [x1, x14] 262*bad17991SGetz Mikalsen strb w16, [x9] 263*bad17991SGetz Mikalsen strb w18, [x9, x14] 264*bad17991SGetz Mikalsen strb w15, [x4] 265*bad17991SGetz Mikalsen ret 266*bad17991SGetz Mikalsen 267*bad17991SGetz Mikalsen.L0: 268*bad17991SGetz Mikalsen eor x0, x0, x0 269*bad17991SGetz Mikalsen ret 270*bad17991SGetz Mikalsen 271*bad17991SGetz MikalsenEND(__memccpy) 272