1/*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org> 5*/ 6 7#include <machine/asm.h> 8 9 .weak memccpy 10 .set memccpy, __memccpy 11 .text 12 13ENTRY(__memccpy) 14 subs x3, x3, #1 15 b.lo .L0 16 17 dup v0.16b, w2 18 19 mov x9, x0 // stash copy of src pointer 20 bic x10, x1, #0xf // src aligned 21 and x11, x1, #0xf // src offset 22 23 ldr q1, [x10] 24 cmeq v1.16b, v1.16b, v0.16b // bytewise compare against src char 25 26 mov x8, #-1 // prepare a 0xfff..fff register 27 mov x6, #0xf 28 29 lsl x12, x11, #2 30 lsl x8, x8, x12 // mask of bytes in the string 31 32 shrn v1.8b, v1.8h, #4 33 fmov x5, d1 34 35 sub x12, x11, #32 36 adds x12, x12, x3 // distance from alignment boundary - 32 37 b.cc .Lrunt // branch if buffer length is 32 or less 38 39 ands x8, x8, x5 40 b.eq 0f 41 42 /* match in first chunk */ 43 rbit x8, x8 44 clz x8, x8 // index of mismatch 45 lsr x8, x8, #2 46 47 sub x8, x8, x11 // ... from beginning of the string 48 49 add x0, x0, x8 50 add x4, x9, x8 // dst + cnt 51 add x5, x1, x8 // src + cnt 52 add x0, x0, #1 53 54 b .L0816 55 560: 57 ldr q3, [x10, #16] // load second string chunk 58 ldr q2, [x1] // load true head 59 cmeq v1.16b, v3.16b, v0.16b // char found in second chunk? 60 61 /* process second chunk */ 62 shrn v1.8b, v1.8h, #4 63 fmov x5, d1 64 65 cbz x5, 0f 66 67 /* match in second chunk */ 68 rbit x8, x5 69 clz x8, x8 // index of mismatch 70 lsr x8, x8, #2 71 72 sub x11, x11, #16 73 sub x8, x8, x11 // adjust for alignment offset 74 add x0, x0, x8 // return value 75 add x0, x0, #1 76 77 add x4, x9, x8 78 add x5, x1, x8 79 b .L1732 80 810: 82 /* string didn't end in second chunk and neither did buffer */ 83 ldr q1, [x10, #32] // load next string chunk 84 str q2, [x0] // deposit head into buffer 85 sub x0, x0, x11 // adjust x0 86 mov x3, x12 87 str q3, [x0, #16] // deposit second chunk 88 89 add x10, x10, #32 // advance src 90 add x0, x0, #32 // advance dst 91 subs x3, x3, #16 // enough left for another round? 92 b.lo 1f 93 94 /* main loop unrolled twice */ 95 .p2align 4 960: 97 cmeq v2.16b, v1.16b, v0.16b // char found in second chunk? 98 shrn v2.8b, v2.8h, #4 99 fmov x5, d2 100 101 cbnz x5, 3f 102 103 str q1, [x0] 104 ldr q1, [x10, #16] // load next chunk 105 106 cmp x3, #16 // more than a full chunk left? 107 b.lo 2f 108 109 add x10, x10, #32 // advance pointers 110 add x0, x0, #32 111 112 cmeq v2.16b, v1.16b, v0.16b // char found in second chunk? 113 shrn v2.8b, v2.8h, #4 114 fmov x5, d2 115 cbnz x5, 4f // process chunk if match 116 117 str q1, [x0, #-16] 118 ldr q1, [x10] // load next chunk 119 120 subs x3, x3, #32 121 b.hs 0b 122 1231: 124 sub x10, x10, #16 // undo second advancement 125 add x3, x3, #16 126 sub x0, x0, #16 127 128 /* 1--16 bytes left in the buffer but string has not ended yet */ 1292: 130 cmeq v2.16b, v1.16b, v0.16b // char found in second chunk? 131 shrn v2.8b, v2.8h, #4 132 fmov x4, d2 133 134 lsl x5, x3, #2 // shift 0xf to the limits position 135 lsl x5, x6, x5 136 orr x8, x4, x5 // insert match in mask at limit 137 138 rbit x8, x8 // simulate x86 tzcnt 139 clz x7, x8 // index of mismatch 140 lsr x8, x7, #2 141 142 lsl x5, x6, x7 // simulate x86 bt with shifted 0xf 143 144 add x8, x8, #1 145 add x0, x0, x8 146 147 ldr q1, [x10, x8] // load tail 148 str q1, [x0] // store tail 149 150 add x0, x0, #16 151 152 tst x4, x5 // terminator encountered inside buffer? 153 csel x0, x0, xzr, ne // if yes, return pointer, else NUL 154 ret 155 1564: 157 sub x10, x10, #16 // undo second advancement 158 sub x0, x0, #16 // undo second advancement 159 1603: 161 rbit x8, x5 162 clz x8, x8 // index of mismatch 163 lsr x3, x8, #2 164 165 add x0, x0, x3 // restore dst pointer 166 add x10, x10, x3 167 ldr q1, [x10, #-15] 168 str q1, [x0, #-15] 169 add x0, x0, #1 170 ret 171 172.Lrunt: 173 add x13, x11, x3 174 175 mov x7, x5 // keep a copy of original match mask 176 177 lsl x4, x12, #2 // shift 0xf to the limits position 178 lsl x4, x6, x4 179 180 cmp x13, #16 // dont induce match if limit >=16 181 csel x4, x4, xzr, lo 182 orr x5, x5, x4 // insert match in mask at limit 183 184 ands x8, x8, x5 // if match always fall through 185 b.ne 0f 186 187 ldr q4, [x10, #16] // load second string chunk 188 cmeq v1.16b, v4.16b, v0.16b // char found in second chunk? 189 190 /* process second chunk */ 191 shrn v1.8b, v1.8h, #4 192 fmov x8, d1 193 mov x7, x8 194 195 lsl x4, x12, #2 196 lsl x4, x6, x4 197 orr x8, x8, x4 // induce match in upper bytes of mask 198 199 rbit x8, x8 200 clz x4, x8 // index of mismatch 201 lsr x8, x4, #2 202 add x8, x8, #16 // no match in first chunk 203 b 1f 204 2050: 206 rbit x8, x8 207 clz x4, x8 // index of mismatch 208 lsr x8, x4, #2 2091: 210 add x0, x0, x8 // return value if terminator not found 211 sub x0, x0, x11 212 add x0, x0, #1 213 214 /* check if we encountered a match or the limit first */ 215 lsl x5, x6, x4 216 ands x7, x7, x5 // was the terminator present? 217 csel x0, xzr, x0, eq // return value based on what we matched 218 219 sub x8, x8, x11 220 add x4, x9, x8 // dst + cnt 221 add x5, x1, x8 // src + cnt 222 223 /* copy 17-32 bytes */ 224.L1732: 225 cmp x8, #16 226 b.lo .L0816 227 add x5, x5, #1 // ldp offsets are powers of 2 228 add x4, x4, #1 229 ldp x16, x17, [x1] 230 ldp x12, x13, [x5, #-16] 231 stp x16, x17, [x9] 232 stp x12, x13, [x4, #-16] 233 ret 234 235 /* Copy 8-16 bytes */ 236.L0816: 237 tbz x8, #3, .L0407 238 ldr x16, [x1] 239 ldr x17, [x5, #-7] 240 str x16, [x9] 241 str x17, [x4, #-7] 242 ret 243 244 /* Copy 4-7 bytes */ 245 .p2align 4 246.L0407: 247 cmp x8, #3 248 b.lo .L0103 249 ldr w16, [x1] 250 ldr w18, [x5, #-3] 251 str w16, [x9] 252 str w18, [x4, #-3] 253 ret 254 255 /* Copy 1-3 bytes */ 256 .p2align 4 257.L0103: 258 lsr x14, x8, #1 259 ldrb w16, [x1] 260 ldrb w15, [x5] 261 ldrb w18, [x1, x14] 262 strb w16, [x9] 263 strb w18, [x9, x14] 264 strb w15, [x4] 265 ret 266 267.L0: 268 eor x0, x0, x0 269 ret 270 271END(__memccpy) 272