1/*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org> 5*/ 6 7#include <machine/asm.h> 8 9 .weak strlcpy 10 .set strlcpy, __strlcpy 11 .text 12 13ENTRY(__strlcpy) 14 subs x2, x2, #1 15 b.lo .L0 16 17 mov x9, x0 // stash copy of dst pointer 18 bic x10, x1, #0xf // src aligned 19 and x11, x1, #0xf // src offset 20 21 ldr q1, [x10] 22 cmeq v1.16b, v1.16b, #0 // NUL found in head? 23 24 mov x8, #-1 // fill register with 0xfff..fff 25 lsl x12, x11, #2 26 lsl x8, x8, x12 // mask of bytes in the string 27 28 shrn v1.8b, v1.8h, #4 29 fmov x5, d1 30 31 ands x5, x5, x8 32 b.ne .Lhead_nul 33 34 ldr q3, [x10, #16] // load second string chunk 35 ldr q2, [x1] // load true head 36 mov x8, #32 37 sub x8, x8, x11 38 39 cmeq v1.16b, v3.16b, #0 // NUL found in second chunk? 40 41 subs x2, x2, x8 42 b.ls .Lhead_buf_end 43 44 /* process second chunk */ 45 shrn v1.8b, v1.8h, #4 46 fmov x5, d1 47 cbnz x5, .Lsecond_nul 48 49 /* string didn't end in second chunk and neither did buffer */ 50 ldr q1, [x10, #32] // load next string chunk 51 str q2, [x0] // deposit head into buffer 52 sub x0, x0, x11 // adjust x0 53 str q3, [x0, #16] // deposit second chunk 54 add x10, x10, #32 // advance src 55 add x0, x0, #32 // advance dst 56 subs x2, x2, #16 // enough left for another round? 57 b.ls 1f 58 59 /* main loop unrolled twice */ 60 .p2align 4 610: 62 cmeq v2.16b, v1.16b, #0 // NUL found in second chunk? 63 shrn v2.8b, v2.8h, #4 64 fmov x5, d2 65 66 cbnz x5, 3f 67 68 str q1, [x0] 69 ldr q1, [x10, #16] // load next chunk 70 71 cmp x2, #16 // more than a full chunk left? 72 b.ls 2f 73 74 add x10, x10, #32 // advance pointers 75 add x0, x0, #32 76 77 cmeq v2.16b, v1.16b, #0 // NUL found in second chunk? 78 shrn v2.8b, v2.8h, #4 79 fmov x5, d2 80 cbnz x5, 4f // process chunk if match 81 82 str q1, [x0, #-16] 83 ldr q1, [x10] // load next chunk 84 85 subs x2, x2, #32 86 b.hi 0b 87 881: 89 sub x10, x10, #16 // undo second advancement 90 add x2, x2, #16 91 sub x0, x0, #16 92 93 /* 1--16 bytes left in the buffer but string has not ended yet */ 942: 95 cmeq v2.16b, v1.16b, #0 // NUL found in second chunk? 96 shrn v2.8b, v2.8h, #4 97 fmov x4, d2 98 99 mov x6, #0xf 100 mov x7, x4 101 102 lsl x5, x2, #2 // shift 0xf to the limits position 103 lsl x5, x6, x5 104 cmp x2, #16 // dont induce match if limit >=16 105 csel x5, x5, xzr, lo 106 orr x8, x4, x5 // treat limit as if terminator present 107 108 rbit x8, x8 // simulate x86 tzcnt 109 clz x8, x8 // index of mismatch 110 lsr x8, x8, #2 111 112 add x0, x0, x8 113 114 ldr q1, [x10, x8] // load tail 115 str q1, [x0] // store tail 116 strb wzr, [x0, #16] 117 118 /* continue to find the end of the string */ 119 cbnz x7, 1f 120 121 /* we opt for a simpler strlen than the one in libc as the 122 * cmeq, shrn approach is faster for shorter strings. 123 */ 124 .p2align 4 1250: 126 ldr q1, [x10, #32] 127 cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL 128 shrn v1.8b, v1.8h, #4 129 fmov x7, d1 130 cbnz x7, 2f 131 132 ldr q1, [x10, #48] 133 cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL 134 shrn v1.8b, v1.8h, #4 135 fmov x7, d1 136 add x10, x10, #32 137 cbz x7, 0b 138 1391: sub x10, x10, #16 1402: rbit x8, x7 141 clz x8, x8 // index of mismatch 142 lsr x8, x8, #2 143 144 sub x10, x10, x1 145 add x0, x10, #32 146 add x0, x0, x8 147 148 ret 149 1504: 151 sub x10, x10, #16 // undo second advancement 152 sub x0, x0, #16 // undo second advancement 153 154 /* string has ended but buffer has not */ 1553: 156 rbit x8, x5 157 clz x8, x8 // index of mismatch 158 lsr x8, x8, #2 159 160 add x0, x0, x8 // restore dst pointer 161 add x10, x10, x8 162 163 ldr q1, [x10, #-15] 164 str q1, [x0, #-15] 165 add x0, x0, #1 166 sub x0, x10, x1 167 168 ret 169 170.Lhead_buf_end: 171 shrn v1.8b, v1.8h, #4 172 fmov x8, d1 173 174 add x2, x2, #32 // restore limit 175 176 mov x7, x8 177 mov x6, #0xf 178 179 cmp x2, #16 // should we induce a match or not 180 b.lo 0f 181 182 rbit x8, x8 183 clz x8, x8 // index of mismatch 184 lsr x8, x8, #2 185 add x8, x8, #16 186 187 cmp x8, x2 188 csel x8, x8, x2, lo // copy min(buflen, srclen) bytes 189 b 1f 1900: 191 192 rbit x8, x8 193 clz x8, x8 // index of mismatch 194 lsr x8, x8, #2 195 196 mov x8, x2 1971: 198 199 sub x8, x8, x11 200 strb wzr, [x9, x8] 201 202 /* continue to find the end of the string */ 203 cbnz x7, 1f 204 205 /* we opt for a simpler strlen than the one in libc as the 206 * cmeq, shrn approach is faster for shorter strings. 207 */ 208 .p2align 4 2090: 210 ldr q1, [x10, #32] 211 cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL 212 shrn v1.8b, v1.8h, #4 213 fmov x7, d1 214 cbnz x7, 2f 215 216 ldr q1, [x10, #48] 217 cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL 218 shrn v1.8b, v1.8h, #4 219 fmov x7, d1 220 add x10, x10, #32 221 cbz x7, 0b 222 2231: sub x10, x10, #16 2242: rbit x6, x7 225 clz x6, x6 // index of mismatch 226 lsr x6, x6, #2 227 228 sub x10, x10, x1 229 add x0, x10, #32 230 add x0, x0, x6 231 232 add x4, x9, x8 // dst + cnt 233 add x5, x1, x8 // src + cnt 234 235 b .L1732 236 237.Lsecond_nul: 238 add x2, x2, x8 239 240 rbit x8, x5 241 clz x8, x8 // index of mismatch 242 lsr x5, x8, #2 243 244 sub x8, x11, #16 245 sub x0, x5, x8 // string length 246 247 cmp x0, x2 // did we match or hit limit first? 248 csel x8, x2, x0, hi 249 250 add x4, x9, x8 // dst + cnt 251 add x5, x1, x8 // src + cnt 252 253 strb wzr, [x4] 254 255 /* copy 17-32 bytes */ 256.L1732: 257 cmp x8, #16 258 b.lo .L0816 259 ldp x16, x17, [x1] 260 ldp x12, x1, [x5, #-16] 261 stp x16, x17, [x9] 262 stp x12, x1, [x4, #-16] 263 ret 264 265.Lhead_nul: 266 rbit x8, x5 267 clz x8, x8 // index of mismatch 268 lsr x8, x8, #2 269 270 sub x0, x8, x11 271 cmp x0, x2 272 csel x8, x2, x0, hi 273 274 add x4, x9, x8 // dst + cnt 275 add x5, x1, x8 // src + cnt 276 strb wzr, [x4] 277 278 /* Copy 8-16 bytes */ 279.L0816: 280 tbz x8, #3, .L0407 281 ldr x16, [x1] 282 ldr x17, [x5, #-8] 283 str x16, [x9] 284 str x17, [x4, #-8] 285 ret 286 287 /* Copy 4-7 bytes */ 288 .p2align 4 289.L0407: 290 cmp x8, #3 291 b.ls .L0203 292 ldr w16, [x1] 293 ldr w18, [x5, #-4] 294 str w16, [x9] 295 str w18, [x4, #-4] 296 ret 297 298.L0203: 299 tbz x8, 1, .L0001 300 ldrh w16, [x1] 301 ldrh w17, [x5, #-2] 302 strh w16, [x9] 303 strh w17, [x4, #-2] 304 ret 305 306.L0001: 307 ldrb w16, [x1] 308 strb w16, [x9] 309 strb wzr, [x4] 310 ret 311 312.L0: 313 mov x0, x1 314 b strlen 315 ret 316END(__strlcpy) 317