1/* 2 * strcpy/stpcpy - copy a string returning pointer to start/end. 3 * 4 * Copyright (c) 2020-2023, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD. 11 * MTE compatible. 12 */ 13 14#include "asmdefs.h" 15 16#define dstin x0 17#define srcin x1 18#define result x0 19 20#define src x2 21#define dst x3 22#define len x4 23#define synd x4 24#define tmp x5 25#define shift x5 26#define data1 x6 27#define dataw1 w6 28#define data2 x7 29#define dataw2 w7 30 31#define dataq q0 32#define vdata v0 33#define vhas_nul v1 34#define vend v2 35#define dend d2 36#define dataq2 q1 37 38#ifdef BUILD_STPCPY 39# define STRCPY __stpcpy_aarch64 40# define IFSTPCPY(X,...) X,__VA_ARGS__ 41#else 42# define STRCPY __strcpy_aarch64 43# define IFSTPCPY(X,...) 44#endif 45 46/* 47 Core algorithm: 48 For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits 49 per byte. We take 4 bits of every comparison byte with shift right and narrow 50 by 4 instruction. Since the bits in the nibble mask reflect the order in 51 which things occur in the original string, counting leading zeros identifies 52 exactly which byte matched. */ 53 54ENTRY (STRCPY) 55 bic src, srcin, 15 56 ld1 {vdata.16b}, [src] 57 cmeq vhas_nul.16b, vdata.16b, 0 58 lsl shift, srcin, 2 59 shrn vend.8b, vhas_nul.8h, 4 60 fmov synd, dend 61 lsr synd, synd, shift 62 cbnz synd, L(tail) 63 64 ldr dataq, [src, 16]! 65 cmeq vhas_nul.16b, vdata.16b, 0 66 shrn vend.8b, vhas_nul.8h, 4 67 fmov synd, dend 68 cbz synd, L(start_loop) 69 70#ifndef __AARCH64EB__ 71 rbit synd, synd 72#endif 73 sub tmp, src, srcin 74 clz len, synd 75 add len, tmp, len, lsr 2 76 tbz len, 4, L(less16) 77 sub tmp, len, 15 78 ldr dataq, [srcin] 79 ldr dataq2, [srcin, tmp] 80 str dataq, [dstin] 81 str dataq2, [dstin, tmp] 82 IFSTPCPY (add result, dstin, len) 83 ret 84 85L(tail): 86 rbit synd, synd 87 clz len, synd 88 lsr len, len, 2 89L(less16): 90 tbz len, 3, L(less8) 91 sub tmp, len, 7 92 ldr data1, [srcin] 93 ldr data2, [srcin, tmp] 94 str data1, [dstin] 95 str data2, [dstin, tmp] 96 IFSTPCPY (add result, dstin, len) 97 ret 98 99 .p2align 4 100L(less8): 101 subs tmp, len, 3 102 b.lo L(less4) 103 ldr dataw1, [srcin] 104 ldr dataw2, [srcin, tmp] 105 str dataw1, [dstin] 106 str dataw2, [dstin, tmp] 107 IFSTPCPY (add result, dstin, len) 108 ret 109 110L(less4): 111 cbz len, L(zerobyte) 112 ldrh dataw1, [srcin] 113 strh dataw1, [dstin] 114L(zerobyte): 115 strb wzr, [dstin, len] 116 IFSTPCPY (add result, dstin, len) 117 ret 118 119 .p2align 4 120L(start_loop): 121 sub tmp, srcin, dstin 122 ldr dataq2, [srcin] 123 sub dst, src, tmp 124 str dataq2, [dstin] 125L(loop): 126 str dataq, [dst], 32 127 ldr dataq, [src, 16] 128 cmeq vhas_nul.16b, vdata.16b, 0 129 umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 130 fmov synd, dend 131 cbnz synd, L(loopend) 132 str dataq, [dst, -16] 133 ldr dataq, [src, 32]! 134 cmeq vhas_nul.16b, vdata.16b, 0 135 umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 136 fmov synd, dend 137 cbz synd, L(loop) 138 add dst, dst, 16 139L(loopend): 140 shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ 141 fmov synd, dend 142 sub dst, dst, 31 143#ifndef __AARCH64EB__ 144 rbit synd, synd 145#endif 146 clz len, synd 147 lsr len, len, 2 148 add dst, dst, len 149 ldr dataq, [dst, tmp] 150 str dataq, [dst] 151 IFSTPCPY (add result, dst, 15) 152 ret 153 154END (STRCPY) 155