1/* 2 * strcpy/stpcpy - copy a string returning pointer to start/end. 3 * 4 * Copyright (c) 2020-2023, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD. 11 * MTE compatible. 12 */ 13 14#include "asmdefs.h" 15 16#define dstin x0 17#define srcin x1 18#define result x0 19 20#define src x2 21#define dst x3 22#define len x4 23#define synd x4 24#define tmp x5 25#define shift x5 26#define data1 x6 27#define dataw1 w6 28#define data2 x7 29#define dataw2 w7 30 31#define dataq q0 32#define vdata v0 33#define vhas_nul v1 34#define vend v2 35#define dend d2 36#define dataq2 q1 37 38#ifdef BUILD_STPCPY 39# define STRCPY __stpcpy_aarch64 40# define IFSTPCPY(X,...) X,__VA_ARGS__ 41#else 42# define STRCPY __strcpy_aarch64 43# define IFSTPCPY(X,...) 44#endif 45 46/* 47 Core algorithm: 48 For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits 49 per byte. We take 4 bits of every comparison byte with shift right and narrow 50 by 4 instruction. Since the bits in the nibble mask reflect the order in 51 which things occur in the original string, counting leading zeros identifies 52 exactly which byte matched. */ 53 54ENTRY (STRCPY) 55 PTR_ARG (0) 56 PTR_ARG (1) 57 bic src, srcin, 15 58 ld1 {vdata.16b}, [src] 59 cmeq vhas_nul.16b, vdata.16b, 0 60 lsl shift, srcin, 2 61 shrn vend.8b, vhas_nul.8h, 4 62 fmov synd, dend 63 lsr synd, synd, shift 64 cbnz synd, L(tail) 65 66 ldr dataq, [src, 16]! 67 cmeq vhas_nul.16b, vdata.16b, 0 68 shrn vend.8b, vhas_nul.8h, 4 69 fmov synd, dend 70 cbz synd, L(start_loop) 71 72#ifndef __AARCH64EB__ 73 rbit synd, synd 74#endif 75 sub tmp, src, srcin 76 clz len, synd 77 add len, tmp, len, lsr 2 78 tbz len, 4, L(less16) 79 sub tmp, len, 15 80 ldr dataq, [srcin] 81 ldr dataq2, [srcin, tmp] 82 str dataq, [dstin] 83 str dataq2, [dstin, tmp] 84 IFSTPCPY (add result, dstin, len) 85 ret 86 87L(tail): 88 rbit synd, synd 89 clz len, synd 90 lsr len, len, 2 91L(less16): 92 tbz len, 3, L(less8) 93 sub tmp, len, 7 94 ldr data1, [srcin] 95 ldr data2, [srcin, tmp] 96 str data1, [dstin] 97 str data2, [dstin, tmp] 98 IFSTPCPY (add result, dstin, len) 99 ret 100 101 .p2align 4 102L(less8): 103 subs tmp, len, 3 104 b.lo L(less4) 105 ldr dataw1, [srcin] 106 ldr dataw2, [srcin, tmp] 107 str dataw1, [dstin] 108 str dataw2, [dstin, tmp] 109 IFSTPCPY (add result, dstin, len) 110 ret 111 112L(less4): 113 cbz len, L(zerobyte) 114 ldrh dataw1, [srcin] 115 strh dataw1, [dstin] 116L(zerobyte): 117 strb wzr, [dstin, len] 118 IFSTPCPY (add result, dstin, len) 119 ret 120 121 .p2align 4 122L(start_loop): 123 sub tmp, srcin, dstin 124 ldr dataq2, [srcin] 125 sub dst, src, tmp 126 str dataq2, [dstin] 127L(loop): 128 str dataq, [dst], 32 129 ldr dataq, [src, 16] 130 cmeq vhas_nul.16b, vdata.16b, 0 131 umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 132 fmov synd, dend 133 cbnz synd, L(loopend) 134 str dataq, [dst, -16] 135 ldr dataq, [src, 32]! 136 cmeq vhas_nul.16b, vdata.16b, 0 137 umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 138 fmov synd, dend 139 cbz synd, L(loop) 140 add dst, dst, 16 141L(loopend): 142 shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ 143 fmov synd, dend 144 sub dst, dst, 31 145#ifndef __AARCH64EB__ 146 rbit synd, synd 147#endif 148 clz len, synd 149 lsr len, len, 2 150 add dst, dst, len 151 ldr dataq, [dst, tmp] 152 str dataq, [dst] 153 IFSTPCPY (add result, dst, 15) 154 ret 155 156END (STRCPY) 157