131914882SAlex Richardson/* 231914882SAlex Richardson * strcpy/stpcpy - copy a string returning pointer to start/end. 331914882SAlex Richardson * 4*072a4ba8SAndrew Turner * Copyright (c) 2020-2023, Arm Limited. 5*072a4ba8SAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 631914882SAlex Richardson */ 731914882SAlex Richardson 831914882SAlex Richardson/* Assumptions: 931914882SAlex Richardson * 10d49ad206SAndrew Turner * ARMv8-a, AArch64, Advanced SIMD. 11d49ad206SAndrew Turner * MTE compatible. 1231914882SAlex Richardson */ 1331914882SAlex Richardson 14*072a4ba8SAndrew Turner#include "asmdefs.h" 1531914882SAlex Richardson 1631914882SAlex Richardson#define dstin x0 1731914882SAlex Richardson#define srcin x1 18d49ad206SAndrew Turner#define result x0 1931914882SAlex Richardson 2031914882SAlex Richardson#define src x2 2131914882SAlex Richardson#define dst x3 22d49ad206SAndrew Turner#define len x4 23d49ad206SAndrew Turner#define synd x4 24d49ad206SAndrew Turner#define tmp x5 25d49ad206SAndrew Turner#define shift x5 26d49ad206SAndrew Turner#define data1 x6 27d49ad206SAndrew Turner#define dataw1 w6 28d49ad206SAndrew Turner#define data2 x7 29d49ad206SAndrew Turner#define dataw2 w7 30d49ad206SAndrew Turner 31d49ad206SAndrew Turner#define dataq q0 32d49ad206SAndrew Turner#define vdata v0 33d49ad206SAndrew Turner#define vhas_nul v1 34*072a4ba8SAndrew Turner#define vend v2 35*072a4ba8SAndrew Turner#define dend d2 36d49ad206SAndrew Turner#define dataq2 q1 3731914882SAlex Richardson 3831914882SAlex Richardson#ifdef BUILD_STPCPY 3931914882SAlex Richardson# define STRCPY __stpcpy_aarch64 40d49ad206SAndrew Turner# define IFSTPCPY(X,...) X,__VA_ARGS__ 4131914882SAlex Richardson#else 4231914882SAlex Richardson# define STRCPY __strcpy_aarch64 43d49ad206SAndrew Turner# define IFSTPCPY(X,...) 4431914882SAlex Richardson#endif 4531914882SAlex Richardson 46*072a4ba8SAndrew Turner/* 47*072a4ba8SAndrew Turner Core algorithm: 48*072a4ba8SAndrew Turner For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits 49*072a4ba8SAndrew Turner per byte. We take 4 bits of every comparison byte with shift right and narrow 50*072a4ba8SAndrew Turner by 4 instruction. Since the bits in the nibble mask reflect the order in 51*072a4ba8SAndrew Turner which things occur in the original string, counting leading zeros identifies 52*072a4ba8SAndrew Turner exactly which byte matched. */ 5331914882SAlex Richardson 5431914882SAlex RichardsonENTRY (STRCPY) 5531914882SAlex Richardson PTR_ARG (0) 5631914882SAlex Richardson PTR_ARG (1) 57d49ad206SAndrew Turner bic src, srcin, 15 58d49ad206SAndrew Turner ld1 {vdata.16b}, [src] 59d49ad206SAndrew Turner cmeq vhas_nul.16b, vdata.16b, 0 60d49ad206SAndrew Turner lsl shift, srcin, 2 61*072a4ba8SAndrew Turner shrn vend.8b, vhas_nul.8h, 4 62d49ad206SAndrew Turner fmov synd, dend 63d49ad206SAndrew Turner lsr synd, synd, shift 64d49ad206SAndrew Turner cbnz synd, L(tail) 6531914882SAlex Richardson 66d49ad206SAndrew Turner ldr dataq, [src, 16]! 67d49ad206SAndrew Turner cmeq vhas_nul.16b, vdata.16b, 0 68*072a4ba8SAndrew Turner shrn vend.8b, vhas_nul.8h, 4 69d49ad206SAndrew Turner fmov synd, dend 70d49ad206SAndrew Turner cbz synd, L(start_loop) 7131914882SAlex Richardson 72d49ad206SAndrew Turner#ifndef __AARCH64EB__ 73d49ad206SAndrew Turner rbit synd, synd 7431914882SAlex Richardson#endif 75d49ad206SAndrew Turner sub tmp, src, srcin 76d49ad206SAndrew Turner clz len, synd 77d49ad206SAndrew Turner add len, tmp, len, lsr 2 78d49ad206SAndrew Turner tbz len, 4, L(less16) 79d49ad206SAndrew Turner sub tmp, len, 15 80d49ad206SAndrew Turner ldr dataq, [srcin] 81d49ad206SAndrew Turner ldr dataq2, [srcin, tmp] 82d49ad206SAndrew Turner str dataq, [dstin] 83d49ad206SAndrew Turner str dataq2, [dstin, tmp] 84d49ad206SAndrew Turner IFSTPCPY (add result, dstin, len) 85d49ad206SAndrew Turner ret 86d49ad206SAndrew Turner 87d49ad206SAndrew TurnerL(tail): 88d49ad206SAndrew Turner rbit synd, synd 89d49ad206SAndrew Turner clz len, synd 90d49ad206SAndrew Turner lsr len, len, 2 91d49ad206SAndrew TurnerL(less16): 92d49ad206SAndrew Turner tbz len, 3, L(less8) 93d49ad206SAndrew Turner sub tmp, len, 7 94d49ad206SAndrew Turner ldr data1, [srcin] 95d49ad206SAndrew Turner ldr data2, [srcin, tmp] 9631914882SAlex Richardson str data1, [dstin] 97d49ad206SAndrew Turner str data2, [dstin, tmp] 98d49ad206SAndrew Turner IFSTPCPY (add result, dstin, len) 9931914882SAlex Richardson ret 10031914882SAlex Richardson 101d49ad206SAndrew Turner .p2align 4 102d49ad206SAndrew TurnerL(less8): 103d49ad206SAndrew Turner subs tmp, len, 3 104d49ad206SAndrew Turner b.lo L(less4) 105d49ad206SAndrew Turner ldr dataw1, [srcin] 106d49ad206SAndrew Turner ldr dataw2, [srcin, tmp] 107d49ad206SAndrew Turner str dataw1, [dstin] 108d49ad206SAndrew Turner str dataw2, [dstin, tmp] 109d49ad206SAndrew Turner IFSTPCPY (add result, dstin, len) 11031914882SAlex Richardson ret 11131914882SAlex Richardson 112d49ad206SAndrew TurnerL(less4): 113d49ad206SAndrew Turner cbz len, L(zerobyte) 114d49ad206SAndrew Turner ldrh dataw1, [srcin] 115d49ad206SAndrew Turner strh dataw1, [dstin] 116d49ad206SAndrew TurnerL(zerobyte): 117d49ad206SAndrew Turner strb wzr, [dstin, len] 118d49ad206SAndrew Turner IFSTPCPY (add result, dstin, len) 11931914882SAlex Richardson ret 12031914882SAlex Richardson 121d49ad206SAndrew Turner .p2align 4 122d49ad206SAndrew TurnerL(start_loop): 123*072a4ba8SAndrew Turner sub tmp, srcin, dstin 124d49ad206SAndrew Turner ldr dataq2, [srcin] 125*072a4ba8SAndrew Turner sub dst, src, tmp 126d49ad206SAndrew Turner str dataq2, [dstin] 127d49ad206SAndrew TurnerL(loop): 128*072a4ba8SAndrew Turner str dataq, [dst], 32 129*072a4ba8SAndrew Turner ldr dataq, [src, 16] 130*072a4ba8SAndrew Turner cmeq vhas_nul.16b, vdata.16b, 0 131*072a4ba8SAndrew Turner umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 132*072a4ba8SAndrew Turner fmov synd, dend 133*072a4ba8SAndrew Turner cbnz synd, L(loopend) 134*072a4ba8SAndrew Turner str dataq, [dst, -16] 135*072a4ba8SAndrew Turner ldr dataq, [src, 32]! 136d49ad206SAndrew Turner cmeq vhas_nul.16b, vdata.16b, 0 137d49ad206SAndrew Turner umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 138d49ad206SAndrew Turner fmov synd, dend 139d49ad206SAndrew Turner cbz synd, L(loop) 140*072a4ba8SAndrew Turner add dst, dst, 16 141*072a4ba8SAndrew TurnerL(loopend): 142*072a4ba8SAndrew Turner shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ 143d49ad206SAndrew Turner fmov synd, dend 144*072a4ba8SAndrew Turner sub dst, dst, 31 145d49ad206SAndrew Turner#ifndef __AARCH64EB__ 146d49ad206SAndrew Turner rbit synd, synd 14731914882SAlex Richardson#endif 148d49ad206SAndrew Turner clz len, synd 149d49ad206SAndrew Turner lsr len, len, 2 150*072a4ba8SAndrew Turner add dst, dst, len 151*072a4ba8SAndrew Turner ldr dataq, [dst, tmp] 152*072a4ba8SAndrew Turner str dataq, [dst] 153*072a4ba8SAndrew Turner IFSTPCPY (add result, dst, 15) 154d49ad206SAndrew Turner ret 15531914882SAlex Richardson 15631914882SAlex RichardsonEND (STRCPY) 157