131914882SAlex Richardson/* 231914882SAlex Richardson * memrchr - find last character in a memory zone. 331914882SAlex Richardson * 4*072a4ba8SAndrew Turner * Copyright (c) 2020-2022, Arm Limited. 5*072a4ba8SAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 631914882SAlex Richardson */ 731914882SAlex Richardson 831914882SAlex Richardson/* Assumptions: 931914882SAlex Richardson * 1031914882SAlex Richardson * ARMv8-a, AArch64, Advanced SIMD. 1131914882SAlex Richardson * MTE compatible. 1231914882SAlex Richardson */ 1331914882SAlex Richardson 14*072a4ba8SAndrew Turner#include "asmdefs.h" 1531914882SAlex Richardson 1631914882SAlex Richardson#define srcin x0 1731914882SAlex Richardson#define chrin w1 1831914882SAlex Richardson#define cntin x2 1931914882SAlex Richardson#define result x0 2031914882SAlex Richardson 2131914882SAlex Richardson#define src x3 2231914882SAlex Richardson#define cntrem x4 2331914882SAlex Richardson#define synd x5 2431914882SAlex Richardson#define shift x6 2531914882SAlex Richardson#define tmp x7 2631914882SAlex Richardson#define end x8 2731914882SAlex Richardson#define endm1 x9 2831914882SAlex Richardson 2931914882SAlex Richardson#define vrepchr v0 3031914882SAlex Richardson#define qdata q1 3131914882SAlex Richardson#define vdata v1 3231914882SAlex Richardson#define vhas_chr v2 33*072a4ba8SAndrew Turner#define vend v3 34*072a4ba8SAndrew Turner#define dend d3 3531914882SAlex Richardson 3631914882SAlex Richardson/* 3731914882SAlex Richardson Core algorithm: 38*072a4ba8SAndrew Turner For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits 39*072a4ba8SAndrew Turner per byte. We take 4 bits of every comparison byte with shift right and narrow 40*072a4ba8SAndrew Turner by 4 instruction. Since the bits in the nibble mask reflect the order in 41*072a4ba8SAndrew Turner which things occur in the original string, counting leading zeros identifies 42*072a4ba8SAndrew Turner exactly which byte matched. */ 4331914882SAlex Richardson 4431914882SAlex RichardsonENTRY (__memrchr_aarch64) 4531914882SAlex Richardson PTR_ARG (0) 4631914882SAlex Richardson add end, srcin, cntin 4731914882SAlex Richardson sub endm1, end, 1 4831914882SAlex Richardson bic src, endm1, 15 4931914882SAlex Richardson cbz cntin, L(nomatch) 5031914882SAlex Richardson ld1 {vdata.16b}, [src] 5131914882SAlex Richardson dup vrepchr.16b, chrin 5231914882SAlex Richardson cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 5331914882SAlex Richardson neg shift, end, lsl 2 54*072a4ba8SAndrew Turner shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ 5531914882SAlex Richardson fmov synd, dend 5631914882SAlex Richardson lsl synd, synd, shift 5731914882SAlex Richardson cbz synd, L(start_loop) 5831914882SAlex Richardson 5931914882SAlex Richardson clz synd, synd 6031914882SAlex Richardson sub result, endm1, synd, lsr 2 6131914882SAlex Richardson cmp cntin, synd, lsr 2 6231914882SAlex Richardson csel result, result, xzr, hi 6331914882SAlex Richardson ret 6431914882SAlex Richardson 65*072a4ba8SAndrew Turner nop 6631914882SAlex RichardsonL(start_loop): 67*072a4ba8SAndrew Turner subs cntrem, src, srcin 6831914882SAlex Richardson b.ls L(nomatch) 6931914882SAlex Richardson 7031914882SAlex Richardson /* Make sure that it won't overread by a 16-byte chunk */ 71*072a4ba8SAndrew Turner sub cntrem, cntrem, 1 72*072a4ba8SAndrew Turner tbz cntrem, 4, L(loop32_2) 73*072a4ba8SAndrew Turner add src, src, 16 7431914882SAlex Richardson 75*072a4ba8SAndrew Turner .p2align 5 7631914882SAlex RichardsonL(loop32): 77*072a4ba8SAndrew Turner ldr qdata, [src, -32]! 7831914882SAlex Richardson cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 7931914882SAlex Richardson umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 8031914882SAlex Richardson fmov synd, dend 8131914882SAlex Richardson cbnz synd, L(end) 8231914882SAlex Richardson 8331914882SAlex RichardsonL(loop32_2): 84*072a4ba8SAndrew Turner ldr qdata, [src, -16] 8531914882SAlex Richardson subs cntrem, cntrem, 32 8631914882SAlex Richardson cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 87*072a4ba8SAndrew Turner b.lo L(end_2) 8831914882SAlex Richardson umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 8931914882SAlex Richardson fmov synd, dend 9031914882SAlex Richardson cbz synd, L(loop32) 91*072a4ba8SAndrew TurnerL(end_2): 92*072a4ba8SAndrew Turner sub src, src, 16 9331914882SAlex RichardsonL(end): 94*072a4ba8SAndrew Turner shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ 9531914882SAlex Richardson fmov synd, dend 9631914882SAlex Richardson 9731914882SAlex Richardson add tmp, src, 15 9831914882SAlex Richardson#ifdef __AARCH64EB__ 9931914882SAlex Richardson rbit synd, synd 10031914882SAlex Richardson#endif 10131914882SAlex Richardson clz synd, synd 10231914882SAlex Richardson sub tmp, tmp, synd, lsr 2 10331914882SAlex Richardson cmp tmp, srcin 10431914882SAlex Richardson csel result, tmp, xzr, hs 10531914882SAlex Richardson ret 10631914882SAlex Richardson 10731914882SAlex RichardsonL(nomatch): 10831914882SAlex Richardson mov result, 0 10931914882SAlex Richardson ret 11031914882SAlex Richardson 11131914882SAlex RichardsonEND (__memrchr_aarch64) 11231914882SAlex Richardson 113