131914882SAlex Richardson/* 231914882SAlex Richardson * memchr - find a character in a memory zone 331914882SAlex Richardson * 4*072a4ba8SAndrew Turner * Copyright (c) 2020-2022, Arm Limited. 5*072a4ba8SAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 631914882SAlex Richardson */ 731914882SAlex Richardson 831914882SAlex Richardson/* Assumptions: 931914882SAlex Richardson * 1031914882SAlex Richardson * ARMv8-a, AArch64, Advanced SIMD. 1131914882SAlex Richardson * MTE compatible. 1231914882SAlex Richardson */ 1331914882SAlex Richardson 14*072a4ba8SAndrew Turner#include "asmdefs.h" 1531914882SAlex Richardson 1631914882SAlex Richardson#define srcin x0 1731914882SAlex Richardson#define chrin w1 1831914882SAlex Richardson#define cntin x2 1931914882SAlex Richardson#define result x0 2031914882SAlex Richardson 2131914882SAlex Richardson#define src x3 2231914882SAlex Richardson#define cntrem x4 2331914882SAlex Richardson#define synd x5 2431914882SAlex Richardson#define shift x6 2531914882SAlex Richardson#define tmp x7 2631914882SAlex Richardson 2731914882SAlex Richardson#define vrepchr v0 2831914882SAlex Richardson#define qdata q1 2931914882SAlex Richardson#define vdata v1 3031914882SAlex Richardson#define vhas_chr v2 31*072a4ba8SAndrew Turner#define vend v3 32*072a4ba8SAndrew Turner#define dend d3 3331914882SAlex Richardson 3431914882SAlex Richardson/* 3531914882SAlex Richardson Core algorithm: 36*072a4ba8SAndrew Turner For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits 37*072a4ba8SAndrew Turner per byte. We take 4 bits of every comparison byte with shift right and narrow 38*072a4ba8SAndrew Turner by 4 instruction. Since the bits in the nibble mask reflect the order in 39*072a4ba8SAndrew Turner which things occur in the original string, counting leading zeros identifies 40*072a4ba8SAndrew Turner exactly which byte matched. */ 4131914882SAlex Richardson 4231914882SAlex RichardsonENTRY (__memchr_aarch64_mte) 4331914882SAlex Richardson PTR_ARG (0) 4431914882SAlex Richardson SIZE_ARG (2) 4531914882SAlex Richardson bic src, srcin, 15 4631914882SAlex Richardson cbz cntin, L(nomatch) 4731914882SAlex Richardson ld1 {vdata.16b}, [src] 4831914882SAlex Richardson dup vrepchr.16b, chrin 4931914882SAlex Richardson cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 5031914882SAlex Richardson lsl shift, srcin, 2 51*072a4ba8SAndrew Turner shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ 5231914882SAlex Richardson fmov synd, dend 5331914882SAlex Richardson lsr synd, synd, shift 5431914882SAlex Richardson cbz synd, L(start_loop) 5531914882SAlex Richardson 5631914882SAlex Richardson rbit synd, synd 5731914882SAlex Richardson clz synd, synd 5831914882SAlex Richardson cmp cntin, synd, lsr 2 59*072a4ba8SAndrew Turner add result, srcin, synd, lsr 2 6031914882SAlex Richardson csel result, result, xzr, hi 6131914882SAlex Richardson ret 6231914882SAlex Richardson 63*072a4ba8SAndrew Turner .p2align 3 6431914882SAlex RichardsonL(start_loop): 6531914882SAlex Richardson sub tmp, src, srcin 66*072a4ba8SAndrew Turner add tmp, tmp, 17 6731914882SAlex Richardson subs cntrem, cntin, tmp 68*072a4ba8SAndrew Turner b.lo L(nomatch) 6931914882SAlex Richardson 7031914882SAlex Richardson /* Make sure that it won't overread by a 16-byte chunk */ 71*072a4ba8SAndrew Turner tbz cntrem, 4, L(loop32_2) 72*072a4ba8SAndrew Turner sub src, src, 16 7331914882SAlex Richardson .p2align 4 7431914882SAlex RichardsonL(loop32): 75*072a4ba8SAndrew Turner ldr qdata, [src, 32]! 7631914882SAlex Richardson cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 7731914882SAlex Richardson umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 7831914882SAlex Richardson fmov synd, dend 7931914882SAlex Richardson cbnz synd, L(end) 8031914882SAlex Richardson 8131914882SAlex RichardsonL(loop32_2): 82*072a4ba8SAndrew Turner ldr qdata, [src, 16] 8331914882SAlex Richardson cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 84*072a4ba8SAndrew Turner subs cntrem, cntrem, 32 85*072a4ba8SAndrew Turner b.lo L(end_2) 8631914882SAlex Richardson umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 8731914882SAlex Richardson fmov synd, dend 8831914882SAlex Richardson cbz synd, L(loop32) 89*072a4ba8SAndrew TurnerL(end_2): 90*072a4ba8SAndrew Turner add src, src, 16 9131914882SAlex RichardsonL(end): 92*072a4ba8SAndrew Turner shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ 93*072a4ba8SAndrew Turner sub cntrem, src, srcin 9431914882SAlex Richardson fmov synd, dend 95*072a4ba8SAndrew Turner sub cntrem, cntin, cntrem 9631914882SAlex Richardson#ifndef __AARCH64EB__ 9731914882SAlex Richardson rbit synd, synd 9831914882SAlex Richardson#endif 9931914882SAlex Richardson clz synd, synd 10031914882SAlex Richardson cmp cntrem, synd, lsr 2 10131914882SAlex Richardson add result, src, synd, lsr 2 10231914882SAlex Richardson csel result, result, xzr, hi 10331914882SAlex Richardson ret 10431914882SAlex Richardson 10531914882SAlex RichardsonL(nomatch): 10631914882SAlex Richardson mov result, 0 10731914882SAlex Richardson ret 10831914882SAlex Richardson 10931914882SAlex RichardsonEND (__memchr_aarch64_mte) 11031914882SAlex Richardson 111