1*31914882SAlex Richardson/* 2*31914882SAlex Richardson * memchr - find a character in a memory zone 3*31914882SAlex Richardson * 4*31914882SAlex Richardson * Copyright (c) 2020, Arm Limited. 5*31914882SAlex Richardson * SPDX-License-Identifier: MIT 6*31914882SAlex Richardson */ 7*31914882SAlex Richardson 8*31914882SAlex Richardson/* Assumptions: 9*31914882SAlex Richardson * 10*31914882SAlex Richardson * ARMv8-a, AArch64, Advanced SIMD. 11*31914882SAlex Richardson * MTE compatible. 12*31914882SAlex Richardson */ 13*31914882SAlex Richardson 14*31914882SAlex Richardson#include "../asmdefs.h" 15*31914882SAlex Richardson 16*31914882SAlex Richardson#define srcin x0 17*31914882SAlex Richardson#define chrin w1 18*31914882SAlex Richardson#define cntin x2 19*31914882SAlex Richardson#define result x0 20*31914882SAlex Richardson 21*31914882SAlex Richardson#define src x3 22*31914882SAlex Richardson#define cntrem x4 23*31914882SAlex Richardson#define synd x5 24*31914882SAlex Richardson#define shift x6 25*31914882SAlex Richardson#define tmp x7 26*31914882SAlex Richardson#define wtmp w7 27*31914882SAlex Richardson 28*31914882SAlex Richardson#define vrepchr v0 29*31914882SAlex Richardson#define qdata q1 30*31914882SAlex Richardson#define vdata v1 31*31914882SAlex Richardson#define vhas_chr v2 32*31914882SAlex Richardson#define vrepmask v3 33*31914882SAlex Richardson#define vend v4 34*31914882SAlex Richardson#define dend d4 35*31914882SAlex Richardson 36*31914882SAlex Richardson/* 37*31914882SAlex Richardson Core algorithm: 38*31914882SAlex Richardson 39*31914882SAlex Richardson For each 16-byte chunk we calculate a 64-bit syndrome value with four bits 40*31914882SAlex Richardson per byte. For even bytes, bits 0-3 are set if the relevant byte matched the 41*31914882SAlex Richardson requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are 42*31914882SAlex Richardson set likewise for odd bytes so that adjacent bytes can be merged. Since the 43*31914882SAlex Richardson bits in the syndrome reflect the order in which things occur in the original 44*31914882SAlex Richardson string, counting trailing zeros identifies exactly which byte matched. */ 45*31914882SAlex Richardson 46*31914882SAlex RichardsonENTRY (__memchr_aarch64_mte) 47*31914882SAlex Richardson PTR_ARG (0) 48*31914882SAlex Richardson SIZE_ARG (2) 49*31914882SAlex Richardson bic src, srcin, 15 50*31914882SAlex Richardson cbz cntin, L(nomatch) 51*31914882SAlex Richardson ld1 {vdata.16b}, [src] 52*31914882SAlex Richardson dup vrepchr.16b, chrin 53*31914882SAlex Richardson mov wtmp, 0xf00f 54*31914882SAlex Richardson dup vrepmask.8h, wtmp 55*31914882SAlex Richardson cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 56*31914882SAlex Richardson lsl shift, srcin, 2 57*31914882SAlex Richardson and vhas_chr.16b, vhas_chr.16b, vrepmask.16b 58*31914882SAlex Richardson addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 59*31914882SAlex Richardson fmov synd, dend 60*31914882SAlex Richardson lsr synd, synd, shift 61*31914882SAlex Richardson cbz synd, L(start_loop) 62*31914882SAlex Richardson 63*31914882SAlex Richardson rbit synd, synd 64*31914882SAlex Richardson clz synd, synd 65*31914882SAlex Richardson add result, srcin, synd, lsr 2 66*31914882SAlex Richardson cmp cntin, synd, lsr 2 67*31914882SAlex Richardson csel result, result, xzr, hi 68*31914882SAlex Richardson ret 69*31914882SAlex Richardson 70*31914882SAlex RichardsonL(start_loop): 71*31914882SAlex Richardson sub tmp, src, srcin 72*31914882SAlex Richardson add tmp, tmp, 16 73*31914882SAlex Richardson subs cntrem, cntin, tmp 74*31914882SAlex Richardson b.ls L(nomatch) 75*31914882SAlex Richardson 76*31914882SAlex Richardson /* Make sure that it won't overread by a 16-byte chunk */ 77*31914882SAlex Richardson add tmp, cntrem, 15 78*31914882SAlex Richardson tbnz tmp, 4, L(loop32_2) 79*31914882SAlex Richardson 80*31914882SAlex Richardson .p2align 4 81*31914882SAlex RichardsonL(loop32): 82*31914882SAlex Richardson ldr qdata, [src, 16]! 83*31914882SAlex Richardson cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 84*31914882SAlex Richardson umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 85*31914882SAlex Richardson fmov synd, dend 86*31914882SAlex Richardson cbnz synd, L(end) 87*31914882SAlex Richardson 88*31914882SAlex RichardsonL(loop32_2): 89*31914882SAlex Richardson ldr qdata, [src, 16]! 90*31914882SAlex Richardson subs cntrem, cntrem, 32 91*31914882SAlex Richardson cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 92*31914882SAlex Richardson b.ls L(end) 93*31914882SAlex Richardson umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 94*31914882SAlex Richardson fmov synd, dend 95*31914882SAlex Richardson cbz synd, L(loop32) 96*31914882SAlex RichardsonL(end): 97*31914882SAlex Richardson and vhas_chr.16b, vhas_chr.16b, vrepmask.16b 98*31914882SAlex Richardson addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 99*31914882SAlex Richardson fmov synd, dend 100*31914882SAlex Richardson add tmp, srcin, cntin 101*31914882SAlex Richardson sub cntrem, tmp, src 102*31914882SAlex Richardson#ifndef __AARCH64EB__ 103*31914882SAlex Richardson rbit synd, synd 104*31914882SAlex Richardson#endif 105*31914882SAlex Richardson clz synd, synd 106*31914882SAlex Richardson cmp cntrem, synd, lsr 2 107*31914882SAlex Richardson add result, src, synd, lsr 2 108*31914882SAlex Richardson csel result, result, xzr, hi 109*31914882SAlex Richardson ret 110*31914882SAlex Richardson 111*31914882SAlex RichardsonL(nomatch): 112*31914882SAlex Richardson mov result, 0 113*31914882SAlex Richardson ret 114*31914882SAlex Richardson 115*31914882SAlex RichardsonEND (__memchr_aarch64_mte) 116*31914882SAlex Richardson 117