1*31914882SAlex Richardson/* 2*31914882SAlex Richardson * memrchr - find last character in a memory zone. 3*31914882SAlex Richardson * 4*31914882SAlex Richardson * Copyright (c) 2020, Arm Limited. 5*31914882SAlex Richardson * SPDX-License-Identifier: MIT 6*31914882SAlex Richardson */ 7*31914882SAlex Richardson 8*31914882SAlex Richardson/* Assumptions: 9*31914882SAlex Richardson * 10*31914882SAlex Richardson * ARMv8-a, AArch64, Advanced SIMD. 11*31914882SAlex Richardson * MTE compatible. 12*31914882SAlex Richardson */ 13*31914882SAlex Richardson 14*31914882SAlex Richardson#include "../asmdefs.h" 15*31914882SAlex Richardson 16*31914882SAlex Richardson#define srcin x0 17*31914882SAlex Richardson#define chrin w1 18*31914882SAlex Richardson#define cntin x2 19*31914882SAlex Richardson#define result x0 20*31914882SAlex Richardson 21*31914882SAlex Richardson#define src x3 22*31914882SAlex Richardson#define cntrem x4 23*31914882SAlex Richardson#define synd x5 24*31914882SAlex Richardson#define shift x6 25*31914882SAlex Richardson#define tmp x7 26*31914882SAlex Richardson#define wtmp w7 27*31914882SAlex Richardson#define end x8 28*31914882SAlex Richardson#define endm1 x9 29*31914882SAlex Richardson 30*31914882SAlex Richardson#define vrepchr v0 31*31914882SAlex Richardson#define qdata q1 32*31914882SAlex Richardson#define vdata v1 33*31914882SAlex Richardson#define vhas_chr v2 34*31914882SAlex Richardson#define vrepmask v3 35*31914882SAlex Richardson#define vend v4 36*31914882SAlex Richardson#define dend d4 37*31914882SAlex Richardson 38*31914882SAlex Richardson/* 39*31914882SAlex Richardson Core algorithm: 40*31914882SAlex Richardson 41*31914882SAlex Richardson For each 16-byte chunk we calculate a 64-bit syndrome value with four bits 42*31914882SAlex Richardson per byte. For even bytes, bits 0-3 are set if the relevant byte matched the 43*31914882SAlex Richardson requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are 44*31914882SAlex Richardson set likewise for odd bytes so that adjacent bytes can be merged. Since the 45*31914882SAlex Richardson bits in the syndrome reflect the order in which things occur in the original 46*31914882SAlex Richardson string, counting trailing zeros identifies exactly which byte matched. */ 47*31914882SAlex Richardson 48*31914882SAlex RichardsonENTRY (__memrchr_aarch64) 49*31914882SAlex Richardson PTR_ARG (0) 50*31914882SAlex Richardson add end, srcin, cntin 51*31914882SAlex Richardson sub endm1, end, 1 52*31914882SAlex Richardson bic src, endm1, 15 53*31914882SAlex Richardson cbz cntin, L(nomatch) 54*31914882SAlex Richardson ld1 {vdata.16b}, [src] 55*31914882SAlex Richardson dup vrepchr.16b, chrin 56*31914882SAlex Richardson mov wtmp, 0xf00f 57*31914882SAlex Richardson dup vrepmask.8h, wtmp 58*31914882SAlex Richardson cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 59*31914882SAlex Richardson neg shift, end, lsl 2 60*31914882SAlex Richardson and vhas_chr.16b, vhas_chr.16b, vrepmask.16b 61*31914882SAlex Richardson addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 62*31914882SAlex Richardson fmov synd, dend 63*31914882SAlex Richardson lsl synd, synd, shift 64*31914882SAlex Richardson cbz synd, L(start_loop) 65*31914882SAlex Richardson 66*31914882SAlex Richardson clz synd, synd 67*31914882SAlex Richardson sub result, endm1, synd, lsr 2 68*31914882SAlex Richardson cmp cntin, synd, lsr 2 69*31914882SAlex Richardson csel result, result, xzr, hi 70*31914882SAlex Richardson ret 71*31914882SAlex Richardson 72*31914882SAlex RichardsonL(start_loop): 73*31914882SAlex Richardson sub tmp, end, src 74*31914882SAlex Richardson subs cntrem, cntin, tmp 75*31914882SAlex Richardson b.ls L(nomatch) 76*31914882SAlex Richardson 77*31914882SAlex Richardson /* Make sure that it won't overread by a 16-byte chunk */ 78*31914882SAlex Richardson add tmp, cntrem, 15 79*31914882SAlex Richardson tbnz tmp, 4, L(loop32_2) 80*31914882SAlex Richardson 81*31914882SAlex Richardson .p2align 4 82*31914882SAlex RichardsonL(loop32): 83*31914882SAlex Richardson ldr qdata, [src, -16]! 84*31914882SAlex Richardson cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 85*31914882SAlex Richardson umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 86*31914882SAlex Richardson fmov synd, dend 87*31914882SAlex Richardson cbnz synd, L(end) 88*31914882SAlex Richardson 89*31914882SAlex RichardsonL(loop32_2): 90*31914882SAlex Richardson ldr qdata, [src, -16]! 91*31914882SAlex Richardson subs cntrem, cntrem, 32 92*31914882SAlex Richardson cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 93*31914882SAlex Richardson b.ls L(end) 94*31914882SAlex Richardson umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 95*31914882SAlex Richardson fmov synd, dend 96*31914882SAlex Richardson cbz synd, L(loop32) 97*31914882SAlex RichardsonL(end): 98*31914882SAlex Richardson and vhas_chr.16b, vhas_chr.16b, vrepmask.16b 99*31914882SAlex Richardson addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 100*31914882SAlex Richardson fmov synd, dend 101*31914882SAlex Richardson 102*31914882SAlex Richardson add tmp, src, 15 103*31914882SAlex Richardson#ifdef __AARCH64EB__ 104*31914882SAlex Richardson rbit synd, synd 105*31914882SAlex Richardson#endif 106*31914882SAlex Richardson clz synd, synd 107*31914882SAlex Richardson sub tmp, tmp, synd, lsr 2 108*31914882SAlex Richardson cmp tmp, srcin 109*31914882SAlex Richardson csel result, tmp, xzr, hs 110*31914882SAlex Richardson ret 111*31914882SAlex Richardson 112*31914882SAlex RichardsonL(nomatch): 113*31914882SAlex Richardson mov result, 0 114*31914882SAlex Richardson ret 115*31914882SAlex Richardson 116*31914882SAlex RichardsonEND (__memrchr_aarch64) 117*31914882SAlex Richardson 118