1/* 2 * memrchr - find last character in a memory zone. 3 * 4 * Copyright (c) 2020-2022, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD. 11 * MTE compatible. 12 */ 13 14#include "asmdefs.h" 15 16#define srcin x0 17#define chrin w1 18#define cntin x2 19#define result x0 20 21#define src x3 22#define cntrem x4 23#define synd x5 24#define shift x6 25#define tmp x7 26#define end x8 27#define endm1 x9 28 29#define vrepchr v0 30#define qdata q1 31#define vdata v1 32#define vhas_chr v2 33#define vend v3 34#define dend d3 35 36/* 37 Core algorithm: 38 For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits 39 per byte. We take 4 bits of every comparison byte with shift right and narrow 40 by 4 instruction. Since the bits in the nibble mask reflect the order in 41 which things occur in the original string, counting leading zeros identifies 42 exactly which byte matched. */ 43 44ENTRY (__memrchr_aarch64) 45 add end, srcin, cntin 46 sub endm1, end, 1 47 bic src, endm1, 15 48 cbz cntin, L(nomatch) 49 ld1 {vdata.16b}, [src] 50 dup vrepchr.16b, chrin 51 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 52 neg shift, end, lsl 2 53 shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ 54 fmov synd, dend 55 lsl synd, synd, shift 56 cbz synd, L(start_loop) 57 58 clz synd, synd 59 sub result, endm1, synd, lsr 2 60 cmp cntin, synd, lsr 2 61 csel result, result, xzr, hi 62 ret 63 64 nop 65L(start_loop): 66 subs cntrem, src, srcin 67 b.ls L(nomatch) 68 69 /* Make sure that it won't overread by a 16-byte chunk */ 70 sub cntrem, cntrem, 1 71 tbz cntrem, 4, L(loop32_2) 72 add src, src, 16 73 74 .p2align 5 75L(loop32): 76 ldr qdata, [src, -32]! 77 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 78 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 79 fmov synd, dend 80 cbnz synd, L(end) 81 82L(loop32_2): 83 ldr qdata, [src, -16] 84 subs cntrem, cntrem, 32 85 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 86 b.lo L(end_2) 87 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 88 fmov synd, dend 89 cbz synd, L(loop32) 90L(end_2): 91 sub src, src, 16 92L(end): 93 shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ 94 fmov synd, dend 95 96 add tmp, src, 15 97#ifdef __AARCH64EB__ 98 rbit synd, synd 99#endif 100 clz synd, synd 101 sub tmp, tmp, synd, lsr 2 102 cmp tmp, srcin 103 csel result, tmp, xzr, hs 104 ret 105 106L(nomatch): 107 mov result, 0 108 ret 109 110END (__memrchr_aarch64) 111 112