1/* 2 * memrchr - find last character in a memory zone. 3 * 4 * Copyright (c) 2020-2022, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD. 11 * MTE compatible. 12 */ 13 14#include "asmdefs.h" 15 16#define srcin x0 17#define chrin w1 18#define cntin x2 19#define result x0 20 21#define src x3 22#define cntrem x4 23#define synd x5 24#define shift x6 25#define tmp x7 26#define end x8 27#define endm1 x9 28 29#define vrepchr v0 30#define qdata q1 31#define vdata v1 32#define vhas_chr v2 33#define vend v3 34#define dend d3 35 36/* 37 Core algorithm: 38 For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits 39 per byte. We take 4 bits of every comparison byte with shift right and narrow 40 by 4 instruction. Since the bits in the nibble mask reflect the order in 41 which things occur in the original string, counting leading zeros identifies 42 exactly which byte matched. */ 43 44ENTRY (__memrchr_aarch64) 45 PTR_ARG (0) 46 add end, srcin, cntin 47 sub endm1, end, 1 48 bic src, endm1, 15 49 cbz cntin, L(nomatch) 50 ld1 {vdata.16b}, [src] 51 dup vrepchr.16b, chrin 52 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 53 neg shift, end, lsl 2 54 shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ 55 fmov synd, dend 56 lsl synd, synd, shift 57 cbz synd, L(start_loop) 58 59 clz synd, synd 60 sub result, endm1, synd, lsr 2 61 cmp cntin, synd, lsr 2 62 csel result, result, xzr, hi 63 ret 64 65 nop 66L(start_loop): 67 subs cntrem, src, srcin 68 b.ls L(nomatch) 69 70 /* Make sure that it won't overread by a 16-byte chunk */ 71 sub cntrem, cntrem, 1 72 tbz cntrem, 4, L(loop32_2) 73 add src, src, 16 74 75 .p2align 5 76L(loop32): 77 ldr qdata, [src, -32]! 78 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 79 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 80 fmov synd, dend 81 cbnz synd, L(end) 82 83L(loop32_2): 84 ldr qdata, [src, -16] 85 subs cntrem, cntrem, 32 86 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 87 b.lo L(end_2) 88 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 89 fmov synd, dend 90 cbz synd, L(loop32) 91L(end_2): 92 sub src, src, 16 93L(end): 94 shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ 95 fmov synd, dend 96 97 add tmp, src, 15 98#ifdef __AARCH64EB__ 99 rbit synd, synd 100#endif 101 clz synd, synd 102 sub tmp, tmp, synd, lsr 2 103 cmp tmp, srcin 104 csel result, tmp, xzr, hs 105 ret 106 107L(nomatch): 108 mov result, 0 109 ret 110 111END (__memrchr_aarch64) 112 113