1/* 2 * memrchr - find last character in a memory zone. 3 * 4 * Copyright (c) 2020, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD. 11 * MTE compatible. 12 */ 13 14#include "../asmdefs.h" 15 16#define srcin x0 17#define chrin w1 18#define cntin x2 19#define result x0 20 21#define src x3 22#define cntrem x4 23#define synd x5 24#define shift x6 25#define tmp x7 26#define wtmp w7 27#define end x8 28#define endm1 x9 29 30#define vrepchr v0 31#define qdata q1 32#define vdata v1 33#define vhas_chr v2 34#define vrepmask v3 35#define vend v4 36#define dend d4 37 38/* 39 Core algorithm: 40 41 For each 16-byte chunk we calculate a 64-bit syndrome value with four bits 42 per byte. For even bytes, bits 0-3 are set if the relevant byte matched the 43 requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are 44 set likewise for odd bytes so that adjacent bytes can be merged. Since the 45 bits in the syndrome reflect the order in which things occur in the original 46 string, counting trailing zeros identifies exactly which byte matched. */ 47 48ENTRY (__memrchr_aarch64) 49 PTR_ARG (0) 50 add end, srcin, cntin 51 sub endm1, end, 1 52 bic src, endm1, 15 53 cbz cntin, L(nomatch) 54 ld1 {vdata.16b}, [src] 55 dup vrepchr.16b, chrin 56 mov wtmp, 0xf00f 57 dup vrepmask.8h, wtmp 58 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 59 neg shift, end, lsl 2 60 and vhas_chr.16b, vhas_chr.16b, vrepmask.16b 61 addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 62 fmov synd, dend 63 lsl synd, synd, shift 64 cbz synd, L(start_loop) 65 66 clz synd, synd 67 sub result, endm1, synd, lsr 2 68 cmp cntin, synd, lsr 2 69 csel result, result, xzr, hi 70 ret 71 72L(start_loop): 73 sub tmp, end, src 74 subs cntrem, cntin, tmp 75 b.ls L(nomatch) 76 77 /* Make sure that it won't overread by a 16-byte chunk */ 78 add tmp, cntrem, 15 79 tbnz tmp, 4, L(loop32_2) 80 81 .p2align 4 82L(loop32): 83 ldr qdata, [src, -16]! 84 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 85 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 86 fmov synd, dend 87 cbnz synd, L(end) 88 89L(loop32_2): 90 ldr qdata, [src, -16]! 91 subs cntrem, cntrem, 32 92 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 93 b.ls L(end) 94 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 95 fmov synd, dend 96 cbz synd, L(loop32) 97L(end): 98 and vhas_chr.16b, vhas_chr.16b, vrepmask.16b 99 addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 100 fmov synd, dend 101 102 add tmp, src, 15 103#ifdef __AARCH64EB__ 104 rbit synd, synd 105#endif 106 clz synd, synd 107 sub tmp, tmp, synd, lsr 2 108 cmp tmp, srcin 109 csel result, tmp, xzr, hs 110 ret 111 112L(nomatch): 113 mov result, 0 114 ret 115 116END (__memrchr_aarch64) 117 118