1/* 2 * strrchr - find last position of a character in a string. 3 * 4 * Copyright (c) 2020-2023, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD. 11 * MTE compatible. 12 */ 13 14#include "asmdefs.h" 15 16#define srcin x0 17#define chrin w1 18#define result x0 19 20#define src x2 21#define tmp x3 22#define synd x3 23#define shift x4 24#define src_match x4 25#define nul_match x5 26#define chr_match x6 27 28#define vrepchr v0 29#define vdata v1 30#define vhas_nul v2 31#define vhas_chr v3 32#define vrepmask v4 33#define vend v5 34#define dend d5 35 36/* Core algorithm. 37 38 For each 16-byte chunk we calculate a 64-bit syndrome value, with 39 four bits per byte (LSB is always in bits 0 and 1, for both big 40 and little-endian systems). For each tuple, bits 0-1 are set if 41 the relevant byte matched the requested character; bits 2-3 are set 42 if the relevant byte matched the NUL end of string. */ 43 44ENTRY (__strrchr_aarch64_mte) 45 PTR_ARG (0) 46 bic src, srcin, 15 47 dup vrepchr.16b, chrin 48 movi vrepmask.16b, 0x33 49 ld1 {vdata.16b}, [src] 50 cmeq vhas_nul.16b, vdata.16b, 0 51 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 52 bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b 53 shrn vend.8b, vhas_nul.8h, 4 54 lsl shift, srcin, 2 55 fmov synd, dend 56 lsr synd, synd, shift 57 lsl synd, synd, shift 58 ands nul_match, synd, 0xcccccccccccccccc 59 bne L(tail) 60 cbnz synd, L(loop2_start) 61 62 .p2align 4 63L(loop1): 64 ldr q1, [src, 16] 65 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 66 cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b 67 umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 68 fmov synd, dend 69 cbnz synd, L(loop1_end) 70 ldr q1, [src, 32]! 71 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 72 cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b 73 umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 74 fmov synd, dend 75 cbz synd, L(loop1) 76 sub src, src, 16 77L(loop1_end): 78 add src, src, 16 79 cmeq vhas_nul.16b, vdata.16b, 0 80#ifdef __AARCH64EB__ 81 bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b 82 shrn vend.8b, vhas_nul.8h, 4 83 fmov synd, dend 84 rbit synd, synd 85#else 86 bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b 87 shrn vend.8b, vhas_nul.8h, 4 88 fmov synd, dend 89#endif 90 ands nul_match, synd, 0xcccccccccccccccc 91 beq L(loop2_start) 92L(tail): 93 sub nul_match, nul_match, 1 94 and chr_match, synd, 0x3333333333333333 95 ands chr_match, chr_match, nul_match 96 add result, src, 15 97 clz tmp, chr_match 98 sub result, result, tmp, lsr 2 99 csel result, result, xzr, ne 100 ret 101 102 .p2align 4 103 nop 104 nop 105L(loop2_start): 106 add src, src, 16 107 bic vrepmask.8h, 0xf0 108 109L(loop2): 110 cmp synd, 0 111 csel src_match, src, src_match, ne 112 csel chr_match, synd, chr_match, ne 113 ld1 {vdata.16b}, [src], 16 114 cmeq vhas_nul.16b, vdata.16b, 0 115 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 116 bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b 117 umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 118 fmov synd, dend 119 tst synd, 0xcccccccccccccccc 120 beq L(loop2) 121 122 bic vhas_nul.8h, 0x0f, lsl 8 123 addp vend.16b, vhas_nul.16b, vhas_nul.16b 124 fmov synd, dend 125 and nul_match, synd, 0xcccccccccccccccc 126 sub nul_match, nul_match, 1 127 and tmp, synd, 0x3333333333333333 128 ands tmp, tmp, nul_match 129 csel chr_match, tmp, chr_match, ne 130 csel src_match, src, src_match, ne 131 sub src_match, src_match, 1 132 clz tmp, chr_match 133 sub result, src_match, tmp, lsr 2 134 ret 135 136END (__strrchr_aarch64_mte) 137 138