1/* 2 * strrchr - find last position of a character in a string. 3 * 4 * Copyright (c) 2020-2023, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD. 11 * MTE compatible. 12 */ 13 14#include "asmdefs.h" 15 16#define srcin x0 17#define chrin w1 18#define result x0 19 20#define src x2 21#define tmp x3 22#define synd x3 23#define shift x4 24#define src_match x4 25#define nul_match x5 26#define chr_match x6 27 28#define vrepchr v0 29#define vdata v1 30#define vhas_nul v2 31#define vhas_chr v3 32#define vrepmask v4 33#define vend v5 34#define dend d5 35 36/* Core algorithm. 37 38 For each 16-byte chunk we calculate a 64-bit syndrome value, with 39 four bits per byte (LSB is always in bits 0 and 1, for both big 40 and little-endian systems). For each tuple, bits 0-1 are set if 41 the relevant byte matched the requested character; bits 2-3 are set 42 if the relevant byte matched the NUL end of string. */ 43 44ENTRY (__strrchr_aarch64_mte) 45 bic src, srcin, 15 46 dup vrepchr.16b, chrin 47 movi vrepmask.16b, 0x33 48 ld1 {vdata.16b}, [src] 49 cmeq vhas_nul.16b, vdata.16b, 0 50 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 51 bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b 52 shrn vend.8b, vhas_nul.8h, 4 53 lsl shift, srcin, 2 54 fmov synd, dend 55 lsr synd, synd, shift 56 lsl synd, synd, shift 57 ands nul_match, synd, 0xcccccccccccccccc 58 bne L(tail) 59 cbnz synd, L(loop2_start) 60 61 .p2align 4 62L(loop1): 63 ldr q1, [src, 16] 64 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 65 cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b 66 umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 67 fmov synd, dend 68 cbnz synd, L(loop1_end) 69 ldr q1, [src, 32]! 70 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 71 cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b 72 umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 73 fmov synd, dend 74 cbz synd, L(loop1) 75 sub src, src, 16 76L(loop1_end): 77 add src, src, 16 78 cmeq vhas_nul.16b, vdata.16b, 0 79#ifdef __AARCH64EB__ 80 bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b 81 shrn vend.8b, vhas_nul.8h, 4 82 fmov synd, dend 83 rbit synd, synd 84#else 85 bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b 86 shrn vend.8b, vhas_nul.8h, 4 87 fmov synd, dend 88#endif 89 ands nul_match, synd, 0xcccccccccccccccc 90 beq L(loop2_start) 91L(tail): 92 sub nul_match, nul_match, 1 93 and chr_match, synd, 0x3333333333333333 94 ands chr_match, chr_match, nul_match 95 add result, src, 15 96 clz tmp, chr_match 97 sub result, result, tmp, lsr 2 98 csel result, result, xzr, ne 99 ret 100 101 .p2align 4 102 nop 103 nop 104L(loop2_start): 105 add src, src, 16 106 bic vrepmask.8h, 0xf0 107 108L(loop2): 109 cmp synd, 0 110 csel src_match, src, src_match, ne 111 csel chr_match, synd, chr_match, ne 112 ld1 {vdata.16b}, [src], 16 113 cmeq vhas_nul.16b, vdata.16b, 0 114 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 115 bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b 116 umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 117 fmov synd, dend 118 tst synd, 0xcccccccccccccccc 119 beq L(loop2) 120 121 bic vhas_nul.8h, 0x0f, lsl 8 122 addp vend.16b, vhas_nul.16b, vhas_nul.16b 123 fmov synd, dend 124 and nul_match, synd, 0xcccccccccccccccc 125 sub nul_match, nul_match, 1 126 and tmp, synd, 0x3333333333333333 127 ands tmp, tmp, nul_match 128 csel chr_match, tmp, chr_match, ne 129 csel src_match, src, src_match, ne 130 sub src_match, src_match, 1 131 clz tmp, chr_match 132 sub result, src_match, tmp, lsr 2 133 ret 134 135END (__strrchr_aarch64_mte) 136 137