1/* 2 * memchr - find a character in a memory zone 3 * 4 * Copyright (c) 2020-2022, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD. 11 * MTE compatible. 12 */ 13 14#include "asmdefs.h" 15 16#define srcin x0 17#define chrin w1 18#define cntin x2 19#define result x0 20 21#define src x3 22#define cntrem x4 23#define synd x5 24#define shift x6 25#define tmp x7 26 27#define vrepchr v0 28#define qdata q1 29#define vdata v1 30#define vhas_chr v2 31#define vend v3 32#define dend d3 33 34/* 35 Core algorithm: 36 For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits 37 per byte. We take 4 bits of every comparison byte with shift right and narrow 38 by 4 instruction. Since the bits in the nibble mask reflect the order in 39 which things occur in the original string, counting leading zeros identifies 40 exactly which byte matched. */ 41 42ENTRY (__memchr_aarch64_mte) 43 bic src, srcin, 15 44 cbz cntin, L(nomatch) 45 ld1 {vdata.16b}, [src] 46 dup vrepchr.16b, chrin 47 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 48 lsl shift, srcin, 2 49 shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ 50 fmov synd, dend 51 lsr synd, synd, shift 52 cbz synd, L(start_loop) 53 54 rbit synd, synd 55 clz synd, synd 56 cmp cntin, synd, lsr 2 57 add result, srcin, synd, lsr 2 58 csel result, result, xzr, hi 59 ret 60 61 .p2align 3 62L(start_loop): 63 sub tmp, src, srcin 64 add tmp, tmp, 17 65 subs cntrem, cntin, tmp 66 b.lo L(nomatch) 67 68 /* Make sure that it won't overread by a 16-byte chunk */ 69 tbz cntrem, 4, L(loop32_2) 70 sub src, src, 16 71 .p2align 4 72L(loop32): 73 ldr qdata, [src, 32]! 74 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 75 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 76 fmov synd, dend 77 cbnz synd, L(end) 78 79L(loop32_2): 80 ldr qdata, [src, 16] 81 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 82 subs cntrem, cntrem, 32 83 b.lo L(end_2) 84 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 85 fmov synd, dend 86 cbz synd, L(loop32) 87L(end_2): 88 add src, src, 16 89L(end): 90 shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ 91 sub cntrem, src, srcin 92 fmov synd, dend 93 sub cntrem, cntin, cntrem 94#ifndef __AARCH64EB__ 95 rbit synd, synd 96#endif 97 clz synd, synd 98 cmp cntrem, synd, lsr 2 99 add result, src, synd, lsr 2 100 csel result, result, xzr, hi 101 ret 102 103L(nomatch): 104 mov result, 0 105 ret 106 107END (__memchr_aarch64_mte) 108 109