1/* 2 * memchr - find a character in a memory zone 3 * 4 * Copyright (c) 2020-2022, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD. 11 * MTE compatible. 12 */ 13 14#include "asmdefs.h" 15 16#define srcin x0 17#define chrin w1 18#define cntin x2 19#define result x0 20 21#define src x3 22#define cntrem x4 23#define synd x5 24#define shift x6 25#define tmp x7 26 27#define vrepchr v0 28#define qdata q1 29#define vdata v1 30#define vhas_chr v2 31#define vend v3 32#define dend d3 33 34/* 35 Core algorithm: 36 For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits 37 per byte. We take 4 bits of every comparison byte with shift right and narrow 38 by 4 instruction. Since the bits in the nibble mask reflect the order in 39 which things occur in the original string, counting leading zeros identifies 40 exactly which byte matched. */ 41 42ENTRY (__memchr_aarch64_mte) 43 PTR_ARG (0) 44 SIZE_ARG (2) 45 bic src, srcin, 15 46 cbz cntin, L(nomatch) 47 ld1 {vdata.16b}, [src] 48 dup vrepchr.16b, chrin 49 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 50 lsl shift, srcin, 2 51 shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ 52 fmov synd, dend 53 lsr synd, synd, shift 54 cbz synd, L(start_loop) 55 56 rbit synd, synd 57 clz synd, synd 58 cmp cntin, synd, lsr 2 59 add result, srcin, synd, lsr 2 60 csel result, result, xzr, hi 61 ret 62 63 .p2align 3 64L(start_loop): 65 sub tmp, src, srcin 66 add tmp, tmp, 17 67 subs cntrem, cntin, tmp 68 b.lo L(nomatch) 69 70 /* Make sure that it won't overread by a 16-byte chunk */ 71 tbz cntrem, 4, L(loop32_2) 72 sub src, src, 16 73 .p2align 4 74L(loop32): 75 ldr qdata, [src, 32]! 76 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 77 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 78 fmov synd, dend 79 cbnz synd, L(end) 80 81L(loop32_2): 82 ldr qdata, [src, 16] 83 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 84 subs cntrem, cntrem, 32 85 b.lo L(end_2) 86 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 87 fmov synd, dend 88 cbz synd, L(loop32) 89L(end_2): 90 add src, src, 16 91L(end): 92 shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ 93 sub cntrem, src, srcin 94 fmov synd, dend 95 sub cntrem, cntin, cntrem 96#ifndef __AARCH64EB__ 97 rbit synd, synd 98#endif 99 clz synd, synd 100 cmp cntrem, synd, lsr 2 101 add result, src, synd, lsr 2 102 csel result, result, xzr, hi 103 ret 104 105L(nomatch): 106 mov result, 0 107 ret 108 109END (__memchr_aarch64_mte) 110 111