1/* 2 * memchr - find a character in a memory zone 3 * 4 * Copyright (c) 2020, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD. 11 * MTE compatible. 12 */ 13 14#include "../asmdefs.h" 15 16#define srcin x0 17#define chrin w1 18#define cntin x2 19#define result x0 20 21#define src x3 22#define cntrem x4 23#define synd x5 24#define shift x6 25#define tmp x7 26#define wtmp w7 27 28#define vrepchr v0 29#define qdata q1 30#define vdata v1 31#define vhas_chr v2 32#define vrepmask v3 33#define vend v4 34#define dend d4 35 36/* 37 Core algorithm: 38 39 For each 16-byte chunk we calculate a 64-bit syndrome value with four bits 40 per byte. For even bytes, bits 0-3 are set if the relevant byte matched the 41 requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are 42 set likewise for odd bytes so that adjacent bytes can be merged. Since the 43 bits in the syndrome reflect the order in which things occur in the original 44 string, counting trailing zeros identifies exactly which byte matched. */ 45 46ENTRY (__memchr_aarch64_mte) 47 PTR_ARG (0) 48 SIZE_ARG (2) 49 bic src, srcin, 15 50 cbz cntin, L(nomatch) 51 ld1 {vdata.16b}, [src] 52 dup vrepchr.16b, chrin 53 mov wtmp, 0xf00f 54 dup vrepmask.8h, wtmp 55 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 56 lsl shift, srcin, 2 57 and vhas_chr.16b, vhas_chr.16b, vrepmask.16b 58 addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 59 fmov synd, dend 60 lsr synd, synd, shift 61 cbz synd, L(start_loop) 62 63 rbit synd, synd 64 clz synd, synd 65 add result, srcin, synd, lsr 2 66 cmp cntin, synd, lsr 2 67 csel result, result, xzr, hi 68 ret 69 70L(start_loop): 71 sub tmp, src, srcin 72 add tmp, tmp, 16 73 subs cntrem, cntin, tmp 74 b.ls L(nomatch) 75 76 /* Make sure that it won't overread by a 16-byte chunk */ 77 add tmp, cntrem, 15 78 tbnz tmp, 4, L(loop32_2) 79 80 .p2align 4 81L(loop32): 82 ldr qdata, [src, 16]! 83 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 84 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 85 fmov synd, dend 86 cbnz synd, L(end) 87 88L(loop32_2): 89 ldr qdata, [src, 16]! 90 subs cntrem, cntrem, 32 91 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 92 b.ls L(end) 93 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 94 fmov synd, dend 95 cbz synd, L(loop32) 96L(end): 97 and vhas_chr.16b, vhas_chr.16b, vrepmask.16b 98 addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 99 fmov synd, dend 100 add tmp, srcin, cntin 101 sub cntrem, tmp, src 102#ifndef __AARCH64EB__ 103 rbit synd, synd 104#endif 105 clz synd, synd 106 cmp cntrem, synd, lsr 2 107 add result, src, synd, lsr 2 108 csel result, result, xzr, hi 109 ret 110 111L(nomatch): 112 mov result, 0 113 ret 114 115END (__memchr_aarch64_mte) 116 117