1/* 2 * memchr - find a character in a memory zone 3 * 4 * Copyright (c) 2014-2022, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64 11 * Neon Available. 12 */ 13 14#include "asmdefs.h" 15 16/* Arguments and results. */ 17#define srcin x0 18#define chrin w1 19#define cntin x2 20 21#define result x0 22 23#define src x3 24#define tmp x4 25#define wtmp2 w5 26#define synd x6 27#define soff x9 28#define cntrem x10 29 30#define vrepchr v0 31#define vdata1 v1 32#define vdata2 v2 33#define vhas_chr1 v3 34#define vhas_chr2 v4 35#define vrepmask v5 36#define vend v6 37 38/* 39 * Core algorithm: 40 * 41 * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits 42 * per byte. For each tuple, bit 0 is set if the relevant byte matched the 43 * requested character and bit 1 is not used (faster than using a 32bit 44 * syndrome). Since the bits in the syndrome reflect exactly the order in which 45 * things occur in the original string, counting trailing zeros allows to 46 * identify exactly which byte has matched. 47 */ 48 49ENTRY (__memchr_aarch64) 50 /* Do not dereference srcin if no bytes to compare. */ 51 cbz cntin, L(zero_length) 52 /* 53 * Magic constant 0x40100401 allows us to identify which lane matches 54 * the requested byte. 55 */ 56 mov wtmp2, #0x0401 57 movk wtmp2, #0x4010, lsl #16 58 dup vrepchr.16b, chrin 59 /* Work with aligned 32-byte chunks */ 60 bic src, srcin, #31 61 dup vrepmask.4s, wtmp2 62 ands soff, srcin, #31 63 and cntrem, cntin, #31 64 b.eq L(loop) 65 66 /* 67 * Input string is not 32-byte aligned. We calculate the syndrome 68 * value for the aligned 32 bytes block containing the first bytes 69 * and mask the irrelevant part. 70 */ 71 72 ld1 {vdata1.16b, vdata2.16b}, [src], #32 73 sub tmp, soff, #32 74 adds cntin, cntin, tmp 75 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 76 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 77 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 78 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 79 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ 80 addp vend.16b, vend.16b, vend.16b /* 128->64 */ 81 mov synd, vend.d[0] 82 /* Clear the soff*2 lower bits */ 83 lsl tmp, soff, #1 84 lsr synd, synd, tmp 85 lsl synd, synd, tmp 86 /* The first block can also be the last */ 87 b.ls L(masklast) 88 /* Have we found something already? */ 89 cbnz synd, L(tail) 90 91L(loop): 92 ld1 {vdata1.16b, vdata2.16b}, [src], #32 93 subs cntin, cntin, #32 94 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 95 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 96 /* If we're out of data we finish regardless of the result */ 97 b.ls L(end) 98 /* Use a fast check for the termination condition */ 99 orr vend.16b, vhas_chr1.16b, vhas_chr2.16b 100 addp vend.2d, vend.2d, vend.2d 101 mov synd, vend.d[0] 102 /* We're not out of data, loop if we haven't found the character */ 103 cbz synd, L(loop) 104 105L(end): 106 /* Termination condition found, let's calculate the syndrome value */ 107 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 108 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 109 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ 110 addp vend.16b, vend.16b, vend.16b /* 128->64 */ 111 mov synd, vend.d[0] 112 /* Only do the clear for the last possible block */ 113 b.hs L(tail) 114 115L(masklast): 116 /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ 117 add tmp, cntrem, soff 118 and tmp, tmp, #31 119 sub tmp, tmp, #32 120 neg tmp, tmp, lsl #1 121 lsl synd, synd, tmp 122 lsr synd, synd, tmp 123 124L(tail): 125 /* Count the trailing zeros using bit reversing */ 126 rbit synd, synd 127 /* Compensate the last post-increment */ 128 sub src, src, #32 129 /* Check that we have found a character */ 130 cmp synd, #0 131 /* And count the leading zeros */ 132 clz synd, synd 133 /* Compute the potential result */ 134 add result, src, synd, lsr #1 135 /* Select result or NULL */ 136 csel result, xzr, result, eq 137 ret 138 139L(zero_length): 140 mov result, #0 141 ret 142 143END (__memchr_aarch64) 144 145