1/* 2 * memchr - find a character in a memory zone 3 * 4 * Copyright (c) 2014, ARM Limited 5 * All rights Reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * * Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * * Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * * Neither the name of the company nor the names of its contributors 15 * may be used to endorse or promote products derived from this 16 * software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31/* Assumptions: 32 * 33 * ARMv8-a, AArch64 34 * Neon Available. 35 */ 36 37/* Arguments and results. */ 38#define srcin x0 39#define chrin w1 40#define cntin x2 41 42#define result x0 43 44#define src x3 45#define tmp x4 46#define wtmp2 w5 47#define synd x6 48#define soff x9 49#define cntrem x10 50 51#define vrepchr v0 52#define vdata1 v1 53#define vdata2 v2 54#define vhas_chr1 v3 55#define vhas_chr2 v4 56#define vrepmask v5 57#define vend v6 58 59/* 60 * Core algorithm: 61 * 62 * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits 63 * per byte. For each tuple, bit 0 is set if the relevant byte matched the 64 * requested character and bit 1 is not used (faster than using a 32bit 65 * syndrome). Since the bits in the syndrome reflect exactly the order in which 66 * things occur in the original string, counting trailing zeros allows to 67 * identify exactly which byte has matched. 68 */ 69 70 .macro def_fn f p2align=0 71 .text 72 .p2align \p2align 73 .global \f 74 .type \f, %function 75\f: 76 .endm 77 78def_fn memchr 79 /* Do not dereference srcin if no bytes to compare. */ 80 cbz cntin, .Lzero_length 81 /* 82 * Magic constant 0x40100401 allows us to identify which lane matches 83 * the requested byte. 84 */ 85 mov wtmp2, #0x0401 86 movk wtmp2, #0x4010, lsl #16 87 dup vrepchr.16b, chrin 88 /* Work with aligned 32-byte chunks */ 89 bic src, srcin, #31 90 dup vrepmask.4s, wtmp2 91 ands soff, srcin, #31 92 and cntrem, cntin, #31 93 b.eq .Lloop 94 95 /* 96 * Input string is not 32-byte aligned. We calculate the syndrome 97 * value for the aligned 32 bytes block containing the first bytes 98 * and mask the irrelevant part. 99 */ 100 101 ld1 {vdata1.16b, vdata2.16b}, [src], #32 102 sub tmp, soff, #32 103 adds cntin, cntin, tmp 104 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 105 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 106 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 107 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 108 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ 109 addp vend.16b, vend.16b, vend.16b /* 128->64 */ 110 mov synd, vend.d[0] 111 /* Clear the soff*2 lower bits */ 112 lsl tmp, soff, #1 113 lsr synd, synd, tmp 114 lsl synd, synd, tmp 115 /* The first block can also be the last */ 116 b.ls .Lmasklast 117 /* Have we found something already? */ 118 cbnz synd, .Ltail 119 120.Lloop: 121 ld1 {vdata1.16b, vdata2.16b}, [src], #32 122 subs cntin, cntin, #32 123 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 124 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 125 /* If we're out of data we finish regardless of the result */ 126 b.ls .Lend 127 /* Use a fast check for the termination condition */ 128 orr vend.16b, vhas_chr1.16b, vhas_chr2.16b 129 addp vend.2d, vend.2d, vend.2d 130 mov synd, vend.d[0] 131 /* We're not out of data, loop if we haven't found the character */ 132 cbz synd, .Lloop 133 134.Lend: 135 /* Termination condition found, let's calculate the syndrome value */ 136 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 137 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 138 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ 139 addp vend.16b, vend.16b, vend.16b /* 128->64 */ 140 mov synd, vend.d[0] 141 /* Only do the clear for the last possible block */ 142 b.hi .Ltail 143 144.Lmasklast: 145 /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ 146 add tmp, cntrem, soff 147 and tmp, tmp, #31 148 sub tmp, tmp, #32 149 neg tmp, tmp, lsl #1 150 lsl synd, synd, tmp 151 lsr synd, synd, tmp 152 153.Ltail: 154 /* Count the trailing zeros using bit reversing */ 155 rbit synd, synd 156 /* Compensate the last post-increment */ 157 sub src, src, #32 158 /* Check that we have found a character */ 159 cmp synd, #0 160 /* And count the leading zeros */ 161 clz synd, synd 162 /* Compute the potential result */ 163 add result, src, synd, lsr #1 164 /* Select result or NULL */ 165 csel result, xzr, result, eq 166 ret 167 168.Lzero_length: 169 mov result, #0 170 ret 171 172 .size memchr, . - memchr 173