1/* 2 * memchr - find a character in a memory zone 3 * 4 * Copyright (c) 2014-2020, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64 11 * Neon Available. 12 */ 13 14#include "../asmdefs.h" 15 16/* Arguments and results. */ 17#define srcin x0 18#define chrin w1 19#define cntin x2 20 21#define result x0 22 23#define src x3 24#define tmp x4 25#define wtmp2 w5 26#define synd x6 27#define soff x9 28#define cntrem x10 29 30#define vrepchr v0 31#define vdata1 v1 32#define vdata2 v2 33#define vhas_chr1 v3 34#define vhas_chr2 v4 35#define vrepmask v5 36#define vend v6 37 38/* 39 * Core algorithm: 40 * 41 * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits 42 * per byte. For each tuple, bit 0 is set if the relevant byte matched the 43 * requested character and bit 1 is not used (faster than using a 32bit 44 * syndrome). Since the bits in the syndrome reflect exactly the order in which 45 * things occur in the original string, counting trailing zeros allows to 46 * identify exactly which byte has matched. 47 */ 48 49ENTRY (__memchr_aarch64) 50 PTR_ARG (0) 51 SIZE_ARG (2) 52 /* Do not dereference srcin if no bytes to compare. */ 53 cbz cntin, L(zero_length) 54 /* 55 * Magic constant 0x40100401 allows us to identify which lane matches 56 * the requested byte. 57 */ 58 mov wtmp2, #0x0401 59 movk wtmp2, #0x4010, lsl #16 60 dup vrepchr.16b, chrin 61 /* Work with aligned 32-byte chunks */ 62 bic src, srcin, #31 63 dup vrepmask.4s, wtmp2 64 ands soff, srcin, #31 65 and cntrem, cntin, #31 66 b.eq L(loop) 67 68 /* 69 * Input string is not 32-byte aligned. We calculate the syndrome 70 * value for the aligned 32 bytes block containing the first bytes 71 * and mask the irrelevant part. 72 */ 73 74 ld1 {vdata1.16b, vdata2.16b}, [src], #32 75 sub tmp, soff, #32 76 adds cntin, cntin, tmp 77 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 78 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 79 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 80 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 81 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ 82 addp vend.16b, vend.16b, vend.16b /* 128->64 */ 83 mov synd, vend.d[0] 84 /* Clear the soff*2 lower bits */ 85 lsl tmp, soff, #1 86 lsr synd, synd, tmp 87 lsl synd, synd, tmp 88 /* The first block can also be the last */ 89 b.ls L(masklast) 90 /* Have we found something already? */ 91 cbnz synd, L(tail) 92 93L(loop): 94 ld1 {vdata1.16b, vdata2.16b}, [src], #32 95 subs cntin, cntin, #32 96 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 97 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 98 /* If we're out of data we finish regardless of the result */ 99 b.ls L(end) 100 /* Use a fast check for the termination condition */ 101 orr vend.16b, vhas_chr1.16b, vhas_chr2.16b 102 addp vend.2d, vend.2d, vend.2d 103 mov synd, vend.d[0] 104 /* We're not out of data, loop if we haven't found the character */ 105 cbz synd, L(loop) 106 107L(end): 108 /* Termination condition found, let's calculate the syndrome value */ 109 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 110 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 111 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ 112 addp vend.16b, vend.16b, vend.16b /* 128->64 */ 113 mov synd, vend.d[0] 114 /* Only do the clear for the last possible block */ 115 b.hs L(tail) 116 117L(masklast): 118 /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ 119 add tmp, cntrem, soff 120 and tmp, tmp, #31 121 sub tmp, tmp, #32 122 neg tmp, tmp, lsl #1 123 lsl synd, synd, tmp 124 lsr synd, synd, tmp 125 126L(tail): 127 /* Count the trailing zeros using bit reversing */ 128 rbit synd, synd 129 /* Compensate the last post-increment */ 130 sub src, src, #32 131 /* Check that we have found a character */ 132 cmp synd, #0 133 /* And count the leading zeros */ 134 clz synd, synd 135 /* Compute the potential result */ 136 add result, src, synd, lsr #1 137 /* Select result or NULL */ 138 csel result, xzr, result, eq 139 ret 140 141L(zero_length): 142 mov result, #0 143 ret 144 145END (__memchr_aarch64) 146 147