1/* 2 strchrnul - find a character or nul in a string 3 4 Copyright (c) 2014, ARM Limited 5 All rights Reserved. 6 7 Redistribution and use in source and binary forms, with or without 8 modification, are permitted provided that the following conditions are met: 9 * Redistributions of source code must retain the above copyright 10 notice, this list of conditions and the following disclaimer. 11 * Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 * Neither the name of the company nor the names of its contributors 15 may be used to endorse or promote products derived from this 16 software without specific prior written permission. 17 18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 29 30/* Assumptions: 31 * 32 * ARMv8-a, AArch64 33 * Neon Available. 34 */ 35 36/* Arguments and results. */ 37#define srcin x0 38#define chrin w1 39 40#define result x0 41 42#define src x2 43#define tmp1 x3 44#define wtmp2 w4 45#define tmp3 x5 46 47#define vrepchr v0 48#define vdata1 v1 49#define vdata2 v2 50#define vhas_nul1 v3 51#define vhas_nul2 v4 52#define vhas_chr1 v5 53#define vhas_chr2 v6 54#define vrepmask v7 55#define vend1 v16 56 57/* Core algorithm. 58 59 For each 32-byte hunk we calculate a 64-bit syndrome value, with 60 two bits per byte (LSB is always in bits 0 and 1, for both big 61 and little-endian systems). For each tuple, bit 0 is set iff 62 the relevant byte matched the requested character or nul. Since the 63 bits in the syndrome reflect exactly the order in which things occur 64 in the original string a count_trailing_zeros() operation will 65 identify exactly which byte is causing the termination. */ 66 67/* Locals and temporaries. */ 68 69 .macro def_fn f p2align=0 70 .text 71 .p2align \p2align 72 .global \f 73 .type \f, %function 74\f: 75 .endm 76 77def_fn strchrnul 78 /* Magic constant 0x40100401 to allow us to identify which lane 79 matches the termination condition. */ 80 mov wtmp2, #0x0401 81 movk wtmp2, #0x4010, lsl #16 82 dup vrepchr.16b, chrin 83 bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ 84 dup vrepmask.4s, wtmp2 85 ands tmp1, srcin, #31 86 b.eq .Lloop 87 88 /* Input string is not 32-byte aligned. Rather than forcing 89 the padding bytes to a safe value, we calculate the syndrome 90 for all the bytes, but then mask off those bits of the 91 syndrome that are related to the padding. */ 92 ld1 {vdata1.16b, vdata2.16b}, [src], #32 93 neg tmp1, tmp1 94 cmeq vhas_nul1.16b, vdata1.16b, #0 95 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 96 cmeq vhas_nul2.16b, vdata2.16b, #0 97 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 98 orr vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b 99 orr vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b 100 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 101 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 102 lsl tmp1, tmp1, #1 103 addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 104 mov tmp3, #~0 105 addp vend1.16b, vend1.16b, vend1.16b // 128->64 106 lsr tmp1, tmp3, tmp1 107 108 mov tmp3, vend1.d[0] 109 bic tmp1, tmp3, tmp1 // Mask padding bits. 110 cbnz tmp1, .Ltail 111 112.Lloop: 113 ld1 {vdata1.16b, vdata2.16b}, [src], #32 114 cmeq vhas_nul1.16b, vdata1.16b, #0 115 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 116 cmeq vhas_nul2.16b, vdata2.16b, #0 117 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 118 /* Use a fast check for the termination condition. */ 119 orr vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b 120 orr vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b 121 orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b 122 addp vend1.2d, vend1.2d, vend1.2d 123 mov tmp1, vend1.d[0] 124 cbz tmp1, .Lloop 125 126 /* Termination condition found. Now need to establish exactly why 127 we terminated. */ 128 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 129 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 130 addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 131 addp vend1.16b, vend1.16b, vend1.16b // 128->64 132 133 mov tmp1, vend1.d[0] 134.Ltail: 135 /* Count the trailing zeros, by bit reversing... */ 136 rbit tmp1, tmp1 137 /* Re-bias source. */ 138 sub src, src, #32 139 clz tmp1, tmp1 /* ... and counting the leading zeros. */ 140 /* tmp1 is twice the offset into the fragment. */ 141 add result, src, tmp1, lsr #1 142 ret 143 144 .size strchrnul, . - strchrnul 145