1/* 2 * strchrnul - find a character or nul in a string 3 * 4 * Copyright (c) 2014-2022, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64 11 * Neon Available. 12 */ 13 14#include "asmdefs.h" 15 16/* Arguments and results. */ 17#define srcin x0 18#define chrin w1 19 20#define result x0 21 22#define src x2 23#define tmp1 x3 24#define wtmp2 w4 25#define tmp3 x5 26 27#define vrepchr v0 28#define vdata1 v1 29#define vdata2 v2 30#define vhas_nul1 v3 31#define vhas_nul2 v4 32#define vhas_chr1 v5 33#define vhas_chr2 v6 34#define vrepmask v7 35#define vend1 v16 36 37/* Core algorithm. 38 39 For each 32-byte hunk we calculate a 64-bit syndrome value, with 40 two bits per byte (LSB is always in bits 0 and 1, for both big 41 and little-endian systems). For each tuple, bit 0 is set iff 42 the relevant byte matched the requested character or nul. Since the 43 bits in the syndrome reflect exactly the order in which things occur 44 in the original string a count_trailing_zeros() operation will 45 identify exactly which byte is causing the termination. */ 46 47/* Locals and temporaries. */ 48 49ENTRY (__strchrnul_aarch64) 50 PTR_ARG (0) 51 /* Magic constant 0x40100401 to allow us to identify which lane 52 matches the termination condition. */ 53 mov wtmp2, #0x0401 54 movk wtmp2, #0x4010, lsl #16 55 dup vrepchr.16b, chrin 56 bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ 57 dup vrepmask.4s, wtmp2 58 ands tmp1, srcin, #31 59 b.eq L(loop) 60 61 /* Input string is not 32-byte aligned. Rather than forcing 62 the padding bytes to a safe value, we calculate the syndrome 63 for all the bytes, but then mask off those bits of the 64 syndrome that are related to the padding. */ 65 ld1 {vdata1.16b, vdata2.16b}, [src], #32 66 neg tmp1, tmp1 67 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 68 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 69 cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b 70 cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b 71 and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b 72 and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b 73 lsl tmp1, tmp1, #1 74 addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 75 mov tmp3, #~0 76 addp vend1.16b, vend1.16b, vend1.16b // 128->64 77 lsr tmp1, tmp3, tmp1 78 79 mov tmp3, vend1.d[0] 80 bic tmp1, tmp3, tmp1 // Mask padding bits. 81 cbnz tmp1, L(tail) 82 83 .p2align 4 84L(loop): 85 ld1 {vdata1.16b, vdata2.16b}, [src], #32 86 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 87 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 88 cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b 89 cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b 90 orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b 91 umaxp vend1.16b, vend1.16b, vend1.16b 92 mov tmp1, vend1.d[0] 93 cbz tmp1, L(loop) 94 95 /* Termination condition found. Now need to establish exactly why 96 we terminated. */ 97 and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b 98 and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b 99 addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 100 addp vend1.16b, vend1.16b, vend1.16b // 128->64 101 102 mov tmp1, vend1.d[0] 103L(tail): 104 /* Count the trailing zeros, by bit reversing... */ 105 rbit tmp1, tmp1 106 /* Re-bias source. */ 107 sub src, src, #32 108 clz tmp1, tmp1 /* ... and counting the leading zeros. */ 109 /* tmp1 is twice the offset into the fragment. */ 110 add result, src, tmp1, lsr #1 111 ret 112 113END (__strchrnul_aarch64) 114 115