1/* 2 strcpy/stpcpy - copy a string returning pointer to start/end. 3 4 Copyright (c) 2013, 2014, 2015 ARM Ltd. 5 All Rights Reserved. 6 7 Redistribution and use in source and binary forms, with or without 8 modification, are permitted provided that the following conditions are met: 9 * Redistributions of source code must retain the above copyright 10 notice, this list of conditions and the following disclaimer. 11 * Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 * Neither the name of the company nor the names of its contributors 15 may be used to endorse or promote products derived from this 16 software without specific prior written permission. 17 18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 29 30/* Assumptions: 31 * 32 * ARMv8-a, AArch64, unaligned accesses, min page size 4k. 33 */ 34 35/* To build as stpcpy, define BUILD_STPCPY before compiling this file. 36 37 To test the page crossing code path more thoroughly, compile with 38 -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower 39 entry path. This option is not intended for production use. */ 40 41/* Arguments and results. */ 42#define dstin x0 43#define srcin x1 44 45/* Locals and temporaries. */ 46#define src x2 47#define dst x3 48#define data1 x4 49#define data1w w4 50#define data2 x5 51#define data2w w5 52#define has_nul1 x6 53#define has_nul2 x7 54#define tmp1 x8 55#define tmp2 x9 56#define tmp3 x10 57#define tmp4 x11 58#define zeroones x12 59#define data1a x13 60#define data2a x14 61#define pos x15 62#define len x16 63#define to_align x17 64 65#ifdef BUILD_STPCPY 66#define STRCPY stpcpy 67#else 68#define STRCPY strcpy 69#endif 70 71 .macro def_fn f p2align=0 72 .text 73 .p2align \p2align 74 .global \f 75 .type \f, %function 76\f: 77 .endm 78 79 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 80 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 81 can be done in parallel across the entire word. */ 82 83#define REP8_01 0x0101010101010101 84#define REP8_7f 0x7f7f7f7f7f7f7f7f 85#define REP8_80 0x8080808080808080 86 87 /* AArch64 systems have a minimum page size of 4k. We can do a quick 88 page size check for crossing this boundary on entry and if we 89 do not, then we can short-circuit much of the entry code. We 90 expect early page-crossing strings to be rare (probability of 91 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite 92 predictable, even with random strings. 93 94 We don't bother checking for larger page sizes, the cost of setting 95 up the correct page size is just not worth the extra gain from 96 a small reduction in the cases taking the slow path. Note that 97 we only care about whether the first fetch, which may be 98 misaligned, crosses a page boundary - after that we move to aligned 99 fetches for the remainder of the string. */ 100 101#ifdef STRCPY_TEST_PAGE_CROSS 102 /* Make everything that isn't Qword aligned look like a page cross. */ 103#define MIN_PAGE_P2 4 104#else 105#define MIN_PAGE_P2 12 106#endif 107 108#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2) 109 110def_fn STRCPY p2align=6 111 /* For moderately short strings, the fastest way to do the copy is to 112 calculate the length of the string in the same way as strlen, then 113 essentially do a memcpy of the result. This avoids the need for 114 multiple byte copies and further means that by the time we 115 reach the bulk copy loop we know we can always use DWord 116 accesses. We expect strcpy to rarely be called repeatedly 117 with the same source string, so branch prediction is likely to 118 always be difficult - we mitigate against this by preferring 119 conditional select operations over branches whenever this is 120 feasible. */ 121 and tmp2, srcin, #(MIN_PAGE_SIZE - 1) 122 mov zeroones, #REP8_01 123 and to_align, srcin, #15 124 cmp tmp2, #(MIN_PAGE_SIZE - 16) 125 neg tmp1, to_align 126 /* The first fetch will straddle a (possible) page boundary iff 127 srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte 128 aligned string will never fail the page align check, so will 129 always take the fast path. */ 130 b.gt .Lpage_cross 131 132.Lpage_cross_ok: 133 ldp data1, data2, [srcin] 134#ifdef __AARCH64EB__ 135 /* Because we expect the end to be found within 16 characters 136 (profiling shows this is the most common case), it's worth 137 swapping the bytes now to save having to recalculate the 138 termination syndrome later. We preserve data1 and data2 139 so that we can re-use the values later on. */ 140 rev tmp2, data1 141 sub tmp1, tmp2, zeroones 142 orr tmp2, tmp2, #REP8_7f 143 bics has_nul1, tmp1, tmp2 144 b.ne .Lfp_le8 145 rev tmp4, data2 146 sub tmp3, tmp4, zeroones 147 orr tmp4, tmp4, #REP8_7f 148#else 149 sub tmp1, data1, zeroones 150 orr tmp2, data1, #REP8_7f 151 bics has_nul1, tmp1, tmp2 152 b.ne .Lfp_le8 153 sub tmp3, data2, zeroones 154 orr tmp4, data2, #REP8_7f 155#endif 156 bics has_nul2, tmp3, tmp4 157 b.eq .Lbulk_entry 158 159 /* The string is short (<=16 bytes). We don't know exactly how 160 short though, yet. Work out the exact length so that we can 161 quickly select the optimal copy strategy. */ 162.Lfp_gt8: 163 rev has_nul2, has_nul2 164 clz pos, has_nul2 165 mov tmp2, #56 166 add dst, dstin, pos, lsr #3 /* Bits to bytes. */ 167 sub pos, tmp2, pos 168#ifdef __AARCH64EB__ 169 lsr data2, data2, pos 170#else 171 lsl data2, data2, pos 172#endif 173 str data2, [dst, #1] 174 str data1, [dstin] 175#ifdef BUILD_STPCPY 176 add dstin, dst, #8 177#endif 178 ret 179 180.Lfp_le8: 181 rev has_nul1, has_nul1 182 clz pos, has_nul1 183 add dst, dstin, pos, lsr #3 /* Bits to bytes. */ 184 subs tmp2, pos, #24 /* Pos in bits. */ 185 b.lt .Lfp_lt4 186#ifdef __AARCH64EB__ 187 mov tmp2, #56 188 sub pos, tmp2, pos 189 lsr data2, data1, pos 190 lsr data1, data1, #32 191#else 192 lsr data2, data1, tmp2 193#endif 194 /* 4->7 bytes to copy. */ 195 str data2w, [dst, #-3] 196 str data1w, [dstin] 197#ifdef BUILD_STPCPY 198 mov dstin, dst 199#endif 200 ret 201.Lfp_lt4: 202 cbz pos, .Lfp_lt2 203 /* 2->3 bytes to copy. */ 204#ifdef __AARCH64EB__ 205 lsr data1, data1, #48 206#endif 207 strh data1w, [dstin] 208 /* Fall-through, one byte (max) to go. */ 209.Lfp_lt2: 210 /* Null-terminated string. Last character must be zero! */ 211 strb wzr, [dst] 212#ifdef BUILD_STPCPY 213 mov dstin, dst 214#endif 215 ret 216 217 .p2align 6 218 /* Aligning here ensures that the entry code and main loop all lies 219 within one 64-byte cache line. */ 220.Lbulk_entry: 221 sub to_align, to_align, #16 222 stp data1, data2, [dstin] 223 sub src, srcin, to_align 224 sub dst, dstin, to_align 225 b .Lentry_no_page_cross 226 227 /* The inner loop deals with two Dwords at a time. This has a 228 slightly higher start-up cost, but we should win quite quickly, 229 especially on cores with a high number of issue slots per 230 cycle, as we get much better parallelism out of the operations. */ 231.Lmain_loop: 232 stp data1, data2, [dst], #16 233.Lentry_no_page_cross: 234 ldp data1, data2, [src], #16 235 sub tmp1, data1, zeroones 236 orr tmp2, data1, #REP8_7f 237 sub tmp3, data2, zeroones 238 orr tmp4, data2, #REP8_7f 239 bic has_nul1, tmp1, tmp2 240 bics has_nul2, tmp3, tmp4 241 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ 242 b.eq .Lmain_loop 243 244 /* Since we know we are copying at least 16 bytes, the fastest way 245 to deal with the tail is to determine the location of the 246 trailing NUL, then (re)copy the 16 bytes leading up to that. */ 247 cmp has_nul1, #0 248#ifdef __AARCH64EB__ 249 /* For big-endian, carry propagation (if the final byte in the 250 string is 0x01) means we cannot use has_nul directly. The 251 easiest way to get the correct byte is to byte-swap the data 252 and calculate the syndrome a second time. */ 253 csel data1, data1, data2, ne 254 rev data1, data1 255 sub tmp1, data1, zeroones 256 orr tmp2, data1, #REP8_7f 257 bic has_nul1, tmp1, tmp2 258#else 259 csel has_nul1, has_nul1, has_nul2, ne 260#endif 261 rev has_nul1, has_nul1 262 clz pos, has_nul1 263 add tmp1, pos, #72 264 add pos, pos, #8 265 csel pos, pos, tmp1, ne 266 add src, src, pos, lsr #3 267 add dst, dst, pos, lsr #3 268 ldp data1, data2, [src, #-32] 269 stp data1, data2, [dst, #-16] 270#ifdef BUILD_STPCPY 271 sub dstin, dst, #1 272#endif 273 ret 274 275.Lpage_cross: 276 bic src, srcin, #15 277 /* Start by loading two words at [srcin & ~15], then forcing the 278 bytes that precede srcin to 0xff. This means they never look 279 like termination bytes. */ 280 ldp data1, data2, [src] 281 lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ 282 tst to_align, #7 283 csetm tmp2, ne 284#ifdef __AARCH64EB__ 285 lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ 286#else 287 lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ 288#endif 289 orr data1, data1, tmp2 290 orr data2a, data2, tmp2 291 cmp to_align, #8 292 csinv data1, data1, xzr, lt 293 csel data2, data2, data2a, lt 294 sub tmp1, data1, zeroones 295 orr tmp2, data1, #REP8_7f 296 sub tmp3, data2, zeroones 297 orr tmp4, data2, #REP8_7f 298 bic has_nul1, tmp1, tmp2 299 bics has_nul2, tmp3, tmp4 300 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ 301 b.eq .Lpage_cross_ok 302 /* We now need to make data1 and data2 look like they've been 303 loaded directly from srcin. Do a rotate on the 128-bit value. */ 304 lsl tmp1, to_align, #3 /* Bytes->bits. */ 305 neg tmp2, to_align, lsl #3 306#ifdef __AARCH64EB__ 307 lsl data1a, data1, tmp1 308 lsr tmp4, data2, tmp2 309 lsl data2, data2, tmp1 310 orr tmp4, tmp4, data1a 311 cmp to_align, #8 312 csel data1, tmp4, data2, lt 313 rev tmp2, data1 314 rev tmp4, data2 315 sub tmp1, tmp2, zeroones 316 orr tmp2, tmp2, #REP8_7f 317 sub tmp3, tmp4, zeroones 318 orr tmp4, tmp4, #REP8_7f 319#else 320 lsr data1a, data1, tmp1 321 lsl tmp4, data2, tmp2 322 lsr data2, data2, tmp1 323 orr tmp4, tmp4, data1a 324 cmp to_align, #8 325 csel data1, tmp4, data2, lt 326 sub tmp1, data1, zeroones 327 orr tmp2, data1, #REP8_7f 328 sub tmp3, data2, zeroones 329 orr tmp4, data2, #REP8_7f 330#endif 331 bic has_nul1, tmp1, tmp2 332 cbnz has_nul1, .Lfp_le8 333 bic has_nul2, tmp3, tmp4 334 b .Lfp_gt8 335 336 .size STRCPY, . - STRCPY 337