1/* memcmp - compare memory 2 3 Copyright (c) 2013, Linaro Limited 4 All rights reserved. 5 6 Redistribution and use in source and binary forms, with or without 7 modification, are permitted provided that the following conditions are met: 8 * Redistributions of source code must retain the above copyright 9 notice, this list of conditions and the following disclaimer. 10 * Redistributions in binary form must reproduce the above copyright 11 notice, this list of conditions and the following disclaimer in the 12 documentation and/or other materials provided with the distribution. 13 * Neither the name of the Linaro nor the 14 names of its contributors may be used to endorse or promote products 15 derived from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 28 29/* Assumptions: 30 * 31 * ARMv8-a, AArch64 32 */ 33 34 .macro def_fn f p2align=0 35 .text 36 .p2align \p2align 37 .global \f 38 .type \f, %function 39\f: 40 .endm 41 42/* Parameters and result. */ 43#define src1 x0 44#define src2 x1 45#define limit x2 46#define result x0 47 48/* Internal variables. */ 49#define data1 x3 50#define data1w w3 51#define data2 x4 52#define data2w w4 53#define has_nul x5 54#define diff x6 55#define endloop x7 56#define tmp1 x8 57#define tmp2 x9 58#define tmp3 x10 59#define pos x11 60#define limit_wd x12 61#define mask x13 62 63def_fn memcmp p2align=6 64 cbz limit, .Lret0 65 eor tmp1, src1, src2 66 tst tmp1, #7 67 b.ne .Lmisaligned8 68 ands tmp1, src1, #7 69 b.ne .Lmutual_align 70 add limit_wd, limit, #7 71 lsr limit_wd, limit_wd, #3 72 /* Start of performance-critical section -- one 64B cache line. */ 73.Lloop_aligned: 74 ldr data1, [src1], #8 75 ldr data2, [src2], #8 76.Lstart_realigned: 77 subs limit_wd, limit_wd, #1 78 eor diff, data1, data2 /* Non-zero if differences found. */ 79 csinv endloop, diff, xzr, ne /* Last Dword or differences. */ 80 cbz endloop, .Lloop_aligned 81 /* End of performance-critical section -- one 64B cache line. */ 82 83 /* Not reached the limit, must have found a diff. */ 84 cbnz limit_wd, .Lnot_limit 85 86 /* Limit % 8 == 0 => all bytes significant. */ 87 ands limit, limit, #7 88 b.eq .Lnot_limit 89 90 lsl limit, limit, #3 /* Bits -> bytes. */ 91 mov mask, #~0 92#ifdef __AARCH64EB__ 93 lsr mask, mask, limit 94#else 95 lsl mask, mask, limit 96#endif 97 bic data1, data1, mask 98 bic data2, data2, mask 99 100 orr diff, diff, mask 101.Lnot_limit: 102 103#ifndef __AARCH64EB__ 104 rev diff, diff 105 rev data1, data1 106 rev data2, data2 107#endif 108 /* The MS-non-zero bit of DIFF marks either the first bit 109 that is different, or the end of the significant data. 110 Shifting left now will bring the critical information into the 111 top bits. */ 112 clz pos, diff 113 lsl data1, data1, pos 114 lsl data2, data2, pos 115 /* But we need to zero-extend (char is unsigned) the value and then 116 perform a signed 32-bit subtraction. */ 117 lsr data1, data1, #56 118 sub result, data1, data2, lsr #56 119 ret 120 121.Lmutual_align: 122 /* Sources are mutually aligned, but are not currently at an 123 alignment boundary. Round down the addresses and then mask off 124 the bytes that precede the start point. */ 125 bic src1, src1, #7 126 bic src2, src2, #7 127 add limit, limit, tmp1 /* Adjust the limit for the extra. */ 128 lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ 129 ldr data1, [src1], #8 130 neg tmp1, tmp1 /* Bits to alignment -64. */ 131 ldr data2, [src2], #8 132 mov tmp2, #~0 133#ifdef __AARCH64EB__ 134 /* Big-endian. Early bytes are at MSB. */ 135 lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ 136#else 137 /* Little-endian. Early bytes are at LSB. */ 138 lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ 139#endif 140 add limit_wd, limit, #7 141 orr data1, data1, tmp2 142 orr data2, data2, tmp2 143 lsr limit_wd, limit_wd, #3 144 b .Lstart_realigned 145 146.Lret0: 147 mov result, #0 148 ret 149 150 .p2align 6 151.Lmisaligned8: 152 sub limit, limit, #1 1531: 154 /* Perhaps we can do better than this. */ 155 ldrb data1w, [src1], #1 156 ldrb data2w, [src2], #1 157 subs limit, limit, #1 158 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 159 b.eq 1b 160 sub result, data1, data2 161 ret 162 .size memcmp, . - memcmp 163