1/* memcmp - compare memory 2 * 3 * Copyright (c) 2013-2022, Arm Limited. 4 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 5 */ 6 7/* Assumptions: 8 * 9 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 10 */ 11 12#include "asmdefs.h" 13 14#define src1 x0 15#define src2 x1 16#define limit x2 17#define result w0 18 19#define data1 x3 20#define data1w w3 21#define data2 x4 22#define data2w w4 23#define data3 x5 24#define data3w w5 25#define data4 x6 26#define data4w w6 27#define tmp x6 28#define src1end x7 29#define src2end x8 30 31 32ENTRY (__memcmp_aarch64) 33 PTR_ARG (0) 34 PTR_ARG (1) 35 SIZE_ARG (2) 36 37 cmp limit, 16 38 b.lo L(less16) 39 ldp data1, data3, [src1] 40 ldp data2, data4, [src2] 41 ccmp data1, data2, 0, ne 42 ccmp data3, data4, 0, eq 43 b.ne L(return2) 44 45 add src1end, src1, limit 46 add src2end, src2, limit 47 cmp limit, 32 48 b.ls L(last_bytes) 49 cmp limit, 160 50 b.hs L(loop_align) 51 sub limit, limit, 32 52 53 .p2align 4 54L(loop32): 55 ldp data1, data3, [src1, 16] 56 ldp data2, data4, [src2, 16] 57 cmp data1, data2 58 ccmp data3, data4, 0, eq 59 b.ne L(return2) 60 cmp limit, 16 61 b.ls L(last_bytes) 62 63 ldp data1, data3, [src1, 32] 64 ldp data2, data4, [src2, 32] 65 cmp data1, data2 66 ccmp data3, data4, 0, eq 67 b.ne L(return2) 68 add src1, src1, 32 69 add src2, src2, 32 70L(last64): 71 subs limit, limit, 32 72 b.hi L(loop32) 73 74 /* Compare last 1-16 bytes using unaligned access. */ 75L(last_bytes): 76 ldp data1, data3, [src1end, -16] 77 ldp data2, data4, [src2end, -16] 78L(return2): 79 cmp data1, data2 80 csel data1, data1, data3, ne 81 csel data2, data2, data4, ne 82 83 /* Compare data bytes and set return value to 0, -1 or 1. */ 84L(return): 85#ifndef __AARCH64EB__ 86 rev data1, data1 87 rev data2, data2 88#endif 89 cmp data1, data2 90 cset result, ne 91 cneg result, result, lo 92 ret 93 94 .p2align 4 95L(less16): 96 add src1end, src1, limit 97 add src2end, src2, limit 98 tbz limit, 3, L(less8) 99 ldr data1, [src1] 100 ldr data2, [src2] 101 ldr data3, [src1end, -8] 102 ldr data4, [src2end, -8] 103 b L(return2) 104 105 .p2align 4 106L(less8): 107 tbz limit, 2, L(less4) 108 ldr data1w, [src1] 109 ldr data2w, [src2] 110 ldr data3w, [src1end, -4] 111 ldr data4w, [src2end, -4] 112 b L(return2) 113 114L(less4): 115 tbz limit, 1, L(less2) 116 ldrh data1w, [src1] 117 ldrh data2w, [src2] 118 cmp data1w, data2w 119 b.ne L(return) 120L(less2): 121 mov result, 0 122 tbz limit, 0, L(return_zero) 123 ldrb data1w, [src1end, -1] 124 ldrb data2w, [src2end, -1] 125 sub result, data1w, data2w 126L(return_zero): 127 ret 128 129L(loop_align): 130 ldp data1, data3, [src1, 16] 131 ldp data2, data4, [src2, 16] 132 cmp data1, data2 133 ccmp data3, data4, 0, eq 134 b.ne L(return2) 135 136 /* Align src2 and adjust src1, src2 and limit. */ 137 and tmp, src2, 15 138 sub tmp, tmp, 16 139 sub src2, src2, tmp 140 add limit, limit, tmp 141 sub src1, src1, tmp 142 sub limit, limit, 64 + 16 143 144 .p2align 4 145L(loop64): 146 ldr q0, [src1, 16] 147 ldr q1, [src2, 16] 148 subs limit, limit, 64 149 ldr q2, [src1, 32] 150 ldr q3, [src2, 32] 151 eor v0.16b, v0.16b, v1.16b 152 eor v1.16b, v2.16b, v3.16b 153 ldr q2, [src1, 48] 154 ldr q3, [src2, 48] 155 umaxp v0.16b, v0.16b, v1.16b 156 ldr q4, [src1, 64]! 157 ldr q5, [src2, 64]! 158 eor v1.16b, v2.16b, v3.16b 159 eor v2.16b, v4.16b, v5.16b 160 umaxp v1.16b, v1.16b, v2.16b 161 umaxp v0.16b, v0.16b, v1.16b 162 umaxp v0.16b, v0.16b, v0.16b 163 fmov tmp, d0 164 ccmp tmp, 0, 0, hi 165 b.eq L(loop64) 166 167 /* If equal, process last 1-64 bytes using scalar loop. */ 168 add limit, limit, 64 + 16 169 cbz tmp, L(last64) 170 171 /* Determine the 8-byte aligned offset of the first difference. */ 172#ifdef __AARCH64EB__ 173 rev16 tmp, tmp 174#endif 175 rev tmp, tmp 176 clz tmp, tmp 177 bic tmp, tmp, 7 178 sub tmp, tmp, 48 179 ldr data1, [src1, tmp] 180 ldr data2, [src2, tmp] 181#ifndef __AARCH64EB__ 182 rev data1, data1 183 rev data2, data2 184#endif 185 mov result, 1 186 cmp data1, data2 187 cneg result, result, lo 188 ret 189 190END (__memcmp_aarch64) 191