1/* memcmp - compare memory 2 * 3 * Copyright (c) 2013-2022, Arm Limited. 4 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 5 */ 6 7/* Assumptions: 8 * 9 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 10 */ 11 12#include "asmdefs.h" 13 14#define src1 x0 15#define src2 x1 16#define limit x2 17#define result w0 18 19#define data1 x3 20#define data1w w3 21#define data2 x4 22#define data2w w4 23#define data3 x5 24#define data3w w5 25#define data4 x6 26#define data4w w6 27#define tmp x6 28#define src1end x7 29#define src2end x8 30 31 32ENTRY (__memcmp_aarch64) 33 cmp limit, 16 34 b.lo L(less16) 35 ldp data1, data3, [src1] 36 ldp data2, data4, [src2] 37 ccmp data1, data2, 0, ne 38 ccmp data3, data4, 0, eq 39 b.ne L(return2) 40 41 add src1end, src1, limit 42 add src2end, src2, limit 43 cmp limit, 32 44 b.ls L(last_bytes) 45 cmp limit, 160 46 b.hs L(loop_align) 47 sub limit, limit, 32 48 49 .p2align 4 50L(loop32): 51 ldp data1, data3, [src1, 16] 52 ldp data2, data4, [src2, 16] 53 cmp data1, data2 54 ccmp data3, data4, 0, eq 55 b.ne L(return2) 56 cmp limit, 16 57 b.ls L(last_bytes) 58 59 ldp data1, data3, [src1, 32] 60 ldp data2, data4, [src2, 32] 61 cmp data1, data2 62 ccmp data3, data4, 0, eq 63 b.ne L(return2) 64 add src1, src1, 32 65 add src2, src2, 32 66L(last64): 67 subs limit, limit, 32 68 b.hi L(loop32) 69 70 /* Compare last 1-16 bytes using unaligned access. */ 71L(last_bytes): 72 ldp data1, data3, [src1end, -16] 73 ldp data2, data4, [src2end, -16] 74L(return2): 75 cmp data1, data2 76 csel data1, data1, data3, ne 77 csel data2, data2, data4, ne 78 79 /* Compare data bytes and set return value to 0, -1 or 1. */ 80L(return): 81#ifndef __AARCH64EB__ 82 rev data1, data1 83 rev data2, data2 84#endif 85 cmp data1, data2 86 cset result, ne 87 cneg result, result, lo 88 ret 89 90 .p2align 4 91L(less16): 92 add src1end, src1, limit 93 add src2end, src2, limit 94 tbz limit, 3, L(less8) 95 ldr data1, [src1] 96 ldr data2, [src2] 97 ldr data3, [src1end, -8] 98 ldr data4, [src2end, -8] 99 b L(return2) 100 101 .p2align 4 102L(less8): 103 tbz limit, 2, L(less4) 104 ldr data1w, [src1] 105 ldr data2w, [src2] 106 ldr data3w, [src1end, -4] 107 ldr data4w, [src2end, -4] 108 b L(return2) 109 110L(less4): 111 tbz limit, 1, L(less2) 112 ldrh data1w, [src1] 113 ldrh data2w, [src2] 114 cmp data1w, data2w 115 b.ne L(return) 116L(less2): 117 mov result, 0 118 tbz limit, 0, L(return_zero) 119 ldrb data1w, [src1end, -1] 120 ldrb data2w, [src2end, -1] 121 sub result, data1w, data2w 122L(return_zero): 123 ret 124 125L(loop_align): 126 ldp data1, data3, [src1, 16] 127 ldp data2, data4, [src2, 16] 128 cmp data1, data2 129 ccmp data3, data4, 0, eq 130 b.ne L(return2) 131 132 /* Align src2 and adjust src1, src2 and limit. */ 133 and tmp, src2, 15 134 sub tmp, tmp, 16 135 sub src2, src2, tmp 136 add limit, limit, tmp 137 sub src1, src1, tmp 138 sub limit, limit, 64 + 16 139 140 .p2align 4 141L(loop64): 142 ldr q0, [src1, 16] 143 ldr q1, [src2, 16] 144 subs limit, limit, 64 145 ldr q2, [src1, 32] 146 ldr q3, [src2, 32] 147 eor v0.16b, v0.16b, v1.16b 148 eor v1.16b, v2.16b, v3.16b 149 ldr q2, [src1, 48] 150 ldr q3, [src2, 48] 151 umaxp v0.16b, v0.16b, v1.16b 152 ldr q4, [src1, 64]! 153 ldr q5, [src2, 64]! 154 eor v1.16b, v2.16b, v3.16b 155 eor v2.16b, v4.16b, v5.16b 156 umaxp v1.16b, v1.16b, v2.16b 157 umaxp v0.16b, v0.16b, v1.16b 158 umaxp v0.16b, v0.16b, v0.16b 159 fmov tmp, d0 160 ccmp tmp, 0, 0, hi 161 b.eq L(loop64) 162 163 /* If equal, process last 1-64 bytes using scalar loop. */ 164 add limit, limit, 64 + 16 165 cbz tmp, L(last64) 166 167 /* Determine the 8-byte aligned offset of the first difference. */ 168#ifdef __AARCH64EB__ 169 rev16 tmp, tmp 170#endif 171 rev tmp, tmp 172 clz tmp, tmp 173 bic tmp, tmp, 7 174 sub tmp, tmp, 48 175 ldr data1, [src1, tmp] 176 ldr data2, [src2, tmp] 177#ifndef __AARCH64EB__ 178 rev data1, data1 179 rev data2, data2 180#endif 181 mov result, 1 182 cmp data1, data2 183 cneg result, result, lo 184 ret 185 186END (__memcmp_aarch64) 187