xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/memcmp.S (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1/* memcmp - compare memory
2 *
3 * Copyright (c) 2013-2022, Arm Limited.
4 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
5 */
6
7/* Assumptions:
8 *
9 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
10 */
11
12#include "asmdefs.h"
13
14#define src1	x0
15#define src2	x1
16#define limit	x2
17#define result	w0
18
19#define data1	x3
20#define data1w	w3
21#define data2	x4
22#define data2w	w4
23#define data3	x5
24#define data3w	w5
25#define data4	x6
26#define data4w	w6
27#define tmp	x6
28#define src1end	x7
29#define src2end	x8
30
31
32ENTRY (__memcmp_aarch64)
33	cmp	limit, 16
34	b.lo	L(less16)
35	ldp	data1, data3, [src1]
36	ldp	data2, data4, [src2]
37	ccmp	data1, data2, 0, ne
38	ccmp	data3, data4, 0, eq
39	b.ne	L(return2)
40
41	add	src1end, src1, limit
42	add	src2end, src2, limit
43	cmp	limit, 32
44	b.ls	L(last_bytes)
45	cmp	limit, 160
46	b.hs	L(loop_align)
47	sub	limit, limit, 32
48
49	.p2align 4
50L(loop32):
51	ldp	data1, data3, [src1, 16]
52	ldp	data2, data4, [src2, 16]
53	cmp	data1, data2
54	ccmp	data3, data4, 0, eq
55	b.ne	L(return2)
56	cmp	limit, 16
57	b.ls	L(last_bytes)
58
59	ldp	data1, data3, [src1, 32]
60	ldp	data2, data4, [src2, 32]
61	cmp	data1, data2
62	ccmp	data3, data4, 0, eq
63	b.ne	L(return2)
64	add	src1, src1, 32
65	add	src2, src2, 32
66L(last64):
67	subs	limit, limit, 32
68	b.hi	L(loop32)
69
70	/* Compare last 1-16 bytes using unaligned access.  */
71L(last_bytes):
72	ldp	data1, data3, [src1end, -16]
73	ldp	data2, data4, [src2end, -16]
74L(return2):
75	cmp	data1, data2
76	csel	data1, data1, data3, ne
77	csel	data2, data2, data4, ne
78
79	/* Compare data bytes and set return value to 0, -1 or 1.  */
80L(return):
81#ifndef __AARCH64EB__
82	rev	data1, data1
83	rev	data2, data2
84#endif
85	cmp	data1, data2
86	cset	result, ne
87	cneg	result, result, lo
88	ret
89
90	.p2align 4
91L(less16):
92	add	src1end, src1, limit
93	add	src2end, src2, limit
94	tbz	limit, 3, L(less8)
95	ldr	data1, [src1]
96	ldr	data2, [src2]
97	ldr	data3, [src1end, -8]
98	ldr	data4, [src2end, -8]
99	b	L(return2)
100
101	.p2align 4
102L(less8):
103	tbz	limit, 2, L(less4)
104	ldr	data1w, [src1]
105	ldr	data2w, [src2]
106	ldr	data3w, [src1end, -4]
107	ldr	data4w, [src2end, -4]
108	b	L(return2)
109
110L(less4):
111	tbz	limit, 1, L(less2)
112	ldrh	data1w, [src1]
113	ldrh	data2w, [src2]
114	cmp	data1w, data2w
115	b.ne	L(return)
116L(less2):
117	mov	result, 0
118	tbz	limit, 0, L(return_zero)
119	ldrb	data1w, [src1end, -1]
120	ldrb	data2w, [src2end, -1]
121	sub	result, data1w, data2w
122L(return_zero):
123	ret
124
125L(loop_align):
126	ldp	data1, data3, [src1, 16]
127	ldp	data2, data4, [src2, 16]
128	cmp	data1, data2
129	ccmp	data3, data4, 0, eq
130	b.ne	L(return2)
131
132	/* Align src2 and adjust src1, src2 and limit.  */
133	and	tmp, src2, 15
134	sub	tmp, tmp, 16
135	sub	src2, src2, tmp
136	add	limit, limit, tmp
137	sub	src1, src1, tmp
138	sub	limit, limit, 64 + 16
139
140	.p2align 4
141L(loop64):
142	ldr	q0, [src1, 16]
143	ldr	q1, [src2, 16]
144	subs	limit, limit, 64
145	ldr	q2, [src1, 32]
146	ldr	q3, [src2, 32]
147	eor	v0.16b, v0.16b, v1.16b
148	eor	v1.16b, v2.16b, v3.16b
149	ldr	q2, [src1, 48]
150	ldr	q3, [src2, 48]
151	umaxp	v0.16b, v0.16b, v1.16b
152	ldr	q4, [src1, 64]!
153	ldr	q5, [src2, 64]!
154	eor	v1.16b, v2.16b, v3.16b
155	eor	v2.16b, v4.16b, v5.16b
156	umaxp	v1.16b, v1.16b, v2.16b
157	umaxp	v0.16b, v0.16b, v1.16b
158	umaxp	v0.16b, v0.16b, v0.16b
159	fmov	tmp, d0
160	ccmp	tmp, 0, 0, hi
161	b.eq	L(loop64)
162
163	/* If equal, process last 1-64 bytes using scalar loop.  */
164	add	limit, limit, 64 + 16
165	cbz	tmp, L(last64)
166
167	/* Determine the 8-byte aligned offset of the first difference.  */
168#ifdef __AARCH64EB__
169	rev16	tmp, tmp
170#endif
171	rev	tmp, tmp
172	clz	tmp, tmp
173	bic	tmp, tmp, 7
174	sub	tmp, tmp, 48
175	ldr	data1, [src1, tmp]
176	ldr	data2, [src2, tmp]
177#ifndef __AARCH64EB__
178	rev	data1, data1
179	rev	data2, data2
180#endif
181	mov	result, 1
182	cmp	data1, data2
183	cneg	result, result, lo
184	ret
185
186END (__memcmp_aarch64)
187