xref: /freebsd/contrib/arm-optimized-routines/string/aarch64/memcmp.S (revision 3dd5524264095ed8612c28908e13f80668eff2f9)
1/* memcmp - compare memory
2 *
3 * Copyright (c) 2013-2021, Arm Limited.
4 * SPDX-License-Identifier: MIT
5 */
6
7/* Assumptions:
8 *
9 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
10 */
11
12#include "../asmdefs.h"
13
14#define src1	x0
15#define src2	x1
16#define limit	x2
17#define result	w0
18
19#define data1	x3
20#define data1w	w3
21#define data2	x4
22#define data2w	w4
23#define data3	x5
24#define data3w	w5
25#define data4	x6
26#define data4w	w6
27#define tmp	x6
28#define src1end	x7
29#define src2end	x8
30
31
32ENTRY (__memcmp_aarch64)
33	PTR_ARG (0)
34	PTR_ARG (1)
35	SIZE_ARG (2)
36
37	cmp	limit, 16
38	b.lo	L(less16)
39	ldp	data1, data3, [src1]
40	ldp	data2, data4, [src2]
41	ccmp	data1, data2, 0, ne
42	ccmp	data3, data4, 0, eq
43	b.ne	L(return2)
44
45	add	src1end, src1, limit
46	add	src2end, src2, limit
47	cmp	limit, 32
48	b.ls	L(last_bytes)
49	cmp	limit, 160
50	b.hs	L(loop_align)
51	sub	limit, limit, 32
52
53	.p2align 4
54L(loop32):
55	ldp	data1, data3, [src1, 16]
56	ldp	data2, data4, [src2, 16]
57	cmp	data1, data2
58	ccmp	data3, data4, 0, eq
59	b.ne	L(return2)
60	cmp	limit, 16
61	b.ls	L(last_bytes)
62
63	ldp	data1, data3, [src1, 32]
64	ldp	data2, data4, [src2, 32]
65	cmp	data1, data2
66	ccmp	data3, data4, 0, eq
67	b.ne	L(return2)
68	add	src1, src1, 32
69	add	src2, src2, 32
70L(last64):
71	subs	limit, limit, 32
72	b.hi	L(loop32)
73
74	/* Compare last 1-16 bytes using unaligned access.  */
75L(last_bytes):
76	ldp	data1, data3, [src1end, -16]
77	ldp	data2, data4, [src2end, -16]
78L(return2):
79	cmp	data1, data2
80	csel	data1, data1, data3, ne
81	csel	data2, data2, data4, ne
82
83	/* Compare data bytes and set return value to 0, -1 or 1.  */
84L(return):
85#ifndef __AARCH64EB__
86	rev	data1, data1
87	rev	data2, data2
88#endif
89	cmp	data1, data2
90	cset	result, ne
91	cneg	result, result, lo
92	ret
93
94	.p2align 4
95L(less16):
96	add	src1end, src1, limit
97	add	src2end, src2, limit
98	tbz	limit, 3, L(less8)
99	ldr	data1, [src1]
100	ldr	data2, [src2]
101	ldr	data3, [src1end, -8]
102	ldr	data4, [src2end, -8]
103	b	L(return2)
104
105	.p2align 4
106L(less8):
107	tbz	limit, 2, L(less4)
108	ldr	data1w, [src1]
109	ldr	data2w, [src2]
110	ldr	data3w, [src1end, -4]
111	ldr	data4w, [src2end, -4]
112	b	L(return2)
113
114L(less4):
115	tbz	limit, 1, L(less2)
116	ldrh	data1w, [src1]
117	ldrh	data2w, [src2]
118	cmp	data1w, data2w
119	b.ne	L(return)
120L(less2):
121	mov	result, 0
122	tbz	limit, 0, L(return_zero)
123	ldrb	data1w, [src1end, -1]
124	ldrb	data2w, [src2end, -1]
125	sub	result, data1w, data2w
126L(return_zero):
127	ret
128
129L(loop_align):
130	ldp	data1, data3, [src1, 16]
131	ldp	data2, data4, [src2, 16]
132	cmp	data1, data2
133	ccmp	data3, data4, 0, eq
134	b.ne	L(return2)
135
136	/* Align src2 and adjust src1, src2 and limit.  */
137	and	tmp, src2, 15
138	sub	tmp, tmp, 16
139	sub	src2, src2, tmp
140	add	limit, limit, tmp
141	sub	src1, src1, tmp
142	sub	limit, limit, 64 + 16
143
144	.p2align 4
145L(loop64):
146	ldr	q0, [src1, 16]
147	ldr	q1, [src2, 16]
148	subs	limit, limit, 64
149	ldr	q2, [src1, 32]
150	ldr	q3, [src2, 32]
151	eor	v0.16b, v0.16b, v1.16b
152	eor	v1.16b, v2.16b, v3.16b
153	ldr	q2, [src1, 48]
154	ldr	q3, [src2, 48]
155	umaxp	v0.16b, v0.16b, v1.16b
156	ldr	q4, [src1, 64]!
157	ldr	q5, [src2, 64]!
158	eor	v1.16b, v2.16b, v3.16b
159	eor	v2.16b, v4.16b, v5.16b
160	umaxp	v1.16b, v1.16b, v2.16b
161	umaxp	v0.16b, v0.16b, v1.16b
162	umaxp	v0.16b, v0.16b, v0.16b
163	fmov	tmp, d0
164	ccmp	tmp, 0, 0, hi
165	b.eq	L(loop64)
166
167	/* If equal, process last 1-64 bytes using scalar loop.  */
168	add	limit, limit, 64 + 16
169	cbz	tmp, L(last64)
170
171	/* Determine the 8-byte aligned offset of the first difference.  */
172#ifdef __AARCH64EB__
173	rev16	tmp, tmp
174#endif
175	rev	tmp, tmp
176	clz	tmp, tmp
177	bic	tmp, tmp, 7
178	sub	tmp, tmp, 48
179	ldr	data1, [src1, tmp]
180	ldr	data2, [src2, tmp]
181#ifndef __AARCH64EB__
182	rev	data1, data1
183	rev	data2, data2
184#endif
185	mov	result, 1
186	cmp	data1, data2
187	cneg	result, result, lo
188	ret
189
190END (__memcmp_aarch64)
191