xref: /linux/tools/testing/selftests/powerpc/stringloops/memcmp_64.S (revision c8bfe3fad4f86a029da7157bae9699c816f0c309)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Author: Anton Blanchard <anton@au.ibm.com>
4 * Copyright 2015 IBM Corporation.
5 */
6#include <linux/export.h>
7#include <asm/ppc_asm.h>
8#include <asm/ppc-opcode.h>
9
10#define off8	r6
11#define off16	r7
12#define off24	r8
13
14#define rA	r9
15#define rB	r10
16#define rC	r11
17#define rD	r27
18#define rE	r28
19#define rF	r29
20#define rG	r30
21#define rH	r31
22
23#ifdef __LITTLE_ENDIAN__
24#define LH	lhbrx
25#define LW	lwbrx
26#define LD	ldbrx
27#define LVS	lvsr
28#define VPERM(_VRT,_VRA,_VRB,_VRC) \
29	vperm _VRT,_VRB,_VRA,_VRC
30#else
31#define LH	lhzx
32#define LW	lwzx
33#define LD	ldx
34#define LVS	lvsl
35#define VPERM(_VRT,_VRA,_VRB,_VRC) \
36	vperm _VRT,_VRA,_VRB,_VRC
37#endif
38
39#define VMX_THRESH 4096
40#define ENTER_VMX_OPS	\
41	mflr    r0;	\
42	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
43	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
44	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
45	std     r0,16(r1); \
46	stdu    r1,-STACKFRAMESIZE(r1); \
47	bl      CFUNC(enter_vmx_ops); \
48	cmpwi   cr1,r3,0; \
49	ld      r0,STACKFRAMESIZE+16(r1); \
50	ld      r3,STK_REG(R31)(r1); \
51	ld      r4,STK_REG(R30)(r1); \
52	ld      r5,STK_REG(R29)(r1); \
53	addi	r1,r1,STACKFRAMESIZE; \
54	mtlr    r0
55
56#define EXIT_VMX_OPS \
57	mflr    r0; \
58	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
59	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
60	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
61	std     r0,16(r1); \
62	stdu    r1,-STACKFRAMESIZE(r1); \
63	bl      CFUNC(exit_vmx_ops); \
64	ld      r0,STACKFRAMESIZE+16(r1); \
65	ld      r3,STK_REG(R31)(r1); \
66	ld      r4,STK_REG(R30)(r1); \
67	ld      r5,STK_REG(R29)(r1); \
68	addi	r1,r1,STACKFRAMESIZE; \
69	mtlr    r0
70
71/*
72 * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
73 * 16 bytes boundary and permute the result with the 1st 16 bytes.
74
75 *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
76 *    ^                                  ^                                 ^
77 * 0xbbbb10                          0xbbbb20                          0xbbb30
78 *                                 ^
79 *                                _vaddr
80 *
81 *
82 * _vmask is the mask generated by LVS
83 * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
84 *   for example: 0xyyyyyyyyyyyyy012 for big endian
85 * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
86 *   for example: 0x3456789abcdefzzz for big endian
87 * The permute result is saved in _v_res.
88 *   for example: 0x0123456789abcdef for big endian.
89 */
90#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
91        lvx     _v2nd_qw,_vaddr,off16; \
92        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
93
94/*
95 * There are 2 categories for memcmp:
96 * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
97 * are named like .Lsameoffset_xxxx
98 * 2) src/dst has different offset to the 8 bytes boundary. The handlers
99 * are named like .Ldiffoffset_xxxx
100 */
101_GLOBAL_TOC(memcmp)
102	cmpdi	cr1,r5,0
103
104	/* Use the short loop if the src/dst addresses are not
105	 * with the same offset of 8 bytes align boundary.
106	 */
107	xor	r6,r3,r4
108	andi.	r6,r6,7
109
110	/* Fall back to short loop if compare at aligned addrs
111	 * with less than 8 bytes.
112	 */
113	cmpdi   cr6,r5,7
114
115	beq	cr1,.Lzero
116	bgt	cr6,.Lno_short
117
118.Lshort:
119	mtctr	r5
1201:	lbz	rA,0(r3)
121	lbz	rB,0(r4)
122	subf.	rC,rB,rA
123	bne	.Lnon_zero
124	bdz	.Lzero
125
126	lbz	rA,1(r3)
127	lbz	rB,1(r4)
128	subf.	rC,rB,rA
129	bne	.Lnon_zero
130	bdz	.Lzero
131
132	lbz	rA,2(r3)
133	lbz	rB,2(r4)
134	subf.	rC,rB,rA
135	bne	.Lnon_zero
136	bdz	.Lzero
137
138	lbz	rA,3(r3)
139	lbz	rB,3(r4)
140	subf.	rC,rB,rA
141	bne	.Lnon_zero
142
143	addi	r3,r3,4
144	addi	r4,r4,4
145
146	bdnz	1b
147
148.Lzero:
149	li	r3,0
150	blr
151
152.Lno_short:
153	dcbt	0,r3
154	dcbt	0,r4
155	bne	.Ldiffoffset_8bytes_make_align_start
156
157
158.Lsameoffset_8bytes_make_align_start:
159	/* attempt to compare bytes not aligned with 8 bytes so that
160	 * rest comparison can run based on 8 bytes alignment.
161	 */
162	andi.   r6,r3,7
163
164	/* Try to compare the first double word which is not 8 bytes aligned:
165	 * load the first double word at (src & ~7UL) and shift left appropriate
166	 * bits before comparision.
167	 */
168	rlwinm  r6,r3,3,26,28
169	beq     .Lsameoffset_8bytes_aligned
170	clrrdi	r3,r3,3
171	clrrdi	r4,r4,3
172	LD	rA,0,r3
173	LD	rB,0,r4
174	sld	rA,rA,r6
175	sld	rB,rB,r6
176	cmpld	cr0,rA,rB
177	srwi	r6,r6,3
178	bne	cr0,.LcmpAB_lightweight
179	subfic  r6,r6,8
180	subf.	r5,r6,r5
181	addi	r3,r3,8
182	addi	r4,r4,8
183	beq	.Lzero
184
185.Lsameoffset_8bytes_aligned:
186	/* now we are aligned with 8 bytes.
187	 * Use .Llong loop if left cmp bytes are equal or greater than 32B.
188	 */
189	cmpdi   cr6,r5,31
190	bgt	cr6,.Llong
191
192.Lcmp_lt32bytes:
193	/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
194	cmpdi   cr5,r5,7
195	srdi    r0,r5,3
196	ble	cr5,.Lcmp_rest_lt8bytes
197
198	/* handle 8 ~ 31 bytes */
199	clrldi  r5,r5,61
200	mtctr   r0
2012:
202	LD	rA,0,r3
203	LD	rB,0,r4
204	cmpld	cr0,rA,rB
205	addi	r3,r3,8
206	addi	r4,r4,8
207	bne	cr0,.LcmpAB_lightweight
208	bdnz	2b
209
210	cmpwi   r5,0
211	beq	.Lzero
212
213.Lcmp_rest_lt8bytes:
214	/*
215	 * Here we have less than 8 bytes to compare. At least s1 is aligned to
216	 * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a
217	 * page boundary, otherwise we might read past the end of the buffer and
218	 * trigger a page fault. We use 4K as the conservative minimum page
219	 * size. If we detect that case we go to the byte-by-byte loop.
220	 *
221	 * Otherwise the next double word is loaded from s1 and s2, and shifted
222	 * right to compare the appropriate bits.
223	 */
224	clrldi	r6,r4,(64-12)	// r6 = r4 & 0xfff
225	cmpdi	r6,0xff8
226	bgt	.Lshort
227
228	subfic  r6,r5,8
229	slwi	r6,r6,3
230	LD	rA,0,r3
231	LD	rB,0,r4
232	srd	rA,rA,r6
233	srd	rB,rB,r6
234	cmpld	cr0,rA,rB
235	bne	cr0,.LcmpAB_lightweight
236	b	.Lzero
237
238.Lnon_zero:
239	mr	r3,rC
240	blr
241
242.Llong:
243#ifdef CONFIG_ALTIVEC
244BEGIN_FTR_SECTION
245	/* Try to use vmx loop if length is equal or greater than 4K */
246	cmpldi  cr6,r5,VMX_THRESH
247	bge	cr6,.Lsameoffset_vmx_cmp
248END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
249
250.Llong_novmx_cmp:
251#endif
252	/* At least s1 addr is aligned with 8 bytes */
253	li	off8,8
254	li	off16,16
255	li	off24,24
256
257	std	r31,-8(r1)
258	std	r30,-16(r1)
259	std	r29,-24(r1)
260	std	r28,-32(r1)
261	std	r27,-40(r1)
262
263	srdi	r0,r5,5
264	mtctr	r0
265	andi.	r5,r5,31
266
267	LD	rA,0,r3
268	LD	rB,0,r4
269
270	LD	rC,off8,r3
271	LD	rD,off8,r4
272
273	LD	rE,off16,r3
274	LD	rF,off16,r4
275
276	LD	rG,off24,r3
277	LD	rH,off24,r4
278	cmpld	cr0,rA,rB
279
280	addi	r3,r3,32
281	addi	r4,r4,32
282
283	bdz	.Lfirst32
284
285	LD	rA,0,r3
286	LD	rB,0,r4
287	cmpld	cr1,rC,rD
288
289	LD	rC,off8,r3
290	LD	rD,off8,r4
291	cmpld	cr6,rE,rF
292
293	LD	rE,off16,r3
294	LD	rF,off16,r4
295	cmpld	cr7,rG,rH
296	bne	cr0,.LcmpAB
297
298	LD	rG,off24,r3
299	LD	rH,off24,r4
300	cmpld	cr0,rA,rB
301	bne	cr1,.LcmpCD
302
303	addi	r3,r3,32
304	addi	r4,r4,32
305
306	bdz	.Lsecond32
307
308	.balign	16
309
3101:	LD	rA,0,r3
311	LD	rB,0,r4
312	cmpld	cr1,rC,rD
313	bne	cr6,.LcmpEF
314
315	LD	rC,off8,r3
316	LD	rD,off8,r4
317	cmpld	cr6,rE,rF
318	bne	cr7,.LcmpGH
319
320	LD	rE,off16,r3
321	LD	rF,off16,r4
322	cmpld	cr7,rG,rH
323	bne	cr0,.LcmpAB
324
325	LD	rG,off24,r3
326	LD	rH,off24,r4
327	cmpld	cr0,rA,rB
328	bne	cr1,.LcmpCD
329
330	addi	r3,r3,32
331	addi	r4,r4,32
332
333	bdnz	1b
334
335.Lsecond32:
336	cmpld	cr1,rC,rD
337	bne	cr6,.LcmpEF
338
339	cmpld	cr6,rE,rF
340	bne	cr7,.LcmpGH
341
342	cmpld	cr7,rG,rH
343	bne	cr0,.LcmpAB
344
345	bne	cr1,.LcmpCD
346	bne	cr6,.LcmpEF
347	bne	cr7,.LcmpGH
348
349.Ltail:
350	ld	r31,-8(r1)
351	ld	r30,-16(r1)
352	ld	r29,-24(r1)
353	ld	r28,-32(r1)
354	ld	r27,-40(r1)
355
356	cmpdi	r5,0
357	beq	.Lzero
358	b	.Lshort
359
360.Lfirst32:
361	cmpld	cr1,rC,rD
362	cmpld	cr6,rE,rF
363	cmpld	cr7,rG,rH
364
365	bne	cr0,.LcmpAB
366	bne	cr1,.LcmpCD
367	bne	cr6,.LcmpEF
368	bne	cr7,.LcmpGH
369
370	b	.Ltail
371
372.LcmpAB:
373	li	r3,1
374	bgt	cr0,.Lout
375	li	r3,-1
376	b	.Lout
377
378.LcmpCD:
379	li	r3,1
380	bgt	cr1,.Lout
381	li	r3,-1
382	b	.Lout
383
384.LcmpEF:
385	li	r3,1
386	bgt	cr6,.Lout
387	li	r3,-1
388	b	.Lout
389
390.LcmpGH:
391	li	r3,1
392	bgt	cr7,.Lout
393	li	r3,-1
394
395.Lout:
396	ld	r31,-8(r1)
397	ld	r30,-16(r1)
398	ld	r29,-24(r1)
399	ld	r28,-32(r1)
400	ld	r27,-40(r1)
401	blr
402
403.LcmpAB_lightweight:   /* skip NV GPRS restore */
404	li	r3,1
405	bgtlr
406	li	r3,-1
407	blr
408
409#ifdef CONFIG_ALTIVEC
410.Lsameoffset_vmx_cmp:
411	/* Enter with src/dst addrs has the same offset with 8 bytes
412	 * align boundary.
413	 *
414	 * There is an optimization based on following fact: memcmp()
415	 * prones to fail early at the first 32 bytes.
416	 * Before applying VMX instructions which will lead to 32x128bits
417	 * VMX regs load/restore penalty, we compare the first 32 bytes
418	 * so that we can catch the ~80% fail cases.
419	 */
420
421	li	r0,4
422	mtctr	r0
423.Lsameoffset_prechk_32B_loop:
424	LD	rA,0,r3
425	LD	rB,0,r4
426	cmpld	cr0,rA,rB
427	addi	r3,r3,8
428	addi	r4,r4,8
429	bne     cr0,.LcmpAB_lightweight
430	addi	r5,r5,-8
431	bdnz	.Lsameoffset_prechk_32B_loop
432
433	ENTER_VMX_OPS
434	beq     cr1,.Llong_novmx_cmp
435
4363:
437	/* need to check whether r4 has the same offset with r3
438	 * for 16 bytes boundary.
439	 */
440	xor	r0,r3,r4
441	andi.	r0,r0,0xf
442	bne	.Ldiffoffset_vmx_cmp_start
443
444	/* len is no less than 4KB. Need to align with 16 bytes further.
445	 */
446	andi.	rA,r3,8
447	LD	rA,0,r3
448	beq	4f
449	LD	rB,0,r4
450	cmpld	cr0,rA,rB
451	addi	r3,r3,8
452	addi	r4,r4,8
453	addi	r5,r5,-8
454
455	beq	cr0,4f
456	/* save and restore cr0 */
457	mfocrf  r5,128
458	EXIT_VMX_OPS
459	mtocrf  128,r5
460	b	.LcmpAB_lightweight
461
4624:
463	/* compare 32 bytes for each loop */
464	srdi	r0,r5,5
465	mtctr	r0
466	clrldi  r5,r5,59
467	li	off16,16
468
469.balign 16
4705:
471	lvx 	v0,0,r3
472	lvx 	v1,0,r4
473	VCMPEQUD_RC(v0,v0,v1)
474	bnl	cr6,7f
475	lvx 	v0,off16,r3
476	lvx 	v1,off16,r4
477	VCMPEQUD_RC(v0,v0,v1)
478	bnl	cr6,6f
479	addi	r3,r3,32
480	addi	r4,r4,32
481	bdnz	5b
482
483	EXIT_VMX_OPS
484	cmpdi	r5,0
485	beq	.Lzero
486	b	.Lcmp_lt32bytes
487
4886:
489	addi	r3,r3,16
490	addi	r4,r4,16
491
4927:
493	/* diff the last 16 bytes */
494	EXIT_VMX_OPS
495	LD	rA,0,r3
496	LD	rB,0,r4
497	cmpld	cr0,rA,rB
498	li	off8,8
499	bne	cr0,.LcmpAB_lightweight
500
501	LD	rA,off8,r3
502	LD	rB,off8,r4
503	cmpld	cr0,rA,rB
504	bne	cr0,.LcmpAB_lightweight
505	b	.Lzero
506#endif
507
508.Ldiffoffset_8bytes_make_align_start:
509	/* now try to align s1 with 8 bytes */
510	rlwinm  r6,r3,3,26,28
511	beq     .Ldiffoffset_align_s1_8bytes
512
513	clrrdi	r3,r3,3
514	LD	rA,0,r3
515	LD	rB,0,r4  /* unaligned load */
516	sld	rA,rA,r6
517	srd	rA,rA,r6
518	srd	rB,rB,r6
519	cmpld	cr0,rA,rB
520	srwi	r6,r6,3
521	bne	cr0,.LcmpAB_lightweight
522
523	subfic  r6,r6,8
524	subf.	r5,r6,r5
525	addi	r3,r3,8
526	add	r4,r4,r6
527
528	beq	.Lzero
529
530.Ldiffoffset_align_s1_8bytes:
531	/* now s1 is aligned with 8 bytes. */
532#ifdef CONFIG_ALTIVEC
533BEGIN_FTR_SECTION
534	/* only do vmx ops when the size equal or greater than 4K bytes */
535	cmpdi	cr5,r5,VMX_THRESH
536	bge	cr5,.Ldiffoffset_vmx_cmp
537END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
538
539.Ldiffoffset_novmx_cmp:
540#endif
541
542
543	cmpdi   cr5,r5,31
544	ble	cr5,.Lcmp_lt32bytes
545
546#ifdef CONFIG_ALTIVEC
547	b	.Llong_novmx_cmp
548#else
549	b	.Llong
550#endif
551
552#ifdef CONFIG_ALTIVEC
553.Ldiffoffset_vmx_cmp:
554	/* perform a 32 bytes pre-checking before
555	 * enable VMX operations.
556	 */
557	li	r0,4
558	mtctr	r0
559.Ldiffoffset_prechk_32B_loop:
560	LD	rA,0,r3
561	LD	rB,0,r4
562	cmpld	cr0,rA,rB
563	addi	r3,r3,8
564	addi	r4,r4,8
565	bne     cr0,.LcmpAB_lightweight
566	addi	r5,r5,-8
567	bdnz	.Ldiffoffset_prechk_32B_loop
568
569	ENTER_VMX_OPS
570	beq     cr1,.Ldiffoffset_novmx_cmp
571
572.Ldiffoffset_vmx_cmp_start:
573	/* Firstly try to align r3 with 16 bytes */
574	andi.   r6,r3,0xf
575	li	off16,16
576	beq     .Ldiffoffset_vmx_s1_16bytes_align
577
578	LVS	v3,0,r3
579	LVS	v4,0,r4
580
581	lvx     v5,0,r3
582	lvx     v6,0,r4
583	LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
584	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
585
586	VCMPEQUB_RC(v7,v9,v10)
587	bnl	cr6,.Ldiffoffset_vmx_diff_found
588
589	subfic  r6,r6,16
590	subf    r5,r6,r5
591	add     r3,r3,r6
592	add     r4,r4,r6
593
594.Ldiffoffset_vmx_s1_16bytes_align:
595	/* now s1 is aligned with 16 bytes */
596	lvx     v6,0,r4
597	LVS	v4,0,r4
598	srdi	r6,r5,5  /* loop for 32 bytes each */
599	clrldi  r5,r5,59
600	mtctr	r6
601
602.balign	16
603.Ldiffoffset_vmx_32bytesloop:
604	/* the first qw of r4 was saved in v6 */
605	lvx	v9,0,r3
606	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
607	VCMPEQUB_RC(v7,v9,v10)
608	vor	v6,v8,v8
609	bnl	cr6,.Ldiffoffset_vmx_diff_found
610
611	addi	r3,r3,16
612	addi	r4,r4,16
613
614	lvx	v9,0,r3
615	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
616	VCMPEQUB_RC(v7,v9,v10)
617	vor	v6,v8,v8
618	bnl	cr6,.Ldiffoffset_vmx_diff_found
619
620	addi	r3,r3,16
621	addi	r4,r4,16
622
623	bdnz	.Ldiffoffset_vmx_32bytesloop
624
625	EXIT_VMX_OPS
626
627	cmpdi	r5,0
628	beq	.Lzero
629	b	.Lcmp_lt32bytes
630
631.Ldiffoffset_vmx_diff_found:
632	EXIT_VMX_OPS
633	/* anyway, the diff will appear in next 16 bytes */
634	li	r5,16
635	b	.Lcmp_lt32bytes
636
637#endif
638EXPORT_SYMBOL(memcmp)
639