xref: /freebsd/contrib/bionic-x86_64-string/sse2-memmove-slm.S (revision 8ddb146abcdf061be9f2c0db7e391697dafad85c)
1*8ddb146aSEd Maste/*
2*8ddb146aSEd MasteCopyright (c) 2014, Intel Corporation
3*8ddb146aSEd MasteAll rights reserved.
4*8ddb146aSEd Maste
5*8ddb146aSEd MasteRedistribution and use in source and binary forms, with or without
6*8ddb146aSEd Mastemodification, are permitted provided that the following conditions are met:
7*8ddb146aSEd Maste
8*8ddb146aSEd Maste    * Redistributions of source code must retain the above copyright notice,
9*8ddb146aSEd Maste    * this list of conditions and the following disclaimer.
10*8ddb146aSEd Maste
11*8ddb146aSEd Maste    * Redistributions in binary form must reproduce the above copyright notice,
12*8ddb146aSEd Maste    * this list of conditions and the following disclaimer in the documentation
13*8ddb146aSEd Maste    * and/or other materials provided with the distribution.
14*8ddb146aSEd Maste
15*8ddb146aSEd Maste    * Neither the name of Intel Corporation nor the names of its contributors
16*8ddb146aSEd Maste    * may be used to endorse or promote products derived from this software
17*8ddb146aSEd Maste    * without specific prior written permission.
18*8ddb146aSEd Maste
19*8ddb146aSEd MasteTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20*8ddb146aSEd MasteANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21*8ddb146aSEd MasteWARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22*8ddb146aSEd MasteDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23*8ddb146aSEd MasteANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24*8ddb146aSEd Maste(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25*8ddb146aSEd MasteLOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26*8ddb146aSEd MasteANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27*8ddb146aSEd Maste(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28*8ddb146aSEd MasteSOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*8ddb146aSEd Maste*/
30*8ddb146aSEd Maste
31*8ddb146aSEd Maste#include "cache.h"
32*8ddb146aSEd Maste
33*8ddb146aSEd Maste#ifndef MEMMOVE
34*8ddb146aSEd Maste# define MEMMOVE		memmove
35*8ddb146aSEd Maste#endif
36*8ddb146aSEd Maste
37*8ddb146aSEd Maste#ifndef L
38*8ddb146aSEd Maste# define L(label)	.L##label
39*8ddb146aSEd Maste#endif
40*8ddb146aSEd Maste
41*8ddb146aSEd Maste#ifndef cfi_startproc
42*8ddb146aSEd Maste# define cfi_startproc	.cfi_startproc
43*8ddb146aSEd Maste#endif
44*8ddb146aSEd Maste
45*8ddb146aSEd Maste#ifndef cfi_endproc
46*8ddb146aSEd Maste# define cfi_endproc	.cfi_endproc
47*8ddb146aSEd Maste#endif
48*8ddb146aSEd Maste
49*8ddb146aSEd Maste#ifndef cfi_rel_offset
50*8ddb146aSEd Maste# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
51*8ddb146aSEd Maste#endif
52*8ddb146aSEd Maste
53*8ddb146aSEd Maste#ifndef cfi_restore
54*8ddb146aSEd Maste# define cfi_restore(reg)	.cfi_restore reg
55*8ddb146aSEd Maste#endif
56*8ddb146aSEd Maste
57*8ddb146aSEd Maste#ifndef cfi_adjust_cfa_offset
58*8ddb146aSEd Maste# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
59*8ddb146aSEd Maste#endif
60*8ddb146aSEd Maste
61*8ddb146aSEd Maste#ifndef ENTRY
62*8ddb146aSEd Maste# define ENTRY(name)		\
63*8ddb146aSEd Maste	.type name,  @function;		\
64*8ddb146aSEd Maste	.globl name;		\
65*8ddb146aSEd Maste	.p2align 4;		\
66*8ddb146aSEd Mastename:		\
67*8ddb146aSEd Maste	cfi_startproc
68*8ddb146aSEd Maste#endif
69*8ddb146aSEd Maste
70*8ddb146aSEd Maste#ifndef ALIAS_SYMBOL
71*8ddb146aSEd Maste# define ALIAS_SYMBOL(alias, original) \
72*8ddb146aSEd Maste	.globl alias; \
73*8ddb146aSEd Maste	.equ alias, original
74*8ddb146aSEd Maste#endif
75*8ddb146aSEd Maste
76*8ddb146aSEd Maste#ifndef END
77*8ddb146aSEd Maste# define END(name)		\
78*8ddb146aSEd Maste	cfi_endproc;		\
79*8ddb146aSEd Maste	.size name, .-name
80*8ddb146aSEd Maste#endif
81*8ddb146aSEd Maste
82*8ddb146aSEd Maste#define CFI_PUSH(REG)		\
83*8ddb146aSEd Maste	cfi_adjust_cfa_offset (4);		\
84*8ddb146aSEd Maste	cfi_rel_offset (REG, 0)
85*8ddb146aSEd Maste
86*8ddb146aSEd Maste#define CFI_POP(REG)		\
87*8ddb146aSEd Maste	cfi_adjust_cfa_offset (-4);		\
88*8ddb146aSEd Maste	cfi_restore (REG)
89*8ddb146aSEd Maste
90*8ddb146aSEd Maste#define PUSH(REG)	push REG;
91*8ddb146aSEd Maste#define POP(REG)	pop REG;
92*8ddb146aSEd Maste
93*8ddb146aSEd Maste#define ENTRANCE	PUSH (%rbx);
94*8ddb146aSEd Maste#define RETURN_END	POP (%rbx); ret
95*8ddb146aSEd Maste#define RETURN		RETURN_END;
96*8ddb146aSEd Maste
97*8ddb146aSEd Maste	.section .text.sse2,"ax",@progbits
98*8ddb146aSEd MasteENTRY (MEMMOVE)
99*8ddb146aSEd Maste	ENTRANCE
100*8ddb146aSEd Maste	mov	%rdi, %rax
101*8ddb146aSEd Maste
102*8ddb146aSEd Maste/* Check whether we should copy backward or forward.  */
103*8ddb146aSEd Maste	cmp	%rsi, %rdi
104*8ddb146aSEd Maste	je	L(mm_return)
105*8ddb146aSEd Maste	jg	L(mm_len_0_or_more_backward)
106*8ddb146aSEd Maste
107*8ddb146aSEd Maste/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
108*8ddb146aSEd Maste	separately.  */
109*8ddb146aSEd Maste	cmp	$16, %rdx
110*8ddb146aSEd Maste	jbe	L(mm_len_0_16_bytes_forward)
111*8ddb146aSEd Maste
112*8ddb146aSEd Maste	cmp	$32, %rdx
113*8ddb146aSEd Maste	ja	L(mm_len_32_or_more_forward)
114*8ddb146aSEd Maste
115*8ddb146aSEd Maste/* Copy [0..32] and return.  */
116*8ddb146aSEd Maste	movdqu	(%rsi), %xmm0
117*8ddb146aSEd Maste	movdqu	-16(%rsi, %rdx), %xmm1
118*8ddb146aSEd Maste	movdqu	%xmm0, (%rdi)
119*8ddb146aSEd Maste	movdqu	%xmm1, -16(%rdi, %rdx)
120*8ddb146aSEd Maste	jmp	L(mm_return)
121*8ddb146aSEd Maste
122*8ddb146aSEd MasteL(mm_len_32_or_more_forward):
123*8ddb146aSEd Maste	cmp	$64, %rdx
124*8ddb146aSEd Maste	ja	L(mm_len_64_or_more_forward)
125*8ddb146aSEd Maste
126*8ddb146aSEd Maste/* Copy [0..64] and return.  */
127*8ddb146aSEd Maste	movdqu	(%rsi), %xmm0
128*8ddb146aSEd Maste	movdqu	16(%rsi), %xmm1
129*8ddb146aSEd Maste	movdqu	-16(%rsi, %rdx), %xmm2
130*8ddb146aSEd Maste	movdqu	-32(%rsi, %rdx), %xmm3
131*8ddb146aSEd Maste	movdqu	%xmm0, (%rdi)
132*8ddb146aSEd Maste	movdqu	%xmm1, 16(%rdi)
133*8ddb146aSEd Maste	movdqu	%xmm2, -16(%rdi, %rdx)
134*8ddb146aSEd Maste	movdqu	%xmm3, -32(%rdi, %rdx)
135*8ddb146aSEd Maste	jmp	L(mm_return)
136*8ddb146aSEd Maste
137*8ddb146aSEd MasteL(mm_len_64_or_more_forward):
138*8ddb146aSEd Maste	cmp	$128, %rdx
139*8ddb146aSEd Maste	ja	L(mm_len_128_or_more_forward)
140*8ddb146aSEd Maste
141*8ddb146aSEd Maste/* Copy [0..128] and return.  */
142*8ddb146aSEd Maste	movdqu	(%rsi), %xmm0
143*8ddb146aSEd Maste	movdqu	16(%rsi), %xmm1
144*8ddb146aSEd Maste	movdqu	32(%rsi), %xmm2
145*8ddb146aSEd Maste	movdqu	48(%rsi), %xmm3
146*8ddb146aSEd Maste	movdqu	-64(%rsi, %rdx), %xmm4
147*8ddb146aSEd Maste	movdqu	-48(%rsi, %rdx), %xmm5
148*8ddb146aSEd Maste	movdqu	-32(%rsi, %rdx), %xmm6
149*8ddb146aSEd Maste	movdqu	-16(%rsi, %rdx), %xmm7
150*8ddb146aSEd Maste	movdqu	%xmm0, (%rdi)
151*8ddb146aSEd Maste	movdqu	%xmm1, 16(%rdi)
152*8ddb146aSEd Maste	movdqu	%xmm2, 32(%rdi)
153*8ddb146aSEd Maste	movdqu	%xmm3, 48(%rdi)
154*8ddb146aSEd Maste	movdqu	%xmm4, -64(%rdi, %rdx)
155*8ddb146aSEd Maste	movdqu	%xmm5, -48(%rdi, %rdx)
156*8ddb146aSEd Maste	movdqu	%xmm6, -32(%rdi, %rdx)
157*8ddb146aSEd Maste	movdqu	%xmm7, -16(%rdi, %rdx)
158*8ddb146aSEd Maste	jmp	L(mm_return)
159*8ddb146aSEd Maste
160*8ddb146aSEd MasteL(mm_len_128_or_more_forward):
161*8ddb146aSEd Maste/* Aligning the address of destination.  */
162*8ddb146aSEd Maste/*  save first unaligned 64 bytes */
163*8ddb146aSEd Maste	movdqu	(%rsi), %xmm0
164*8ddb146aSEd Maste	movdqu	16(%rsi), %xmm1
165*8ddb146aSEd Maste	movdqu	32(%rsi), %xmm2
166*8ddb146aSEd Maste	movdqu	48(%rsi), %xmm3
167*8ddb146aSEd Maste
168*8ddb146aSEd Maste	lea	64(%rdi), %r8
169*8ddb146aSEd Maste	and	$-64, %r8  /* r8 now aligned to next 64 byte boundary */
170*8ddb146aSEd Maste	sub	%rdi, %rsi /* rsi = src - dst = diff */
171*8ddb146aSEd Maste
172*8ddb146aSEd Maste	movdqu	(%r8, %rsi), %xmm4
173*8ddb146aSEd Maste	movdqu	16(%r8, %rsi), %xmm5
174*8ddb146aSEd Maste	movdqu	32(%r8, %rsi), %xmm6
175*8ddb146aSEd Maste	movdqu	48(%r8, %rsi), %xmm7
176*8ddb146aSEd Maste
177*8ddb146aSEd Maste	movdqu	%xmm0, (%rdi)
178*8ddb146aSEd Maste	movdqu	%xmm1, 16(%rdi)
179*8ddb146aSEd Maste	movdqu	%xmm2, 32(%rdi)
180*8ddb146aSEd Maste	movdqu	%xmm3, 48(%rdi)
181*8ddb146aSEd Maste	movdqa	%xmm4, (%r8)
182*8ddb146aSEd Maste	movaps	%xmm5, 16(%r8)
183*8ddb146aSEd Maste	movaps	%xmm6, 32(%r8)
184*8ddb146aSEd Maste	movaps	%xmm7, 48(%r8)
185*8ddb146aSEd Maste	add	$64, %r8
186*8ddb146aSEd Maste
187*8ddb146aSEd Maste	lea	(%rdi, %rdx), %rbx
188*8ddb146aSEd Maste	and	$-64, %rbx
189*8ddb146aSEd Maste	cmp	%r8, %rbx
190*8ddb146aSEd Maste	jbe	L(mm_copy_remaining_forward)
191*8ddb146aSEd Maste
192*8ddb146aSEd Maste	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
193*8ddb146aSEd Maste	jae	L(mm_large_page_loop_forward)
194*8ddb146aSEd Maste
195*8ddb146aSEd Maste	.p2align 4
196*8ddb146aSEd MasteL(mm_main_loop_forward):
197*8ddb146aSEd Maste
198*8ddb146aSEd Maste	prefetcht0 128(%r8, %rsi)
199*8ddb146aSEd Maste
200*8ddb146aSEd Maste	movdqu	(%r8, %rsi), %xmm0
201*8ddb146aSEd Maste	movdqu	16(%r8, %rsi), %xmm1
202*8ddb146aSEd Maste	movdqu	32(%r8, %rsi), %xmm2
203*8ddb146aSEd Maste	movdqu	48(%r8, %rsi), %xmm3
204*8ddb146aSEd Maste	movdqa	%xmm0, (%r8)
205*8ddb146aSEd Maste	movaps	%xmm1, 16(%r8)
206*8ddb146aSEd Maste	movaps	%xmm2, 32(%r8)
207*8ddb146aSEd Maste	movaps	%xmm3, 48(%r8)
208*8ddb146aSEd Maste	lea	64(%r8), %r8
209*8ddb146aSEd Maste	cmp	%r8, %rbx
210*8ddb146aSEd Maste	ja	L(mm_main_loop_forward)
211*8ddb146aSEd Maste
212*8ddb146aSEd MasteL(mm_copy_remaining_forward):
213*8ddb146aSEd Maste	add	%rdi, %rdx
214*8ddb146aSEd Maste	sub	%r8, %rdx
215*8ddb146aSEd Maste/* We copied all up till %rdi position in the dst.
216*8ddb146aSEd Maste	In %rdx now is how many bytes are left to copy.
217*8ddb146aSEd Maste	Now we need to advance %r8. */
218*8ddb146aSEd Maste	lea	(%r8, %rsi), %r9
219*8ddb146aSEd Maste
220*8ddb146aSEd MasteL(mm_remaining_0_64_bytes_forward):
221*8ddb146aSEd Maste	cmp	$32, %rdx
222*8ddb146aSEd Maste	ja	L(mm_remaining_33_64_bytes_forward)
223*8ddb146aSEd Maste	cmp	$16, %rdx
224*8ddb146aSEd Maste	ja	L(mm_remaining_17_32_bytes_forward)
225*8ddb146aSEd Maste	test	%rdx, %rdx
226*8ddb146aSEd Maste	.p2align 4,,2
227*8ddb146aSEd Maste	je	L(mm_return)
228*8ddb146aSEd Maste
229*8ddb146aSEd Maste	cmpb	$8, %dl
230*8ddb146aSEd Maste	ja	L(mm_remaining_9_16_bytes_forward)
231*8ddb146aSEd Maste	cmpb	$4, %dl
232*8ddb146aSEd Maste	.p2align 4,,5
233*8ddb146aSEd Maste	ja	L(mm_remaining_5_8_bytes_forward)
234*8ddb146aSEd Maste	cmpb	$2, %dl
235*8ddb146aSEd Maste	.p2align 4,,1
236*8ddb146aSEd Maste	ja	L(mm_remaining_3_4_bytes_forward)
237*8ddb146aSEd Maste	movzbl	-1(%r9,%rdx), %esi
238*8ddb146aSEd Maste	movzbl	(%r9), %ebx
239*8ddb146aSEd Maste	movb	%sil, -1(%r8,%rdx)
240*8ddb146aSEd Maste	movb	%bl, (%r8)
241*8ddb146aSEd Maste	jmp	L(mm_return)
242*8ddb146aSEd Maste
243*8ddb146aSEd MasteL(mm_remaining_33_64_bytes_forward):
244*8ddb146aSEd Maste	movdqu	(%r9), %xmm0
245*8ddb146aSEd Maste	movdqu	16(%r9), %xmm1
246*8ddb146aSEd Maste	movdqu	-32(%r9, %rdx), %xmm2
247*8ddb146aSEd Maste	movdqu	-16(%r9, %rdx), %xmm3
248*8ddb146aSEd Maste	movdqu	%xmm0, (%r8)
249*8ddb146aSEd Maste	movdqu	%xmm1, 16(%r8)
250*8ddb146aSEd Maste	movdqu	%xmm2, -32(%r8, %rdx)
251*8ddb146aSEd Maste	movdqu	%xmm3, -16(%r8, %rdx)
252*8ddb146aSEd Maste	jmp	L(mm_return)
253*8ddb146aSEd Maste
254*8ddb146aSEd MasteL(mm_remaining_17_32_bytes_forward):
255*8ddb146aSEd Maste	movdqu	(%r9), %xmm0
256*8ddb146aSEd Maste	movdqu	-16(%r9, %rdx), %xmm1
257*8ddb146aSEd Maste	movdqu	%xmm0, (%r8)
258*8ddb146aSEd Maste	movdqu	%xmm1, -16(%r8, %rdx)
259*8ddb146aSEd Maste	jmp	L(mm_return)
260*8ddb146aSEd Maste
261*8ddb146aSEd MasteL(mm_remaining_5_8_bytes_forward):
262*8ddb146aSEd Maste	movl	(%r9), %esi
263*8ddb146aSEd Maste	movl	-4(%r9,%rdx), %ebx
264*8ddb146aSEd Maste	movl	%esi, (%r8)
265*8ddb146aSEd Maste	movl	%ebx, -4(%r8,%rdx)
266*8ddb146aSEd Maste	jmp	L(mm_return)
267*8ddb146aSEd Maste
268*8ddb146aSEd MasteL(mm_remaining_9_16_bytes_forward):
269*8ddb146aSEd Maste	mov	(%r9), %rsi
270*8ddb146aSEd Maste	mov	-8(%r9, %rdx), %rbx
271*8ddb146aSEd Maste	mov	%rsi, (%r8)
272*8ddb146aSEd Maste	mov	%rbx, -8(%r8, %rdx)
273*8ddb146aSEd Maste	jmp	L(mm_return)
274*8ddb146aSEd Maste
275*8ddb146aSEd MasteL(mm_remaining_3_4_bytes_forward):
276*8ddb146aSEd Maste	movzwl	-2(%r9,%rdx), %esi
277*8ddb146aSEd Maste	movzwl	(%r9), %ebx
278*8ddb146aSEd Maste	movw	%si, -2(%r8,%rdx)
279*8ddb146aSEd Maste	movw	%bx, (%r8)
280*8ddb146aSEd Maste	jmp	L(mm_return)
281*8ddb146aSEd Maste
282*8ddb146aSEd MasteL(mm_len_0_16_bytes_forward):
283*8ddb146aSEd Maste	testb	$24, %dl
284*8ddb146aSEd Maste	jne	L(mm_len_9_16_bytes_forward)
285*8ddb146aSEd Maste	testb	$4, %dl
286*8ddb146aSEd Maste	.p2align 4,,5
287*8ddb146aSEd Maste	jne	L(mm_len_5_8_bytes_forward)
288*8ddb146aSEd Maste	test	%rdx, %rdx
289*8ddb146aSEd Maste	.p2align 4,,2
290*8ddb146aSEd Maste	je	L(mm_return)
291*8ddb146aSEd Maste	testb	$2, %dl
292*8ddb146aSEd Maste	.p2align 4,,1
293*8ddb146aSEd Maste	jne	L(mm_len_2_4_bytes_forward)
294*8ddb146aSEd Maste	movzbl	-1(%rsi,%rdx), %ebx
295*8ddb146aSEd Maste	movzbl	(%rsi), %esi
296*8ddb146aSEd Maste	movb	%bl, -1(%rdi,%rdx)
297*8ddb146aSEd Maste	movb	%sil, (%rdi)
298*8ddb146aSEd Maste	jmp	L(mm_return)
299*8ddb146aSEd Maste
300*8ddb146aSEd MasteL(mm_len_2_4_bytes_forward):
301*8ddb146aSEd Maste	movzwl	-2(%rsi,%rdx), %ebx
302*8ddb146aSEd Maste	movzwl	(%rsi), %esi
303*8ddb146aSEd Maste	movw	%bx, -2(%rdi,%rdx)
304*8ddb146aSEd Maste	movw	%si, (%rdi)
305*8ddb146aSEd Maste	jmp	L(mm_return)
306*8ddb146aSEd Maste
307*8ddb146aSEd MasteL(mm_len_5_8_bytes_forward):
308*8ddb146aSEd Maste	movl	(%rsi), %ebx
309*8ddb146aSEd Maste	movl	-4(%rsi,%rdx), %esi
310*8ddb146aSEd Maste	movl	%ebx, (%rdi)
311*8ddb146aSEd Maste	movl	%esi, -4(%rdi,%rdx)
312*8ddb146aSEd Maste	jmp	L(mm_return)
313*8ddb146aSEd Maste
314*8ddb146aSEd MasteL(mm_len_9_16_bytes_forward):
315*8ddb146aSEd Maste	mov	(%rsi), %rbx
316*8ddb146aSEd Maste	mov	-8(%rsi, %rdx), %rsi
317*8ddb146aSEd Maste	mov	%rbx, (%rdi)
318*8ddb146aSEd Maste	mov	%rsi, -8(%rdi, %rdx)
319*8ddb146aSEd Maste	jmp	L(mm_return)
320*8ddb146aSEd Maste
321*8ddb146aSEd MasteL(mm_recalc_len):
322*8ddb146aSEd Maste/* Compute in %rdx how many bytes are left to copy after
323*8ddb146aSEd Maste	the main loop stops.  */
324*8ddb146aSEd Maste	mov 	%rbx, %rdx
325*8ddb146aSEd Maste	sub 	%rdi, %rdx
326*8ddb146aSEd Maste/* The code for copying backwards.  */
327*8ddb146aSEd MasteL(mm_len_0_or_more_backward):
328*8ddb146aSEd Maste
329*8ddb146aSEd Maste/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
330*8ddb146aSEd Maste	separately.  */
331*8ddb146aSEd Maste	cmp	$16, %rdx
332*8ddb146aSEd Maste	jbe	L(mm_len_0_16_bytes_backward)
333*8ddb146aSEd Maste
334*8ddb146aSEd Maste	cmp	$32, %rdx
335*8ddb146aSEd Maste	ja	L(mm_len_32_or_more_backward)
336*8ddb146aSEd Maste
337*8ddb146aSEd Maste/* Copy [0..32] and return.  */
338*8ddb146aSEd Maste	movdqu	(%rsi), %xmm0
339*8ddb146aSEd Maste	movdqu	-16(%rsi, %rdx), %xmm1
340*8ddb146aSEd Maste	movdqu	%xmm0, (%rdi)
341*8ddb146aSEd Maste	movdqu	%xmm1, -16(%rdi, %rdx)
342*8ddb146aSEd Maste	jmp	L(mm_return)
343*8ddb146aSEd Maste
344*8ddb146aSEd MasteL(mm_len_32_or_more_backward):
345*8ddb146aSEd Maste	cmp	$64, %rdx
346*8ddb146aSEd Maste	ja	L(mm_len_64_or_more_backward)
347*8ddb146aSEd Maste
348*8ddb146aSEd Maste/* Copy [0..64] and return.  */
349*8ddb146aSEd Maste	movdqu	(%rsi), %xmm0
350*8ddb146aSEd Maste	movdqu	16(%rsi), %xmm1
351*8ddb146aSEd Maste	movdqu	-16(%rsi, %rdx), %xmm2
352*8ddb146aSEd Maste	movdqu	-32(%rsi, %rdx), %xmm3
353*8ddb146aSEd Maste	movdqu	%xmm0, (%rdi)
354*8ddb146aSEd Maste	movdqu	%xmm1, 16(%rdi)
355*8ddb146aSEd Maste	movdqu	%xmm2, -16(%rdi, %rdx)
356*8ddb146aSEd Maste	movdqu	%xmm3, -32(%rdi, %rdx)
357*8ddb146aSEd Maste	jmp	L(mm_return)
358*8ddb146aSEd Maste
359*8ddb146aSEd MasteL(mm_len_64_or_more_backward):
360*8ddb146aSEd Maste	cmp	$128, %rdx
361*8ddb146aSEd Maste	ja	L(mm_len_128_or_more_backward)
362*8ddb146aSEd Maste
363*8ddb146aSEd Maste/* Copy [0..128] and return.  */
364*8ddb146aSEd Maste	movdqu	(%rsi), %xmm0
365*8ddb146aSEd Maste	movdqu	16(%rsi), %xmm1
366*8ddb146aSEd Maste	movdqu	32(%rsi), %xmm2
367*8ddb146aSEd Maste	movdqu	48(%rsi), %xmm3
368*8ddb146aSEd Maste	movdqu	-64(%rsi, %rdx), %xmm4
369*8ddb146aSEd Maste	movdqu	-48(%rsi, %rdx), %xmm5
370*8ddb146aSEd Maste	movdqu	-32(%rsi, %rdx), %xmm6
371*8ddb146aSEd Maste	movdqu	-16(%rsi, %rdx), %xmm7
372*8ddb146aSEd Maste	movdqu	%xmm0, (%rdi)
373*8ddb146aSEd Maste	movdqu	%xmm1, 16(%rdi)
374*8ddb146aSEd Maste	movdqu	%xmm2, 32(%rdi)
375*8ddb146aSEd Maste	movdqu	%xmm3, 48(%rdi)
376*8ddb146aSEd Maste	movdqu	%xmm4, -64(%rdi, %rdx)
377*8ddb146aSEd Maste	movdqu	%xmm5, -48(%rdi, %rdx)
378*8ddb146aSEd Maste	movdqu	%xmm6, -32(%rdi, %rdx)
379*8ddb146aSEd Maste	movdqu	%xmm7, -16(%rdi, %rdx)
380*8ddb146aSEd Maste	jmp	L(mm_return)
381*8ddb146aSEd Maste
382*8ddb146aSEd MasteL(mm_len_128_or_more_backward):
383*8ddb146aSEd Maste/* Aligning the address of destination. We need to save
384*8ddb146aSEd Maste	16 bits from the source in order not to overwrite them.  */
385*8ddb146aSEd Maste	movdqu	-16(%rsi, %rdx), %xmm0
386*8ddb146aSEd Maste	movdqu	-32(%rsi, %rdx), %xmm1
387*8ddb146aSEd Maste	movdqu	-48(%rsi, %rdx), %xmm2
388*8ddb146aSEd Maste	movdqu	-64(%rsi, %rdx), %xmm3
389*8ddb146aSEd Maste
390*8ddb146aSEd Maste	lea	(%rdi, %rdx), %r9
391*8ddb146aSEd Maste	and	$-64, %r9 /* r9 = aligned dst */
392*8ddb146aSEd Maste
393*8ddb146aSEd Maste	mov	%rsi, %r8
394*8ddb146aSEd Maste	sub	%rdi, %r8 /* r8 = src - dst, diff */
395*8ddb146aSEd Maste
396*8ddb146aSEd Maste	movdqu	-16(%r9, %r8), %xmm4
397*8ddb146aSEd Maste	movdqu	-32(%r9, %r8), %xmm5
398*8ddb146aSEd Maste	movdqu	-48(%r9, %r8), %xmm6
399*8ddb146aSEd Maste	movdqu	-64(%r9, %r8), %xmm7
400*8ddb146aSEd Maste
401*8ddb146aSEd Maste	movdqu	%xmm0, -16(%rdi, %rdx)
402*8ddb146aSEd Maste	movdqu	%xmm1, -32(%rdi, %rdx)
403*8ddb146aSEd Maste	movdqu	%xmm2, -48(%rdi, %rdx)
404*8ddb146aSEd Maste	movdqu	%xmm3, -64(%rdi, %rdx)
405*8ddb146aSEd Maste	movdqa	%xmm4, -16(%r9)
406*8ddb146aSEd Maste	movaps	%xmm5, -32(%r9)
407*8ddb146aSEd Maste	movaps	%xmm6, -48(%r9)
408*8ddb146aSEd Maste	movaps	%xmm7, -64(%r9)
409*8ddb146aSEd Maste	lea	-64(%r9), %r9
410*8ddb146aSEd Maste
411*8ddb146aSEd Maste	lea	64(%rdi), %rbx
412*8ddb146aSEd Maste	and	$-64, %rbx
413*8ddb146aSEd Maste
414*8ddb146aSEd Maste	cmp	%r9, %rbx
415*8ddb146aSEd Maste	jae	L(mm_recalc_len)
416*8ddb146aSEd Maste
417*8ddb146aSEd Maste	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
418*8ddb146aSEd Maste	jae	L(mm_large_page_loop_backward)
419*8ddb146aSEd Maste
420*8ddb146aSEd Maste	.p2align 4
421*8ddb146aSEd MasteL(mm_main_loop_backward):
422*8ddb146aSEd Maste
423*8ddb146aSEd Maste	prefetcht0 -128(%r9, %r8)
424*8ddb146aSEd Maste
425*8ddb146aSEd Maste	movdqu	-64(%r9, %r8), %xmm0
426*8ddb146aSEd Maste	movdqu	-48(%r9, %r8), %xmm1
427*8ddb146aSEd Maste	movdqu	-32(%r9, %r8), %xmm2
428*8ddb146aSEd Maste	movdqu	-16(%r9, %r8), %xmm3
429*8ddb146aSEd Maste	movdqa	%xmm0, -64(%r9)
430*8ddb146aSEd Maste	movaps	%xmm1, -48(%r9)
431*8ddb146aSEd Maste	movaps	%xmm2, -32(%r9)
432*8ddb146aSEd Maste	movaps	%xmm3, -16(%r9)
433*8ddb146aSEd Maste	lea	-64(%r9), %r9
434*8ddb146aSEd Maste	cmp	%r9, %rbx
435*8ddb146aSEd Maste	jb	L(mm_main_loop_backward)
436*8ddb146aSEd Maste	jmp	L(mm_recalc_len)
437*8ddb146aSEd Maste
438*8ddb146aSEd Maste/* Copy [0..16] and return.  */
439*8ddb146aSEd MasteL(mm_len_0_16_bytes_backward):
440*8ddb146aSEd Maste	testb	$24, %dl
441*8ddb146aSEd Maste	jnz	L(mm_len_9_16_bytes_backward)
442*8ddb146aSEd Maste	testb	$4, %dl
443*8ddb146aSEd Maste	.p2align 4,,5
444*8ddb146aSEd Maste	jnz	L(mm_len_5_8_bytes_backward)
445*8ddb146aSEd Maste	test	%rdx, %rdx
446*8ddb146aSEd Maste	.p2align 4,,2
447*8ddb146aSEd Maste	je	L(mm_return)
448*8ddb146aSEd Maste	testb	$2, %dl
449*8ddb146aSEd Maste	.p2align 4,,1
450*8ddb146aSEd Maste	jne	L(mm_len_3_4_bytes_backward)
451*8ddb146aSEd Maste	movzbl	-1(%rsi,%rdx), %ebx
452*8ddb146aSEd Maste	movzbl	(%rsi), %ecx
453*8ddb146aSEd Maste	movb	%bl, -1(%rdi,%rdx)
454*8ddb146aSEd Maste	movb	%cl, (%rdi)
455*8ddb146aSEd Maste	jmp	L(mm_return)
456*8ddb146aSEd Maste
457*8ddb146aSEd MasteL(mm_len_3_4_bytes_backward):
458*8ddb146aSEd Maste	movzwl	-2(%rsi,%rdx), %ebx
459*8ddb146aSEd Maste	movzwl	(%rsi), %ecx
460*8ddb146aSEd Maste	movw	%bx, -2(%rdi,%rdx)
461*8ddb146aSEd Maste	movw	%cx, (%rdi)
462*8ddb146aSEd Maste	jmp	L(mm_return)
463*8ddb146aSEd Maste
464*8ddb146aSEd MasteL(mm_len_9_16_bytes_backward):
465*8ddb146aSEd Maste	movl	-4(%rsi,%rdx), %ebx
466*8ddb146aSEd Maste	movl	-8(%rsi,%rdx), %ecx
467*8ddb146aSEd Maste	movl	%ebx, -4(%rdi,%rdx)
468*8ddb146aSEd Maste	movl	%ecx, -8(%rdi,%rdx)
469*8ddb146aSEd Maste	sub	$8, %rdx
470*8ddb146aSEd Maste	jmp	L(mm_len_0_16_bytes_backward)
471*8ddb146aSEd Maste
472*8ddb146aSEd MasteL(mm_len_5_8_bytes_backward):
473*8ddb146aSEd Maste	movl	(%rsi), %ebx
474*8ddb146aSEd Maste	movl	-4(%rsi,%rdx), %ecx
475*8ddb146aSEd Maste	movl	%ebx, (%rdi)
476*8ddb146aSEd Maste	movl	%ecx, -4(%rdi,%rdx)
477*8ddb146aSEd Maste
478*8ddb146aSEd MasteL(mm_return):
479*8ddb146aSEd Maste	RETURN
480*8ddb146aSEd Maste
481*8ddb146aSEd Maste/* Big length copy forward part.  */
482*8ddb146aSEd Maste
483*8ddb146aSEd Maste	.p2align 4
484*8ddb146aSEd MasteL(mm_large_page_loop_forward):
485*8ddb146aSEd Maste	movdqu	(%r8, %rsi), %xmm0
486*8ddb146aSEd Maste	movdqu	16(%r8, %rsi), %xmm1
487*8ddb146aSEd Maste	movdqu	32(%r8, %rsi), %xmm2
488*8ddb146aSEd Maste	movdqu	48(%r8, %rsi), %xmm3
489*8ddb146aSEd Maste	movntdq	%xmm0, (%r8)
490*8ddb146aSEd Maste	movntdq	%xmm1, 16(%r8)
491*8ddb146aSEd Maste	movntdq	%xmm2, 32(%r8)
492*8ddb146aSEd Maste	movntdq	%xmm3, 48(%r8)
493*8ddb146aSEd Maste	lea 	64(%r8), %r8
494*8ddb146aSEd Maste	cmp	%r8, %rbx
495*8ddb146aSEd Maste	ja	L(mm_large_page_loop_forward)
496*8ddb146aSEd Maste	sfence
497*8ddb146aSEd Maste	jmp	L(mm_copy_remaining_forward)
498*8ddb146aSEd Maste
499*8ddb146aSEd Maste/* Big length copy backward part.  */
500*8ddb146aSEd Maste	.p2align 4
501*8ddb146aSEd MasteL(mm_large_page_loop_backward):
502*8ddb146aSEd Maste	movdqu	-64(%r9, %r8), %xmm0
503*8ddb146aSEd Maste	movdqu	-48(%r9, %r8), %xmm1
504*8ddb146aSEd Maste	movdqu	-32(%r9, %r8), %xmm2
505*8ddb146aSEd Maste	movdqu	-16(%r9, %r8), %xmm3
506*8ddb146aSEd Maste	movntdq	%xmm0, -64(%r9)
507*8ddb146aSEd Maste	movntdq	%xmm1, -48(%r9)
508*8ddb146aSEd Maste	movntdq	%xmm2, -32(%r9)
509*8ddb146aSEd Maste	movntdq	%xmm3, -16(%r9)
510*8ddb146aSEd Maste	lea 	-64(%r9), %r9
511*8ddb146aSEd Maste	cmp	%r9, %rbx
512*8ddb146aSEd Maste	jb	L(mm_large_page_loop_backward)
513*8ddb146aSEd Maste	sfence
514*8ddb146aSEd Maste	jmp	L(mm_recalc_len)
515*8ddb146aSEd Maste
516*8ddb146aSEd MasteEND (MEMMOVE)
517*8ddb146aSEd Maste
518*8ddb146aSEd MasteALIAS_SYMBOL(memcpy, MEMMOVE)
519