xref: /linux/arch/x86/lib/memmove_64.S (revision 7f81907b7e3f93dfed2e903af52659baa4944341)
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Normally compiler builtins are used, but sometimes the compiler calls out
4 * of line code. Based on asm-i386/string.h.
5 *
6 * This assembly file is re-written from memmove_64.c file.
7 *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
8 */
9#include <linux/export.h>
10#include <linux/linkage.h>
11#include <linux/cfi_types.h>
12#include <asm/cpufeatures.h>
13#include <asm/alternative.h>
14
15#undef memmove
16
17.section .noinstr.text, "ax"
18
19/*
20 * Implement memmove(). This can handle overlap between src and dst.
21 *
22 * Input:
23 * rdi: dest
24 * rsi: src
25 * rdx: count
26 *
27 * Output:
28 * rax: dest
29 */
30SYM_TYPED_FUNC_START(__memmove)
31
32	mov %rdi, %rax
33
34	/* Decide forward/backward copy mode */
35	cmp %rdi, %rsi
36	jge .Lmemmove_begin_forward
37	mov %rsi, %r8
38	add %rdx, %r8
39	cmp %rdi, %r8
40	jg 2f
41
42#define CHECK_LEN	cmp $0x20, %rdx; jb 1f
43#define MEMMOVE_BYTES	movq %rdx, %rcx; rep movsb; RET
44.Lmemmove_begin_forward:
45	ALTERNATIVE_2 __stringify(CHECK_LEN), \
46		      __stringify(CHECK_LEN; MEMMOVE_BYTES), X86_FEATURE_ERMS, \
47		      __stringify(MEMMOVE_BYTES), X86_FEATURE_FSRM
48
49	/*
50	 * movsq instruction have many startup latency
51	 * so we handle small size by general register.
52	 */
53	cmp  $680, %rdx
54	jb	3f
55	/*
56	 * movsq instruction is only good for aligned case.
57	 */
58
59	cmpb %dil, %sil
60	je 4f
613:
62	sub $0x20, %rdx
63	/*
64	 * We gobble 32 bytes forward in each loop.
65	 */
665:
67	sub $0x20, %rdx
68	movq 0*8(%rsi), %r11
69	movq 1*8(%rsi), %r10
70	movq 2*8(%rsi), %r9
71	movq 3*8(%rsi), %r8
72	leaq 4*8(%rsi), %rsi
73
74	movq %r11, 0*8(%rdi)
75	movq %r10, 1*8(%rdi)
76	movq %r9, 2*8(%rdi)
77	movq %r8, 3*8(%rdi)
78	leaq 4*8(%rdi), %rdi
79	jae 5b
80	addq $0x20, %rdx
81	jmp 1f
82	/*
83	 * Handle data forward by movsq.
84	 */
85	.p2align 4
864:
87	movq %rdx, %rcx
88	movq -8(%rsi, %rdx), %r11
89	lea -8(%rdi, %rdx), %r10
90	shrq $3, %rcx
91	rep movsq
92	movq %r11, (%r10)
93	jmp 13f
94.Lmemmove_end_forward:
95
96	/*
97	 * Handle data backward by movsq.
98	 */
99	.p2align 4
1007:
101	movq %rdx, %rcx
102	movq (%rsi), %r11
103	movq %rdi, %r10
104	leaq -8(%rsi, %rdx), %rsi
105	leaq -8(%rdi, %rdx), %rdi
106	shrq $3, %rcx
107	std
108	rep movsq
109	cld
110	movq %r11, (%r10)
111	jmp 13f
112
113	/*
114	 * Start to prepare for backward copy.
115	 */
116	.p2align 4
1172:
118	cmp $0x20, %rdx
119	jb 1f
120	cmp $680, %rdx
121	jb 6f
122	cmp %dil, %sil
123	je 7b
1246:
125	/*
126	 * Calculate copy position to tail.
127	 */
128	addq %rdx, %rsi
129	addq %rdx, %rdi
130	subq $0x20, %rdx
131	/*
132	 * We gobble 32 bytes backward in each loop.
133	 */
1348:
135	subq $0x20, %rdx
136	movq -1*8(%rsi), %r11
137	movq -2*8(%rsi), %r10
138	movq -3*8(%rsi), %r9
139	movq -4*8(%rsi), %r8
140	leaq -4*8(%rsi), %rsi
141
142	movq %r11, -1*8(%rdi)
143	movq %r10, -2*8(%rdi)
144	movq %r9, -3*8(%rdi)
145	movq %r8, -4*8(%rdi)
146	leaq -4*8(%rdi), %rdi
147	jae 8b
148	/*
149	 * Calculate copy position to head.
150	 */
151	addq $0x20, %rdx
152	subq %rdx, %rsi
153	subq %rdx, %rdi
1541:
155	cmpq $16, %rdx
156	jb 9f
157	/*
158	 * Move data from 16 bytes to 31 bytes.
159	 */
160	movq 0*8(%rsi), %r11
161	movq 1*8(%rsi), %r10
162	movq -2*8(%rsi, %rdx), %r9
163	movq -1*8(%rsi, %rdx), %r8
164	movq %r11, 0*8(%rdi)
165	movq %r10, 1*8(%rdi)
166	movq %r9, -2*8(%rdi, %rdx)
167	movq %r8, -1*8(%rdi, %rdx)
168	jmp 13f
169	.p2align 4
1709:
171	cmpq $8, %rdx
172	jb 10f
173	/*
174	 * Move data from 8 bytes to 15 bytes.
175	 */
176	movq 0*8(%rsi), %r11
177	movq -1*8(%rsi, %rdx), %r10
178	movq %r11, 0*8(%rdi)
179	movq %r10, -1*8(%rdi, %rdx)
180	jmp 13f
18110:
182	cmpq $4, %rdx
183	jb 11f
184	/*
185	 * Move data from 4 bytes to 7 bytes.
186	 */
187	movl (%rsi), %r11d
188	movl -4(%rsi, %rdx), %r10d
189	movl %r11d, (%rdi)
190	movl %r10d, -4(%rdi, %rdx)
191	jmp 13f
19211:
193	cmp $2, %rdx
194	jb 12f
195	/*
196	 * Move data from 2 bytes to 3 bytes.
197	 */
198	movw (%rsi), %r11w
199	movw -2(%rsi, %rdx), %r10w
200	movw %r11w, (%rdi)
201	movw %r10w, -2(%rdi, %rdx)
202	jmp 13f
20312:
204	cmp $1, %rdx
205	jb 13f
206	/*
207	 * Move data for 1 byte.
208	 */
209	movb (%rsi), %r11b
210	movb %r11b, (%rdi)
21113:
212	RET
213SYM_FUNC_END(__memmove)
214EXPORT_SYMBOL(__memmove)
215
216SYM_FUNC_ALIAS_MEMFUNC(memmove, __memmove)
217EXPORT_SYMBOL(memmove)
218