xref: /linux/arch/x86/lib/memcpy_64.S (revision 3b812ecce736432e6b55e77028ea387eb1517d24)
1/* Copyright 2002 Andi Kleen */
2
3#include <linux/linkage.h>
4#include <asm/cpufeatures.h>
5#include <asm/alternative-asm.h>
6
7/*
8 * We build a jump to memcpy_orig by default which gets NOPped out on
9 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
10 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
11 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
12 */
13
14.weak memcpy
15
16/*
17 * memcpy - Copy a memory block.
18 *
19 * Input:
20 *  rdi destination
21 *  rsi source
22 *  rdx count
23 *
24 * Output:
25 * rax original destination
26 */
27ENTRY(__memcpy)
28ENTRY(memcpy)
29	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
30		      "jmp memcpy_erms", X86_FEATURE_ERMS
31
32	movq %rdi, %rax
33	movq %rdx, %rcx
34	shrq $3, %rcx
35	andl $7, %edx
36	rep movsq
37	movl %edx, %ecx
38	rep movsb
39	ret
40ENDPROC(memcpy)
41ENDPROC(__memcpy)
42
43/*
44 * memcpy_erms() - enhanced fast string memcpy. This is faster and
45 * simpler than memcpy. Use memcpy_erms when possible.
46 */
47ENTRY(memcpy_erms)
48	movq %rdi, %rax
49	movq %rdx, %rcx
50	rep movsb
51	ret
52ENDPROC(memcpy_erms)
53
54ENTRY(memcpy_orig)
55	movq %rdi, %rax
56
57	cmpq $0x20, %rdx
58	jb .Lhandle_tail
59
60	/*
61	 * We check whether memory false dependence could occur,
62	 * then jump to corresponding copy mode.
63	 */
64	cmp  %dil, %sil
65	jl .Lcopy_backward
66	subq $0x20, %rdx
67.Lcopy_forward_loop:
68	subq $0x20,	%rdx
69
70	/*
71	 * Move in blocks of 4x8 bytes:
72	 */
73	movq 0*8(%rsi),	%r8
74	movq 1*8(%rsi),	%r9
75	movq 2*8(%rsi),	%r10
76	movq 3*8(%rsi),	%r11
77	leaq 4*8(%rsi),	%rsi
78
79	movq %r8,	0*8(%rdi)
80	movq %r9,	1*8(%rdi)
81	movq %r10,	2*8(%rdi)
82	movq %r11,	3*8(%rdi)
83	leaq 4*8(%rdi),	%rdi
84	jae  .Lcopy_forward_loop
85	addl $0x20,	%edx
86	jmp  .Lhandle_tail
87
88.Lcopy_backward:
89	/*
90	 * Calculate copy position to tail.
91	 */
92	addq %rdx,	%rsi
93	addq %rdx,	%rdi
94	subq $0x20,	%rdx
95	/*
96	 * At most 3 ALU operations in one cycle,
97	 * so append NOPS in the same 16 bytes trunk.
98	 */
99	.p2align 4
100.Lcopy_backward_loop:
101	subq $0x20,	%rdx
102	movq -1*8(%rsi),	%r8
103	movq -2*8(%rsi),	%r9
104	movq -3*8(%rsi),	%r10
105	movq -4*8(%rsi),	%r11
106	leaq -4*8(%rsi),	%rsi
107	movq %r8,		-1*8(%rdi)
108	movq %r9,		-2*8(%rdi)
109	movq %r10,		-3*8(%rdi)
110	movq %r11,		-4*8(%rdi)
111	leaq -4*8(%rdi),	%rdi
112	jae  .Lcopy_backward_loop
113
114	/*
115	 * Calculate copy position to head.
116	 */
117	addl $0x20,	%edx
118	subq %rdx,	%rsi
119	subq %rdx,	%rdi
120.Lhandle_tail:
121	cmpl $16,	%edx
122	jb   .Lless_16bytes
123
124	/*
125	 * Move data from 16 bytes to 31 bytes.
126	 */
127	movq 0*8(%rsi), %r8
128	movq 1*8(%rsi),	%r9
129	movq -2*8(%rsi, %rdx),	%r10
130	movq -1*8(%rsi, %rdx),	%r11
131	movq %r8,	0*8(%rdi)
132	movq %r9,	1*8(%rdi)
133	movq %r10,	-2*8(%rdi, %rdx)
134	movq %r11,	-1*8(%rdi, %rdx)
135	retq
136	.p2align 4
137.Lless_16bytes:
138	cmpl $8,	%edx
139	jb   .Lless_8bytes
140	/*
141	 * Move data from 8 bytes to 15 bytes.
142	 */
143	movq 0*8(%rsi),	%r8
144	movq -1*8(%rsi, %rdx),	%r9
145	movq %r8,	0*8(%rdi)
146	movq %r9,	-1*8(%rdi, %rdx)
147	retq
148	.p2align 4
149.Lless_8bytes:
150	cmpl $4,	%edx
151	jb   .Lless_3bytes
152
153	/*
154	 * Move data from 4 bytes to 7 bytes.
155	 */
156	movl (%rsi), %ecx
157	movl -4(%rsi, %rdx), %r8d
158	movl %ecx, (%rdi)
159	movl %r8d, -4(%rdi, %rdx)
160	retq
161	.p2align 4
162.Lless_3bytes:
163	subl $1, %edx
164	jb .Lend
165	/*
166	 * Move data from 1 bytes to 3 bytes.
167	 */
168	movzbl (%rsi), %ecx
169	jz .Lstore_1byte
170	movzbq 1(%rsi), %r8
171	movzbq (%rsi, %rdx), %r9
172	movb %r8b, 1(%rdi)
173	movb %r9b, (%rdi, %rdx)
174.Lstore_1byte:
175	movb %cl, (%rdi)
176
177.Lend:
178	retq
179ENDPROC(memcpy_orig)
180
181#ifndef CONFIG_UML
182/*
183 * memcpy_mcsafe - memory copy with machine check exception handling
184 * Note that we only catch machine checks when reading the source addresses.
185 * Writes to target are posted and don't generate machine checks.
186 */
187ENTRY(memcpy_mcsafe)
188	cmpl $8, %edx
189	/* Less than 8 bytes? Go to byte copy loop */
190	jb .L_no_whole_words
191
192	/* Check for bad alignment of source */
193	testl $7, %esi
194	/* Already aligned */
195	jz .L_8byte_aligned
196
197	/* Copy one byte at a time until source is 8-byte aligned */
198	movl %esi, %ecx
199	andl $7, %ecx
200	subl $8, %ecx
201	negl %ecx
202	subl %ecx, %edx
203.L_copy_leading_bytes:
204	movb (%rsi), %al
205	movb %al, (%rdi)
206	incq %rsi
207	incq %rdi
208	decl %ecx
209	jnz .L_copy_leading_bytes
210
211.L_8byte_aligned:
212	/* Figure out how many whole cache lines (64-bytes) to copy */
213	movl %edx, %ecx
214	andl $63, %edx
215	shrl $6, %ecx
216	jz .L_no_whole_cache_lines
217
218	/* Loop copying whole cache lines */
219.L_cache_w0: movq (%rsi), %r8
220.L_cache_w1: movq 1*8(%rsi), %r9
221.L_cache_w2: movq 2*8(%rsi), %r10
222.L_cache_w3: movq 3*8(%rsi), %r11
223	movq %r8, (%rdi)
224	movq %r9, 1*8(%rdi)
225	movq %r10, 2*8(%rdi)
226	movq %r11, 3*8(%rdi)
227.L_cache_w4: movq 4*8(%rsi), %r8
228.L_cache_w5: movq 5*8(%rsi), %r9
229.L_cache_w6: movq 6*8(%rsi), %r10
230.L_cache_w7: movq 7*8(%rsi), %r11
231	movq %r8, 4*8(%rdi)
232	movq %r9, 5*8(%rdi)
233	movq %r10, 6*8(%rdi)
234	movq %r11, 7*8(%rdi)
235	leaq 64(%rsi), %rsi
236	leaq 64(%rdi), %rdi
237	decl %ecx
238	jnz .L_cache_w0
239
240	/* Are there any trailing 8-byte words? */
241.L_no_whole_cache_lines:
242	movl %edx, %ecx
243	andl $7, %edx
244	shrl $3, %ecx
245	jz .L_no_whole_words
246
247	/* Copy trailing words */
248.L_copy_trailing_words:
249	movq (%rsi), %r8
250	mov %r8, (%rdi)
251	leaq 8(%rsi), %rsi
252	leaq 8(%rdi), %rdi
253	decl %ecx
254	jnz .L_copy_trailing_words
255
256	/* Any trailing bytes? */
257.L_no_whole_words:
258	andl %edx, %edx
259	jz .L_done_memcpy_trap
260
261	/* Copy trailing bytes */
262	movl %edx, %ecx
263.L_copy_trailing_bytes:
264	movb (%rsi), %al
265	movb %al, (%rdi)
266	incq %rsi
267	incq %rdi
268	decl %ecx
269	jnz .L_copy_trailing_bytes
270
271	/* Copy successful. Return true */
272.L_done_memcpy_trap:
273	xorq %rax, %rax
274	ret
275ENDPROC(memcpy_mcsafe)
276
277	.section .fixup, "ax"
278	/* Return false for any failure */
279.L_memcpy_mcsafe_fail:
280	mov	$1, %rax
281	ret
282
283	.previous
284
285	_ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
286	_ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
287	_ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
288	_ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
289	_ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
290	_ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
291	_ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
292	_ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
293	_ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
294	_ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
295	_ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
296#endif
297