xref: /linux/tools/arch/x86/lib/memcpy_64.S (revision e5c86679d5e864947a52fb31e45a425dea3e7fa9)
1/* Copyright 2002 Andi Kleen */
2
3#include <linux/linkage.h>
4#include <asm/errno.h>
5#include <asm/cpufeatures.h>
6#include <asm/alternative-asm.h>
7
8/*
9 * We build a jump to memcpy_orig by default which gets NOPped out on
10 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
11 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
12 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
13 */
14
15.weak memcpy
16
17/*
18 * memcpy - Copy a memory block.
19 *
20 * Input:
21 *  rdi destination
22 *  rsi source
23 *  rdx count
24 *
25 * Output:
26 * rax original destination
27 */
28ENTRY(__memcpy)
29ENTRY(memcpy)
30	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
31		      "jmp memcpy_erms", X86_FEATURE_ERMS
32
33	movq %rdi, %rax
34	movq %rdx, %rcx
35	shrq $3, %rcx
36	andl $7, %edx
37	rep movsq
38	movl %edx, %ecx
39	rep movsb
40	ret
41ENDPROC(memcpy)
42ENDPROC(__memcpy)
43
44/*
45 * memcpy_erms() - enhanced fast string memcpy. This is faster and
46 * simpler than memcpy. Use memcpy_erms when possible.
47 */
48ENTRY(memcpy_erms)
49	movq %rdi, %rax
50	movq %rdx, %rcx
51	rep movsb
52	ret
53ENDPROC(memcpy_erms)
54
55ENTRY(memcpy_orig)
56	movq %rdi, %rax
57
58	cmpq $0x20, %rdx
59	jb .Lhandle_tail
60
61	/*
62	 * We check whether memory false dependence could occur,
63	 * then jump to corresponding copy mode.
64	 */
65	cmp  %dil, %sil
66	jl .Lcopy_backward
67	subq $0x20, %rdx
68.Lcopy_forward_loop:
69	subq $0x20,	%rdx
70
71	/*
72	 * Move in blocks of 4x8 bytes:
73	 */
74	movq 0*8(%rsi),	%r8
75	movq 1*8(%rsi),	%r9
76	movq 2*8(%rsi),	%r10
77	movq 3*8(%rsi),	%r11
78	leaq 4*8(%rsi),	%rsi
79
80	movq %r8,	0*8(%rdi)
81	movq %r9,	1*8(%rdi)
82	movq %r10,	2*8(%rdi)
83	movq %r11,	3*8(%rdi)
84	leaq 4*8(%rdi),	%rdi
85	jae  .Lcopy_forward_loop
86	addl $0x20,	%edx
87	jmp  .Lhandle_tail
88
89.Lcopy_backward:
90	/*
91	 * Calculate copy position to tail.
92	 */
93	addq %rdx,	%rsi
94	addq %rdx,	%rdi
95	subq $0x20,	%rdx
96	/*
97	 * At most 3 ALU operations in one cycle,
98	 * so append NOPS in the same 16 bytes trunk.
99	 */
100	.p2align 4
101.Lcopy_backward_loop:
102	subq $0x20,	%rdx
103	movq -1*8(%rsi),	%r8
104	movq -2*8(%rsi),	%r9
105	movq -3*8(%rsi),	%r10
106	movq -4*8(%rsi),	%r11
107	leaq -4*8(%rsi),	%rsi
108	movq %r8,		-1*8(%rdi)
109	movq %r9,		-2*8(%rdi)
110	movq %r10,		-3*8(%rdi)
111	movq %r11,		-4*8(%rdi)
112	leaq -4*8(%rdi),	%rdi
113	jae  .Lcopy_backward_loop
114
115	/*
116	 * Calculate copy position to head.
117	 */
118	addl $0x20,	%edx
119	subq %rdx,	%rsi
120	subq %rdx,	%rdi
121.Lhandle_tail:
122	cmpl $16,	%edx
123	jb   .Lless_16bytes
124
125	/*
126	 * Move data from 16 bytes to 31 bytes.
127	 */
128	movq 0*8(%rsi), %r8
129	movq 1*8(%rsi),	%r9
130	movq -2*8(%rsi, %rdx),	%r10
131	movq -1*8(%rsi, %rdx),	%r11
132	movq %r8,	0*8(%rdi)
133	movq %r9,	1*8(%rdi)
134	movq %r10,	-2*8(%rdi, %rdx)
135	movq %r11,	-1*8(%rdi, %rdx)
136	retq
137	.p2align 4
138.Lless_16bytes:
139	cmpl $8,	%edx
140	jb   .Lless_8bytes
141	/*
142	 * Move data from 8 bytes to 15 bytes.
143	 */
144	movq 0*8(%rsi),	%r8
145	movq -1*8(%rsi, %rdx),	%r9
146	movq %r8,	0*8(%rdi)
147	movq %r9,	-1*8(%rdi, %rdx)
148	retq
149	.p2align 4
150.Lless_8bytes:
151	cmpl $4,	%edx
152	jb   .Lless_3bytes
153
154	/*
155	 * Move data from 4 bytes to 7 bytes.
156	 */
157	movl (%rsi), %ecx
158	movl -4(%rsi, %rdx), %r8d
159	movl %ecx, (%rdi)
160	movl %r8d, -4(%rdi, %rdx)
161	retq
162	.p2align 4
163.Lless_3bytes:
164	subl $1, %edx
165	jb .Lend
166	/*
167	 * Move data from 1 bytes to 3 bytes.
168	 */
169	movzbl (%rsi), %ecx
170	jz .Lstore_1byte
171	movzbq 1(%rsi), %r8
172	movzbq (%rsi, %rdx), %r9
173	movb %r8b, 1(%rdi)
174	movb %r9b, (%rdi, %rdx)
175.Lstore_1byte:
176	movb %cl, (%rdi)
177
178.Lend:
179	retq
180ENDPROC(memcpy_orig)
181
182#ifndef CONFIG_UML
183/*
184 * memcpy_mcsafe_unrolled - memory copy with machine check exception handling
185 * Note that we only catch machine checks when reading the source addresses.
186 * Writes to target are posted and don't generate machine checks.
187 */
188ENTRY(memcpy_mcsafe_unrolled)
189	cmpl $8, %edx
190	/* Less than 8 bytes? Go to byte copy loop */
191	jb .L_no_whole_words
192
193	/* Check for bad alignment of source */
194	testl $7, %esi
195	/* Already aligned */
196	jz .L_8byte_aligned
197
198	/* Copy one byte at a time until source is 8-byte aligned */
199	movl %esi, %ecx
200	andl $7, %ecx
201	subl $8, %ecx
202	negl %ecx
203	subl %ecx, %edx
204.L_copy_leading_bytes:
205	movb (%rsi), %al
206	movb %al, (%rdi)
207	incq %rsi
208	incq %rdi
209	decl %ecx
210	jnz .L_copy_leading_bytes
211
212.L_8byte_aligned:
213	/* Figure out how many whole cache lines (64-bytes) to copy */
214	movl %edx, %ecx
215	andl $63, %edx
216	shrl $6, %ecx
217	jz .L_no_whole_cache_lines
218
219	/* Loop copying whole cache lines */
220.L_cache_w0: movq (%rsi), %r8
221.L_cache_w1: movq 1*8(%rsi), %r9
222.L_cache_w2: movq 2*8(%rsi), %r10
223.L_cache_w3: movq 3*8(%rsi), %r11
224	movq %r8, (%rdi)
225	movq %r9, 1*8(%rdi)
226	movq %r10, 2*8(%rdi)
227	movq %r11, 3*8(%rdi)
228.L_cache_w4: movq 4*8(%rsi), %r8
229.L_cache_w5: movq 5*8(%rsi), %r9
230.L_cache_w6: movq 6*8(%rsi), %r10
231.L_cache_w7: movq 7*8(%rsi), %r11
232	movq %r8, 4*8(%rdi)
233	movq %r9, 5*8(%rdi)
234	movq %r10, 6*8(%rdi)
235	movq %r11, 7*8(%rdi)
236	leaq 64(%rsi), %rsi
237	leaq 64(%rdi), %rdi
238	decl %ecx
239	jnz .L_cache_w0
240
241	/* Are there any trailing 8-byte words? */
242.L_no_whole_cache_lines:
243	movl %edx, %ecx
244	andl $7, %edx
245	shrl $3, %ecx
246	jz .L_no_whole_words
247
248	/* Copy trailing words */
249.L_copy_trailing_words:
250	movq (%rsi), %r8
251	mov %r8, (%rdi)
252	leaq 8(%rsi), %rsi
253	leaq 8(%rdi), %rdi
254	decl %ecx
255	jnz .L_copy_trailing_words
256
257	/* Any trailing bytes? */
258.L_no_whole_words:
259	andl %edx, %edx
260	jz .L_done_memcpy_trap
261
262	/* Copy trailing bytes */
263	movl %edx, %ecx
264.L_copy_trailing_bytes:
265	movb (%rsi), %al
266	movb %al, (%rdi)
267	incq %rsi
268	incq %rdi
269	decl %ecx
270	jnz .L_copy_trailing_bytes
271
272	/* Copy successful. Return zero */
273.L_done_memcpy_trap:
274	xorq %rax, %rax
275	ret
276ENDPROC(memcpy_mcsafe_unrolled)
277
278	.section .fixup, "ax"
279	/* Return -EFAULT for any failure */
280.L_memcpy_mcsafe_fail:
281	mov	$-EFAULT, %rax
282	ret
283
284	.previous
285
286	_ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
287	_ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
288	_ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
289	_ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
290	_ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
291	_ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
292	_ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
293	_ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
294	_ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
295	_ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
296	_ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
297#endif
298