xref: /linux/arch/x86/lib/memcpy_64.S (revision 5c35a02c545a7bbe77f3a1ae337d9e29beed079b)
1/* Copyright 2002 Andi Kleen */
2
3#include <linux/linkage.h>
4#include <asm/errno.h>
5#include <asm/cpufeatures.h>
6#include <asm/mcsafe_test.h>
7#include <asm/alternative-asm.h>
8#include <asm/export.h>
9
10/*
11 * We build a jump to memcpy_orig by default which gets NOPped out on
12 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
13 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
14 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
15 */
16
17.weak memcpy
18
19/*
20 * memcpy - Copy a memory block.
21 *
22 * Input:
23 *  rdi destination
24 *  rsi source
25 *  rdx count
26 *
27 * Output:
28 * rax original destination
29 */
30ENTRY(__memcpy)
31ENTRY(memcpy)
32	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
33		      "jmp memcpy_erms", X86_FEATURE_ERMS
34
35	movq %rdi, %rax
36	movq %rdx, %rcx
37	shrq $3, %rcx
38	andl $7, %edx
39	rep movsq
40	movl %edx, %ecx
41	rep movsb
42	ret
43ENDPROC(memcpy)
44ENDPROC(__memcpy)
45EXPORT_SYMBOL(memcpy)
46EXPORT_SYMBOL(__memcpy)
47
48/*
49 * memcpy_erms() - enhanced fast string memcpy. This is faster and
50 * simpler than memcpy. Use memcpy_erms when possible.
51 */
52ENTRY(memcpy_erms)
53	movq %rdi, %rax
54	movq %rdx, %rcx
55	rep movsb
56	ret
57ENDPROC(memcpy_erms)
58
59ENTRY(memcpy_orig)
60	movq %rdi, %rax
61
62	cmpq $0x20, %rdx
63	jb .Lhandle_tail
64
65	/*
66	 * We check whether memory false dependence could occur,
67	 * then jump to corresponding copy mode.
68	 */
69	cmp  %dil, %sil
70	jl .Lcopy_backward
71	subq $0x20, %rdx
72.Lcopy_forward_loop:
73	subq $0x20,	%rdx
74
75	/*
76	 * Move in blocks of 4x8 bytes:
77	 */
78	movq 0*8(%rsi),	%r8
79	movq 1*8(%rsi),	%r9
80	movq 2*8(%rsi),	%r10
81	movq 3*8(%rsi),	%r11
82	leaq 4*8(%rsi),	%rsi
83
84	movq %r8,	0*8(%rdi)
85	movq %r9,	1*8(%rdi)
86	movq %r10,	2*8(%rdi)
87	movq %r11,	3*8(%rdi)
88	leaq 4*8(%rdi),	%rdi
89	jae  .Lcopy_forward_loop
90	addl $0x20,	%edx
91	jmp  .Lhandle_tail
92
93.Lcopy_backward:
94	/*
95	 * Calculate copy position to tail.
96	 */
97	addq %rdx,	%rsi
98	addq %rdx,	%rdi
99	subq $0x20,	%rdx
100	/*
101	 * At most 3 ALU operations in one cycle,
102	 * so append NOPS in the same 16 bytes trunk.
103	 */
104	.p2align 4
105.Lcopy_backward_loop:
106	subq $0x20,	%rdx
107	movq -1*8(%rsi),	%r8
108	movq -2*8(%rsi),	%r9
109	movq -3*8(%rsi),	%r10
110	movq -4*8(%rsi),	%r11
111	leaq -4*8(%rsi),	%rsi
112	movq %r8,		-1*8(%rdi)
113	movq %r9,		-2*8(%rdi)
114	movq %r10,		-3*8(%rdi)
115	movq %r11,		-4*8(%rdi)
116	leaq -4*8(%rdi),	%rdi
117	jae  .Lcopy_backward_loop
118
119	/*
120	 * Calculate copy position to head.
121	 */
122	addl $0x20,	%edx
123	subq %rdx,	%rsi
124	subq %rdx,	%rdi
125.Lhandle_tail:
126	cmpl $16,	%edx
127	jb   .Lless_16bytes
128
129	/*
130	 * Move data from 16 bytes to 31 bytes.
131	 */
132	movq 0*8(%rsi), %r8
133	movq 1*8(%rsi),	%r9
134	movq -2*8(%rsi, %rdx),	%r10
135	movq -1*8(%rsi, %rdx),	%r11
136	movq %r8,	0*8(%rdi)
137	movq %r9,	1*8(%rdi)
138	movq %r10,	-2*8(%rdi, %rdx)
139	movq %r11,	-1*8(%rdi, %rdx)
140	retq
141	.p2align 4
142.Lless_16bytes:
143	cmpl $8,	%edx
144	jb   .Lless_8bytes
145	/*
146	 * Move data from 8 bytes to 15 bytes.
147	 */
148	movq 0*8(%rsi),	%r8
149	movq -1*8(%rsi, %rdx),	%r9
150	movq %r8,	0*8(%rdi)
151	movq %r9,	-1*8(%rdi, %rdx)
152	retq
153	.p2align 4
154.Lless_8bytes:
155	cmpl $4,	%edx
156	jb   .Lless_3bytes
157
158	/*
159	 * Move data from 4 bytes to 7 bytes.
160	 */
161	movl (%rsi), %ecx
162	movl -4(%rsi, %rdx), %r8d
163	movl %ecx, (%rdi)
164	movl %r8d, -4(%rdi, %rdx)
165	retq
166	.p2align 4
167.Lless_3bytes:
168	subl $1, %edx
169	jb .Lend
170	/*
171	 * Move data from 1 bytes to 3 bytes.
172	 */
173	movzbl (%rsi), %ecx
174	jz .Lstore_1byte
175	movzbq 1(%rsi), %r8
176	movzbq (%rsi, %rdx), %r9
177	movb %r8b, 1(%rdi)
178	movb %r9b, (%rdi, %rdx)
179.Lstore_1byte:
180	movb %cl, (%rdi)
181
182.Lend:
183	retq
184ENDPROC(memcpy_orig)
185
186#ifndef CONFIG_UML
187
188MCSAFE_TEST_CTL
189
190/*
191 * __memcpy_mcsafe - memory copy with machine check exception handling
192 * Note that we only catch machine checks when reading the source addresses.
193 * Writes to target are posted and don't generate machine checks.
194 */
195ENTRY(__memcpy_mcsafe)
196	cmpl $8, %edx
197	/* Less than 8 bytes? Go to byte copy loop */
198	jb .L_no_whole_words
199
200	/* Check for bad alignment of source */
201	testl $7, %esi
202	/* Already aligned */
203	jz .L_8byte_aligned
204
205	/* Copy one byte at a time until source is 8-byte aligned */
206	movl %esi, %ecx
207	andl $7, %ecx
208	subl $8, %ecx
209	negl %ecx
210	subl %ecx, %edx
211.L_read_leading_bytes:
212	movb (%rsi), %al
213	MCSAFE_TEST_SRC %rsi 1 .E_leading_bytes
214	MCSAFE_TEST_DST %rdi 1 .E_leading_bytes
215.L_write_leading_bytes:
216	movb %al, (%rdi)
217	incq %rsi
218	incq %rdi
219	decl %ecx
220	jnz .L_read_leading_bytes
221
222.L_8byte_aligned:
223	movl %edx, %ecx
224	andl $7, %edx
225	shrl $3, %ecx
226	jz .L_no_whole_words
227
228.L_read_words:
229	movq (%rsi), %r8
230	MCSAFE_TEST_SRC %rsi 8 .E_read_words
231	MCSAFE_TEST_DST %rdi 8 .E_write_words
232.L_write_words:
233	movq %r8, (%rdi)
234	addq $8, %rsi
235	addq $8, %rdi
236	decl %ecx
237	jnz .L_read_words
238
239	/* Any trailing bytes? */
240.L_no_whole_words:
241	andl %edx, %edx
242	jz .L_done_memcpy_trap
243
244	/* Copy trailing bytes */
245	movl %edx, %ecx
246.L_read_trailing_bytes:
247	movb (%rsi), %al
248	MCSAFE_TEST_SRC %rsi 1 .E_trailing_bytes
249	MCSAFE_TEST_DST %rdi 1 .E_trailing_bytes
250.L_write_trailing_bytes:
251	movb %al, (%rdi)
252	incq %rsi
253	incq %rdi
254	decl %ecx
255	jnz .L_read_trailing_bytes
256
257	/* Copy successful. Return zero */
258.L_done_memcpy_trap:
259	xorq %rax, %rax
260	ret
261ENDPROC(__memcpy_mcsafe)
262EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
263
264	.section .fixup, "ax"
265	/*
266	 * Return number of bytes not copied for any failure. Note that
267	 * there is no "tail" handling since the source buffer is 8-byte
268	 * aligned and poison is cacheline aligned.
269	 */
270.E_read_words:
271	shll	$3, %ecx
272.E_leading_bytes:
273	addl	%edx, %ecx
274.E_trailing_bytes:
275	mov	%ecx, %eax
276	ret
277
278	/*
279	 * For write fault handling, given the destination is unaligned,
280	 * we handle faults on multi-byte writes with a byte-by-byte
281	 * copy up to the write-protected page.
282	 */
283.E_write_words:
284	shll	$3, %ecx
285	addl	%edx, %ecx
286	movl	%ecx, %edx
287	jmp mcsafe_handle_tail
288
289	.previous
290
291	_ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes)
292	_ASM_EXTABLE_FAULT(.L_read_words, .E_read_words)
293	_ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes)
294	_ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes)
295	_ASM_EXTABLE(.L_write_words, .E_write_words)
296	_ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
297#endif
298