xref: /linux/arch/x86/lib/copy_user_64.S (revision a1c3be890440a1769ed6f822376a3e3ab0d42994)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
4 * Copyright 2002 Andi Kleen, SuSE Labs.
5 *
6 * Functions to copy from and to user space.
7 */
8
9#include <linux/linkage.h>
10#include <asm/current.h>
11#include <asm/asm-offsets.h>
12#include <asm/thread_info.h>
13#include <asm/cpufeatures.h>
14#include <asm/alternative-asm.h>
15#include <asm/asm.h>
16#include <asm/smap.h>
17#include <asm/export.h>
18#include <asm/trapnr.h>
19
20.macro ALIGN_DESTINATION
21	/* check for bad alignment of destination */
22	movl %edi,%ecx
23	andl $7,%ecx
24	jz 102f				/* already aligned */
25	subl $8,%ecx
26	negl %ecx
27	subl %ecx,%edx
28100:	movb (%rsi),%al
29101:	movb %al,(%rdi)
30	incq %rsi
31	incq %rdi
32	decl %ecx
33	jnz 100b
34102:
35	.section .fixup,"ax"
36103:	addl %ecx,%edx			/* ecx is zerorest also */
37	jmp .Lcopy_user_handle_tail
38	.previous
39
40	_ASM_EXTABLE_CPY(100b, 103b)
41	_ASM_EXTABLE_CPY(101b, 103b)
42	.endm
43
44/*
45 * copy_user_generic_unrolled - memory copy with exception handling.
46 * This version is for CPUs like P4 that don't have efficient micro
47 * code for rep movsq
48 *
49 * Input:
50 * rdi destination
51 * rsi source
52 * rdx count
53 *
54 * Output:
55 * eax uncopied bytes or 0 if successful.
56 */
57SYM_FUNC_START(copy_user_generic_unrolled)
58	ASM_STAC
59	cmpl $8,%edx
60	jb 20f		/* less then 8 bytes, go to byte copy loop */
61	ALIGN_DESTINATION
62	movl %edx,%ecx
63	andl $63,%edx
64	shrl $6,%ecx
65	jz .L_copy_short_string
661:	movq (%rsi),%r8
672:	movq 1*8(%rsi),%r9
683:	movq 2*8(%rsi),%r10
694:	movq 3*8(%rsi),%r11
705:	movq %r8,(%rdi)
716:	movq %r9,1*8(%rdi)
727:	movq %r10,2*8(%rdi)
738:	movq %r11,3*8(%rdi)
749:	movq 4*8(%rsi),%r8
7510:	movq 5*8(%rsi),%r9
7611:	movq 6*8(%rsi),%r10
7712:	movq 7*8(%rsi),%r11
7813:	movq %r8,4*8(%rdi)
7914:	movq %r9,5*8(%rdi)
8015:	movq %r10,6*8(%rdi)
8116:	movq %r11,7*8(%rdi)
82	leaq 64(%rsi),%rsi
83	leaq 64(%rdi),%rdi
84	decl %ecx
85	jnz 1b
86.L_copy_short_string:
87	movl %edx,%ecx
88	andl $7,%edx
89	shrl $3,%ecx
90	jz 20f
9118:	movq (%rsi),%r8
9219:	movq %r8,(%rdi)
93	leaq 8(%rsi),%rsi
94	leaq 8(%rdi),%rdi
95	decl %ecx
96	jnz 18b
9720:	andl %edx,%edx
98	jz 23f
99	movl %edx,%ecx
10021:	movb (%rsi),%al
10122:	movb %al,(%rdi)
102	incq %rsi
103	incq %rdi
104	decl %ecx
105	jnz 21b
10623:	xor %eax,%eax
107	ASM_CLAC
108	ret
109
110	.section .fixup,"ax"
11130:	shll $6,%ecx
112	addl %ecx,%edx
113	jmp 60f
11440:	leal (%rdx,%rcx,8),%edx
115	jmp 60f
11650:	movl %ecx,%edx
11760:	jmp .Lcopy_user_handle_tail /* ecx is zerorest also */
118	.previous
119
120	_ASM_EXTABLE_CPY(1b, 30b)
121	_ASM_EXTABLE_CPY(2b, 30b)
122	_ASM_EXTABLE_CPY(3b, 30b)
123	_ASM_EXTABLE_CPY(4b, 30b)
124	_ASM_EXTABLE_CPY(5b, 30b)
125	_ASM_EXTABLE_CPY(6b, 30b)
126	_ASM_EXTABLE_CPY(7b, 30b)
127	_ASM_EXTABLE_CPY(8b, 30b)
128	_ASM_EXTABLE_CPY(9b, 30b)
129	_ASM_EXTABLE_CPY(10b, 30b)
130	_ASM_EXTABLE_CPY(11b, 30b)
131	_ASM_EXTABLE_CPY(12b, 30b)
132	_ASM_EXTABLE_CPY(13b, 30b)
133	_ASM_EXTABLE_CPY(14b, 30b)
134	_ASM_EXTABLE_CPY(15b, 30b)
135	_ASM_EXTABLE_CPY(16b, 30b)
136	_ASM_EXTABLE_CPY(18b, 40b)
137	_ASM_EXTABLE_CPY(19b, 40b)
138	_ASM_EXTABLE_CPY(21b, 50b)
139	_ASM_EXTABLE_CPY(22b, 50b)
140SYM_FUNC_END(copy_user_generic_unrolled)
141EXPORT_SYMBOL(copy_user_generic_unrolled)
142
143/* Some CPUs run faster using the string copy instructions.
144 * This is also a lot simpler. Use them when possible.
145 *
146 * Only 4GB of copy is supported. This shouldn't be a problem
147 * because the kernel normally only writes from/to page sized chunks
148 * even if user space passed a longer buffer.
149 * And more would be dangerous because both Intel and AMD have
150 * errata with rep movsq > 4GB. If someone feels the need to fix
151 * this please consider this.
152 *
153 * Input:
154 * rdi destination
155 * rsi source
156 * rdx count
157 *
158 * Output:
159 * eax uncopied bytes or 0 if successful.
160 */
161SYM_FUNC_START(copy_user_generic_string)
162	ASM_STAC
163	cmpl $8,%edx
164	jb 2f		/* less than 8 bytes, go to byte copy loop */
165	ALIGN_DESTINATION
166	movl %edx,%ecx
167	shrl $3,%ecx
168	andl $7,%edx
1691:	rep
170	movsq
1712:	movl %edx,%ecx
1723:	rep
173	movsb
174	xorl %eax,%eax
175	ASM_CLAC
176	ret
177
178	.section .fixup,"ax"
17911:	leal (%rdx,%rcx,8),%ecx
18012:	movl %ecx,%edx		/* ecx is zerorest also */
181	jmp .Lcopy_user_handle_tail
182	.previous
183
184	_ASM_EXTABLE_CPY(1b, 11b)
185	_ASM_EXTABLE_CPY(3b, 12b)
186SYM_FUNC_END(copy_user_generic_string)
187EXPORT_SYMBOL(copy_user_generic_string)
188
189/*
190 * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
191 * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
192 *
193 * Input:
194 * rdi destination
195 * rsi source
196 * rdx count
197 *
198 * Output:
199 * eax uncopied bytes or 0 if successful.
200 */
201SYM_FUNC_START(copy_user_enhanced_fast_string)
202	ASM_STAC
203	cmpl $64,%edx
204	jb .L_copy_short_string	/* less then 64 bytes, avoid the costly 'rep' */
205	movl %edx,%ecx
2061:	rep
207	movsb
208	xorl %eax,%eax
209	ASM_CLAC
210	ret
211
212	.section .fixup,"ax"
21312:	movl %ecx,%edx		/* ecx is zerorest also */
214	jmp .Lcopy_user_handle_tail
215	.previous
216
217	_ASM_EXTABLE_CPY(1b, 12b)
218SYM_FUNC_END(copy_user_enhanced_fast_string)
219EXPORT_SYMBOL(copy_user_enhanced_fast_string)
220
221/*
222 * Try to copy last bytes and clear the rest if needed.
223 * Since protection fault in copy_from/to_user is not a normal situation,
224 * it is not necessary to optimize tail handling.
225 * Don't try to copy the tail if machine check happened
226 *
227 * Input:
228 * rdi destination
229 * rsi source
230 * rdx count
231 *
232 * Output:
233 * eax uncopied bytes or 0 if successful.
234 */
235SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
236	movl %edx,%ecx
237	cmp $X86_TRAP_MC,%eax		/* check if X86_TRAP_MC */
238	je 3f
2391:	rep movsb
2402:	mov %ecx,%eax
241	ASM_CLAC
242	ret
243
244	/*
245	 * Return zero to pretend that this copy succeeded. This
246	 * is counter-intuitive, but needed to prevent the code
247	 * in lib/iov_iter.c from retrying and running back into
248	 * the poison cache line again. The machine check handler
249	 * will ensure that a SIGBUS is sent to the task.
250	 */
2513:	xorl %eax,%eax
252	ASM_CLAC
253	ret
254
255	_ASM_EXTABLE_CPY(1b, 2b)
256SYM_CODE_END(.Lcopy_user_handle_tail)
257
258/*
259 * copy_user_nocache - Uncached memory copy with exception handling
260 * This will force destination out of cache for more performance.
261 *
262 * Note: Cached memory copy is used when destination or size is not
263 * naturally aligned. That is:
264 *  - Require 8-byte alignment when size is 8 bytes or larger.
265 *  - Require 4-byte alignment when size is 4 bytes.
266 */
267SYM_FUNC_START(__copy_user_nocache)
268	ASM_STAC
269
270	/* If size is less than 8 bytes, go to 4-byte copy */
271	cmpl $8,%edx
272	jb .L_4b_nocache_copy_entry
273
274	/* If destination is not 8-byte aligned, "cache" copy to align it */
275	ALIGN_DESTINATION
276
277	/* Set 4x8-byte copy count and remainder */
278	movl %edx,%ecx
279	andl $63,%edx
280	shrl $6,%ecx
281	jz .L_8b_nocache_copy_entry	/* jump if count is 0 */
282
283	/* Perform 4x8-byte nocache loop-copy */
284.L_4x8b_nocache_copy_loop:
2851:	movq (%rsi),%r8
2862:	movq 1*8(%rsi),%r9
2873:	movq 2*8(%rsi),%r10
2884:	movq 3*8(%rsi),%r11
2895:	movnti %r8,(%rdi)
2906:	movnti %r9,1*8(%rdi)
2917:	movnti %r10,2*8(%rdi)
2928:	movnti %r11,3*8(%rdi)
2939:	movq 4*8(%rsi),%r8
29410:	movq 5*8(%rsi),%r9
29511:	movq 6*8(%rsi),%r10
29612:	movq 7*8(%rsi),%r11
29713:	movnti %r8,4*8(%rdi)
29814:	movnti %r9,5*8(%rdi)
29915:	movnti %r10,6*8(%rdi)
30016:	movnti %r11,7*8(%rdi)
301	leaq 64(%rsi),%rsi
302	leaq 64(%rdi),%rdi
303	decl %ecx
304	jnz .L_4x8b_nocache_copy_loop
305
306	/* Set 8-byte copy count and remainder */
307.L_8b_nocache_copy_entry:
308	movl %edx,%ecx
309	andl $7,%edx
310	shrl $3,%ecx
311	jz .L_4b_nocache_copy_entry	/* jump if count is 0 */
312
313	/* Perform 8-byte nocache loop-copy */
314.L_8b_nocache_copy_loop:
31520:	movq (%rsi),%r8
31621:	movnti %r8,(%rdi)
317	leaq 8(%rsi),%rsi
318	leaq 8(%rdi),%rdi
319	decl %ecx
320	jnz .L_8b_nocache_copy_loop
321
322	/* If no byte left, we're done */
323.L_4b_nocache_copy_entry:
324	andl %edx,%edx
325	jz .L_finish_copy
326
327	/* If destination is not 4-byte aligned, go to byte copy: */
328	movl %edi,%ecx
329	andl $3,%ecx
330	jnz .L_1b_cache_copy_entry
331
332	/* Set 4-byte copy count (1 or 0) and remainder */
333	movl %edx,%ecx
334	andl $3,%edx
335	shrl $2,%ecx
336	jz .L_1b_cache_copy_entry	/* jump if count is 0 */
337
338	/* Perform 4-byte nocache copy: */
33930:	movl (%rsi),%r8d
34031:	movnti %r8d,(%rdi)
341	leaq 4(%rsi),%rsi
342	leaq 4(%rdi),%rdi
343
344	/* If no bytes left, we're done: */
345	andl %edx,%edx
346	jz .L_finish_copy
347
348	/* Perform byte "cache" loop-copy for the remainder */
349.L_1b_cache_copy_entry:
350	movl %edx,%ecx
351.L_1b_cache_copy_loop:
35240:	movb (%rsi),%al
35341:	movb %al,(%rdi)
354	incq %rsi
355	incq %rdi
356	decl %ecx
357	jnz .L_1b_cache_copy_loop
358
359	/* Finished copying; fence the prior stores */
360.L_finish_copy:
361	xorl %eax,%eax
362	ASM_CLAC
363	sfence
364	ret
365
366	.section .fixup,"ax"
367.L_fixup_4x8b_copy:
368	shll $6,%ecx
369	addl %ecx,%edx
370	jmp .L_fixup_handle_tail
371.L_fixup_8b_copy:
372	lea (%rdx,%rcx,8),%rdx
373	jmp .L_fixup_handle_tail
374.L_fixup_4b_copy:
375	lea (%rdx,%rcx,4),%rdx
376	jmp .L_fixup_handle_tail
377.L_fixup_1b_copy:
378	movl %ecx,%edx
379.L_fixup_handle_tail:
380	sfence
381	jmp .Lcopy_user_handle_tail
382	.previous
383
384	_ASM_EXTABLE_CPY(1b, .L_fixup_4x8b_copy)
385	_ASM_EXTABLE_CPY(2b, .L_fixup_4x8b_copy)
386	_ASM_EXTABLE_CPY(3b, .L_fixup_4x8b_copy)
387	_ASM_EXTABLE_CPY(4b, .L_fixup_4x8b_copy)
388	_ASM_EXTABLE_CPY(5b, .L_fixup_4x8b_copy)
389	_ASM_EXTABLE_CPY(6b, .L_fixup_4x8b_copy)
390	_ASM_EXTABLE_CPY(7b, .L_fixup_4x8b_copy)
391	_ASM_EXTABLE_CPY(8b, .L_fixup_4x8b_copy)
392	_ASM_EXTABLE_CPY(9b, .L_fixup_4x8b_copy)
393	_ASM_EXTABLE_CPY(10b, .L_fixup_4x8b_copy)
394	_ASM_EXTABLE_CPY(11b, .L_fixup_4x8b_copy)
395	_ASM_EXTABLE_CPY(12b, .L_fixup_4x8b_copy)
396	_ASM_EXTABLE_CPY(13b, .L_fixup_4x8b_copy)
397	_ASM_EXTABLE_CPY(14b, .L_fixup_4x8b_copy)
398	_ASM_EXTABLE_CPY(15b, .L_fixup_4x8b_copy)
399	_ASM_EXTABLE_CPY(16b, .L_fixup_4x8b_copy)
400	_ASM_EXTABLE_CPY(20b, .L_fixup_8b_copy)
401	_ASM_EXTABLE_CPY(21b, .L_fixup_8b_copy)
402	_ASM_EXTABLE_CPY(30b, .L_fixup_4b_copy)
403	_ASM_EXTABLE_CPY(31b, .L_fixup_4b_copy)
404	_ASM_EXTABLE_CPY(40b, .L_fixup_1b_copy)
405	_ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy)
406SYM_FUNC_END(__copy_user_nocache)
407EXPORT_SYMBOL(__copy_user_nocache)
408