xref: /linux/arch/x86/lib/copy_user_64.S (revision e6f2a617ac53bc0753b885ffb94379ff48b2e2df)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
4 * Copyright 2002 Andi Kleen, SuSE Labs.
5 *
6 * Functions to copy from and to user space.
7 */
8
9#include <linux/linkage.h>
10#include <asm/current.h>
11#include <asm/asm-offsets.h>
12#include <asm/thread_info.h>
13#include <asm/cpufeatures.h>
14#include <asm/alternative-asm.h>
15#include <asm/asm.h>
16#include <asm/smap.h>
17#include <asm/export.h>
18
19.macro ALIGN_DESTINATION
20	/* check for bad alignment of destination */
21	movl %edi,%ecx
22	andl $7,%ecx
23	jz 102f				/* already aligned */
24	subl $8,%ecx
25	negl %ecx
26	subl %ecx,%edx
27100:	movb (%rsi),%al
28101:	movb %al,(%rdi)
29	incq %rsi
30	incq %rdi
31	decl %ecx
32	jnz 100b
33102:
34	.section .fixup,"ax"
35103:	addl %ecx,%edx			/* ecx is zerorest also */
36	jmp .Lcopy_user_handle_tail
37	.previous
38
39	_ASM_EXTABLE_UA(100b, 103b)
40	_ASM_EXTABLE_UA(101b, 103b)
41	.endm
42
43/*
44 * copy_user_generic_unrolled - memory copy with exception handling.
45 * This version is for CPUs like P4 that don't have efficient micro
46 * code for rep movsq
47 *
48 * Input:
49 * rdi destination
50 * rsi source
51 * rdx count
52 *
53 * Output:
54 * eax uncopied bytes or 0 if successful.
55 */
56SYM_FUNC_START(copy_user_generic_unrolled)
57	ASM_STAC
58	cmpl $8,%edx
59	jb 20f		/* less then 8 bytes, go to byte copy loop */
60	ALIGN_DESTINATION
61	movl %edx,%ecx
62	andl $63,%edx
63	shrl $6,%ecx
64	jz .L_copy_short_string
651:	movq (%rsi),%r8
662:	movq 1*8(%rsi),%r9
673:	movq 2*8(%rsi),%r10
684:	movq 3*8(%rsi),%r11
695:	movq %r8,(%rdi)
706:	movq %r9,1*8(%rdi)
717:	movq %r10,2*8(%rdi)
728:	movq %r11,3*8(%rdi)
739:	movq 4*8(%rsi),%r8
7410:	movq 5*8(%rsi),%r9
7511:	movq 6*8(%rsi),%r10
7612:	movq 7*8(%rsi),%r11
7713:	movq %r8,4*8(%rdi)
7814:	movq %r9,5*8(%rdi)
7915:	movq %r10,6*8(%rdi)
8016:	movq %r11,7*8(%rdi)
81	leaq 64(%rsi),%rsi
82	leaq 64(%rdi),%rdi
83	decl %ecx
84	jnz 1b
85.L_copy_short_string:
86	movl %edx,%ecx
87	andl $7,%edx
88	shrl $3,%ecx
89	jz 20f
9018:	movq (%rsi),%r8
9119:	movq %r8,(%rdi)
92	leaq 8(%rsi),%rsi
93	leaq 8(%rdi),%rdi
94	decl %ecx
95	jnz 18b
9620:	andl %edx,%edx
97	jz 23f
98	movl %edx,%ecx
9921:	movb (%rsi),%al
10022:	movb %al,(%rdi)
101	incq %rsi
102	incq %rdi
103	decl %ecx
104	jnz 21b
10523:	xor %eax,%eax
106	ASM_CLAC
107	ret
108
109	.section .fixup,"ax"
11030:	shll $6,%ecx
111	addl %ecx,%edx
112	jmp 60f
11340:	leal (%rdx,%rcx,8),%edx
114	jmp 60f
11550:	movl %ecx,%edx
11660:	jmp .Lcopy_user_handle_tail /* ecx is zerorest also */
117	.previous
118
119	_ASM_EXTABLE_UA(1b, 30b)
120	_ASM_EXTABLE_UA(2b, 30b)
121	_ASM_EXTABLE_UA(3b, 30b)
122	_ASM_EXTABLE_UA(4b, 30b)
123	_ASM_EXTABLE_UA(5b, 30b)
124	_ASM_EXTABLE_UA(6b, 30b)
125	_ASM_EXTABLE_UA(7b, 30b)
126	_ASM_EXTABLE_UA(8b, 30b)
127	_ASM_EXTABLE_UA(9b, 30b)
128	_ASM_EXTABLE_UA(10b, 30b)
129	_ASM_EXTABLE_UA(11b, 30b)
130	_ASM_EXTABLE_UA(12b, 30b)
131	_ASM_EXTABLE_UA(13b, 30b)
132	_ASM_EXTABLE_UA(14b, 30b)
133	_ASM_EXTABLE_UA(15b, 30b)
134	_ASM_EXTABLE_UA(16b, 30b)
135	_ASM_EXTABLE_UA(18b, 40b)
136	_ASM_EXTABLE_UA(19b, 40b)
137	_ASM_EXTABLE_UA(21b, 50b)
138	_ASM_EXTABLE_UA(22b, 50b)
139SYM_FUNC_END(copy_user_generic_unrolled)
140EXPORT_SYMBOL(copy_user_generic_unrolled)
141
142/* Some CPUs run faster using the string copy instructions.
143 * This is also a lot simpler. Use them when possible.
144 *
145 * Only 4GB of copy is supported. This shouldn't be a problem
146 * because the kernel normally only writes from/to page sized chunks
147 * even if user space passed a longer buffer.
148 * And more would be dangerous because both Intel and AMD have
149 * errata with rep movsq > 4GB. If someone feels the need to fix
150 * this please consider this.
151 *
152 * Input:
153 * rdi destination
154 * rsi source
155 * rdx count
156 *
157 * Output:
158 * eax uncopied bytes or 0 if successful.
159 */
160SYM_FUNC_START(copy_user_generic_string)
161	ASM_STAC
162	cmpl $8,%edx
163	jb 2f		/* less than 8 bytes, go to byte copy loop */
164	ALIGN_DESTINATION
165	movl %edx,%ecx
166	shrl $3,%ecx
167	andl $7,%edx
1681:	rep
169	movsq
1702:	movl %edx,%ecx
1713:	rep
172	movsb
173	xorl %eax,%eax
174	ASM_CLAC
175	ret
176
177	.section .fixup,"ax"
17811:	leal (%rdx,%rcx,8),%ecx
17912:	movl %ecx,%edx		/* ecx is zerorest also */
180	jmp .Lcopy_user_handle_tail
181	.previous
182
183	_ASM_EXTABLE_UA(1b, 11b)
184	_ASM_EXTABLE_UA(3b, 12b)
185SYM_FUNC_END(copy_user_generic_string)
186EXPORT_SYMBOL(copy_user_generic_string)
187
188/*
189 * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
190 * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
191 *
192 * Input:
193 * rdi destination
194 * rsi source
195 * rdx count
196 *
197 * Output:
198 * eax uncopied bytes or 0 if successful.
199 */
200SYM_FUNC_START(copy_user_enhanced_fast_string)
201	ASM_STAC
202	cmpl $64,%edx
203	jb .L_copy_short_string	/* less then 64 bytes, avoid the costly 'rep' */
204	movl %edx,%ecx
2051:	rep
206	movsb
207	xorl %eax,%eax
208	ASM_CLAC
209	ret
210
211	.section .fixup,"ax"
21212:	movl %ecx,%edx		/* ecx is zerorest also */
213	jmp .Lcopy_user_handle_tail
214	.previous
215
216	_ASM_EXTABLE_UA(1b, 12b)
217SYM_FUNC_END(copy_user_enhanced_fast_string)
218EXPORT_SYMBOL(copy_user_enhanced_fast_string)
219
220/*
221 * Try to copy last bytes and clear the rest if needed.
222 * Since protection fault in copy_from/to_user is not a normal situation,
223 * it is not necessary to optimize tail handling.
224 *
225 * Input:
226 * rdi destination
227 * rsi source
228 * rdx count
229 *
230 * Output:
231 * eax uncopied bytes or 0 if successful.
232 */
233SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
234	movl %edx,%ecx
2351:	rep movsb
2362:	mov %ecx,%eax
237	ASM_CLAC
238	ret
239
240	_ASM_EXTABLE_UA(1b, 2b)
241SYM_CODE_END(.Lcopy_user_handle_tail)
242
243/*
244 * copy_user_nocache - Uncached memory copy with exception handling
245 * This will force destination out of cache for more performance.
246 *
247 * Note: Cached memory copy is used when destination or size is not
248 * naturally aligned. That is:
249 *  - Require 8-byte alignment when size is 8 bytes or larger.
250 *  - Require 4-byte alignment when size is 4 bytes.
251 */
252SYM_FUNC_START(__copy_user_nocache)
253	ASM_STAC
254
255	/* If size is less than 8 bytes, go to 4-byte copy */
256	cmpl $8,%edx
257	jb .L_4b_nocache_copy_entry
258
259	/* If destination is not 8-byte aligned, "cache" copy to align it */
260	ALIGN_DESTINATION
261
262	/* Set 4x8-byte copy count and remainder */
263	movl %edx,%ecx
264	andl $63,%edx
265	shrl $6,%ecx
266	jz .L_8b_nocache_copy_entry	/* jump if count is 0 */
267
268	/* Perform 4x8-byte nocache loop-copy */
269.L_4x8b_nocache_copy_loop:
2701:	movq (%rsi),%r8
2712:	movq 1*8(%rsi),%r9
2723:	movq 2*8(%rsi),%r10
2734:	movq 3*8(%rsi),%r11
2745:	movnti %r8,(%rdi)
2756:	movnti %r9,1*8(%rdi)
2767:	movnti %r10,2*8(%rdi)
2778:	movnti %r11,3*8(%rdi)
2789:	movq 4*8(%rsi),%r8
27910:	movq 5*8(%rsi),%r9
28011:	movq 6*8(%rsi),%r10
28112:	movq 7*8(%rsi),%r11
28213:	movnti %r8,4*8(%rdi)
28314:	movnti %r9,5*8(%rdi)
28415:	movnti %r10,6*8(%rdi)
28516:	movnti %r11,7*8(%rdi)
286	leaq 64(%rsi),%rsi
287	leaq 64(%rdi),%rdi
288	decl %ecx
289	jnz .L_4x8b_nocache_copy_loop
290
291	/* Set 8-byte copy count and remainder */
292.L_8b_nocache_copy_entry:
293	movl %edx,%ecx
294	andl $7,%edx
295	shrl $3,%ecx
296	jz .L_4b_nocache_copy_entry	/* jump if count is 0 */
297
298	/* Perform 8-byte nocache loop-copy */
299.L_8b_nocache_copy_loop:
30020:	movq (%rsi),%r8
30121:	movnti %r8,(%rdi)
302	leaq 8(%rsi),%rsi
303	leaq 8(%rdi),%rdi
304	decl %ecx
305	jnz .L_8b_nocache_copy_loop
306
307	/* If no byte left, we're done */
308.L_4b_nocache_copy_entry:
309	andl %edx,%edx
310	jz .L_finish_copy
311
312	/* If destination is not 4-byte aligned, go to byte copy: */
313	movl %edi,%ecx
314	andl $3,%ecx
315	jnz .L_1b_cache_copy_entry
316
317	/* Set 4-byte copy count (1 or 0) and remainder */
318	movl %edx,%ecx
319	andl $3,%edx
320	shrl $2,%ecx
321	jz .L_1b_cache_copy_entry	/* jump if count is 0 */
322
323	/* Perform 4-byte nocache copy: */
32430:	movl (%rsi),%r8d
32531:	movnti %r8d,(%rdi)
326	leaq 4(%rsi),%rsi
327	leaq 4(%rdi),%rdi
328
329	/* If no bytes left, we're done: */
330	andl %edx,%edx
331	jz .L_finish_copy
332
333	/* Perform byte "cache" loop-copy for the remainder */
334.L_1b_cache_copy_entry:
335	movl %edx,%ecx
336.L_1b_cache_copy_loop:
33740:	movb (%rsi),%al
33841:	movb %al,(%rdi)
339	incq %rsi
340	incq %rdi
341	decl %ecx
342	jnz .L_1b_cache_copy_loop
343
344	/* Finished copying; fence the prior stores */
345.L_finish_copy:
346	xorl %eax,%eax
347	ASM_CLAC
348	sfence
349	ret
350
351	.section .fixup,"ax"
352.L_fixup_4x8b_copy:
353	shll $6,%ecx
354	addl %ecx,%edx
355	jmp .L_fixup_handle_tail
356.L_fixup_8b_copy:
357	lea (%rdx,%rcx,8),%rdx
358	jmp .L_fixup_handle_tail
359.L_fixup_4b_copy:
360	lea (%rdx,%rcx,4),%rdx
361	jmp .L_fixup_handle_tail
362.L_fixup_1b_copy:
363	movl %ecx,%edx
364.L_fixup_handle_tail:
365	sfence
366	jmp .Lcopy_user_handle_tail
367	.previous
368
369	_ASM_EXTABLE_UA(1b, .L_fixup_4x8b_copy)
370	_ASM_EXTABLE_UA(2b, .L_fixup_4x8b_copy)
371	_ASM_EXTABLE_UA(3b, .L_fixup_4x8b_copy)
372	_ASM_EXTABLE_UA(4b, .L_fixup_4x8b_copy)
373	_ASM_EXTABLE_UA(5b, .L_fixup_4x8b_copy)
374	_ASM_EXTABLE_UA(6b, .L_fixup_4x8b_copy)
375	_ASM_EXTABLE_UA(7b, .L_fixup_4x8b_copy)
376	_ASM_EXTABLE_UA(8b, .L_fixup_4x8b_copy)
377	_ASM_EXTABLE_UA(9b, .L_fixup_4x8b_copy)
378	_ASM_EXTABLE_UA(10b, .L_fixup_4x8b_copy)
379	_ASM_EXTABLE_UA(11b, .L_fixup_4x8b_copy)
380	_ASM_EXTABLE_UA(12b, .L_fixup_4x8b_copy)
381	_ASM_EXTABLE_UA(13b, .L_fixup_4x8b_copy)
382	_ASM_EXTABLE_UA(14b, .L_fixup_4x8b_copy)
383	_ASM_EXTABLE_UA(15b, .L_fixup_4x8b_copy)
384	_ASM_EXTABLE_UA(16b, .L_fixup_4x8b_copy)
385	_ASM_EXTABLE_UA(20b, .L_fixup_8b_copy)
386	_ASM_EXTABLE_UA(21b, .L_fixup_8b_copy)
387	_ASM_EXTABLE_UA(30b, .L_fixup_4b_copy)
388	_ASM_EXTABLE_UA(31b, .L_fixup_4b_copy)
389	_ASM_EXTABLE_UA(40b, .L_fixup_1b_copy)
390	_ASM_EXTABLE_UA(41b, .L_fixup_1b_copy)
391SYM_FUNC_END(__copy_user_nocache)
392EXPORT_SYMBOL(__copy_user_nocache)
393