xref: /linux/arch/x86/lib/copy_user_64.S (revision b45e0c30bc58fb6fcaa42f1d1d813cefb8ab4117)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
4 * Copyright 2002 Andi Kleen, SuSE Labs.
5 *
6 * Functions to copy from and to user space.
7 */
8
9#include <linux/linkage.h>
10#include <asm/current.h>
11#include <asm/asm-offsets.h>
12#include <asm/thread_info.h>
13#include <asm/cpufeatures.h>
14#include <asm/alternative-asm.h>
15#include <asm/asm.h>
16#include <asm/smap.h>
17#include <asm/export.h>
18
19.macro ALIGN_DESTINATION
20	/* check for bad alignment of destination */
21	movl %edi,%ecx
22	andl $7,%ecx
23	jz 102f				/* already aligned */
24	subl $8,%ecx
25	negl %ecx
26	subl %ecx,%edx
27100:	movb (%rsi),%al
28101:	movb %al,(%rdi)
29	incq %rsi
30	incq %rdi
31	decl %ecx
32	jnz 100b
33102:
34	.section .fixup,"ax"
35103:	addl %ecx,%edx			/* ecx is zerorest also */
36	jmp .Lcopy_user_handle_tail
37	.previous
38
39	_ASM_EXTABLE_UA(100b, 103b)
40	_ASM_EXTABLE_UA(101b, 103b)
41	.endm
42
43/*
44 * copy_user_generic_unrolled - memory copy with exception handling.
45 * This version is for CPUs like P4 that don't have efficient micro
46 * code for rep movsq
47 *
48 * Input:
49 * rdi destination
50 * rsi source
51 * rdx count
52 *
53 * Output:
54 * eax uncopied bytes or 0 if successful.
55 */
56ENTRY(copy_user_generic_unrolled)
57	ASM_STAC
58	cmpl $8,%edx
59	jb 20f		/* less then 8 bytes, go to byte copy loop */
60	ALIGN_DESTINATION
61	movl %edx,%ecx
62	andl $63,%edx
63	shrl $6,%ecx
64	jz .L_copy_short_string
651:	movq (%rsi),%r8
662:	movq 1*8(%rsi),%r9
673:	movq 2*8(%rsi),%r10
684:	movq 3*8(%rsi),%r11
695:	movq %r8,(%rdi)
706:	movq %r9,1*8(%rdi)
717:	movq %r10,2*8(%rdi)
728:	movq %r11,3*8(%rdi)
739:	movq 4*8(%rsi),%r8
7410:	movq 5*8(%rsi),%r9
7511:	movq 6*8(%rsi),%r10
7612:	movq 7*8(%rsi),%r11
7713:	movq %r8,4*8(%rdi)
7814:	movq %r9,5*8(%rdi)
7915:	movq %r10,6*8(%rdi)
8016:	movq %r11,7*8(%rdi)
81	leaq 64(%rsi),%rsi
82	leaq 64(%rdi),%rdi
83	decl %ecx
84	jnz 1b
85.L_copy_short_string:
86	movl %edx,%ecx
87	andl $7,%edx
88	shrl $3,%ecx
89	jz 20f
9018:	movq (%rsi),%r8
9119:	movq %r8,(%rdi)
92	leaq 8(%rsi),%rsi
93	leaq 8(%rdi),%rdi
94	decl %ecx
95	jnz 18b
9620:	andl %edx,%edx
97	jz 23f
98	movl %edx,%ecx
9921:	movb (%rsi),%al
10022:	movb %al,(%rdi)
101	incq %rsi
102	incq %rdi
103	decl %ecx
104	jnz 21b
10523:	xor %eax,%eax
106	ASM_CLAC
107	ret
108
109	.section .fixup,"ax"
11030:	shll $6,%ecx
111	addl %ecx,%edx
112	jmp 60f
11340:	leal (%rdx,%rcx,8),%edx
114	jmp 60f
11550:	movl %ecx,%edx
11660:	jmp .Lcopy_user_handle_tail /* ecx is zerorest also */
117	.previous
118
119	_ASM_EXTABLE_UA(1b, 30b)
120	_ASM_EXTABLE_UA(2b, 30b)
121	_ASM_EXTABLE_UA(3b, 30b)
122	_ASM_EXTABLE_UA(4b, 30b)
123	_ASM_EXTABLE_UA(5b, 30b)
124	_ASM_EXTABLE_UA(6b, 30b)
125	_ASM_EXTABLE_UA(7b, 30b)
126	_ASM_EXTABLE_UA(8b, 30b)
127	_ASM_EXTABLE_UA(9b, 30b)
128	_ASM_EXTABLE_UA(10b, 30b)
129	_ASM_EXTABLE_UA(11b, 30b)
130	_ASM_EXTABLE_UA(12b, 30b)
131	_ASM_EXTABLE_UA(13b, 30b)
132	_ASM_EXTABLE_UA(14b, 30b)
133	_ASM_EXTABLE_UA(15b, 30b)
134	_ASM_EXTABLE_UA(16b, 30b)
135	_ASM_EXTABLE_UA(18b, 40b)
136	_ASM_EXTABLE_UA(19b, 40b)
137	_ASM_EXTABLE_UA(21b, 50b)
138	_ASM_EXTABLE_UA(22b, 50b)
139ENDPROC(copy_user_generic_unrolled)
140EXPORT_SYMBOL(copy_user_generic_unrolled)
141
142/* Some CPUs run faster using the string copy instructions.
143 * This is also a lot simpler. Use them when possible.
144 *
145 * Only 4GB of copy is supported. This shouldn't be a problem
146 * because the kernel normally only writes from/to page sized chunks
147 * even if user space passed a longer buffer.
148 * And more would be dangerous because both Intel and AMD have
149 * errata with rep movsq > 4GB. If someone feels the need to fix
150 * this please consider this.
151 *
152 * Input:
153 * rdi destination
154 * rsi source
155 * rdx count
156 *
157 * Output:
158 * eax uncopied bytes or 0 if successful.
159 */
160ENTRY(copy_user_generic_string)
161	ASM_STAC
162	cmpl $8,%edx
163	jb 2f		/* less than 8 bytes, go to byte copy loop */
164	ALIGN_DESTINATION
165	movl %edx,%ecx
166	shrl $3,%ecx
167	andl $7,%edx
1681:	rep
169	movsq
1702:	movl %edx,%ecx
1713:	rep
172	movsb
173	xorl %eax,%eax
174	ASM_CLAC
175	ret
176
177	.section .fixup,"ax"
17811:	leal (%rdx,%rcx,8),%ecx
17912:	movl %ecx,%edx		/* ecx is zerorest also */
180	jmp .Lcopy_user_handle_tail
181	.previous
182
183	_ASM_EXTABLE_UA(1b, 11b)
184	_ASM_EXTABLE_UA(3b, 12b)
185ENDPROC(copy_user_generic_string)
186EXPORT_SYMBOL(copy_user_generic_string)
187
188/*
189 * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
190 * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
191 *
192 * Input:
193 * rdi destination
194 * rsi source
195 * rdx count
196 *
197 * Output:
198 * eax uncopied bytes or 0 if successful.
199 */
200ENTRY(copy_user_enhanced_fast_string)
201	ASM_STAC
202	cmpl $64,%edx
203	jb .L_copy_short_string	/* less then 64 bytes, avoid the costly 'rep' */
204	movl %edx,%ecx
2051:	rep
206	movsb
207	xorl %eax,%eax
208	ASM_CLAC
209	ret
210
211	.section .fixup,"ax"
21212:	movl %ecx,%edx		/* ecx is zerorest also */
213	jmp .Lcopy_user_handle_tail
214	.previous
215
216	_ASM_EXTABLE_UA(1b, 12b)
217ENDPROC(copy_user_enhanced_fast_string)
218EXPORT_SYMBOL(copy_user_enhanced_fast_string)
219
220/*
221 * Try to copy last bytes and clear the rest if needed.
222 * Since protection fault in copy_from/to_user is not a normal situation,
223 * it is not necessary to optimize tail handling.
224 *
225 * Input:
226 * rdi destination
227 * rsi source
228 * rdx count
229 *
230 * Output:
231 * eax uncopied bytes or 0 if successful.
232 */
233ALIGN;
234.Lcopy_user_handle_tail:
235	movl %edx,%ecx
2361:	rep movsb
2372:	mov %ecx,%eax
238	ASM_CLAC
239	ret
240
241	_ASM_EXTABLE_UA(1b, 2b)
242END(.Lcopy_user_handle_tail)
243
244/*
245 * copy_user_nocache - Uncached memory copy with exception handling
246 * This will force destination out of cache for more performance.
247 *
248 * Note: Cached memory copy is used when destination or size is not
249 * naturally aligned. That is:
250 *  - Require 8-byte alignment when size is 8 bytes or larger.
251 *  - Require 4-byte alignment when size is 4 bytes.
252 */
253ENTRY(__copy_user_nocache)
254	ASM_STAC
255
256	/* If size is less than 8 bytes, go to 4-byte copy */
257	cmpl $8,%edx
258	jb .L_4b_nocache_copy_entry
259
260	/* If destination is not 8-byte aligned, "cache" copy to align it */
261	ALIGN_DESTINATION
262
263	/* Set 4x8-byte copy count and remainder */
264	movl %edx,%ecx
265	andl $63,%edx
266	shrl $6,%ecx
267	jz .L_8b_nocache_copy_entry	/* jump if count is 0 */
268
269	/* Perform 4x8-byte nocache loop-copy */
270.L_4x8b_nocache_copy_loop:
2711:	movq (%rsi),%r8
2722:	movq 1*8(%rsi),%r9
2733:	movq 2*8(%rsi),%r10
2744:	movq 3*8(%rsi),%r11
2755:	movnti %r8,(%rdi)
2766:	movnti %r9,1*8(%rdi)
2777:	movnti %r10,2*8(%rdi)
2788:	movnti %r11,3*8(%rdi)
2799:	movq 4*8(%rsi),%r8
28010:	movq 5*8(%rsi),%r9
28111:	movq 6*8(%rsi),%r10
28212:	movq 7*8(%rsi),%r11
28313:	movnti %r8,4*8(%rdi)
28414:	movnti %r9,5*8(%rdi)
28515:	movnti %r10,6*8(%rdi)
28616:	movnti %r11,7*8(%rdi)
287	leaq 64(%rsi),%rsi
288	leaq 64(%rdi),%rdi
289	decl %ecx
290	jnz .L_4x8b_nocache_copy_loop
291
292	/* Set 8-byte copy count and remainder */
293.L_8b_nocache_copy_entry:
294	movl %edx,%ecx
295	andl $7,%edx
296	shrl $3,%ecx
297	jz .L_4b_nocache_copy_entry	/* jump if count is 0 */
298
299	/* Perform 8-byte nocache loop-copy */
300.L_8b_nocache_copy_loop:
30120:	movq (%rsi),%r8
30221:	movnti %r8,(%rdi)
303	leaq 8(%rsi),%rsi
304	leaq 8(%rdi),%rdi
305	decl %ecx
306	jnz .L_8b_nocache_copy_loop
307
308	/* If no byte left, we're done */
309.L_4b_nocache_copy_entry:
310	andl %edx,%edx
311	jz .L_finish_copy
312
313	/* If destination is not 4-byte aligned, go to byte copy: */
314	movl %edi,%ecx
315	andl $3,%ecx
316	jnz .L_1b_cache_copy_entry
317
318	/* Set 4-byte copy count (1 or 0) and remainder */
319	movl %edx,%ecx
320	andl $3,%edx
321	shrl $2,%ecx
322	jz .L_1b_cache_copy_entry	/* jump if count is 0 */
323
324	/* Perform 4-byte nocache copy: */
32530:	movl (%rsi),%r8d
32631:	movnti %r8d,(%rdi)
327	leaq 4(%rsi),%rsi
328	leaq 4(%rdi),%rdi
329
330	/* If no bytes left, we're done: */
331	andl %edx,%edx
332	jz .L_finish_copy
333
334	/* Perform byte "cache" loop-copy for the remainder */
335.L_1b_cache_copy_entry:
336	movl %edx,%ecx
337.L_1b_cache_copy_loop:
33840:	movb (%rsi),%al
33941:	movb %al,(%rdi)
340	incq %rsi
341	incq %rdi
342	decl %ecx
343	jnz .L_1b_cache_copy_loop
344
345	/* Finished copying; fence the prior stores */
346.L_finish_copy:
347	xorl %eax,%eax
348	ASM_CLAC
349	sfence
350	ret
351
352	.section .fixup,"ax"
353.L_fixup_4x8b_copy:
354	shll $6,%ecx
355	addl %ecx,%edx
356	jmp .L_fixup_handle_tail
357.L_fixup_8b_copy:
358	lea (%rdx,%rcx,8),%rdx
359	jmp .L_fixup_handle_tail
360.L_fixup_4b_copy:
361	lea (%rdx,%rcx,4),%rdx
362	jmp .L_fixup_handle_tail
363.L_fixup_1b_copy:
364	movl %ecx,%edx
365.L_fixup_handle_tail:
366	sfence
367	jmp .Lcopy_user_handle_tail
368	.previous
369
370	_ASM_EXTABLE_UA(1b, .L_fixup_4x8b_copy)
371	_ASM_EXTABLE_UA(2b, .L_fixup_4x8b_copy)
372	_ASM_EXTABLE_UA(3b, .L_fixup_4x8b_copy)
373	_ASM_EXTABLE_UA(4b, .L_fixup_4x8b_copy)
374	_ASM_EXTABLE_UA(5b, .L_fixup_4x8b_copy)
375	_ASM_EXTABLE_UA(6b, .L_fixup_4x8b_copy)
376	_ASM_EXTABLE_UA(7b, .L_fixup_4x8b_copy)
377	_ASM_EXTABLE_UA(8b, .L_fixup_4x8b_copy)
378	_ASM_EXTABLE_UA(9b, .L_fixup_4x8b_copy)
379	_ASM_EXTABLE_UA(10b, .L_fixup_4x8b_copy)
380	_ASM_EXTABLE_UA(11b, .L_fixup_4x8b_copy)
381	_ASM_EXTABLE_UA(12b, .L_fixup_4x8b_copy)
382	_ASM_EXTABLE_UA(13b, .L_fixup_4x8b_copy)
383	_ASM_EXTABLE_UA(14b, .L_fixup_4x8b_copy)
384	_ASM_EXTABLE_UA(15b, .L_fixup_4x8b_copy)
385	_ASM_EXTABLE_UA(16b, .L_fixup_4x8b_copy)
386	_ASM_EXTABLE_UA(20b, .L_fixup_8b_copy)
387	_ASM_EXTABLE_UA(21b, .L_fixup_8b_copy)
388	_ASM_EXTABLE_UA(30b, .L_fixup_4b_copy)
389	_ASM_EXTABLE_UA(31b, .L_fixup_4b_copy)
390	_ASM_EXTABLE_UA(40b, .L_fixup_1b_copy)
391	_ASM_EXTABLE_UA(41b, .L_fixup_1b_copy)
392ENDPROC(__copy_user_nocache)
393EXPORT_SYMBOL(__copy_user_nocache)
394