xref: /linux/arch/x86/lib/copy_user_64.S (revision bd628c1bed7902ec1f24ba0fe70758949146abbe)
1/*
2 * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
3 * Copyright 2002 Andi Kleen, SuSE Labs.
4 * Subject to the GNU Public License v2.
5 *
6 * Functions to copy from and to user space.
7 */
8
9#include <linux/linkage.h>
10#include <asm/current.h>
11#include <asm/asm-offsets.h>
12#include <asm/thread_info.h>
13#include <asm/cpufeatures.h>
14#include <asm/alternative-asm.h>
15#include <asm/asm.h>
16#include <asm/smap.h>
17#include <asm/export.h>
18
19/*
20 * copy_user_generic_unrolled - memory copy with exception handling.
21 * This version is for CPUs like P4 that don't have efficient micro
22 * code for rep movsq
23 *
24 * Input:
25 * rdi destination
26 * rsi source
27 * rdx count
28 *
29 * Output:
30 * eax uncopied bytes or 0 if successful.
31 */
32ENTRY(copy_user_generic_unrolled)
33	ASM_STAC
34	cmpl $8,%edx
35	jb 20f		/* less then 8 bytes, go to byte copy loop */
36	ALIGN_DESTINATION
37	movl %edx,%ecx
38	andl $63,%edx
39	shrl $6,%ecx
40	jz .L_copy_short_string
411:	movq (%rsi),%r8
422:	movq 1*8(%rsi),%r9
433:	movq 2*8(%rsi),%r10
444:	movq 3*8(%rsi),%r11
455:	movq %r8,(%rdi)
466:	movq %r9,1*8(%rdi)
477:	movq %r10,2*8(%rdi)
488:	movq %r11,3*8(%rdi)
499:	movq 4*8(%rsi),%r8
5010:	movq 5*8(%rsi),%r9
5111:	movq 6*8(%rsi),%r10
5212:	movq 7*8(%rsi),%r11
5313:	movq %r8,4*8(%rdi)
5414:	movq %r9,5*8(%rdi)
5515:	movq %r10,6*8(%rdi)
5616:	movq %r11,7*8(%rdi)
57	leaq 64(%rsi),%rsi
58	leaq 64(%rdi),%rdi
59	decl %ecx
60	jnz 1b
61.L_copy_short_string:
62	movl %edx,%ecx
63	andl $7,%edx
64	shrl $3,%ecx
65	jz 20f
6618:	movq (%rsi),%r8
6719:	movq %r8,(%rdi)
68	leaq 8(%rsi),%rsi
69	leaq 8(%rdi),%rdi
70	decl %ecx
71	jnz 18b
7220:	andl %edx,%edx
73	jz 23f
74	movl %edx,%ecx
7521:	movb (%rsi),%al
7622:	movb %al,(%rdi)
77	incq %rsi
78	incq %rdi
79	decl %ecx
80	jnz 21b
8123:	xor %eax,%eax
82	ASM_CLAC
83	ret
84
85	.section .fixup,"ax"
8630:	shll $6,%ecx
87	addl %ecx,%edx
88	jmp 60f
8940:	leal (%rdx,%rcx,8),%edx
90	jmp 60f
9150:	movl %ecx,%edx
9260:	jmp copy_user_handle_tail /* ecx is zerorest also */
93	.previous
94
95	_ASM_EXTABLE_UA(1b, 30b)
96	_ASM_EXTABLE_UA(2b, 30b)
97	_ASM_EXTABLE_UA(3b, 30b)
98	_ASM_EXTABLE_UA(4b, 30b)
99	_ASM_EXTABLE_UA(5b, 30b)
100	_ASM_EXTABLE_UA(6b, 30b)
101	_ASM_EXTABLE_UA(7b, 30b)
102	_ASM_EXTABLE_UA(8b, 30b)
103	_ASM_EXTABLE_UA(9b, 30b)
104	_ASM_EXTABLE_UA(10b, 30b)
105	_ASM_EXTABLE_UA(11b, 30b)
106	_ASM_EXTABLE_UA(12b, 30b)
107	_ASM_EXTABLE_UA(13b, 30b)
108	_ASM_EXTABLE_UA(14b, 30b)
109	_ASM_EXTABLE_UA(15b, 30b)
110	_ASM_EXTABLE_UA(16b, 30b)
111	_ASM_EXTABLE_UA(18b, 40b)
112	_ASM_EXTABLE_UA(19b, 40b)
113	_ASM_EXTABLE_UA(21b, 50b)
114	_ASM_EXTABLE_UA(22b, 50b)
115ENDPROC(copy_user_generic_unrolled)
116EXPORT_SYMBOL(copy_user_generic_unrolled)
117
118/* Some CPUs run faster using the string copy instructions.
119 * This is also a lot simpler. Use them when possible.
120 *
121 * Only 4GB of copy is supported. This shouldn't be a problem
122 * because the kernel normally only writes from/to page sized chunks
123 * even if user space passed a longer buffer.
124 * And more would be dangerous because both Intel and AMD have
125 * errata with rep movsq > 4GB. If someone feels the need to fix
126 * this please consider this.
127 *
128 * Input:
129 * rdi destination
130 * rsi source
131 * rdx count
132 *
133 * Output:
134 * eax uncopied bytes or 0 if successful.
135 */
136ENTRY(copy_user_generic_string)
137	ASM_STAC
138	cmpl $8,%edx
139	jb 2f		/* less than 8 bytes, go to byte copy loop */
140	ALIGN_DESTINATION
141	movl %edx,%ecx
142	shrl $3,%ecx
143	andl $7,%edx
1441:	rep
145	movsq
1462:	movl %edx,%ecx
1473:	rep
148	movsb
149	xorl %eax,%eax
150	ASM_CLAC
151	ret
152
153	.section .fixup,"ax"
15411:	leal (%rdx,%rcx,8),%ecx
15512:	movl %ecx,%edx		/* ecx is zerorest also */
156	jmp copy_user_handle_tail
157	.previous
158
159	_ASM_EXTABLE_UA(1b, 11b)
160	_ASM_EXTABLE_UA(3b, 12b)
161ENDPROC(copy_user_generic_string)
162EXPORT_SYMBOL(copy_user_generic_string)
163
164/*
165 * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
166 * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
167 *
168 * Input:
169 * rdi destination
170 * rsi source
171 * rdx count
172 *
173 * Output:
174 * eax uncopied bytes or 0 if successful.
175 */
176ENTRY(copy_user_enhanced_fast_string)
177	ASM_STAC
178	cmpl $64,%edx
179	jb .L_copy_short_string	/* less then 64 bytes, avoid the costly 'rep' */
180	movl %edx,%ecx
1811:	rep
182	movsb
183	xorl %eax,%eax
184	ASM_CLAC
185	ret
186
187	.section .fixup,"ax"
18812:	movl %ecx,%edx		/* ecx is zerorest also */
189	jmp copy_user_handle_tail
190	.previous
191
192	_ASM_EXTABLE_UA(1b, 12b)
193ENDPROC(copy_user_enhanced_fast_string)
194EXPORT_SYMBOL(copy_user_enhanced_fast_string)
195
196/*
197 * copy_user_nocache - Uncached memory copy with exception handling
198 * This will force destination out of cache for more performance.
199 *
200 * Note: Cached memory copy is used when destination or size is not
201 * naturally aligned. That is:
202 *  - Require 8-byte alignment when size is 8 bytes or larger.
203 *  - Require 4-byte alignment when size is 4 bytes.
204 */
205ENTRY(__copy_user_nocache)
206	ASM_STAC
207
208	/* If size is less than 8 bytes, go to 4-byte copy */
209	cmpl $8,%edx
210	jb .L_4b_nocache_copy_entry
211
212	/* If destination is not 8-byte aligned, "cache" copy to align it */
213	ALIGN_DESTINATION
214
215	/* Set 4x8-byte copy count and remainder */
216	movl %edx,%ecx
217	andl $63,%edx
218	shrl $6,%ecx
219	jz .L_8b_nocache_copy_entry	/* jump if count is 0 */
220
221	/* Perform 4x8-byte nocache loop-copy */
222.L_4x8b_nocache_copy_loop:
2231:	movq (%rsi),%r8
2242:	movq 1*8(%rsi),%r9
2253:	movq 2*8(%rsi),%r10
2264:	movq 3*8(%rsi),%r11
2275:	movnti %r8,(%rdi)
2286:	movnti %r9,1*8(%rdi)
2297:	movnti %r10,2*8(%rdi)
2308:	movnti %r11,3*8(%rdi)
2319:	movq 4*8(%rsi),%r8
23210:	movq 5*8(%rsi),%r9
23311:	movq 6*8(%rsi),%r10
23412:	movq 7*8(%rsi),%r11
23513:	movnti %r8,4*8(%rdi)
23614:	movnti %r9,5*8(%rdi)
23715:	movnti %r10,6*8(%rdi)
23816:	movnti %r11,7*8(%rdi)
239	leaq 64(%rsi),%rsi
240	leaq 64(%rdi),%rdi
241	decl %ecx
242	jnz .L_4x8b_nocache_copy_loop
243
244	/* Set 8-byte copy count and remainder */
245.L_8b_nocache_copy_entry:
246	movl %edx,%ecx
247	andl $7,%edx
248	shrl $3,%ecx
249	jz .L_4b_nocache_copy_entry	/* jump if count is 0 */
250
251	/* Perform 8-byte nocache loop-copy */
252.L_8b_nocache_copy_loop:
25320:	movq (%rsi),%r8
25421:	movnti %r8,(%rdi)
255	leaq 8(%rsi),%rsi
256	leaq 8(%rdi),%rdi
257	decl %ecx
258	jnz .L_8b_nocache_copy_loop
259
260	/* If no byte left, we're done */
261.L_4b_nocache_copy_entry:
262	andl %edx,%edx
263	jz .L_finish_copy
264
265	/* If destination is not 4-byte aligned, go to byte copy: */
266	movl %edi,%ecx
267	andl $3,%ecx
268	jnz .L_1b_cache_copy_entry
269
270	/* Set 4-byte copy count (1 or 0) and remainder */
271	movl %edx,%ecx
272	andl $3,%edx
273	shrl $2,%ecx
274	jz .L_1b_cache_copy_entry	/* jump if count is 0 */
275
276	/* Perform 4-byte nocache copy: */
27730:	movl (%rsi),%r8d
27831:	movnti %r8d,(%rdi)
279	leaq 4(%rsi),%rsi
280	leaq 4(%rdi),%rdi
281
282	/* If no bytes left, we're done: */
283	andl %edx,%edx
284	jz .L_finish_copy
285
286	/* Perform byte "cache" loop-copy for the remainder */
287.L_1b_cache_copy_entry:
288	movl %edx,%ecx
289.L_1b_cache_copy_loop:
29040:	movb (%rsi),%al
29141:	movb %al,(%rdi)
292	incq %rsi
293	incq %rdi
294	decl %ecx
295	jnz .L_1b_cache_copy_loop
296
297	/* Finished copying; fence the prior stores */
298.L_finish_copy:
299	xorl %eax,%eax
300	ASM_CLAC
301	sfence
302	ret
303
304	.section .fixup,"ax"
305.L_fixup_4x8b_copy:
306	shll $6,%ecx
307	addl %ecx,%edx
308	jmp .L_fixup_handle_tail
309.L_fixup_8b_copy:
310	lea (%rdx,%rcx,8),%rdx
311	jmp .L_fixup_handle_tail
312.L_fixup_4b_copy:
313	lea (%rdx,%rcx,4),%rdx
314	jmp .L_fixup_handle_tail
315.L_fixup_1b_copy:
316	movl %ecx,%edx
317.L_fixup_handle_tail:
318	sfence
319	jmp copy_user_handle_tail
320	.previous
321
322	_ASM_EXTABLE_UA(1b, .L_fixup_4x8b_copy)
323	_ASM_EXTABLE_UA(2b, .L_fixup_4x8b_copy)
324	_ASM_EXTABLE_UA(3b, .L_fixup_4x8b_copy)
325	_ASM_EXTABLE_UA(4b, .L_fixup_4x8b_copy)
326	_ASM_EXTABLE_UA(5b, .L_fixup_4x8b_copy)
327	_ASM_EXTABLE_UA(6b, .L_fixup_4x8b_copy)
328	_ASM_EXTABLE_UA(7b, .L_fixup_4x8b_copy)
329	_ASM_EXTABLE_UA(8b, .L_fixup_4x8b_copy)
330	_ASM_EXTABLE_UA(9b, .L_fixup_4x8b_copy)
331	_ASM_EXTABLE_UA(10b, .L_fixup_4x8b_copy)
332	_ASM_EXTABLE_UA(11b, .L_fixup_4x8b_copy)
333	_ASM_EXTABLE_UA(12b, .L_fixup_4x8b_copy)
334	_ASM_EXTABLE_UA(13b, .L_fixup_4x8b_copy)
335	_ASM_EXTABLE_UA(14b, .L_fixup_4x8b_copy)
336	_ASM_EXTABLE_UA(15b, .L_fixup_4x8b_copy)
337	_ASM_EXTABLE_UA(16b, .L_fixup_4x8b_copy)
338	_ASM_EXTABLE_UA(20b, .L_fixup_8b_copy)
339	_ASM_EXTABLE_UA(21b, .L_fixup_8b_copy)
340	_ASM_EXTABLE_UA(30b, .L_fixup_4b_copy)
341	_ASM_EXTABLE_UA(31b, .L_fixup_4b_copy)
342	_ASM_EXTABLE_UA(40b, .L_fixup_1b_copy)
343	_ASM_EXTABLE_UA(41b, .L_fixup_1b_copy)
344ENDPROC(__copy_user_nocache)
345EXPORT_SYMBOL(__copy_user_nocache)
346