xref: /linux/arch/x86/lib/copy_user_uncached_64.S (revision 24168c5e6dfbdd5b414f048f47f75d64533296ca)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
4 */
5
6#include <linux/export.h>
7#include <linux/linkage.h>
8#include <asm/asm.h>
9
10/*
11 * copy_user_nocache - Uncached memory copy with exception handling
12 *
13 * This copies from user space into kernel space, but the kernel
14 * space accesses can take a machine check exception, so they too
15 * need exception handling.
16 *
17 * Note: only 32-bit and 64-bit stores have non-temporal versions,
18 * and we only use aligned versions. Any unaligned parts at the
19 * start or end of the copy will be done using normal cached stores.
20 *
21 * Input:
22 * rdi destination
23 * rsi source
24 * edx count
25 *
26 * Output:
27 * rax uncopied bytes or 0 if successful.
28 */
29SYM_FUNC_START(__copy_user_nocache)
30	/* If destination is not 7-byte aligned, we'll have to align it */
31	testb $7,%dil
32	jne .Lalign
33
34.Lis_aligned:
35	cmp $64,%edx
36	jb .Lquadwords
37
38	.p2align 4,0x90
39.Lunrolled:
4010:	movq (%rsi),%r8
4111:	movq 8(%rsi),%r9
4212:	movq 16(%rsi),%r10
4313:	movq 24(%rsi),%r11
4420:	movnti %r8,(%rdi)
4521:	movnti %r9,8(%rdi)
4622:	movnti %r10,16(%rdi)
4723:	movnti %r11,24(%rdi)
4830:	movq 32(%rsi),%r8
4931:	movq 40(%rsi),%r9
5032:	movq 48(%rsi),%r10
5133:	movq 56(%rsi),%r11
5240:	movnti %r8,32(%rdi)
5341:	movnti %r9,40(%rdi)
5442:	movnti %r10,48(%rdi)
5543:	movnti %r11,56(%rdi)
56
57	addq $64,%rsi
58	addq $64,%rdi
59	sub $64,%edx
60	cmp $64,%edx
61	jae .Lunrolled
62
63/*
64 * First set of user mode loads have been done
65 * without any stores, so if they fail, we can
66 * just try the non-unrolled loop.
67 */
68_ASM_EXTABLE_UA(10b, .Lquadwords)
69_ASM_EXTABLE_UA(11b, .Lquadwords)
70_ASM_EXTABLE_UA(12b, .Lquadwords)
71_ASM_EXTABLE_UA(13b, .Lquadwords)
72
73/*
74 * The second set of user mode loads have been
75 * done with 32 bytes stored to the destination,
76 * so we need to take that into account before
77 * falling back to the unrolled loop.
78 */
79_ASM_EXTABLE_UA(30b, .Lfixup32)
80_ASM_EXTABLE_UA(31b, .Lfixup32)
81_ASM_EXTABLE_UA(32b, .Lfixup32)
82_ASM_EXTABLE_UA(33b, .Lfixup32)
83
84/*
85 * An exception on a write means that we're
86 * done, but we need to update the count
87 * depending on where in the unrolled loop
88 * we were.
89 */
90_ASM_EXTABLE_UA(20b, .Ldone0)
91_ASM_EXTABLE_UA(21b, .Ldone8)
92_ASM_EXTABLE_UA(22b, .Ldone16)
93_ASM_EXTABLE_UA(23b, .Ldone24)
94_ASM_EXTABLE_UA(40b, .Ldone32)
95_ASM_EXTABLE_UA(41b, .Ldone40)
96_ASM_EXTABLE_UA(42b, .Ldone48)
97_ASM_EXTABLE_UA(43b, .Ldone56)
98
99.Lquadwords:
100	cmp $8,%edx
101	jb .Llong
10250:	movq (%rsi),%rax
10351:	movnti %rax,(%rdi)
104	addq $8,%rsi
105	addq $8,%rdi
106	sub $8,%edx
107	jmp .Lquadwords
108
109/*
110 * If we fail on the last full quadword, we will
111 * not try to do any byte-wise cached accesses.
112 * We will try to do one more 4-byte uncached
113 * one, though.
114 */
115_ASM_EXTABLE_UA(50b, .Llast4)
116_ASM_EXTABLE_UA(51b, .Ldone0)
117
118.Llong:
119	test $4,%dl
120	je .Lword
12160:	movl (%rsi),%eax
12261:	movnti %eax,(%rdi)
123	addq $4,%rsi
124	addq $4,%rdi
125	sub $4,%edx
126.Lword:
127	sfence
128	test $2,%dl
129	je .Lbyte
13070:	movw (%rsi),%ax
13171:	movw %ax,(%rdi)
132	addq $2,%rsi
133	addq $2,%rdi
134	sub $2,%edx
135.Lbyte:
136	test $1,%dl
137	je .Ldone
13880:	movb (%rsi),%al
13981:	movb %al,(%rdi)
140	dec %edx
141.Ldone:
142	mov %edx,%eax
143	RET
144
145/*
146 * If we fail on the last four bytes, we won't
147 * bother with any fixups. It's dead, Jim. Note
148 * that there's no need for 'sfence' for any
149 * of this, since the exception will have been
150 * serializing.
151 */
152_ASM_EXTABLE_UA(60b, .Ldone)
153_ASM_EXTABLE_UA(61b, .Ldone)
154_ASM_EXTABLE_UA(70b, .Ldone)
155_ASM_EXTABLE_UA(71b, .Ldone)
156_ASM_EXTABLE_UA(80b, .Ldone)
157_ASM_EXTABLE_UA(81b, .Ldone)
158
159/*
160 * This is the "head needs aliging" case when
161 * the destination isn't 8-byte aligned. The
162 * 4-byte case can be done uncached, but any
163 * smaller alignment is done with regular stores.
164 */
165.Lalign:
166	test $1,%dil
167	je .Lalign_word
168	test %edx,%edx
169	je .Ldone
17090:	movb (%rsi),%al
17191:	movb %al,(%rdi)
172	inc %rsi
173	inc %rdi
174	dec %edx
175.Lalign_word:
176	test $2,%dil
177	je .Lalign_long
178	cmp $2,%edx
179	jb .Lbyte
18092:	movw (%rsi),%ax
18193:	movw %ax,(%rdi)
182	addq $2,%rsi
183	addq $2,%rdi
184	sub $2,%edx
185.Lalign_long:
186	test $4,%dil
187	je .Lis_aligned
188	cmp $4,%edx
189	jb .Lword
19094:	movl (%rsi),%eax
19195:	movnti %eax,(%rdi)
192	addq $4,%rsi
193	addq $4,%rdi
194	sub $4,%edx
195	jmp .Lis_aligned
196
197/*
198 * If we fail on the initial alignment accesses,
199 * we're all done. Again, no point in trying to
200 * do byte-by-byte probing if the 4-byte load
201 * fails - we're not doing any uncached accesses
202 * any more.
203 */
204_ASM_EXTABLE_UA(90b, .Ldone)
205_ASM_EXTABLE_UA(91b, .Ldone)
206_ASM_EXTABLE_UA(92b, .Ldone)
207_ASM_EXTABLE_UA(93b, .Ldone)
208_ASM_EXTABLE_UA(94b, .Ldone)
209_ASM_EXTABLE_UA(95b, .Ldone)
210
211/*
212 * Exception table fixups for faults in the middle
213 */
214.Ldone56: sub $8,%edx
215.Ldone48: sub $8,%edx
216.Ldone40: sub $8,%edx
217.Ldone32: sub $8,%edx
218.Ldone24: sub $8,%edx
219.Ldone16: sub $8,%edx
220.Ldone8: sub $8,%edx
221.Ldone0:
222	mov %edx,%eax
223	RET
224
225.Lfixup32:
226	addq $32,%rsi
227	addq $32,%rdi
228	sub $32,%edx
229	jmp .Lquadwords
230
231.Llast4:
23252:	movl (%rsi),%eax
23353:	movnti %eax,(%rdi)
234	sfence
235	sub $4,%edx
236	mov %edx,%eax
237	RET
238_ASM_EXTABLE_UA(52b, .Ldone0)
239_ASM_EXTABLE_UA(53b, .Ldone0)
240
241SYM_FUNC_END(__copy_user_nocache)
242EXPORT_SYMBOL(__copy_user_nocache)
243