1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org> 4 */ 5 6#include <linux/export.h> 7#include <linux/linkage.h> 8#include <asm/asm.h> 9 10/* 11 * copy_user_nocache - Uncached memory copy with exception handling 12 * 13 * This copies from user space into kernel space, but the kernel 14 * space accesses can take a machine check exception, so they too 15 * need exception handling. 16 * 17 * Note: only 32-bit and 64-bit stores have non-temporal versions, 18 * and we only use aligned versions. Any unaligned parts at the 19 * start or end of the copy will be done using normal cached stores. 20 * 21 * Input: 22 * rdi destination 23 * rsi source 24 * edx count 25 * 26 * Output: 27 * rax uncopied bytes or 0 if successful. 28 */ 29SYM_FUNC_START(__copy_user_nocache) 30 /* If destination is not 7-byte aligned, we'll have to align it */ 31 testb $7,%dil 32 jne .Lalign 33 34.Lis_aligned: 35 cmp $64,%edx 36 jb .Lquadwords 37 38 .p2align 4,0x90 39.Lunrolled: 4010: movq (%rsi),%r8 4111: movq 8(%rsi),%r9 4212: movq 16(%rsi),%r10 4313: movq 24(%rsi),%r11 4420: movnti %r8,(%rdi) 4521: movnti %r9,8(%rdi) 4622: movnti %r10,16(%rdi) 4723: movnti %r11,24(%rdi) 4830: movq 32(%rsi),%r8 4931: movq 40(%rsi),%r9 5032: movq 48(%rsi),%r10 5133: movq 56(%rsi),%r11 5240: movnti %r8,32(%rdi) 5341: movnti %r9,40(%rdi) 5442: movnti %r10,48(%rdi) 5543: movnti %r11,56(%rdi) 56 57 addq $64,%rsi 58 addq $64,%rdi 59 sub $64,%edx 60 cmp $64,%edx 61 jae .Lunrolled 62 63/* 64 * First set of user mode loads have been done 65 * without any stores, so if they fail, we can 66 * just try the non-unrolled loop. 67 */ 68_ASM_EXTABLE_UA(10b, .Lquadwords) 69_ASM_EXTABLE_UA(11b, .Lquadwords) 70_ASM_EXTABLE_UA(12b, .Lquadwords) 71_ASM_EXTABLE_UA(13b, .Lquadwords) 72 73/* 74 * The second set of user mode loads have been 75 * done with 32 bytes stored to the destination, 76 * so we need to take that into account before 77 * falling back to the unrolled loop. 78 */ 79_ASM_EXTABLE_UA(30b, .Lfixup32) 80_ASM_EXTABLE_UA(31b, .Lfixup32) 81_ASM_EXTABLE_UA(32b, .Lfixup32) 82_ASM_EXTABLE_UA(33b, .Lfixup32) 83 84/* 85 * An exception on a write means that we're 86 * done, but we need to update the count 87 * depending on where in the unrolled loop 88 * we were. 89 */ 90_ASM_EXTABLE_UA(20b, .Ldone0) 91_ASM_EXTABLE_UA(21b, .Ldone8) 92_ASM_EXTABLE_UA(22b, .Ldone16) 93_ASM_EXTABLE_UA(23b, .Ldone24) 94_ASM_EXTABLE_UA(40b, .Ldone32) 95_ASM_EXTABLE_UA(41b, .Ldone40) 96_ASM_EXTABLE_UA(42b, .Ldone48) 97_ASM_EXTABLE_UA(43b, .Ldone56) 98 99.Lquadwords: 100 cmp $8,%edx 101 jb .Llong 10250: movq (%rsi),%rax 10351: movnti %rax,(%rdi) 104 addq $8,%rsi 105 addq $8,%rdi 106 sub $8,%edx 107 jmp .Lquadwords 108 109/* 110 * If we fail on the last full quadword, we will 111 * not try to do any byte-wise cached accesses. 112 * We will try to do one more 4-byte uncached 113 * one, though. 114 */ 115_ASM_EXTABLE_UA(50b, .Llast4) 116_ASM_EXTABLE_UA(51b, .Ldone0) 117 118.Llong: 119 test $4,%dl 120 je .Lword 12160: movl (%rsi),%eax 12261: movnti %eax,(%rdi) 123 addq $4,%rsi 124 addq $4,%rdi 125 sub $4,%edx 126.Lword: 127 sfence 128 test $2,%dl 129 je .Lbyte 13070: movw (%rsi),%ax 13171: movw %ax,(%rdi) 132 addq $2,%rsi 133 addq $2,%rdi 134 sub $2,%edx 135.Lbyte: 136 test $1,%dl 137 je .Ldone 13880: movb (%rsi),%al 13981: movb %al,(%rdi) 140 dec %edx 141.Ldone: 142 mov %edx,%eax 143 RET 144 145/* 146 * If we fail on the last four bytes, we won't 147 * bother with any fixups. It's dead, Jim. Note 148 * that there's no need for 'sfence' for any 149 * of this, since the exception will have been 150 * serializing. 151 */ 152_ASM_EXTABLE_UA(60b, .Ldone) 153_ASM_EXTABLE_UA(61b, .Ldone) 154_ASM_EXTABLE_UA(70b, .Ldone) 155_ASM_EXTABLE_UA(71b, .Ldone) 156_ASM_EXTABLE_UA(80b, .Ldone) 157_ASM_EXTABLE_UA(81b, .Ldone) 158 159/* 160 * This is the "head needs aliging" case when 161 * the destination isn't 8-byte aligned. The 162 * 4-byte case can be done uncached, but any 163 * smaller alignment is done with regular stores. 164 */ 165.Lalign: 166 test $1,%dil 167 je .Lalign_word 168 test %edx,%edx 169 je .Ldone 17090: movb (%rsi),%al 17191: movb %al,(%rdi) 172 inc %rsi 173 inc %rdi 174 dec %edx 175.Lalign_word: 176 test $2,%dil 177 je .Lalign_long 178 cmp $2,%edx 179 jb .Lbyte 18092: movw (%rsi),%ax 18193: movw %ax,(%rdi) 182 addq $2,%rsi 183 addq $2,%rdi 184 sub $2,%edx 185.Lalign_long: 186 test $4,%dil 187 je .Lis_aligned 188 cmp $4,%edx 189 jb .Lword 19094: movl (%rsi),%eax 19195: movnti %eax,(%rdi) 192 addq $4,%rsi 193 addq $4,%rdi 194 sub $4,%edx 195 jmp .Lis_aligned 196 197/* 198 * If we fail on the initial alignment accesses, 199 * we're all done. Again, no point in trying to 200 * do byte-by-byte probing if the 4-byte load 201 * fails - we're not doing any uncached accesses 202 * any more. 203 */ 204_ASM_EXTABLE_UA(90b, .Ldone) 205_ASM_EXTABLE_UA(91b, .Ldone) 206_ASM_EXTABLE_UA(92b, .Ldone) 207_ASM_EXTABLE_UA(93b, .Ldone) 208_ASM_EXTABLE_UA(94b, .Ldone) 209_ASM_EXTABLE_UA(95b, .Ldone) 210 211/* 212 * Exception table fixups for faults in the middle 213 */ 214.Ldone56: sub $8,%edx 215.Ldone48: sub $8,%edx 216.Ldone40: sub $8,%edx 217.Ldone32: sub $8,%edx 218.Ldone24: sub $8,%edx 219.Ldone16: sub $8,%edx 220.Ldone8: sub $8,%edx 221.Ldone0: 222 mov %edx,%eax 223 RET 224 225.Lfixup32: 226 addq $32,%rsi 227 addq $32,%rdi 228 sub $32,%edx 229 jmp .Lquadwords 230 231.Llast4: 23252: movl (%rsi),%eax 23353: movnti %eax,(%rdi) 234 sfence 235 sub $4,%edx 236 mov %edx,%eax 237 RET 238_ASM_EXTABLE_UA(52b, .Ldone0) 239_ASM_EXTABLE_UA(53b, .Ldone0) 240 241SYM_FUNC_END(__copy_user_nocache) 242EXPORT_SYMBOL(__copy_user_nocache) 243