1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2009 Sunplus Core Technology Co., Ltd. 4 * Lennox Wu <lennox.wu@sunplusct.com> 5 * Chen Liqin <liqin.chen@sunplusct.com> 6 * Copyright (C) 2012 Regents of the University of California 7 */ 8 9 10 #include <linux/mm.h> 11 #include <linux/kernel.h> 12 #include <linux/interrupt.h> 13 #include <linux/perf_event.h> 14 #include <linux/signal.h> 15 #include <linux/uaccess.h> 16 #include <linux/kprobes.h> 17 #include <linux/kfence.h> 18 #include <linux/entry-common.h> 19 20 #include <asm/ptrace.h> 21 #include <asm/tlbflush.h> 22 23 #define CREATE_TRACE_POINTS 24 #include <trace/events/exceptions.h> 25 26 #include "../kernel/head.h" 27 28 static void show_pte(unsigned long addr) 29 { 30 pgd_t *pgdp, pgd; 31 p4d_t *p4dp, p4d; 32 pud_t *pudp, pud; 33 pmd_t *pmdp, pmd; 34 pte_t *ptep, pte; 35 struct mm_struct *mm = current->mm; 36 37 if (!mm) 38 mm = &init_mm; 39 40 pr_alert("Current %s pgtable: %luK pagesize, %d-bit VAs, pgdp=0x%016llx\n", 41 current->comm, PAGE_SIZE / SZ_1K, VA_BITS, 42 mm == &init_mm ? (u64)__pa_symbol(mm->pgd) : virt_to_phys(mm->pgd)); 43 44 pgdp = pgd_offset(mm, addr); 45 pgd = pgdp_get(pgdp); 46 pr_alert("[%016lx] pgd=%016lx", addr, pgd_val(pgd)); 47 if (pgd_none(pgd) || pgd_bad(pgd) || pgd_leaf(pgd)) 48 goto out; 49 50 p4dp = p4d_offset(pgdp, addr); 51 p4d = p4dp_get(p4dp); 52 pr_cont(", p4d=%016lx", p4d_val(p4d)); 53 if (p4d_none(p4d) || p4d_bad(p4d) || p4d_leaf(p4d)) 54 goto out; 55 56 pudp = pud_offset(p4dp, addr); 57 pud = pudp_get(pudp); 58 pr_cont(", pud=%016lx", pud_val(pud)); 59 if (pud_none(pud) || pud_bad(pud) || pud_leaf(pud)) 60 goto out; 61 62 pmdp = pmd_offset(pudp, addr); 63 pmd = pmdp_get(pmdp); 64 pr_cont(", pmd=%016lx", pmd_val(pmd)); 65 if (pmd_none(pmd) || pmd_bad(pmd) || pmd_leaf(pmd)) 66 goto out; 67 68 ptep = pte_offset_map(pmdp, addr); 69 if (!ptep) 70 goto out; 71 72 pte = ptep_get(ptep); 73 pr_cont(", pte=%016lx", pte_val(pte)); 74 pte_unmap(ptep); 75 out: 76 pr_cont("\n"); 77 } 78 79 static void die_kernel_fault(const char *msg, unsigned long addr, 80 struct pt_regs *regs) 81 { 82 bust_spinlocks(1); 83 84 pr_alert("Unable to handle kernel %s at virtual address " REG_FMT "\n", msg, 85 addr); 86 87 bust_spinlocks(0); 88 show_pte(addr); 89 die(regs, "Oops"); 90 make_task_dead(SIGKILL); 91 } 92 93 static inline void no_context(struct pt_regs *regs, unsigned long addr) 94 { 95 const char *msg; 96 97 /* Are we prepared to handle this kernel fault? */ 98 if (fixup_exception(regs)) 99 return; 100 101 /* 102 * Oops. The kernel tried to access some bad page. We'll have to 103 * terminate things with extreme prejudice. 104 */ 105 if (addr < PAGE_SIZE) 106 msg = "NULL pointer dereference"; 107 else { 108 if (kfence_handle_page_fault(addr, regs->cause == EXC_STORE_PAGE_FAULT, regs)) 109 return; 110 111 msg = "paging request"; 112 } 113 114 die_kernel_fault(msg, addr, regs); 115 } 116 117 static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_fault_t fault) 118 { 119 if (!user_mode(regs)) { 120 no_context(regs, addr); 121 return; 122 } 123 124 if (fault & VM_FAULT_OOM) { 125 /* 126 * We ran out of memory, call the OOM killer, and return the userspace 127 * (which will retry the fault, or kill us if we got oom-killed). 128 */ 129 pagefault_out_of_memory(); 130 return; 131 } else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) { 132 /* Kernel mode? Handle exceptions or die */ 133 do_trap(regs, SIGBUS, BUS_ADRERR, addr); 134 return; 135 } else if (fault & VM_FAULT_SIGSEGV) { 136 do_trap(regs, SIGSEGV, SEGV_MAPERR, addr); 137 return; 138 } 139 140 BUG(); 141 } 142 143 static inline void 144 bad_area_nosemaphore(struct pt_regs *regs, int code, unsigned long addr) 145 { 146 /* 147 * Something tried to access memory that isn't in our memory map. 148 * Fix it, but check if it's kernel or user first. 149 */ 150 /* User mode accesses just cause a SIGSEGV */ 151 if (user_mode(regs)) { 152 do_trap(regs, SIGSEGV, code, addr); 153 return; 154 } 155 156 no_context(regs, addr); 157 } 158 159 static inline void 160 bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, 161 unsigned long addr) 162 { 163 mmap_read_unlock(mm); 164 165 bad_area_nosemaphore(regs, code, addr); 166 } 167 168 static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr) 169 { 170 pgd_t *pgd, *pgd_k; 171 pud_t *pud_k; 172 p4d_t *p4d_k; 173 pmd_t *pmd_k; 174 pte_t *pte_k; 175 int index; 176 unsigned long pfn; 177 178 /* User mode accesses just cause a SIGSEGV */ 179 if (user_mode(regs)) 180 return do_trap(regs, SIGSEGV, code, addr); 181 182 /* 183 * Synchronize this task's top level page-table 184 * with the 'reference' page table. 185 * 186 * Do _not_ use "tsk->active_mm->pgd" here. 187 * We might be inside an interrupt in the middle 188 * of a task switch. 189 */ 190 index = pgd_index(addr); 191 pfn = csr_read(CSR_SATP) & SATP_PPN; 192 pgd = (pgd_t *)pfn_to_virt(pfn) + index; 193 pgd_k = init_mm.pgd + index; 194 195 if (!pgd_present(pgdp_get(pgd_k))) { 196 no_context(regs, addr); 197 return; 198 } 199 set_pgd(pgd, pgdp_get(pgd_k)); 200 201 p4d_k = p4d_offset(pgd_k, addr); 202 if (!p4d_present(p4dp_get(p4d_k))) { 203 no_context(regs, addr); 204 return; 205 } 206 207 pud_k = pud_offset(p4d_k, addr); 208 if (!pud_present(pudp_get(pud_k))) { 209 no_context(regs, addr); 210 return; 211 } 212 if (pud_leaf(pudp_get(pud_k))) 213 goto flush_tlb; 214 215 /* 216 * Since the vmalloc area is global, it is unnecessary 217 * to copy individual PTEs 218 */ 219 pmd_k = pmd_offset(pud_k, addr); 220 if (!pmd_present(pmdp_get(pmd_k))) { 221 no_context(regs, addr); 222 return; 223 } 224 if (pmd_leaf(pmdp_get(pmd_k))) 225 goto flush_tlb; 226 227 /* 228 * Make sure the actual PTE exists as well to 229 * catch kernel vmalloc-area accesses to non-mapped 230 * addresses. If we don't do this, this will just 231 * silently loop forever. 232 */ 233 pte_k = pte_offset_kernel(pmd_k, addr); 234 if (!pte_present(ptep_get(pte_k))) { 235 no_context(regs, addr); 236 return; 237 } 238 239 /* 240 * The kernel assumes that TLBs don't cache invalid 241 * entries, but in RISC-V, SFENCE.VMA specifies an 242 * ordering constraint, not a cache flush; it is 243 * necessary even after writing invalid entries. 244 */ 245 flush_tlb: 246 local_flush_tlb_page(addr); 247 } 248 249 static inline bool access_error(unsigned long cause, struct vm_area_struct *vma) 250 { 251 switch (cause) { 252 case EXC_INST_PAGE_FAULT: 253 if (!(vma->vm_flags & VM_EXEC)) { 254 return true; 255 } 256 break; 257 case EXC_LOAD_PAGE_FAULT: 258 /* Write implies read */ 259 if (!(vma->vm_flags & (VM_READ | VM_WRITE))) { 260 return true; 261 } 262 break; 263 case EXC_STORE_PAGE_FAULT: 264 if (!(vma->vm_flags & VM_WRITE)) { 265 return true; 266 } 267 break; 268 default: 269 panic("%s: unhandled cause %lu", __func__, cause); 270 } 271 return false; 272 } 273 274 /* 275 * This routine handles page faults. It determines the address and the 276 * problem, and then passes it off to one of the appropriate routines. 277 */ 278 void handle_page_fault(struct pt_regs *regs) 279 { 280 struct task_struct *tsk; 281 struct vm_area_struct *vma; 282 struct mm_struct *mm; 283 unsigned long addr, cause; 284 unsigned int flags = FAULT_FLAG_DEFAULT; 285 int code = SEGV_MAPERR; 286 vm_fault_t fault; 287 288 cause = regs->cause; 289 addr = regs->badaddr; 290 291 tsk = current; 292 mm = tsk->mm; 293 294 if (kprobe_page_fault(regs, cause)) 295 return; 296 297 if (user_mode(regs)) 298 trace_page_fault_user(addr, regs, cause); 299 else 300 trace_page_fault_kernel(addr, regs, cause); 301 302 /* 303 * Fault-in kernel-space virtual memory on-demand. 304 * The 'reference' page table is init_mm.pgd. 305 * 306 * NOTE! We MUST NOT take any locks for this case. We may 307 * be in an interrupt or a critical region, and should 308 * only copy the information from the master page table, 309 * nothing more. 310 */ 311 if ((!IS_ENABLED(CONFIG_MMU) || !IS_ENABLED(CONFIG_64BIT)) && 312 unlikely(addr >= VMALLOC_START && addr < VMALLOC_END)) { 313 vmalloc_fault(regs, code, addr); 314 return; 315 } 316 317 /* Enable interrupts if they were enabled in the parent context. */ 318 if (!regs_irqs_disabled(regs)) 319 local_irq_enable(); 320 321 /* 322 * If we're in an interrupt, have no user context, or are running 323 * in an atomic region, then we must not take the fault. 324 */ 325 if (unlikely(faulthandler_disabled() || !mm)) { 326 tsk->thread.bad_cause = cause; 327 no_context(regs, addr); 328 return; 329 } 330 331 if (user_mode(regs)) 332 flags |= FAULT_FLAG_USER; 333 334 if (!user_mode(regs) && addr < TASK_SIZE && unlikely(!(regs->status & SR_SUM))) { 335 if (fixup_exception(regs)) 336 return; 337 338 die_kernel_fault("access to user memory without uaccess routines", addr, regs); 339 } 340 341 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); 342 343 if (cause == EXC_STORE_PAGE_FAULT) 344 flags |= FAULT_FLAG_WRITE; 345 else if (cause == EXC_INST_PAGE_FAULT) 346 flags |= FAULT_FLAG_INSTRUCTION; 347 if (!(flags & FAULT_FLAG_USER)) 348 goto lock_mmap; 349 350 vma = lock_vma_under_rcu(mm, addr); 351 if (!vma) 352 goto lock_mmap; 353 354 if (unlikely(access_error(cause, vma))) { 355 vma_end_read(vma); 356 count_vm_vma_lock_event(VMA_LOCK_SUCCESS); 357 tsk->thread.bad_cause = cause; 358 bad_area_nosemaphore(regs, SEGV_ACCERR, addr); 359 return; 360 } 361 362 fault = handle_mm_fault(vma, addr, flags | FAULT_FLAG_VMA_LOCK, regs); 363 if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) 364 vma_end_read(vma); 365 366 if (!(fault & VM_FAULT_RETRY)) { 367 count_vm_vma_lock_event(VMA_LOCK_SUCCESS); 368 goto done; 369 } 370 count_vm_vma_lock_event(VMA_LOCK_RETRY); 371 if (fault & VM_FAULT_MAJOR) 372 flags |= FAULT_FLAG_TRIED; 373 374 if (fault_signal_pending(fault, regs)) { 375 if (!user_mode(regs)) 376 no_context(regs, addr); 377 return; 378 } 379 lock_mmap: 380 381 retry: 382 vma = lock_mm_and_find_vma(mm, addr, regs); 383 if (unlikely(!vma)) { 384 tsk->thread.bad_cause = cause; 385 bad_area_nosemaphore(regs, code, addr); 386 return; 387 } 388 389 /* 390 * Ok, we have a good vm_area for this memory access, so 391 * we can handle it. 392 */ 393 code = SEGV_ACCERR; 394 395 if (unlikely(access_error(cause, vma))) { 396 tsk->thread.bad_cause = cause; 397 bad_area(regs, mm, code, addr); 398 return; 399 } 400 401 /* 402 * If for any reason at all we could not handle the fault, 403 * make sure we exit gracefully rather than endlessly redo 404 * the fault. 405 */ 406 fault = handle_mm_fault(vma, addr, flags, regs); 407 408 /* 409 * If we need to retry but a fatal signal is pending, handle the 410 * signal first. We do not need to release the mmap_lock because it 411 * would already be released in __lock_page_or_retry in mm/filemap.c. 412 */ 413 if (fault_signal_pending(fault, regs)) { 414 if (!user_mode(regs)) 415 no_context(regs, addr); 416 return; 417 } 418 419 /* The fault is fully completed (including releasing mmap lock) */ 420 if (fault & VM_FAULT_COMPLETED) 421 return; 422 423 if (unlikely(fault & VM_FAULT_RETRY)) { 424 flags |= FAULT_FLAG_TRIED; 425 426 /* 427 * No need to mmap_read_unlock(mm) as we would 428 * have already released it in __lock_page_or_retry 429 * in mm/filemap.c. 430 */ 431 goto retry; 432 } 433 434 mmap_read_unlock(mm); 435 436 done: 437 if (unlikely(fault & VM_FAULT_ERROR)) { 438 tsk->thread.bad_cause = cause; 439 mm_fault_error(regs, addr, fault); 440 return; 441 } 442 return; 443 } 444