1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Based on arch/arm/mm/fault.c 4 * 5 * Copyright (C) 1995 Linus Torvalds 6 * Copyright (C) 1995-2004 Russell King 7 * Copyright (C) 2012 ARM Ltd. 8 */ 9 10 #include <linux/acpi.h> 11 #include <linux/bitfield.h> 12 #include <linux/extable.h> 13 #include <linux/kfence.h> 14 #include <linux/signal.h> 15 #include <linux/mm.h> 16 #include <linux/hardirq.h> 17 #include <linux/init.h> 18 #include <linux/kasan.h> 19 #include <linux/kprobes.h> 20 #include <linux/uaccess.h> 21 #include <linux/page-flags.h> 22 #include <linux/sched/signal.h> 23 #include <linux/sched/debug.h> 24 #include <linux/highmem.h> 25 #include <linux/perf_event.h> 26 #include <linux/pkeys.h> 27 #include <linux/preempt.h> 28 #include <linux/hugetlb.h> 29 30 #include <asm/acpi.h> 31 #include <asm/bug.h> 32 #include <asm/cmpxchg.h> 33 #include <asm/cpufeature.h> 34 #include <asm/efi.h> 35 #include <asm/exception.h> 36 #include <asm/daifflags.h> 37 #include <asm/debug-monitors.h> 38 #include <asm/esr.h> 39 #include <asm/kprobes.h> 40 #include <asm/mte.h> 41 #include <asm/processor.h> 42 #include <asm/sysreg.h> 43 #include <asm/system_misc.h> 44 #include <asm/tlbflush.h> 45 #include <asm/traps.h> 46 #include <asm/virt.h> 47 48 struct fault_info { 49 int (*fn)(unsigned long far, unsigned long esr, 50 struct pt_regs *regs); 51 int sig; 52 int code; 53 const char *name; 54 }; 55 56 static const struct fault_info fault_info[]; 57 58 static inline const struct fault_info *esr_to_fault_info(unsigned long esr) 59 { 60 return fault_info + (esr & ESR_ELx_FSC); 61 } 62 63 static void data_abort_decode(unsigned long esr) 64 { 65 unsigned long iss2 = ESR_ELx_ISS2(esr); 66 67 pr_alert("Data abort info:\n"); 68 69 if (esr & ESR_ELx_ISV) { 70 pr_alert(" Access size = %u byte(s)\n", 71 1U << ((esr & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT)); 72 pr_alert(" SSE = %lu, SRT = %lu\n", 73 (esr & ESR_ELx_SSE) >> ESR_ELx_SSE_SHIFT, 74 (esr & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT); 75 pr_alert(" SF = %lu, AR = %lu\n", 76 (esr & ESR_ELx_SF) >> ESR_ELx_SF_SHIFT, 77 (esr & ESR_ELx_AR) >> ESR_ELx_AR_SHIFT); 78 } else { 79 pr_alert(" ISV = 0, ISS = 0x%08lx, ISS2 = 0x%08lx\n", 80 esr & ESR_ELx_ISS_MASK, iss2); 81 } 82 83 pr_alert(" CM = %lu, WnR = %lu, TnD = %lu, TagAccess = %lu\n", 84 (esr & ESR_ELx_CM) >> ESR_ELx_CM_SHIFT, 85 (esr & ESR_ELx_WNR) >> ESR_ELx_WNR_SHIFT, 86 (iss2 & ESR_ELx_TnD) >> ESR_ELx_TnD_SHIFT, 87 (iss2 & ESR_ELx_TagAccess) >> ESR_ELx_TagAccess_SHIFT); 88 89 pr_alert(" GCS = %ld, Overlay = %lu, DirtyBit = %lu, Xs = %llu\n", 90 (iss2 & ESR_ELx_GCS) >> ESR_ELx_GCS_SHIFT, 91 (iss2 & ESR_ELx_Overlay) >> ESR_ELx_Overlay_SHIFT, 92 (iss2 & ESR_ELx_DirtyBit) >> ESR_ELx_DirtyBit_SHIFT, 93 (iss2 & ESR_ELx_Xs_MASK) >> ESR_ELx_Xs_SHIFT); 94 } 95 96 static void mem_abort_decode(unsigned long esr) 97 { 98 pr_alert("Mem abort info:\n"); 99 100 pr_alert(" ESR = 0x%016lx\n", esr); 101 pr_alert(" EC = 0x%02lx: %s, IL = %u bits\n", 102 ESR_ELx_EC(esr), esr_get_class_string(esr), 103 (esr & ESR_ELx_IL) ? 32 : 16); 104 pr_alert(" SET = %lu, FnV = %lu\n", 105 (esr & ESR_ELx_SET_MASK) >> ESR_ELx_SET_SHIFT, 106 (esr & ESR_ELx_FnV) >> ESR_ELx_FnV_SHIFT); 107 pr_alert(" EA = %lu, S1PTW = %lu\n", 108 (esr & ESR_ELx_EA) >> ESR_ELx_EA_SHIFT, 109 (esr & ESR_ELx_S1PTW) >> ESR_ELx_S1PTW_SHIFT); 110 pr_alert(" FSC = 0x%02lx: %s\n", (esr & ESR_ELx_FSC), 111 esr_to_fault_info(esr)->name); 112 113 if (esr_is_data_abort(esr)) 114 data_abort_decode(esr); 115 } 116 117 static inline unsigned long mm_to_pgd_phys(struct mm_struct *mm) 118 { 119 /* Either init_pg_dir or swapper_pg_dir */ 120 if (mm == &init_mm) 121 return __pa_symbol(mm->pgd); 122 123 return (unsigned long)virt_to_phys(mm->pgd); 124 } 125 126 /* 127 * Dump out the page tables associated with 'addr' in the currently active mm. 128 */ 129 static void show_pte(unsigned long addr) 130 { 131 struct mm_struct *mm; 132 pgd_t *pgdp; 133 pgd_t pgd; 134 135 if (is_ttbr0_addr(addr)) { 136 /* TTBR0 */ 137 mm = current->active_mm; 138 if (mm == &init_mm) { 139 pr_alert("[%016lx] user address but active_mm is swapper\n", 140 addr); 141 return; 142 } 143 } else if (is_ttbr1_addr(addr)) { 144 /* TTBR1 */ 145 mm = &init_mm; 146 } else { 147 pr_alert("[%016lx] address between user and kernel address ranges\n", 148 addr); 149 return; 150 } 151 152 pr_alert("%s pgtable: %luk pages, %llu-bit VAs, pgdp=%016lx\n", 153 mm == &init_mm ? "swapper" : "user", PAGE_SIZE / SZ_1K, 154 vabits_actual, mm_to_pgd_phys(mm)); 155 pgdp = pgd_offset(mm, addr); 156 pgd = READ_ONCE(*pgdp); 157 pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd)); 158 159 do { 160 p4d_t *p4dp, p4d; 161 pud_t *pudp, pud; 162 pmd_t *pmdp, pmd; 163 pte_t *ptep, pte; 164 165 if (pgd_none(pgd) || pgd_bad(pgd)) 166 break; 167 168 p4dp = p4d_offset(pgdp, addr); 169 p4d = READ_ONCE(*p4dp); 170 pr_cont(", p4d=%016llx", p4d_val(p4d)); 171 if (p4d_none(p4d) || p4d_bad(p4d)) 172 break; 173 174 pudp = pud_offset(p4dp, addr); 175 pud = READ_ONCE(*pudp); 176 pr_cont(", pud=%016llx", pud_val(pud)); 177 if (pud_none(pud) || pud_bad(pud)) 178 break; 179 180 pmdp = pmd_offset(pudp, addr); 181 pmd = READ_ONCE(*pmdp); 182 pr_cont(", pmd=%016llx", pmd_val(pmd)); 183 if (pmd_none(pmd) || pmd_bad(pmd)) 184 break; 185 186 ptep = pte_offset_map(pmdp, addr); 187 if (!ptep) 188 break; 189 190 pte = __ptep_get(ptep); 191 pr_cont(", pte=%016llx", pte_val(pte)); 192 pte_unmap(ptep); 193 } while(0); 194 195 pr_cont("\n"); 196 } 197 198 /* 199 * This function sets the access flags (dirty, accessed), as well as write 200 * permission, and only to a more permissive setting. 201 * 202 * It needs to cope with hardware update of the accessed/dirty state by other 203 * agents in the system and can safely skip the __sync_icache_dcache() call as, 204 * like __set_ptes(), the PTE is never changed from no-exec to exec here. 205 * 206 * Returns whether or not the PTE actually changed. 207 */ 208 int __ptep_set_access_flags_anysz(struct vm_area_struct *vma, 209 unsigned long address, pte_t *ptep, 210 pte_t entry, int dirty, unsigned long pgsize) 211 { 212 pteval_t old_pteval, pteval; 213 pte_t pte = __ptep_get(ptep); 214 int level; 215 216 if (pte_same(pte, entry)) 217 return 0; 218 219 /* only preserve the access flags and write permission */ 220 pte_val(entry) &= PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY; 221 222 /* 223 * Setting the flags must be done atomically to avoid racing with the 224 * hardware update of the access/dirty state. The PTE_RDONLY bit must 225 * be set to the most permissive (lowest value) of *ptep and entry 226 * (calculated as: a & b == ~(~a | ~b)). 227 */ 228 pte_val(entry) ^= PTE_RDONLY; 229 pteval = pte_val(pte); 230 do { 231 old_pteval = pteval; 232 pteval ^= PTE_RDONLY; 233 pteval |= pte_val(entry); 234 pteval ^= PTE_RDONLY; 235 pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval); 236 } while (pteval != old_pteval); 237 238 /* 239 * Invalidate the local stale read-only entry. Remote stale entries 240 * may still cause page faults and be invalidated via 241 * flush_tlb_fix_spurious_fault(). 242 */ 243 if (dirty) { 244 switch (pgsize) { 245 case PAGE_SIZE: 246 level = 3; 247 break; 248 case PMD_SIZE: 249 level = 2; 250 break; 251 #ifndef __PAGETABLE_PMD_FOLDED 252 case PUD_SIZE: 253 level = 1; 254 break; 255 #endif 256 default: 257 level = TLBI_TTL_UNKNOWN; 258 WARN_ON(1); 259 } 260 261 __flush_tlb_range(vma, address, address + pgsize, pgsize, level, 262 TLBF_NOWALKCACHE | TLBF_NOBROADCAST); 263 } 264 return 1; 265 } 266 267 static bool is_el1_instruction_abort(unsigned long esr) 268 { 269 return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR; 270 } 271 272 static bool is_el1_data_abort(unsigned long esr) 273 { 274 return ESR_ELx_EC(esr) == ESR_ELx_EC_DABT_CUR; 275 } 276 277 static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr, 278 struct pt_regs *regs) 279 { 280 if (!is_el1_data_abort(esr) && !is_el1_instruction_abort(esr)) 281 return false; 282 283 if (esr_fsc_is_permission_fault(esr)) 284 return true; 285 286 if (is_ttbr0_addr(addr) && system_uses_ttbr0_pan()) 287 return esr_fsc_is_translation_fault(esr) && 288 (regs->pstate & PSR_PAN_BIT); 289 290 return false; 291 } 292 293 static bool is_pkvm_stage2_abort(unsigned int esr) 294 { 295 /* 296 * S1PTW should only ever be set in ESR_EL1 if the pkvm hypervisor 297 * injected a stage-2 abort -- see host_inject_mem_abort(). 298 */ 299 return is_pkvm_initialized() && (esr & ESR_ELx_S1PTW); 300 } 301 302 static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr, 303 unsigned long esr, 304 struct pt_regs *regs) 305 { 306 unsigned long flags; 307 u64 par, dfsc; 308 309 if (!is_el1_data_abort(esr) || !esr_fsc_is_translation_fault(esr)) 310 return false; 311 312 local_irq_save(flags); 313 asm volatile("at s1e1r, %0" :: "r" (addr)); 314 isb(); 315 par = read_sysreg_par(); 316 local_irq_restore(flags); 317 318 /* 319 * If we now have a valid translation, treat the translation fault as 320 * spurious. 321 */ 322 if (!(par & SYS_PAR_EL1_F)) { 323 if (is_pkvm_stage2_abort(esr)) { 324 par &= SYS_PAR_EL1_PA; 325 return pkvm_force_reclaim_guest_page(par); 326 } 327 328 return true; 329 } 330 331 /* 332 * If we got a different type of fault from the AT instruction, 333 * treat the translation fault as spurious. 334 */ 335 dfsc = FIELD_GET(SYS_PAR_EL1_FST, par); 336 return !esr_fsc_is_translation_fault(dfsc); 337 } 338 339 static void die_kernel_fault(const char *msg, unsigned long addr, 340 unsigned long esr, struct pt_regs *regs) 341 { 342 bust_spinlocks(1); 343 344 pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg, 345 addr); 346 347 kasan_non_canonical_hook(addr); 348 349 mem_abort_decode(esr); 350 351 show_pte(addr); 352 die("Oops", regs, esr); 353 bust_spinlocks(0); 354 make_task_dead(SIGKILL); 355 } 356 357 #ifdef CONFIG_KASAN_HW_TAGS 358 static void report_tag_fault(unsigned long addr, unsigned long esr, 359 struct pt_regs *regs) 360 { 361 /* 362 * SAS bits aren't set for all faults reported in EL1, so we can't 363 * find out access size. 364 */ 365 bool is_write = !!(esr & ESR_ELx_WNR); 366 kasan_report((void *)addr, 0, is_write, regs->pc); 367 } 368 #else 369 /* Tag faults aren't enabled without CONFIG_KASAN_HW_TAGS. */ 370 static inline void report_tag_fault(unsigned long addr, unsigned long esr, 371 struct pt_regs *regs) { } 372 #endif 373 374 static void do_tag_recovery(unsigned long addr, unsigned long esr, 375 struct pt_regs *regs) 376 { 377 378 report_tag_fault(addr, esr, regs); 379 380 /* 381 * Disable MTE Tag Checking on the local CPU for the current EL. 382 * It will be done lazily on the other CPUs when they will hit a 383 * tag fault. 384 */ 385 sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF_MASK, 386 SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF, NONE)); 387 isb(); 388 } 389 390 static bool is_el1_mte_sync_tag_check_fault(unsigned long esr) 391 { 392 unsigned long fsc = esr & ESR_ELx_FSC; 393 394 if (!is_el1_data_abort(esr)) 395 return false; 396 397 if (fsc == ESR_ELx_FSC_MTE) 398 return true; 399 400 return false; 401 } 402 403 static void __do_kernel_fault(unsigned long addr, unsigned long esr, 404 struct pt_regs *regs) 405 { 406 const char *msg; 407 408 /* 409 * Are we prepared to handle this kernel fault? 410 * We are almost certainly not prepared to handle instruction faults. 411 */ 412 if (!is_el1_instruction_abort(esr) && fixup_exception(regs, esr)) 413 return; 414 415 if (is_spurious_el1_translation_fault(addr, esr, regs)) { 416 WARN_RATELIMIT(!is_pkvm_stage2_abort(esr), 417 "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr); 418 return; 419 } 420 421 if (is_el1_mte_sync_tag_check_fault(esr)) { 422 do_tag_recovery(addr, esr, regs); 423 424 return; 425 } 426 427 if (is_el1_permission_fault(addr, esr, regs)) { 428 if (esr & ESR_ELx_WNR) 429 msg = "write to read-only memory"; 430 else if (is_el1_instruction_abort(esr)) 431 msg = "execute from non-executable memory"; 432 else 433 msg = "read from unreadable memory"; 434 } else if (addr < PAGE_SIZE) { 435 msg = "NULL pointer dereference"; 436 } else if (is_pkvm_stage2_abort(esr)) { 437 msg = "access to hypervisor-protected memory"; 438 } else { 439 if (esr_fsc_is_translation_fault(esr) && 440 kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) 441 return; 442 443 msg = "paging request"; 444 } 445 446 if (efi_runtime_fixup_exception(regs, msg)) 447 return; 448 449 die_kernel_fault(msg, addr, esr, regs); 450 } 451 452 static void set_thread_esr(unsigned long address, unsigned long esr) 453 { 454 current->thread.fault_address = address; 455 456 /* 457 * If the faulting address is in the kernel, we must sanitize the ESR. 458 * From userspace's point of view, kernel-only mappings don't exist 459 * at all, so we report them as level 0 translation faults. 460 * (This is not quite the way that "no mapping there at all" behaves: 461 * an alignment fault not caused by the memory type would take 462 * precedence over translation fault for a real access to empty 463 * space. Unfortunately we can't easily distinguish "alignment fault 464 * not caused by memory type" from "alignment fault caused by memory 465 * type", so we ignore this wrinkle and just return the translation 466 * fault.) 467 */ 468 if (!is_ttbr0_addr(current->thread.fault_address)) { 469 switch (ESR_ELx_EC(esr)) { 470 case ESR_ELx_EC_DABT_LOW: 471 /* 472 * These bits provide only information about the 473 * faulting instruction, which userspace knows already. 474 * We explicitly clear bits which are architecturally 475 * RES0 in case they are given meanings in future. 476 * We always report the ESR as if the fault was taken 477 * to EL1 and so ISV and the bits in ISS[23:14] are 478 * clear. (In fact it always will be a fault to EL1.) 479 */ 480 esr &= ESR_ELx_EC_MASK | ESR_ELx_IL | 481 ESR_ELx_CM | ESR_ELx_WNR; 482 esr |= ESR_ELx_FSC_FAULT; 483 break; 484 case ESR_ELx_EC_IABT_LOW: 485 /* 486 * Claim a level 0 translation fault. 487 * All other bits are architecturally RES0 for faults 488 * reported with that DFSC value, so we clear them. 489 */ 490 esr &= ESR_ELx_EC_MASK | ESR_ELx_IL; 491 esr |= ESR_ELx_FSC_FAULT; 492 break; 493 default: 494 /* 495 * This should never happen (entry.S only brings us 496 * into this code for insn and data aborts from a lower 497 * exception level). Fail safe by not providing an ESR 498 * context record at all. 499 */ 500 WARN(1, "ESR 0x%lx is not DABT or IABT from EL0\n", esr); 501 esr = 0; 502 break; 503 } 504 } 505 506 current->thread.fault_code = esr; 507 } 508 509 static void do_bad_area(unsigned long far, unsigned long esr, 510 struct pt_regs *regs) 511 { 512 unsigned long addr = untagged_addr(far); 513 514 /* 515 * If we are in kernel mode at this point, we have no context to 516 * handle this fault with. 517 */ 518 if (user_mode(regs)) { 519 const struct fault_info *inf = esr_to_fault_info(esr); 520 521 set_thread_esr(addr, esr); 522 arm64_force_sig_fault(inf->sig, inf->code, far, inf->name); 523 } else { 524 __do_kernel_fault(addr, esr, regs); 525 } 526 } 527 528 static bool fault_from_pkey(struct vm_area_struct *vma, unsigned int mm_flags) 529 { 530 if (!system_supports_poe()) 531 return false; 532 533 /* 534 * We do not check whether an Overlay fault has occurred because we 535 * cannot make a decision based solely on its value: 536 * 537 * - If Overlay is set, a fault did occur due to POE, but it may be 538 * spurious in those cases where we update POR_EL0 without ISB (e.g. 539 * on context-switch). We would then need to manually check POR_EL0 540 * against vma_pkey(vma), which is exactly what 541 * arch_vma_access_permitted() does. 542 * 543 * - If Overlay is not set, we may still need to report a pkey fault. 544 * This is the case if an access was made within a mapping but with no 545 * page mapped, and POR_EL0 forbids the access (according to 546 * vma_pkey()). Such access will result in a SIGSEGV regardless 547 * because core code checks arch_vma_access_permitted(), but in order 548 * to report the correct error code - SEGV_PKUERR - we must handle 549 * that case here. 550 */ 551 return !arch_vma_access_permitted(vma, 552 mm_flags & FAULT_FLAG_WRITE, 553 mm_flags & FAULT_FLAG_INSTRUCTION, 554 false); 555 } 556 557 static bool is_gcs_fault(unsigned long esr) 558 { 559 if (!esr_is_data_abort(esr)) 560 return false; 561 562 return ESR_ELx_ISS2(esr) & ESR_ELx_GCS; 563 } 564 565 static bool is_el0_instruction_abort(unsigned long esr) 566 { 567 return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW; 568 } 569 570 /* 571 * Note: not valid for EL1 DC IVAC, but we never use that such that it 572 * should fault. EL0 cannot issue DC IVAC (undef). 573 */ 574 static bool is_write_abort(unsigned long esr) 575 { 576 return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM); 577 } 578 579 static bool is_invalid_gcs_access(struct vm_area_struct *vma, u64 esr) 580 { 581 if (!system_supports_gcs()) 582 return false; 583 584 if (unlikely(is_gcs_fault(esr))) { 585 /* GCS accesses must be performed on a GCS page */ 586 if (!(vma->vm_flags & VM_SHADOW_STACK)) 587 return true; 588 } else if (unlikely(vma->vm_flags & VM_SHADOW_STACK)) { 589 /* Only GCS operations can write to a GCS page */ 590 return esr_is_data_abort(esr) && is_write_abort(esr); 591 } 592 593 return false; 594 } 595 596 static int __kprobes do_page_fault(unsigned long far, unsigned long esr, 597 struct pt_regs *regs) 598 { 599 const struct fault_info *inf; 600 struct mm_struct *mm = current->mm; 601 vm_fault_t fault; 602 vm_flags_t vm_flags; 603 unsigned int mm_flags = FAULT_FLAG_DEFAULT; 604 unsigned long addr = untagged_addr(far); 605 struct vm_area_struct *vma; 606 int si_code; 607 int pkey = -1; 608 609 if (kprobe_page_fault(regs, esr)) 610 return 0; 611 612 /* 613 * If we're in an interrupt or have no user context, we must not take 614 * the fault. 615 */ 616 if (faulthandler_disabled() || !mm) 617 goto no_context; 618 619 if (user_mode(regs)) 620 mm_flags |= FAULT_FLAG_USER; 621 622 /* 623 * vm_flags tells us what bits we must have in vma->vm_flags 624 * for the fault to be benign, __do_page_fault() would check 625 * vma->vm_flags & vm_flags and returns an error if the 626 * intersection is empty 627 */ 628 if (is_el0_instruction_abort(esr)) { 629 /* It was exec fault */ 630 vm_flags = VM_EXEC; 631 mm_flags |= FAULT_FLAG_INSTRUCTION; 632 } else if (is_gcs_fault(esr)) { 633 /* 634 * The GCS permission on a page implies both read and 635 * write so always handle any GCS fault as a write fault, 636 * we need to trigger CoW even for GCS reads. 637 */ 638 vm_flags = VM_WRITE; 639 mm_flags |= FAULT_FLAG_WRITE; 640 } else if (is_write_abort(esr)) { 641 /* It was write fault */ 642 vm_flags = VM_WRITE; 643 mm_flags |= FAULT_FLAG_WRITE; 644 } else { 645 /* It was read fault */ 646 vm_flags = VM_READ; 647 /* Write implies read */ 648 vm_flags |= VM_WRITE; 649 /* If EPAN is absent then exec implies read */ 650 if (!alternative_has_cap_unlikely(ARM64_HAS_EPAN)) 651 vm_flags |= VM_EXEC; 652 } 653 654 if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) { 655 if (is_el1_instruction_abort(esr)) 656 die_kernel_fault("execution of user memory", 657 addr, esr, regs); 658 659 if (!insn_may_access_user(regs->pc, esr)) 660 die_kernel_fault("access to user memory outside uaccess routines", 661 addr, esr, regs); 662 } 663 664 if (is_pkvm_stage2_abort(esr)) { 665 if (!user_mode(regs)) 666 goto no_context; 667 arm64_force_sig_fault(SIGSEGV, SEGV_ACCERR, far, "stage-2 fault"); 668 return 0; 669 } 670 671 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); 672 673 if (!(mm_flags & FAULT_FLAG_USER)) 674 goto lock_mmap; 675 676 vma = lock_vma_under_rcu(mm, addr); 677 if (!vma) 678 goto lock_mmap; 679 680 if (is_invalid_gcs_access(vma, esr)) { 681 vma_end_read(vma); 682 fault = 0; 683 si_code = SEGV_ACCERR; 684 goto bad_area; 685 } 686 687 if (!(vma->vm_flags & vm_flags)) { 688 vma_end_read(vma); 689 fault = 0; 690 si_code = SEGV_ACCERR; 691 count_vm_vma_lock_event(VMA_LOCK_SUCCESS); 692 goto bad_area; 693 } 694 695 if (fault_from_pkey(vma, mm_flags)) { 696 pkey = vma_pkey(vma); 697 vma_end_read(vma); 698 fault = 0; 699 si_code = SEGV_PKUERR; 700 count_vm_vma_lock_event(VMA_LOCK_SUCCESS); 701 goto bad_area; 702 } 703 704 fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs); 705 if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) 706 vma_end_read(vma); 707 708 if (!(fault & VM_FAULT_RETRY)) { 709 count_vm_vma_lock_event(VMA_LOCK_SUCCESS); 710 goto done; 711 } 712 count_vm_vma_lock_event(VMA_LOCK_RETRY); 713 if (fault & VM_FAULT_MAJOR) 714 mm_flags |= FAULT_FLAG_TRIED; 715 716 /* Quick path to respond to signals */ 717 if (fault_signal_pending(fault, regs)) { 718 if (!user_mode(regs)) 719 goto no_context; 720 return 0; 721 } 722 lock_mmap: 723 724 retry: 725 vma = lock_mm_and_find_vma(mm, addr, regs); 726 if (unlikely(!vma)) { 727 fault = 0; 728 si_code = SEGV_MAPERR; 729 goto bad_area; 730 } 731 732 if (!(vma->vm_flags & vm_flags)) { 733 mmap_read_unlock(mm); 734 fault = 0; 735 si_code = SEGV_ACCERR; 736 goto bad_area; 737 } 738 739 if (fault_from_pkey(vma, mm_flags)) { 740 pkey = vma_pkey(vma); 741 mmap_read_unlock(mm); 742 fault = 0; 743 si_code = SEGV_PKUERR; 744 goto bad_area; 745 } 746 747 fault = handle_mm_fault(vma, addr, mm_flags, regs); 748 749 /* Quick path to respond to signals */ 750 if (fault_signal_pending(fault, regs)) { 751 if (!user_mode(regs)) 752 goto no_context; 753 return 0; 754 } 755 756 /* The fault is fully completed (including releasing mmap lock) */ 757 if (fault & VM_FAULT_COMPLETED) 758 return 0; 759 760 if (fault & VM_FAULT_RETRY) { 761 mm_flags |= FAULT_FLAG_TRIED; 762 goto retry; 763 } 764 mmap_read_unlock(mm); 765 766 done: 767 /* Handle the "normal" (no error) case first. */ 768 if (likely(!(fault & VM_FAULT_ERROR))) 769 return 0; 770 771 si_code = SEGV_MAPERR; 772 bad_area: 773 /* 774 * If we are in kernel mode at this point, we have no context to 775 * handle this fault with. 776 */ 777 if (!user_mode(regs)) 778 goto no_context; 779 780 if (fault & VM_FAULT_OOM) { 781 /* 782 * We ran out of memory, call the OOM killer, and return to 783 * userspace (which will retry the fault, or kill us if we got 784 * oom-killed). 785 */ 786 pagefault_out_of_memory(); 787 return 0; 788 } 789 790 inf = esr_to_fault_info(esr); 791 set_thread_esr(addr, esr); 792 if (fault & VM_FAULT_SIGBUS) { 793 /* 794 * We had some memory, but were unable to successfully fix up 795 * this page fault. 796 */ 797 arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name); 798 } else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) { 799 unsigned int lsb; 800 801 lsb = PAGE_SHIFT; 802 if (fault & VM_FAULT_HWPOISON_LARGE) 803 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); 804 805 arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name); 806 } else { 807 /* 808 * The pkey value that we return to userspace can be different 809 * from the pkey that caused the fault. 810 * 811 * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); 812 * 2. T1 : set POR_EL0 to deny access to pkey=4, touches, page 813 * 3. T1 : faults... 814 * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); 815 * 5. T1 : enters fault handler, takes mmap_lock, etc... 816 * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really 817 * faulted on a pte with its pkey=4. 818 */ 819 /* Something tried to access memory that out of memory map */ 820 if (si_code == SEGV_PKUERR) 821 arm64_force_sig_fault_pkey(far, inf->name, pkey); 822 else 823 arm64_force_sig_fault(SIGSEGV, si_code, far, inf->name); 824 } 825 826 return 0; 827 828 no_context: 829 __do_kernel_fault(addr, esr, regs); 830 return 0; 831 } 832 833 static int __kprobes do_translation_fault(unsigned long far, 834 unsigned long esr, 835 struct pt_regs *regs) 836 { 837 unsigned long addr = untagged_addr(far); 838 839 if (is_ttbr0_addr(addr)) 840 return do_page_fault(far, esr, regs); 841 842 do_bad_area(far, esr, regs); 843 return 0; 844 } 845 846 static int do_alignment_fault(unsigned long far, unsigned long esr, 847 struct pt_regs *regs) 848 { 849 if (IS_ENABLED(CONFIG_COMPAT_ALIGNMENT_FIXUPS) && 850 compat_user_mode(regs)) 851 return do_compat_alignment_fixup(far, regs); 852 do_bad_area(far, esr, regs); 853 return 0; 854 } 855 856 static int do_bad(unsigned long far, unsigned long esr, struct pt_regs *regs) 857 { 858 return 1; /* "fault" */ 859 } 860 861 static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs) 862 { 863 const struct fault_info *inf; 864 unsigned long siaddr; 865 866 inf = esr_to_fault_info(esr); 867 868 if (user_mode(regs) && apei_claim_sea(regs) == 0) { 869 /* 870 * APEI claimed this as a firmware-first notification. 871 * Some processing deferred to task_work before ret_to_user(). 872 */ 873 return 0; 874 } 875 876 if (esr & ESR_ELx_FnV) { 877 siaddr = 0; 878 } else { 879 /* 880 * The architecture specifies that the tag bits of FAR_EL1 are 881 * UNKNOWN for synchronous external aborts. Mask them out now 882 * so that userspace doesn't see them. 883 */ 884 siaddr = untagged_addr(far); 885 } 886 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_STILL_OK); 887 arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr); 888 889 return 0; 890 } 891 892 static int do_tag_check_fault(unsigned long far, unsigned long esr, 893 struct pt_regs *regs) 894 { 895 /* 896 * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN 897 * for tag check faults. Set them to corresponding bits in the untagged 898 * address if ARM64_MTE_FAR isn't supported. 899 * Otherwise, bits 63:60 of FAR_EL1 are not UNKNOWN. 900 */ 901 if (!cpus_have_cap(ARM64_MTE_FAR)) 902 far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK); 903 904 do_bad_area(far, esr, regs); 905 return 0; 906 } 907 908 static const struct fault_info fault_info[] = { 909 { do_bad, SIGKILL, SI_KERNEL, "ttbr address size fault" }, 910 { do_bad, SIGKILL, SI_KERNEL, "level 1 address size fault" }, 911 { do_bad, SIGKILL, SI_KERNEL, "level 2 address size fault" }, 912 { do_bad, SIGKILL, SI_KERNEL, "level 3 address size fault" }, 913 { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 0 translation fault" }, 914 { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" }, 915 { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" }, 916 { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, 917 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 0 access flag fault" }, 918 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" }, 919 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" }, 920 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 access flag fault" }, 921 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 0 permission fault" }, 922 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" }, 923 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" }, 924 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" }, 925 { do_sea, SIGBUS, BUS_OBJERR, "synchronous external abort" }, 926 { do_tag_check_fault, SIGSEGV, SEGV_MTESERR, "synchronous tag check fault" }, 927 { do_bad, SIGKILL, SI_KERNEL, "unknown 18" }, 928 { do_sea, SIGKILL, SI_KERNEL, "level -1 (translation table walk)" }, 929 { do_sea, SIGKILL, SI_KERNEL, "level 0 (translation table walk)" }, 930 { do_sea, SIGKILL, SI_KERNEL, "level 1 (translation table walk)" }, 931 { do_sea, SIGKILL, SI_KERNEL, "level 2 (translation table walk)" }, 932 { do_sea, SIGKILL, SI_KERNEL, "level 3 (translation table walk)" }, 933 { do_sea, SIGBUS, BUS_OBJERR, "synchronous parity or ECC error" }, // Reserved when RAS is implemented 934 { do_bad, SIGKILL, SI_KERNEL, "unknown 25" }, 935 { do_bad, SIGKILL, SI_KERNEL, "unknown 26" }, 936 { do_sea, SIGKILL, SI_KERNEL, "level -1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented 937 { do_sea, SIGKILL, SI_KERNEL, "level 0 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented 938 { do_sea, SIGKILL, SI_KERNEL, "level 1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented 939 { do_sea, SIGKILL, SI_KERNEL, "level 2 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented 940 { do_sea, SIGKILL, SI_KERNEL, "level 3 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented 941 { do_bad, SIGKILL, SI_KERNEL, "unknown 32" }, 942 { do_alignment_fault, SIGBUS, BUS_ADRALN, "alignment fault" }, 943 { do_bad, SIGKILL, SI_KERNEL, "unknown 34" }, 944 { do_bad, SIGKILL, SI_KERNEL, "unknown 35" }, 945 { do_bad, SIGKILL, SI_KERNEL, "unknown 36" }, 946 { do_bad, SIGKILL, SI_KERNEL, "unknown 37" }, 947 { do_bad, SIGKILL, SI_KERNEL, "unknown 38" }, 948 { do_bad, SIGKILL, SI_KERNEL, "unknown 39" }, 949 { do_bad, SIGKILL, SI_KERNEL, "unknown 40" }, 950 { do_bad, SIGKILL, SI_KERNEL, "level -1 address size fault" }, 951 { do_bad, SIGKILL, SI_KERNEL, "unknown 42" }, 952 { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level -1 translation fault" }, 953 { do_bad, SIGKILL, SI_KERNEL, "unknown 44" }, 954 { do_bad, SIGKILL, SI_KERNEL, "unknown 45" }, 955 { do_bad, SIGKILL, SI_KERNEL, "unknown 46" }, 956 { do_bad, SIGKILL, SI_KERNEL, "unknown 47" }, 957 { do_bad, SIGKILL, SI_KERNEL, "TLB conflict abort" }, 958 { do_bad, SIGKILL, SI_KERNEL, "Unsupported atomic hardware update fault" }, 959 { do_bad, SIGKILL, SI_KERNEL, "unknown 50" }, 960 { do_bad, SIGKILL, SI_KERNEL, "unknown 51" }, 961 { do_bad, SIGKILL, SI_KERNEL, "implementation fault (lockdown abort)" }, 962 { do_bad, SIGBUS, BUS_OBJERR, "implementation fault (unsupported exclusive)" }, 963 { do_bad, SIGKILL, SI_KERNEL, "unknown 54" }, 964 { do_bad, SIGKILL, SI_KERNEL, "unknown 55" }, 965 { do_bad, SIGKILL, SI_KERNEL, "unknown 56" }, 966 { do_bad, SIGKILL, SI_KERNEL, "unknown 57" }, 967 { do_bad, SIGKILL, SI_KERNEL, "unknown 58" }, 968 { do_bad, SIGKILL, SI_KERNEL, "unknown 59" }, 969 { do_bad, SIGKILL, SI_KERNEL, "unknown 60" }, 970 { do_bad, SIGKILL, SI_KERNEL, "section domain fault" }, 971 { do_bad, SIGKILL, SI_KERNEL, "page domain fault" }, 972 { do_bad, SIGKILL, SI_KERNEL, "unknown 63" }, 973 }; 974 975 void do_mem_abort(unsigned long far, unsigned long esr, struct pt_regs *regs) 976 { 977 const struct fault_info *inf = esr_to_fault_info(esr); 978 unsigned long addr = untagged_addr(far); 979 980 if (!inf->fn(far, esr, regs)) 981 return; 982 983 if (!user_mode(regs)) 984 die_kernel_fault(inf->name, addr, esr, regs); 985 986 /* 987 * At this point we have an unrecognized fault type whose tag bits may 988 * have been defined as UNKNOWN. Therefore we only expose the untagged 989 * address to the signal handler. 990 */ 991 arm64_notify_die(inf->name, regs, inf->sig, inf->code, addr, esr); 992 } 993 NOKPROBE_SYMBOL(do_mem_abort); 994 995 void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs) 996 { 997 arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN, 998 addr, esr); 999 } 1000 NOKPROBE_SYMBOL(do_sp_pc_abort); 1001 1002 /* 1003 * Used during anonymous page fault handling. 1004 */ 1005 struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, 1006 unsigned long vaddr) 1007 { 1008 gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO; 1009 1010 /* 1011 * If the page is mapped with PROT_MTE, initialise the tags at the 1012 * point of allocation and page zeroing as this is usually faster than 1013 * separate DC ZVA and STGM. 1014 */ 1015 if (vma->vm_flags & VM_MTE) 1016 flags |= __GFP_ZEROTAGS; 1017 1018 return vma_alloc_folio(flags, 0, vma, vaddr); 1019 } 1020 1021 bool tag_clear_highpages(struct page *page, int numpages) 1022 { 1023 /* 1024 * Check if MTE is supported and fall back to clear_highpage(). 1025 * get_huge_zero_folio() unconditionally passes __GFP_ZEROTAGS and 1026 * post_alloc_hook() will invoke tag_clear_highpages(). 1027 */ 1028 if (!system_supports_mte()) 1029 return false; 1030 1031 /* Newly allocated pages, shouldn't have been tagged yet */ 1032 for (int i = 0; i < numpages; i++, page++) { 1033 WARN_ON_ONCE(!try_page_mte_tagging(page)); 1034 mte_zero_clear_page_tags(page_address(page)); 1035 set_page_mte_tagged(page); 1036 } 1037 return true; 1038 } 1039