1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Based on arch/arm/mm/fault.c
4 *
5 * Copyright (C) 1995 Linus Torvalds
6 * Copyright (C) 1995-2004 Russell King
7 * Copyright (C) 2012 ARM Ltd.
8 */
9
10 #include <linux/acpi.h>
11 #include <linux/bitfield.h>
12 #include <linux/extable.h>
13 #include <linux/kfence.h>
14 #include <linux/signal.h>
15 #include <linux/mm.h>
16 #include <linux/hardirq.h>
17 #include <linux/init.h>
18 #include <linux/kasan.h>
19 #include <linux/kprobes.h>
20 #include <linux/uaccess.h>
21 #include <linux/page-flags.h>
22 #include <linux/sched/signal.h>
23 #include <linux/sched/debug.h>
24 #include <linux/highmem.h>
25 #include <linux/perf_event.h>
26 #include <linux/pkeys.h>
27 #include <linux/preempt.h>
28 #include <linux/hugetlb.h>
29
30 #include <asm/acpi.h>
31 #include <asm/bug.h>
32 #include <asm/cmpxchg.h>
33 #include <asm/cpufeature.h>
34 #include <asm/efi.h>
35 #include <asm/exception.h>
36 #include <asm/daifflags.h>
37 #include <asm/debug-monitors.h>
38 #include <asm/esr.h>
39 #include <asm/kprobes.h>
40 #include <asm/mte.h>
41 #include <asm/processor.h>
42 #include <asm/sysreg.h>
43 #include <asm/system_misc.h>
44 #include <asm/tlbflush.h>
45 #include <asm/traps.h>
46
47 struct fault_info {
48 int (*fn)(unsigned long far, unsigned long esr,
49 struct pt_regs *regs);
50 int sig;
51 int code;
52 const char *name;
53 };
54
55 static const struct fault_info fault_info[];
56
esr_to_fault_info(unsigned long esr)57 static inline const struct fault_info *esr_to_fault_info(unsigned long esr)
58 {
59 return fault_info + (esr & ESR_ELx_FSC);
60 }
61
data_abort_decode(unsigned long esr)62 static void data_abort_decode(unsigned long esr)
63 {
64 unsigned long iss2 = ESR_ELx_ISS2(esr);
65
66 pr_alert("Data abort info:\n");
67
68 if (esr & ESR_ELx_ISV) {
69 pr_alert(" Access size = %u byte(s)\n",
70 1U << ((esr & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT));
71 pr_alert(" SSE = %lu, SRT = %lu\n",
72 (esr & ESR_ELx_SSE) >> ESR_ELx_SSE_SHIFT,
73 (esr & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT);
74 pr_alert(" SF = %lu, AR = %lu\n",
75 (esr & ESR_ELx_SF) >> ESR_ELx_SF_SHIFT,
76 (esr & ESR_ELx_AR) >> ESR_ELx_AR_SHIFT);
77 } else {
78 pr_alert(" ISV = 0, ISS = 0x%08lx, ISS2 = 0x%08lx\n",
79 esr & ESR_ELx_ISS_MASK, iss2);
80 }
81
82 pr_alert(" CM = %lu, WnR = %lu, TnD = %lu, TagAccess = %lu\n",
83 (esr & ESR_ELx_CM) >> ESR_ELx_CM_SHIFT,
84 (esr & ESR_ELx_WNR) >> ESR_ELx_WNR_SHIFT,
85 (iss2 & ESR_ELx_TnD) >> ESR_ELx_TnD_SHIFT,
86 (iss2 & ESR_ELx_TagAccess) >> ESR_ELx_TagAccess_SHIFT);
87
88 pr_alert(" GCS = %ld, Overlay = %lu, DirtyBit = %lu, Xs = %llu\n",
89 (iss2 & ESR_ELx_GCS) >> ESR_ELx_GCS_SHIFT,
90 (iss2 & ESR_ELx_Overlay) >> ESR_ELx_Overlay_SHIFT,
91 (iss2 & ESR_ELx_DirtyBit) >> ESR_ELx_DirtyBit_SHIFT,
92 (iss2 & ESR_ELx_Xs_MASK) >> ESR_ELx_Xs_SHIFT);
93 }
94
mem_abort_decode(unsigned long esr)95 static void mem_abort_decode(unsigned long esr)
96 {
97 pr_alert("Mem abort info:\n");
98
99 pr_alert(" ESR = 0x%016lx\n", esr);
100 pr_alert(" EC = 0x%02lx: %s, IL = %u bits\n",
101 ESR_ELx_EC(esr), esr_get_class_string(esr),
102 (esr & ESR_ELx_IL) ? 32 : 16);
103 pr_alert(" SET = %lu, FnV = %lu\n",
104 (esr & ESR_ELx_SET_MASK) >> ESR_ELx_SET_SHIFT,
105 (esr & ESR_ELx_FnV) >> ESR_ELx_FnV_SHIFT);
106 pr_alert(" EA = %lu, S1PTW = %lu\n",
107 (esr & ESR_ELx_EA) >> ESR_ELx_EA_SHIFT,
108 (esr & ESR_ELx_S1PTW) >> ESR_ELx_S1PTW_SHIFT);
109 pr_alert(" FSC = 0x%02lx: %s\n", (esr & ESR_ELx_FSC),
110 esr_to_fault_info(esr)->name);
111
112 if (esr_is_data_abort(esr))
113 data_abort_decode(esr);
114 }
115
mm_to_pgd_phys(struct mm_struct * mm)116 static inline unsigned long mm_to_pgd_phys(struct mm_struct *mm)
117 {
118 /* Either init_pg_dir or swapper_pg_dir */
119 if (mm == &init_mm)
120 return __pa_symbol(mm->pgd);
121
122 return (unsigned long)virt_to_phys(mm->pgd);
123 }
124
125 /*
126 * Dump out the page tables associated with 'addr' in the currently active mm.
127 */
show_pte(unsigned long addr)128 static void show_pte(unsigned long addr)
129 {
130 struct mm_struct *mm;
131 pgd_t *pgdp;
132 pgd_t pgd;
133
134 if (is_ttbr0_addr(addr)) {
135 /* TTBR0 */
136 mm = current->active_mm;
137 if (mm == &init_mm) {
138 pr_alert("[%016lx] user address but active_mm is swapper\n",
139 addr);
140 return;
141 }
142 } else if (is_ttbr1_addr(addr)) {
143 /* TTBR1 */
144 mm = &init_mm;
145 } else {
146 pr_alert("[%016lx] address between user and kernel address ranges\n",
147 addr);
148 return;
149 }
150
151 pr_alert("%s pgtable: %luk pages, %llu-bit VAs, pgdp=%016lx\n",
152 mm == &init_mm ? "swapper" : "user", PAGE_SIZE / SZ_1K,
153 vabits_actual, mm_to_pgd_phys(mm));
154 pgdp = pgd_offset(mm, addr);
155 pgd = READ_ONCE(*pgdp);
156 pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd));
157
158 do {
159 p4d_t *p4dp, p4d;
160 pud_t *pudp, pud;
161 pmd_t *pmdp, pmd;
162 pte_t *ptep, pte;
163
164 if (pgd_none(pgd) || pgd_bad(pgd))
165 break;
166
167 p4dp = p4d_offset(pgdp, addr);
168 p4d = READ_ONCE(*p4dp);
169 pr_cont(", p4d=%016llx", p4d_val(p4d));
170 if (p4d_none(p4d) || p4d_bad(p4d))
171 break;
172
173 pudp = pud_offset(p4dp, addr);
174 pud = READ_ONCE(*pudp);
175 pr_cont(", pud=%016llx", pud_val(pud));
176 if (pud_none(pud) || pud_bad(pud))
177 break;
178
179 pmdp = pmd_offset(pudp, addr);
180 pmd = READ_ONCE(*pmdp);
181 pr_cont(", pmd=%016llx", pmd_val(pmd));
182 if (pmd_none(pmd) || pmd_bad(pmd))
183 break;
184
185 ptep = pte_offset_map(pmdp, addr);
186 if (!ptep)
187 break;
188
189 pte = __ptep_get(ptep);
190 pr_cont(", pte=%016llx", pte_val(pte));
191 pte_unmap(ptep);
192 } while(0);
193
194 pr_cont("\n");
195 }
196
197 /*
198 * This function sets the access flags (dirty, accessed), as well as write
199 * permission, and only to a more permissive setting.
200 *
201 * It needs to cope with hardware update of the accessed/dirty state by other
202 * agents in the system and can safely skip the __sync_icache_dcache() call as,
203 * like __set_ptes(), the PTE is never changed from no-exec to exec here.
204 *
205 * Returns whether or not the PTE actually changed.
206 */
__ptep_set_access_flags(struct vm_area_struct * vma,unsigned long address,pte_t * ptep,pte_t entry,int dirty)207 int __ptep_set_access_flags(struct vm_area_struct *vma,
208 unsigned long address, pte_t *ptep,
209 pte_t entry, int dirty)
210 {
211 pteval_t old_pteval, pteval;
212 pte_t pte = __ptep_get(ptep);
213
214 if (pte_same(pte, entry))
215 return 0;
216
217 /* only preserve the access flags and write permission */
218 pte_val(entry) &= PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY;
219
220 /*
221 * Setting the flags must be done atomically to avoid racing with the
222 * hardware update of the access/dirty state. The PTE_RDONLY bit must
223 * be set to the most permissive (lowest value) of *ptep and entry
224 * (calculated as: a & b == ~(~a | ~b)).
225 */
226 pte_val(entry) ^= PTE_RDONLY;
227 pteval = pte_val(pte);
228 do {
229 old_pteval = pteval;
230 pteval ^= PTE_RDONLY;
231 pteval |= pte_val(entry);
232 pteval ^= PTE_RDONLY;
233 pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
234 } while (pteval != old_pteval);
235
236 /*
237 * Invalidate the local stale read-only entry. Remote stale entries
238 * may still cause page faults and be invalidated via
239 * flush_tlb_fix_spurious_fault().
240 */
241 if (dirty)
242 local_flush_tlb_page(vma, address);
243 return 1;
244 }
245
is_el1_instruction_abort(unsigned long esr)246 static bool is_el1_instruction_abort(unsigned long esr)
247 {
248 return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR;
249 }
250
is_el1_data_abort(unsigned long esr)251 static bool is_el1_data_abort(unsigned long esr)
252 {
253 return ESR_ELx_EC(esr) == ESR_ELx_EC_DABT_CUR;
254 }
255
is_el1_permission_fault(unsigned long addr,unsigned long esr,struct pt_regs * regs)256 static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr,
257 struct pt_regs *regs)
258 {
259 if (!is_el1_data_abort(esr) && !is_el1_instruction_abort(esr))
260 return false;
261
262 if (esr_fsc_is_permission_fault(esr))
263 return true;
264
265 if (is_ttbr0_addr(addr) && system_uses_ttbr0_pan())
266 return esr_fsc_is_translation_fault(esr) &&
267 (regs->pstate & PSR_PAN_BIT);
268
269 return false;
270 }
271
is_spurious_el1_translation_fault(unsigned long addr,unsigned long esr,struct pt_regs * regs)272 static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
273 unsigned long esr,
274 struct pt_regs *regs)
275 {
276 unsigned long flags;
277 u64 par, dfsc;
278
279 if (!is_el1_data_abort(esr) || !esr_fsc_is_translation_fault(esr))
280 return false;
281
282 local_irq_save(flags);
283 asm volatile("at s1e1r, %0" :: "r" (addr));
284 isb();
285 par = read_sysreg_par();
286 local_irq_restore(flags);
287
288 /*
289 * If we now have a valid translation, treat the translation fault as
290 * spurious.
291 */
292 if (!(par & SYS_PAR_EL1_F))
293 return true;
294
295 /*
296 * If we got a different type of fault from the AT instruction,
297 * treat the translation fault as spurious.
298 */
299 dfsc = FIELD_GET(SYS_PAR_EL1_FST, par);
300 return !esr_fsc_is_translation_fault(dfsc);
301 }
302
die_kernel_fault(const char * msg,unsigned long addr,unsigned long esr,struct pt_regs * regs)303 static void die_kernel_fault(const char *msg, unsigned long addr,
304 unsigned long esr, struct pt_regs *regs)
305 {
306 bust_spinlocks(1);
307
308 pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
309 addr);
310
311 kasan_non_canonical_hook(addr);
312
313 mem_abort_decode(esr);
314
315 show_pte(addr);
316 die("Oops", regs, esr);
317 bust_spinlocks(0);
318 make_task_dead(SIGKILL);
319 }
320
321 #ifdef CONFIG_KASAN_HW_TAGS
report_tag_fault(unsigned long addr,unsigned long esr,struct pt_regs * regs)322 static void report_tag_fault(unsigned long addr, unsigned long esr,
323 struct pt_regs *regs)
324 {
325 /*
326 * SAS bits aren't set for all faults reported in EL1, so we can't
327 * find out access size.
328 */
329 bool is_write = !!(esr & ESR_ELx_WNR);
330 kasan_report((void *)addr, 0, is_write, regs->pc);
331 }
332 #else
333 /* Tag faults aren't enabled without CONFIG_KASAN_HW_TAGS. */
report_tag_fault(unsigned long addr,unsigned long esr,struct pt_regs * regs)334 static inline void report_tag_fault(unsigned long addr, unsigned long esr,
335 struct pt_regs *regs) { }
336 #endif
337
do_tag_recovery(unsigned long addr,unsigned long esr,struct pt_regs * regs)338 static void do_tag_recovery(unsigned long addr, unsigned long esr,
339 struct pt_regs *regs)
340 {
341
342 report_tag_fault(addr, esr, regs);
343
344 /*
345 * Disable MTE Tag Checking on the local CPU for the current EL.
346 * It will be done lazily on the other CPUs when they will hit a
347 * tag fault.
348 */
349 sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF_MASK,
350 SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF, NONE));
351 isb();
352 }
353
is_el1_mte_sync_tag_check_fault(unsigned long esr)354 static bool is_el1_mte_sync_tag_check_fault(unsigned long esr)
355 {
356 unsigned long fsc = esr & ESR_ELx_FSC;
357
358 if (!is_el1_data_abort(esr))
359 return false;
360
361 if (fsc == ESR_ELx_FSC_MTE)
362 return true;
363
364 return false;
365 }
366
__do_kernel_fault(unsigned long addr,unsigned long esr,struct pt_regs * regs)367 static void __do_kernel_fault(unsigned long addr, unsigned long esr,
368 struct pt_regs *regs)
369 {
370 const char *msg;
371
372 /*
373 * Are we prepared to handle this kernel fault?
374 * We are almost certainly not prepared to handle instruction faults.
375 */
376 if (!is_el1_instruction_abort(esr) && fixup_exception(regs, esr))
377 return;
378
379 if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs),
380 "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr))
381 return;
382
383 if (is_el1_mte_sync_tag_check_fault(esr)) {
384 do_tag_recovery(addr, esr, regs);
385
386 return;
387 }
388
389 if (is_el1_permission_fault(addr, esr, regs)) {
390 if (esr & ESR_ELx_WNR)
391 msg = "write to read-only memory";
392 else if (is_el1_instruction_abort(esr))
393 msg = "execute from non-executable memory";
394 else
395 msg = "read from unreadable memory";
396 } else if (addr < PAGE_SIZE) {
397 msg = "NULL pointer dereference";
398 } else {
399 if (esr_fsc_is_translation_fault(esr) &&
400 kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
401 return;
402
403 msg = "paging request";
404 }
405
406 if (efi_runtime_fixup_exception(regs, msg))
407 return;
408
409 die_kernel_fault(msg, addr, esr, regs);
410 }
411
set_thread_esr(unsigned long address,unsigned long esr)412 static void set_thread_esr(unsigned long address, unsigned long esr)
413 {
414 current->thread.fault_address = address;
415
416 /*
417 * If the faulting address is in the kernel, we must sanitize the ESR.
418 * From userspace's point of view, kernel-only mappings don't exist
419 * at all, so we report them as level 0 translation faults.
420 * (This is not quite the way that "no mapping there at all" behaves:
421 * an alignment fault not caused by the memory type would take
422 * precedence over translation fault for a real access to empty
423 * space. Unfortunately we can't easily distinguish "alignment fault
424 * not caused by memory type" from "alignment fault caused by memory
425 * type", so we ignore this wrinkle and just return the translation
426 * fault.)
427 */
428 if (!is_ttbr0_addr(current->thread.fault_address)) {
429 switch (ESR_ELx_EC(esr)) {
430 case ESR_ELx_EC_DABT_LOW:
431 /*
432 * These bits provide only information about the
433 * faulting instruction, which userspace knows already.
434 * We explicitly clear bits which are architecturally
435 * RES0 in case they are given meanings in future.
436 * We always report the ESR as if the fault was taken
437 * to EL1 and so ISV and the bits in ISS[23:14] are
438 * clear. (In fact it always will be a fault to EL1.)
439 */
440 esr &= ESR_ELx_EC_MASK | ESR_ELx_IL |
441 ESR_ELx_CM | ESR_ELx_WNR;
442 esr |= ESR_ELx_FSC_FAULT;
443 break;
444 case ESR_ELx_EC_IABT_LOW:
445 /*
446 * Claim a level 0 translation fault.
447 * All other bits are architecturally RES0 for faults
448 * reported with that DFSC value, so we clear them.
449 */
450 esr &= ESR_ELx_EC_MASK | ESR_ELx_IL;
451 esr |= ESR_ELx_FSC_FAULT;
452 break;
453 default:
454 /*
455 * This should never happen (entry.S only brings us
456 * into this code for insn and data aborts from a lower
457 * exception level). Fail safe by not providing an ESR
458 * context record at all.
459 */
460 WARN(1, "ESR 0x%lx is not DABT or IABT from EL0\n", esr);
461 esr = 0;
462 break;
463 }
464 }
465
466 current->thread.fault_code = esr;
467 }
468
do_bad_area(unsigned long far,unsigned long esr,struct pt_regs * regs)469 static void do_bad_area(unsigned long far, unsigned long esr,
470 struct pt_regs *regs)
471 {
472 unsigned long addr = untagged_addr(far);
473
474 /*
475 * If we are in kernel mode at this point, we have no context to
476 * handle this fault with.
477 */
478 if (user_mode(regs)) {
479 const struct fault_info *inf = esr_to_fault_info(esr);
480
481 set_thread_esr(addr, esr);
482 arm64_force_sig_fault(inf->sig, inf->code, far, inf->name);
483 } else {
484 __do_kernel_fault(addr, esr, regs);
485 }
486 }
487
fault_from_pkey(struct vm_area_struct * vma,unsigned int mm_flags)488 static bool fault_from_pkey(struct vm_area_struct *vma, unsigned int mm_flags)
489 {
490 if (!system_supports_poe())
491 return false;
492
493 /*
494 * We do not check whether an Overlay fault has occurred because we
495 * cannot make a decision based solely on its value:
496 *
497 * - If Overlay is set, a fault did occur due to POE, but it may be
498 * spurious in those cases where we update POR_EL0 without ISB (e.g.
499 * on context-switch). We would then need to manually check POR_EL0
500 * against vma_pkey(vma), which is exactly what
501 * arch_vma_access_permitted() does.
502 *
503 * - If Overlay is not set, we may still need to report a pkey fault.
504 * This is the case if an access was made within a mapping but with no
505 * page mapped, and POR_EL0 forbids the access (according to
506 * vma_pkey()). Such access will result in a SIGSEGV regardless
507 * because core code checks arch_vma_access_permitted(), but in order
508 * to report the correct error code - SEGV_PKUERR - we must handle
509 * that case here.
510 */
511 return !arch_vma_access_permitted(vma,
512 mm_flags & FAULT_FLAG_WRITE,
513 mm_flags & FAULT_FLAG_INSTRUCTION,
514 false);
515 }
516
is_gcs_fault(unsigned long esr)517 static bool is_gcs_fault(unsigned long esr)
518 {
519 if (!esr_is_data_abort(esr))
520 return false;
521
522 return ESR_ELx_ISS2(esr) & ESR_ELx_GCS;
523 }
524
is_el0_instruction_abort(unsigned long esr)525 static bool is_el0_instruction_abort(unsigned long esr)
526 {
527 return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW;
528 }
529
530 /*
531 * Note: not valid for EL1 DC IVAC, but we never use that such that it
532 * should fault. EL0 cannot issue DC IVAC (undef).
533 */
is_write_abort(unsigned long esr)534 static bool is_write_abort(unsigned long esr)
535 {
536 return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
537 }
538
is_invalid_gcs_access(struct vm_area_struct * vma,u64 esr)539 static bool is_invalid_gcs_access(struct vm_area_struct *vma, u64 esr)
540 {
541 if (!system_supports_gcs())
542 return false;
543
544 if (unlikely(is_gcs_fault(esr))) {
545 /* GCS accesses must be performed on a GCS page */
546 if (!(vma->vm_flags & VM_SHADOW_STACK))
547 return true;
548 } else if (unlikely(vma->vm_flags & VM_SHADOW_STACK)) {
549 /* Only GCS operations can write to a GCS page */
550 return esr_is_data_abort(esr) && is_write_abort(esr);
551 }
552
553 return false;
554 }
555
do_page_fault(unsigned long far,unsigned long esr,struct pt_regs * regs)556 static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
557 struct pt_regs *regs)
558 {
559 const struct fault_info *inf;
560 struct mm_struct *mm = current->mm;
561 vm_fault_t fault;
562 vm_flags_t vm_flags;
563 unsigned int mm_flags = FAULT_FLAG_DEFAULT;
564 unsigned long addr = untagged_addr(far);
565 struct vm_area_struct *vma;
566 int si_code;
567 int pkey = -1;
568
569 if (kprobe_page_fault(regs, esr))
570 return 0;
571
572 /*
573 * If we're in an interrupt or have no user context, we must not take
574 * the fault.
575 */
576 if (faulthandler_disabled() || !mm)
577 goto no_context;
578
579 if (user_mode(regs))
580 mm_flags |= FAULT_FLAG_USER;
581
582 /*
583 * vm_flags tells us what bits we must have in vma->vm_flags
584 * for the fault to be benign, __do_page_fault() would check
585 * vma->vm_flags & vm_flags and returns an error if the
586 * intersection is empty
587 */
588 if (is_el0_instruction_abort(esr)) {
589 /* It was exec fault */
590 vm_flags = VM_EXEC;
591 mm_flags |= FAULT_FLAG_INSTRUCTION;
592 } else if (is_gcs_fault(esr)) {
593 /*
594 * The GCS permission on a page implies both read and
595 * write so always handle any GCS fault as a write fault,
596 * we need to trigger CoW even for GCS reads.
597 */
598 vm_flags = VM_WRITE;
599 mm_flags |= FAULT_FLAG_WRITE;
600 } else if (is_write_abort(esr)) {
601 /* It was write fault */
602 vm_flags = VM_WRITE;
603 mm_flags |= FAULT_FLAG_WRITE;
604 } else {
605 /* It was read fault */
606 vm_flags = VM_READ;
607 /* Write implies read */
608 vm_flags |= VM_WRITE;
609 /* If EPAN is absent then exec implies read */
610 if (!alternative_has_cap_unlikely(ARM64_HAS_EPAN))
611 vm_flags |= VM_EXEC;
612 }
613
614 if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) {
615 if (is_el1_instruction_abort(esr))
616 die_kernel_fault("execution of user memory",
617 addr, esr, regs);
618
619 if (!insn_may_access_user(regs->pc, esr))
620 die_kernel_fault("access to user memory outside uaccess routines",
621 addr, esr, regs);
622 }
623
624 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
625
626 if (!(mm_flags & FAULT_FLAG_USER))
627 goto lock_mmap;
628
629 vma = lock_vma_under_rcu(mm, addr);
630 if (!vma)
631 goto lock_mmap;
632
633 if (is_invalid_gcs_access(vma, esr)) {
634 vma_end_read(vma);
635 fault = 0;
636 si_code = SEGV_ACCERR;
637 goto bad_area;
638 }
639
640 if (!(vma->vm_flags & vm_flags)) {
641 vma_end_read(vma);
642 fault = 0;
643 si_code = SEGV_ACCERR;
644 count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
645 goto bad_area;
646 }
647
648 if (fault_from_pkey(vma, mm_flags)) {
649 pkey = vma_pkey(vma);
650 vma_end_read(vma);
651 fault = 0;
652 si_code = SEGV_PKUERR;
653 count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
654 goto bad_area;
655 }
656
657 fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs);
658 if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
659 vma_end_read(vma);
660
661 if (!(fault & VM_FAULT_RETRY)) {
662 count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
663 goto done;
664 }
665 count_vm_vma_lock_event(VMA_LOCK_RETRY);
666 if (fault & VM_FAULT_MAJOR)
667 mm_flags |= FAULT_FLAG_TRIED;
668
669 /* Quick path to respond to signals */
670 if (fault_signal_pending(fault, regs)) {
671 if (!user_mode(regs))
672 goto no_context;
673 return 0;
674 }
675 lock_mmap:
676
677 retry:
678 vma = lock_mm_and_find_vma(mm, addr, regs);
679 if (unlikely(!vma)) {
680 fault = 0;
681 si_code = SEGV_MAPERR;
682 goto bad_area;
683 }
684
685 if (!(vma->vm_flags & vm_flags)) {
686 mmap_read_unlock(mm);
687 fault = 0;
688 si_code = SEGV_ACCERR;
689 goto bad_area;
690 }
691
692 if (fault_from_pkey(vma, mm_flags)) {
693 pkey = vma_pkey(vma);
694 mmap_read_unlock(mm);
695 fault = 0;
696 si_code = SEGV_PKUERR;
697 goto bad_area;
698 }
699
700 fault = handle_mm_fault(vma, addr, mm_flags, regs);
701
702 /* Quick path to respond to signals */
703 if (fault_signal_pending(fault, regs)) {
704 if (!user_mode(regs))
705 goto no_context;
706 return 0;
707 }
708
709 /* The fault is fully completed (including releasing mmap lock) */
710 if (fault & VM_FAULT_COMPLETED)
711 return 0;
712
713 if (fault & VM_FAULT_RETRY) {
714 mm_flags |= FAULT_FLAG_TRIED;
715 goto retry;
716 }
717 mmap_read_unlock(mm);
718
719 done:
720 /* Handle the "normal" (no error) case first. */
721 if (likely(!(fault & VM_FAULT_ERROR)))
722 return 0;
723
724 si_code = SEGV_MAPERR;
725 bad_area:
726 /*
727 * If we are in kernel mode at this point, we have no context to
728 * handle this fault with.
729 */
730 if (!user_mode(regs))
731 goto no_context;
732
733 if (fault & VM_FAULT_OOM) {
734 /*
735 * We ran out of memory, call the OOM killer, and return to
736 * userspace (which will retry the fault, or kill us if we got
737 * oom-killed).
738 */
739 pagefault_out_of_memory();
740 return 0;
741 }
742
743 inf = esr_to_fault_info(esr);
744 set_thread_esr(addr, esr);
745 if (fault & VM_FAULT_SIGBUS) {
746 /*
747 * We had some memory, but were unable to successfully fix up
748 * this page fault.
749 */
750 arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name);
751 } else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) {
752 unsigned int lsb;
753
754 lsb = PAGE_SHIFT;
755 if (fault & VM_FAULT_HWPOISON_LARGE)
756 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
757
758 arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name);
759 } else {
760 /*
761 * The pkey value that we return to userspace can be different
762 * from the pkey that caused the fault.
763 *
764 * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
765 * 2. T1 : set POR_EL0 to deny access to pkey=4, touches, page
766 * 3. T1 : faults...
767 * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
768 * 5. T1 : enters fault handler, takes mmap_lock, etc...
769 * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
770 * faulted on a pte with its pkey=4.
771 */
772 /* Something tried to access memory that out of memory map */
773 if (si_code == SEGV_PKUERR)
774 arm64_force_sig_fault_pkey(far, inf->name, pkey);
775 else
776 arm64_force_sig_fault(SIGSEGV, si_code, far, inf->name);
777 }
778
779 return 0;
780
781 no_context:
782 __do_kernel_fault(addr, esr, regs);
783 return 0;
784 }
785
do_translation_fault(unsigned long far,unsigned long esr,struct pt_regs * regs)786 static int __kprobes do_translation_fault(unsigned long far,
787 unsigned long esr,
788 struct pt_regs *regs)
789 {
790 unsigned long addr = untagged_addr(far);
791
792 if (is_ttbr0_addr(addr))
793 return do_page_fault(far, esr, regs);
794
795 do_bad_area(far, esr, regs);
796 return 0;
797 }
798
do_alignment_fault(unsigned long far,unsigned long esr,struct pt_regs * regs)799 static int do_alignment_fault(unsigned long far, unsigned long esr,
800 struct pt_regs *regs)
801 {
802 if (IS_ENABLED(CONFIG_COMPAT_ALIGNMENT_FIXUPS) &&
803 compat_user_mode(regs))
804 return do_compat_alignment_fixup(far, regs);
805 do_bad_area(far, esr, regs);
806 return 0;
807 }
808
do_bad(unsigned long far,unsigned long esr,struct pt_regs * regs)809 static int do_bad(unsigned long far, unsigned long esr, struct pt_regs *regs)
810 {
811 return 1; /* "fault" */
812 }
813
do_sea(unsigned long far,unsigned long esr,struct pt_regs * regs)814 static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs)
815 {
816 const struct fault_info *inf;
817 unsigned long siaddr;
818
819 inf = esr_to_fault_info(esr);
820
821 if (user_mode(regs) && apei_claim_sea(regs) == 0) {
822 /*
823 * APEI claimed this as a firmware-first notification.
824 * Some processing deferred to task_work before ret_to_user().
825 */
826 return 0;
827 }
828
829 if (esr & ESR_ELx_FnV) {
830 siaddr = 0;
831 } else {
832 /*
833 * The architecture specifies that the tag bits of FAR_EL1 are
834 * UNKNOWN for synchronous external aborts. Mask them out now
835 * so that userspace doesn't see them.
836 */
837 siaddr = untagged_addr(far);
838 }
839 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_STILL_OK);
840 arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
841
842 return 0;
843 }
844
do_tag_check_fault(unsigned long far,unsigned long esr,struct pt_regs * regs)845 static int do_tag_check_fault(unsigned long far, unsigned long esr,
846 struct pt_regs *regs)
847 {
848 /*
849 * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN
850 * for tag check faults. Set them to corresponding bits in the untagged
851 * address if ARM64_MTE_FAR isn't supported.
852 * Otherwise, bits 63:60 of FAR_EL1 are not UNKNOWN.
853 */
854 if (!cpus_have_cap(ARM64_MTE_FAR))
855 far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK);
856
857 do_bad_area(far, esr, regs);
858 return 0;
859 }
860
861 static const struct fault_info fault_info[] = {
862 { do_bad, SIGKILL, SI_KERNEL, "ttbr address size fault" },
863 { do_bad, SIGKILL, SI_KERNEL, "level 1 address size fault" },
864 { do_bad, SIGKILL, SI_KERNEL, "level 2 address size fault" },
865 { do_bad, SIGKILL, SI_KERNEL, "level 3 address size fault" },
866 { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 0 translation fault" },
867 { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" },
868 { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" },
869 { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" },
870 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 0 access flag fault" },
871 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" },
872 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" },
873 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 access flag fault" },
874 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 0 permission fault" },
875 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" },
876 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" },
877 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" },
878 { do_sea, SIGBUS, BUS_OBJERR, "synchronous external abort" },
879 { do_tag_check_fault, SIGSEGV, SEGV_MTESERR, "synchronous tag check fault" },
880 { do_bad, SIGKILL, SI_KERNEL, "unknown 18" },
881 { do_sea, SIGKILL, SI_KERNEL, "level -1 (translation table walk)" },
882 { do_sea, SIGKILL, SI_KERNEL, "level 0 (translation table walk)" },
883 { do_sea, SIGKILL, SI_KERNEL, "level 1 (translation table walk)" },
884 { do_sea, SIGKILL, SI_KERNEL, "level 2 (translation table walk)" },
885 { do_sea, SIGKILL, SI_KERNEL, "level 3 (translation table walk)" },
886 { do_sea, SIGBUS, BUS_OBJERR, "synchronous parity or ECC error" }, // Reserved when RAS is implemented
887 { do_bad, SIGKILL, SI_KERNEL, "unknown 25" },
888 { do_bad, SIGKILL, SI_KERNEL, "unknown 26" },
889 { do_sea, SIGKILL, SI_KERNEL, "level -1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
890 { do_sea, SIGKILL, SI_KERNEL, "level 0 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
891 { do_sea, SIGKILL, SI_KERNEL, "level 1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
892 { do_sea, SIGKILL, SI_KERNEL, "level 2 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
893 { do_sea, SIGKILL, SI_KERNEL, "level 3 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
894 { do_bad, SIGKILL, SI_KERNEL, "unknown 32" },
895 { do_alignment_fault, SIGBUS, BUS_ADRALN, "alignment fault" },
896 { do_bad, SIGKILL, SI_KERNEL, "unknown 34" },
897 { do_bad, SIGKILL, SI_KERNEL, "unknown 35" },
898 { do_bad, SIGKILL, SI_KERNEL, "unknown 36" },
899 { do_bad, SIGKILL, SI_KERNEL, "unknown 37" },
900 { do_bad, SIGKILL, SI_KERNEL, "unknown 38" },
901 { do_bad, SIGKILL, SI_KERNEL, "unknown 39" },
902 { do_bad, SIGKILL, SI_KERNEL, "unknown 40" },
903 { do_bad, SIGKILL, SI_KERNEL, "level -1 address size fault" },
904 { do_bad, SIGKILL, SI_KERNEL, "unknown 42" },
905 { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level -1 translation fault" },
906 { do_bad, SIGKILL, SI_KERNEL, "unknown 44" },
907 { do_bad, SIGKILL, SI_KERNEL, "unknown 45" },
908 { do_bad, SIGKILL, SI_KERNEL, "unknown 46" },
909 { do_bad, SIGKILL, SI_KERNEL, "unknown 47" },
910 { do_bad, SIGKILL, SI_KERNEL, "TLB conflict abort" },
911 { do_bad, SIGKILL, SI_KERNEL, "Unsupported atomic hardware update fault" },
912 { do_bad, SIGKILL, SI_KERNEL, "unknown 50" },
913 { do_bad, SIGKILL, SI_KERNEL, "unknown 51" },
914 { do_bad, SIGKILL, SI_KERNEL, "implementation fault (lockdown abort)" },
915 { do_bad, SIGBUS, BUS_OBJERR, "implementation fault (unsupported exclusive)" },
916 { do_bad, SIGKILL, SI_KERNEL, "unknown 54" },
917 { do_bad, SIGKILL, SI_KERNEL, "unknown 55" },
918 { do_bad, SIGKILL, SI_KERNEL, "unknown 56" },
919 { do_bad, SIGKILL, SI_KERNEL, "unknown 57" },
920 { do_bad, SIGKILL, SI_KERNEL, "unknown 58" },
921 { do_bad, SIGKILL, SI_KERNEL, "unknown 59" },
922 { do_bad, SIGKILL, SI_KERNEL, "unknown 60" },
923 { do_bad, SIGKILL, SI_KERNEL, "section domain fault" },
924 { do_bad, SIGKILL, SI_KERNEL, "page domain fault" },
925 { do_bad, SIGKILL, SI_KERNEL, "unknown 63" },
926 };
927
do_mem_abort(unsigned long far,unsigned long esr,struct pt_regs * regs)928 void do_mem_abort(unsigned long far, unsigned long esr, struct pt_regs *regs)
929 {
930 const struct fault_info *inf = esr_to_fault_info(esr);
931 unsigned long addr = untagged_addr(far);
932
933 if (!inf->fn(far, esr, regs))
934 return;
935
936 if (!user_mode(regs))
937 die_kernel_fault(inf->name, addr, esr, regs);
938
939 /*
940 * At this point we have an unrecognized fault type whose tag bits may
941 * have been defined as UNKNOWN. Therefore we only expose the untagged
942 * address to the signal handler.
943 */
944 arm64_notify_die(inf->name, regs, inf->sig, inf->code, addr, esr);
945 }
946 NOKPROBE_SYMBOL(do_mem_abort);
947
do_sp_pc_abort(unsigned long addr,unsigned long esr,struct pt_regs * regs)948 void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs)
949 {
950 arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN,
951 addr, esr);
952 }
953 NOKPROBE_SYMBOL(do_sp_pc_abort);
954
955 /*
956 * Used during anonymous page fault handling.
957 */
vma_alloc_zeroed_movable_folio(struct vm_area_struct * vma,unsigned long vaddr)958 struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
959 unsigned long vaddr)
960 {
961 gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO;
962
963 /*
964 * If the page is mapped with PROT_MTE, initialise the tags at the
965 * point of allocation and page zeroing as this is usually faster than
966 * separate DC ZVA and STGM.
967 */
968 if (vma->vm_flags & VM_MTE)
969 flags |= __GFP_ZEROTAGS;
970
971 return vma_alloc_folio(flags, 0, vma, vaddr);
972 }
973
tag_clear_highpages(struct page * page,int numpages)974 bool tag_clear_highpages(struct page *page, int numpages)
975 {
976 /*
977 * Check if MTE is supported and fall back to clear_highpage().
978 * get_huge_zero_folio() unconditionally passes __GFP_ZEROTAGS and
979 * post_alloc_hook() will invoke tag_clear_highpages().
980 */
981 if (!system_supports_mte())
982 return false;
983
984 /* Newly allocated pages, shouldn't have been tagged yet */
985 for (int i = 0; i < numpages; i++, page++) {
986 WARN_ON_ONCE(!try_page_mte_tagging(page));
987 mte_zero_clear_page_tags(page_address(page));
988 set_page_mte_tagged(page);
989 }
990 return true;
991 }
992