1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 #include <linux/context_tracking.h> 4 #include <linux/err.h> 5 #include <linux/compat.h> 6 #include <linux/sched/debug.h> /* for show_regs */ 7 8 #include <asm/asm-prototypes.h> 9 #include <asm/kup.h> 10 #include <asm/cputime.h> 11 #include <asm/hw_irq.h> 12 #include <asm/interrupt.h> 13 #include <asm/kprobes.h> 14 #include <asm/paca.h> 15 #include <asm/ptrace.h> 16 #include <asm/reg.h> 17 #include <asm/signal.h> 18 #include <asm/switch_to.h> 19 #include <asm/syscall.h> 20 #include <asm/time.h> 21 #include <asm/unistd.h> 22 23 #if defined(CONFIG_PPC_ADV_DEBUG_REGS) && defined(CONFIG_PPC32) 24 unsigned long global_dbcr0[NR_CPUS]; 25 #endif 26 27 typedef long (*syscall_fn)(long, long, long, long, long, long); 28 29 #ifdef CONFIG_PPC_BOOK3S_64 30 DEFINE_STATIC_KEY_FALSE(interrupt_exit_not_reentrant); 31 static inline bool exit_must_hard_disable(void) 32 { 33 return static_branch_unlikely(&interrupt_exit_not_reentrant); 34 } 35 #else 36 static inline bool exit_must_hard_disable(void) 37 { 38 return true; 39 } 40 #endif 41 42 /* 43 * local irqs must be disabled. Returns false if the caller must re-enable 44 * them, check for new work, and try again. 45 * 46 * This should be called with local irqs disabled, but if they were previously 47 * enabled when the interrupt handler returns (indicating a process-context / 48 * synchronous interrupt) then irqs_enabled should be true. 49 * 50 * restartable is true then EE/RI can be left on because interrupts are handled 51 * with a restart sequence. 52 */ 53 static notrace __always_inline bool prep_irq_for_enabled_exit(bool restartable) 54 { 55 /* This must be done with RI=1 because tracing may touch vmaps */ 56 trace_hardirqs_on(); 57 58 if (exit_must_hard_disable() || !restartable) 59 __hard_EE_RI_disable(); 60 61 #ifdef CONFIG_PPC64 62 /* This pattern matches prep_irq_for_idle */ 63 if (unlikely(lazy_irq_pending_nocheck())) { 64 if (exit_must_hard_disable() || !restartable) { 65 local_paca->irq_happened |= PACA_IRQ_HARD_DIS; 66 __hard_RI_enable(); 67 } 68 trace_hardirqs_off(); 69 70 return false; 71 } 72 #endif 73 return true; 74 } 75 76 /* Has to run notrace because it is entered not completely "reconciled" */ 77 notrace long system_call_exception(long r3, long r4, long r5, 78 long r6, long r7, long r8, 79 unsigned long r0, struct pt_regs *regs) 80 { 81 syscall_fn f; 82 83 kuep_lock(); 84 85 regs->orig_gpr3 = r3; 86 87 if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) 88 BUG_ON(irq_soft_mask_return() != IRQS_ALL_DISABLED); 89 90 trace_hardirqs_off(); /* finish reconciling */ 91 92 CT_WARN_ON(ct_state() == CONTEXT_KERNEL); 93 user_exit_irqoff(); 94 95 if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x)) 96 BUG_ON(!(regs->msr & MSR_RI)); 97 BUG_ON(!(regs->msr & MSR_PR)); 98 BUG_ON(arch_irq_disabled_regs(regs)); 99 100 #ifdef CONFIG_PPC_PKEY 101 if (mmu_has_feature(MMU_FTR_PKEY)) { 102 unsigned long amr, iamr; 103 bool flush_needed = false; 104 /* 105 * When entering from userspace we mostly have the AMR/IAMR 106 * different from kernel default values. Hence don't compare. 107 */ 108 amr = mfspr(SPRN_AMR); 109 iamr = mfspr(SPRN_IAMR); 110 regs->amr = amr; 111 regs->iamr = iamr; 112 if (mmu_has_feature(MMU_FTR_BOOK3S_KUAP)) { 113 mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); 114 flush_needed = true; 115 } 116 if (mmu_has_feature(MMU_FTR_BOOK3S_KUEP)) { 117 mtspr(SPRN_IAMR, AMR_KUEP_BLOCKED); 118 flush_needed = true; 119 } 120 if (flush_needed) 121 isync(); 122 } else 123 #endif 124 kuap_assert_locked(); 125 126 booke_restore_dbcr0(); 127 128 account_cpu_user_entry(); 129 130 account_stolen_time(); 131 132 /* 133 * This is not required for the syscall exit path, but makes the 134 * stack frame look nicer. If this was initialised in the first stack 135 * frame, or if the unwinder was taught the first stack frame always 136 * returns to user with IRQS_ENABLED, this store could be avoided! 137 */ 138 irq_soft_mask_regs_set_state(regs, IRQS_ENABLED); 139 140 local_irq_enable(); 141 142 if (unlikely(current_thread_info()->flags & _TIF_SYSCALL_DOTRACE)) { 143 if (unlikely(trap_is_unsupported_scv(regs))) { 144 /* Unsupported scv vector */ 145 _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); 146 return regs->gpr[3]; 147 } 148 /* 149 * We use the return value of do_syscall_trace_enter() as the 150 * syscall number. If the syscall was rejected for any reason 151 * do_syscall_trace_enter() returns an invalid syscall number 152 * and the test against NR_syscalls will fail and the return 153 * value to be used is in regs->gpr[3]. 154 */ 155 r0 = do_syscall_trace_enter(regs); 156 if (unlikely(r0 >= NR_syscalls)) 157 return regs->gpr[3]; 158 r3 = regs->gpr[3]; 159 r4 = regs->gpr[4]; 160 r5 = regs->gpr[5]; 161 r6 = regs->gpr[6]; 162 r7 = regs->gpr[7]; 163 r8 = regs->gpr[8]; 164 165 } else if (unlikely(r0 >= NR_syscalls)) { 166 if (unlikely(trap_is_unsupported_scv(regs))) { 167 /* Unsupported scv vector */ 168 _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); 169 return regs->gpr[3]; 170 } 171 return -ENOSYS; 172 } 173 174 /* May be faster to do array_index_nospec? */ 175 barrier_nospec(); 176 177 if (unlikely(is_compat_task())) { 178 f = (void *)compat_sys_call_table[r0]; 179 180 r3 &= 0x00000000ffffffffULL; 181 r4 &= 0x00000000ffffffffULL; 182 r5 &= 0x00000000ffffffffULL; 183 r6 &= 0x00000000ffffffffULL; 184 r7 &= 0x00000000ffffffffULL; 185 r8 &= 0x00000000ffffffffULL; 186 187 } else { 188 f = (void *)sys_call_table[r0]; 189 } 190 191 return f(r3, r4, r5, r6, r7, r8); 192 } 193 194 static notrace void booke_load_dbcr0(void) 195 { 196 #ifdef CONFIG_PPC_ADV_DEBUG_REGS 197 unsigned long dbcr0 = current->thread.debug.dbcr0; 198 199 if (likely(!(dbcr0 & DBCR0_IDM))) 200 return; 201 202 /* 203 * Check to see if the dbcr0 register is set up to debug. 204 * Use the internal debug mode bit to do this. 205 */ 206 mtmsr(mfmsr() & ~MSR_DE); 207 if (IS_ENABLED(CONFIG_PPC32)) { 208 isync(); 209 global_dbcr0[smp_processor_id()] = mfspr(SPRN_DBCR0); 210 } 211 mtspr(SPRN_DBCR0, dbcr0); 212 mtspr(SPRN_DBSR, -1); 213 #endif 214 } 215 216 static void check_return_regs_valid(struct pt_regs *regs) 217 { 218 #ifdef CONFIG_PPC_BOOK3S_64 219 unsigned long trap, srr0, srr1; 220 static bool warned; 221 u8 *validp; 222 char *h; 223 224 if (trap_is_scv(regs)) 225 return; 226 227 trap = regs->trap; 228 // EE in HV mode sets HSRRs like 0xea0 229 if (cpu_has_feature(CPU_FTR_HVMODE) && trap == INTERRUPT_EXTERNAL) 230 trap = 0xea0; 231 232 switch (trap) { 233 case 0x980: 234 case INTERRUPT_H_DATA_STORAGE: 235 case 0xe20: 236 case 0xe40: 237 case INTERRUPT_HMI: 238 case 0xe80: 239 case 0xea0: 240 case INTERRUPT_H_FAC_UNAVAIL: 241 case 0x1200: 242 case 0x1500: 243 case 0x1600: 244 case 0x1800: 245 validp = &local_paca->hsrr_valid; 246 if (!*validp) 247 return; 248 249 srr0 = mfspr(SPRN_HSRR0); 250 srr1 = mfspr(SPRN_HSRR1); 251 h = "H"; 252 253 break; 254 default: 255 validp = &local_paca->srr_valid; 256 if (!*validp) 257 return; 258 259 srr0 = mfspr(SPRN_SRR0); 260 srr1 = mfspr(SPRN_SRR1); 261 h = ""; 262 break; 263 } 264 265 if (srr0 == regs->nip && srr1 == regs->msr) 266 return; 267 268 /* 269 * A NMI / soft-NMI interrupt may have come in after we found 270 * srr_valid and before the SRRs are loaded. The interrupt then 271 * comes in and clobbers SRRs and clears srr_valid. Then we load 272 * the SRRs here and test them above and find they don't match. 273 * 274 * Test validity again after that, to catch such false positives. 275 * 276 * This test in general will have some window for false negatives 277 * and may not catch and fix all such cases if an NMI comes in 278 * later and clobbers SRRs without clearing srr_valid, but hopefully 279 * such things will get caught most of the time, statistically 280 * enough to be able to get a warning out. 281 */ 282 barrier(); 283 284 if (!*validp) 285 return; 286 287 if (!warned) { 288 warned = true; 289 printk("%sSRR0 was: %lx should be: %lx\n", h, srr0, regs->nip); 290 printk("%sSRR1 was: %lx should be: %lx\n", h, srr1, regs->msr); 291 show_regs(regs); 292 } 293 294 *validp = 0; /* fixup */ 295 #endif 296 } 297 298 static notrace unsigned long 299 interrupt_exit_user_prepare_main(unsigned long ret, struct pt_regs *regs) 300 { 301 unsigned long ti_flags; 302 303 again: 304 ti_flags = READ_ONCE(current_thread_info()->flags); 305 while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) { 306 local_irq_enable(); 307 if (ti_flags & _TIF_NEED_RESCHED) { 308 schedule(); 309 } else { 310 /* 311 * SIGPENDING must restore signal handler function 312 * argument GPRs, and some non-volatiles (e.g., r1). 313 * Restore all for now. This could be made lighter. 314 */ 315 if (ti_flags & _TIF_SIGPENDING) 316 ret |= _TIF_RESTOREALL; 317 do_notify_resume(regs, ti_flags); 318 } 319 local_irq_disable(); 320 ti_flags = READ_ONCE(current_thread_info()->flags); 321 } 322 323 if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && IS_ENABLED(CONFIG_PPC_FPU)) { 324 if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) && 325 unlikely((ti_flags & _TIF_RESTORE_TM))) { 326 restore_tm_state(regs); 327 } else { 328 unsigned long mathflags = MSR_FP; 329 330 if (cpu_has_feature(CPU_FTR_VSX)) 331 mathflags |= MSR_VEC | MSR_VSX; 332 else if (cpu_has_feature(CPU_FTR_ALTIVEC)) 333 mathflags |= MSR_VEC; 334 335 /* 336 * If userspace MSR has all available FP bits set, 337 * then they are live and no need to restore. If not, 338 * it means the regs were given up and restore_math 339 * may decide to restore them (to avoid taking an FP 340 * fault). 341 */ 342 if ((regs->msr & mathflags) != mathflags) 343 restore_math(regs); 344 } 345 } 346 347 check_return_regs_valid(regs); 348 349 user_enter_irqoff(); 350 if (!prep_irq_for_enabled_exit(true)) { 351 user_exit_irqoff(); 352 local_irq_enable(); 353 local_irq_disable(); 354 goto again; 355 } 356 357 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 358 local_paca->tm_scratch = regs->msr; 359 #endif 360 361 booke_load_dbcr0(); 362 363 account_cpu_user_exit(); 364 365 /* Restore user access locks last */ 366 kuap_user_restore(regs); 367 kuep_unlock(); 368 369 return ret; 370 } 371 372 /* 373 * This should be called after a syscall returns, with r3 the return value 374 * from the syscall. If this function returns non-zero, the system call 375 * exit assembly should additionally load all GPR registers and CTR and XER 376 * from the interrupt frame. 377 * 378 * The function graph tracer can not trace the return side of this function, 379 * because RI=0 and soft mask state is "unreconciled", so it is marked notrace. 380 */ 381 notrace unsigned long syscall_exit_prepare(unsigned long r3, 382 struct pt_regs *regs, 383 long scv) 384 { 385 unsigned long ti_flags; 386 unsigned long ret = 0; 387 bool is_not_scv = !IS_ENABLED(CONFIG_PPC_BOOK3S_64) || !scv; 388 389 CT_WARN_ON(ct_state() == CONTEXT_USER); 390 391 kuap_assert_locked(); 392 393 regs->result = r3; 394 395 /* Check whether the syscall is issued inside a restartable sequence */ 396 rseq_syscall(regs); 397 398 ti_flags = current_thread_info()->flags; 399 400 if (unlikely(r3 >= (unsigned long)-MAX_ERRNO) && is_not_scv) { 401 if (likely(!(ti_flags & (_TIF_NOERROR | _TIF_RESTOREALL)))) { 402 r3 = -r3; 403 regs->ccr |= 0x10000000; /* Set SO bit in CR */ 404 } 405 } 406 407 if (unlikely(ti_flags & _TIF_PERSYSCALL_MASK)) { 408 if (ti_flags & _TIF_RESTOREALL) 409 ret = _TIF_RESTOREALL; 410 else 411 regs->gpr[3] = r3; 412 clear_bits(_TIF_PERSYSCALL_MASK, ¤t_thread_info()->flags); 413 } else { 414 regs->gpr[3] = r3; 415 } 416 417 if (unlikely(ti_flags & _TIF_SYSCALL_DOTRACE)) { 418 do_syscall_trace_leave(regs); 419 ret |= _TIF_RESTOREALL; 420 } 421 422 local_irq_disable(); 423 ret = interrupt_exit_user_prepare_main(ret, regs); 424 425 #ifdef CONFIG_PPC64 426 regs->exit_result = ret; 427 #endif 428 429 return ret; 430 } 431 432 #ifdef CONFIG_PPC64 433 notrace unsigned long syscall_exit_restart(unsigned long r3, struct pt_regs *regs) 434 { 435 /* 436 * This is called when detecting a soft-pending interrupt as well as 437 * an alternate-return interrupt. So we can't just have the alternate 438 * return path clear SRR1[MSR] and set PACA_IRQ_HARD_DIS (unless 439 * the soft-pending case were to fix things up as well). RI might be 440 * disabled, in which case it gets re-enabled by __hard_irq_disable(). 441 */ 442 __hard_irq_disable(); 443 local_paca->irq_happened |= PACA_IRQ_HARD_DIS; 444 445 #ifdef CONFIG_PPC_BOOK3S_64 446 set_kuap(AMR_KUAP_BLOCKED); 447 #endif 448 449 trace_hardirqs_off(); 450 user_exit_irqoff(); 451 account_cpu_user_entry(); 452 453 BUG_ON(!user_mode(regs)); 454 455 regs->exit_result = interrupt_exit_user_prepare_main(regs->exit_result, regs); 456 457 return regs->exit_result; 458 } 459 #endif 460 461 notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs) 462 { 463 unsigned long ret; 464 465 if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x)) 466 BUG_ON(!(regs->msr & MSR_RI)); 467 BUG_ON(!(regs->msr & MSR_PR)); 468 BUG_ON(arch_irq_disabled_regs(regs)); 469 CT_WARN_ON(ct_state() == CONTEXT_USER); 470 471 /* 472 * We don't need to restore AMR on the way back to userspace for KUAP. 473 * AMR can only have been unlocked if we interrupted the kernel. 474 */ 475 kuap_assert_locked(); 476 477 local_irq_disable(); 478 479 ret = interrupt_exit_user_prepare_main(0, regs); 480 481 #ifdef CONFIG_PPC64 482 regs->exit_result = ret; 483 #endif 484 485 return ret; 486 } 487 488 void preempt_schedule_irq(void); 489 490 notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) 491 { 492 unsigned long flags; 493 unsigned long ret = 0; 494 unsigned long kuap; 495 bool stack_store = current_thread_info()->flags & 496 _TIF_EMULATE_STACK_STORE; 497 498 if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x) && 499 unlikely(!(regs->msr & MSR_RI))) 500 unrecoverable_exception(regs); 501 BUG_ON(regs->msr & MSR_PR); 502 /* 503 * CT_WARN_ON comes here via program_check_exception, 504 * so avoid recursion. 505 */ 506 if (TRAP(regs) != INTERRUPT_PROGRAM) 507 CT_WARN_ON(ct_state() == CONTEXT_USER); 508 509 kuap = kuap_get_and_assert_locked(); 510 511 local_irq_save(flags); 512 513 if (!arch_irq_disabled_regs(regs)) { 514 /* Returning to a kernel context with local irqs enabled. */ 515 WARN_ON_ONCE(!(regs->msr & MSR_EE)); 516 again: 517 if (IS_ENABLED(CONFIG_PREEMPT)) { 518 /* Return to preemptible kernel context */ 519 if (unlikely(current_thread_info()->flags & _TIF_NEED_RESCHED)) { 520 if (preempt_count() == 0) 521 preempt_schedule_irq(); 522 } 523 } 524 525 check_return_regs_valid(regs); 526 527 /* 528 * Stack store exit can't be restarted because the interrupt 529 * stack frame might have been clobbered. 530 */ 531 if (!prep_irq_for_enabled_exit(unlikely(stack_store))) { 532 /* 533 * Replay pending soft-masked interrupts now. Don't 534 * just local_irq_enabe(); local_irq_disable(); because 535 * if we are returning from an asynchronous interrupt 536 * here, another one might hit after irqs are enabled, 537 * and it would exit via this same path allowing 538 * another to fire, and so on unbounded. 539 */ 540 hard_irq_disable(); 541 replay_soft_interrupts(); 542 /* Took an interrupt, may have more exit work to do. */ 543 goto again; 544 } 545 #ifdef CONFIG_PPC64 546 /* 547 * An interrupt may clear MSR[EE] and set this concurrently, 548 * but it will be marked pending and the exit will be retried. 549 * This leaves a racy window where MSR[EE]=0 and HARD_DIS is 550 * clear, until interrupt_exit_kernel_restart() calls 551 * hard_irq_disable(), which will set HARD_DIS again. 552 */ 553 local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; 554 555 } else { 556 check_return_regs_valid(regs); 557 558 if (unlikely(stack_store)) 559 __hard_EE_RI_disable(); 560 /* 561 * Returning to a kernel context with local irqs disabled. 562 * Here, if EE was enabled in the interrupted context, enable 563 * it on return as well. A problem exists here where a soft 564 * masked interrupt may have cleared MSR[EE] and set HARD_DIS 565 * here, and it will still exist on return to the caller. This 566 * will be resolved by the masked interrupt firing again. 567 */ 568 if (regs->msr & MSR_EE) 569 local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; 570 #endif /* CONFIG_PPC64 */ 571 } 572 573 if (unlikely(stack_store)) { 574 clear_bits(_TIF_EMULATE_STACK_STORE, ¤t_thread_info()->flags); 575 ret = 1; 576 } 577 578 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 579 local_paca->tm_scratch = regs->msr; 580 #endif 581 582 /* 583 * 64s does not want to mfspr(SPRN_AMR) here, because this comes after 584 * mtmsr, which would cause Read-After-Write stalls. Hence, take the 585 * AMR value from the check above. 586 */ 587 kuap_kernel_restore(regs, kuap); 588 589 return ret; 590 } 591 592 #ifdef CONFIG_PPC64 593 notrace unsigned long interrupt_exit_user_restart(struct pt_regs *regs) 594 { 595 __hard_irq_disable(); 596 local_paca->irq_happened |= PACA_IRQ_HARD_DIS; 597 598 #ifdef CONFIG_PPC_BOOK3S_64 599 set_kuap(AMR_KUAP_BLOCKED); 600 #endif 601 602 trace_hardirqs_off(); 603 user_exit_irqoff(); 604 account_cpu_user_entry(); 605 606 BUG_ON(!user_mode(regs)); 607 608 regs->exit_result |= interrupt_exit_user_prepare(regs); 609 610 return regs->exit_result; 611 } 612 613 /* 614 * No real need to return a value here because the stack store case does not 615 * get restarted. 616 */ 617 notrace unsigned long interrupt_exit_kernel_restart(struct pt_regs *regs) 618 { 619 __hard_irq_disable(); 620 local_paca->irq_happened |= PACA_IRQ_HARD_DIS; 621 622 #ifdef CONFIG_PPC_BOOK3S_64 623 set_kuap(AMR_KUAP_BLOCKED); 624 #endif 625 626 if (regs->softe == IRQS_ENABLED) 627 trace_hardirqs_off(); 628 629 BUG_ON(user_mode(regs)); 630 631 return interrupt_exit_kernel_prepare(regs); 632 } 633 #endif 634