1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 #include <linux/context_tracking.h> 4 #include <linux/err.h> 5 #include <linux/compat.h> 6 #include <linux/sched/debug.h> /* for show_regs */ 7 8 #include <asm/asm-prototypes.h> 9 #include <asm/kup.h> 10 #include <asm/cputime.h> 11 #include <asm/hw_irq.h> 12 #include <asm/interrupt.h> 13 #include <asm/kprobes.h> 14 #include <asm/paca.h> 15 #include <asm/ptrace.h> 16 #include <asm/reg.h> 17 #include <asm/signal.h> 18 #include <asm/switch_to.h> 19 #include <asm/syscall.h> 20 #include <asm/time.h> 21 #include <asm/unistd.h> 22 23 #if defined(CONFIG_PPC_ADV_DEBUG_REGS) && defined(CONFIG_PPC32) 24 unsigned long global_dbcr0[NR_CPUS]; 25 #endif 26 27 typedef long (*syscall_fn)(long, long, long, long, long, long); 28 29 #ifdef CONFIG_PPC_BOOK3S_64 30 DEFINE_STATIC_KEY_FALSE(interrupt_exit_not_reentrant); 31 static inline bool exit_must_hard_disable(void) 32 { 33 return static_branch_unlikely(&interrupt_exit_not_reentrant); 34 } 35 #else 36 static inline bool exit_must_hard_disable(void) 37 { 38 return true; 39 } 40 #endif 41 42 /* 43 * local irqs must be disabled. Returns false if the caller must re-enable 44 * them, check for new work, and try again. 45 * 46 * This should be called with local irqs disabled, but if they were previously 47 * enabled when the interrupt handler returns (indicating a process-context / 48 * synchronous interrupt) then irqs_enabled should be true. 49 * 50 * restartable is true then EE/RI can be left on because interrupts are handled 51 * with a restart sequence. 52 */ 53 static notrace __always_inline bool prep_irq_for_enabled_exit(bool restartable) 54 { 55 /* This must be done with RI=1 because tracing may touch vmaps */ 56 trace_hardirqs_on(); 57 58 if (exit_must_hard_disable() || !restartable) 59 __hard_EE_RI_disable(); 60 61 #ifdef CONFIG_PPC64 62 /* This pattern matches prep_irq_for_idle */ 63 if (unlikely(lazy_irq_pending_nocheck())) { 64 if (exit_must_hard_disable() || !restartable) { 65 local_paca->irq_happened |= PACA_IRQ_HARD_DIS; 66 __hard_RI_enable(); 67 } 68 trace_hardirqs_off(); 69 70 return false; 71 } 72 #endif 73 return true; 74 } 75 76 /* Has to run notrace because it is entered not completely "reconciled" */ 77 notrace long system_call_exception(long r3, long r4, long r5, 78 long r6, long r7, long r8, 79 unsigned long r0, struct pt_regs *regs) 80 { 81 syscall_fn f; 82 83 kuep_lock(); 84 85 regs->orig_gpr3 = r3; 86 87 if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) 88 BUG_ON(irq_soft_mask_return() != IRQS_ALL_DISABLED); 89 90 trace_hardirqs_off(); /* finish reconciling */ 91 92 CT_WARN_ON(ct_state() == CONTEXT_KERNEL); 93 user_exit_irqoff(); 94 95 BUG_ON(regs_is_unrecoverable(regs)); 96 BUG_ON(!(regs->msr & MSR_PR)); 97 BUG_ON(arch_irq_disabled_regs(regs)); 98 99 #ifdef CONFIG_PPC_PKEY 100 if (mmu_has_feature(MMU_FTR_PKEY)) { 101 unsigned long amr, iamr; 102 bool flush_needed = false; 103 /* 104 * When entering from userspace we mostly have the AMR/IAMR 105 * different from kernel default values. Hence don't compare. 106 */ 107 amr = mfspr(SPRN_AMR); 108 iamr = mfspr(SPRN_IAMR); 109 regs->amr = amr; 110 regs->iamr = iamr; 111 if (mmu_has_feature(MMU_FTR_BOOK3S_KUAP)) { 112 mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); 113 flush_needed = true; 114 } 115 if (mmu_has_feature(MMU_FTR_BOOK3S_KUEP)) { 116 mtspr(SPRN_IAMR, AMR_KUEP_BLOCKED); 117 flush_needed = true; 118 } 119 if (flush_needed) 120 isync(); 121 } else 122 #endif 123 kuap_assert_locked(); 124 125 booke_restore_dbcr0(); 126 127 account_cpu_user_entry(); 128 129 account_stolen_time(); 130 131 /* 132 * This is not required for the syscall exit path, but makes the 133 * stack frame look nicer. If this was initialised in the first stack 134 * frame, or if the unwinder was taught the first stack frame always 135 * returns to user with IRQS_ENABLED, this store could be avoided! 136 */ 137 irq_soft_mask_regs_set_state(regs, IRQS_ENABLED); 138 139 local_irq_enable(); 140 141 if (unlikely(current_thread_info()->flags & _TIF_SYSCALL_DOTRACE)) { 142 if (unlikely(trap_is_unsupported_scv(regs))) { 143 /* Unsupported scv vector */ 144 _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); 145 return regs->gpr[3]; 146 } 147 /* 148 * We use the return value of do_syscall_trace_enter() as the 149 * syscall number. If the syscall was rejected for any reason 150 * do_syscall_trace_enter() returns an invalid syscall number 151 * and the test against NR_syscalls will fail and the return 152 * value to be used is in regs->gpr[3]. 153 */ 154 r0 = do_syscall_trace_enter(regs); 155 if (unlikely(r0 >= NR_syscalls)) 156 return regs->gpr[3]; 157 r3 = regs->gpr[3]; 158 r4 = regs->gpr[4]; 159 r5 = regs->gpr[5]; 160 r6 = regs->gpr[6]; 161 r7 = regs->gpr[7]; 162 r8 = regs->gpr[8]; 163 164 } else if (unlikely(r0 >= NR_syscalls)) { 165 if (unlikely(trap_is_unsupported_scv(regs))) { 166 /* Unsupported scv vector */ 167 _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); 168 return regs->gpr[3]; 169 } 170 return -ENOSYS; 171 } 172 173 /* May be faster to do array_index_nospec? */ 174 barrier_nospec(); 175 176 if (unlikely(is_compat_task())) { 177 f = (void *)compat_sys_call_table[r0]; 178 179 r3 &= 0x00000000ffffffffULL; 180 r4 &= 0x00000000ffffffffULL; 181 r5 &= 0x00000000ffffffffULL; 182 r6 &= 0x00000000ffffffffULL; 183 r7 &= 0x00000000ffffffffULL; 184 r8 &= 0x00000000ffffffffULL; 185 186 } else { 187 f = (void *)sys_call_table[r0]; 188 } 189 190 return f(r3, r4, r5, r6, r7, r8); 191 } 192 193 static notrace void booke_load_dbcr0(void) 194 { 195 #ifdef CONFIG_PPC_ADV_DEBUG_REGS 196 unsigned long dbcr0 = current->thread.debug.dbcr0; 197 198 if (likely(!(dbcr0 & DBCR0_IDM))) 199 return; 200 201 /* 202 * Check to see if the dbcr0 register is set up to debug. 203 * Use the internal debug mode bit to do this. 204 */ 205 mtmsr(mfmsr() & ~MSR_DE); 206 if (IS_ENABLED(CONFIG_PPC32)) { 207 isync(); 208 global_dbcr0[smp_processor_id()] = mfspr(SPRN_DBCR0); 209 } 210 mtspr(SPRN_DBCR0, dbcr0); 211 mtspr(SPRN_DBSR, -1); 212 #endif 213 } 214 215 static void check_return_regs_valid(struct pt_regs *regs) 216 { 217 #ifdef CONFIG_PPC_BOOK3S_64 218 unsigned long trap, srr0, srr1; 219 static bool warned; 220 u8 *validp; 221 char *h; 222 223 if (trap_is_scv(regs)) 224 return; 225 226 trap = regs->trap; 227 // EE in HV mode sets HSRRs like 0xea0 228 if (cpu_has_feature(CPU_FTR_HVMODE) && trap == INTERRUPT_EXTERNAL) 229 trap = 0xea0; 230 231 switch (trap) { 232 case 0x980: 233 case INTERRUPT_H_DATA_STORAGE: 234 case 0xe20: 235 case 0xe40: 236 case INTERRUPT_HMI: 237 case 0xe80: 238 case 0xea0: 239 case INTERRUPT_H_FAC_UNAVAIL: 240 case 0x1200: 241 case 0x1500: 242 case 0x1600: 243 case 0x1800: 244 validp = &local_paca->hsrr_valid; 245 if (!*validp) 246 return; 247 248 srr0 = mfspr(SPRN_HSRR0); 249 srr1 = mfspr(SPRN_HSRR1); 250 h = "H"; 251 252 break; 253 default: 254 validp = &local_paca->srr_valid; 255 if (!*validp) 256 return; 257 258 srr0 = mfspr(SPRN_SRR0); 259 srr1 = mfspr(SPRN_SRR1); 260 h = ""; 261 break; 262 } 263 264 if (srr0 == regs->nip && srr1 == regs->msr) 265 return; 266 267 /* 268 * A NMI / soft-NMI interrupt may have come in after we found 269 * srr_valid and before the SRRs are loaded. The interrupt then 270 * comes in and clobbers SRRs and clears srr_valid. Then we load 271 * the SRRs here and test them above and find they don't match. 272 * 273 * Test validity again after that, to catch such false positives. 274 * 275 * This test in general will have some window for false negatives 276 * and may not catch and fix all such cases if an NMI comes in 277 * later and clobbers SRRs without clearing srr_valid, but hopefully 278 * such things will get caught most of the time, statistically 279 * enough to be able to get a warning out. 280 */ 281 barrier(); 282 283 if (!*validp) 284 return; 285 286 if (!warned) { 287 warned = true; 288 printk("%sSRR0 was: %lx should be: %lx\n", h, srr0, regs->nip); 289 printk("%sSRR1 was: %lx should be: %lx\n", h, srr1, regs->msr); 290 show_regs(regs); 291 } 292 293 *validp = 0; /* fixup */ 294 #endif 295 } 296 297 static notrace unsigned long 298 interrupt_exit_user_prepare_main(unsigned long ret, struct pt_regs *regs) 299 { 300 unsigned long ti_flags; 301 302 again: 303 ti_flags = READ_ONCE(current_thread_info()->flags); 304 while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) { 305 local_irq_enable(); 306 if (ti_flags & _TIF_NEED_RESCHED) { 307 schedule(); 308 } else { 309 /* 310 * SIGPENDING must restore signal handler function 311 * argument GPRs, and some non-volatiles (e.g., r1). 312 * Restore all for now. This could be made lighter. 313 */ 314 if (ti_flags & _TIF_SIGPENDING) 315 ret |= _TIF_RESTOREALL; 316 do_notify_resume(regs, ti_flags); 317 } 318 local_irq_disable(); 319 ti_flags = READ_ONCE(current_thread_info()->flags); 320 } 321 322 if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && IS_ENABLED(CONFIG_PPC_FPU)) { 323 if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) && 324 unlikely((ti_flags & _TIF_RESTORE_TM))) { 325 restore_tm_state(regs); 326 } else { 327 unsigned long mathflags = MSR_FP; 328 329 if (cpu_has_feature(CPU_FTR_VSX)) 330 mathflags |= MSR_VEC | MSR_VSX; 331 else if (cpu_has_feature(CPU_FTR_ALTIVEC)) 332 mathflags |= MSR_VEC; 333 334 /* 335 * If userspace MSR has all available FP bits set, 336 * then they are live and no need to restore. If not, 337 * it means the regs were given up and restore_math 338 * may decide to restore them (to avoid taking an FP 339 * fault). 340 */ 341 if ((regs->msr & mathflags) != mathflags) 342 restore_math(regs); 343 } 344 } 345 346 check_return_regs_valid(regs); 347 348 user_enter_irqoff(); 349 if (!prep_irq_for_enabled_exit(true)) { 350 user_exit_irqoff(); 351 local_irq_enable(); 352 local_irq_disable(); 353 goto again; 354 } 355 356 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 357 local_paca->tm_scratch = regs->msr; 358 #endif 359 360 booke_load_dbcr0(); 361 362 account_cpu_user_exit(); 363 364 /* Restore user access locks last */ 365 kuap_user_restore(regs); 366 kuep_unlock(); 367 368 return ret; 369 } 370 371 /* 372 * This should be called after a syscall returns, with r3 the return value 373 * from the syscall. If this function returns non-zero, the system call 374 * exit assembly should additionally load all GPR registers and CTR and XER 375 * from the interrupt frame. 376 * 377 * The function graph tracer can not trace the return side of this function, 378 * because RI=0 and soft mask state is "unreconciled", so it is marked notrace. 379 */ 380 notrace unsigned long syscall_exit_prepare(unsigned long r3, 381 struct pt_regs *regs, 382 long scv) 383 { 384 unsigned long ti_flags; 385 unsigned long ret = 0; 386 bool is_not_scv = !IS_ENABLED(CONFIG_PPC_BOOK3S_64) || !scv; 387 388 CT_WARN_ON(ct_state() == CONTEXT_USER); 389 390 kuap_assert_locked(); 391 392 regs->result = r3; 393 394 /* Check whether the syscall is issued inside a restartable sequence */ 395 rseq_syscall(regs); 396 397 ti_flags = current_thread_info()->flags; 398 399 if (unlikely(r3 >= (unsigned long)-MAX_ERRNO) && is_not_scv) { 400 if (likely(!(ti_flags & (_TIF_NOERROR | _TIF_RESTOREALL)))) { 401 r3 = -r3; 402 regs->ccr |= 0x10000000; /* Set SO bit in CR */ 403 } 404 } 405 406 if (unlikely(ti_flags & _TIF_PERSYSCALL_MASK)) { 407 if (ti_flags & _TIF_RESTOREALL) 408 ret = _TIF_RESTOREALL; 409 else 410 regs->gpr[3] = r3; 411 clear_bits(_TIF_PERSYSCALL_MASK, ¤t_thread_info()->flags); 412 } else { 413 regs->gpr[3] = r3; 414 } 415 416 if (unlikely(ti_flags & _TIF_SYSCALL_DOTRACE)) { 417 do_syscall_trace_leave(regs); 418 ret |= _TIF_RESTOREALL; 419 } 420 421 local_irq_disable(); 422 ret = interrupt_exit_user_prepare_main(ret, regs); 423 424 #ifdef CONFIG_PPC64 425 regs->exit_result = ret; 426 #endif 427 428 return ret; 429 } 430 431 #ifdef CONFIG_PPC64 432 notrace unsigned long syscall_exit_restart(unsigned long r3, struct pt_regs *regs) 433 { 434 /* 435 * This is called when detecting a soft-pending interrupt as well as 436 * an alternate-return interrupt. So we can't just have the alternate 437 * return path clear SRR1[MSR] and set PACA_IRQ_HARD_DIS (unless 438 * the soft-pending case were to fix things up as well). RI might be 439 * disabled, in which case it gets re-enabled by __hard_irq_disable(). 440 */ 441 __hard_irq_disable(); 442 local_paca->irq_happened |= PACA_IRQ_HARD_DIS; 443 444 #ifdef CONFIG_PPC_BOOK3S_64 445 set_kuap(AMR_KUAP_BLOCKED); 446 #endif 447 448 trace_hardirqs_off(); 449 user_exit_irqoff(); 450 account_cpu_user_entry(); 451 452 BUG_ON(!user_mode(regs)); 453 454 regs->exit_result = interrupt_exit_user_prepare_main(regs->exit_result, regs); 455 456 return regs->exit_result; 457 } 458 #endif 459 460 notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs) 461 { 462 unsigned long ret; 463 464 BUG_ON(regs_is_unrecoverable(regs)); 465 BUG_ON(arch_irq_disabled_regs(regs)); 466 CT_WARN_ON(ct_state() == CONTEXT_USER); 467 468 /* 469 * We don't need to restore AMR on the way back to userspace for KUAP. 470 * AMR can only have been unlocked if we interrupted the kernel. 471 */ 472 kuap_assert_locked(); 473 474 local_irq_disable(); 475 476 ret = interrupt_exit_user_prepare_main(0, regs); 477 478 #ifdef CONFIG_PPC64 479 regs->exit_result = ret; 480 #endif 481 482 return ret; 483 } 484 485 void preempt_schedule_irq(void); 486 487 notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) 488 { 489 unsigned long flags; 490 unsigned long ret = 0; 491 unsigned long kuap; 492 bool stack_store = current_thread_info()->flags & 493 _TIF_EMULATE_STACK_STORE; 494 495 if (regs_is_unrecoverable(regs)) 496 unrecoverable_exception(regs); 497 /* 498 * CT_WARN_ON comes here via program_check_exception, 499 * so avoid recursion. 500 */ 501 if (TRAP(regs) != INTERRUPT_PROGRAM) 502 CT_WARN_ON(ct_state() == CONTEXT_USER); 503 504 kuap = kuap_get_and_assert_locked(); 505 506 local_irq_save(flags); 507 508 if (!arch_irq_disabled_regs(regs)) { 509 /* Returning to a kernel context with local irqs enabled. */ 510 WARN_ON_ONCE(!(regs->msr & MSR_EE)); 511 again: 512 if (IS_ENABLED(CONFIG_PREEMPT)) { 513 /* Return to preemptible kernel context */ 514 if (unlikely(current_thread_info()->flags & _TIF_NEED_RESCHED)) { 515 if (preempt_count() == 0) 516 preempt_schedule_irq(); 517 } 518 } 519 520 check_return_regs_valid(regs); 521 522 /* 523 * Stack store exit can't be restarted because the interrupt 524 * stack frame might have been clobbered. 525 */ 526 if (!prep_irq_for_enabled_exit(unlikely(stack_store))) { 527 /* 528 * Replay pending soft-masked interrupts now. Don't 529 * just local_irq_enabe(); local_irq_disable(); because 530 * if we are returning from an asynchronous interrupt 531 * here, another one might hit after irqs are enabled, 532 * and it would exit via this same path allowing 533 * another to fire, and so on unbounded. 534 */ 535 hard_irq_disable(); 536 replay_soft_interrupts(); 537 /* Took an interrupt, may have more exit work to do. */ 538 goto again; 539 } 540 #ifdef CONFIG_PPC64 541 /* 542 * An interrupt may clear MSR[EE] and set this concurrently, 543 * but it will be marked pending and the exit will be retried. 544 * This leaves a racy window where MSR[EE]=0 and HARD_DIS is 545 * clear, until interrupt_exit_kernel_restart() calls 546 * hard_irq_disable(), which will set HARD_DIS again. 547 */ 548 local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; 549 550 } else { 551 check_return_regs_valid(regs); 552 553 if (unlikely(stack_store)) 554 __hard_EE_RI_disable(); 555 /* 556 * Returning to a kernel context with local irqs disabled. 557 * Here, if EE was enabled in the interrupted context, enable 558 * it on return as well. A problem exists here where a soft 559 * masked interrupt may have cleared MSR[EE] and set HARD_DIS 560 * here, and it will still exist on return to the caller. This 561 * will be resolved by the masked interrupt firing again. 562 */ 563 if (regs->msr & MSR_EE) 564 local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; 565 #endif /* CONFIG_PPC64 */ 566 } 567 568 if (unlikely(stack_store)) { 569 clear_bits(_TIF_EMULATE_STACK_STORE, ¤t_thread_info()->flags); 570 ret = 1; 571 } 572 573 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 574 local_paca->tm_scratch = regs->msr; 575 #endif 576 577 /* 578 * 64s does not want to mfspr(SPRN_AMR) here, because this comes after 579 * mtmsr, which would cause Read-After-Write stalls. Hence, take the 580 * AMR value from the check above. 581 */ 582 kuap_kernel_restore(regs, kuap); 583 584 return ret; 585 } 586 587 #ifdef CONFIG_PPC64 588 notrace unsigned long interrupt_exit_user_restart(struct pt_regs *regs) 589 { 590 __hard_irq_disable(); 591 local_paca->irq_happened |= PACA_IRQ_HARD_DIS; 592 593 #ifdef CONFIG_PPC_BOOK3S_64 594 set_kuap(AMR_KUAP_BLOCKED); 595 #endif 596 597 trace_hardirqs_off(); 598 user_exit_irqoff(); 599 account_cpu_user_entry(); 600 601 BUG_ON(!user_mode(regs)); 602 603 regs->exit_result |= interrupt_exit_user_prepare(regs); 604 605 return regs->exit_result; 606 } 607 608 /* 609 * No real need to return a value here because the stack store case does not 610 * get restarted. 611 */ 612 notrace unsigned long interrupt_exit_kernel_restart(struct pt_regs *regs) 613 { 614 __hard_irq_disable(); 615 local_paca->irq_happened |= PACA_IRQ_HARD_DIS; 616 617 #ifdef CONFIG_PPC_BOOK3S_64 618 set_kuap(AMR_KUAP_BLOCKED); 619 #endif 620 621 if (regs->softe == IRQS_ENABLED) 622 trace_hardirqs_off(); 623 624 BUG_ON(user_mode(regs)); 625 626 return interrupt_exit_kernel_prepare(regs); 627 } 628 #endif 629