1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/context_tracking.h> 4 #include <linux/entry-common.h> 5 #include <linux/resume_user_mode.h> 6 #include <linux/highmem.h> 7 #include <linux/jump_label.h> 8 #include <linux/livepatch.h> 9 #include <linux/audit.h> 10 #include <linux/tick.h> 11 12 #include "common.h" 13 14 #define CREATE_TRACE_POINTS 15 #include <trace/events/syscalls.h> 16 17 /* See comment for enter_from_user_mode() in entry-common.h */ 18 static __always_inline void __enter_from_user_mode(struct pt_regs *regs) 19 { 20 arch_check_user_regs(regs); 21 lockdep_hardirqs_off(CALLER_ADDR0); 22 23 CT_WARN_ON(ct_state() != CONTEXT_USER); 24 user_exit_irqoff(); 25 26 instrumentation_begin(); 27 trace_hardirqs_off_finish(); 28 instrumentation_end(); 29 } 30 31 void noinstr enter_from_user_mode(struct pt_regs *regs) 32 { 33 __enter_from_user_mode(regs); 34 } 35 36 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) 37 { 38 if (unlikely(audit_context())) { 39 unsigned long args[6]; 40 41 syscall_get_arguments(current, regs, args); 42 audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); 43 } 44 } 45 46 static long syscall_trace_enter(struct pt_regs *regs, long syscall, 47 unsigned long work) 48 { 49 long ret = 0; 50 51 /* 52 * Handle Syscall User Dispatch. This must comes first, since 53 * the ABI here can be something that doesn't make sense for 54 * other syscall_work features. 55 */ 56 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 57 if (syscall_user_dispatch(regs)) 58 return -1L; 59 } 60 61 /* Handle ptrace */ 62 if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { 63 ret = ptrace_report_syscall_entry(regs); 64 if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) 65 return -1L; 66 } 67 68 /* Do seccomp after ptrace, to catch any tracer changes. */ 69 if (work & SYSCALL_WORK_SECCOMP) { 70 ret = __secure_computing(NULL); 71 if (ret == -1L) 72 return ret; 73 } 74 75 /* Either of the above might have changed the syscall number */ 76 syscall = syscall_get_nr(current, regs); 77 78 if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) 79 trace_sys_enter(regs, syscall); 80 81 syscall_enter_audit(regs, syscall); 82 83 return ret ? : syscall; 84 } 85 86 static __always_inline long 87 __syscall_enter_from_user_work(struct pt_regs *regs, long syscall) 88 { 89 unsigned long work = READ_ONCE(current_thread_info()->syscall_work); 90 91 if (work & SYSCALL_WORK_ENTER) 92 syscall = syscall_trace_enter(regs, syscall, work); 93 94 return syscall; 95 } 96 97 long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) 98 { 99 return __syscall_enter_from_user_work(regs, syscall); 100 } 101 102 noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) 103 { 104 long ret; 105 106 __enter_from_user_mode(regs); 107 108 instrumentation_begin(); 109 local_irq_enable(); 110 ret = __syscall_enter_from_user_work(regs, syscall); 111 instrumentation_end(); 112 113 return ret; 114 } 115 116 noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) 117 { 118 __enter_from_user_mode(regs); 119 instrumentation_begin(); 120 local_irq_enable(); 121 instrumentation_end(); 122 } 123 124 /* See comment for exit_to_user_mode() in entry-common.h */ 125 static __always_inline void __exit_to_user_mode(void) 126 { 127 instrumentation_begin(); 128 trace_hardirqs_on_prepare(); 129 lockdep_hardirqs_on_prepare(CALLER_ADDR0); 130 instrumentation_end(); 131 132 user_enter_irqoff(); 133 arch_exit_to_user_mode(); 134 lockdep_hardirqs_on(CALLER_ADDR0); 135 } 136 137 void noinstr exit_to_user_mode(void) 138 { 139 __exit_to_user_mode(); 140 } 141 142 /* Workaround to allow gradual conversion of architecture code */ 143 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } 144 145 #ifdef CONFIG_RT_DELAYED_SIGNALS 146 static inline void raise_delayed_signal(void) 147 { 148 if (unlikely(current->forced_info.si_signo)) { 149 force_sig_info(¤t->forced_info); 150 current->forced_info.si_signo = 0; 151 } 152 } 153 #else 154 static inline void raise_delayed_signal(void) { } 155 #endif 156 157 static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 158 unsigned long ti_work) 159 { 160 /* 161 * Before returning to user space ensure that all pending work 162 * items have been completed. 163 */ 164 while (ti_work & EXIT_TO_USER_MODE_WORK) { 165 166 local_irq_enable_exit_to_user(ti_work); 167 168 if (ti_work & _TIF_NEED_RESCHED) 169 schedule(); 170 171 raise_delayed_signal(); 172 173 if (ti_work & _TIF_UPROBE) 174 uprobe_notify_resume(regs); 175 176 if (ti_work & _TIF_PATCH_PENDING) 177 klp_update_patch_state(current); 178 179 if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) 180 arch_do_signal_or_restart(regs); 181 182 if (ti_work & _TIF_NOTIFY_RESUME) 183 resume_user_mode_work(regs); 184 185 /* Architecture specific TIF work */ 186 arch_exit_to_user_mode_work(regs, ti_work); 187 188 /* 189 * Disable interrupts and reevaluate the work flags as they 190 * might have changed while interrupts and preemption was 191 * enabled above. 192 */ 193 local_irq_disable_exit_to_user(); 194 195 /* Check if any of the above work has queued a deferred wakeup */ 196 tick_nohz_user_enter_prepare(); 197 198 ti_work = read_thread_flags(); 199 } 200 201 /* Return the latest work state for arch_exit_to_user_mode() */ 202 return ti_work; 203 } 204 205 static void exit_to_user_mode_prepare(struct pt_regs *regs) 206 { 207 unsigned long ti_work = read_thread_flags(); 208 209 lockdep_assert_irqs_disabled(); 210 211 /* Flush pending rcuog wakeup before the last need_resched() check */ 212 tick_nohz_user_enter_prepare(); 213 214 if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) 215 ti_work = exit_to_user_mode_loop(regs, ti_work); 216 217 arch_exit_to_user_mode_prepare(regs, ti_work); 218 219 /* Ensure that the address limit is intact and no locks are held */ 220 addr_limit_user_check(); 221 kmap_assert_nomap(); 222 lockdep_assert_irqs_disabled(); 223 lockdep_sys_exit(); 224 } 225 226 /* 227 * If SYSCALL_EMU is set, then the only reason to report is when 228 * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall 229 * instruction has been already reported in syscall_enter_from_user_mode(). 230 */ 231 static inline bool report_single_step(unsigned long work) 232 { 233 if (work & SYSCALL_WORK_SYSCALL_EMU) 234 return false; 235 236 return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; 237 } 238 239 static void syscall_exit_work(struct pt_regs *regs, unsigned long work) 240 { 241 bool step; 242 243 /* 244 * If the syscall was rolled back due to syscall user dispatching, 245 * then the tracers below are not invoked for the same reason as 246 * the entry side was not invoked in syscall_trace_enter(): The ABI 247 * of these syscalls is unknown. 248 */ 249 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 250 if (unlikely(current->syscall_dispatch.on_dispatch)) { 251 current->syscall_dispatch.on_dispatch = false; 252 return; 253 } 254 } 255 256 audit_syscall_exit(regs); 257 258 if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) 259 trace_sys_exit(regs, syscall_get_return_value(current, regs)); 260 261 step = report_single_step(work); 262 if (step || work & SYSCALL_WORK_SYSCALL_TRACE) 263 ptrace_report_syscall_exit(regs, step); 264 } 265 266 /* 267 * Syscall specific exit to user mode preparation. Runs with interrupts 268 * enabled. 269 */ 270 static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) 271 { 272 unsigned long work = READ_ONCE(current_thread_info()->syscall_work); 273 unsigned long nr = syscall_get_nr(current, regs); 274 275 CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 276 277 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { 278 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) 279 local_irq_enable(); 280 } 281 282 rseq_syscall(regs); 283 284 /* 285 * Do one-time syscall specific work. If these work items are 286 * enabled, we want to run them exactly once per syscall exit with 287 * interrupts enabled. 288 */ 289 if (unlikely(work & SYSCALL_WORK_EXIT)) 290 syscall_exit_work(regs, work); 291 } 292 293 static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs) 294 { 295 syscall_exit_to_user_mode_prepare(regs); 296 local_irq_disable_exit_to_user(); 297 exit_to_user_mode_prepare(regs); 298 } 299 300 void syscall_exit_to_user_mode_work(struct pt_regs *regs) 301 { 302 __syscall_exit_to_user_mode_work(regs); 303 } 304 305 __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) 306 { 307 instrumentation_begin(); 308 __syscall_exit_to_user_mode_work(regs); 309 instrumentation_end(); 310 __exit_to_user_mode(); 311 } 312 313 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) 314 { 315 __enter_from_user_mode(regs); 316 } 317 318 noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) 319 { 320 instrumentation_begin(); 321 exit_to_user_mode_prepare(regs); 322 instrumentation_end(); 323 __exit_to_user_mode(); 324 } 325 326 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) 327 { 328 irqentry_state_t ret = { 329 .exit_rcu = false, 330 }; 331 332 if (user_mode(regs)) { 333 irqentry_enter_from_user_mode(regs); 334 return ret; 335 } 336 337 /* 338 * If this entry hit the idle task invoke rcu_irq_enter() whether 339 * RCU is watching or not. 340 * 341 * Interrupts can nest when the first interrupt invokes softirq 342 * processing on return which enables interrupts. 343 * 344 * Scheduler ticks in the idle task can mark quiescent state and 345 * terminate a grace period, if and only if the timer interrupt is 346 * not nested into another interrupt. 347 * 348 * Checking for rcu_is_watching() here would prevent the nesting 349 * interrupt to invoke rcu_irq_enter(). If that nested interrupt is 350 * the tick then rcu_flavor_sched_clock_irq() would wrongfully 351 * assume that it is the first interrupt and eventually claim 352 * quiescent state and end grace periods prematurely. 353 * 354 * Unconditionally invoke rcu_irq_enter() so RCU state stays 355 * consistent. 356 * 357 * TINY_RCU does not support EQS, so let the compiler eliminate 358 * this part when enabled. 359 */ 360 if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { 361 /* 362 * If RCU is not watching then the same careful 363 * sequence vs. lockdep and tracing is required 364 * as in irqentry_enter_from_user_mode(). 365 */ 366 lockdep_hardirqs_off(CALLER_ADDR0); 367 rcu_irq_enter(); 368 instrumentation_begin(); 369 trace_hardirqs_off_finish(); 370 instrumentation_end(); 371 372 ret.exit_rcu = true; 373 return ret; 374 } 375 376 /* 377 * If RCU is watching then RCU only wants to check whether it needs 378 * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() 379 * already contains a warning when RCU is not watching, so no point 380 * in having another one here. 381 */ 382 lockdep_hardirqs_off(CALLER_ADDR0); 383 instrumentation_begin(); 384 rcu_irq_enter_check_tick(); 385 trace_hardirqs_off_finish(); 386 instrumentation_end(); 387 388 return ret; 389 } 390 391 void raw_irqentry_exit_cond_resched(void) 392 { 393 if (!preempt_count()) { 394 /* Sanity check RCU and thread stack */ 395 rcu_irq_exit_check_preempt(); 396 if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) 397 WARN_ON_ONCE(!on_thread_stack()); 398 if (need_resched()) 399 preempt_schedule_irq(); 400 } 401 } 402 #ifdef CONFIG_PREEMPT_DYNAMIC 403 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 404 DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); 405 #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 406 DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); 407 void dynamic_irqentry_exit_cond_resched(void) 408 { 409 if (!static_key_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) 410 return; 411 raw_irqentry_exit_cond_resched(); 412 } 413 #endif 414 #endif 415 416 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) 417 { 418 lockdep_assert_irqs_disabled(); 419 420 /* Check whether this returns to user mode */ 421 if (user_mode(regs)) { 422 irqentry_exit_to_user_mode(regs); 423 } else if (!regs_irqs_disabled(regs)) { 424 /* 425 * If RCU was not watching on entry this needs to be done 426 * carefully and needs the same ordering of lockdep/tracing 427 * and RCU as the return to user mode path. 428 */ 429 if (state.exit_rcu) { 430 instrumentation_begin(); 431 /* Tell the tracer that IRET will enable interrupts */ 432 trace_hardirqs_on_prepare(); 433 lockdep_hardirqs_on_prepare(CALLER_ADDR0); 434 instrumentation_end(); 435 rcu_irq_exit(); 436 lockdep_hardirqs_on(CALLER_ADDR0); 437 return; 438 } 439 440 instrumentation_begin(); 441 if (IS_ENABLED(CONFIG_PREEMPTION)) 442 irqentry_exit_cond_resched(); 443 444 /* Covers both tracing and lockdep */ 445 trace_hardirqs_on(); 446 instrumentation_end(); 447 } else { 448 /* 449 * IRQ flags state is correct already. Just tell RCU if it 450 * was not watching on entry. 451 */ 452 if (state.exit_rcu) 453 rcu_irq_exit(); 454 } 455 } 456 457 irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) 458 { 459 irqentry_state_t irq_state; 460 461 irq_state.lockdep = lockdep_hardirqs_enabled(); 462 463 __nmi_enter(); 464 lockdep_hardirqs_off(CALLER_ADDR0); 465 lockdep_hardirq_enter(); 466 rcu_nmi_enter(); 467 468 instrumentation_begin(); 469 trace_hardirqs_off_finish(); 470 ftrace_nmi_enter(); 471 instrumentation_end(); 472 473 return irq_state; 474 } 475 476 void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state) 477 { 478 instrumentation_begin(); 479 ftrace_nmi_exit(); 480 if (irq_state.lockdep) { 481 trace_hardirqs_on_prepare(); 482 lockdep_hardirqs_on_prepare(CALLER_ADDR0); 483 } 484 instrumentation_end(); 485 486 rcu_nmi_exit(); 487 lockdep_hardirq_exit(); 488 if (irq_state.lockdep) 489 lockdep_hardirqs_on(CALLER_ADDR0); 490 __nmi_exit(); 491 } 492