1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/context_tracking.h> 4 #include <linux/entry-common.h> 5 #include <linux/resume_user_mode.h> 6 #include <linux/highmem.h> 7 #include <linux/jump_label.h> 8 #include <linux/livepatch.h> 9 #include <linux/audit.h> 10 #include <linux/tick.h> 11 12 #include "common.h" 13 14 #define CREATE_TRACE_POINTS 15 #include <trace/events/syscalls.h> 16 17 /* See comment for enter_from_user_mode() in entry-common.h */ 18 static __always_inline void __enter_from_user_mode(struct pt_regs *regs) 19 { 20 arch_enter_from_user_mode(regs); 21 lockdep_hardirqs_off(CALLER_ADDR0); 22 23 CT_WARN_ON(ct_state() != CONTEXT_USER); 24 user_exit_irqoff(); 25 26 instrumentation_begin(); 27 trace_hardirqs_off_finish(); 28 instrumentation_end(); 29 } 30 31 void noinstr enter_from_user_mode(struct pt_regs *regs) 32 { 33 __enter_from_user_mode(regs); 34 } 35 36 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) 37 { 38 if (unlikely(audit_context())) { 39 unsigned long args[6]; 40 41 syscall_get_arguments(current, regs, args); 42 audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); 43 } 44 } 45 46 static long syscall_trace_enter(struct pt_regs *regs, long syscall, 47 unsigned long work) 48 { 49 long ret = 0; 50 51 /* 52 * Handle Syscall User Dispatch. This must comes first, since 53 * the ABI here can be something that doesn't make sense for 54 * other syscall_work features. 55 */ 56 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 57 if (syscall_user_dispatch(regs)) 58 return -1L; 59 } 60 61 /* Handle ptrace */ 62 if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { 63 ret = ptrace_report_syscall_entry(regs); 64 if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) 65 return -1L; 66 } 67 68 /* Do seccomp after ptrace, to catch any tracer changes. */ 69 if (work & SYSCALL_WORK_SECCOMP) { 70 ret = __secure_computing(NULL); 71 if (ret == -1L) 72 return ret; 73 } 74 75 /* Either of the above might have changed the syscall number */ 76 syscall = syscall_get_nr(current, regs); 77 78 if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) 79 trace_sys_enter(regs, syscall); 80 81 syscall_enter_audit(regs, syscall); 82 83 return ret ? : syscall; 84 } 85 86 static __always_inline long 87 __syscall_enter_from_user_work(struct pt_regs *regs, long syscall) 88 { 89 unsigned long work = READ_ONCE(current_thread_info()->syscall_work); 90 91 if (work & SYSCALL_WORK_ENTER) 92 syscall = syscall_trace_enter(regs, syscall, work); 93 94 return syscall; 95 } 96 97 long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) 98 { 99 return __syscall_enter_from_user_work(regs, syscall); 100 } 101 102 noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) 103 { 104 long ret; 105 106 __enter_from_user_mode(regs); 107 108 instrumentation_begin(); 109 local_irq_enable(); 110 ret = __syscall_enter_from_user_work(regs, syscall); 111 instrumentation_end(); 112 113 return ret; 114 } 115 116 noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) 117 { 118 __enter_from_user_mode(regs); 119 instrumentation_begin(); 120 local_irq_enable(); 121 instrumentation_end(); 122 } 123 124 /* See comment for exit_to_user_mode() in entry-common.h */ 125 static __always_inline void __exit_to_user_mode(void) 126 { 127 instrumentation_begin(); 128 trace_hardirqs_on_prepare(); 129 lockdep_hardirqs_on_prepare(); 130 instrumentation_end(); 131 132 user_enter_irqoff(); 133 arch_exit_to_user_mode(); 134 lockdep_hardirqs_on(CALLER_ADDR0); 135 } 136 137 void noinstr exit_to_user_mode(void) 138 { 139 __exit_to_user_mode(); 140 } 141 142 /* Workaround to allow gradual conversion of architecture code */ 143 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } 144 145 static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 146 unsigned long ti_work) 147 { 148 /* 149 * Before returning to user space ensure that all pending work 150 * items have been completed. 151 */ 152 while (ti_work & EXIT_TO_USER_MODE_WORK) { 153 154 local_irq_enable_exit_to_user(ti_work); 155 156 if (ti_work & _TIF_NEED_RESCHED) 157 schedule(); 158 159 if (ti_work & _TIF_UPROBE) 160 uprobe_notify_resume(regs); 161 162 if (ti_work & _TIF_PATCH_PENDING) 163 klp_update_patch_state(current); 164 165 if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) 166 arch_do_signal_or_restart(regs); 167 168 if (ti_work & _TIF_NOTIFY_RESUME) 169 resume_user_mode_work(regs); 170 171 /* Architecture specific TIF work */ 172 arch_exit_to_user_mode_work(regs, ti_work); 173 174 /* 175 * Disable interrupts and reevaluate the work flags as they 176 * might have changed while interrupts and preemption was 177 * enabled above. 178 */ 179 local_irq_disable_exit_to_user(); 180 181 /* Check if any of the above work has queued a deferred wakeup */ 182 tick_nohz_user_enter_prepare(); 183 184 ti_work = read_thread_flags(); 185 } 186 187 /* Return the latest work state for arch_exit_to_user_mode() */ 188 return ti_work; 189 } 190 191 static void exit_to_user_mode_prepare(struct pt_regs *regs) 192 { 193 unsigned long ti_work = read_thread_flags(); 194 195 lockdep_assert_irqs_disabled(); 196 197 /* Flush pending rcuog wakeup before the last need_resched() check */ 198 tick_nohz_user_enter_prepare(); 199 200 if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) 201 ti_work = exit_to_user_mode_loop(regs, ti_work); 202 203 arch_exit_to_user_mode_prepare(regs, ti_work); 204 205 /* Ensure that the address limit is intact and no locks are held */ 206 addr_limit_user_check(); 207 kmap_assert_nomap(); 208 lockdep_assert_irqs_disabled(); 209 lockdep_sys_exit(); 210 } 211 212 /* 213 * If SYSCALL_EMU is set, then the only reason to report is when 214 * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall 215 * instruction has been already reported in syscall_enter_from_user_mode(). 216 */ 217 static inline bool report_single_step(unsigned long work) 218 { 219 if (work & SYSCALL_WORK_SYSCALL_EMU) 220 return false; 221 222 return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; 223 } 224 225 static void syscall_exit_work(struct pt_regs *regs, unsigned long work) 226 { 227 bool step; 228 229 /* 230 * If the syscall was rolled back due to syscall user dispatching, 231 * then the tracers below are not invoked for the same reason as 232 * the entry side was not invoked in syscall_trace_enter(): The ABI 233 * of these syscalls is unknown. 234 */ 235 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 236 if (unlikely(current->syscall_dispatch.on_dispatch)) { 237 current->syscall_dispatch.on_dispatch = false; 238 return; 239 } 240 } 241 242 audit_syscall_exit(regs); 243 244 if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) 245 trace_sys_exit(regs, syscall_get_return_value(current, regs)); 246 247 step = report_single_step(work); 248 if (step || work & SYSCALL_WORK_SYSCALL_TRACE) 249 ptrace_report_syscall_exit(regs, step); 250 } 251 252 /* 253 * Syscall specific exit to user mode preparation. Runs with interrupts 254 * enabled. 255 */ 256 static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) 257 { 258 unsigned long work = READ_ONCE(current_thread_info()->syscall_work); 259 unsigned long nr = syscall_get_nr(current, regs); 260 261 CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 262 263 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { 264 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) 265 local_irq_enable(); 266 } 267 268 rseq_syscall(regs); 269 270 /* 271 * Do one-time syscall specific work. If these work items are 272 * enabled, we want to run them exactly once per syscall exit with 273 * interrupts enabled. 274 */ 275 if (unlikely(work & SYSCALL_WORK_EXIT)) 276 syscall_exit_work(regs, work); 277 } 278 279 static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs) 280 { 281 syscall_exit_to_user_mode_prepare(regs); 282 local_irq_disable_exit_to_user(); 283 exit_to_user_mode_prepare(regs); 284 } 285 286 void syscall_exit_to_user_mode_work(struct pt_regs *regs) 287 { 288 __syscall_exit_to_user_mode_work(regs); 289 } 290 291 __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) 292 { 293 instrumentation_begin(); 294 __syscall_exit_to_user_mode_work(regs); 295 instrumentation_end(); 296 __exit_to_user_mode(); 297 } 298 299 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) 300 { 301 __enter_from_user_mode(regs); 302 } 303 304 noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) 305 { 306 instrumentation_begin(); 307 exit_to_user_mode_prepare(regs); 308 instrumentation_end(); 309 __exit_to_user_mode(); 310 } 311 312 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) 313 { 314 irqentry_state_t ret = { 315 .exit_rcu = false, 316 }; 317 318 if (user_mode(regs)) { 319 irqentry_enter_from_user_mode(regs); 320 return ret; 321 } 322 323 /* 324 * If this entry hit the idle task invoke rcu_irq_enter() whether 325 * RCU is watching or not. 326 * 327 * Interrupts can nest when the first interrupt invokes softirq 328 * processing on return which enables interrupts. 329 * 330 * Scheduler ticks in the idle task can mark quiescent state and 331 * terminate a grace period, if and only if the timer interrupt is 332 * not nested into another interrupt. 333 * 334 * Checking for rcu_is_watching() here would prevent the nesting 335 * interrupt to invoke rcu_irq_enter(). If that nested interrupt is 336 * the tick then rcu_flavor_sched_clock_irq() would wrongfully 337 * assume that it is the first interrupt and eventually claim 338 * quiescent state and end grace periods prematurely. 339 * 340 * Unconditionally invoke rcu_irq_enter() so RCU state stays 341 * consistent. 342 * 343 * TINY_RCU does not support EQS, so let the compiler eliminate 344 * this part when enabled. 345 */ 346 if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { 347 /* 348 * If RCU is not watching then the same careful 349 * sequence vs. lockdep and tracing is required 350 * as in irqentry_enter_from_user_mode(). 351 */ 352 lockdep_hardirqs_off(CALLER_ADDR0); 353 rcu_irq_enter(); 354 instrumentation_begin(); 355 trace_hardirqs_off_finish(); 356 instrumentation_end(); 357 358 ret.exit_rcu = true; 359 return ret; 360 } 361 362 /* 363 * If RCU is watching then RCU only wants to check whether it needs 364 * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() 365 * already contains a warning when RCU is not watching, so no point 366 * in having another one here. 367 */ 368 lockdep_hardirqs_off(CALLER_ADDR0); 369 instrumentation_begin(); 370 rcu_irq_enter_check_tick(); 371 trace_hardirqs_off_finish(); 372 instrumentation_end(); 373 374 return ret; 375 } 376 377 void raw_irqentry_exit_cond_resched(void) 378 { 379 if (!preempt_count()) { 380 /* Sanity check RCU and thread stack */ 381 rcu_irq_exit_check_preempt(); 382 if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) 383 WARN_ON_ONCE(!on_thread_stack()); 384 if (need_resched()) 385 preempt_schedule_irq(); 386 } 387 } 388 #ifdef CONFIG_PREEMPT_DYNAMIC 389 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 390 DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); 391 #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 392 DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); 393 void dynamic_irqentry_exit_cond_resched(void) 394 { 395 if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) 396 return; 397 raw_irqentry_exit_cond_resched(); 398 } 399 #endif 400 #endif 401 402 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) 403 { 404 lockdep_assert_irqs_disabled(); 405 406 /* Check whether this returns to user mode */ 407 if (user_mode(regs)) { 408 irqentry_exit_to_user_mode(regs); 409 } else if (!regs_irqs_disabled(regs)) { 410 /* 411 * If RCU was not watching on entry this needs to be done 412 * carefully and needs the same ordering of lockdep/tracing 413 * and RCU as the return to user mode path. 414 */ 415 if (state.exit_rcu) { 416 instrumentation_begin(); 417 /* Tell the tracer that IRET will enable interrupts */ 418 trace_hardirqs_on_prepare(); 419 lockdep_hardirqs_on_prepare(); 420 instrumentation_end(); 421 rcu_irq_exit(); 422 lockdep_hardirqs_on(CALLER_ADDR0); 423 return; 424 } 425 426 instrumentation_begin(); 427 if (IS_ENABLED(CONFIG_PREEMPTION)) 428 irqentry_exit_cond_resched(); 429 430 /* Covers both tracing and lockdep */ 431 trace_hardirqs_on(); 432 instrumentation_end(); 433 } else { 434 /* 435 * IRQ flags state is correct already. Just tell RCU if it 436 * was not watching on entry. 437 */ 438 if (state.exit_rcu) 439 rcu_irq_exit(); 440 } 441 } 442 443 irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) 444 { 445 irqentry_state_t irq_state; 446 447 irq_state.lockdep = lockdep_hardirqs_enabled(); 448 449 __nmi_enter(); 450 lockdep_hardirqs_off(CALLER_ADDR0); 451 lockdep_hardirq_enter(); 452 rcu_nmi_enter(); 453 454 instrumentation_begin(); 455 trace_hardirqs_off_finish(); 456 ftrace_nmi_enter(); 457 instrumentation_end(); 458 459 return irq_state; 460 } 461 462 void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state) 463 { 464 instrumentation_begin(); 465 ftrace_nmi_exit(); 466 if (irq_state.lockdep) { 467 trace_hardirqs_on_prepare(); 468 lockdep_hardirqs_on_prepare(); 469 } 470 instrumentation_end(); 471 472 rcu_nmi_exit(); 473 lockdep_hardirq_exit(); 474 if (irq_state.lockdep) 475 lockdep_hardirqs_on(CALLER_ADDR0); 476 __nmi_exit(); 477 } 478