1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/context_tracking.h> 4 #include <linux/entry-common.h> 5 #include <linux/resume_user_mode.h> 6 #include <linux/highmem.h> 7 #include <linux/jump_label.h> 8 #include <linux/kmsan.h> 9 #include <linux/livepatch.h> 10 #include <linux/audit.h> 11 #include <linux/tick.h> 12 13 #include "common.h" 14 15 #define CREATE_TRACE_POINTS 16 #include <trace/events/syscalls.h> 17 18 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) 19 { 20 if (unlikely(audit_context())) { 21 unsigned long args[6]; 22 23 syscall_get_arguments(current, regs, args); 24 audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); 25 } 26 } 27 28 long syscall_trace_enter(struct pt_regs *regs, long syscall, 29 unsigned long work) 30 { 31 long ret = 0; 32 33 /* 34 * Handle Syscall User Dispatch. This must comes first, since 35 * the ABI here can be something that doesn't make sense for 36 * other syscall_work features. 37 */ 38 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 39 if (syscall_user_dispatch(regs)) 40 return -1L; 41 } 42 43 /* Handle ptrace */ 44 if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { 45 ret = ptrace_report_syscall_entry(regs); 46 if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) 47 return -1L; 48 } 49 50 /* Do seccomp after ptrace, to catch any tracer changes. */ 51 if (work & SYSCALL_WORK_SECCOMP) { 52 ret = __secure_computing(NULL); 53 if (ret == -1L) 54 return ret; 55 } 56 57 /* Either of the above might have changed the syscall number */ 58 syscall = syscall_get_nr(current, regs); 59 60 if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) { 61 trace_sys_enter(regs, syscall); 62 /* 63 * Probes or BPF hooks in the tracepoint may have changed the 64 * system call number as well. 65 */ 66 syscall = syscall_get_nr(current, regs); 67 } 68 69 syscall_enter_audit(regs, syscall); 70 71 return ret ? : syscall; 72 } 73 74 noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) 75 { 76 enter_from_user_mode(regs); 77 instrumentation_begin(); 78 local_irq_enable(); 79 instrumentation_end(); 80 } 81 82 /* Workaround to allow gradual conversion of architecture code */ 83 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } 84 85 /** 86 * exit_to_user_mode_loop - do any pending work before leaving to user space 87 * @regs: Pointer to pt_regs on entry stack 88 * @ti_work: TIF work flags as read by the caller 89 */ 90 __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 91 unsigned long ti_work) 92 { 93 /* 94 * Before returning to user space ensure that all pending work 95 * items have been completed. 96 */ 97 while (ti_work & EXIT_TO_USER_MODE_WORK) { 98 99 local_irq_enable_exit_to_user(ti_work); 100 101 if (ti_work & _TIF_NEED_RESCHED) 102 schedule(); 103 104 if (ti_work & _TIF_UPROBE) 105 uprobe_notify_resume(regs); 106 107 if (ti_work & _TIF_PATCH_PENDING) 108 klp_update_patch_state(current); 109 110 if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) 111 arch_do_signal_or_restart(regs); 112 113 if (ti_work & _TIF_NOTIFY_RESUME) 114 resume_user_mode_work(regs); 115 116 /* Architecture specific TIF work */ 117 arch_exit_to_user_mode_work(regs, ti_work); 118 119 /* 120 * Disable interrupts and reevaluate the work flags as they 121 * might have changed while interrupts and preemption was 122 * enabled above. 123 */ 124 local_irq_disable_exit_to_user(); 125 126 /* Check if any of the above work has queued a deferred wakeup */ 127 tick_nohz_user_enter_prepare(); 128 129 ti_work = read_thread_flags(); 130 } 131 132 /* Return the latest work state for arch_exit_to_user_mode() */ 133 return ti_work; 134 } 135 136 /* 137 * If SYSCALL_EMU is set, then the only reason to report is when 138 * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall 139 * instruction has been already reported in syscall_enter_from_user_mode(). 140 */ 141 static inline bool report_single_step(unsigned long work) 142 { 143 if (work & SYSCALL_WORK_SYSCALL_EMU) 144 return false; 145 146 return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; 147 } 148 149 static void syscall_exit_work(struct pt_regs *regs, unsigned long work) 150 { 151 bool step; 152 153 /* 154 * If the syscall was rolled back due to syscall user dispatching, 155 * then the tracers below are not invoked for the same reason as 156 * the entry side was not invoked in syscall_trace_enter(): The ABI 157 * of these syscalls is unknown. 158 */ 159 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 160 if (unlikely(current->syscall_dispatch.on_dispatch)) { 161 current->syscall_dispatch.on_dispatch = false; 162 return; 163 } 164 } 165 166 audit_syscall_exit(regs); 167 168 if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) 169 trace_sys_exit(regs, syscall_get_return_value(current, regs)); 170 171 step = report_single_step(work); 172 if (step || work & SYSCALL_WORK_SYSCALL_TRACE) 173 ptrace_report_syscall_exit(regs, step); 174 } 175 176 /* 177 * Syscall specific exit to user mode preparation. Runs with interrupts 178 * enabled. 179 */ 180 static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) 181 { 182 unsigned long work = READ_ONCE(current_thread_info()->syscall_work); 183 unsigned long nr = syscall_get_nr(current, regs); 184 185 CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 186 187 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { 188 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) 189 local_irq_enable(); 190 } 191 192 rseq_syscall(regs); 193 194 /* 195 * Do one-time syscall specific work. If these work items are 196 * enabled, we want to run them exactly once per syscall exit with 197 * interrupts enabled. 198 */ 199 if (unlikely(work & SYSCALL_WORK_EXIT)) 200 syscall_exit_work(regs, work); 201 } 202 203 static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs) 204 { 205 syscall_exit_to_user_mode_prepare(regs); 206 local_irq_disable_exit_to_user(); 207 exit_to_user_mode_prepare(regs); 208 } 209 210 void syscall_exit_to_user_mode_work(struct pt_regs *regs) 211 { 212 __syscall_exit_to_user_mode_work(regs); 213 } 214 215 __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) 216 { 217 instrumentation_begin(); 218 __syscall_exit_to_user_mode_work(regs); 219 instrumentation_end(); 220 exit_to_user_mode(); 221 } 222 223 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) 224 { 225 enter_from_user_mode(regs); 226 } 227 228 noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) 229 { 230 instrumentation_begin(); 231 exit_to_user_mode_prepare(regs); 232 instrumentation_end(); 233 exit_to_user_mode(); 234 } 235 236 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) 237 { 238 irqentry_state_t ret = { 239 .exit_rcu = false, 240 }; 241 242 if (user_mode(regs)) { 243 irqentry_enter_from_user_mode(regs); 244 return ret; 245 } 246 247 /* 248 * If this entry hit the idle task invoke ct_irq_enter() whether 249 * RCU is watching or not. 250 * 251 * Interrupts can nest when the first interrupt invokes softirq 252 * processing on return which enables interrupts. 253 * 254 * Scheduler ticks in the idle task can mark quiescent state and 255 * terminate a grace period, if and only if the timer interrupt is 256 * not nested into another interrupt. 257 * 258 * Checking for rcu_is_watching() here would prevent the nesting 259 * interrupt to invoke ct_irq_enter(). If that nested interrupt is 260 * the tick then rcu_flavor_sched_clock_irq() would wrongfully 261 * assume that it is the first interrupt and eventually claim 262 * quiescent state and end grace periods prematurely. 263 * 264 * Unconditionally invoke ct_irq_enter() so RCU state stays 265 * consistent. 266 * 267 * TINY_RCU does not support EQS, so let the compiler eliminate 268 * this part when enabled. 269 */ 270 if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { 271 /* 272 * If RCU is not watching then the same careful 273 * sequence vs. lockdep and tracing is required 274 * as in irqentry_enter_from_user_mode(). 275 */ 276 lockdep_hardirqs_off(CALLER_ADDR0); 277 ct_irq_enter(); 278 instrumentation_begin(); 279 kmsan_unpoison_entry_regs(regs); 280 trace_hardirqs_off_finish(); 281 instrumentation_end(); 282 283 ret.exit_rcu = true; 284 return ret; 285 } 286 287 /* 288 * If RCU is watching then RCU only wants to check whether it needs 289 * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() 290 * already contains a warning when RCU is not watching, so no point 291 * in having another one here. 292 */ 293 lockdep_hardirqs_off(CALLER_ADDR0); 294 instrumentation_begin(); 295 kmsan_unpoison_entry_regs(regs); 296 rcu_irq_enter_check_tick(); 297 trace_hardirqs_off_finish(); 298 instrumentation_end(); 299 300 return ret; 301 } 302 303 void raw_irqentry_exit_cond_resched(void) 304 { 305 if (!preempt_count()) { 306 /* Sanity check RCU and thread stack */ 307 rcu_irq_exit_check_preempt(); 308 if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) 309 WARN_ON_ONCE(!on_thread_stack()); 310 if (need_resched()) 311 preempt_schedule_irq(); 312 } 313 } 314 #ifdef CONFIG_PREEMPT_DYNAMIC 315 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 316 DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); 317 #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 318 DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); 319 void dynamic_irqentry_exit_cond_resched(void) 320 { 321 if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) 322 return; 323 raw_irqentry_exit_cond_resched(); 324 } 325 #endif 326 #endif 327 328 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) 329 { 330 lockdep_assert_irqs_disabled(); 331 332 /* Check whether this returns to user mode */ 333 if (user_mode(regs)) { 334 irqentry_exit_to_user_mode(regs); 335 } else if (!regs_irqs_disabled(regs)) { 336 /* 337 * If RCU was not watching on entry this needs to be done 338 * carefully and needs the same ordering of lockdep/tracing 339 * and RCU as the return to user mode path. 340 */ 341 if (state.exit_rcu) { 342 instrumentation_begin(); 343 /* Tell the tracer that IRET will enable interrupts */ 344 trace_hardirqs_on_prepare(); 345 lockdep_hardirqs_on_prepare(); 346 instrumentation_end(); 347 ct_irq_exit(); 348 lockdep_hardirqs_on(CALLER_ADDR0); 349 return; 350 } 351 352 instrumentation_begin(); 353 if (IS_ENABLED(CONFIG_PREEMPTION)) 354 irqentry_exit_cond_resched(); 355 356 /* Covers both tracing and lockdep */ 357 trace_hardirqs_on(); 358 instrumentation_end(); 359 } else { 360 /* 361 * IRQ flags state is correct already. Just tell RCU if it 362 * was not watching on entry. 363 */ 364 if (state.exit_rcu) 365 ct_irq_exit(); 366 } 367 } 368 369 irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) 370 { 371 irqentry_state_t irq_state; 372 373 irq_state.lockdep = lockdep_hardirqs_enabled(); 374 375 __nmi_enter(); 376 lockdep_hardirqs_off(CALLER_ADDR0); 377 lockdep_hardirq_enter(); 378 ct_nmi_enter(); 379 380 instrumentation_begin(); 381 kmsan_unpoison_entry_regs(regs); 382 trace_hardirqs_off_finish(); 383 ftrace_nmi_enter(); 384 instrumentation_end(); 385 386 return irq_state; 387 } 388 389 void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state) 390 { 391 instrumentation_begin(); 392 ftrace_nmi_exit(); 393 if (irq_state.lockdep) { 394 trace_hardirqs_on_prepare(); 395 lockdep_hardirqs_on_prepare(); 396 } 397 instrumentation_end(); 398 399 ct_nmi_exit(); 400 lockdep_hardirq_exit(); 401 if (irq_state.lockdep) 402 lockdep_hardirqs_on(CALLER_ADDR0); 403 __nmi_exit(); 404 } 405