xref: /linux/kernel/entry/common.c (revision 37a93dd5c49b5fda807fd204edf2547c3493319c)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/irq-entry-common.h>
4 #include <linux/resume_user_mode.h>
5 #include <linux/highmem.h>
6 #include <linux/jump_label.h>
7 #include <linux/kmsan.h>
8 #include <linux/livepatch.h>
9 #include <linux/tick.h>
10 
11 /* Workaround to allow gradual conversion of architecture code */
12 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
13 
14 #ifdef CONFIG_HAVE_GENERIC_TIF_BITS
15 #define EXIT_TO_USER_MODE_WORK_LOOP	(EXIT_TO_USER_MODE_WORK & ~_TIF_RSEQ)
16 #else
17 #define EXIT_TO_USER_MODE_WORK_LOOP	(EXIT_TO_USER_MODE_WORK)
18 #endif
19 
20 /* TIF bits, which prevent a time slice extension. */
21 #ifdef CONFIG_PREEMPT_RT
22 /*
23  * Since rseq slice ext has a direct correlation to the worst case
24  * scheduling latency (schedule is delayed after all), only have it affect
25  * LAZY reschedules on PREEMPT_RT for now.
26  *
27  * However, since this delay is only applicable to userspace, a value
28  * for rseq_slice_extension_nsec that is strictly less than the worst case
29  * kernel space preempt_disable() region, should mean the scheduling latency
30  * is not affected, even for !LAZY.
31  *
32  * However, since this value depends on the hardware at hand, it cannot be
33  * pre-determined in any sensible way. Hence punt on this problem for now.
34  */
35 # define TIF_SLICE_EXT_SCHED	(_TIF_NEED_RESCHED_LAZY)
36 #else
37 # define TIF_SLICE_EXT_SCHED	(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
38 #endif
39 #define TIF_SLICE_EXT_DENY	(EXIT_TO_USER_MODE_WORK & ~TIF_SLICE_EXT_SCHED)
40 
41 static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs,
42 							      unsigned long ti_work)
43 {
44 	/*
45 	 * Before returning to user space ensure that all pending work
46 	 * items have been completed.
47 	 */
48 	while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) {
49 
50 		local_irq_enable_exit_to_user(ti_work);
51 
52 		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
53 			if (!rseq_grant_slice_extension(ti_work & TIF_SLICE_EXT_DENY))
54 				schedule();
55 		}
56 
57 		if (ti_work & _TIF_UPROBE)
58 			uprobe_notify_resume(regs);
59 
60 		if (ti_work & _TIF_PATCH_PENDING)
61 			klp_update_patch_state(current);
62 
63 		if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
64 			arch_do_signal_or_restart(regs);
65 
66 		if (ti_work & _TIF_NOTIFY_RESUME)
67 			resume_user_mode_work(regs);
68 
69 		/* Architecture specific TIF work */
70 		arch_exit_to_user_mode_work(regs, ti_work);
71 
72 		/*
73 		 * Disable interrupts and reevaluate the work flags as they
74 		 * might have changed while interrupts and preemption was
75 		 * enabled above.
76 		 */
77 		local_irq_disable_exit_to_user();
78 
79 		/* Check if any of the above work has queued a deferred wakeup */
80 		tick_nohz_user_enter_prepare();
81 
82 		ti_work = read_thread_flags();
83 	}
84 
85 	/* Return the latest work state for arch_exit_to_user_mode() */
86 	return ti_work;
87 }
88 
89 /**
90  * exit_to_user_mode_loop - do any pending work before leaving to user space
91  * @regs:	Pointer to pt_regs on entry stack
92  * @ti_work:	TIF work flags as read by the caller
93  */
94 __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
95 						     unsigned long ti_work)
96 {
97 	for (;;) {
98 		ti_work = __exit_to_user_mode_loop(regs, ti_work);
99 
100 		if (likely(!rseq_exit_to_user_mode_restart(regs, ti_work)))
101 			return ti_work;
102 		ti_work = read_thread_flags();
103 	}
104 }
105 
106 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
107 {
108 	irqentry_state_t ret = {
109 		.exit_rcu = false,
110 	};
111 
112 	if (user_mode(regs)) {
113 		irqentry_enter_from_user_mode(regs);
114 		return ret;
115 	}
116 
117 	/*
118 	 * If this entry hit the idle task invoke ct_irq_enter() whether
119 	 * RCU is watching or not.
120 	 *
121 	 * Interrupts can nest when the first interrupt invokes softirq
122 	 * processing on return which enables interrupts.
123 	 *
124 	 * Scheduler ticks in the idle task can mark quiescent state and
125 	 * terminate a grace period, if and only if the timer interrupt is
126 	 * not nested into another interrupt.
127 	 *
128 	 * Checking for rcu_is_watching() here would prevent the nesting
129 	 * interrupt to invoke ct_irq_enter(). If that nested interrupt is
130 	 * the tick then rcu_flavor_sched_clock_irq() would wrongfully
131 	 * assume that it is the first interrupt and eventually claim
132 	 * quiescent state and end grace periods prematurely.
133 	 *
134 	 * Unconditionally invoke ct_irq_enter() so RCU state stays
135 	 * consistent.
136 	 *
137 	 * TINY_RCU does not support EQS, so let the compiler eliminate
138 	 * this part when enabled.
139 	 */
140 	if (!IS_ENABLED(CONFIG_TINY_RCU) &&
141 	    (is_idle_task(current) || arch_in_rcu_eqs())) {
142 		/*
143 		 * If RCU is not watching then the same careful
144 		 * sequence vs. lockdep and tracing is required
145 		 * as in irqentry_enter_from_user_mode().
146 		 */
147 		lockdep_hardirqs_off(CALLER_ADDR0);
148 		ct_irq_enter();
149 		instrumentation_begin();
150 		kmsan_unpoison_entry_regs(regs);
151 		trace_hardirqs_off_finish();
152 		instrumentation_end();
153 
154 		ret.exit_rcu = true;
155 		return ret;
156 	}
157 
158 	/*
159 	 * If RCU is watching then RCU only wants to check whether it needs
160 	 * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
161 	 * already contains a warning when RCU is not watching, so no point
162 	 * in having another one here.
163 	 */
164 	lockdep_hardirqs_off(CALLER_ADDR0);
165 	instrumentation_begin();
166 	kmsan_unpoison_entry_regs(regs);
167 	rcu_irq_enter_check_tick();
168 	trace_hardirqs_off_finish();
169 	instrumentation_end();
170 
171 	return ret;
172 }
173 
174 /**
175  * arch_irqentry_exit_need_resched - Architecture specific need resched function
176  *
177  * Invoked from raw_irqentry_exit_cond_resched() to check if resched is needed.
178  * Defaults return true.
179  *
180  * The main purpose is to permit arch to avoid preemption of a task from an IRQ.
181  */
182 static inline bool arch_irqentry_exit_need_resched(void);
183 
184 #ifndef arch_irqentry_exit_need_resched
185 static inline bool arch_irqentry_exit_need_resched(void) { return true; }
186 #endif
187 
188 void raw_irqentry_exit_cond_resched(void)
189 {
190 	if (!preempt_count()) {
191 		/* Sanity check RCU and thread stack */
192 		rcu_irq_exit_check_preempt();
193 		if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
194 			WARN_ON_ONCE(!on_thread_stack());
195 		if (need_resched() && arch_irqentry_exit_need_resched())
196 			preempt_schedule_irq();
197 	}
198 }
199 #ifdef CONFIG_PREEMPT_DYNAMIC
200 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
201 DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
202 #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
203 DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
204 void dynamic_irqentry_exit_cond_resched(void)
205 {
206 	if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
207 		return;
208 	raw_irqentry_exit_cond_resched();
209 }
210 #endif
211 #endif
212 
213 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
214 {
215 	lockdep_assert_irqs_disabled();
216 
217 	/* Check whether this returns to user mode */
218 	if (user_mode(regs)) {
219 		irqentry_exit_to_user_mode(regs);
220 	} else if (!regs_irqs_disabled(regs)) {
221 		/*
222 		 * If RCU was not watching on entry this needs to be done
223 		 * carefully and needs the same ordering of lockdep/tracing
224 		 * and RCU as the return to user mode path.
225 		 */
226 		if (state.exit_rcu) {
227 			instrumentation_begin();
228 			/* Tell the tracer that IRET will enable interrupts */
229 			trace_hardirqs_on_prepare();
230 			lockdep_hardirqs_on_prepare();
231 			instrumentation_end();
232 			ct_irq_exit();
233 			lockdep_hardirqs_on(CALLER_ADDR0);
234 			return;
235 		}
236 
237 		instrumentation_begin();
238 		if (IS_ENABLED(CONFIG_PREEMPTION))
239 			irqentry_exit_cond_resched();
240 
241 		/* Covers both tracing and lockdep */
242 		trace_hardirqs_on();
243 		instrumentation_end();
244 	} else {
245 		/*
246 		 * IRQ flags state is correct already. Just tell RCU if it
247 		 * was not watching on entry.
248 		 */
249 		if (state.exit_rcu)
250 			ct_irq_exit();
251 	}
252 }
253 
254 irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
255 {
256 	irqentry_state_t irq_state;
257 
258 	irq_state.lockdep = lockdep_hardirqs_enabled();
259 
260 	__nmi_enter();
261 	lockdep_hardirqs_off(CALLER_ADDR0);
262 	lockdep_hardirq_enter();
263 	ct_nmi_enter();
264 
265 	instrumentation_begin();
266 	kmsan_unpoison_entry_regs(regs);
267 	trace_hardirqs_off_finish();
268 	ftrace_nmi_enter();
269 	instrumentation_end();
270 
271 	return irq_state;
272 }
273 
274 void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
275 {
276 	instrumentation_begin();
277 	ftrace_nmi_exit();
278 	if (irq_state.lockdep) {
279 		trace_hardirqs_on_prepare();
280 		lockdep_hardirqs_on_prepare();
281 	}
282 	instrumentation_end();
283 
284 	ct_nmi_exit();
285 	lockdep_hardirq_exit();
286 	if (irq_state.lockdep)
287 		lockdep_hardirqs_on(CALLER_ADDR0);
288 	__nmi_exit();
289 }
290