xref: /linux/kernel/trace/trace_sched_wakeup.c (revision 2d6ffcca623a9a16df6cdfbe8250b7a5904a5f5e)
1 /*
2  * trace task wakeup timings
3  *
4  * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5  * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
6  *
7  * Based on code from the latency_tracer, that is:
8  *
9  *  Copyright (C) 2004-2006 Ingo Molnar
10  *  Copyright (C) 2004 William Lee Irwin III
11  */
12 #include <linux/module.h>
13 #include <linux/fs.h>
14 #include <linux/debugfs.h>
15 #include <linux/kallsyms.h>
16 #include <linux/uaccess.h>
17 #include <linux/ftrace.h>
18 #include <linux/marker.h>
19 
20 #include "trace.h"
21 
22 static struct trace_array	*wakeup_trace;
23 static int __read_mostly	tracer_enabled;
24 
25 static struct task_struct	*wakeup_task;
26 static int			wakeup_cpu;
27 static unsigned			wakeup_prio = -1;
28 
29 static DEFINE_SPINLOCK(wakeup_lock);
30 
31 static void __wakeup_reset(struct trace_array *tr);
32 
33 #ifdef CONFIG_FTRACE
34 /*
35  * irqsoff uses its own tracer function to keep the overhead down:
36  */
37 static void
38 wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
39 {
40 	struct trace_array *tr = wakeup_trace;
41 	struct trace_array_cpu *data;
42 	unsigned long flags;
43 	long disabled;
44 	int resched;
45 	int cpu;
46 
47 	if (likely(!wakeup_task))
48 		return;
49 
50 	resched = need_resched();
51 	preempt_disable_notrace();
52 
53 	cpu = raw_smp_processor_id();
54 	data = tr->data[cpu];
55 	disabled = atomic_inc_return(&data->disabled);
56 	if (unlikely(disabled != 1))
57 		goto out;
58 
59 	spin_lock_irqsave(&wakeup_lock, flags);
60 
61 	if (unlikely(!wakeup_task))
62 		goto unlock;
63 
64 	/*
65 	 * The task can't disappear because it needs to
66 	 * wake up first, and we have the wakeup_lock.
67 	 */
68 	if (task_cpu(wakeup_task) != cpu)
69 		goto unlock;
70 
71 	trace_function(tr, data, ip, parent_ip, flags);
72 
73  unlock:
74 	spin_unlock_irqrestore(&wakeup_lock, flags);
75 
76  out:
77 	atomic_dec(&data->disabled);
78 
79 	/*
80 	 * To prevent recursion from the scheduler, if the
81 	 * resched flag was set before we entered, then
82 	 * don't reschedule.
83 	 */
84 	if (resched)
85 		preempt_enable_no_resched_notrace();
86 	else
87 		preempt_enable_notrace();
88 }
89 
90 static struct ftrace_ops trace_ops __read_mostly =
91 {
92 	.func = wakeup_tracer_call,
93 };
94 #endif /* CONFIG_FTRACE */
95 
96 /*
97  * Should this new latency be reported/recorded?
98  */
99 static int report_latency(cycle_t delta)
100 {
101 	if (tracing_thresh) {
102 		if (delta < tracing_thresh)
103 			return 0;
104 	} else {
105 		if (delta <= tracing_max_latency)
106 			return 0;
107 	}
108 	return 1;
109 }
110 
111 static void notrace
112 wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
113 	struct task_struct *next)
114 {
115 	unsigned long latency = 0, t0 = 0, t1 = 0;
116 	struct trace_array **ptr = private;
117 	struct trace_array *tr = *ptr;
118 	struct trace_array_cpu *data;
119 	cycle_t T0, T1, delta;
120 	unsigned long flags;
121 	long disabled;
122 	int cpu;
123 
124 	if (unlikely(!tracer_enabled))
125 		return;
126 
127 	/*
128 	 * When we start a new trace, we set wakeup_task to NULL
129 	 * and then set tracer_enabled = 1. We want to make sure
130 	 * that another CPU does not see the tracer_enabled = 1
131 	 * and the wakeup_task with an older task, that might
132 	 * actually be the same as next.
133 	 */
134 	smp_rmb();
135 
136 	if (next != wakeup_task)
137 		return;
138 
139 	/* The task we are waiting for is waking up */
140 	data = tr->data[wakeup_cpu];
141 
142 	/* disable local data, not wakeup_cpu data */
143 	cpu = raw_smp_processor_id();
144 	disabled = atomic_inc_return(&tr->data[cpu]->disabled);
145 	if (likely(disabled != 1))
146 		goto out;
147 
148 	spin_lock_irqsave(&wakeup_lock, flags);
149 
150 	/* We could race with grabbing wakeup_lock */
151 	if (unlikely(!tracer_enabled || next != wakeup_task))
152 		goto out_unlock;
153 
154 	trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags);
155 
156 	/*
157 	 * usecs conversion is slow so we try to delay the conversion
158 	 * as long as possible:
159 	 */
160 	T0 = data->preempt_timestamp;
161 	T1 = ftrace_now(cpu);
162 	delta = T1-T0;
163 
164 	if (!report_latency(delta))
165 		goto out_unlock;
166 
167 	latency = nsecs_to_usecs(delta);
168 
169 	tracing_max_latency = delta;
170 	t0 = nsecs_to_usecs(T0);
171 	t1 = nsecs_to_usecs(T1);
172 
173 	update_max_tr(tr, wakeup_task, wakeup_cpu);
174 
175 out_unlock:
176 	__wakeup_reset(tr);
177 	spin_unlock_irqrestore(&wakeup_lock, flags);
178 out:
179 	atomic_dec(&tr->data[cpu]->disabled);
180 }
181 
182 static notrace void
183 sched_switch_callback(void *probe_data, void *call_data,
184 		      const char *format, va_list *args)
185 {
186 	struct task_struct *prev;
187 	struct task_struct *next;
188 	struct rq *__rq;
189 
190 	/* skip prev_pid %d next_pid %d prev_state %ld */
191 	(void)va_arg(*args, int);
192 	(void)va_arg(*args, int);
193 	(void)va_arg(*args, long);
194 	__rq = va_arg(*args, typeof(__rq));
195 	prev = va_arg(*args, typeof(prev));
196 	next = va_arg(*args, typeof(next));
197 
198 	tracing_record_cmdline(prev);
199 
200 	/*
201 	 * If tracer_switch_func only points to the local
202 	 * switch func, it still needs the ptr passed to it.
203 	 */
204 	wakeup_sched_switch(probe_data, __rq, prev, next);
205 }
206 
207 static void __wakeup_reset(struct trace_array *tr)
208 {
209 	struct trace_array_cpu *data;
210 	int cpu;
211 
212 	assert_spin_locked(&wakeup_lock);
213 
214 	for_each_possible_cpu(cpu) {
215 		data = tr->data[cpu];
216 		tracing_reset(data);
217 	}
218 
219 	wakeup_cpu = -1;
220 	wakeup_prio = -1;
221 
222 	if (wakeup_task)
223 		put_task_struct(wakeup_task);
224 
225 	wakeup_task = NULL;
226 }
227 
228 static void wakeup_reset(struct trace_array *tr)
229 {
230 	unsigned long flags;
231 
232 	spin_lock_irqsave(&wakeup_lock, flags);
233 	__wakeup_reset(tr);
234 	spin_unlock_irqrestore(&wakeup_lock, flags);
235 }
236 
237 static void
238 wakeup_check_start(struct trace_array *tr, struct task_struct *p,
239 		   struct task_struct *curr)
240 {
241 	int cpu = smp_processor_id();
242 	unsigned long flags;
243 	long disabled;
244 
245 	if (likely(!rt_task(p)) ||
246 			p->prio >= wakeup_prio ||
247 			p->prio >= curr->prio)
248 		return;
249 
250 	disabled = atomic_inc_return(&tr->data[cpu]->disabled);
251 	if (unlikely(disabled != 1))
252 		goto out;
253 
254 	/* interrupts should be off from try_to_wake_up */
255 	spin_lock(&wakeup_lock);
256 
257 	/* check for races. */
258 	if (!tracer_enabled || p->prio >= wakeup_prio)
259 		goto out_locked;
260 
261 	/* reset the trace */
262 	__wakeup_reset(tr);
263 
264 	wakeup_cpu = task_cpu(p);
265 	wakeup_prio = p->prio;
266 
267 	wakeup_task = p;
268 	get_task_struct(wakeup_task);
269 
270 	local_save_flags(flags);
271 
272 	tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
273 	trace_function(tr, tr->data[wakeup_cpu],
274 		       CALLER_ADDR1, CALLER_ADDR2, flags);
275 
276 out_locked:
277 	spin_unlock(&wakeup_lock);
278 out:
279 	atomic_dec(&tr->data[cpu]->disabled);
280 }
281 
282 static notrace void
283 wake_up_callback(void *probe_data, void *call_data,
284 		 const char *format, va_list *args)
285 {
286 	struct trace_array **ptr = probe_data;
287 	struct trace_array *tr = *ptr;
288 	struct task_struct *curr;
289 	struct task_struct *task;
290 	struct rq *__rq;
291 
292 	if (likely(!tracer_enabled))
293 		return;
294 
295 	/* Skip pid %d state %ld */
296 	(void)va_arg(*args, int);
297 	(void)va_arg(*args, long);
298 	/* now get the meat: "rq %p task %p rq->curr %p" */
299 	__rq = va_arg(*args, typeof(__rq));
300 	task = va_arg(*args, typeof(task));
301 	curr = va_arg(*args, typeof(curr));
302 
303 	tracing_record_cmdline(task);
304 	tracing_record_cmdline(curr);
305 
306 	wakeup_check_start(tr, task, curr);
307 }
308 
309 static void start_wakeup_tracer(struct trace_array *tr)
310 {
311 	int ret;
312 
313 	ret = marker_probe_register("kernel_sched_wakeup",
314 			"pid %d state %ld ## rq %p task %p rq->curr %p",
315 			wake_up_callback,
316 			&wakeup_trace);
317 	if (ret) {
318 		pr_info("wakeup trace: Couldn't add marker"
319 			" probe to kernel_sched_wakeup\n");
320 		return;
321 	}
322 
323 	ret = marker_probe_register("kernel_sched_wakeup_new",
324 			"pid %d state %ld ## rq %p task %p rq->curr %p",
325 			wake_up_callback,
326 			&wakeup_trace);
327 	if (ret) {
328 		pr_info("wakeup trace: Couldn't add marker"
329 			" probe to kernel_sched_wakeup_new\n");
330 		goto fail_deprobe;
331 	}
332 
333 	ret = marker_probe_register("kernel_sched_schedule",
334 		"prev_pid %d next_pid %d prev_state %ld "
335 		"## rq %p prev %p next %p",
336 		sched_switch_callback,
337 		&wakeup_trace);
338 	if (ret) {
339 		pr_info("sched trace: Couldn't add marker"
340 			" probe to kernel_sched_schedule\n");
341 		goto fail_deprobe_wake_new;
342 	}
343 
344 	wakeup_reset(tr);
345 
346 	/*
347 	 * Don't let the tracer_enabled = 1 show up before
348 	 * the wakeup_task is reset. This may be overkill since
349 	 * wakeup_reset does a spin_unlock after setting the
350 	 * wakeup_task to NULL, but I want to be safe.
351 	 * This is a slow path anyway.
352 	 */
353 	smp_wmb();
354 
355 	register_ftrace_function(&trace_ops);
356 
357 	tracer_enabled = 1;
358 
359 	return;
360 fail_deprobe_wake_new:
361 	marker_probe_unregister("kernel_sched_wakeup_new",
362 				wake_up_callback,
363 				&wakeup_trace);
364 fail_deprobe:
365 	marker_probe_unregister("kernel_sched_wakeup",
366 				wake_up_callback,
367 				&wakeup_trace);
368 }
369 
370 static void stop_wakeup_tracer(struct trace_array *tr)
371 {
372 	tracer_enabled = 0;
373 	unregister_ftrace_function(&trace_ops);
374 	marker_probe_unregister("kernel_sched_schedule",
375 				sched_switch_callback,
376 				&wakeup_trace);
377 	marker_probe_unregister("kernel_sched_wakeup_new",
378 				wake_up_callback,
379 				&wakeup_trace);
380 	marker_probe_unregister("kernel_sched_wakeup",
381 				wake_up_callback,
382 				&wakeup_trace);
383 }
384 
385 static void wakeup_tracer_init(struct trace_array *tr)
386 {
387 	wakeup_trace = tr;
388 
389 	if (tr->ctrl)
390 		start_wakeup_tracer(tr);
391 }
392 
393 static void wakeup_tracer_reset(struct trace_array *tr)
394 {
395 	if (tr->ctrl) {
396 		stop_wakeup_tracer(tr);
397 		/* make sure we put back any tasks we are tracing */
398 		wakeup_reset(tr);
399 	}
400 }
401 
402 static void wakeup_tracer_ctrl_update(struct trace_array *tr)
403 {
404 	if (tr->ctrl)
405 		start_wakeup_tracer(tr);
406 	else
407 		stop_wakeup_tracer(tr);
408 }
409 
410 static void wakeup_tracer_open(struct trace_iterator *iter)
411 {
412 	/* stop the trace while dumping */
413 	if (iter->tr->ctrl)
414 		stop_wakeup_tracer(iter->tr);
415 }
416 
417 static void wakeup_tracer_close(struct trace_iterator *iter)
418 {
419 	/* forget about any processes we were recording */
420 	if (iter->tr->ctrl)
421 		start_wakeup_tracer(iter->tr);
422 }
423 
424 static struct tracer wakeup_tracer __read_mostly =
425 {
426 	.name		= "wakeup",
427 	.init		= wakeup_tracer_init,
428 	.reset		= wakeup_tracer_reset,
429 	.open		= wakeup_tracer_open,
430 	.close		= wakeup_tracer_close,
431 	.ctrl_update	= wakeup_tracer_ctrl_update,
432 	.print_max	= 1,
433 #ifdef CONFIG_FTRACE_SELFTEST
434 	.selftest    = trace_selftest_startup_wakeup,
435 #endif
436 };
437 
438 __init static int init_wakeup_tracer(void)
439 {
440 	int ret;
441 
442 	ret = register_tracer(&wakeup_tracer);
443 	if (ret)
444 		return ret;
445 
446 	return 0;
447 }
448 device_initcall(init_wakeup_tracer);
449