xref: /linux/kernel/trace/trace_sched_switch.c (revision 69050f8d6d075dc01af7a5f2f550a8067510366f)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * trace context switch
4  *
5  * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
6  *
7  */
8 #include <linux/module.h>
9 #include <linux/kallsyms.h>
10 #include <linux/uaccess.h>
11 #include <linux/kmemleak.h>
12 #include <linux/ftrace.h>
13 #include <trace/events/sched.h>
14 
15 #include "trace.h"
16 
17 #define RECORD_CMDLINE	1
18 #define RECORD_TGID	2
19 
20 static int		sched_cmdline_ref;
21 static int		sched_tgid_ref;
22 static DEFINE_MUTEX(sched_register_mutex);
23 
24 static void
25 probe_sched_switch(void *ignore, bool preempt,
26 		   struct task_struct *prev, struct task_struct *next,
27 		   unsigned int prev_state)
28 {
29 	int flags;
30 
31 	flags = (RECORD_TGID * !!sched_tgid_ref) +
32 		(RECORD_CMDLINE * !!sched_cmdline_ref);
33 
34 	if (!flags)
35 		return;
36 	tracing_record_taskinfo_sched_switch(prev, next, flags);
37 }
38 
39 static void
40 probe_sched_wakeup(void *ignore, struct task_struct *wakee)
41 {
42 	int flags;
43 
44 	flags = (RECORD_TGID * !!sched_tgid_ref) +
45 		(RECORD_CMDLINE * !!sched_cmdline_ref);
46 
47 	if (!flags)
48 		return;
49 	tracing_record_taskinfo_sched_switch(current, wakee, flags);
50 }
51 
52 static int tracing_sched_register(void)
53 {
54 	int ret;
55 
56 	ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL);
57 	if (ret) {
58 		pr_info("wakeup trace: Couldn't activate tracepoint"
59 			" probe to kernel_sched_wakeup\n");
60 		return ret;
61 	}
62 
63 	ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
64 	if (ret) {
65 		pr_info("wakeup trace: Couldn't activate tracepoint"
66 			" probe to kernel_sched_wakeup_new\n");
67 		goto fail_deprobe;
68 	}
69 
70 	ret = register_trace_sched_switch(probe_sched_switch, NULL);
71 	if (ret) {
72 		pr_info("sched trace: Couldn't activate tracepoint"
73 			" probe to kernel_sched_switch\n");
74 		goto fail_deprobe_wake_new;
75 	}
76 
77 	return ret;
78 fail_deprobe_wake_new:
79 	unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
80 fail_deprobe:
81 	unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
82 	return ret;
83 }
84 
85 static void tracing_sched_unregister(void)
86 {
87 	unregister_trace_sched_switch(probe_sched_switch, NULL);
88 	unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
89 	unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
90 }
91 
92 static void tracing_start_sched_switch(int ops)
93 {
94 	bool sched_register;
95 
96 	mutex_lock(&sched_register_mutex);
97 	sched_register = (!sched_cmdline_ref && !sched_tgid_ref);
98 
99 	switch (ops) {
100 	case RECORD_CMDLINE:
101 		sched_cmdline_ref++;
102 		break;
103 
104 	case RECORD_TGID:
105 		sched_tgid_ref++;
106 		break;
107 	}
108 
109 	if (sched_register && (sched_cmdline_ref || sched_tgid_ref))
110 		tracing_sched_register();
111 	mutex_unlock(&sched_register_mutex);
112 }
113 
114 static void tracing_stop_sched_switch(int ops)
115 {
116 	mutex_lock(&sched_register_mutex);
117 
118 	switch (ops) {
119 	case RECORD_CMDLINE:
120 		sched_cmdline_ref--;
121 		break;
122 
123 	case RECORD_TGID:
124 		sched_tgid_ref--;
125 		break;
126 	}
127 
128 	if (!sched_cmdline_ref && !sched_tgid_ref)
129 		tracing_sched_unregister();
130 	mutex_unlock(&sched_register_mutex);
131 }
132 
133 void tracing_start_cmdline_record(void)
134 {
135 	tracing_start_sched_switch(RECORD_CMDLINE);
136 }
137 
138 void tracing_stop_cmdline_record(void)
139 {
140 	tracing_stop_sched_switch(RECORD_CMDLINE);
141 }
142 
143 void tracing_start_tgid_record(void)
144 {
145 	tracing_start_sched_switch(RECORD_TGID);
146 }
147 
148 void tracing_stop_tgid_record(void)
149 {
150 	tracing_stop_sched_switch(RECORD_TGID);
151 }
152 
153 /*
154  * The tgid_map array maps from pid to tgid; i.e. the value stored at index i
155  * is the tgid last observed corresponding to pid=i.
156  */
157 static int *tgid_map;
158 
159 /* The maximum valid index into tgid_map. */
160 static size_t tgid_map_max;
161 
162 #define SAVED_CMDLINES_DEFAULT 128
163 #define NO_CMDLINE_MAP UINT_MAX
164 /*
165  * Preemption must be disabled before acquiring trace_cmdline_lock.
166  * The various trace_arrays' max_lock must be acquired in a context
167  * where interrupt is disabled.
168  */
169 static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
170 struct saved_cmdlines_buffer {
171 	unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
172 	unsigned *map_cmdline_to_pid;
173 	unsigned cmdline_num;
174 	int cmdline_idx;
175 	char saved_cmdlines[];
176 };
177 static struct saved_cmdlines_buffer *savedcmd;
178 
179 /* Holds the size of a cmdline and pid element */
180 #define SAVED_CMDLINE_MAP_ELEMENT_SIZE(s)			\
181 	(TASK_COMM_LEN + sizeof((s)->map_cmdline_to_pid[0]))
182 
183 static inline char *get_saved_cmdlines(int idx)
184 {
185 	return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
186 }
187 
188 static inline void set_cmdline(int idx, const char *cmdline)
189 {
190 	strscpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
191 }
192 
193 static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
194 {
195 	int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN);
196 
197 	kmemleak_free(s);
198 	free_pages((unsigned long)s, order);
199 }
200 
201 static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
202 {
203 	struct saved_cmdlines_buffer *s;
204 	struct page *page;
205 	int orig_size, size;
206 	int order;
207 
208 	/* Figure out how much is needed to hold the given number of cmdlines */
209 	orig_size = sizeof(*s) + val * SAVED_CMDLINE_MAP_ELEMENT_SIZE(s);
210 	order = get_order(orig_size);
211 	size = 1 << (order + PAGE_SHIFT);
212 	page = alloc_pages(GFP_KERNEL, order);
213 	if (!page)
214 		return NULL;
215 
216 	s = page_address(page);
217 	kmemleak_alloc(s, size, 1, GFP_KERNEL);
218 	memset(s, 0, sizeof(*s));
219 
220 	/* Round up to actual allocation */
221 	val = (size - sizeof(*s)) / SAVED_CMDLINE_MAP_ELEMENT_SIZE(s);
222 	s->cmdline_num = val;
223 
224 	/* Place map_cmdline_to_pid array right after saved_cmdlines */
225 	s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN];
226 
227 	memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
228 	       sizeof(s->map_pid_to_cmdline));
229 	memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
230 	       val * sizeof(*s->map_cmdline_to_pid));
231 
232 	return s;
233 }
234 
235 int trace_create_savedcmd(void)
236 {
237 	savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT);
238 
239 	return savedcmd ? 0 : -ENOMEM;
240 }
241 
242 int trace_save_cmdline(struct task_struct *tsk)
243 {
244 	unsigned tpid, idx;
245 
246 	/* treat recording of idle task as a success */
247 	if (!tsk->pid)
248 		return 1;
249 
250 	BUILD_BUG_ON(!is_power_of_2(PID_MAX_DEFAULT));
251 
252 	tpid = tsk->pid & (PID_MAX_DEFAULT - 1);
253 
254 	/*
255 	 * It's not the end of the world if we don't get
256 	 * the lock, but we also don't want to spin
257 	 * nor do we want to disable interrupts,
258 	 * so if we miss here, then better luck next time.
259 	 *
260 	 * This is called within the scheduler and wake up, so interrupts
261 	 * had better been disabled and run queue lock been held.
262 	 */
263 	lockdep_assert_preemption_disabled();
264 	if (!arch_spin_trylock(&trace_cmdline_lock))
265 		return 0;
266 
267 	idx = savedcmd->map_pid_to_cmdline[tpid];
268 	if (idx == NO_CMDLINE_MAP) {
269 		idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
270 
271 		savedcmd->map_pid_to_cmdline[tpid] = idx;
272 		savedcmd->cmdline_idx = idx;
273 	}
274 
275 	savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
276 	set_cmdline(idx, tsk->comm);
277 
278 	arch_spin_unlock(&trace_cmdline_lock);
279 
280 	return 1;
281 }
282 
283 static void __trace_find_cmdline(int pid, char comm[])
284 {
285 	unsigned map;
286 	int tpid;
287 
288 	if (!pid) {
289 		strcpy(comm, "<idle>");
290 		return;
291 	}
292 
293 	if (WARN_ON_ONCE(pid < 0)) {
294 		strcpy(comm, "<XXX>");
295 		return;
296 	}
297 
298 	tpid = pid & (PID_MAX_DEFAULT - 1);
299 	map = savedcmd->map_pid_to_cmdline[tpid];
300 	if (map != NO_CMDLINE_MAP) {
301 		tpid = savedcmd->map_cmdline_to_pid[map];
302 		if (tpid == pid) {
303 			strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
304 			return;
305 		}
306 	}
307 	strcpy(comm, "<...>");
308 }
309 
310 void trace_find_cmdline(int pid, char comm[])
311 {
312 	preempt_disable();
313 	arch_spin_lock(&trace_cmdline_lock);
314 
315 	__trace_find_cmdline(pid, comm);
316 
317 	arch_spin_unlock(&trace_cmdline_lock);
318 	preempt_enable();
319 }
320 
321 static int *trace_find_tgid_ptr(int pid)
322 {
323 	/*
324 	 * Pairs with the smp_store_release in set_tracer_flag() to ensure that
325 	 * if we observe a non-NULL tgid_map then we also observe the correct
326 	 * tgid_map_max.
327 	 */
328 	int *map = smp_load_acquire(&tgid_map);
329 
330 	if (unlikely(!map || pid > tgid_map_max))
331 		return NULL;
332 
333 	return &map[pid];
334 }
335 
336 int trace_find_tgid(int pid)
337 {
338 	int *ptr = trace_find_tgid_ptr(pid);
339 
340 	return ptr ? *ptr : 0;
341 }
342 
343 static int trace_save_tgid(struct task_struct *tsk)
344 {
345 	int *ptr;
346 
347 	/* treat recording of idle task as a success */
348 	if (!tsk->pid)
349 		return 1;
350 
351 	ptr = trace_find_tgid_ptr(tsk->pid);
352 	if (!ptr)
353 		return 0;
354 
355 	*ptr = tsk->tgid;
356 	return 1;
357 }
358 
359 static bool tracing_record_taskinfo_skip(int flags)
360 {
361 	if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID))))
362 		return true;
363 	if (!__this_cpu_read(trace_taskinfo_save))
364 		return true;
365 	return false;
366 }
367 
368 /**
369  * tracing_record_taskinfo - record the task info of a task
370  *
371  * @task:  task to record
372  * @flags: TRACE_RECORD_CMDLINE for recording comm
373  *         TRACE_RECORD_TGID for recording tgid
374  */
375 void tracing_record_taskinfo(struct task_struct *task, int flags)
376 {
377 	bool done;
378 
379 	if (tracing_record_taskinfo_skip(flags))
380 		return;
381 
382 	/*
383 	 * Record as much task information as possible. If some fail, continue
384 	 * to try to record the others.
385 	 */
386 	done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task);
387 	done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task);
388 
389 	/* If recording any information failed, retry again soon. */
390 	if (!done)
391 		return;
392 
393 	__this_cpu_write(trace_taskinfo_save, false);
394 }
395 
396 /**
397  * tracing_record_taskinfo_sched_switch - record task info for sched_switch
398  *
399  * @prev: previous task during sched_switch
400  * @next: next task during sched_switch
401  * @flags: TRACE_RECORD_CMDLINE for recording comm
402  *         TRACE_RECORD_TGID for recording tgid
403  */
404 void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
405 					  struct task_struct *next, int flags)
406 {
407 	bool done;
408 
409 	if (tracing_record_taskinfo_skip(flags))
410 		return;
411 
412 	/*
413 	 * Record as much task information as possible. If some fail, continue
414 	 * to try to record the others.
415 	 */
416 	done  = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev);
417 	done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next);
418 	done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev);
419 	done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next);
420 
421 	/* If recording any information failed, retry again soon. */
422 	if (!done)
423 		return;
424 
425 	__this_cpu_write(trace_taskinfo_save, false);
426 }
427 
428 /* Helpers to record a specific task information */
429 void tracing_record_cmdline(struct task_struct *task)
430 {
431 	tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE);
432 }
433 
434 void tracing_record_tgid(struct task_struct *task)
435 {
436 	tracing_record_taskinfo(task, TRACE_RECORD_TGID);
437 }
438 
439 int trace_alloc_tgid_map(void)
440 {
441 	int *map;
442 
443 	if (tgid_map)
444 		return 0;
445 
446 	tgid_map_max = init_pid_ns.pid_max;
447 	map = kvzalloc_objs(*tgid_map, tgid_map_max + 1, GFP_KERNEL);
448 	if (!map)
449 		return -ENOMEM;
450 
451 	/*
452 	 * Pairs with smp_load_acquire() in
453 	 * trace_find_tgid_ptr() to ensure that if it observes
454 	 * the tgid_map we just allocated then it also observes
455 	 * the corresponding tgid_map_max value.
456 	 */
457 	smp_store_release(&tgid_map, map);
458 	return 0;
459 }
460 
461 static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
462 {
463 	int pid = ++(*pos);
464 
465 	return trace_find_tgid_ptr(pid);
466 }
467 
468 static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
469 {
470 	int pid = *pos;
471 
472 	return trace_find_tgid_ptr(pid);
473 }
474 
475 static void saved_tgids_stop(struct seq_file *m, void *v)
476 {
477 }
478 
479 static int saved_tgids_show(struct seq_file *m, void *v)
480 {
481 	int *entry = (int *)v;
482 	int pid = entry - tgid_map;
483 	int tgid = *entry;
484 
485 	if (tgid == 0)
486 		return SEQ_SKIP;
487 
488 	seq_printf(m, "%d %d\n", pid, tgid);
489 	return 0;
490 }
491 
492 static const struct seq_operations tracing_saved_tgids_seq_ops = {
493 	.start		= saved_tgids_start,
494 	.stop		= saved_tgids_stop,
495 	.next		= saved_tgids_next,
496 	.show		= saved_tgids_show,
497 };
498 
499 static int tracing_saved_tgids_open(struct inode *inode, struct file *filp)
500 {
501 	int ret;
502 
503 	ret = tracing_check_open_get_tr(NULL);
504 	if (ret)
505 		return ret;
506 
507 	return seq_open(filp, &tracing_saved_tgids_seq_ops);
508 }
509 
510 
511 const struct file_operations tracing_saved_tgids_fops = {
512 	.open		= tracing_saved_tgids_open,
513 	.read		= seq_read,
514 	.llseek		= seq_lseek,
515 	.release	= seq_release,
516 };
517 
518 static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
519 {
520 	unsigned int *ptr = v;
521 
522 	if (*pos || m->count)
523 		ptr++;
524 
525 	(*pos)++;
526 
527 	for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num];
528 	     ptr++) {
529 		if (*ptr == -1 || *ptr == NO_CMDLINE_MAP)
530 			continue;
531 
532 		return ptr;
533 	}
534 
535 	return NULL;
536 }
537 
538 static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos)
539 {
540 	void *v;
541 	loff_t l = 0;
542 
543 	preempt_disable();
544 	arch_spin_lock(&trace_cmdline_lock);
545 
546 	v = &savedcmd->map_cmdline_to_pid[0];
547 	while (l <= *pos) {
548 		v = saved_cmdlines_next(m, v, &l);
549 		if (!v)
550 			return NULL;
551 	}
552 
553 	return v;
554 }
555 
556 static void saved_cmdlines_stop(struct seq_file *m, void *v)
557 {
558 	arch_spin_unlock(&trace_cmdline_lock);
559 	preempt_enable();
560 }
561 
562 static int saved_cmdlines_show(struct seq_file *m, void *v)
563 {
564 	char buf[TASK_COMM_LEN];
565 	unsigned int *pid = v;
566 
567 	__trace_find_cmdline(*pid, buf);
568 	seq_printf(m, "%d %s\n", *pid, buf);
569 	return 0;
570 }
571 
572 static const struct seq_operations tracing_saved_cmdlines_seq_ops = {
573 	.start		= saved_cmdlines_start,
574 	.next		= saved_cmdlines_next,
575 	.stop		= saved_cmdlines_stop,
576 	.show		= saved_cmdlines_show,
577 };
578 
579 static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp)
580 {
581 	int ret;
582 
583 	ret = tracing_check_open_get_tr(NULL);
584 	if (ret)
585 		return ret;
586 
587 	return seq_open(filp, &tracing_saved_cmdlines_seq_ops);
588 }
589 
590 const struct file_operations tracing_saved_cmdlines_fops = {
591 	.open		= tracing_saved_cmdlines_open,
592 	.read		= seq_read,
593 	.llseek		= seq_lseek,
594 	.release	= seq_release,
595 };
596 
597 static ssize_t
598 tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
599 				 size_t cnt, loff_t *ppos)
600 {
601 	char buf[64];
602 	int r;
603 
604 	preempt_disable();
605 	arch_spin_lock(&trace_cmdline_lock);
606 	r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
607 	arch_spin_unlock(&trace_cmdline_lock);
608 	preempt_enable();
609 
610 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
611 }
612 
613 void trace_free_saved_cmdlines_buffer(void)
614 {
615 	free_saved_cmdlines_buffer(savedcmd);
616 }
617 
618 static int tracing_resize_saved_cmdlines(unsigned int val)
619 {
620 	struct saved_cmdlines_buffer *s, *savedcmd_temp;
621 
622 	s = allocate_cmdlines_buffer(val);
623 	if (!s)
624 		return -ENOMEM;
625 
626 	preempt_disable();
627 	arch_spin_lock(&trace_cmdline_lock);
628 	savedcmd_temp = savedcmd;
629 	savedcmd = s;
630 	arch_spin_unlock(&trace_cmdline_lock);
631 	preempt_enable();
632 	free_saved_cmdlines_buffer(savedcmd_temp);
633 
634 	return 0;
635 }
636 
637 static ssize_t
638 tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf,
639 				  size_t cnt, loff_t *ppos)
640 {
641 	unsigned long val;
642 	int ret;
643 
644 	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
645 	if (ret)
646 		return ret;
647 
648 	/* must have at least 1 entry or less than PID_MAX_DEFAULT */
649 	if (!val || val > PID_MAX_DEFAULT)
650 		return -EINVAL;
651 
652 	ret = tracing_resize_saved_cmdlines((unsigned int)val);
653 	if (ret < 0)
654 		return ret;
655 
656 	*ppos += cnt;
657 
658 	return cnt;
659 }
660 
661 const struct file_operations tracing_saved_cmdlines_size_fops = {
662 	.open		= tracing_open_generic,
663 	.read		= tracing_saved_cmdlines_size_read,
664 	.write		= tracing_saved_cmdlines_size_write,
665 };
666