1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * trace context switch
4 *
5 * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
6 *
7 */
8 #include <linux/module.h>
9 #include <linux/kallsyms.h>
10 #include <linux/uaccess.h>
11 #include <linux/kmemleak.h>
12 #include <linux/ftrace.h>
13 #include <trace/events/sched.h>
14
15 #include "trace.h"
16
17 #define RECORD_CMDLINE 1
18 #define RECORD_TGID 2
19
20 static int sched_cmdline_ref;
21 static int sched_tgid_ref;
22 static DEFINE_MUTEX(sched_register_mutex);
23
24 static void
probe_sched_switch(void * ignore,bool preempt,struct task_struct * prev,struct task_struct * next,unsigned int prev_state)25 probe_sched_switch(void *ignore, bool preempt,
26 struct task_struct *prev, struct task_struct *next,
27 unsigned int prev_state)
28 {
29 int flags;
30
31 flags = (RECORD_TGID * !!sched_tgid_ref) +
32 (RECORD_CMDLINE * !!sched_cmdline_ref);
33
34 if (!flags)
35 return;
36 tracing_record_taskinfo_sched_switch(prev, next, flags);
37 }
38
39 static void
probe_sched_wakeup(void * ignore,struct task_struct * wakee)40 probe_sched_wakeup(void *ignore, struct task_struct *wakee)
41 {
42 int flags;
43
44 flags = (RECORD_TGID * !!sched_tgid_ref) +
45 (RECORD_CMDLINE * !!sched_cmdline_ref);
46
47 if (!flags)
48 return;
49 tracing_record_taskinfo_sched_switch(current, wakee, flags);
50 }
51
tracing_sched_register(void)52 static int tracing_sched_register(void)
53 {
54 int ret;
55
56 ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL);
57 if (ret) {
58 pr_info("wakeup trace: Couldn't activate tracepoint"
59 " probe to kernel_sched_wakeup\n");
60 return ret;
61 }
62
63 ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
64 if (ret) {
65 pr_info("wakeup trace: Couldn't activate tracepoint"
66 " probe to kernel_sched_wakeup_new\n");
67 goto fail_deprobe;
68 }
69
70 ret = register_trace_sched_switch(probe_sched_switch, NULL);
71 if (ret) {
72 pr_info("sched trace: Couldn't activate tracepoint"
73 " probe to kernel_sched_switch\n");
74 goto fail_deprobe_wake_new;
75 }
76
77 return ret;
78 fail_deprobe_wake_new:
79 unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
80 fail_deprobe:
81 unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
82 return ret;
83 }
84
tracing_sched_unregister(void)85 static void tracing_sched_unregister(void)
86 {
87 unregister_trace_sched_switch(probe_sched_switch, NULL);
88 unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
89 unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
90 }
91
tracing_start_sched_switch(int ops)92 static void tracing_start_sched_switch(int ops)
93 {
94 bool sched_register;
95
96 mutex_lock(&sched_register_mutex);
97 sched_register = (!sched_cmdline_ref && !sched_tgid_ref);
98
99 switch (ops) {
100 case RECORD_CMDLINE:
101 sched_cmdline_ref++;
102 break;
103
104 case RECORD_TGID:
105 sched_tgid_ref++;
106 break;
107 }
108
109 if (sched_register && (sched_cmdline_ref || sched_tgid_ref))
110 tracing_sched_register();
111 mutex_unlock(&sched_register_mutex);
112 }
113
tracing_stop_sched_switch(int ops)114 static void tracing_stop_sched_switch(int ops)
115 {
116 mutex_lock(&sched_register_mutex);
117
118 switch (ops) {
119 case RECORD_CMDLINE:
120 sched_cmdline_ref--;
121 break;
122
123 case RECORD_TGID:
124 sched_tgid_ref--;
125 break;
126 }
127
128 if (!sched_cmdline_ref && !sched_tgid_ref)
129 tracing_sched_unregister();
130 mutex_unlock(&sched_register_mutex);
131 }
132
tracing_start_cmdline_record(void)133 void tracing_start_cmdline_record(void)
134 {
135 tracing_start_sched_switch(RECORD_CMDLINE);
136 }
137
tracing_stop_cmdline_record(void)138 void tracing_stop_cmdline_record(void)
139 {
140 tracing_stop_sched_switch(RECORD_CMDLINE);
141 }
142
tracing_start_tgid_record(void)143 void tracing_start_tgid_record(void)
144 {
145 tracing_start_sched_switch(RECORD_TGID);
146 }
147
tracing_stop_tgid_record(void)148 void tracing_stop_tgid_record(void)
149 {
150 tracing_stop_sched_switch(RECORD_TGID);
151 }
152
153 /*
154 * The tgid_map array maps from pid to tgid; i.e. the value stored at index i
155 * is the tgid last observed corresponding to pid=i.
156 */
157 static int *tgid_map;
158
159 /* The maximum valid index into tgid_map. */
160 static size_t tgid_map_max;
161
162 #define SAVED_CMDLINES_DEFAULT 128
163 #define NO_CMDLINE_MAP UINT_MAX
164 /*
165 * Preemption must be disabled before acquiring trace_cmdline_lock.
166 * The various trace_arrays' max_lock must be acquired in a context
167 * where interrupt is disabled.
168 */
169 static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
170 struct saved_cmdlines_buffer {
171 unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
172 unsigned *map_cmdline_to_pid;
173 unsigned cmdline_num;
174 int cmdline_idx;
175 char saved_cmdlines[];
176 };
177 static struct saved_cmdlines_buffer *savedcmd;
178
179 /* Holds the size of a cmdline and pid element */
180 #define SAVED_CMDLINE_MAP_ELEMENT_SIZE(s) \
181 (TASK_COMM_LEN + sizeof((s)->map_cmdline_to_pid[0]))
182
get_saved_cmdlines(int idx)183 static inline char *get_saved_cmdlines(int idx)
184 {
185 return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
186 }
187
set_cmdline(int idx,const char * cmdline)188 static inline void set_cmdline(int idx, const char *cmdline)
189 {
190 strscpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
191 }
192
free_saved_cmdlines_buffer(struct saved_cmdlines_buffer * s)193 static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
194 {
195 int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN);
196
197 kmemleak_free(s);
198 free_pages((unsigned long)s, order);
199 }
200
allocate_cmdlines_buffer(unsigned int val)201 static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
202 {
203 struct saved_cmdlines_buffer *s;
204 struct page *page;
205 int orig_size, size;
206 int order;
207
208 /* Figure out how much is needed to hold the given number of cmdlines */
209 orig_size = sizeof(*s) + val * SAVED_CMDLINE_MAP_ELEMENT_SIZE(s);
210 order = get_order(orig_size);
211 size = 1 << (order + PAGE_SHIFT);
212 page = alloc_pages(GFP_KERNEL, order);
213 if (!page)
214 return NULL;
215
216 s = page_address(page);
217 kmemleak_alloc(s, size, 1, GFP_KERNEL);
218 memset(s, 0, sizeof(*s));
219
220 /* Round up to actual allocation */
221 val = (size - sizeof(*s)) / SAVED_CMDLINE_MAP_ELEMENT_SIZE(s);
222 s->cmdline_num = val;
223
224 /* Place map_cmdline_to_pid array right after saved_cmdlines */
225 s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN];
226
227 memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
228 sizeof(s->map_pid_to_cmdline));
229 memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
230 val * sizeof(*s->map_cmdline_to_pid));
231
232 return s;
233 }
234
trace_create_savedcmd(void)235 int trace_create_savedcmd(void)
236 {
237 savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT);
238
239 return savedcmd ? 0 : -ENOMEM;
240 }
241
trace_save_cmdline(struct task_struct * tsk)242 int trace_save_cmdline(struct task_struct *tsk)
243 {
244 unsigned tpid, idx;
245
246 /* treat recording of idle task as a success */
247 if (!tsk->pid)
248 return 1;
249
250 BUILD_BUG_ON(!is_power_of_2(PID_MAX_DEFAULT));
251
252 tpid = tsk->pid & (PID_MAX_DEFAULT - 1);
253
254 /*
255 * It's not the end of the world if we don't get
256 * the lock, but we also don't want to spin
257 * nor do we want to disable interrupts,
258 * so if we miss here, then better luck next time.
259 *
260 * This is called within the scheduler and wake up, so interrupts
261 * had better been disabled and run queue lock been held.
262 */
263 lockdep_assert_preemption_disabled();
264 if (!arch_spin_trylock(&trace_cmdline_lock))
265 return 0;
266
267 idx = savedcmd->map_pid_to_cmdline[tpid];
268 if (idx == NO_CMDLINE_MAP) {
269 idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
270
271 savedcmd->map_pid_to_cmdline[tpid] = idx;
272 savedcmd->cmdline_idx = idx;
273 }
274
275 savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
276 set_cmdline(idx, tsk->comm);
277
278 arch_spin_unlock(&trace_cmdline_lock);
279
280 return 1;
281 }
282
__trace_find_cmdline(int pid,char comm[])283 static void __trace_find_cmdline(int pid, char comm[])
284 {
285 unsigned map;
286 int tpid;
287
288 if (!pid) {
289 strcpy(comm, "<idle>");
290 return;
291 }
292
293 if (WARN_ON_ONCE(pid < 0)) {
294 strcpy(comm, "<XXX>");
295 return;
296 }
297
298 tpid = pid & (PID_MAX_DEFAULT - 1);
299 map = savedcmd->map_pid_to_cmdline[tpid];
300 if (map != NO_CMDLINE_MAP) {
301 tpid = savedcmd->map_cmdline_to_pid[map];
302 if (tpid == pid) {
303 strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
304 return;
305 }
306 }
307 strcpy(comm, "<...>");
308 }
309
trace_find_cmdline(int pid,char comm[])310 void trace_find_cmdline(int pid, char comm[])
311 {
312 preempt_disable();
313 arch_spin_lock(&trace_cmdline_lock);
314
315 __trace_find_cmdline(pid, comm);
316
317 arch_spin_unlock(&trace_cmdline_lock);
318 preempt_enable();
319 }
320
trace_find_tgid_ptr(int pid)321 static int *trace_find_tgid_ptr(int pid)
322 {
323 /*
324 * Pairs with the smp_store_release in set_tracer_flag() to ensure that
325 * if we observe a non-NULL tgid_map then we also observe the correct
326 * tgid_map_max.
327 */
328 int *map = smp_load_acquire(&tgid_map);
329
330 if (unlikely(!map || pid > tgid_map_max))
331 return NULL;
332
333 return &map[pid];
334 }
335
trace_find_tgid(int pid)336 int trace_find_tgid(int pid)
337 {
338 int *ptr = trace_find_tgid_ptr(pid);
339
340 return ptr ? *ptr : 0;
341 }
342
trace_save_tgid(struct task_struct * tsk)343 static int trace_save_tgid(struct task_struct *tsk)
344 {
345 int *ptr;
346
347 /* treat recording of idle task as a success */
348 if (!tsk->pid)
349 return 1;
350
351 ptr = trace_find_tgid_ptr(tsk->pid);
352 if (!ptr)
353 return 0;
354
355 *ptr = tsk->tgid;
356 return 1;
357 }
358
tracing_record_taskinfo_skip(int flags)359 static bool tracing_record_taskinfo_skip(int flags)
360 {
361 if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID))))
362 return true;
363 if (!__this_cpu_read(trace_taskinfo_save))
364 return true;
365 return false;
366 }
367
368 /**
369 * tracing_record_taskinfo - record the task info of a task
370 *
371 * @task: task to record
372 * @flags: TRACE_RECORD_CMDLINE for recording comm
373 * TRACE_RECORD_TGID for recording tgid
374 */
tracing_record_taskinfo(struct task_struct * task,int flags)375 void tracing_record_taskinfo(struct task_struct *task, int flags)
376 {
377 bool done;
378
379 if (tracing_record_taskinfo_skip(flags))
380 return;
381
382 /*
383 * Record as much task information as possible. If some fail, continue
384 * to try to record the others.
385 */
386 done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task);
387 done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task);
388
389 /* If recording any information failed, retry again soon. */
390 if (!done)
391 return;
392
393 __this_cpu_write(trace_taskinfo_save, false);
394 }
395
396 /**
397 * tracing_record_taskinfo_sched_switch - record task info for sched_switch
398 *
399 * @prev: previous task during sched_switch
400 * @next: next task during sched_switch
401 * @flags: TRACE_RECORD_CMDLINE for recording comm
402 * TRACE_RECORD_TGID for recording tgid
403 */
tracing_record_taskinfo_sched_switch(struct task_struct * prev,struct task_struct * next,int flags)404 void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
405 struct task_struct *next, int flags)
406 {
407 bool done;
408
409 if (tracing_record_taskinfo_skip(flags))
410 return;
411
412 /*
413 * Record as much task information as possible. If some fail, continue
414 * to try to record the others.
415 */
416 done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev);
417 done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next);
418 done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev);
419 done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next);
420
421 /* If recording any information failed, retry again soon. */
422 if (!done)
423 return;
424
425 __this_cpu_write(trace_taskinfo_save, false);
426 }
427
428 /* Helpers to record a specific task information */
tracing_record_cmdline(struct task_struct * task)429 void tracing_record_cmdline(struct task_struct *task)
430 {
431 tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE);
432 }
433
tracing_record_tgid(struct task_struct * task)434 void tracing_record_tgid(struct task_struct *task)
435 {
436 tracing_record_taskinfo(task, TRACE_RECORD_TGID);
437 }
438
trace_alloc_tgid_map(void)439 int trace_alloc_tgid_map(void)
440 {
441 int *map;
442
443 if (tgid_map)
444 return 0;
445
446 tgid_map_max = init_pid_ns.pid_max;
447 map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
448 GFP_KERNEL);
449 if (!map)
450 return -ENOMEM;
451
452 /*
453 * Pairs with smp_load_acquire() in
454 * trace_find_tgid_ptr() to ensure that if it observes
455 * the tgid_map we just allocated then it also observes
456 * the corresponding tgid_map_max value.
457 */
458 smp_store_release(&tgid_map, map);
459 return 0;
460 }
461
saved_tgids_next(struct seq_file * m,void * v,loff_t * pos)462 static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
463 {
464 int pid = ++(*pos);
465
466 return trace_find_tgid_ptr(pid);
467 }
468
saved_tgids_start(struct seq_file * m,loff_t * pos)469 static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
470 {
471 int pid = *pos;
472
473 return trace_find_tgid_ptr(pid);
474 }
475
saved_tgids_stop(struct seq_file * m,void * v)476 static void saved_tgids_stop(struct seq_file *m, void *v)
477 {
478 }
479
saved_tgids_show(struct seq_file * m,void * v)480 static int saved_tgids_show(struct seq_file *m, void *v)
481 {
482 int *entry = (int *)v;
483 int pid = entry - tgid_map;
484 int tgid = *entry;
485
486 if (tgid == 0)
487 return SEQ_SKIP;
488
489 seq_printf(m, "%d %d\n", pid, tgid);
490 return 0;
491 }
492
493 static const struct seq_operations tracing_saved_tgids_seq_ops = {
494 .start = saved_tgids_start,
495 .stop = saved_tgids_stop,
496 .next = saved_tgids_next,
497 .show = saved_tgids_show,
498 };
499
tracing_saved_tgids_open(struct inode * inode,struct file * filp)500 static int tracing_saved_tgids_open(struct inode *inode, struct file *filp)
501 {
502 int ret;
503
504 ret = tracing_check_open_get_tr(NULL);
505 if (ret)
506 return ret;
507
508 return seq_open(filp, &tracing_saved_tgids_seq_ops);
509 }
510
511
512 const struct file_operations tracing_saved_tgids_fops = {
513 .open = tracing_saved_tgids_open,
514 .read = seq_read,
515 .llseek = seq_lseek,
516 .release = seq_release,
517 };
518
saved_cmdlines_next(struct seq_file * m,void * v,loff_t * pos)519 static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
520 {
521 unsigned int *ptr = v;
522
523 if (*pos || m->count)
524 ptr++;
525
526 (*pos)++;
527
528 for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num];
529 ptr++) {
530 if (*ptr == -1 || *ptr == NO_CMDLINE_MAP)
531 continue;
532
533 return ptr;
534 }
535
536 return NULL;
537 }
538
saved_cmdlines_start(struct seq_file * m,loff_t * pos)539 static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos)
540 {
541 void *v;
542 loff_t l = 0;
543
544 preempt_disable();
545 arch_spin_lock(&trace_cmdline_lock);
546
547 v = &savedcmd->map_cmdline_to_pid[0];
548 while (l <= *pos) {
549 v = saved_cmdlines_next(m, v, &l);
550 if (!v)
551 return NULL;
552 }
553
554 return v;
555 }
556
saved_cmdlines_stop(struct seq_file * m,void * v)557 static void saved_cmdlines_stop(struct seq_file *m, void *v)
558 {
559 arch_spin_unlock(&trace_cmdline_lock);
560 preempt_enable();
561 }
562
saved_cmdlines_show(struct seq_file * m,void * v)563 static int saved_cmdlines_show(struct seq_file *m, void *v)
564 {
565 char buf[TASK_COMM_LEN];
566 unsigned int *pid = v;
567
568 __trace_find_cmdline(*pid, buf);
569 seq_printf(m, "%d %s\n", *pid, buf);
570 return 0;
571 }
572
573 static const struct seq_operations tracing_saved_cmdlines_seq_ops = {
574 .start = saved_cmdlines_start,
575 .next = saved_cmdlines_next,
576 .stop = saved_cmdlines_stop,
577 .show = saved_cmdlines_show,
578 };
579
tracing_saved_cmdlines_open(struct inode * inode,struct file * filp)580 static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp)
581 {
582 int ret;
583
584 ret = tracing_check_open_get_tr(NULL);
585 if (ret)
586 return ret;
587
588 return seq_open(filp, &tracing_saved_cmdlines_seq_ops);
589 }
590
591 const struct file_operations tracing_saved_cmdlines_fops = {
592 .open = tracing_saved_cmdlines_open,
593 .read = seq_read,
594 .llseek = seq_lseek,
595 .release = seq_release,
596 };
597
598 static ssize_t
tracing_saved_cmdlines_size_read(struct file * filp,char __user * ubuf,size_t cnt,loff_t * ppos)599 tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
600 size_t cnt, loff_t *ppos)
601 {
602 char buf[64];
603 int r;
604
605 preempt_disable();
606 arch_spin_lock(&trace_cmdline_lock);
607 r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
608 arch_spin_unlock(&trace_cmdline_lock);
609 preempt_enable();
610
611 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
612 }
613
trace_free_saved_cmdlines_buffer(void)614 void trace_free_saved_cmdlines_buffer(void)
615 {
616 free_saved_cmdlines_buffer(savedcmd);
617 }
618
tracing_resize_saved_cmdlines(unsigned int val)619 static int tracing_resize_saved_cmdlines(unsigned int val)
620 {
621 struct saved_cmdlines_buffer *s, *savedcmd_temp;
622
623 s = allocate_cmdlines_buffer(val);
624 if (!s)
625 return -ENOMEM;
626
627 preempt_disable();
628 arch_spin_lock(&trace_cmdline_lock);
629 savedcmd_temp = savedcmd;
630 savedcmd = s;
631 arch_spin_unlock(&trace_cmdline_lock);
632 preempt_enable();
633 free_saved_cmdlines_buffer(savedcmd_temp);
634
635 return 0;
636 }
637
638 static ssize_t
tracing_saved_cmdlines_size_write(struct file * filp,const char __user * ubuf,size_t cnt,loff_t * ppos)639 tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf,
640 size_t cnt, loff_t *ppos)
641 {
642 unsigned long val;
643 int ret;
644
645 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
646 if (ret)
647 return ret;
648
649 /* must have at least 1 entry or less than PID_MAX_DEFAULT */
650 if (!val || val > PID_MAX_DEFAULT)
651 return -EINVAL;
652
653 ret = tracing_resize_saved_cmdlines((unsigned int)val);
654 if (ret < 0)
655 return ret;
656
657 *ppos += cnt;
658
659 return cnt;
660 }
661
662 const struct file_operations tracing_saved_cmdlines_size_fops = {
663 .open = tracing_open_generic,
664 .read = tracing_saved_cmdlines_size_read,
665 .write = tracing_saved_cmdlines_size_write,
666 };
667