xref: /linux/kernel/trace/trace_osnoise.c (revision 9cb99c598643ba78638dfd668cf020544159cf70)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * OS Noise Tracer: computes the OS Noise suffered by a running thread.
4  * Timerlat Tracer: measures the wakeup latency of a timer triggered IRQ and thread.
5  *
6  * Based on "hwlat_detector" tracer by:
7  *   Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
8  *   Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com>
9  *   With feedback from Clark Williams <williams@redhat.com>
10  *
11  * And also based on the rtsl tracer presented on:
12  *  DE OLIVEIRA, Daniel Bristot, et al. Demystifying the real-time linux
13  *  scheduling latency. In: 32nd Euromicro Conference on Real-Time Systems
14  *  (ECRTS 2020). Schloss Dagstuhl-Leibniz-Zentrum fur Informatik, 2020.
15  *
16  * Copyright (C) 2021 Daniel Bristot de Oliveira, Red Hat, Inc. <bristot@redhat.com>
17  */
18 
19 #include <linux/kthread.h>
20 #include <linux/tracefs.h>
21 #include <linux/uaccess.h>
22 #include <linux/cpumask.h>
23 #include <linux/delay.h>
24 #include <linux/sched/clock.h>
25 #include <uapi/linux/sched/types.h>
26 #include <linux/sched.h>
27 #include <linux/string.h>
28 #include "trace.h"
29 
30 #ifdef CONFIG_X86_LOCAL_APIC
31 #include <asm/trace/irq_vectors.h>
32 #undef TRACE_INCLUDE_PATH
33 #undef TRACE_INCLUDE_FILE
34 #endif /* CONFIG_X86_LOCAL_APIC */
35 
36 #include <trace/events/irq.h>
37 #include <trace/events/sched.h>
38 
39 #define CREATE_TRACE_POINTS
40 #include <trace/events/osnoise.h>
41 
42 /*
43  * Default values.
44  */
45 #define BANNER			"osnoise: "
46 #define DEFAULT_SAMPLE_PERIOD	1000000			/* 1s */
47 #define DEFAULT_SAMPLE_RUNTIME	1000000			/* 1s */
48 
49 #define DEFAULT_TIMERLAT_PERIOD	1000			/* 1ms */
50 #define DEFAULT_TIMERLAT_PRIO	95			/* FIFO 95 */
51 
52 /*
53  * osnoise/options entries.
54  */
55 enum osnoise_options_index {
56 	OSN_DEFAULTS = 0,
57 	OSN_WORKLOAD,
58 	OSN_PANIC_ON_STOP,
59 	OSN_PREEMPT_DISABLE,
60 	OSN_IRQ_DISABLE,
61 	OSN_TIMERLAT_ALIGN,
62 	OSN_MAX
63 };
64 
65 static const char * const osnoise_options_str[OSN_MAX] = {
66 							"DEFAULTS",
67 							"OSNOISE_WORKLOAD",
68 							"PANIC_ON_STOP",
69 							"OSNOISE_PREEMPT_DISABLE",
70 							"OSNOISE_IRQ_DISABLE",
71 							"TIMERLAT_ALIGN" };
72 
73 #define OSN_DEFAULT_OPTIONS		0x2
74 static unsigned long osnoise_options	= OSN_DEFAULT_OPTIONS;
75 
76 /*
77  * trace_array of the enabled osnoise/timerlat instances.
78  */
79 struct osnoise_instance {
80 	struct list_head	list;
81 	struct trace_array	*tr;
82 };
83 
84 static struct list_head osnoise_instances;
85 
86 static void osnoise_print(const char *fmt, ...)
87 {
88 	struct osnoise_instance *inst;
89 	struct trace_array *tr;
90 	va_list ap;
91 
92 	rcu_read_lock();
93 	list_for_each_entry_rcu(inst, &osnoise_instances, list) {
94 		tr = inst->tr;
95 		va_start(ap, fmt);
96 		trace_array_vprintk(tr, _RET_IP_, fmt, ap);
97 		va_end(ap);
98 	}
99 	rcu_read_unlock();
100 }
101 
102 static bool osnoise_has_registered_instances(void)
103 {
104 	return !!list_first_or_null_rcu(&osnoise_instances,
105 					struct osnoise_instance,
106 					list);
107 }
108 
109 /*
110  * osnoise_instance_registered - check if a tr is already registered
111  */
112 static int osnoise_instance_registered(struct trace_array *tr)
113 {
114 	struct osnoise_instance *inst;
115 	int found = 0;
116 
117 	rcu_read_lock();
118 	list_for_each_entry_rcu(inst, &osnoise_instances, list) {
119 		if (inst->tr == tr)
120 			found = 1;
121 	}
122 	rcu_read_unlock();
123 
124 	return found;
125 }
126 
127 /*
128  * osnoise_register_instance - register a new trace instance
129  *
130  * Register a trace_array *tr in the list of instances running
131  * osnoise/timerlat tracers.
132  */
133 static int osnoise_register_instance(struct trace_array *tr)
134 {
135 	struct osnoise_instance *inst;
136 
137 	/*
138 	 * register/unregister serialization is provided by trace's
139 	 * trace_types_lock.
140 	 */
141 	lockdep_assert_held(&trace_types_lock);
142 	trace_array_init_printk(tr);
143 
144 	inst = kmalloc_obj(*inst);
145 	if (!inst)
146 		return -ENOMEM;
147 
148 	INIT_LIST_HEAD_RCU(&inst->list);
149 	inst->tr = tr;
150 	list_add_tail_rcu(&inst->list, &osnoise_instances);
151 
152 	return 0;
153 }
154 
155 /*
156  *  osnoise_unregister_instance - unregister a registered trace instance
157  *
158  * Remove the trace_array *tr from the list of instances running
159  * osnoise/timerlat tracers.
160  */
161 static void osnoise_unregister_instance(struct trace_array *tr)
162 {
163 	struct osnoise_instance *inst;
164 	int found = 0;
165 
166 	/*
167 	 * register/unregister serialization is provided by trace's
168 	 * trace_types_lock.
169 	 */
170 	list_for_each_entry_rcu(inst, &osnoise_instances, list,
171 				lockdep_is_held(&trace_types_lock)) {
172 		if (inst->tr == tr) {
173 			list_del_rcu(&inst->list);
174 			found = 1;
175 			break;
176 		}
177 	}
178 
179 	if (!found)
180 		return;
181 
182 	kvfree_rcu_mightsleep(inst);
183 }
184 
185 /*
186  * NMI runtime info.
187  */
188 struct osn_nmi {
189 	u64	count;
190 	u64	delta_start;
191 };
192 
193 /*
194  * IRQ runtime info.
195  */
196 struct osn_irq {
197 	u64	count;
198 	u64	arrival_time;
199 	u64	delta_start;
200 };
201 
202 #define IRQ_CONTEXT	0
203 #define THREAD_CONTEXT	1
204 #define THREAD_URET	2
205 /*
206  * sofirq runtime info.
207  */
208 struct osn_softirq {
209 	u64	count;
210 	u64	arrival_time;
211 	u64	delta_start;
212 };
213 
214 /*
215  * thread runtime info.
216  */
217 struct osn_thread {
218 	u64	count;
219 	u64	arrival_time;
220 	u64	delta_start;
221 };
222 
223 /*
224  * Runtime information: this structure saves the runtime information used by
225  * one sampling thread.
226  */
227 struct osnoise_variables {
228 	struct task_struct	*kthread;
229 	bool			sampling;
230 	pid_t			pid;
231 	struct osn_nmi		nmi;
232 	struct osn_irq		irq;
233 	struct osn_softirq	softirq;
234 	struct osn_thread	thread;
235 	local_t			int_counter;
236 };
237 
238 /*
239  * Per-cpu runtime information.
240  */
241 static DEFINE_PER_CPU(struct osnoise_variables, per_cpu_osnoise_var);
242 
243 /*
244  * this_cpu_osn_var - Return the per-cpu osnoise_variables on its relative CPU
245  */
246 static inline struct osnoise_variables *this_cpu_osn_var(void)
247 {
248 	return this_cpu_ptr(&per_cpu_osnoise_var);
249 }
250 
251 /*
252  * Protect the interface.
253  */
254 static struct mutex interface_lock;
255 
256 #ifdef CONFIG_TIMERLAT_TRACER
257 /*
258  * Runtime information for the timer mode.
259  */
260 struct timerlat_variables {
261 	struct task_struct	*kthread;
262 	struct hrtimer		timer;
263 	u64			rel_period;
264 	u64			abs_period;
265 	bool			tracing_thread;
266 	u64			count;
267 	bool			uthread_migrate;
268 };
269 
270 static DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var);
271 
272 /*
273  * timerlat wake-up offset for next thread with TIMERLAT_ALIGN set.
274  */
275 static atomic64_t align_next;
276 
277 /*
278  * this_cpu_tmr_var - Return the per-cpu timerlat_variables on its relative CPU
279  */
280 static inline struct timerlat_variables *this_cpu_tmr_var(void)
281 {
282 	return this_cpu_ptr(&per_cpu_timerlat_var);
283 }
284 
285 /*
286  * tlat_var_reset - Reset the values of the given timerlat_variables
287  */
288 static inline void tlat_var_reset(void)
289 {
290 	struct timerlat_variables *tlat_var;
291 	int cpu;
292 
293 	/* Synchronize with the timerlat interfaces */
294 	mutex_lock(&interface_lock);
295 
296 	/*
297 	 * So far, all the values are initialized as 0, so
298 	 * zeroing the structure is perfect.
299 	 */
300 	for_each_online_cpu(cpu) {
301 		tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu);
302 		if (tlat_var->kthread)
303 			hrtimer_cancel(&tlat_var->timer);
304 		memset(tlat_var, 0, sizeof(*tlat_var));
305 	}
306 	/*
307 	 * Reset also align_next, to be filled by a new offset by the first timerlat
308 	 * thread that wakes up, if TIMERLAT_ALIGN is set.
309 	 */
310 	atomic64_set(&align_next, 0);
311 
312 	mutex_unlock(&interface_lock);
313 }
314 #else /* CONFIG_TIMERLAT_TRACER */
315 #define tlat_var_reset()	do {} while (0)
316 #endif /* CONFIG_TIMERLAT_TRACER */
317 
318 /*
319  * osn_var_reset - Reset the values of the given osnoise_variables
320  */
321 static inline void osn_var_reset(void)
322 {
323 	struct osnoise_variables *osn_var;
324 	int cpu;
325 
326 	/*
327 	 * So far, all the values are initialized as 0, so
328 	 * zeroing the structure is perfect.
329 	 */
330 	for_each_online_cpu(cpu) {
331 		osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu);
332 		memset(osn_var, 0, sizeof(*osn_var));
333 	}
334 }
335 
336 /*
337  * osn_var_reset_all - Reset the value of all per-cpu osnoise_variables
338  */
339 static inline void osn_var_reset_all(void)
340 {
341 	osn_var_reset();
342 	tlat_var_reset();
343 }
344 
345 /*
346  * Tells NMIs to call back to the osnoise tracer to record timestamps.
347  */
348 bool trace_osnoise_callback_enabled;
349 
350 /*
351  * Tracer data.
352  */
353 static struct osnoise_data {
354 	u64	sample_period;		/* total sampling period */
355 	u64	sample_runtime;		/* active sampling portion of period */
356 	u64	stop_tracing;		/* stop trace in the internal operation (loop/irq) */
357 	u64	stop_tracing_total;	/* stop trace in the final operation (report/thread) */
358 #ifdef CONFIG_TIMERLAT_TRACER
359 	u64	timerlat_period;	/* timerlat period */
360 	u64	timerlat_align_us;	/* timerlat alignment */
361 	u64	print_stack;		/* print IRQ stack if total > */
362 	int	timerlat_tracer;	/* timerlat tracer */
363 #endif
364 	bool	tainted;		/* info users and developers about a problem */
365 } osnoise_data = {
366 	.sample_period			= DEFAULT_SAMPLE_PERIOD,
367 	.sample_runtime			= DEFAULT_SAMPLE_RUNTIME,
368 	.stop_tracing			= 0,
369 	.stop_tracing_total		= 0,
370 #ifdef CONFIG_TIMERLAT_TRACER
371 	.print_stack			= 0,
372 	.timerlat_period		= DEFAULT_TIMERLAT_PERIOD,
373 	.timerlat_align_us		= 0,
374 	.timerlat_tracer		= 0,
375 #endif
376 };
377 
378 #ifdef CONFIG_TIMERLAT_TRACER
379 static inline bool timerlat_enabled(void)
380 {
381 	return osnoise_data.timerlat_tracer;
382 }
383 
384 static inline int timerlat_softirq_exit(struct osnoise_variables *osn_var)
385 {
386 	struct timerlat_variables *tlat_var = this_cpu_tmr_var();
387 	/*
388 	 * If the timerlat is enabled, but the irq handler did
389 	 * not run yet enabling timerlat_tracer, do not trace.
390 	 */
391 	if (!tlat_var->tracing_thread) {
392 		osn_var->softirq.arrival_time = 0;
393 		osn_var->softirq.delta_start = 0;
394 		return 0;
395 	}
396 	return 1;
397 }
398 
399 static inline int timerlat_thread_exit(struct osnoise_variables *osn_var)
400 {
401 	struct timerlat_variables *tlat_var = this_cpu_tmr_var();
402 	/*
403 	 * If the timerlat is enabled, but the irq handler did
404 	 * not run yet enabling timerlat_tracer, do not trace.
405 	 */
406 	if (!tlat_var->tracing_thread) {
407 		osn_var->thread.delta_start = 0;
408 		osn_var->thread.arrival_time = 0;
409 		return 0;
410 	}
411 	return 1;
412 }
413 #else /* CONFIG_TIMERLAT_TRACER */
414 static inline bool timerlat_enabled(void)
415 {
416 	return false;
417 }
418 
419 static inline int timerlat_softirq_exit(struct osnoise_variables *osn_var)
420 {
421 	return 1;
422 }
423 static inline int timerlat_thread_exit(struct osnoise_variables *osn_var)
424 {
425 	return 1;
426 }
427 #endif
428 
429 #ifdef CONFIG_PREEMPT_RT
430 /*
431  * Print the osnoise header info.
432  */
433 static void print_osnoise_headers(struct seq_file *s)
434 {
435 	if (osnoise_data.tainted)
436 		seq_puts(s, "# osnoise is tainted!\n");
437 
438 	seq_puts(s, "#                                _-------=> irqs-off\n");
439 	seq_puts(s, "#                               / _------=> need-resched\n");
440 	seq_puts(s, "#                              | / _-----=> need-resched-lazy\n");
441 	seq_puts(s, "#                              || / _----=> hardirq/softirq\n");
442 	seq_puts(s, "#                              ||| / _---=> preempt-depth\n");
443 	seq_puts(s, "#                              |||| / _--=> preempt-lazy-depth\n");
444 	seq_puts(s, "#                              ||||| / _-=> migrate-disable\n");
445 
446 	seq_puts(s, "#                              |||||| /          ");
447 	seq_puts(s, "                                     MAX\n");
448 
449 	seq_puts(s, "#                              ||||| /                         ");
450 	seq_puts(s, "                    SINGLE      Interference counters:\n");
451 
452 	seq_puts(s, "#                              |||||||               RUNTIME   ");
453 	seq_puts(s, "   NOISE  %% OF CPU  NOISE    +-----------------------------+\n");
454 
455 	seq_puts(s, "#           TASK-PID      CPU# |||||||   TIMESTAMP    IN US    ");
456 	seq_puts(s, "   IN US  AVAILABLE  IN US     HW    NMI    IRQ   SIRQ THREAD\n");
457 
458 	seq_puts(s, "#              | |         |   |||||||      |           |      ");
459 	seq_puts(s, "       |    |            |      |      |      |      |      |\n");
460 }
461 #else /* CONFIG_PREEMPT_RT */
462 static void print_osnoise_headers(struct seq_file *s)
463 {
464 	if (osnoise_data.tainted)
465 		seq_puts(s, "# osnoise is tainted!\n");
466 
467 	seq_puts(s, "#                                _-----=> irqs-off\n");
468 	seq_puts(s, "#                               / _----=> need-resched\n");
469 	seq_puts(s, "#                              | / _---=> hardirq/softirq\n");
470 	seq_puts(s, "#                              || / _--=> preempt-depth\n");
471 	seq_puts(s, "#                              ||| / _-=> migrate-disable     ");
472 	seq_puts(s, "                    MAX\n");
473 	seq_puts(s, "#                              |||| /     delay               ");
474 	seq_puts(s, "                    SINGLE      Interference counters:\n");
475 
476 	seq_puts(s, "#                              |||||               RUNTIME   ");
477 	seq_puts(s, "   NOISE  %% OF CPU  NOISE    +-----------------------------+\n");
478 
479 	seq_puts(s, "#           TASK-PID      CPU# |||||   TIMESTAMP    IN US    ");
480 	seq_puts(s, "   IN US  AVAILABLE  IN US     HW    NMI    IRQ   SIRQ THREAD\n");
481 
482 	seq_puts(s, "#              | |         |   |||||      |           |      ");
483 	seq_puts(s, "       |    |            |      |      |      |      |      |\n");
484 }
485 #endif /* CONFIG_PREEMPT_RT */
486 
487 /*
488  * osnoise_taint - report an osnoise error.
489  */
490 #define osnoise_taint(msg) ({							\
491 	osnoise_print(msg);							\
492 	osnoise_data.tainted = true;						\
493 })
494 
495 /*
496  * Record an osnoise_sample into the tracer buffer.
497  */
498 static void
499 __record_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer)
500 {
501 	struct ring_buffer_event *event;
502 	struct osnoise_entry *entry;
503 
504 	event = trace_buffer_lock_reserve(buffer, TRACE_OSNOISE, sizeof(*entry),
505 					  tracing_gen_ctx());
506 	if (!event)
507 		return;
508 	entry	= ring_buffer_event_data(event);
509 	entry->runtime		= sample->runtime;
510 	entry->noise		= sample->noise;
511 	entry->max_sample	= sample->max_sample;
512 	entry->hw_count		= sample->hw_count;
513 	entry->nmi_count	= sample->nmi_count;
514 	entry->irq_count	= sample->irq_count;
515 	entry->softirq_count	= sample->softirq_count;
516 	entry->thread_count	= sample->thread_count;
517 
518 	trace_buffer_unlock_commit_nostack(buffer, event);
519 }
520 
521 /*
522  * Record an osnoise_sample on all osnoise instances and fire trace event.
523  */
524 static void record_osnoise_sample(struct osnoise_sample *sample)
525 {
526 	struct osnoise_instance *inst;
527 	struct trace_buffer *buffer;
528 
529 	trace_osnoise_sample(sample);
530 
531 	rcu_read_lock();
532 	list_for_each_entry_rcu(inst, &osnoise_instances, list) {
533 		buffer = inst->tr->array_buffer.buffer;
534 		__record_osnoise_sample(sample, buffer);
535 	}
536 	rcu_read_unlock();
537 }
538 
539 #ifdef CONFIG_TIMERLAT_TRACER
540 /*
541  * Print the timerlat header info.
542  */
543 #ifdef CONFIG_PREEMPT_RT
544 static void print_timerlat_headers(struct seq_file *s)
545 {
546 	seq_puts(s, "#                                _-------=> irqs-off\n");
547 	seq_puts(s, "#                               / _------=> need-resched\n");
548 	seq_puts(s, "#                              | / _-----=> need-resched-lazy\n");
549 	seq_puts(s, "#                              || / _----=> hardirq/softirq\n");
550 	seq_puts(s, "#                              ||| / _---=> preempt-depth\n");
551 	seq_puts(s, "#                              |||| / _--=> preempt-lazy-depth\n");
552 	seq_puts(s, "#                              ||||| / _-=> migrate-disable\n");
553 	seq_puts(s, "#                              |||||| /\n");
554 	seq_puts(s, "#                              |||||||             ACTIVATION\n");
555 	seq_puts(s, "#           TASK-PID      CPU# |||||||   TIMESTAMP    ID     ");
556 	seq_puts(s, "       CONTEXT                LATENCY\n");
557 	seq_puts(s, "#              | |         |   |||||||      |         |      ");
558 	seq_puts(s, "            |                       |\n");
559 }
560 #else /* CONFIG_PREEMPT_RT */
561 static void print_timerlat_headers(struct seq_file *s)
562 {
563 	seq_puts(s, "#                                _-----=> irqs-off\n");
564 	seq_puts(s, "#                               / _----=> need-resched\n");
565 	seq_puts(s, "#                              | / _---=> hardirq/softirq\n");
566 	seq_puts(s, "#                              || / _--=> preempt-depth\n");
567 	seq_puts(s, "#                              ||| / _-=> migrate-disable\n");
568 	seq_puts(s, "#                              |||| /     delay\n");
569 	seq_puts(s, "#                              |||||            ACTIVATION\n");
570 	seq_puts(s, "#           TASK-PID      CPU# |||||   TIMESTAMP   ID      ");
571 	seq_puts(s, "      CONTEXT                 LATENCY\n");
572 	seq_puts(s, "#              | |         |   |||||      |         |      ");
573 	seq_puts(s, "            |                       |\n");
574 }
575 #endif /* CONFIG_PREEMPT_RT */
576 
577 static void
578 __record_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer)
579 {
580 	struct ring_buffer_event *event;
581 	struct timerlat_entry *entry;
582 
583 	event = trace_buffer_lock_reserve(buffer, TRACE_TIMERLAT, sizeof(*entry),
584 					  tracing_gen_ctx());
585 	if (!event)
586 		return;
587 	entry	= ring_buffer_event_data(event);
588 	entry->seqnum			= sample->seqnum;
589 	entry->context			= sample->context;
590 	entry->timer_latency		= sample->timer_latency;
591 
592 	trace_buffer_unlock_commit_nostack(buffer, event);
593 }
594 
595 /*
596  * Record an timerlat_sample into the tracer buffer.
597  */
598 static void record_timerlat_sample(struct timerlat_sample *sample)
599 {
600 	struct osnoise_instance *inst;
601 	struct trace_buffer *buffer;
602 
603 	trace_timerlat_sample(sample);
604 
605 	rcu_read_lock();
606 	list_for_each_entry_rcu(inst, &osnoise_instances, list) {
607 		buffer = inst->tr->array_buffer.buffer;
608 		__record_timerlat_sample(sample, buffer);
609 	}
610 	rcu_read_unlock();
611 }
612 
613 #ifdef CONFIG_STACKTRACE
614 
615 #define	MAX_CALLS	256
616 
617 /*
618  * Stack trace will take place only at IRQ level, so, no need
619  * to control nesting here.
620  */
621 struct trace_stack {
622 	int		stack_size;
623 	int		nr_entries;
624 	unsigned long	calls[MAX_CALLS];
625 };
626 
627 static DEFINE_PER_CPU(struct trace_stack, trace_stack);
628 
629 /*
630  * timerlat_save_stack - save a stack trace without printing
631  *
632  * Save the current stack trace without printing. The
633  * stack will be printed later, after the end of the measurement.
634  */
635 static void timerlat_save_stack(int skip)
636 {
637 	unsigned int size, nr_entries;
638 	struct trace_stack *fstack;
639 
640 	fstack = this_cpu_ptr(&trace_stack);
641 
642 	size = ARRAY_SIZE(fstack->calls);
643 
644 	nr_entries = stack_trace_save(fstack->calls, size, skip);
645 
646 	fstack->stack_size = nr_entries * sizeof(unsigned long);
647 	fstack->nr_entries = nr_entries;
648 
649 	return;
650 
651 }
652 
653 static void
654 __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, unsigned int size)
655 {
656 	struct ring_buffer_event *event;
657 	struct stack_entry *entry;
658 
659 	event = trace_buffer_lock_reserve(buffer, TRACE_STACK, sizeof(*entry) + size,
660 					  tracing_gen_ctx());
661 	if (!event)
662 		return;
663 
664 	entry = ring_buffer_event_data(event);
665 
666 	entry->size = fstack->nr_entries;
667 	memcpy(&entry->caller, fstack->calls, size);
668 
669 	trace_buffer_unlock_commit_nostack(buffer, event);
670 }
671 
672 /*
673  * timerlat_dump_stack - dump a stack trace previously saved
674  */
675 static void timerlat_dump_stack(u64 latency)
676 {
677 	struct osnoise_instance *inst;
678 	struct trace_buffer *buffer;
679 	struct trace_stack *fstack;
680 	unsigned int size;
681 
682 	/*
683 	 * trace only if latency > print_stack config, if enabled.
684 	 */
685 	if (!osnoise_data.print_stack || osnoise_data.print_stack > latency)
686 		return;
687 
688 	preempt_disable_notrace();
689 	fstack = this_cpu_ptr(&trace_stack);
690 	size = fstack->stack_size;
691 
692 	rcu_read_lock();
693 	list_for_each_entry_rcu(inst, &osnoise_instances, list) {
694 		buffer = inst->tr->array_buffer.buffer;
695 		__timerlat_dump_stack(buffer, fstack, size);
696 
697 	}
698 	rcu_read_unlock();
699 	preempt_enable_notrace();
700 }
701 #else /* CONFIG_STACKTRACE */
702 #define timerlat_dump_stack(u64 latency) do {} while (0)
703 #define timerlat_save_stack(a) do {} while (0)
704 #endif /* CONFIG_STACKTRACE */
705 #endif /* CONFIG_TIMERLAT_TRACER */
706 
707 /*
708  * Macros to encapsulate the time capturing infrastructure.
709  */
710 #define time_get()	trace_clock_local()
711 #define time_to_us(x)	div_u64(x, 1000)
712 #define time_sub(a, b)	((a) - (b))
713 
714 /*
715  * cond_move_irq_delta_start - Forward the delta_start of a running IRQ
716  *
717  * If an IRQ is preempted by an NMI, its delta_start is pushed forward
718  * to discount the NMI interference.
719  *
720  * See get_int_safe_duration().
721  */
722 static inline void
723 cond_move_irq_delta_start(struct osnoise_variables *osn_var, u64 duration)
724 {
725 	if (osn_var->irq.delta_start)
726 		osn_var->irq.delta_start += duration;
727 }
728 
729 #ifndef CONFIG_PREEMPT_RT
730 /*
731  * cond_move_softirq_delta_start - Forward the delta_start of a running softirq.
732  *
733  * If a softirq is preempted by an IRQ or NMI, its delta_start is pushed
734  * forward to discount the interference.
735  *
736  * See get_int_safe_duration().
737  */
738 static inline void
739 cond_move_softirq_delta_start(struct osnoise_variables *osn_var, u64 duration)
740 {
741 	if (osn_var->softirq.delta_start)
742 		osn_var->softirq.delta_start += duration;
743 }
744 #else /* CONFIG_PREEMPT_RT */
745 #define cond_move_softirq_delta_start(osn_var, duration) do {} while (0)
746 #endif
747 
748 /*
749  * cond_move_thread_delta_start - Forward the delta_start of a running thread
750  *
751  * If a noisy thread is preempted by an softirq, IRQ or NMI, its delta_start
752  * is pushed forward to discount the interference.
753  *
754  * See get_int_safe_duration().
755  */
756 static inline void
757 cond_move_thread_delta_start(struct osnoise_variables *osn_var, u64 duration)
758 {
759 	if (osn_var->thread.delta_start)
760 		osn_var->thread.delta_start += duration;
761 }
762 
763 /*
764  * get_int_safe_duration - Get the duration of a window
765  *
766  * The irq, softirq and thread variables need to have its duration without
767  * the interference from higher priority interrupts. Instead of keeping a
768  * variable to discount the interrupt interference from these variables, the
769  * starting time of these variables are pushed forward with the interrupt's
770  * duration. In this way, a single variable is used to:
771  *
772  *   - Know if a given window is being measured.
773  *   - Account its duration.
774  *   - Discount the interference.
775  *
776  * To avoid getting inconsistent values, e.g.,:
777  *
778  *	now = time_get()
779  *		--->	interrupt!
780  *			delta_start -= int duration;
781  *		<---
782  *	duration = now - delta_start;
783  *
784  *	result: negative duration if the variable duration before the
785  *	interrupt was smaller than the interrupt execution.
786  *
787  * A counter of interrupts is used. If the counter increased, try
788  * to capture an interference safe duration.
789  */
790 static inline s64
791 get_int_safe_duration(struct osnoise_variables *osn_var, u64 *delta_start)
792 {
793 	u64 int_counter, now;
794 	s64 duration;
795 
796 	do {
797 		int_counter = local_read(&osn_var->int_counter);
798 		/* synchronize with interrupts */
799 		barrier();
800 
801 		now = time_get();
802 		duration = (now - *delta_start);
803 
804 		/* synchronize with interrupts */
805 		barrier();
806 	} while (int_counter != local_read(&osn_var->int_counter));
807 
808 	/*
809 	 * This is an evidence of race conditions that cause
810 	 * a value to be "discounted" too much.
811 	 */
812 	if (duration < 0)
813 		osnoise_taint("Negative duration!\n");
814 
815 	*delta_start = 0;
816 
817 	return duration;
818 }
819 
820 /*
821  *
822  * set_int_safe_time - Save the current time on *time, aware of interference
823  *
824  * Get the time, taking into consideration a possible interference from
825  * higher priority interrupts.
826  *
827  * See get_int_safe_duration() for an explanation.
828  */
829 static u64
830 set_int_safe_time(struct osnoise_variables *osn_var, u64 *time)
831 {
832 	u64 int_counter;
833 
834 	do {
835 		int_counter = local_read(&osn_var->int_counter);
836 		/* synchronize with interrupts */
837 		barrier();
838 
839 		*time = time_get();
840 
841 		/* synchronize with interrupts */
842 		barrier();
843 	} while (int_counter != local_read(&osn_var->int_counter));
844 
845 	return int_counter;
846 }
847 
848 #ifdef CONFIG_TIMERLAT_TRACER
849 /*
850  * copy_int_safe_time - Copy *src into *desc aware of interference
851  */
852 static u64
853 copy_int_safe_time(struct osnoise_variables *osn_var, u64 *dst, u64 *src)
854 {
855 	u64 int_counter;
856 
857 	do {
858 		int_counter = local_read(&osn_var->int_counter);
859 		/* synchronize with interrupts */
860 		barrier();
861 
862 		*dst = *src;
863 
864 		/* synchronize with interrupts */
865 		barrier();
866 	} while (int_counter != local_read(&osn_var->int_counter));
867 
868 	return int_counter;
869 }
870 #endif /* CONFIG_TIMERLAT_TRACER */
871 
872 /*
873  * trace_osnoise_callback - NMI entry/exit callback
874  *
875  * This function is called at the entry and exit NMI code. The bool enter
876  * distinguishes between either case. This function is used to note a NMI
877  * occurrence, compute the noise caused by the NMI, and to remove the noise
878  * it is potentially causing on other interference variables.
879  */
880 void trace_osnoise_callback(bool enter)
881 {
882 	struct osnoise_variables *osn_var = this_cpu_osn_var();
883 	u64 duration;
884 
885 	if (!osn_var->sampling)
886 		return;
887 
888 	/*
889 	 * Currently trace_clock_local() calls sched_clock() and the
890 	 * generic version is not NMI safe.
891 	 */
892 	if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) {
893 		if (enter) {
894 			osn_var->nmi.delta_start = time_get();
895 			local_inc(&osn_var->int_counter);
896 		} else {
897 			duration = time_get() - osn_var->nmi.delta_start;
898 
899 			trace_nmi_noise(osn_var->nmi.delta_start, duration);
900 
901 			cond_move_irq_delta_start(osn_var, duration);
902 			cond_move_softirq_delta_start(osn_var, duration);
903 			cond_move_thread_delta_start(osn_var, duration);
904 		}
905 	}
906 
907 	if (enter)
908 		osn_var->nmi.count++;
909 }
910 
911 /*
912  * osnoise_trace_irq_entry - Note the starting of an IRQ
913  *
914  * Save the starting time of an IRQ. As IRQs are non-preemptive to other IRQs,
915  * it is safe to use a single variable (ons_var->irq) to save the statistics.
916  * The arrival_time is used to report... the arrival time. The delta_start
917  * is used to compute the duration at the IRQ exit handler. See
918  * cond_move_irq_delta_start().
919  */
920 void osnoise_trace_irq_entry(int id)
921 {
922 	struct osnoise_variables *osn_var = this_cpu_osn_var();
923 
924 	if (!osn_var->sampling)
925 		return;
926 	/*
927 	 * This value will be used in the report, but not to compute
928 	 * the execution time, so it is safe to get it unsafe.
929 	 */
930 	osn_var->irq.arrival_time = time_get();
931 	set_int_safe_time(osn_var, &osn_var->irq.delta_start);
932 	osn_var->irq.count++;
933 
934 	local_inc(&osn_var->int_counter);
935 }
936 
937 /*
938  * osnoise_irq_exit - Note the end of an IRQ, sava data and trace
939  *
940  * Computes the duration of the IRQ noise, and trace it. Also discounts the
941  * interference from other sources of noise could be currently being accounted.
942  */
943 void osnoise_trace_irq_exit(int id, const char *desc)
944 {
945 	struct osnoise_variables *osn_var = this_cpu_osn_var();
946 	s64 duration;
947 
948 	if (!osn_var->sampling)
949 		return;
950 
951 	duration = get_int_safe_duration(osn_var, &osn_var->irq.delta_start);
952 	trace_irq_noise(id, desc, osn_var->irq.arrival_time, duration);
953 	osn_var->irq.arrival_time = 0;
954 	cond_move_softirq_delta_start(osn_var, duration);
955 	cond_move_thread_delta_start(osn_var, duration);
956 }
957 
958 /*
959  * trace_irqentry_callback - Callback to the irq:irq_entry traceevent
960  *
961  * Used to note the starting of an IRQ occurece.
962  */
963 static void trace_irqentry_callback(void *data, int irq,
964 				    struct irqaction *action)
965 {
966 	osnoise_trace_irq_entry(irq);
967 }
968 
969 /*
970  * trace_irqexit_callback - Callback to the irq:irq_exit traceevent
971  *
972  * Used to note the end of an IRQ occurece.
973  */
974 static void trace_irqexit_callback(void *data, int irq,
975 				   struct irqaction *action, int ret)
976 {
977 	osnoise_trace_irq_exit(irq, action->name);
978 }
979 
980 /*
981  * arch specific register function.
982  */
983 int __weak osnoise_arch_register(void)
984 {
985 	return 0;
986 }
987 
988 /*
989  * arch specific unregister function.
990  */
991 void __weak osnoise_arch_unregister(void)
992 {
993 	return;
994 }
995 
996 /*
997  * hook_irq_events - Hook IRQ handling events
998  *
999  * This function hooks the IRQ related callbacks to the respective trace
1000  * events.
1001  */
1002 static int hook_irq_events(void)
1003 {
1004 	int ret;
1005 
1006 	ret = register_trace_irq_handler_entry(trace_irqentry_callback, NULL);
1007 	if (ret)
1008 		goto out_err;
1009 
1010 	ret = register_trace_irq_handler_exit(trace_irqexit_callback, NULL);
1011 	if (ret)
1012 		goto out_unregister_entry;
1013 
1014 	ret = osnoise_arch_register();
1015 	if (ret)
1016 		goto out_irq_exit;
1017 
1018 	return 0;
1019 
1020 out_irq_exit:
1021 	unregister_trace_irq_handler_exit(trace_irqexit_callback, NULL);
1022 out_unregister_entry:
1023 	unregister_trace_irq_handler_entry(trace_irqentry_callback, NULL);
1024 out_err:
1025 	return -EINVAL;
1026 }
1027 
1028 /*
1029  * unhook_irq_events - Unhook IRQ handling events
1030  *
1031  * This function unhooks the IRQ related callbacks to the respective trace
1032  * events.
1033  */
1034 static void unhook_irq_events(void)
1035 {
1036 	osnoise_arch_unregister();
1037 	unregister_trace_irq_handler_exit(trace_irqexit_callback, NULL);
1038 	unregister_trace_irq_handler_entry(trace_irqentry_callback, NULL);
1039 }
1040 
1041 #ifndef CONFIG_PREEMPT_RT
1042 /*
1043  * trace_softirq_entry_callback - Note the starting of a softirq
1044  *
1045  * Save the starting time of a softirq. As softirqs are non-preemptive to
1046  * other softirqs, it is safe to use a single variable (ons_var->softirq)
1047  * to save the statistics. The arrival_time is used to report... the
1048  * arrival time. The delta_start is used to compute the duration at the
1049  * softirq exit handler. See cond_move_softirq_delta_start().
1050  */
1051 static void trace_softirq_entry_callback(void *data, unsigned int vec_nr)
1052 {
1053 	struct osnoise_variables *osn_var = this_cpu_osn_var();
1054 
1055 	if (!osn_var->sampling)
1056 		return;
1057 	/*
1058 	 * This value will be used in the report, but not to compute
1059 	 * the execution time, so it is safe to get it unsafe.
1060 	 */
1061 	osn_var->softirq.arrival_time = time_get();
1062 	set_int_safe_time(osn_var, &osn_var->softirq.delta_start);
1063 	osn_var->softirq.count++;
1064 
1065 	local_inc(&osn_var->int_counter);
1066 }
1067 
1068 /*
1069  * trace_softirq_exit_callback - Note the end of an softirq
1070  *
1071  * Computes the duration of the softirq noise, and trace it. Also discounts the
1072  * interference from other sources of noise could be currently being accounted.
1073  */
1074 static void trace_softirq_exit_callback(void *data, unsigned int vec_nr)
1075 {
1076 	struct osnoise_variables *osn_var = this_cpu_osn_var();
1077 	s64 duration;
1078 
1079 	if (!osn_var->sampling)
1080 		return;
1081 
1082 	if (unlikely(timerlat_enabled()))
1083 		if (!timerlat_softirq_exit(osn_var))
1084 			return;
1085 
1086 	duration = get_int_safe_duration(osn_var, &osn_var->softirq.delta_start);
1087 	trace_softirq_noise(vec_nr, osn_var->softirq.arrival_time, duration);
1088 	cond_move_thread_delta_start(osn_var, duration);
1089 	osn_var->softirq.arrival_time = 0;
1090 }
1091 
1092 /*
1093  * hook_softirq_events - Hook softirq handling events
1094  *
1095  * This function hooks the softirq related callbacks to the respective trace
1096  * events.
1097  */
1098 static int hook_softirq_events(void)
1099 {
1100 	int ret;
1101 
1102 	ret = register_trace_softirq_entry(trace_softirq_entry_callback, NULL);
1103 	if (ret)
1104 		goto out_err;
1105 
1106 	ret = register_trace_softirq_exit(trace_softirq_exit_callback, NULL);
1107 	if (ret)
1108 		goto out_unreg_entry;
1109 
1110 	return 0;
1111 
1112 out_unreg_entry:
1113 	unregister_trace_softirq_entry(trace_softirq_entry_callback, NULL);
1114 out_err:
1115 	return -EINVAL;
1116 }
1117 
1118 /*
1119  * unhook_softirq_events - Unhook softirq handling events
1120  *
1121  * This function hooks the softirq related callbacks to the respective trace
1122  * events.
1123  */
1124 static void unhook_softirq_events(void)
1125 {
1126 	unregister_trace_softirq_entry(trace_softirq_entry_callback, NULL);
1127 	unregister_trace_softirq_exit(trace_softirq_exit_callback, NULL);
1128 }
1129 #else /* CONFIG_PREEMPT_RT */
1130 /*
1131  * softirq are threads on the PREEMPT_RT mode.
1132  */
1133 static int hook_softirq_events(void)
1134 {
1135 	return 0;
1136 }
1137 static void unhook_softirq_events(void)
1138 {
1139 }
1140 #endif
1141 
1142 /*
1143  * thread_entry - Record the starting of a thread noise window
1144  *
1145  * It saves the context switch time for a noisy thread, and increments
1146  * the interference counters.
1147  */
1148 static void
1149 thread_entry(struct osnoise_variables *osn_var, struct task_struct *t)
1150 {
1151 	if (!osn_var->sampling)
1152 		return;
1153 	/*
1154 	 * The arrival time will be used in the report, but not to compute
1155 	 * the execution time, so it is safe to get it unsafe.
1156 	 */
1157 	osn_var->thread.arrival_time = time_get();
1158 
1159 	set_int_safe_time(osn_var, &osn_var->thread.delta_start);
1160 
1161 	osn_var->thread.count++;
1162 	local_inc(&osn_var->int_counter);
1163 }
1164 
1165 /*
1166  * thread_exit - Report the end of a thread noise window
1167  *
1168  * It computes the total noise from a thread, tracing if needed.
1169  */
1170 static void
1171 thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
1172 {
1173 	s64 duration;
1174 
1175 	if (!osn_var->sampling)
1176 		return;
1177 
1178 	if (unlikely(timerlat_enabled()))
1179 		if (!timerlat_thread_exit(osn_var))
1180 			return;
1181 
1182 	duration = get_int_safe_duration(osn_var, &osn_var->thread.delta_start);
1183 
1184 	trace_thread_noise(t, osn_var->thread.arrival_time, duration);
1185 
1186 	osn_var->thread.arrival_time = 0;
1187 }
1188 
1189 #ifdef CONFIG_TIMERLAT_TRACER
1190 /*
1191  * osnoise_stop_exception - Stop tracing and the tracer.
1192  */
1193 static __always_inline void osnoise_stop_exception(char *msg, int cpu)
1194 {
1195 	struct osnoise_instance *inst;
1196 	struct trace_array *tr;
1197 
1198 	rcu_read_lock();
1199 	list_for_each_entry_rcu(inst, &osnoise_instances, list) {
1200 		tr = inst->tr;
1201 		trace_array_printk(tr, _THIS_IP_,
1202 				   "stop tracing hit on cpu %d due to exception: %s\n",
1203 				   smp_processor_id(),
1204 				   msg);
1205 
1206 		if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options))
1207 			panic("tracer hit on cpu %d due to exception: %s\n",
1208 			      smp_processor_id(),
1209 			      msg);
1210 
1211 		tracer_tracing_off(tr);
1212 	}
1213 	rcu_read_unlock();
1214 }
1215 
1216 /*
1217  * trace_sched_migrate_callback - sched:sched_migrate_task trace event handler
1218  *
1219  * his function is hooked to the sched:sched_migrate_task trace event, and monitors
1220  * timerlat user-space thread migration.
1221  */
1222 static void trace_sched_migrate_callback(void *data, struct task_struct *p, int dest_cpu)
1223 {
1224 	struct osnoise_variables *osn_var;
1225 	long cpu = task_cpu(p);
1226 
1227 	osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu);
1228 	if (osn_var->pid == p->pid && dest_cpu != cpu) {
1229 		per_cpu_ptr(&per_cpu_timerlat_var, cpu)->uthread_migrate = 1;
1230 		osnoise_taint("timerlat user-thread migrated\n");
1231 		osnoise_stop_exception("timerlat user-thread migrated", cpu);
1232 	}
1233 }
1234 
1235 static bool monitor_enabled;
1236 
1237 static int register_migration_monitor(void)
1238 {
1239 	int ret = 0;
1240 
1241 	/*
1242 	 * Timerlat thread migration check is only required when running timerlat in user-space.
1243 	 * Thus, enable callback only if timerlat is set with no workload.
1244 	 */
1245 	if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) {
1246 		if (WARN_ON_ONCE(monitor_enabled))
1247 			return 0;
1248 
1249 		ret = register_trace_sched_migrate_task(trace_sched_migrate_callback, NULL);
1250 		if (!ret)
1251 			monitor_enabled = true;
1252 	}
1253 
1254 	return ret;
1255 }
1256 
1257 static void unregister_migration_monitor(void)
1258 {
1259 	if (!monitor_enabled)
1260 		return;
1261 
1262 	unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL);
1263 	monitor_enabled = false;
1264 }
1265 #else
1266 static int register_migration_monitor(void)
1267 {
1268 	return 0;
1269 }
1270 static void unregister_migration_monitor(void) {}
1271 #endif
1272 /*
1273  * trace_sched_switch - sched:sched_switch trace event handler
1274  *
1275  * This function is hooked to the sched:sched_switch trace event, and it is
1276  * used to record the beginning and to report the end of a thread noise window.
1277  */
1278 static void
1279 trace_sched_switch_callback(void *data, bool preempt,
1280 			    struct task_struct *p,
1281 			    struct task_struct *n,
1282 			    unsigned int prev_state)
1283 {
1284 	struct osnoise_variables *osn_var = this_cpu_osn_var();
1285 	int workload = test_bit(OSN_WORKLOAD, &osnoise_options);
1286 
1287 	if ((p->pid != osn_var->pid) || !workload)
1288 		thread_exit(osn_var, p);
1289 
1290 	if ((n->pid != osn_var->pid) || !workload)
1291 		thread_entry(osn_var, n);
1292 }
1293 
1294 /*
1295  * hook_thread_events - Hook the instrumentation for thread noise
1296  *
1297  * Hook the osnoise tracer callbacks to handle the noise from other
1298  * threads on the necessary kernel events.
1299  */
1300 static int hook_thread_events(void)
1301 {
1302 	int ret;
1303 
1304 	ret = register_trace_sched_switch(trace_sched_switch_callback, NULL);
1305 	if (ret)
1306 		return -EINVAL;
1307 
1308 	ret = register_migration_monitor();
1309 	if (ret)
1310 		goto out_unreg;
1311 
1312 	return 0;
1313 
1314 out_unreg:
1315 	unregister_trace_sched_switch(trace_sched_switch_callback, NULL);
1316 	return -EINVAL;
1317 }
1318 
1319 /*
1320  * unhook_thread_events - unhook the instrumentation for thread noise
1321  *
1322  * Unook the osnoise tracer callbacks to handle the noise from other
1323  * threads on the necessary kernel events.
1324  */
1325 static void unhook_thread_events(void)
1326 {
1327 	unregister_trace_sched_switch(trace_sched_switch_callback, NULL);
1328 	unregister_migration_monitor();
1329 }
1330 
1331 /*
1332  * save_osn_sample_stats - Save the osnoise_sample statistics
1333  *
1334  * Save the osnoise_sample statistics before the sampling phase. These
1335  * values will be used later to compute the diff betwneen the statistics
1336  * before and after the osnoise sampling.
1337  */
1338 static void
1339 save_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s)
1340 {
1341 	s->nmi_count = osn_var->nmi.count;
1342 	s->irq_count = osn_var->irq.count;
1343 	s->softirq_count = osn_var->softirq.count;
1344 	s->thread_count = osn_var->thread.count;
1345 }
1346 
1347 /*
1348  * diff_osn_sample_stats - Compute the osnoise_sample statistics
1349  *
1350  * After a sample period, compute the difference on the osnoise_sample
1351  * statistics. The struct osnoise_sample *s contains the statistics saved via
1352  * save_osn_sample_stats() before the osnoise sampling.
1353  */
1354 static void
1355 diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s)
1356 {
1357 	s->nmi_count = osn_var->nmi.count - s->nmi_count;
1358 	s->irq_count = osn_var->irq.count - s->irq_count;
1359 	s->softirq_count = osn_var->softirq.count - s->softirq_count;
1360 	s->thread_count = osn_var->thread.count - s->thread_count;
1361 }
1362 
1363 /*
1364  * osnoise_stop_tracing - Stop tracing and the tracer.
1365  */
1366 static __always_inline void osnoise_stop_tracing(void)
1367 {
1368 	struct osnoise_instance *inst;
1369 	struct trace_array *tr;
1370 
1371 	rcu_read_lock();
1372 	list_for_each_entry_rcu(inst, &osnoise_instances, list) {
1373 		tr = inst->tr;
1374 		trace_array_printk(tr, _THIS_IP_,
1375 				   "stop tracing hit on cpu %d\n", smp_processor_id());
1376 
1377 		if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options))
1378 			panic("tracer hit stop condition on CPU %d\n", smp_processor_id());
1379 
1380 		tracer_tracing_off(tr);
1381 	}
1382 	rcu_read_unlock();
1383 }
1384 
1385 /*
1386  * osnoise_has_tracing_on - Check if there is at least one instance on
1387  */
1388 static __always_inline int osnoise_has_tracing_on(void)
1389 {
1390 	struct osnoise_instance *inst;
1391 	int trace_is_on = 0;
1392 
1393 	rcu_read_lock();
1394 	list_for_each_entry_rcu(inst, &osnoise_instances, list)
1395 		trace_is_on += tracer_tracing_is_on(inst->tr);
1396 	rcu_read_unlock();
1397 
1398 	return trace_is_on;
1399 }
1400 
1401 /*
1402  * notify_new_max_latency - Notify a new max latency via fsnotify interface.
1403  */
1404 static void notify_new_max_latency(u64 latency)
1405 {
1406 	struct osnoise_instance *inst;
1407 	struct trace_array *tr;
1408 
1409 	rcu_read_lock();
1410 	list_for_each_entry_rcu(inst, &osnoise_instances, list) {
1411 		tr = inst->tr;
1412 		if (tracer_tracing_is_on(tr) && tr->max_latency < latency) {
1413 			tr->max_latency = latency;
1414 			latency_fsnotify(tr);
1415 		}
1416 	}
1417 	rcu_read_unlock();
1418 }
1419 
1420 /*
1421  * run_osnoise - Sample the time and look for osnoise
1422  *
1423  * Used to capture the time, looking for potential osnoise latency repeatedly.
1424  * Different from hwlat_detector, it is called with preemption and interrupts
1425  * enabled. This allows irqs, softirqs and threads to run, interfering on the
1426  * osnoise sampling thread, as they would do with a regular thread.
1427  */
1428 static int run_osnoise(void)
1429 {
1430 	bool disable_irq = test_bit(OSN_IRQ_DISABLE, &osnoise_options);
1431 	struct osnoise_variables *osn_var = this_cpu_osn_var();
1432 	u64 start, sample, last_sample;
1433 	u64 last_int_count, int_count;
1434 	s64 noise = 0, max_noise = 0;
1435 	s64 total, last_total = 0;
1436 	struct osnoise_sample s;
1437 	bool disable_preemption;
1438 	unsigned int threshold;
1439 	u64 runtime, stop_in;
1440 	u64 sum_noise = 0;
1441 	int hw_count = 0;
1442 	int ret = -1;
1443 
1444 	/*
1445 	 * Disabling preemption is only required if IRQs are enabled,
1446 	 * and the options is set on.
1447 	 */
1448 	disable_preemption = !disable_irq && test_bit(OSN_PREEMPT_DISABLE, &osnoise_options);
1449 
1450 	/*
1451 	 * Considers the current thread as the workload.
1452 	 */
1453 	osn_var->pid = current->pid;
1454 
1455 	/*
1456 	 * Save the current stats for the diff
1457 	 */
1458 	save_osn_sample_stats(osn_var, &s);
1459 
1460 	/*
1461 	 * if threshold is 0, use the default value of 1 us.
1462 	 */
1463 	threshold = tracing_thresh ? : 1000;
1464 
1465 	/*
1466 	 * Apply PREEMPT and IRQ disabled options.
1467 	 */
1468 	if (disable_irq)
1469 		local_irq_disable();
1470 
1471 	if (disable_preemption)
1472 		preempt_disable();
1473 
1474 	/*
1475 	 * Make sure NMIs see sampling first
1476 	 */
1477 	osn_var->sampling = true;
1478 	barrier();
1479 
1480 	/*
1481 	 * Transform the *_us config to nanoseconds to avoid the
1482 	 * division on the main loop.
1483 	 */
1484 	runtime = osnoise_data.sample_runtime * NSEC_PER_USEC;
1485 	stop_in = osnoise_data.stop_tracing * NSEC_PER_USEC;
1486 
1487 	/*
1488 	 * Start timestamp
1489 	 */
1490 	start = time_get();
1491 
1492 	/*
1493 	 * "previous" loop.
1494 	 */
1495 	last_int_count = set_int_safe_time(osn_var, &last_sample);
1496 
1497 	do {
1498 		/*
1499 		 * Get sample!
1500 		 */
1501 		int_count = set_int_safe_time(osn_var, &sample);
1502 
1503 		noise = time_sub(sample, last_sample);
1504 
1505 		/*
1506 		 * This shouldn't happen.
1507 		 */
1508 		if (noise < 0) {
1509 			osnoise_taint("negative noise!");
1510 			goto out;
1511 		}
1512 
1513 		/*
1514 		 * Sample runtime.
1515 		 */
1516 		total = time_sub(sample, start);
1517 
1518 		/*
1519 		 * Check for possible overflows.
1520 		 */
1521 		if (total < last_total) {
1522 			osnoise_taint("total overflow!");
1523 			break;
1524 		}
1525 
1526 		last_total = total;
1527 
1528 		if (noise >= threshold) {
1529 			int interference = int_count - last_int_count;
1530 
1531 			if (noise > max_noise)
1532 				max_noise = noise;
1533 
1534 			if (!interference)
1535 				hw_count++;
1536 
1537 			sum_noise += noise;
1538 
1539 			trace_sample_threshold(last_sample, noise, interference);
1540 
1541 			if (osnoise_data.stop_tracing)
1542 				if (noise > stop_in)
1543 					osnoise_stop_tracing();
1544 		}
1545 
1546 		/*
1547 		 * In some cases, notably when running on a nohz_full CPU with
1548 		 * a stopped tick PREEMPT_RCU or PREEMPT_LAZY have no way to
1549 		 * account for QSs. This will eventually cause unwarranted
1550 		 * noise as RCU forces preemption as the means of ending the
1551 		 * current grace period.  We avoid this by calling
1552 		 * rcu_momentary_eqs(), which performs a zero duration EQS
1553 		 * allowing RCU to end the current grace period. This call
1554 		 * shouldn't be wrapped inside an RCU critical section.
1555 		 *
1556 		 * Normally QSs for other cases are handled through cond_resched().
1557 		 * For simplicity, however, we call rcu_momentary_eqs() for all
1558 		 * configurations here.
1559 		 */
1560 		if (!disable_irq)
1561 			local_irq_disable();
1562 
1563 		rcu_momentary_eqs();
1564 
1565 		if (!disable_irq)
1566 			local_irq_enable();
1567 
1568 		/*
1569 		 * For the non-preemptive kernel config: let threads runs, if
1570 		 * they so wish, unless set not do to so.
1571 		 */
1572 		if (!disable_irq && !disable_preemption)
1573 			cond_resched();
1574 
1575 		last_sample = sample;
1576 		last_int_count = int_count;
1577 
1578 	} while (total < runtime && !kthread_should_stop());
1579 
1580 	/*
1581 	 * Finish the above in the view for interrupts.
1582 	 */
1583 	barrier();
1584 
1585 	osn_var->sampling = false;
1586 
1587 	/*
1588 	 * Make sure sampling data is no longer updated.
1589 	 */
1590 	barrier();
1591 
1592 	/*
1593 	 * Return to the preemptive state.
1594 	 */
1595 	if (disable_preemption)
1596 		preempt_enable();
1597 
1598 	if (disable_irq)
1599 		local_irq_enable();
1600 
1601 	/*
1602 	 * Save noise info.
1603 	 */
1604 	s.noise = time_to_us(sum_noise);
1605 	s.runtime = time_to_us(total);
1606 	s.max_sample = time_to_us(max_noise);
1607 	s.hw_count = hw_count;
1608 
1609 	/* Save interference stats info */
1610 	diff_osn_sample_stats(osn_var, &s);
1611 
1612 	record_osnoise_sample(&s);
1613 
1614 	notify_new_max_latency(max_noise);
1615 
1616 	if (osnoise_data.stop_tracing_total)
1617 		if (s.noise > osnoise_data.stop_tracing_total)
1618 			osnoise_stop_tracing();
1619 
1620 	return 0;
1621 out:
1622 	return ret;
1623 }
1624 
1625 static struct cpumask osnoise_cpumask;
1626 static struct cpumask save_cpumask;
1627 static struct cpumask kthread_cpumask;
1628 
1629 /*
1630  * osnoise_sleep - sleep until the next period
1631  */
1632 static void osnoise_sleep(bool skip_period)
1633 {
1634 	u64 interval;
1635 	ktime_t wake_time;
1636 
1637 	mutex_lock(&interface_lock);
1638 	if (skip_period)
1639 		interval = osnoise_data.sample_period;
1640 	else
1641 		interval = osnoise_data.sample_period - osnoise_data.sample_runtime;
1642 	mutex_unlock(&interface_lock);
1643 
1644 	/*
1645 	 * differently from hwlat_detector, the osnoise tracer can run
1646 	 * without a pause because preemption is on.
1647 	 */
1648 	if (!interval) {
1649 		/* Let synchronize_rcu_tasks() make progress */
1650 		cond_resched_tasks_rcu_qs();
1651 		return;
1652 	}
1653 
1654 	wake_time = ktime_add_us(ktime_get(), interval);
1655 	__set_current_state(TASK_INTERRUPTIBLE);
1656 
1657 	while (schedule_hrtimeout(&wake_time, HRTIMER_MODE_ABS)) {
1658 		if (kthread_should_stop())
1659 			break;
1660 	}
1661 }
1662 
1663 /*
1664  * osnoise_migration_pending - checks if the task needs to migrate
1665  *
1666  * osnoise/timerlat threads are per-cpu. If there is a pending request to
1667  * migrate the thread away from the current CPU, something bad has happened.
1668  * Play the good citizen and leave.
1669  *
1670  * Returns 0 if it is safe to continue, 1 otherwise.
1671  */
1672 static inline int osnoise_migration_pending(void)
1673 {
1674 	if (!current->migration_pending)
1675 		return 0;
1676 
1677 	/*
1678 	 * If migration is pending, there is a task waiting for the
1679 	 * tracer to enable migration. The tracer does not allow migration,
1680 	 * thus: taint and leave to unblock the blocked thread.
1681 	 */
1682 	osnoise_taint("migration requested to osnoise threads, leaving.");
1683 
1684 	/*
1685 	 * Unset this thread from the threads managed by the interface.
1686 	 * The tracers are responsible for cleaning their env before
1687 	 * exiting.
1688 	 */
1689 	mutex_lock(&interface_lock);
1690 	this_cpu_osn_var()->kthread = NULL;
1691 	cpumask_clear_cpu(smp_processor_id(), &kthread_cpumask);
1692 	mutex_unlock(&interface_lock);
1693 
1694 	return 1;
1695 }
1696 
1697 /*
1698  * osnoise_main - The osnoise detection kernel thread
1699  *
1700  * Calls run_osnoise() function to measure the osnoise for the configured runtime,
1701  * every period.
1702  */
1703 static int osnoise_main(void *data)
1704 {
1705 	unsigned long flags;
1706 
1707 	/*
1708 	 * This thread was created pinned to the CPU using PF_NO_SETAFFINITY.
1709 	 * The problem is that cgroup does not allow PF_NO_SETAFFINITY thread.
1710 	 *
1711 	 * To work around this limitation, disable migration and remove the
1712 	 * flag.
1713 	 */
1714 	migrate_disable();
1715 	raw_spin_lock_irqsave(&current->pi_lock, flags);
1716 	current->flags &= ~(PF_NO_SETAFFINITY);
1717 	raw_spin_unlock_irqrestore(&current->pi_lock, flags);
1718 
1719 	while (!kthread_should_stop()) {
1720 		if (osnoise_migration_pending())
1721 			break;
1722 
1723 		/* skip a period if tracing is off on all instances */
1724 		if (!osnoise_has_tracing_on()) {
1725 			osnoise_sleep(true);
1726 			continue;
1727 		}
1728 
1729 		run_osnoise();
1730 		osnoise_sleep(false);
1731 	}
1732 
1733 	migrate_enable();
1734 	return 0;
1735 }
1736 
1737 #ifdef CONFIG_TIMERLAT_TRACER
1738 /*
1739  * timerlat_irq - hrtimer handler for timerlat.
1740  */
1741 static enum hrtimer_restart timerlat_irq(struct hrtimer *timer)
1742 {
1743 	struct osnoise_variables *osn_var = this_cpu_osn_var();
1744 	struct timerlat_variables *tlat;
1745 	struct timerlat_sample s;
1746 	u64 now;
1747 	u64 diff;
1748 
1749 	/*
1750 	 * I am not sure if the timer was armed for this CPU. So, get
1751 	 * the timerlat struct from the timer itself, not from this
1752 	 * CPU.
1753 	 */
1754 	tlat = container_of(timer, struct timerlat_variables, timer);
1755 
1756 	now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer));
1757 
1758 	/*
1759 	 * Enable the osnoise: events for thread an softirq.
1760 	 */
1761 	tlat->tracing_thread = true;
1762 
1763 	osn_var->thread.arrival_time = time_get();
1764 
1765 	/*
1766 	 * A hardirq is running: the timer IRQ. It is for sure preempting
1767 	 * a thread, and potentially preempting a softirq.
1768 	 *
1769 	 * At this point, it is not interesting to know the duration of the
1770 	 * preempted thread (and maybe softirq), but how much time they will
1771 	 * delay the beginning of the execution of the timer thread.
1772 	 *
1773 	 * To get the correct (net) delay added by the softirq, its delta_start
1774 	 * is set as the IRQ one. In this way, at the return of the IRQ, the delta
1775 	 * start of the sofitrq will be zeroed, accounting then only the time
1776 	 * after that.
1777 	 *
1778 	 * The thread follows the same principle. However, if a softirq is
1779 	 * running, the thread needs to receive the softirq delta_start. The
1780 	 * reason being is that the softirq will be the last to be unfolded,
1781 	 * resseting the thread delay to zero.
1782 	 *
1783 	 * The PREEMPT_RT is a special case, though. As softirqs run as threads
1784 	 * on RT, moving the thread is enough.
1785 	 */
1786 	if (!IS_ENABLED(CONFIG_PREEMPT_RT) && osn_var->softirq.delta_start) {
1787 		copy_int_safe_time(osn_var, &osn_var->thread.delta_start,
1788 				   &osn_var->softirq.delta_start);
1789 
1790 		copy_int_safe_time(osn_var, &osn_var->softirq.delta_start,
1791 				    &osn_var->irq.delta_start);
1792 	} else {
1793 		copy_int_safe_time(osn_var, &osn_var->thread.delta_start,
1794 				    &osn_var->irq.delta_start);
1795 	}
1796 
1797 	/*
1798 	 * Compute the current time with the expected time.
1799 	 */
1800 	diff = now - tlat->abs_period;
1801 
1802 	tlat->count++;
1803 	s.seqnum = tlat->count;
1804 	s.timer_latency = diff;
1805 	s.context = IRQ_CONTEXT;
1806 
1807 	record_timerlat_sample(&s);
1808 
1809 	if (osnoise_data.stop_tracing) {
1810 		if (time_to_us(diff) >= osnoise_data.stop_tracing) {
1811 
1812 			/*
1813 			 * At this point, if stop_tracing is set and <= print_stack,
1814 			 * print_stack is set and would be printed in the thread handler.
1815 			 *
1816 			 * Thus, print the stack trace as it is helpful to define the
1817 			 * root cause of an IRQ latency.
1818 			 */
1819 			if (osnoise_data.stop_tracing <= osnoise_data.print_stack) {
1820 				timerlat_save_stack(0);
1821 				timerlat_dump_stack(time_to_us(diff));
1822 			}
1823 
1824 			osnoise_stop_tracing();
1825 			notify_new_max_latency(diff);
1826 
1827 			wake_up_process(tlat->kthread);
1828 
1829 			return HRTIMER_NORESTART;
1830 		}
1831 	}
1832 
1833 	wake_up_process(tlat->kthread);
1834 
1835 	if (osnoise_data.print_stack)
1836 		timerlat_save_stack(0);
1837 
1838 	return HRTIMER_NORESTART;
1839 }
1840 
1841 /*
1842  * wait_next_period - Wait for the next period for timerlat
1843  */
1844 static int wait_next_period(struct timerlat_variables *tlat)
1845 {
1846 	ktime_t next_abs_period, now;
1847 	u64 rel_period = osnoise_data.timerlat_period * 1000;
1848 
1849 	now = hrtimer_cb_get_time(&tlat->timer);
1850 	next_abs_period = ns_to_ktime(tlat->abs_period + rel_period);
1851 
1852 	/*
1853 	 * Save the next abs_period.
1854 	 */
1855 	tlat->abs_period = (u64) ktime_to_ns(next_abs_period);
1856 
1857 	/*
1858 	 * Align thread in the first cycle on each CPU to the set alignment
1859 	 * if TIMERLAT_ALIGN is set.
1860 	 *
1861 	 * This is done by using an atomic64_t to store the next absolute period.
1862 	 * The first thread that wakes up will set the atomic64_t to its
1863 	 * absolute period, and the other threads will increment it by
1864 	 * the alignment value.
1865 	 */
1866 	if (test_bit(OSN_TIMERLAT_ALIGN, &osnoise_options) && !tlat->count
1867 	    && atomic64_cmpxchg_relaxed(&align_next, 0, tlat->abs_period)) {
1868 		/*
1869 		 * A thread has already set align_next, use it and increment it
1870 		 * to be used by the next thread that wakes up after this one.
1871 		 */
1872 		tlat->abs_period = atomic64_add_return_relaxed(
1873 			osnoise_data.timerlat_align_us * 1000, &align_next);
1874 		next_abs_period = ns_to_ktime(tlat->abs_period);
1875 	}
1876 
1877 	/*
1878 	 * If the new abs_period is in the past, skip the activation.
1879 	 */
1880 	while (ktime_compare(now, next_abs_period) > 0) {
1881 		next_abs_period = ns_to_ktime(tlat->abs_period + rel_period);
1882 		tlat->abs_period = (u64) ktime_to_ns(next_abs_period);
1883 	}
1884 
1885 	set_current_state(TASK_INTERRUPTIBLE);
1886 
1887 	hrtimer_start(&tlat->timer, next_abs_period, HRTIMER_MODE_ABS_PINNED_HARD);
1888 	schedule();
1889 	return 1;
1890 }
1891 
1892 /*
1893  * timerlat_main- Timerlat main
1894  */
1895 static int timerlat_main(void *data)
1896 {
1897 	struct osnoise_variables *osn_var = this_cpu_osn_var();
1898 	struct timerlat_variables *tlat = this_cpu_tmr_var();
1899 	struct timerlat_sample s;
1900 	struct sched_param sp;
1901 	unsigned long flags;
1902 	u64 now, diff;
1903 
1904 	/*
1905 	 * Make the thread RT, that is how cyclictest is usually used.
1906 	 */
1907 	sp.sched_priority = DEFAULT_TIMERLAT_PRIO;
1908 	sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1909 
1910 	/*
1911 	 * This thread was created pinned to the CPU using PF_NO_SETAFFINITY.
1912 	 * The problem is that cgroup does not allow PF_NO_SETAFFINITY thread.
1913 	 *
1914 	 * To work around this limitation, disable migration and remove the
1915 	 * flag.
1916 	 */
1917 	migrate_disable();
1918 	raw_spin_lock_irqsave(&current->pi_lock, flags);
1919 	current->flags &= ~(PF_NO_SETAFFINITY);
1920 	raw_spin_unlock_irqrestore(&current->pi_lock, flags);
1921 
1922 	tlat->count = 0;
1923 	tlat->tracing_thread = false;
1924 
1925 	hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
1926 	tlat->kthread = current;
1927 	osn_var->pid = current->pid;
1928 	/*
1929 	 * Annotate the arrival time.
1930 	 */
1931 	tlat->abs_period = hrtimer_cb_get_time(&tlat->timer);
1932 
1933 	wait_next_period(tlat);
1934 
1935 	osn_var->sampling = 1;
1936 
1937 	while (!kthread_should_stop()) {
1938 
1939 		now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer));
1940 		diff = now - tlat->abs_period;
1941 
1942 		s.seqnum = tlat->count;
1943 		s.timer_latency = diff;
1944 		s.context = THREAD_CONTEXT;
1945 
1946 		record_timerlat_sample(&s);
1947 
1948 		notify_new_max_latency(diff);
1949 
1950 		timerlat_dump_stack(time_to_us(diff));
1951 
1952 		tlat->tracing_thread = false;
1953 		if (osnoise_data.stop_tracing_total)
1954 			if (time_to_us(diff) >= osnoise_data.stop_tracing_total)
1955 				osnoise_stop_tracing();
1956 
1957 		if (osnoise_migration_pending())
1958 			break;
1959 
1960 		wait_next_period(tlat);
1961 	}
1962 
1963 	hrtimer_cancel(&tlat->timer);
1964 	migrate_enable();
1965 	return 0;
1966 }
1967 #else /* CONFIG_TIMERLAT_TRACER */
1968 static int timerlat_main(void *data)
1969 {
1970 	return 0;
1971 }
1972 #endif /* CONFIG_TIMERLAT_TRACER */
1973 
1974 /*
1975  * stop_kthread - stop a workload thread
1976  */
1977 static void stop_kthread(unsigned int cpu)
1978 {
1979 	struct task_struct *kthread;
1980 
1981 	kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL);
1982 	if (kthread) {
1983 		if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask) &&
1984 		    !WARN_ON(!test_bit(OSN_WORKLOAD, &osnoise_options))) {
1985 			kthread_stop(kthread);
1986 		} else if (!WARN_ON(test_bit(OSN_WORKLOAD, &osnoise_options))) {
1987 			/*
1988 			 * This is a user thread waiting on the timerlat_fd. We need
1989 			 * to close all users, and the best way to guarantee this is
1990 			 * by killing the thread. NOTE: this is a purpose specific file.
1991 			 */
1992 			kill_pid(kthread->thread_pid, SIGKILL, 1);
1993 			put_task_struct(kthread);
1994 		}
1995 	} else {
1996 		/* if no workload, just return */
1997 		if (!test_bit(OSN_WORKLOAD, &osnoise_options)) {
1998 			/*
1999 			 * This is set in the osnoise tracer case.
2000 			 */
2001 			per_cpu(per_cpu_osnoise_var, cpu).sampling = false;
2002 			barrier();
2003 		}
2004 	}
2005 }
2006 
2007 /*
2008  * stop_per_cpu_kthread - Stop per-cpu threads
2009  *
2010  * Stop the osnoise sampling htread. Use this on unload and at system
2011  * shutdown.
2012  */
2013 static void stop_per_cpu_kthreads(void)
2014 {
2015 	int cpu;
2016 
2017 	cpus_read_lock();
2018 
2019 	for_each_online_cpu(cpu)
2020 		stop_kthread(cpu);
2021 
2022 	cpus_read_unlock();
2023 }
2024 
2025 /*
2026  * start_kthread - Start a workload thread
2027  */
2028 static int start_kthread(unsigned int cpu)
2029 {
2030 	struct task_struct *kthread;
2031 	void *main = osnoise_main;
2032 	char comm[24];
2033 
2034 	/* Do not start a new thread if it is already running */
2035 	if (per_cpu(per_cpu_osnoise_var, cpu).kthread)
2036 		return 0;
2037 
2038 	if (timerlat_enabled()) {
2039 		snprintf(comm, 24, "timerlat/%d", cpu);
2040 		main = timerlat_main;
2041 	} else {
2042 		/* if no workload, just return */
2043 		if (!test_bit(OSN_WORKLOAD, &osnoise_options)) {
2044 			per_cpu(per_cpu_osnoise_var, cpu).sampling = true;
2045 			barrier();
2046 			return 0;
2047 		}
2048 		snprintf(comm, 24, "osnoise/%d", cpu);
2049 	}
2050 
2051 	kthread = kthread_run_on_cpu(main, NULL, cpu, comm);
2052 
2053 	if (IS_ERR(kthread)) {
2054 		pr_err(BANNER "could not start sampling thread\n");
2055 		return -ENOMEM;
2056 	}
2057 
2058 	per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread;
2059 	cpumask_set_cpu(cpu, &kthread_cpumask);
2060 
2061 	return 0;
2062 }
2063 
2064 /*
2065  * start_per_cpu_kthread - Kick off per-cpu osnoise sampling kthreads
2066  *
2067  * This starts the kernel thread that will look for osnoise on many
2068  * cpus.
2069  */
2070 static int start_per_cpu_kthreads(void)
2071 {
2072 	struct cpumask *current_mask = &save_cpumask;
2073 	int retval = 0;
2074 	int cpu;
2075 
2076 	if (!test_bit(OSN_WORKLOAD, &osnoise_options)) {
2077 		if (timerlat_enabled())
2078 			return 0;
2079 	}
2080 
2081 	cpus_read_lock();
2082 	/*
2083 	 * Run only on online CPUs in which osnoise is allowed to run.
2084 	 */
2085 	cpumask_and(current_mask, cpu_online_mask, &osnoise_cpumask);
2086 
2087 	for_each_possible_cpu(cpu) {
2088 		if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask)) {
2089 			struct task_struct *kthread;
2090 
2091 			kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL);
2092 			if (!WARN_ON(!kthread))
2093 				kthread_stop(kthread);
2094 		}
2095 	}
2096 
2097 	for_each_cpu(cpu, current_mask) {
2098 		retval = start_kthread(cpu);
2099 		if (retval) {
2100 			cpus_read_unlock();
2101 			stop_per_cpu_kthreads();
2102 			return retval;
2103 		}
2104 	}
2105 
2106 	cpus_read_unlock();
2107 
2108 	return retval;
2109 }
2110 
2111 #ifdef CONFIG_HOTPLUG_CPU
2112 static void osnoise_hotplug_workfn(struct work_struct *dummy)
2113 {
2114 	unsigned int cpu = smp_processor_id();
2115 
2116 	guard(mutex)(&trace_types_lock);
2117 
2118 	if (!osnoise_has_registered_instances())
2119 		return;
2120 
2121 	guard(cpus_read_lock)();
2122 	guard(mutex)(&interface_lock);
2123 
2124 	if (!cpu_online(cpu))
2125 		return;
2126 
2127 	if (!cpumask_test_cpu(cpu, &osnoise_cpumask))
2128 		return;
2129 
2130 	start_kthread(cpu);
2131 }
2132 
2133 static DECLARE_WORK(osnoise_hotplug_work, osnoise_hotplug_workfn);
2134 
2135 /*
2136  * osnoise_cpu_init - CPU hotplug online callback function
2137  */
2138 static int osnoise_cpu_init(unsigned int cpu)
2139 {
2140 	schedule_work_on(cpu, &osnoise_hotplug_work);
2141 	return 0;
2142 }
2143 
2144 /*
2145  * osnoise_cpu_die - CPU hotplug offline callback function
2146  */
2147 static int osnoise_cpu_die(unsigned int cpu)
2148 {
2149 	stop_kthread(cpu);
2150 	return 0;
2151 }
2152 
2153 static void osnoise_init_hotplug_support(void)
2154 {
2155 	int ret;
2156 
2157 	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "trace/osnoise:online",
2158 				osnoise_cpu_init, osnoise_cpu_die);
2159 	if (ret < 0)
2160 		pr_warn(BANNER "Error to init cpu hotplug support\n");
2161 
2162 	return;
2163 }
2164 #else /* CONFIG_HOTPLUG_CPU */
2165 static void osnoise_init_hotplug_support(void)
2166 {
2167 	return;
2168 }
2169 #endif /* CONFIG_HOTPLUG_CPU */
2170 
2171 /*
2172  * seq file functions for the osnoise/options file.
2173  */
2174 static void *s_options_start(struct seq_file *s, loff_t *pos)
2175 {
2176 	int option = *pos;
2177 
2178 	mutex_lock(&interface_lock);
2179 
2180 	if (option >= OSN_MAX)
2181 		return NULL;
2182 
2183 	return pos;
2184 }
2185 
2186 static void *s_options_next(struct seq_file *s, void *v, loff_t *pos)
2187 {
2188 	int option = ++(*pos);
2189 
2190 	if (option >= OSN_MAX)
2191 		return NULL;
2192 
2193 	return pos;
2194 }
2195 
2196 static int s_options_show(struct seq_file *s, void *v)
2197 {
2198 	loff_t *pos = v;
2199 	int option = *pos;
2200 
2201 	if (option == OSN_DEFAULTS) {
2202 		if (osnoise_options == OSN_DEFAULT_OPTIONS)
2203 			seq_printf(s, "%s", osnoise_options_str[option]);
2204 		else
2205 			seq_printf(s, "NO_%s", osnoise_options_str[option]);
2206 		goto out;
2207 	}
2208 
2209 	if (test_bit(option, &osnoise_options))
2210 		seq_printf(s, "%s", osnoise_options_str[option]);
2211 	else
2212 		seq_printf(s, "NO_%s", osnoise_options_str[option]);
2213 
2214 out:
2215 	if (option != OSN_MAX)
2216 		seq_puts(s, " ");
2217 
2218 	return 0;
2219 }
2220 
2221 static void s_options_stop(struct seq_file *s, void *v)
2222 {
2223 	seq_puts(s, "\n");
2224 	mutex_unlock(&interface_lock);
2225 }
2226 
2227 static const struct seq_operations osnoise_options_seq_ops = {
2228 	.start		= s_options_start,
2229 	.next		= s_options_next,
2230 	.show		= s_options_show,
2231 	.stop		= s_options_stop
2232 };
2233 
2234 static int osnoise_options_open(struct inode *inode, struct file *file)
2235 {
2236 	return seq_open(file, &osnoise_options_seq_ops);
2237 };
2238 
2239 /**
2240  * osnoise_options_write - Write function for "options" entry
2241  * @filp: The active open file structure
2242  * @ubuf: The user buffer that contains the value to write
2243  * @cnt: The maximum number of bytes to write to "file"
2244  * @ppos: The current position in @file
2245  *
2246  * Writing the option name sets the option, writing the "NO_"
2247  * prefix in front of the option name disables it.
2248  *
2249  * Writing "DEFAULTS" resets the option values to the default ones.
2250  */
2251 static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf,
2252 				     size_t cnt, loff_t *ppos)
2253 {
2254 	int running, option, enable, retval;
2255 	char buf[256], *option_str;
2256 
2257 	if (cnt >= 256)
2258 		return -EINVAL;
2259 
2260 	if (copy_from_user(buf, ubuf, cnt))
2261 		return -EFAULT;
2262 
2263 	buf[cnt] = 0;
2264 
2265 	if (strncmp(buf, "NO_", 3)) {
2266 		option_str = strstrip(buf);
2267 		enable = true;
2268 	} else {
2269 		option_str = strstrip(&buf[3]);
2270 		enable = false;
2271 	}
2272 
2273 	option = match_string(osnoise_options_str, OSN_MAX, option_str);
2274 	if (option < 0)
2275 		return -EINVAL;
2276 
2277 	/*
2278 	 * trace_types_lock is taken to avoid concurrency on start/stop.
2279 	 */
2280 	mutex_lock(&trace_types_lock);
2281 	running = osnoise_has_registered_instances();
2282 	if (running)
2283 		stop_per_cpu_kthreads();
2284 
2285 	/*
2286 	 * avoid CPU hotplug operations that might read options.
2287 	 */
2288 	cpus_read_lock();
2289 	mutex_lock(&interface_lock);
2290 
2291 	retval = cnt;
2292 
2293 	if (enable) {
2294 		if (option == OSN_DEFAULTS)
2295 			osnoise_options = OSN_DEFAULT_OPTIONS;
2296 		else
2297 			set_bit(option, &osnoise_options);
2298 	} else {
2299 		if (option == OSN_DEFAULTS)
2300 			retval = -EINVAL;
2301 		else
2302 			clear_bit(option, &osnoise_options);
2303 	}
2304 
2305 	mutex_unlock(&interface_lock);
2306 	cpus_read_unlock();
2307 
2308 	if (running)
2309 		start_per_cpu_kthreads();
2310 	mutex_unlock(&trace_types_lock);
2311 
2312 	return retval;
2313 }
2314 
2315 /*
2316  * osnoise_cpus_read - Read function for reading the "cpus" file
2317  * @filp: The active open file structure
2318  * @ubuf: The userspace provided buffer to read value into
2319  * @cnt: The maximum number of bytes to read
2320  * @ppos: The current "file" position
2321  *
2322  * Prints the "cpus" output into the user-provided buffer.
2323  */
2324 static ssize_t
2325 osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count,
2326 		  loff_t *ppos)
2327 {
2328 	char *mask_str __free(kfree) = NULL;
2329 	int len;
2330 
2331 	guard(mutex)(&interface_lock);
2332 
2333 	len = snprintf(NULL, 0, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)) + 1;
2334 	mask_str = kmalloc(len, GFP_KERNEL);
2335 	if (!mask_str)
2336 		return -ENOMEM;
2337 
2338 	len = snprintf(mask_str, len, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask));
2339 	if (len >= count)
2340 		return -EINVAL;
2341 
2342 	count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
2343 
2344 	return count;
2345 }
2346 
2347 /*
2348  * osnoise_cpus_write - Write function for "cpus" entry
2349  * @filp: The active open file structure
2350  * @ubuf: The user buffer that contains the value to write
2351  * @count: The maximum number of bytes to write to "file"
2352  * @ppos: The current position in @file
2353  *
2354  * This function provides a write implementation for the "cpus"
2355  * interface to the osnoise trace. By default, it lists all  CPUs,
2356  * in this way, allowing osnoise threads to run on any online CPU
2357  * of the system. It serves to restrict the execution of osnoise to the
2358  * set of CPUs writing via this interface. Why not use "tracing_cpumask"?
2359  * Because the user might be interested in tracing what is running on
2360  * other CPUs. For instance, one might run osnoise in one HT CPU
2361  * while observing what is running on the sibling HT CPU.
2362  */
2363 static ssize_t
2364 osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
2365 		   loff_t *ppos)
2366 {
2367 	cpumask_var_t osnoise_cpumask_new;
2368 	int running, err;
2369 	char *buf __free(kfree) = NULL;
2370 
2371 	if (count < 1)
2372 		return 0;
2373 
2374 	buf = memdup_user_nul(ubuf, count);
2375 	if (IS_ERR(buf))
2376 		return PTR_ERR(buf);
2377 
2378 	if (!zalloc_cpumask_var(&osnoise_cpumask_new, GFP_KERNEL))
2379 		return -ENOMEM;
2380 
2381 	err = cpulist_parse(buf, osnoise_cpumask_new);
2382 	if (err)
2383 		goto err_free;
2384 
2385 	/*
2386 	 * trace_types_lock is taken to avoid concurrency on start/stop.
2387 	 */
2388 	mutex_lock(&trace_types_lock);
2389 	running = osnoise_has_registered_instances();
2390 	if (running)
2391 		stop_per_cpu_kthreads();
2392 
2393 	/*
2394 	 * osnoise_cpumask is read by CPU hotplug operations.
2395 	 */
2396 	cpus_read_lock();
2397 	mutex_lock(&interface_lock);
2398 
2399 	cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new);
2400 
2401 	mutex_unlock(&interface_lock);
2402 	cpus_read_unlock();
2403 
2404 	if (running)
2405 		start_per_cpu_kthreads();
2406 	mutex_unlock(&trace_types_lock);
2407 
2408 	free_cpumask_var(osnoise_cpumask_new);
2409 	return count;
2410 
2411 err_free:
2412 	free_cpumask_var(osnoise_cpumask_new);
2413 
2414 	return err;
2415 }
2416 
2417 #ifdef CONFIG_TIMERLAT_TRACER
2418 static int timerlat_fd_open(struct inode *inode, struct file *file)
2419 {
2420 	struct osnoise_variables *osn_var;
2421 	struct timerlat_variables *tlat;
2422 	long cpu = (long) inode->i_cdev;
2423 
2424 	mutex_lock(&interface_lock);
2425 
2426 	/*
2427 	 * This file is accessible only if timerlat is enabled, and
2428 	 * NO_OSNOISE_WORKLOAD is set.
2429 	 */
2430 	if (!timerlat_enabled() || test_bit(OSN_WORKLOAD, &osnoise_options)) {
2431 		mutex_unlock(&interface_lock);
2432 		return -EINVAL;
2433 	}
2434 
2435 	migrate_disable();
2436 
2437 	osn_var = this_cpu_osn_var();
2438 
2439 	/*
2440 	 * The osn_var->pid holds the single access to this file.
2441 	 */
2442 	if (osn_var->pid) {
2443 		mutex_unlock(&interface_lock);
2444 		migrate_enable();
2445 		return -EBUSY;
2446 	}
2447 
2448 	/*
2449 	 * timerlat tracer is a per-cpu tracer. Check if the user-space too
2450 	 * is pinned to a single CPU. The tracer laters monitor if the task
2451 	 * migrates and then disables tracer if it does. However, it is
2452 	 * worth doing this basic acceptance test to avoid obviusly wrong
2453 	 * setup.
2454 	 */
2455 	if (current->nr_cpus_allowed > 1 ||  cpu != smp_processor_id()) {
2456 		mutex_unlock(&interface_lock);
2457 		migrate_enable();
2458 		return -EPERM;
2459 	}
2460 
2461 	/*
2462 	 * From now on, it is good to go.
2463 	 */
2464 	file->private_data = inode->i_cdev;
2465 
2466 	get_task_struct(current);
2467 
2468 	osn_var->kthread = current;
2469 	osn_var->pid = current->pid;
2470 
2471 	/*
2472 	 * Setup is done.
2473 	 */
2474 	mutex_unlock(&interface_lock);
2475 
2476 	tlat = this_cpu_tmr_var();
2477 	tlat->count = 0;
2478 
2479 	hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
2480 
2481 	migrate_enable();
2482 	return 0;
2483 };
2484 
2485 /*
2486  * timerlat_fd_read - Read function for "timerlat_fd" file
2487  * @file: The active open file structure
2488  * @ubuf: The userspace provided buffer to read value into
2489  * @cnt: The maximum number of bytes to read
2490  * @ppos: The current "file" position
2491  *
2492  * Prints 1 on timerlat, the number of interferences on osnoise, -1 on error.
2493  */
2494 static ssize_t
2495 timerlat_fd_read(struct file *file, char __user *ubuf, size_t count,
2496 		  loff_t *ppos)
2497 {
2498 	long cpu = (long) file->private_data;
2499 	struct osnoise_variables *osn_var;
2500 	struct timerlat_variables *tlat;
2501 	struct timerlat_sample s;
2502 	s64 diff;
2503 	u64 now;
2504 
2505 	migrate_disable();
2506 
2507 	tlat = this_cpu_tmr_var();
2508 
2509 	/*
2510 	 * While in user-space, the thread is migratable. There is nothing
2511 	 * we can do about it.
2512 	 * So, if the thread is running on another CPU, stop the machinery.
2513 	 */
2514 	if (cpu == smp_processor_id()) {
2515 		if (tlat->uthread_migrate) {
2516 			migrate_enable();
2517 			return -EINVAL;
2518 		}
2519 	} else {
2520 		per_cpu_ptr(&per_cpu_timerlat_var, cpu)->uthread_migrate = 1;
2521 		osnoise_taint("timerlat user thread migrate\n");
2522 		osnoise_stop_tracing();
2523 		migrate_enable();
2524 		return -EINVAL;
2525 	}
2526 
2527 	osn_var = this_cpu_osn_var();
2528 
2529 	/*
2530 	 * The timerlat in user-space runs in a different order:
2531 	 * the read() starts from the execution of the previous occurrence,
2532 	 * sleeping for the next occurrence.
2533 	 *
2534 	 * So, skip if we are entering on read() before the first wakeup
2535 	 * from timerlat IRQ:
2536 	 */
2537 	if (likely(osn_var->sampling)) {
2538 		now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer));
2539 		diff = now - tlat->abs_period;
2540 
2541 		/*
2542 		 * it was not a timer firing, but some other signal?
2543 		 */
2544 		if (diff < 0)
2545 			goto out;
2546 
2547 		s.seqnum = tlat->count;
2548 		s.timer_latency = diff;
2549 		s.context = THREAD_URET;
2550 
2551 		record_timerlat_sample(&s);
2552 
2553 		notify_new_max_latency(diff);
2554 
2555 		tlat->tracing_thread = false;
2556 		if (osnoise_data.stop_tracing_total) {
2557 			if (time_to_us(diff) >= osnoise_data.stop_tracing_total) {
2558 				timerlat_dump_stack(time_to_us(diff));
2559 				osnoise_stop_tracing();
2560 			}
2561 		}
2562 	} else {
2563 		tlat->tracing_thread = false;
2564 		tlat->kthread = current;
2565 
2566 		/* Annotate now to drift new period */
2567 		tlat->abs_period = hrtimer_cb_get_time(&tlat->timer);
2568 
2569 		osn_var->sampling = 1;
2570 	}
2571 
2572 	/* wait for the next period */
2573 	wait_next_period(tlat);
2574 
2575 	/* This is the wakeup from this cycle */
2576 	now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer));
2577 	diff = now - tlat->abs_period;
2578 
2579 	/*
2580 	 * it was not a timer firing, but some other signal?
2581 	 */
2582 	if (diff < 0)
2583 		goto out;
2584 
2585 	s.seqnum = tlat->count;
2586 	s.timer_latency = diff;
2587 	s.context = THREAD_CONTEXT;
2588 
2589 	record_timerlat_sample(&s);
2590 
2591 	if (osnoise_data.stop_tracing_total) {
2592 		if (time_to_us(diff) >= osnoise_data.stop_tracing_total) {
2593 			timerlat_dump_stack(time_to_us(diff));
2594 			notify_new_max_latency(diff);
2595 			osnoise_stop_tracing();
2596 		}
2597 	}
2598 
2599 out:
2600 	migrate_enable();
2601 	return 0;
2602 }
2603 
2604 static int timerlat_fd_release(struct inode *inode, struct file *file)
2605 {
2606 	struct osnoise_variables *osn_var;
2607 	struct timerlat_variables *tlat_var;
2608 	long cpu = (long) file->private_data;
2609 
2610 	migrate_disable();
2611 	mutex_lock(&interface_lock);
2612 
2613 	osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu);
2614 	tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu);
2615 
2616 	if (tlat_var->kthread)
2617 		hrtimer_cancel(&tlat_var->timer);
2618 	memset(tlat_var, 0, sizeof(*tlat_var));
2619 
2620 	osn_var->sampling = 0;
2621 	osn_var->pid = 0;
2622 
2623 	/*
2624 	 * We are leaving, not being stopped... see stop_kthread();
2625 	 */
2626 	if (osn_var->kthread) {
2627 		put_task_struct(osn_var->kthread);
2628 		osn_var->kthread = NULL;
2629 	}
2630 
2631 	mutex_unlock(&interface_lock);
2632 	migrate_enable();
2633 	return 0;
2634 }
2635 #endif
2636 
2637 /*
2638  * osnoise/runtime_us: cannot be greater than the period.
2639  */
2640 static struct trace_min_max_param osnoise_runtime = {
2641 	.lock	= &interface_lock,
2642 	.val	= &osnoise_data.sample_runtime,
2643 	.max	= &osnoise_data.sample_period,
2644 	.min	= NULL,
2645 };
2646 
2647 /*
2648  * osnoise/period_us: cannot be smaller than the runtime.
2649  */
2650 static struct trace_min_max_param osnoise_period = {
2651 	.lock	= &interface_lock,
2652 	.val	= &osnoise_data.sample_period,
2653 	.max	= NULL,
2654 	.min	= &osnoise_data.sample_runtime,
2655 };
2656 
2657 /*
2658  * osnoise/stop_tracing_us: no limit.
2659  */
2660 static struct trace_min_max_param osnoise_stop_tracing_in = {
2661 	.lock	= &interface_lock,
2662 	.val	= &osnoise_data.stop_tracing,
2663 	.max	= NULL,
2664 	.min	= NULL,
2665 };
2666 
2667 /*
2668  * osnoise/stop_tracing_total_us: no limit.
2669  */
2670 static struct trace_min_max_param osnoise_stop_tracing_total = {
2671 	.lock	= &interface_lock,
2672 	.val	= &osnoise_data.stop_tracing_total,
2673 	.max	= NULL,
2674 	.min	= NULL,
2675 };
2676 
2677 #ifdef CONFIG_TIMERLAT_TRACER
2678 /*
2679  * osnoise/print_stack: print the stacktrace of the IRQ handler if the total
2680  * latency is higher than val.
2681  */
2682 static struct trace_min_max_param osnoise_print_stack = {
2683 	.lock	= &interface_lock,
2684 	.val	= &osnoise_data.print_stack,
2685 	.max	= NULL,
2686 	.min	= NULL,
2687 };
2688 
2689 /*
2690  * osnoise/timerlat_period: min 100 us, max 1 s
2691  */
2692 static u64 timerlat_min_period = 100;
2693 static u64 timerlat_max_period = 1000000;
2694 static struct trace_min_max_param timerlat_period = {
2695 	.lock	= &interface_lock,
2696 	.val	= &osnoise_data.timerlat_period,
2697 	.max	= &timerlat_max_period,
2698 	.min	= &timerlat_min_period,
2699 };
2700 
2701 /*
2702  * osnoise/timerlat_align_us: align the first wakeup of all timerlat
2703  * threads to a common boundary (in us). 0 means disabled.
2704  */
2705 static struct trace_min_max_param timerlat_align_us = {
2706 	.lock	= &interface_lock,
2707 	.val	= &osnoise_data.timerlat_align_us,
2708 	.max	= NULL,
2709 	.min	= NULL,
2710 };
2711 
2712 static const struct file_operations timerlat_fd_fops = {
2713 	.open		= timerlat_fd_open,
2714 	.read		= timerlat_fd_read,
2715 	.release	= timerlat_fd_release,
2716 	.llseek		= generic_file_llseek,
2717 };
2718 #endif
2719 
2720 static const struct file_operations cpus_fops = {
2721 	.open		= tracing_open_generic,
2722 	.read		= osnoise_cpus_read,
2723 	.write		= osnoise_cpus_write,
2724 	.llseek		= generic_file_llseek,
2725 };
2726 
2727 static const struct file_operations osnoise_options_fops = {
2728 	.open		= osnoise_options_open,
2729 	.read		= seq_read,
2730 	.llseek		= seq_lseek,
2731 	.release	= seq_release,
2732 	.write		= osnoise_options_write
2733 };
2734 
2735 #ifdef CONFIG_TIMERLAT_TRACER
2736 #ifdef CONFIG_STACKTRACE
2737 static int init_timerlat_stack_tracefs(struct dentry *top_dir)
2738 {
2739 	struct dentry *tmp;
2740 
2741 	tmp = tracefs_create_file("print_stack", TRACE_MODE_WRITE, top_dir,
2742 				  &osnoise_print_stack, &trace_min_max_fops);
2743 	if (!tmp)
2744 		return -ENOMEM;
2745 
2746 	return 0;
2747 }
2748 #else /* CONFIG_STACKTRACE */
2749 static int init_timerlat_stack_tracefs(struct dentry *top_dir)
2750 {
2751 	return 0;
2752 }
2753 #endif /* CONFIG_STACKTRACE */
2754 
2755 static int osnoise_create_cpu_timerlat_fd(struct dentry *top_dir)
2756 {
2757 	struct dentry *timerlat_fd;
2758 	struct dentry *per_cpu;
2759 	struct dentry *cpu_dir;
2760 	char cpu_str[30]; /* see trace.c: tracing_init_tracefs_percpu() */
2761 	long cpu;
2762 
2763 	/*
2764 	 * Why not using tracing instance per_cpu/ dir?
2765 	 *
2766 	 * Because osnoise/timerlat have a single workload, having
2767 	 * multiple files like these are waste of memory.
2768 	 */
2769 	per_cpu = tracefs_create_dir("per_cpu", top_dir);
2770 	if (!per_cpu)
2771 		return -ENOMEM;
2772 
2773 	for_each_possible_cpu(cpu) {
2774 		snprintf(cpu_str, 30, "cpu%ld", cpu);
2775 		cpu_dir = tracefs_create_dir(cpu_str, per_cpu);
2776 		if (!cpu_dir)
2777 			goto out_clean;
2778 
2779 		timerlat_fd = trace_create_file("timerlat_fd", TRACE_MODE_READ,
2780 						cpu_dir, NULL, &timerlat_fd_fops);
2781 		if (!timerlat_fd)
2782 			goto out_clean;
2783 
2784 		/* Record the CPU */
2785 		d_inode(timerlat_fd)->i_cdev = (void *)(cpu);
2786 	}
2787 
2788 	return 0;
2789 
2790 out_clean:
2791 	tracefs_remove(per_cpu);
2792 	return -ENOMEM;
2793 }
2794 
2795 /*
2796  * init_timerlat_tracefs - A function to initialize the timerlat interface files
2797  */
2798 static int init_timerlat_tracefs(struct dentry *top_dir)
2799 {
2800 	struct dentry *tmp;
2801 	int retval;
2802 
2803 	tmp = tracefs_create_file("timerlat_period_us", TRACE_MODE_WRITE, top_dir,
2804 				  &timerlat_period, &trace_min_max_fops);
2805 	if (!tmp)
2806 		return -ENOMEM;
2807 
2808 	tmp = tracefs_create_file("timerlat_align_us", TRACE_MODE_WRITE, top_dir,
2809 				  &timerlat_align_us, &trace_min_max_fops);
2810 	if (!tmp)
2811 		return -ENOMEM;
2812 
2813 	retval = osnoise_create_cpu_timerlat_fd(top_dir);
2814 	if (retval)
2815 		return retval;
2816 
2817 	return init_timerlat_stack_tracefs(top_dir);
2818 }
2819 #else /* CONFIG_TIMERLAT_TRACER */
2820 static int init_timerlat_tracefs(struct dentry *top_dir)
2821 {
2822 	return 0;
2823 }
2824 #endif /* CONFIG_TIMERLAT_TRACER */
2825 
2826 /*
2827  * init_tracefs - A function to initialize the tracefs interface files
2828  *
2829  * This function creates entries in tracefs for "osnoise" and "timerlat".
2830  * It creates these directories in the tracing directory, and within that
2831  * directory the use can change and view the configs.
2832  */
2833 static int init_tracefs(void)
2834 {
2835 	struct dentry *top_dir;
2836 	struct dentry *tmp;
2837 	int ret;
2838 
2839 	ret = tracing_init_dentry();
2840 	if (ret)
2841 		return -ENOMEM;
2842 
2843 	top_dir = tracefs_create_dir("osnoise", NULL);
2844 	if (!top_dir)
2845 		return 0;
2846 
2847 	tmp = tracefs_create_file("period_us", TRACE_MODE_WRITE, top_dir,
2848 				  &osnoise_period, &trace_min_max_fops);
2849 	if (!tmp)
2850 		goto err;
2851 
2852 	tmp = tracefs_create_file("runtime_us", TRACE_MODE_WRITE, top_dir,
2853 				  &osnoise_runtime, &trace_min_max_fops);
2854 	if (!tmp)
2855 		goto err;
2856 
2857 	tmp = tracefs_create_file("stop_tracing_us", TRACE_MODE_WRITE, top_dir,
2858 				  &osnoise_stop_tracing_in, &trace_min_max_fops);
2859 	if (!tmp)
2860 		goto err;
2861 
2862 	tmp = tracefs_create_file("stop_tracing_total_us", TRACE_MODE_WRITE, top_dir,
2863 				  &osnoise_stop_tracing_total, &trace_min_max_fops);
2864 	if (!tmp)
2865 		goto err;
2866 
2867 	tmp = trace_create_file("cpus", TRACE_MODE_WRITE, top_dir, NULL, &cpus_fops);
2868 	if (!tmp)
2869 		goto err;
2870 
2871 	tmp = trace_create_file("options", TRACE_MODE_WRITE, top_dir, NULL,
2872 				&osnoise_options_fops);
2873 	if (!tmp)
2874 		goto err;
2875 
2876 	ret = init_timerlat_tracefs(top_dir);
2877 	if (ret)
2878 		goto err;
2879 
2880 	return 0;
2881 
2882 err:
2883 	tracefs_remove(top_dir);
2884 	return -ENOMEM;
2885 }
2886 
2887 static int osnoise_hook_events(void)
2888 {
2889 	int retval;
2890 
2891 	/*
2892 	 * Trace is already hooked, we are re-enabling from
2893 	 * a stop_tracing_*.
2894 	 */
2895 	if (trace_osnoise_callback_enabled)
2896 		return 0;
2897 
2898 	retval = hook_irq_events();
2899 	if (retval)
2900 		return -EINVAL;
2901 
2902 	retval = hook_softirq_events();
2903 	if (retval)
2904 		goto out_unhook_irq;
2905 
2906 	retval = hook_thread_events();
2907 	/*
2908 	 * All fine!
2909 	 */
2910 	if (!retval)
2911 		return 0;
2912 
2913 	unhook_softirq_events();
2914 out_unhook_irq:
2915 	unhook_irq_events();
2916 	return -EINVAL;
2917 }
2918 
2919 static void osnoise_unhook_events(void)
2920 {
2921 	unhook_thread_events();
2922 	unhook_softirq_events();
2923 	unhook_irq_events();
2924 }
2925 
2926 /*
2927  * osnoise_workload_start - start the workload and hook to events
2928  */
2929 static int osnoise_workload_start(void)
2930 {
2931 	int retval;
2932 
2933 	/*
2934 	 * Instances need to be registered after calling workload
2935 	 * start. Hence, if there is already an instance, the
2936 	 * workload was already registered. Otherwise, this
2937 	 * code is on the way to register the first instance,
2938 	 * and the workload will start.
2939 	 */
2940 	if (osnoise_has_registered_instances())
2941 		return 0;
2942 
2943 	osn_var_reset_all();
2944 
2945 	retval = osnoise_hook_events();
2946 	if (retval)
2947 		return retval;
2948 
2949 	/*
2950 	 * Make sure that ftrace_nmi_enter/exit() see reset values
2951 	 * before enabling trace_osnoise_callback_enabled.
2952 	 */
2953 	barrier();
2954 	trace_osnoise_callback_enabled = true;
2955 
2956 	retval = start_per_cpu_kthreads();
2957 	if (retval) {
2958 		trace_osnoise_callback_enabled = false;
2959 		/*
2960 		 * Make sure that ftrace_nmi_enter/exit() see
2961 		 * trace_osnoise_callback_enabled as false before continuing.
2962 		 */
2963 		barrier();
2964 
2965 		osnoise_unhook_events();
2966 		return retval;
2967 	}
2968 
2969 	return 0;
2970 }
2971 
2972 /*
2973  * osnoise_workload_stop - stop the workload and unhook the events
2974  */
2975 static void osnoise_workload_stop(void)
2976 {
2977 	/*
2978 	 * Instances need to be unregistered before calling
2979 	 * stop. Hence, if there is a registered instance, more
2980 	 * than one instance is running, and the workload will not
2981 	 * yet stop. Otherwise, this code is on the way to disable
2982 	 * the last instance, and the workload can stop.
2983 	 */
2984 	if (osnoise_has_registered_instances())
2985 		return;
2986 
2987 	/*
2988 	 * If callbacks were already disabled in a previous stop
2989 	 * call, there is no need to disable then again.
2990 	 *
2991 	 * For instance, this happens when tracing is stopped via:
2992 	 * echo 0 > tracing_on
2993 	 * echo nop > current_tracer.
2994 	 */
2995 	if (!trace_osnoise_callback_enabled)
2996 		return;
2997 
2998 	trace_osnoise_callback_enabled = false;
2999 	/*
3000 	 * Make sure that ftrace_nmi_enter/exit() see
3001 	 * trace_osnoise_callback_enabled as false before continuing.
3002 	 */
3003 	barrier();
3004 
3005 	stop_per_cpu_kthreads();
3006 
3007 	osnoise_unhook_events();
3008 }
3009 
3010 static void osnoise_tracer_start(struct trace_array *tr)
3011 {
3012 	int retval;
3013 
3014 	/*
3015 	 * If the instance is already registered, there is no need to
3016 	 * register it again.
3017 	 */
3018 	if (osnoise_instance_registered(tr))
3019 		return;
3020 
3021 	retval = osnoise_workload_start();
3022 	if (retval)
3023 		pr_err(BANNER "Error starting osnoise tracer\n");
3024 
3025 	osnoise_register_instance(tr);
3026 }
3027 
3028 static void osnoise_tracer_stop(struct trace_array *tr)
3029 {
3030 	osnoise_unregister_instance(tr);
3031 	osnoise_workload_stop();
3032 }
3033 
3034 static int osnoise_tracer_init(struct trace_array *tr)
3035 {
3036 	/*
3037 	 * Only allow osnoise tracer if timerlat tracer is not running
3038 	 * already.
3039 	 */
3040 	if (timerlat_enabled())
3041 		return -EBUSY;
3042 
3043 	tr->max_latency = 0;
3044 
3045 	osnoise_tracer_start(tr);
3046 	return 0;
3047 }
3048 
3049 static void osnoise_tracer_reset(struct trace_array *tr)
3050 {
3051 	osnoise_tracer_stop(tr);
3052 }
3053 
3054 static struct tracer osnoise_tracer __read_mostly = {
3055 	.name		= "osnoise",
3056 	.init		= osnoise_tracer_init,
3057 	.reset		= osnoise_tracer_reset,
3058 	.start		= osnoise_tracer_start,
3059 	.stop		= osnoise_tracer_stop,
3060 	.print_header	= print_osnoise_headers,
3061 	.allow_instances = true,
3062 };
3063 
3064 #ifdef CONFIG_TIMERLAT_TRACER
3065 static void timerlat_tracer_start(struct trace_array *tr)
3066 {
3067 	int retval;
3068 
3069 	/*
3070 	 * If the instance is already registered, there is no need to
3071 	 * register it again.
3072 	 */
3073 	if (osnoise_instance_registered(tr))
3074 		return;
3075 
3076 	retval = osnoise_workload_start();
3077 	if (retval)
3078 		pr_err(BANNER "Error starting timerlat tracer\n");
3079 
3080 	osnoise_register_instance(tr);
3081 
3082 	return;
3083 }
3084 
3085 static void timerlat_tracer_stop(struct trace_array *tr)
3086 {
3087 	int cpu;
3088 
3089 	osnoise_unregister_instance(tr);
3090 
3091 	/*
3092 	 * Instruct the threads to stop only if this is the last instance.
3093 	 */
3094 	if (!osnoise_has_registered_instances()) {
3095 		for_each_online_cpu(cpu)
3096 			per_cpu(per_cpu_osnoise_var, cpu).sampling = 0;
3097 	}
3098 
3099 	osnoise_workload_stop();
3100 }
3101 
3102 static int timerlat_tracer_init(struct trace_array *tr)
3103 {
3104 	/*
3105 	 * Only allow timerlat tracer if osnoise tracer is not running already.
3106 	 */
3107 	if (osnoise_has_registered_instances() && !osnoise_data.timerlat_tracer)
3108 		return -EBUSY;
3109 
3110 	/*
3111 	 * If this is the first instance, set timerlat_tracer to block
3112 	 * osnoise tracer start.
3113 	 */
3114 	if (!osnoise_has_registered_instances())
3115 		osnoise_data.timerlat_tracer = 1;
3116 
3117 	tr->max_latency = 0;
3118 	timerlat_tracer_start(tr);
3119 
3120 	return 0;
3121 }
3122 
3123 static void timerlat_tracer_reset(struct trace_array *tr)
3124 {
3125 	timerlat_tracer_stop(tr);
3126 
3127 	/*
3128 	 * If this is the last instance, reset timerlat_tracer allowing
3129 	 * osnoise to be started.
3130 	 */
3131 	if (!osnoise_has_registered_instances())
3132 		osnoise_data.timerlat_tracer = 0;
3133 }
3134 
3135 static struct tracer timerlat_tracer __read_mostly = {
3136 	.name		= "timerlat",
3137 	.init		= timerlat_tracer_init,
3138 	.reset		= timerlat_tracer_reset,
3139 	.start		= timerlat_tracer_start,
3140 	.stop		= timerlat_tracer_stop,
3141 	.print_header	= print_timerlat_headers,
3142 	.allow_instances = true,
3143 };
3144 
3145 __init static int init_timerlat_tracer(void)
3146 {
3147 	return register_tracer(&timerlat_tracer);
3148 }
3149 #else /* CONFIG_TIMERLAT_TRACER */
3150 __init static int init_timerlat_tracer(void)
3151 {
3152 	return 0;
3153 }
3154 #endif /* CONFIG_TIMERLAT_TRACER */
3155 
3156 __init static int init_osnoise_tracer(void)
3157 {
3158 	int ret;
3159 
3160 	mutex_init(&interface_lock);
3161 
3162 	cpumask_copy(&osnoise_cpumask, cpu_all_mask);
3163 
3164 	ret = register_tracer(&osnoise_tracer);
3165 	if (ret) {
3166 		pr_err(BANNER "Error registering osnoise!\n");
3167 		return ret;
3168 	}
3169 
3170 	ret = init_timerlat_tracer();
3171 	if (ret) {
3172 		pr_err(BANNER "Error registering timerlat!\n");
3173 		return ret;
3174 	}
3175 
3176 	osnoise_init_hotplug_support();
3177 
3178 	INIT_LIST_HEAD_RCU(&osnoise_instances);
3179 
3180 	init_tracefs();
3181 
3182 	return 0;
3183 }
3184 late_initcall(init_osnoise_tracer);
3185