xref: /linux/tools/accounting/delaytop.c (revision e991acf1bce7a428794514cbbe216973c9c0a3c8)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * delaytop.c - system-wide delay monitoring tool.
4  *
5  * This tool provides real-time monitoring and statistics of
6  * system, container, and task-level delays, including CPU,
7  * memory, IO, and IRQ. It supports both interactive (top-like),
8  * and can output delay information for the whole system, specific
9  * containers (cgroups), or individual tasks (PIDs).
10  *
11  * Key features:
12  *	- Collects per-task delay accounting statistics via taskstats.
13  *	- Collects system-wide PSI information.
14  *	- Supports sorting, filtering.
15  *	- Supports both interactive (screen refresh).
16  *
17  * Copyright (C) Fan Yu, ZTE Corp. 2025
18  * Copyright (C) Wang Yaxin, ZTE Corp. 2025
19  *
20  * Compile with
21  *	gcc -I/usr/src/linux/include delaytop.c -o delaytop
22  */
23 
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <errno.h>
28 #include <unistd.h>
29 #include <fcntl.h>
30 #include <getopt.h>
31 #include <signal.h>
32 #include <time.h>
33 #include <dirent.h>
34 #include <ctype.h>
35 #include <stdbool.h>
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/socket.h>
39 #include <sys/select.h>
40 #include <termios.h>
41 #include <limits.h>
42 #include <linux/genetlink.h>
43 #include <linux/taskstats.h>
44 #include <linux/cgroupstats.h>
45 
46 #define PSI_CPU_SOME "/proc/pressure/cpu"
47 #define PSI_CPU_FULL	"/proc/pressure/cpu"
48 #define PSI_MEMORY_SOME "/proc/pressure/memory"
49 #define PSI_MEMORY_FULL "/proc/pressure/memory"
50 #define PSI_IO_SOME "/proc/pressure/io"
51 #define PSI_IO_FULL "/proc/pressure/io"
52 #define PSI_IRQ_FULL	"/proc/pressure/irq"
53 
54 #define NLA_NEXT(na)			((struct nlattr *)((char *)(na) + NLA_ALIGN((na)->nla_len)))
55 #define NLA_DATA(na)			((void *)((char *)(na) + NLA_HDRLEN))
56 #define NLA_PAYLOAD(len)		(len - NLA_HDRLEN)
57 
58 #define GENLMSG_DATA(glh)		((void *)(NLMSG_DATA(glh) + GENL_HDRLEN))
59 #define GENLMSG_PAYLOAD(glh)	(NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN)
60 
61 #define TASK_COMM_LEN	16
62 #define MAX_MSG_SIZE	1024
63 #define MAX_TASKS		1000
64 #define SET_TASK_STAT(task_count, field) tasks[task_count].field = stats.field
65 #define BOOL_FPRINT(stream, fmt, ...) \
66 ({ \
67 	int ret = fprintf(stream, fmt, ##__VA_ARGS__); \
68 	ret >= 0; \
69 })
70 #define PSI_LINE_FORMAT "%-12s %6.1f%%/%6.1f%%/%6.1f%%/%8llu(ms)\n"
71 
72 /* Program settings structure */
73 struct config {
74 	int delay;				/* Update interval in seconds */
75 	int iterations;			/* Number of iterations, 0 == infinite */
76 	int max_processes;		/* Maximum number of processes to show */
77 	char sort_field;		/* Field to sort by */
78 	int output_one_time;	/* Output once and exit */
79 	int monitor_pid;		/* Monitor specific PID */
80 	char *container_path;	/* Path to container cgroup */
81 };
82 
83 /* PSI statistics structure */
84 struct psi_stats {
85 	double cpu_some_avg10, cpu_some_avg60, cpu_some_avg300;
86 	unsigned long long cpu_some_total;
87 	double cpu_full_avg10, cpu_full_avg60, cpu_full_avg300;
88 	unsigned long long cpu_full_total;
89 	double memory_some_avg10, memory_some_avg60, memory_some_avg300;
90 	unsigned long long memory_some_total;
91 	double memory_full_avg10, memory_full_avg60, memory_full_avg300;
92 	unsigned long long memory_full_total;
93 	double io_some_avg10, io_some_avg60, io_some_avg300;
94 	unsigned long long io_some_total;
95 	double io_full_avg10, io_full_avg60, io_full_avg300;
96 	unsigned long long io_full_total;
97 	double irq_full_avg10, irq_full_avg60, irq_full_avg300;
98 	unsigned long long irq_full_total;
99 };
100 
101 /* Task delay information structure */
102 struct task_info {
103 	int pid;
104 	int tgid;
105 	char command[TASK_COMM_LEN];
106 	unsigned long long cpu_count;
107 	unsigned long long cpu_delay_total;
108 	unsigned long long blkio_count;
109 	unsigned long long blkio_delay_total;
110 	unsigned long long swapin_count;
111 	unsigned long long swapin_delay_total;
112 	unsigned long long freepages_count;
113 	unsigned long long freepages_delay_total;
114 	unsigned long long thrashing_count;
115 	unsigned long long thrashing_delay_total;
116 	unsigned long long compact_count;
117 	unsigned long long compact_delay_total;
118 	unsigned long long wpcopy_count;
119 	unsigned long long wpcopy_delay_total;
120 	unsigned long long irq_count;
121 	unsigned long long irq_delay_total;
122 };
123 
124 /* Container statistics structure */
125 struct container_stats {
126 	int nr_sleeping;		/* Number of sleeping processes */
127 	int nr_running;			/* Number of running processes */
128 	int nr_stopped;			/* Number of stopped processes */
129 	int nr_uninterruptible; /* Number of uninterruptible processes */
130 	int nr_io_wait;			/* Number of processes in IO wait */
131 };
132 
133 /* Global variables */
134 static struct config cfg;
135 static struct psi_stats psi;
136 static struct task_info tasks[MAX_TASKS];
137 static int task_count;
138 static int running = 1;
139 static struct container_stats container_stats;
140 
141 /* Netlink socket variables */
142 static int nl_sd = -1;
143 static int family_id;
144 
145 /* Set terminal to non-canonical mode for q-to-quit */
146 static struct termios orig_termios;
enable_raw_mode(void)147 static void enable_raw_mode(void)
148 {
149 	struct termios raw;
150 
151 	tcgetattr(STDIN_FILENO, &orig_termios);
152 	raw = orig_termios;
153 	raw.c_lflag &= ~(ICANON | ECHO);
154 	tcsetattr(STDIN_FILENO, TCSAFLUSH, &raw);
155 }
disable_raw_mode(void)156 static void disable_raw_mode(void)
157 {
158 	tcsetattr(STDIN_FILENO, TCSAFLUSH, &orig_termios);
159 }
160 
161 /* Display usage information and command line options */
usage(void)162 static void usage(void)
163 {
164 	printf("Usage: delaytop [Options]\n"
165 	"Options:\n"
166 	"  -h, --help				Show this help message and exit\n"
167 	"  -d, --delay=SECONDS	  Set refresh interval (default: 2 seconds, min: 1)\n"
168 	"  -n, --iterations=COUNT	Set number of updates (default: 0 = infinite)\n"
169 	"  -P, --processes=NUMBER	Set maximum number of processes to show (default: 20, max: 1000)\n"
170 	"  -o, --once				Display once and exit\n"
171 	"  -p, --pid=PID			Monitor only the specified PID\n"
172 	"  -C, --container=PATH	 Monitor the container at specified cgroup path\n");
173 	exit(0);
174 }
175 
176 /* Parse command line arguments and set configuration */
parse_args(int argc,char ** argv)177 static void parse_args(int argc, char **argv)
178 {
179 	int c;
180 	struct option long_options[] = {
181 		{"help", no_argument, 0, 'h'},
182 		{"delay", required_argument, 0, 'd'},
183 		{"iterations", required_argument, 0, 'n'},
184 		{"pid", required_argument, 0, 'p'},
185 		{"once", no_argument, 0, 'o'},
186 		{"processes", required_argument, 0, 'P'},
187 		{"container", required_argument, 0, 'C'},
188 		{0, 0, 0, 0}
189 	};
190 
191 	/* Set defaults */
192 	cfg.delay = 2;
193 	cfg.iterations = 0;
194 	cfg.max_processes = 20;
195 	cfg.sort_field = 'c';	/* Default sort by CPU delay */
196 	cfg.output_one_time = 0;
197 	cfg.monitor_pid = 0;	/* 0 means monitor all PIDs */
198 	cfg.container_path = NULL;
199 
200 	while (1) {
201 		int option_index = 0;
202 
203 		c = getopt_long(argc, argv, "hd:n:p:oP:C:", long_options, &option_index);
204 		if (c == -1)
205 			break;
206 
207 		switch (c) {
208 		case 'h':
209 			usage();
210 			break;
211 		case 'd':
212 			cfg.delay = atoi(optarg);
213 			if (cfg.delay < 1) {
214 				fprintf(stderr, "Error: delay must be >= 1.\n");
215 				exit(1);
216 			}
217 			break;
218 		case 'n':
219 			cfg.iterations = atoi(optarg);
220 			if (cfg.iterations < 0) {
221 				fprintf(stderr, "Error: iterations must be >= 0.\n");
222 				exit(1);
223 			}
224 			break;
225 		case 'p':
226 			cfg.monitor_pid = atoi(optarg);
227 			if (cfg.monitor_pid < 1) {
228 				fprintf(stderr, "Error: pid must be >= 1.\n");
229 				exit(1);
230 			}
231 			break;
232 		case 'o':
233 			cfg.output_one_time = 1;
234 			break;
235 		case 'P':
236 			cfg.max_processes = atoi(optarg);
237 			if (cfg.max_processes < 1) {
238 				fprintf(stderr, "Error: processes must be >= 1.\n");
239 				exit(1);
240 			}
241 			if (cfg.max_processes > MAX_TASKS) {
242 				fprintf(stderr, "Warning: processes capped to %d.\n",
243 					MAX_TASKS);
244 				cfg.max_processes = MAX_TASKS;
245 			}
246 			break;
247 		case 'C':
248 			cfg.container_path = strdup(optarg);
249 			break;
250 		default:
251 			fprintf(stderr, "Try 'delaytop --help' for more information.\n");
252 			exit(1);
253 		}
254 	}
255 }
256 
257 /* Create a raw netlink socket and bind */
create_nl_socket(void)258 static int create_nl_socket(void)
259 {
260 	int fd;
261 	struct sockaddr_nl local;
262 
263 	fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
264 	if (fd < 0)
265 		return -1;
266 
267 	memset(&local, 0, sizeof(local));
268 	local.nl_family = AF_NETLINK;
269 
270 	if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) {
271 		fprintf(stderr, "Failed to bind socket when create nl_socket\n");
272 		close(fd);
273 		return -1;
274 	}
275 
276 	return fd;
277 }
278 
279 /* Send a command via netlink */
send_cmd(int sd,__u16 nlmsg_type,__u32 nlmsg_pid,__u8 genl_cmd,__u16 nla_type,void * nla_data,int nla_len)280 static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid,
281 			 __u8 genl_cmd, __u16 nla_type,
282 			 void *nla_data, int nla_len)
283 {
284 	struct sockaddr_nl nladdr;
285 	struct nlattr *na;
286 	int r, buflen;
287 	char *buf;
288 
289 	struct {
290 		struct nlmsghdr n;
291 		struct genlmsghdr g;
292 		char buf[MAX_MSG_SIZE];
293 	} msg;
294 
295 	msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
296 	msg.n.nlmsg_type = nlmsg_type;
297 	msg.n.nlmsg_flags = NLM_F_REQUEST;
298 	msg.n.nlmsg_seq = 0;
299 	msg.n.nlmsg_pid = nlmsg_pid;
300 	msg.g.cmd = genl_cmd;
301 	msg.g.version = 0x1;
302 	na = (struct nlattr *) GENLMSG_DATA(&msg);
303 	na->nla_type = nla_type;
304 	na->nla_len = nla_len + NLA_HDRLEN;
305 	memcpy(NLA_DATA(na), nla_data, nla_len);
306 	msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
307 
308 	buf = (char *) &msg;
309 	buflen = msg.n.nlmsg_len;
310 	memset(&nladdr, 0, sizeof(nladdr));
311 	nladdr.nl_family = AF_NETLINK;
312 	while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr,
313 					sizeof(nladdr))) < buflen) {
314 		if (r > 0) {
315 			buf += r;
316 			buflen -= r;
317 		} else if (errno != EAGAIN)
318 			return -1;
319 	}
320 	return 0;
321 }
322 
323 /* Get family ID for taskstats via netlink */
get_family_id(int sd)324 static int get_family_id(int sd)
325 {
326 	struct {
327 		struct nlmsghdr n;
328 		struct genlmsghdr g;
329 		char buf[256];
330 	} ans;
331 
332 	int id = 0, rc;
333 	struct nlattr *na;
334 	int rep_len;
335 	char name[100];
336 
337 	strncpy(name, TASKSTATS_GENL_NAME, sizeof(name) - 1);
338 	name[sizeof(name) - 1] = '\0';
339 	rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY,
340 			CTRL_ATTR_FAMILY_NAME, (void *)name,
341 			strlen(TASKSTATS_GENL_NAME)+1);
342 	if (rc < 0) {
343 		fprintf(stderr, "Failed to send cmd for family id\n");
344 		return 0;
345 	}
346 
347 	rep_len = recv(sd, &ans, sizeof(ans), 0);
348 	if (ans.n.nlmsg_type == NLMSG_ERROR ||
349 		(rep_len < 0) || !NLMSG_OK((&ans.n), rep_len)) {
350 		fprintf(stderr, "Failed to receive response for family id\n");
351 		return 0;
352 	}
353 
354 	na = (struct nlattr *) GENLMSG_DATA(&ans);
355 	na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
356 	if (na->nla_type == CTRL_ATTR_FAMILY_ID)
357 		id = *(__u16 *) NLA_DATA(na);
358 	return id;
359 }
360 
read_psi_stats(void)361 static void read_psi_stats(void)
362 {
363 	FILE *fp;
364 	char line[256];
365 	int ret = 0;
366 	/* Zero all fields */
367 	memset(&psi, 0, sizeof(psi));
368 	/* CPU pressure */
369 	fp = fopen(PSI_CPU_SOME, "r");
370 	if (fp) {
371 		while (fgets(line, sizeof(line), fp)) {
372 			if (strncmp(line, "some", 4) == 0) {
373 				ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
374 							&psi.cpu_some_avg10, &psi.cpu_some_avg60,
375 							&psi.cpu_some_avg300, &psi.cpu_some_total);
376 				if (ret != 4)
377 					fprintf(stderr, "Failed to parse CPU some PSI data\n");
378 			} else if (strncmp(line, "full", 4) == 0) {
379 				ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
380 						&psi.cpu_full_avg10, &psi.cpu_full_avg60,
381 						&psi.cpu_full_avg300, &psi.cpu_full_total);
382 				if (ret != 4)
383 					fprintf(stderr, "Failed to parse CPU full PSI data\n");
384 			}
385 		}
386 		fclose(fp);
387 	}
388 	/* Memory pressure */
389 	fp = fopen(PSI_MEMORY_SOME, "r");
390 	if (fp) {
391 		while (fgets(line, sizeof(line), fp)) {
392 			if (strncmp(line, "some", 4) == 0) {
393 				ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
394 						&psi.memory_some_avg10, &psi.memory_some_avg60,
395 						&psi.memory_some_avg300, &psi.memory_some_total);
396 				if (ret != 4)
397 					fprintf(stderr, "Failed to parse Memory some PSI data\n");
398 			} else if (strncmp(line, "full", 4) == 0) {
399 				ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
400 						&psi.memory_full_avg10, &psi.memory_full_avg60,
401 						&psi.memory_full_avg300, &psi.memory_full_total);
402 			}
403 				if (ret != 4)
404 					fprintf(stderr, "Failed to parse Memory full PSI data\n");
405 		}
406 		fclose(fp);
407 	}
408 	/* IO pressure */
409 	fp = fopen(PSI_IO_SOME, "r");
410 	if (fp) {
411 		while (fgets(line, sizeof(line), fp)) {
412 			if (strncmp(line, "some", 4) == 0) {
413 				ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
414 						&psi.io_some_avg10, &psi.io_some_avg60,
415 						&psi.io_some_avg300, &psi.io_some_total);
416 				if (ret != 4)
417 					fprintf(stderr, "Failed to parse IO some PSI data\n");
418 			} else if (strncmp(line, "full", 4) == 0) {
419 				ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
420 						&psi.io_full_avg10, &psi.io_full_avg60,
421 						&psi.io_full_avg300, &psi.io_full_total);
422 				if (ret != 4)
423 					fprintf(stderr, "Failed to parse IO full PSI data\n");
424 			}
425 		}
426 		fclose(fp);
427 	}
428 	/* IRQ pressure (only full) */
429 	fp = fopen(PSI_IRQ_FULL, "r");
430 	if (fp) {
431 		while (fgets(line, sizeof(line), fp)) {
432 			if (strncmp(line, "full", 4) == 0) {
433 				ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
434 						&psi.irq_full_avg10, &psi.irq_full_avg60,
435 						&psi.irq_full_avg300, &psi.irq_full_total);
436 				if (ret != 4)
437 					fprintf(stderr, "Failed to parse IRQ full PSI data\n");
438 			}
439 		}
440 		fclose(fp);
441 	}
442 }
443 
read_comm(int pid,char * comm_buf,size_t buf_size)444 static int read_comm(int pid, char *comm_buf, size_t buf_size)
445 {
446 	char path[64];
447 	int ret = -1;
448 	size_t len;
449 	FILE *fp;
450 
451 	snprintf(path, sizeof(path), "/proc/%d/comm", pid);
452 	fp = fopen(path, "r");
453 	if (!fp) {
454 		fprintf(stderr, "Failed to open comm file /proc/%d/comm\n", pid);
455 		return ret;
456 	}
457 
458 	if (fgets(comm_buf, buf_size, fp)) {
459 		len = strlen(comm_buf);
460 		if (len > 0 && comm_buf[len - 1] == '\n')
461 			comm_buf[len - 1] = '\0';
462 		ret = 0;
463 	}
464 
465 	fclose(fp);
466 
467 	return ret;
468 }
469 
fetch_and_fill_task_info(int pid,const char * comm)470 static void fetch_and_fill_task_info(int pid, const char *comm)
471 {
472 	struct {
473 		struct nlmsghdr n;
474 		struct genlmsghdr g;
475 		char buf[MAX_MSG_SIZE];
476 	} resp;
477 	struct taskstats stats;
478 	struct nlattr *nested;
479 	struct nlattr *na;
480 	int nested_len;
481 	int nl_len;
482 	int rc;
483 
484 	/* Send request for task stats */
485 	if (send_cmd(nl_sd, family_id, getpid(), TASKSTATS_CMD_GET,
486 				 TASKSTATS_CMD_ATTR_PID, &pid, sizeof(pid)) < 0) {
487 		fprintf(stderr, "Failed to send request for task stats\n");
488 		return;
489 	}
490 
491 	/* Receive response */
492 	rc = recv(nl_sd, &resp, sizeof(resp), 0);
493 	if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) {
494 		fprintf(stderr, "Failed to receive response for task stats\n");
495 		return;
496 	}
497 
498 	/* Parse response */
499 	nl_len = GENLMSG_PAYLOAD(&resp.n);
500 	na = (struct nlattr *) GENLMSG_DATA(&resp);
501 	while (nl_len > 0) {
502 		if (na->nla_type == TASKSTATS_TYPE_AGGR_PID) {
503 			nested = (struct nlattr *) NLA_DATA(na);
504 			nested_len = NLA_PAYLOAD(na->nla_len);
505 			while (nested_len > 0) {
506 				if (nested->nla_type == TASKSTATS_TYPE_STATS) {
507 					memcpy(&stats, NLA_DATA(nested), sizeof(stats));
508 					if (task_count < MAX_TASKS) {
509 						tasks[task_count].pid = pid;
510 						tasks[task_count].tgid = pid;
511 						strncpy(tasks[task_count].command, comm,
512 							TASK_COMM_LEN - 1);
513 						tasks[task_count].command[TASK_COMM_LEN - 1] = '\0';
514 						SET_TASK_STAT(task_count, cpu_count);
515 						SET_TASK_STAT(task_count, cpu_delay_total);
516 						SET_TASK_STAT(task_count, blkio_count);
517 						SET_TASK_STAT(task_count, blkio_delay_total);
518 						SET_TASK_STAT(task_count, swapin_count);
519 						SET_TASK_STAT(task_count, swapin_delay_total);
520 						SET_TASK_STAT(task_count, freepages_count);
521 						SET_TASK_STAT(task_count, freepages_delay_total);
522 						SET_TASK_STAT(task_count, thrashing_count);
523 						SET_TASK_STAT(task_count, thrashing_delay_total);
524 						SET_TASK_STAT(task_count, compact_count);
525 						SET_TASK_STAT(task_count, compact_delay_total);
526 						SET_TASK_STAT(task_count, wpcopy_count);
527 						SET_TASK_STAT(task_count, wpcopy_delay_total);
528 						SET_TASK_STAT(task_count, irq_count);
529 						SET_TASK_STAT(task_count, irq_delay_total);
530 						task_count++;
531 					}
532 					break;
533 				}
534 				nested_len -= NLA_ALIGN(nested->nla_len);
535 				nested = NLA_NEXT(nested);
536 			}
537 		}
538 		nl_len -= NLA_ALIGN(na->nla_len);
539 		na = NLA_NEXT(na);
540 	}
541 	return;
542 }
543 
get_task_delays(void)544 static void get_task_delays(void)
545 {
546 	char comm[TASK_COMM_LEN];
547 	struct dirent *entry;
548 	DIR *dir;
549 	int pid;
550 
551 	task_count = 0;
552 	if (cfg.monitor_pid > 0) {
553 		if (read_comm(cfg.monitor_pid, comm, sizeof(comm)) == 0)
554 			fetch_and_fill_task_info(cfg.monitor_pid, comm);
555 		return;
556 	}
557 
558 	dir = opendir("/proc");
559 	if (!dir) {
560 		fprintf(stderr, "Error opening /proc directory\n");
561 		return;
562 	}
563 
564 	while ((entry = readdir(dir)) != NULL && task_count < MAX_TASKS) {
565 		if (!isdigit(entry->d_name[0]))
566 			continue;
567 		pid = atoi(entry->d_name);
568 		if (pid == 0)
569 			continue;
570 		if (read_comm(pid, comm, sizeof(comm)) != 0)
571 			continue;
572 		fetch_and_fill_task_info(pid, comm);
573 	}
574 	closedir(dir);
575 }
576 
577 /* Calculate average delay in milliseconds */
average_ms(unsigned long long total,unsigned long long count)578 static double average_ms(unsigned long long total, unsigned long long count)
579 {
580 	if (count == 0)
581 		return 0;
582 	return (double)total / 1000000.0 / count;
583 }
584 
585 /* Comparison function for sorting tasks */
compare_tasks(const void * a,const void * b)586 static int compare_tasks(const void *a, const void *b)
587 {
588 	const struct task_info *t1 = (const struct task_info *)a;
589 	const struct task_info *t2 = (const struct task_info *)b;
590 	double avg1, avg2;
591 
592 	switch (cfg.sort_field) {
593 	case 'c': /* CPU */
594 		avg1 = average_ms(t1->cpu_delay_total, t1->cpu_count);
595 		avg2 = average_ms(t2->cpu_delay_total, t2->cpu_count);
596 		if (avg1 != avg2)
597 			return avg2 > avg1 ? 1 : -1;
598 		return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1;
599 
600 	default:
601 		return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1;
602 	}
603 }
604 
605 /* Sort tasks by selected field */
sort_tasks(void)606 static void sort_tasks(void)
607 {
608 	if (task_count > 0)
609 		qsort(tasks, task_count, sizeof(struct task_info), compare_tasks);
610 }
611 
612 /* Get container statistics via cgroupstats */
get_container_stats(void)613 static void get_container_stats(void)
614 {
615 	int rc, cfd;
616 	struct {
617 		struct nlmsghdr n;
618 		struct genlmsghdr g;
619 		char buf[MAX_MSG_SIZE];
620 	} req, resp;
621 	struct nlattr *na;
622 	int nl_len;
623 	struct cgroupstats stats;
624 
625 	/* Check if container path is set */
626 	if (!cfg.container_path)
627 		return;
628 
629 	/* Open container cgroup */
630 	cfd = open(cfg.container_path, O_RDONLY);
631 	if (cfd < 0) {
632 		fprintf(stderr, "Error opening container path: %s\n", cfg.container_path);
633 		return;
634 	}
635 
636 	/* Send request for container stats */
637 	if (send_cmd(nl_sd, family_id, getpid(), CGROUPSTATS_CMD_GET,
638 				CGROUPSTATS_CMD_ATTR_FD, &cfd, sizeof(__u32)) < 0) {
639 		fprintf(stderr, "Failed to send request for container stats\n");
640 		close(cfd);
641 		return;
642 	}
643 
644 	/* Receive response */
645 	rc = recv(nl_sd, &resp, sizeof(resp), 0);
646 	if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) {
647 		fprintf(stderr, "Failed to receive response for container stats\n");
648 		close(cfd);
649 		return;
650 	}
651 
652 	/* Parse response */
653 	nl_len = GENLMSG_PAYLOAD(&resp.n);
654 	na = (struct nlattr *) GENLMSG_DATA(&resp);
655 	while (nl_len > 0) {
656 		if (na->nla_type == CGROUPSTATS_TYPE_CGROUP_STATS) {
657 			/* Get the cgroupstats structure */
658 			memcpy(&stats, NLA_DATA(na), sizeof(stats));
659 
660 			/* Fill container stats */
661 			container_stats.nr_sleeping = stats.nr_sleeping;
662 			container_stats.nr_running = stats.nr_running;
663 			container_stats.nr_stopped = stats.nr_stopped;
664 			container_stats.nr_uninterruptible = stats.nr_uninterruptible;
665 			container_stats.nr_io_wait = stats.nr_io_wait;
666 			break;
667 		}
668 		nl_len -= NLA_ALIGN(na->nla_len);
669 		na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
670 	}
671 
672 	close(cfd);
673 }
674 
675 /* Display results to stdout or log file */
display_results(void)676 static void display_results(void)
677 {
678 	time_t now = time(NULL);
679 	struct tm *tm_now = localtime(&now);
680 	FILE *out = stdout;
681 	char timestamp[32];
682 	bool suc = true;
683 	int i, count;
684 
685 	/* Clear terminal screen */
686 	suc &= BOOL_FPRINT(out, "\033[H\033[J");
687 
688 	/* PSI output (one-line, no cat style) */
689 	suc &= BOOL_FPRINT(out, "System Pressure Information: (avg10/avg60/avg300/total)\n");
690 	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
691 		"CPU some:",
692 		psi.cpu_some_avg10,
693 		psi.cpu_some_avg60,
694 		psi.cpu_some_avg300,
695 		psi.cpu_some_total / 1000);
696 	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
697 		"CPU full:",
698 		psi.cpu_full_avg10,
699 		psi.cpu_full_avg60,
700 		psi.cpu_full_avg300,
701 		psi.cpu_full_total / 1000);
702 	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
703 		"Memory full:",
704 		psi.memory_full_avg10,
705 		psi.memory_full_avg60,
706 		psi.memory_full_avg300,
707 		psi.memory_full_total / 1000);
708 	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
709 		"Memory some:",
710 		psi.memory_some_avg10,
711 		psi.memory_some_avg60,
712 		psi.memory_some_avg300,
713 		psi.memory_some_total / 1000);
714 	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
715 		"IO full:",
716 		psi.io_full_avg10,
717 		psi.io_full_avg60,
718 		psi.io_full_avg300,
719 		psi.io_full_total / 1000);
720 	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
721 		"IO some:",
722 		psi.io_some_avg10,
723 		psi.io_some_avg60,
724 		psi.io_some_avg300,
725 		psi.io_some_total / 1000);
726 	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
727 		"IRQ full:",
728 		psi.irq_full_avg10,
729 		psi.irq_full_avg60,
730 		psi.irq_full_avg300,
731 		psi.irq_full_total / 1000);
732 
733 	if (cfg.container_path) {
734 		suc &= BOOL_FPRINT(out, "Container Information (%s):\n", cfg.container_path);
735 		suc &= BOOL_FPRINT(out, "Processes: running=%d, sleeping=%d, ",
736 			container_stats.nr_running, container_stats.nr_sleeping);
737 		suc &= BOOL_FPRINT(out, "stopped=%d, uninterruptible=%d, io_wait=%d\n\n",
738 			container_stats.nr_stopped, container_stats.nr_uninterruptible,
739 			container_stats.nr_io_wait);
740 	}
741 	suc &= BOOL_FPRINT(out, "Top %d processes (sorted by CPU delay):\n",
742 			cfg.max_processes);
743 	suc &= BOOL_FPRINT(out, "%5s  %5s  %-17s", "PID", "TGID", "COMMAND");
744 	suc &= BOOL_FPRINT(out, "%7s %7s %7s %7s %7s %7s %7s %7s\n",
745 		"CPU(ms)", "IO(ms)", "SWAP(ms)", "RCL(ms)",
746 		"THR(ms)", "CMP(ms)", "WP(ms)", "IRQ(ms)");
747 
748 	suc &= BOOL_FPRINT(out, "-----------------------------------------------");
749 	suc &= BOOL_FPRINT(out, "----------------------------------------------\n");
750 	count = task_count < cfg.max_processes ? task_count : cfg.max_processes;
751 
752 	for (i = 0; i < count; i++) {
753 		suc &= BOOL_FPRINT(out, "%5d  %5d  %-15s",
754 			tasks[i].pid, tasks[i].tgid, tasks[i].command);
755 		suc &= BOOL_FPRINT(out, "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f\n",
756 			average_ms(tasks[i].cpu_delay_total, tasks[i].cpu_count),
757 			average_ms(tasks[i].blkio_delay_total, tasks[i].blkio_count),
758 			average_ms(tasks[i].swapin_delay_total, tasks[i].swapin_count),
759 			average_ms(tasks[i].freepages_delay_total, tasks[i].freepages_count),
760 			average_ms(tasks[i].thrashing_delay_total, tasks[i].thrashing_count),
761 			average_ms(tasks[i].compact_delay_total, tasks[i].compact_count),
762 			average_ms(tasks[i].wpcopy_delay_total, tasks[i].wpcopy_count),
763 			average_ms(tasks[i].irq_delay_total, tasks[i].irq_count));
764 	}
765 
766 	suc &= BOOL_FPRINT(out, "\n");
767 
768 	if (!suc)
769 		perror("Error writing to output");
770 }
771 
772 /* Main function */
main(int argc,char ** argv)773 int main(int argc, char **argv)
774 {
775 	int iterations = 0;
776 	int use_q_quit = 0;
777 
778 	/* Parse command line arguments */
779 	parse_args(argc, argv);
780 
781 	/* Setup netlink socket */
782 	nl_sd = create_nl_socket();
783 	if (nl_sd < 0) {
784 		fprintf(stderr, "Error creating netlink socket\n");
785 		exit(1);
786 	}
787 
788 	/* Get family ID for taskstats via netlink */
789 	family_id = get_family_id(nl_sd);
790 	if (!family_id) {
791 		fprintf(stderr, "Error getting taskstats family ID\n");
792 		close(nl_sd);
793 		exit(1);
794 	}
795 
796 	if (!cfg.output_one_time) {
797 		use_q_quit = 1;
798 		enable_raw_mode();
799 		printf("Press 'q' to quit.\n");
800 		fflush(stdout);
801 	}
802 
803 	/* Main loop */
804 	while (running) {
805 		/* Read PSI statistics */
806 		read_psi_stats();
807 
808 		/* Get container stats if container path provided */
809 		if (cfg.container_path)
810 			get_container_stats();
811 
812 		/* Get task delays */
813 		get_task_delays();
814 
815 		/* Sort tasks */
816 		sort_tasks();
817 
818 		/* Display results to stdout or log file */
819 		display_results();
820 
821 		/* Check for iterations */
822 		if (cfg.iterations > 0 && ++iterations >= cfg.iterations)
823 			break;
824 
825 		/* Exit if output_one_time is set */
826 		if (cfg.output_one_time)
827 			break;
828 
829 		/* Check for 'q' key to quit */
830 		if (use_q_quit) {
831 			struct timeval tv = {cfg.delay, 0};
832 			fd_set readfds;
833 
834 			FD_ZERO(&readfds);
835 			FD_SET(STDIN_FILENO, &readfds);
836 			int r = select(STDIN_FILENO+1, &readfds, NULL, NULL, &tv);
837 
838 			if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) {
839 				char ch = 0;
840 
841 				read(STDIN_FILENO, &ch, 1);
842 				if (ch == 'q' || ch == 'Q') {
843 					running = 0;
844 					break;
845 				}
846 			}
847 		} else {
848 			sleep(cfg.delay);
849 		}
850 	}
851 
852 	/* Restore terminal mode */
853 	if (use_q_quit)
854 		disable_raw_mode();
855 
856 	/* Cleanup */
857 	close(nl_sd);
858 	if (cfg.container_path)
859 		free(cfg.container_path);
860 
861 	return 0;
862 }
863