xref: /linux/tools/accounting/delaytop.c (revision 5e57515d81f9003555b7a4d246e02f1ee9c74ffa)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * delaytop.c - system-wide delay monitoring tool.
4  *
5  * This tool provides real-time monitoring and statistics of
6  * system, container, and task-level delays, including CPU,
7  * memory, IO, and IRQ. It supports both interactive (top-like),
8  * and can output delay information for the whole system, specific
9  * containers (cgroups), or individual tasks (PIDs).
10  *
11  * Key features:
12  *	- Collects per-task delay accounting statistics via taskstats.
13  *	- Collects system-wide PSI information.
14  *	- Supports sorting, filtering.
15  *	- Supports both interactive (screen refresh).
16  *
17  * Copyright (C) Fan Yu, ZTE Corp. 2025
18  * Copyright (C) Wang Yaxin, ZTE Corp. 2025
19  *
20  * Compile with
21  *	gcc -I/usr/src/linux/include delaytop.c -o delaytop
22  */
23 
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <errno.h>
28 #include <unistd.h>
29 #include <fcntl.h>
30 #include <getopt.h>
31 #include <signal.h>
32 #include <time.h>
33 #include <dirent.h>
34 #include <ctype.h>
35 #include <stdbool.h>
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/socket.h>
39 #include <sys/select.h>
40 #include <termios.h>
41 #include <limits.h>
42 #include <linux/genetlink.h>
43 #include <linux/taskstats.h>
44 #include <linux/cgroupstats.h>
45 #include <stddef.h>
46 
47 #define PSI_CPU_SOME "/proc/pressure/cpu"
48 #define PSI_CPU_FULL	"/proc/pressure/cpu"
49 #define PSI_MEMORY_SOME "/proc/pressure/memory"
50 #define PSI_MEMORY_FULL "/proc/pressure/memory"
51 #define PSI_IO_SOME "/proc/pressure/io"
52 #define PSI_IO_FULL "/proc/pressure/io"
53 #define PSI_IRQ_FULL	"/proc/pressure/irq"
54 
55 #define NLA_NEXT(na)			((struct nlattr *)((char *)(na) + NLA_ALIGN((na)->nla_len)))
56 #define NLA_DATA(na)			((void *)((char *)(na) + NLA_HDRLEN))
57 #define NLA_PAYLOAD(len)		(len - NLA_HDRLEN)
58 
59 #define GENLMSG_DATA(glh)		((void *)(NLMSG_DATA(glh) + GENL_HDRLEN))
60 #define GENLMSG_PAYLOAD(glh)	(NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN)
61 
62 #define TASK_COMM_LEN	16
63 #define MAX_MSG_SIZE	1024
64 #define MAX_TASKS		1000
65 #define MAX_BUF_LEN		256
66 #define SET_TASK_STAT(task_count, field) tasks[task_count].field = stats.field
67 #define BOOL_FPRINT(stream, fmt, ...) \
68 ({ \
69 	int ret = fprintf(stream, fmt, ##__VA_ARGS__); \
70 	ret >= 0; \
71 })
72 #define TASK_AVG(task, field) average_ms((task).field##_delay_total, (task).field##_count)
73 #define PSI_LINE_FORMAT "%-12s %6.1f%%/%6.1f%%/%6.1f%%/%8llu(ms)\n"
74 #define DELAY_FMT_DEFAULT "%8.2f %8.2f %8.2f %8.2f\n"
75 #define DELAY_FMT_MEMVERBOSE "%8.2f %8.2f %8.2f %8.2f %8.2f %8.2f\n"
76 #define SORT_FIELD(name, cmd, modes) \
77 	{#name, #cmd, \
78 	offsetof(struct task_info, name##_delay_total), \
79 	offsetof(struct task_info, name##_count), \
80 	modes}
81 #define END_FIELD {NULL, 0, 0}
82 
83 /* Display mode types */
84 #define MODE_TYPE_ALL	(0xFFFFFFFF)
85 #define MODE_DEFAULT	(1 << 0)
86 #define MODE_MEMVERBOSE	(1 << 1)
87 
88 /* PSI statistics structure */
89 struct psi_stats {
90 	double cpu_some_avg10, cpu_some_avg60, cpu_some_avg300;
91 	unsigned long long cpu_some_total;
92 	double cpu_full_avg10, cpu_full_avg60, cpu_full_avg300;
93 	unsigned long long cpu_full_total;
94 	double memory_some_avg10, memory_some_avg60, memory_some_avg300;
95 	unsigned long long memory_some_total;
96 	double memory_full_avg10, memory_full_avg60, memory_full_avg300;
97 	unsigned long long memory_full_total;
98 	double io_some_avg10, io_some_avg60, io_some_avg300;
99 	unsigned long long io_some_total;
100 	double io_full_avg10, io_full_avg60, io_full_avg300;
101 	unsigned long long io_full_total;
102 	double irq_full_avg10, irq_full_avg60, irq_full_avg300;
103 	unsigned long long irq_full_total;
104 };
105 
106 /* Task delay information structure */
107 struct task_info {
108 	int pid;
109 	int tgid;
110 	char command[TASK_COMM_LEN];
111 	unsigned long long cpu_count;
112 	unsigned long long cpu_delay_total;
113 	unsigned long long blkio_count;
114 	unsigned long long blkio_delay_total;
115 	unsigned long long swapin_count;
116 	unsigned long long swapin_delay_total;
117 	unsigned long long freepages_count;
118 	unsigned long long freepages_delay_total;
119 	unsigned long long thrashing_count;
120 	unsigned long long thrashing_delay_total;
121 	unsigned long long compact_count;
122 	unsigned long long compact_delay_total;
123 	unsigned long long wpcopy_count;
124 	unsigned long long wpcopy_delay_total;
125 	unsigned long long irq_count;
126 	unsigned long long irq_delay_total;
127 	unsigned long long mem_count;
128 	unsigned long long mem_delay_total;
129 };
130 
131 /* Container statistics structure */
132 struct container_stats {
133 	int nr_sleeping;		/* Number of sleeping processes */
134 	int nr_running;			/* Number of running processes */
135 	int nr_stopped;			/* Number of stopped processes */
136 	int nr_uninterruptible; /* Number of uninterruptible processes */
137 	int nr_io_wait;			/* Number of processes in IO wait */
138 };
139 
140 /* Delay field structure */
141 struct field_desc {
142 	const char *name;	/* Field name for cmdline argument */
143 	const char *cmd_char;	/* Interactive command */
144 	unsigned long total_offset; /* Offset of total delay in task_info */
145 	unsigned long count_offset; /* Offset of count in task_info */
146 	size_t supported_modes; /* Supported display modes */
147 };
148 
149 /* Program settings structure */
150 struct config {
151 	int delay;				/* Update interval in seconds */
152 	int iterations;			/* Number of iterations, 0 == infinite */
153 	int max_processes;		/* Maximum number of processes to show */
154 	int output_one_time;	/* Output once and exit */
155 	int monitor_pid;		/* Monitor specific PID */
156 	char *container_path;	/* Path to container cgroup */
157 	const struct field_desc *sort_field;	/* Current sort field */
158 	size_t display_mode;	/* Current display mode */
159 };
160 
161 /* Global variables */
162 static struct config cfg;
163 static struct psi_stats psi;
164 static struct task_info tasks[MAX_TASKS];
165 static int task_count;
166 static int running = 1;
167 static struct container_stats container_stats;
168 static const struct field_desc sort_fields[] = {
169 	SORT_FIELD(cpu,		c,	MODE_DEFAULT),
170 	SORT_FIELD(blkio,	i,	MODE_DEFAULT),
171 	SORT_FIELD(irq,		q,	MODE_DEFAULT),
172 	SORT_FIELD(mem,		m,	MODE_DEFAULT | MODE_MEMVERBOSE),
173 	SORT_FIELD(swapin,	s,	MODE_MEMVERBOSE),
174 	SORT_FIELD(freepages,	r,	MODE_MEMVERBOSE),
175 	SORT_FIELD(thrashing,	t,	MODE_MEMVERBOSE),
176 	SORT_FIELD(compact,	p,	MODE_MEMVERBOSE),
177 	SORT_FIELD(wpcopy,	w,	MODE_MEMVERBOSE),
178 	END_FIELD
179 };
180 static int sort_selected;
181 
182 /* Netlink socket variables */
183 static int nl_sd = -1;
184 static int family_id;
185 
186 /* Set terminal to non-canonical mode for q-to-quit */
187 static struct termios orig_termios;
188 static void enable_raw_mode(void)
189 {
190 	struct termios raw;
191 
192 	tcgetattr(STDIN_FILENO, &orig_termios);
193 	raw = orig_termios;
194 	raw.c_lflag &= ~(ICANON | ECHO);
195 	tcsetattr(STDIN_FILENO, TCSAFLUSH, &raw);
196 }
197 static void disable_raw_mode(void)
198 {
199 	tcsetattr(STDIN_FILENO, TCSAFLUSH, &orig_termios);
200 }
201 
202 /* Find field descriptor by command line */
203 static const struct field_desc *get_field_by_cmd_char(char ch)
204 {
205 	const struct field_desc *field;
206 
207 	for (field = sort_fields; field->name != NULL; field++) {
208 		if (field->cmd_char[0] == ch)
209 			return field;
210 	}
211 
212 	return NULL;
213 }
214 
215 /* Find field descriptor by name with string comparison */
216 static const struct field_desc *get_field_by_name(const char *name)
217 {
218 	const struct field_desc *field;
219 	size_t field_len;
220 
221 	for (field = sort_fields; field->name != NULL; field++) {
222 		field_len = strlen(field->name);
223 		if (field_len != strlen(name))
224 			continue;
225 		if (strncmp(field->name, name, field_len) == 0)
226 			return field;
227 	}
228 
229 	return NULL;
230 }
231 
232 /* Find display name for a field descriptor */
233 static const char *get_name_by_field(const struct field_desc *field)
234 {
235 	return field ? field->name : "UNKNOWN";
236 }
237 
238 /* Generate string of available field names */
239 static void display_available_fields(size_t mode)
240 {
241 	const struct field_desc *field;
242 	char buf[MAX_BUF_LEN];
243 
244 	buf[0] = '\0';
245 
246 	for (field = sort_fields; field->name != NULL; field++) {
247 		if (!(field->supported_modes & mode))
248 			continue;
249 		strncat(buf, "|", MAX_BUF_LEN - strlen(buf) - 1);
250 		strncat(buf, field->name, MAX_BUF_LEN - strlen(buf) - 1);
251 		buf[MAX_BUF_LEN - 1] = '\0';
252 	}
253 
254 	fprintf(stderr, "Available fields: %s\n", buf);
255 }
256 
257 /* Display usage information and command line options */
258 static void usage(void)
259 {
260 	printf("Usage: delaytop [Options]\n"
261 	"Options:\n"
262 	"  -h, --help               Show this help message and exit\n"
263 	"  -d, --delay=SECONDS      Set refresh interval (default: 2 seconds, min: 1)\n"
264 	"  -n, --iterations=COUNT   Set number of updates (default: 0 = infinite)\n"
265 	"  -P, --processes=NUMBER   Set maximum number of processes to show (default: 20, max: 1000)\n"
266 	"  -o, --once               Display once and exit\n"
267 	"  -p, --pid=PID            Monitor only the specified PID\n"
268 	"  -C, --container=PATH     Monitor the container at specified cgroup path\n"
269 	"  -s, --sort=FIELD         Sort by delay field (default: cpu)\n"
270 	"  -M, --memverbose         Display memory detailed information\n");
271 	exit(0);
272 }
273 
274 /* Parse command line arguments and set configuration */
275 static void parse_args(int argc, char **argv)
276 {
277 	int c;
278 	const struct field_desc *field;
279 	struct option long_options[] = {
280 		{"help", no_argument, 0, 'h'},
281 		{"delay", required_argument, 0, 'd'},
282 		{"iterations", required_argument, 0, 'n'},
283 		{"pid", required_argument, 0, 'p'},
284 		{"once", no_argument, 0, 'o'},
285 		{"processes", required_argument, 0, 'P'},
286 		{"sort", required_argument, 0, 's'},
287 		{"container", required_argument, 0, 'C'},
288 		{"memverbose", no_argument, 0, 'M'},
289 		{0, 0, 0, 0}
290 	};
291 
292 	/* Set defaults */
293 	cfg.delay = 2;
294 	cfg.iterations = 0;
295 	cfg.max_processes = 20;
296 	cfg.sort_field = &sort_fields[0];	/* Default sorted by CPU delay */
297 	cfg.output_one_time = 0;
298 	cfg.monitor_pid = 0;	/* 0 means monitor all PIDs */
299 	cfg.container_path = NULL;
300 	cfg.display_mode = MODE_DEFAULT;
301 
302 	while (1) {
303 		int option_index = 0;
304 
305 		c = getopt_long(argc, argv, "hd:n:p:oP:C:s:M", long_options, &option_index);
306 		if (c == -1)
307 			break;
308 
309 		switch (c) {
310 		case 'h':
311 			usage();
312 			break;
313 		case 'd':
314 			cfg.delay = atoi(optarg);
315 			if (cfg.delay < 1) {
316 				fprintf(stderr, "Error: delay must be >= 1.\n");
317 				exit(1);
318 			}
319 			break;
320 		case 'n':
321 			cfg.iterations = atoi(optarg);
322 			if (cfg.iterations < 0) {
323 				fprintf(stderr, "Error: iterations must be >= 0.\n");
324 				exit(1);
325 			}
326 			break;
327 		case 'p':
328 			cfg.monitor_pid = atoi(optarg);
329 			if (cfg.monitor_pid < 1) {
330 				fprintf(stderr, "Error: pid must be >= 1.\n");
331 				exit(1);
332 			}
333 			break;
334 		case 'o':
335 			cfg.output_one_time = 1;
336 			break;
337 		case 'P':
338 			cfg.max_processes = atoi(optarg);
339 			if (cfg.max_processes < 1) {
340 				fprintf(stderr, "Error: processes must be >= 1.\n");
341 				exit(1);
342 			}
343 			if (cfg.max_processes > MAX_TASKS) {
344 				fprintf(stderr, "Warning: processes capped to %d.\n",
345 					MAX_TASKS);
346 				cfg.max_processes = MAX_TASKS;
347 			}
348 			break;
349 		case 'C':
350 			cfg.container_path = strdup(optarg);
351 			break;
352 		case 's':
353 			if (strlen(optarg) == 0) {
354 				fprintf(stderr, "Error: empty sort field\n");
355 				exit(1);
356 			}
357 
358 			field = get_field_by_name(optarg);
359 			/* Show available fields if invalid option provided */
360 			if (!field) {
361 				fprintf(stderr, "Error: invalid sort field '%s'\n", optarg);
362 				display_available_fields(MODE_TYPE_ALL);
363 				exit(1);
364 			}
365 
366 			cfg.sort_field = field;
367 			break;
368 		case 'M':
369 			cfg.display_mode = MODE_MEMVERBOSE;
370 			cfg.sort_field = get_field_by_name("mem");
371 			break;
372 		default:
373 			fprintf(stderr, "Try 'delaytop --help' for more information.\n");
374 			exit(1);
375 		}
376 	}
377 }
378 
379 /* Calculate average delay in milliseconds for overall memory */
380 static void set_mem_delay_total(struct task_info *t)
381 {
382 	t->mem_delay_total = t->swapin_delay_total +
383 		t->freepages_delay_total +
384 		t->thrashing_delay_total +
385 		t->compact_delay_total +
386 		t->wpcopy_delay_total;
387 }
388 
389 static void set_mem_count(struct task_info *t)
390 {
391 	t->mem_count = t->swapin_count +
392 		t->freepages_count +
393 		t->thrashing_count +
394 		t->compact_count +
395 		t->wpcopy_count;
396 }
397 
398 /* Create a raw netlink socket and bind */
399 static int create_nl_socket(void)
400 {
401 	int fd;
402 	struct sockaddr_nl local;
403 
404 	fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
405 	if (fd < 0)
406 		return -1;
407 
408 	memset(&local, 0, sizeof(local));
409 	local.nl_family = AF_NETLINK;
410 
411 	if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) {
412 		fprintf(stderr, "Failed to bind socket when create nl_socket\n");
413 		close(fd);
414 		return -1;
415 	}
416 
417 	return fd;
418 }
419 
420 /* Send a command via netlink */
421 static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid,
422 			 __u8 genl_cmd, __u16 nla_type,
423 			 void *nla_data, int nla_len)
424 {
425 	struct sockaddr_nl nladdr;
426 	struct nlattr *na;
427 	int r, buflen;
428 	char *buf;
429 
430 	struct {
431 		struct nlmsghdr n;
432 		struct genlmsghdr g;
433 		char buf[MAX_MSG_SIZE];
434 	} msg;
435 
436 	msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
437 	msg.n.nlmsg_type = nlmsg_type;
438 	msg.n.nlmsg_flags = NLM_F_REQUEST;
439 	msg.n.nlmsg_seq = 0;
440 	msg.n.nlmsg_pid = nlmsg_pid;
441 	msg.g.cmd = genl_cmd;
442 	msg.g.version = 0x1;
443 	na = (struct nlattr *) GENLMSG_DATA(&msg);
444 	na->nla_type = nla_type;
445 	na->nla_len = nla_len + NLA_HDRLEN;
446 	memcpy(NLA_DATA(na), nla_data, nla_len);
447 	msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
448 
449 	buf = (char *) &msg;
450 	buflen = msg.n.nlmsg_len;
451 	memset(&nladdr, 0, sizeof(nladdr));
452 	nladdr.nl_family = AF_NETLINK;
453 	while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr,
454 					sizeof(nladdr))) < buflen) {
455 		if (r > 0) {
456 			buf += r;
457 			buflen -= r;
458 		} else if (errno != EAGAIN)
459 			return -1;
460 	}
461 	return 0;
462 }
463 
464 /* Get family ID for taskstats via netlink */
465 static int get_family_id(int sd)
466 {
467 	struct {
468 		struct nlmsghdr n;
469 		struct genlmsghdr g;
470 		char buf[256];
471 	} ans;
472 
473 	int id = 0, rc;
474 	struct nlattr *na;
475 	int rep_len;
476 	char name[100];
477 
478 	strncpy(name, TASKSTATS_GENL_NAME, sizeof(name) - 1);
479 	name[sizeof(name) - 1] = '\0';
480 	rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY,
481 			CTRL_ATTR_FAMILY_NAME, (void *)name,
482 			strlen(TASKSTATS_GENL_NAME)+1);
483 	if (rc < 0) {
484 		fprintf(stderr, "Failed to send cmd for family id\n");
485 		return 0;
486 	}
487 
488 	rep_len = recv(sd, &ans, sizeof(ans), 0);
489 	if (ans.n.nlmsg_type == NLMSG_ERROR ||
490 		(rep_len < 0) || !NLMSG_OK((&ans.n), rep_len)) {
491 		fprintf(stderr, "Failed to receive response for family id\n");
492 		return 0;
493 	}
494 
495 	na = (struct nlattr *) GENLMSG_DATA(&ans);
496 	na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
497 	if (na->nla_type == CTRL_ATTR_FAMILY_ID)
498 		id = *(__u16 *) NLA_DATA(na);
499 	return id;
500 }
501 
502 static void read_psi_stats(void)
503 {
504 	FILE *fp;
505 	char line[256];
506 	int ret = 0;
507 	/* Zero all fields */
508 	memset(&psi, 0, sizeof(psi));
509 	/* CPU pressure */
510 	fp = fopen(PSI_CPU_SOME, "r");
511 	if (fp) {
512 		while (fgets(line, sizeof(line), fp)) {
513 			if (strncmp(line, "some", 4) == 0) {
514 				ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
515 							&psi.cpu_some_avg10, &psi.cpu_some_avg60,
516 							&psi.cpu_some_avg300, &psi.cpu_some_total);
517 				if (ret != 4)
518 					fprintf(stderr, "Failed to parse CPU some PSI data\n");
519 			} else if (strncmp(line, "full", 4) == 0) {
520 				ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
521 						&psi.cpu_full_avg10, &psi.cpu_full_avg60,
522 						&psi.cpu_full_avg300, &psi.cpu_full_total);
523 				if (ret != 4)
524 					fprintf(stderr, "Failed to parse CPU full PSI data\n");
525 			}
526 		}
527 		fclose(fp);
528 	}
529 	/* Memory pressure */
530 	fp = fopen(PSI_MEMORY_SOME, "r");
531 	if (fp) {
532 		while (fgets(line, sizeof(line), fp)) {
533 			if (strncmp(line, "some", 4) == 0) {
534 				ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
535 						&psi.memory_some_avg10, &psi.memory_some_avg60,
536 						&psi.memory_some_avg300, &psi.memory_some_total);
537 				if (ret != 4)
538 					fprintf(stderr, "Failed to parse Memory some PSI data\n");
539 			} else if (strncmp(line, "full", 4) == 0) {
540 				ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
541 						&psi.memory_full_avg10, &psi.memory_full_avg60,
542 						&psi.memory_full_avg300, &psi.memory_full_total);
543 			}
544 				if (ret != 4)
545 					fprintf(stderr, "Failed to parse Memory full PSI data\n");
546 		}
547 		fclose(fp);
548 	}
549 	/* IO pressure */
550 	fp = fopen(PSI_IO_SOME, "r");
551 	if (fp) {
552 		while (fgets(line, sizeof(line), fp)) {
553 			if (strncmp(line, "some", 4) == 0) {
554 				ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
555 						&psi.io_some_avg10, &psi.io_some_avg60,
556 						&psi.io_some_avg300, &psi.io_some_total);
557 				if (ret != 4)
558 					fprintf(stderr, "Failed to parse IO some PSI data\n");
559 			} else if (strncmp(line, "full", 4) == 0) {
560 				ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
561 						&psi.io_full_avg10, &psi.io_full_avg60,
562 						&psi.io_full_avg300, &psi.io_full_total);
563 				if (ret != 4)
564 					fprintf(stderr, "Failed to parse IO full PSI data\n");
565 			}
566 		}
567 		fclose(fp);
568 	}
569 	/* IRQ pressure (only full) */
570 	fp = fopen(PSI_IRQ_FULL, "r");
571 	if (fp) {
572 		while (fgets(line, sizeof(line), fp)) {
573 			if (strncmp(line, "full", 4) == 0) {
574 				ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
575 						&psi.irq_full_avg10, &psi.irq_full_avg60,
576 						&psi.irq_full_avg300, &psi.irq_full_total);
577 				if (ret != 4)
578 					fprintf(stderr, "Failed to parse IRQ full PSI data\n");
579 			}
580 		}
581 		fclose(fp);
582 	}
583 }
584 
585 static int read_comm(int pid, char *comm_buf, size_t buf_size)
586 {
587 	char path[64];
588 	int ret = -1;
589 	size_t len;
590 	FILE *fp;
591 
592 	snprintf(path, sizeof(path), "/proc/%d/comm", pid);
593 	fp = fopen(path, "r");
594 	if (!fp) {
595 		fprintf(stderr, "Failed to open comm file /proc/%d/comm\n", pid);
596 		return ret;
597 	}
598 
599 	if (fgets(comm_buf, buf_size, fp)) {
600 		len = strlen(comm_buf);
601 		if (len > 0 && comm_buf[len - 1] == '\n')
602 			comm_buf[len - 1] = '\0';
603 		ret = 0;
604 	}
605 
606 	fclose(fp);
607 
608 	return ret;
609 }
610 
611 static void fetch_and_fill_task_info(int pid, const char *comm)
612 {
613 	struct {
614 		struct nlmsghdr n;
615 		struct genlmsghdr g;
616 		char buf[MAX_MSG_SIZE];
617 	} resp;
618 	struct taskstats stats;
619 	struct nlattr *nested;
620 	struct nlattr *na;
621 	int nested_len;
622 	int nl_len;
623 	int rc;
624 
625 	/* Send request for task stats */
626 	if (send_cmd(nl_sd, family_id, getpid(), TASKSTATS_CMD_GET,
627 				 TASKSTATS_CMD_ATTR_PID, &pid, sizeof(pid)) < 0) {
628 		fprintf(stderr, "Failed to send request for task stats\n");
629 		return;
630 	}
631 
632 	/* Receive response */
633 	rc = recv(nl_sd, &resp, sizeof(resp), 0);
634 	if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) {
635 		fprintf(stderr, "Failed to receive response for task stats\n");
636 		return;
637 	}
638 
639 	/* Parse response */
640 	nl_len = GENLMSG_PAYLOAD(&resp.n);
641 	na = (struct nlattr *) GENLMSG_DATA(&resp);
642 	while (nl_len > 0) {
643 		if (na->nla_type == TASKSTATS_TYPE_AGGR_PID) {
644 			nested = (struct nlattr *) NLA_DATA(na);
645 			nested_len = NLA_PAYLOAD(na->nla_len);
646 			while (nested_len > 0) {
647 				if (nested->nla_type == TASKSTATS_TYPE_STATS) {
648 					memcpy(&stats, NLA_DATA(nested), sizeof(stats));
649 					if (task_count < MAX_TASKS) {
650 						tasks[task_count].pid = pid;
651 						tasks[task_count].tgid = pid;
652 						strncpy(tasks[task_count].command, comm,
653 							TASK_COMM_LEN - 1);
654 						tasks[task_count].command[TASK_COMM_LEN - 1] = '\0';
655 						SET_TASK_STAT(task_count, cpu_count);
656 						SET_TASK_STAT(task_count, cpu_delay_total);
657 						SET_TASK_STAT(task_count, blkio_count);
658 						SET_TASK_STAT(task_count, blkio_delay_total);
659 						SET_TASK_STAT(task_count, swapin_count);
660 						SET_TASK_STAT(task_count, swapin_delay_total);
661 						SET_TASK_STAT(task_count, freepages_count);
662 						SET_TASK_STAT(task_count, freepages_delay_total);
663 						SET_TASK_STAT(task_count, thrashing_count);
664 						SET_TASK_STAT(task_count, thrashing_delay_total);
665 						SET_TASK_STAT(task_count, compact_count);
666 						SET_TASK_STAT(task_count, compact_delay_total);
667 						SET_TASK_STAT(task_count, wpcopy_count);
668 						SET_TASK_STAT(task_count, wpcopy_delay_total);
669 						SET_TASK_STAT(task_count, irq_count);
670 						SET_TASK_STAT(task_count, irq_delay_total);
671 						set_mem_count(&tasks[task_count]);
672 						set_mem_delay_total(&tasks[task_count]);
673 						task_count++;
674 					}
675 					break;
676 				}
677 				nested_len -= NLA_ALIGN(nested->nla_len);
678 				nested = NLA_NEXT(nested);
679 			}
680 		}
681 		nl_len -= NLA_ALIGN(na->nla_len);
682 		na = NLA_NEXT(na);
683 	}
684 	return;
685 }
686 
687 static void get_task_delays(void)
688 {
689 	char comm[TASK_COMM_LEN];
690 	struct dirent *entry;
691 	DIR *dir;
692 	int pid;
693 
694 	task_count = 0;
695 	if (cfg.monitor_pid > 0) {
696 		if (read_comm(cfg.monitor_pid, comm, sizeof(comm)) == 0)
697 			fetch_and_fill_task_info(cfg.monitor_pid, comm);
698 		return;
699 	}
700 
701 	dir = opendir("/proc");
702 	if (!dir) {
703 		fprintf(stderr, "Error opening /proc directory\n");
704 		return;
705 	}
706 
707 	while ((entry = readdir(dir)) != NULL && task_count < MAX_TASKS) {
708 		if (!isdigit(entry->d_name[0]))
709 			continue;
710 		pid = atoi(entry->d_name);
711 		if (pid == 0)
712 			continue;
713 		if (read_comm(pid, comm, sizeof(comm)) != 0)
714 			continue;
715 		fetch_and_fill_task_info(pid, comm);
716 	}
717 	closedir(dir);
718 }
719 
720 /* Calculate average delay in milliseconds */
721 static double average_ms(unsigned long long total, unsigned long long count)
722 {
723 	if (count == 0)
724 		return 0;
725 	return (double)total / 1000000.0 / count;
726 }
727 
728 /* Comparison function for sorting tasks */
729 static int compare_tasks(const void *a, const void *b)
730 {
731 	const struct task_info *t1 = (const struct task_info *)a;
732 	const struct task_info *t2 = (const struct task_info *)b;
733 	unsigned long long total1;
734 	unsigned long long total2;
735 	unsigned long count1;
736 	unsigned long count2;
737 	double avg1, avg2;
738 
739 	total1 = *(unsigned long long *)((char *)t1 + cfg.sort_field->total_offset);
740 	total2 = *(unsigned long long *)((char *)t2 + cfg.sort_field->total_offset);
741 	count1 = *(unsigned long *)((char *)t1 + cfg.sort_field->count_offset);
742 	count2 = *(unsigned long *)((char *)t2 + cfg.sort_field->count_offset);
743 
744 	avg1 = average_ms(total1, count1);
745 	avg2 = average_ms(total2, count2);
746 	if (avg1 != avg2)
747 		return avg2 > avg1 ? 1 : -1;
748 
749 	return 0;
750 }
751 
752 /* Sort tasks by selected field */
753 static void sort_tasks(void)
754 {
755 	if (task_count > 0)
756 		qsort(tasks, task_count, sizeof(struct task_info), compare_tasks);
757 }
758 
759 /* Get container statistics via cgroupstats */
760 static void get_container_stats(void)
761 {
762 	int rc, cfd;
763 	struct {
764 		struct nlmsghdr n;
765 		struct genlmsghdr g;
766 		char buf[MAX_MSG_SIZE];
767 	} req, resp;
768 	struct nlattr *na;
769 	int nl_len;
770 	struct cgroupstats stats;
771 
772 	/* Check if container path is set */
773 	if (!cfg.container_path)
774 		return;
775 
776 	/* Open container cgroup */
777 	cfd = open(cfg.container_path, O_RDONLY);
778 	if (cfd < 0) {
779 		fprintf(stderr, "Error opening container path: %s\n", cfg.container_path);
780 		return;
781 	}
782 
783 	/* Send request for container stats */
784 	if (send_cmd(nl_sd, family_id, getpid(), CGROUPSTATS_CMD_GET,
785 				CGROUPSTATS_CMD_ATTR_FD, &cfd, sizeof(__u32)) < 0) {
786 		fprintf(stderr, "Failed to send request for container stats\n");
787 		close(cfd);
788 		return;
789 	}
790 
791 	/* Receive response */
792 	rc = recv(nl_sd, &resp, sizeof(resp), 0);
793 	if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) {
794 		fprintf(stderr, "Failed to receive response for container stats\n");
795 		close(cfd);
796 		return;
797 	}
798 
799 	/* Parse response */
800 	nl_len = GENLMSG_PAYLOAD(&resp.n);
801 	na = (struct nlattr *) GENLMSG_DATA(&resp);
802 	while (nl_len > 0) {
803 		if (na->nla_type == CGROUPSTATS_TYPE_CGROUP_STATS) {
804 			/* Get the cgroupstats structure */
805 			memcpy(&stats, NLA_DATA(na), sizeof(stats));
806 
807 			/* Fill container stats */
808 			container_stats.nr_sleeping = stats.nr_sleeping;
809 			container_stats.nr_running = stats.nr_running;
810 			container_stats.nr_stopped = stats.nr_stopped;
811 			container_stats.nr_uninterruptible = stats.nr_uninterruptible;
812 			container_stats.nr_io_wait = stats.nr_io_wait;
813 			break;
814 		}
815 		nl_len -= NLA_ALIGN(na->nla_len);
816 		na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
817 	}
818 
819 	close(cfd);
820 }
821 
822 /* Display results to stdout or log file */
823 static void display_results(void)
824 {
825 	time_t now = time(NULL);
826 	struct tm *tm_now = localtime(&now);
827 	FILE *out = stdout;
828 	char timestamp[32];
829 	bool suc = true;
830 	int i, count;
831 
832 	/* Clear terminal screen */
833 	suc &= BOOL_FPRINT(out, "\033[H\033[J");
834 
835 	/* PSI output (one-line, no cat style) */
836 	suc &= BOOL_FPRINT(out, "System Pressure Information: (avg10/avg60/avg300/total)\n");
837 	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
838 		"CPU some:",
839 		psi.cpu_some_avg10,
840 		psi.cpu_some_avg60,
841 		psi.cpu_some_avg300,
842 		psi.cpu_some_total / 1000);
843 	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
844 		"CPU full:",
845 		psi.cpu_full_avg10,
846 		psi.cpu_full_avg60,
847 		psi.cpu_full_avg300,
848 		psi.cpu_full_total / 1000);
849 	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
850 		"Memory full:",
851 		psi.memory_full_avg10,
852 		psi.memory_full_avg60,
853 		psi.memory_full_avg300,
854 		psi.memory_full_total / 1000);
855 	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
856 		"Memory some:",
857 		psi.memory_some_avg10,
858 		psi.memory_some_avg60,
859 		psi.memory_some_avg300,
860 		psi.memory_some_total / 1000);
861 	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
862 		"IO full:",
863 		psi.io_full_avg10,
864 		psi.io_full_avg60,
865 		psi.io_full_avg300,
866 		psi.io_full_total / 1000);
867 	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
868 		"IO some:",
869 		psi.io_some_avg10,
870 		psi.io_some_avg60,
871 		psi.io_some_avg300,
872 		psi.io_some_total / 1000);
873 	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
874 		"IRQ full:",
875 		psi.irq_full_avg10,
876 		psi.irq_full_avg60,
877 		psi.irq_full_avg300,
878 		psi.irq_full_total / 1000);
879 
880 	if (cfg.container_path) {
881 		suc &= BOOL_FPRINT(out, "Container Information (%s):\n", cfg.container_path);
882 		suc &= BOOL_FPRINT(out, "Processes: running=%d, sleeping=%d, ",
883 			container_stats.nr_running, container_stats.nr_sleeping);
884 		suc &= BOOL_FPRINT(out, "stopped=%d, uninterruptible=%d, io_wait=%d\n\n",
885 			container_stats.nr_stopped, container_stats.nr_uninterruptible,
886 			container_stats.nr_io_wait);
887 	}
888 
889 	/* Interacive command */
890 	suc &= BOOL_FPRINT(out, "[o]sort [M]memverbose [q]quit\n");
891 	if (sort_selected) {
892 		if (cfg.display_mode == MODE_MEMVERBOSE)
893 			suc &= BOOL_FPRINT(out,
894 				"sort selection: [m]MEM [r]RCL [t]THR [p]CMP [w]WP\n");
895 		else
896 			suc &= BOOL_FPRINT(out,
897 				"sort selection: [c]CPU [i]IO [m]MEM [q]IRQ\n");
898 	}
899 
900 	/* Task delay output */
901 	suc &= BOOL_FPRINT(out, "Top %d processes (sorted by %s delay):\n",
902 			cfg.max_processes, get_name_by_field(cfg.sort_field));
903 
904 	suc &= BOOL_FPRINT(out, "%8s  %8s  %-17s", "PID", "TGID", "COMMAND");
905 	if (cfg.display_mode == MODE_MEMVERBOSE) {
906 		suc &= BOOL_FPRINT(out, "%8s %8s %8s %8s %8s %8s\n",
907 			"MEM(ms)", "SWAP(ms)", "RCL(ms)",
908 			"THR(ms)", "CMP(ms)", "WP(ms)");
909 		suc &= BOOL_FPRINT(out, "-----------------------");
910 		suc &= BOOL_FPRINT(out, "-----------------------");
911 		suc &= BOOL_FPRINT(out, "-----------------------");
912 		suc &= BOOL_FPRINT(out, "---------------------\n");
913 	} else {
914 		suc &= BOOL_FPRINT(out, "%8s %8s %8s %8s\n",
915 			"CPU(ms)", "IO(ms)", "IRQ(ms)", "MEM(ms)");
916 		suc &= BOOL_FPRINT(out, "-----------------------");
917 		suc &= BOOL_FPRINT(out, "-----------------------");
918 		suc &= BOOL_FPRINT(out, "--------------------------\n");
919 	}
920 
921 	count = task_count < cfg.max_processes ? task_count : cfg.max_processes;
922 
923 	for (i = 0; i < count; i++) {
924 		suc &= BOOL_FPRINT(out, "%8d  %8d  %-15s",
925 			tasks[i].pid, tasks[i].tgid, tasks[i].command);
926 		if (cfg.display_mode == MODE_MEMVERBOSE) {
927 			suc &= BOOL_FPRINT(out, DELAY_FMT_MEMVERBOSE,
928 				TASK_AVG(tasks[i], mem),
929 				TASK_AVG(tasks[i], swapin),
930 				TASK_AVG(tasks[i], freepages),
931 				TASK_AVG(tasks[i], thrashing),
932 				TASK_AVG(tasks[i], compact),
933 				TASK_AVG(tasks[i], wpcopy));
934 		} else {
935 			suc &= BOOL_FPRINT(out, DELAY_FMT_DEFAULT,
936 				TASK_AVG(tasks[i], cpu),
937 				TASK_AVG(tasks[i], blkio),
938 				TASK_AVG(tasks[i], irq),
939 				TASK_AVG(tasks[i], mem));
940 		}
941 	}
942 
943 	suc &= BOOL_FPRINT(out, "\n");
944 
945 	if (!suc)
946 		perror("Error writing to output");
947 }
948 
949 /* Check for keyboard input with timeout based on cfg.delay */
950 static char check_for_keypress(void)
951 {
952 	struct timeval tv = {cfg.delay, 0};
953 	fd_set readfds;
954 	char ch = 0;
955 
956 	FD_ZERO(&readfds);
957 	FD_SET(STDIN_FILENO, &readfds);
958 	int r = select(STDIN_FILENO + 1, &readfds, NULL, NULL, &tv);
959 
960 	if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) {
961 		read(STDIN_FILENO, &ch, 1);
962 		return ch;
963 	}
964 
965 	return 0;
966 }
967 
968 #define MAX_MODE_SIZE 2
969 static void toggle_display_mode(void)
970 {
971 	static const size_t modes[MAX_MODE_SIZE] = {MODE_DEFAULT, MODE_MEMVERBOSE};
972 	static size_t cur_index;
973 
974 	cur_index = (cur_index + 1) % MAX_MODE_SIZE;
975 	cfg.display_mode = modes[cur_index];
976 }
977 
978 /* Handle keyboard input: sorting selection, mode toggle, or quit */
979 static void handle_keypress(char ch, int *running)
980 {
981 	const struct field_desc *field;
982 
983 	/* Change sort field */
984 	if (sort_selected) {
985 		field = get_field_by_cmd_char(ch);
986 		if (field && (field->supported_modes & cfg.display_mode))
987 			cfg.sort_field = field;
988 
989 		sort_selected = 0;
990 	/* Handle mode changes or quit */
991 	} else {
992 		switch (ch) {
993 		case 'o':
994 			sort_selected = 1;
995 			break;
996 		case 'M':
997 			toggle_display_mode();
998 			for (field = sort_fields; field->name != NULL; field++) {
999 				if (field->supported_modes & cfg.display_mode) {
1000 					cfg.sort_field = field;
1001 					break;
1002 				}
1003 			}
1004 			break;
1005 		case 'q':
1006 		case 'Q':
1007 			*running = 0;
1008 			break;
1009 		default:
1010 			break;
1011 		}
1012 	}
1013 }
1014 
1015 /* Main function */
1016 int main(int argc, char **argv)
1017 {
1018 	const struct field_desc *field;
1019 	int iterations = 0;
1020 	char keypress;
1021 
1022 	/* Parse command line arguments */
1023 	parse_args(argc, argv);
1024 
1025 	/* Setup netlink socket */
1026 	nl_sd = create_nl_socket();
1027 	if (nl_sd < 0) {
1028 		fprintf(stderr, "Error creating netlink socket\n");
1029 		exit(1);
1030 	}
1031 
1032 	/* Get family ID for taskstats via netlink */
1033 	family_id = get_family_id(nl_sd);
1034 	if (!family_id) {
1035 		fprintf(stderr, "Error getting taskstats family ID\n");
1036 		close(nl_sd);
1037 		exit(1);
1038 	}
1039 
1040 	/* Set terminal to non-canonical mode for interaction */
1041 	enable_raw_mode();
1042 
1043 	/* Main loop */
1044 	while (running) {
1045 		/* Auto-switch sort field when not matching display mode */
1046 		if (!(cfg.sort_field->supported_modes & cfg.display_mode)) {
1047 			for (field = sort_fields; field->name != NULL; field++) {
1048 				if (field->supported_modes & cfg.display_mode) {
1049 					cfg.sort_field = field;
1050 					printf("Auto-switched sort field to: %s\n", field->name);
1051 					break;
1052 				}
1053 			}
1054 		}
1055 
1056 		/* Read PSI statistics */
1057 		read_psi_stats();
1058 
1059 		/* Get container stats if container path provided */
1060 		if (cfg.container_path)
1061 			get_container_stats();
1062 
1063 		/* Get task delays */
1064 		get_task_delays();
1065 
1066 		/* Sort tasks */
1067 		sort_tasks();
1068 
1069 		/* Display results to stdout or log file */
1070 		display_results();
1071 
1072 		/* Check for iterations */
1073 		if (cfg.iterations > 0 && ++iterations >= cfg.iterations)
1074 			break;
1075 
1076 		/* Exit if output_one_time is set */
1077 		if (cfg.output_one_time)
1078 			break;
1079 
1080 		/* Keypress for interactive usage */
1081 		keypress = check_for_keypress();
1082 		if (keypress)
1083 			handle_keypress(keypress, &running);
1084 	}
1085 
1086 	/* Restore terminal mode */
1087 	disable_raw_mode();
1088 
1089 	/* Cleanup */
1090 	close(nl_sd);
1091 	if (cfg.container_path)
1092 		free(cfg.container_path);
1093 
1094 	return 0;
1095 }
1096