xref: /linux/tools/accounting/delaytop.c (revision 99d9c55f88e69ebbfc90e05ce7c320bdb3901d03)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * delaytop.c - system-wide delay monitoring tool.
4  *
5  * This tool provides real-time monitoring and statistics of
6  * system, container, and task-level delays, including CPU,
7  * memory, IO, and IRQ. It supports both interactive (top-like),
8  * and can output delay information for the whole system, specific
9  * containers (cgroups), or individual tasks (PIDs).
10  *
11  * Key features:
12  *	- Collects per-task delay accounting statistics via taskstats.
13  *	- Collects system-wide PSI information.
14  *	- Supports sorting, filtering.
15  *	- Supports both interactive (screen refresh).
16  *
17  * Copyright (C) Fan Yu, ZTE Corp. 2025
18  * Copyright (C) Wang Yaxin, ZTE Corp. 2025
19  *
20  * Compile with
21  *	gcc -I/usr/src/linux/include delaytop.c -o delaytop
22  */
23 
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <errno.h>
28 #include <unistd.h>
29 #include <fcntl.h>
30 #include <getopt.h>
31 #include <signal.h>
32 #include <time.h>
33 #include <dirent.h>
34 #include <ctype.h>
35 #include <stdbool.h>
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/socket.h>
39 #include <sys/select.h>
40 #include <termios.h>
41 #include <limits.h>
42 #include <linux/genetlink.h>
43 #include <linux/taskstats.h>
44 #include <linux/cgroupstats.h>
45 #include <stddef.h>
46 
47 #define PSI_CPU_SOME "/proc/pressure/cpu"
48 #define PSI_CPU_FULL	"/proc/pressure/cpu"
49 #define PSI_MEMORY_SOME "/proc/pressure/memory"
50 #define PSI_MEMORY_FULL "/proc/pressure/memory"
51 #define PSI_IO_SOME "/proc/pressure/io"
52 #define PSI_IO_FULL "/proc/pressure/io"
53 #define PSI_IRQ_FULL	"/proc/pressure/irq"
54 
55 #define NLA_NEXT(na)			((struct nlattr *)((char *)(na) + NLA_ALIGN((na)->nla_len)))
56 #define NLA_DATA(na)			((void *)((char *)(na) + NLA_HDRLEN))
57 #define NLA_PAYLOAD(len)		(len - NLA_HDRLEN)
58 
59 #define GENLMSG_DATA(glh)		((void *)(NLMSG_DATA(glh) + GENL_HDRLEN))
60 #define GENLMSG_PAYLOAD(glh)	(NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN)
61 
62 #define TASK_COMM_LEN	16
63 #define MAX_MSG_SIZE	1024
64 #define MAX_TASKS		1000
65 #define MAX_BUF_LEN		256
66 #define SET_TASK_STAT(task_count, field) tasks[task_count].field = stats.field
67 #define BOOL_FPRINT(stream, fmt, ...) \
68 ({ \
69 	int ret = fprintf(stream, fmt, ##__VA_ARGS__); \
70 	ret >= 0; \
71 })
72 #define TASK_AVG(task, field) average_ms((task).field##_delay_total, (task).field##_count)
73 #define PSI_LINE_FORMAT "%-12s %6.1f%%/%6.1f%%/%6.1f%%/%8llu(ms)\n"
74 #define DELAY_FMT_DEFAULT "%8.2f %8.2f %8.2f %8.2f\n"
75 #define DELAY_FMT_MEMVERBOSE "%8.2f %8.2f %8.2f %8.2f %8.2f %8.2f\n"
76 #define SORT_FIELD(name, modes) \
77 	{#name, \
78 	offsetof(struct task_info, name##_delay_total), \
79 	offsetof(struct task_info, name##_count), \
80 	modes}
81 #define END_FIELD {NULL, 0, 0}
82 
83 /* Display mode types */
84 #define MODE_TYPE_ALL	(0xFFFFFFFF)
85 #define MODE_DEFAULT	(1 << 0)
86 #define MODE_MEMVERBOSE	(1 << 1)
87 
88 /* PSI statistics structure */
89 struct psi_stats {
90 	double cpu_some_avg10, cpu_some_avg60, cpu_some_avg300;
91 	unsigned long long cpu_some_total;
92 	double cpu_full_avg10, cpu_full_avg60, cpu_full_avg300;
93 	unsigned long long cpu_full_total;
94 	double memory_some_avg10, memory_some_avg60, memory_some_avg300;
95 	unsigned long long memory_some_total;
96 	double memory_full_avg10, memory_full_avg60, memory_full_avg300;
97 	unsigned long long memory_full_total;
98 	double io_some_avg10, io_some_avg60, io_some_avg300;
99 	unsigned long long io_some_total;
100 	double io_full_avg10, io_full_avg60, io_full_avg300;
101 	unsigned long long io_full_total;
102 	double irq_full_avg10, irq_full_avg60, irq_full_avg300;
103 	unsigned long long irq_full_total;
104 };
105 
106 /* Task delay information structure */
107 struct task_info {
108 	int pid;
109 	int tgid;
110 	char command[TASK_COMM_LEN];
111 	unsigned long long cpu_count;
112 	unsigned long long cpu_delay_total;
113 	unsigned long long blkio_count;
114 	unsigned long long blkio_delay_total;
115 	unsigned long long swapin_count;
116 	unsigned long long swapin_delay_total;
117 	unsigned long long freepages_count;
118 	unsigned long long freepages_delay_total;
119 	unsigned long long thrashing_count;
120 	unsigned long long thrashing_delay_total;
121 	unsigned long long compact_count;
122 	unsigned long long compact_delay_total;
123 	unsigned long long wpcopy_count;
124 	unsigned long long wpcopy_delay_total;
125 	unsigned long long irq_count;
126 	unsigned long long irq_delay_total;
127 	unsigned long long mem_count;
128 	unsigned long long mem_delay_total;
129 };
130 
131 /* Container statistics structure */
132 struct container_stats {
133 	int nr_sleeping;		/* Number of sleeping processes */
134 	int nr_running;			/* Number of running processes */
135 	int nr_stopped;			/* Number of stopped processes */
136 	int nr_uninterruptible; /* Number of uninterruptible processes */
137 	int nr_io_wait;			/* Number of processes in IO wait */
138 };
139 
140 /* Delay field structure */
141 struct field_desc {
142 	const char *name;	/* Field name for cmdline argument */
143 	unsigned long total_offset; /* Offset of total delay in task_info */
144 	unsigned long count_offset; /* Offset of count in task_info */
145 	size_t supported_modes; /* Supported display modes */
146 };
147 
148 /* Program settings structure */
149 struct config {
150 	int delay;				/* Update interval in seconds */
151 	int iterations;			/* Number of iterations, 0 == infinite */
152 	int max_processes;		/* Maximum number of processes to show */
153 	int output_one_time;	/* Output once and exit */
154 	int monitor_pid;		/* Monitor specific PID */
155 	char *container_path;	/* Path to container cgroup */
156 	const struct field_desc *sort_field;	/* Current sort field */
157 	size_t display_mode;	/* Current display mode */
158 };
159 
160 /* Global variables */
161 static struct config cfg;
162 static struct psi_stats psi;
163 static struct task_info tasks[MAX_TASKS];
164 static int task_count;
165 static int running = 1;
166 static struct container_stats container_stats;
167 static const struct field_desc sort_fields[] = {
168 	SORT_FIELD(cpu,		MODE_DEFAULT),
169 	SORT_FIELD(blkio,	MODE_DEFAULT),
170 	SORT_FIELD(irq,		MODE_DEFAULT),
171 	SORT_FIELD(mem,		MODE_DEFAULT | MODE_MEMVERBOSE),
172 	SORT_FIELD(swapin,	MODE_MEMVERBOSE),
173 	SORT_FIELD(freepages,	MODE_MEMVERBOSE),
174 	SORT_FIELD(thrashing,	MODE_MEMVERBOSE),
175 	SORT_FIELD(compact,	MODE_MEMVERBOSE),
176 	SORT_FIELD(wpcopy,	MODE_MEMVERBOSE),
177 	END_FIELD
178 };
179 
180 /* Netlink socket variables */
181 static int nl_sd = -1;
182 static int family_id;
183 
184 /* Set terminal to non-canonical mode for q-to-quit */
185 static struct termios orig_termios;
186 static void enable_raw_mode(void)
187 {
188 	struct termios raw;
189 
190 	tcgetattr(STDIN_FILENO, &orig_termios);
191 	raw = orig_termios;
192 	raw.c_lflag &= ~(ICANON | ECHO);
193 	tcsetattr(STDIN_FILENO, TCSAFLUSH, &raw);
194 }
195 static void disable_raw_mode(void)
196 {
197 	tcsetattr(STDIN_FILENO, TCSAFLUSH, &orig_termios);
198 }
199 
200 /* Find field descriptor by name with string comparison */
201 static const struct field_desc *get_field_by_name(const char *name)
202 {
203 	const struct field_desc *field;
204 	size_t field_len;
205 
206 	for (field = sort_fields; field->name != NULL; field++) {
207 		field_len = strlen(field->name);
208 		if (field_len != strlen(name))
209 			continue;
210 		if (strncmp(field->name, name, field_len) == 0)
211 			return field;
212 	}
213 
214 	return NULL;
215 }
216 
217 /* Find display name for a field descriptor */
218 static const char *get_name_by_field(const struct field_desc *field)
219 {
220 	return field ? field->name : "UNKNOWN";
221 }
222 
223 /* Generate string of available field names */
224 static void display_available_fields(size_t mode)
225 {
226 	const struct field_desc *field;
227 	char buf[MAX_BUF_LEN];
228 
229 	buf[0] = '\0';
230 
231 	for (field = sort_fields; field->name != NULL; field++) {
232 		if (!(field->supported_modes & mode))
233 			continue;
234 		strncat(buf, "|", MAX_BUF_LEN - strlen(buf) - 1);
235 		strncat(buf, field->name, MAX_BUF_LEN - strlen(buf) - 1);
236 		buf[MAX_BUF_LEN - 1] = '\0';
237 	}
238 
239 	fprintf(stderr, "Available fields: %s\n", buf);
240 }
241 
242 /* Display usage information and command line options */
243 static void usage(void)
244 {
245 	printf("Usage: delaytop [Options]\n"
246 	"Options:\n"
247 	"  -h, --help               Show this help message and exit\n"
248 	"  -d, --delay=SECONDS      Set refresh interval (default: 2 seconds, min: 1)\n"
249 	"  -n, --iterations=COUNT   Set number of updates (default: 0 = infinite)\n"
250 	"  -P, --processes=NUMBER   Set maximum number of processes to show (default: 20, max: 1000)\n"
251 	"  -o, --once               Display once and exit\n"
252 	"  -p, --pid=PID            Monitor only the specified PID\n"
253 	"  -C, --container=PATH     Monitor the container at specified cgroup path\n"
254 	"  -s, --sort=FIELD         Sort by delay field (default: cpu)\n"
255 	"  -M, --memverbose         Display memory detailed information\n");
256 	exit(0);
257 }
258 
259 /* Parse command line arguments and set configuration */
260 static void parse_args(int argc, char **argv)
261 {
262 	int c;
263 	const struct field_desc *field;
264 	struct option long_options[] = {
265 		{"help", no_argument, 0, 'h'},
266 		{"delay", required_argument, 0, 'd'},
267 		{"iterations", required_argument, 0, 'n'},
268 		{"pid", required_argument, 0, 'p'},
269 		{"once", no_argument, 0, 'o'},
270 		{"processes", required_argument, 0, 'P'},
271 		{"sort", required_argument, 0, 's'},
272 		{"container", required_argument, 0, 'C'},
273 		{"memverbose", no_argument, 0, 'M'},
274 		{0, 0, 0, 0}
275 	};
276 
277 	/* Set defaults */
278 	cfg.delay = 2;
279 	cfg.iterations = 0;
280 	cfg.max_processes = 20;
281 	cfg.sort_field = &sort_fields[0];	/* Default sorted by CPU delay */
282 	cfg.output_one_time = 0;
283 	cfg.monitor_pid = 0;	/* 0 means monitor all PIDs */
284 	cfg.container_path = NULL;
285 	cfg.display_mode = MODE_DEFAULT;
286 
287 	while (1) {
288 		int option_index = 0;
289 
290 		c = getopt_long(argc, argv, "hd:n:p:oP:C:s:M", long_options, &option_index);
291 		if (c == -1)
292 			break;
293 
294 		switch (c) {
295 		case 'h':
296 			usage();
297 			break;
298 		case 'd':
299 			cfg.delay = atoi(optarg);
300 			if (cfg.delay < 1) {
301 				fprintf(stderr, "Error: delay must be >= 1.\n");
302 				exit(1);
303 			}
304 			break;
305 		case 'n':
306 			cfg.iterations = atoi(optarg);
307 			if (cfg.iterations < 0) {
308 				fprintf(stderr, "Error: iterations must be >= 0.\n");
309 				exit(1);
310 			}
311 			break;
312 		case 'p':
313 			cfg.monitor_pid = atoi(optarg);
314 			if (cfg.monitor_pid < 1) {
315 				fprintf(stderr, "Error: pid must be >= 1.\n");
316 				exit(1);
317 			}
318 			break;
319 		case 'o':
320 			cfg.output_one_time = 1;
321 			break;
322 		case 'P':
323 			cfg.max_processes = atoi(optarg);
324 			if (cfg.max_processes < 1) {
325 				fprintf(stderr, "Error: processes must be >= 1.\n");
326 				exit(1);
327 			}
328 			if (cfg.max_processes > MAX_TASKS) {
329 				fprintf(stderr, "Warning: processes capped to %d.\n",
330 					MAX_TASKS);
331 				cfg.max_processes = MAX_TASKS;
332 			}
333 			break;
334 		case 'C':
335 			cfg.container_path = strdup(optarg);
336 			break;
337 		case 's':
338 			if (strlen(optarg) == 0) {
339 				fprintf(stderr, "Error: empty sort field\n");
340 				exit(1);
341 			}
342 
343 			field = get_field_by_name(optarg);
344 			/* Show available fields if invalid option provided */
345 			if (!field) {
346 				fprintf(stderr, "Error: invalid sort field '%s'\n", optarg);
347 				display_available_fields(MODE_TYPE_ALL);
348 				exit(1);
349 			}
350 
351 			cfg.sort_field = field;
352 			break;
353 		case 'M':
354 			cfg.display_mode = MODE_MEMVERBOSE;
355 			cfg.sort_field = get_field_by_name("mem");
356 			break;
357 		default:
358 			fprintf(stderr, "Try 'delaytop --help' for more information.\n");
359 			exit(1);
360 		}
361 	}
362 }
363 
364 /* Calculate average delay in milliseconds for overall memory */
365 static void set_mem_delay_total(struct task_info *t)
366 {
367 	t->mem_delay_total = t->swapin_delay_total +
368 		t->freepages_delay_total +
369 		t->thrashing_delay_total +
370 		t->compact_delay_total +
371 		t->wpcopy_delay_total;
372 }
373 
374 static void set_mem_count(struct task_info *t)
375 {
376 	t->mem_count = t->swapin_count +
377 		t->freepages_count +
378 		t->thrashing_count +
379 		t->compact_count +
380 		t->wpcopy_count;
381 }
382 
383 /* Create a raw netlink socket and bind */
384 static int create_nl_socket(void)
385 {
386 	int fd;
387 	struct sockaddr_nl local;
388 
389 	fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
390 	if (fd < 0)
391 		return -1;
392 
393 	memset(&local, 0, sizeof(local));
394 	local.nl_family = AF_NETLINK;
395 
396 	if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) {
397 		fprintf(stderr, "Failed to bind socket when create nl_socket\n");
398 		close(fd);
399 		return -1;
400 	}
401 
402 	return fd;
403 }
404 
405 /* Send a command via netlink */
406 static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid,
407 			 __u8 genl_cmd, __u16 nla_type,
408 			 void *nla_data, int nla_len)
409 {
410 	struct sockaddr_nl nladdr;
411 	struct nlattr *na;
412 	int r, buflen;
413 	char *buf;
414 
415 	struct {
416 		struct nlmsghdr n;
417 		struct genlmsghdr g;
418 		char buf[MAX_MSG_SIZE];
419 	} msg;
420 
421 	msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
422 	msg.n.nlmsg_type = nlmsg_type;
423 	msg.n.nlmsg_flags = NLM_F_REQUEST;
424 	msg.n.nlmsg_seq = 0;
425 	msg.n.nlmsg_pid = nlmsg_pid;
426 	msg.g.cmd = genl_cmd;
427 	msg.g.version = 0x1;
428 	na = (struct nlattr *) GENLMSG_DATA(&msg);
429 	na->nla_type = nla_type;
430 	na->nla_len = nla_len + NLA_HDRLEN;
431 	memcpy(NLA_DATA(na), nla_data, nla_len);
432 	msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
433 
434 	buf = (char *) &msg;
435 	buflen = msg.n.nlmsg_len;
436 	memset(&nladdr, 0, sizeof(nladdr));
437 	nladdr.nl_family = AF_NETLINK;
438 	while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr,
439 					sizeof(nladdr))) < buflen) {
440 		if (r > 0) {
441 			buf += r;
442 			buflen -= r;
443 		} else if (errno != EAGAIN)
444 			return -1;
445 	}
446 	return 0;
447 }
448 
449 /* Get family ID for taskstats via netlink */
450 static int get_family_id(int sd)
451 {
452 	struct {
453 		struct nlmsghdr n;
454 		struct genlmsghdr g;
455 		char buf[256];
456 	} ans;
457 
458 	int id = 0, rc;
459 	struct nlattr *na;
460 	int rep_len;
461 	char name[100];
462 
463 	strncpy(name, TASKSTATS_GENL_NAME, sizeof(name) - 1);
464 	name[sizeof(name) - 1] = '\0';
465 	rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY,
466 			CTRL_ATTR_FAMILY_NAME, (void *)name,
467 			strlen(TASKSTATS_GENL_NAME)+1);
468 	if (rc < 0) {
469 		fprintf(stderr, "Failed to send cmd for family id\n");
470 		return 0;
471 	}
472 
473 	rep_len = recv(sd, &ans, sizeof(ans), 0);
474 	if (ans.n.nlmsg_type == NLMSG_ERROR ||
475 		(rep_len < 0) || !NLMSG_OK((&ans.n), rep_len)) {
476 		fprintf(stderr, "Failed to receive response for family id\n");
477 		return 0;
478 	}
479 
480 	na = (struct nlattr *) GENLMSG_DATA(&ans);
481 	na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
482 	if (na->nla_type == CTRL_ATTR_FAMILY_ID)
483 		id = *(__u16 *) NLA_DATA(na);
484 	return id;
485 }
486 
487 static void read_psi_stats(void)
488 {
489 	FILE *fp;
490 	char line[256];
491 	int ret = 0;
492 	/* Zero all fields */
493 	memset(&psi, 0, sizeof(psi));
494 	/* CPU pressure */
495 	fp = fopen(PSI_CPU_SOME, "r");
496 	if (fp) {
497 		while (fgets(line, sizeof(line), fp)) {
498 			if (strncmp(line, "some", 4) == 0) {
499 				ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
500 							&psi.cpu_some_avg10, &psi.cpu_some_avg60,
501 							&psi.cpu_some_avg300, &psi.cpu_some_total);
502 				if (ret != 4)
503 					fprintf(stderr, "Failed to parse CPU some PSI data\n");
504 			} else if (strncmp(line, "full", 4) == 0) {
505 				ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
506 						&psi.cpu_full_avg10, &psi.cpu_full_avg60,
507 						&psi.cpu_full_avg300, &psi.cpu_full_total);
508 				if (ret != 4)
509 					fprintf(stderr, "Failed to parse CPU full PSI data\n");
510 			}
511 		}
512 		fclose(fp);
513 	}
514 	/* Memory pressure */
515 	fp = fopen(PSI_MEMORY_SOME, "r");
516 	if (fp) {
517 		while (fgets(line, sizeof(line), fp)) {
518 			if (strncmp(line, "some", 4) == 0) {
519 				ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
520 						&psi.memory_some_avg10, &psi.memory_some_avg60,
521 						&psi.memory_some_avg300, &psi.memory_some_total);
522 				if (ret != 4)
523 					fprintf(stderr, "Failed to parse Memory some PSI data\n");
524 			} else if (strncmp(line, "full", 4) == 0) {
525 				ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
526 						&psi.memory_full_avg10, &psi.memory_full_avg60,
527 						&psi.memory_full_avg300, &psi.memory_full_total);
528 			}
529 				if (ret != 4)
530 					fprintf(stderr, "Failed to parse Memory full PSI data\n");
531 		}
532 		fclose(fp);
533 	}
534 	/* IO pressure */
535 	fp = fopen(PSI_IO_SOME, "r");
536 	if (fp) {
537 		while (fgets(line, sizeof(line), fp)) {
538 			if (strncmp(line, "some", 4) == 0) {
539 				ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
540 						&psi.io_some_avg10, &psi.io_some_avg60,
541 						&psi.io_some_avg300, &psi.io_some_total);
542 				if (ret != 4)
543 					fprintf(stderr, "Failed to parse IO some PSI data\n");
544 			} else if (strncmp(line, "full", 4) == 0) {
545 				ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
546 						&psi.io_full_avg10, &psi.io_full_avg60,
547 						&psi.io_full_avg300, &psi.io_full_total);
548 				if (ret != 4)
549 					fprintf(stderr, "Failed to parse IO full PSI data\n");
550 			}
551 		}
552 		fclose(fp);
553 	}
554 	/* IRQ pressure (only full) */
555 	fp = fopen(PSI_IRQ_FULL, "r");
556 	if (fp) {
557 		while (fgets(line, sizeof(line), fp)) {
558 			if (strncmp(line, "full", 4) == 0) {
559 				ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
560 						&psi.irq_full_avg10, &psi.irq_full_avg60,
561 						&psi.irq_full_avg300, &psi.irq_full_total);
562 				if (ret != 4)
563 					fprintf(stderr, "Failed to parse IRQ full PSI data\n");
564 			}
565 		}
566 		fclose(fp);
567 	}
568 }
569 
570 static int read_comm(int pid, char *comm_buf, size_t buf_size)
571 {
572 	char path[64];
573 	int ret = -1;
574 	size_t len;
575 	FILE *fp;
576 
577 	snprintf(path, sizeof(path), "/proc/%d/comm", pid);
578 	fp = fopen(path, "r");
579 	if (!fp) {
580 		fprintf(stderr, "Failed to open comm file /proc/%d/comm\n", pid);
581 		return ret;
582 	}
583 
584 	if (fgets(comm_buf, buf_size, fp)) {
585 		len = strlen(comm_buf);
586 		if (len > 0 && comm_buf[len - 1] == '\n')
587 			comm_buf[len - 1] = '\0';
588 		ret = 0;
589 	}
590 
591 	fclose(fp);
592 
593 	return ret;
594 }
595 
596 static void fetch_and_fill_task_info(int pid, const char *comm)
597 {
598 	struct {
599 		struct nlmsghdr n;
600 		struct genlmsghdr g;
601 		char buf[MAX_MSG_SIZE];
602 	} resp;
603 	struct taskstats stats;
604 	struct nlattr *nested;
605 	struct nlattr *na;
606 	int nested_len;
607 	int nl_len;
608 	int rc;
609 
610 	/* Send request for task stats */
611 	if (send_cmd(nl_sd, family_id, getpid(), TASKSTATS_CMD_GET,
612 				 TASKSTATS_CMD_ATTR_PID, &pid, sizeof(pid)) < 0) {
613 		fprintf(stderr, "Failed to send request for task stats\n");
614 		return;
615 	}
616 
617 	/* Receive response */
618 	rc = recv(nl_sd, &resp, sizeof(resp), 0);
619 	if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) {
620 		fprintf(stderr, "Failed to receive response for task stats\n");
621 		return;
622 	}
623 
624 	/* Parse response */
625 	nl_len = GENLMSG_PAYLOAD(&resp.n);
626 	na = (struct nlattr *) GENLMSG_DATA(&resp);
627 	while (nl_len > 0) {
628 		if (na->nla_type == TASKSTATS_TYPE_AGGR_PID) {
629 			nested = (struct nlattr *) NLA_DATA(na);
630 			nested_len = NLA_PAYLOAD(na->nla_len);
631 			while (nested_len > 0) {
632 				if (nested->nla_type == TASKSTATS_TYPE_STATS) {
633 					memcpy(&stats, NLA_DATA(nested), sizeof(stats));
634 					if (task_count < MAX_TASKS) {
635 						tasks[task_count].pid = pid;
636 						tasks[task_count].tgid = pid;
637 						strncpy(tasks[task_count].command, comm,
638 							TASK_COMM_LEN - 1);
639 						tasks[task_count].command[TASK_COMM_LEN - 1] = '\0';
640 						SET_TASK_STAT(task_count, cpu_count);
641 						SET_TASK_STAT(task_count, cpu_delay_total);
642 						SET_TASK_STAT(task_count, blkio_count);
643 						SET_TASK_STAT(task_count, blkio_delay_total);
644 						SET_TASK_STAT(task_count, swapin_count);
645 						SET_TASK_STAT(task_count, swapin_delay_total);
646 						SET_TASK_STAT(task_count, freepages_count);
647 						SET_TASK_STAT(task_count, freepages_delay_total);
648 						SET_TASK_STAT(task_count, thrashing_count);
649 						SET_TASK_STAT(task_count, thrashing_delay_total);
650 						SET_TASK_STAT(task_count, compact_count);
651 						SET_TASK_STAT(task_count, compact_delay_total);
652 						SET_TASK_STAT(task_count, wpcopy_count);
653 						SET_TASK_STAT(task_count, wpcopy_delay_total);
654 						SET_TASK_STAT(task_count, irq_count);
655 						SET_TASK_STAT(task_count, irq_delay_total);
656 						set_mem_count(&tasks[task_count]);
657 						set_mem_delay_total(&tasks[task_count]);
658 						task_count++;
659 					}
660 					break;
661 				}
662 				nested_len -= NLA_ALIGN(nested->nla_len);
663 				nested = NLA_NEXT(nested);
664 			}
665 		}
666 		nl_len -= NLA_ALIGN(na->nla_len);
667 		na = NLA_NEXT(na);
668 	}
669 	return;
670 }
671 
672 static void get_task_delays(void)
673 {
674 	char comm[TASK_COMM_LEN];
675 	struct dirent *entry;
676 	DIR *dir;
677 	int pid;
678 
679 	task_count = 0;
680 	if (cfg.monitor_pid > 0) {
681 		if (read_comm(cfg.monitor_pid, comm, sizeof(comm)) == 0)
682 			fetch_and_fill_task_info(cfg.monitor_pid, comm);
683 		return;
684 	}
685 
686 	dir = opendir("/proc");
687 	if (!dir) {
688 		fprintf(stderr, "Error opening /proc directory\n");
689 		return;
690 	}
691 
692 	while ((entry = readdir(dir)) != NULL && task_count < MAX_TASKS) {
693 		if (!isdigit(entry->d_name[0]))
694 			continue;
695 		pid = atoi(entry->d_name);
696 		if (pid == 0)
697 			continue;
698 		if (read_comm(pid, comm, sizeof(comm)) != 0)
699 			continue;
700 		fetch_and_fill_task_info(pid, comm);
701 	}
702 	closedir(dir);
703 }
704 
705 /* Calculate average delay in milliseconds */
706 static double average_ms(unsigned long long total, unsigned long long count)
707 {
708 	if (count == 0)
709 		return 0;
710 	return (double)total / 1000000.0 / count;
711 }
712 
713 /* Comparison function for sorting tasks */
714 static int compare_tasks(const void *a, const void *b)
715 {
716 	const struct task_info *t1 = (const struct task_info *)a;
717 	const struct task_info *t2 = (const struct task_info *)b;
718 	unsigned long long total1;
719 	unsigned long long total2;
720 	unsigned long count1;
721 	unsigned long count2;
722 	double avg1, avg2;
723 
724 	total1 = *(unsigned long long *)((char *)t1 + cfg.sort_field->total_offset);
725 	total2 = *(unsigned long long *)((char *)t2 + cfg.sort_field->total_offset);
726 	count1 = *(unsigned long *)((char *)t1 + cfg.sort_field->count_offset);
727 	count2 = *(unsigned long *)((char *)t2 + cfg.sort_field->count_offset);
728 
729 	avg1 = average_ms(total1, count1);
730 	avg2 = average_ms(total2, count2);
731 	if (avg1 != avg2)
732 		return avg2 > avg1 ? 1 : -1;
733 
734 	return 0;
735 }
736 
737 /* Sort tasks by selected field */
738 static void sort_tasks(void)
739 {
740 	if (task_count > 0)
741 		qsort(tasks, task_count, sizeof(struct task_info), compare_tasks);
742 }
743 
744 /* Get container statistics via cgroupstats */
745 static void get_container_stats(void)
746 {
747 	int rc, cfd;
748 	struct {
749 		struct nlmsghdr n;
750 		struct genlmsghdr g;
751 		char buf[MAX_MSG_SIZE];
752 	} req, resp;
753 	struct nlattr *na;
754 	int nl_len;
755 	struct cgroupstats stats;
756 
757 	/* Check if container path is set */
758 	if (!cfg.container_path)
759 		return;
760 
761 	/* Open container cgroup */
762 	cfd = open(cfg.container_path, O_RDONLY);
763 	if (cfd < 0) {
764 		fprintf(stderr, "Error opening container path: %s\n", cfg.container_path);
765 		return;
766 	}
767 
768 	/* Send request for container stats */
769 	if (send_cmd(nl_sd, family_id, getpid(), CGROUPSTATS_CMD_GET,
770 				CGROUPSTATS_CMD_ATTR_FD, &cfd, sizeof(__u32)) < 0) {
771 		fprintf(stderr, "Failed to send request for container stats\n");
772 		close(cfd);
773 		return;
774 	}
775 
776 	/* Receive response */
777 	rc = recv(nl_sd, &resp, sizeof(resp), 0);
778 	if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) {
779 		fprintf(stderr, "Failed to receive response for container stats\n");
780 		close(cfd);
781 		return;
782 	}
783 
784 	/* Parse response */
785 	nl_len = GENLMSG_PAYLOAD(&resp.n);
786 	na = (struct nlattr *) GENLMSG_DATA(&resp);
787 	while (nl_len > 0) {
788 		if (na->nla_type == CGROUPSTATS_TYPE_CGROUP_STATS) {
789 			/* Get the cgroupstats structure */
790 			memcpy(&stats, NLA_DATA(na), sizeof(stats));
791 
792 			/* Fill container stats */
793 			container_stats.nr_sleeping = stats.nr_sleeping;
794 			container_stats.nr_running = stats.nr_running;
795 			container_stats.nr_stopped = stats.nr_stopped;
796 			container_stats.nr_uninterruptible = stats.nr_uninterruptible;
797 			container_stats.nr_io_wait = stats.nr_io_wait;
798 			break;
799 		}
800 		nl_len -= NLA_ALIGN(na->nla_len);
801 		na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
802 	}
803 
804 	close(cfd);
805 }
806 
807 /* Display results to stdout or log file */
808 static void display_results(void)
809 {
810 	time_t now = time(NULL);
811 	struct tm *tm_now = localtime(&now);
812 	FILE *out = stdout;
813 	char timestamp[32];
814 	bool suc = true;
815 	int i, count;
816 
817 	/* Clear terminal screen */
818 	suc &= BOOL_FPRINT(out, "\033[H\033[J");
819 
820 	/* PSI output (one-line, no cat style) */
821 	suc &= BOOL_FPRINT(out, "System Pressure Information: (avg10/avg60/avg300/total)\n");
822 	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
823 		"CPU some:",
824 		psi.cpu_some_avg10,
825 		psi.cpu_some_avg60,
826 		psi.cpu_some_avg300,
827 		psi.cpu_some_total / 1000);
828 	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
829 		"CPU full:",
830 		psi.cpu_full_avg10,
831 		psi.cpu_full_avg60,
832 		psi.cpu_full_avg300,
833 		psi.cpu_full_total / 1000);
834 	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
835 		"Memory full:",
836 		psi.memory_full_avg10,
837 		psi.memory_full_avg60,
838 		psi.memory_full_avg300,
839 		psi.memory_full_total / 1000);
840 	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
841 		"Memory some:",
842 		psi.memory_some_avg10,
843 		psi.memory_some_avg60,
844 		psi.memory_some_avg300,
845 		psi.memory_some_total / 1000);
846 	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
847 		"IO full:",
848 		psi.io_full_avg10,
849 		psi.io_full_avg60,
850 		psi.io_full_avg300,
851 		psi.io_full_total / 1000);
852 	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
853 		"IO some:",
854 		psi.io_some_avg10,
855 		psi.io_some_avg60,
856 		psi.io_some_avg300,
857 		psi.io_some_total / 1000);
858 	suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
859 		"IRQ full:",
860 		psi.irq_full_avg10,
861 		psi.irq_full_avg60,
862 		psi.irq_full_avg300,
863 		psi.irq_full_total / 1000);
864 
865 	if (cfg.container_path) {
866 		suc &= BOOL_FPRINT(out, "Container Information (%s):\n", cfg.container_path);
867 		suc &= BOOL_FPRINT(out, "Processes: running=%d, sleeping=%d, ",
868 			container_stats.nr_running, container_stats.nr_sleeping);
869 		suc &= BOOL_FPRINT(out, "stopped=%d, uninterruptible=%d, io_wait=%d\n\n",
870 			container_stats.nr_stopped, container_stats.nr_uninterruptible,
871 			container_stats.nr_io_wait);
872 	}
873 	/* Task delay output */
874 	suc &= BOOL_FPRINT(out, "Top %d processes (sorted by %s delay):\n",
875 			cfg.max_processes, get_name_by_field(cfg.sort_field));
876 
877 	suc &= BOOL_FPRINT(out, "%8s  %8s  %-17s", "PID", "TGID", "COMMAND");
878 	if (cfg.display_mode == MODE_MEMVERBOSE) {
879 		suc &= BOOL_FPRINT(out, "%8s %8s %8s %8s %8s %8s\n",
880 			"MEM(ms)", "SWAP(ms)", "RCL(ms)",
881 			"THR(ms)", "CMP(ms)", "WP(ms)");
882 		suc &= BOOL_FPRINT(out, "-----------------------");
883 		suc &= BOOL_FPRINT(out, "-----------------------");
884 		suc &= BOOL_FPRINT(out, "-----------------------");
885 		suc &= BOOL_FPRINT(out, "---------------------\n");
886 	} else {
887 		suc &= BOOL_FPRINT(out, "%8s %8s %8s %8s\n",
888 			"CPU(ms)", "IO(ms)", "IRQ(ms)", "MEM(ms)");
889 		suc &= BOOL_FPRINT(out, "-----------------------");
890 		suc &= BOOL_FPRINT(out, "-----------------------");
891 		suc &= BOOL_FPRINT(out, "--------------------------\n");
892 	}
893 
894 	count = task_count < cfg.max_processes ? task_count : cfg.max_processes;
895 
896 	for (i = 0; i < count; i++) {
897 		suc &= BOOL_FPRINT(out, "%8d  %8d  %-15s",
898 			tasks[i].pid, tasks[i].tgid, tasks[i].command);
899 		if (cfg.display_mode == MODE_MEMVERBOSE) {
900 			suc &= BOOL_FPRINT(out, DELAY_FMT_MEMVERBOSE,
901 				TASK_AVG(tasks[i], mem),
902 				TASK_AVG(tasks[i], swapin),
903 				TASK_AVG(tasks[i], freepages),
904 				TASK_AVG(tasks[i], thrashing),
905 				TASK_AVG(tasks[i], compact),
906 				TASK_AVG(tasks[i], wpcopy));
907 		} else {
908 			suc &= BOOL_FPRINT(out, DELAY_FMT_DEFAULT,
909 				TASK_AVG(tasks[i], cpu),
910 				TASK_AVG(tasks[i], blkio),
911 				TASK_AVG(tasks[i], irq),
912 				TASK_AVG(tasks[i], mem));
913 		}
914 	}
915 
916 	suc &= BOOL_FPRINT(out, "\n");
917 
918 	if (!suc)
919 		perror("Error writing to output");
920 }
921 
922 /* Main function */
923 int main(int argc, char **argv)
924 {
925 	int iterations = 0;
926 	int use_q_quit = 0;
927 
928 	/* Parse command line arguments */
929 	parse_args(argc, argv);
930 
931 	/* Setup netlink socket */
932 	nl_sd = create_nl_socket();
933 	if (nl_sd < 0) {
934 		fprintf(stderr, "Error creating netlink socket\n");
935 		exit(1);
936 	}
937 
938 	/* Get family ID for taskstats via netlink */
939 	family_id = get_family_id(nl_sd);
940 	if (!family_id) {
941 		fprintf(stderr, "Error getting taskstats family ID\n");
942 		close(nl_sd);
943 		exit(1);
944 	}
945 
946 	if (!cfg.output_one_time) {
947 		use_q_quit = 1;
948 		enable_raw_mode();
949 		printf("Press 'q' to quit.\n");
950 		fflush(stdout);
951 	}
952 
953 	/* Main loop */
954 	while (running) {
955 		/* Exit when sort field do not match display mode */
956 		if (!(cfg.sort_field->supported_modes & cfg.display_mode)) {
957 			fprintf(stderr, "Sort field not supported in this mode\n");
958 			display_available_fields(cfg.display_mode);
959 			break;
960 		}
961 
962 		/* Read PSI statistics */
963 		read_psi_stats();
964 
965 		/* Get container stats if container path provided */
966 		if (cfg.container_path)
967 			get_container_stats();
968 
969 		/* Get task delays */
970 		get_task_delays();
971 
972 		/* Sort tasks */
973 		sort_tasks();
974 
975 		/* Display results to stdout or log file */
976 		display_results();
977 
978 		/* Check for iterations */
979 		if (cfg.iterations > 0 && ++iterations >= cfg.iterations)
980 			break;
981 
982 		/* Exit if output_one_time is set */
983 		if (cfg.output_one_time)
984 			break;
985 
986 		/* Check for 'q' key to quit */
987 		if (use_q_quit) {
988 			struct timeval tv = {cfg.delay, 0};
989 			fd_set readfds;
990 
991 			FD_ZERO(&readfds);
992 			FD_SET(STDIN_FILENO, &readfds);
993 			int r = select(STDIN_FILENO+1, &readfds, NULL, NULL, &tv);
994 
995 			if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) {
996 				char ch = 0;
997 
998 				read(STDIN_FILENO, &ch, 1);
999 				if (ch == 'q' || ch == 'Q') {
1000 					running = 0;
1001 					break;
1002 				}
1003 			}
1004 		} else {
1005 			sleep(cfg.delay);
1006 		}
1007 	}
1008 
1009 	/* Restore terminal mode */
1010 	if (use_q_quit)
1011 		disable_raw_mode();
1012 
1013 	/* Cleanup */
1014 	close(nl_sd);
1015 	if (cfg.container_path)
1016 		free(cfg.container_path);
1017 
1018 	return 0;
1019 }
1020