xref: /linux/tools/accounting/delaytop.c (revision e406d57be7bd2a4e73ea512c1ae36a40a44e499e)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * delaytop.c - system-wide delay monitoring tool.
4  *
5  * This tool provides real-time monitoring and statistics of
6  * system, container, and task-level delays, including CPU,
7  * memory, IO, and IRQ. It supports both interactive (top-like),
8  * and can output delay information for the whole system, specific
9  * containers (cgroups), or individual tasks (PIDs).
10  *
11  * Key features:
12  *	- Collects per-task delay accounting statistics via taskstats.
13  *	- Collects system-wide PSI information.
14  *	- Supports sorting, filtering.
15  *	- Supports both interactive (screen refresh).
16  *
17  * Copyright (C) Fan Yu, ZTE Corp. 2025
18  * Copyright (C) Wang Yaxin, ZTE Corp. 2025
19  *
20  * Compile with
21  *	gcc -I/usr/src/linux/include delaytop.c -o delaytop
22  */
23 
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <errno.h>
28 #include <unistd.h>
29 #include <fcntl.h>
30 #include <getopt.h>
31 #include <signal.h>
32 #include <time.h>
33 #include <dirent.h>
34 #include <ctype.h>
35 #include <stdbool.h>
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/socket.h>
39 #include <sys/select.h>
40 #include <termios.h>
41 #include <limits.h>
42 #include <linux/genetlink.h>
43 #include <linux/taskstats.h>
44 #include <linux/cgroupstats.h>
45 #include <stddef.h>
46 
47 #define PSI_PATH	"/proc/pressure"
48 #define PSI_CPU_PATH	"/proc/pressure/cpu"
49 #define PSI_MEMORY_PATH	"/proc/pressure/memory"
50 #define PSI_IO_PATH	"/proc/pressure/io"
51 #define PSI_IRQ_PATH	"/proc/pressure/irq"
52 
53 #define NLA_NEXT(na)			((struct nlattr *)((char *)(na) + NLA_ALIGN((na)->nla_len)))
54 #define NLA_DATA(na)			((void *)((char *)(na) + NLA_HDRLEN))
55 #define NLA_PAYLOAD(len)		(len - NLA_HDRLEN)
56 
57 #define GENLMSG_DATA(glh)		((void *)(NLMSG_DATA(glh) + GENL_HDRLEN))
58 #define GENLMSG_PAYLOAD(glh)	(NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN)
59 
60 #define TASK_COMM_LEN	16
61 #define MAX_MSG_SIZE	1024
62 #define MAX_TASKS		1000
63 #define MAX_BUF_LEN		256
64 #define SET_TASK_STAT(task_count, field) tasks[task_count].field = stats.field
65 #define BOOL_FPRINT(stream, fmt, ...) \
66 ({ \
67 	int ret = fprintf(stream, fmt, ##__VA_ARGS__); \
68 	ret >= 0; \
69 })
70 #define TASK_AVG(task, field) average_ms((task).field##_delay_total, (task).field##_count)
71 #define PSI_LINE_FORMAT "%-12s %6.1f%%/%6.1f%%/%6.1f%%/%8llu(ms)\n"
72 #define DELAY_FMT_DEFAULT "%8.2f %8.2f %8.2f %8.2f\n"
73 #define DELAY_FMT_MEMVERBOSE "%8.2f %8.2f %8.2f %8.2f %8.2f %8.2f\n"
74 #define SORT_FIELD(name, cmd, modes) \
75 	{#name, #cmd, \
76 	offsetof(struct task_info, name##_delay_total), \
77 	offsetof(struct task_info, name##_count), \
78 	modes}
79 #define END_FIELD {NULL, 0, 0}
80 
81 /* Display mode types */
82 #define MODE_TYPE_ALL	(0xFFFFFFFF)
83 #define MODE_DEFAULT	(1 << 0)
84 #define MODE_MEMVERBOSE	(1 << 1)
85 
86 /* PSI statistics structure */
87 struct psi_stats {
88 	double cpu_some_avg10, cpu_some_avg60, cpu_some_avg300;
89 	unsigned long long cpu_some_total;
90 	double cpu_full_avg10, cpu_full_avg60, cpu_full_avg300;
91 	unsigned long long cpu_full_total;
92 	double memory_some_avg10, memory_some_avg60, memory_some_avg300;
93 	unsigned long long memory_some_total;
94 	double memory_full_avg10, memory_full_avg60, memory_full_avg300;
95 	unsigned long long memory_full_total;
96 	double io_some_avg10, io_some_avg60, io_some_avg300;
97 	unsigned long long io_some_total;
98 	double io_full_avg10, io_full_avg60, io_full_avg300;
99 	unsigned long long io_full_total;
100 	double irq_full_avg10, irq_full_avg60, irq_full_avg300;
101 	unsigned long long irq_full_total;
102 };
103 
104 /* Task delay information structure */
105 struct task_info {
106 	int pid;
107 	int tgid;
108 	char command[TASK_COMM_LEN];
109 	unsigned long long cpu_count;
110 	unsigned long long cpu_delay_total;
111 	unsigned long long blkio_count;
112 	unsigned long long blkio_delay_total;
113 	unsigned long long swapin_count;
114 	unsigned long long swapin_delay_total;
115 	unsigned long long freepages_count;
116 	unsigned long long freepages_delay_total;
117 	unsigned long long thrashing_count;
118 	unsigned long long thrashing_delay_total;
119 	unsigned long long compact_count;
120 	unsigned long long compact_delay_total;
121 	unsigned long long wpcopy_count;
122 	unsigned long long wpcopy_delay_total;
123 	unsigned long long irq_count;
124 	unsigned long long irq_delay_total;
125 	unsigned long long mem_count;
126 	unsigned long long mem_delay_total;
127 };
128 
129 /* Container statistics structure */
130 struct container_stats {
131 	int nr_sleeping;		/* Number of sleeping processes */
132 	int nr_running;			/* Number of running processes */
133 	int nr_stopped;			/* Number of stopped processes */
134 	int nr_uninterruptible; /* Number of uninterruptible processes */
135 	int nr_io_wait;			/* Number of processes in IO wait */
136 };
137 
138 /* Delay field structure */
139 struct field_desc {
140 	const char *name;	/* Field name for cmdline argument */
141 	const char *cmd_char;	/* Interactive command */
142 	unsigned long total_offset; /* Offset of total delay in task_info */
143 	unsigned long count_offset; /* Offset of count in task_info */
144 	size_t supported_modes; /* Supported display modes */
145 };
146 
147 /* Program settings structure */
148 struct config {
149 	int delay;				/* Update interval in seconds */
150 	int iterations;			/* Number of iterations, 0 == infinite */
151 	int max_processes;		/* Maximum number of processes to show */
152 	int output_one_time;	/* Output once and exit */
153 	int monitor_pid;		/* Monitor specific PID */
154 	char *container_path;	/* Path to container cgroup */
155 	const struct field_desc *sort_field;	/* Current sort field */
156 	size_t display_mode;	/* Current display mode */
157 };
158 
159 /* Global variables */
160 static struct config cfg;
161 static struct psi_stats psi;
162 static struct task_info tasks[MAX_TASKS];
163 static int task_count;
164 static int running = 1;
165 static struct container_stats container_stats;
166 static const struct field_desc sort_fields[] = {
167 	SORT_FIELD(cpu,		c,	MODE_DEFAULT),
168 	SORT_FIELD(blkio,	i,	MODE_DEFAULT),
169 	SORT_FIELD(irq,		q,	MODE_DEFAULT),
170 	SORT_FIELD(mem,		m,	MODE_DEFAULT | MODE_MEMVERBOSE),
171 	SORT_FIELD(swapin,	s,	MODE_MEMVERBOSE),
172 	SORT_FIELD(freepages,	r,	MODE_MEMVERBOSE),
173 	SORT_FIELD(thrashing,	t,	MODE_MEMVERBOSE),
174 	SORT_FIELD(compact,	p,	MODE_MEMVERBOSE),
175 	SORT_FIELD(wpcopy,	w,	MODE_MEMVERBOSE),
176 	END_FIELD
177 };
178 static int sort_selected;
179 
180 /* Netlink socket variables */
181 static int nl_sd = -1;
182 static int family_id;
183 
184 /* Set terminal to non-canonical mode for q-to-quit */
185 static struct termios orig_termios;
enable_raw_mode(void)186 static void enable_raw_mode(void)
187 {
188 	struct termios raw;
189 
190 	tcgetattr(STDIN_FILENO, &orig_termios);
191 	raw = orig_termios;
192 	raw.c_lflag &= ~(ICANON | ECHO);
193 	tcsetattr(STDIN_FILENO, TCSAFLUSH, &raw);
194 }
disable_raw_mode(void)195 static void disable_raw_mode(void)
196 {
197 	tcsetattr(STDIN_FILENO, TCSAFLUSH, &orig_termios);
198 }
199 
200 /* Find field descriptor by command line */
get_field_by_cmd_char(char ch)201 static const struct field_desc *get_field_by_cmd_char(char ch)
202 {
203 	const struct field_desc *field;
204 
205 	for (field = sort_fields; field->name != NULL; field++) {
206 		if (field->cmd_char[0] == ch)
207 			return field;
208 	}
209 
210 	return NULL;
211 }
212 
213 /* Find field descriptor by name with string comparison */
get_field_by_name(const char * name)214 static const struct field_desc *get_field_by_name(const char *name)
215 {
216 	const struct field_desc *field;
217 	size_t field_len;
218 
219 	for (field = sort_fields; field->name != NULL; field++) {
220 		field_len = strlen(field->name);
221 		if (field_len != strlen(name))
222 			continue;
223 		if (strncmp(field->name, name, field_len) == 0)
224 			return field;
225 	}
226 
227 	return NULL;
228 }
229 
230 /* Find display name for a field descriptor */
get_name_by_field(const struct field_desc * field)231 static const char *get_name_by_field(const struct field_desc *field)
232 {
233 	return field ? field->name : "UNKNOWN";
234 }
235 
236 /* Generate string of available field names */
display_available_fields(size_t mode)237 static void display_available_fields(size_t mode)
238 {
239 	const struct field_desc *field;
240 	char buf[MAX_BUF_LEN];
241 
242 	buf[0] = '\0';
243 
244 	for (field = sort_fields; field->name != NULL; field++) {
245 		if (!(field->supported_modes & mode))
246 			continue;
247 		strncat(buf, "|", MAX_BUF_LEN - strlen(buf) - 1);
248 		strncat(buf, field->name, MAX_BUF_LEN - strlen(buf) - 1);
249 		buf[MAX_BUF_LEN - 1] = '\0';
250 	}
251 
252 	fprintf(stderr, "Available fields: %s\n", buf);
253 }
254 
255 /* Display usage information and command line options */
usage(void)256 static void usage(void)
257 {
258 	printf("Usage: delaytop [Options]\n"
259 	"Options:\n"
260 	"  -h, --help               Show this help message and exit\n"
261 	"  -d, --delay=SECONDS      Set refresh interval (default: 2 seconds, min: 1)\n"
262 	"  -n, --iterations=COUNT   Set number of updates (default: 0 = infinite)\n"
263 	"  -P, --processes=NUMBER   Set maximum number of processes to show (default: 20, max: 1000)\n"
264 	"  -o, --once               Display once and exit\n"
265 	"  -p, --pid=PID            Monitor only the specified PID\n"
266 	"  -C, --container=PATH     Monitor the container at specified cgroup path\n"
267 	"  -s, --sort=FIELD         Sort by delay field (default: cpu)\n"
268 	"  -M, --memverbose         Display memory detailed information\n");
269 	exit(0);
270 }
271 
272 /* Parse command line arguments and set configuration */
parse_args(int argc,char ** argv)273 static void parse_args(int argc, char **argv)
274 {
275 	int c;
276 	const struct field_desc *field;
277 	struct option long_options[] = {
278 		{"help", no_argument, 0, 'h'},
279 		{"delay", required_argument, 0, 'd'},
280 		{"iterations", required_argument, 0, 'n'},
281 		{"pid", required_argument, 0, 'p'},
282 		{"once", no_argument, 0, 'o'},
283 		{"processes", required_argument, 0, 'P'},
284 		{"sort", required_argument, 0, 's'},
285 		{"container", required_argument, 0, 'C'},
286 		{"memverbose", no_argument, 0, 'M'},
287 		{0, 0, 0, 0}
288 	};
289 
290 	/* Set defaults */
291 	cfg.delay = 2;
292 	cfg.iterations = 0;
293 	cfg.max_processes = 20;
294 	cfg.sort_field = &sort_fields[0];	/* Default sorted by CPU delay */
295 	cfg.output_one_time = 0;
296 	cfg.monitor_pid = 0;	/* 0 means monitor all PIDs */
297 	cfg.container_path = NULL;
298 	cfg.display_mode = MODE_DEFAULT;
299 
300 	while (1) {
301 		int option_index = 0;
302 
303 		c = getopt_long(argc, argv, "hd:n:p:oP:C:s:M", long_options, &option_index);
304 		if (c == -1)
305 			break;
306 
307 		switch (c) {
308 		case 'h':
309 			usage();
310 			break;
311 		case 'd':
312 			cfg.delay = atoi(optarg);
313 			if (cfg.delay < 1) {
314 				fprintf(stderr, "Error: delay must be >= 1.\n");
315 				exit(1);
316 			}
317 			break;
318 		case 'n':
319 			cfg.iterations = atoi(optarg);
320 			if (cfg.iterations < 0) {
321 				fprintf(stderr, "Error: iterations must be >= 0.\n");
322 				exit(1);
323 			}
324 			break;
325 		case 'p':
326 			cfg.monitor_pid = atoi(optarg);
327 			if (cfg.monitor_pid < 1) {
328 				fprintf(stderr, "Error: pid must be >= 1.\n");
329 				exit(1);
330 			}
331 			break;
332 		case 'o':
333 			cfg.output_one_time = 1;
334 			break;
335 		case 'P':
336 			cfg.max_processes = atoi(optarg);
337 			if (cfg.max_processes < 1) {
338 				fprintf(stderr, "Error: processes must be >= 1.\n");
339 				exit(1);
340 			}
341 			if (cfg.max_processes > MAX_TASKS) {
342 				fprintf(stderr, "Warning: processes capped to %d.\n",
343 					MAX_TASKS);
344 				cfg.max_processes = MAX_TASKS;
345 			}
346 			break;
347 		case 'C':
348 			cfg.container_path = strdup(optarg);
349 			break;
350 		case 's':
351 			if (strlen(optarg) == 0) {
352 				fprintf(stderr, "Error: empty sort field\n");
353 				exit(1);
354 			}
355 
356 			field = get_field_by_name(optarg);
357 			/* Show available fields if invalid option provided */
358 			if (!field) {
359 				fprintf(stderr, "Error: invalid sort field '%s'\n", optarg);
360 				display_available_fields(MODE_TYPE_ALL);
361 				exit(1);
362 			}
363 
364 			cfg.sort_field = field;
365 			break;
366 		case 'M':
367 			cfg.display_mode = MODE_MEMVERBOSE;
368 			cfg.sort_field = get_field_by_name("mem");
369 			break;
370 		default:
371 			fprintf(stderr, "Try 'delaytop --help' for more information.\n");
372 			exit(1);
373 		}
374 	}
375 }
376 
377 /* Calculate average delay in milliseconds for overall memory */
set_mem_delay_total(struct task_info * t)378 static void set_mem_delay_total(struct task_info *t)
379 {
380 	t->mem_delay_total = t->swapin_delay_total +
381 		t->freepages_delay_total +
382 		t->thrashing_delay_total +
383 		t->compact_delay_total +
384 		t->wpcopy_delay_total;
385 }
386 
set_mem_count(struct task_info * t)387 static void set_mem_count(struct task_info *t)
388 {
389 	t->mem_count = t->swapin_count +
390 		t->freepages_count +
391 		t->thrashing_count +
392 		t->compact_count +
393 		t->wpcopy_count;
394 }
395 
396 /* Create a raw netlink socket and bind */
create_nl_socket(void)397 static int create_nl_socket(void)
398 {
399 	int fd;
400 	struct sockaddr_nl local;
401 
402 	fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
403 	if (fd < 0)
404 		return -1;
405 
406 	memset(&local, 0, sizeof(local));
407 	local.nl_family = AF_NETLINK;
408 
409 	if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) {
410 		fprintf(stderr, "Failed to bind socket when create nl_socket\n");
411 		close(fd);
412 		return -1;
413 	}
414 
415 	return fd;
416 }
417 
418 /* Send a command via netlink */
send_cmd(int sd,__u16 nlmsg_type,__u32 nlmsg_pid,__u8 genl_cmd,__u16 nla_type,void * nla_data,int nla_len)419 static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid,
420 			 __u8 genl_cmd, __u16 nla_type,
421 			 void *nla_data, int nla_len)
422 {
423 	struct sockaddr_nl nladdr;
424 	struct nlattr *na;
425 	int r, buflen;
426 	char *buf;
427 
428 	struct {
429 		struct nlmsghdr n;
430 		struct genlmsghdr g;
431 		char buf[MAX_MSG_SIZE];
432 	} msg;
433 
434 	msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
435 	msg.n.nlmsg_type = nlmsg_type;
436 	msg.n.nlmsg_flags = NLM_F_REQUEST;
437 	msg.n.nlmsg_seq = 0;
438 	msg.n.nlmsg_pid = nlmsg_pid;
439 	msg.g.cmd = genl_cmd;
440 	msg.g.version = 0x1;
441 	na = (struct nlattr *) GENLMSG_DATA(&msg);
442 	na->nla_type = nla_type;
443 	na->nla_len = nla_len + NLA_HDRLEN;
444 	memcpy(NLA_DATA(na), nla_data, nla_len);
445 	msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
446 
447 	buf = (char *) &msg;
448 	buflen = msg.n.nlmsg_len;
449 	memset(&nladdr, 0, sizeof(nladdr));
450 	nladdr.nl_family = AF_NETLINK;
451 	while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr,
452 					sizeof(nladdr))) < buflen) {
453 		if (r > 0) {
454 			buf += r;
455 			buflen -= r;
456 		} else if (errno != EAGAIN)
457 			return -1;
458 	}
459 	return 0;
460 }
461 
462 /* Get family ID for taskstats via netlink */
get_family_id(int sd)463 static int get_family_id(int sd)
464 {
465 	struct {
466 		struct nlmsghdr n;
467 		struct genlmsghdr g;
468 		char buf[256];
469 	} ans;
470 
471 	int id = 0, rc;
472 	struct nlattr *na;
473 	int rep_len;
474 	char name[100];
475 
476 	strncpy(name, TASKSTATS_GENL_NAME, sizeof(name) - 1);
477 	name[sizeof(name) - 1] = '\0';
478 	rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY,
479 			CTRL_ATTR_FAMILY_NAME, (void *)name,
480 			strlen(TASKSTATS_GENL_NAME)+1);
481 	if (rc < 0) {
482 		fprintf(stderr, "Failed to send cmd for family id\n");
483 		return 0;
484 	}
485 
486 	rep_len = recv(sd, &ans, sizeof(ans), 0);
487 	if (ans.n.nlmsg_type == NLMSG_ERROR ||
488 		(rep_len < 0) || !NLMSG_OK((&ans.n), rep_len)) {
489 		fprintf(stderr, "Failed to receive response for family id\n");
490 		return 0;
491 	}
492 
493 	na = (struct nlattr *) GENLMSG_DATA(&ans);
494 	na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
495 	if (na->nla_type == CTRL_ATTR_FAMILY_ID)
496 		id = *(__u16 *) NLA_DATA(na);
497 	return id;
498 }
499 
read_psi_stats(void)500 static int read_psi_stats(void)
501 {
502 	FILE *fp;
503 	char line[256];
504 	int ret = 0;
505 	int error_count = 0;
506 
507 	/* Check if PSI path exists */
508 	if (access(PSI_PATH, F_OK) != 0) {
509 		fprintf(stderr, "Error: PSI interface not found at %s\n", PSI_PATH);
510 		fprintf(stderr, "Please ensure your kernel supports PSI (Pressure Stall Information)\n");
511 		return -1;
512 	}
513 
514 	/* Zero all fields */
515 	memset(&psi, 0, sizeof(psi));
516 
517 	/* CPU pressure */
518 	fp = fopen(PSI_CPU_PATH, "r");
519 	if (fp) {
520 		while (fgets(line, sizeof(line), fp)) {
521 			if (strncmp(line, "some", 4) == 0) {
522 				ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
523 							&psi.cpu_some_avg10, &psi.cpu_some_avg60,
524 							&psi.cpu_some_avg300, &psi.cpu_some_total);
525 				if (ret != 4) {
526 					fprintf(stderr, "Failed to parse CPU some PSI data\n");
527 					error_count++;
528 				}
529 			} else if (strncmp(line, "full", 4) == 0) {
530 				ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
531 						&psi.cpu_full_avg10, &psi.cpu_full_avg60,
532 						&psi.cpu_full_avg300, &psi.cpu_full_total);
533 				if (ret != 4) {
534 					fprintf(stderr, "Failed to parse CPU full PSI data\n");
535 					error_count++;
536 				}
537 			}
538 		}
539 		fclose(fp);
540 	} else {
541 		fprintf(stderr, "Warning: Failed to open %s\n", PSI_CPU_PATH);
542 		error_count++;
543 	}
544 
545 	/* Memory pressure */
546 	fp = fopen(PSI_MEMORY_PATH, "r");
547 	if (fp) {
548 		while (fgets(line, sizeof(line), fp)) {
549 			if (strncmp(line, "some", 4) == 0) {
550 				ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
551 						&psi.memory_some_avg10, &psi.memory_some_avg60,
552 						&psi.memory_some_avg300, &psi.memory_some_total);
553 				if (ret != 4) {
554 					fprintf(stderr, "Failed to parse Memory some PSI data\n");
555 					error_count++;
556 				}
557 			} else if (strncmp(line, "full", 4) == 0) {
558 				ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
559 						&psi.memory_full_avg10, &psi.memory_full_avg60,
560 						&psi.memory_full_avg300, &psi.memory_full_total);
561 				if (ret != 4) {
562 					fprintf(stderr, "Failed to parse Memory full PSI data\n");
563 					error_count++;
564 				}
565 			}
566 		}
567 		fclose(fp);
568 	} else {
569 		fprintf(stderr, "Warning: Failed to open %s\n", PSI_MEMORY_PATH);
570 		error_count++;
571 	}
572 
573 	/* IO pressure */
574 	fp = fopen(PSI_IO_PATH, "r");
575 	if (fp) {
576 		while (fgets(line, sizeof(line), fp)) {
577 			if (strncmp(line, "some", 4) == 0) {
578 				ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
579 						&psi.io_some_avg10, &psi.io_some_avg60,
580 						&psi.io_some_avg300, &psi.io_some_total);
581 				if (ret != 4) {
582 					fprintf(stderr, "Failed to parse IO some PSI data\n");
583 					error_count++;
584 				}
585 			} else if (strncmp(line, "full", 4) == 0) {
586 				ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
587 						&psi.io_full_avg10, &psi.io_full_avg60,
588 						&psi.io_full_avg300, &psi.io_full_total);
589 				if (ret != 4) {
590 					fprintf(stderr, "Failed to parse IO full PSI data\n");
591 					error_count++;
592 				}
593 			}
594 		}
595 		fclose(fp);
596 	} else {
597 		fprintf(stderr, "Warning: Failed to open %s\n", PSI_IO_PATH);
598 		error_count++;
599 	}
600 
601 	/* IRQ pressure (only full) */
602 	fp = fopen(PSI_IRQ_PATH, "r");
603 	if (fp) {
604 		while (fgets(line, sizeof(line), fp)) {
605 			if (strncmp(line, "full", 4) == 0) {
606 				ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
607 						&psi.irq_full_avg10, &psi.irq_full_avg60,
608 						&psi.irq_full_avg300, &psi.irq_full_total);
609 				if (ret != 4) {
610 					fprintf(stderr, "Failed to parse IRQ full PSI data\n");
611 					error_count++;
612 				}
613 			}
614 		}
615 		fclose(fp);
616 	} else {
617 		fprintf(stderr, "Warning: Failed to open %s\n", PSI_IRQ_PATH);
618 		error_count++;
619 	}
620 
621 	/* Return error count: 0 means success, >0 means warnings, -1 means fatal error */
622 	if (error_count > 0) {
623 		fprintf(stderr, "PSI stats reading completed with %d warnings\n", error_count);
624 		return error_count;
625 	}
626 
627 	return 0;
628 }
629 
read_comm(int pid,char * comm_buf,size_t buf_size)630 static int read_comm(int pid, char *comm_buf, size_t buf_size)
631 {
632 	char path[64];
633 	int ret = -1;
634 	size_t len;
635 	FILE *fp;
636 
637 	snprintf(path, sizeof(path), "/proc/%d/comm", pid);
638 	fp = fopen(path, "r");
639 	if (!fp) {
640 		fprintf(stderr, "Failed to open comm file /proc/%d/comm\n", pid);
641 		return ret;
642 	}
643 
644 	if (fgets(comm_buf, buf_size, fp)) {
645 		len = strlen(comm_buf);
646 		if (len > 0 && comm_buf[len - 1] == '\n')
647 			comm_buf[len - 1] = '\0';
648 		ret = 0;
649 	}
650 
651 	fclose(fp);
652 
653 	return ret;
654 }
655 
fetch_and_fill_task_info(int pid,const char * comm)656 static void fetch_and_fill_task_info(int pid, const char *comm)
657 {
658 	struct {
659 		struct nlmsghdr n;
660 		struct genlmsghdr g;
661 		char buf[MAX_MSG_SIZE];
662 	} resp;
663 	struct taskstats stats;
664 	struct nlattr *nested;
665 	struct nlattr *na;
666 	int nested_len;
667 	int nl_len;
668 	int rc;
669 
670 	/* Send request for task stats */
671 	if (send_cmd(nl_sd, family_id, getpid(), TASKSTATS_CMD_GET,
672 				 TASKSTATS_CMD_ATTR_PID, &pid, sizeof(pid)) < 0) {
673 		fprintf(stderr, "Failed to send request for task stats\n");
674 		return;
675 	}
676 
677 	/* Receive response */
678 	rc = recv(nl_sd, &resp, sizeof(resp), 0);
679 	if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) {
680 		fprintf(stderr, "Failed to receive response for task stats\n");
681 		return;
682 	}
683 
684 	/* Parse response */
685 	nl_len = GENLMSG_PAYLOAD(&resp.n);
686 	na = (struct nlattr *) GENLMSG_DATA(&resp);
687 	while (nl_len > 0) {
688 		if (na->nla_type == TASKSTATS_TYPE_AGGR_PID) {
689 			nested = (struct nlattr *) NLA_DATA(na);
690 			nested_len = NLA_PAYLOAD(na->nla_len);
691 			while (nested_len > 0) {
692 				if (nested->nla_type == TASKSTATS_TYPE_STATS) {
693 					memcpy(&stats, NLA_DATA(nested), sizeof(stats));
694 					if (task_count < MAX_TASKS) {
695 						tasks[task_count].pid = pid;
696 						tasks[task_count].tgid = pid;
697 						strncpy(tasks[task_count].command, comm,
698 							TASK_COMM_LEN - 1);
699 						tasks[task_count].command[TASK_COMM_LEN - 1] = '\0';
700 						SET_TASK_STAT(task_count, cpu_count);
701 						SET_TASK_STAT(task_count, cpu_delay_total);
702 						SET_TASK_STAT(task_count, blkio_count);
703 						SET_TASK_STAT(task_count, blkio_delay_total);
704 						SET_TASK_STAT(task_count, swapin_count);
705 						SET_TASK_STAT(task_count, swapin_delay_total);
706 						SET_TASK_STAT(task_count, freepages_count);
707 						SET_TASK_STAT(task_count, freepages_delay_total);
708 						SET_TASK_STAT(task_count, thrashing_count);
709 						SET_TASK_STAT(task_count, thrashing_delay_total);
710 						SET_TASK_STAT(task_count, compact_count);
711 						SET_TASK_STAT(task_count, compact_delay_total);
712 						SET_TASK_STAT(task_count, wpcopy_count);
713 						SET_TASK_STAT(task_count, wpcopy_delay_total);
714 						SET_TASK_STAT(task_count, irq_count);
715 						SET_TASK_STAT(task_count, irq_delay_total);
716 						set_mem_count(&tasks[task_count]);
717 						set_mem_delay_total(&tasks[task_count]);
718 						task_count++;
719 					}
720 					break;
721 				}
722 				nested_len -= NLA_ALIGN(nested->nla_len);
723 				nested = NLA_NEXT(nested);
724 			}
725 		}
726 		nl_len -= NLA_ALIGN(na->nla_len);
727 		na = NLA_NEXT(na);
728 	}
729 	return;
730 }
731 
get_task_delays(void)732 static void get_task_delays(void)
733 {
734 	char comm[TASK_COMM_LEN];
735 	struct dirent *entry;
736 	DIR *dir;
737 	int pid;
738 
739 	task_count = 0;
740 	if (cfg.monitor_pid > 0) {
741 		if (read_comm(cfg.monitor_pid, comm, sizeof(comm)) == 0)
742 			fetch_and_fill_task_info(cfg.monitor_pid, comm);
743 		return;
744 	}
745 
746 	dir = opendir("/proc");
747 	if (!dir) {
748 		fprintf(stderr, "Error opening /proc directory\n");
749 		return;
750 	}
751 
752 	while ((entry = readdir(dir)) != NULL && task_count < MAX_TASKS) {
753 		if (!isdigit(entry->d_name[0]))
754 			continue;
755 		pid = atoi(entry->d_name);
756 		if (pid == 0)
757 			continue;
758 		if (read_comm(pid, comm, sizeof(comm)) != 0)
759 			continue;
760 		fetch_and_fill_task_info(pid, comm);
761 	}
762 	closedir(dir);
763 }
764 
765 /* Calculate average delay in milliseconds */
average_ms(unsigned long long total,unsigned long long count)766 static double average_ms(unsigned long long total, unsigned long long count)
767 {
768 	if (count == 0)
769 		return 0;
770 	return (double)total / 1000000.0 / count;
771 }
772 
773 /* Comparison function for sorting tasks */
compare_tasks(const void * a,const void * b)774 static int compare_tasks(const void *a, const void *b)
775 {
776 	const struct task_info *t1 = (const struct task_info *)a;
777 	const struct task_info *t2 = (const struct task_info *)b;
778 	unsigned long long total1;
779 	unsigned long long total2;
780 	unsigned long count1;
781 	unsigned long count2;
782 	double avg1, avg2;
783 
784 	total1 = *(unsigned long long *)((char *)t1 + cfg.sort_field->total_offset);
785 	total2 = *(unsigned long long *)((char *)t2 + cfg.sort_field->total_offset);
786 	count1 = *(unsigned long *)((char *)t1 + cfg.sort_field->count_offset);
787 	count2 = *(unsigned long *)((char *)t2 + cfg.sort_field->count_offset);
788 
789 	avg1 = average_ms(total1, count1);
790 	avg2 = average_ms(total2, count2);
791 	if (avg1 != avg2)
792 		return avg2 > avg1 ? 1 : -1;
793 
794 	return 0;
795 }
796 
797 /* Sort tasks by selected field */
sort_tasks(void)798 static void sort_tasks(void)
799 {
800 	if (task_count > 0)
801 		qsort(tasks, task_count, sizeof(struct task_info), compare_tasks);
802 }
803 
804 /* Get container statistics via cgroupstats */
get_container_stats(void)805 static void get_container_stats(void)
806 {
807 	int rc, cfd;
808 	struct {
809 		struct nlmsghdr n;
810 		struct genlmsghdr g;
811 		char buf[MAX_MSG_SIZE];
812 	} req, resp;
813 	struct nlattr *na;
814 	int nl_len;
815 	struct cgroupstats stats;
816 
817 	/* Check if container path is set */
818 	if (!cfg.container_path)
819 		return;
820 
821 	/* Open container cgroup */
822 	cfd = open(cfg.container_path, O_RDONLY);
823 	if (cfd < 0) {
824 		fprintf(stderr, "Error opening container path: %s\n", cfg.container_path);
825 		return;
826 	}
827 
828 	/* Send request for container stats */
829 	if (send_cmd(nl_sd, family_id, getpid(), CGROUPSTATS_CMD_GET,
830 				CGROUPSTATS_CMD_ATTR_FD, &cfd, sizeof(__u32)) < 0) {
831 		fprintf(stderr, "Failed to send request for container stats\n");
832 		close(cfd);
833 		return;
834 	}
835 
836 	/* Receive response */
837 	rc = recv(nl_sd, &resp, sizeof(resp), 0);
838 	if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) {
839 		fprintf(stderr, "Failed to receive response for container stats\n");
840 		close(cfd);
841 		return;
842 	}
843 
844 	/* Parse response */
845 	nl_len = GENLMSG_PAYLOAD(&resp.n);
846 	na = (struct nlattr *) GENLMSG_DATA(&resp);
847 	while (nl_len > 0) {
848 		if (na->nla_type == CGROUPSTATS_TYPE_CGROUP_STATS) {
849 			/* Get the cgroupstats structure */
850 			memcpy(&stats, NLA_DATA(na), sizeof(stats));
851 
852 			/* Fill container stats */
853 			container_stats.nr_sleeping = stats.nr_sleeping;
854 			container_stats.nr_running = stats.nr_running;
855 			container_stats.nr_stopped = stats.nr_stopped;
856 			container_stats.nr_uninterruptible = stats.nr_uninterruptible;
857 			container_stats.nr_io_wait = stats.nr_io_wait;
858 			break;
859 		}
860 		nl_len -= NLA_ALIGN(na->nla_len);
861 		na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
862 	}
863 
864 	close(cfd);
865 }
866 
867 /* Display results to stdout or log file */
display_results(int psi_ret)868 static void display_results(int psi_ret)
869 {
870 	time_t now = time(NULL);
871 	struct tm *tm_now = localtime(&now);
872 	FILE *out = stdout;
873 	char timestamp[32];
874 	bool suc = true;
875 	int i, count;
876 
877 	/* Clear terminal screen */
878 	suc &= BOOL_FPRINT(out, "\033[H\033[J");
879 
880 	/* PSI output (one-line, no cat style) */
881 	suc &= BOOL_FPRINT(out, "System Pressure Information: (avg10/avg60vg300/total)\n");
882 	if (psi_ret) {
883 		suc &= BOOL_FPRINT(out, "  PSI not found: check if psi=1 enabled in cmdline\n");
884 	} else {
885 		suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
886 			"CPU some:",
887 			psi.cpu_some_avg10,
888 			psi.cpu_some_avg60,
889 			psi.cpu_some_avg300,
890 			psi.cpu_some_total / 1000);
891 		suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
892 			"CPU full:",
893 			psi.cpu_full_avg10,
894 			psi.cpu_full_avg60,
895 			psi.cpu_full_avg300,
896 			psi.cpu_full_total / 1000);
897 		suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
898 			"Memory full:",
899 			psi.memory_full_avg10,
900 			psi.memory_full_avg60,
901 			psi.memory_full_avg300,
902 			psi.memory_full_total / 1000);
903 		suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
904 			"Memory some:",
905 			psi.memory_some_avg10,
906 			psi.memory_some_avg60,
907 			psi.memory_some_avg300,
908 			psi.memory_some_total / 1000);
909 		suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
910 			"IO full:",
911 			psi.io_full_avg10,
912 			psi.io_full_avg60,
913 			psi.io_full_avg300,
914 			psi.io_full_total / 1000);
915 		suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
916 			"IO some:",
917 			psi.io_some_avg10,
918 			psi.io_some_avg60,
919 			psi.io_some_avg300,
920 			psi.io_some_total / 1000);
921 		suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
922 			"IRQ full:",
923 			psi.irq_full_avg10,
924 			psi.irq_full_avg60,
925 			psi.irq_full_avg300,
926 			psi.irq_full_total / 1000);
927 	}
928 
929 	if (cfg.container_path) {
930 		suc &= BOOL_FPRINT(out, "Container Information (%s):\n", cfg.container_path);
931 		suc &= BOOL_FPRINT(out, "Processes: running=%d, sleeping=%d, ",
932 			container_stats.nr_running, container_stats.nr_sleeping);
933 		suc &= BOOL_FPRINT(out, "stopped=%d, uninterruptible=%d, io_wait=%d\n\n",
934 			container_stats.nr_stopped, container_stats.nr_uninterruptible,
935 			container_stats.nr_io_wait);
936 	}
937 
938 	/* Interacive command */
939 	suc &= BOOL_FPRINT(out, "[o]sort [M]memverbose [q]quit\n");
940 	if (sort_selected) {
941 		if (cfg.display_mode == MODE_MEMVERBOSE)
942 			suc &= BOOL_FPRINT(out,
943 				"sort selection: [m]MEM [r]RCL [t]THR [p]CMP [w]WP\n");
944 		else
945 			suc &= BOOL_FPRINT(out,
946 				"sort selection: [c]CPU [i]IO [m]MEM [q]IRQ\n");
947 	}
948 
949 	/* Task delay output */
950 	suc &= BOOL_FPRINT(out, "Top %d processes (sorted by %s delay):\n",
951 			cfg.max_processes, get_name_by_field(cfg.sort_field));
952 
953 	suc &= BOOL_FPRINT(out, "%8s  %8s  %-17s", "PID", "TGID", "COMMAND");
954 	if (cfg.display_mode == MODE_MEMVERBOSE) {
955 		suc &= BOOL_FPRINT(out, "%8s %8s %8s %8s %8s %8s\n",
956 			"MEM(ms)", "SWAP(ms)", "RCL(ms)",
957 			"THR(ms)", "CMP(ms)", "WP(ms)");
958 		suc &= BOOL_FPRINT(out, "-----------------------");
959 		suc &= BOOL_FPRINT(out, "-----------------------");
960 		suc &= BOOL_FPRINT(out, "-----------------------");
961 		suc &= BOOL_FPRINT(out, "---------------------\n");
962 	} else {
963 		suc &= BOOL_FPRINT(out, "%8s %8s %8s %8s\n",
964 			"CPU(ms)", "IO(ms)", "IRQ(ms)", "MEM(ms)");
965 		suc &= BOOL_FPRINT(out, "-----------------------");
966 		suc &= BOOL_FPRINT(out, "-----------------------");
967 		suc &= BOOL_FPRINT(out, "--------------------------\n");
968 	}
969 
970 	count = task_count < cfg.max_processes ? task_count : cfg.max_processes;
971 
972 	for (i = 0; i < count; i++) {
973 		suc &= BOOL_FPRINT(out, "%8d  %8d  %-15s",
974 			tasks[i].pid, tasks[i].tgid, tasks[i].command);
975 		if (cfg.display_mode == MODE_MEMVERBOSE) {
976 			suc &= BOOL_FPRINT(out, DELAY_FMT_MEMVERBOSE,
977 				TASK_AVG(tasks[i], mem),
978 				TASK_AVG(tasks[i], swapin),
979 				TASK_AVG(tasks[i], freepages),
980 				TASK_AVG(tasks[i], thrashing),
981 				TASK_AVG(tasks[i], compact),
982 				TASK_AVG(tasks[i], wpcopy));
983 		} else {
984 			suc &= BOOL_FPRINT(out, DELAY_FMT_DEFAULT,
985 				TASK_AVG(tasks[i], cpu),
986 				TASK_AVG(tasks[i], blkio),
987 				TASK_AVG(tasks[i], irq),
988 				TASK_AVG(tasks[i], mem));
989 		}
990 	}
991 
992 	suc &= BOOL_FPRINT(out, "\n");
993 
994 	if (!suc)
995 		perror("Error writing to output");
996 }
997 
998 /* Check for keyboard input with timeout based on cfg.delay */
check_for_keypress(void)999 static char check_for_keypress(void)
1000 {
1001 	struct timeval tv = {cfg.delay, 0};
1002 	fd_set readfds;
1003 	char ch = 0;
1004 
1005 	FD_ZERO(&readfds);
1006 	FD_SET(STDIN_FILENO, &readfds);
1007 	int r = select(STDIN_FILENO + 1, &readfds, NULL, NULL, &tv);
1008 
1009 	if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) {
1010 		read(STDIN_FILENO, &ch, 1);
1011 		return ch;
1012 	}
1013 
1014 	return 0;
1015 }
1016 
1017 #define MAX_MODE_SIZE 2
toggle_display_mode(void)1018 static void toggle_display_mode(void)
1019 {
1020 	static const size_t modes[MAX_MODE_SIZE] = {MODE_DEFAULT, MODE_MEMVERBOSE};
1021 	static size_t cur_index;
1022 
1023 	cur_index = (cur_index + 1) % MAX_MODE_SIZE;
1024 	cfg.display_mode = modes[cur_index];
1025 }
1026 
1027 /* Handle keyboard input: sorting selection, mode toggle, or quit */
handle_keypress(char ch,int * running)1028 static void handle_keypress(char ch, int *running)
1029 {
1030 	const struct field_desc *field;
1031 
1032 	/* Change sort field */
1033 	if (sort_selected) {
1034 		field = get_field_by_cmd_char(ch);
1035 		if (field && (field->supported_modes & cfg.display_mode))
1036 			cfg.sort_field = field;
1037 
1038 		sort_selected = 0;
1039 	/* Handle mode changes or quit */
1040 	} else {
1041 		switch (ch) {
1042 		case 'o':
1043 			sort_selected = 1;
1044 			break;
1045 		case 'M':
1046 			toggle_display_mode();
1047 			for (field = sort_fields; field->name != NULL; field++) {
1048 				if (field->supported_modes & cfg.display_mode) {
1049 					cfg.sort_field = field;
1050 					break;
1051 				}
1052 			}
1053 			break;
1054 		case 'q':
1055 		case 'Q':
1056 			*running = 0;
1057 			break;
1058 		default:
1059 			break;
1060 		}
1061 	}
1062 }
1063 
1064 /* Main function */
main(int argc,char ** argv)1065 int main(int argc, char **argv)
1066 {
1067 	const struct field_desc *field;
1068 	int iterations = 0;
1069 	int psi_ret = 0;
1070 	char keypress;
1071 
1072 	/* Parse command line arguments */
1073 	parse_args(argc, argv);
1074 
1075 	/* Setup netlink socket */
1076 	nl_sd = create_nl_socket();
1077 	if (nl_sd < 0) {
1078 		fprintf(stderr, "Error creating netlink socket\n");
1079 		exit(1);
1080 	}
1081 
1082 	/* Get family ID for taskstats via netlink */
1083 	family_id = get_family_id(nl_sd);
1084 	if (!family_id) {
1085 		fprintf(stderr, "Error getting taskstats family ID\n");
1086 		close(nl_sd);
1087 		exit(1);
1088 	}
1089 
1090 	/* Set terminal to non-canonical mode for interaction */
1091 	enable_raw_mode();
1092 
1093 	/* Main loop */
1094 	while (running) {
1095 		/* Auto-switch sort field when not matching display mode */
1096 		if (!(cfg.sort_field->supported_modes & cfg.display_mode)) {
1097 			for (field = sort_fields; field->name != NULL; field++) {
1098 				if (field->supported_modes & cfg.display_mode) {
1099 					cfg.sort_field = field;
1100 					printf("Auto-switched sort field to: %s\n", field->name);
1101 					break;
1102 				}
1103 			}
1104 		}
1105 
1106 		/* Read PSI statistics */
1107 		psi_ret = read_psi_stats();
1108 
1109 		/* Get container stats if container path provided */
1110 		if (cfg.container_path)
1111 			get_container_stats();
1112 
1113 		/* Get task delays */
1114 		get_task_delays();
1115 
1116 		/* Sort tasks */
1117 		sort_tasks();
1118 
1119 		/* Display results to stdout or log file */
1120 		display_results(psi_ret);
1121 
1122 		/* Check for iterations */
1123 		if (cfg.iterations > 0 && ++iterations >= cfg.iterations)
1124 			break;
1125 
1126 		/* Exit if output_one_time is set */
1127 		if (cfg.output_one_time)
1128 			break;
1129 
1130 		/* Keypress for interactive usage */
1131 		keypress = check_for_keypress();
1132 		if (keypress)
1133 			handle_keypress(keypress, &running);
1134 	}
1135 
1136 	/* Restore terminal mode */
1137 	disable_raw_mode();
1138 
1139 	/* Cleanup */
1140 	close(nl_sd);
1141 	if (cfg.container_path)
1142 		free(cfg.container_path);
1143 
1144 	return 0;
1145 }
1146