1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * delaytop.c - system-wide delay monitoring tool.
4 *
5 * This tool provides real-time monitoring and statistics of
6 * system, container, and task-level delays, including CPU,
7 * memory, IO, and IRQ. It supports both interactive (top-like),
8 * and can output delay information for the whole system, specific
9 * containers (cgroups), or individual tasks (PIDs).
10 *
11 * Key features:
12 * - Collects per-task delay accounting statistics via taskstats.
13 * - Collects system-wide PSI information.
14 * - Supports sorting, filtering.
15 * - Supports both interactive (screen refresh).
16 *
17 * Copyright (C) Fan Yu, ZTE Corp. 2025
18 * Copyright (C) Wang Yaxin, ZTE Corp. 2025
19 *
20 * Compile with
21 * gcc -I/usr/src/linux/include delaytop.c -o delaytop
22 */
23
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <errno.h>
28 #include <unistd.h>
29 #include <fcntl.h>
30 #include <getopt.h>
31 #include <signal.h>
32 #include <time.h>
33 #include <dirent.h>
34 #include <ctype.h>
35 #include <stdbool.h>
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/socket.h>
39 #include <sys/select.h>
40 #include <termios.h>
41 #include <limits.h>
42 #include <linux/genetlink.h>
43 #include <linux/taskstats.h>
44 #include <linux/cgroupstats.h>
45 #include <stddef.h>
46
47 #define PSI_PATH "/proc/pressure"
48 #define PSI_CPU_PATH "/proc/pressure/cpu"
49 #define PSI_MEMORY_PATH "/proc/pressure/memory"
50 #define PSI_IO_PATH "/proc/pressure/io"
51 #define PSI_IRQ_PATH "/proc/pressure/irq"
52
53 #define NLA_NEXT(na) ((struct nlattr *)((char *)(na) + NLA_ALIGN((na)->nla_len)))
54 #define NLA_DATA(na) ((void *)((char *)(na) + NLA_HDRLEN))
55 #define NLA_PAYLOAD(len) (len - NLA_HDRLEN)
56
57 #define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN))
58 #define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN)
59
60 #define TASK_COMM_LEN 16
61 #define MAX_MSG_SIZE 1024
62 #define MAX_TASKS 1000
63 #define MAX_BUF_LEN 256
64 #define SET_TASK_STAT(task_count, field) tasks[task_count].field = stats.field
65 #define BOOL_FPRINT(stream, fmt, ...) \
66 ({ \
67 int ret = fprintf(stream, fmt, ##__VA_ARGS__); \
68 ret >= 0; \
69 })
70 #define TASK_AVG(task, field) average_ms((task).field##_delay_total, (task).field##_count)
71 #define PSI_LINE_FORMAT "%-12s %6.1f%%/%6.1f%%/%6.1f%%/%8llu(ms)\n"
72 #define DELAY_FMT_DEFAULT "%8.2f %8.2f %8.2f %8.2f\n"
73 #define DELAY_FMT_MEMVERBOSE "%8.2f %8.2f %8.2f %8.2f %8.2f %8.2f\n"
74 #define SORT_FIELD(name, cmd, modes) \
75 {#name, #cmd, \
76 offsetof(struct task_info, name##_delay_total), \
77 offsetof(struct task_info, name##_count), \
78 modes}
79 #define END_FIELD {NULL, 0, 0}
80
81 /* Display mode types */
82 #define MODE_TYPE_ALL (0xFFFFFFFF)
83 #define MODE_DEFAULT (1 << 0)
84 #define MODE_MEMVERBOSE (1 << 1)
85
86 /* PSI statistics structure */
87 struct psi_stats {
88 double cpu_some_avg10, cpu_some_avg60, cpu_some_avg300;
89 unsigned long long cpu_some_total;
90 double cpu_full_avg10, cpu_full_avg60, cpu_full_avg300;
91 unsigned long long cpu_full_total;
92 double memory_some_avg10, memory_some_avg60, memory_some_avg300;
93 unsigned long long memory_some_total;
94 double memory_full_avg10, memory_full_avg60, memory_full_avg300;
95 unsigned long long memory_full_total;
96 double io_some_avg10, io_some_avg60, io_some_avg300;
97 unsigned long long io_some_total;
98 double io_full_avg10, io_full_avg60, io_full_avg300;
99 unsigned long long io_full_total;
100 double irq_full_avg10, irq_full_avg60, irq_full_avg300;
101 unsigned long long irq_full_total;
102 };
103
104 /* Task delay information structure */
105 struct task_info {
106 int pid;
107 int tgid;
108 char command[TASK_COMM_LEN];
109 unsigned long long cpu_count;
110 unsigned long long cpu_delay_total;
111 unsigned long long blkio_count;
112 unsigned long long blkio_delay_total;
113 unsigned long long swapin_count;
114 unsigned long long swapin_delay_total;
115 unsigned long long freepages_count;
116 unsigned long long freepages_delay_total;
117 unsigned long long thrashing_count;
118 unsigned long long thrashing_delay_total;
119 unsigned long long compact_count;
120 unsigned long long compact_delay_total;
121 unsigned long long wpcopy_count;
122 unsigned long long wpcopy_delay_total;
123 unsigned long long irq_count;
124 unsigned long long irq_delay_total;
125 unsigned long long mem_count;
126 unsigned long long mem_delay_total;
127 };
128
129 /* Container statistics structure */
130 struct container_stats {
131 int nr_sleeping; /* Number of sleeping processes */
132 int nr_running; /* Number of running processes */
133 int nr_stopped; /* Number of stopped processes */
134 int nr_uninterruptible; /* Number of uninterruptible processes */
135 int nr_io_wait; /* Number of processes in IO wait */
136 };
137
138 /* Delay field structure */
139 struct field_desc {
140 const char *name; /* Field name for cmdline argument */
141 const char *cmd_char; /* Interactive command */
142 unsigned long total_offset; /* Offset of total delay in task_info */
143 unsigned long count_offset; /* Offset of count in task_info */
144 size_t supported_modes; /* Supported display modes */
145 };
146
147 /* Program settings structure */
148 struct config {
149 int delay; /* Update interval in seconds */
150 int iterations; /* Number of iterations, 0 == infinite */
151 int max_processes; /* Maximum number of processes to show */
152 int output_one_time; /* Output once and exit */
153 int monitor_pid; /* Monitor specific PID */
154 char *container_path; /* Path to container cgroup */
155 const struct field_desc *sort_field; /* Current sort field */
156 size_t display_mode; /* Current display mode */
157 };
158
159 /* Global variables */
160 static struct config cfg;
161 static struct psi_stats psi;
162 static struct task_info tasks[MAX_TASKS];
163 static int task_count;
164 static int running = 1;
165 static struct container_stats container_stats;
166 static const struct field_desc sort_fields[] = {
167 SORT_FIELD(cpu, c, MODE_DEFAULT),
168 SORT_FIELD(blkio, i, MODE_DEFAULT),
169 SORT_FIELD(irq, q, MODE_DEFAULT),
170 SORT_FIELD(mem, m, MODE_DEFAULT | MODE_MEMVERBOSE),
171 SORT_FIELD(swapin, s, MODE_MEMVERBOSE),
172 SORT_FIELD(freepages, r, MODE_MEMVERBOSE),
173 SORT_FIELD(thrashing, t, MODE_MEMVERBOSE),
174 SORT_FIELD(compact, p, MODE_MEMVERBOSE),
175 SORT_FIELD(wpcopy, w, MODE_MEMVERBOSE),
176 END_FIELD
177 };
178 static int sort_selected;
179
180 /* Netlink socket variables */
181 static int nl_sd = -1;
182 static int family_id;
183
184 /* Set terminal to non-canonical mode for q-to-quit */
185 static struct termios orig_termios;
enable_raw_mode(void)186 static void enable_raw_mode(void)
187 {
188 struct termios raw;
189
190 tcgetattr(STDIN_FILENO, &orig_termios);
191 raw = orig_termios;
192 raw.c_lflag &= ~(ICANON | ECHO);
193 tcsetattr(STDIN_FILENO, TCSAFLUSH, &raw);
194 }
disable_raw_mode(void)195 static void disable_raw_mode(void)
196 {
197 tcsetattr(STDIN_FILENO, TCSAFLUSH, &orig_termios);
198 }
199
200 /* Find field descriptor by command line */
get_field_by_cmd_char(char ch)201 static const struct field_desc *get_field_by_cmd_char(char ch)
202 {
203 const struct field_desc *field;
204
205 for (field = sort_fields; field->name != NULL; field++) {
206 if (field->cmd_char[0] == ch)
207 return field;
208 }
209
210 return NULL;
211 }
212
213 /* Find field descriptor by name with string comparison */
get_field_by_name(const char * name)214 static const struct field_desc *get_field_by_name(const char *name)
215 {
216 const struct field_desc *field;
217 size_t field_len;
218
219 for (field = sort_fields; field->name != NULL; field++) {
220 field_len = strlen(field->name);
221 if (field_len != strlen(name))
222 continue;
223 if (strncmp(field->name, name, field_len) == 0)
224 return field;
225 }
226
227 return NULL;
228 }
229
230 /* Find display name for a field descriptor */
get_name_by_field(const struct field_desc * field)231 static const char *get_name_by_field(const struct field_desc *field)
232 {
233 return field ? field->name : "UNKNOWN";
234 }
235
236 /* Generate string of available field names */
display_available_fields(size_t mode)237 static void display_available_fields(size_t mode)
238 {
239 const struct field_desc *field;
240 char buf[MAX_BUF_LEN];
241
242 buf[0] = '\0';
243
244 for (field = sort_fields; field->name != NULL; field++) {
245 if (!(field->supported_modes & mode))
246 continue;
247 strncat(buf, "|", MAX_BUF_LEN - strlen(buf) - 1);
248 strncat(buf, field->name, MAX_BUF_LEN - strlen(buf) - 1);
249 buf[MAX_BUF_LEN - 1] = '\0';
250 }
251
252 fprintf(stderr, "Available fields: %s\n", buf);
253 }
254
255 /* Display usage information and command line options */
usage(void)256 static void usage(void)
257 {
258 printf("Usage: delaytop [Options]\n"
259 "Options:\n"
260 " -h, --help Show this help message and exit\n"
261 " -d, --delay=SECONDS Set refresh interval (default: 2 seconds, min: 1)\n"
262 " -n, --iterations=COUNT Set number of updates (default: 0 = infinite)\n"
263 " -P, --processes=NUMBER Set maximum number of processes to show (default: 20, max: 1000)\n"
264 " -o, --once Display once and exit\n"
265 " -p, --pid=PID Monitor only the specified PID\n"
266 " -C, --container=PATH Monitor the container at specified cgroup path\n"
267 " -s, --sort=FIELD Sort by delay field (default: cpu)\n"
268 " -M, --memverbose Display memory detailed information\n");
269 exit(0);
270 }
271
272 /* Parse command line arguments and set configuration */
parse_args(int argc,char ** argv)273 static void parse_args(int argc, char **argv)
274 {
275 int c;
276 const struct field_desc *field;
277 struct option long_options[] = {
278 {"help", no_argument, 0, 'h'},
279 {"delay", required_argument, 0, 'd'},
280 {"iterations", required_argument, 0, 'n'},
281 {"pid", required_argument, 0, 'p'},
282 {"once", no_argument, 0, 'o'},
283 {"processes", required_argument, 0, 'P'},
284 {"sort", required_argument, 0, 's'},
285 {"container", required_argument, 0, 'C'},
286 {"memverbose", no_argument, 0, 'M'},
287 {0, 0, 0, 0}
288 };
289
290 /* Set defaults */
291 cfg.delay = 2;
292 cfg.iterations = 0;
293 cfg.max_processes = 20;
294 cfg.sort_field = &sort_fields[0]; /* Default sorted by CPU delay */
295 cfg.output_one_time = 0;
296 cfg.monitor_pid = 0; /* 0 means monitor all PIDs */
297 cfg.container_path = NULL;
298 cfg.display_mode = MODE_DEFAULT;
299
300 while (1) {
301 int option_index = 0;
302
303 c = getopt_long(argc, argv, "hd:n:p:oP:C:s:M", long_options, &option_index);
304 if (c == -1)
305 break;
306
307 switch (c) {
308 case 'h':
309 usage();
310 break;
311 case 'd':
312 cfg.delay = atoi(optarg);
313 if (cfg.delay < 1) {
314 fprintf(stderr, "Error: delay must be >= 1.\n");
315 exit(1);
316 }
317 break;
318 case 'n':
319 cfg.iterations = atoi(optarg);
320 if (cfg.iterations < 0) {
321 fprintf(stderr, "Error: iterations must be >= 0.\n");
322 exit(1);
323 }
324 break;
325 case 'p':
326 cfg.monitor_pid = atoi(optarg);
327 if (cfg.monitor_pid < 1) {
328 fprintf(stderr, "Error: pid must be >= 1.\n");
329 exit(1);
330 }
331 break;
332 case 'o':
333 cfg.output_one_time = 1;
334 break;
335 case 'P':
336 cfg.max_processes = atoi(optarg);
337 if (cfg.max_processes < 1) {
338 fprintf(stderr, "Error: processes must be >= 1.\n");
339 exit(1);
340 }
341 if (cfg.max_processes > MAX_TASKS) {
342 fprintf(stderr, "Warning: processes capped to %d.\n",
343 MAX_TASKS);
344 cfg.max_processes = MAX_TASKS;
345 }
346 break;
347 case 'C':
348 cfg.container_path = strdup(optarg);
349 break;
350 case 's':
351 if (strlen(optarg) == 0) {
352 fprintf(stderr, "Error: empty sort field\n");
353 exit(1);
354 }
355
356 field = get_field_by_name(optarg);
357 /* Show available fields if invalid option provided */
358 if (!field) {
359 fprintf(stderr, "Error: invalid sort field '%s'\n", optarg);
360 display_available_fields(MODE_TYPE_ALL);
361 exit(1);
362 }
363
364 cfg.sort_field = field;
365 break;
366 case 'M':
367 cfg.display_mode = MODE_MEMVERBOSE;
368 cfg.sort_field = get_field_by_name("mem");
369 break;
370 default:
371 fprintf(stderr, "Try 'delaytop --help' for more information.\n");
372 exit(1);
373 }
374 }
375 }
376
377 /* Calculate average delay in milliseconds for overall memory */
set_mem_delay_total(struct task_info * t)378 static void set_mem_delay_total(struct task_info *t)
379 {
380 t->mem_delay_total = t->swapin_delay_total +
381 t->freepages_delay_total +
382 t->thrashing_delay_total +
383 t->compact_delay_total +
384 t->wpcopy_delay_total;
385 }
386
set_mem_count(struct task_info * t)387 static void set_mem_count(struct task_info *t)
388 {
389 t->mem_count = t->swapin_count +
390 t->freepages_count +
391 t->thrashing_count +
392 t->compact_count +
393 t->wpcopy_count;
394 }
395
396 /* Create a raw netlink socket and bind */
create_nl_socket(void)397 static int create_nl_socket(void)
398 {
399 int fd;
400 struct sockaddr_nl local;
401
402 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
403 if (fd < 0)
404 return -1;
405
406 memset(&local, 0, sizeof(local));
407 local.nl_family = AF_NETLINK;
408
409 if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) {
410 fprintf(stderr, "Failed to bind socket when create nl_socket\n");
411 close(fd);
412 return -1;
413 }
414
415 return fd;
416 }
417
418 /* Send a command via netlink */
send_cmd(int sd,__u16 nlmsg_type,__u32 nlmsg_pid,__u8 genl_cmd,__u16 nla_type,void * nla_data,int nla_len)419 static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid,
420 __u8 genl_cmd, __u16 nla_type,
421 void *nla_data, int nla_len)
422 {
423 struct sockaddr_nl nladdr;
424 struct nlattr *na;
425 int r, buflen;
426 char *buf;
427
428 struct {
429 struct nlmsghdr n;
430 struct genlmsghdr g;
431 char buf[MAX_MSG_SIZE];
432 } msg;
433
434 msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
435 msg.n.nlmsg_type = nlmsg_type;
436 msg.n.nlmsg_flags = NLM_F_REQUEST;
437 msg.n.nlmsg_seq = 0;
438 msg.n.nlmsg_pid = nlmsg_pid;
439 msg.g.cmd = genl_cmd;
440 msg.g.version = 0x1;
441 na = (struct nlattr *) GENLMSG_DATA(&msg);
442 na->nla_type = nla_type;
443 na->nla_len = nla_len + NLA_HDRLEN;
444 memcpy(NLA_DATA(na), nla_data, nla_len);
445 msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
446
447 buf = (char *) &msg;
448 buflen = msg.n.nlmsg_len;
449 memset(&nladdr, 0, sizeof(nladdr));
450 nladdr.nl_family = AF_NETLINK;
451 while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr,
452 sizeof(nladdr))) < buflen) {
453 if (r > 0) {
454 buf += r;
455 buflen -= r;
456 } else if (errno != EAGAIN)
457 return -1;
458 }
459 return 0;
460 }
461
462 /* Get family ID for taskstats via netlink */
get_family_id(int sd)463 static int get_family_id(int sd)
464 {
465 struct {
466 struct nlmsghdr n;
467 struct genlmsghdr g;
468 char buf[256];
469 } ans;
470
471 int id = 0, rc;
472 struct nlattr *na;
473 int rep_len;
474 char name[100];
475
476 strncpy(name, TASKSTATS_GENL_NAME, sizeof(name) - 1);
477 name[sizeof(name) - 1] = '\0';
478 rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY,
479 CTRL_ATTR_FAMILY_NAME, (void *)name,
480 strlen(TASKSTATS_GENL_NAME)+1);
481 if (rc < 0) {
482 fprintf(stderr, "Failed to send cmd for family id\n");
483 return 0;
484 }
485
486 rep_len = recv(sd, &ans, sizeof(ans), 0);
487 if (ans.n.nlmsg_type == NLMSG_ERROR ||
488 (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len)) {
489 fprintf(stderr, "Failed to receive response for family id\n");
490 return 0;
491 }
492
493 na = (struct nlattr *) GENLMSG_DATA(&ans);
494 na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
495 if (na->nla_type == CTRL_ATTR_FAMILY_ID)
496 id = *(__u16 *) NLA_DATA(na);
497 return id;
498 }
499
read_psi_stats(void)500 static int read_psi_stats(void)
501 {
502 FILE *fp;
503 char line[256];
504 int ret = 0;
505 int error_count = 0;
506
507 /* Check if PSI path exists */
508 if (access(PSI_PATH, F_OK) != 0) {
509 fprintf(stderr, "Error: PSI interface not found at %s\n", PSI_PATH);
510 fprintf(stderr, "Please ensure your kernel supports PSI (Pressure Stall Information)\n");
511 return -1;
512 }
513
514 /* Zero all fields */
515 memset(&psi, 0, sizeof(psi));
516
517 /* CPU pressure */
518 fp = fopen(PSI_CPU_PATH, "r");
519 if (fp) {
520 while (fgets(line, sizeof(line), fp)) {
521 if (strncmp(line, "some", 4) == 0) {
522 ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
523 &psi.cpu_some_avg10, &psi.cpu_some_avg60,
524 &psi.cpu_some_avg300, &psi.cpu_some_total);
525 if (ret != 4) {
526 fprintf(stderr, "Failed to parse CPU some PSI data\n");
527 error_count++;
528 }
529 } else if (strncmp(line, "full", 4) == 0) {
530 ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
531 &psi.cpu_full_avg10, &psi.cpu_full_avg60,
532 &psi.cpu_full_avg300, &psi.cpu_full_total);
533 if (ret != 4) {
534 fprintf(stderr, "Failed to parse CPU full PSI data\n");
535 error_count++;
536 }
537 }
538 }
539 fclose(fp);
540 } else {
541 fprintf(stderr, "Warning: Failed to open %s\n", PSI_CPU_PATH);
542 error_count++;
543 }
544
545 /* Memory pressure */
546 fp = fopen(PSI_MEMORY_PATH, "r");
547 if (fp) {
548 while (fgets(line, sizeof(line), fp)) {
549 if (strncmp(line, "some", 4) == 0) {
550 ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
551 &psi.memory_some_avg10, &psi.memory_some_avg60,
552 &psi.memory_some_avg300, &psi.memory_some_total);
553 if (ret != 4) {
554 fprintf(stderr, "Failed to parse Memory some PSI data\n");
555 error_count++;
556 }
557 } else if (strncmp(line, "full", 4) == 0) {
558 ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
559 &psi.memory_full_avg10, &psi.memory_full_avg60,
560 &psi.memory_full_avg300, &psi.memory_full_total);
561 if (ret != 4) {
562 fprintf(stderr, "Failed to parse Memory full PSI data\n");
563 error_count++;
564 }
565 }
566 }
567 fclose(fp);
568 } else {
569 fprintf(stderr, "Warning: Failed to open %s\n", PSI_MEMORY_PATH);
570 error_count++;
571 }
572
573 /* IO pressure */
574 fp = fopen(PSI_IO_PATH, "r");
575 if (fp) {
576 while (fgets(line, sizeof(line), fp)) {
577 if (strncmp(line, "some", 4) == 0) {
578 ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu",
579 &psi.io_some_avg10, &psi.io_some_avg60,
580 &psi.io_some_avg300, &psi.io_some_total);
581 if (ret != 4) {
582 fprintf(stderr, "Failed to parse IO some PSI data\n");
583 error_count++;
584 }
585 } else if (strncmp(line, "full", 4) == 0) {
586 ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
587 &psi.io_full_avg10, &psi.io_full_avg60,
588 &psi.io_full_avg300, &psi.io_full_total);
589 if (ret != 4) {
590 fprintf(stderr, "Failed to parse IO full PSI data\n");
591 error_count++;
592 }
593 }
594 }
595 fclose(fp);
596 } else {
597 fprintf(stderr, "Warning: Failed to open %s\n", PSI_IO_PATH);
598 error_count++;
599 }
600
601 /* IRQ pressure (only full) */
602 fp = fopen(PSI_IRQ_PATH, "r");
603 if (fp) {
604 while (fgets(line, sizeof(line), fp)) {
605 if (strncmp(line, "full", 4) == 0) {
606 ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu",
607 &psi.irq_full_avg10, &psi.irq_full_avg60,
608 &psi.irq_full_avg300, &psi.irq_full_total);
609 if (ret != 4) {
610 fprintf(stderr, "Failed to parse IRQ full PSI data\n");
611 error_count++;
612 }
613 }
614 }
615 fclose(fp);
616 } else {
617 fprintf(stderr, "Warning: Failed to open %s\n", PSI_IRQ_PATH);
618 error_count++;
619 }
620
621 /* Return error count: 0 means success, >0 means warnings, -1 means fatal error */
622 if (error_count > 0) {
623 fprintf(stderr, "PSI stats reading completed with %d warnings\n", error_count);
624 return error_count;
625 }
626
627 return 0;
628 }
629
read_comm(int pid,char * comm_buf,size_t buf_size)630 static int read_comm(int pid, char *comm_buf, size_t buf_size)
631 {
632 char path[64];
633 int ret = -1;
634 size_t len;
635 FILE *fp;
636
637 snprintf(path, sizeof(path), "/proc/%d/comm", pid);
638 fp = fopen(path, "r");
639 if (!fp) {
640 fprintf(stderr, "Failed to open comm file /proc/%d/comm\n", pid);
641 return ret;
642 }
643
644 if (fgets(comm_buf, buf_size, fp)) {
645 len = strlen(comm_buf);
646 if (len > 0 && comm_buf[len - 1] == '\n')
647 comm_buf[len - 1] = '\0';
648 ret = 0;
649 }
650
651 fclose(fp);
652
653 return ret;
654 }
655
fetch_and_fill_task_info(int pid,const char * comm)656 static void fetch_and_fill_task_info(int pid, const char *comm)
657 {
658 struct {
659 struct nlmsghdr n;
660 struct genlmsghdr g;
661 char buf[MAX_MSG_SIZE];
662 } resp;
663 struct taskstats stats;
664 struct nlattr *nested;
665 struct nlattr *na;
666 int nested_len;
667 int nl_len;
668 int rc;
669
670 /* Send request for task stats */
671 if (send_cmd(nl_sd, family_id, getpid(), TASKSTATS_CMD_GET,
672 TASKSTATS_CMD_ATTR_PID, &pid, sizeof(pid)) < 0) {
673 fprintf(stderr, "Failed to send request for task stats\n");
674 return;
675 }
676
677 /* Receive response */
678 rc = recv(nl_sd, &resp, sizeof(resp), 0);
679 if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) {
680 fprintf(stderr, "Failed to receive response for task stats\n");
681 return;
682 }
683
684 /* Parse response */
685 nl_len = GENLMSG_PAYLOAD(&resp.n);
686 na = (struct nlattr *) GENLMSG_DATA(&resp);
687 while (nl_len > 0) {
688 if (na->nla_type == TASKSTATS_TYPE_AGGR_PID) {
689 nested = (struct nlattr *) NLA_DATA(na);
690 nested_len = NLA_PAYLOAD(na->nla_len);
691 while (nested_len > 0) {
692 if (nested->nla_type == TASKSTATS_TYPE_STATS) {
693 memcpy(&stats, NLA_DATA(nested), sizeof(stats));
694 if (task_count < MAX_TASKS) {
695 tasks[task_count].pid = pid;
696 tasks[task_count].tgid = pid;
697 strncpy(tasks[task_count].command, comm,
698 TASK_COMM_LEN - 1);
699 tasks[task_count].command[TASK_COMM_LEN - 1] = '\0';
700 SET_TASK_STAT(task_count, cpu_count);
701 SET_TASK_STAT(task_count, cpu_delay_total);
702 SET_TASK_STAT(task_count, blkio_count);
703 SET_TASK_STAT(task_count, blkio_delay_total);
704 SET_TASK_STAT(task_count, swapin_count);
705 SET_TASK_STAT(task_count, swapin_delay_total);
706 SET_TASK_STAT(task_count, freepages_count);
707 SET_TASK_STAT(task_count, freepages_delay_total);
708 SET_TASK_STAT(task_count, thrashing_count);
709 SET_TASK_STAT(task_count, thrashing_delay_total);
710 SET_TASK_STAT(task_count, compact_count);
711 SET_TASK_STAT(task_count, compact_delay_total);
712 SET_TASK_STAT(task_count, wpcopy_count);
713 SET_TASK_STAT(task_count, wpcopy_delay_total);
714 SET_TASK_STAT(task_count, irq_count);
715 SET_TASK_STAT(task_count, irq_delay_total);
716 set_mem_count(&tasks[task_count]);
717 set_mem_delay_total(&tasks[task_count]);
718 task_count++;
719 }
720 break;
721 }
722 nested_len -= NLA_ALIGN(nested->nla_len);
723 nested = NLA_NEXT(nested);
724 }
725 }
726 nl_len -= NLA_ALIGN(na->nla_len);
727 na = NLA_NEXT(na);
728 }
729 return;
730 }
731
get_task_delays(void)732 static void get_task_delays(void)
733 {
734 char comm[TASK_COMM_LEN];
735 struct dirent *entry;
736 DIR *dir;
737 int pid;
738
739 task_count = 0;
740 if (cfg.monitor_pid > 0) {
741 if (read_comm(cfg.monitor_pid, comm, sizeof(comm)) == 0)
742 fetch_and_fill_task_info(cfg.monitor_pid, comm);
743 return;
744 }
745
746 dir = opendir("/proc");
747 if (!dir) {
748 fprintf(stderr, "Error opening /proc directory\n");
749 return;
750 }
751
752 while ((entry = readdir(dir)) != NULL && task_count < MAX_TASKS) {
753 if (!isdigit(entry->d_name[0]))
754 continue;
755 pid = atoi(entry->d_name);
756 if (pid == 0)
757 continue;
758 if (read_comm(pid, comm, sizeof(comm)) != 0)
759 continue;
760 fetch_and_fill_task_info(pid, comm);
761 }
762 closedir(dir);
763 }
764
765 /* Calculate average delay in milliseconds */
average_ms(unsigned long long total,unsigned long long count)766 static double average_ms(unsigned long long total, unsigned long long count)
767 {
768 if (count == 0)
769 return 0;
770 return (double)total / 1000000.0 / count;
771 }
772
773 /* Comparison function for sorting tasks */
compare_tasks(const void * a,const void * b)774 static int compare_tasks(const void *a, const void *b)
775 {
776 const struct task_info *t1 = (const struct task_info *)a;
777 const struct task_info *t2 = (const struct task_info *)b;
778 unsigned long long total1;
779 unsigned long long total2;
780 unsigned long count1;
781 unsigned long count2;
782 double avg1, avg2;
783
784 total1 = *(unsigned long long *)((char *)t1 + cfg.sort_field->total_offset);
785 total2 = *(unsigned long long *)((char *)t2 + cfg.sort_field->total_offset);
786 count1 = *(unsigned long *)((char *)t1 + cfg.sort_field->count_offset);
787 count2 = *(unsigned long *)((char *)t2 + cfg.sort_field->count_offset);
788
789 avg1 = average_ms(total1, count1);
790 avg2 = average_ms(total2, count2);
791 if (avg1 != avg2)
792 return avg2 > avg1 ? 1 : -1;
793
794 return 0;
795 }
796
797 /* Sort tasks by selected field */
sort_tasks(void)798 static void sort_tasks(void)
799 {
800 if (task_count > 0)
801 qsort(tasks, task_count, sizeof(struct task_info), compare_tasks);
802 }
803
804 /* Get container statistics via cgroupstats */
get_container_stats(void)805 static void get_container_stats(void)
806 {
807 int rc, cfd;
808 struct {
809 struct nlmsghdr n;
810 struct genlmsghdr g;
811 char buf[MAX_MSG_SIZE];
812 } req, resp;
813 struct nlattr *na;
814 int nl_len;
815 struct cgroupstats stats;
816
817 /* Check if container path is set */
818 if (!cfg.container_path)
819 return;
820
821 /* Open container cgroup */
822 cfd = open(cfg.container_path, O_RDONLY);
823 if (cfd < 0) {
824 fprintf(stderr, "Error opening container path: %s\n", cfg.container_path);
825 return;
826 }
827
828 /* Send request for container stats */
829 if (send_cmd(nl_sd, family_id, getpid(), CGROUPSTATS_CMD_GET,
830 CGROUPSTATS_CMD_ATTR_FD, &cfd, sizeof(__u32)) < 0) {
831 fprintf(stderr, "Failed to send request for container stats\n");
832 close(cfd);
833 return;
834 }
835
836 /* Receive response */
837 rc = recv(nl_sd, &resp, sizeof(resp), 0);
838 if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) {
839 fprintf(stderr, "Failed to receive response for container stats\n");
840 close(cfd);
841 return;
842 }
843
844 /* Parse response */
845 nl_len = GENLMSG_PAYLOAD(&resp.n);
846 na = (struct nlattr *) GENLMSG_DATA(&resp);
847 while (nl_len > 0) {
848 if (na->nla_type == CGROUPSTATS_TYPE_CGROUP_STATS) {
849 /* Get the cgroupstats structure */
850 memcpy(&stats, NLA_DATA(na), sizeof(stats));
851
852 /* Fill container stats */
853 container_stats.nr_sleeping = stats.nr_sleeping;
854 container_stats.nr_running = stats.nr_running;
855 container_stats.nr_stopped = stats.nr_stopped;
856 container_stats.nr_uninterruptible = stats.nr_uninterruptible;
857 container_stats.nr_io_wait = stats.nr_io_wait;
858 break;
859 }
860 nl_len -= NLA_ALIGN(na->nla_len);
861 na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
862 }
863
864 close(cfd);
865 }
866
867 /* Display results to stdout or log file */
display_results(int psi_ret)868 static void display_results(int psi_ret)
869 {
870 time_t now = time(NULL);
871 struct tm *tm_now = localtime(&now);
872 FILE *out = stdout;
873 char timestamp[32];
874 bool suc = true;
875 int i, count;
876
877 /* Clear terminal screen */
878 suc &= BOOL_FPRINT(out, "\033[H\033[J");
879
880 /* PSI output (one-line, no cat style) */
881 suc &= BOOL_FPRINT(out, "System Pressure Information: (avg10/avg60vg300/total)\n");
882 if (psi_ret) {
883 suc &= BOOL_FPRINT(out, " PSI not found: check if psi=1 enabled in cmdline\n");
884 } else {
885 suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
886 "CPU some:",
887 psi.cpu_some_avg10,
888 psi.cpu_some_avg60,
889 psi.cpu_some_avg300,
890 psi.cpu_some_total / 1000);
891 suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
892 "CPU full:",
893 psi.cpu_full_avg10,
894 psi.cpu_full_avg60,
895 psi.cpu_full_avg300,
896 psi.cpu_full_total / 1000);
897 suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
898 "Memory full:",
899 psi.memory_full_avg10,
900 psi.memory_full_avg60,
901 psi.memory_full_avg300,
902 psi.memory_full_total / 1000);
903 suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
904 "Memory some:",
905 psi.memory_some_avg10,
906 psi.memory_some_avg60,
907 psi.memory_some_avg300,
908 psi.memory_some_total / 1000);
909 suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
910 "IO full:",
911 psi.io_full_avg10,
912 psi.io_full_avg60,
913 psi.io_full_avg300,
914 psi.io_full_total / 1000);
915 suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
916 "IO some:",
917 psi.io_some_avg10,
918 psi.io_some_avg60,
919 psi.io_some_avg300,
920 psi.io_some_total / 1000);
921 suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT,
922 "IRQ full:",
923 psi.irq_full_avg10,
924 psi.irq_full_avg60,
925 psi.irq_full_avg300,
926 psi.irq_full_total / 1000);
927 }
928
929 if (cfg.container_path) {
930 suc &= BOOL_FPRINT(out, "Container Information (%s):\n", cfg.container_path);
931 suc &= BOOL_FPRINT(out, "Processes: running=%d, sleeping=%d, ",
932 container_stats.nr_running, container_stats.nr_sleeping);
933 suc &= BOOL_FPRINT(out, "stopped=%d, uninterruptible=%d, io_wait=%d\n\n",
934 container_stats.nr_stopped, container_stats.nr_uninterruptible,
935 container_stats.nr_io_wait);
936 }
937
938 /* Interacive command */
939 suc &= BOOL_FPRINT(out, "[o]sort [M]memverbose [q]quit\n");
940 if (sort_selected) {
941 if (cfg.display_mode == MODE_MEMVERBOSE)
942 suc &= BOOL_FPRINT(out,
943 "sort selection: [m]MEM [r]RCL [t]THR [p]CMP [w]WP\n");
944 else
945 suc &= BOOL_FPRINT(out,
946 "sort selection: [c]CPU [i]IO [m]MEM [q]IRQ\n");
947 }
948
949 /* Task delay output */
950 suc &= BOOL_FPRINT(out, "Top %d processes (sorted by %s delay):\n",
951 cfg.max_processes, get_name_by_field(cfg.sort_field));
952
953 suc &= BOOL_FPRINT(out, "%8s %8s %-17s", "PID", "TGID", "COMMAND");
954 if (cfg.display_mode == MODE_MEMVERBOSE) {
955 suc &= BOOL_FPRINT(out, "%8s %8s %8s %8s %8s %8s\n",
956 "MEM(ms)", "SWAP(ms)", "RCL(ms)",
957 "THR(ms)", "CMP(ms)", "WP(ms)");
958 suc &= BOOL_FPRINT(out, "-----------------------");
959 suc &= BOOL_FPRINT(out, "-----------------------");
960 suc &= BOOL_FPRINT(out, "-----------------------");
961 suc &= BOOL_FPRINT(out, "---------------------\n");
962 } else {
963 suc &= BOOL_FPRINT(out, "%8s %8s %8s %8s\n",
964 "CPU(ms)", "IO(ms)", "IRQ(ms)", "MEM(ms)");
965 suc &= BOOL_FPRINT(out, "-----------------------");
966 suc &= BOOL_FPRINT(out, "-----------------------");
967 suc &= BOOL_FPRINT(out, "--------------------------\n");
968 }
969
970 count = task_count < cfg.max_processes ? task_count : cfg.max_processes;
971
972 for (i = 0; i < count; i++) {
973 suc &= BOOL_FPRINT(out, "%8d %8d %-15s",
974 tasks[i].pid, tasks[i].tgid, tasks[i].command);
975 if (cfg.display_mode == MODE_MEMVERBOSE) {
976 suc &= BOOL_FPRINT(out, DELAY_FMT_MEMVERBOSE,
977 TASK_AVG(tasks[i], mem),
978 TASK_AVG(tasks[i], swapin),
979 TASK_AVG(tasks[i], freepages),
980 TASK_AVG(tasks[i], thrashing),
981 TASK_AVG(tasks[i], compact),
982 TASK_AVG(tasks[i], wpcopy));
983 } else {
984 suc &= BOOL_FPRINT(out, DELAY_FMT_DEFAULT,
985 TASK_AVG(tasks[i], cpu),
986 TASK_AVG(tasks[i], blkio),
987 TASK_AVG(tasks[i], irq),
988 TASK_AVG(tasks[i], mem));
989 }
990 }
991
992 suc &= BOOL_FPRINT(out, "\n");
993
994 if (!suc)
995 perror("Error writing to output");
996 }
997
998 /* Check for keyboard input with timeout based on cfg.delay */
check_for_keypress(void)999 static char check_for_keypress(void)
1000 {
1001 struct timeval tv = {cfg.delay, 0};
1002 fd_set readfds;
1003 char ch = 0;
1004
1005 FD_ZERO(&readfds);
1006 FD_SET(STDIN_FILENO, &readfds);
1007 int r = select(STDIN_FILENO + 1, &readfds, NULL, NULL, &tv);
1008
1009 if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) {
1010 read(STDIN_FILENO, &ch, 1);
1011 return ch;
1012 }
1013
1014 return 0;
1015 }
1016
1017 #define MAX_MODE_SIZE 2
toggle_display_mode(void)1018 static void toggle_display_mode(void)
1019 {
1020 static const size_t modes[MAX_MODE_SIZE] = {MODE_DEFAULT, MODE_MEMVERBOSE};
1021 static size_t cur_index;
1022
1023 cur_index = (cur_index + 1) % MAX_MODE_SIZE;
1024 cfg.display_mode = modes[cur_index];
1025 }
1026
1027 /* Handle keyboard input: sorting selection, mode toggle, or quit */
handle_keypress(char ch,int * running)1028 static void handle_keypress(char ch, int *running)
1029 {
1030 const struct field_desc *field;
1031
1032 /* Change sort field */
1033 if (sort_selected) {
1034 field = get_field_by_cmd_char(ch);
1035 if (field && (field->supported_modes & cfg.display_mode))
1036 cfg.sort_field = field;
1037
1038 sort_selected = 0;
1039 /* Handle mode changes or quit */
1040 } else {
1041 switch (ch) {
1042 case 'o':
1043 sort_selected = 1;
1044 break;
1045 case 'M':
1046 toggle_display_mode();
1047 for (field = sort_fields; field->name != NULL; field++) {
1048 if (field->supported_modes & cfg.display_mode) {
1049 cfg.sort_field = field;
1050 break;
1051 }
1052 }
1053 break;
1054 case 'q':
1055 case 'Q':
1056 *running = 0;
1057 break;
1058 default:
1059 break;
1060 }
1061 }
1062 }
1063
1064 /* Main function */
main(int argc,char ** argv)1065 int main(int argc, char **argv)
1066 {
1067 const struct field_desc *field;
1068 int iterations = 0;
1069 int psi_ret = 0;
1070 char keypress;
1071
1072 /* Parse command line arguments */
1073 parse_args(argc, argv);
1074
1075 /* Setup netlink socket */
1076 nl_sd = create_nl_socket();
1077 if (nl_sd < 0) {
1078 fprintf(stderr, "Error creating netlink socket\n");
1079 exit(1);
1080 }
1081
1082 /* Get family ID for taskstats via netlink */
1083 family_id = get_family_id(nl_sd);
1084 if (!family_id) {
1085 fprintf(stderr, "Error getting taskstats family ID\n");
1086 close(nl_sd);
1087 exit(1);
1088 }
1089
1090 /* Set terminal to non-canonical mode for interaction */
1091 enable_raw_mode();
1092
1093 /* Main loop */
1094 while (running) {
1095 /* Auto-switch sort field when not matching display mode */
1096 if (!(cfg.sort_field->supported_modes & cfg.display_mode)) {
1097 for (field = sort_fields; field->name != NULL; field++) {
1098 if (field->supported_modes & cfg.display_mode) {
1099 cfg.sort_field = field;
1100 printf("Auto-switched sort field to: %s\n", field->name);
1101 break;
1102 }
1103 }
1104 }
1105
1106 /* Read PSI statistics */
1107 psi_ret = read_psi_stats();
1108
1109 /* Get container stats if container path provided */
1110 if (cfg.container_path)
1111 get_container_stats();
1112
1113 /* Get task delays */
1114 get_task_delays();
1115
1116 /* Sort tasks */
1117 sort_tasks();
1118
1119 /* Display results to stdout or log file */
1120 display_results(psi_ret);
1121
1122 /* Check for iterations */
1123 if (cfg.iterations > 0 && ++iterations >= cfg.iterations)
1124 break;
1125
1126 /* Exit if output_one_time is set */
1127 if (cfg.output_one_time)
1128 break;
1129
1130 /* Keypress for interactive usage */
1131 keypress = check_for_keypress();
1132 if (keypress)
1133 handle_keypress(keypress, &running);
1134 }
1135
1136 /* Restore terminal mode */
1137 disable_raw_mode();
1138
1139 /* Cleanup */
1140 close(nl_sd);
1141 if (cfg.container_path)
1142 free(cfg.container_path);
1143
1144 return 0;
1145 }
1146