1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * delaytop.c - system-wide delay monitoring tool. 4 * 5 * This tool provides real-time monitoring and statistics of 6 * system, container, and task-level delays, including CPU, 7 * memory, IO, and IRQ. It supports both interactive (top-like), 8 * and can output delay information for the whole system, specific 9 * containers (cgroups), or individual tasks (PIDs). 10 * 11 * Key features: 12 * - Collects per-task delay accounting statistics via taskstats. 13 * - Collects system-wide PSI information. 14 * - Supports sorting, filtering. 15 * - Supports both interactive (screen refresh). 16 * 17 * Copyright (C) Fan Yu, ZTE Corp. 2025 18 * Copyright (C) Wang Yaxin, ZTE Corp. 2025 19 * 20 * Compile with 21 * gcc -I/usr/src/linux/include delaytop.c -o delaytop 22 */ 23 24 #include <stdio.h> 25 #include <stdlib.h> 26 #include <string.h> 27 #include <errno.h> 28 #include <unistd.h> 29 #include <fcntl.h> 30 #include <getopt.h> 31 #include <signal.h> 32 #include <time.h> 33 #include <dirent.h> 34 #include <ctype.h> 35 #include <stdbool.h> 36 #include <sys/types.h> 37 #include <sys/stat.h> 38 #include <sys/socket.h> 39 #include <sys/select.h> 40 #include <termios.h> 41 #include <limits.h> 42 #include <linux/genetlink.h> 43 #include <linux/taskstats.h> 44 #include <linux/cgroupstats.h> 45 #include <stddef.h> 46 47 #define PSI_PATH "/proc/pressure" 48 #define PSI_CPU_PATH "/proc/pressure/cpu" 49 #define PSI_MEMORY_PATH "/proc/pressure/memory" 50 #define PSI_IO_PATH "/proc/pressure/io" 51 #define PSI_IRQ_PATH "/proc/pressure/irq" 52 53 #define NLA_NEXT(na) ((struct nlattr *)((char *)(na) + NLA_ALIGN((na)->nla_len))) 54 #define NLA_DATA(na) ((void *)((char *)(na) + NLA_HDRLEN)) 55 #define NLA_PAYLOAD(len) (len - NLA_HDRLEN) 56 57 #define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN)) 58 #define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN) 59 60 #define TASK_COMM_LEN 16 61 #define MAX_MSG_SIZE 1024 62 #define MAX_TASKS 1000 63 #define MAX_BUF_LEN 256 64 #define SET_TASK_STAT(task_count, field) tasks[task_count].field = stats.field 65 #define BOOL_FPRINT(stream, fmt, ...) \ 66 ({ \ 67 int ret = fprintf(stream, fmt, ##__VA_ARGS__); \ 68 ret >= 0; \ 69 }) 70 #define TASK_AVG(task, field) average_ms((task).field##_delay_total, (task).field##_count) 71 #define PSI_LINE_FORMAT "%-12s %6.1f%%/%6.1f%%/%6.1f%%/%8llu(ms)\n" 72 #define DELAY_FMT_DEFAULT "%8.2f %8.2f %8.2f %8.2f\n" 73 #define DELAY_FMT_MEMVERBOSE "%8.2f %8.2f %8.2f %8.2f %8.2f %8.2f\n" 74 #define SORT_FIELD(name, cmd, modes) \ 75 {#name, #cmd, \ 76 offsetof(struct task_info, name##_delay_total), \ 77 offsetof(struct task_info, name##_count), \ 78 modes} 79 #define END_FIELD {NULL, 0, 0} 80 81 /* Display mode types */ 82 #define MODE_TYPE_ALL (0xFFFFFFFF) 83 #define MODE_DEFAULT (1 << 0) 84 #define MODE_MEMVERBOSE (1 << 1) 85 86 /* PSI statistics structure */ 87 struct psi_stats { 88 double cpu_some_avg10, cpu_some_avg60, cpu_some_avg300; 89 unsigned long long cpu_some_total; 90 double cpu_full_avg10, cpu_full_avg60, cpu_full_avg300; 91 unsigned long long cpu_full_total; 92 double memory_some_avg10, memory_some_avg60, memory_some_avg300; 93 unsigned long long memory_some_total; 94 double memory_full_avg10, memory_full_avg60, memory_full_avg300; 95 unsigned long long memory_full_total; 96 double io_some_avg10, io_some_avg60, io_some_avg300; 97 unsigned long long io_some_total; 98 double io_full_avg10, io_full_avg60, io_full_avg300; 99 unsigned long long io_full_total; 100 double irq_full_avg10, irq_full_avg60, irq_full_avg300; 101 unsigned long long irq_full_total; 102 }; 103 104 /* Task delay information structure */ 105 struct task_info { 106 int pid; 107 int tgid; 108 char command[TASK_COMM_LEN]; 109 unsigned long long cpu_count; 110 unsigned long long cpu_delay_total; 111 unsigned long long blkio_count; 112 unsigned long long blkio_delay_total; 113 unsigned long long swapin_count; 114 unsigned long long swapin_delay_total; 115 unsigned long long freepages_count; 116 unsigned long long freepages_delay_total; 117 unsigned long long thrashing_count; 118 unsigned long long thrashing_delay_total; 119 unsigned long long compact_count; 120 unsigned long long compact_delay_total; 121 unsigned long long wpcopy_count; 122 unsigned long long wpcopy_delay_total; 123 unsigned long long irq_count; 124 unsigned long long irq_delay_total; 125 unsigned long long mem_count; 126 unsigned long long mem_delay_total; 127 }; 128 129 /* Container statistics structure */ 130 struct container_stats { 131 int nr_sleeping; /* Number of sleeping processes */ 132 int nr_running; /* Number of running processes */ 133 int nr_stopped; /* Number of stopped processes */ 134 int nr_uninterruptible; /* Number of uninterruptible processes */ 135 int nr_io_wait; /* Number of processes in IO wait */ 136 }; 137 138 /* Delay field structure */ 139 struct field_desc { 140 const char *name; /* Field name for cmdline argument */ 141 const char *cmd_char; /* Interactive command */ 142 unsigned long total_offset; /* Offset of total delay in task_info */ 143 unsigned long count_offset; /* Offset of count in task_info */ 144 size_t supported_modes; /* Supported display modes */ 145 }; 146 147 /* Program settings structure */ 148 struct config { 149 int delay; /* Update interval in seconds */ 150 int iterations; /* Number of iterations, 0 == infinite */ 151 int max_processes; /* Maximum number of processes to show */ 152 int output_one_time; /* Output once and exit */ 153 int monitor_pid; /* Monitor specific PID */ 154 char *container_path; /* Path to container cgroup */ 155 const struct field_desc *sort_field; /* Current sort field */ 156 size_t display_mode; /* Current display mode */ 157 }; 158 159 /* Global variables */ 160 static struct config cfg; 161 static struct psi_stats psi; 162 static struct task_info tasks[MAX_TASKS]; 163 static int task_count; 164 static int running = 1; 165 static struct container_stats container_stats; 166 static const struct field_desc sort_fields[] = { 167 SORT_FIELD(cpu, c, MODE_DEFAULT), 168 SORT_FIELD(blkio, i, MODE_DEFAULT), 169 SORT_FIELD(irq, q, MODE_DEFAULT), 170 SORT_FIELD(mem, m, MODE_DEFAULT | MODE_MEMVERBOSE), 171 SORT_FIELD(swapin, s, MODE_MEMVERBOSE), 172 SORT_FIELD(freepages, r, MODE_MEMVERBOSE), 173 SORT_FIELD(thrashing, t, MODE_MEMVERBOSE), 174 SORT_FIELD(compact, p, MODE_MEMVERBOSE), 175 SORT_FIELD(wpcopy, w, MODE_MEMVERBOSE), 176 END_FIELD 177 }; 178 static int sort_selected; 179 180 /* Netlink socket variables */ 181 static int nl_sd = -1; 182 static int family_id; 183 184 /* Set terminal to non-canonical mode for q-to-quit */ 185 static struct termios orig_termios; 186 static void enable_raw_mode(void) 187 { 188 struct termios raw; 189 190 tcgetattr(STDIN_FILENO, &orig_termios); 191 raw = orig_termios; 192 raw.c_lflag &= ~(ICANON | ECHO); 193 tcsetattr(STDIN_FILENO, TCSAFLUSH, &raw); 194 } 195 static void disable_raw_mode(void) 196 { 197 tcsetattr(STDIN_FILENO, TCSAFLUSH, &orig_termios); 198 } 199 200 /* Find field descriptor by command line */ 201 static const struct field_desc *get_field_by_cmd_char(char ch) 202 { 203 const struct field_desc *field; 204 205 for (field = sort_fields; field->name != NULL; field++) { 206 if (field->cmd_char[0] == ch) 207 return field; 208 } 209 210 return NULL; 211 } 212 213 /* Find field descriptor by name with string comparison */ 214 static const struct field_desc *get_field_by_name(const char *name) 215 { 216 const struct field_desc *field; 217 size_t field_len; 218 219 for (field = sort_fields; field->name != NULL; field++) { 220 field_len = strlen(field->name); 221 if (field_len != strlen(name)) 222 continue; 223 if (strncmp(field->name, name, field_len) == 0) 224 return field; 225 } 226 227 return NULL; 228 } 229 230 /* Find display name for a field descriptor */ 231 static const char *get_name_by_field(const struct field_desc *field) 232 { 233 return field ? field->name : "UNKNOWN"; 234 } 235 236 /* Generate string of available field names */ 237 static void display_available_fields(size_t mode) 238 { 239 const struct field_desc *field; 240 char buf[MAX_BUF_LEN]; 241 242 buf[0] = '\0'; 243 244 for (field = sort_fields; field->name != NULL; field++) { 245 if (!(field->supported_modes & mode)) 246 continue; 247 strncat(buf, "|", MAX_BUF_LEN - strlen(buf) - 1); 248 strncat(buf, field->name, MAX_BUF_LEN - strlen(buf) - 1); 249 buf[MAX_BUF_LEN - 1] = '\0'; 250 } 251 252 fprintf(stderr, "Available fields: %s\n", buf); 253 } 254 255 /* Display usage information and command line options */ 256 static void usage(void) 257 { 258 printf("Usage: delaytop [Options]\n" 259 "Options:\n" 260 " -h, --help Show this help message and exit\n" 261 " -d, --delay=SECONDS Set refresh interval (default: 2 seconds, min: 1)\n" 262 " -n, --iterations=COUNT Set number of updates (default: 0 = infinite)\n" 263 " -P, --processes=NUMBER Set maximum number of processes to show (default: 20, max: 1000)\n" 264 " -o, --once Display once and exit\n" 265 " -p, --pid=PID Monitor only the specified PID\n" 266 " -C, --container=PATH Monitor the container at specified cgroup path\n" 267 " -s, --sort=FIELD Sort by delay field (default: cpu)\n" 268 " -M, --memverbose Display memory detailed information\n"); 269 exit(0); 270 } 271 272 /* Parse command line arguments and set configuration */ 273 static void parse_args(int argc, char **argv) 274 { 275 int c; 276 const struct field_desc *field; 277 struct option long_options[] = { 278 {"help", no_argument, 0, 'h'}, 279 {"delay", required_argument, 0, 'd'}, 280 {"iterations", required_argument, 0, 'n'}, 281 {"pid", required_argument, 0, 'p'}, 282 {"once", no_argument, 0, 'o'}, 283 {"processes", required_argument, 0, 'P'}, 284 {"sort", required_argument, 0, 's'}, 285 {"container", required_argument, 0, 'C'}, 286 {"memverbose", no_argument, 0, 'M'}, 287 {0, 0, 0, 0} 288 }; 289 290 /* Set defaults */ 291 cfg.delay = 2; 292 cfg.iterations = 0; 293 cfg.max_processes = 20; 294 cfg.sort_field = &sort_fields[0]; /* Default sorted by CPU delay */ 295 cfg.output_one_time = 0; 296 cfg.monitor_pid = 0; /* 0 means monitor all PIDs */ 297 cfg.container_path = NULL; 298 cfg.display_mode = MODE_DEFAULT; 299 300 while (1) { 301 int option_index = 0; 302 303 c = getopt_long(argc, argv, "hd:n:p:oP:C:s:M", long_options, &option_index); 304 if (c == -1) 305 break; 306 307 switch (c) { 308 case 'h': 309 usage(); 310 break; 311 case 'd': 312 cfg.delay = atoi(optarg); 313 if (cfg.delay < 1) { 314 fprintf(stderr, "Error: delay must be >= 1.\n"); 315 exit(1); 316 } 317 break; 318 case 'n': 319 cfg.iterations = atoi(optarg); 320 if (cfg.iterations < 0) { 321 fprintf(stderr, "Error: iterations must be >= 0.\n"); 322 exit(1); 323 } 324 break; 325 case 'p': 326 cfg.monitor_pid = atoi(optarg); 327 if (cfg.monitor_pid < 1) { 328 fprintf(stderr, "Error: pid must be >= 1.\n"); 329 exit(1); 330 } 331 break; 332 case 'o': 333 cfg.output_one_time = 1; 334 break; 335 case 'P': 336 cfg.max_processes = atoi(optarg); 337 if (cfg.max_processes < 1) { 338 fprintf(stderr, "Error: processes must be >= 1.\n"); 339 exit(1); 340 } 341 if (cfg.max_processes > MAX_TASKS) { 342 fprintf(stderr, "Warning: processes capped to %d.\n", 343 MAX_TASKS); 344 cfg.max_processes = MAX_TASKS; 345 } 346 break; 347 case 'C': 348 cfg.container_path = strdup(optarg); 349 break; 350 case 's': 351 if (strlen(optarg) == 0) { 352 fprintf(stderr, "Error: empty sort field\n"); 353 exit(1); 354 } 355 356 field = get_field_by_name(optarg); 357 /* Show available fields if invalid option provided */ 358 if (!field) { 359 fprintf(stderr, "Error: invalid sort field '%s'\n", optarg); 360 display_available_fields(MODE_TYPE_ALL); 361 exit(1); 362 } 363 364 cfg.sort_field = field; 365 break; 366 case 'M': 367 cfg.display_mode = MODE_MEMVERBOSE; 368 cfg.sort_field = get_field_by_name("mem"); 369 break; 370 default: 371 fprintf(stderr, "Try 'delaytop --help' for more information.\n"); 372 exit(1); 373 } 374 } 375 } 376 377 /* Calculate average delay in milliseconds for overall memory */ 378 static void set_mem_delay_total(struct task_info *t) 379 { 380 t->mem_delay_total = t->swapin_delay_total + 381 t->freepages_delay_total + 382 t->thrashing_delay_total + 383 t->compact_delay_total + 384 t->wpcopy_delay_total; 385 } 386 387 static void set_mem_count(struct task_info *t) 388 { 389 t->mem_count = t->swapin_count + 390 t->freepages_count + 391 t->thrashing_count + 392 t->compact_count + 393 t->wpcopy_count; 394 } 395 396 /* Create a raw netlink socket and bind */ 397 static int create_nl_socket(void) 398 { 399 int fd; 400 struct sockaddr_nl local; 401 402 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); 403 if (fd < 0) 404 return -1; 405 406 memset(&local, 0, sizeof(local)); 407 local.nl_family = AF_NETLINK; 408 409 if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) { 410 fprintf(stderr, "Failed to bind socket when create nl_socket\n"); 411 close(fd); 412 return -1; 413 } 414 415 return fd; 416 } 417 418 /* Send a command via netlink */ 419 static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, 420 __u8 genl_cmd, __u16 nla_type, 421 void *nla_data, int nla_len) 422 { 423 struct sockaddr_nl nladdr; 424 struct nlattr *na; 425 int r, buflen; 426 char *buf; 427 428 struct { 429 struct nlmsghdr n; 430 struct genlmsghdr g; 431 char buf[MAX_MSG_SIZE]; 432 } msg; 433 434 msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); 435 msg.n.nlmsg_type = nlmsg_type; 436 msg.n.nlmsg_flags = NLM_F_REQUEST; 437 msg.n.nlmsg_seq = 0; 438 msg.n.nlmsg_pid = nlmsg_pid; 439 msg.g.cmd = genl_cmd; 440 msg.g.version = 0x1; 441 na = (struct nlattr *) GENLMSG_DATA(&msg); 442 na->nla_type = nla_type; 443 na->nla_len = nla_len + NLA_HDRLEN; 444 memcpy(NLA_DATA(na), nla_data, nla_len); 445 msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len); 446 447 buf = (char *) &msg; 448 buflen = msg.n.nlmsg_len; 449 memset(&nladdr, 0, sizeof(nladdr)); 450 nladdr.nl_family = AF_NETLINK; 451 while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr, 452 sizeof(nladdr))) < buflen) { 453 if (r > 0) { 454 buf += r; 455 buflen -= r; 456 } else if (errno != EAGAIN) 457 return -1; 458 } 459 return 0; 460 } 461 462 /* Get family ID for taskstats via netlink */ 463 static int get_family_id(int sd) 464 { 465 struct { 466 struct nlmsghdr n; 467 struct genlmsghdr g; 468 char buf[256]; 469 } ans; 470 471 int id = 0, rc; 472 struct nlattr *na; 473 int rep_len; 474 char name[100]; 475 476 strncpy(name, TASKSTATS_GENL_NAME, sizeof(name) - 1); 477 name[sizeof(name) - 1] = '\0'; 478 rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY, 479 CTRL_ATTR_FAMILY_NAME, (void *)name, 480 strlen(TASKSTATS_GENL_NAME)+1); 481 if (rc < 0) { 482 fprintf(stderr, "Failed to send cmd for family id\n"); 483 return 0; 484 } 485 486 rep_len = recv(sd, &ans, sizeof(ans), 0); 487 if (ans.n.nlmsg_type == NLMSG_ERROR || 488 (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len)) { 489 fprintf(stderr, "Failed to receive response for family id\n"); 490 return 0; 491 } 492 493 na = (struct nlattr *) GENLMSG_DATA(&ans); 494 na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len)); 495 if (na->nla_type == CTRL_ATTR_FAMILY_ID) 496 id = *(__u16 *) NLA_DATA(na); 497 return id; 498 } 499 500 static int read_psi_stats(void) 501 { 502 FILE *fp; 503 char line[256]; 504 int ret = 0; 505 int error_count = 0; 506 507 /* Check if PSI path exists */ 508 if (access(PSI_PATH, F_OK) != 0) { 509 fprintf(stderr, "Error: PSI interface not found at %s\n", PSI_PATH); 510 fprintf(stderr, "Please ensure your kernel supports PSI (Pressure Stall Information)\n"); 511 return -1; 512 } 513 514 /* Zero all fields */ 515 memset(&psi, 0, sizeof(psi)); 516 517 /* CPU pressure */ 518 fp = fopen(PSI_CPU_PATH, "r"); 519 if (fp) { 520 while (fgets(line, sizeof(line), fp)) { 521 if (strncmp(line, "some", 4) == 0) { 522 ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu", 523 &psi.cpu_some_avg10, &psi.cpu_some_avg60, 524 &psi.cpu_some_avg300, &psi.cpu_some_total); 525 if (ret != 4) { 526 fprintf(stderr, "Failed to parse CPU some PSI data\n"); 527 error_count++; 528 } 529 } else if (strncmp(line, "full", 4) == 0) { 530 ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", 531 &psi.cpu_full_avg10, &psi.cpu_full_avg60, 532 &psi.cpu_full_avg300, &psi.cpu_full_total); 533 if (ret != 4) { 534 fprintf(stderr, "Failed to parse CPU full PSI data\n"); 535 error_count++; 536 } 537 } 538 } 539 fclose(fp); 540 } else { 541 fprintf(stderr, "Warning: Failed to open %s\n", PSI_CPU_PATH); 542 error_count++; 543 } 544 545 /* Memory pressure */ 546 fp = fopen(PSI_MEMORY_PATH, "r"); 547 if (fp) { 548 while (fgets(line, sizeof(line), fp)) { 549 if (strncmp(line, "some", 4) == 0) { 550 ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu", 551 &psi.memory_some_avg10, &psi.memory_some_avg60, 552 &psi.memory_some_avg300, &psi.memory_some_total); 553 if (ret != 4) { 554 fprintf(stderr, "Failed to parse Memory some PSI data\n"); 555 error_count++; 556 } 557 } else if (strncmp(line, "full", 4) == 0) { 558 ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", 559 &psi.memory_full_avg10, &psi.memory_full_avg60, 560 &psi.memory_full_avg300, &psi.memory_full_total); 561 if (ret != 4) { 562 fprintf(stderr, "Failed to parse Memory full PSI data\n"); 563 error_count++; 564 } 565 } 566 } 567 fclose(fp); 568 } else { 569 fprintf(stderr, "Warning: Failed to open %s\n", PSI_MEMORY_PATH); 570 error_count++; 571 } 572 573 /* IO pressure */ 574 fp = fopen(PSI_IO_PATH, "r"); 575 if (fp) { 576 while (fgets(line, sizeof(line), fp)) { 577 if (strncmp(line, "some", 4) == 0) { 578 ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu", 579 &psi.io_some_avg10, &psi.io_some_avg60, 580 &psi.io_some_avg300, &psi.io_some_total); 581 if (ret != 4) { 582 fprintf(stderr, "Failed to parse IO some PSI data\n"); 583 error_count++; 584 } 585 } else if (strncmp(line, "full", 4) == 0) { 586 ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", 587 &psi.io_full_avg10, &psi.io_full_avg60, 588 &psi.io_full_avg300, &psi.io_full_total); 589 if (ret != 4) { 590 fprintf(stderr, "Failed to parse IO full PSI data\n"); 591 error_count++; 592 } 593 } 594 } 595 fclose(fp); 596 } else { 597 fprintf(stderr, "Warning: Failed to open %s\n", PSI_IO_PATH); 598 error_count++; 599 } 600 601 /* IRQ pressure (only full) */ 602 fp = fopen(PSI_IRQ_PATH, "r"); 603 if (fp) { 604 while (fgets(line, sizeof(line), fp)) { 605 if (strncmp(line, "full", 4) == 0) { 606 ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", 607 &psi.irq_full_avg10, &psi.irq_full_avg60, 608 &psi.irq_full_avg300, &psi.irq_full_total); 609 if (ret != 4) { 610 fprintf(stderr, "Failed to parse IRQ full PSI data\n"); 611 error_count++; 612 } 613 } 614 } 615 fclose(fp); 616 } else { 617 fprintf(stderr, "Warning: Failed to open %s\n", PSI_IRQ_PATH); 618 error_count++; 619 } 620 621 /* Return error count: 0 means success, >0 means warnings, -1 means fatal error */ 622 if (error_count > 0) { 623 fprintf(stderr, "PSI stats reading completed with %d warnings\n", error_count); 624 return error_count; 625 } 626 627 return 0; 628 } 629 630 static int read_comm(int pid, char *comm_buf, size_t buf_size) 631 { 632 char path[64]; 633 int ret = -1; 634 size_t len; 635 FILE *fp; 636 637 snprintf(path, sizeof(path), "/proc/%d/comm", pid); 638 fp = fopen(path, "r"); 639 if (!fp) { 640 fprintf(stderr, "Failed to open comm file /proc/%d/comm\n", pid); 641 return ret; 642 } 643 644 if (fgets(comm_buf, buf_size, fp)) { 645 len = strlen(comm_buf); 646 if (len > 0 && comm_buf[len - 1] == '\n') 647 comm_buf[len - 1] = '\0'; 648 ret = 0; 649 } 650 651 fclose(fp); 652 653 return ret; 654 } 655 656 static void fetch_and_fill_task_info(int pid, const char *comm) 657 { 658 struct { 659 struct nlmsghdr n; 660 struct genlmsghdr g; 661 char buf[MAX_MSG_SIZE]; 662 } resp; 663 struct taskstats stats; 664 struct nlattr *nested; 665 struct nlattr *na; 666 int nested_len; 667 int nl_len; 668 int rc; 669 670 /* Send request for task stats */ 671 if (send_cmd(nl_sd, family_id, getpid(), TASKSTATS_CMD_GET, 672 TASKSTATS_CMD_ATTR_PID, &pid, sizeof(pid)) < 0) { 673 fprintf(stderr, "Failed to send request for task stats\n"); 674 return; 675 } 676 677 /* Receive response */ 678 rc = recv(nl_sd, &resp, sizeof(resp), 0); 679 if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) { 680 fprintf(stderr, "Failed to receive response for task stats\n"); 681 return; 682 } 683 684 /* Parse response */ 685 nl_len = GENLMSG_PAYLOAD(&resp.n); 686 na = (struct nlattr *) GENLMSG_DATA(&resp); 687 while (nl_len > 0) { 688 if (na->nla_type == TASKSTATS_TYPE_AGGR_PID) { 689 nested = (struct nlattr *) NLA_DATA(na); 690 nested_len = NLA_PAYLOAD(na->nla_len); 691 while (nested_len > 0) { 692 if (nested->nla_type == TASKSTATS_TYPE_STATS) { 693 memcpy(&stats, NLA_DATA(nested), sizeof(stats)); 694 if (task_count < MAX_TASKS) { 695 tasks[task_count].pid = pid; 696 tasks[task_count].tgid = pid; 697 strncpy(tasks[task_count].command, comm, 698 TASK_COMM_LEN - 1); 699 tasks[task_count].command[TASK_COMM_LEN - 1] = '\0'; 700 SET_TASK_STAT(task_count, cpu_count); 701 SET_TASK_STAT(task_count, cpu_delay_total); 702 SET_TASK_STAT(task_count, blkio_count); 703 SET_TASK_STAT(task_count, blkio_delay_total); 704 SET_TASK_STAT(task_count, swapin_count); 705 SET_TASK_STAT(task_count, swapin_delay_total); 706 SET_TASK_STAT(task_count, freepages_count); 707 SET_TASK_STAT(task_count, freepages_delay_total); 708 SET_TASK_STAT(task_count, thrashing_count); 709 SET_TASK_STAT(task_count, thrashing_delay_total); 710 SET_TASK_STAT(task_count, compact_count); 711 SET_TASK_STAT(task_count, compact_delay_total); 712 SET_TASK_STAT(task_count, wpcopy_count); 713 SET_TASK_STAT(task_count, wpcopy_delay_total); 714 SET_TASK_STAT(task_count, irq_count); 715 SET_TASK_STAT(task_count, irq_delay_total); 716 set_mem_count(&tasks[task_count]); 717 set_mem_delay_total(&tasks[task_count]); 718 task_count++; 719 } 720 break; 721 } 722 nested_len -= NLA_ALIGN(nested->nla_len); 723 nested = NLA_NEXT(nested); 724 } 725 } 726 nl_len -= NLA_ALIGN(na->nla_len); 727 na = NLA_NEXT(na); 728 } 729 return; 730 } 731 732 static void get_task_delays(void) 733 { 734 char comm[TASK_COMM_LEN]; 735 struct dirent *entry; 736 DIR *dir; 737 int pid; 738 739 task_count = 0; 740 if (cfg.monitor_pid > 0) { 741 if (read_comm(cfg.monitor_pid, comm, sizeof(comm)) == 0) 742 fetch_and_fill_task_info(cfg.monitor_pid, comm); 743 return; 744 } 745 746 dir = opendir("/proc"); 747 if (!dir) { 748 fprintf(stderr, "Error opening /proc directory\n"); 749 return; 750 } 751 752 while ((entry = readdir(dir)) != NULL && task_count < MAX_TASKS) { 753 if (!isdigit(entry->d_name[0])) 754 continue; 755 pid = atoi(entry->d_name); 756 if (pid == 0) 757 continue; 758 if (read_comm(pid, comm, sizeof(comm)) != 0) 759 continue; 760 fetch_and_fill_task_info(pid, comm); 761 } 762 closedir(dir); 763 } 764 765 /* Calculate average delay in milliseconds */ 766 static double average_ms(unsigned long long total, unsigned long long count) 767 { 768 if (count == 0) 769 return 0; 770 return (double)total / 1000000.0 / count; 771 } 772 773 /* Comparison function for sorting tasks */ 774 static int compare_tasks(const void *a, const void *b) 775 { 776 const struct task_info *t1 = (const struct task_info *)a; 777 const struct task_info *t2 = (const struct task_info *)b; 778 unsigned long long total1; 779 unsigned long long total2; 780 unsigned long count1; 781 unsigned long count2; 782 double avg1, avg2; 783 784 total1 = *(unsigned long long *)((char *)t1 + cfg.sort_field->total_offset); 785 total2 = *(unsigned long long *)((char *)t2 + cfg.sort_field->total_offset); 786 count1 = *(unsigned long *)((char *)t1 + cfg.sort_field->count_offset); 787 count2 = *(unsigned long *)((char *)t2 + cfg.sort_field->count_offset); 788 789 avg1 = average_ms(total1, count1); 790 avg2 = average_ms(total2, count2); 791 if (avg1 != avg2) 792 return avg2 > avg1 ? 1 : -1; 793 794 return 0; 795 } 796 797 /* Sort tasks by selected field */ 798 static void sort_tasks(void) 799 { 800 if (task_count > 0) 801 qsort(tasks, task_count, sizeof(struct task_info), compare_tasks); 802 } 803 804 /* Get container statistics via cgroupstats */ 805 static void get_container_stats(void) 806 { 807 int rc, cfd; 808 struct { 809 struct nlmsghdr n; 810 struct genlmsghdr g; 811 char buf[MAX_MSG_SIZE]; 812 } req, resp; 813 struct nlattr *na; 814 int nl_len; 815 struct cgroupstats stats; 816 817 /* Check if container path is set */ 818 if (!cfg.container_path) 819 return; 820 821 /* Open container cgroup */ 822 cfd = open(cfg.container_path, O_RDONLY); 823 if (cfd < 0) { 824 fprintf(stderr, "Error opening container path: %s\n", cfg.container_path); 825 return; 826 } 827 828 /* Send request for container stats */ 829 if (send_cmd(nl_sd, family_id, getpid(), CGROUPSTATS_CMD_GET, 830 CGROUPSTATS_CMD_ATTR_FD, &cfd, sizeof(__u32)) < 0) { 831 fprintf(stderr, "Failed to send request for container stats\n"); 832 close(cfd); 833 return; 834 } 835 836 /* Receive response */ 837 rc = recv(nl_sd, &resp, sizeof(resp), 0); 838 if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) { 839 fprintf(stderr, "Failed to receive response for container stats\n"); 840 close(cfd); 841 return; 842 } 843 844 /* Parse response */ 845 nl_len = GENLMSG_PAYLOAD(&resp.n); 846 na = (struct nlattr *) GENLMSG_DATA(&resp); 847 while (nl_len > 0) { 848 if (na->nla_type == CGROUPSTATS_TYPE_CGROUP_STATS) { 849 /* Get the cgroupstats structure */ 850 memcpy(&stats, NLA_DATA(na), sizeof(stats)); 851 852 /* Fill container stats */ 853 container_stats.nr_sleeping = stats.nr_sleeping; 854 container_stats.nr_running = stats.nr_running; 855 container_stats.nr_stopped = stats.nr_stopped; 856 container_stats.nr_uninterruptible = stats.nr_uninterruptible; 857 container_stats.nr_io_wait = stats.nr_io_wait; 858 break; 859 } 860 nl_len -= NLA_ALIGN(na->nla_len); 861 na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len)); 862 } 863 864 close(cfd); 865 } 866 867 /* Display results to stdout or log file */ 868 static void display_results(int psi_ret) 869 { 870 time_t now = time(NULL); 871 struct tm *tm_now = localtime(&now); 872 FILE *out = stdout; 873 char timestamp[32]; 874 bool suc = true; 875 int i, count; 876 877 /* Clear terminal screen */ 878 suc &= BOOL_FPRINT(out, "\033[H\033[J"); 879 880 /* PSI output (one-line, no cat style) */ 881 suc &= BOOL_FPRINT(out, "System Pressure Information: (avg10/avg60vg300/total)\n"); 882 if (psi_ret) { 883 suc &= BOOL_FPRINT(out, " PSI not found: check if psi=1 enabled in cmdline\n"); 884 } else { 885 suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, 886 "CPU some:", 887 psi.cpu_some_avg10, 888 psi.cpu_some_avg60, 889 psi.cpu_some_avg300, 890 psi.cpu_some_total / 1000); 891 suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, 892 "CPU full:", 893 psi.cpu_full_avg10, 894 psi.cpu_full_avg60, 895 psi.cpu_full_avg300, 896 psi.cpu_full_total / 1000); 897 suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, 898 "Memory full:", 899 psi.memory_full_avg10, 900 psi.memory_full_avg60, 901 psi.memory_full_avg300, 902 psi.memory_full_total / 1000); 903 suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, 904 "Memory some:", 905 psi.memory_some_avg10, 906 psi.memory_some_avg60, 907 psi.memory_some_avg300, 908 psi.memory_some_total / 1000); 909 suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, 910 "IO full:", 911 psi.io_full_avg10, 912 psi.io_full_avg60, 913 psi.io_full_avg300, 914 psi.io_full_total / 1000); 915 suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, 916 "IO some:", 917 psi.io_some_avg10, 918 psi.io_some_avg60, 919 psi.io_some_avg300, 920 psi.io_some_total / 1000); 921 suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, 922 "IRQ full:", 923 psi.irq_full_avg10, 924 psi.irq_full_avg60, 925 psi.irq_full_avg300, 926 psi.irq_full_total / 1000); 927 } 928 929 if (cfg.container_path) { 930 suc &= BOOL_FPRINT(out, "Container Information (%s):\n", cfg.container_path); 931 suc &= BOOL_FPRINT(out, "Processes: running=%d, sleeping=%d, ", 932 container_stats.nr_running, container_stats.nr_sleeping); 933 suc &= BOOL_FPRINT(out, "stopped=%d, uninterruptible=%d, io_wait=%d\n\n", 934 container_stats.nr_stopped, container_stats.nr_uninterruptible, 935 container_stats.nr_io_wait); 936 } 937 938 /* Interacive command */ 939 suc &= BOOL_FPRINT(out, "[o]sort [M]memverbose [q]quit\n"); 940 if (sort_selected) { 941 if (cfg.display_mode == MODE_MEMVERBOSE) 942 suc &= BOOL_FPRINT(out, 943 "sort selection: [m]MEM [r]RCL [t]THR [p]CMP [w]WP\n"); 944 else 945 suc &= BOOL_FPRINT(out, 946 "sort selection: [c]CPU [i]IO [m]MEM [q]IRQ\n"); 947 } 948 949 /* Task delay output */ 950 suc &= BOOL_FPRINT(out, "Top %d processes (sorted by %s delay):\n", 951 cfg.max_processes, get_name_by_field(cfg.sort_field)); 952 953 suc &= BOOL_FPRINT(out, "%8s %8s %-17s", "PID", "TGID", "COMMAND"); 954 if (cfg.display_mode == MODE_MEMVERBOSE) { 955 suc &= BOOL_FPRINT(out, "%8s %8s %8s %8s %8s %8s\n", 956 "MEM(ms)", "SWAP(ms)", "RCL(ms)", 957 "THR(ms)", "CMP(ms)", "WP(ms)"); 958 suc &= BOOL_FPRINT(out, "-----------------------"); 959 suc &= BOOL_FPRINT(out, "-----------------------"); 960 suc &= BOOL_FPRINT(out, "-----------------------"); 961 suc &= BOOL_FPRINT(out, "---------------------\n"); 962 } else { 963 suc &= BOOL_FPRINT(out, "%8s %8s %8s %8s\n", 964 "CPU(ms)", "IO(ms)", "IRQ(ms)", "MEM(ms)"); 965 suc &= BOOL_FPRINT(out, "-----------------------"); 966 suc &= BOOL_FPRINT(out, "-----------------------"); 967 suc &= BOOL_FPRINT(out, "--------------------------\n"); 968 } 969 970 count = task_count < cfg.max_processes ? task_count : cfg.max_processes; 971 972 for (i = 0; i < count; i++) { 973 suc &= BOOL_FPRINT(out, "%8d %8d %-15s", 974 tasks[i].pid, tasks[i].tgid, tasks[i].command); 975 if (cfg.display_mode == MODE_MEMVERBOSE) { 976 suc &= BOOL_FPRINT(out, DELAY_FMT_MEMVERBOSE, 977 TASK_AVG(tasks[i], mem), 978 TASK_AVG(tasks[i], swapin), 979 TASK_AVG(tasks[i], freepages), 980 TASK_AVG(tasks[i], thrashing), 981 TASK_AVG(tasks[i], compact), 982 TASK_AVG(tasks[i], wpcopy)); 983 } else { 984 suc &= BOOL_FPRINT(out, DELAY_FMT_DEFAULT, 985 TASK_AVG(tasks[i], cpu), 986 TASK_AVG(tasks[i], blkio), 987 TASK_AVG(tasks[i], irq), 988 TASK_AVG(tasks[i], mem)); 989 } 990 } 991 992 suc &= BOOL_FPRINT(out, "\n"); 993 994 if (!suc) 995 perror("Error writing to output"); 996 } 997 998 /* Check for keyboard input with timeout based on cfg.delay */ 999 static char check_for_keypress(void) 1000 { 1001 struct timeval tv = {cfg.delay, 0}; 1002 fd_set readfds; 1003 char ch = 0; 1004 1005 FD_ZERO(&readfds); 1006 FD_SET(STDIN_FILENO, &readfds); 1007 int r = select(STDIN_FILENO + 1, &readfds, NULL, NULL, &tv); 1008 1009 if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) { 1010 read(STDIN_FILENO, &ch, 1); 1011 return ch; 1012 } 1013 1014 return 0; 1015 } 1016 1017 #define MAX_MODE_SIZE 2 1018 static void toggle_display_mode(void) 1019 { 1020 static const size_t modes[MAX_MODE_SIZE] = {MODE_DEFAULT, MODE_MEMVERBOSE}; 1021 static size_t cur_index; 1022 1023 cur_index = (cur_index + 1) % MAX_MODE_SIZE; 1024 cfg.display_mode = modes[cur_index]; 1025 } 1026 1027 /* Handle keyboard input: sorting selection, mode toggle, or quit */ 1028 static void handle_keypress(char ch, int *running) 1029 { 1030 const struct field_desc *field; 1031 1032 /* Change sort field */ 1033 if (sort_selected) { 1034 field = get_field_by_cmd_char(ch); 1035 if (field && (field->supported_modes & cfg.display_mode)) 1036 cfg.sort_field = field; 1037 1038 sort_selected = 0; 1039 /* Handle mode changes or quit */ 1040 } else { 1041 switch (ch) { 1042 case 'o': 1043 sort_selected = 1; 1044 break; 1045 case 'M': 1046 toggle_display_mode(); 1047 for (field = sort_fields; field->name != NULL; field++) { 1048 if (field->supported_modes & cfg.display_mode) { 1049 cfg.sort_field = field; 1050 break; 1051 } 1052 } 1053 break; 1054 case 'q': 1055 case 'Q': 1056 *running = 0; 1057 break; 1058 default: 1059 break; 1060 } 1061 } 1062 } 1063 1064 /* Main function */ 1065 int main(int argc, char **argv) 1066 { 1067 const struct field_desc *field; 1068 int iterations = 0; 1069 int psi_ret = 0; 1070 char keypress; 1071 1072 /* Parse command line arguments */ 1073 parse_args(argc, argv); 1074 1075 /* Setup netlink socket */ 1076 nl_sd = create_nl_socket(); 1077 if (nl_sd < 0) { 1078 fprintf(stderr, "Error creating netlink socket\n"); 1079 exit(1); 1080 } 1081 1082 /* Get family ID for taskstats via netlink */ 1083 family_id = get_family_id(nl_sd); 1084 if (!family_id) { 1085 fprintf(stderr, "Error getting taskstats family ID\n"); 1086 close(nl_sd); 1087 exit(1); 1088 } 1089 1090 /* Set terminal to non-canonical mode for interaction */ 1091 enable_raw_mode(); 1092 1093 /* Main loop */ 1094 while (running) { 1095 /* Auto-switch sort field when not matching display mode */ 1096 if (!(cfg.sort_field->supported_modes & cfg.display_mode)) { 1097 for (field = sort_fields; field->name != NULL; field++) { 1098 if (field->supported_modes & cfg.display_mode) { 1099 cfg.sort_field = field; 1100 printf("Auto-switched sort field to: %s\n", field->name); 1101 break; 1102 } 1103 } 1104 } 1105 1106 /* Read PSI statistics */ 1107 psi_ret = read_psi_stats(); 1108 1109 /* Get container stats if container path provided */ 1110 if (cfg.container_path) 1111 get_container_stats(); 1112 1113 /* Get task delays */ 1114 get_task_delays(); 1115 1116 /* Sort tasks */ 1117 sort_tasks(); 1118 1119 /* Display results to stdout or log file */ 1120 display_results(psi_ret); 1121 1122 /* Check for iterations */ 1123 if (cfg.iterations > 0 && ++iterations >= cfg.iterations) 1124 break; 1125 1126 /* Exit if output_one_time is set */ 1127 if (cfg.output_one_time) 1128 break; 1129 1130 /* Keypress for interactive usage */ 1131 keypress = check_for_keypress(); 1132 if (keypress) 1133 handle_keypress(keypress, &running); 1134 } 1135 1136 /* Restore terminal mode */ 1137 disable_raw_mode(); 1138 1139 /* Cleanup */ 1140 close(nl_sd); 1141 if (cfg.container_path) 1142 free(cfg.container_path); 1143 1144 return 0; 1145 } 1146