1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * delaytop.c - task delay monitoring tool. 4 * 5 * This tool provides real-time monitoring and statistics of 6 * system, container, and task-level delays, including CPU, 7 * memory, IO, and IRQ and delay accounting. It supports both 8 * interactive (top-like), and can output delay information 9 * for the whole system, specific containers (cgroups), or 10 * individual tasks (PIDs). 11 * 12 * Key features: 13 * - Collects per-task delay accounting statistics via taskstats. 14 * - Supports sorting, filtering. 15 * - Supports both interactive (screen refresh). 16 * 17 * Copyright (C) Fan Yu, ZTE Corp. 2025 18 * Copyright (C) Wang Yaxin, ZTE Corp. 2025 19 * 20 * Compile with 21 * gcc -I/usr/src/linux/include delaytop.c -o delaytop 22 */ 23 24 #include <stdio.h> 25 #include <stdlib.h> 26 #include <string.h> 27 #include <errno.h> 28 #include <unistd.h> 29 #include <fcntl.h> 30 #include <getopt.h> 31 #include <signal.h> 32 #include <time.h> 33 #include <dirent.h> 34 #include <ctype.h> 35 #include <sys/types.h> 36 #include <sys/stat.h> 37 #include <sys/socket.h> 38 #include <sys/select.h> 39 #include <termios.h> 40 #include <limits.h> 41 #include <linux/genetlink.h> 42 #include <linux/taskstats.h> 43 #include <linux/cgroupstats.h> 44 #include <ncurses.h> 45 46 #define NLA_NEXT(na) ((struct nlattr *)((char *)(na) + NLA_ALIGN((na)->nla_len))) 47 #define NLA_DATA(na) ((void *)((char *)(na) + NLA_HDRLEN)) 48 #define NLA_PAYLOAD(len) (len - NLA_HDRLEN) 49 50 #define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN)) 51 #define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN) 52 53 #define TASK_COMM_LEN 16 54 #define MAX_MSG_SIZE 1024 55 #define MAX_TASKS 1000 56 #define SET_TASK_STAT(task_count, field) tasks[task_count].field = stats.field 57 58 /* Program settings structure */ 59 struct config { 60 int delay; /* Update interval in seconds */ 61 int iterations; /* Number of iterations, 0 == infinite */ 62 int max_processes; /* Maximum number of processes to show */ 63 char sort_field; /* Field to sort by */ 64 int output_one_time; /* Output once and exit */ 65 int monitor_pid; /* Monitor specific PID */ 66 char *container_path; /* Path to container cgroup */ 67 }; 68 69 /* Task delay information structure */ 70 struct task_info { 71 int pid; 72 int tgid; 73 char command[TASK_COMM_LEN]; 74 unsigned long long cpu_count; 75 unsigned long long cpu_delay_total; 76 unsigned long long blkio_count; 77 unsigned long long blkio_delay_total; 78 unsigned long long swapin_count; 79 unsigned long long swapin_delay_total; 80 unsigned long long freepages_count; 81 unsigned long long freepages_delay_total; 82 unsigned long long thrashing_count; 83 unsigned long long thrashing_delay_total; 84 unsigned long long compact_count; 85 unsigned long long compact_delay_total; 86 unsigned long long wpcopy_count; 87 unsigned long long wpcopy_delay_total; 88 unsigned long long irq_count; 89 unsigned long long irq_delay_total; 90 }; 91 92 /* Container statistics structure */ 93 struct container_stats { 94 int nr_sleeping; /* Number of sleeping processes */ 95 int nr_running; /* Number of running processes */ 96 int nr_stopped; /* Number of stopped processes */ 97 int nr_uninterruptible; /* Number of uninterruptible processes */ 98 int nr_io_wait; /* Number of processes in IO wait */ 99 }; 100 101 /* Global variables */ 102 static struct config cfg; 103 static struct task_info tasks[MAX_TASKS]; 104 static int task_count; 105 static int running = 1; 106 static struct container_stats container_stats; 107 108 /* Netlink socket variables */ 109 static int nl_sd = -1; 110 static int family_id; 111 112 /* Set terminal to non-canonical mode for q-to-quit */ 113 static struct termios orig_termios; 114 static void enable_raw_mode(void) 115 { 116 struct termios raw; 117 118 tcgetattr(STDIN_FILENO, &orig_termios); 119 raw = orig_termios; 120 raw.c_lflag &= ~(ICANON | ECHO); 121 tcsetattr(STDIN_FILENO, TCSAFLUSH, &raw); 122 } 123 static void disable_raw_mode(void) 124 { 125 tcsetattr(STDIN_FILENO, TCSAFLUSH, &orig_termios); 126 } 127 128 /* Display usage information and command line options */ 129 static void usage(void) 130 { 131 printf("Usage: delaytop [Options]\n" 132 "Options:\n" 133 " -h, --help Show this help message and exit\n" 134 " -d, --delay=SECONDS Set refresh interval (default: 2 seconds, min: 1)\n" 135 " -n, --iterations=COUNT Set number of updates (default: 0 = infinite)\n" 136 " -P, --processes=NUMBER Set maximum number of processes to show (default: 20, max: 1000)\n" 137 " -o, --once Display once and exit\n" 138 " -p, --pid=PID Monitor only the specified PID\n" 139 " -C, --container=PATH Monitor the container at specified cgroup path\n"); 140 exit(0); 141 } 142 143 /* Parse command line arguments and set configuration */ 144 static void parse_args(int argc, char **argv) 145 { 146 int c; 147 struct option long_options[] = { 148 {"help", no_argument, 0, 'h'}, 149 {"delay", required_argument, 0, 'd'}, 150 {"iterations", required_argument, 0, 'n'}, 151 {"pid", required_argument, 0, 'p'}, 152 {"once", no_argument, 0, 'o'}, 153 {"processes", required_argument, 0, 'P'}, 154 {"container", required_argument, 0, 'C'}, 155 {0, 0, 0, 0} 156 }; 157 158 /* Set defaults */ 159 cfg.delay = 2; 160 cfg.iterations = 0; 161 cfg.max_processes = 20; 162 cfg.sort_field = 'c'; /* Default sort by CPU delay */ 163 cfg.output_one_time = 0; 164 cfg.monitor_pid = 0; /* 0 means monitor all PIDs */ 165 cfg.container_path = NULL; 166 167 while (1) { 168 int option_index = 0; 169 170 c = getopt_long(argc, argv, "hd:n:p:oP:C:", long_options, &option_index); 171 if (c == -1) 172 break; 173 174 switch (c) { 175 case 'h': 176 usage(); 177 break; 178 case 'd': 179 cfg.delay = atoi(optarg); 180 if (cfg.delay < 1) { 181 fprintf(stderr, "Error: delay must be >= 1.\n"); 182 exit(1); 183 } 184 break; 185 case 'n': 186 cfg.iterations = atoi(optarg); 187 if (cfg.iterations < 0) { 188 fprintf(stderr, "Error: iterations must be >= 0.\n"); 189 exit(1); 190 } 191 break; 192 case 'p': 193 cfg.monitor_pid = atoi(optarg); 194 if (cfg.monitor_pid < 1) { 195 fprintf(stderr, "Error: pid must be >= 1.\n"); 196 exit(1); 197 } 198 break; 199 case 'o': 200 cfg.output_one_time = 1; 201 break; 202 case 'P': 203 cfg.max_processes = atoi(optarg); 204 if (cfg.max_processes < 1) { 205 fprintf(stderr, "Error: processes must be >= 1.\n"); 206 exit(1); 207 } 208 if (cfg.max_processes > MAX_TASKS) { 209 fprintf(stderr, "Warning: processes capped to %d.\n", 210 MAX_TASKS); 211 cfg.max_processes = MAX_TASKS; 212 } 213 break; 214 case 'C': 215 cfg.container_path = strdup(optarg); 216 break; 217 default: 218 fprintf(stderr, "Try 'delaytop --help' for more information.\n"); 219 exit(1); 220 } 221 } 222 } 223 224 /* Create a raw netlink socket and bind */ 225 static int create_nl_socket(void) 226 { 227 int fd; 228 struct sockaddr_nl local; 229 230 fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); 231 if (fd < 0) 232 return -1; 233 234 memset(&local, 0, sizeof(local)); 235 local.nl_family = AF_NETLINK; 236 237 if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) { 238 close(fd); 239 return -1; 240 } 241 242 return fd; 243 } 244 245 /* Send a command via netlink */ 246 static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, 247 __u8 genl_cmd, __u16 nla_type, 248 void *nla_data, int nla_len) 249 { 250 struct sockaddr_nl nladdr; 251 struct nlattr *na; 252 int r, buflen; 253 char *buf; 254 255 struct { 256 struct nlmsghdr n; 257 struct genlmsghdr g; 258 char buf[MAX_MSG_SIZE]; 259 } msg; 260 261 msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); 262 msg.n.nlmsg_type = nlmsg_type; 263 msg.n.nlmsg_flags = NLM_F_REQUEST; 264 msg.n.nlmsg_seq = 0; 265 msg.n.nlmsg_pid = nlmsg_pid; 266 msg.g.cmd = genl_cmd; 267 msg.g.version = 0x1; 268 na = (struct nlattr *) GENLMSG_DATA(&msg); 269 na->nla_type = nla_type; 270 na->nla_len = nla_len + NLA_HDRLEN; 271 memcpy(NLA_DATA(na), nla_data, nla_len); 272 msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len); 273 274 buf = (char *) &msg; 275 buflen = msg.n.nlmsg_len; 276 memset(&nladdr, 0, sizeof(nladdr)); 277 nladdr.nl_family = AF_NETLINK; 278 while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr, 279 sizeof(nladdr))) < buflen) { 280 if (r > 0) { 281 buf += r; 282 buflen -= r; 283 } else if (errno != EAGAIN) 284 return -1; 285 } 286 return 0; 287 } 288 289 /* Get family ID for taskstats via netlink */ 290 static int get_family_id(int sd) 291 { 292 struct { 293 struct nlmsghdr n; 294 struct genlmsghdr g; 295 char buf[256]; 296 } ans; 297 298 int id = 0, rc; 299 struct nlattr *na; 300 int rep_len; 301 char name[100]; 302 303 strncpy(name, TASKSTATS_GENL_NAME, sizeof(name) - 1); 304 name[sizeof(name) - 1] = '\0'; 305 rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY, 306 CTRL_ATTR_FAMILY_NAME, (void *)name, 307 strlen(TASKSTATS_GENL_NAME)+1); 308 if (rc < 0) 309 return 0; 310 311 rep_len = recv(sd, &ans, sizeof(ans), 0); 312 if (ans.n.nlmsg_type == NLMSG_ERROR || 313 (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len)) 314 return 0; 315 316 na = (struct nlattr *) GENLMSG_DATA(&ans); 317 na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len)); 318 if (na->nla_type == CTRL_ATTR_FAMILY_ID) 319 id = *(__u16 *) NLA_DATA(na); 320 return id; 321 } 322 323 static int read_comm(int pid, char *comm_buf, size_t buf_size) 324 { 325 char path[64]; 326 size_t len; 327 FILE *fp; 328 329 snprintf(path, sizeof(path), "/proc/%d/comm", pid); 330 fp = fopen(path, "r"); 331 if (!fp) 332 return -1; 333 if (fgets(comm_buf, buf_size, fp)) { 334 len = strlen(comm_buf); 335 if (len > 0 && comm_buf[len - 1] == '\n') 336 comm_buf[len - 1] = '\0'; 337 } else { 338 fclose(fp); 339 return -1; 340 } 341 fclose(fp); 342 return 0; 343 } 344 345 static int fetch_and_fill_task_info(int pid, const char *comm) 346 { 347 struct { 348 struct nlmsghdr n; 349 struct genlmsghdr g; 350 char buf[MAX_MSG_SIZE]; 351 } resp; 352 struct taskstats stats; 353 struct nlattr *nested; 354 struct nlattr *na; 355 int nested_len; 356 int nl_len; 357 int rc; 358 359 if (send_cmd(nl_sd, family_id, getpid(), TASKSTATS_CMD_GET, 360 TASKSTATS_CMD_ATTR_PID, &pid, sizeof(pid)) < 0) { 361 return -1; 362 } 363 rc = recv(nl_sd, &resp, sizeof(resp), 0); 364 if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) 365 return -1; 366 nl_len = GENLMSG_PAYLOAD(&resp.n); 367 na = (struct nlattr *) GENLMSG_DATA(&resp); 368 while (nl_len > 0) { 369 if (na->nla_type == TASKSTATS_TYPE_AGGR_PID) { 370 nested = (struct nlattr *) NLA_DATA(na); 371 nested_len = NLA_PAYLOAD(na->nla_len); 372 while (nested_len > 0) { 373 if (nested->nla_type == TASKSTATS_TYPE_STATS) { 374 memcpy(&stats, NLA_DATA(nested), sizeof(stats)); 375 if (task_count < MAX_TASKS) { 376 tasks[task_count].pid = pid; 377 tasks[task_count].tgid = pid; 378 strncpy(tasks[task_count].command, comm, 379 TASK_COMM_LEN - 1); 380 tasks[task_count].command[TASK_COMM_LEN - 1] = '\0'; 381 SET_TASK_STAT(task_count, cpu_count); 382 SET_TASK_STAT(task_count, cpu_delay_total); 383 SET_TASK_STAT(task_count, blkio_count); 384 SET_TASK_STAT(task_count, blkio_delay_total); 385 SET_TASK_STAT(task_count, swapin_count); 386 SET_TASK_STAT(task_count, swapin_delay_total); 387 SET_TASK_STAT(task_count, freepages_count); 388 SET_TASK_STAT(task_count, freepages_delay_total); 389 SET_TASK_STAT(task_count, thrashing_count); 390 SET_TASK_STAT(task_count, thrashing_delay_total); 391 SET_TASK_STAT(task_count, compact_count); 392 SET_TASK_STAT(task_count, compact_delay_total); 393 SET_TASK_STAT(task_count, wpcopy_count); 394 SET_TASK_STAT(task_count, wpcopy_delay_total); 395 SET_TASK_STAT(task_count, irq_count); 396 SET_TASK_STAT(task_count, irq_delay_total); 397 task_count++; 398 } 399 break; 400 } 401 nested_len -= NLA_ALIGN(nested->nla_len); 402 nested = NLA_NEXT(nested); 403 } 404 } 405 nl_len -= NLA_ALIGN(na->nla_len); 406 na = NLA_NEXT(na); 407 } 408 return 0; 409 } 410 411 static void get_task_delays(void) 412 { 413 char comm[TASK_COMM_LEN]; 414 struct dirent *entry; 415 DIR *dir; 416 int pid; 417 418 task_count = 0; 419 if (cfg.monitor_pid > 0) { 420 if (read_comm(cfg.monitor_pid, comm, sizeof(comm)) == 0) 421 fetch_and_fill_task_info(cfg.monitor_pid, comm); 422 return; 423 } 424 425 dir = opendir("/proc"); 426 if (!dir) { 427 fprintf(stderr, "Error opening /proc directory\n"); 428 return; 429 } 430 431 while ((entry = readdir(dir)) != NULL && task_count < MAX_TASKS) { 432 if (!isdigit(entry->d_name[0])) 433 continue; 434 pid = atoi(entry->d_name); 435 if (pid == 0) 436 continue; 437 if (read_comm(pid, comm, sizeof(comm)) != 0) 438 continue; 439 fetch_and_fill_task_info(pid, comm); 440 } 441 closedir(dir); 442 } 443 444 /* Calculate average delay in milliseconds */ 445 static double average_ms(unsigned long long total, unsigned long long count) 446 { 447 if (count == 0) 448 return 0; 449 return (double)total / 1000000.0 / count; 450 } 451 452 /* Comparison function for sorting tasks */ 453 static int compare_tasks(const void *a, const void *b) 454 { 455 const struct task_info *t1 = (const struct task_info *)a; 456 const struct task_info *t2 = (const struct task_info *)b; 457 double avg1, avg2; 458 459 switch (cfg.sort_field) { 460 case 'c': /* CPU */ 461 avg1 = average_ms(t1->cpu_delay_total, t1->cpu_count); 462 avg2 = average_ms(t2->cpu_delay_total, t2->cpu_count); 463 if (avg1 != avg2) 464 return avg2 > avg1 ? 1 : -1; 465 return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1; 466 467 default: 468 return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1; 469 } 470 } 471 472 /* Sort tasks by selected field */ 473 static void sort_tasks(void) 474 { 475 if (task_count > 0) 476 qsort(tasks, task_count, sizeof(struct task_info), compare_tasks); 477 } 478 479 /* Get container statistics via cgroupstats */ 480 static void get_container_stats(void) 481 { 482 int rc, cfd; 483 struct { 484 struct nlmsghdr n; 485 struct genlmsghdr g; 486 char buf[MAX_MSG_SIZE]; 487 } req, resp; 488 struct nlattr *na; 489 int nl_len; 490 struct cgroupstats stats; 491 492 /* Check if container path is set */ 493 if (!cfg.container_path) 494 return; 495 496 /* Open container cgroup */ 497 cfd = open(cfg.container_path, O_RDONLY); 498 if (cfd < 0) { 499 fprintf(stderr, "Error opening container path: %s\n", cfg.container_path); 500 return; 501 } 502 503 /* Send request for container stats */ 504 if (send_cmd(nl_sd, family_id, getpid(), CGROUPSTATS_CMD_GET, 505 CGROUPSTATS_CMD_ATTR_FD, &cfd, sizeof(__u32)) < 0) { 506 fprintf(stderr, "Failed to send request for container stats\n"); 507 close(cfd); 508 return; 509 } 510 511 /* Receive response */ 512 rc = recv(nl_sd, &resp, sizeof(resp), 0); 513 if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) { 514 fprintf(stderr, "Failed to receive response for container stats\n"); 515 close(cfd); 516 return; 517 } 518 519 /* Parse response */ 520 nl_len = GENLMSG_PAYLOAD(&resp.n); 521 na = (struct nlattr *) GENLMSG_DATA(&resp); 522 while (nl_len > 0) { 523 if (na->nla_type == CGROUPSTATS_TYPE_CGROUP_STATS) { 524 /* Get the cgroupstats structure */ 525 memcpy(&stats, NLA_DATA(na), sizeof(stats)); 526 527 /* Fill container stats */ 528 container_stats.nr_sleeping = stats.nr_sleeping; 529 container_stats.nr_running = stats.nr_running; 530 container_stats.nr_stopped = stats.nr_stopped; 531 container_stats.nr_uninterruptible = stats.nr_uninterruptible; 532 container_stats.nr_io_wait = stats.nr_io_wait; 533 break; 534 } 535 nl_len -= NLA_ALIGN(na->nla_len); 536 na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len)); 537 } 538 539 close(cfd); 540 } 541 542 /* Display results to stdout or log file */ 543 static void display_results(void) 544 { 545 time_t now = time(NULL); 546 struct tm *tm_now = localtime(&now); 547 char timestamp[32]; 548 int i, count; 549 FILE *out = stdout; 550 551 fprintf(out, "\033[H\033[J"); 552 553 if (cfg.container_path) { 554 fprintf(out, "Container Information (%s):\n", cfg.container_path); 555 fprintf(out, "Processes: running=%d, sleeping=%d, ", 556 container_stats.nr_running, container_stats.nr_sleeping); 557 fprintf(out, "stopped=%d, uninterruptible=%d, io_wait=%d\n\n", 558 container_stats.nr_stopped, container_stats.nr_uninterruptible, 559 container_stats.nr_io_wait); 560 } 561 fprintf(out, "Top %d processes (sorted by CPU delay):\n\n", 562 cfg.max_processes); 563 fprintf(out, " PID TGID COMMAND CPU(ms) IO(ms) "); 564 fprintf(out, "SWAP(ms) RCL(ms) THR(ms) CMP(ms) WP(ms) IRQ(ms)\n"); 565 fprintf(out, "-----------------------------------------------"); 566 fprintf(out, "----------------------------------------------\n"); 567 count = task_count < cfg.max_processes ? task_count : cfg.max_processes; 568 569 for (i = 0; i < count; i++) { 570 fprintf(out, "%5d %5d %-15s ", 571 tasks[i].pid, tasks[i].tgid, tasks[i].command); 572 fprintf(out, "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f\n", 573 average_ms(tasks[i].cpu_delay_total, tasks[i].cpu_count), 574 average_ms(tasks[i].blkio_delay_total, tasks[i].blkio_count), 575 average_ms(tasks[i].swapin_delay_total, tasks[i].swapin_count), 576 average_ms(tasks[i].freepages_delay_total, tasks[i].freepages_count), 577 average_ms(tasks[i].thrashing_delay_total, tasks[i].thrashing_count), 578 average_ms(tasks[i].compact_delay_total, tasks[i].compact_count), 579 average_ms(tasks[i].wpcopy_delay_total, tasks[i].wpcopy_count), 580 average_ms(tasks[i].irq_delay_total, tasks[i].irq_count)); 581 } 582 583 fprintf(out, "\n"); 584 } 585 586 /* Main function */ 587 int main(int argc, char **argv) 588 { 589 int iterations = 0; 590 int use_q_quit = 0; 591 592 /* Parse command line arguments */ 593 parse_args(argc, argv); 594 595 /* Setup netlink socket */ 596 nl_sd = create_nl_socket(); 597 if (nl_sd < 0) { 598 fprintf(stderr, "Error creating netlink socket\n"); 599 exit(1); 600 } 601 602 /* Get family ID for taskstats via netlink */ 603 family_id = get_family_id(nl_sd); 604 if (!family_id) { 605 fprintf(stderr, "Error getting taskstats family ID\n"); 606 close(nl_sd); 607 exit(1); 608 } 609 610 if (!cfg.output_one_time) { 611 use_q_quit = 1; 612 enable_raw_mode(); 613 printf("Press 'q' to quit.\n"); 614 fflush(stdout); 615 } 616 617 /* Main loop */ 618 while (running) { 619 /* Get container stats if container path provided */ 620 if (cfg.container_path) 621 get_container_stats(); 622 623 /* Get task delays */ 624 get_task_delays(); 625 626 /* Sort tasks */ 627 sort_tasks(); 628 629 /* Display results to stdout or log file */ 630 display_results(); 631 632 /* Check for iterations */ 633 if (cfg.iterations > 0 && ++iterations >= cfg.iterations) 634 break; 635 636 /* Exit if output_one_time is set */ 637 if (cfg.output_one_time) 638 break; 639 640 /* Check for 'q' key to quit */ 641 if (use_q_quit) { 642 struct timeval tv = {cfg.delay, 0}; 643 fd_set readfds; 644 645 FD_ZERO(&readfds); 646 FD_SET(STDIN_FILENO, &readfds); 647 int r = select(STDIN_FILENO+1, &readfds, NULL, NULL, &tv); 648 649 if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) { 650 char ch = 0; 651 652 read(STDIN_FILENO, &ch, 1); 653 if (ch == 'q' || ch == 'Q') { 654 running = 0; 655 break; 656 } 657 } 658 } else { 659 sleep(cfg.delay); 660 } 661 } 662 663 /* Restore terminal mode */ 664 if (use_q_quit) 665 disable_raw_mode(); 666 667 /* Cleanup */ 668 close(nl_sd); 669 if (cfg.container_path) 670 free(cfg.container_path); 671 672 return 0; 673 } 674