1 // SPDX-License-Identifier: GPL-2.0 2 /* getdelays.c 3 * 4 * Utility to get per-pid and per-tgid delay accounting statistics 5 * Also illustrates usage of the taskstats interface 6 * 7 * Copyright (C) Shailabh Nagar, IBM Corp. 2005 8 * Copyright (C) Balbir Singh, IBM Corp. 2006 9 * Copyright (c) Jay Lan, SGI. 2006 10 * 11 * Compile with 12 * gcc -I/usr/src/linux/include getdelays.c -o getdelays 13 */ 14 15 #include <stdio.h> 16 #include <stdlib.h> 17 #include <errno.h> 18 #include <unistd.h> 19 #include <poll.h> 20 #include <string.h> 21 #include <fcntl.h> 22 #include <sys/types.h> 23 #include <sys/stat.h> 24 #include <sys/socket.h> 25 #include <sys/wait.h> 26 #include <signal.h> 27 28 #include <linux/genetlink.h> 29 #include <linux/taskstats.h> 30 #include <linux/cgroupstats.h> 31 32 /* 33 * Generic macros for dealing with netlink sockets. Might be duplicated 34 * elsewhere. It is recommended that commercial grade applications use 35 * libnl or libnetlink and use the interfaces provided by the library 36 */ 37 #define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN)) 38 #define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN) 39 #define NLA_DATA(na) ((void *)((char*)(na) + NLA_HDRLEN)) 40 #define NLA_PAYLOAD(len) (len - NLA_HDRLEN) 41 42 #define err(code, fmt, arg...) \ 43 do { \ 44 fprintf(stderr, fmt, ##arg); \ 45 exit(code); \ 46 } while (0) 47 48 int rcvbufsz; 49 char name[100]; 50 int dbg; 51 int print_delays; 52 int print_io_accounting; 53 int print_task_context_switch_counts; 54 55 #define PRINTF(fmt, arg...) { \ 56 if (dbg) { \ 57 printf(fmt, ##arg); \ 58 } \ 59 } 60 61 /* Maximum size of response requested or message sent */ 62 #define MAX_MSG_SIZE 1024 63 /* Maximum number of cpus expected to be specified in a cpumask */ 64 #define MAX_CPUS 32 65 66 struct msgtemplate { 67 struct nlmsghdr n; 68 struct genlmsghdr g; 69 char buf[MAX_MSG_SIZE]; 70 }; 71 72 char cpumask[100+6*MAX_CPUS]; 73 74 static void usage(void) 75 { 76 fprintf(stderr, "getdelays [-dilv] [-w logfile] [-r bufsize] " 77 "[-m cpumask] [-t tgid] [-p pid]\n"); 78 fprintf(stderr, " -d: print delayacct stats\n"); 79 fprintf(stderr, " -i: print IO accounting (works only with -p)\n"); 80 fprintf(stderr, " -l: listen forever\n"); 81 fprintf(stderr, " -v: debug on\n"); 82 fprintf(stderr, " -C: container path\n"); 83 } 84 85 /* 86 * Create a raw netlink socket and bind 87 */ 88 static int create_nl_socket(int protocol) 89 { 90 int fd; 91 struct sockaddr_nl local; 92 93 fd = socket(AF_NETLINK, SOCK_RAW, protocol); 94 if (fd < 0) 95 return -1; 96 97 if (rcvbufsz) 98 if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, 99 &rcvbufsz, sizeof(rcvbufsz)) < 0) { 100 fprintf(stderr, "Unable to set socket rcv buf size to %d\n", 101 rcvbufsz); 102 goto error; 103 } 104 105 memset(&local, 0, sizeof(local)); 106 local.nl_family = AF_NETLINK; 107 108 if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) 109 goto error; 110 111 return fd; 112 error: 113 close(fd); 114 return -1; 115 } 116 117 118 static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, 119 __u8 genl_cmd, __u16 nla_type, 120 void *nla_data, int nla_len) 121 { 122 struct nlattr *na; 123 struct sockaddr_nl nladdr; 124 int r, buflen; 125 char *buf; 126 127 struct msgtemplate msg; 128 129 msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); 130 msg.n.nlmsg_type = nlmsg_type; 131 msg.n.nlmsg_flags = NLM_F_REQUEST; 132 msg.n.nlmsg_seq = 0; 133 msg.n.nlmsg_pid = nlmsg_pid; 134 msg.g.cmd = genl_cmd; 135 msg.g.version = 0x1; 136 na = (struct nlattr *) GENLMSG_DATA(&msg); 137 na->nla_type = nla_type; 138 na->nla_len = nla_len + NLA_HDRLEN; 139 memcpy(NLA_DATA(na), nla_data, nla_len); 140 msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len); 141 142 buf = (char *) &msg; 143 buflen = msg.n.nlmsg_len ; 144 memset(&nladdr, 0, sizeof(nladdr)); 145 nladdr.nl_family = AF_NETLINK; 146 while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr, 147 sizeof(nladdr))) < buflen) { 148 if (r > 0) { 149 buf += r; 150 buflen -= r; 151 } else if (errno != EAGAIN) 152 return -1; 153 } 154 return 0; 155 } 156 157 158 /* 159 * Probe the controller in genetlink to find the family id 160 * for the TASKSTATS family 161 */ 162 static int get_family_id(int sd) 163 { 164 struct { 165 struct nlmsghdr n; 166 struct genlmsghdr g; 167 char buf[256]; 168 } ans; 169 170 int id = 0, rc; 171 struct nlattr *na; 172 int rep_len; 173 174 strcpy(name, TASKSTATS_GENL_NAME); 175 rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY, 176 CTRL_ATTR_FAMILY_NAME, (void *)name, 177 strlen(TASKSTATS_GENL_NAME)+1); 178 if (rc < 0) 179 return 0; /* sendto() failure? */ 180 181 rep_len = recv(sd, &ans, sizeof(ans), 0); 182 if (ans.n.nlmsg_type == NLMSG_ERROR || 183 (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len)) 184 return 0; 185 186 na = (struct nlattr *) GENLMSG_DATA(&ans); 187 na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len)); 188 if (na->nla_type == CTRL_ATTR_FAMILY_ID) { 189 id = *(__u16 *) NLA_DATA(na); 190 } 191 return id; 192 } 193 194 #define average_ms(t, c) (t / 1000000ULL / (c ? c : 1)) 195 #define delay_ms(t) (t / 1000000ULL) 196 197 /* 198 * Version compatibility note: 199 * Field availability depends on taskstats version (t->version), 200 * corresponding to TASKSTATS_VERSION in kernel headers 201 * see include/uapi/linux/taskstats.h 202 * 203 * Version feature mapping: 204 * version >= 11 - supports COMPACT statistics 205 * version >= 13 - supports WPCOPY statistics 206 * version >= 14 - supports IRQ statistics 207 * version >= 16 - supports *_max and *_min delay statistics 208 * 209 * Always verify version before accessing version-dependent fields 210 * to maintain backward compatibility. 211 */ 212 #define PRINT_CPU_DELAY(version, t) \ 213 do { \ 214 if (version >= 16) { \ 215 printf("%-10s%15s%15s%15s%15s%15s%15s%15s\n", \ 216 "CPU", "count", "real total", "virtual total", \ 217 "delay total", "delay average", "delay max", "delay min"); \ 218 printf(" %15llu%15llu%15llu%15llu%15.3fms%13.6fms%13.6fms\n", \ 219 (unsigned long long)(t)->cpu_count, \ 220 (unsigned long long)(t)->cpu_run_real_total, \ 221 (unsigned long long)(t)->cpu_run_virtual_total, \ 222 (unsigned long long)(t)->cpu_delay_total, \ 223 average_ms((double)(t)->cpu_delay_total, (t)->cpu_count), \ 224 delay_ms((double)(t)->cpu_delay_max), \ 225 delay_ms((double)(t)->cpu_delay_min)); \ 226 } else { \ 227 printf("%-10s%15s%15s%15s%15s%15s\n", \ 228 "CPU", "count", "real total", "virtual total", \ 229 "delay total", "delay average"); \ 230 printf(" %15llu%15llu%15llu%15llu%15.3fms\n", \ 231 (unsigned long long)(t)->cpu_count, \ 232 (unsigned long long)(t)->cpu_run_real_total, \ 233 (unsigned long long)(t)->cpu_run_virtual_total, \ 234 (unsigned long long)(t)->cpu_delay_total, \ 235 average_ms((double)(t)->cpu_delay_total, (t)->cpu_count)); \ 236 } \ 237 } while (0) 238 #define PRINT_FILED_DELAY(name, version, t, count, total, max, min) \ 239 do { \ 240 if (version >= 16) { \ 241 printf("%-10s%15s%15s%15s%15s%15s\n", \ 242 name, "count", "delay total", "delay average", \ 243 "delay max", "delay min"); \ 244 printf(" %15llu%15llu%15.3fms%13.6fms%13.6fms\n", \ 245 (unsigned long long)(t)->count, \ 246 (unsigned long long)(t)->total, \ 247 average_ms((double)(t)->total, (t)->count), \ 248 delay_ms((double)(t)->max), \ 249 delay_ms((double)(t)->min)); \ 250 } else { \ 251 printf("%-10s%15s%15s%15s\n", \ 252 name, "count", "delay total", "delay average"); \ 253 printf(" %15llu%15llu%15.3fms\n", \ 254 (unsigned long long)(t)->count, \ 255 (unsigned long long)(t)->total, \ 256 average_ms((double)(t)->total, (t)->count)); \ 257 } \ 258 } while (0) 259 260 static void print_delayacct(struct taskstats *t) 261 { 262 printf("\n\n"); 263 264 PRINT_CPU_DELAY(t->version, t); 265 266 PRINT_FILED_DELAY("IO", t->version, t, 267 blkio_count, blkio_delay_total, 268 blkio_delay_max, blkio_delay_min); 269 270 PRINT_FILED_DELAY("SWAP", t->version, t, 271 swapin_count, swapin_delay_total, 272 swapin_delay_max, swapin_delay_min); 273 274 PRINT_FILED_DELAY("RECLAIM", t->version, t, 275 freepages_count, freepages_delay_total, 276 freepages_delay_max, freepages_delay_min); 277 278 PRINT_FILED_DELAY("THRASHING", t->version, t, 279 thrashing_count, thrashing_delay_total, 280 thrashing_delay_max, thrashing_delay_min); 281 282 if (t->version >= 11) { 283 PRINT_FILED_DELAY("COMPACT", t->version, t, 284 compact_count, compact_delay_total, 285 compact_delay_max, compact_delay_min); 286 } 287 288 if (t->version >= 13) { 289 PRINT_FILED_DELAY("WPCOPY", t->version, t, 290 wpcopy_count, wpcopy_delay_total, 291 wpcopy_delay_max, wpcopy_delay_min); 292 } 293 294 if (t->version >= 14) { 295 PRINT_FILED_DELAY("IRQ", t->version, t, 296 irq_count, irq_delay_total, 297 irq_delay_max, irq_delay_min); 298 } 299 } 300 301 static void task_context_switch_counts(struct taskstats *t) 302 { 303 printf("\n\nTask %15s%15s\n" 304 " %15llu%15llu\n", 305 "voluntary", "nonvoluntary", 306 (unsigned long long)t->nvcsw, (unsigned long long)t->nivcsw); 307 } 308 309 static void print_cgroupstats(struct cgroupstats *c) 310 { 311 printf("sleeping %llu, blocked %llu, running %llu, stopped %llu, " 312 "uninterruptible %llu\n", (unsigned long long)c->nr_sleeping, 313 (unsigned long long)c->nr_io_wait, 314 (unsigned long long)c->nr_running, 315 (unsigned long long)c->nr_stopped, 316 (unsigned long long)c->nr_uninterruptible); 317 } 318 319 320 static void print_ioacct(struct taskstats *t) 321 { 322 printf("%s: read=%llu, write=%llu, cancelled_write=%llu\n", 323 t->ac_comm, 324 (unsigned long long)t->read_bytes, 325 (unsigned long long)t->write_bytes, 326 (unsigned long long)t->cancelled_write_bytes); 327 } 328 329 int main(int argc, char *argv[]) 330 { 331 int c, rc, rep_len, aggr_len, len2; 332 int cmd_type = TASKSTATS_CMD_ATTR_UNSPEC; 333 __u16 id; 334 __u32 mypid; 335 336 struct nlattr *na; 337 int nl_sd = -1; 338 int len = 0; 339 pid_t tid = 0; 340 pid_t rtid = 0; 341 342 int fd = 0; 343 int write_file = 0; 344 int maskset = 0; 345 char *logfile = NULL; 346 int loop = 0; 347 int containerset = 0; 348 char *containerpath = NULL; 349 int cfd = 0; 350 int forking = 0; 351 sigset_t sigset; 352 353 struct msgtemplate msg; 354 355 while (!forking) { 356 c = getopt(argc, argv, "qdiw:r:m:t:p:vlC:c:"); 357 if (c < 0) 358 break; 359 360 switch (c) { 361 case 'd': 362 printf("print delayacct stats ON\n"); 363 print_delays = 1; 364 break; 365 case 'i': 366 printf("printing IO accounting\n"); 367 print_io_accounting = 1; 368 break; 369 case 'q': 370 printf("printing task/process context switch rates\n"); 371 print_task_context_switch_counts = 1; 372 break; 373 case 'C': 374 containerset = 1; 375 containerpath = optarg; 376 break; 377 case 'w': 378 logfile = strdup(optarg); 379 printf("write to file %s\n", logfile); 380 write_file = 1; 381 break; 382 case 'r': 383 rcvbufsz = atoi(optarg); 384 printf("receive buf size %d\n", rcvbufsz); 385 if (rcvbufsz < 0) 386 err(1, "Invalid rcv buf size\n"); 387 break; 388 case 'm': 389 strncpy(cpumask, optarg, sizeof(cpumask)); 390 cpumask[sizeof(cpumask) - 1] = '\0'; 391 maskset = 1; 392 printf("cpumask %s maskset %d\n", cpumask, maskset); 393 break; 394 case 't': 395 tid = atoi(optarg); 396 if (!tid) 397 err(1, "Invalid tgid\n"); 398 cmd_type = TASKSTATS_CMD_ATTR_TGID; 399 break; 400 case 'p': 401 tid = atoi(optarg); 402 if (!tid) 403 err(1, "Invalid pid\n"); 404 cmd_type = TASKSTATS_CMD_ATTR_PID; 405 break; 406 case 'c': 407 408 /* Block SIGCHLD for sigwait() later */ 409 if (sigemptyset(&sigset) == -1) 410 err(1, "Failed to empty sigset"); 411 if (sigaddset(&sigset, SIGCHLD)) 412 err(1, "Failed to set sigchld in sigset"); 413 sigprocmask(SIG_BLOCK, &sigset, NULL); 414 415 /* fork/exec a child */ 416 tid = fork(); 417 if (tid < 0) 418 err(1, "Fork failed\n"); 419 if (tid == 0) 420 if (execvp(argv[optind - 1], 421 &argv[optind - 1]) < 0) 422 exit(-1); 423 424 /* Set the command type and avoid further processing */ 425 cmd_type = TASKSTATS_CMD_ATTR_PID; 426 forking = 1; 427 break; 428 case 'v': 429 printf("debug on\n"); 430 dbg = 1; 431 break; 432 case 'l': 433 printf("listen forever\n"); 434 loop = 1; 435 break; 436 default: 437 usage(); 438 exit(-1); 439 } 440 } 441 442 if (write_file) { 443 fd = open(logfile, O_WRONLY | O_CREAT | O_TRUNC, 444 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); 445 if (fd == -1) { 446 perror("Cannot open output file\n"); 447 exit(1); 448 } 449 } 450 451 nl_sd = create_nl_socket(NETLINK_GENERIC); 452 if (nl_sd < 0) 453 err(1, "error creating Netlink socket\n"); 454 455 456 mypid = getpid(); 457 id = get_family_id(nl_sd); 458 if (!id) { 459 fprintf(stderr, "Error getting family id, errno %d\n", errno); 460 goto err; 461 } 462 PRINTF("family id %d\n", id); 463 464 if (maskset) { 465 rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET, 466 TASKSTATS_CMD_ATTR_REGISTER_CPUMASK, 467 &cpumask, strlen(cpumask) + 1); 468 PRINTF("Sent register cpumask, retval %d\n", rc); 469 if (rc < 0) { 470 fprintf(stderr, "error sending register cpumask\n"); 471 goto err; 472 } 473 } 474 475 if (tid && containerset) { 476 fprintf(stderr, "Select either -t or -C, not both\n"); 477 goto err; 478 } 479 480 /* 481 * If we forked a child, wait for it to exit. Cannot use waitpid() 482 * as all the delicious data would be reaped as part of the wait 483 */ 484 if (tid && forking) { 485 int sig_received; 486 sigwait(&sigset, &sig_received); 487 } 488 489 if (tid) { 490 rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET, 491 cmd_type, &tid, sizeof(__u32)); 492 PRINTF("Sent pid/tgid, retval %d\n", rc); 493 if (rc < 0) { 494 fprintf(stderr, "error sending tid/tgid cmd\n"); 495 goto done; 496 } 497 } 498 499 if (containerset) { 500 cfd = open(containerpath, O_RDONLY); 501 if (cfd < 0) { 502 perror("error opening container file"); 503 goto err; 504 } 505 rc = send_cmd(nl_sd, id, mypid, CGROUPSTATS_CMD_GET, 506 CGROUPSTATS_CMD_ATTR_FD, &cfd, sizeof(__u32)); 507 if (rc < 0) { 508 perror("error sending cgroupstats command"); 509 goto err; 510 } 511 } 512 if (!maskset && !tid && !containerset) { 513 usage(); 514 goto err; 515 } 516 517 do { 518 rep_len = recv(nl_sd, &msg, sizeof(msg), 0); 519 PRINTF("received %d bytes\n", rep_len); 520 521 if (rep_len < 0) { 522 fprintf(stderr, "nonfatal reply error: errno %d\n", 523 errno); 524 continue; 525 } 526 if (msg.n.nlmsg_type == NLMSG_ERROR || 527 !NLMSG_OK((&msg.n), rep_len)) { 528 struct nlmsgerr *err = NLMSG_DATA(&msg); 529 fprintf(stderr, "fatal reply error, errno %d\n", 530 err->error); 531 goto done; 532 } 533 534 PRINTF("nlmsghdr size=%zu, nlmsg_len=%d, rep_len=%d\n", 535 sizeof(struct nlmsghdr), msg.n.nlmsg_len, rep_len); 536 537 538 rep_len = GENLMSG_PAYLOAD(&msg.n); 539 540 na = (struct nlattr *) GENLMSG_DATA(&msg); 541 len = 0; 542 while (len < rep_len) { 543 len += NLA_ALIGN(na->nla_len); 544 switch (na->nla_type) { 545 case TASKSTATS_TYPE_AGGR_TGID: 546 /* Fall through */ 547 case TASKSTATS_TYPE_AGGR_PID: 548 aggr_len = NLA_PAYLOAD(na->nla_len); 549 len2 = 0; 550 /* For nested attributes, na follows */ 551 na = (struct nlattr *) NLA_DATA(na); 552 while (len2 < aggr_len) { 553 switch (na->nla_type) { 554 case TASKSTATS_TYPE_PID: 555 rtid = *(int *) NLA_DATA(na); 556 if (print_delays) 557 printf("PID\t%d\n", rtid); 558 break; 559 case TASKSTATS_TYPE_TGID: 560 rtid = *(int *) NLA_DATA(na); 561 if (print_delays) 562 printf("TGID\t%d\n", rtid); 563 break; 564 case TASKSTATS_TYPE_STATS: 565 if (print_delays) 566 print_delayacct((struct taskstats *) NLA_DATA(na)); 567 if (print_io_accounting) 568 print_ioacct((struct taskstats *) NLA_DATA(na)); 569 if (print_task_context_switch_counts) 570 task_context_switch_counts((struct taskstats *) NLA_DATA(na)); 571 if (fd) { 572 if (write(fd, NLA_DATA(na), na->nla_len) < 0) { 573 err(1,"write error\n"); 574 } 575 } 576 if (!loop) 577 goto done; 578 break; 579 case TASKSTATS_TYPE_NULL: 580 break; 581 default: 582 fprintf(stderr, "Unknown nested" 583 " nla_type %d\n", 584 na->nla_type); 585 break; 586 } 587 len2 += NLA_ALIGN(na->nla_len); 588 na = (struct nlattr *)((char *)na + 589 NLA_ALIGN(na->nla_len)); 590 } 591 break; 592 593 case CGROUPSTATS_TYPE_CGROUP_STATS: 594 print_cgroupstats(NLA_DATA(na)); 595 break; 596 default: 597 fprintf(stderr, "Unknown nla_type %d\n", 598 na->nla_type); 599 case TASKSTATS_TYPE_NULL: 600 break; 601 } 602 na = (struct nlattr *) (GENLMSG_DATA(&msg) + len); 603 } 604 } while (loop); 605 done: 606 if (maskset) { 607 rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET, 608 TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK, 609 &cpumask, strlen(cpumask) + 1); 610 printf("Sent deregister mask, retval %d\n", rc); 611 if (rc < 0) 612 err(rc, "error sending deregister cpumask\n"); 613 } 614 err: 615 close(nl_sd); 616 if (fd) 617 close(fd); 618 if (cfd) 619 close(cfd); 620 return 0; 621 } 622