1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * taskstats.c - Export per-task statistics to userland 4 * 5 * Copyright (C) Shailabh Nagar, IBM Corp. 2006 6 * (C) Balbir Singh, IBM Corp. 2006 7 */ 8 9 #include <linux/kernel.h> 10 #include <linux/taskstats_kern.h> 11 #include <linux/tsacct_kern.h> 12 #include <linux/acct.h> 13 #include <linux/delayacct.h> 14 #include <linux/cpumask.h> 15 #include <linux/percpu.h> 16 #include <linux/slab.h> 17 #include <linux/cgroupstats.h> 18 #include <linux/cgroup.h> 19 #include <linux/fs.h> 20 #include <linux/file.h> 21 #include <linux/pid_namespace.h> 22 #include <net/genetlink.h> 23 #include <linux/atomic.h> 24 #include <linux/sched/cputime.h> 25 26 /* 27 * Maximum length of a cpumask that can be specified in 28 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute 29 */ 30 #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) 31 32 static DEFINE_PER_CPU(__u32, taskstats_seqnum); 33 static int family_registered; 34 struct kmem_cache *taskstats_cache; 35 36 static struct genl_family family; 37 38 static const struct nla_policy taskstats_cmd_get_policy[] = { 39 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 40 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 41 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 42 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 43 44 static const struct nla_policy cgroupstats_cmd_get_policy[] = { 45 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 46 }; 47 48 struct listener { 49 struct list_head list; 50 pid_t pid; 51 char valid; 52 }; 53 54 struct listener_list { 55 struct rw_semaphore sem; 56 struct list_head list; 57 }; 58 static DEFINE_PER_CPU(struct listener_list, listener_array); 59 60 enum actions { 61 REGISTER, 62 DEREGISTER, 63 CPU_DONT_CARE 64 }; 65 66 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 67 size_t size) 68 { 69 struct sk_buff *skb; 70 void *reply; 71 72 /* 73 * If new attributes are added, please revisit this allocation 74 */ 75 skb = genlmsg_new(size, GFP_KERNEL); 76 if (!skb) 77 return -ENOMEM; 78 79 if (!info) { 80 int seq = this_cpu_inc_return(taskstats_seqnum) - 1; 81 82 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); 83 } else 84 reply = genlmsg_put_reply(skb, info, &family, 0, cmd); 85 if (reply == NULL) { 86 nlmsg_free(skb); 87 return -EINVAL; 88 } 89 90 *skbp = skb; 91 return 0; 92 } 93 94 /* 95 * Send taskstats data in @skb to listener with nl_pid @pid 96 */ 97 static int send_reply(struct sk_buff *skb, struct genl_info *info) 98 { 99 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 100 void *reply = genlmsg_data(genlhdr); 101 102 genlmsg_end(skb, reply); 103 104 return genlmsg_reply(skb, info); 105 } 106 107 /* 108 * Send taskstats data in @skb to listeners registered for @cpu's exit data 109 */ 110 static void send_cpu_listeners(struct sk_buff *skb, 111 struct listener_list *listeners) 112 { 113 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 114 struct listener *s, *tmp; 115 struct sk_buff *skb_next, *skb_cur = skb; 116 void *reply = genlmsg_data(genlhdr); 117 int delcount = 0; 118 119 genlmsg_end(skb, reply); 120 121 down_read(&listeners->sem); 122 list_for_each_entry(s, &listeners->list, list) { 123 int rc; 124 125 skb_next = NULL; 126 if (!list_is_last(&s->list, &listeners->list)) { 127 skb_next = skb_clone(skb_cur, GFP_KERNEL); 128 if (!skb_next) 129 break; 130 } 131 rc = genlmsg_unicast(&init_net, skb_cur, s->pid); 132 if (rc == -ECONNREFUSED) { 133 s->valid = 0; 134 delcount++; 135 } 136 skb_cur = skb_next; 137 } 138 up_read(&listeners->sem); 139 140 if (skb_cur) 141 nlmsg_free(skb_cur); 142 143 if (!delcount) 144 return; 145 146 /* Delete invalidated entries */ 147 down_write(&listeners->sem); 148 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 149 if (!s->valid) { 150 list_del(&s->list); 151 kfree(s); 152 } 153 } 154 up_write(&listeners->sem); 155 } 156 157 static void exe_add_tsk(struct taskstats *stats, struct task_struct *tsk) 158 { 159 /* No idea if I'm allowed to access that here, now. */ 160 struct file *exe_file = get_task_exe_file(tsk); 161 162 if (exe_file) { 163 /* Following cp_new_stat64() in stat.c . */ 164 stats->ac_exe_dev = 165 huge_encode_dev(exe_file->f_inode->i_sb->s_dev); 166 stats->ac_exe_inode = exe_file->f_inode->i_ino; 167 fput(exe_file); 168 } else { 169 stats->ac_exe_dev = 0; 170 stats->ac_exe_inode = 0; 171 } 172 } 173 174 static void fill_stats(struct user_namespace *user_ns, 175 struct pid_namespace *pid_ns, 176 struct task_struct *tsk, struct taskstats *stats) 177 { 178 memset(stats, 0, sizeof(*stats)); 179 /* 180 * Each accounting subsystem adds calls to its functions to 181 * fill in relevant parts of struct taskstsats as follows 182 * 183 * per-task-foo(stats, tsk); 184 */ 185 186 delayacct_add_tsk(stats, tsk); 187 188 /* fill in basic acct fields */ 189 stats->version = TASKSTATS_VERSION; 190 stats->nvcsw = tsk->nvcsw; 191 stats->nivcsw = tsk->nivcsw; 192 bacct_add_tsk(user_ns, pid_ns, stats, tsk); 193 194 /* fill in extended acct fields */ 195 xacct_add_tsk(stats, tsk); 196 197 /* add executable info */ 198 exe_add_tsk(stats, tsk); 199 } 200 201 static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) 202 { 203 struct task_struct *tsk; 204 205 tsk = find_get_task_by_vpid(pid); 206 if (!tsk) 207 return -ESRCH; 208 fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats); 209 put_task_struct(tsk); 210 return 0; 211 } 212 213 static void tgid_stats_add_task(struct taskstats *stats, 214 struct task_struct *tsk, u64 now_ns) 215 { 216 u64 delta, utime, stime; 217 218 /* 219 * Each accounting subsystem calls its functions here to 220 * accumulate its per-task stats for tsk, into the per-tgid structure 221 * 222 * per-task-foo(stats, tsk); 223 */ 224 delayacct_add_tsk(stats, tsk); 225 226 /* calculate task elapsed time in nsec */ 227 delta = now_ns - tsk->start_time; 228 /* Convert to micro seconds */ 229 do_div(delta, NSEC_PER_USEC); 230 stats->ac_etime += delta; 231 232 task_cputime(tsk, &utime, &stime); 233 stats->ac_utime += div_u64(utime, NSEC_PER_USEC); 234 stats->ac_stime += div_u64(stime, NSEC_PER_USEC); 235 236 stats->nvcsw += tsk->nvcsw; 237 stats->nivcsw += tsk->nivcsw; 238 } 239 240 static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) 241 { 242 struct task_struct *tsk, *first; 243 unsigned long flags; 244 int rc = -ESRCH; 245 u64 now_ns; 246 247 /* 248 * Add additional stats from live tasks except zombie thread group 249 * leaders who are already counted with the dead tasks 250 */ 251 rcu_read_lock(); 252 first = find_task_by_vpid(tgid); 253 254 if (!first || !lock_task_sighand(first, &flags)) 255 goto out; 256 257 if (first->signal->stats) 258 memcpy(stats, first->signal->stats, sizeof(*stats)); 259 else 260 memset(stats, 0, sizeof(*stats)); 261 262 now_ns = ktime_get_ns(); 263 for_each_thread(first, tsk) { 264 if (tsk->exit_state) 265 continue; 266 267 tgid_stats_add_task(stats, tsk, now_ns); 268 } 269 270 unlock_task_sighand(first, &flags); 271 rc = 0; 272 out: 273 rcu_read_unlock(); 274 275 stats->version = TASKSTATS_VERSION; 276 /* 277 * Accounting subsystems can also add calls here to modify 278 * fields of taskstats. 279 */ 280 return rc; 281 } 282 283 static void fill_tgid_exit(struct task_struct *tsk) 284 { 285 unsigned long flags; 286 u64 now_ns; 287 288 spin_lock_irqsave(&tsk->sighand->siglock, flags); 289 if (!tsk->signal->stats) 290 goto ret; 291 292 now_ns = ktime_get_ns(); 293 tgid_stats_add_task(tsk->signal->stats, tsk, now_ns); 294 ret: 295 spin_unlock_irqrestore(&tsk->sighand->siglock, flags); 296 return; 297 } 298 299 static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) 300 { 301 struct listener_list *listeners; 302 struct listener *s, *tmp, *s2; 303 unsigned int cpu; 304 int ret = 0; 305 306 if (!cpumask_subset(mask, cpu_possible_mask)) 307 return -EINVAL; 308 309 if (current_user_ns() != &init_user_ns) 310 return -EINVAL; 311 312 if (task_active_pid_ns(current) != &init_pid_ns) 313 return -EINVAL; 314 315 if (isadd == REGISTER) { 316 for_each_cpu(cpu, mask) { 317 s = kmalloc_node(sizeof(struct listener), 318 GFP_KERNEL, cpu_to_node(cpu)); 319 if (!s) { 320 ret = -ENOMEM; 321 goto cleanup; 322 } 323 s->pid = pid; 324 s->valid = 1; 325 326 listeners = &per_cpu(listener_array, cpu); 327 down_write(&listeners->sem); 328 list_for_each_entry(s2, &listeners->list, list) { 329 if (s2->pid == pid && s2->valid) 330 goto exists; 331 } 332 list_add(&s->list, &listeners->list); 333 s = NULL; 334 exists: 335 up_write(&listeners->sem); 336 kfree(s); /* nop if NULL */ 337 } 338 return 0; 339 } 340 341 /* Deregister or cleanup */ 342 cleanup: 343 for_each_cpu(cpu, mask) { 344 listeners = &per_cpu(listener_array, cpu); 345 down_write(&listeners->sem); 346 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 347 if (s->pid == pid) { 348 list_del(&s->list); 349 kfree(s); 350 break; 351 } 352 } 353 up_write(&listeners->sem); 354 } 355 return ret; 356 } 357 358 static int parse(struct nlattr *na, struct cpumask *mask) 359 { 360 char *data; 361 int len; 362 int ret; 363 364 if (na == NULL) 365 return 1; 366 len = nla_len(na); 367 if (len > TASKSTATS_CPUMASK_MAXLEN) 368 return -E2BIG; 369 if (len < 1) 370 return -EINVAL; 371 data = kmalloc(len, GFP_KERNEL); 372 if (!data) 373 return -ENOMEM; 374 nla_strscpy(data, na, len); 375 ret = cpulist_parse(data, mask); 376 kfree(data); 377 return ret; 378 } 379 380 static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) 381 { 382 struct nlattr *na, *ret; 383 int aggr; 384 385 aggr = (type == TASKSTATS_TYPE_PID) 386 ? TASKSTATS_TYPE_AGGR_PID 387 : TASKSTATS_TYPE_AGGR_TGID; 388 389 na = nla_nest_start_noflag(skb, aggr); 390 if (!na) 391 goto err; 392 393 if (nla_put(skb, type, sizeof(pid), &pid) < 0) { 394 nla_nest_cancel(skb, na); 395 goto err; 396 } 397 ret = nla_reserve_64bit(skb, TASKSTATS_TYPE_STATS, 398 sizeof(struct taskstats), TASKSTATS_TYPE_NULL); 399 if (!ret) { 400 nla_nest_cancel(skb, na); 401 goto err; 402 } 403 nla_nest_end(skb, na); 404 405 return nla_data(ret); 406 err: 407 return NULL; 408 } 409 410 static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 411 { 412 int rc = 0; 413 struct sk_buff *rep_skb; 414 struct cgroupstats *stats; 415 struct nlattr *na; 416 size_t size; 417 u32 fd; 418 419 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; 420 if (!na) 421 return -EINVAL; 422 423 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); 424 CLASS(fd, f)(fd); 425 if (fd_empty(f)) 426 return 0; 427 428 size = nla_total_size(sizeof(struct cgroupstats)); 429 430 rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, 431 size); 432 if (rc < 0) 433 return rc; 434 435 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 436 sizeof(struct cgroupstats)); 437 if (na == NULL) { 438 nlmsg_free(rep_skb); 439 return -EMSGSIZE; 440 } 441 442 stats = nla_data(na); 443 memset(stats, 0, sizeof(*stats)); 444 445 rc = cgroupstats_build(stats, fd_file(f)->f_path.dentry); 446 if (rc < 0) { 447 nlmsg_free(rep_skb); 448 return rc; 449 } 450 451 return send_reply(rep_skb, info); 452 } 453 454 static int cmd_attr_register_cpumask(struct genl_info *info) 455 { 456 cpumask_var_t mask; 457 int rc; 458 459 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 460 return -ENOMEM; 461 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 462 if (rc < 0) 463 goto out; 464 rc = add_del_listener(info->snd_portid, mask, REGISTER); 465 out: 466 free_cpumask_var(mask); 467 return rc; 468 } 469 470 static int cmd_attr_deregister_cpumask(struct genl_info *info) 471 { 472 cpumask_var_t mask; 473 int rc; 474 475 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 476 return -ENOMEM; 477 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 478 if (rc < 0) 479 goto out; 480 rc = add_del_listener(info->snd_portid, mask, DEREGISTER); 481 out: 482 free_cpumask_var(mask); 483 return rc; 484 } 485 486 static size_t taskstats_packet_size(void) 487 { 488 size_t size; 489 490 size = nla_total_size(sizeof(u32)) + 491 nla_total_size_64bit(sizeof(struct taskstats)) + 492 nla_total_size(0); 493 494 return size; 495 } 496 497 static int cmd_attr_pid(struct genl_info *info) 498 { 499 struct taskstats *stats; 500 struct sk_buff *rep_skb; 501 size_t size; 502 u32 pid; 503 int rc; 504 505 size = taskstats_packet_size(); 506 507 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 508 if (rc < 0) 509 return rc; 510 511 rc = -EINVAL; 512 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 513 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); 514 if (!stats) 515 goto err; 516 517 rc = fill_stats_for_pid(pid, stats); 518 if (rc < 0) 519 goto err; 520 return send_reply(rep_skb, info); 521 err: 522 nlmsg_free(rep_skb); 523 return rc; 524 } 525 526 static int cmd_attr_tgid(struct genl_info *info) 527 { 528 struct taskstats *stats; 529 struct sk_buff *rep_skb; 530 size_t size; 531 u32 tgid; 532 int rc; 533 534 size = taskstats_packet_size(); 535 536 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 537 if (rc < 0) 538 return rc; 539 540 rc = -EINVAL; 541 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 542 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); 543 if (!stats) 544 goto err; 545 546 rc = fill_stats_for_tgid(tgid, stats); 547 if (rc < 0) 548 goto err; 549 return send_reply(rep_skb, info); 550 err: 551 nlmsg_free(rep_skb); 552 return rc; 553 } 554 555 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 556 { 557 if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) 558 return cmd_attr_register_cpumask(info); 559 else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) 560 return cmd_attr_deregister_cpumask(info); 561 else if (info->attrs[TASKSTATS_CMD_ATTR_PID]) 562 return cmd_attr_pid(info); 563 else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) 564 return cmd_attr_tgid(info); 565 else 566 return -EINVAL; 567 } 568 569 static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) 570 { 571 struct signal_struct *sig = tsk->signal; 572 struct taskstats *stats_new, *stats; 573 574 /* Pairs with smp_store_release() below. */ 575 stats = smp_load_acquire(&sig->stats); 576 if (stats || thread_group_empty(tsk)) 577 return stats; 578 579 /* No problem if kmem_cache_zalloc() fails */ 580 stats_new = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); 581 582 spin_lock_irq(&tsk->sighand->siglock); 583 stats = sig->stats; 584 if (!stats) { 585 /* 586 * Pairs with smp_store_release() above and order the 587 * kmem_cache_zalloc(). 588 */ 589 smp_store_release(&sig->stats, stats_new); 590 stats = stats_new; 591 stats_new = NULL; 592 } 593 spin_unlock_irq(&tsk->sighand->siglock); 594 595 if (stats_new) 596 kmem_cache_free(taskstats_cache, stats_new); 597 598 return stats; 599 } 600 601 /* Send pid data out on exit */ 602 void taskstats_exit(struct task_struct *tsk, int group_dead) 603 { 604 int rc; 605 struct listener_list *listeners; 606 struct taskstats *stats; 607 struct sk_buff *rep_skb; 608 size_t size; 609 int is_thread_group; 610 611 if (!family_registered) 612 return; 613 614 /* 615 * Size includes space for nested attributes 616 */ 617 size = taskstats_packet_size(); 618 619 is_thread_group = !!taskstats_tgid_alloc(tsk); 620 if (is_thread_group) { 621 /* PID + STATS + TGID + STATS */ 622 size = 2 * size; 623 /* fill the tsk->signal->stats structure */ 624 fill_tgid_exit(tsk); 625 } 626 627 listeners = raw_cpu_ptr(&listener_array); 628 if (list_empty(&listeners->list)) 629 return; 630 631 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); 632 if (rc < 0) 633 return; 634 635 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, 636 task_pid_nr_ns(tsk, &init_pid_ns)); 637 if (!stats) 638 goto err; 639 640 fill_stats(&init_user_ns, &init_pid_ns, tsk, stats); 641 if (group_dead) 642 stats->ac_flag |= AGROUP; 643 644 /* 645 * Doesn't matter if tsk is the leader or the last group member leaving 646 */ 647 if (!is_thread_group || !group_dead) 648 goto send; 649 650 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, 651 task_tgid_nr_ns(tsk, &init_pid_ns)); 652 if (!stats) 653 goto err; 654 655 memcpy(stats, tsk->signal->stats, sizeof(*stats)); 656 stats->version = TASKSTATS_VERSION; 657 658 send: 659 send_cpu_listeners(rep_skb, listeners); 660 return; 661 err: 662 nlmsg_free(rep_skb); 663 } 664 665 static const struct genl_ops taskstats_ops[] = { 666 { 667 .cmd = TASKSTATS_CMD_GET, 668 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 669 .doit = taskstats_user_cmd, 670 .policy = taskstats_cmd_get_policy, 671 .maxattr = ARRAY_SIZE(taskstats_cmd_get_policy) - 1, 672 .flags = GENL_ADMIN_PERM, 673 }, 674 { 675 .cmd = CGROUPSTATS_CMD_GET, 676 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 677 .doit = cgroupstats_user_cmd, 678 .policy = cgroupstats_cmd_get_policy, 679 .maxattr = ARRAY_SIZE(cgroupstats_cmd_get_policy) - 1, 680 }, 681 }; 682 683 static struct genl_family family __ro_after_init = { 684 .name = TASKSTATS_GENL_NAME, 685 .version = TASKSTATS_GENL_VERSION, 686 .module = THIS_MODULE, 687 .ops = taskstats_ops, 688 .n_ops = ARRAY_SIZE(taskstats_ops), 689 .resv_start_op = CGROUPSTATS_CMD_GET + 1, 690 .netnsok = true, 691 }; 692 693 /* Needed early in initialization */ 694 void __init taskstats_init_early(void) 695 { 696 unsigned int i; 697 698 taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC); 699 for_each_possible_cpu(i) { 700 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); 701 init_rwsem(&(per_cpu(listener_array, i).sem)); 702 } 703 } 704 705 static int __init taskstats_init(void) 706 { 707 int rc; 708 709 rc = genl_register_family(&family); 710 if (rc) 711 return rc; 712 713 family_registered = 1; 714 pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 715 return 0; 716 } 717 718 /* 719 * late initcall ensures initialization of statistics collection 720 * mechanisms precedes initialization of the taskstats interface 721 */ 722 late_initcall(taskstats_init); 723