1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * taskstats.c - Export per-task statistics to userland 4 * 5 * Copyright (C) Shailabh Nagar, IBM Corp. 2006 6 * (C) Balbir Singh, IBM Corp. 2006 7 */ 8 9 #include <linux/kernel.h> 10 #include <linux/taskstats_kern.h> 11 #include <linux/tsacct_kern.h> 12 #include <linux/delayacct.h> 13 #include <linux/cpumask.h> 14 #include <linux/percpu.h> 15 #include <linux/slab.h> 16 #include <linux/cgroupstats.h> 17 #include <linux/cgroup.h> 18 #include <linux/fs.h> 19 #include <linux/file.h> 20 #include <linux/pid_namespace.h> 21 #include <net/genetlink.h> 22 #include <linux/atomic.h> 23 #include <linux/sched/cputime.h> 24 25 /* 26 * Maximum length of a cpumask that can be specified in 27 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute 28 */ 29 #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) 30 31 static DEFINE_PER_CPU(__u32, taskstats_seqnum); 32 static int family_registered; 33 struct kmem_cache *taskstats_cache; 34 35 static struct genl_family family; 36 37 static const struct nla_policy taskstats_cmd_get_policy[] = { 38 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 39 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 40 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 41 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 42 43 static const struct nla_policy cgroupstats_cmd_get_policy[] = { 44 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 45 }; 46 47 struct listener { 48 struct list_head list; 49 pid_t pid; 50 char valid; 51 }; 52 53 struct listener_list { 54 struct rw_semaphore sem; 55 struct list_head list; 56 }; 57 static DEFINE_PER_CPU(struct listener_list, listener_array); 58 59 enum actions { 60 REGISTER, 61 DEREGISTER, 62 CPU_DONT_CARE 63 }; 64 65 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 66 size_t size) 67 { 68 struct sk_buff *skb; 69 void *reply; 70 71 /* 72 * If new attributes are added, please revisit this allocation 73 */ 74 skb = genlmsg_new(size, GFP_KERNEL); 75 if (!skb) 76 return -ENOMEM; 77 78 if (!info) { 79 int seq = this_cpu_inc_return(taskstats_seqnum) - 1; 80 81 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); 82 } else 83 reply = genlmsg_put_reply(skb, info, &family, 0, cmd); 84 if (reply == NULL) { 85 nlmsg_free(skb); 86 return -EINVAL; 87 } 88 89 *skbp = skb; 90 return 0; 91 } 92 93 /* 94 * Send taskstats data in @skb to listener with nl_pid @pid 95 */ 96 static int send_reply(struct sk_buff *skb, struct genl_info *info) 97 { 98 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 99 void *reply = genlmsg_data(genlhdr); 100 101 genlmsg_end(skb, reply); 102 103 return genlmsg_reply(skb, info); 104 } 105 106 /* 107 * Send taskstats data in @skb to listeners registered for @cpu's exit data 108 */ 109 static void send_cpu_listeners(struct sk_buff *skb, 110 struct listener_list *listeners) 111 { 112 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 113 struct listener *s, *tmp; 114 struct sk_buff *skb_next, *skb_cur = skb; 115 void *reply = genlmsg_data(genlhdr); 116 int rc, delcount = 0; 117 118 genlmsg_end(skb, reply); 119 120 rc = 0; 121 down_read(&listeners->sem); 122 list_for_each_entry(s, &listeners->list, list) { 123 skb_next = NULL; 124 if (!list_is_last(&s->list, &listeners->list)) { 125 skb_next = skb_clone(skb_cur, GFP_KERNEL); 126 if (!skb_next) 127 break; 128 } 129 rc = genlmsg_unicast(&init_net, skb_cur, s->pid); 130 if (rc == -ECONNREFUSED) { 131 s->valid = 0; 132 delcount++; 133 } 134 skb_cur = skb_next; 135 } 136 up_read(&listeners->sem); 137 138 if (skb_cur) 139 nlmsg_free(skb_cur); 140 141 if (!delcount) 142 return; 143 144 /* Delete invalidated entries */ 145 down_write(&listeners->sem); 146 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 147 if (!s->valid) { 148 list_del(&s->list); 149 kfree(s); 150 } 151 } 152 up_write(&listeners->sem); 153 } 154 155 static void fill_stats(struct user_namespace *user_ns, 156 struct pid_namespace *pid_ns, 157 struct task_struct *tsk, struct taskstats *stats) 158 { 159 memset(stats, 0, sizeof(*stats)); 160 /* 161 * Each accounting subsystem adds calls to its functions to 162 * fill in relevant parts of struct taskstsats as follows 163 * 164 * per-task-foo(stats, tsk); 165 */ 166 167 delayacct_add_tsk(stats, tsk); 168 169 /* fill in basic acct fields */ 170 stats->version = TASKSTATS_VERSION; 171 stats->nvcsw = tsk->nvcsw; 172 stats->nivcsw = tsk->nivcsw; 173 bacct_add_tsk(user_ns, pid_ns, stats, tsk); 174 175 /* fill in extended acct fields */ 176 xacct_add_tsk(stats, tsk); 177 } 178 179 static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) 180 { 181 struct task_struct *tsk; 182 183 tsk = find_get_task_by_vpid(pid); 184 if (!tsk) 185 return -ESRCH; 186 fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats); 187 put_task_struct(tsk); 188 return 0; 189 } 190 191 static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) 192 { 193 struct task_struct *tsk, *first; 194 unsigned long flags; 195 int rc = -ESRCH; 196 u64 delta, utime, stime; 197 u64 start_time; 198 199 /* 200 * Add additional stats from live tasks except zombie thread group 201 * leaders who are already counted with the dead tasks 202 */ 203 rcu_read_lock(); 204 first = find_task_by_vpid(tgid); 205 206 if (!first || !lock_task_sighand(first, &flags)) 207 goto out; 208 209 if (first->signal->stats) 210 memcpy(stats, first->signal->stats, sizeof(*stats)); 211 else 212 memset(stats, 0, sizeof(*stats)); 213 214 tsk = first; 215 start_time = ktime_get_ns(); 216 do { 217 if (tsk->exit_state) 218 continue; 219 /* 220 * Accounting subsystem can call its functions here to 221 * fill in relevant parts of struct taskstsats as follows 222 * 223 * per-task-foo(stats, tsk); 224 */ 225 delayacct_add_tsk(stats, tsk); 226 227 /* calculate task elapsed time in nsec */ 228 delta = start_time - tsk->start_time; 229 /* Convert to micro seconds */ 230 do_div(delta, NSEC_PER_USEC); 231 stats->ac_etime += delta; 232 233 task_cputime(tsk, &utime, &stime); 234 stats->ac_utime += div_u64(utime, NSEC_PER_USEC); 235 stats->ac_stime += div_u64(stime, NSEC_PER_USEC); 236 237 stats->nvcsw += tsk->nvcsw; 238 stats->nivcsw += tsk->nivcsw; 239 } while_each_thread(first, tsk); 240 241 unlock_task_sighand(first, &flags); 242 rc = 0; 243 out: 244 rcu_read_unlock(); 245 246 stats->version = TASKSTATS_VERSION; 247 /* 248 * Accounting subsystems can also add calls here to modify 249 * fields of taskstats. 250 */ 251 return rc; 252 } 253 254 static void fill_tgid_exit(struct task_struct *tsk) 255 { 256 unsigned long flags; 257 258 spin_lock_irqsave(&tsk->sighand->siglock, flags); 259 if (!tsk->signal->stats) 260 goto ret; 261 262 /* 263 * Each accounting subsystem calls its functions here to 264 * accumalate its per-task stats for tsk, into the per-tgid structure 265 * 266 * per-task-foo(tsk->signal->stats, tsk); 267 */ 268 delayacct_add_tsk(tsk->signal->stats, tsk); 269 ret: 270 spin_unlock_irqrestore(&tsk->sighand->siglock, flags); 271 return; 272 } 273 274 static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) 275 { 276 struct listener_list *listeners; 277 struct listener *s, *tmp, *s2; 278 unsigned int cpu; 279 int ret = 0; 280 281 if (!cpumask_subset(mask, cpu_possible_mask)) 282 return -EINVAL; 283 284 if (current_user_ns() != &init_user_ns) 285 return -EINVAL; 286 287 if (task_active_pid_ns(current) != &init_pid_ns) 288 return -EINVAL; 289 290 if (isadd == REGISTER) { 291 for_each_cpu(cpu, mask) { 292 s = kmalloc_node(sizeof(struct listener), 293 GFP_KERNEL, cpu_to_node(cpu)); 294 if (!s) { 295 ret = -ENOMEM; 296 goto cleanup; 297 } 298 s->pid = pid; 299 s->valid = 1; 300 301 listeners = &per_cpu(listener_array, cpu); 302 down_write(&listeners->sem); 303 list_for_each_entry(s2, &listeners->list, list) { 304 if (s2->pid == pid && s2->valid) 305 goto exists; 306 } 307 list_add(&s->list, &listeners->list); 308 s = NULL; 309 exists: 310 up_write(&listeners->sem); 311 kfree(s); /* nop if NULL */ 312 } 313 return 0; 314 } 315 316 /* Deregister or cleanup */ 317 cleanup: 318 for_each_cpu(cpu, mask) { 319 listeners = &per_cpu(listener_array, cpu); 320 down_write(&listeners->sem); 321 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 322 if (s->pid == pid) { 323 list_del(&s->list); 324 kfree(s); 325 break; 326 } 327 } 328 up_write(&listeners->sem); 329 } 330 return ret; 331 } 332 333 static int parse(struct nlattr *na, struct cpumask *mask) 334 { 335 char *data; 336 int len; 337 int ret; 338 339 if (na == NULL) 340 return 1; 341 len = nla_len(na); 342 if (len > TASKSTATS_CPUMASK_MAXLEN) 343 return -E2BIG; 344 if (len < 1) 345 return -EINVAL; 346 data = kmalloc(len, GFP_KERNEL); 347 if (!data) 348 return -ENOMEM; 349 nla_strscpy(data, na, len); 350 ret = cpulist_parse(data, mask); 351 kfree(data); 352 return ret; 353 } 354 355 static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) 356 { 357 struct nlattr *na, *ret; 358 int aggr; 359 360 aggr = (type == TASKSTATS_TYPE_PID) 361 ? TASKSTATS_TYPE_AGGR_PID 362 : TASKSTATS_TYPE_AGGR_TGID; 363 364 na = nla_nest_start_noflag(skb, aggr); 365 if (!na) 366 goto err; 367 368 if (nla_put(skb, type, sizeof(pid), &pid) < 0) { 369 nla_nest_cancel(skb, na); 370 goto err; 371 } 372 ret = nla_reserve_64bit(skb, TASKSTATS_TYPE_STATS, 373 sizeof(struct taskstats), TASKSTATS_TYPE_NULL); 374 if (!ret) { 375 nla_nest_cancel(skb, na); 376 goto err; 377 } 378 nla_nest_end(skb, na); 379 380 return nla_data(ret); 381 err: 382 return NULL; 383 } 384 385 static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 386 { 387 int rc = 0; 388 struct sk_buff *rep_skb; 389 struct cgroupstats *stats; 390 struct nlattr *na; 391 size_t size; 392 u32 fd; 393 struct fd f; 394 395 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; 396 if (!na) 397 return -EINVAL; 398 399 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); 400 f = fdget(fd); 401 if (!f.file) 402 return 0; 403 404 size = nla_total_size(sizeof(struct cgroupstats)); 405 406 rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, 407 size); 408 if (rc < 0) 409 goto err; 410 411 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 412 sizeof(struct cgroupstats)); 413 if (na == NULL) { 414 nlmsg_free(rep_skb); 415 rc = -EMSGSIZE; 416 goto err; 417 } 418 419 stats = nla_data(na); 420 memset(stats, 0, sizeof(*stats)); 421 422 rc = cgroupstats_build(stats, f.file->f_path.dentry); 423 if (rc < 0) { 424 nlmsg_free(rep_skb); 425 goto err; 426 } 427 428 rc = send_reply(rep_skb, info); 429 430 err: 431 fdput(f); 432 return rc; 433 } 434 435 static int cmd_attr_register_cpumask(struct genl_info *info) 436 { 437 cpumask_var_t mask; 438 int rc; 439 440 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 441 return -ENOMEM; 442 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 443 if (rc < 0) 444 goto out; 445 rc = add_del_listener(info->snd_portid, mask, REGISTER); 446 out: 447 free_cpumask_var(mask); 448 return rc; 449 } 450 451 static int cmd_attr_deregister_cpumask(struct genl_info *info) 452 { 453 cpumask_var_t mask; 454 int rc; 455 456 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 457 return -ENOMEM; 458 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 459 if (rc < 0) 460 goto out; 461 rc = add_del_listener(info->snd_portid, mask, DEREGISTER); 462 out: 463 free_cpumask_var(mask); 464 return rc; 465 } 466 467 static size_t taskstats_packet_size(void) 468 { 469 size_t size; 470 471 size = nla_total_size(sizeof(u32)) + 472 nla_total_size_64bit(sizeof(struct taskstats)) + 473 nla_total_size(0); 474 475 return size; 476 } 477 478 static int cmd_attr_pid(struct genl_info *info) 479 { 480 struct taskstats *stats; 481 struct sk_buff *rep_skb; 482 size_t size; 483 u32 pid; 484 int rc; 485 486 size = taskstats_packet_size(); 487 488 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 489 if (rc < 0) 490 return rc; 491 492 rc = -EINVAL; 493 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 494 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); 495 if (!stats) 496 goto err; 497 498 rc = fill_stats_for_pid(pid, stats); 499 if (rc < 0) 500 goto err; 501 return send_reply(rep_skb, info); 502 err: 503 nlmsg_free(rep_skb); 504 return rc; 505 } 506 507 static int cmd_attr_tgid(struct genl_info *info) 508 { 509 struct taskstats *stats; 510 struct sk_buff *rep_skb; 511 size_t size; 512 u32 tgid; 513 int rc; 514 515 size = taskstats_packet_size(); 516 517 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 518 if (rc < 0) 519 return rc; 520 521 rc = -EINVAL; 522 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 523 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); 524 if (!stats) 525 goto err; 526 527 rc = fill_stats_for_tgid(tgid, stats); 528 if (rc < 0) 529 goto err; 530 return send_reply(rep_skb, info); 531 err: 532 nlmsg_free(rep_skb); 533 return rc; 534 } 535 536 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 537 { 538 if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) 539 return cmd_attr_register_cpumask(info); 540 else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) 541 return cmd_attr_deregister_cpumask(info); 542 else if (info->attrs[TASKSTATS_CMD_ATTR_PID]) 543 return cmd_attr_pid(info); 544 else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) 545 return cmd_attr_tgid(info); 546 else 547 return -EINVAL; 548 } 549 550 static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) 551 { 552 struct signal_struct *sig = tsk->signal; 553 struct taskstats *stats_new, *stats; 554 555 /* Pairs with smp_store_release() below. */ 556 stats = smp_load_acquire(&sig->stats); 557 if (stats || thread_group_empty(tsk)) 558 return stats; 559 560 /* No problem if kmem_cache_zalloc() fails */ 561 stats_new = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); 562 563 spin_lock_irq(&tsk->sighand->siglock); 564 stats = sig->stats; 565 if (!stats) { 566 /* 567 * Pairs with smp_store_release() above and order the 568 * kmem_cache_zalloc(). 569 */ 570 smp_store_release(&sig->stats, stats_new); 571 stats = stats_new; 572 stats_new = NULL; 573 } 574 spin_unlock_irq(&tsk->sighand->siglock); 575 576 if (stats_new) 577 kmem_cache_free(taskstats_cache, stats_new); 578 579 return stats; 580 } 581 582 /* Send pid data out on exit */ 583 void taskstats_exit(struct task_struct *tsk, int group_dead) 584 { 585 int rc; 586 struct listener_list *listeners; 587 struct taskstats *stats; 588 struct sk_buff *rep_skb; 589 size_t size; 590 int is_thread_group; 591 592 if (!family_registered) 593 return; 594 595 /* 596 * Size includes space for nested attributes 597 */ 598 size = taskstats_packet_size(); 599 600 is_thread_group = !!taskstats_tgid_alloc(tsk); 601 if (is_thread_group) { 602 /* PID + STATS + TGID + STATS */ 603 size = 2 * size; 604 /* fill the tsk->signal->stats structure */ 605 fill_tgid_exit(tsk); 606 } 607 608 listeners = raw_cpu_ptr(&listener_array); 609 if (list_empty(&listeners->list)) 610 return; 611 612 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); 613 if (rc < 0) 614 return; 615 616 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, 617 task_pid_nr_ns(tsk, &init_pid_ns)); 618 if (!stats) 619 goto err; 620 621 fill_stats(&init_user_ns, &init_pid_ns, tsk, stats); 622 623 /* 624 * Doesn't matter if tsk is the leader or the last group member leaving 625 */ 626 if (!is_thread_group || !group_dead) 627 goto send; 628 629 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, 630 task_tgid_nr_ns(tsk, &init_pid_ns)); 631 if (!stats) 632 goto err; 633 634 memcpy(stats, tsk->signal->stats, sizeof(*stats)); 635 636 send: 637 send_cpu_listeners(rep_skb, listeners); 638 return; 639 err: 640 nlmsg_free(rep_skb); 641 } 642 643 static const struct genl_ops taskstats_ops[] = { 644 { 645 .cmd = TASKSTATS_CMD_GET, 646 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 647 .doit = taskstats_user_cmd, 648 .policy = taskstats_cmd_get_policy, 649 .maxattr = ARRAY_SIZE(taskstats_cmd_get_policy) - 1, 650 .flags = GENL_ADMIN_PERM, 651 }, 652 { 653 .cmd = CGROUPSTATS_CMD_GET, 654 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 655 .doit = cgroupstats_user_cmd, 656 .policy = cgroupstats_cmd_get_policy, 657 .maxattr = ARRAY_SIZE(cgroupstats_cmd_get_policy) - 1, 658 }, 659 }; 660 661 static struct genl_family family __ro_after_init = { 662 .name = TASKSTATS_GENL_NAME, 663 .version = TASKSTATS_GENL_VERSION, 664 .module = THIS_MODULE, 665 .ops = taskstats_ops, 666 .n_ops = ARRAY_SIZE(taskstats_ops), 667 }; 668 669 /* Needed early in initialization */ 670 void __init taskstats_init_early(void) 671 { 672 unsigned int i; 673 674 taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC); 675 for_each_possible_cpu(i) { 676 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); 677 init_rwsem(&(per_cpu(listener_array, i).sem)); 678 } 679 } 680 681 static int __init taskstats_init(void) 682 { 683 int rc; 684 685 rc = genl_register_family(&family); 686 if (rc) 687 return rc; 688 689 family_registered = 1; 690 pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 691 return 0; 692 } 693 694 /* 695 * late initcall ensures initialization of statistics collection 696 * mechanisms precedes initialization of the taskstats interface 697 */ 698 late_initcall(taskstats_init); 699