1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * taskstats.c - Export per-task statistics to userland 4 * 5 * Copyright (C) Shailabh Nagar, IBM Corp. 2006 6 * (C) Balbir Singh, IBM Corp. 2006 7 */ 8 9 #include <linux/kernel.h> 10 #include <linux/taskstats_kern.h> 11 #include <linux/tsacct_kern.h> 12 #include <linux/delayacct.h> 13 #include <linux/cpumask.h> 14 #include <linux/percpu.h> 15 #include <linux/slab.h> 16 #include <linux/cgroupstats.h> 17 #include <linux/cgroup.h> 18 #include <linux/fs.h> 19 #include <linux/file.h> 20 #include <linux/pid_namespace.h> 21 #include <net/genetlink.h> 22 #include <linux/atomic.h> 23 #include <linux/sched/cputime.h> 24 25 /* 26 * Maximum length of a cpumask that can be specified in 27 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute 28 */ 29 #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) 30 31 static DEFINE_PER_CPU(__u32, taskstats_seqnum); 32 static int family_registered; 33 struct kmem_cache *taskstats_cache; 34 35 static struct genl_family family; 36 37 static const struct nla_policy taskstats_cmd_get_policy[] = { 38 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 39 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 40 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 41 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 42 43 static const struct nla_policy cgroupstats_cmd_get_policy[] = { 44 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 45 }; 46 47 struct listener { 48 struct list_head list; 49 pid_t pid; 50 char valid; 51 }; 52 53 struct listener_list { 54 struct rw_semaphore sem; 55 struct list_head list; 56 }; 57 static DEFINE_PER_CPU(struct listener_list, listener_array); 58 59 enum actions { 60 REGISTER, 61 DEREGISTER, 62 CPU_DONT_CARE 63 }; 64 65 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 66 size_t size) 67 { 68 struct sk_buff *skb; 69 void *reply; 70 71 /* 72 * If new attributes are added, please revisit this allocation 73 */ 74 skb = genlmsg_new(size, GFP_KERNEL); 75 if (!skb) 76 return -ENOMEM; 77 78 if (!info) { 79 int seq = this_cpu_inc_return(taskstats_seqnum) - 1; 80 81 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); 82 } else 83 reply = genlmsg_put_reply(skb, info, &family, 0, cmd); 84 if (reply == NULL) { 85 nlmsg_free(skb); 86 return -EINVAL; 87 } 88 89 *skbp = skb; 90 return 0; 91 } 92 93 /* 94 * Send taskstats data in @skb to listener with nl_pid @pid 95 */ 96 static int send_reply(struct sk_buff *skb, struct genl_info *info) 97 { 98 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 99 void *reply = genlmsg_data(genlhdr); 100 101 genlmsg_end(skb, reply); 102 103 return genlmsg_reply(skb, info); 104 } 105 106 /* 107 * Send taskstats data in @skb to listeners registered for @cpu's exit data 108 */ 109 static void send_cpu_listeners(struct sk_buff *skb, 110 struct listener_list *listeners) 111 { 112 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 113 struct listener *s, *tmp; 114 struct sk_buff *skb_next, *skb_cur = skb; 115 void *reply = genlmsg_data(genlhdr); 116 int delcount = 0; 117 118 genlmsg_end(skb, reply); 119 120 down_read(&listeners->sem); 121 list_for_each_entry(s, &listeners->list, list) { 122 int rc; 123 124 skb_next = NULL; 125 if (!list_is_last(&s->list, &listeners->list)) { 126 skb_next = skb_clone(skb_cur, GFP_KERNEL); 127 if (!skb_next) 128 break; 129 } 130 rc = genlmsg_unicast(&init_net, skb_cur, s->pid); 131 if (rc == -ECONNREFUSED) { 132 s->valid = 0; 133 delcount++; 134 } 135 skb_cur = skb_next; 136 } 137 up_read(&listeners->sem); 138 139 if (skb_cur) 140 nlmsg_free(skb_cur); 141 142 if (!delcount) 143 return; 144 145 /* Delete invalidated entries */ 146 down_write(&listeners->sem); 147 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 148 if (!s->valid) { 149 list_del(&s->list); 150 kfree(s); 151 } 152 } 153 up_write(&listeners->sem); 154 } 155 156 static void fill_stats(struct user_namespace *user_ns, 157 struct pid_namespace *pid_ns, 158 struct task_struct *tsk, struct taskstats *stats) 159 { 160 memset(stats, 0, sizeof(*stats)); 161 /* 162 * Each accounting subsystem adds calls to its functions to 163 * fill in relevant parts of struct taskstsats as follows 164 * 165 * per-task-foo(stats, tsk); 166 */ 167 168 delayacct_add_tsk(stats, tsk); 169 170 /* fill in basic acct fields */ 171 stats->version = TASKSTATS_VERSION; 172 stats->nvcsw = tsk->nvcsw; 173 stats->nivcsw = tsk->nivcsw; 174 bacct_add_tsk(user_ns, pid_ns, stats, tsk); 175 176 /* fill in extended acct fields */ 177 xacct_add_tsk(stats, tsk); 178 } 179 180 static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) 181 { 182 struct task_struct *tsk; 183 184 tsk = find_get_task_by_vpid(pid); 185 if (!tsk) 186 return -ESRCH; 187 fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats); 188 put_task_struct(tsk); 189 return 0; 190 } 191 192 static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) 193 { 194 struct task_struct *tsk, *first; 195 unsigned long flags; 196 int rc = -ESRCH; 197 u64 delta, utime, stime; 198 u64 start_time; 199 200 /* 201 * Add additional stats from live tasks except zombie thread group 202 * leaders who are already counted with the dead tasks 203 */ 204 rcu_read_lock(); 205 first = find_task_by_vpid(tgid); 206 207 if (!first || !lock_task_sighand(first, &flags)) 208 goto out; 209 210 if (first->signal->stats) 211 memcpy(stats, first->signal->stats, sizeof(*stats)); 212 else 213 memset(stats, 0, sizeof(*stats)); 214 215 tsk = first; 216 start_time = ktime_get_ns(); 217 do { 218 if (tsk->exit_state) 219 continue; 220 /* 221 * Accounting subsystem can call its functions here to 222 * fill in relevant parts of struct taskstsats as follows 223 * 224 * per-task-foo(stats, tsk); 225 */ 226 delayacct_add_tsk(stats, tsk); 227 228 /* calculate task elapsed time in nsec */ 229 delta = start_time - tsk->start_time; 230 /* Convert to micro seconds */ 231 do_div(delta, NSEC_PER_USEC); 232 stats->ac_etime += delta; 233 234 task_cputime(tsk, &utime, &stime); 235 stats->ac_utime += div_u64(utime, NSEC_PER_USEC); 236 stats->ac_stime += div_u64(stime, NSEC_PER_USEC); 237 238 stats->nvcsw += tsk->nvcsw; 239 stats->nivcsw += tsk->nivcsw; 240 } while_each_thread(first, tsk); 241 242 unlock_task_sighand(first, &flags); 243 rc = 0; 244 out: 245 rcu_read_unlock(); 246 247 stats->version = TASKSTATS_VERSION; 248 /* 249 * Accounting subsystems can also add calls here to modify 250 * fields of taskstats. 251 */ 252 return rc; 253 } 254 255 static void fill_tgid_exit(struct task_struct *tsk) 256 { 257 unsigned long flags; 258 259 spin_lock_irqsave(&tsk->sighand->siglock, flags); 260 if (!tsk->signal->stats) 261 goto ret; 262 263 /* 264 * Each accounting subsystem calls its functions here to 265 * accumalate its per-task stats for tsk, into the per-tgid structure 266 * 267 * per-task-foo(tsk->signal->stats, tsk); 268 */ 269 delayacct_add_tsk(tsk->signal->stats, tsk); 270 ret: 271 spin_unlock_irqrestore(&tsk->sighand->siglock, flags); 272 return; 273 } 274 275 static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) 276 { 277 struct listener_list *listeners; 278 struct listener *s, *tmp, *s2; 279 unsigned int cpu; 280 int ret = 0; 281 282 if (!cpumask_subset(mask, cpu_possible_mask)) 283 return -EINVAL; 284 285 if (current_user_ns() != &init_user_ns) 286 return -EINVAL; 287 288 if (task_active_pid_ns(current) != &init_pid_ns) 289 return -EINVAL; 290 291 if (isadd == REGISTER) { 292 for_each_cpu(cpu, mask) { 293 s = kmalloc_node(sizeof(struct listener), 294 GFP_KERNEL, cpu_to_node(cpu)); 295 if (!s) { 296 ret = -ENOMEM; 297 goto cleanup; 298 } 299 s->pid = pid; 300 s->valid = 1; 301 302 listeners = &per_cpu(listener_array, cpu); 303 down_write(&listeners->sem); 304 list_for_each_entry(s2, &listeners->list, list) { 305 if (s2->pid == pid && s2->valid) 306 goto exists; 307 } 308 list_add(&s->list, &listeners->list); 309 s = NULL; 310 exists: 311 up_write(&listeners->sem); 312 kfree(s); /* nop if NULL */ 313 } 314 return 0; 315 } 316 317 /* Deregister or cleanup */ 318 cleanup: 319 for_each_cpu(cpu, mask) { 320 listeners = &per_cpu(listener_array, cpu); 321 down_write(&listeners->sem); 322 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 323 if (s->pid == pid) { 324 list_del(&s->list); 325 kfree(s); 326 break; 327 } 328 } 329 up_write(&listeners->sem); 330 } 331 return ret; 332 } 333 334 static int parse(struct nlattr *na, struct cpumask *mask) 335 { 336 char *data; 337 int len; 338 int ret; 339 340 if (na == NULL) 341 return 1; 342 len = nla_len(na); 343 if (len > TASKSTATS_CPUMASK_MAXLEN) 344 return -E2BIG; 345 if (len < 1) 346 return -EINVAL; 347 data = kmalloc(len, GFP_KERNEL); 348 if (!data) 349 return -ENOMEM; 350 nla_strscpy(data, na, len); 351 ret = cpulist_parse(data, mask); 352 kfree(data); 353 return ret; 354 } 355 356 static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) 357 { 358 struct nlattr *na, *ret; 359 int aggr; 360 361 aggr = (type == TASKSTATS_TYPE_PID) 362 ? TASKSTATS_TYPE_AGGR_PID 363 : TASKSTATS_TYPE_AGGR_TGID; 364 365 na = nla_nest_start_noflag(skb, aggr); 366 if (!na) 367 goto err; 368 369 if (nla_put(skb, type, sizeof(pid), &pid) < 0) { 370 nla_nest_cancel(skb, na); 371 goto err; 372 } 373 ret = nla_reserve_64bit(skb, TASKSTATS_TYPE_STATS, 374 sizeof(struct taskstats), TASKSTATS_TYPE_NULL); 375 if (!ret) { 376 nla_nest_cancel(skb, na); 377 goto err; 378 } 379 nla_nest_end(skb, na); 380 381 return nla_data(ret); 382 err: 383 return NULL; 384 } 385 386 static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 387 { 388 int rc = 0; 389 struct sk_buff *rep_skb; 390 struct cgroupstats *stats; 391 struct nlattr *na; 392 size_t size; 393 u32 fd; 394 struct fd f; 395 396 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; 397 if (!na) 398 return -EINVAL; 399 400 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); 401 f = fdget(fd); 402 if (!f.file) 403 return 0; 404 405 size = nla_total_size(sizeof(struct cgroupstats)); 406 407 rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, 408 size); 409 if (rc < 0) 410 goto err; 411 412 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 413 sizeof(struct cgroupstats)); 414 if (na == NULL) { 415 nlmsg_free(rep_skb); 416 rc = -EMSGSIZE; 417 goto err; 418 } 419 420 stats = nla_data(na); 421 memset(stats, 0, sizeof(*stats)); 422 423 rc = cgroupstats_build(stats, f.file->f_path.dentry); 424 if (rc < 0) { 425 nlmsg_free(rep_skb); 426 goto err; 427 } 428 429 rc = send_reply(rep_skb, info); 430 431 err: 432 fdput(f); 433 return rc; 434 } 435 436 static int cmd_attr_register_cpumask(struct genl_info *info) 437 { 438 cpumask_var_t mask; 439 int rc; 440 441 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 442 return -ENOMEM; 443 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 444 if (rc < 0) 445 goto out; 446 rc = add_del_listener(info->snd_portid, mask, REGISTER); 447 out: 448 free_cpumask_var(mask); 449 return rc; 450 } 451 452 static int cmd_attr_deregister_cpumask(struct genl_info *info) 453 { 454 cpumask_var_t mask; 455 int rc; 456 457 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 458 return -ENOMEM; 459 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 460 if (rc < 0) 461 goto out; 462 rc = add_del_listener(info->snd_portid, mask, DEREGISTER); 463 out: 464 free_cpumask_var(mask); 465 return rc; 466 } 467 468 static size_t taskstats_packet_size(void) 469 { 470 size_t size; 471 472 size = nla_total_size(sizeof(u32)) + 473 nla_total_size_64bit(sizeof(struct taskstats)) + 474 nla_total_size(0); 475 476 return size; 477 } 478 479 static int cmd_attr_pid(struct genl_info *info) 480 { 481 struct taskstats *stats; 482 struct sk_buff *rep_skb; 483 size_t size; 484 u32 pid; 485 int rc; 486 487 size = taskstats_packet_size(); 488 489 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 490 if (rc < 0) 491 return rc; 492 493 rc = -EINVAL; 494 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 495 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); 496 if (!stats) 497 goto err; 498 499 rc = fill_stats_for_pid(pid, stats); 500 if (rc < 0) 501 goto err; 502 return send_reply(rep_skb, info); 503 err: 504 nlmsg_free(rep_skb); 505 return rc; 506 } 507 508 static int cmd_attr_tgid(struct genl_info *info) 509 { 510 struct taskstats *stats; 511 struct sk_buff *rep_skb; 512 size_t size; 513 u32 tgid; 514 int rc; 515 516 size = taskstats_packet_size(); 517 518 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 519 if (rc < 0) 520 return rc; 521 522 rc = -EINVAL; 523 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 524 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); 525 if (!stats) 526 goto err; 527 528 rc = fill_stats_for_tgid(tgid, stats); 529 if (rc < 0) 530 goto err; 531 return send_reply(rep_skb, info); 532 err: 533 nlmsg_free(rep_skb); 534 return rc; 535 } 536 537 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 538 { 539 if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) 540 return cmd_attr_register_cpumask(info); 541 else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) 542 return cmd_attr_deregister_cpumask(info); 543 else if (info->attrs[TASKSTATS_CMD_ATTR_PID]) 544 return cmd_attr_pid(info); 545 else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) 546 return cmd_attr_tgid(info); 547 else 548 return -EINVAL; 549 } 550 551 static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) 552 { 553 struct signal_struct *sig = tsk->signal; 554 struct taskstats *stats_new, *stats; 555 556 /* Pairs with smp_store_release() below. */ 557 stats = smp_load_acquire(&sig->stats); 558 if (stats || thread_group_empty(tsk)) 559 return stats; 560 561 /* No problem if kmem_cache_zalloc() fails */ 562 stats_new = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); 563 564 spin_lock_irq(&tsk->sighand->siglock); 565 stats = sig->stats; 566 if (!stats) { 567 /* 568 * Pairs with smp_store_release() above and order the 569 * kmem_cache_zalloc(). 570 */ 571 smp_store_release(&sig->stats, stats_new); 572 stats = stats_new; 573 stats_new = NULL; 574 } 575 spin_unlock_irq(&tsk->sighand->siglock); 576 577 if (stats_new) 578 kmem_cache_free(taskstats_cache, stats_new); 579 580 return stats; 581 } 582 583 /* Send pid data out on exit */ 584 void taskstats_exit(struct task_struct *tsk, int group_dead) 585 { 586 int rc; 587 struct listener_list *listeners; 588 struct taskstats *stats; 589 struct sk_buff *rep_skb; 590 size_t size; 591 int is_thread_group; 592 593 if (!family_registered) 594 return; 595 596 /* 597 * Size includes space for nested attributes 598 */ 599 size = taskstats_packet_size(); 600 601 is_thread_group = !!taskstats_tgid_alloc(tsk); 602 if (is_thread_group) { 603 /* PID + STATS + TGID + STATS */ 604 size = 2 * size; 605 /* fill the tsk->signal->stats structure */ 606 fill_tgid_exit(tsk); 607 } 608 609 listeners = raw_cpu_ptr(&listener_array); 610 if (list_empty(&listeners->list)) 611 return; 612 613 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); 614 if (rc < 0) 615 return; 616 617 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, 618 task_pid_nr_ns(tsk, &init_pid_ns)); 619 if (!stats) 620 goto err; 621 622 fill_stats(&init_user_ns, &init_pid_ns, tsk, stats); 623 624 /* 625 * Doesn't matter if tsk is the leader or the last group member leaving 626 */ 627 if (!is_thread_group || !group_dead) 628 goto send; 629 630 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, 631 task_tgid_nr_ns(tsk, &init_pid_ns)); 632 if (!stats) 633 goto err; 634 635 memcpy(stats, tsk->signal->stats, sizeof(*stats)); 636 637 send: 638 send_cpu_listeners(rep_skb, listeners); 639 return; 640 err: 641 nlmsg_free(rep_skb); 642 } 643 644 static const struct genl_ops taskstats_ops[] = { 645 { 646 .cmd = TASKSTATS_CMD_GET, 647 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 648 .doit = taskstats_user_cmd, 649 .policy = taskstats_cmd_get_policy, 650 .maxattr = ARRAY_SIZE(taskstats_cmd_get_policy) - 1, 651 .flags = GENL_ADMIN_PERM, 652 }, 653 { 654 .cmd = CGROUPSTATS_CMD_GET, 655 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 656 .doit = cgroupstats_user_cmd, 657 .policy = cgroupstats_cmd_get_policy, 658 .maxattr = ARRAY_SIZE(cgroupstats_cmd_get_policy) - 1, 659 }, 660 }; 661 662 static struct genl_family family __ro_after_init = { 663 .name = TASKSTATS_GENL_NAME, 664 .version = TASKSTATS_GENL_VERSION, 665 .module = THIS_MODULE, 666 .ops = taskstats_ops, 667 .n_ops = ARRAY_SIZE(taskstats_ops), 668 }; 669 670 /* Needed early in initialization */ 671 void __init taskstats_init_early(void) 672 { 673 unsigned int i; 674 675 taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC); 676 for_each_possible_cpu(i) { 677 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); 678 init_rwsem(&(per_cpu(listener_array, i).sem)); 679 } 680 } 681 682 static int __init taskstats_init(void) 683 { 684 int rc; 685 686 rc = genl_register_family(&family); 687 if (rc) 688 return rc; 689 690 family_registered = 1; 691 pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 692 return 0; 693 } 694 695 /* 696 * late initcall ensures initialization of statistics collection 697 * mechanisms precedes initialization of the taskstats interface 698 */ 699 late_initcall(taskstats_init); 700