1 /* 2 * taskstats.c - Export per-task statistics to userland 3 * 4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006 5 * (C) Balbir Singh, IBM Corp. 2006 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/taskstats_kern.h> 21 #include <linux/tsacct_kern.h> 22 #include <linux/delayacct.h> 23 #include <linux/cpumask.h> 24 #include <linux/percpu.h> 25 #include <linux/slab.h> 26 #include <linux/cgroupstats.h> 27 #include <linux/cgroup.h> 28 #include <linux/fs.h> 29 #include <linux/file.h> 30 #include <linux/pid_namespace.h> 31 #include <net/genetlink.h> 32 #include <linux/atomic.h> 33 34 /* 35 * Maximum length of a cpumask that can be specified in 36 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute 37 */ 38 #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) 39 40 static DEFINE_PER_CPU(__u32, taskstats_seqnum); 41 static int family_registered; 42 struct kmem_cache *taskstats_cache; 43 44 static struct genl_family family = { 45 .id = GENL_ID_GENERATE, 46 .name = TASKSTATS_GENL_NAME, 47 .version = TASKSTATS_GENL_VERSION, 48 .maxattr = TASKSTATS_CMD_ATTR_MAX, 49 }; 50 51 static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { 52 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 53 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 54 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 55 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 56 57 /* 58 * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family. 59 * Make sure they are always aligned. 60 */ 61 static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { 62 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 63 }; 64 65 struct listener { 66 struct list_head list; 67 pid_t pid; 68 char valid; 69 }; 70 71 struct listener_list { 72 struct rw_semaphore sem; 73 struct list_head list; 74 }; 75 static DEFINE_PER_CPU(struct listener_list, listener_array); 76 77 enum actions { 78 REGISTER, 79 DEREGISTER, 80 CPU_DONT_CARE 81 }; 82 83 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 84 size_t size) 85 { 86 struct sk_buff *skb; 87 void *reply; 88 89 /* 90 * If new attributes are added, please revisit this allocation 91 */ 92 skb = genlmsg_new(size, GFP_KERNEL); 93 if (!skb) 94 return -ENOMEM; 95 96 if (!info) { 97 int seq = this_cpu_inc_return(taskstats_seqnum) - 1; 98 99 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); 100 } else 101 reply = genlmsg_put_reply(skb, info, &family, 0, cmd); 102 if (reply == NULL) { 103 nlmsg_free(skb); 104 return -EINVAL; 105 } 106 107 *skbp = skb; 108 return 0; 109 } 110 111 /* 112 * Send taskstats data in @skb to listener with nl_pid @pid 113 */ 114 static int send_reply(struct sk_buff *skb, struct genl_info *info) 115 { 116 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 117 void *reply = genlmsg_data(genlhdr); 118 119 genlmsg_end(skb, reply); 120 121 return genlmsg_reply(skb, info); 122 } 123 124 /* 125 * Send taskstats data in @skb to listeners registered for @cpu's exit data 126 */ 127 static void send_cpu_listeners(struct sk_buff *skb, 128 struct listener_list *listeners) 129 { 130 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 131 struct listener *s, *tmp; 132 struct sk_buff *skb_next, *skb_cur = skb; 133 void *reply = genlmsg_data(genlhdr); 134 int rc, delcount = 0; 135 136 genlmsg_end(skb, reply); 137 138 rc = 0; 139 down_read(&listeners->sem); 140 list_for_each_entry(s, &listeners->list, list) { 141 skb_next = NULL; 142 if (!list_is_last(&s->list, &listeners->list)) { 143 skb_next = skb_clone(skb_cur, GFP_KERNEL); 144 if (!skb_next) 145 break; 146 } 147 rc = genlmsg_unicast(&init_net, skb_cur, s->pid); 148 if (rc == -ECONNREFUSED) { 149 s->valid = 0; 150 delcount++; 151 } 152 skb_cur = skb_next; 153 } 154 up_read(&listeners->sem); 155 156 if (skb_cur) 157 nlmsg_free(skb_cur); 158 159 if (!delcount) 160 return; 161 162 /* Delete invalidated entries */ 163 down_write(&listeners->sem); 164 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 165 if (!s->valid) { 166 list_del(&s->list); 167 kfree(s); 168 } 169 } 170 up_write(&listeners->sem); 171 } 172 173 static void fill_stats(struct user_namespace *user_ns, 174 struct pid_namespace *pid_ns, 175 struct task_struct *tsk, struct taskstats *stats) 176 { 177 memset(stats, 0, sizeof(*stats)); 178 /* 179 * Each accounting subsystem adds calls to its functions to 180 * fill in relevant parts of struct taskstsats as follows 181 * 182 * per-task-foo(stats, tsk); 183 */ 184 185 delayacct_add_tsk(stats, tsk); 186 187 /* fill in basic acct fields */ 188 stats->version = TASKSTATS_VERSION; 189 stats->nvcsw = tsk->nvcsw; 190 stats->nivcsw = tsk->nivcsw; 191 bacct_add_tsk(user_ns, pid_ns, stats, tsk); 192 193 /* fill in extended acct fields */ 194 xacct_add_tsk(stats, tsk); 195 } 196 197 static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) 198 { 199 struct task_struct *tsk; 200 201 rcu_read_lock(); 202 tsk = find_task_by_vpid(pid); 203 if (tsk) 204 get_task_struct(tsk); 205 rcu_read_unlock(); 206 if (!tsk) 207 return -ESRCH; 208 fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats); 209 put_task_struct(tsk); 210 return 0; 211 } 212 213 static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) 214 { 215 struct task_struct *tsk, *first; 216 unsigned long flags; 217 int rc = -ESRCH; 218 219 /* 220 * Add additional stats from live tasks except zombie thread group 221 * leaders who are already counted with the dead tasks 222 */ 223 rcu_read_lock(); 224 first = find_task_by_vpid(tgid); 225 226 if (!first || !lock_task_sighand(first, &flags)) 227 goto out; 228 229 if (first->signal->stats) 230 memcpy(stats, first->signal->stats, sizeof(*stats)); 231 else 232 memset(stats, 0, sizeof(*stats)); 233 234 tsk = first; 235 do { 236 if (tsk->exit_state) 237 continue; 238 /* 239 * Accounting subsystem can call its functions here to 240 * fill in relevant parts of struct taskstsats as follows 241 * 242 * per-task-foo(stats, tsk); 243 */ 244 delayacct_add_tsk(stats, tsk); 245 246 stats->nvcsw += tsk->nvcsw; 247 stats->nivcsw += tsk->nivcsw; 248 } while_each_thread(first, tsk); 249 250 unlock_task_sighand(first, &flags); 251 rc = 0; 252 out: 253 rcu_read_unlock(); 254 255 stats->version = TASKSTATS_VERSION; 256 /* 257 * Accounting subsystems can also add calls here to modify 258 * fields of taskstats. 259 */ 260 return rc; 261 } 262 263 static void fill_tgid_exit(struct task_struct *tsk) 264 { 265 unsigned long flags; 266 267 spin_lock_irqsave(&tsk->sighand->siglock, flags); 268 if (!tsk->signal->stats) 269 goto ret; 270 271 /* 272 * Each accounting subsystem calls its functions here to 273 * accumalate its per-task stats for tsk, into the per-tgid structure 274 * 275 * per-task-foo(tsk->signal->stats, tsk); 276 */ 277 delayacct_add_tsk(tsk->signal->stats, tsk); 278 ret: 279 spin_unlock_irqrestore(&tsk->sighand->siglock, flags); 280 return; 281 } 282 283 static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) 284 { 285 struct listener_list *listeners; 286 struct listener *s, *tmp, *s2; 287 unsigned int cpu; 288 int ret = 0; 289 290 if (!cpumask_subset(mask, cpu_possible_mask)) 291 return -EINVAL; 292 293 if (current_user_ns() != &init_user_ns) 294 return -EINVAL; 295 296 if (task_active_pid_ns(current) != &init_pid_ns) 297 return -EINVAL; 298 299 if (isadd == REGISTER) { 300 for_each_cpu(cpu, mask) { 301 s = kmalloc_node(sizeof(struct listener), 302 GFP_KERNEL, cpu_to_node(cpu)); 303 if (!s) { 304 ret = -ENOMEM; 305 goto cleanup; 306 } 307 s->pid = pid; 308 s->valid = 1; 309 310 listeners = &per_cpu(listener_array, cpu); 311 down_write(&listeners->sem); 312 list_for_each_entry(s2, &listeners->list, list) { 313 if (s2->pid == pid && s2->valid) 314 goto exists; 315 } 316 list_add(&s->list, &listeners->list); 317 s = NULL; 318 exists: 319 up_write(&listeners->sem); 320 kfree(s); /* nop if NULL */ 321 } 322 return 0; 323 } 324 325 /* Deregister or cleanup */ 326 cleanup: 327 for_each_cpu(cpu, mask) { 328 listeners = &per_cpu(listener_array, cpu); 329 down_write(&listeners->sem); 330 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 331 if (s->pid == pid) { 332 list_del(&s->list); 333 kfree(s); 334 break; 335 } 336 } 337 up_write(&listeners->sem); 338 } 339 return ret; 340 } 341 342 static int parse(struct nlattr *na, struct cpumask *mask) 343 { 344 char *data; 345 int len; 346 int ret; 347 348 if (na == NULL) 349 return 1; 350 len = nla_len(na); 351 if (len > TASKSTATS_CPUMASK_MAXLEN) 352 return -E2BIG; 353 if (len < 1) 354 return -EINVAL; 355 data = kmalloc(len, GFP_KERNEL); 356 if (!data) 357 return -ENOMEM; 358 nla_strlcpy(data, na, len); 359 ret = cpulist_parse(data, mask); 360 kfree(data); 361 return ret; 362 } 363 364 static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) 365 { 366 struct nlattr *na, *ret; 367 int aggr; 368 369 aggr = (type == TASKSTATS_TYPE_PID) 370 ? TASKSTATS_TYPE_AGGR_PID 371 : TASKSTATS_TYPE_AGGR_TGID; 372 373 na = nla_nest_start(skb, aggr); 374 if (!na) 375 goto err; 376 377 if (nla_put(skb, type, sizeof(pid), &pid) < 0) { 378 nla_nest_cancel(skb, na); 379 goto err; 380 } 381 ret = nla_reserve_64bit(skb, TASKSTATS_TYPE_STATS, 382 sizeof(struct taskstats), TASKSTATS_TYPE_NULL); 383 if (!ret) { 384 nla_nest_cancel(skb, na); 385 goto err; 386 } 387 nla_nest_end(skb, na); 388 389 return nla_data(ret); 390 err: 391 return NULL; 392 } 393 394 static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 395 { 396 int rc = 0; 397 struct sk_buff *rep_skb; 398 struct cgroupstats *stats; 399 struct nlattr *na; 400 size_t size; 401 u32 fd; 402 struct fd f; 403 404 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; 405 if (!na) 406 return -EINVAL; 407 408 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); 409 f = fdget(fd); 410 if (!f.file) 411 return 0; 412 413 size = nla_total_size(sizeof(struct cgroupstats)); 414 415 rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, 416 size); 417 if (rc < 0) 418 goto err; 419 420 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 421 sizeof(struct cgroupstats)); 422 if (na == NULL) { 423 nlmsg_free(rep_skb); 424 rc = -EMSGSIZE; 425 goto err; 426 } 427 428 stats = nla_data(na); 429 memset(stats, 0, sizeof(*stats)); 430 431 rc = cgroupstats_build(stats, f.file->f_path.dentry); 432 if (rc < 0) { 433 nlmsg_free(rep_skb); 434 goto err; 435 } 436 437 rc = send_reply(rep_skb, info); 438 439 err: 440 fdput(f); 441 return rc; 442 } 443 444 static int cmd_attr_register_cpumask(struct genl_info *info) 445 { 446 cpumask_var_t mask; 447 int rc; 448 449 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 450 return -ENOMEM; 451 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 452 if (rc < 0) 453 goto out; 454 rc = add_del_listener(info->snd_portid, mask, REGISTER); 455 out: 456 free_cpumask_var(mask); 457 return rc; 458 } 459 460 static int cmd_attr_deregister_cpumask(struct genl_info *info) 461 { 462 cpumask_var_t mask; 463 int rc; 464 465 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 466 return -ENOMEM; 467 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 468 if (rc < 0) 469 goto out; 470 rc = add_del_listener(info->snd_portid, mask, DEREGISTER); 471 out: 472 free_cpumask_var(mask); 473 return rc; 474 } 475 476 static size_t taskstats_packet_size(void) 477 { 478 size_t size; 479 480 size = nla_total_size(sizeof(u32)) + 481 nla_total_size_64bit(sizeof(struct taskstats)) + 482 nla_total_size(0); 483 484 return size; 485 } 486 487 static int cmd_attr_pid(struct genl_info *info) 488 { 489 struct taskstats *stats; 490 struct sk_buff *rep_skb; 491 size_t size; 492 u32 pid; 493 int rc; 494 495 size = taskstats_packet_size(); 496 497 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 498 if (rc < 0) 499 return rc; 500 501 rc = -EINVAL; 502 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 503 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); 504 if (!stats) 505 goto err; 506 507 rc = fill_stats_for_pid(pid, stats); 508 if (rc < 0) 509 goto err; 510 return send_reply(rep_skb, info); 511 err: 512 nlmsg_free(rep_skb); 513 return rc; 514 } 515 516 static int cmd_attr_tgid(struct genl_info *info) 517 { 518 struct taskstats *stats; 519 struct sk_buff *rep_skb; 520 size_t size; 521 u32 tgid; 522 int rc; 523 524 size = taskstats_packet_size(); 525 526 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 527 if (rc < 0) 528 return rc; 529 530 rc = -EINVAL; 531 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 532 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); 533 if (!stats) 534 goto err; 535 536 rc = fill_stats_for_tgid(tgid, stats); 537 if (rc < 0) 538 goto err; 539 return send_reply(rep_skb, info); 540 err: 541 nlmsg_free(rep_skb); 542 return rc; 543 } 544 545 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 546 { 547 if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) 548 return cmd_attr_register_cpumask(info); 549 else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) 550 return cmd_attr_deregister_cpumask(info); 551 else if (info->attrs[TASKSTATS_CMD_ATTR_PID]) 552 return cmd_attr_pid(info); 553 else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) 554 return cmd_attr_tgid(info); 555 else 556 return -EINVAL; 557 } 558 559 static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) 560 { 561 struct signal_struct *sig = tsk->signal; 562 struct taskstats *stats; 563 564 if (sig->stats || thread_group_empty(tsk)) 565 goto ret; 566 567 /* No problem if kmem_cache_zalloc() fails */ 568 stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); 569 570 spin_lock_irq(&tsk->sighand->siglock); 571 if (!sig->stats) { 572 sig->stats = stats; 573 stats = NULL; 574 } 575 spin_unlock_irq(&tsk->sighand->siglock); 576 577 if (stats) 578 kmem_cache_free(taskstats_cache, stats); 579 ret: 580 return sig->stats; 581 } 582 583 /* Send pid data out on exit */ 584 void taskstats_exit(struct task_struct *tsk, int group_dead) 585 { 586 int rc; 587 struct listener_list *listeners; 588 struct taskstats *stats; 589 struct sk_buff *rep_skb; 590 size_t size; 591 int is_thread_group; 592 593 if (!family_registered) 594 return; 595 596 /* 597 * Size includes space for nested attributes 598 */ 599 size = taskstats_packet_size(); 600 601 is_thread_group = !!taskstats_tgid_alloc(tsk); 602 if (is_thread_group) { 603 /* PID + STATS + TGID + STATS */ 604 size = 2 * size; 605 /* fill the tsk->signal->stats structure */ 606 fill_tgid_exit(tsk); 607 } 608 609 listeners = raw_cpu_ptr(&listener_array); 610 if (list_empty(&listeners->list)) 611 return; 612 613 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); 614 if (rc < 0) 615 return; 616 617 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, 618 task_pid_nr_ns(tsk, &init_pid_ns)); 619 if (!stats) 620 goto err; 621 622 fill_stats(&init_user_ns, &init_pid_ns, tsk, stats); 623 624 /* 625 * Doesn't matter if tsk is the leader or the last group member leaving 626 */ 627 if (!is_thread_group || !group_dead) 628 goto send; 629 630 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, 631 task_tgid_nr_ns(tsk, &init_pid_ns)); 632 if (!stats) 633 goto err; 634 635 memcpy(stats, tsk->signal->stats, sizeof(*stats)); 636 637 send: 638 send_cpu_listeners(rep_skb, listeners); 639 return; 640 err: 641 nlmsg_free(rep_skb); 642 } 643 644 static const struct genl_ops taskstats_ops[] = { 645 { 646 .cmd = TASKSTATS_CMD_GET, 647 .doit = taskstats_user_cmd, 648 .policy = taskstats_cmd_get_policy, 649 .flags = GENL_ADMIN_PERM, 650 }, 651 { 652 .cmd = CGROUPSTATS_CMD_GET, 653 .doit = cgroupstats_user_cmd, 654 .policy = cgroupstats_cmd_get_policy, 655 }, 656 }; 657 658 /* Needed early in initialization */ 659 void __init taskstats_init_early(void) 660 { 661 unsigned int i; 662 663 taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC); 664 for_each_possible_cpu(i) { 665 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); 666 init_rwsem(&(per_cpu(listener_array, i).sem)); 667 } 668 } 669 670 static int __init taskstats_init(void) 671 { 672 int rc; 673 674 rc = genl_register_family_with_ops(&family, taskstats_ops); 675 if (rc) 676 return rc; 677 678 family_registered = 1; 679 pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 680 return 0; 681 } 682 683 /* 684 * late initcall ensures initialization of statistics collection 685 * mechanisms precedes initialization of the taskstats interface 686 */ 687 late_initcall(taskstats_init); 688