1 /* 2 * taskstats.c - Export per-task statistics to userland 3 * 4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006 5 * (C) Balbir Singh, IBM Corp. 2006 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/taskstats_kern.h> 21 #include <linux/tsacct_kern.h> 22 #include <linux/delayacct.h> 23 #include <linux/cpumask.h> 24 #include <linux/percpu.h> 25 #include <linux/slab.h> 26 #include <linux/cgroupstats.h> 27 #include <linux/cgroup.h> 28 #include <linux/fs.h> 29 #include <linux/file.h> 30 #include <net/genetlink.h> 31 #include <asm/atomic.h> 32 33 /* 34 * Maximum length of a cpumask that can be specified in 35 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute 36 */ 37 #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) 38 39 static DEFINE_PER_CPU(__u32, taskstats_seqnum); 40 static int family_registered; 41 struct kmem_cache *taskstats_cache; 42 43 static struct genl_family family = { 44 .id = GENL_ID_GENERATE, 45 .name = TASKSTATS_GENL_NAME, 46 .version = TASKSTATS_GENL_VERSION, 47 .maxattr = TASKSTATS_CMD_ATTR_MAX, 48 }; 49 50 static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { 51 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 52 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 55 56 static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { 57 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 58 }; 59 60 struct listener { 61 struct list_head list; 62 pid_t pid; 63 char valid; 64 }; 65 66 struct listener_list { 67 struct rw_semaphore sem; 68 struct list_head list; 69 }; 70 static DEFINE_PER_CPU(struct listener_list, listener_array); 71 72 enum actions { 73 REGISTER, 74 DEREGISTER, 75 CPU_DONT_CARE 76 }; 77 78 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 79 size_t size) 80 { 81 struct sk_buff *skb; 82 void *reply; 83 84 /* 85 * If new attributes are added, please revisit this allocation 86 */ 87 skb = genlmsg_new(size, GFP_KERNEL); 88 if (!skb) 89 return -ENOMEM; 90 91 if (!info) { 92 int seq = get_cpu_var(taskstats_seqnum)++; 93 put_cpu_var(taskstats_seqnum); 94 95 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); 96 } else 97 reply = genlmsg_put_reply(skb, info, &family, 0, cmd); 98 if (reply == NULL) { 99 nlmsg_free(skb); 100 return -EINVAL; 101 } 102 103 *skbp = skb; 104 return 0; 105 } 106 107 /* 108 * Send taskstats data in @skb to listener with nl_pid @pid 109 */ 110 static int send_reply(struct sk_buff *skb, struct genl_info *info) 111 { 112 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 113 void *reply = genlmsg_data(genlhdr); 114 int rc; 115 116 rc = genlmsg_end(skb, reply); 117 if (rc < 0) { 118 nlmsg_free(skb); 119 return rc; 120 } 121 122 return genlmsg_reply(skb, info); 123 } 124 125 /* 126 * Send taskstats data in @skb to listeners registered for @cpu's exit data 127 */ 128 static void send_cpu_listeners(struct sk_buff *skb, 129 struct listener_list *listeners) 130 { 131 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 132 struct listener *s, *tmp; 133 struct sk_buff *skb_next, *skb_cur = skb; 134 void *reply = genlmsg_data(genlhdr); 135 int rc, delcount = 0; 136 137 rc = genlmsg_end(skb, reply); 138 if (rc < 0) { 139 nlmsg_free(skb); 140 return; 141 } 142 143 rc = 0; 144 down_read(&listeners->sem); 145 list_for_each_entry(s, &listeners->list, list) { 146 skb_next = NULL; 147 if (!list_is_last(&s->list, &listeners->list)) { 148 skb_next = skb_clone(skb_cur, GFP_KERNEL); 149 if (!skb_next) 150 break; 151 } 152 rc = genlmsg_unicast(&init_net, skb_cur, s->pid); 153 if (rc == -ECONNREFUSED) { 154 s->valid = 0; 155 delcount++; 156 } 157 skb_cur = skb_next; 158 } 159 up_read(&listeners->sem); 160 161 if (skb_cur) 162 nlmsg_free(skb_cur); 163 164 if (!delcount) 165 return; 166 167 /* Delete invalidated entries */ 168 down_write(&listeners->sem); 169 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 170 if (!s->valid) { 171 list_del(&s->list); 172 kfree(s); 173 } 174 } 175 up_write(&listeners->sem); 176 } 177 178 static void fill_stats(struct task_struct *tsk, struct taskstats *stats) 179 { 180 memset(stats, 0, sizeof(*stats)); 181 /* 182 * Each accounting subsystem adds calls to its functions to 183 * fill in relevant parts of struct taskstsats as follows 184 * 185 * per-task-foo(stats, tsk); 186 */ 187 188 delayacct_add_tsk(stats, tsk); 189 190 /* fill in basic acct fields */ 191 stats->version = TASKSTATS_VERSION; 192 stats->nvcsw = tsk->nvcsw; 193 stats->nivcsw = tsk->nivcsw; 194 bacct_add_tsk(stats, tsk); 195 196 /* fill in extended acct fields */ 197 xacct_add_tsk(stats, tsk); 198 } 199 200 static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) 201 { 202 struct task_struct *tsk; 203 204 rcu_read_lock(); 205 tsk = find_task_by_vpid(pid); 206 if (tsk) 207 get_task_struct(tsk); 208 rcu_read_unlock(); 209 if (!tsk) 210 return -ESRCH; 211 fill_stats(tsk, stats); 212 put_task_struct(tsk); 213 return 0; 214 } 215 216 static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) 217 { 218 struct task_struct *tsk, *first; 219 unsigned long flags; 220 int rc = -ESRCH; 221 222 /* 223 * Add additional stats from live tasks except zombie thread group 224 * leaders who are already counted with the dead tasks 225 */ 226 rcu_read_lock(); 227 first = find_task_by_vpid(tgid); 228 229 if (!first || !lock_task_sighand(first, &flags)) 230 goto out; 231 232 if (first->signal->stats) 233 memcpy(stats, first->signal->stats, sizeof(*stats)); 234 else 235 memset(stats, 0, sizeof(*stats)); 236 237 tsk = first; 238 do { 239 if (tsk->exit_state) 240 continue; 241 /* 242 * Accounting subsystem can call its functions here to 243 * fill in relevant parts of struct taskstsats as follows 244 * 245 * per-task-foo(stats, tsk); 246 */ 247 delayacct_add_tsk(stats, tsk); 248 249 stats->nvcsw += tsk->nvcsw; 250 stats->nivcsw += tsk->nivcsw; 251 } while_each_thread(first, tsk); 252 253 unlock_task_sighand(first, &flags); 254 rc = 0; 255 out: 256 rcu_read_unlock(); 257 258 stats->version = TASKSTATS_VERSION; 259 /* 260 * Accounting subsystems can also add calls here to modify 261 * fields of taskstats. 262 */ 263 return rc; 264 } 265 266 static void fill_tgid_exit(struct task_struct *tsk) 267 { 268 unsigned long flags; 269 270 spin_lock_irqsave(&tsk->sighand->siglock, flags); 271 if (!tsk->signal->stats) 272 goto ret; 273 274 /* 275 * Each accounting subsystem calls its functions here to 276 * accumalate its per-task stats for tsk, into the per-tgid structure 277 * 278 * per-task-foo(tsk->signal->stats, tsk); 279 */ 280 delayacct_add_tsk(tsk->signal->stats, tsk); 281 ret: 282 spin_unlock_irqrestore(&tsk->sighand->siglock, flags); 283 return; 284 } 285 286 static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) 287 { 288 struct listener_list *listeners; 289 struct listener *s, *tmp; 290 unsigned int cpu; 291 292 if (!cpumask_subset(mask, cpu_possible_mask)) 293 return -EINVAL; 294 295 if (isadd == REGISTER) { 296 for_each_cpu(cpu, mask) { 297 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, 298 cpu_to_node(cpu)); 299 if (!s) 300 goto cleanup; 301 s->pid = pid; 302 INIT_LIST_HEAD(&s->list); 303 s->valid = 1; 304 305 listeners = &per_cpu(listener_array, cpu); 306 down_write(&listeners->sem); 307 list_add(&s->list, &listeners->list); 308 up_write(&listeners->sem); 309 } 310 return 0; 311 } 312 313 /* Deregister or cleanup */ 314 cleanup: 315 for_each_cpu(cpu, mask) { 316 listeners = &per_cpu(listener_array, cpu); 317 down_write(&listeners->sem); 318 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 319 if (s->pid == pid) { 320 list_del(&s->list); 321 kfree(s); 322 break; 323 } 324 } 325 up_write(&listeners->sem); 326 } 327 return 0; 328 } 329 330 static int parse(struct nlattr *na, struct cpumask *mask) 331 { 332 char *data; 333 int len; 334 int ret; 335 336 if (na == NULL) 337 return 1; 338 len = nla_len(na); 339 if (len > TASKSTATS_CPUMASK_MAXLEN) 340 return -E2BIG; 341 if (len < 1) 342 return -EINVAL; 343 data = kmalloc(len, GFP_KERNEL); 344 if (!data) 345 return -ENOMEM; 346 nla_strlcpy(data, na, len); 347 ret = cpulist_parse(data, mask); 348 kfree(data); 349 return ret; 350 } 351 352 static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) 353 { 354 struct nlattr *na, *ret; 355 int aggr; 356 357 /* If we don't pad, we end up with alignment on a 4 byte boundary. 358 * This causes lots of runtime warnings on systems requiring 8 byte 359 * alignment */ 360 u32 pids[2] = { pid, 0 }; 361 int pid_size = ALIGN(sizeof(pid), sizeof(long)); 362 363 aggr = (type == TASKSTATS_TYPE_PID) 364 ? TASKSTATS_TYPE_AGGR_PID 365 : TASKSTATS_TYPE_AGGR_TGID; 366 367 na = nla_nest_start(skb, aggr); 368 if (!na) 369 goto err; 370 if (nla_put(skb, type, pid_size, pids) < 0) 371 goto err; 372 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); 373 if (!ret) 374 goto err; 375 nla_nest_end(skb, na); 376 377 return nla_data(ret); 378 err: 379 return NULL; 380 } 381 382 static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 383 { 384 int rc = 0; 385 struct sk_buff *rep_skb; 386 struct cgroupstats *stats; 387 struct nlattr *na; 388 size_t size; 389 u32 fd; 390 struct file *file; 391 int fput_needed; 392 393 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; 394 if (!na) 395 return -EINVAL; 396 397 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); 398 file = fget_light(fd, &fput_needed); 399 if (!file) 400 return 0; 401 402 size = nla_total_size(sizeof(struct cgroupstats)); 403 404 rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, 405 size); 406 if (rc < 0) 407 goto err; 408 409 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 410 sizeof(struct cgroupstats)); 411 stats = nla_data(na); 412 memset(stats, 0, sizeof(*stats)); 413 414 rc = cgroupstats_build(stats, file->f_dentry); 415 if (rc < 0) { 416 nlmsg_free(rep_skb); 417 goto err; 418 } 419 420 rc = send_reply(rep_skb, info); 421 422 err: 423 fput_light(file, fput_needed); 424 return rc; 425 } 426 427 static int cmd_attr_register_cpumask(struct genl_info *info) 428 { 429 cpumask_var_t mask; 430 int rc; 431 432 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 433 return -ENOMEM; 434 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 435 if (rc < 0) 436 goto out; 437 rc = add_del_listener(info->snd_pid, mask, REGISTER); 438 out: 439 free_cpumask_var(mask); 440 return rc; 441 } 442 443 static int cmd_attr_deregister_cpumask(struct genl_info *info) 444 { 445 cpumask_var_t mask; 446 int rc; 447 448 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 449 return -ENOMEM; 450 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 451 if (rc < 0) 452 goto out; 453 rc = add_del_listener(info->snd_pid, mask, DEREGISTER); 454 out: 455 free_cpumask_var(mask); 456 return rc; 457 } 458 459 static int cmd_attr_pid(struct genl_info *info) 460 { 461 struct taskstats *stats; 462 struct sk_buff *rep_skb; 463 size_t size; 464 u32 pid; 465 int rc; 466 467 size = nla_total_size(sizeof(u32)) + 468 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 469 470 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 471 if (rc < 0) 472 return rc; 473 474 rc = -EINVAL; 475 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 476 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); 477 if (!stats) 478 goto err; 479 480 rc = fill_stats_for_pid(pid, stats); 481 if (rc < 0) 482 goto err; 483 return send_reply(rep_skb, info); 484 err: 485 nlmsg_free(rep_skb); 486 return rc; 487 } 488 489 static int cmd_attr_tgid(struct genl_info *info) 490 { 491 struct taskstats *stats; 492 struct sk_buff *rep_skb; 493 size_t size; 494 u32 tgid; 495 int rc; 496 497 size = nla_total_size(sizeof(u32)) + 498 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 499 500 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 501 if (rc < 0) 502 return rc; 503 504 rc = -EINVAL; 505 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 506 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); 507 if (!stats) 508 goto err; 509 510 rc = fill_stats_for_tgid(tgid, stats); 511 if (rc < 0) 512 goto err; 513 return send_reply(rep_skb, info); 514 err: 515 nlmsg_free(rep_skb); 516 return rc; 517 } 518 519 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 520 { 521 if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) 522 return cmd_attr_register_cpumask(info); 523 else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) 524 return cmd_attr_deregister_cpumask(info); 525 else if (info->attrs[TASKSTATS_CMD_ATTR_PID]) 526 return cmd_attr_pid(info); 527 else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) 528 return cmd_attr_tgid(info); 529 else 530 return -EINVAL; 531 } 532 533 static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) 534 { 535 struct signal_struct *sig = tsk->signal; 536 struct taskstats *stats; 537 538 if (sig->stats || thread_group_empty(tsk)) 539 goto ret; 540 541 /* No problem if kmem_cache_zalloc() fails */ 542 stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); 543 544 spin_lock_irq(&tsk->sighand->siglock); 545 if (!sig->stats) { 546 sig->stats = stats; 547 stats = NULL; 548 } 549 spin_unlock_irq(&tsk->sighand->siglock); 550 551 if (stats) 552 kmem_cache_free(taskstats_cache, stats); 553 ret: 554 return sig->stats; 555 } 556 557 /* Send pid data out on exit */ 558 void taskstats_exit(struct task_struct *tsk, int group_dead) 559 { 560 int rc; 561 struct listener_list *listeners; 562 struct taskstats *stats; 563 struct sk_buff *rep_skb; 564 size_t size; 565 int is_thread_group; 566 567 if (!family_registered) 568 return; 569 570 /* 571 * Size includes space for nested attributes 572 */ 573 size = nla_total_size(sizeof(u32)) + 574 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 575 576 is_thread_group = !!taskstats_tgid_alloc(tsk); 577 if (is_thread_group) { 578 /* PID + STATS + TGID + STATS */ 579 size = 2 * size; 580 /* fill the tsk->signal->stats structure */ 581 fill_tgid_exit(tsk); 582 } 583 584 listeners = &__raw_get_cpu_var(listener_array); 585 if (list_empty(&listeners->list)) 586 return; 587 588 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); 589 if (rc < 0) 590 return; 591 592 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); 593 if (!stats) 594 goto err; 595 596 fill_stats(tsk, stats); 597 598 /* 599 * Doesn't matter if tsk is the leader or the last group member leaving 600 */ 601 if (!is_thread_group || !group_dead) 602 goto send; 603 604 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); 605 if (!stats) 606 goto err; 607 608 memcpy(stats, tsk->signal->stats, sizeof(*stats)); 609 610 send: 611 send_cpu_listeners(rep_skb, listeners); 612 return; 613 err: 614 nlmsg_free(rep_skb); 615 } 616 617 static struct genl_ops taskstats_ops = { 618 .cmd = TASKSTATS_CMD_GET, 619 .doit = taskstats_user_cmd, 620 .policy = taskstats_cmd_get_policy, 621 }; 622 623 static struct genl_ops cgroupstats_ops = { 624 .cmd = CGROUPSTATS_CMD_GET, 625 .doit = cgroupstats_user_cmd, 626 .policy = cgroupstats_cmd_get_policy, 627 }; 628 629 /* Needed early in initialization */ 630 void __init taskstats_init_early(void) 631 { 632 unsigned int i; 633 634 taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC); 635 for_each_possible_cpu(i) { 636 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); 637 init_rwsem(&(per_cpu(listener_array, i).sem)); 638 } 639 } 640 641 static int __init taskstats_init(void) 642 { 643 int rc; 644 645 rc = genl_register_family(&family); 646 if (rc) 647 return rc; 648 649 rc = genl_register_ops(&family, &taskstats_ops); 650 if (rc < 0) 651 goto err; 652 653 rc = genl_register_ops(&family, &cgroupstats_ops); 654 if (rc < 0) 655 goto err_cgroup_ops; 656 657 family_registered = 1; 658 printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 659 return 0; 660 err_cgroup_ops: 661 genl_unregister_ops(&family, &taskstats_ops); 662 err: 663 genl_unregister_family(&family); 664 return rc; 665 } 666 667 /* 668 * late initcall ensures initialization of statistics collection 669 * mechanisms precedes initialization of the taskstats interface 670 */ 671 late_initcall(taskstats_init); 672