1 /* 2 * taskstats.c - Export per-task statistics to userland 3 * 4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006 5 * (C) Balbir Singh, IBM Corp. 2006 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/taskstats_kern.h> 21 #include <linux/delayacct.h> 22 #include <linux/cpumask.h> 23 #include <linux/percpu.h> 24 #include <net/genetlink.h> 25 #include <asm/atomic.h> 26 27 /* 28 * Maximum length of a cpumask that can be specified in 29 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute 30 */ 31 #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) 32 33 static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; 34 static int family_registered; 35 kmem_cache_t *taskstats_cache; 36 37 static struct genl_family family = { 38 .id = GENL_ID_GENERATE, 39 .name = TASKSTATS_GENL_NAME, 40 .version = TASKSTATS_GENL_VERSION, 41 .maxattr = TASKSTATS_CMD_ATTR_MAX, 42 }; 43 44 static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] 45 __read_mostly = { 46 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 47 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 48 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 49 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 50 51 struct listener { 52 struct list_head list; 53 pid_t pid; 54 char valid; 55 }; 56 57 struct listener_list { 58 struct rw_semaphore sem; 59 struct list_head list; 60 }; 61 static DEFINE_PER_CPU(struct listener_list, listener_array); 62 63 enum actions { 64 REGISTER, 65 DEREGISTER, 66 CPU_DONT_CARE 67 }; 68 69 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 70 void **replyp, size_t size) 71 { 72 struct sk_buff *skb; 73 void *reply; 74 75 /* 76 * If new attributes are added, please revisit this allocation 77 */ 78 skb = nlmsg_new(size); 79 if (!skb) 80 return -ENOMEM; 81 82 if (!info) { 83 int seq = get_cpu_var(taskstats_seqnum)++; 84 put_cpu_var(taskstats_seqnum); 85 86 reply = genlmsg_put(skb, 0, seq, 87 family.id, 0, 0, 88 cmd, family.version); 89 } else 90 reply = genlmsg_put(skb, info->snd_pid, info->snd_seq, 91 family.id, 0, 0, 92 cmd, family.version); 93 if (reply == NULL) { 94 nlmsg_free(skb); 95 return -EINVAL; 96 } 97 98 *skbp = skb; 99 *replyp = reply; 100 return 0; 101 } 102 103 /* 104 * Send taskstats data in @skb to listener with nl_pid @pid 105 */ 106 static int send_reply(struct sk_buff *skb, pid_t pid) 107 { 108 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); 109 void *reply = genlmsg_data(genlhdr); 110 int rc; 111 112 rc = genlmsg_end(skb, reply); 113 if (rc < 0) { 114 nlmsg_free(skb); 115 return rc; 116 } 117 118 return genlmsg_unicast(skb, pid); 119 } 120 121 /* 122 * Send taskstats data in @skb to listeners registered for @cpu's exit data 123 */ 124 static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) 125 { 126 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); 127 struct listener_list *listeners; 128 struct listener *s, *tmp; 129 struct sk_buff *skb_next, *skb_cur = skb; 130 void *reply = genlmsg_data(genlhdr); 131 int rc, delcount = 0; 132 133 rc = genlmsg_end(skb, reply); 134 if (rc < 0) { 135 nlmsg_free(skb); 136 return; 137 } 138 139 rc = 0; 140 listeners = &per_cpu(listener_array, cpu); 141 down_read(&listeners->sem); 142 list_for_each_entry(s, &listeners->list, list) { 143 skb_next = NULL; 144 if (!list_is_last(&s->list, &listeners->list)) { 145 skb_next = skb_clone(skb_cur, GFP_KERNEL); 146 if (!skb_next) 147 break; 148 } 149 rc = genlmsg_unicast(skb_cur, s->pid); 150 if (rc == -ECONNREFUSED) { 151 s->valid = 0; 152 delcount++; 153 } 154 skb_cur = skb_next; 155 } 156 up_read(&listeners->sem); 157 158 if (skb_cur) 159 nlmsg_free(skb_cur); 160 161 if (!delcount) 162 return; 163 164 /* Delete invalidated entries */ 165 down_write(&listeners->sem); 166 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 167 if (!s->valid) { 168 list_del(&s->list); 169 kfree(s); 170 } 171 } 172 up_write(&listeners->sem); 173 } 174 175 static int fill_pid(pid_t pid, struct task_struct *pidtsk, 176 struct taskstats *stats) 177 { 178 int rc = 0; 179 struct task_struct *tsk = pidtsk; 180 181 if (!pidtsk) { 182 read_lock(&tasklist_lock); 183 tsk = find_task_by_pid(pid); 184 if (!tsk) { 185 read_unlock(&tasklist_lock); 186 return -ESRCH; 187 } 188 get_task_struct(tsk); 189 read_unlock(&tasklist_lock); 190 } else 191 get_task_struct(tsk); 192 193 /* 194 * Each accounting subsystem adds calls to its functions to 195 * fill in relevant parts of struct taskstsats as follows 196 * 197 * per-task-foo(stats, tsk); 198 */ 199 200 delayacct_add_tsk(stats, tsk); 201 stats->version = TASKSTATS_VERSION; 202 203 /* Define err: label here if needed */ 204 put_task_struct(tsk); 205 return rc; 206 207 } 208 209 static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, 210 struct taskstats *stats) 211 { 212 struct task_struct *tsk, *first; 213 unsigned long flags; 214 215 /* 216 * Add additional stats from live tasks except zombie thread group 217 * leaders who are already counted with the dead tasks 218 */ 219 first = tgidtsk; 220 if (!first) { 221 read_lock(&tasklist_lock); 222 first = find_task_by_pid(tgid); 223 if (!first) { 224 read_unlock(&tasklist_lock); 225 return -ESRCH; 226 } 227 get_task_struct(first); 228 read_unlock(&tasklist_lock); 229 } else 230 get_task_struct(first); 231 232 /* Start with stats from dead tasks */ 233 spin_lock_irqsave(&first->signal->stats_lock, flags); 234 if (first->signal->stats) 235 memcpy(stats, first->signal->stats, sizeof(*stats)); 236 spin_unlock_irqrestore(&first->signal->stats_lock, flags); 237 238 tsk = first; 239 read_lock(&tasklist_lock); 240 do { 241 if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) 242 continue; 243 /* 244 * Accounting subsystem can call its functions here to 245 * fill in relevant parts of struct taskstsats as follows 246 * 247 * per-task-foo(stats, tsk); 248 */ 249 delayacct_add_tsk(stats, tsk); 250 251 } while_each_thread(first, tsk); 252 read_unlock(&tasklist_lock); 253 stats->version = TASKSTATS_VERSION; 254 255 /* 256 * Accounting subsytems can also add calls here to modify 257 * fields of taskstats. 258 */ 259 260 return 0; 261 } 262 263 264 static void fill_tgid_exit(struct task_struct *tsk) 265 { 266 unsigned long flags; 267 268 spin_lock_irqsave(&tsk->signal->stats_lock, flags); 269 if (!tsk->signal->stats) 270 goto ret; 271 272 /* 273 * Each accounting subsystem calls its functions here to 274 * accumalate its per-task stats for tsk, into the per-tgid structure 275 * 276 * per-task-foo(tsk->signal->stats, tsk); 277 */ 278 delayacct_add_tsk(tsk->signal->stats, tsk); 279 ret: 280 spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); 281 return; 282 } 283 284 static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) 285 { 286 struct listener_list *listeners; 287 struct listener *s, *tmp; 288 unsigned int cpu; 289 cpumask_t mask = *maskp; 290 291 if (!cpus_subset(mask, cpu_possible_map)) 292 return -EINVAL; 293 294 if (isadd == REGISTER) { 295 for_each_cpu_mask(cpu, mask) { 296 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, 297 cpu_to_node(cpu)); 298 if (!s) 299 goto cleanup; 300 s->pid = pid; 301 INIT_LIST_HEAD(&s->list); 302 s->valid = 1; 303 304 listeners = &per_cpu(listener_array, cpu); 305 down_write(&listeners->sem); 306 list_add(&s->list, &listeners->list); 307 up_write(&listeners->sem); 308 } 309 return 0; 310 } 311 312 /* Deregister or cleanup */ 313 cleanup: 314 for_each_cpu_mask(cpu, mask) { 315 listeners = &per_cpu(listener_array, cpu); 316 down_write(&listeners->sem); 317 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 318 if (s->pid == pid) { 319 list_del(&s->list); 320 kfree(s); 321 break; 322 } 323 } 324 up_write(&listeners->sem); 325 } 326 return 0; 327 } 328 329 static int parse(struct nlattr *na, cpumask_t *mask) 330 { 331 char *data; 332 int len; 333 int ret; 334 335 if (na == NULL) 336 return 1; 337 len = nla_len(na); 338 if (len > TASKSTATS_CPUMASK_MAXLEN) 339 return -E2BIG; 340 if (len < 1) 341 return -EINVAL; 342 data = kmalloc(len, GFP_KERNEL); 343 if (!data) 344 return -ENOMEM; 345 nla_strlcpy(data, na, len); 346 ret = cpulist_parse(data, *mask); 347 kfree(data); 348 return ret; 349 } 350 351 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 352 { 353 int rc = 0; 354 struct sk_buff *rep_skb; 355 struct taskstats stats; 356 void *reply; 357 size_t size; 358 struct nlattr *na; 359 cpumask_t mask; 360 361 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); 362 if (rc < 0) 363 return rc; 364 if (rc == 0) 365 return add_del_listener(info->snd_pid, &mask, REGISTER); 366 367 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); 368 if (rc < 0) 369 return rc; 370 if (rc == 0) 371 return add_del_listener(info->snd_pid, &mask, DEREGISTER); 372 373 /* 374 * Size includes space for nested attributes 375 */ 376 size = nla_total_size(sizeof(u32)) + 377 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 378 379 memset(&stats, 0, sizeof(stats)); 380 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); 381 if (rc < 0) 382 return rc; 383 384 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { 385 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 386 rc = fill_pid(pid, NULL, &stats); 387 if (rc < 0) 388 goto err; 389 390 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); 391 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); 392 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 393 stats); 394 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { 395 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 396 rc = fill_tgid(tgid, NULL, &stats); 397 if (rc < 0) 398 goto err; 399 400 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); 401 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); 402 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 403 stats); 404 } else { 405 rc = -EINVAL; 406 goto err; 407 } 408 409 nla_nest_end(rep_skb, na); 410 411 return send_reply(rep_skb, info->snd_pid); 412 413 nla_put_failure: 414 return genlmsg_cancel(rep_skb, reply); 415 err: 416 nlmsg_free(rep_skb); 417 return rc; 418 } 419 420 void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) 421 { 422 struct listener_list *listeners; 423 struct taskstats *tmp; 424 /* 425 * This is the cpu on which the task is exiting currently and will 426 * be the one for which the exit event is sent, even if the cpu 427 * on which this function is running changes later. 428 */ 429 *mycpu = raw_smp_processor_id(); 430 431 *ptidstats = NULL; 432 tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); 433 if (!tmp) 434 return; 435 436 listeners = &per_cpu(listener_array, *mycpu); 437 down_read(&listeners->sem); 438 if (!list_empty(&listeners->list)) { 439 *ptidstats = tmp; 440 tmp = NULL; 441 } 442 up_read(&listeners->sem); 443 kfree(tmp); 444 } 445 446 /* Send pid data out on exit */ 447 void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, 448 int group_dead, unsigned int mycpu) 449 { 450 int rc; 451 struct sk_buff *rep_skb; 452 void *reply; 453 size_t size; 454 int is_thread_group; 455 struct nlattr *na; 456 unsigned long flags; 457 458 if (!family_registered || !tidstats) 459 return; 460 461 spin_lock_irqsave(&tsk->signal->stats_lock, flags); 462 is_thread_group = tsk->signal->stats ? 1 : 0; 463 spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); 464 465 rc = 0; 466 /* 467 * Size includes space for nested attributes 468 */ 469 size = nla_total_size(sizeof(u32)) + 470 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 471 472 if (is_thread_group) 473 size = 2 * size; /* PID + STATS + TGID + STATS */ 474 475 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); 476 if (rc < 0) 477 goto ret; 478 479 rc = fill_pid(tsk->pid, tsk, tidstats); 480 if (rc < 0) 481 goto err_skb; 482 483 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); 484 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); 485 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 486 *tidstats); 487 nla_nest_end(rep_skb, na); 488 489 if (!is_thread_group) 490 goto send; 491 492 /* 493 * tsk has/had a thread group so fill the tsk->signal->stats structure 494 * Doesn't matter if tsk is the leader or the last group member leaving 495 */ 496 497 fill_tgid_exit(tsk); 498 if (!group_dead) 499 goto send; 500 501 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); 502 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); 503 /* No locking needed for tsk->signal->stats since group is dead */ 504 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, 505 *tsk->signal->stats); 506 nla_nest_end(rep_skb, na); 507 508 send: 509 send_cpu_listeners(rep_skb, mycpu); 510 return; 511 512 nla_put_failure: 513 genlmsg_cancel(rep_skb, reply); 514 goto ret; 515 err_skb: 516 nlmsg_free(rep_skb); 517 ret: 518 return; 519 } 520 521 static struct genl_ops taskstats_ops = { 522 .cmd = TASKSTATS_CMD_GET, 523 .doit = taskstats_user_cmd, 524 .policy = taskstats_cmd_get_policy, 525 }; 526 527 /* Needed early in initialization */ 528 void __init taskstats_init_early(void) 529 { 530 unsigned int i; 531 532 taskstats_cache = kmem_cache_create("taskstats_cache", 533 sizeof(struct taskstats), 534 0, SLAB_PANIC, NULL, NULL); 535 for_each_possible_cpu(i) { 536 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); 537 init_rwsem(&(per_cpu(listener_array, i).sem)); 538 } 539 } 540 541 static int __init taskstats_init(void) 542 { 543 int rc; 544 545 rc = genl_register_family(&family); 546 if (rc) 547 return rc; 548 549 rc = genl_register_ops(&family, &taskstats_ops); 550 if (rc < 0) 551 goto err; 552 553 family_registered = 1; 554 return 0; 555 err: 556 genl_unregister_family(&family); 557 return rc; 558 } 559 560 /* 561 * late initcall ensures initialization of statistics collection 562 * mechanisms precedes initialization of the taskstats interface 563 */ 564 late_initcall(taskstats_init); 565