1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2016 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/atomic.h> 27 #include <sys/callb.h> 28 #include <sys/cmn_err.h> 29 #include <sys/exacct.h> 30 #include <sys/id_space.h> 31 #include <sys/kmem.h> 32 #include <sys/kstat.h> 33 #include <sys/modhash.h> 34 #include <sys/mutex.h> 35 #include <sys/proc.h> 36 #include <sys/project.h> 37 #include <sys/rctl.h> 38 #include <sys/systm.h> 39 #include <sys/task.h> 40 #include <sys/time.h> 41 #include <sys/types.h> 42 #include <sys/zone.h> 43 #include <sys/cpuvar.h> 44 #include <sys/fss.h> 45 #include <sys/class.h> 46 #include <sys/project.h> 47 48 /* 49 * Tasks 50 * 51 * A task is a collection of processes, associated with a common project ID 52 * and related by a common initial parent. The task primarily represents a 53 * natural process sequence with known resource usage, although it can also be 54 * viewed as a convenient grouping of processes for signal delivery, processor 55 * binding, and administrative operations. 56 * 57 * Membership and observership 58 * We can conceive of situations where processes outside of the task may wish 59 * to examine the resource usage of the task. Similarly, a number of the 60 * administrative operations on a task can be performed by processes who are 61 * not members of the task. Accordingly, we must design a locking strategy 62 * where observers of the task, who wish to examine or operate on the task, 63 * and members of task, who can perform the mentioned operations, as well as 64 * leave the task, see a consistent and correct representation of the task at 65 * all times. 66 * 67 * Locking 68 * Because the task membership is a new relation between processes, its 69 * locking becomes an additional responsibility of the pidlock/p_lock locking 70 * sequence; however, tasks closely resemble sessions and the session locking 71 * model is mostly appropriate for the interaction of tasks, processes, and 72 * procfs. 73 * 74 * kmutex_t task_hash_lock 75 * task_hash_lock is a global lock protecting the contents of the task 76 * ID-to-task pointer hash. Holders of task_hash_lock must not attempt to 77 * acquire pidlock or p_lock. 78 * uint_t tk_hold_count 79 * tk_hold_count, the number of members and observers of the current task, 80 * must be manipulated atomically. 81 * proc_t *tk_memb_list 82 * proc_t *p_tasknext 83 * proc_t *p_taskprev 84 * The task's membership list is protected by pidlock, and is therefore 85 * always acquired before any of its members' p_lock mutexes. The p_task 86 * member of the proc structure is protected by pidlock or p_lock for 87 * reading, and by both pidlock and p_lock for modification, as is done for 88 * p_sessp. The key point is that only the process can modify its p_task, 89 * and not any entity on the system. (/proc will use prlock() to prevent 90 * the process from leaving, as opposed to pidlock.) 91 * kmutex_t tk_usage_lock 92 * tk_usage_lock is a per-task lock protecting the contents of the task 93 * usage structure and tk_nlwps counter for the task.max-lwps resource 94 * control. 95 */ 96 97 int task_hash_size = 256; 98 static kmutex_t task_hash_lock; 99 static mod_hash_t *task_hash; 100 101 static id_space_t *taskid_space; /* global taskid space */ 102 static kmem_cache_t *task_cache; /* kmem cache for task structures */ 103 104 rctl_hndl_t rc_task_lwps; 105 rctl_hndl_t rc_task_nprocs; 106 rctl_hndl_t rc_task_cpu_time; 107 108 /* 109 * Resource usage is committed using task queues; if taskq_dispatch() fails 110 * due to resource constraints, the task is placed on a list for background 111 * processing by the task_commit_thread() backup thread. 112 */ 113 static kmutex_t task_commit_lock; /* protects list pointers and cv */ 114 static kcondvar_t task_commit_cv; /* wakeup task_commit_thread */ 115 static task_t *task_commit_head = NULL; 116 static task_t *task_commit_tail = NULL; 117 kthread_t *task_commit_thread; 118 119 static void task_commit(); 120 static kstat_t *task_kstat_create(task_t *, zone_t *); 121 static void task_kstat_delete(task_t *); 122 123 /* 124 * static rctl_qty_t task_usage_lwps(void *taskp) 125 * 126 * Overview 127 * task_usage_lwps() is the usage operation for the resource control 128 * associated with the number of LWPs in a task. 129 * 130 * Return values 131 * The number of LWPs in the given task is returned. 132 * 133 * Caller's context 134 * The p->p_lock must be held across the call. 135 */ 136 /*ARGSUSED*/ 137 static rctl_qty_t 138 task_lwps_usage(rctl_t *r, proc_t *p) 139 { 140 task_t *t; 141 rctl_qty_t nlwps; 142 143 ASSERT(MUTEX_HELD(&p->p_lock)); 144 145 t = p->p_task; 146 mutex_enter(&p->p_zone->zone_nlwps_lock); 147 nlwps = t->tk_nlwps; 148 mutex_exit(&p->p_zone->zone_nlwps_lock); 149 150 return (nlwps); 151 } 152 153 /* 154 * static int task_test_lwps(void *taskp, rctl_val_t *, int64_t incr, 155 * int flags) 156 * 157 * Overview 158 * task_test_lwps() is the test-if-valid-increment for the resource control 159 * for the number of processes in a task. 160 * 161 * Return values 162 * 0 if the threshold limit was not passed, 1 if the limit was passed. 163 * 164 * Caller's context 165 * p->p_lock must be held across the call. 166 */ 167 /*ARGSUSED*/ 168 static int 169 task_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl, 170 rctl_qty_t incr, 171 uint_t flags) 172 { 173 rctl_qty_t nlwps; 174 175 ASSERT(MUTEX_HELD(&p->p_lock)); 176 ASSERT(e->rcep_t == RCENTITY_TASK); 177 if (e->rcep_p.task == NULL) 178 return (0); 179 180 ASSERT(MUTEX_HELD(&(e->rcep_p.task->tk_zone->zone_nlwps_lock))); 181 nlwps = e->rcep_p.task->tk_nlwps; 182 183 if (nlwps + incr > rcntl->rcv_value) 184 return (1); 185 186 return (0); 187 } 188 189 /*ARGSUSED*/ 190 static int 191 task_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) { 192 193 ASSERT(MUTEX_HELD(&p->p_lock)); 194 ASSERT(e->rcep_t == RCENTITY_TASK); 195 if (e->rcep_p.task == NULL) 196 return (0); 197 198 e->rcep_p.task->tk_nlwps_ctl = nv; 199 return (0); 200 } 201 202 /*ARGSUSED*/ 203 static rctl_qty_t 204 task_nprocs_usage(rctl_t *r, proc_t *p) 205 { 206 task_t *t; 207 rctl_qty_t nprocs; 208 209 ASSERT(MUTEX_HELD(&p->p_lock)); 210 211 t = p->p_task; 212 mutex_enter(&p->p_zone->zone_nlwps_lock); 213 nprocs = t->tk_nprocs; 214 mutex_exit(&p->p_zone->zone_nlwps_lock); 215 216 return (nprocs); 217 } 218 219 /*ARGSUSED*/ 220 static int 221 task_nprocs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl, 222 rctl_qty_t incr, uint_t flags) 223 { 224 rctl_qty_t nprocs; 225 226 ASSERT(MUTEX_HELD(&p->p_lock)); 227 ASSERT(e->rcep_t == RCENTITY_TASK); 228 if (e->rcep_p.task == NULL) 229 return (0); 230 231 ASSERT(MUTEX_HELD(&(e->rcep_p.task->tk_zone->zone_nlwps_lock))); 232 nprocs = e->rcep_p.task->tk_nprocs; 233 234 if (nprocs + incr > rcntl->rcv_value) 235 return (1); 236 237 return (0); 238 } 239 240 /*ARGSUSED*/ 241 static int 242 task_nprocs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, 243 rctl_qty_t nv) { 244 245 ASSERT(MUTEX_HELD(&p->p_lock)); 246 ASSERT(e->rcep_t == RCENTITY_TASK); 247 if (e->rcep_p.task == NULL) 248 return (0); 249 250 e->rcep_p.task->tk_nprocs_ctl = nv; 251 return (0); 252 } 253 254 /* 255 * static rctl_qty_t task_usage_cpu_secs(void *taskp) 256 * 257 * Overview 258 * task_usage_cpu_secs() is the usage operation for the resource control 259 * associated with the total accrued CPU seconds for a task. 260 * 261 * Return values 262 * The number of CPU seconds consumed by the task is returned. 263 * 264 * Caller's context 265 * The given task must be held across the call. 266 */ 267 /*ARGSUSED*/ 268 static rctl_qty_t 269 task_cpu_time_usage(rctl_t *r, proc_t *p) 270 { 271 task_t *t = p->p_task; 272 273 ASSERT(MUTEX_HELD(&p->p_lock)); 274 return (t->tk_cpu_time); 275 } 276 277 /* 278 * int task_cpu_time_incr(task_t *t, rctl_qty_t incr) 279 * 280 * Overview 281 * task_cpu_time_incr() increments the amount of CPU time used 282 * by this task. 283 * 284 * Return values 285 * 1 if a second or more time is accumulated 286 * 0 otherwise 287 * 288 * Caller's context 289 * This is called by the clock tick accounting function to charge 290 * CPU time to a task. 291 */ 292 rctl_qty_t 293 task_cpu_time_incr(task_t *t, rctl_qty_t incr) 294 { 295 rctl_qty_t ret = 0; 296 297 mutex_enter(&t->tk_cpu_time_lock); 298 t->tk_cpu_ticks += incr; 299 if (t->tk_cpu_ticks >= hz) { 300 t->tk_cpu_time += t->tk_cpu_ticks / hz; 301 t->tk_cpu_ticks = t->tk_cpu_ticks % hz; 302 ret = t->tk_cpu_time; 303 } 304 mutex_exit(&t->tk_cpu_time_lock); 305 306 return (ret); 307 } 308 309 /* 310 * static int task_test_cpu_secs(void *taskp, rctl_val_t *, int64_t incr, 311 * int flags) 312 * 313 * Overview 314 * task_test_cpu_secs() is the test-if-valid-increment for the resource 315 * control for the total accrued CPU seconds for a task. 316 * 317 * Return values 318 * 0 if the threshold limit was not passed, 1 if the limit was passed. 319 * 320 * Caller's context 321 * The given task must be held across the call. 322 */ 323 /*ARGSUSED*/ 324 static int 325 task_cpu_time_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, 326 struct rctl_val *rcntl, rctl_qty_t incr, uint_t flags) 327 { 328 ASSERT(MUTEX_HELD(&p->p_lock)); 329 ASSERT(e->rcep_t == RCENTITY_TASK); 330 if (e->rcep_p.task == NULL) 331 return (0); 332 333 if (incr >= rcntl->rcv_value) 334 return (1); 335 336 return (0); 337 } 338 339 static task_t * 340 task_find(taskid_t id, zoneid_t zoneid) 341 { 342 task_t *tk; 343 344 ASSERT(MUTEX_HELD(&task_hash_lock)); 345 346 if (mod_hash_find(task_hash, (mod_hash_key_t)(uintptr_t)id, 347 (mod_hash_val_t *)&tk) == MH_ERR_NOTFOUND || 348 (zoneid != ALL_ZONES && zoneid != tk->tk_zone->zone_id)) 349 return (NULL); 350 351 return (tk); 352 } 353 354 /* 355 * task_hold_by_id(), task_hold_by_id_zone() 356 * 357 * Overview 358 * task_hold_by_id() is used to take a reference on a task by its task id, 359 * supporting the various system call interfaces for obtaining resource data, 360 * delivering signals, and so forth. 361 * 362 * Return values 363 * Returns a pointer to the task_t with taskid_t id. The task is returned 364 * with its hold count incremented by one. Returns NULL if there 365 * is no task with the requested id. 366 * 367 * Caller's context 368 * Caller must not be holding task_hash_lock. No restrictions on context. 369 */ 370 task_t * 371 task_hold_by_id_zone(taskid_t id, zoneid_t zoneid) 372 { 373 task_t *tk; 374 375 mutex_enter(&task_hash_lock); 376 if ((tk = task_find(id, zoneid)) != NULL) 377 atomic_inc_32(&tk->tk_hold_count); 378 mutex_exit(&task_hash_lock); 379 380 return (tk); 381 } 382 383 task_t * 384 task_hold_by_id(taskid_t id) 385 { 386 zoneid_t zoneid; 387 388 if (INGLOBALZONE(curproc)) 389 zoneid = ALL_ZONES; 390 else 391 zoneid = getzoneid(); 392 return (task_hold_by_id_zone(id, zoneid)); 393 } 394 395 /* 396 * void task_hold(task_t *) 397 * 398 * Overview 399 * task_hold() is used to take an additional reference to the given task. 400 * 401 * Return values 402 * None. 403 * 404 * Caller's context 405 * No restriction on context. 406 */ 407 void 408 task_hold(task_t *tk) 409 { 410 atomic_inc_32(&tk->tk_hold_count); 411 } 412 413 /* 414 * void task_rele(task_t *) 415 * 416 * Overview 417 * task_rele() relinquishes a reference on the given task, which was acquired 418 * via task_hold() or task_hold_by_id(). If this is the last member or 419 * observer of the task, dispatch it for commitment via the accounting 420 * subsystem. 421 * 422 * Return values 423 * None. 424 * 425 * Caller's context 426 * Caller must not be holding the task_hash_lock. 427 */ 428 void 429 task_rele(task_t *tk) 430 { 431 mutex_enter(&task_hash_lock); 432 if (atomic_add_32_nv(&tk->tk_hold_count, -1) > 0) { 433 mutex_exit(&task_hash_lock); 434 return; 435 } 436 437 ASSERT(tk->tk_nprocs == 0); 438 439 mutex_enter(&tk->tk_zone->zone_nlwps_lock); 440 tk->tk_proj->kpj_ntasks--; 441 mutex_exit(&tk->tk_zone->zone_nlwps_lock); 442 443 task_kstat_delete(tk); 444 445 if (mod_hash_destroy(task_hash, 446 (mod_hash_key_t)(uintptr_t)tk->tk_tkid) != 0) 447 panic("unable to delete task %d", tk->tk_tkid); 448 mutex_exit(&task_hash_lock); 449 450 /* 451 * At this point, there are no members or observers of the task, so we 452 * can safely send it on for commitment to the accounting subsystem. 453 * The task will be destroyed in task_end() subsequent to commitment. 454 * Since we may be called with pidlock held, taskq_dispatch() cannot 455 * sleep. Commitment is handled by a backup thread in case dispatching 456 * the task fails. 457 */ 458 if (taskq_dispatch(exacct_queue, exacct_commit_task, tk, 459 TQ_NOSLEEP | TQ_NOQUEUE) == TASKQID_INVALID) { 460 mutex_enter(&task_commit_lock); 461 if (task_commit_head == NULL) { 462 task_commit_head = task_commit_tail = tk; 463 } else { 464 task_commit_tail->tk_commit_next = tk; 465 task_commit_tail = tk; 466 } 467 cv_signal(&task_commit_cv); 468 mutex_exit(&task_commit_lock); 469 } 470 } 471 472 /* 473 * task_t *task_create(projid_t, zone *) 474 * 475 * Overview 476 * A process constructing a new task calls task_create() to construct and 477 * preinitialize the task for the appropriate destination project. Only one 478 * task, the primordial task0, is not created with task_create(). 479 * 480 * Return values 481 * None. 482 * 483 * Caller's context 484 * Caller's context should be safe for KM_SLEEP allocations. 485 * The caller should appropriately bump the kpj_ntasks counter on the 486 * project that contains this task. 487 */ 488 task_t * 489 task_create(projid_t projid, zone_t *zone) 490 { 491 task_t *tk = kmem_cache_alloc(task_cache, KM_SLEEP); 492 task_t *ancestor_tk; 493 taskid_t tkid; 494 task_usage_t *tu = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP); 495 mod_hash_hndl_t hndl; 496 rctl_set_t *set = rctl_set_create(); 497 rctl_alloc_gp_t *gp; 498 rctl_entity_p_t e; 499 500 bzero(tk, sizeof (task_t)); 501 502 tk->tk_tkid = tkid = id_alloc(taskid_space); 503 tk->tk_nlwps = 0; 504 tk->tk_nlwps_ctl = INT_MAX; 505 tk->tk_nprocs = 0; 506 tk->tk_nprocs_ctl = INT_MAX; 507 tk->tk_usage = tu; 508 tk->tk_inherited = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP); 509 tk->tk_proj = project_hold_by_id(projid, zone, PROJECT_HOLD_INSERT); 510 tk->tk_flags = TASK_NORMAL; 511 tk->tk_commit_next = NULL; 512 513 /* 514 * Copy ancestor task's resource controls. 515 */ 516 zone_task_hold(zone); 517 mutex_enter(&curproc->p_lock); 518 ancestor_tk = curproc->p_task; 519 task_hold(ancestor_tk); 520 tk->tk_zone = zone; 521 mutex_exit(&curproc->p_lock); 522 523 for (;;) { 524 gp = rctl_set_dup_prealloc(ancestor_tk->tk_rctls); 525 526 mutex_enter(&ancestor_tk->tk_rctls->rcs_lock); 527 if (rctl_set_dup_ready(ancestor_tk->tk_rctls, gp)) 528 break; 529 530 mutex_exit(&ancestor_tk->tk_rctls->rcs_lock); 531 532 rctl_prealloc_destroy(gp); 533 } 534 535 /* 536 * At this point, curproc does not have the appropriate linkage 537 * through the task to the project. So, rctl_set_dup should only 538 * copy the rctls, and leave the callbacks for later. 539 */ 540 e.rcep_p.task = tk; 541 e.rcep_t = RCENTITY_TASK; 542 tk->tk_rctls = rctl_set_dup(ancestor_tk->tk_rctls, curproc, curproc, &e, 543 set, gp, RCD_DUP); 544 mutex_exit(&ancestor_tk->tk_rctls->rcs_lock); 545 546 rctl_prealloc_destroy(gp); 547 548 /* 549 * Record the ancestor task's ID for use by extended accounting. 550 */ 551 tu->tu_anctaskid = ancestor_tk->tk_tkid; 552 task_rele(ancestor_tk); 553 554 /* 555 * Put new task structure in the hash table. 556 */ 557 (void) mod_hash_reserve(task_hash, &hndl); 558 mutex_enter(&task_hash_lock); 559 ASSERT(task_find(tkid, zone->zone_id) == NULL); 560 if (mod_hash_insert_reserve(task_hash, (mod_hash_key_t)(uintptr_t)tkid, 561 (mod_hash_val_t *)tk, hndl) != 0) { 562 mod_hash_cancel(task_hash, &hndl); 563 panic("unable to insert task %d(%p)", tkid, (void *)tk); 564 } 565 mutex_exit(&task_hash_lock); 566 567 tk->tk_nprocs_kstat = task_kstat_create(tk, zone); 568 return (tk); 569 } 570 571 /* 572 * void task_attach(task_t *, proc_t *) 573 * 574 * Overview 575 * task_attach() is used to attach a process to a task; this operation is only 576 * performed as a result of a fork() or settaskid() system call. The proc_t's 577 * p_tasknext and p_taskprev fields will be set such that the proc_t is a 578 * member of the doubly-linked list of proc_t's that make up the task. 579 * 580 * Return values 581 * None. 582 * 583 * Caller's context 584 * pidlock and p->p_lock must be held on entry. 585 */ 586 void 587 task_attach(task_t *tk, proc_t *p) 588 { 589 proc_t *first, *prev; 590 ASSERT(tk != NULL); 591 ASSERT(p != NULL); 592 ASSERT(MUTEX_HELD(&pidlock)); 593 ASSERT(MUTEX_HELD(&p->p_lock)); 594 595 if (tk->tk_memb_list == NULL) { 596 p->p_tasknext = p; 597 p->p_taskprev = p; 598 } else { 599 first = tk->tk_memb_list; 600 prev = first->p_taskprev; 601 first->p_taskprev = p; 602 p->p_tasknext = first; 603 p->p_taskprev = prev; 604 prev->p_tasknext = p; 605 } 606 tk->tk_memb_list = p; 607 task_hold(tk); 608 p->p_task = tk; 609 } 610 611 /* 612 * task_begin() 613 * 614 * Overview 615 * A process constructing a new task calls task_begin() to initialize the 616 * task, by attaching itself as a member. 617 * 618 * Return values 619 * None. 620 * 621 * Caller's context 622 * pidlock and p_lock must be held across the call to task_begin(). 623 */ 624 void 625 task_begin(task_t *tk, proc_t *p) 626 { 627 timestruc_t ts; 628 task_usage_t *tu; 629 rctl_entity_p_t e; 630 631 ASSERT(MUTEX_HELD(&pidlock)); 632 ASSERT(MUTEX_HELD(&p->p_lock)); 633 634 mutex_enter(&tk->tk_usage_lock); 635 tu = tk->tk_usage; 636 gethrestime(&ts); 637 tu->tu_startsec = (uint64_t)ts.tv_sec; 638 tu->tu_startnsec = (uint64_t)ts.tv_nsec; 639 mutex_exit(&tk->tk_usage_lock); 640 641 /* 642 * Join process to the task as a member. 643 */ 644 task_attach(tk, p); 645 646 /* 647 * Now that the linkage from process to task is complete, do the 648 * required callback for the task rctl set. 649 */ 650 e.rcep_p.task = tk; 651 e.rcep_t = RCENTITY_TASK; 652 (void) rctl_set_dup(NULL, NULL, p, &e, tk->tk_rctls, NULL, 653 RCD_CALLBACK); 654 } 655 656 /* 657 * void task_detach(proc_t *) 658 * 659 * Overview 660 * task_detach() removes the specified process from its task. task_detach 661 * sets the process's task membership to NULL, in anticipation of a final exit 662 * or of joining a new task. Because task_rele() requires a context safe for 663 * KM_SLEEP allocations, a task_detach() is followed by a subsequent 664 * task_rele() once appropriate context is available. 665 * 666 * Because task_detach() involves relinquishing the process's membership in 667 * the project, any observational rctls the process may have had on the task 668 * or project are destroyed. 669 * 670 * Return values 671 * None. 672 * 673 * Caller's context 674 * pidlock and p_lock held across task_detach(). 675 */ 676 void 677 task_detach(proc_t *p) 678 { 679 task_t *tk = p->p_task; 680 681 ASSERT(MUTEX_HELD(&pidlock)); 682 ASSERT(MUTEX_HELD(&p->p_lock)); 683 ASSERT(p->p_task != NULL); 684 ASSERT(tk->tk_memb_list != NULL); 685 686 if (tk->tk_memb_list == p) 687 tk->tk_memb_list = p->p_tasknext; 688 if (tk->tk_memb_list == p) 689 tk->tk_memb_list = NULL; 690 p->p_taskprev->p_tasknext = p->p_tasknext; 691 p->p_tasknext->p_taskprev = p->p_taskprev; 692 693 rctl_set_tearoff(p->p_task->tk_rctls, p); 694 rctl_set_tearoff(p->p_task->tk_proj->kpj_rctls, p); 695 696 p->p_task = NULL; 697 p->p_tasknext = p->p_taskprev = NULL; 698 } 699 700 /* 701 * task_change(task_t *, proc_t *) 702 * 703 * Overview 704 * task_change() removes the specified process from its current task. The 705 * process is then attached to the specified task. This routine is called 706 * from settaskid() when process is being moved to a new task. 707 * 708 * Return values 709 * None. 710 * 711 * Caller's context 712 * pidlock and p_lock held across task_change() 713 */ 714 void 715 task_change(task_t *newtk, proc_t *p) 716 { 717 task_t *oldtk = p->p_task; 718 719 ASSERT(MUTEX_HELD(&pidlock)); 720 ASSERT(MUTEX_HELD(&p->p_lock)); 721 ASSERT(oldtk != NULL); 722 ASSERT(oldtk->tk_memb_list != NULL); 723 724 mutex_enter(&oldtk->tk_zone->zone_nlwps_lock); 725 oldtk->tk_nlwps -= p->p_lwpcnt; 726 oldtk->tk_nprocs--; 727 mutex_exit(&oldtk->tk_zone->zone_nlwps_lock); 728 729 mutex_enter(&newtk->tk_zone->zone_nlwps_lock); 730 newtk->tk_nlwps += p->p_lwpcnt; 731 newtk->tk_nprocs++; 732 mutex_exit(&newtk->tk_zone->zone_nlwps_lock); 733 734 task_detach(p); 735 task_begin(newtk, p); 736 exacct_move_mstate(p, oldtk, newtk); 737 } 738 739 /* 740 * task_end() 741 * 742 * Overview 743 * task_end() contains the actions executed once the final member of 744 * a task has released the task, and all actions connected with the task, such 745 * as committing an accounting record to a file, are completed. It is called 746 * by the known last consumer of the task information. Additionally, 747 * task_end() must never refer to any process in the system. 748 * 749 * Return values 750 * None. 751 * 752 * Caller's context 753 * No restrictions on context, beyond that given above. 754 */ 755 void 756 task_end(task_t *tk) 757 { 758 ASSERT(tk->tk_hold_count == 0); 759 760 project_rele(tk->tk_proj); 761 kmem_free(tk->tk_usage, sizeof (task_usage_t)); 762 kmem_free(tk->tk_inherited, sizeof (task_usage_t)); 763 if (tk->tk_prevusage != NULL) 764 kmem_free(tk->tk_prevusage, sizeof (task_usage_t)); 765 if (tk->tk_zoneusage != NULL) 766 kmem_free(tk->tk_zoneusage, sizeof (task_usage_t)); 767 rctl_set_free(tk->tk_rctls); 768 id_free(taskid_space, tk->tk_tkid); 769 zone_task_rele(tk->tk_zone); 770 kmem_cache_free(task_cache, tk); 771 } 772 773 static void 774 changeproj(proc_t *p, kproject_t *kpj, zone_t *zone, void *projbuf, 775 void *zonebuf) 776 { 777 kproject_t *oldkpj; 778 kthread_t *t; 779 780 ASSERT(MUTEX_HELD(&pidlock)); 781 ASSERT(MUTEX_HELD(&p->p_lock)); 782 783 if ((t = p->p_tlist) != NULL) { 784 do { 785 (void) project_hold(kpj); 786 787 thread_lock(t); 788 oldkpj = ttoproj(t); 789 790 /* 791 * Kick this thread so that it doesn't sit 792 * on a wrong wait queue. 793 */ 794 if (ISWAITING(t)) 795 setrun_locked(t); 796 797 /* 798 * The thread wants to go on the project wait queue, but 799 * the waitq is changing. 800 */ 801 if (t->t_schedflag & TS_PROJWAITQ) 802 t->t_schedflag &= ~ TS_PROJWAITQ; 803 804 t->t_proj = kpj; 805 t->t_pre_sys = 1; /* For cred update */ 806 thread_unlock(t); 807 fss_changeproj(t, kpj, zone, projbuf, zonebuf); 808 809 project_rele(oldkpj); 810 } while ((t = t->t_forw) != p->p_tlist); 811 } 812 } 813 814 /* 815 * task_join() 816 * 817 * Overview 818 * task_join() contains the actions that must be executed when the first 819 * member (curproc) of a newly created task joins it. It may never fail. 820 * 821 * The caller must make sure holdlwps() is called so that all other lwps are 822 * stopped prior to calling this function. 823 * 824 * NB: It returns with curproc->p_lock held. 825 * 826 * Return values 827 * Pointer to the old task. 828 * 829 * Caller's context 830 * cpu_lock must be held entering the function. It will acquire pidlock, 831 * p_crlock and p_lock during execution. 832 */ 833 task_t * 834 task_join(task_t *tk, uint_t flags) 835 { 836 proc_t *p = ttoproc(curthread); 837 task_t *prev_tk; 838 void *projbuf, *zonebuf; 839 zone_t *zone = tk->tk_zone; 840 projid_t projid = tk->tk_proj->kpj_id; 841 cred_t *oldcr; 842 843 /* 844 * We can't know for sure if holdlwps() was called, but we can check to 845 * ensure we're single-threaded. 846 */ 847 ASSERT(curthread == p->p_agenttp || p->p_lwprcnt == 1); 848 849 /* 850 * Changing the credential is always hard because we cannot 851 * allocate memory when holding locks but we don't know whether 852 * we need to change it. We first get a reference to the current 853 * cred if we need to change it. Then we create a credential 854 * with an updated project id. Finally we install it, first 855 * releasing the reference we had on the p_cred at the time we 856 * acquired the lock the first time and later we release the 857 * reference to p_cred at the time we acquired the lock the 858 * second time. 859 */ 860 mutex_enter(&p->p_crlock); 861 if (crgetprojid(p->p_cred) == projid) 862 oldcr = NULL; 863 else 864 crhold(oldcr = p->p_cred); 865 mutex_exit(&p->p_crlock); 866 867 if (oldcr != NULL) { 868 cred_t *newcr = crdup(oldcr); 869 crsetprojid(newcr, projid); 870 crfree(oldcr); 871 872 mutex_enter(&p->p_crlock); 873 oldcr = p->p_cred; 874 p->p_cred = newcr; 875 mutex_exit(&p->p_crlock); 876 crfree(oldcr); 877 } 878 879 /* 880 * Make sure that the number of processor sets is constant 881 * across this operation. 882 */ 883 ASSERT(MUTEX_HELD(&cpu_lock)); 884 885 projbuf = fss_allocbuf(FSS_NPSET_BUF, FSS_ALLOC_PROJ); 886 zonebuf = fss_allocbuf(FSS_NPSET_BUF, FSS_ALLOC_ZONE); 887 888 mutex_enter(&pidlock); 889 mutex_enter(&p->p_lock); 890 891 prev_tk = p->p_task; 892 task_change(tk, p); 893 894 /* 895 * Now move threads one by one to their new project. 896 */ 897 changeproj(p, tk->tk_proj, zone, projbuf, zonebuf); 898 if (flags & TASK_FINAL) 899 p->p_task->tk_flags |= TASK_FINAL; 900 901 mutex_exit(&pidlock); 902 903 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 904 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 905 return (prev_tk); 906 } 907 908 /* 909 * rctl ops vectors 910 */ 911 static rctl_ops_t task_lwps_ops = { 912 rcop_no_action, 913 task_lwps_usage, 914 task_lwps_set, 915 task_lwps_test 916 }; 917 918 static rctl_ops_t task_procs_ops = { 919 rcop_no_action, 920 task_nprocs_usage, 921 task_nprocs_set, 922 task_nprocs_test 923 }; 924 925 static rctl_ops_t task_cpu_time_ops = { 926 rcop_no_action, 927 task_cpu_time_usage, 928 rcop_no_set, 929 task_cpu_time_test 930 }; 931 932 /*ARGSUSED*/ 933 /* 934 * void task_init(void) 935 * 936 * Overview 937 * task_init() initializes task-related hashes, caches, and the task id 938 * space. Additionally, task_init() establishes p0 as a member of task0. 939 * Called by main(). 940 * 941 * Return values 942 * None. 943 * 944 * Caller's context 945 * task_init() must be called prior to MP startup. 946 */ 947 void 948 task_init(void) 949 { 950 proc_t *p = &p0; 951 mod_hash_hndl_t hndl; 952 rctl_set_t *set; 953 rctl_alloc_gp_t *gp; 954 rctl_entity_p_t e; 955 956 /* 957 * Initialize task_cache and taskid_space. 958 */ 959 task_cache = kmem_cache_create("task_cache", sizeof (task_t), 960 0, NULL, NULL, NULL, NULL, NULL, 0); 961 taskid_space = id_space_create("taskid_space", 0, MAX_TASKID); 962 963 /* 964 * Initialize task hash table. 965 */ 966 task_hash = mod_hash_create_idhash("task_hash", task_hash_size, 967 mod_hash_null_valdtor); 968 969 /* 970 * Initialize task-based rctls. 971 */ 972 rc_task_lwps = rctl_register("task.max-lwps", RCENTITY_TASK, 973 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_COUNT, INT_MAX, INT_MAX, 974 &task_lwps_ops); 975 rc_task_nprocs = rctl_register("task.max-processes", RCENTITY_TASK, 976 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_COUNT, INT_MAX, INT_MAX, 977 &task_procs_ops); 978 rc_task_cpu_time = rctl_register("task.max-cpu-time", RCENTITY_TASK, 979 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_DENY_NEVER | 980 RCTL_GLOBAL_CPU_TIME | RCTL_GLOBAL_INFINITE | 981 RCTL_GLOBAL_UNOBSERVABLE | RCTL_GLOBAL_SECONDS, UINT64_MAX, 982 UINT64_MAX, &task_cpu_time_ops); 983 984 /* 985 * Create task0 and place p0 in it as a member. 986 */ 987 task0p = kmem_cache_alloc(task_cache, KM_SLEEP); 988 bzero(task0p, sizeof (task_t)); 989 990 task0p->tk_tkid = id_alloc(taskid_space); 991 task0p->tk_usage = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP); 992 task0p->tk_inherited = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP); 993 task0p->tk_proj = project_hold_by_id(0, &zone0, 994 PROJECT_HOLD_INSERT); 995 task0p->tk_flags = TASK_NORMAL; 996 task0p->tk_nlwps = p->p_lwpcnt; 997 task0p->tk_nprocs = 1; 998 task0p->tk_zone = global_zone; 999 task0p->tk_commit_next = NULL; 1000 1001 set = rctl_set_create(); 1002 gp = rctl_set_init_prealloc(RCENTITY_TASK); 1003 mutex_enter(&curproc->p_lock); 1004 e.rcep_p.task = task0p; 1005 e.rcep_t = RCENTITY_TASK; 1006 task0p->tk_rctls = rctl_set_init(RCENTITY_TASK, curproc, &e, set, gp); 1007 mutex_exit(&curproc->p_lock); 1008 rctl_prealloc_destroy(gp); 1009 1010 (void) mod_hash_reserve(task_hash, &hndl); 1011 mutex_enter(&task_hash_lock); 1012 ASSERT(task_find(task0p->tk_tkid, GLOBAL_ZONEID) == NULL); 1013 if (mod_hash_insert_reserve(task_hash, 1014 (mod_hash_key_t)(uintptr_t)task0p->tk_tkid, 1015 (mod_hash_val_t *)task0p, hndl) != 0) { 1016 mod_hash_cancel(task_hash, &hndl); 1017 panic("unable to insert task %d(%p)", task0p->tk_tkid, 1018 (void *)task0p); 1019 } 1020 mutex_exit(&task_hash_lock); 1021 1022 task0p->tk_memb_list = p; 1023 1024 task0p->tk_nprocs_kstat = task_kstat_create(task0p, task0p->tk_zone); 1025 1026 /* 1027 * Initialize task pointers for p0, including doubly linked list of task 1028 * members. 1029 */ 1030 p->p_task = task0p; 1031 p->p_taskprev = p->p_tasknext = p; 1032 task_hold(task0p); 1033 } 1034 1035 static int 1036 task_nprocs_kstat_update(kstat_t *ksp, int rw) 1037 { 1038 task_t *tk = ksp->ks_private; 1039 task_kstat_t *ktk = ksp->ks_data; 1040 1041 if (rw == KSTAT_WRITE) 1042 return (EACCES); 1043 1044 ktk->ktk_usage.value.ui64 = tk->tk_nprocs; 1045 ktk->ktk_value.value.ui64 = tk->tk_nprocs_ctl; 1046 return (0); 1047 } 1048 1049 static kstat_t * 1050 task_kstat_create(task_t *tk, zone_t *zone) 1051 { 1052 kstat_t *ksp; 1053 task_kstat_t *ktk; 1054 char *zonename = zone->zone_name; 1055 1056 ksp = rctl_kstat_create_task(tk, "nprocs", KSTAT_TYPE_NAMED, 1057 sizeof (task_kstat_t) / sizeof (kstat_named_t), 1058 KSTAT_FLAG_VIRTUAL); 1059 1060 if (ksp == NULL) 1061 return (NULL); 1062 1063 ktk = ksp->ks_data = kmem_alloc(sizeof (task_kstat_t), KM_SLEEP); 1064 ksp->ks_data_size += strlen(zonename) + 1; 1065 kstat_named_init(&ktk->ktk_zonename, "zonename", KSTAT_DATA_STRING); 1066 kstat_named_setstr(&ktk->ktk_zonename, zonename); 1067 kstat_named_init(&ktk->ktk_usage, "usage", KSTAT_DATA_UINT64); 1068 kstat_named_init(&ktk->ktk_value, "value", KSTAT_DATA_UINT64); 1069 ksp->ks_update = task_nprocs_kstat_update; 1070 ksp->ks_private = tk; 1071 kstat_install(ksp); 1072 1073 return (ksp); 1074 } 1075 1076 static void 1077 task_kstat_delete(task_t *tk) 1078 { 1079 void *data; 1080 1081 if (tk->tk_nprocs_kstat != NULL) { 1082 data = tk->tk_nprocs_kstat->ks_data; 1083 kstat_delete(tk->tk_nprocs_kstat); 1084 kmem_free(data, sizeof (task_kstat_t)); 1085 tk->tk_nprocs_kstat = NULL; 1086 } 1087 } 1088 1089 void 1090 task_commit_thread_init() 1091 { 1092 mutex_init(&task_commit_lock, NULL, MUTEX_DEFAULT, NULL); 1093 cv_init(&task_commit_cv, NULL, CV_DEFAULT, NULL); 1094 task_commit_thread = thread_create(NULL, 0, task_commit, NULL, 0, 1095 &p0, TS_RUN, minclsyspri); 1096 } 1097 1098 /* 1099 * Backup thread to commit task resource usage when taskq_dispatch() fails. 1100 */ 1101 static void 1102 task_commit() 1103 { 1104 callb_cpr_t cprinfo; 1105 1106 CALLB_CPR_INIT(&cprinfo, &task_commit_lock, callb_generic_cpr, 1107 "task_commit_thread"); 1108 1109 mutex_enter(&task_commit_lock); 1110 1111 for (;;) { 1112 while (task_commit_head == NULL) { 1113 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1114 cv_wait(&task_commit_cv, &task_commit_lock); 1115 CALLB_CPR_SAFE_END(&cprinfo, &task_commit_lock); 1116 } 1117 while (task_commit_head != NULL) { 1118 task_t *tk; 1119 1120 tk = task_commit_head; 1121 task_commit_head = task_commit_head->tk_commit_next; 1122 if (task_commit_head == NULL) 1123 task_commit_tail = NULL; 1124 mutex_exit(&task_commit_lock); 1125 exacct_commit_task(tk); 1126 mutex_enter(&task_commit_lock); 1127 } 1128 } 1129 } 1130