xref: /titanic_51/usr/src/uts/common/os/task.c (revision 381a2a9a387f449fab7d0c7e97c4184c26963abf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/atomic.h>
30 #include <sys/cmn_err.h>
31 #include <sys/exacct.h>
32 #include <sys/id_space.h>
33 #include <sys/kmem.h>
34 #include <sys/modhash.h>
35 #include <sys/mutex.h>
36 #include <sys/proc.h>
37 #include <sys/project.h>
38 #include <sys/rctl.h>
39 #include <sys/systm.h>
40 #include <sys/task.h>
41 #include <sys/time.h>
42 #include <sys/types.h>
43 #include <sys/zone.h>
44 #include <sys/cpuvar.h>
45 #include <sys/fss.h>
46 #include <sys/class.h>
47 #include <sys/project.h>
48 
49 /*
50  * Tasks
51  *
52  *   A task is a collection of processes, associated with a common project ID
53  *   and related by a common initial parent.  The task primarily represents a
54  *   natural process sequence with known resource usage, although it can also be
55  *   viewed as a convenient grouping of processes for signal delivery, processor
56  *   binding, and administrative operations.
57  *
58  * Membership and observership
59  *   We can conceive of situations where processes outside of the task may wish
60  *   to examine the resource usage of the task.  Similarly, a number of the
61  *   administrative operations on a task can be performed by processes who are
62  *   not members of the task.  Accordingly, we must design a locking strategy
63  *   where observers of the task, who wish to examine or operate on the task,
64  *   and members of task, who can perform the mentioned operations, as well as
65  *   leave the task, see a consistent and correct representation of the task at
66  *   all times.
67  *
68  * Locking
69  *   Because the task membership is a new relation between processes, its
70  *   locking becomes an additional responsibility of the pidlock/p_lock locking
71  *   sequence; however, tasks closely resemble sessions and the session locking
72  *   model is mostly appropriate for the interaction of tasks, processes, and
73  *   procfs.
74  *
75  *   kmutex_t task_hash_lock
76  *     task_hash_lock is a global lock protecting the contents of the task
77  *     ID-to-task pointer hash.  Holders of task_hash_lock must not attempt to
78  *     acquire pidlock or p_lock.
79  *   uint_t tk_hold_count
80  *     tk_hold_count, the number of members and observers of the current task,
81  *     must be manipulated atomically.
82  *   proc_t *tk_memb_list
83  *   proc_t *p_tasknext
84  *   proc_t *p_taskprev
85  *     The task's membership list is protected by pidlock, and is therefore
86  *     always acquired before any of its members' p_lock mutexes.  The p_task
87  *     member of the proc structure is protected by pidlock or p_lock for
88  *     reading, and by both pidlock and p_lock for modification, as is done for
89  *     p_sessp.  The key point is that only the process can modify its p_task,
90  *     and not any entity on the system.  (/proc will use prlock() to prevent
91  *     the process from leaving, as opposed to pidlock.)
92  *   kmutex_t tk_usage_lock
93  *     tk_usage_lock is a per-task lock protecting the contents of the task
94  *     usage structure and tk_nlwps counter for the task.max-lwps resource
95  *     control.
96  */
97 
98 int task_hash_size = 256;
99 static kmutex_t task_hash_lock;
100 static mod_hash_t *task_hash;
101 
102 static id_space_t *taskid_space;	/* global taskid space */
103 static kmem_cache_t *task_cache;	/* kmem cache for task structures */
104 
105 rctl_hndl_t rc_task_lwps;
106 rctl_hndl_t rc_task_cpu_time;
107 
108 /*
109  * static rctl_qty_t task_usage_lwps(void *taskp)
110  *
111  * Overview
112  *   task_usage_lwps() is the usage operation for the resource control
113  *   associated with the number of LWPs in a task.
114  *
115  * Return values
116  *   The number of LWPs in the given task is returned.
117  *
118  * Caller's context
119  *   The p->p_lock must be held across the call.
120  */
121 /*ARGSUSED*/
122 static rctl_qty_t
123 task_lwps_usage(rctl_t *r, proc_t *p)
124 {
125 	task_t *t;
126 	rctl_qty_t nlwps;
127 
128 	ASSERT(MUTEX_HELD(&p->p_lock));
129 
130 	t = p->p_task;
131 	mutex_enter(&p->p_zone->zone_nlwps_lock);
132 	nlwps = t->tk_nlwps;
133 	mutex_exit(&p->p_zone->zone_nlwps_lock);
134 
135 	return (nlwps);
136 }
137 
138 /*
139  * static int task_test_lwps(void *taskp, rctl_val_t *, int64_t incr,
140  *   int flags)
141  *
142  * Overview
143  *   task_test_lwps() is the test-if-valid-increment for the resource control
144  *   for the number of processes in a task.
145  *
146  * Return values
147  *   0 if the threshold limit was not passed, 1 if the limit was passed.
148  *
149  * Caller's context
150  *   p->p_lock must be held across the call.
151  */
152 /*ARGSUSED*/
153 static int
154 task_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
155     rctl_qty_t incr,
156     uint_t flags)
157 {
158 	rctl_qty_t nlwps;
159 
160 	ASSERT(MUTEX_HELD(&p->p_lock));
161 	ASSERT(e->rcep_t == RCENTITY_TASK);
162 	if (e->rcep_p.task == NULL)
163 		return (0);
164 
165 	ASSERT(MUTEX_HELD(&(e->rcep_p.task->tk_zone->zone_nlwps_lock)));
166 	nlwps = e->rcep_p.task->tk_nlwps;
167 
168 	if (nlwps + incr > rcntl->rcv_value)
169 		return (1);
170 
171 	return (0);
172 }
173 /*ARGSUSED*/
174 static int
175 task_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) {
176 
177 	ASSERT(MUTEX_HELD(&p->p_lock));
178 	ASSERT(e->rcep_t == RCENTITY_TASK);
179 	if (e->rcep_p.task == NULL)
180 		return (0);
181 
182 	e->rcep_p.task->tk_nlwps_ctl = nv;
183 	return (0);
184 }
185 
186 /*
187  * static rctl_qty_t task_usage_cpu_secs(void *taskp)
188  *
189  * Overview
190  *   task_usage_cpu_secs() is the usage operation for the resource control
191  *   associated with the total accrued CPU seconds for a task.
192  *
193  * Return values
194  *   The number of CPU seconds consumed by the task is returned.
195  *
196  * Caller's context
197  *   The given task must be held across the call.
198  */
199 /*ARGSUSED*/
200 static rctl_qty_t
201 task_cpu_time_usage(rctl_t *r, proc_t *p)
202 {
203 	task_t *t = p->p_task;
204 
205 	ASSERT(MUTEX_HELD(&p->p_lock));
206 	return (t->tk_cpu_time / hz);
207 }
208 
209 /*
210  * static int task_test_cpu_secs(void *taskp, rctl_val_t *, int64_t incr,
211  *   int flags)
212  *
213  * Overview
214  *   task_test_cpu_secs() is the test-if-valid-increment for the resource
215  *   control for the total accrued CPU seconds for a task.
216  *
217  * Return values
218  *   0 if the threshold limit was not passed, 1 if the limit was passed.
219  *
220  * Caller's context
221  *   The given task must be held across the call.
222  */
223 /*ARGSUSED*/
224 static int
225 task_cpu_time_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
226     struct rctl_val *rcntl, rctl_qty_t incr, uint_t flags)
227 {
228 	task_t *t;
229 
230 	ASSERT(MUTEX_HELD(&p->p_lock));
231 	ASSERT(e->rcep_t == RCENTITY_TASK);
232 	if (e->rcep_p.task == NULL)
233 		return (0);
234 
235 	t = e->rcep_p.task;
236 	if ((t->tk_cpu_time + incr) / hz >= rcntl->rcv_value)
237 		return (1);
238 
239 	return (0);
240 }
241 
242 static task_t *
243 task_find(taskid_t id, zoneid_t zoneid)
244 {
245 	task_t *tk;
246 
247 	ASSERT(MUTEX_HELD(&task_hash_lock));
248 
249 	if (mod_hash_find(task_hash, (mod_hash_key_t)(uintptr_t)id,
250 	    (mod_hash_val_t *)&tk) == MH_ERR_NOTFOUND ||
251 	    (zoneid != ALL_ZONES && zoneid != tk->tk_zone->zone_id))
252 		return (NULL);
253 
254 	return (tk);
255 }
256 
257 /*
258  * task_hold_by_id(), task_hold_by_id_zone()
259  *
260  * Overview
261  *   task_hold_by_id() is used to take a reference on a task by its task id,
262  *   supporting the various system call interfaces for obtaining resource data,
263  *   delivering signals, and so forth.
264  *
265  * Return values
266  *   Returns a pointer to the task_t with taskid_t id.  The task is returned
267  *   with its hold count incremented by one.  Returns NULL if there
268  *   is no task with the requested id.
269  *
270  * Caller's context
271  *   Caller must not be holding task_hash_lock.  No restrictions on context.
272  */
273 task_t *
274 task_hold_by_id_zone(taskid_t id, zoneid_t zoneid)
275 {
276 	task_t *tk;
277 
278 	mutex_enter(&task_hash_lock);
279 	if ((tk = task_find(id, zoneid)) != NULL)
280 		atomic_add_32(&tk->tk_hold_count, 1);
281 	mutex_exit(&task_hash_lock);
282 
283 	return (tk);
284 }
285 
286 task_t *
287 task_hold_by_id(taskid_t id)
288 {
289 	zoneid_t zoneid;
290 
291 	if (INGLOBALZONE(curproc))
292 		zoneid = ALL_ZONES;
293 	else
294 		zoneid = getzoneid();
295 	return (task_hold_by_id_zone(id, zoneid));
296 }
297 
298 /*
299  * void task_hold(task_t *)
300  *
301  * Overview
302  *   task_hold() is used to take an additional reference to the given task.
303  *
304  * Return values
305  *   None.
306  *
307  * Caller's context
308  *   No restriction on context.
309  */
310 void
311 task_hold(task_t *tk)
312 {
313 	atomic_add_32(&tk->tk_hold_count, 1);
314 }
315 
316 /*
317  * void task_rele(task_t *)
318  *
319  * Overview
320  *   task_rele() relinquishes a reference on the given task, which was acquired
321  *   via task_hold() or task_hold_by_id().  If this is the last member or
322  *   observer of the task, dispatch it for commitment via the accounting
323  *   subsystem.
324  *
325  * Return values
326  *   None.
327  *
328  * Caller's context
329  *   Caller must not be holding the task_hash_lock.
330  *   Caller's context must be acceptable for KM_SLEEP allocations.
331  */
332 void
333 task_rele(task_t *tk)
334 {
335 	mutex_enter(&task_hash_lock);
336 	if (atomic_add_32_nv(&tk->tk_hold_count, -1) > 0) {
337 		mutex_exit(&task_hash_lock);
338 		return;
339 	}
340 
341 	mutex_enter(&tk->tk_zone->zone_nlwps_lock);
342 	tk->tk_proj->kpj_ntasks--;
343 	mutex_exit(&tk->tk_zone->zone_nlwps_lock);
344 
345 	if (mod_hash_destroy(task_hash,
346 	    (mod_hash_key_t)(uintptr_t)tk->tk_tkid) != 0)
347 		panic("unable to delete task %d", tk->tk_tkid);
348 	mutex_exit(&task_hash_lock);
349 
350 	/*
351 	 * At this point, there are no members or observers of the task, so we
352 	 * can safely send it on for commitment to the accounting subsystem.
353 	 * The task will be destroyed in task_end() subsequent to commitment.
354 	 */
355 	(void) taskq_dispatch(exacct_queue, exacct_commit_task, tk, KM_SLEEP);
356 }
357 
358 /*
359  * task_t *task_create(projid_t, zone *)
360  *
361  * Overview
362  *   A process constructing a new task calls task_create() to construct and
363  *   preinitialize the task for the appropriate destination project.  Only one
364  *   task, the primordial task0, is not created with task_create().
365  *
366  * Return values
367  *   None.
368  *
369  * Caller's context
370  *   Caller's context should be safe for KM_SLEEP allocations.
371  *   The caller should appropriately bump the kpj_ntasks counter on the
372  *   project that contains this task.
373  */
374 task_t *
375 task_create(projid_t projid, zone_t *zone)
376 {
377 	task_t *tk = kmem_cache_alloc(task_cache, KM_SLEEP);
378 	task_t *ancestor_tk;
379 	taskid_t tkid;
380 	task_usage_t *tu = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
381 	mod_hash_hndl_t hndl;
382 	rctl_set_t *set = rctl_set_create();
383 	rctl_alloc_gp_t *gp;
384 	rctl_entity_p_t e;
385 
386 	bzero(tk, sizeof (task_t));
387 
388 	tk->tk_tkid = tkid = id_alloc(taskid_space);
389 	tk->tk_nlwps = 0;
390 	tk->tk_nlwps_ctl = INT_MAX;
391 	tk->tk_usage = tu;
392 	tk->tk_proj = project_hold_by_id(projid, zone->zone_id,
393 	    PROJECT_HOLD_INSERT);
394 	tk->tk_flags = TASK_NORMAL;
395 
396 	/*
397 	 * Copy ancestor task's resource controls.
398 	 */
399 	zone_task_hold(zone);
400 	mutex_enter(&curproc->p_lock);
401 	ancestor_tk = curproc->p_task;
402 	task_hold(ancestor_tk);
403 	tk->tk_zone = zone;
404 	mutex_exit(&curproc->p_lock);
405 
406 	for (;;) {
407 		gp = rctl_set_dup_prealloc(ancestor_tk->tk_rctls);
408 
409 		mutex_enter(&ancestor_tk->tk_rctls->rcs_lock);
410 		if (rctl_set_dup_ready(ancestor_tk->tk_rctls, gp))
411 			break;
412 
413 		mutex_exit(&ancestor_tk->tk_rctls->rcs_lock);
414 
415 		rctl_prealloc_destroy(gp);
416 	}
417 
418 	/*
419 	 * At this point, curproc does not have the appropriate linkage
420 	 * through the task to the project. So, rctl_set_dup should only
421 	 * copy the rctls, and leave the callbacks for later.
422 	 */
423 	e.rcep_p.task = tk;
424 	e.rcep_t = RCENTITY_TASK;
425 	tk->tk_rctls = rctl_set_dup(ancestor_tk->tk_rctls, curproc, curproc, &e,
426 	    set, gp, RCD_DUP);
427 	mutex_exit(&ancestor_tk->tk_rctls->rcs_lock);
428 
429 	rctl_prealloc_destroy(gp);
430 
431 	/*
432 	 * Record the ancestor task's ID for use by extended accounting.
433 	 */
434 	tu->tu_anctaskid = ancestor_tk->tk_tkid;
435 	task_rele(ancestor_tk);
436 
437 	/*
438 	 * Put new task structure in the hash table.
439 	 */
440 	(void) mod_hash_reserve(task_hash, &hndl);
441 	mutex_enter(&task_hash_lock);
442 	ASSERT(task_find(tkid, getzoneid()) == NULL);
443 	if (mod_hash_insert_reserve(task_hash, (mod_hash_key_t)(uintptr_t)tkid,
444 	    (mod_hash_val_t *)tk, hndl) != 0) {
445 		mod_hash_cancel(task_hash, &hndl);
446 		panic("unable to insert task %d(%p)", tkid, (void *)tk);
447 	}
448 	mutex_exit(&task_hash_lock);
449 
450 	return (tk);
451 }
452 
453 /*
454  * void task_attach(task_t *, proc_t *)
455  *
456  * Overview
457  *   task_attach() is used to attach a process to a task; this operation is only
458  *   performed as a result of a fork() or settaskid() system call.  The proc_t's
459  *   p_tasknext and p_taskprev fields will be set such that the proc_t is a
460  *   member of the doubly-linked list of proc_t's that make up the task.
461  *
462  * Return values
463  *   None.
464  *
465  * Caller's context
466  *   pidlock and p->p_lock must be held on entry.
467  */
468 void
469 task_attach(task_t *tk, proc_t *p)
470 {
471 	proc_t *first, *prev;
472 	rctl_entity_p_t e;
473 	ASSERT(tk != NULL);
474 	ASSERT(p != NULL);
475 	ASSERT(MUTEX_HELD(&pidlock));
476 	ASSERT(MUTEX_HELD(&p->p_lock));
477 
478 	if (tk->tk_memb_list == NULL) {
479 		p->p_tasknext = p;
480 		p->p_taskprev = p;
481 	} else {
482 		first = tk->tk_memb_list;
483 		prev = first->p_taskprev;
484 		first->p_taskprev = p;
485 		p->p_tasknext = first;
486 		p->p_taskprev = prev;
487 		prev->p_tasknext = p;
488 	}
489 	tk->tk_memb_list = p;
490 	task_hold(tk);
491 	p->p_task = tk;
492 
493 	/*
494 	 * Now that the linkage from process to task and project is
495 	 * complete, do the required callbacks for the task and project
496 	 * rctl sets.
497 	 */
498 	e.rcep_p.proj = tk->tk_proj;
499 	e.rcep_t = RCENTITY_PROJECT;
500 	(void) rctl_set_dup(NULL, NULL, p, &e, tk->tk_proj->kpj_rctls, NULL,
501 	    RCD_CALLBACK);
502 
503 	e.rcep_p.task = tk;
504 	e.rcep_t = RCENTITY_TASK;
505 	(void) rctl_set_dup(NULL, NULL, p, &e, tk->tk_rctls, NULL,
506 	    RCD_CALLBACK);
507 
508 }
509 
510 /*
511  * task_begin()
512  *
513  * Overview
514  *   A process constructing a new task calls task_begin() to initialize the
515  *   task, by attaching itself as a member.
516  *
517  * Return values
518  *   None.
519  *
520  * Caller's context
521  *   pidlock and p_lock must be held across the call to task_begin().
522  */
523 void
524 task_begin(task_t *tk, proc_t *p)
525 {
526 	timestruc_t ts;
527 	task_usage_t *tu;
528 
529 	ASSERT(MUTEX_HELD(&pidlock));
530 	ASSERT(MUTEX_HELD(&p->p_lock));
531 
532 	mutex_enter(&tk->tk_usage_lock);
533 	tu = tk->tk_usage;
534 	gethrestime(&ts);
535 	tu->tu_startsec = (uint64_t)ts.tv_sec;
536 	tu->tu_startnsec = (uint64_t)ts.tv_nsec;
537 	mutex_exit(&tk->tk_usage_lock);
538 
539 	/*
540 	 * Join process to the task as a member.
541 	 */
542 	task_attach(tk, p);
543 }
544 
545 /*
546  * void task_detach(proc_t *)
547  *
548  * Overview
549  *   task_detach() removes the specified process from its task.  task_detach
550  *   sets the process's task membership to NULL, in anticipation of a final exit
551  *   or of joining a new task.  Because task_rele() requires a context safe for
552  *   KM_SLEEP allocations, a task_detach() is followed by a subsequent
553  *   task_rele() once appropriate context is available.
554  *
555  *   Because task_detach() involves relinquishing the process's membership in
556  *   the project, any observational rctls the process may have had on the task
557  *   or project are destroyed.
558  *
559  * Return values
560  *   None.
561  *
562  * Caller's context
563  *   pidlock and p_lock held across task_detach().
564  */
565 void
566 task_detach(proc_t *p)
567 {
568 	task_t *tk = p->p_task;
569 
570 	ASSERT(MUTEX_HELD(&pidlock));
571 	ASSERT(MUTEX_HELD(&p->p_lock));
572 	ASSERT(p->p_task != NULL);
573 	ASSERT(tk->tk_memb_list != NULL);
574 
575 	if (tk->tk_memb_list == p)
576 		tk->tk_memb_list = p->p_tasknext;
577 	if (tk->tk_memb_list == p)
578 		tk->tk_memb_list = NULL;
579 	p->p_taskprev->p_tasknext = p->p_tasknext;
580 	p->p_tasknext->p_taskprev = p->p_taskprev;
581 
582 	rctl_set_tearoff(p->p_task->tk_rctls, p);
583 	rctl_set_tearoff(p->p_task->tk_proj->kpj_rctls, p);
584 
585 	p->p_task = NULL;
586 	p->p_tasknext = p->p_taskprev = NULL;
587 }
588 
589 /*
590  * task_change(task_t *, proc_t *)
591  *
592  * Overview
593  *   task_change() removes the specified process from its current task.  The
594  *   process is then attached to the specified task.  This routine is called
595  *   from settaskid() when process is being moved to a new task.
596  *
597  * Return values
598  *   None.
599  *
600  * Caller's context
601  *   pidlock and p_lock held across task_change()
602  */
603 void
604 task_change(task_t *newtk, proc_t *p)
605 {
606 	task_t *oldtk = p->p_task;
607 
608 	ASSERT(MUTEX_HELD(&pidlock));
609 	ASSERT(MUTEX_HELD(&p->p_lock));
610 	ASSERT(oldtk != NULL);
611 	ASSERT(oldtk->tk_memb_list != NULL);
612 
613 	mutex_enter(&p->p_zone->zone_nlwps_lock);
614 	oldtk->tk_nlwps -= p->p_lwpcnt;
615 	mutex_exit(&p->p_zone->zone_nlwps_lock);
616 
617 	mutex_enter(&newtk->tk_zone->zone_nlwps_lock);
618 	newtk->tk_nlwps += p->p_lwpcnt;
619 	mutex_exit(&newtk->tk_zone->zone_nlwps_lock);
620 
621 	task_detach(p);
622 	task_begin(newtk, p);
623 }
624 
625 /*
626  * task_end()
627  *
628  * Overview
629  *   task_end() contains the actions executed once the final member of
630  *   a task has released the task, and all actions connected with the task, such
631  *   as committing an accounting record to a file, are completed.  It is called
632  *   by the known last consumer of the task information.  Additionally,
633  *   task_end() must never refer to any process in the system.
634  *
635  * Return values
636  *   None.
637  *
638  * Caller's context
639  *   No restrictions on context, beyond that given above.
640  */
641 void
642 task_end(task_t *tk)
643 {
644 	ASSERT(tk->tk_hold_count == 0);
645 
646 	project_rele(tk->tk_proj);
647 	kmem_free(tk->tk_usage, sizeof (task_usage_t));
648 	if (tk->tk_prevusage != NULL)
649 		kmem_free(tk->tk_prevusage, sizeof (task_usage_t));
650 	if (tk->tk_zoneusage != NULL)
651 		kmem_free(tk->tk_zoneusage, sizeof (task_usage_t));
652 	rctl_set_free(tk->tk_rctls);
653 	id_free(taskid_space, tk->tk_tkid);
654 	zone_task_rele(tk->tk_zone);
655 	kmem_cache_free(task_cache, tk);
656 }
657 
658 static void
659 changeproj(proc_t *p, kproject_t *kpj, zone_t *zone, void *projbuf,
660     void *zonebuf)
661 {
662 	kproject_t *oldkpj;
663 	kthread_t *t;
664 
665 	ASSERT(MUTEX_HELD(&pidlock));
666 	ASSERT(MUTEX_HELD(&p->p_lock));
667 
668 	if ((t = p->p_tlist) != NULL) {
669 		do {
670 			(void) project_hold(kpj);
671 
672 			thread_lock(t);
673 			oldkpj = ttoproj(t);
674 			t->t_proj = kpj;
675 			t->t_pre_sys = 1;		/* For cred update */
676 			thread_unlock(t);
677 			fss_changeproj(t, kpj, zone, projbuf, zonebuf);
678 
679 			project_rele(oldkpj);
680 		} while ((t = t->t_forw) != p->p_tlist);
681 	}
682 }
683 
684 /*
685  * task_join()
686  *
687  * Overview
688  *   task_join() contains the actions that must be executed when the first
689  *   member (curproc) of a newly created task joins it.  It may never fail.
690  *
691  *   The caller must make sure holdlwps() is called so that all other lwps are
692  *   stopped prior to calling this function.
693  *
694  *   NB: It returns with curproc->p_lock held.
695  *
696  * Return values
697  *   Pointer to the old task.
698  *
699  * Caller's context
700  *   cpu_lock must be held entering the function.  It will acquire pidlock,
701  *   p_crlock and p_lock during execution.
702  */
703 task_t *
704 task_join(task_t *tk, uint_t flags)
705 {
706 	proc_t *p = ttoproc(curthread);
707 	task_t *prev_tk;
708 	void *projbuf, *zonebuf;
709 	zone_t *zone = tk->tk_zone;
710 	projid_t projid = tk->tk_proj->kpj_id;
711 	cred_t *oldcr;
712 
713 	/*
714 	 * We can't know for sure if holdlwps() was called, but we can check to
715 	 * ensure we're single-threaded.
716 	 */
717 	ASSERT(curthread == p->p_agenttp || p->p_lwprcnt == 1);
718 
719 	/*
720 	 * Changing the credential is always hard because we cannot
721 	 * allocate memory when holding locks but we don't know whether
722 	 * we need to change it.  We first get a reference to the current
723 	 * cred if we need to change it.  Then we create a credential
724 	 * with an updated project id.  Finally we install it, first
725 	 * releasing the reference we had on the p_cred at the time we
726 	 * acquired the lock the first time and later we release the
727 	 * reference to p_cred at the time we acquired the lock the
728 	 * second time.
729 	 */
730 	mutex_enter(&p->p_crlock);
731 	if (crgetprojid(p->p_cred) == projid)
732 		oldcr = NULL;
733 	else
734 		crhold(oldcr = p->p_cred);
735 	mutex_exit(&p->p_crlock);
736 
737 	if (oldcr != NULL) {
738 		cred_t *newcr = crdup(oldcr);
739 		crsetprojid(newcr, projid);
740 		crfree(oldcr);
741 
742 		mutex_enter(&p->p_crlock);
743 		oldcr = p->p_cred;
744 		p->p_cred = newcr;
745 		mutex_exit(&p->p_crlock);
746 		crfree(oldcr);
747 	}
748 
749 	/*
750 	 * Make sure that the number of processor sets is constant
751 	 * across this operation.
752 	 */
753 	ASSERT(MUTEX_HELD(&cpu_lock));
754 
755 	projbuf = fss_allocbuf(FSS_NPSET_BUF, FSS_ALLOC_PROJ);
756 	zonebuf = fss_allocbuf(FSS_NPSET_BUF, FSS_ALLOC_ZONE);
757 
758 	mutex_enter(&pidlock);
759 	mutex_enter(&p->p_lock);
760 
761 	prev_tk = p->p_task;
762 	task_change(tk, p);
763 
764 	/*
765 	 * Now move threads one by one to their new project.
766 	 */
767 	changeproj(p, tk->tk_proj, zone, projbuf, zonebuf);
768 	if (flags & TASK_FINAL)
769 		p->p_task->tk_flags |= TASK_FINAL;
770 
771 	mutex_exit(&pidlock);
772 
773 	fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
774 	fss_freebuf(projbuf, FSS_ALLOC_PROJ);
775 	return (prev_tk);
776 }
777 
778 /*
779  * rctl ops vectors
780  */
781 static rctl_ops_t task_lwps_ops = {
782 	rcop_no_action,
783 	task_lwps_usage,
784 	task_lwps_set,
785 	task_lwps_test
786 };
787 
788 static rctl_ops_t task_cpu_time_ops = {
789 	rcop_no_action,
790 	task_cpu_time_usage,
791 	rcop_no_set,
792 	task_cpu_time_test
793 };
794 
795 /*ARGSUSED*/
796 /*
797  * void task_init(void)
798  *
799  * Overview
800  *   task_init() initializes task-related hashes, caches, and the task id
801  *   space.  Additionally, task_init() establishes p0 as a member of task0.
802  *   Called by main().
803  *
804  * Return values
805  *   None.
806  *
807  * Caller's context
808  *   task_init() must be called prior to MP startup.
809  */
810 void
811 task_init(void)
812 {
813 	proc_t *p = &p0;
814 	mod_hash_hndl_t hndl;
815 	rctl_set_t *set;
816 	rctl_alloc_gp_t *gp;
817 	rctl_entity_p_t e;
818 	/*
819 	 * Initialize task_cache and taskid_space.
820 	 */
821 	task_cache = kmem_cache_create("task_cache", sizeof (task_t),
822 	    0, NULL, NULL, NULL, NULL, NULL, 0);
823 	taskid_space = id_space_create("taskid_space", 0, MAX_TASKID);
824 
825 	/*
826 	 * Initialize task hash table.
827 	 */
828 	task_hash = mod_hash_create_idhash("task_hash", task_hash_size,
829 	    mod_hash_null_valdtor);
830 
831 	/*
832 	 * Initialize task-based rctls.
833 	 */
834 	rc_task_lwps = rctl_register("task.max-lwps", RCENTITY_TASK,
835 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_COUNT, INT_MAX, INT_MAX,
836 	    &task_lwps_ops);
837 	rc_task_cpu_time = rctl_register("task.max-cpu-time", RCENTITY_TASK,
838 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_DENY_NEVER |
839 	    RCTL_GLOBAL_CPU_TIME | RCTL_GLOBAL_INFINITE |
840 	    RCTL_GLOBAL_UNOBSERVABLE | RCTL_GLOBAL_SECONDS, UINT64_MAX,
841 	    UINT64_MAX, &task_cpu_time_ops);
842 
843 	/*
844 	 * Create task0 and place p0 in it as a member.
845 	 */
846 	task0p = kmem_cache_alloc(task_cache, KM_SLEEP);
847 	bzero(task0p, sizeof (task_t));
848 
849 	task0p->tk_tkid = id_alloc(taskid_space);
850 	task0p->tk_usage = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
851 	task0p->tk_proj = project_hold_by_id(0, GLOBAL_ZONEID,
852 	    PROJECT_HOLD_INSERT);
853 	task0p->tk_flags = TASK_NORMAL;
854 	task0p->tk_nlwps = p->p_lwpcnt;
855 	task0p->tk_zone = global_zone;
856 
857 	set = rctl_set_create();
858 	gp = rctl_set_init_prealloc(RCENTITY_TASK);
859 	mutex_enter(&curproc->p_lock);
860 	e.rcep_p.task = task0p;
861 	e.rcep_t = RCENTITY_TASK;
862 	task0p->tk_rctls = rctl_set_init(RCENTITY_TASK, curproc, &e, set, gp);
863 	mutex_exit(&curproc->p_lock);
864 	rctl_prealloc_destroy(gp);
865 
866 	(void) mod_hash_reserve(task_hash, &hndl);
867 	mutex_enter(&task_hash_lock);
868 	ASSERT(task_find(task0p->tk_tkid, GLOBAL_ZONEID) == NULL);
869 	if (mod_hash_insert_reserve(task_hash,
870 	    (mod_hash_key_t)(uintptr_t)task0p->tk_tkid,
871 	    (mod_hash_val_t *)task0p, hndl) != 0) {
872 		mod_hash_cancel(task_hash, &hndl);
873 		panic("unable to insert task %d(%p)", task0p->tk_tkid,
874 		    (void *)task0p);
875 	}
876 	mutex_exit(&task_hash_lock);
877 
878 	task0p->tk_memb_list = p;
879 
880 	/*
881 	 * Initialize task pointers for p0, including doubly linked list of task
882 	 * members.
883 	 */
884 	p->p_task = task0p;
885 	p->p_taskprev = p->p_tasknext = p;
886 	task_hold(task0p);
887 }
888