1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2016 by Delphix. All rights reserved.
24 */
25
26 #include <sys/atomic.h>
27 #include <sys/callb.h>
28 #include <sys/cmn_err.h>
29 #include <sys/exacct.h>
30 #include <sys/id_space.h>
31 #include <sys/kmem.h>
32 #include <sys/kstat.h>
33 #include <sys/modhash.h>
34 #include <sys/mutex.h>
35 #include <sys/proc.h>
36 #include <sys/project.h>
37 #include <sys/rctl.h>
38 #include <sys/systm.h>
39 #include <sys/task.h>
40 #include <sys/time.h>
41 #include <sys/types.h>
42 #include <sys/zone.h>
43 #include <sys/cpuvar.h>
44 #include <sys/fss.h>
45 #include <sys/class.h>
46 #include <sys/project.h>
47
48 /*
49 * Tasks
50 *
51 * A task is a collection of processes, associated with a common project ID
52 * and related by a common initial parent. The task primarily represents a
53 * natural process sequence with known resource usage, although it can also be
54 * viewed as a convenient grouping of processes for signal delivery, processor
55 * binding, and administrative operations.
56 *
57 * Membership and observership
58 * We can conceive of situations where processes outside of the task may wish
59 * to examine the resource usage of the task. Similarly, a number of the
60 * administrative operations on a task can be performed by processes who are
61 * not members of the task. Accordingly, we must design a locking strategy
62 * where observers of the task, who wish to examine or operate on the task,
63 * and members of task, who can perform the mentioned operations, as well as
64 * leave the task, see a consistent and correct representation of the task at
65 * all times.
66 *
67 * Locking
68 * Because the task membership is a new relation between processes, its
69 * locking becomes an additional responsibility of the pidlock/p_lock locking
70 * sequence; however, tasks closely resemble sessions and the session locking
71 * model is mostly appropriate for the interaction of tasks, processes, and
72 * procfs.
73 *
74 * kmutex_t task_hash_lock
75 * task_hash_lock is a global lock protecting the contents of the task
76 * ID-to-task pointer hash. Holders of task_hash_lock must not attempt to
77 * acquire pidlock or p_lock.
78 * uint_t tk_hold_count
79 * tk_hold_count, the number of members and observers of the current task,
80 * must be manipulated atomically.
81 * proc_t *tk_memb_list
82 * proc_t *p_tasknext
83 * proc_t *p_taskprev
84 * The task's membership list is protected by pidlock, and is therefore
85 * always acquired before any of its members' p_lock mutexes. The p_task
86 * member of the proc structure is protected by pidlock or p_lock for
87 * reading, and by both pidlock and p_lock for modification, as is done for
88 * p_sessp. The key point is that only the process can modify its p_task,
89 * and not any entity on the system. (/proc will use prlock() to prevent
90 * the process from leaving, as opposed to pidlock.)
91 * kmutex_t tk_usage_lock
92 * tk_usage_lock is a per-task lock protecting the contents of the task
93 * usage structure and tk_nlwps counter for the task.max-lwps resource
94 * control.
95 */
96
97 int task_hash_size = 256;
98 static kmutex_t task_hash_lock;
99 static mod_hash_t *task_hash;
100
101 static id_space_t *taskid_space; /* global taskid space */
102 static kmem_cache_t *task_cache; /* kmem cache for task structures */
103
104 rctl_hndl_t rc_task_lwps;
105 rctl_hndl_t rc_task_nprocs;
106 rctl_hndl_t rc_task_cpu_time;
107
108 /*
109 * Resource usage is committed using task queues; if taskq_dispatch() fails
110 * due to resource constraints, the task is placed on a list for background
111 * processing by the task_commit_thread() backup thread.
112 */
113 static kmutex_t task_commit_lock; /* protects list pointers and cv */
114 static kcondvar_t task_commit_cv; /* wakeup task_commit_thread */
115 static task_t *task_commit_head = NULL;
116 static task_t *task_commit_tail = NULL;
117 kthread_t *task_commit_thread;
118
119 static void task_commit();
120 static kstat_t *task_kstat_create(task_t *, zone_t *);
121 static void task_kstat_delete(task_t *);
122
123 /*
124 * static rctl_qty_t task_usage_lwps(void *taskp)
125 *
126 * Overview
127 * task_usage_lwps() is the usage operation for the resource control
128 * associated with the number of LWPs in a task.
129 *
130 * Return values
131 * The number of LWPs in the given task is returned.
132 *
133 * Caller's context
134 * The p->p_lock must be held across the call.
135 */
136 /*ARGSUSED*/
137 static rctl_qty_t
task_lwps_usage(rctl_t * r,proc_t * p)138 task_lwps_usage(rctl_t *r, proc_t *p)
139 {
140 task_t *t;
141 rctl_qty_t nlwps;
142
143 ASSERT(MUTEX_HELD(&p->p_lock));
144
145 t = p->p_task;
146 mutex_enter(&p->p_zone->zone_nlwps_lock);
147 nlwps = t->tk_nlwps;
148 mutex_exit(&p->p_zone->zone_nlwps_lock);
149
150 return (nlwps);
151 }
152
153 /*
154 * static int task_test_lwps(void *taskp, rctl_val_t *, int64_t incr,
155 * int flags)
156 *
157 * Overview
158 * task_test_lwps() is the test-if-valid-increment for the resource control
159 * for the number of processes in a task.
160 *
161 * Return values
162 * 0 if the threshold limit was not passed, 1 if the limit was passed.
163 *
164 * Caller's context
165 * p->p_lock must be held across the call.
166 */
167 /*ARGSUSED*/
168 static int
task_lwps_test(rctl_t * r,proc_t * p,rctl_entity_p_t * e,rctl_val_t * rcntl,rctl_qty_t incr,uint_t flags)169 task_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
170 rctl_qty_t incr,
171 uint_t flags)
172 {
173 rctl_qty_t nlwps;
174
175 ASSERT(MUTEX_HELD(&p->p_lock));
176 ASSERT(e->rcep_t == RCENTITY_TASK);
177 if (e->rcep_p.task == NULL)
178 return (0);
179
180 ASSERT(MUTEX_HELD(&(e->rcep_p.task->tk_zone->zone_nlwps_lock)));
181 nlwps = e->rcep_p.task->tk_nlwps;
182
183 if (nlwps + incr > rcntl->rcv_value)
184 return (1);
185
186 return (0);
187 }
188
189 /*ARGSUSED*/
190 static int
task_lwps_set(rctl_t * rctl,struct proc * p,rctl_entity_p_t * e,rctl_qty_t nv)191 task_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) {
192
193 ASSERT(MUTEX_HELD(&p->p_lock));
194 ASSERT(e->rcep_t == RCENTITY_TASK);
195 if (e->rcep_p.task == NULL)
196 return (0);
197
198 e->rcep_p.task->tk_nlwps_ctl = nv;
199 return (0);
200 }
201
202 /*ARGSUSED*/
203 static rctl_qty_t
task_nprocs_usage(rctl_t * r,proc_t * p)204 task_nprocs_usage(rctl_t *r, proc_t *p)
205 {
206 task_t *t;
207 rctl_qty_t nprocs;
208
209 ASSERT(MUTEX_HELD(&p->p_lock));
210
211 t = p->p_task;
212 mutex_enter(&p->p_zone->zone_nlwps_lock);
213 nprocs = t->tk_nprocs;
214 mutex_exit(&p->p_zone->zone_nlwps_lock);
215
216 return (nprocs);
217 }
218
219 /*ARGSUSED*/
220 static int
task_nprocs_test(rctl_t * r,proc_t * p,rctl_entity_p_t * e,rctl_val_t * rcntl,rctl_qty_t incr,uint_t flags)221 task_nprocs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
222 rctl_qty_t incr, uint_t flags)
223 {
224 rctl_qty_t nprocs;
225
226 ASSERT(MUTEX_HELD(&p->p_lock));
227 ASSERT(e->rcep_t == RCENTITY_TASK);
228 if (e->rcep_p.task == NULL)
229 return (0);
230
231 ASSERT(MUTEX_HELD(&(e->rcep_p.task->tk_zone->zone_nlwps_lock)));
232 nprocs = e->rcep_p.task->tk_nprocs;
233
234 if (nprocs + incr > rcntl->rcv_value)
235 return (1);
236
237 return (0);
238 }
239
240 /*ARGSUSED*/
241 static int
task_nprocs_set(rctl_t * rctl,struct proc * p,rctl_entity_p_t * e,rctl_qty_t nv)242 task_nprocs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
243 rctl_qty_t nv) {
244
245 ASSERT(MUTEX_HELD(&p->p_lock));
246 ASSERT(e->rcep_t == RCENTITY_TASK);
247 if (e->rcep_p.task == NULL)
248 return (0);
249
250 e->rcep_p.task->tk_nprocs_ctl = nv;
251 return (0);
252 }
253
254 /*
255 * static rctl_qty_t task_usage_cpu_secs(void *taskp)
256 *
257 * Overview
258 * task_usage_cpu_secs() is the usage operation for the resource control
259 * associated with the total accrued CPU seconds for a task.
260 *
261 * Return values
262 * The number of CPU seconds consumed by the task is returned.
263 *
264 * Caller's context
265 * The given task must be held across the call.
266 */
267 /*ARGSUSED*/
268 static rctl_qty_t
task_cpu_time_usage(rctl_t * r,proc_t * p)269 task_cpu_time_usage(rctl_t *r, proc_t *p)
270 {
271 task_t *t = p->p_task;
272
273 ASSERT(MUTEX_HELD(&p->p_lock));
274 return (t->tk_cpu_time);
275 }
276
277 /*
278 * int task_cpu_time_incr(task_t *t, rctl_qty_t incr)
279 *
280 * Overview
281 * task_cpu_time_incr() increments the amount of CPU time used
282 * by this task.
283 *
284 * Return values
285 * 1 if a second or more time is accumulated
286 * 0 otherwise
287 *
288 * Caller's context
289 * This is called by the clock tick accounting function to charge
290 * CPU time to a task.
291 */
292 rctl_qty_t
task_cpu_time_incr(task_t * t,rctl_qty_t incr)293 task_cpu_time_incr(task_t *t, rctl_qty_t incr)
294 {
295 rctl_qty_t ret = 0;
296
297 mutex_enter(&t->tk_cpu_time_lock);
298 t->tk_cpu_ticks += incr;
299 if (t->tk_cpu_ticks >= hz) {
300 t->tk_cpu_time += t->tk_cpu_ticks / hz;
301 t->tk_cpu_ticks = t->tk_cpu_ticks % hz;
302 ret = t->tk_cpu_time;
303 }
304 mutex_exit(&t->tk_cpu_time_lock);
305
306 return (ret);
307 }
308
309 /*
310 * static int task_test_cpu_secs(void *taskp, rctl_val_t *, int64_t incr,
311 * int flags)
312 *
313 * Overview
314 * task_test_cpu_secs() is the test-if-valid-increment for the resource
315 * control for the total accrued CPU seconds for a task.
316 *
317 * Return values
318 * 0 if the threshold limit was not passed, 1 if the limit was passed.
319 *
320 * Caller's context
321 * The given task must be held across the call.
322 */
323 /*ARGSUSED*/
324 static int
task_cpu_time_test(rctl_t * r,proc_t * p,rctl_entity_p_t * e,struct rctl_val * rcntl,rctl_qty_t incr,uint_t flags)325 task_cpu_time_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
326 struct rctl_val *rcntl, rctl_qty_t incr, uint_t flags)
327 {
328 ASSERT(MUTEX_HELD(&p->p_lock));
329 ASSERT(e->rcep_t == RCENTITY_TASK);
330 if (e->rcep_p.task == NULL)
331 return (0);
332
333 if (incr >= rcntl->rcv_value)
334 return (1);
335
336 return (0);
337 }
338
339 static task_t *
task_find(taskid_t id,zoneid_t zoneid)340 task_find(taskid_t id, zoneid_t zoneid)
341 {
342 task_t *tk;
343
344 ASSERT(MUTEX_HELD(&task_hash_lock));
345
346 if (mod_hash_find(task_hash, (mod_hash_key_t)(uintptr_t)id,
347 (mod_hash_val_t *)&tk) == MH_ERR_NOTFOUND ||
348 (zoneid != ALL_ZONES && zoneid != tk->tk_zone->zone_id))
349 return (NULL);
350
351 return (tk);
352 }
353
354 /*
355 * task_hold_by_id(), task_hold_by_id_zone()
356 *
357 * Overview
358 * task_hold_by_id() is used to take a reference on a task by its task id,
359 * supporting the various system call interfaces for obtaining resource data,
360 * delivering signals, and so forth.
361 *
362 * Return values
363 * Returns a pointer to the task_t with taskid_t id. The task is returned
364 * with its hold count incremented by one. Returns NULL if there
365 * is no task with the requested id.
366 *
367 * Caller's context
368 * Caller must not be holding task_hash_lock. No restrictions on context.
369 */
370 task_t *
task_hold_by_id_zone(taskid_t id,zoneid_t zoneid)371 task_hold_by_id_zone(taskid_t id, zoneid_t zoneid)
372 {
373 task_t *tk;
374
375 mutex_enter(&task_hash_lock);
376 if ((tk = task_find(id, zoneid)) != NULL)
377 atomic_inc_32(&tk->tk_hold_count);
378 mutex_exit(&task_hash_lock);
379
380 return (tk);
381 }
382
383 task_t *
task_hold_by_id(taskid_t id)384 task_hold_by_id(taskid_t id)
385 {
386 zoneid_t zoneid;
387
388 if (INGLOBALZONE(curproc))
389 zoneid = ALL_ZONES;
390 else
391 zoneid = getzoneid();
392 return (task_hold_by_id_zone(id, zoneid));
393 }
394
395 /*
396 * void task_hold(task_t *)
397 *
398 * Overview
399 * task_hold() is used to take an additional reference to the given task.
400 *
401 * Return values
402 * None.
403 *
404 * Caller's context
405 * No restriction on context.
406 */
407 void
task_hold(task_t * tk)408 task_hold(task_t *tk)
409 {
410 atomic_inc_32(&tk->tk_hold_count);
411 }
412
413 /*
414 * void task_rele(task_t *)
415 *
416 * Overview
417 * task_rele() relinquishes a reference on the given task, which was acquired
418 * via task_hold() or task_hold_by_id(). If this is the last member or
419 * observer of the task, dispatch it for commitment via the accounting
420 * subsystem.
421 *
422 * Return values
423 * None.
424 *
425 * Caller's context
426 * Caller must not be holding the task_hash_lock.
427 */
428 void
task_rele(task_t * tk)429 task_rele(task_t *tk)
430 {
431 mutex_enter(&task_hash_lock);
432 if (atomic_add_32_nv(&tk->tk_hold_count, -1) > 0) {
433 mutex_exit(&task_hash_lock);
434 return;
435 }
436
437 ASSERT(tk->tk_nprocs == 0);
438
439 mutex_enter(&tk->tk_zone->zone_nlwps_lock);
440 tk->tk_proj->kpj_ntasks--;
441 mutex_exit(&tk->tk_zone->zone_nlwps_lock);
442
443 task_kstat_delete(tk);
444
445 if (mod_hash_destroy(task_hash,
446 (mod_hash_key_t)(uintptr_t)tk->tk_tkid) != 0)
447 panic("unable to delete task %d", tk->tk_tkid);
448 mutex_exit(&task_hash_lock);
449
450 /*
451 * At this point, there are no members or observers of the task, so we
452 * can safely send it on for commitment to the accounting subsystem.
453 * The task will be destroyed in task_end() subsequent to commitment.
454 * Since we may be called with pidlock held, taskq_dispatch() cannot
455 * sleep. Commitment is handled by a backup thread in case dispatching
456 * the task fails.
457 */
458 if (taskq_dispatch(exacct_queue, exacct_commit_task, tk,
459 TQ_NOSLEEP | TQ_NOQUEUE) == TASKQID_INVALID) {
460 mutex_enter(&task_commit_lock);
461 if (task_commit_head == NULL) {
462 task_commit_head = task_commit_tail = tk;
463 } else {
464 task_commit_tail->tk_commit_next = tk;
465 task_commit_tail = tk;
466 }
467 cv_signal(&task_commit_cv);
468 mutex_exit(&task_commit_lock);
469 }
470 }
471
472 /*
473 * task_t *task_create(projid_t, zone *)
474 *
475 * Overview
476 * A process constructing a new task calls task_create() to construct and
477 * preinitialize the task for the appropriate destination project. Only one
478 * task, the primordial task0, is not created with task_create().
479 *
480 * Return values
481 * None.
482 *
483 * Caller's context
484 * Caller's context should be safe for KM_SLEEP allocations.
485 * The caller should appropriately bump the kpj_ntasks counter on the
486 * project that contains this task.
487 */
488 task_t *
task_create(projid_t projid,zone_t * zone)489 task_create(projid_t projid, zone_t *zone)
490 {
491 task_t *tk = kmem_cache_alloc(task_cache, KM_SLEEP);
492 task_t *ancestor_tk;
493 taskid_t tkid;
494 task_usage_t *tu = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
495 mod_hash_hndl_t hndl;
496 rctl_set_t *set = rctl_set_create();
497 rctl_alloc_gp_t *gp;
498 rctl_entity_p_t e;
499
500 bzero(tk, sizeof (task_t));
501
502 tk->tk_tkid = tkid = id_alloc(taskid_space);
503 tk->tk_nlwps = 0;
504 tk->tk_nlwps_ctl = INT_MAX;
505 tk->tk_nprocs = 0;
506 tk->tk_nprocs_ctl = INT_MAX;
507 tk->tk_usage = tu;
508 tk->tk_inherited = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
509 tk->tk_proj = project_hold_by_id(projid, zone, PROJECT_HOLD_INSERT);
510 tk->tk_flags = TASK_NORMAL;
511 tk->tk_commit_next = NULL;
512
513 /*
514 * Copy ancestor task's resource controls.
515 */
516 zone_task_hold(zone);
517 mutex_enter(&curproc->p_lock);
518 ancestor_tk = curproc->p_task;
519 task_hold(ancestor_tk);
520 tk->tk_zone = zone;
521 mutex_exit(&curproc->p_lock);
522
523 for (;;) {
524 gp = rctl_set_dup_prealloc(ancestor_tk->tk_rctls);
525
526 mutex_enter(&ancestor_tk->tk_rctls->rcs_lock);
527 if (rctl_set_dup_ready(ancestor_tk->tk_rctls, gp))
528 break;
529
530 mutex_exit(&ancestor_tk->tk_rctls->rcs_lock);
531
532 rctl_prealloc_destroy(gp);
533 }
534
535 /*
536 * At this point, curproc does not have the appropriate linkage
537 * through the task to the project. So, rctl_set_dup should only
538 * copy the rctls, and leave the callbacks for later.
539 */
540 e.rcep_p.task = tk;
541 e.rcep_t = RCENTITY_TASK;
542 tk->tk_rctls = rctl_set_dup(ancestor_tk->tk_rctls, curproc, curproc, &e,
543 set, gp, RCD_DUP);
544 mutex_exit(&ancestor_tk->tk_rctls->rcs_lock);
545
546 rctl_prealloc_destroy(gp);
547
548 /*
549 * Record the ancestor task's ID for use by extended accounting.
550 */
551 tu->tu_anctaskid = ancestor_tk->tk_tkid;
552 task_rele(ancestor_tk);
553
554 /*
555 * Put new task structure in the hash table.
556 */
557 (void) mod_hash_reserve(task_hash, &hndl);
558 mutex_enter(&task_hash_lock);
559 ASSERT(task_find(tkid, zone->zone_id) == NULL);
560 if (mod_hash_insert_reserve(task_hash, (mod_hash_key_t)(uintptr_t)tkid,
561 (mod_hash_val_t *)tk, hndl) != 0) {
562 mod_hash_cancel(task_hash, &hndl);
563 panic("unable to insert task %d(%p)", tkid, (void *)tk);
564 }
565 mutex_exit(&task_hash_lock);
566
567 tk->tk_nprocs_kstat = task_kstat_create(tk, zone);
568 return (tk);
569 }
570
571 /*
572 * void task_attach(task_t *, proc_t *)
573 *
574 * Overview
575 * task_attach() is used to attach a process to a task; this operation is only
576 * performed as a result of a fork() or settaskid() system call. The proc_t's
577 * p_tasknext and p_taskprev fields will be set such that the proc_t is a
578 * member of the doubly-linked list of proc_t's that make up the task.
579 *
580 * Return values
581 * None.
582 *
583 * Caller's context
584 * pidlock and p->p_lock must be held on entry.
585 */
586 void
task_attach(task_t * tk,proc_t * p)587 task_attach(task_t *tk, proc_t *p)
588 {
589 proc_t *first, *prev;
590 ASSERT(tk != NULL);
591 ASSERT(p != NULL);
592 ASSERT(MUTEX_HELD(&pidlock));
593 ASSERT(MUTEX_HELD(&p->p_lock));
594
595 if (tk->tk_memb_list == NULL) {
596 p->p_tasknext = p;
597 p->p_taskprev = p;
598 } else {
599 first = tk->tk_memb_list;
600 prev = first->p_taskprev;
601 first->p_taskprev = p;
602 p->p_tasknext = first;
603 p->p_taskprev = prev;
604 prev->p_tasknext = p;
605 }
606 tk->tk_memb_list = p;
607 task_hold(tk);
608 p->p_task = tk;
609 }
610
611 /*
612 * task_begin()
613 *
614 * Overview
615 * A process constructing a new task calls task_begin() to initialize the
616 * task, by attaching itself as a member.
617 *
618 * Return values
619 * None.
620 *
621 * Caller's context
622 * pidlock and p_lock must be held across the call to task_begin().
623 */
624 void
task_begin(task_t * tk,proc_t * p)625 task_begin(task_t *tk, proc_t *p)
626 {
627 timestruc_t ts;
628 task_usage_t *tu;
629 rctl_entity_p_t e;
630
631 ASSERT(MUTEX_HELD(&pidlock));
632 ASSERT(MUTEX_HELD(&p->p_lock));
633
634 mutex_enter(&tk->tk_usage_lock);
635 tu = tk->tk_usage;
636 gethrestime(&ts);
637 tu->tu_startsec = (uint64_t)ts.tv_sec;
638 tu->tu_startnsec = (uint64_t)ts.tv_nsec;
639 mutex_exit(&tk->tk_usage_lock);
640
641 /*
642 * Join process to the task as a member.
643 */
644 task_attach(tk, p);
645
646 /*
647 * Now that the linkage from process to task is complete, do the
648 * required callback for the task rctl set.
649 */
650 e.rcep_p.task = tk;
651 e.rcep_t = RCENTITY_TASK;
652 (void) rctl_set_dup(NULL, NULL, p, &e, tk->tk_rctls, NULL,
653 RCD_CALLBACK);
654 }
655
656 /*
657 * void task_detach(proc_t *)
658 *
659 * Overview
660 * task_detach() removes the specified process from its task. task_detach
661 * sets the process's task membership to NULL, in anticipation of a final exit
662 * or of joining a new task. Because task_rele() requires a context safe for
663 * KM_SLEEP allocations, a task_detach() is followed by a subsequent
664 * task_rele() once appropriate context is available.
665 *
666 * Because task_detach() involves relinquishing the process's membership in
667 * the project, any observational rctls the process may have had on the task
668 * or project are destroyed.
669 *
670 * Return values
671 * None.
672 *
673 * Caller's context
674 * pidlock and p_lock held across task_detach().
675 */
676 void
task_detach(proc_t * p)677 task_detach(proc_t *p)
678 {
679 task_t *tk = p->p_task;
680
681 ASSERT(MUTEX_HELD(&pidlock));
682 ASSERT(MUTEX_HELD(&p->p_lock));
683 ASSERT(p->p_task != NULL);
684 ASSERT(tk->tk_memb_list != NULL);
685
686 if (tk->tk_memb_list == p)
687 tk->tk_memb_list = p->p_tasknext;
688 if (tk->tk_memb_list == p)
689 tk->tk_memb_list = NULL;
690 p->p_taskprev->p_tasknext = p->p_tasknext;
691 p->p_tasknext->p_taskprev = p->p_taskprev;
692
693 rctl_set_tearoff(p->p_task->tk_rctls, p);
694 rctl_set_tearoff(p->p_task->tk_proj->kpj_rctls, p);
695
696 p->p_task = NULL;
697 p->p_tasknext = p->p_taskprev = NULL;
698 }
699
700 /*
701 * task_change(task_t *, proc_t *)
702 *
703 * Overview
704 * task_change() removes the specified process from its current task. The
705 * process is then attached to the specified task. This routine is called
706 * from settaskid() when process is being moved to a new task.
707 *
708 * Return values
709 * None.
710 *
711 * Caller's context
712 * pidlock and p_lock held across task_change()
713 */
714 void
task_change(task_t * newtk,proc_t * p)715 task_change(task_t *newtk, proc_t *p)
716 {
717 task_t *oldtk = p->p_task;
718
719 ASSERT(MUTEX_HELD(&pidlock));
720 ASSERT(MUTEX_HELD(&p->p_lock));
721 ASSERT(oldtk != NULL);
722 ASSERT(oldtk->tk_memb_list != NULL);
723
724 mutex_enter(&oldtk->tk_zone->zone_nlwps_lock);
725 oldtk->tk_nlwps -= p->p_lwpcnt;
726 oldtk->tk_nprocs--;
727 mutex_exit(&oldtk->tk_zone->zone_nlwps_lock);
728
729 mutex_enter(&newtk->tk_zone->zone_nlwps_lock);
730 newtk->tk_nlwps += p->p_lwpcnt;
731 newtk->tk_nprocs++;
732 mutex_exit(&newtk->tk_zone->zone_nlwps_lock);
733
734 task_detach(p);
735 task_begin(newtk, p);
736 exacct_move_mstate(p, oldtk, newtk);
737 }
738
739 /*
740 * task_end()
741 *
742 * Overview
743 * task_end() contains the actions executed once the final member of
744 * a task has released the task, and all actions connected with the task, such
745 * as committing an accounting record to a file, are completed. It is called
746 * by the known last consumer of the task information. Additionally,
747 * task_end() must never refer to any process in the system.
748 *
749 * Return values
750 * None.
751 *
752 * Caller's context
753 * No restrictions on context, beyond that given above.
754 */
755 void
task_end(task_t * tk)756 task_end(task_t *tk)
757 {
758 ASSERT(tk->tk_hold_count == 0);
759
760 project_rele(tk->tk_proj);
761 kmem_free(tk->tk_usage, sizeof (task_usage_t));
762 kmem_free(tk->tk_inherited, sizeof (task_usage_t));
763 if (tk->tk_prevusage != NULL)
764 kmem_free(tk->tk_prevusage, sizeof (task_usage_t));
765 if (tk->tk_zoneusage != NULL)
766 kmem_free(tk->tk_zoneusage, sizeof (task_usage_t));
767 rctl_set_free(tk->tk_rctls);
768 id_free(taskid_space, tk->tk_tkid);
769 zone_task_rele(tk->tk_zone);
770 kmem_cache_free(task_cache, tk);
771 }
772
773 static void
changeproj(proc_t * p,kproject_t * kpj,zone_t * zone,void * projbuf,void * zonebuf)774 changeproj(proc_t *p, kproject_t *kpj, zone_t *zone, void *projbuf,
775 void *zonebuf)
776 {
777 kproject_t *oldkpj;
778 kthread_t *t;
779
780 ASSERT(MUTEX_HELD(&pidlock));
781 ASSERT(MUTEX_HELD(&p->p_lock));
782
783 if ((t = p->p_tlist) != NULL) {
784 do {
785 (void) project_hold(kpj);
786
787 thread_lock(t);
788 oldkpj = ttoproj(t);
789
790 /*
791 * Kick this thread so that it doesn't sit
792 * on a wrong wait queue.
793 */
794 if (ISWAITING(t))
795 setrun_locked(t);
796
797 /*
798 * The thread wants to go on the project wait queue, but
799 * the waitq is changing.
800 */
801 if (t->t_schedflag & TS_PROJWAITQ)
802 t->t_schedflag &= ~ TS_PROJWAITQ;
803
804 t->t_proj = kpj;
805 t->t_pre_sys = 1; /* For cred update */
806 thread_unlock(t);
807 fss_changeproj(t, kpj, zone, projbuf, zonebuf);
808
809 project_rele(oldkpj);
810 } while ((t = t->t_forw) != p->p_tlist);
811 }
812 }
813
814 /*
815 * task_join()
816 *
817 * Overview
818 * task_join() contains the actions that must be executed when the first
819 * member (curproc) of a newly created task joins it. It may never fail.
820 *
821 * The caller must make sure holdlwps() is called so that all other lwps are
822 * stopped prior to calling this function.
823 *
824 * NB: It returns with curproc->p_lock held.
825 *
826 * Return values
827 * Pointer to the old task.
828 *
829 * Caller's context
830 * cpu_lock must be held entering the function. It will acquire pidlock,
831 * p_crlock and p_lock during execution.
832 */
833 task_t *
task_join(task_t * tk,uint_t flags)834 task_join(task_t *tk, uint_t flags)
835 {
836 proc_t *p = ttoproc(curthread);
837 task_t *prev_tk;
838 void *projbuf, *zonebuf;
839 zone_t *zone = tk->tk_zone;
840 projid_t projid = tk->tk_proj->kpj_id;
841 cred_t *oldcr;
842
843 /*
844 * We can't know for sure if holdlwps() was called, but we can check to
845 * ensure we're single-threaded.
846 */
847 ASSERT(curthread == p->p_agenttp || p->p_lwprcnt == 1);
848
849 /*
850 * Changing the credential is always hard because we cannot
851 * allocate memory when holding locks but we don't know whether
852 * we need to change it. We first get a reference to the current
853 * cred if we need to change it. Then we create a credential
854 * with an updated project id. Finally we install it, first
855 * releasing the reference we had on the p_cred at the time we
856 * acquired the lock the first time and later we release the
857 * reference to p_cred at the time we acquired the lock the
858 * second time.
859 */
860 mutex_enter(&p->p_crlock);
861 if (crgetprojid(p->p_cred) == projid)
862 oldcr = NULL;
863 else
864 crhold(oldcr = p->p_cred);
865 mutex_exit(&p->p_crlock);
866
867 if (oldcr != NULL) {
868 cred_t *newcr = crdup(oldcr);
869 crsetprojid(newcr, projid);
870 crfree(oldcr);
871
872 mutex_enter(&p->p_crlock);
873 oldcr = p->p_cred;
874 p->p_cred = newcr;
875 mutex_exit(&p->p_crlock);
876 crfree(oldcr);
877 }
878
879 /*
880 * Make sure that the number of processor sets is constant
881 * across this operation.
882 */
883 ASSERT(MUTEX_HELD(&cpu_lock));
884
885 projbuf = fss_allocbuf(FSS_NPSET_BUF, FSS_ALLOC_PROJ);
886 zonebuf = fss_allocbuf(FSS_NPSET_BUF, FSS_ALLOC_ZONE);
887
888 mutex_enter(&pidlock);
889 mutex_enter(&p->p_lock);
890
891 prev_tk = p->p_task;
892 task_change(tk, p);
893
894 /*
895 * Now move threads one by one to their new project.
896 */
897 changeproj(p, tk->tk_proj, zone, projbuf, zonebuf);
898 if (flags & TASK_FINAL)
899 p->p_task->tk_flags |= TASK_FINAL;
900
901 mutex_exit(&pidlock);
902
903 fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
904 fss_freebuf(projbuf, FSS_ALLOC_PROJ);
905 return (prev_tk);
906 }
907
908 /*
909 * rctl ops vectors
910 */
911 static rctl_ops_t task_lwps_ops = {
912 rcop_no_action,
913 task_lwps_usage,
914 task_lwps_set,
915 task_lwps_test
916 };
917
918 static rctl_ops_t task_procs_ops = {
919 rcop_no_action,
920 task_nprocs_usage,
921 task_nprocs_set,
922 task_nprocs_test
923 };
924
925 static rctl_ops_t task_cpu_time_ops = {
926 rcop_no_action,
927 task_cpu_time_usage,
928 rcop_no_set,
929 task_cpu_time_test
930 };
931
932 /*ARGSUSED*/
933 /*
934 * void task_init(void)
935 *
936 * Overview
937 * task_init() initializes task-related hashes, caches, and the task id
938 * space. Additionally, task_init() establishes p0 as a member of task0.
939 * Called by main().
940 *
941 * Return values
942 * None.
943 *
944 * Caller's context
945 * task_init() must be called prior to MP startup.
946 */
947 void
task_init(void)948 task_init(void)
949 {
950 proc_t *p = &p0;
951 mod_hash_hndl_t hndl;
952 rctl_set_t *set;
953 rctl_alloc_gp_t *gp;
954 rctl_entity_p_t e;
955
956 /*
957 * Initialize task_cache and taskid_space.
958 */
959 task_cache = kmem_cache_create("task_cache", sizeof (task_t),
960 0, NULL, NULL, NULL, NULL, NULL, 0);
961 taskid_space = id_space_create("taskid_space", 0, MAX_TASKID);
962
963 /*
964 * Initialize task hash table.
965 */
966 task_hash = mod_hash_create_idhash("task_hash", task_hash_size,
967 mod_hash_null_valdtor);
968
969 /*
970 * Initialize task-based rctls.
971 */
972 rc_task_lwps = rctl_register("task.max-lwps", RCENTITY_TASK,
973 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_COUNT, INT_MAX, INT_MAX,
974 &task_lwps_ops);
975 rc_task_nprocs = rctl_register("task.max-processes", RCENTITY_TASK,
976 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_COUNT, INT_MAX, INT_MAX,
977 &task_procs_ops);
978 rc_task_cpu_time = rctl_register("task.max-cpu-time", RCENTITY_TASK,
979 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_DENY_NEVER |
980 RCTL_GLOBAL_CPU_TIME | RCTL_GLOBAL_INFINITE |
981 RCTL_GLOBAL_UNOBSERVABLE | RCTL_GLOBAL_SECONDS, UINT64_MAX,
982 UINT64_MAX, &task_cpu_time_ops);
983
984 /*
985 * Create task0 and place p0 in it as a member.
986 */
987 task0p = kmem_cache_alloc(task_cache, KM_SLEEP);
988 bzero(task0p, sizeof (task_t));
989
990 task0p->tk_tkid = id_alloc(taskid_space);
991 task0p->tk_usage = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
992 task0p->tk_inherited = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
993 task0p->tk_proj = project_hold_by_id(0, &zone0,
994 PROJECT_HOLD_INSERT);
995 task0p->tk_flags = TASK_NORMAL;
996 task0p->tk_nlwps = p->p_lwpcnt;
997 task0p->tk_nprocs = 1;
998 task0p->tk_zone = global_zone;
999 task0p->tk_commit_next = NULL;
1000
1001 set = rctl_set_create();
1002 gp = rctl_set_init_prealloc(RCENTITY_TASK);
1003 mutex_enter(&curproc->p_lock);
1004 e.rcep_p.task = task0p;
1005 e.rcep_t = RCENTITY_TASK;
1006 task0p->tk_rctls = rctl_set_init(RCENTITY_TASK, curproc, &e, set, gp);
1007 mutex_exit(&curproc->p_lock);
1008 rctl_prealloc_destroy(gp);
1009
1010 (void) mod_hash_reserve(task_hash, &hndl);
1011 mutex_enter(&task_hash_lock);
1012 ASSERT(task_find(task0p->tk_tkid, GLOBAL_ZONEID) == NULL);
1013 if (mod_hash_insert_reserve(task_hash,
1014 (mod_hash_key_t)(uintptr_t)task0p->tk_tkid,
1015 (mod_hash_val_t *)task0p, hndl) != 0) {
1016 mod_hash_cancel(task_hash, &hndl);
1017 panic("unable to insert task %d(%p)", task0p->tk_tkid,
1018 (void *)task0p);
1019 }
1020 mutex_exit(&task_hash_lock);
1021
1022 task0p->tk_memb_list = p;
1023
1024 task0p->tk_nprocs_kstat = task_kstat_create(task0p, task0p->tk_zone);
1025
1026 /*
1027 * Initialize task pointers for p0, including doubly linked list of task
1028 * members.
1029 */
1030 p->p_task = task0p;
1031 p->p_taskprev = p->p_tasknext = p;
1032 task_hold(task0p);
1033 }
1034
1035 static int
task_nprocs_kstat_update(kstat_t * ksp,int rw)1036 task_nprocs_kstat_update(kstat_t *ksp, int rw)
1037 {
1038 task_t *tk = ksp->ks_private;
1039 task_kstat_t *ktk = ksp->ks_data;
1040
1041 if (rw == KSTAT_WRITE)
1042 return (EACCES);
1043
1044 ktk->ktk_usage.value.ui64 = tk->tk_nprocs;
1045 ktk->ktk_value.value.ui64 = tk->tk_nprocs_ctl;
1046 return (0);
1047 }
1048
1049 static kstat_t *
task_kstat_create(task_t * tk,zone_t * zone)1050 task_kstat_create(task_t *tk, zone_t *zone)
1051 {
1052 kstat_t *ksp;
1053 task_kstat_t *ktk;
1054 char *zonename = zone->zone_name;
1055
1056 ksp = rctl_kstat_create_task(tk, "nprocs", KSTAT_TYPE_NAMED,
1057 sizeof (task_kstat_t) / sizeof (kstat_named_t),
1058 KSTAT_FLAG_VIRTUAL);
1059
1060 if (ksp == NULL)
1061 return (NULL);
1062
1063 ktk = ksp->ks_data = kmem_alloc(sizeof (task_kstat_t), KM_SLEEP);
1064 ksp->ks_data_size += strlen(zonename) + 1;
1065 kstat_named_init(&ktk->ktk_zonename, "zonename", KSTAT_DATA_STRING);
1066 kstat_named_setstr(&ktk->ktk_zonename, zonename);
1067 kstat_named_init(&ktk->ktk_usage, "usage", KSTAT_DATA_UINT64);
1068 kstat_named_init(&ktk->ktk_value, "value", KSTAT_DATA_UINT64);
1069 ksp->ks_update = task_nprocs_kstat_update;
1070 ksp->ks_private = tk;
1071 kstat_install(ksp);
1072
1073 return (ksp);
1074 }
1075
1076 static void
task_kstat_delete(task_t * tk)1077 task_kstat_delete(task_t *tk)
1078 {
1079 void *data;
1080
1081 if (tk->tk_nprocs_kstat != NULL) {
1082 data = tk->tk_nprocs_kstat->ks_data;
1083 kstat_delete(tk->tk_nprocs_kstat);
1084 kmem_free(data, sizeof (task_kstat_t));
1085 tk->tk_nprocs_kstat = NULL;
1086 }
1087 }
1088
1089 void
task_commit_thread_init()1090 task_commit_thread_init()
1091 {
1092 mutex_init(&task_commit_lock, NULL, MUTEX_DEFAULT, NULL);
1093 cv_init(&task_commit_cv, NULL, CV_DEFAULT, NULL);
1094 task_commit_thread = thread_create(NULL, 0, task_commit, NULL, 0,
1095 &p0, TS_RUN, minclsyspri);
1096 }
1097
1098 /*
1099 * Backup thread to commit task resource usage when taskq_dispatch() fails.
1100 */
1101 static void
task_commit()1102 task_commit()
1103 {
1104 callb_cpr_t cprinfo;
1105
1106 CALLB_CPR_INIT(&cprinfo, &task_commit_lock, callb_generic_cpr,
1107 "task_commit_thread");
1108
1109 mutex_enter(&task_commit_lock);
1110
1111 for (;;) {
1112 while (task_commit_head == NULL) {
1113 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1114 cv_wait(&task_commit_cv, &task_commit_lock);
1115 CALLB_CPR_SAFE_END(&cprinfo, &task_commit_lock);
1116 }
1117 while (task_commit_head != NULL) {
1118 task_t *tk;
1119
1120 tk = task_commit_head;
1121 task_commit_head = task_commit_head->tk_commit_next;
1122 if (task_commit_head == NULL)
1123 task_commit_tail = NULL;
1124 mutex_exit(&task_commit_lock);
1125 exacct_commit_task(tk);
1126 mutex_enter(&task_commit_lock);
1127 }
1128 }
1129 }
1130