xref: /illumos-gate/usr/src/uts/common/os/rctl.c (revision b3783300013fa93b98278c901b855062f538f7e2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/atomic.h>
26 #include <sys/cmn_err.h>
27 #include <sys/id_space.h>
28 #include <sys/kmem.h>
29 #include <sys/kstat.h>
30 #include <sys/log.h>
31 #include <sys/modctl.h>
32 #include <sys/modhash.h>
33 #include <sys/mutex.h>
34 #include <sys/proc.h>
35 #include <sys/procset.h>
36 #include <sys/project.h>
37 #include <sys/resource.h>
38 #include <sys/rctl.h>
39 #include <sys/siginfo.h>
40 #include <sys/strlog.h>
41 #include <sys/systm.h>
42 #include <sys/task.h>
43 #include <sys/types.h>
44 #include <sys/policy.h>
45 #include <sys/zone.h>
46 
47 /*
48  * Resource controls (rctls)
49  *
50  *   The rctl subsystem provides a mechanism for kernel components to
51  *   register their individual resource controls with the system as a whole,
52  *   such that those controls can subscribe to specific actions while being
53  *   associated with the various process-model entities provided by the kernel:
54  *   the process, the task, the project, and the zone.  (In principle, only
55  *   minor modifications would be required to connect the resource control
56  *   functionality to non-process-model entities associated with the system.)
57  *
58  *   Subsystems register their rctls via rctl_register().  Subsystems
59  *   also wishing to provide additional limits on a given rctl can modify
60  *   them once they have the rctl handle.  Each subsystem should store the
61  *   handle to their rctl for direct access.
62  *
63  *   A primary dictionary, rctl_dict, contains a hash of id to the default
64  *   control definition for each controlled resource-entity pair on the system.
65  *   A secondary dictionary, rctl_dict_by_name, contains a hash of name to
66  *   resource control handles.  The resource control handles are distributed by
67  *   the rctl_ids ID space.  The handles are private and not to be
68  *   advertised to userland; all userland interactions are via the rctl
69  *   names.
70  *
71  *   Entities inherit their rctls from their predecessor.  Since projects have
72  *   no ancestor, they inherit their rctls from the rctl dict for project
73  *   rctls.  It is expected that project controls will be set to their
74  *   appropriate values shortly after project creation, presumably from a
75  *   policy source such as the project database.
76  *
77  * Data structures
78  *   The rctl_set_t attached to each of the process model entities is a simple
79  *   hash table keyed on the rctl handle assigned at registration.  The entries
80  *   in the hash table are rctl_t's, whose relationship with the active control
81  *   values on that resource and with the global state of the resource we
82  *   illustrate below:
83  *
84  *   rctl_dict[key] --> rctl_dict_entry
85  *			   ^
86  *			   |
87  *			+--+---+
88  *   rctl_set[key] ---> | rctl | --> value <-> value <-> system value --> NULL
89  *			+--+---+		 ^
90  *			   |			 |
91  *			   +------- cursor ------+
92  *
93  *   That is, the rctl contains a back pointer to the global resource control
94  *   state for this resource, which is also available in the rctl_dict hash
95  *   table mentioned earlier.  The rctl contains two pointers to resource
96  *   control values:  one, values, indicates the entire sequence of control
97  *   values; the other, cursor, indicates the currently active control
98  *   value--the next value to be enforced.  The value list itself is an open,
99  *   doubly-linked list, the last non-NULL member of which is the system value
100  *   for that resource (being the theoretical/conventional maximum allowable
101  *   value for the resource on this OS instance).
102  *
103  * Ops Vector
104  *   Subsystems publishing rctls need not provide instances of all of the
105  *   functions specified by the ops vector.  In particular, if general
106  *   rctl_*() entry points are not being called, certain functions can be
107  *   omitted.  These align as follows:
108  *
109  *   rctl_set()
110  *     You may wish to provide a set callback if locking circumstances prevent
111  *     it or if the performance cost of requesting the enforced value from the
112  *     resource control is prohibitively expensive.  For instance, the currently
113  *     enforced file size limit is stored on the process in the p_fsz_ctl to
114  *     maintain read()/write() performance.
115  *
116  *   rctl_test()
117  *     You must provide a test callback if you are using the rctl_test()
118  *     interface.  An action callback is optional.
119  *
120  *   rctl_action()
121  *     You may wish to provide an action callback.
122  *
123  * Registration
124  *   New resource controls can be added to a running instance by loaded modules
125  *   via registration.  (The current implementation does not support unloadable
126  *   modules; this functionality can be added if needed, via an
127  *   activation/deactivation interface involving the manipulation of the
128  *   ops vector for the resource control(s) needing to support unloading.)
129  *
130  * Control value ordering
131  *   Because the rctl_val chain on each rctl must be navigable in a
132  *   deterministic way, we have to define an ordering on the rctl_val_t's.  The
133  *   defined order is (flags & [maximal], value, flags & [deny-action],
134  *   privilege).
135  *
136  * Locking
137  *   rctl_dict_lock must be acquired prior to rctl_lists_lock.  Since
138  *   rctl_dict_lock or rctl_lists_lock can be called at the enforcement point
139  *   of any subsystem, holding subsystem locks, it is at all times inappropriate
140  *   to call kmem_alloc(., KM_SLEEP) while holding either of these locks.
141  *   Traversing any of the various resource control entity lists requires
142  *   holding rctl_lists_lock.
143  *
144  *   Each individual resource control set associated with an entity must have
145  *   its rcs_lock held for the duration of any operations that would add
146  *   resource controls or control values to the set.
147  *
148  *   The locking subsequence of interest is: p_lock, rctl_dict_lock,
149  *   rctl_lists_lock, entity->rcs_lock.
150  *
151  * The project(5) database and project entity resource controls
152  *   A special case is made for RCENTITY_PROJECT values set through the
153  *   setproject(3PROJECT) interface.  setproject() makes use of a private
154  *   interface, setprojrctl(), which passes through an array of resource control
155  *   blocks that need to be set while holding the entity->rcs_lock.  This
156  *   ensures that the act of modifying a project's resource controls is
157  *   "atomic" within the kernel.
158  *
159  *   Within the rctl sub-system, we provide two interfaces that are only used by
160  *   the setprojrctl() code path - rctl_local_insert_all() and
161  *   rctl_local_replace_all().  rctl_local_insert_all() will ensure that the
162  *   resource values specified in *new_values are applied.
163  *   rctl_local_replace_all() will purge the current rctl->rc_projdb and
164  *   rctl->rc_values entries, and apply the *new_values.
165  *
166  *   These functions modify not only the linked list of active resource controls
167  *   (rctl->rc_values), but also a "cached" linked list (rctl->rc_projdb) of
168  *   values set through these interfaces.  To clarify:
169  *
170  *      rctl->rc_values - a linked list of rctl_val_t.  These are the active
171  *      resource values associated with this rctl, and may have been set by
172  *      setrctl() - via prctl(1), or by setprojrctl() - via
173  *      setproject(3PROJECT).
174  *
175  *      rctl->rc_projdb - a linked list of rctl_val_t.  These reflect the
176  *      resource values set by the setprojrctl() code path.  rc_projdb is not
177  *      referenced by any other component of the rctl sub-system.
178  *
179  *   As various locks are held when calling these functions, we ensure that all
180  *   the possible memory allocations are performed prior to calling the
181  *   function.  *alloc_values is a linked list of uninitialized rctl_val_t,
182  *   which may be used to duplicate a new resource control value (passed in as
183  *   one of the members of the *new_values linked list), in order to populate
184  *   rctl->rc_values.
185  */
186 
187 id_t max_rctl_hndl = 32768;
188 int rctl_dict_size = 64;
189 int rctl_set_size = 8;
190 kmutex_t rctl_dict_lock;
191 mod_hash_t *rctl_dict;
192 mod_hash_t *rctl_dict_by_name;
193 id_space_t *rctl_ids;
194 kmem_cache_t *rctl_cache;	/* kmem cache for rctl structures */
195 kmem_cache_t *rctl_val_cache;	/* kmem cache for rctl values */
196 
197 kmutex_t rctl_lists_lock;
198 rctl_dict_entry_t *rctl_lists[RC_MAX_ENTITY + 1];
199 
200 /*
201  * Default resource control operations and ops vector
202  *   To be used if the particular rcontrol has no specific actions defined, or
203  *   if the subsystem providing the control is quiescing (in preparation for
204  *   unloading, presumably.)
205  *
206  *   Resource controls with callbacks should fill the unused operations with the
207  *   appropriate default impotent callback.
208  */
209 /*ARGSUSED*/
210 void
211 rcop_no_action(struct rctl *r, struct proc *p, rctl_entity_p_t *e)
212 {
213 }
214 
215 /*ARGSUSED*/
216 rctl_qty_t
217 rcop_no_usage(struct rctl *r, struct proc *p)
218 {
219 	return (0);
220 }
221 
222 /*ARGSUSED*/
223 int
224 rcop_no_set(struct rctl *r, struct proc *p, rctl_entity_p_t *e, rctl_qty_t l)
225 {
226 	return (0);
227 }
228 
229 /*ARGSUSED*/
230 int
231 rcop_no_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
232     struct rctl_val *rv, rctl_qty_t i, uint_t f)
233 {
234 	return (0);
235 }
236 
237 rctl_ops_t rctl_default_ops = {
238 	rcop_no_action,
239 	rcop_no_usage,
240 	rcop_no_set,
241 	rcop_no_test
242 };
243 
244 /*
245  * Default "absolute" resource control operation and ops vector
246  *   Useful if there is no usage associated with the
247  *   resource control.
248  */
249 /*ARGSUSED*/
250 int
251 rcop_absolute_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
252     struct rctl_val *rv, rctl_qty_t i, uint_t f)
253 {
254 	return (i > rv->rcv_value);
255 }
256 
257 rctl_ops_t rctl_absolute_ops = {
258 	rcop_no_action,
259 	rcop_no_usage,
260 	rcop_no_set,
261 	rcop_absolute_test
262 };
263 
264 /*ARGSUSED*/
265 static uint_t
266 rctl_dict_hash_by_id(void *hash_data, mod_hash_key_t key)
267 {
268 	return ((uint_t)(uintptr_t)key % rctl_dict_size);
269 }
270 
271 static int
272 rctl_dict_id_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
273 {
274 	uint_t u1 = (uint_t)(uintptr_t)key1;
275 	uint_t u2 = (uint_t)(uintptr_t)key2;
276 
277 	if (u1 > u2)
278 		return (1);
279 
280 	if (u1 == u2)
281 		return (0);
282 
283 	return (-1);
284 }
285 
286 static void
287 rctl_dict_val_dtor(mod_hash_val_t val)
288 {
289 	rctl_dict_entry_t *kr = (rctl_dict_entry_t *)val;
290 
291 	kmem_free(kr, sizeof (rctl_dict_entry_t));
292 }
293 
294 /*
295  * size_t rctl_build_name_buf()
296  *
297  * Overview
298  *   rctl_build_name_buf() walks all active resource controls in the dictionary,
299  *   building a buffer of continguous NUL-terminated strings.
300  *
301  * Return values
302  *   The size of the buffer is returned, the passed pointer's contents are
303  *   modified to that of the location of the buffer.
304  *
305  * Caller's context
306  *   Caller must be in a context suitable for KM_SLEEP allocations.
307  */
308 size_t
309 rctl_build_name_buf(char **rbufp)
310 {
311 	size_t req_size, cpy_size;
312 	char *rbufloc;
313 	int i;
314 
315 rctl_rebuild_name_buf:
316 	req_size = cpy_size = 0;
317 
318 	/*
319 	 * Calculate needed buffer length.
320 	 */
321 	mutex_enter(&rctl_lists_lock);
322 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
323 		rctl_dict_entry_t *rde;
324 
325 		for (rde = rctl_lists[i];
326 		    rde != NULL;
327 		    rde = rde->rcd_next)
328 			req_size += strlen(rde->rcd_name) + 1;
329 	}
330 	mutex_exit(&rctl_lists_lock);
331 
332 	rbufloc = *rbufp = kmem_alloc(req_size, KM_SLEEP);
333 
334 	/*
335 	 * Copy rctl names into our buffer.  If the copy length exceeds the
336 	 * allocate length (due to registration changes), stop copying, free the
337 	 * buffer, and start again.
338 	 */
339 	mutex_enter(&rctl_lists_lock);
340 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
341 		rctl_dict_entry_t *rde;
342 
343 		for (rde = rctl_lists[i];
344 		    rde != NULL;
345 		    rde = rde->rcd_next) {
346 			size_t length = strlen(rde->rcd_name) + 1;
347 
348 			cpy_size += length;
349 
350 			if (cpy_size > req_size) {
351 				kmem_free(*rbufp, req_size);
352 				mutex_exit(&rctl_lists_lock);
353 				goto rctl_rebuild_name_buf;
354 			}
355 
356 			bcopy(rde->rcd_name, rbufloc, length);
357 			rbufloc += length;
358 		}
359 	}
360 	mutex_exit(&rctl_lists_lock);
361 
362 	return (req_size);
363 }
364 
365 /*
366  * rctl_dict_entry_t *rctl_dict_lookup(const char *)
367  *
368  * Overview
369  *   rctl_dict_lookup() returns the resource control dictionary entry for the
370  *   named resource control.
371  *
372  * Return values
373  *   A pointer to the appropriate resource control dictionary entry, or NULL if
374  *   no such named entry exists.
375  *
376  * Caller's context
377  *   Caller must not be holding rctl_dict_lock.
378  */
379 rctl_dict_entry_t *
380 rctl_dict_lookup(const char *name)
381 {
382 	rctl_dict_entry_t *rde;
383 
384 	mutex_enter(&rctl_dict_lock);
385 
386 	if (mod_hash_find(rctl_dict_by_name, (mod_hash_key_t)name,
387 	    (mod_hash_val_t *)&rde) == MH_ERR_NOTFOUND) {
388 		mutex_exit(&rctl_dict_lock);
389 		return (NULL);
390 	}
391 
392 	mutex_exit(&rctl_dict_lock);
393 
394 	return (rde);
395 }
396 
397 /*
398  * rctl_hndl_t rctl_hndl_lookup(const char *)
399  *
400  * Overview
401  *   rctl_hndl_lookup() returns the resource control id (the "handle") for the
402  *   named resource control.
403  *
404  * Return values
405  *   The appropriate id, or -1 if no such named entry exists.
406  *
407  * Caller's context
408  *   Caller must not be holding rctl_dict_lock.
409  */
410 rctl_hndl_t
411 rctl_hndl_lookup(const char *name)
412 {
413 	rctl_dict_entry_t *rde;
414 
415 	if ((rde = rctl_dict_lookup(name)) == NULL)
416 		return (-1);
417 
418 	return (rde->rcd_id);
419 }
420 
421 /*
422  * rctl_dict_entry_t * rctl_dict_lookup_hndl(rctl_hndl_t)
423  *
424  * Overview
425  *   rctl_dict_lookup_hndl() completes the public lookup functions, by returning
426  *   the resource control dictionary entry matching a given resource control id.
427  *
428  * Return values
429  *   A pointer to the matching resource control dictionary entry, or NULL if the
430  *   id does not match any existing entries.
431  *
432  * Caller's context
433  *   Caller must not be holding rctl_lists_lock.
434  */
435 rctl_dict_entry_t *
436 rctl_dict_lookup_hndl(rctl_hndl_t hndl)
437 {
438 	uint_t i;
439 
440 	mutex_enter(&rctl_lists_lock);
441 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
442 		rctl_dict_entry_t *rde;
443 
444 		for (rde = rctl_lists[i];
445 		    rde != NULL;
446 		    rde = rde->rcd_next)
447 			if (rde->rcd_id == hndl) {
448 				mutex_exit(&rctl_lists_lock);
449 				return (rde);
450 			}
451 	}
452 	mutex_exit(&rctl_lists_lock);
453 
454 	return (NULL);
455 }
456 
457 /*
458  * void rctl_add_default_limit(const char *name, rctl_qty_t value,
459  *     rctl_priv_t privilege, uint_t action)
460  *
461  * Overview
462  *   Create a default limit with specified value, privilege, and action.
463  *
464  * Return value
465  *   No value returned.
466  */
467 void
468 rctl_add_default_limit(const char *name, rctl_qty_t value,
469     rctl_priv_t privilege, uint_t action)
470 {
471 	rctl_val_t *dval;
472 	rctl_dict_entry_t *rde;
473 
474 	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
475 	bzero(dval, sizeof (rctl_val_t));
476 	dval->rcv_value = value;
477 	dval->rcv_privilege = privilege;
478 	dval->rcv_flagaction = action;
479 	dval->rcv_action_recip_pid = -1;
480 
481 	rde = rctl_dict_lookup(name);
482 	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
483 }
484 
485 /*
486  * void rctl_add_legacy_limit(const char *name, const char *mname,
487  *     const char *lname, rctl_qty_t dflt)
488  *
489  * Overview
490  *   Create a default privileged limit, using the value obtained from
491  *   /etc/system if it exists and is greater than the specified default
492  *   value.  Exists primarily for System V IPC.
493  *
494  * Return value
495  *   No value returned.
496  */
497 void
498 rctl_add_legacy_limit(const char *name, const char *mname, const char *lname,
499     rctl_qty_t dflt, rctl_qty_t max)
500 {
501 	rctl_qty_t qty;
502 
503 	if (!mod_sysvar(mname, lname, &qty) || (qty < dflt))
504 		qty = dflt;
505 
506 	if (qty > max)
507 		qty = max;
508 
509 	rctl_add_default_limit(name, qty, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY);
510 }
511 
512 rctl_set_t *
513 rctl_entity_obtain_rset(rctl_dict_entry_t *rcd, struct proc *p)
514 {
515 	rctl_set_t *rset = NULL;
516 
517 	if (rcd == NULL)
518 		return (NULL);
519 
520 	switch (rcd->rcd_entity) {
521 	case RCENTITY_PROCESS:
522 		rset = p->p_rctls;
523 		break;
524 	case RCENTITY_TASK:
525 		ASSERT(MUTEX_HELD(&p->p_lock));
526 		if (p->p_task != NULL)
527 			rset = p->p_task->tk_rctls;
528 		break;
529 	case RCENTITY_PROJECT:
530 		ASSERT(MUTEX_HELD(&p->p_lock));
531 		if (p->p_task != NULL &&
532 		    p->p_task->tk_proj != NULL)
533 			rset = p->p_task->tk_proj->kpj_rctls;
534 		break;
535 	case RCENTITY_ZONE:
536 		ASSERT(MUTEX_HELD(&p->p_lock));
537 		if (p->p_zone != NULL)
538 			rset = p->p_zone->zone_rctls;
539 		break;
540 	default:
541 		panic("unknown rctl entity type %d seen", rcd->rcd_entity);
542 		break;
543 	}
544 
545 	return (rset);
546 }
547 
548 static void
549 rctl_entity_obtain_entity_p(rctl_entity_t entity, struct proc *p,
550     rctl_entity_p_t *e)
551 {
552 	e->rcep_p.proc = NULL;
553 	e->rcep_t = entity;
554 
555 	switch (entity) {
556 	case RCENTITY_PROCESS:
557 		e->rcep_p.proc = p;
558 		break;
559 	case RCENTITY_TASK:
560 		ASSERT(MUTEX_HELD(&p->p_lock));
561 		if (p->p_task != NULL)
562 			e->rcep_p.task = p->p_task;
563 		break;
564 	case RCENTITY_PROJECT:
565 		ASSERT(MUTEX_HELD(&p->p_lock));
566 		if (p->p_task != NULL &&
567 		    p->p_task->tk_proj != NULL)
568 			e->rcep_p.proj = p->p_task->tk_proj;
569 		break;
570 	case RCENTITY_ZONE:
571 		ASSERT(MUTEX_HELD(&p->p_lock));
572 		if (p->p_zone != NULL)
573 			e->rcep_p.zone = p->p_zone;
574 		break;
575 	default:
576 		panic("unknown rctl entity type %d seen", entity);
577 		break;
578 	}
579 }
580 
581 static void
582 rctl_gp_alloc(rctl_alloc_gp_t *rcgp)
583 {
584 	uint_t i;
585 
586 	if (rcgp->rcag_nctls > 0) {
587 		rctl_t *prev = kmem_cache_alloc(rctl_cache, KM_SLEEP);
588 		rctl_t *rctl = prev;
589 
590 		rcgp->rcag_ctls = prev;
591 
592 		for (i = 1; i < rcgp->rcag_nctls; i++) {
593 			rctl = kmem_cache_alloc(rctl_cache, KM_SLEEP);
594 			prev->rc_next = rctl;
595 			prev = rctl;
596 		}
597 
598 		rctl->rc_next = NULL;
599 	}
600 
601 	if (rcgp->rcag_nvals > 0) {
602 		rctl_val_t *prev = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
603 		rctl_val_t *rval = prev;
604 
605 		rcgp->rcag_vals = prev;
606 
607 		for (i = 1; i < rcgp->rcag_nvals; i++) {
608 			rval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
609 			prev->rcv_next = rval;
610 			prev = rval;
611 		}
612 
613 		rval->rcv_next = NULL;
614 	}
615 
616 }
617 
618 static rctl_val_t *
619 rctl_gp_detach_val(rctl_alloc_gp_t *rcgp)
620 {
621 	rctl_val_t *rval = rcgp->rcag_vals;
622 
623 	ASSERT(rcgp->rcag_nvals > 0);
624 	rcgp->rcag_nvals--;
625 	rcgp->rcag_vals = rval->rcv_next;
626 
627 	rval->rcv_next = NULL;
628 
629 	return (rval);
630 }
631 
632 static rctl_t *
633 rctl_gp_detach_ctl(rctl_alloc_gp_t *rcgp)
634 {
635 	rctl_t *rctl = rcgp->rcag_ctls;
636 
637 	ASSERT(rcgp->rcag_nctls > 0);
638 	rcgp->rcag_nctls--;
639 	rcgp->rcag_ctls = rctl->rc_next;
640 
641 	rctl->rc_next = NULL;
642 
643 	return (rctl);
644 
645 }
646 
647 static void
648 rctl_gp_free(rctl_alloc_gp_t *rcgp)
649 {
650 	rctl_val_t *rval = rcgp->rcag_vals;
651 	rctl_t *rctl = rcgp->rcag_ctls;
652 
653 	while (rval != NULL) {
654 		rctl_val_t *next = rval->rcv_next;
655 
656 		kmem_cache_free(rctl_val_cache, rval);
657 		rval = next;
658 	}
659 
660 	while (rctl != NULL) {
661 		rctl_t *next = rctl->rc_next;
662 
663 		kmem_cache_free(rctl_cache, rctl);
664 		rctl = next;
665 	}
666 }
667 
668 /*
669  * void rctl_prealloc_destroy(rctl_alloc_gp_t *)
670  *
671  * Overview
672  *   Release all unused memory allocated via one of the "prealloc" functions:
673  *   rctl_set_init_prealloc, rctl_set_dup_prealloc, or rctl_rlimit_set_prealloc.
674  *
675  * Return values
676  *   None.
677  *
678  * Caller's context
679  *   No restrictions on context.
680  */
681 void
682 rctl_prealloc_destroy(rctl_alloc_gp_t *gp)
683 {
684 	rctl_gp_free(gp);
685 	kmem_free(gp, sizeof (rctl_alloc_gp_t));
686 }
687 
688 /*
689  * int rctl_val_cmp(rctl_val_t *, rctl_val_t *, int)
690  *
691  * Overview
692  *   This function defines an ordering to rctl_val_t's in order to allow
693  *   for correct placement in value lists. When the imprecise flag is set,
694  *   the action recipient is ignored. This is to facilitate insert,
695  *   delete, and replace operations by rctlsys.
696  *
697  * Return values
698  *   0 if the val_t's are are considered identical
699  *   -1 if a is ordered lower than b
700  *   1 if a is lowered higher than b
701  *
702  * Caller's context
703  *   No restrictions on context.
704  */
705 int
706 rctl_val_cmp(rctl_val_t *a, rctl_val_t *b, int imprecise)
707 {
708 	if ((a->rcv_flagaction & RCTL_LOCAL_MAXIMAL) <
709 	    (b->rcv_flagaction & RCTL_LOCAL_MAXIMAL))
710 		return (-1);
711 
712 	if ((a->rcv_flagaction & RCTL_LOCAL_MAXIMAL) >
713 	    (b->rcv_flagaction & RCTL_LOCAL_MAXIMAL))
714 		return (1);
715 
716 	if (a->rcv_value < b->rcv_value)
717 		return (-1);
718 
719 	if (a->rcv_value > b->rcv_value)
720 		return (1);
721 
722 	if ((a->rcv_flagaction & RCTL_LOCAL_DENY) <
723 	    (b->rcv_flagaction & RCTL_LOCAL_DENY))
724 		return (-1);
725 
726 	if ((a->rcv_flagaction & RCTL_LOCAL_DENY) >
727 	    (b->rcv_flagaction & RCTL_LOCAL_DENY))
728 		return (1);
729 
730 	if (a->rcv_privilege < b->rcv_privilege)
731 		return (-1);
732 
733 	if (a->rcv_privilege > b->rcv_privilege)
734 		return (1);
735 
736 	if (imprecise)
737 		return (0);
738 
739 	if (a->rcv_action_recip_pid < b->rcv_action_recip_pid)
740 		return (-1);
741 
742 	if (a->rcv_action_recip_pid > b->rcv_action_recip_pid)
743 		return (1);
744 
745 	return (0);
746 }
747 
748 static rctl_val_t *
749 rctl_val_list_find(rctl_val_t **head, rctl_val_t *cval)
750 {
751 	rctl_val_t *rval = *head;
752 
753 	while (rval != NULL) {
754 		if (rctl_val_cmp(cval, rval, 0) == 0)
755 			return (rval);
756 
757 		rval = rval->rcv_next;
758 	}
759 
760 	return (NULL);
761 
762 }
763 
764 /*
765  * int rctl_val_list_insert(rctl_val_t **, rctl_val_t *)
766  *
767  * Overview
768  *   This function inserts the rctl_val_t into the value list provided.
769  *   The insert is always successful unless if the value is a duplicate
770  *   of one already in the list.
771  *
772  * Return values
773  *    1 if the value was a duplicate of an existing value in the list.
774  *    0 if the insert was successful.
775  */
776 int
777 rctl_val_list_insert(rctl_val_t **root, rctl_val_t *rval)
778 {
779 	rctl_val_t *prev;
780 	int equiv;
781 
782 	rval->rcv_next = NULL;
783 	rval->rcv_prev = NULL;
784 
785 	if (*root == NULL) {
786 		*root = rval;
787 		return (0);
788 	}
789 
790 	equiv = rctl_val_cmp(rval, *root, 0);
791 
792 	if (equiv == 0)
793 		return (1);
794 
795 	if (equiv < 0) {
796 		rval->rcv_next = *root;
797 		rval->rcv_next->rcv_prev = rval;
798 		*root = rval;
799 
800 		return (0);
801 	}
802 
803 	prev = *root;
804 	while (prev->rcv_next != NULL &&
805 	    (equiv = rctl_val_cmp(rval, prev->rcv_next, 0)) > 0) {
806 		prev = prev->rcv_next;
807 	}
808 
809 	if (equiv == 0)
810 		return (1);
811 
812 	rval->rcv_next = prev->rcv_next;
813 	if (rval->rcv_next != NULL)
814 		rval->rcv_next->rcv_prev = rval;
815 	prev->rcv_next = rval;
816 	rval->rcv_prev = prev;
817 
818 	return (0);
819 }
820 
821 static int
822 rctl_val_list_delete(rctl_val_t **root, rctl_val_t *rval)
823 {
824 	rctl_val_t *prev;
825 
826 	if (*root == NULL)
827 		return (-1);
828 
829 	prev = *root;
830 	if (rctl_val_cmp(rval, prev, 0) == 0) {
831 		*root = prev->rcv_next;
832 		if (*root != NULL)
833 			(*root)->rcv_prev = NULL;
834 
835 		kmem_cache_free(rctl_val_cache, prev);
836 
837 		return (0);
838 	}
839 
840 	while (prev->rcv_next != NULL &&
841 	    rctl_val_cmp(rval, prev->rcv_next, 0) != 0) {
842 		prev = prev->rcv_next;
843 	}
844 
845 	if (prev->rcv_next == NULL) {
846 		/*
847 		 * If we navigate the entire list and cannot find a match, then
848 		 * return failure.
849 		 */
850 		return (-1);
851 	}
852 
853 	prev = prev->rcv_next;
854 	prev->rcv_prev->rcv_next = prev->rcv_next;
855 	if (prev->rcv_next != NULL)
856 		prev->rcv_next->rcv_prev = prev->rcv_prev;
857 
858 	kmem_cache_free(rctl_val_cache, prev);
859 
860 	return (0);
861 }
862 
863 static rctl_val_t *
864 rctl_val_list_dup(rctl_val_t *rval, rctl_alloc_gp_t *ragp, struct proc *oldp,
865     struct proc *newp)
866 {
867 	rctl_val_t *head = NULL;
868 
869 	for (; rval != NULL; rval = rval->rcv_next) {
870 		rctl_val_t *dval = rctl_gp_detach_val(ragp);
871 
872 		bcopy(rval, dval, sizeof (rctl_val_t));
873 		dval->rcv_prev = dval->rcv_next = NULL;
874 
875 		if (oldp == NULL ||
876 		    rval->rcv_action_recipient == NULL ||
877 		    rval->rcv_action_recipient == oldp) {
878 			if (rval->rcv_privilege == RCPRIV_BASIC) {
879 				dval->rcv_action_recipient = newp;
880 				dval->rcv_action_recip_pid = newp->p_pid;
881 			} else {
882 				dval->rcv_action_recipient = NULL;
883 				dval->rcv_action_recip_pid = -1;
884 			}
885 
886 			(void) rctl_val_list_insert(&head, dval);
887 		} else {
888 			kmem_cache_free(rctl_val_cache, dval);
889 		}
890 	}
891 
892 	return (head);
893 }
894 
895 static void
896 rctl_val_list_reset(rctl_val_t *rval)
897 {
898 	for (; rval != NULL; rval = rval->rcv_next)
899 		rval->rcv_firing_time = 0;
900 }
901 
902 static uint_t
903 rctl_val_list_count(rctl_val_t *rval)
904 {
905 	uint_t n = 0;
906 
907 	for (; rval != NULL; rval = rval->rcv_next)
908 		n++;
909 
910 	return (n);
911 }
912 
913 
914 static void
915 rctl_val_list_free(rctl_val_t *rval)
916 {
917 	while (rval != NULL) {
918 		rctl_val_t *next = rval->rcv_next;
919 
920 		kmem_cache_free(rctl_val_cache, rval);
921 
922 		rval = next;
923 	}
924 }
925 
926 /*
927  * rctl_qty_t rctl_model_maximum(rctl_dict_entry_t *, struct proc *)
928  *
929  * Overview
930  *   In cases where the operating system supports more than one process
931  *   addressing model, the operating system capabilities will exceed those of
932  *   one or more of these models.  Processes in a less capable model must have
933  *   their resources accurately controlled, without diluting those of their
934  *   descendants reached via exec().  rctl_model_maximum() returns the governing
935  *   value for the specified process with respect to a resource control, such
936  *   that the value can used for the RCTLOP_SET callback or compatability
937  *   support.
938  *
939  * Return values
940  *   The maximum value for the given process for the specified resource control.
941  *
942  * Caller's context
943  *   No restrictions on context.
944  */
945 rctl_qty_t
946 rctl_model_maximum(rctl_dict_entry_t *rde, struct proc *p)
947 {
948 	if (p->p_model == DATAMODEL_NATIVE)
949 		return (rde->rcd_max_native);
950 
951 	return (rde->rcd_max_ilp32);
952 }
953 
954 /*
955  * rctl_qty_t rctl_model_value(rctl_dict_entry_t *, struct proc *, rctl_qty_t)
956  *
957  * Overview
958  *   Convenience function wrapping the rctl_model_maximum() functionality.
959  *
960  * Return values
961  *   The lesser of the process's maximum value and the given value for the
962  *   specified resource control.
963  *
964  * Caller's context
965  *   No restrictions on context.
966  */
967 rctl_qty_t
968 rctl_model_value(rctl_dict_entry_t *rde, struct proc *p, rctl_qty_t value)
969 {
970 	rctl_qty_t max = rctl_model_maximum(rde, p);
971 
972 	return (value < max ? value : max);
973 }
974 
975 static void
976 rctl_set_insert(rctl_set_t *set, rctl_hndl_t hndl, rctl_t *rctl)
977 {
978 	uint_t index = hndl % rctl_set_size;
979 	rctl_t *next_ctl, *prev_ctl;
980 
981 	ASSERT(MUTEX_HELD(&set->rcs_lock));
982 
983 	rctl->rc_next = NULL;
984 
985 	if (set->rcs_ctls[index] == NULL) {
986 		set->rcs_ctls[index] = rctl;
987 		return;
988 	}
989 
990 	if (hndl < set->rcs_ctls[index]->rc_id) {
991 		rctl->rc_next = set->rcs_ctls[index];
992 		set->rcs_ctls[index] = rctl;
993 
994 		return;
995 	}
996 
997 	for (next_ctl = set->rcs_ctls[index]->rc_next,
998 	    prev_ctl = set->rcs_ctls[index];
999 	    next_ctl != NULL;
1000 	    prev_ctl = next_ctl,
1001 	    next_ctl = next_ctl->rc_next) {
1002 		if (next_ctl->rc_id > hndl) {
1003 			rctl->rc_next = next_ctl;
1004 			prev_ctl->rc_next = rctl;
1005 
1006 			return;
1007 		}
1008 	}
1009 
1010 	rctl->rc_next = next_ctl;
1011 	prev_ctl->rc_next = rctl;
1012 }
1013 
1014 /*
1015  * rctl_set_t *rctl_set_create()
1016  *
1017  * Overview
1018  *   Create an empty resource control set, suitable for attaching to a
1019  *   controlled entity.
1020  *
1021  * Return values
1022  *   A pointer to the newly created set.
1023  *
1024  * Caller's context
1025  *   Safe for KM_SLEEP allocations.
1026  */
1027 rctl_set_t *
1028 rctl_set_create()
1029 {
1030 	rctl_set_t *rset = kmem_zalloc(sizeof (rctl_set_t), KM_SLEEP);
1031 
1032 	mutex_init(&rset->rcs_lock, NULL, MUTEX_DEFAULT, NULL);
1033 	rset->rcs_ctls = kmem_zalloc(rctl_set_size * sizeof (rctl_t *),
1034 	    KM_SLEEP);
1035 	rset->rcs_entity = -1;
1036 
1037 	return (rset);
1038 }
1039 
1040 /*
1041  * rctl_gp_alloc_t *rctl_set_init_prealloc(rctl_entity_t)
1042  *
1043  * Overview
1044  *    rctl_set_init_prealloc() examines the globally defined resource controls
1045  *    and their default values and returns a resource control allocation group
1046  *    populated with sufficient controls and values to form a representative
1047  *    resource control set for the specified entity.
1048  *
1049  * Return values
1050  *    A pointer to the newly created allocation group.
1051  *
1052  * Caller's context
1053  *    Caller must be in a context suitable for KM_SLEEP allocations.
1054  */
1055 rctl_alloc_gp_t *
1056 rctl_set_init_prealloc(rctl_entity_t entity)
1057 {
1058 	rctl_dict_entry_t *rde;
1059 	rctl_alloc_gp_t *ragp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
1060 
1061 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1062 
1063 	if (rctl_lists[entity] == NULL)
1064 		return (ragp);
1065 
1066 	mutex_enter(&rctl_lists_lock);
1067 
1068 	for (rde = rctl_lists[entity]; rde != NULL; rde = rde->rcd_next) {
1069 		ragp->rcag_nctls++;
1070 		ragp->rcag_nvals += rctl_val_list_count(rde->rcd_default_value);
1071 	}
1072 
1073 	mutex_exit(&rctl_lists_lock);
1074 
1075 	rctl_gp_alloc(ragp);
1076 
1077 	return (ragp);
1078 }
1079 
1080 /*
1081  * rctl_set_t *rctl_set_init(rctl_entity_t)
1082  *
1083  * Overview
1084  *   rctl_set_create() creates a resource control set, initialized with the
1085  *   system infinite values on all registered controls, for attachment to a
1086  *   system entity requiring resource controls, such as a process or a task.
1087  *
1088  * Return values
1089  *   A pointer to the newly filled set.
1090  *
1091  * Caller's context
1092  *   Caller must be holding p_lock on entry so that RCTLOP_SET() functions
1093  *   may modify task and project members based on the proc structure
1094  *   they are passed.
1095  */
1096 rctl_set_t *
1097 rctl_set_init(rctl_entity_t entity, struct proc *p, rctl_entity_p_t *e,
1098     rctl_set_t *rset, rctl_alloc_gp_t *ragp)
1099 {
1100 	rctl_dict_entry_t *rde;
1101 
1102 	ASSERT(MUTEX_HELD(&p->p_lock));
1103 	ASSERT(e);
1104 	rset->rcs_entity = entity;
1105 
1106 	if (rctl_lists[entity] == NULL)
1107 		return (rset);
1108 
1109 	mutex_enter(&rctl_lists_lock);
1110 	mutex_enter(&rset->rcs_lock);
1111 
1112 	for (rde = rctl_lists[entity]; rde != NULL; rde = rde->rcd_next) {
1113 		rctl_t *rctl = rctl_gp_detach_ctl(ragp);
1114 
1115 		rctl->rc_dict_entry = rde;
1116 		rctl->rc_id = rde->rcd_id;
1117 		rctl->rc_projdb = NULL;
1118 
1119 		rctl->rc_values = rctl_val_list_dup(rde->rcd_default_value,
1120 		    ragp, NULL, p);
1121 		rctl->rc_cursor = rctl->rc_values;
1122 
1123 		ASSERT(rctl->rc_cursor != NULL);
1124 
1125 		rctl_set_insert(rset, rde->rcd_id, rctl);
1126 
1127 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1128 		    rctl->rc_cursor->rcv_value));
1129 	}
1130 
1131 	mutex_exit(&rset->rcs_lock);
1132 	mutex_exit(&rctl_lists_lock);
1133 
1134 	return (rset);
1135 }
1136 
1137 static rctl_t *
1138 rctl_dup(rctl_t *rctl, rctl_alloc_gp_t *ragp, struct proc *oldp,
1139     struct proc *newp)
1140 {
1141 	rctl_t *dup = rctl_gp_detach_ctl(ragp);
1142 	rctl_val_t *dval;
1143 
1144 	dup->rc_id = rctl->rc_id;
1145 	dup->rc_dict_entry = rctl->rc_dict_entry;
1146 	dup->rc_next = NULL;
1147 	dup->rc_cursor = NULL;
1148 	dup->rc_values = rctl_val_list_dup(rctl->rc_values, ragp, oldp, newp);
1149 
1150 	for (dval = dup->rc_values;
1151 	    dval != NULL; dval = dval->rcv_next) {
1152 		if (rctl_val_cmp(rctl->rc_cursor, dval, 0) >= 0) {
1153 			dup->rc_cursor = dval;
1154 			break;
1155 		}
1156 	}
1157 
1158 	if (dup->rc_cursor == NULL)
1159 		dup->rc_cursor = dup->rc_values;
1160 
1161 	return (dup);
1162 }
1163 
1164 static void
1165 rctl_set_fill_alloc_gp(rctl_set_t *set, rctl_alloc_gp_t *ragp)
1166 {
1167 	uint_t i;
1168 
1169 	bzero(ragp, sizeof (rctl_alloc_gp_t));
1170 
1171 	for (i = 0; i < rctl_set_size; i++) {
1172 		rctl_t *r = set->rcs_ctls[i];
1173 
1174 		while (r != NULL) {
1175 			ragp->rcag_nctls++;
1176 
1177 			ragp->rcag_nvals += rctl_val_list_count(r->rc_values);
1178 
1179 			r = r->rc_next;
1180 		}
1181 	}
1182 }
1183 
1184 /*
1185  * rctl_alloc_gp_t *rctl_set_dup_prealloc(rctl_set_t *)
1186  *
1187  * Overview
1188  *   Given a resource control set, allocate a sufficiently large allocation
1189  *   group to contain a duplicate of the set.
1190  *
1191  * Return value
1192  *   A pointer to the newly created allocation group.
1193  *
1194  * Caller's context
1195  *   Safe for KM_SLEEP allocations.
1196  */
1197 rctl_alloc_gp_t *
1198 rctl_set_dup_prealloc(rctl_set_t *set)
1199 {
1200 	rctl_alloc_gp_t *ragp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
1201 
1202 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1203 
1204 	mutex_enter(&set->rcs_lock);
1205 	rctl_set_fill_alloc_gp(set, ragp);
1206 	mutex_exit(&set->rcs_lock);
1207 
1208 	rctl_gp_alloc(ragp);
1209 
1210 	return (ragp);
1211 }
1212 
1213 /*
1214  * int rctl_set_dup_ready(rctl_set_t *, rctl_alloc_gp_t *)
1215  *
1216  * Overview
1217  *   Verify that the allocation group provided is large enough to allow a
1218  *   duplicate of the given resource control set to be constructed from its
1219  *   contents.
1220  *
1221  * Return values
1222  *   1 if the allocation group is sufficiently large, 0 otherwise.
1223  *
1224  * Caller's context
1225  *   rcs_lock must be held prior to entry.
1226  */
1227 int
1228 rctl_set_dup_ready(rctl_set_t *set, rctl_alloc_gp_t *ragp)
1229 {
1230 	rctl_alloc_gp_t curr_gp;
1231 
1232 	ASSERT(MUTEX_HELD(&set->rcs_lock));
1233 
1234 	rctl_set_fill_alloc_gp(set, &curr_gp);
1235 
1236 	if (curr_gp.rcag_nctls <= ragp->rcag_nctls &&
1237 	    curr_gp.rcag_nvals <= ragp->rcag_nvals)
1238 		return (1);
1239 
1240 	return (0);
1241 }
1242 
1243 /*
1244  * rctl_set_t *rctl_set_dup(rctl_set_t *, struct proc *, struct proc *,
1245  *   rctl_set_t *, rctl_alloc_gp_t *, int)
1246  *
1247  * Overview
1248  *   Make a duplicate of the resource control set.  The proc pointers are those
1249  *   of the owning process and of the process associated with the entity
1250  *   receiving the duplicate.
1251  *
1252  *   Duplication is a 3 stage process. Stage 1 is memory allocation for
1253  *   the duplicate set, which is taken care of by rctl_set_dup_prealloc().
1254  *   Stage 2 consists of copying all rctls and values from the old set into
1255  *   the new. Stage 3 completes the duplication by performing the appropriate
1256  *   callbacks for each rctl in the new set.
1257  *
1258  *   Stages 2 and 3 are handled by calling rctl_set_dup with the RCD_DUP and
1259  *   RCD_CALLBACK functions, respectively. The RCD_CALLBACK flag may only
1260  *   be supplied if the newp proc structure reflects the new task and
1261  *   project linkage.
1262  *
1263  * Return value
1264  *   A pointer to the duplicate set.
1265  *
1266  * Caller's context
1267  *   The rcs_lock of the set to be duplicated must be held prior to entry.
1268  */
1269 rctl_set_t *
1270 rctl_set_dup(rctl_set_t *set, struct proc *oldp, struct proc *newp,
1271     rctl_entity_p_t *e, rctl_set_t *dup, rctl_alloc_gp_t *ragp, int flag)
1272 {
1273 	uint_t i;
1274 	rctl_set_t	*iter;
1275 
1276 	ASSERT((flag & RCD_DUP) || (flag & RCD_CALLBACK));
1277 	ASSERT(e);
1278 	/*
1279 	 * When copying the old set, iterate over that. Otherwise, when
1280 	 * only callbacks have been requested, iterate over the dup set.
1281 	 */
1282 	if (flag & RCD_DUP) {
1283 		ASSERT(MUTEX_HELD(&set->rcs_lock));
1284 		iter = set;
1285 		dup->rcs_entity = set->rcs_entity;
1286 	} else {
1287 		iter = dup;
1288 	}
1289 
1290 	mutex_enter(&dup->rcs_lock);
1291 
1292 	for (i = 0; i < rctl_set_size; i++) {
1293 		rctl_t *r = iter->rcs_ctls[i];
1294 		rctl_t *d;
1295 
1296 		while (r != NULL) {
1297 			if (flag & RCD_DUP) {
1298 				d = rctl_dup(r, ragp, oldp, newp);
1299 				rctl_set_insert(dup, r->rc_id, d);
1300 			} else {
1301 				d = r;
1302 			}
1303 
1304 			if (flag & RCD_CALLBACK)
1305 				RCTLOP_SET(d, newp, e,
1306 				    rctl_model_value(d->rc_dict_entry, newp,
1307 				    d->rc_cursor->rcv_value));
1308 
1309 			r = r->rc_next;
1310 		}
1311 	}
1312 
1313 	mutex_exit(&dup->rcs_lock);
1314 
1315 	return (dup);
1316 }
1317 
1318 /*
1319  * void rctl_set_free(rctl_set_t *)
1320  *
1321  * Overview
1322  *   Delete resource control set and all attached values.
1323  *
1324  * Return values
1325  *   No value returned.
1326  *
1327  * Caller's context
1328  *   No restrictions on context.
1329  */
1330 void
1331 rctl_set_free(rctl_set_t *set)
1332 {
1333 	uint_t i;
1334 
1335 	mutex_enter(&set->rcs_lock);
1336 	for (i = 0; i < rctl_set_size; i++) {
1337 		rctl_t *r = set->rcs_ctls[i];
1338 
1339 		while (r != NULL) {
1340 			rctl_val_t *v = r->rc_values;
1341 			rctl_t *n = r->rc_next;
1342 
1343 			kmem_cache_free(rctl_cache, r);
1344 
1345 			rctl_val_list_free(v);
1346 
1347 			r = n;
1348 		}
1349 	}
1350 	mutex_exit(&set->rcs_lock);
1351 
1352 	kmem_free(set->rcs_ctls, sizeof (rctl_t *) * rctl_set_size);
1353 	kmem_free(set, sizeof (rctl_set_t));
1354 }
1355 
1356 /*
1357  * void rctl_set_reset(rctl_set_t *)
1358  *
1359  * Overview
1360  *   Resets all rctls within the set such that the lowest value becomes active.
1361  *
1362  * Return values
1363  *   No value returned.
1364  *
1365  * Caller's context
1366  *   No restrictions on context.
1367  */
1368 void
1369 rctl_set_reset(rctl_set_t *set, struct proc *p, rctl_entity_p_t *e)
1370 {
1371 	uint_t i;
1372 
1373 	ASSERT(e);
1374 
1375 	mutex_enter(&set->rcs_lock);
1376 	for (i = 0; i < rctl_set_size; i++) {
1377 		rctl_t *r = set->rcs_ctls[i];
1378 
1379 		while (r != NULL) {
1380 			r->rc_cursor = r->rc_values;
1381 			rctl_val_list_reset(r->rc_cursor);
1382 			RCTLOP_SET(r, p, e, rctl_model_value(r->rc_dict_entry,
1383 			    p, r->rc_cursor->rcv_value));
1384 
1385 			ASSERT(r->rc_cursor != NULL);
1386 
1387 			r = r->rc_next;
1388 		}
1389 	}
1390 
1391 	mutex_exit(&set->rcs_lock);
1392 }
1393 
1394 /*
1395  * void rctl_set_tearoff(rctl_set *, struct proc *)
1396  *
1397  * Overview
1398  *   Tear off any resource control values on this set with an action recipient
1399  *   equal to the specified process (as they are becoming invalid with the
1400  *   process's departure from this set as an observer).
1401  *
1402  * Return values
1403  *   No value returned.
1404  *
1405  * Caller's context
1406  *   No restrictions on context
1407  */
1408 void
1409 rctl_set_tearoff(rctl_set_t *set, struct proc *p)
1410 {
1411 	uint_t i;
1412 
1413 	mutex_enter(&set->rcs_lock);
1414 	for (i = 0; i < rctl_set_size; i++) {
1415 		rctl_t *r = set->rcs_ctls[i];
1416 
1417 		while (r != NULL) {
1418 			rctl_val_t *rval;
1419 
1420 tearoff_rewalk_list:
1421 			rval = r->rc_values;
1422 
1423 			while (rval != NULL) {
1424 				if (rval->rcv_privilege == RCPRIV_BASIC &&
1425 				    rval->rcv_action_recipient == p) {
1426 					if (r->rc_cursor == rval)
1427 						r->rc_cursor = rval->rcv_next;
1428 
1429 					(void) rctl_val_list_delete(
1430 					    &r->rc_values, rval);
1431 
1432 					goto tearoff_rewalk_list;
1433 				}
1434 
1435 				rval = rval->rcv_next;
1436 			}
1437 
1438 			ASSERT(r->rc_cursor != NULL);
1439 
1440 			r = r->rc_next;
1441 		}
1442 	}
1443 
1444 	mutex_exit(&set->rcs_lock);
1445 }
1446 
1447 int
1448 rctl_set_find(rctl_set_t *set, rctl_hndl_t hndl, rctl_t **rctl)
1449 {
1450 	uint_t index = hndl % rctl_set_size;
1451 	rctl_t *curr_ctl;
1452 
1453 	ASSERT(MUTEX_HELD(&set->rcs_lock));
1454 
1455 	for (curr_ctl = set->rcs_ctls[index]; curr_ctl != NULL;
1456 	    curr_ctl = curr_ctl->rc_next) {
1457 		if (curr_ctl->rc_id == hndl) {
1458 			*rctl = curr_ctl;
1459 
1460 			return (0);
1461 		}
1462 	}
1463 
1464 	return (-1);
1465 }
1466 
1467 /*
1468  * rlim64_t rctl_enforced_value(rctl_hndl_t, rctl_set_t *, struct proc *)
1469  *
1470  * Overview
1471  *   Given a process, get the next enforced value on the rctl of the specified
1472  *   handle.
1473  *
1474  * Return value
1475  *   The enforced value.
1476  *
1477  * Caller's context
1478  *   For controls on process collectives, p->p_lock must be held across the
1479  *   operation.
1480  */
1481 /*ARGSUSED*/
1482 rctl_qty_t
1483 rctl_enforced_value(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p)
1484 {
1485 	rctl_t *rctl;
1486 	rlim64_t ret;
1487 
1488 	mutex_enter(&rset->rcs_lock);
1489 
1490 	if (rctl_set_find(rset, hndl, &rctl) == -1)
1491 		panic("unknown resource control handle %d requested", hndl);
1492 	else
1493 		ret = rctl_model_value(rctl->rc_dict_entry, p,
1494 		    rctl->rc_cursor->rcv_value);
1495 
1496 	mutex_exit(&rset->rcs_lock);
1497 
1498 	return (ret);
1499 }
1500 
1501 /*
1502  * int rctl_global_get(const char *, rctl_dict_entry_t *)
1503  *
1504  * Overview
1505  *   Copy a sanitized version of the global rctl for a given resource control
1506  *   name.  (By sanitization, we mean that the unsafe data pointers have been
1507  *   zeroed.)
1508  *
1509  * Return value
1510  *   -1 if name not defined, 0 otherwise.
1511  *
1512  * Caller's context
1513  *   No restrictions on context.  rctl_dict_lock must not be held.
1514  */
1515 int
1516 rctl_global_get(const char *name, rctl_dict_entry_t *drde)
1517 {
1518 	rctl_dict_entry_t *rde = rctl_dict_lookup(name);
1519 
1520 	if (rde == NULL)
1521 		return (-1);
1522 
1523 	bcopy(rde, drde, sizeof (rctl_dict_entry_t));
1524 
1525 	drde->rcd_next = NULL;
1526 	drde->rcd_ops = NULL;
1527 
1528 	return (0);
1529 }
1530 
1531 /*
1532  * int rctl_global_set(const char *, rctl_dict_entry_t *)
1533  *
1534  * Overview
1535  *   Transfer the settable fields of the named rctl to the global rctl matching
1536  *   the given resource control name.
1537  *
1538  * Return value
1539  *   -1 if name not defined, 0 otherwise.
1540  *
1541  * Caller's context
1542  *   No restrictions on context.  rctl_dict_lock must not be held.
1543  */
1544 int
1545 rctl_global_set(const char *name, rctl_dict_entry_t *drde)
1546 {
1547 	rctl_dict_entry_t *rde = rctl_dict_lookup(name);
1548 
1549 	if (rde == NULL)
1550 		return (-1);
1551 
1552 	rde->rcd_flagaction = drde->rcd_flagaction;
1553 	rde->rcd_syslog_level = drde->rcd_syslog_level;
1554 	rde->rcd_strlog_flags = drde->rcd_strlog_flags;
1555 
1556 	return (0);
1557 }
1558 
1559 static int
1560 rctl_local_op(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
1561     int (*cbop)(rctl_hndl_t, struct proc *p, rctl_entity_p_t *e, rctl_t *,
1562     rctl_val_t *, rctl_val_t *), struct proc *p)
1563 {
1564 	rctl_t *rctl;
1565 	rctl_set_t *rset;
1566 	rctl_entity_p_t e;
1567 	int ret = 0;
1568 	rctl_dict_entry_t *rde = rctl_dict_lookup_hndl(hndl);
1569 
1570 	ASSERT(MUTEX_HELD(&p->p_lock));
1571 
1572 	rset = rctl_entity_obtain_rset(rde, p);
1573 
1574 	if (rset == NULL) {
1575 		return (-1);
1576 	}
1577 	rctl_entity_obtain_entity_p(rset->rcs_entity, p, &e);
1578 
1579 	mutex_enter(&rset->rcs_lock);
1580 
1581 	/* using rctl's hndl, get rctl from local set */
1582 	if (rctl_set_find(rset, hndl, &rctl) == -1) {
1583 		mutex_exit(&rset->rcs_lock);
1584 		return (-1);
1585 	}
1586 
1587 	ret = cbop(hndl, p, &e, rctl, oval, nval);
1588 
1589 	mutex_exit(&rset->rcs_lock);
1590 	return (ret);
1591 }
1592 
1593 /*ARGSUSED*/
1594 static int
1595 rctl_local_get_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1596     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1597 {
1598 	if (oval == NULL) {
1599 		/*
1600 		 * RCTL_FIRST
1601 		 */
1602 		bcopy(rctl->rc_values, nval, sizeof (rctl_val_t));
1603 	} else {
1604 		/*
1605 		 * RCTL_NEXT
1606 		 */
1607 		rctl_val_t *tval = rctl_val_list_find(&rctl->rc_values, oval);
1608 
1609 		if (tval == NULL)
1610 			return (ESRCH);
1611 		else if (tval->rcv_next == NULL)
1612 			return (ENOENT);
1613 		else
1614 			bcopy(tval->rcv_next, nval, sizeof (rctl_val_t));
1615 	}
1616 
1617 	return (0);
1618 }
1619 
1620 /*
1621  * int rctl_local_get(rctl_hndl_t, rctl_val_t *)
1622  *
1623  * Overview
1624  *   Get the rctl value for the given flags.
1625  *
1626  * Return values
1627  *   0 for successful get, errno otherwise.
1628  */
1629 int
1630 rctl_local_get(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
1631     struct proc *p)
1632 {
1633 	return (rctl_local_op(hndl, oval, nval, rctl_local_get_cb, p));
1634 }
1635 
1636 /*ARGSUSED*/
1637 static int
1638 rctl_local_delete_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1639     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1640 {
1641 	if ((oval = rctl_val_list_find(&rctl->rc_values, nval)) == NULL)
1642 		return (ESRCH);
1643 
1644 	if (rctl->rc_cursor == oval) {
1645 		rctl->rc_cursor = oval->rcv_next;
1646 		rctl_val_list_reset(rctl->rc_cursor);
1647 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1648 		    rctl->rc_cursor->rcv_value));
1649 
1650 		ASSERT(rctl->rc_cursor != NULL);
1651 	}
1652 
1653 	(void) rctl_val_list_delete(&rctl->rc_values, oval);
1654 
1655 	return (0);
1656 }
1657 
1658 /*
1659  * int rctl_local_delete(rctl_hndl_t, rctl_val_t *)
1660  *
1661  * Overview
1662  *   Delete the rctl value for the given flags.
1663  *
1664  * Return values
1665  *   0 for successful delete, errno otherwise.
1666  */
1667 int
1668 rctl_local_delete(rctl_hndl_t hndl, rctl_val_t *val, struct proc *p)
1669 {
1670 	return (rctl_local_op(hndl, NULL, val, rctl_local_delete_cb, p));
1671 }
1672 
1673 /*
1674  * rctl_local_insert_cb()
1675  *
1676  * Overview
1677  *   Insert a new value into the rctl's val list. If an error occurs,
1678  *   the val list must be left in the same state as when the function
1679  *   was entered.
1680  *
1681  * Return Values
1682  *   0 for successful insert, EINVAL if the value is duplicated in the
1683  *   existing list.
1684  */
1685 /*ARGSUSED*/
1686 static int
1687 rctl_local_insert_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1688     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1689 {
1690 	/*
1691 	 * Before inserting, confirm there are no duplicates of this value
1692 	 * and flag level. If there is a duplicate, flag an error and do
1693 	 * nothing.
1694 	 */
1695 	if (rctl_val_list_insert(&rctl->rc_values, nval) != 0)
1696 		return (EINVAL);
1697 
1698 	if (rctl_val_cmp(nval, rctl->rc_cursor, 0) < 0) {
1699 		rctl->rc_cursor = nval;
1700 		rctl_val_list_reset(rctl->rc_cursor);
1701 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1702 		    rctl->rc_cursor->rcv_value));
1703 
1704 		ASSERT(rctl->rc_cursor != NULL);
1705 	}
1706 
1707 	return (0);
1708 }
1709 
1710 /*
1711  * int rctl_local_insert(rctl_hndl_t, rctl_val_t *)
1712  *
1713  * Overview
1714  *   Insert the rctl value into the appropriate rctl set for the calling
1715  *   process, given the handle.
1716  */
1717 int
1718 rctl_local_insert(rctl_hndl_t hndl, rctl_val_t *val, struct proc *p)
1719 {
1720 	return (rctl_local_op(hndl, NULL, val, rctl_local_insert_cb, p));
1721 }
1722 
1723 /*
1724  * rctl_local_insert_all_cb()
1725  *
1726  * Overview
1727  *   Called for RCENTITY_PROJECT rctls only, via rctlsys_projset().
1728  *
1729  *   Inserts new values from the project database (new_values).  alloc_values
1730  *   should be a linked list of pre-allocated rctl_val_t, which are used to
1731  *   populate (rc_projdb).
1732  *
1733  *   Should the *new_values linked list match the contents of the rctl's
1734  *   rp_projdb then we do nothing.
1735  *
1736  * Return Values
1737  *   0 is always returned.
1738  */
1739 /*ARGSUSED*/
1740 static int
1741 rctl_local_insert_all_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1742     rctl_t *rctl, rctl_val_t *new_values, rctl_val_t *alloc_values)
1743 {
1744 	rctl_val_t *val;
1745 	rctl_val_t *tmp_val;
1746 	rctl_val_t *next;
1747 	int modified = 0;
1748 
1749 	/*
1750 	 * If this the first time we've set this project rctl, then we delete
1751 	 * all the privilege values.  These privilege values have been set by
1752 	 * rctl_add_default_limit().
1753 	 *
1754 	 * We save some cycles here by not calling rctl_val_list_delete().
1755 	 */
1756 	if (rctl->rc_projdb == NULL) {
1757 		val = rctl->rc_values;
1758 
1759 		while (val != NULL) {
1760 			if (val->rcv_privilege == RCPRIV_PRIVILEGED) {
1761 				if (val->rcv_prev != NULL)
1762 					val->rcv_prev->rcv_next = val->rcv_next;
1763 				else
1764 					rctl->rc_values = val->rcv_next;
1765 
1766 				if (val->rcv_next != NULL)
1767 					val->rcv_next->rcv_prev = val->rcv_prev;
1768 
1769 				tmp_val = val;
1770 				val = val->rcv_next;
1771 				kmem_cache_free(rctl_val_cache, tmp_val);
1772 			} else {
1773 				val = val->rcv_next;
1774 			}
1775 		}
1776 		modified = 1;
1777 	}
1778 
1779 	/*
1780 	 * Delete active values previously set through the project database.
1781 	 */
1782 	val = rctl->rc_projdb;
1783 
1784 	while (val != NULL) {
1785 
1786 		/* Is the old value found in the new values? */
1787 		if (rctl_val_list_find(&new_values, val) == NULL) {
1788 
1789 			/*
1790 			 * Delete from the active values if it originated from
1791 			 * the project database.
1792 			 */
1793 			if (((tmp_val = rctl_val_list_find(&rctl->rc_values,
1794 			    val)) != NULL) &&
1795 			    (tmp_val->rcv_flagaction & RCTL_LOCAL_PROJDB)) {
1796 				(void) rctl_val_list_delete(&rctl->rc_values,
1797 				    tmp_val);
1798 			}
1799 
1800 			tmp_val = val->rcv_next;
1801 			(void) rctl_val_list_delete(&rctl->rc_projdb, val);
1802 			val = tmp_val;
1803 			modified = 1;
1804 
1805 		} else
1806 			val = val->rcv_next;
1807 	}
1808 
1809 	/*
1810 	 * Insert new values from the project database.
1811 	 */
1812 	while (new_values != NULL) {
1813 		next = new_values->rcv_next;
1814 
1815 		/*
1816 		 * Insert this new value into the rc_projdb, and duplicate this
1817 		 * entry to the active list.
1818 		 */
1819 		if (rctl_val_list_insert(&rctl->rc_projdb, new_values) == 0) {
1820 
1821 			tmp_val = alloc_values->rcv_next;
1822 			bcopy(new_values, alloc_values, sizeof (rctl_val_t));
1823 			alloc_values->rcv_next = tmp_val;
1824 
1825 			if (rctl_val_list_insert(&rctl->rc_values,
1826 			    alloc_values) == 0) {
1827 				/* inserted move alloc_values on */
1828 				alloc_values = tmp_val;
1829 				modified = 1;
1830 			}
1831 		} else {
1832 			/*
1833 			 * Unlike setrctl() we don't want to return an error on
1834 			 * a duplicate entry; we are concerned solely with
1835 			 * ensuring that all the values specified are set.
1836 			 */
1837 			kmem_cache_free(rctl_val_cache, new_values);
1838 		}
1839 		new_values = next;
1840 	}
1841 
1842 	/* Teardown any unused rctl_val_t */
1843 	while (alloc_values != NULL) {
1844 		tmp_val = alloc_values;
1845 		alloc_values = alloc_values->rcv_next;
1846 		kmem_cache_free(rctl_val_cache, tmp_val);
1847 	}
1848 
1849 	/* Reset the cursor if rctl values have been modified */
1850 	if (modified) {
1851 		rctl->rc_cursor = rctl->rc_values;
1852 		rctl_val_list_reset(rctl->rc_cursor);
1853 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1854 		    rctl->rc_cursor->rcv_value));
1855 	}
1856 
1857 	return (0);
1858 }
1859 
1860 int
1861 rctl_local_insert_all(rctl_hndl_t hndl, rctl_val_t *new_values,
1862     rctl_val_t *alloc_values, struct proc *p)
1863 {
1864 	return (rctl_local_op(hndl, new_values, alloc_values,
1865 	    rctl_local_insert_all_cb, p));
1866 }
1867 
1868 /*
1869  * rctl_local_replace_all_cb()
1870  *
1871  * Overview
1872  *   Called for RCENTITY_PROJECT rctls only, via rctlsys_projset().
1873  *
1874  *   Clears the active rctl values (rc_values), and stored values from the
1875  *   previous insertions from the project database (rc_projdb).
1876  *
1877  *   Inserts new values from the project database (new_values).  alloc_values
1878  *   should be a linked list of pre-allocated rctl_val_t, which are used to
1879  *   populate (rc_projdb).
1880  *
1881  * Return Values
1882  *   0 is always returned.
1883  */
1884 /*ARGSUSED*/
1885 static int
1886 rctl_local_replace_all_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1887     rctl_t *rctl, rctl_val_t *new_values, rctl_val_t *alloc_values)
1888 {
1889 	rctl_val_t *val;
1890 	rctl_val_t *next;
1891 	rctl_val_t *tmp_val;
1892 
1893 	/* Delete all the privilege vaules */
1894 	val = rctl->rc_values;
1895 
1896 	while (val != NULL) {
1897 		if (val->rcv_privilege == RCPRIV_PRIVILEGED) {
1898 			if (val->rcv_prev != NULL)
1899 				val->rcv_prev->rcv_next = val->rcv_next;
1900 			else
1901 				rctl->rc_values = val->rcv_next;
1902 
1903 			if (val->rcv_next != NULL)
1904 				val->rcv_next->rcv_prev = val->rcv_prev;
1905 
1906 			tmp_val = val;
1907 			val = val->rcv_next;
1908 			kmem_cache_free(rctl_val_cache, tmp_val);
1909 		} else {
1910 			val = val->rcv_next;
1911 		}
1912 	}
1913 
1914 	/* Delete the contents of rc_projdb */
1915 	val = rctl->rc_projdb;
1916 	while (val != NULL) {
1917 
1918 		tmp_val = val;
1919 		val = val->rcv_next;
1920 		kmem_cache_free(rctl_val_cache, tmp_val);
1921 	}
1922 	rctl->rc_projdb = NULL;
1923 
1924 	/*
1925 	 * Insert new values from the project database.
1926 	 */
1927 	while (new_values != NULL) {
1928 		next = new_values->rcv_next;
1929 
1930 		if (rctl_val_list_insert(&rctl->rc_projdb, new_values) == 0) {
1931 			tmp_val = alloc_values->rcv_next;
1932 			bcopy(new_values, alloc_values, sizeof (rctl_val_t));
1933 			alloc_values->rcv_next = tmp_val;
1934 
1935 			if (rctl_val_list_insert(&rctl->rc_values,
1936 			    alloc_values) == 0) {
1937 				/* inserted, so move alloc_values on */
1938 				alloc_values = tmp_val;
1939 			}
1940 		} else {
1941 			/*
1942 			 * Unlike setrctl() we don't want to return an error on
1943 			 * a duplicate entry; we are concerned solely with
1944 			 * ensuring that all the values specified are set.
1945 			 */
1946 			kmem_cache_free(rctl_val_cache, new_values);
1947 		}
1948 
1949 		new_values = next;
1950 	}
1951 
1952 	/* Teardown any unused rctl_val_t */
1953 	while (alloc_values != NULL) {
1954 		tmp_val = alloc_values;
1955 		alloc_values = alloc_values->rcv_next;
1956 		kmem_cache_free(rctl_val_cache, tmp_val);
1957 	}
1958 
1959 	/* Always reset the cursor */
1960 	rctl->rc_cursor = rctl->rc_values;
1961 	rctl_val_list_reset(rctl->rc_cursor);
1962 	RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1963 	    rctl->rc_cursor->rcv_value));
1964 
1965 	return (0);
1966 }
1967 
1968 int
1969 rctl_local_replace_all(rctl_hndl_t hndl, rctl_val_t *new_values,
1970     rctl_val_t *alloc_values, struct proc *p)
1971 {
1972 	return (rctl_local_op(hndl, new_values, alloc_values,
1973 	    rctl_local_replace_all_cb, p));
1974 }
1975 
1976 static int
1977 rctl_local_replace_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1978     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1979 {
1980 	int ret;
1981 	rctl_val_t *tmp;
1982 
1983 	/* Verify that old will be delete-able */
1984 	tmp = rctl_val_list_find(&rctl->rc_values, oval);
1985 	if (tmp == NULL)
1986 		return (ESRCH);
1987 	/*
1988 	 * Caller should verify that value being deleted is not the
1989 	 * system value.
1990 	 */
1991 	ASSERT(tmp->rcv_privilege != RCPRIV_SYSTEM);
1992 
1993 	/*
1994 	 * rctl_local_insert_cb() does the job of flagging an error
1995 	 * for any duplicate values. So, call rctl_local_insert_cb()
1996 	 * for the new value first, then do deletion of the old value.
1997 	 * Since this is a callback function to rctl_local_op, we can
1998 	 * count on rcs_lock being held at this point. This guarantees
1999 	 * that there is at no point a visible list which contains both
2000 	 * new and old values.
2001 	 */
2002 	if (ret = rctl_local_insert_cb(hndl, p, e, rctl, NULL, nval))
2003 		return (ret);
2004 
2005 	ret = rctl_local_delete_cb(hndl, p, e, rctl, NULL, oval);
2006 	ASSERT(ret == 0);
2007 	return (0);
2008 }
2009 
2010 /*
2011  * int rctl_local_replace(rctl_hndl_t, void *, int, uint64_t *)
2012  *
2013  * Overview
2014  *   Replace the rctl value with a new one.
2015  *
2016  * Return values
2017  *   0 for successful replace, errno otherwise.
2018  */
2019 int
2020 rctl_local_replace(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
2021     struct proc *p)
2022 {
2023 	return (rctl_local_op(hndl, oval, nval, rctl_local_replace_cb, p));
2024 }
2025 
2026 /*
2027  * int rctl_rlimit_get(rctl_hndl_t, struct proc *, struct rlimit64 *)
2028  *
2029  * Overview
2030  *   To support rlimit compatibility, we need a function which takes a 64-bit
2031  *   rlimit and encodes it as appropriate rcontrol values on the given rcontrol.
2032  *   This operation is only intended for legacy rlimits.
2033  */
2034 int
2035 rctl_rlimit_get(rctl_hndl_t rc, struct proc *p, struct rlimit64 *rlp64)
2036 {
2037 	rctl_t *rctl;
2038 	rctl_val_t *rval;
2039 	rctl_set_t *rset = p->p_rctls;
2040 	int soft_limit_seen = 0;
2041 	int test_for_deny = 1;
2042 
2043 	mutex_enter(&rset->rcs_lock);
2044 	if (rctl_set_find(rset, rc, &rctl) == -1) {
2045 		mutex_exit(&rset->rcs_lock);
2046 		return (-1);
2047 	}
2048 
2049 	rval = rctl->rc_values;
2050 
2051 	if (rctl->rc_dict_entry->rcd_flagaction & (RCTL_GLOBAL_DENY_NEVER |
2052 	    RCTL_GLOBAL_DENY_ALWAYS))
2053 		test_for_deny = 0;
2054 
2055 	/*
2056 	 * 1.  Find the first control value with the RCTL_LOCAL_DENY bit set.
2057 	 */
2058 	while (rval != NULL && rval->rcv_privilege != RCPRIV_SYSTEM) {
2059 		if (test_for_deny &&
2060 		    (rval->rcv_flagaction & RCTL_LOCAL_DENY) == 0) {
2061 			rval = rval->rcv_next;
2062 			continue;
2063 		}
2064 
2065 		/*
2066 		 * 2.  If this is an RCPRIV_BASIC value, then we've found the
2067 		 * effective soft limit and should set rlim_cur.  We should then
2068 		 * continue looking for another control value with the DENY bit
2069 		 * set.
2070 		 */
2071 		if (rval->rcv_privilege == RCPRIV_BASIC) {
2072 			if (soft_limit_seen) {
2073 				rval = rval->rcv_next;
2074 				continue;
2075 			}
2076 
2077 			if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
2078 			    rval->rcv_value < rctl_model_maximum(
2079 			    rctl->rc_dict_entry, p))
2080 				rlp64->rlim_cur = rval->rcv_value;
2081 			else
2082 				rlp64->rlim_cur = RLIM64_INFINITY;
2083 			soft_limit_seen = 1;
2084 
2085 			rval = rval->rcv_next;
2086 			continue;
2087 		}
2088 
2089 		/*
2090 		 * 3.  This is an RCPRIV_PRIVILEGED value.  If we haven't found
2091 		 * a soft limit candidate, then we've found the effective hard
2092 		 * and soft limits and should set both  If we had found a soft
2093 		 * limit, then this is only the hard limit and we need only set
2094 		 * rlim_max.
2095 		 */
2096 		if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
2097 		    rval->rcv_value < rctl_model_maximum(rctl->rc_dict_entry,
2098 		    p))
2099 			rlp64->rlim_max = rval->rcv_value;
2100 		else
2101 			rlp64->rlim_max = RLIM64_INFINITY;
2102 		if (!soft_limit_seen)
2103 			rlp64->rlim_cur = rlp64->rlim_max;
2104 
2105 		mutex_exit(&rset->rcs_lock);
2106 		return (0);
2107 	}
2108 
2109 	if (rval == NULL) {
2110 		/*
2111 		 * This control sequence is corrupt, as it is not terminated by
2112 		 * a system privileged control value.
2113 		 */
2114 		mutex_exit(&rset->rcs_lock);
2115 		return (-1);
2116 	}
2117 
2118 	/*
2119 	 * 4.  If we run into a RCPRIV_SYSTEM value, then the hard limit (and
2120 	 * the soft, if we haven't a soft candidate) should be the value of the
2121 	 * system control value.
2122 	 */
2123 	if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
2124 	    rval->rcv_value < rctl_model_maximum(rctl->rc_dict_entry, p))
2125 		rlp64->rlim_max = rval->rcv_value;
2126 	else
2127 		rlp64->rlim_max = RLIM64_INFINITY;
2128 
2129 	if (!soft_limit_seen)
2130 		rlp64->rlim_cur = rlp64->rlim_max;
2131 
2132 	mutex_exit(&rset->rcs_lock);
2133 	return (0);
2134 }
2135 
2136 /*
2137  * rctl_alloc_gp_t *rctl_rlimit_set_prealloc(uint_t)
2138  *
2139  * Overview
2140  *   Before making a series of calls to rctl_rlimit_set(), we must have a
2141  *   preallocated batch of resource control values, as rctl_rlimit_set() can
2142  *   potentially consume two resource control values per call.
2143  *
2144  * Return values
2145  *   A populated resource control allocation group with 2n resource control
2146  *   values.
2147  *
2148  * Caller's context
2149  *   Must be safe for KM_SLEEP allocations.
2150  */
2151 rctl_alloc_gp_t *
2152 rctl_rlimit_set_prealloc(uint_t n)
2153 {
2154 	rctl_alloc_gp_t *gp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
2155 
2156 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
2157 
2158 	gp->rcag_nvals = 2 * n;
2159 
2160 	rctl_gp_alloc(gp);
2161 
2162 	return (gp);
2163 }
2164 
2165 /*
2166  * int rctl_rlimit_set(rctl_hndl_t, struct proc *, struct rlimit64 *, int,
2167  *   int)
2168  *
2169  * Overview
2170  *   To support rlimit compatibility, we need a function which takes a 64-bit
2171  *   rlimit and encodes it as appropriate rcontrol values on the given rcontrol.
2172  *   This operation is only intended for legacy rlimits.
2173  *
2174  *   The implementation of rctl_rlimit_set() is a bit clever, as it tries to
2175  *   minimize the number of values placed on the value sequence in various
2176  *   cases.  Furthermore, we don't allow multiple identical privilege-action
2177  *   values on the same sequence.  (That is, we don't want a sequence like
2178  *   "while (1) { rlim.rlim_cur++; setrlimit(..., rlim); }" to exhaust kernel
2179  *   memory.)  So we want to delete any values with the same privilege value and
2180  *   action.
2181  *
2182  * Return values
2183  *   0 for successful set, errno otherwise. Errno will be either EINVAL
2184  *   or EPERM, in keeping with defined errnos for ulimit() and setrlimit()
2185  *   system calls.
2186  */
2187 /*ARGSUSED*/
2188 int
2189 rctl_rlimit_set(rctl_hndl_t rc, struct proc *p, struct rlimit64 *rlp64,
2190     rctl_alloc_gp_t *ragp, int flagaction, int signal, const cred_t *cr)
2191 {
2192 	rctl_t *rctl;
2193 	rctl_val_t *rval, *rval_priv, *rval_basic;
2194 	rctl_set_t *rset = p->p_rctls;
2195 	rctl_qty_t max;
2196 	rctl_entity_p_t e;
2197 	struct rlimit64 cur_rl;
2198 
2199 	e.rcep_t = RCENTITY_PROCESS;
2200 	e.rcep_p.proc = p;
2201 
2202 	if (rlp64->rlim_cur > rlp64->rlim_max)
2203 		return (EINVAL);
2204 
2205 	if (rctl_rlimit_get(rc, p, &cur_rl) == -1)
2206 		return (EINVAL);
2207 
2208 	/*
2209 	 * If we are not privileged, we can only lower the hard limit.
2210 	 */
2211 	if ((rlp64->rlim_max > cur_rl.rlim_max) &&
2212 	    cur_rl.rlim_max != RLIM64_INFINITY &&
2213 	    secpolicy_resource(cr) != 0)
2214 		return (EPERM);
2215 
2216 	mutex_enter(&rset->rcs_lock);
2217 
2218 	if (rctl_set_find(rset, rc, &rctl) == -1) {
2219 		mutex_exit(&rset->rcs_lock);
2220 		return (EINVAL);
2221 	}
2222 
2223 	rval_priv = rctl_gp_detach_val(ragp);
2224 
2225 	rval = rctl->rc_values;
2226 
2227 	while (rval != NULL) {
2228 		rctl_val_t *next = rval->rcv_next;
2229 
2230 		if (rval->rcv_privilege == RCPRIV_SYSTEM)
2231 			break;
2232 
2233 		if ((rval->rcv_privilege == RCPRIV_BASIC) ||
2234 		    (rval->rcv_flagaction & ~RCTL_LOCAL_ACTION_MASK) ==
2235 		    (flagaction & ~RCTL_LOCAL_ACTION_MASK)) {
2236 			if (rctl->rc_cursor == rval) {
2237 				rctl->rc_cursor = rval->rcv_next;
2238 				rctl_val_list_reset(rctl->rc_cursor);
2239 				RCTLOP_SET(rctl, p, &e, rctl_model_value(
2240 				    rctl->rc_dict_entry, p,
2241 				    rctl->rc_cursor->rcv_value));
2242 			}
2243 			(void) rctl_val_list_delete(&rctl->rc_values, rval);
2244 		}
2245 
2246 		rval = next;
2247 	}
2248 
2249 	rval_priv->rcv_privilege = RCPRIV_PRIVILEGED;
2250 	rval_priv->rcv_flagaction = flagaction;
2251 	if (rlp64->rlim_max == RLIM64_INFINITY) {
2252 		rval_priv->rcv_flagaction |= RCTL_LOCAL_MAXIMAL;
2253 		max = rctl->rc_dict_entry->rcd_max_native;
2254 	} else {
2255 		max = rlp64->rlim_max;
2256 	}
2257 	rval_priv->rcv_value = max;
2258 	rval_priv->rcv_action_signal = signal;
2259 	rval_priv->rcv_action_recipient = NULL;
2260 	rval_priv->rcv_action_recip_pid = -1;
2261 	rval_priv->rcv_firing_time = 0;
2262 	rval_priv->rcv_prev = rval_priv->rcv_next = NULL;
2263 
2264 	(void) rctl_val_list_insert(&rctl->rc_values, rval_priv);
2265 	rctl->rc_cursor = rval_priv;
2266 	rctl_val_list_reset(rctl->rc_cursor);
2267 	RCTLOP_SET(rctl, p, &e, rctl_model_value(rctl->rc_dict_entry, p,
2268 	    rctl->rc_cursor->rcv_value));
2269 
2270 	if (rlp64->rlim_cur != RLIM64_INFINITY && rlp64->rlim_cur < max) {
2271 		rval_basic = rctl_gp_detach_val(ragp);
2272 
2273 		rval_basic->rcv_privilege = RCPRIV_BASIC;
2274 		rval_basic->rcv_value = rlp64->rlim_cur;
2275 		rval_basic->rcv_flagaction = flagaction;
2276 		rval_basic->rcv_action_signal = signal;
2277 		rval_basic->rcv_action_recipient = p;
2278 		rval_basic->rcv_action_recip_pid = p->p_pid;
2279 		rval_basic->rcv_firing_time = 0;
2280 		rval_basic->rcv_prev = rval_basic->rcv_next = NULL;
2281 
2282 		(void) rctl_val_list_insert(&rctl->rc_values, rval_basic);
2283 		rctl->rc_cursor = rval_basic;
2284 		rctl_val_list_reset(rctl->rc_cursor);
2285 		RCTLOP_SET(rctl, p, &e, rctl_model_value(rctl->rc_dict_entry, p,
2286 		    rctl->rc_cursor->rcv_value));
2287 	}
2288 
2289 	ASSERT(rctl->rc_cursor != NULL);
2290 
2291 	mutex_exit(&rset->rcs_lock);
2292 	return (0);
2293 }
2294 
2295 
2296 /*
2297  * rctl_hndl_t rctl_register(const char *, rctl_entity_t, int, rlim64_t,
2298  *   rlim64_t, rctl_ops_t *)
2299  *
2300  * Overview
2301  *   rctl_register() performs a look-up in the dictionary of rctls
2302  *   active on the system; if a rctl of that name is absent, an entry is
2303  *   made into the dictionary.  The rctl is returned with its reference
2304  *   count incremented by one.  If the rctl name already exists, we panic.
2305  *   (Were the resource control system to support dynamic loading and unloading,
2306  *   which it is structured for, duplicate registration should lead to load
2307  *   failure instead of panicking.)
2308  *
2309  *   Each registered rctl has a requirement that a RCPRIV_SYSTEM limit be
2310  *   defined.  This limit contains the highest possible value for this quantity
2311  *   on the system.  Furthermore, the registered control must provide infinite
2312  *   values for all applicable address space models supported by the operating
2313  *   system.  Attempts to set resource control values beyond the system limit
2314  *   will fail.
2315  *
2316  * Return values
2317  *   The rctl's ID.
2318  *
2319  * Caller's context
2320  *   Caller must be in a context suitable for KM_SLEEP allocations.
2321  */
2322 rctl_hndl_t
2323 rctl_register(
2324     const char *name,
2325     rctl_entity_t entity,
2326     int global_flags,
2327     rlim64_t max_native,
2328     rlim64_t max_ilp32,
2329     rctl_ops_t *ops)
2330 {
2331 	rctl_t *rctl = kmem_cache_alloc(rctl_cache, KM_SLEEP);
2332 	rctl_val_t *rctl_val = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2333 	rctl_dict_entry_t *rctl_de = kmem_zalloc(sizeof (rctl_dict_entry_t),
2334 	    KM_SLEEP);
2335 	rctl_t *old_rctl;
2336 	rctl_hndl_t rhndl;
2337 	int localflags;
2338 
2339 	ASSERT(ops != NULL);
2340 
2341 	bzero(rctl, sizeof (rctl_t));
2342 	bzero(rctl_val, sizeof (rctl_val_t));
2343 
2344 	if (global_flags & RCTL_GLOBAL_DENY_NEVER)
2345 		localflags = RCTL_LOCAL_MAXIMAL;
2346 	else
2347 		localflags = RCTL_LOCAL_MAXIMAL | RCTL_LOCAL_DENY;
2348 
2349 	rctl_val->rcv_privilege = RCPRIV_SYSTEM;
2350 	rctl_val->rcv_value = max_native;
2351 	rctl_val->rcv_flagaction = localflags;
2352 	rctl_val->rcv_action_signal = 0;
2353 	rctl_val->rcv_action_recipient = NULL;
2354 	rctl_val->rcv_action_recip_pid = -1;
2355 	rctl_val->rcv_firing_time = 0;
2356 	rctl_val->rcv_next = NULL;
2357 	rctl_val->rcv_prev = NULL;
2358 
2359 	rctl_de->rcd_name = (char *)name;
2360 	rctl_de->rcd_default_value = rctl_val;
2361 	rctl_de->rcd_max_native = max_native;
2362 	rctl_de->rcd_max_ilp32 = max_ilp32;
2363 	rctl_de->rcd_entity = entity;
2364 	rctl_de->rcd_ops = ops;
2365 	rctl_de->rcd_flagaction = global_flags;
2366 
2367 	rctl->rc_dict_entry = rctl_de;
2368 	rctl->rc_values = rctl_val;
2369 
2370 	/*
2371 	 * 1.  Take global lock, validate nonexistence of name, get ID.
2372 	 */
2373 	mutex_enter(&rctl_dict_lock);
2374 
2375 	if (mod_hash_find(rctl_dict_by_name, (mod_hash_key_t)name,
2376 	    (mod_hash_val_t *)&rhndl) != MH_ERR_NOTFOUND)
2377 		panic("duplicate registration of rctl %s", name);
2378 
2379 	rhndl = rctl_de->rcd_id = rctl->rc_id =
2380 	    (rctl_hndl_t)id_alloc(rctl_ids);
2381 
2382 	/*
2383 	 * 2.  Insert name-entry pair in rctl_dict_by_name.
2384 	 */
2385 	if (mod_hash_insert(rctl_dict_by_name, (mod_hash_key_t)name,
2386 	    (mod_hash_val_t)rctl_de))
2387 		panic("unable to insert rctl dict entry for %s (%u)", name,
2388 		    (uint_t)rctl->rc_id);
2389 
2390 	/*
2391 	 * 3.  Insert ID-rctl_t * pair in rctl_dict.
2392 	 */
2393 	if (mod_hash_find(rctl_dict, (mod_hash_key_t)(uintptr_t)rctl->rc_id,
2394 	    (mod_hash_val_t *)&old_rctl) != MH_ERR_NOTFOUND)
2395 		panic("duplicate rctl ID %u registered", rctl->rc_id);
2396 
2397 	if (mod_hash_insert(rctl_dict, (mod_hash_key_t)(uintptr_t)rctl->rc_id,
2398 	    (mod_hash_val_t)rctl))
2399 		panic("unable to insert rctl %s/%u (%p)", name,
2400 		    (uint_t)rctl->rc_id, (void *)rctl);
2401 
2402 	/*
2403 	 * 3a. Insert rctl_dict_entry_t * in appropriate entity list.
2404 	 */
2405 
2406 	mutex_enter(&rctl_lists_lock);
2407 
2408 	switch (entity) {
2409 	case RCENTITY_ZONE:
2410 	case RCENTITY_PROJECT:
2411 	case RCENTITY_TASK:
2412 	case RCENTITY_PROCESS:
2413 		rctl_de->rcd_next = rctl_lists[entity];
2414 		rctl_lists[entity] = rctl_de;
2415 		break;
2416 	default:
2417 		panic("registering unknown rctl entity %d (%s)", entity,
2418 		    name);
2419 		break;
2420 	}
2421 
2422 	mutex_exit(&rctl_lists_lock);
2423 
2424 	/*
2425 	 * 4.  Drop lock.
2426 	 */
2427 	mutex_exit(&rctl_dict_lock);
2428 
2429 	return (rhndl);
2430 }
2431 
2432 /*
2433  * static int rctl_global_action(rctl_t *r, rctl_set_t *rset, struct proc *p,
2434  *    rctl_val_t *v)
2435  *
2436  * Overview
2437  *   rctl_global_action() takes, in according with the flags on the rctl_dict
2438  *   entry for the given control, the appropriate actions on the exceeded
2439  *   control value.  Additionally, rctl_global_action() updates the firing time
2440  *   on the exceeded value.
2441  *
2442  * Return values
2443  *   A bitmask reflecting the actions actually taken.
2444  *
2445  * Caller's context
2446  *   No restrictions on context.
2447  */
2448 /*ARGSUSED*/
2449 static int
2450 rctl_global_action(rctl_t *r, rctl_set_t *rset, struct proc *p, rctl_val_t *v)
2451 {
2452 	rctl_dict_entry_t *rde = r->rc_dict_entry;
2453 	const char *pr, *en, *idstr;
2454 	id_t id;
2455 	enum {
2456 		SUFFIX_NONE,	/* id consumed directly */
2457 		SUFFIX_NUMERIC,	/* id consumed in suffix */
2458 		SUFFIX_STRING	/* idstr consumed in suffix */
2459 	} suffix = SUFFIX_NONE;
2460 	int ret = 0;
2461 
2462 	v->rcv_firing_time = gethrtime();
2463 
2464 	switch (v->rcv_privilege) {
2465 	case RCPRIV_BASIC:
2466 		pr = "basic";
2467 		break;
2468 	case RCPRIV_PRIVILEGED:
2469 		pr = "privileged";
2470 		break;
2471 	case RCPRIV_SYSTEM:
2472 		pr = "system";
2473 		break;
2474 	default:
2475 		pr = "unknown";
2476 		break;
2477 	}
2478 
2479 	switch (rde->rcd_entity) {
2480 	case RCENTITY_PROCESS:
2481 		en = "process";
2482 		id = p->p_pid;
2483 		suffix = SUFFIX_NONE;
2484 		break;
2485 	case RCENTITY_TASK:
2486 		en = "task";
2487 		id = p->p_task->tk_tkid;
2488 		suffix = SUFFIX_NUMERIC;
2489 		break;
2490 	case RCENTITY_PROJECT:
2491 		en = "project";
2492 		id = p->p_task->tk_proj->kpj_id;
2493 		suffix = SUFFIX_NUMERIC;
2494 		break;
2495 	case RCENTITY_ZONE:
2496 		en = "zone";
2497 		idstr = p->p_zone->zone_name;
2498 		suffix = SUFFIX_STRING;
2499 		break;
2500 	default:
2501 		en = "unknown entity associated with process";
2502 		id = p->p_pid;
2503 		suffix = SUFFIX_NONE;
2504 		break;
2505 	}
2506 
2507 	if (rde->rcd_flagaction & RCTL_GLOBAL_SYSLOG) {
2508 		switch (suffix) {
2509 		default:
2510 		case SUFFIX_NONE:
2511 			(void) strlog(0, 0, 0,
2512 			    rde->rcd_strlog_flags | log_global.lz_active,
2513 			    "%s rctl %s (value %llu) exceeded by %s %d.",
2514 			    pr, rde->rcd_name, v->rcv_value, en, id);
2515 			break;
2516 		case SUFFIX_NUMERIC:
2517 			(void) strlog(0, 0, 0,
2518 			    rde->rcd_strlog_flags | log_global.lz_active,
2519 			    "%s rctl %s (value %llu) exceeded by process %d"
2520 			    " in %s %d.",
2521 			    pr, rde->rcd_name, v->rcv_value, p->p_pid,
2522 			    en, id);
2523 			break;
2524 		case SUFFIX_STRING:
2525 			(void) strlog(0, 0, 0,
2526 			    rde->rcd_strlog_flags | log_global.lz_active,
2527 			    "%s rctl %s (value %llu) exceeded by process %d"
2528 			    " in %s %s.",
2529 			    pr, rde->rcd_name, v->rcv_value, p->p_pid,
2530 			    en, idstr);
2531 			break;
2532 		}
2533 	}
2534 
2535 	if (rde->rcd_flagaction & RCTL_GLOBAL_DENY_ALWAYS)
2536 		ret |= RCT_DENY;
2537 
2538 	return (ret);
2539 }
2540 
2541 static int
2542 rctl_local_action(rctl_t *r, rctl_set_t *rset, struct proc *p, rctl_val_t *v,
2543     uint_t safety)
2544 {
2545 	int ret = 0;
2546 	sigqueue_t *sqp = NULL;
2547 	rctl_dict_entry_t *rde = r->rc_dict_entry;
2548 	int unobservable = (rde->rcd_flagaction & RCTL_GLOBAL_UNOBSERVABLE);
2549 
2550 	proc_t *recipient = v->rcv_action_recipient;
2551 	id_t recip_pid = v->rcv_action_recip_pid;
2552 	int recip_signal = v->rcv_action_signal;
2553 	uint_t flagaction = v->rcv_flagaction;
2554 
2555 	if (safety == RCA_UNSAFE_ALL) {
2556 		if (flagaction & RCTL_LOCAL_DENY) {
2557 			ret |= RCT_DENY;
2558 		}
2559 		return (ret);
2560 	}
2561 
2562 	if (flagaction & RCTL_LOCAL_SIGNAL) {
2563 		/*
2564 		 * We can build a siginfo only in the case that it is
2565 		 * safe for us to drop p_lock.  (For asynchronous
2566 		 * checks this is currently not true.)
2567 		 */
2568 		if (safety == RCA_SAFE) {
2569 			mutex_exit(&rset->rcs_lock);
2570 			mutex_exit(&p->p_lock);
2571 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
2572 			mutex_enter(&p->p_lock);
2573 			mutex_enter(&rset->rcs_lock);
2574 
2575 			sqp->sq_info.si_signo = recip_signal;
2576 			sqp->sq_info.si_code = SI_RCTL;
2577 			sqp->sq_info.si_errno = 0;
2578 			sqp->sq_info.si_entity = (int)rde->rcd_entity;
2579 		}
2580 
2581 		if (recipient == NULL || recipient == p) {
2582 			ret |= RCT_SIGNAL;
2583 
2584 			if (sqp == NULL) {
2585 				sigtoproc(p, NULL, recip_signal);
2586 			} else if (p == curproc) {
2587 				/*
2588 				 * Then this is a synchronous test and we can
2589 				 * direct the signal at the violating thread.
2590 				 */
2591 				sigaddqa(curproc, curthread, sqp);
2592 			} else {
2593 				sigaddqa(p, NULL, sqp);
2594 			}
2595 		} else if (!unobservable) {
2596 			proc_t *rp;
2597 
2598 			mutex_exit(&rset->rcs_lock);
2599 			mutex_exit(&p->p_lock);
2600 
2601 			mutex_enter(&pidlock);
2602 			if ((rp = prfind(recip_pid)) == recipient) {
2603 				/*
2604 				 * Recipient process is still alive, but may not
2605 				 * be in this task or project any longer.  In
2606 				 * this case, the recipient's resource control
2607 				 * set pertinent to this control will have
2608 				 * changed--and we will not deliver the signal,
2609 				 * as the recipient process is trying to tear
2610 				 * itself off of its former set.
2611 				 */
2612 				mutex_enter(&rp->p_lock);
2613 				mutex_exit(&pidlock);
2614 
2615 				if (rctl_entity_obtain_rset(rde, rp) == rset) {
2616 					ret |= RCT_SIGNAL;
2617 
2618 					if (sqp == NULL)
2619 						sigtoproc(rp, NULL,
2620 						    recip_signal);
2621 					else
2622 						sigaddqa(rp, NULL, sqp);
2623 				} else if (sqp) {
2624 					kmem_free(sqp, sizeof (sigqueue_t));
2625 				}
2626 				mutex_exit(&rp->p_lock);
2627 			} else {
2628 				mutex_exit(&pidlock);
2629 				if (sqp)
2630 					kmem_free(sqp, sizeof (sigqueue_t));
2631 			}
2632 
2633 			mutex_enter(&p->p_lock);
2634 			/*
2635 			 * Since we dropped p_lock, we may no longer be in the
2636 			 * same task or project as we were at entry.  It is thus
2637 			 * unsafe for us to reacquire the set lock at this
2638 			 * point; callers of rctl_local_action() must handle
2639 			 * this possibility.
2640 			 */
2641 			ret |= RCT_LK_ABANDONED;
2642 		} else if (sqp) {
2643 			kmem_free(sqp, sizeof (sigqueue_t));
2644 		}
2645 	}
2646 
2647 	if ((flagaction & RCTL_LOCAL_DENY) &&
2648 	    (recipient == NULL || recipient == p)) {
2649 		ret |= RCT_DENY;
2650 	}
2651 
2652 	return (ret);
2653 }
2654 
2655 /*
2656  * int rctl_action(rctl_hndl_t, rctl_set_t *, struct proc *, uint_t)
2657  *
2658  * Overview
2659  *   Take the action associated with the enforced value (as defined by
2660  *   rctl_get_enforced_value()) being exceeded or encountered.  Possibly perform
2661  *   a restricted subset of the available actions, if circumstances dictate that
2662  *   we cannot safely allocate memory (for a sigqueue_t) or guarantee process
2663  *   persistence across the duration of the function (an asynchronous action).
2664  *
2665  * Return values
2666  *   Actions taken, according to the rctl_test bitmask.
2667  *
2668  * Caller's context
2669  *   Safe to acquire rcs_lock.
2670  */
2671 int
2672 rctl_action(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p, uint_t safety)
2673 {
2674 	return (rctl_action_entity(hndl, rset, p, NULL, safety));
2675 }
2676 
2677 int
2678 rctl_action_entity(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p,
2679     rctl_entity_p_t *e, uint_t safety)
2680 {
2681 	int ret = RCT_NONE;
2682 	rctl_t *lrctl;
2683 	rctl_entity_p_t e_tmp;
2684 
2685 rctl_action_acquire:
2686 	mutex_enter(&rset->rcs_lock);
2687 	if (rctl_set_find(rset, hndl, &lrctl) == -1) {
2688 		mutex_exit(&rset->rcs_lock);
2689 		return (ret);
2690 	}
2691 
2692 	if (e == NULL) {
2693 		rctl_entity_obtain_entity_p(lrctl->rc_dict_entry->rcd_entity,
2694 		    p, &e_tmp);
2695 		e = &e_tmp;
2696 	}
2697 
2698 	if ((ret & RCT_LK_ABANDONED) == 0) {
2699 		ret |= rctl_global_action(lrctl, rset, p, lrctl->rc_cursor);
2700 
2701 		RCTLOP_ACTION(lrctl, p, e);
2702 
2703 		ret |= rctl_local_action(lrctl, rset, p,
2704 		    lrctl->rc_cursor, safety);
2705 
2706 		if (ret & RCT_LK_ABANDONED)
2707 			goto rctl_action_acquire;
2708 	}
2709 
2710 	ret &= ~RCT_LK_ABANDONED;
2711 
2712 	if (!(ret & RCT_DENY) &&
2713 	    lrctl->rc_cursor->rcv_next != NULL) {
2714 		lrctl->rc_cursor = lrctl->rc_cursor->rcv_next;
2715 
2716 		RCTLOP_SET(lrctl, p, e, rctl_model_value(lrctl->rc_dict_entry,
2717 		    p, lrctl->rc_cursor->rcv_value));
2718 
2719 	}
2720 	mutex_exit(&rset->rcs_lock);
2721 
2722 	return (ret);
2723 }
2724 
2725 /*
2726  * int rctl_test(rctl_hndl_t, rctl_set_t *, struct proc *, rctl_qty_t, uint_t)
2727  *
2728  * Overview
2729  *   Increment the resource associated with the given handle, returning zero if
2730  *   the incremented value does not exceed the threshold for the current limit
2731  *   on the resource.
2732  *
2733  * Return values
2734  *   Actions taken, according to the rctl_test bitmask.
2735  *
2736  * Caller's context
2737  *   p_lock held by caller.
2738  */
2739 /*ARGSUSED*/
2740 int
2741 rctl_test(rctl_hndl_t rhndl, rctl_set_t *rset, struct proc *p,
2742     rctl_qty_t incr, uint_t flags)
2743 {
2744 	return (rctl_test_entity(rhndl, rset, p, NULL, incr, flags));
2745 }
2746 
2747 int
2748 rctl_test_entity(rctl_hndl_t rhndl, rctl_set_t *rset, struct proc *p,
2749     rctl_entity_p_t *e, rctl_qty_t incr, uint_t flags)
2750 {
2751 	rctl_t *lrctl;
2752 	int ret = RCT_NONE;
2753 	rctl_entity_p_t e_tmp;
2754 	if (p == &p0) {
2755 		/*
2756 		 * We don't enforce rctls on the kernel itself.
2757 		 */
2758 		return (ret);
2759 	}
2760 
2761 rctl_test_acquire:
2762 	ASSERT(MUTEX_HELD(&p->p_lock));
2763 
2764 	mutex_enter(&rset->rcs_lock);
2765 
2766 	/*
2767 	 * Dereference from rctl_set.  We don't enforce newly loaded controls
2768 	 * that haven't been set on this entity (since the only valid value is
2769 	 * the infinite system value).
2770 	 */
2771 	if (rctl_set_find(rset, rhndl, &lrctl) == -1) {
2772 		mutex_exit(&rset->rcs_lock);
2773 		return (ret);
2774 	}
2775 
2776 	/*
2777 	 * This control is currently unenforced:  maximal value on control
2778 	 * supporting infinitely available resource.
2779 	 */
2780 	if ((lrctl->rc_dict_entry->rcd_flagaction & RCTL_GLOBAL_INFINITE) &&
2781 	    (lrctl->rc_cursor->rcv_flagaction & RCTL_LOCAL_MAXIMAL)) {
2782 
2783 		mutex_exit(&rset->rcs_lock);
2784 		return (ret);
2785 	}
2786 
2787 	/*
2788 	 * If we have been called by rctl_test, look up the entity pointer
2789 	 * from the proc pointer.
2790 	 */
2791 	if (e == NULL) {
2792 		rctl_entity_obtain_entity_p(lrctl->rc_dict_entry->rcd_entity,
2793 		    p, &e_tmp);
2794 		e = &e_tmp;
2795 	}
2796 
2797 	/*
2798 	 * Get enforced rctl value and current usage.  Test the increment
2799 	 * with the current usage against the enforced value--take action as
2800 	 * necessary.
2801 	 */
2802 	while (RCTLOP_TEST(lrctl, p, e, lrctl->rc_cursor, incr, flags)) {
2803 		if ((ret & RCT_LK_ABANDONED) == 0) {
2804 			ret |= rctl_global_action(lrctl, rset, p,
2805 			    lrctl->rc_cursor);
2806 
2807 			RCTLOP_ACTION(lrctl, p, e);
2808 
2809 			ret |= rctl_local_action(lrctl, rset, p,
2810 			    lrctl->rc_cursor, flags);
2811 
2812 			if (ret & RCT_LK_ABANDONED)
2813 				goto rctl_test_acquire;
2814 		}
2815 
2816 		ret &= ~RCT_LK_ABANDONED;
2817 
2818 		if ((ret & RCT_DENY) == RCT_DENY ||
2819 		    lrctl->rc_cursor->rcv_next == NULL) {
2820 			ret |= RCT_DENY;
2821 			break;
2822 		}
2823 
2824 		lrctl->rc_cursor = lrctl->rc_cursor->rcv_next;
2825 		RCTLOP_SET(lrctl, p, e, rctl_model_value(lrctl->rc_dict_entry,
2826 		    p, lrctl->rc_cursor->rcv_value));
2827 	}
2828 
2829 	mutex_exit(&rset->rcs_lock);
2830 
2831 	return (ret);
2832 }
2833 
2834 /*
2835  * void rctl_init(void)
2836  *
2837  * Overview
2838  *   Initialize the rctl subsystem, including the primoridal rctls
2839  *   provided by the system.  New subsystem-specific rctls should _not_ be
2840  *   initialized here.  (Do it in your own file.)
2841  *
2842  * Return values
2843  *   None.
2844  *
2845  * Caller's context
2846  *   Safe for KM_SLEEP allocations.  Must be called prior to any process model
2847  *   initialization.
2848  */
2849 void
2850 rctl_init(void)
2851 {
2852 	rctl_cache = kmem_cache_create("rctl_cache", sizeof (rctl_t),
2853 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2854 	rctl_val_cache = kmem_cache_create("rctl_val_cache",
2855 	    sizeof (rctl_val_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
2856 
2857 	rctl_dict = mod_hash_create_extended("rctl_dict",
2858 	    rctl_dict_size, mod_hash_null_keydtor, rctl_dict_val_dtor,
2859 	    rctl_dict_hash_by_id, NULL, rctl_dict_id_cmp, KM_SLEEP);
2860 	rctl_dict_by_name = mod_hash_create_strhash(
2861 	    "rctl_handles_by_name", rctl_dict_size,
2862 	    mod_hash_null_valdtor);
2863 	rctl_ids = id_space_create("rctl_ids", 1, max_rctl_hndl);
2864 	bzero(rctl_lists, (RC_MAX_ENTITY + 1) * sizeof (rctl_dict_entry_t *));
2865 
2866 	rctlproc_init();
2867 }
2868 
2869 /*
2870  * rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
2871  *     int chargeproc)
2872  *
2873  * Increments the amount of locked memory on a project, and
2874  * zone. If proj is non-NULL the project must be held by the
2875  * caller; if it is NULL the proj and zone of proc_t p are used.
2876  * If chargeproc is non-zero, then the charged amount is cached
2877  * on p->p_locked_mem so that the charge can be migrated when a
2878  * process changes projects.
2879  *
2880  * Return values
2881  *    0 - success
2882  *    EAGAIN - attempting to increment locked memory is denied by one
2883  *      or more resource entities.
2884  */
2885 int
2886 rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
2887     int chargeproc)
2888 {
2889 	kproject_t *projp;
2890 	zone_t *zonep;
2891 	rctl_entity_p_t e;
2892 	int ret = 0;
2893 
2894 	ASSERT(p != NULL);
2895 	ASSERT(MUTEX_HELD(&p->p_lock));
2896 	if (proj != NULL) {
2897 		projp = proj;
2898 		zonep = proj->kpj_zone;
2899 	} else {
2900 		projp = p->p_task->tk_proj;
2901 		zonep = p->p_zone;
2902 	}
2903 
2904 	mutex_enter(&zonep->zone_mem_lock);
2905 
2906 	e.rcep_p.proj = projp;
2907 	e.rcep_t = RCENTITY_PROJECT;
2908 
2909 	/* check for overflow */
2910 	if ((projp->kpj_data.kpd_locked_mem + inc) <
2911 	    projp->kpj_data.kpd_locked_mem) {
2912 		ret = EAGAIN;
2913 		goto out;
2914 	}
2915 	if (projp->kpj_data.kpd_locked_mem + inc >
2916 	    projp->kpj_data.kpd_locked_mem_ctl) {
2917 		if (rctl_test_entity(rc_project_locked_mem, projp->kpj_rctls,
2918 		    p, &e, inc, 0) & RCT_DENY) {
2919 			ret = EAGAIN;
2920 			goto out;
2921 		}
2922 	}
2923 	e.rcep_p.zone = zonep;
2924 	e.rcep_t = RCENTITY_ZONE;
2925 
2926 	/* Check for overflow */
2927 	if ((zonep->zone_locked_mem + inc) < zonep->zone_locked_mem) {
2928 		ret = EAGAIN;
2929 		goto out;
2930 	}
2931 	if (zonep->zone_locked_mem + inc > zonep->zone_locked_mem_ctl) {
2932 		if (rctl_test_entity(rc_zone_locked_mem, zonep->zone_rctls,
2933 		    p, &e, inc, 0) & RCT_DENY) {
2934 			ret = EAGAIN;
2935 			goto out;
2936 		}
2937 	}
2938 
2939 	zonep->zone_locked_mem += inc;
2940 	projp->kpj_data.kpd_locked_mem += inc;
2941 	if (chargeproc != 0) {
2942 		p->p_locked_mem += inc;
2943 	}
2944 out:
2945 	mutex_exit(&zonep->zone_mem_lock);
2946 	return (ret);
2947 }
2948 
2949 /*
2950  * rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
2951  *     int creditproc)
2952  *
2953  * Decrements the amount of locked memory on a project and
2954  * zone.  If proj is non-NULL the project must be held by the
2955  * caller; if it is NULL the proj and zone of proc_t p are used.
2956  * If creditproc is non-zero, then the quantity of locked memory
2957  * is subtracted from p->p_locked_mem.
2958  *
2959  * Return values
2960  *   none
2961  */
2962 void
2963 rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
2964     int creditproc)
2965 {
2966 	kproject_t *projp;
2967 	zone_t *zonep;
2968 
2969 	if (proj != NULL) {
2970 		projp = proj;
2971 		zonep = proj->kpj_zone;
2972 	} else {
2973 		ASSERT(p != NULL);
2974 		ASSERT(MUTEX_HELD(&p->p_lock));
2975 		projp = p->p_task->tk_proj;
2976 		zonep = p->p_zone;
2977 	}
2978 
2979 	mutex_enter(&zonep->zone_mem_lock);
2980 	zonep->zone_locked_mem -= inc;
2981 	projp->kpj_data.kpd_locked_mem -= inc;
2982 	if (creditproc != 0) {
2983 		ASSERT(p != NULL);
2984 		ASSERT(MUTEX_HELD(&p->p_lock));
2985 		p->p_locked_mem -= inc;
2986 	}
2987 	mutex_exit(&zonep->zone_mem_lock);
2988 }
2989 
2990 /*
2991  * rctl_incr_swap(proc_t *, zone_t *, size_t)
2992  *
2993  * Overview
2994  *   Increments the swap charge on the specified zone.
2995  *
2996  * Return values
2997  *   0 on success.  EAGAIN if swap increment fails due an rctl value
2998  *   on the zone.
2999  *
3000  * Callers context
3001  *   p_lock held on specified proc.
3002  *   swap must be even multiple of PAGESIZE
3003  */
3004 int
3005 rctl_incr_swap(proc_t *proc, zone_t *zone, size_t swap)
3006 {
3007 	rctl_entity_p_t e;
3008 
3009 	ASSERT(MUTEX_HELD(&proc->p_lock));
3010 	ASSERT((swap & PAGEOFFSET) == 0);
3011 	e.rcep_p.zone = zone;
3012 	e.rcep_t = RCENTITY_ZONE;
3013 
3014 	mutex_enter(&zone->zone_mem_lock);
3015 
3016 	/* Check for overflow */
3017 	if ((zone->zone_max_swap + swap) < zone->zone_max_swap) {
3018 		mutex_exit(&zone->zone_mem_lock);
3019 		return (EAGAIN);
3020 	}
3021 	if ((zone->zone_max_swap + swap) >
3022 	    zone->zone_max_swap_ctl) {
3023 
3024 		if (rctl_test_entity(rc_zone_max_swap, zone->zone_rctls,
3025 		    proc, &e, swap, 0) & RCT_DENY) {
3026 			mutex_exit(&zone->zone_mem_lock);
3027 			return (EAGAIN);
3028 		}
3029 	}
3030 	zone->zone_max_swap += swap;
3031 	mutex_exit(&zone->zone_mem_lock);
3032 	return (0);
3033 }
3034 
3035 /*
3036  * rctl_decr_swap(zone_t *, size_t)
3037  *
3038  * Overview
3039  *   Decrements the swap charge on the specified zone.
3040  *
3041  * Return values
3042  *   None
3043  *
3044  * Callers context
3045  *   swap must be even multiple of PAGESIZE
3046  */
3047 void
3048 rctl_decr_swap(zone_t *zone, size_t swap)
3049 {
3050 	ASSERT((swap & PAGEOFFSET) == 0);
3051 	mutex_enter(&zone->zone_mem_lock);
3052 	ASSERT(zone->zone_max_swap >= swap);
3053 	zone->zone_max_swap -= swap;
3054 	mutex_exit(&zone->zone_mem_lock);
3055 }
3056 
3057 /*
3058  * rctl_incr_lofi(proc_t *, zone_t *, size_t)
3059  *
3060  * Overview
3061  *   Increments the number of lofi devices for the zone.
3062  *
3063  * Return values
3064  *   0 on success.  EAGAIN if increment fails due an rctl value
3065  *   on the zone.
3066  *
3067  * Callers context
3068  *   p_lock held on specified proc.
3069  */
3070 int
3071 rctl_incr_lofi(proc_t *proc, zone_t *zone, size_t incr)
3072 {
3073 	rctl_entity_p_t e;
3074 
3075 	ASSERT(MUTEX_HELD(&proc->p_lock));
3076 	ASSERT(incr > 0);
3077 
3078 	e.rcep_p.zone = zone;
3079 	e.rcep_t = RCENTITY_ZONE;
3080 
3081 	mutex_enter(&zone->zone_rctl_lock);
3082 
3083 	/* Check for overflow */
3084 	if ((zone->zone_max_lofi + incr) < zone->zone_max_lofi) {
3085 		mutex_exit(&zone->zone_rctl_lock);
3086 		return (EAGAIN);
3087 	}
3088 	if ((zone->zone_max_lofi + incr) > zone->zone_max_lofi_ctl) {
3089 		if (rctl_test_entity(rc_zone_max_lofi, zone->zone_rctls,
3090 		    proc, &e, incr, 0) & RCT_DENY) {
3091 			mutex_exit(&zone->zone_rctl_lock);
3092 			return (EAGAIN);
3093 		}
3094 	}
3095 	zone->zone_max_lofi += incr;
3096 	mutex_exit(&zone->zone_rctl_lock);
3097 	return (0);
3098 }
3099 
3100 /*
3101  * rctl_decr_lofi(zone_t *, size_t)
3102  *
3103  * Overview
3104  *   Decrements the number of lofi devices for the zone.
3105  */
3106 void
3107 rctl_decr_lofi(zone_t *zone, size_t decr)
3108 {
3109 	mutex_enter(&zone->zone_rctl_lock);
3110 	ASSERT(zone->zone_max_lofi >= decr);
3111 	zone->zone_max_lofi -= decr;
3112 	mutex_exit(&zone->zone_rctl_lock);
3113 }
3114 
3115 /*
3116  * Create resource kstat
3117  */
3118 static kstat_t *
3119 rctl_kstat_create_common(char *ks_name, int ks_instance, char *ks_class,
3120     uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags, int ks_zoneid)
3121 {
3122 	kstat_t *ksp = NULL;
3123 	char name[KSTAT_STRLEN];
3124 
3125 	(void) snprintf(name, KSTAT_STRLEN, "%s_%d", ks_name, ks_instance);
3126 
3127 	if ((ksp = kstat_create_zone("caps", ks_zoneid,
3128 	    name, ks_class, ks_type,
3129 	    ks_ndata, ks_flags, ks_zoneid)) != NULL) {
3130 		if (ks_zoneid != GLOBAL_ZONEID)
3131 			kstat_zone_add(ksp, GLOBAL_ZONEID);
3132 	}
3133 	return (ksp);
3134 }
3135 
3136 /*
3137  * Create zone-specific resource kstat
3138  */
3139 kstat_t *
3140 rctl_kstat_create_zone(zone_t *zone, char *ks_name, uchar_t ks_type,
3141     uint_t ks_ndata, uchar_t ks_flags)
3142 {
3143 	char name[KSTAT_STRLEN];
3144 
3145 	(void) snprintf(name, KSTAT_STRLEN, "%s_zone", ks_name);
3146 
3147 	return (rctl_kstat_create_common(name, zone->zone_id, "zone_caps",
3148 	    ks_type, ks_ndata, ks_flags, zone->zone_id));
3149 }
3150 
3151 /*
3152  * Create project-specific resource kstat
3153  */
3154 kstat_t *
3155 rctl_kstat_create_project(kproject_t *kpj, char *ks_name, uchar_t ks_type,
3156     uint_t ks_ndata, uchar_t ks_flags)
3157 {
3158 	char name[KSTAT_STRLEN];
3159 
3160 	(void) snprintf(name, KSTAT_STRLEN, "%s_project", ks_name);
3161 
3162 	return (rctl_kstat_create_common(name, kpj->kpj_id, "project_caps",
3163 	    ks_type, ks_ndata, ks_flags, kpj->kpj_zoneid));
3164 }
3165 
3166 /*
3167  * Create task-specific resource kstat
3168  */
3169 kstat_t *
3170 rctl_kstat_create_task(task_t *tk, char *ks_name, uchar_t ks_type,
3171     uint_t ks_ndata, uchar_t ks_flags)
3172 {
3173 	char name[KSTAT_STRLEN];
3174 
3175 	(void) snprintf(name, KSTAT_STRLEN, "%s_task", ks_name);
3176 
3177 	return (rctl_kstat_create_common(name, tk->tk_tkid, "task_caps",
3178 	    ks_type, ks_ndata, ks_flags, tk->tk_proj->kpj_zoneid));
3179 }
3180