xref: /illumos-gate/usr/src/uts/common/os/rctl.c (revision 71e32251703c729dbbebef2101770135584fd8d4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/atomic.h>
29 #include <sys/cmn_err.h>
30 #include <sys/id_space.h>
31 #include <sys/kmem.h>
32 #include <sys/kstat.h>
33 #include <sys/log.h>
34 #include <sys/modctl.h>
35 #include <sys/modhash.h>
36 #include <sys/mutex.h>
37 #include <sys/proc.h>
38 #include <sys/procset.h>
39 #include <sys/project.h>
40 #include <sys/resource.h>
41 #include <sys/rctl.h>
42 #include <sys/siginfo.h>
43 #include <sys/strlog.h>
44 #include <sys/systm.h>
45 #include <sys/task.h>
46 #include <sys/types.h>
47 #include <sys/policy.h>
48 #include <sys/zone.h>
49 
50 /*
51  * Resource controls (rctls)
52  *
53  *   The rctl subsystem provides a mechanism for kernel components to
54  *   register their individual resource controls with the system as a whole,
55  *   such that those controls can subscribe to specific actions while being
56  *   associated with the various process-model entities provided by the kernel:
57  *   the process, the task, the project, and the zone.  (In principle, only
58  *   minor modifications would be required to connect the resource control
59  *   functionality to non-process-model entities associated with the system.)
60  *
61  *   Subsystems register their rctls via rctl_register().  Subsystems
62  *   also wishing to provide additional limits on a given rctl can modify
63  *   them once they have the rctl handle.  Each subsystem should store the
64  *   handle to their rctl for direct access.
65  *
66  *   A primary dictionary, rctl_dict, contains a hash of id to the default
67  *   control definition for each controlled resource-entity pair on the system.
68  *   A secondary dictionary, rctl_dict_by_name, contains a hash of name to
69  *   resource control handles.  The resource control handles are distributed by
70  *   the rctl_ids ID space.  The handles are private and not to be
71  *   advertised to userland; all userland interactions are via the rctl
72  *   names.
73  *
74  *   Entities inherit their rctls from their predecessor.  Since projects have
75  *   no ancestor, they inherit their rctls from the rctl dict for project
76  *   rctls.  It is expected that project controls will be set to their
77  *   appropriate values shortly after project creation, presumably from a
78  *   policy source such as the project database.
79  *
80  * Data structures
81  *   The rctl_set_t attached to each of the process model entities is a simple
82  *   hash table keyed on the rctl handle assigned at registration.  The entries
83  *   in the hash table are rctl_t's, whose relationship with the active control
84  *   values on that resource and with the global state of the resource we
85  *   illustrate below:
86  *
87  *   rctl_dict[key] --> rctl_dict_entry
88  *			   ^
89  *			   |
90  *			+--+---+
91  *   rctl_set[key] ---> | rctl | --> value <-> value <-> system value --> NULL
92  *			+--+---+		 ^
93  *			   |			 |
94  *			   +------- cursor ------+
95  *
96  *   That is, the rctl contains a back pointer to the global resource control
97  *   state for this resource, which is also available in the rctl_dict hash
98  *   table mentioned earlier.  The rctl contains two pointers to resource
99  *   control values:  one, values, indicates the entire sequence of control
100  *   values; the other, cursor, indicates the currently active control
101  *   value--the next value to be enforced.  The value list itself is an open,
102  *   doubly-linked list, the last non-NULL member of which is the system value
103  *   for that resource (being the theoretical/conventional maximum allowable
104  *   value for the resource on this OS instance).
105  *
106  * Ops Vector
107  *   Subsystems publishing rctls need not provide instances of all of the
108  *   functions specified by the ops vector.  In particular, if general
109  *   rctl_*() entry points are not being called, certain functions can be
110  *   omitted.  These align as follows:
111  *
112  *   rctl_set()
113  *     You may wish to provide a set callback if locking circumstances prevent
114  *     it or if the performance cost of requesting the enforced value from the
115  *     resource control is prohibitively expensive.  For instance, the currently
116  *     enforced file size limit is stored on the process in the p_fsz_ctl to
117  *     maintain read()/write() performance.
118  *
119  *   rctl_test()
120  *     You must provide a test callback if you are using the rctl_test()
121  *     interface.  An action callback is optional.
122  *
123  *   rctl_action()
124  *     You may wish to provide an action callback.
125  *
126  * Registration
127  *   New resource controls can be added to a running instance by loaded modules
128  *   via registration.  (The current implementation does not support unloadable
129  *   modules; this functionality can be added if needed, via an
130  *   activation/deactivation interface involving the manipulation of the
131  *   ops vector for the resource control(s) needing to support unloading.)
132  *
133  * Control value ordering
134  *   Because the rctl_val chain on each rctl must be navigable in a
135  *   deterministic way, we have to define an ordering on the rctl_val_t's.  The
136  *   defined order is (flags & [maximal], value, flags & [deny-action],
137  *   privilege).
138  *
139  * Locking
140  *   rctl_dict_lock must be acquired prior to rctl_lists_lock.  Since
141  *   rctl_dict_lock or rctl_lists_lock can be called at the enforcement point
142  *   of any subsystem, holding subsystem locks, it is at all times inappropriate
143  *   to call kmem_alloc(., KM_SLEEP) while holding either of these locks.
144  *   Traversing any of the various resource control entity lists requires
145  *   holding rctl_lists_lock.
146  *
147  *   Each individual resource control set associated with an entity must have
148  *   its rcs_lock held for the duration of any operations that would add
149  *   resource controls or control values to the set.
150  *
151  *   The locking subsequence of interest is: p_lock, rctl_dict_lock,
152  *   rctl_lists_lock, entity->rcs_lock.
153  *
154  * The projects(4) database and project entity resource controls
155  *   A special case is made for RCENTITY_PROJECT values set through the
156  *   setproject(3PROJECT) interface.  setproject() makes use of a private
157  *   interface, setprojrctl(), which passes through an array of resource control
158  *   blocks that need to be set while holding the entity->rcs_lock.  This
159  *   ensures that the act of modifying a project's resource controls is
160  *   "atomic" within the kernel.
161  *
162  *   Within the rctl sub-system, we provide two interfaces that are only used by
163  *   the setprojrctl() code path - rctl_local_insert_all() and
164  *   rctl_local_replace_all().  rctl_local_insert_all() will ensure that the
165  *   resource values specified in *new_values are applied.
166  *   rctl_local_replace_all() will purge the current rctl->rc_projdb and
167  *   rctl->rc_values entries, and apply the *new_values.
168  *
169  *   These functions modify not only the linked list of active resource controls
170  *   (rctl->rc_values), but also a "cached" linked list (rctl->rc_projdb) of
171  *   values set through these interfaces.  To clarify:
172  *
173  *      rctl->rc_values - a linked list of rctl_val_t.  These are the active
174  *      resource values associated with this rctl, and may have been set by
175  *      setrctl() - via prctl(1M), or by setprojrctl() - via
176  *      setproject(3PROJECT).
177  *
178  *      rctl->rc_projdb - a linked list of rctl_val_t.  These reflect the
179  *      resource values set by the setprojrctl() code path.  rc_projdb is not
180  *      referenced by any other component of the rctl sub-system.
181  *
182  *   As various locks are held when calling these functions, we ensure that all
183  *   the possible memory allocations are performed prior to calling the
184  *   function.  *alloc_values is a linked list of uninitialized rctl_val_t,
185  *   which may be used to duplicate a new resource control value (passed in as
186  *   one of the members of the *new_values linked list), in order to populate
187  *   rctl->rc_values.
188  */
189 
190 id_t max_rctl_hndl = 32768;
191 int rctl_dict_size = 64;
192 int rctl_set_size = 8;
193 kmutex_t rctl_dict_lock;
194 mod_hash_t *rctl_dict;
195 mod_hash_t *rctl_dict_by_name;
196 id_space_t *rctl_ids;
197 kmem_cache_t *rctl_cache;	/* kmem cache for rctl structures */
198 kmem_cache_t *rctl_val_cache;	/* kmem cache for rctl values */
199 
200 kmutex_t rctl_lists_lock;
201 rctl_dict_entry_t *rctl_lists[RC_MAX_ENTITY + 1];
202 
203 /*
204  * Default resource control operations and ops vector
205  *   To be used if the particular rcontrol has no specific actions defined, or
206  *   if the subsystem providing the control is quiescing (in preparation for
207  *   unloading, presumably.)
208  *
209  *   Resource controls with callbacks should fill the unused operations with the
210  *   appropriate default impotent callback.
211  */
212 /*ARGSUSED*/
213 void
214 rcop_no_action(struct rctl *r, struct proc *p, rctl_entity_p_t *e)
215 {
216 }
217 
218 /*ARGSUSED*/
219 rctl_qty_t
220 rcop_no_usage(struct rctl *r, struct proc *p)
221 {
222 	return (0);
223 }
224 
225 /*ARGSUSED*/
226 int
227 rcop_no_set(struct rctl *r, struct proc *p, rctl_entity_p_t *e, rctl_qty_t l)
228 {
229 	return (0);
230 }
231 
232 /*ARGSUSED*/
233 int
234 rcop_no_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
235     struct rctl_val *rv, rctl_qty_t i, uint_t f)
236 {
237 	return (0);
238 }
239 
240 rctl_ops_t rctl_default_ops = {
241 	rcop_no_action,
242 	rcop_no_usage,
243 	rcop_no_set,
244 	rcop_no_test
245 };
246 
247 /*
248  * Default "absolute" resource control operation and ops vector
249  *   Useful if there is no usage associated with the
250  *   resource control.
251  */
252 /*ARGSUSED*/
253 int
254 rcop_absolute_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e,
255     struct rctl_val *rv, rctl_qty_t i, uint_t f)
256 {
257 	return (i > rv->rcv_value);
258 }
259 
260 rctl_ops_t rctl_absolute_ops = {
261 	rcop_no_action,
262 	rcop_no_usage,
263 	rcop_no_set,
264 	rcop_absolute_test
265 };
266 
267 /*ARGSUSED*/
268 static uint_t
269 rctl_dict_hash_by_id(void *hash_data, mod_hash_key_t key)
270 {
271 	return ((uint_t)(uintptr_t)key % rctl_dict_size);
272 }
273 
274 static int
275 rctl_dict_id_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
276 {
277 	uint_t u1 = (uint_t)(uintptr_t)key1;
278 	uint_t u2 = (uint_t)(uintptr_t)key2;
279 
280 	if (u1 > u2)
281 		return (1);
282 
283 	if (u1 == u2)
284 		return (0);
285 
286 	return (-1);
287 }
288 
289 static void
290 rctl_dict_val_dtor(mod_hash_val_t val)
291 {
292 	rctl_dict_entry_t *kr = (rctl_dict_entry_t *)val;
293 
294 	kmem_free(kr, sizeof (rctl_dict_entry_t));
295 }
296 
297 /*
298  * size_t rctl_build_name_buf()
299  *
300  * Overview
301  *   rctl_build_name_buf() walks all active resource controls in the dictionary,
302  *   building a buffer of continguous NUL-terminated strings.
303  *
304  * Return values
305  *   The size of the buffer is returned, the passed pointer's contents are
306  *   modified to that of the location of the buffer.
307  *
308  * Caller's context
309  *   Caller must be in a context suitable for KM_SLEEP allocations.
310  */
311 size_t
312 rctl_build_name_buf(char **rbufp)
313 {
314 	size_t req_size, cpy_size;
315 	char *rbufloc;
316 	int i;
317 
318 rctl_rebuild_name_buf:
319 	req_size = cpy_size = 0;
320 
321 	/*
322 	 * Calculate needed buffer length.
323 	 */
324 	mutex_enter(&rctl_lists_lock);
325 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
326 		rctl_dict_entry_t *rde;
327 
328 		for (rde = rctl_lists[i];
329 		    rde != NULL;
330 		    rde = rde->rcd_next)
331 			req_size += strlen(rde->rcd_name) + 1;
332 	}
333 	mutex_exit(&rctl_lists_lock);
334 
335 	rbufloc = *rbufp = kmem_alloc(req_size, KM_SLEEP);
336 
337 	/*
338 	 * Copy rctl names into our buffer.  If the copy length exceeds the
339 	 * allocate length (due to registration changes), stop copying, free the
340 	 * buffer, and start again.
341 	 */
342 	mutex_enter(&rctl_lists_lock);
343 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
344 		rctl_dict_entry_t *rde;
345 
346 		for (rde = rctl_lists[i];
347 		    rde != NULL;
348 		    rde = rde->rcd_next) {
349 			size_t length = strlen(rde->rcd_name) + 1;
350 
351 			cpy_size += length;
352 
353 			if (cpy_size > req_size) {
354 				kmem_free(*rbufp, req_size);
355 				mutex_exit(&rctl_lists_lock);
356 				goto rctl_rebuild_name_buf;
357 			}
358 
359 			bcopy(rde->rcd_name, rbufloc, length);
360 			rbufloc += length;
361 		}
362 	}
363 	mutex_exit(&rctl_lists_lock);
364 
365 	return (req_size);
366 }
367 
368 /*
369  * rctl_dict_entry_t *rctl_dict_lookup(const char *)
370  *
371  * Overview
372  *   rctl_dict_lookup() returns the resource control dictionary entry for the
373  *   named resource control.
374  *
375  * Return values
376  *   A pointer to the appropriate resource control dictionary entry, or NULL if
377  *   no such named entry exists.
378  *
379  * Caller's context
380  *   Caller must not be holding rctl_dict_lock.
381  */
382 rctl_dict_entry_t *
383 rctl_dict_lookup(const char *name)
384 {
385 	rctl_dict_entry_t *rde;
386 
387 	mutex_enter(&rctl_dict_lock);
388 
389 	if (mod_hash_find(rctl_dict_by_name, (mod_hash_key_t)name,
390 	    (mod_hash_val_t *)&rde) == MH_ERR_NOTFOUND) {
391 		mutex_exit(&rctl_dict_lock);
392 		return (NULL);
393 	}
394 
395 	mutex_exit(&rctl_dict_lock);
396 
397 	return (rde);
398 }
399 
400 /*
401  * rctl_hndl_t rctl_hndl_lookup(const char *)
402  *
403  * Overview
404  *   rctl_hndl_lookup() returns the resource control id (the "handle") for the
405  *   named resource control.
406  *
407  * Return values
408  *   The appropriate id, or -1 if no such named entry exists.
409  *
410  * Caller's context
411  *   Caller must not be holding rctl_dict_lock.
412  */
413 rctl_hndl_t
414 rctl_hndl_lookup(const char *name)
415 {
416 	rctl_dict_entry_t *rde;
417 
418 	if ((rde = rctl_dict_lookup(name)) == NULL)
419 		return (-1);
420 
421 	return (rde->rcd_id);
422 }
423 
424 /*
425  * rctl_dict_entry_t * rctl_dict_lookup_hndl(rctl_hndl_t)
426  *
427  * Overview
428  *   rctl_dict_lookup_hndl() completes the public lookup functions, by returning
429  *   the resource control dictionary entry matching a given resource control id.
430  *
431  * Return values
432  *   A pointer to the matching resource control dictionary entry, or NULL if the
433  *   id does not match any existing entries.
434  *
435  * Caller's context
436  *   Caller must not be holding rctl_lists_lock.
437  */
438 rctl_dict_entry_t *
439 rctl_dict_lookup_hndl(rctl_hndl_t hndl)
440 {
441 	uint_t i;
442 
443 	mutex_enter(&rctl_lists_lock);
444 	for (i = 0; i < RC_MAX_ENTITY + 1; i++) {
445 		rctl_dict_entry_t *rde;
446 
447 		for (rde = rctl_lists[i];
448 		    rde != NULL;
449 		    rde = rde->rcd_next)
450 			if (rde->rcd_id == hndl) {
451 				mutex_exit(&rctl_lists_lock);
452 				return (rde);
453 			}
454 	}
455 	mutex_exit(&rctl_lists_lock);
456 
457 	return (NULL);
458 }
459 
460 /*
461  * void rctl_add_default_limit(const char *name, rctl_qty_t value,
462  *     rctl_priv_t privilege, uint_t action)
463  *
464  * Overview
465  *   Create a default limit with specified value, privilege, and action.
466  *
467  * Return value
468  *   No value returned.
469  */
470 void
471 rctl_add_default_limit(const char *name, rctl_qty_t value,
472     rctl_priv_t privilege, uint_t action)
473 {
474 	rctl_val_t *dval;
475 	rctl_dict_entry_t *rde;
476 
477 	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
478 	bzero(dval, sizeof (rctl_val_t));
479 	dval->rcv_value = value;
480 	dval->rcv_privilege = privilege;
481 	dval->rcv_flagaction = action;
482 	dval->rcv_action_recip_pid = -1;
483 
484 	rde = rctl_dict_lookup(name);
485 	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
486 }
487 
488 /*
489  * void rctl_add_legacy_limit(const char *name, const char *mname,
490  *     const char *lname, rctl_qty_t dflt)
491  *
492  * Overview
493  *   Create a default privileged limit, using the value obtained from
494  *   /etc/system if it exists and is greater than the specified default
495  *   value.  Exists primarily for System V IPC.
496  *
497  * Return value
498  *   No value returned.
499  */
500 void
501 rctl_add_legacy_limit(const char *name, const char *mname, const char *lname,
502     rctl_qty_t dflt, rctl_qty_t max)
503 {
504 	rctl_qty_t qty;
505 
506 	if (!mod_sysvar(mname, lname, &qty) || (qty < dflt))
507 		qty = dflt;
508 
509 	if (qty > max)
510 		qty = max;
511 
512 	rctl_add_default_limit(name, qty, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY);
513 }
514 
515 static rctl_set_t *
516 rctl_entity_obtain_rset(rctl_dict_entry_t *rcd, struct proc *p)
517 {
518 	rctl_set_t *rset = NULL;
519 
520 	if (rcd == NULL)
521 		return (NULL);
522 
523 	switch (rcd->rcd_entity) {
524 	case RCENTITY_PROCESS:
525 		rset = p->p_rctls;
526 		break;
527 	case RCENTITY_TASK:
528 		ASSERT(MUTEX_HELD(&p->p_lock));
529 		if (p->p_task != NULL)
530 			rset = p->p_task->tk_rctls;
531 		break;
532 	case RCENTITY_PROJECT:
533 		ASSERT(MUTEX_HELD(&p->p_lock));
534 		if (p->p_task != NULL &&
535 		    p->p_task->tk_proj != NULL)
536 			rset = p->p_task->tk_proj->kpj_rctls;
537 		break;
538 	case RCENTITY_ZONE:
539 		ASSERT(MUTEX_HELD(&p->p_lock));
540 		if (p->p_zone != NULL)
541 			rset = p->p_zone->zone_rctls;
542 		break;
543 	default:
544 		panic("unknown rctl entity type %d seen", rcd->rcd_entity);
545 		break;
546 	}
547 
548 	return (rset);
549 }
550 
551 static void
552 rctl_entity_obtain_entity_p(rctl_entity_t entity, struct proc *p,
553     rctl_entity_p_t *e)
554 {
555 	e->rcep_p.proc = NULL;
556 	e->rcep_t = entity;
557 
558 	switch (entity) {
559 	case RCENTITY_PROCESS:
560 		e->rcep_p.proc = p;
561 		break;
562 	case RCENTITY_TASK:
563 		ASSERT(MUTEX_HELD(&p->p_lock));
564 		if (p->p_task != NULL)
565 			e->rcep_p.task = p->p_task;
566 		break;
567 	case RCENTITY_PROJECT:
568 		ASSERT(MUTEX_HELD(&p->p_lock));
569 		if (p->p_task != NULL &&
570 		    p->p_task->tk_proj != NULL)
571 			e->rcep_p.proj = p->p_task->tk_proj;
572 		break;
573 	case RCENTITY_ZONE:
574 		ASSERT(MUTEX_HELD(&p->p_lock));
575 		if (p->p_zone != NULL)
576 			e->rcep_p.zone = p->p_zone;
577 		break;
578 	default:
579 		panic("unknown rctl entity type %d seen", entity);
580 		break;
581 	}
582 }
583 
584 static void
585 rctl_gp_alloc(rctl_alloc_gp_t *rcgp)
586 {
587 	uint_t i;
588 
589 	if (rcgp->rcag_nctls > 0) {
590 		rctl_t *prev = kmem_cache_alloc(rctl_cache, KM_SLEEP);
591 		rctl_t *rctl = prev;
592 
593 		rcgp->rcag_ctls = prev;
594 
595 		for (i = 1; i < rcgp->rcag_nctls; i++) {
596 			rctl = kmem_cache_alloc(rctl_cache, KM_SLEEP);
597 			prev->rc_next = rctl;
598 			prev = rctl;
599 		}
600 
601 		rctl->rc_next = NULL;
602 	}
603 
604 	if (rcgp->rcag_nvals > 0) {
605 		rctl_val_t *prev = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
606 		rctl_val_t *rval = prev;
607 
608 		rcgp->rcag_vals = prev;
609 
610 		for (i = 1; i < rcgp->rcag_nvals; i++) {
611 			rval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
612 			prev->rcv_next = rval;
613 			prev = rval;
614 		}
615 
616 		rval->rcv_next = NULL;
617 	}
618 
619 }
620 
621 static rctl_val_t *
622 rctl_gp_detach_val(rctl_alloc_gp_t *rcgp)
623 {
624 	rctl_val_t *rval = rcgp->rcag_vals;
625 
626 	ASSERT(rcgp->rcag_nvals > 0);
627 	rcgp->rcag_nvals--;
628 	rcgp->rcag_vals = rval->rcv_next;
629 
630 	rval->rcv_next = NULL;
631 
632 	return (rval);
633 }
634 
635 static rctl_t *
636 rctl_gp_detach_ctl(rctl_alloc_gp_t *rcgp)
637 {
638 	rctl_t *rctl = rcgp->rcag_ctls;
639 
640 	ASSERT(rcgp->rcag_nctls > 0);
641 	rcgp->rcag_nctls--;
642 	rcgp->rcag_ctls = rctl->rc_next;
643 
644 	rctl->rc_next = NULL;
645 
646 	return (rctl);
647 
648 }
649 
650 static void
651 rctl_gp_free(rctl_alloc_gp_t *rcgp)
652 {
653 	rctl_val_t *rval = rcgp->rcag_vals;
654 	rctl_t *rctl = rcgp->rcag_ctls;
655 
656 	while (rval != NULL) {
657 		rctl_val_t *next = rval->rcv_next;
658 
659 		kmem_cache_free(rctl_val_cache, rval);
660 		rval = next;
661 	}
662 
663 	while (rctl != NULL) {
664 		rctl_t *next = rctl->rc_next;
665 
666 		kmem_cache_free(rctl_cache, rctl);
667 		rctl = next;
668 	}
669 }
670 
671 /*
672  * void rctl_prealloc_destroy(rctl_alloc_gp_t *)
673  *
674  * Overview
675  *   Release all unused memory allocated via one of the "prealloc" functions:
676  *   rctl_set_init_prealloc, rctl_set_dup_prealloc, or rctl_rlimit_set_prealloc.
677  *
678  * Return values
679  *   None.
680  *
681  * Caller's context
682  *   No restrictions on context.
683  */
684 void
685 rctl_prealloc_destroy(rctl_alloc_gp_t *gp)
686 {
687 	rctl_gp_free(gp);
688 	kmem_free(gp, sizeof (rctl_alloc_gp_t));
689 }
690 
691 /*
692  * int rctl_val_cmp(rctl_val_t *, rctl_val_t *, int)
693  *
694  * Overview
695  *   This function defines an ordering to rctl_val_t's in order to allow
696  *   for correct placement in value lists. When the imprecise flag is set,
697  *   the action recipient is ignored. This is to facilitate insert,
698  *   delete, and replace operations by rctlsys.
699  *
700  * Return values
701  *   0 if the val_t's are are considered identical
702  *   -1 if a is ordered lower than b
703  *   1 if a is lowered higher than b
704  *
705  * Caller's context
706  *   No restrictions on context.
707  */
708 int
709 rctl_val_cmp(rctl_val_t *a, rctl_val_t *b, int imprecise)
710 {
711 	if ((a->rcv_flagaction & RCTL_LOCAL_MAXIMAL) <
712 	    (b->rcv_flagaction & RCTL_LOCAL_MAXIMAL))
713 		return (-1);
714 
715 	if ((a->rcv_flagaction & RCTL_LOCAL_MAXIMAL) >
716 	    (b->rcv_flagaction & RCTL_LOCAL_MAXIMAL))
717 		return (1);
718 
719 	if (a->rcv_value < b->rcv_value)
720 		return (-1);
721 
722 	if (a->rcv_value > b->rcv_value)
723 		return (1);
724 
725 	if ((a->rcv_flagaction & RCTL_LOCAL_DENY) <
726 	    (b->rcv_flagaction & RCTL_LOCAL_DENY))
727 		return (-1);
728 
729 	if ((a->rcv_flagaction & RCTL_LOCAL_DENY) >
730 	    (b->rcv_flagaction & RCTL_LOCAL_DENY))
731 		return (1);
732 
733 	if (a->rcv_privilege < b->rcv_privilege)
734 		return (-1);
735 
736 	if (a->rcv_privilege > b->rcv_privilege)
737 		return (1);
738 
739 	if (imprecise)
740 		return (0);
741 
742 	if (a->rcv_action_recip_pid < b->rcv_action_recip_pid)
743 		return (-1);
744 
745 	if (a->rcv_action_recip_pid > b->rcv_action_recip_pid)
746 		return (1);
747 
748 	return (0);
749 }
750 
751 static rctl_val_t *
752 rctl_val_list_find(rctl_val_t **head, rctl_val_t *cval)
753 {
754 	rctl_val_t *rval = *head;
755 
756 	while (rval != NULL) {
757 		if (rctl_val_cmp(cval, rval, 0) == 0)
758 			return (rval);
759 
760 		rval = rval->rcv_next;
761 	}
762 
763 	return (NULL);
764 
765 }
766 
767 /*
768  * int rctl_val_list_insert(rctl_val_t **, rctl_val_t *)
769  *
770  * Overview
771  *   This function inserts the rctl_val_t into the value list provided.
772  *   The insert is always successful unless if the value is a duplicate
773  *   of one already in the list.
774  *
775  * Return values
776  *    1 if the value was a duplicate of an existing value in the list.
777  *    0 if the insert was successful.
778  */
779 int
780 rctl_val_list_insert(rctl_val_t **root, rctl_val_t *rval)
781 {
782 	rctl_val_t *prev;
783 	int equiv;
784 
785 	rval->rcv_next = NULL;
786 	rval->rcv_prev = NULL;
787 
788 	if (*root == NULL) {
789 		*root = rval;
790 		return (0);
791 	}
792 
793 	equiv = rctl_val_cmp(rval, *root, 0);
794 
795 	if (equiv == 0)
796 		return (1);
797 
798 	if (equiv < 0) {
799 		rval->rcv_next = *root;
800 		rval->rcv_next->rcv_prev = rval;
801 		*root = rval;
802 
803 		return (0);
804 	}
805 
806 	prev = *root;
807 	while (prev->rcv_next != NULL &&
808 	    (equiv = rctl_val_cmp(rval, prev->rcv_next, 0)) > 0) {
809 		prev = prev->rcv_next;
810 	}
811 
812 	if (equiv == 0)
813 		return (1);
814 
815 	rval->rcv_next = prev->rcv_next;
816 	if (rval->rcv_next != NULL)
817 		rval->rcv_next->rcv_prev = rval;
818 	prev->rcv_next = rval;
819 	rval->rcv_prev = prev;
820 
821 	return (0);
822 }
823 
824 static int
825 rctl_val_list_delete(rctl_val_t **root, rctl_val_t *rval)
826 {
827 	rctl_val_t *prev;
828 
829 	if (*root == NULL)
830 		return (-1);
831 
832 	prev = *root;
833 	if (rctl_val_cmp(rval, prev, 0) == 0) {
834 		*root = prev->rcv_next;
835 		if (*root != NULL)
836 			(*root)->rcv_prev = NULL;
837 
838 		kmem_cache_free(rctl_val_cache, prev);
839 
840 		return (0);
841 	}
842 
843 	while (prev->rcv_next != NULL &&
844 	    rctl_val_cmp(rval, prev->rcv_next, 0) != 0) {
845 		prev = prev->rcv_next;
846 	}
847 
848 	if (prev->rcv_next == NULL) {
849 		/*
850 		 * If we navigate the entire list and cannot find a match, then
851 		 * return failure.
852 		 */
853 		return (-1);
854 	}
855 
856 	prev = prev->rcv_next;
857 	prev->rcv_prev->rcv_next = prev->rcv_next;
858 	if (prev->rcv_next != NULL)
859 		prev->rcv_next->rcv_prev = prev->rcv_prev;
860 
861 	kmem_cache_free(rctl_val_cache, prev);
862 
863 	return (0);
864 }
865 
866 static rctl_val_t *
867 rctl_val_list_dup(rctl_val_t *rval, rctl_alloc_gp_t *ragp, struct proc *oldp,
868     struct proc *newp)
869 {
870 	rctl_val_t *head = NULL;
871 
872 	for (; rval != NULL; rval = rval->rcv_next) {
873 		rctl_val_t *dval = rctl_gp_detach_val(ragp);
874 
875 		bcopy(rval, dval, sizeof (rctl_val_t));
876 		dval->rcv_prev = dval->rcv_next = NULL;
877 
878 		if (oldp == NULL ||
879 		    rval->rcv_action_recipient == NULL ||
880 		    rval->rcv_action_recipient == oldp) {
881 			if (rval->rcv_privilege == RCPRIV_BASIC) {
882 				dval->rcv_action_recipient = newp;
883 				dval->rcv_action_recip_pid = newp->p_pid;
884 			} else {
885 				dval->rcv_action_recipient = NULL;
886 				dval->rcv_action_recip_pid = -1;
887 			}
888 
889 			(void) rctl_val_list_insert(&head, dval);
890 		} else {
891 			kmem_cache_free(rctl_val_cache, dval);
892 		}
893 	}
894 
895 	return (head);
896 }
897 
898 static void
899 rctl_val_list_reset(rctl_val_t *rval)
900 {
901 	for (; rval != NULL; rval = rval->rcv_next)
902 		rval->rcv_firing_time = 0;
903 }
904 
905 static uint_t
906 rctl_val_list_count(rctl_val_t *rval)
907 {
908 	uint_t n = 0;
909 
910 	for (; rval != NULL; rval = rval->rcv_next)
911 		n++;
912 
913 	return (n);
914 }
915 
916 
917 static void
918 rctl_val_list_free(rctl_val_t *rval)
919 {
920 	while (rval != NULL) {
921 		rctl_val_t *next = rval->rcv_next;
922 
923 		kmem_cache_free(rctl_val_cache, rval);
924 
925 		rval = next;
926 	}
927 }
928 
929 /*
930  * rctl_qty_t rctl_model_maximum(rctl_dict_entry_t *, struct proc *)
931  *
932  * Overview
933  *   In cases where the operating system supports more than one process
934  *   addressing model, the operating system capabilities will exceed those of
935  *   one or more of these models.  Processes in a less capable model must have
936  *   their resources accurately controlled, without diluting those of their
937  *   descendants reached via exec().  rctl_model_maximum() returns the governing
938  *   value for the specified process with respect to a resource control, such
939  *   that the value can used for the RCTLOP_SET callback or compatability
940  *   support.
941  *
942  * Return values
943  *   The maximum value for the given process for the specified resource control.
944  *
945  * Caller's context
946  *   No restrictions on context.
947  */
948 rctl_qty_t
949 rctl_model_maximum(rctl_dict_entry_t *rde, struct proc *p)
950 {
951 	if (p->p_model == DATAMODEL_NATIVE)
952 		return (rde->rcd_max_native);
953 
954 	return (rde->rcd_max_ilp32);
955 }
956 
957 /*
958  * rctl_qty_t rctl_model_value(rctl_dict_entry_t *, struct proc *, rctl_qty_t)
959  *
960  * Overview
961  *   Convenience function wrapping the rctl_model_maximum() functionality.
962  *
963  * Return values
964  *   The lesser of the process's maximum value and the given value for the
965  *   specified resource control.
966  *
967  * Caller's context
968  *   No restrictions on context.
969  */
970 rctl_qty_t
971 rctl_model_value(rctl_dict_entry_t *rde, struct proc *p, rctl_qty_t value)
972 {
973 	rctl_qty_t max = rctl_model_maximum(rde, p);
974 
975 	return (value < max ? value : max);
976 }
977 
978 static void
979 rctl_set_insert(rctl_set_t *set, rctl_hndl_t hndl, rctl_t *rctl)
980 {
981 	uint_t index = hndl % rctl_set_size;
982 	rctl_t *next_ctl, *prev_ctl;
983 
984 	ASSERT(MUTEX_HELD(&set->rcs_lock));
985 
986 	rctl->rc_next = NULL;
987 
988 	if (set->rcs_ctls[index] == NULL) {
989 		set->rcs_ctls[index] = rctl;
990 		return;
991 	}
992 
993 	if (hndl < set->rcs_ctls[index]->rc_id) {
994 		rctl->rc_next = set->rcs_ctls[index];
995 		set->rcs_ctls[index] = rctl;
996 
997 		return;
998 	}
999 
1000 	for (next_ctl = set->rcs_ctls[index]->rc_next,
1001 	    prev_ctl = set->rcs_ctls[index];
1002 	    next_ctl != NULL;
1003 	    prev_ctl = next_ctl,
1004 	    next_ctl = next_ctl->rc_next) {
1005 		if (next_ctl->rc_id > hndl) {
1006 			rctl->rc_next = next_ctl;
1007 			prev_ctl->rc_next = rctl;
1008 
1009 			return;
1010 		}
1011 	}
1012 
1013 	rctl->rc_next = next_ctl;
1014 	prev_ctl->rc_next = rctl;
1015 }
1016 
1017 /*
1018  * rctl_set_t *rctl_set_create()
1019  *
1020  * Overview
1021  *   Create an empty resource control set, suitable for attaching to a
1022  *   controlled entity.
1023  *
1024  * Return values
1025  *   A pointer to the newly created set.
1026  *
1027  * Caller's context
1028  *   Safe for KM_SLEEP allocations.
1029  */
1030 rctl_set_t *
1031 rctl_set_create()
1032 {
1033 	rctl_set_t *rset = kmem_zalloc(sizeof (rctl_set_t), KM_SLEEP);
1034 
1035 	mutex_init(&rset->rcs_lock, NULL, MUTEX_DEFAULT, NULL);
1036 	rset->rcs_ctls = kmem_zalloc(rctl_set_size * sizeof (rctl_t *),
1037 	    KM_SLEEP);
1038 	rset->rcs_entity = -1;
1039 
1040 	return (rset);
1041 }
1042 
1043 /*
1044  * rctl_gp_alloc_t *rctl_set_init_prealloc(rctl_entity_t)
1045  *
1046  * Overview
1047  *    rctl_set_init_prealloc() examines the globally defined resource controls
1048  *    and their default values and returns a resource control allocation group
1049  *    populated with sufficient controls and values to form a representative
1050  *    resource control set for the specified entity.
1051  *
1052  * Return values
1053  *    A pointer to the newly created allocation group.
1054  *
1055  * Caller's context
1056  *    Caller must be in a context suitable for KM_SLEEP allocations.
1057  */
1058 rctl_alloc_gp_t *
1059 rctl_set_init_prealloc(rctl_entity_t entity)
1060 {
1061 	rctl_dict_entry_t *rde;
1062 	rctl_alloc_gp_t *ragp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
1063 
1064 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1065 
1066 	if (rctl_lists[entity] == NULL)
1067 		return (ragp);
1068 
1069 	mutex_enter(&rctl_lists_lock);
1070 
1071 	for (rde = rctl_lists[entity]; rde != NULL; rde = rde->rcd_next) {
1072 		ragp->rcag_nctls++;
1073 		ragp->rcag_nvals += rctl_val_list_count(rde->rcd_default_value);
1074 	}
1075 
1076 	mutex_exit(&rctl_lists_lock);
1077 
1078 	rctl_gp_alloc(ragp);
1079 
1080 	return (ragp);
1081 }
1082 
1083 /*
1084  * rctl_set_t *rctl_set_init(rctl_entity_t)
1085  *
1086  * Overview
1087  *   rctl_set_create() creates a resource control set, initialized with the
1088  *   system infinite values on all registered controls, for attachment to a
1089  *   system entity requiring resource controls, such as a process or a task.
1090  *
1091  * Return values
1092  *   A pointer to the newly filled set.
1093  *
1094  * Caller's context
1095  *   Caller must be holding p_lock on entry so that RCTLOP_SET() functions
1096  *   may modify task and project members based on the proc structure
1097  *   they are passed.
1098  */
1099 rctl_set_t *
1100 rctl_set_init(rctl_entity_t entity, struct proc *p, rctl_entity_p_t *e,
1101     rctl_set_t *rset, rctl_alloc_gp_t *ragp)
1102 {
1103 	rctl_dict_entry_t *rde;
1104 
1105 	ASSERT(MUTEX_HELD(&p->p_lock));
1106 	ASSERT(e);
1107 	rset->rcs_entity = entity;
1108 
1109 	if (rctl_lists[entity] == NULL)
1110 		return (rset);
1111 
1112 	mutex_enter(&rctl_lists_lock);
1113 	mutex_enter(&rset->rcs_lock);
1114 
1115 	for (rde = rctl_lists[entity]; rde != NULL; rde = rde->rcd_next) {
1116 		rctl_t *rctl = rctl_gp_detach_ctl(ragp);
1117 
1118 		rctl->rc_dict_entry = rde;
1119 		rctl->rc_id = rde->rcd_id;
1120 		rctl->rc_projdb = NULL;
1121 
1122 		rctl->rc_values = rctl_val_list_dup(rde->rcd_default_value,
1123 		    ragp, NULL, p);
1124 		rctl->rc_cursor = rctl->rc_values;
1125 
1126 		ASSERT(rctl->rc_cursor != NULL);
1127 
1128 		rctl_set_insert(rset, rde->rcd_id, rctl);
1129 
1130 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1131 		    rctl->rc_cursor->rcv_value));
1132 	}
1133 
1134 	mutex_exit(&rset->rcs_lock);
1135 	mutex_exit(&rctl_lists_lock);
1136 
1137 	return (rset);
1138 }
1139 
1140 static rctl_t *
1141 rctl_dup(rctl_t *rctl, rctl_alloc_gp_t *ragp, struct proc *oldp,
1142     struct proc *newp)
1143 {
1144 	rctl_t *dup = rctl_gp_detach_ctl(ragp);
1145 	rctl_val_t *dval;
1146 
1147 	dup->rc_id = rctl->rc_id;
1148 	dup->rc_dict_entry = rctl->rc_dict_entry;
1149 	dup->rc_next = NULL;
1150 	dup->rc_cursor = NULL;
1151 	dup->rc_values = rctl_val_list_dup(rctl->rc_values, ragp, oldp, newp);
1152 
1153 	for (dval = dup->rc_values;
1154 	    dval != NULL; dval = dval->rcv_next) {
1155 		if (rctl_val_cmp(rctl->rc_cursor, dval, 0) >= 0) {
1156 			dup->rc_cursor = dval;
1157 			break;
1158 		}
1159 	}
1160 
1161 	if (dup->rc_cursor == NULL)
1162 		dup->rc_cursor = dup->rc_values;
1163 
1164 	return (dup);
1165 }
1166 
1167 static void
1168 rctl_set_fill_alloc_gp(rctl_set_t *set, rctl_alloc_gp_t *ragp)
1169 {
1170 	uint_t i;
1171 
1172 	bzero(ragp, sizeof (rctl_alloc_gp_t));
1173 
1174 	for (i = 0; i < rctl_set_size; i++) {
1175 		rctl_t *r = set->rcs_ctls[i];
1176 
1177 		while (r != NULL) {
1178 			ragp->rcag_nctls++;
1179 
1180 			ragp->rcag_nvals += rctl_val_list_count(r->rc_values);
1181 
1182 			r = r->rc_next;
1183 		}
1184 	}
1185 }
1186 
1187 /*
1188  * rctl_alloc_gp_t *rctl_set_dup_prealloc(rctl_set_t *)
1189  *
1190  * Overview
1191  *   Given a resource control set, allocate a sufficiently large allocation
1192  *   group to contain a duplicate of the set.
1193  *
1194  * Return value
1195  *   A pointer to the newly created allocation group.
1196  *
1197  * Caller's context
1198  *   Safe for KM_SLEEP allocations.
1199  */
1200 rctl_alloc_gp_t *
1201 rctl_set_dup_prealloc(rctl_set_t *set)
1202 {
1203 	rctl_alloc_gp_t *ragp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
1204 
1205 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1206 
1207 	mutex_enter(&set->rcs_lock);
1208 	rctl_set_fill_alloc_gp(set, ragp);
1209 	mutex_exit(&set->rcs_lock);
1210 
1211 	rctl_gp_alloc(ragp);
1212 
1213 	return (ragp);
1214 }
1215 
1216 /*
1217  * int rctl_set_dup_ready(rctl_set_t *, rctl_alloc_gp_t *)
1218  *
1219  * Overview
1220  *   Verify that the allocation group provided is large enough to allow a
1221  *   duplicate of the given resource control set to be constructed from its
1222  *   contents.
1223  *
1224  * Return values
1225  *   1 if the allocation group is sufficiently large, 0 otherwise.
1226  *
1227  * Caller's context
1228  *   rcs_lock must be held prior to entry.
1229  */
1230 int
1231 rctl_set_dup_ready(rctl_set_t *set, rctl_alloc_gp_t *ragp)
1232 {
1233 	rctl_alloc_gp_t curr_gp;
1234 
1235 	ASSERT(MUTEX_HELD(&set->rcs_lock));
1236 
1237 	rctl_set_fill_alloc_gp(set, &curr_gp);
1238 
1239 	if (curr_gp.rcag_nctls <= ragp->rcag_nctls &&
1240 	    curr_gp.rcag_nvals <= ragp->rcag_nvals)
1241 		return (1);
1242 
1243 	return (0);
1244 }
1245 
1246 /*
1247  * rctl_set_t *rctl_set_dup(rctl_set_t *, struct proc *, struct proc *,
1248  *   rctl_set_t *, rctl_alloc_gp_t *, int)
1249  *
1250  * Overview
1251  *   Make a duplicate of the resource control set.  The proc pointers are those
1252  *   of the owning process and of the process associated with the entity
1253  *   receiving the duplicate.
1254  *
1255  *   Duplication is a 3 stage process. Stage 1 is memory allocation for
1256  *   the duplicate set, which is taken care of by rctl_set_dup_prealloc().
1257  *   Stage 2 consists of copying all rctls and values from the old set into
1258  *   the new. Stage 3 completes the duplication by performing the appropriate
1259  *   callbacks for each rctl in the new set.
1260  *
1261  *   Stages 2 and 3 are handled by calling rctl_set_dup with the RCD_DUP and
1262  *   RCD_CALLBACK functions, respectively. The RCD_CALLBACK flag may only
1263  *   be supplied if the newp proc structure reflects the new task and
1264  *   project linkage.
1265  *
1266  * Return value
1267  *   A pointer to the duplicate set.
1268  *
1269  * Caller's context
1270  *   The rcs_lock of the set to be duplicated must be held prior to entry.
1271  */
1272 rctl_set_t *
1273 rctl_set_dup(rctl_set_t *set, struct proc *oldp, struct proc *newp,
1274     rctl_entity_p_t *e, rctl_set_t *dup, rctl_alloc_gp_t *ragp, int flag)
1275 {
1276 	uint_t i;
1277 	rctl_set_t	*iter;
1278 
1279 	ASSERT((flag & RCD_DUP) || (flag & RCD_CALLBACK));
1280 	ASSERT(e);
1281 	/*
1282 	 * When copying the old set, iterate over that. Otherwise, when
1283 	 * only callbacks have been requested, iterate over the dup set.
1284 	 */
1285 	if (flag & RCD_DUP) {
1286 		ASSERT(MUTEX_HELD(&set->rcs_lock));
1287 		iter = set;
1288 		dup->rcs_entity = set->rcs_entity;
1289 	} else {
1290 		iter = dup;
1291 	}
1292 
1293 	mutex_enter(&dup->rcs_lock);
1294 
1295 	for (i = 0; i < rctl_set_size; i++) {
1296 		rctl_t *r = iter->rcs_ctls[i];
1297 		rctl_t *d;
1298 
1299 		while (r != NULL) {
1300 			if (flag & RCD_DUP) {
1301 				d = rctl_dup(r, ragp, oldp, newp);
1302 				rctl_set_insert(dup, r->rc_id, d);
1303 			} else {
1304 				d = r;
1305 			}
1306 
1307 			if (flag & RCD_CALLBACK)
1308 				RCTLOP_SET(d, newp, e,
1309 				    rctl_model_value(d->rc_dict_entry, newp,
1310 				    d->rc_cursor->rcv_value));
1311 
1312 			r = r->rc_next;
1313 		}
1314 	}
1315 
1316 	mutex_exit(&dup->rcs_lock);
1317 
1318 	return (dup);
1319 }
1320 
1321 /*
1322  * void rctl_set_free(rctl_set_t *)
1323  *
1324  * Overview
1325  *   Delete resource control set and all attached values.
1326  *
1327  * Return values
1328  *   No value returned.
1329  *
1330  * Caller's context
1331  *   No restrictions on context.
1332  */
1333 void
1334 rctl_set_free(rctl_set_t *set)
1335 {
1336 	uint_t i;
1337 
1338 	mutex_enter(&set->rcs_lock);
1339 	for (i = 0; i < rctl_set_size; i++) {
1340 		rctl_t *r = set->rcs_ctls[i];
1341 
1342 		while (r != NULL) {
1343 			rctl_val_t *v = r->rc_values;
1344 			rctl_t *n = r->rc_next;
1345 
1346 			kmem_cache_free(rctl_cache, r);
1347 
1348 			rctl_val_list_free(v);
1349 
1350 			r = n;
1351 		}
1352 	}
1353 	mutex_exit(&set->rcs_lock);
1354 
1355 	kmem_free(set->rcs_ctls, sizeof (rctl_t *) * rctl_set_size);
1356 	kmem_free(set, sizeof (rctl_set_t));
1357 }
1358 
1359 /*
1360  * void rctl_set_reset(rctl_set_t *)
1361  *
1362  * Overview
1363  *   Resets all rctls within the set such that the lowest value becomes active.
1364  *
1365  * Return values
1366  *   No value returned.
1367  *
1368  * Caller's context
1369  *   No restrictions on context.
1370  */
1371 void
1372 rctl_set_reset(rctl_set_t *set, struct proc *p, rctl_entity_p_t *e)
1373 {
1374 	uint_t i;
1375 
1376 	ASSERT(e);
1377 
1378 	mutex_enter(&set->rcs_lock);
1379 	for (i = 0; i < rctl_set_size; i++) {
1380 		rctl_t *r = set->rcs_ctls[i];
1381 
1382 		while (r != NULL) {
1383 			r->rc_cursor = r->rc_values;
1384 			rctl_val_list_reset(r->rc_cursor);
1385 			RCTLOP_SET(r, p, e, rctl_model_value(r->rc_dict_entry,
1386 			    p, r->rc_cursor->rcv_value));
1387 
1388 			ASSERT(r->rc_cursor != NULL);
1389 
1390 			r = r->rc_next;
1391 		}
1392 	}
1393 
1394 	mutex_exit(&set->rcs_lock);
1395 }
1396 
1397 /*
1398  * void rctl_set_tearoff(rctl_set *, struct proc *)
1399  *
1400  * Overview
1401  *   Tear off any resource control values on this set with an action recipient
1402  *   equal to the specified process (as they are becoming invalid with the
1403  *   process's departure from this set as an observer).
1404  *
1405  * Return values
1406  *   No value returned.
1407  *
1408  * Caller's context
1409  *   No restrictions on context
1410  */
1411 void
1412 rctl_set_tearoff(rctl_set_t *set, struct proc *p)
1413 {
1414 	uint_t i;
1415 
1416 	mutex_enter(&set->rcs_lock);
1417 	for (i = 0; i < rctl_set_size; i++) {
1418 		rctl_t *r = set->rcs_ctls[i];
1419 
1420 		while (r != NULL) {
1421 			rctl_val_t *rval;
1422 
1423 tearoff_rewalk_list:
1424 			rval = r->rc_values;
1425 
1426 			while (rval != NULL) {
1427 				if (rval->rcv_privilege == RCPRIV_BASIC &&
1428 				    rval->rcv_action_recipient == p) {
1429 					if (r->rc_cursor == rval)
1430 						r->rc_cursor = rval->rcv_next;
1431 
1432 					(void) rctl_val_list_delete(
1433 					    &r->rc_values, rval);
1434 
1435 					goto tearoff_rewalk_list;
1436 				}
1437 
1438 				rval = rval->rcv_next;
1439 			}
1440 
1441 			ASSERT(r->rc_cursor != NULL);
1442 
1443 			r = r->rc_next;
1444 		}
1445 	}
1446 
1447 	mutex_exit(&set->rcs_lock);
1448 }
1449 
1450 static int
1451 rctl_set_find(rctl_set_t *set, rctl_hndl_t hndl, rctl_t **rctl)
1452 {
1453 	uint_t index = hndl % rctl_set_size;
1454 	rctl_t *curr_ctl;
1455 
1456 	ASSERT(MUTEX_HELD(&set->rcs_lock));
1457 
1458 	for (curr_ctl = set->rcs_ctls[index]; curr_ctl != NULL;
1459 	    curr_ctl = curr_ctl->rc_next) {
1460 		if (curr_ctl->rc_id == hndl) {
1461 			*rctl = curr_ctl;
1462 
1463 			return (0);
1464 		}
1465 	}
1466 
1467 	return (-1);
1468 }
1469 
1470 /*
1471  * rlim64_t rctl_enforced_value(rctl_hndl_t, rctl_set_t *, struct proc *)
1472  *
1473  * Overview
1474  *   Given a process, get the next enforced value on the rctl of the specified
1475  *   handle.
1476  *
1477  * Return value
1478  *   The enforced value.
1479  *
1480  * Caller's context
1481  *   For controls on process collectives, p->p_lock must be held across the
1482  *   operation.
1483  */
1484 /*ARGSUSED*/
1485 rctl_qty_t
1486 rctl_enforced_value(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p)
1487 {
1488 	rctl_t *rctl;
1489 	rlim64_t ret;
1490 
1491 	mutex_enter(&rset->rcs_lock);
1492 
1493 	if (rctl_set_find(rset, hndl, &rctl) == -1)
1494 		panic("unknown resource control handle %d requested", hndl);
1495 	else
1496 		ret = rctl_model_value(rctl->rc_dict_entry, p,
1497 		    rctl->rc_cursor->rcv_value);
1498 
1499 	mutex_exit(&rset->rcs_lock);
1500 
1501 	return (ret);
1502 }
1503 
1504 /*
1505  * int rctl_global_get(const char *, rctl_dict_entry_t *)
1506  *
1507  * Overview
1508  *   Copy a sanitized version of the global rctl for a given resource control
1509  *   name.  (By sanitization, we mean that the unsafe data pointers have been
1510  *   zeroed.)
1511  *
1512  * Return value
1513  *   -1 if name not defined, 0 otherwise.
1514  *
1515  * Caller's context
1516  *   No restrictions on context.  rctl_dict_lock must not be held.
1517  */
1518 int
1519 rctl_global_get(const char *name, rctl_dict_entry_t *drde)
1520 {
1521 	rctl_dict_entry_t *rde = rctl_dict_lookup(name);
1522 
1523 	if (rde == NULL)
1524 		return (-1);
1525 
1526 	bcopy(rde, drde, sizeof (rctl_dict_entry_t));
1527 
1528 	drde->rcd_next = NULL;
1529 	drde->rcd_ops = NULL;
1530 
1531 	return (0);
1532 }
1533 
1534 /*
1535  * int rctl_global_set(const char *, rctl_dict_entry_t *)
1536  *
1537  * Overview
1538  *   Transfer the settable fields of the named rctl to the global rctl matching
1539  *   the given resource control name.
1540  *
1541  * Return value
1542  *   -1 if name not defined, 0 otherwise.
1543  *
1544  * Caller's context
1545  *   No restrictions on context.  rctl_dict_lock must not be held.
1546  */
1547 int
1548 rctl_global_set(const char *name, rctl_dict_entry_t *drde)
1549 {
1550 	rctl_dict_entry_t *rde = rctl_dict_lookup(name);
1551 
1552 	if (rde == NULL)
1553 		return (-1);
1554 
1555 	rde->rcd_flagaction = drde->rcd_flagaction;
1556 	rde->rcd_syslog_level = drde->rcd_syslog_level;
1557 	rde->rcd_strlog_flags = drde->rcd_strlog_flags;
1558 
1559 	return (0);
1560 }
1561 
1562 static int
1563 rctl_local_op(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
1564     int (*cbop)(rctl_hndl_t, struct proc *p, rctl_entity_p_t *e, rctl_t *,
1565     rctl_val_t *, rctl_val_t *), struct proc *p)
1566 {
1567 	rctl_t *rctl;
1568 	rctl_set_t *rset;
1569 	rctl_entity_p_t e;
1570 	int ret = 0;
1571 	rctl_dict_entry_t *rde = rctl_dict_lookup_hndl(hndl);
1572 
1573 local_op_retry:
1574 
1575 	ASSERT(MUTEX_HELD(&p->p_lock));
1576 
1577 	rset = rctl_entity_obtain_rset(rde, p);
1578 
1579 	if (rset == NULL) {
1580 		return (-1);
1581 	}
1582 	rctl_entity_obtain_entity_p(rset->rcs_entity, p, &e);
1583 
1584 	mutex_enter(&rset->rcs_lock);
1585 
1586 	/* using rctl's hndl, get rctl from local set */
1587 	if (rctl_set_find(rset, hndl, &rctl) == -1) {
1588 		mutex_exit(&rset->rcs_lock);
1589 		return (-1);
1590 	}
1591 
1592 	ret = cbop(hndl, p, &e, rctl, oval, nval);
1593 
1594 	mutex_exit(&rset->rcs_lock);
1595 	return (ret);
1596 }
1597 
1598 /*ARGSUSED*/
1599 static int
1600 rctl_local_get_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1601     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1602 {
1603 	if (oval == NULL) {
1604 		/*
1605 		 * RCTL_FIRST
1606 		 */
1607 		bcopy(rctl->rc_values, nval, sizeof (rctl_val_t));
1608 	} else {
1609 		/*
1610 		 * RCTL_NEXT
1611 		 */
1612 		rctl_val_t *tval = rctl_val_list_find(&rctl->rc_values, oval);
1613 
1614 		if (tval == NULL)
1615 			return (ESRCH);
1616 		else if (tval->rcv_next == NULL)
1617 			return (ENOENT);
1618 		else
1619 			bcopy(tval->rcv_next, nval, sizeof (rctl_val_t));
1620 	}
1621 
1622 	return (0);
1623 }
1624 
1625 /*
1626  * int rctl_local_get(rctl_hndl_t, rctl_val_t *)
1627  *
1628  * Overview
1629  *   Get the rctl value for the given flags.
1630  *
1631  * Return values
1632  *   0 for successful get, errno otherwise.
1633  */
1634 int
1635 rctl_local_get(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
1636     struct proc *p)
1637 {
1638 	return (rctl_local_op(hndl, oval, nval, rctl_local_get_cb, p));
1639 }
1640 
1641 /*ARGSUSED*/
1642 static int
1643 rctl_local_delete_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1644     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1645 {
1646 	if ((oval = rctl_val_list_find(&rctl->rc_values, nval)) == NULL)
1647 		return (ESRCH);
1648 
1649 	if (rctl->rc_cursor == oval) {
1650 		rctl->rc_cursor = oval->rcv_next;
1651 		rctl_val_list_reset(rctl->rc_cursor);
1652 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1653 		    rctl->rc_cursor->rcv_value));
1654 
1655 		ASSERT(rctl->rc_cursor != NULL);
1656 	}
1657 
1658 	(void) rctl_val_list_delete(&rctl->rc_values, oval);
1659 
1660 	return (0);
1661 }
1662 
1663 /*
1664  * int rctl_local_delete(rctl_hndl_t, rctl_val_t *)
1665  *
1666  * Overview
1667  *   Delete the rctl value for the given flags.
1668  *
1669  * Return values
1670  *   0 for successful delete, errno otherwise.
1671  */
1672 int
1673 rctl_local_delete(rctl_hndl_t hndl, rctl_val_t *val, struct proc *p)
1674 {
1675 	return (rctl_local_op(hndl, NULL, val, rctl_local_delete_cb, p));
1676 }
1677 
1678 /*
1679  * rctl_local_insert_cb()
1680  *
1681  * Overview
1682  *   Insert a new value into the rctl's val list. If an error occurs,
1683  *   the val list must be left in the same state as when the function
1684  *   was entered.
1685  *
1686  * Return Values
1687  *   0 for successful insert, EINVAL if the value is duplicated in the
1688  *   existing list.
1689  */
1690 /*ARGSUSED*/
1691 static int
1692 rctl_local_insert_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1693     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1694 {
1695 	/*
1696 	 * Before inserting, confirm there are no duplicates of this value
1697 	 * and flag level. If there is a duplicate, flag an error and do
1698 	 * nothing.
1699 	 */
1700 	if (rctl_val_list_insert(&rctl->rc_values, nval) != 0)
1701 		return (EINVAL);
1702 
1703 	if (rctl_val_cmp(nval, rctl->rc_cursor, 0) < 0) {
1704 		rctl->rc_cursor = nval;
1705 		rctl_val_list_reset(rctl->rc_cursor);
1706 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1707 		    rctl->rc_cursor->rcv_value));
1708 
1709 		ASSERT(rctl->rc_cursor != NULL);
1710 	}
1711 
1712 	return (0);
1713 }
1714 
1715 /*
1716  * int rctl_local_insert(rctl_hndl_t, rctl_val_t *)
1717  *
1718  * Overview
1719  *   Insert the rctl value into the appropriate rctl set for the calling
1720  *   process, given the handle.
1721  */
1722 int
1723 rctl_local_insert(rctl_hndl_t hndl, rctl_val_t *val, struct proc *p)
1724 {
1725 	return (rctl_local_op(hndl, NULL, val, rctl_local_insert_cb, p));
1726 }
1727 
1728 /*
1729  * rctl_local_insert_all_cb()
1730  *
1731  * Overview
1732  *   Called for RCENTITY_PROJECT rctls only, via rctlsys_projset().
1733  *
1734  *   Inserts new values from the project database (new_values).  alloc_values
1735  *   should be a linked list of pre-allocated rctl_val_t, which are used to
1736  *   populate (rc_projdb).
1737  *
1738  *   Should the *new_values linked list match the contents of the rctl's
1739  *   rp_projdb then we do nothing.
1740  *
1741  * Return Values
1742  *   0 is always returned.
1743  */
1744 /*ARGSUSED*/
1745 static int
1746 rctl_local_insert_all_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1747     rctl_t *rctl, rctl_val_t *new_values, rctl_val_t *alloc_values)
1748 {
1749 	rctl_val_t *val;
1750 	rctl_val_t *tmp_val;
1751 	rctl_val_t *next;
1752 	int modified = 0;
1753 
1754 	/*
1755 	 * If this the first time we've set this project rctl, then we delete
1756 	 * all the privilege values.  These privilege values have been set by
1757 	 * rctl_add_default_limit().
1758 	 *
1759 	 * We save some cycles here by not calling rctl_val_list_delete().
1760 	 */
1761 	if (rctl->rc_projdb == NULL) {
1762 		val = rctl->rc_values;
1763 
1764 		while (val != NULL) {
1765 			if (val->rcv_privilege == RCPRIV_PRIVILEGED) {
1766 				if (val->rcv_prev != NULL)
1767 					val->rcv_prev->rcv_next = val->rcv_next;
1768 				else
1769 					rctl->rc_values = val->rcv_next;
1770 
1771 				if (val->rcv_next != NULL)
1772 					val->rcv_next->rcv_prev = val->rcv_prev;
1773 
1774 				tmp_val = val;
1775 				val = val->rcv_next;
1776 				kmem_cache_free(rctl_val_cache, tmp_val);
1777 			} else {
1778 				val = val->rcv_next;
1779 			}
1780 		}
1781 		modified = 1;
1782 	}
1783 
1784 	/*
1785 	 * Delete active values previously set through the project database.
1786 	 */
1787 	val = rctl->rc_projdb;
1788 
1789 	while (val != NULL) {
1790 
1791 		/* Is the old value found in the new values? */
1792 		if (rctl_val_list_find(&new_values, val) == NULL) {
1793 
1794 			/*
1795 			 * Delete from the active values if it originated from
1796 			 * the project database.
1797 			 */
1798 			if (((tmp_val = rctl_val_list_find(&rctl->rc_values,
1799 			    val)) != NULL) &&
1800 			    (tmp_val->rcv_flagaction & RCTL_LOCAL_PROJDB)) {
1801 				(void) rctl_val_list_delete(&rctl->rc_values,
1802 				    tmp_val);
1803 			}
1804 
1805 			tmp_val = val->rcv_next;
1806 			(void) rctl_val_list_delete(&rctl->rc_projdb, val);
1807 			val = tmp_val;
1808 			modified = 1;
1809 
1810 		} else
1811 			val = val->rcv_next;
1812 	}
1813 
1814 	/*
1815 	 * Insert new values from the project database.
1816 	 */
1817 	while (new_values != NULL) {
1818 		next = new_values->rcv_next;
1819 
1820 		/*
1821 		 * Insert this new value into the rc_projdb, and duplicate this
1822 		 * entry to the active list.
1823 		 */
1824 		if (rctl_val_list_insert(&rctl->rc_projdb, new_values) == 0) {
1825 
1826 			tmp_val = alloc_values->rcv_next;
1827 			bcopy(new_values, alloc_values, sizeof (rctl_val_t));
1828 			alloc_values->rcv_next = tmp_val;
1829 
1830 			if (rctl_val_list_insert(&rctl->rc_values,
1831 				alloc_values) == 0) {
1832 				/* inserted move alloc_values on */
1833 				alloc_values = tmp_val;
1834 				modified = 1;
1835 			}
1836 		} else {
1837 			/*
1838 			 * Unlike setrctl() we don't want to return an error on
1839 			 * a duplicate entry; we are concerned solely with
1840 			 * ensuring that all the values specified are set.
1841 			 */
1842 			kmem_cache_free(rctl_val_cache, new_values);
1843 		}
1844 		new_values = next;
1845 	}
1846 
1847 	/* Teardown any unused rctl_val_t */
1848 	while (alloc_values != NULL) {
1849 		tmp_val = alloc_values;
1850 		alloc_values = alloc_values->rcv_next;
1851 		kmem_cache_free(rctl_val_cache, tmp_val);
1852 	}
1853 
1854 	/* Reset the cursor if rctl values have been modified */
1855 	if (modified) {
1856 		rctl->rc_cursor = rctl->rc_values;
1857 		rctl_val_list_reset(rctl->rc_cursor);
1858 		RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1859 		    rctl->rc_cursor->rcv_value));
1860 	}
1861 
1862 	return (0);
1863 }
1864 
1865 int
1866 rctl_local_insert_all(rctl_hndl_t hndl, rctl_val_t *new_values,
1867     rctl_val_t *alloc_values, struct proc *p)
1868 {
1869 	return (rctl_local_op(hndl, new_values, alloc_values,
1870 	    rctl_local_insert_all_cb, p));
1871 }
1872 
1873 /*
1874  * rctl_local_replace_all_cb()
1875  *
1876  * Overview
1877  *   Called for RCENTITY_PROJECT rctls only, via rctlsys_projset().
1878  *
1879  *   Clears the active rctl values (rc_values), and stored values from the
1880  *   previous insertions from the project database (rc_projdb).
1881  *
1882  *   Inserts new values from the project database (new_values).  alloc_values
1883  *   should be a linked list of pre-allocated rctl_val_t, which are used to
1884  *   populate (rc_projdb).
1885  *
1886  * Return Values
1887  *   0 is always returned.
1888  */
1889 /*ARGSUSED*/
1890 static int
1891 rctl_local_replace_all_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1892     rctl_t *rctl, rctl_val_t *new_values, rctl_val_t *alloc_values)
1893 {
1894 	rctl_val_t *val;
1895 	rctl_val_t *next;
1896 	rctl_val_t *tmp_val;
1897 
1898 	/* Delete all the privilege vaules */
1899 	val = rctl->rc_values;
1900 
1901 	while (val != NULL) {
1902 		if (val->rcv_privilege == RCPRIV_PRIVILEGED) {
1903 			if (val->rcv_prev != NULL)
1904 				val->rcv_prev->rcv_next = val->rcv_next;
1905 			else
1906 				rctl->rc_values = val->rcv_next;
1907 
1908 			if (val->rcv_next != NULL)
1909 				val->rcv_next->rcv_prev = val->rcv_prev;
1910 
1911 			tmp_val = val;
1912 			val = val->rcv_next;
1913 			kmem_cache_free(rctl_val_cache, tmp_val);
1914 		} else {
1915 			val = val->rcv_next;
1916 		}
1917 	}
1918 
1919 	/* Delete the contents of rc_projdb */
1920 	val = rctl->rc_projdb;
1921 	while (val != NULL) {
1922 
1923 		tmp_val = val;
1924 		val = val->rcv_next;
1925 		kmem_cache_free(rctl_val_cache, tmp_val);
1926 	}
1927 	rctl->rc_projdb = NULL;
1928 
1929 	/*
1930 	 * Insert new values from the project database.
1931 	 */
1932 	while (new_values != NULL) {
1933 		next = new_values->rcv_next;
1934 
1935 		if (rctl_val_list_insert(&rctl->rc_projdb, new_values) == 0) {
1936 			tmp_val = alloc_values->rcv_next;
1937 			bcopy(new_values, alloc_values, sizeof (rctl_val_t));
1938 			alloc_values->rcv_next = tmp_val;
1939 
1940 			if (rctl_val_list_insert(&rctl->rc_values,
1941 				alloc_values) == 0) {
1942 				/* inserted, so move alloc_values on */
1943 				alloc_values = tmp_val;
1944 			}
1945 		} else {
1946 			/*
1947 			 * Unlike setrctl() we don't want to return an error on
1948 			 * a duplicate entry; we are concerned solely with
1949 			 * ensuring that all the values specified are set.
1950 			 */
1951 			kmem_cache_free(rctl_val_cache, new_values);
1952 		}
1953 
1954 		new_values = next;
1955 	}
1956 
1957 	/* Teardown any unused rctl_val_t */
1958 	while (alloc_values != NULL) {
1959 		tmp_val = alloc_values;
1960 		alloc_values = alloc_values->rcv_next;
1961 		kmem_cache_free(rctl_val_cache, tmp_val);
1962 	}
1963 
1964 	/* Always reset the cursor */
1965 	rctl->rc_cursor = rctl->rc_values;
1966 	rctl_val_list_reset(rctl->rc_cursor);
1967 	RCTLOP_SET(rctl, p, e, rctl_model_value(rctl->rc_dict_entry, p,
1968 	    rctl->rc_cursor->rcv_value));
1969 
1970 	return (0);
1971 }
1972 
1973 int
1974 rctl_local_replace_all(rctl_hndl_t hndl, rctl_val_t *new_values,
1975     rctl_val_t *alloc_values, struct proc *p)
1976 {
1977 	return (rctl_local_op(hndl, new_values, alloc_values,
1978 	    rctl_local_replace_all_cb, p));
1979 }
1980 
1981 static int
1982 rctl_local_replace_cb(rctl_hndl_t hndl, struct proc *p, rctl_entity_p_t *e,
1983     rctl_t *rctl, rctl_val_t *oval, rctl_val_t *nval)
1984 {
1985 	int ret;
1986 	rctl_val_t *tmp;
1987 
1988 	/* Verify that old will be delete-able */
1989 	tmp = rctl_val_list_find(&rctl->rc_values, oval);
1990 	if (tmp == NULL)
1991 		return (ESRCH);
1992 	/*
1993 	 * Caller should verify that value being deleted is not the
1994 	 * system value.
1995 	 */
1996 	ASSERT(tmp->rcv_privilege != RCPRIV_SYSTEM);
1997 
1998 	/*
1999 	 * rctl_local_insert_cb() does the job of flagging an error
2000 	 * for any duplicate values. So, call rctl_local_insert_cb()
2001 	 * for the new value first, then do deletion of the old value.
2002 	 * Since this is a callback function to rctl_local_op, we can
2003 	 * count on rcs_lock being held at this point. This guarantees
2004 	 * that there is at no point a visible list which contains both
2005 	 * new and old values.
2006 	 */
2007 	if (ret = rctl_local_insert_cb(hndl, p, e, rctl, NULL, nval))
2008 		return (ret);
2009 
2010 	ret = rctl_local_delete_cb(hndl, p, e, rctl, NULL, oval);
2011 	ASSERT(ret == 0);
2012 	return (0);
2013 }
2014 
2015 /*
2016  * int rctl_local_replace(rctl_hndl_t, void *, int, uint64_t *)
2017  *
2018  * Overview
2019  *   Replace the rctl value with a new one.
2020  *
2021  * Return values
2022  *   0 for successful replace, errno otherwise.
2023  */
2024 int
2025 rctl_local_replace(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
2026     struct proc *p)
2027 {
2028 	return (rctl_local_op(hndl, oval, nval, rctl_local_replace_cb, p));
2029 }
2030 
2031 /*
2032  * int rctl_rlimit_get(rctl_hndl_t, struct proc *, struct rlimit64 *)
2033  *
2034  * Overview
2035  *   To support rlimit compatibility, we need a function which takes a 64-bit
2036  *   rlimit and encodes it as appropriate rcontrol values on the given rcontrol.
2037  *   This operation is only intended for legacy rlimits.
2038  */
2039 int
2040 rctl_rlimit_get(rctl_hndl_t rc, struct proc *p, struct rlimit64 *rlp64)
2041 {
2042 	rctl_t *rctl;
2043 	rctl_val_t *rval;
2044 	rctl_set_t *rset = p->p_rctls;
2045 	int soft_limit_seen = 0;
2046 	int test_for_deny = 1;
2047 
2048 	mutex_enter(&rset->rcs_lock);
2049 	if (rctl_set_find(rset, rc, &rctl) == -1) {
2050 		mutex_exit(&rset->rcs_lock);
2051 		return (-1);
2052 	}
2053 
2054 	rval = rctl->rc_values;
2055 
2056 	if (rctl->rc_dict_entry->rcd_flagaction & (RCTL_GLOBAL_DENY_NEVER |
2057 	    RCTL_GLOBAL_DENY_ALWAYS))
2058 		test_for_deny = 0;
2059 
2060 	/*
2061 	 * 1.  Find the first control value with the RCTL_LOCAL_DENY bit set.
2062 	 */
2063 	while (rval != NULL && rval->rcv_privilege != RCPRIV_SYSTEM) {
2064 		if (test_for_deny &&
2065 		    (rval->rcv_flagaction & RCTL_LOCAL_DENY) == 0) {
2066 			rval = rval->rcv_next;
2067 			continue;
2068 		}
2069 
2070 		/*
2071 		 * 2.  If this is an RCPRIV_BASIC value, then we've found the
2072 		 * effective soft limit and should set rlim_cur.  We should then
2073 		 * continue looking for another control value with the DENY bit
2074 		 * set.
2075 		 */
2076 		if (rval->rcv_privilege == RCPRIV_BASIC) {
2077 			if (soft_limit_seen) {
2078 				rval = rval->rcv_next;
2079 				continue;
2080 			}
2081 
2082 			if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
2083 			    rval->rcv_value < rctl_model_maximum(
2084 			    rctl->rc_dict_entry, p))
2085 				rlp64->rlim_cur = rval->rcv_value;
2086 			else
2087 				rlp64->rlim_cur = RLIM64_INFINITY;
2088 			soft_limit_seen = 1;
2089 
2090 			rval = rval->rcv_next;
2091 			continue;
2092 		}
2093 
2094 		/*
2095 		 * 3.  This is an RCPRIV_PRIVILEGED value.  If we haven't found
2096 		 * a soft limit candidate, then we've found the effective hard
2097 		 * and soft limits and should set both  If we had found a soft
2098 		 * limit, then this is only the hard limit and we need only set
2099 		 * rlim_max.
2100 		 */
2101 		if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
2102 		    rval->rcv_value < rctl_model_maximum(rctl->rc_dict_entry,
2103 		    p))
2104 			rlp64->rlim_max = rval->rcv_value;
2105 		else
2106 			rlp64->rlim_max = RLIM64_INFINITY;
2107 		if (!soft_limit_seen)
2108 			rlp64->rlim_cur = rlp64->rlim_max;
2109 
2110 		mutex_exit(&rset->rcs_lock);
2111 		return (0);
2112 	}
2113 
2114 	if (rval == NULL) {
2115 		/*
2116 		 * This control sequence is corrupt, as it is not terminated by
2117 		 * a system privileged control value.
2118 		 */
2119 		mutex_exit(&rset->rcs_lock);
2120 		return (-1);
2121 	}
2122 
2123 	/*
2124 	 * 4.  If we run into a RCPRIV_SYSTEM value, then the hard limit (and
2125 	 * the soft, if we haven't a soft candidate) should be the value of the
2126 	 * system control value.
2127 	 */
2128 	if ((rval->rcv_flagaction & RCTL_LOCAL_MAXIMAL) == 0 &&
2129 	    rval->rcv_value < rctl_model_maximum(rctl->rc_dict_entry, p))
2130 		rlp64->rlim_max = rval->rcv_value;
2131 	else
2132 		rlp64->rlim_max = RLIM64_INFINITY;
2133 
2134 	if (!soft_limit_seen)
2135 		rlp64->rlim_cur = rlp64->rlim_max;
2136 
2137 	mutex_exit(&rset->rcs_lock);
2138 	return (0);
2139 }
2140 
2141 /*
2142  * rctl_alloc_gp_t *rctl_rlimit_set_prealloc(uint_t)
2143  *
2144  * Overview
2145  *   Before making a series of calls to rctl_rlimit_set(), we must have a
2146  *   preallocated batch of resource control values, as rctl_rlimit_set() can
2147  *   potentially consume two resource control values per call.
2148  *
2149  * Return values
2150  *   A populated resource control allocation group with 2n resource control
2151  *   values.
2152  *
2153  * Caller's context
2154  *   Must be safe for KM_SLEEP allocations.
2155  */
2156 rctl_alloc_gp_t *
2157 rctl_rlimit_set_prealloc(uint_t n)
2158 {
2159 	rctl_alloc_gp_t *gp = kmem_zalloc(sizeof (rctl_alloc_gp_t), KM_SLEEP);
2160 
2161 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
2162 
2163 	gp->rcag_nvals = 2 * n;
2164 
2165 	rctl_gp_alloc(gp);
2166 
2167 	return (gp);
2168 }
2169 
2170 /*
2171  * int rctl_rlimit_set(rctl_hndl_t, struct proc *, struct rlimit64 *, int,
2172  *   int)
2173  *
2174  * Overview
2175  *   To support rlimit compatibility, we need a function which takes a 64-bit
2176  *   rlimit and encodes it as appropriate rcontrol values on the given rcontrol.
2177  *   This operation is only intended for legacy rlimits.
2178  *
2179  *   The implementation of rctl_rlimit_set() is a bit clever, as it tries to
2180  *   minimize the number of values placed on the value sequence in various
2181  *   cases.  Furthermore, we don't allow multiple identical privilege-action
2182  *   values on the same sequence.  (That is, we don't want a sequence like
2183  *   "while (1) { rlim.rlim_cur++; setrlimit(..., rlim); }" to exhaust kernel
2184  *   memory.)  So we want to delete any values with the same privilege value and
2185  *   action.
2186  *
2187  * Return values
2188  *   0 for successful set, errno otherwise. Errno will be either EINVAL
2189  *   or EPERM, in keeping with defined errnos for ulimit() and setrlimit()
2190  *   system calls.
2191  */
2192 /*ARGSUSED*/
2193 int
2194 rctl_rlimit_set(rctl_hndl_t rc, struct proc *p, struct rlimit64 *rlp64,
2195     rctl_alloc_gp_t *ragp, int flagaction, int signal, const cred_t *cr)
2196 {
2197 	rctl_t *rctl;
2198 	rctl_val_t *rval, *rval_priv, *rval_basic;
2199 	rctl_set_t *rset = p->p_rctls;
2200 	rctl_qty_t max;
2201 	rctl_entity_p_t e;
2202 	struct rlimit64 cur_rl;
2203 
2204 	e.rcep_t = RCENTITY_PROCESS;
2205 	e.rcep_p.proc = p;
2206 
2207 	if (rlp64->rlim_cur > rlp64->rlim_max)
2208 		return (EINVAL);
2209 
2210 	if (rctl_rlimit_get(rc, p, &cur_rl) == -1)
2211 		return (EINVAL);
2212 
2213 	/*
2214 	 * If we are not privileged, we can only lower the hard limit.
2215 	 */
2216 	if ((rlp64->rlim_max > cur_rl.rlim_max) &&
2217 	    cur_rl.rlim_max != RLIM64_INFINITY &&
2218 	    secpolicy_resource(cr) != 0)
2219 		return (EPERM);
2220 
2221 	mutex_enter(&rset->rcs_lock);
2222 
2223 	if (rctl_set_find(rset, rc, &rctl) == -1) {
2224 		mutex_exit(&rset->rcs_lock);
2225 		return (EINVAL);
2226 	}
2227 
2228 	rval_priv = rctl_gp_detach_val(ragp);
2229 
2230 	rval = rctl->rc_values;
2231 
2232 	while (rval != NULL) {
2233 		rctl_val_t *next = rval->rcv_next;
2234 
2235 		if (rval->rcv_privilege == RCPRIV_SYSTEM)
2236 			break;
2237 
2238 		if ((rval->rcv_privilege == RCPRIV_BASIC) ||
2239 		    (rval->rcv_flagaction & ~RCTL_LOCAL_ACTION_MASK) ==
2240 		    (flagaction & ~RCTL_LOCAL_ACTION_MASK)) {
2241 			if (rctl->rc_cursor == rval) {
2242 				rctl->rc_cursor = rval->rcv_next;
2243 				rctl_val_list_reset(rctl->rc_cursor);
2244 				RCTLOP_SET(rctl, p, &e, rctl_model_value(
2245 				    rctl->rc_dict_entry, p,
2246 				    rctl->rc_cursor->rcv_value));
2247 			}
2248 			(void) rctl_val_list_delete(&rctl->rc_values, rval);
2249 		}
2250 
2251 		rval = next;
2252 	}
2253 
2254 	rval_priv->rcv_privilege = RCPRIV_PRIVILEGED;
2255 	rval_priv->rcv_flagaction = flagaction;
2256 	if (rlp64->rlim_max == RLIM64_INFINITY) {
2257 		rval_priv->rcv_flagaction |= RCTL_LOCAL_MAXIMAL;
2258 		max = rctl->rc_dict_entry->rcd_max_native;
2259 	} else {
2260 		max = rlp64->rlim_max;
2261 	}
2262 	rval_priv->rcv_value = max;
2263 	rval_priv->rcv_action_signal = signal;
2264 	rval_priv->rcv_action_recipient = NULL;
2265 	rval_priv->rcv_action_recip_pid = -1;
2266 	rval_priv->rcv_firing_time = 0;
2267 	rval_priv->rcv_prev = rval_priv->rcv_next = NULL;
2268 
2269 	(void) rctl_val_list_insert(&rctl->rc_values, rval_priv);
2270 	rctl->rc_cursor = rval_priv;
2271 	rctl_val_list_reset(rctl->rc_cursor);
2272 	RCTLOP_SET(rctl, p, &e, rctl_model_value(rctl->rc_dict_entry, p,
2273 	    rctl->rc_cursor->rcv_value));
2274 
2275 	if (rlp64->rlim_cur != RLIM64_INFINITY && rlp64->rlim_cur < max) {
2276 		rval_basic = rctl_gp_detach_val(ragp);
2277 
2278 		rval_basic->rcv_privilege = RCPRIV_BASIC;
2279 		rval_basic->rcv_value = rlp64->rlim_cur;
2280 		rval_basic->rcv_flagaction = flagaction;
2281 		rval_basic->rcv_action_signal = signal;
2282 		rval_basic->rcv_action_recipient = p;
2283 		rval_basic->rcv_action_recip_pid = p->p_pid;
2284 		rval_basic->rcv_firing_time = 0;
2285 		rval_basic->rcv_prev = rval_basic->rcv_next = NULL;
2286 
2287 		(void) rctl_val_list_insert(&rctl->rc_values, rval_basic);
2288 		rctl->rc_cursor = rval_basic;
2289 		rctl_val_list_reset(rctl->rc_cursor);
2290 		RCTLOP_SET(rctl, p, &e, rctl_model_value(rctl->rc_dict_entry, p,
2291 		    rctl->rc_cursor->rcv_value));
2292 	}
2293 
2294 	ASSERT(rctl->rc_cursor != NULL);
2295 
2296 	mutex_exit(&rset->rcs_lock);
2297 	return (0);
2298 }
2299 
2300 
2301 /*
2302  * rctl_hndl_t rctl_register(const char *, rctl_entity_t, int, rlim64_t,
2303  *   rlim64_t, rctl_ops_t *)
2304  *
2305  * Overview
2306  *   rctl_register() performs a look-up in the dictionary of rctls
2307  *   active on the system; if a rctl of that name is absent, an entry is
2308  *   made into the dictionary.  The rctl is returned with its reference
2309  *   count incremented by one.  If the rctl name already exists, we panic.
2310  *   (Were the resource control system to support dynamic loading and unloading,
2311  *   which it is structured for, duplicate registration should lead to load
2312  *   failure instead of panicking.)
2313  *
2314  *   Each registered rctl has a requirement that a RCPRIV_SYSTEM limit be
2315  *   defined.  This limit contains the highest possible value for this quantity
2316  *   on the system.  Furthermore, the registered control must provide infinite
2317  *   values for all applicable address space models supported by the operating
2318  *   system.  Attempts to set resource control values beyond the system limit
2319  *   will fail.
2320  *
2321  * Return values
2322  *   The rctl's ID.
2323  *
2324  * Caller's context
2325  *   Caller must be in a context suitable for KM_SLEEP allocations.
2326  */
2327 rctl_hndl_t
2328 rctl_register(
2329     const char *name,
2330     rctl_entity_t entity,
2331     int global_flags,
2332     rlim64_t max_native,
2333     rlim64_t max_ilp32,
2334     rctl_ops_t *ops)
2335 {
2336 	rctl_t *rctl = kmem_cache_alloc(rctl_cache, KM_SLEEP);
2337 	rctl_val_t *rctl_val = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2338 	rctl_dict_entry_t *rctl_de = kmem_zalloc(sizeof (rctl_dict_entry_t),
2339 	    KM_SLEEP);
2340 	rctl_t *old_rctl;
2341 	rctl_hndl_t rhndl;
2342 	int localflags;
2343 
2344 	ASSERT(ops != NULL);
2345 
2346 	bzero(rctl, sizeof (rctl_t));
2347 	bzero(rctl_val, sizeof (rctl_val_t));
2348 
2349 	if (global_flags & RCTL_GLOBAL_DENY_NEVER)
2350 		localflags = RCTL_LOCAL_MAXIMAL;
2351 	else
2352 		localflags = RCTL_LOCAL_MAXIMAL | RCTL_LOCAL_DENY;
2353 
2354 	rctl_val->rcv_privilege = RCPRIV_SYSTEM;
2355 	rctl_val->rcv_value = max_native;
2356 	rctl_val->rcv_flagaction = localflags;
2357 	rctl_val->rcv_action_signal = 0;
2358 	rctl_val->rcv_action_recipient = NULL;
2359 	rctl_val->rcv_action_recip_pid = -1;
2360 	rctl_val->rcv_firing_time = 0;
2361 	rctl_val->rcv_next = NULL;
2362 	rctl_val->rcv_prev = NULL;
2363 
2364 	rctl_de->rcd_name = (char *)name;
2365 	rctl_de->rcd_default_value = rctl_val;
2366 	rctl_de->rcd_max_native = max_native;
2367 	rctl_de->rcd_max_ilp32 = max_ilp32;
2368 	rctl_de->rcd_entity = entity;
2369 	rctl_de->rcd_ops = ops;
2370 	rctl_de->rcd_flagaction = global_flags;
2371 
2372 	rctl->rc_dict_entry = rctl_de;
2373 	rctl->rc_values = rctl_val;
2374 
2375 	/*
2376 	 * 1.  Take global lock, validate nonexistence of name, get ID.
2377 	 */
2378 	mutex_enter(&rctl_dict_lock);
2379 
2380 	if (mod_hash_find(rctl_dict_by_name, (mod_hash_key_t)name,
2381 	    (mod_hash_val_t *)&rhndl) != MH_ERR_NOTFOUND)
2382 		panic("duplicate registration of rctl %s", name);
2383 
2384 	rhndl = rctl_de->rcd_id = rctl->rc_id =
2385 	    (rctl_hndl_t)id_alloc(rctl_ids);
2386 
2387 	/*
2388 	 * 2.  Insert name-entry pair in rctl_dict_by_name.
2389 	 */
2390 	if (mod_hash_insert(rctl_dict_by_name, (mod_hash_key_t)name,
2391 	    (mod_hash_val_t)rctl_de))
2392 		panic("unable to insert rctl dict entry for %s (%u)", name,
2393 		    (uint_t)rctl->rc_id);
2394 
2395 	/*
2396 	 * 3.  Insert ID-rctl_t * pair in rctl_dict.
2397 	 */
2398 	if (mod_hash_find(rctl_dict, (mod_hash_key_t)(uintptr_t)rctl->rc_id,
2399 	    (mod_hash_val_t *)&old_rctl) != MH_ERR_NOTFOUND)
2400 		panic("duplicate rctl ID %u registered", rctl->rc_id);
2401 
2402 	if (mod_hash_insert(rctl_dict, (mod_hash_key_t)(uintptr_t)rctl->rc_id,
2403 	    (mod_hash_val_t)rctl))
2404 		panic("unable to insert rctl %s/%u (%p)", name,
2405 		    (uint_t)rctl->rc_id, rctl);
2406 
2407 	/*
2408 	 * 3a. Insert rctl_dict_entry_t * in appropriate entity list.
2409 	 */
2410 
2411 	mutex_enter(&rctl_lists_lock);
2412 
2413 	switch (entity) {
2414 	case RCENTITY_ZONE:
2415 	case RCENTITY_PROJECT:
2416 	case RCENTITY_TASK:
2417 	case RCENTITY_PROCESS:
2418 		rctl_de->rcd_next = rctl_lists[entity];
2419 		rctl_lists[entity] = rctl_de;
2420 		break;
2421 	default:
2422 		panic("registering unknown rctl entity %d (%s)", entity,
2423 		    name);
2424 		break;
2425 	}
2426 
2427 	mutex_exit(&rctl_lists_lock);
2428 
2429 	/*
2430 	 * 4.  Drop lock.
2431 	 */
2432 	mutex_exit(&rctl_dict_lock);
2433 
2434 	return (rhndl);
2435 }
2436 
2437 /*
2438  * static int rctl_global_action(rctl_t *r, rctl_set_t *rset, struct proc *p,
2439  *    rctl_val_t *v)
2440  *
2441  * Overview
2442  *   rctl_global_action() takes, in according with the flags on the rctl_dict
2443  *   entry for the given control, the appropriate actions on the exceeded
2444  *   control value.  Additionally, rctl_global_action() updates the firing time
2445  *   on the exceeded value.
2446  *
2447  * Return values
2448  *   A bitmask reflecting the actions actually taken.
2449  *
2450  * Caller's context
2451  *   No restrictions on context.
2452  */
2453 /*ARGSUSED*/
2454 static int
2455 rctl_global_action(rctl_t *r, rctl_set_t *rset, struct proc *p, rctl_val_t *v)
2456 {
2457 	rctl_dict_entry_t *rde = r->rc_dict_entry;
2458 	const char *pr, *en, *idstr;
2459 	id_t id;
2460 	enum {
2461 		SUFFIX_NONE,	/* id consumed directly */
2462 		SUFFIX_NUMERIC,	/* id consumed in suffix */
2463 		SUFFIX_STRING	/* idstr consumed in suffix */
2464 	} suffix = SUFFIX_NONE;
2465 	int ret = 0;
2466 
2467 	v->rcv_firing_time = gethrtime();
2468 
2469 	switch (v->rcv_privilege) {
2470 	case RCPRIV_BASIC:
2471 		pr = "basic";
2472 		break;
2473 	case RCPRIV_PRIVILEGED:
2474 		pr = "privileged";
2475 		break;
2476 	case RCPRIV_SYSTEM:
2477 		pr = "system";
2478 		break;
2479 	default:
2480 		pr = "unknown";
2481 		break;
2482 	}
2483 
2484 	switch (rde->rcd_entity) {
2485 	case RCENTITY_PROCESS:
2486 		en = "process";
2487 		id = p->p_pid;
2488 		suffix = SUFFIX_NONE;
2489 		break;
2490 	case RCENTITY_TASK:
2491 		en = "task";
2492 		id = p->p_task->tk_tkid;
2493 		suffix = SUFFIX_NUMERIC;
2494 		break;
2495 	case RCENTITY_PROJECT:
2496 		en = "project";
2497 		id = p->p_task->tk_proj->kpj_id;
2498 		suffix = SUFFIX_NUMERIC;
2499 		break;
2500 	case RCENTITY_ZONE:
2501 		en = "zone";
2502 		idstr = p->p_zone->zone_name;
2503 		suffix = SUFFIX_STRING;
2504 		break;
2505 	default:
2506 		en = "unknown entity associated with process";
2507 		id = p->p_pid;
2508 		suffix = SUFFIX_NONE;
2509 		break;
2510 	}
2511 
2512 	if (rde->rcd_flagaction & RCTL_GLOBAL_SYSLOG) {
2513 		switch (suffix) {
2514 		default:
2515 		case SUFFIX_NONE:
2516 			(void) strlog(0, 0, 0,
2517 			    rde->rcd_strlog_flags | log_global.lz_active,
2518 			    "%s rctl %s (value %llu) exceeded by %s %d.",
2519 			    pr, rde->rcd_name, v->rcv_value, en, id);
2520 			break;
2521 		case SUFFIX_NUMERIC:
2522 			(void) strlog(0, 0, 0,
2523 			    rde->rcd_strlog_flags | log_global.lz_active,
2524 			    "%s rctl %s (value %llu) exceeded by process %d"
2525 			    " in %s %d.",
2526 			    pr, rde->rcd_name, v->rcv_value, p->p_pid,
2527 			    en, id);
2528 			break;
2529 		case SUFFIX_STRING:
2530 			(void) strlog(0, 0, 0,
2531 			    rde->rcd_strlog_flags | log_global.lz_active,
2532 			    "%s rctl %s (value %llu) exceeded by process %d"
2533 			    " in %s %s.",
2534 			    pr, rde->rcd_name, v->rcv_value, p->p_pid,
2535 			    en, idstr);
2536 			break;
2537 		}
2538 	}
2539 
2540 	if (rde->rcd_flagaction & RCTL_GLOBAL_DENY_ALWAYS)
2541 		ret |= RCT_DENY;
2542 
2543 	return (ret);
2544 }
2545 
2546 static int
2547 rctl_local_action(rctl_t *r, rctl_set_t *rset, struct proc *p, rctl_val_t *v,
2548     uint_t safety)
2549 {
2550 	int ret = 0;
2551 	sigqueue_t *sqp = NULL;
2552 	rctl_dict_entry_t *rde = r->rc_dict_entry;
2553 	int unobservable = (rde->rcd_flagaction & RCTL_GLOBAL_UNOBSERVABLE);
2554 
2555 	proc_t *recipient = v->rcv_action_recipient;
2556 	id_t recip_pid = v->rcv_action_recip_pid;
2557 	int recip_signal = v->rcv_action_signal;
2558 	uint_t flagaction = v->rcv_flagaction;
2559 
2560 	if (safety == RCA_UNSAFE_ALL) {
2561 		if (flagaction & RCTL_LOCAL_DENY) {
2562 			ret |= RCT_DENY;
2563 		}
2564 		return (ret);
2565 	}
2566 
2567 	if (flagaction & RCTL_LOCAL_SIGNAL) {
2568 		/*
2569 		 * We can build a siginfo only in the case that it is
2570 		 * safe for us to drop p_lock.  (For asynchronous
2571 		 * checks this is currently not true.)
2572 		 */
2573 		if (safety == RCA_SAFE) {
2574 			mutex_exit(&rset->rcs_lock);
2575 			mutex_exit(&p->p_lock);
2576 			sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
2577 			mutex_enter(&p->p_lock);
2578 			mutex_enter(&rset->rcs_lock);
2579 
2580 			sqp->sq_info.si_signo = recip_signal;
2581 			sqp->sq_info.si_code = SI_RCTL;
2582 			sqp->sq_info.si_errno = 0;
2583 			sqp->sq_info.si_entity = (int)rde->rcd_entity;
2584 		}
2585 
2586 		if (recipient == NULL || recipient == p) {
2587 			ret |= RCT_SIGNAL;
2588 
2589 			if (sqp == NULL) {
2590 				sigtoproc(p, NULL, recip_signal);
2591 			} else if (p == curproc) {
2592 				/*
2593 				 * Then this is a synchronous test and we can
2594 				 * direct the signal at the violating thread.
2595 				 */
2596 				sigaddqa(curproc, curthread, sqp);
2597 			} else {
2598 				sigaddqa(p, NULL, sqp);
2599 			}
2600 		} else if (!unobservable) {
2601 			proc_t *rp;
2602 
2603 			mutex_exit(&rset->rcs_lock);
2604 			mutex_exit(&p->p_lock);
2605 
2606 			mutex_enter(&pidlock);
2607 			if ((rp = prfind(recip_pid)) == recipient) {
2608 				/*
2609 				 * Recipient process is still alive, but may not
2610 				 * be in this task or project any longer.  In
2611 				 * this case, the recipient's resource control
2612 				 * set pertinent to this control will have
2613 				 * changed--and we will not deliver the signal,
2614 				 * as the recipient process is trying to tear
2615 				 * itself off of its former set.
2616 				 */
2617 				mutex_enter(&rp->p_lock);
2618 				mutex_exit(&pidlock);
2619 
2620 				if (rctl_entity_obtain_rset(rde, rp) == rset) {
2621 					ret |= RCT_SIGNAL;
2622 
2623 					if (sqp == NULL)
2624 						sigtoproc(rp, NULL,
2625 						    recip_signal);
2626 					else
2627 						sigaddqa(rp, NULL, sqp);
2628 				} else if (sqp) {
2629 					kmem_free(sqp, sizeof (sigqueue_t));
2630 				}
2631 				mutex_exit(&rp->p_lock);
2632 			} else {
2633 				mutex_exit(&pidlock);
2634 				if (sqp)
2635 					kmem_free(sqp, sizeof (sigqueue_t));
2636 			}
2637 
2638 			mutex_enter(&p->p_lock);
2639 			/*
2640 			 * Since we dropped p_lock, we may no longer be in the
2641 			 * same task or project as we were at entry.  It is thus
2642 			 * unsafe for us to reacquire the set lock at this
2643 			 * point; callers of rctl_local_action() must handle
2644 			 * this possibility.
2645 			 */
2646 			ret |= RCT_LK_ABANDONED;
2647 		} else if (sqp) {
2648 			kmem_free(sqp, sizeof (sigqueue_t));
2649 		}
2650 	}
2651 
2652 	if ((flagaction & RCTL_LOCAL_DENY) &&
2653 	    (recipient == NULL || recipient == p)) {
2654 		ret |= RCT_DENY;
2655 	}
2656 
2657 	return (ret);
2658 }
2659 
2660 /*
2661  * int rctl_action(rctl_hndl_t, rctl_set_t *, struct proc *, uint_t)
2662  *
2663  * Overview
2664  *   Take the action associated with the enforced value (as defined by
2665  *   rctl_get_enforced_value()) being exceeded or encountered.  Possibly perform
2666  *   a restricted subset of the available actions, if circumstances dictate that
2667  *   we cannot safely allocate memory (for a sigqueue_t) or guarantee process
2668  *   persistence across the duration of the function (an asynchronous action).
2669  *
2670  * Return values
2671  *   Actions taken, according to the rctl_test bitmask.
2672  *
2673  * Caller's context
2674  *   Safe to acquire rcs_lock.
2675  */
2676 int
2677 rctl_action(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p, uint_t safety)
2678 {
2679 	return (rctl_action_entity(hndl, rset, p, NULL, safety));
2680 }
2681 
2682 int
2683 rctl_action_entity(rctl_hndl_t hndl, rctl_set_t *rset, struct proc *p,
2684     rctl_entity_p_t *e, uint_t safety)
2685 {
2686 	int ret = RCT_NONE;
2687 	rctl_t *lrctl;
2688 	rctl_entity_p_t e_tmp;
2689 
2690 rctl_action_acquire:
2691 	mutex_enter(&rset->rcs_lock);
2692 	if (rctl_set_find(rset, hndl, &lrctl) == -1) {
2693 		mutex_exit(&rset->rcs_lock);
2694 		return (ret);
2695 	}
2696 
2697 	if (e == NULL) {
2698 		rctl_entity_obtain_entity_p(lrctl->rc_dict_entry->rcd_entity,
2699 		p, &e_tmp);
2700 		e = &e_tmp;
2701 	}
2702 
2703 	if ((ret & RCT_LK_ABANDONED) == 0) {
2704 		ret |= rctl_global_action(lrctl, rset, p, lrctl->rc_cursor);
2705 
2706 		RCTLOP_ACTION(lrctl, p, e);
2707 
2708 		ret |= rctl_local_action(lrctl, rset, p,
2709 		    lrctl->rc_cursor, safety);
2710 
2711 		if (ret & RCT_LK_ABANDONED)
2712 			goto rctl_action_acquire;
2713 	}
2714 
2715 	ret &= ~RCT_LK_ABANDONED;
2716 
2717 	if (!(ret & RCT_DENY) &&
2718 	    lrctl->rc_cursor->rcv_next != NULL) {
2719 		lrctl->rc_cursor = lrctl->rc_cursor->rcv_next;
2720 
2721 		RCTLOP_SET(lrctl, p, e, rctl_model_value(lrctl->rc_dict_entry,
2722 		    p, lrctl->rc_cursor->rcv_value));
2723 
2724 	}
2725 	mutex_exit(&rset->rcs_lock);
2726 
2727 	return (ret);
2728 }
2729 
2730 /*
2731  * int rctl_test(rctl_hndl_t, rctl_set_t *, struct proc *, rctl_qty_t, uint_t)
2732  *
2733  * Overview
2734  *   Increment the resource associated with the given handle, returning zero if
2735  *   the incremented value does not exceed the threshold for the current limit
2736  *   on the resource.
2737  *
2738  * Return values
2739  *   Actions taken, according to the rctl_test bitmask.
2740  *
2741  * Caller's context
2742  *   p_lock held by caller.
2743  */
2744 /*ARGSUSED*/
2745 int
2746 rctl_test(rctl_hndl_t rhndl, rctl_set_t *rset, struct proc *p,
2747     rctl_qty_t incr, uint_t flags)
2748 {
2749 	return (rctl_test_entity(rhndl, rset, p, NULL, incr, flags));
2750 }
2751 
2752 int
2753 rctl_test_entity(rctl_hndl_t rhndl, rctl_set_t *rset, struct proc *p,
2754     rctl_entity_p_t *e, rctl_qty_t incr, uint_t flags)
2755 {
2756 	rctl_t *lrctl;
2757 	int ret = RCT_NONE;
2758 	rctl_entity_p_t e_tmp;
2759 	if (p == &p0) {
2760 		/*
2761 		 * We don't enforce rctls on the kernel itself.
2762 		 */
2763 		return (ret);
2764 	}
2765 
2766 rctl_test_acquire:
2767 	ASSERT(MUTEX_HELD(&p->p_lock));
2768 
2769 	mutex_enter(&rset->rcs_lock);
2770 
2771 	/*
2772 	 * Dereference from rctl_set.  We don't enforce newly loaded controls
2773 	 * that haven't been set on this entity (since the only valid value is
2774 	 * the infinite system value).
2775 	 */
2776 	if (rctl_set_find(rset, rhndl, &lrctl) == -1) {
2777 		mutex_exit(&rset->rcs_lock);
2778 		return (ret);
2779 	}
2780 
2781 	/*
2782 	 * This control is currently unenforced:  maximal value on control
2783 	 * supporting infinitely available resource.
2784 	 */
2785 	if ((lrctl->rc_dict_entry->rcd_flagaction & RCTL_GLOBAL_INFINITE) &&
2786 	    (lrctl->rc_cursor->rcv_flagaction & RCTL_LOCAL_MAXIMAL)) {
2787 
2788 		mutex_exit(&rset->rcs_lock);
2789 		return (ret);
2790 	}
2791 
2792 	/*
2793 	 * If we have been called by rctl_test, look up the entity pointer
2794 	 * from the proc pointer.
2795 	 */
2796 	if (e == NULL) {
2797 		rctl_entity_obtain_entity_p(lrctl->rc_dict_entry->rcd_entity,
2798 		p, &e_tmp);
2799 		e = &e_tmp;
2800 	}
2801 
2802 	/*
2803 	 * Get enforced rctl value and current usage.  Test the increment
2804 	 * with the current usage against the enforced value--take action as
2805 	 * necessary.
2806 	 */
2807 	while (RCTLOP_TEST(lrctl, p, e, lrctl->rc_cursor, incr, flags)) {
2808 		if ((ret & RCT_LK_ABANDONED) == 0) {
2809 			ret |= rctl_global_action(lrctl, rset, p,
2810 			    lrctl->rc_cursor);
2811 
2812 			RCTLOP_ACTION(lrctl, p, e);
2813 
2814 			ret |= rctl_local_action(lrctl, rset, p,
2815 			    lrctl->rc_cursor, flags);
2816 
2817 			if (ret & RCT_LK_ABANDONED)
2818 				goto rctl_test_acquire;
2819 		}
2820 
2821 		ret &= ~RCT_LK_ABANDONED;
2822 
2823 		if ((ret & RCT_DENY) == RCT_DENY ||
2824 		    lrctl->rc_cursor->rcv_next == NULL) {
2825 			ret |= RCT_DENY;
2826 			break;
2827 		}
2828 
2829 		lrctl->rc_cursor = lrctl->rc_cursor->rcv_next;
2830 		RCTLOP_SET(lrctl, p, e, rctl_model_value(lrctl->rc_dict_entry,
2831 		    p, lrctl->rc_cursor->rcv_value));
2832 	}
2833 
2834 	mutex_exit(&rset->rcs_lock);
2835 
2836 	return (ret);
2837 }
2838 
2839 /*
2840  * void rctl_init(void)
2841  *
2842  * Overview
2843  *   Initialize the rctl subsystem, including the primoridal rctls
2844  *   provided by the system.  New subsystem-specific rctls should _not_ be
2845  *   initialized here.  (Do it in your own file.)
2846  *
2847  * Return values
2848  *   None.
2849  *
2850  * Caller's context
2851  *   Safe for KM_SLEEP allocations.  Must be called prior to any process model
2852  *   initialization.
2853  */
2854 void
2855 rctl_init(void)
2856 {
2857 	rctl_cache = kmem_cache_create("rctl_cache", sizeof (rctl_t),
2858 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2859 	rctl_val_cache = kmem_cache_create("rctl_val_cache",
2860 	    sizeof (rctl_val_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
2861 
2862 	rctl_dict = mod_hash_create_extended("rctl_dict",
2863 	    rctl_dict_size, mod_hash_null_keydtor, rctl_dict_val_dtor,
2864 	    rctl_dict_hash_by_id, NULL, rctl_dict_id_cmp, KM_SLEEP);
2865 	rctl_dict_by_name = mod_hash_create_strhash(
2866 	    "rctl_handles_by_name", rctl_dict_size,
2867 	    mod_hash_null_valdtor);
2868 	rctl_ids = id_space_create("rctl_ids", 1, max_rctl_hndl);
2869 	bzero(rctl_lists, (RC_MAX_ENTITY + 1) * sizeof (rctl_dict_entry_t *));
2870 
2871 	rctlproc_init();
2872 }
2873 
2874 /*
2875  * rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc)
2876  *
2877  * Increments the amount of locked memory on a project, and
2878  * zone. If proj is NULL, the proj and zone of proc_t p is used.  If
2879  * chargeproc is non-zero, then the charged amount is cached on p->p_locked_mem
2880  * so that the charge can be migrated when a process changes projects.
2881  *
2882  * Return values
2883  *    0 - success
2884  *    EAGAIN - attempting to increment locked memory is denied by one
2885  *      or more resource entities.
2886  */
2887 int
2888 rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
2889     int chargeproc)
2890 {
2891 	kproject_t *projp;
2892 	zone_t *zonep;
2893 	rctl_entity_p_t e;
2894 	int ret = 0;
2895 
2896 	ASSERT(p != NULL);
2897 	ASSERT(MUTEX_HELD(&p->p_lock));
2898 	if (proj != NULL) {
2899 		projp = proj;
2900 		zonep = zone_find_by_id(projp->kpj_zoneid);
2901 	} else {
2902 		projp = p->p_task->tk_proj;
2903 		zonep = p->p_zone;
2904 	}
2905 
2906 	mutex_enter(&zonep->zone_mem_lock);
2907 
2908 	e.rcep_p.proj = projp;
2909 	e.rcep_t = RCENTITY_PROJECT;
2910 	if (projp->kpj_data.kpd_locked_mem + inc >
2911 	    projp->kpj_data.kpd_locked_mem_ctl) {
2912 		if (rctl_test_entity(rc_project_locked_mem, projp->kpj_rctls,
2913 		    p, &e, inc, 0) & RCT_DENY) {
2914 			ret = EAGAIN;
2915 			goto out;
2916 		}
2917 	}
2918 	e.rcep_p.zone = zonep;
2919 	e.rcep_t = RCENTITY_ZONE;
2920 	if (zonep->zone_locked_mem + inc > zonep->zone_locked_mem_ctl) {
2921 		if (rctl_test_entity(rc_zone_locked_mem, zonep->zone_rctls,
2922 		    p, &e, inc, 0) & RCT_DENY) {
2923 			ret = EAGAIN;
2924 			goto out;
2925 		}
2926 	}
2927 
2928 	zonep->zone_locked_mem += inc;
2929 	projp->kpj_data.kpd_locked_mem += inc;
2930 	if (chargeproc != 0) {
2931 		p->p_locked_mem += inc;
2932 	}
2933 out:
2934 	mutex_exit(&zonep->zone_mem_lock);
2935 	if (proj != NULL)
2936 		zone_rele(zonep);
2937 	return (ret);
2938 }
2939 
2940 /*
2941  * rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc)
2942  *
2943  * Decrements the amount of locked memory on a project and
2944  * zone.  If proj is NULL, the proj and zone of proc_t p is used.  If
2945  * creditproc is non-zero, then the quantity of locked memory is subtracted
2946  * from p->p_locked_mem.
2947  *
2948  * Return values
2949  *   none
2950  */
2951 void
2952 rctl_decr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc,
2953     int creditproc)
2954 {
2955 	kproject_t *projp;
2956 	zone_t *zonep;
2957 
2958 	if (proj != NULL) {
2959 		projp = proj;
2960 		zonep = zone_find_by_id(projp->kpj_zoneid);
2961 	} else {
2962 		ASSERT(p != NULL);
2963 		ASSERT(MUTEX_HELD(&p->p_lock));
2964 		projp = p->p_task->tk_proj;
2965 		zonep = p->p_zone;
2966 	}
2967 
2968 	mutex_enter(&zonep->zone_mem_lock);
2969 	zonep->zone_locked_mem -= inc;
2970 	projp->kpj_data.kpd_locked_mem -= inc;
2971 	if (creditproc != 0) {
2972 		ASSERT(p != NULL);
2973 		ASSERT(MUTEX_HELD(&p->p_lock));
2974 		p->p_locked_mem -= inc;
2975 	}
2976 	mutex_exit(&zonep->zone_mem_lock);
2977 	if (proj != NULL)
2978 		zone_rele(zonep);
2979 }
2980 
2981 /*
2982  * rctl_incr_swap(proc_t *, zone_t *, size_t)
2983  *
2984  * Overview
2985  *   Increments the swap charge on the specified zone.
2986  *
2987  * Return values
2988  *   0 on success.  EAGAIN if swap increment fails due an rctl value
2989  *   on the zone.
2990  *
2991  * Callers context
2992  *   p_lock held on specified proc.
2993  *   swap must be even multiple of PAGESIZE
2994  */
2995 int
2996 rctl_incr_swap(proc_t *proc, zone_t *zone, size_t swap)
2997 {
2998 	rctl_entity_p_t e;
2999 
3000 	ASSERT(MUTEX_HELD(&proc->p_lock));
3001 	ASSERT((swap & PAGEOFFSET) == 0);
3002 	e.rcep_p.zone = zone;
3003 	e.rcep_t = RCENTITY_ZONE;
3004 
3005 	mutex_enter(&zone->zone_mem_lock);
3006 
3007 	if ((zone->zone_max_swap + swap) >
3008 	    zone->zone_max_swap_ctl) {
3009 
3010 		if (rctl_test_entity(rc_zone_max_swap, zone->zone_rctls,
3011 		    proc, &e, swap, 0) & RCT_DENY) {
3012 			mutex_exit(&zone->zone_mem_lock);
3013 			return (EAGAIN);
3014 		}
3015 	}
3016 	zone->zone_max_swap += swap;
3017 	mutex_exit(&zone->zone_mem_lock);
3018 	return (0);
3019 }
3020 
3021 /*
3022  * rctl_decr_swap(zone_t *, size_t)
3023  *
3024  * Overview
3025  *   Decrements the swap charge on the specified zone.
3026  *
3027  * Return values
3028  *   None
3029  *
3030  * Callers context
3031  *   swap must be even multiple of PAGESIZE
3032  */
3033 void
3034 rctl_decr_swap(zone_t *zone, size_t swap)
3035 {
3036 	ASSERT((swap & PAGEOFFSET) == 0);
3037 	mutex_enter(&zone->zone_mem_lock);
3038 	ASSERT(zone->zone_max_swap >= swap);
3039 	zone->zone_max_swap -= swap;
3040 	mutex_exit(&zone->zone_mem_lock);
3041 }
3042 
3043 /*
3044  * Create resource kstat
3045  */
3046 static kstat_t *
3047 rctl_kstat_create_common(char *ks_name, int ks_instance, char *ks_class,
3048     uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags, int ks_zoneid)
3049 {
3050 	kstat_t *ksp = NULL;
3051 	char name[KSTAT_STRLEN];
3052 
3053 	(void) snprintf(name, KSTAT_STRLEN, "%s_%d", ks_name, ks_instance);
3054 
3055 	if ((ksp = kstat_create_zone("caps", ks_zoneid,
3056 		name, ks_class, ks_type,
3057 		ks_ndata, ks_flags, ks_zoneid)) != NULL) {
3058 		if (ks_zoneid != GLOBAL_ZONEID)
3059 			kstat_zone_add(ksp, GLOBAL_ZONEID);
3060 	}
3061 	return (ksp);
3062 }
3063 
3064 /*
3065  * Create zone-specific resource kstat
3066  */
3067 kstat_t *
3068 rctl_kstat_create_zone(zone_t *zone, char *ks_name, uchar_t ks_type,
3069     uint_t ks_ndata, uchar_t ks_flags)
3070 {
3071 	char name[KSTAT_STRLEN];
3072 
3073 	(void) snprintf(name, KSTAT_STRLEN, "%s_zone", ks_name);
3074 
3075 	return (rctl_kstat_create_common(name, zone->zone_id, "zone_caps",
3076 	    ks_type, ks_ndata, ks_flags, zone->zone_id));
3077 }
3078 
3079 /*
3080  * Create project-specific resource kstat
3081  */
3082 kstat_t *
3083 rctl_kstat_create_project(kproject_t *kpj, char *ks_name, uchar_t ks_type,
3084     uint_t ks_ndata, uchar_t ks_flags)
3085 {
3086 	char name[KSTAT_STRLEN];
3087 
3088 	(void) snprintf(name, KSTAT_STRLEN, "%s_project", ks_name);
3089 
3090 	return (rctl_kstat_create_common(name, kpj->kpj_id, "project_caps",
3091 	    ks_type, ks_ndata, ks_flags, kpj->kpj_zoneid));
3092 }
3093